1542 lines
62 KiB
C
1542 lines
62 KiB
C
/* Olivier Couvreur : 3/98 */
|
||
|
||
#if !defined(MTH_ASM_H)
|
||
#define MTH_ASM_H
|
||
#include "acp_base.h"
|
||
|
||
/* force OPTIMIZED_FOR_PC_FLOATS if OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
#if !defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define OPTIMIZED_FOR_PC_FLOATS
|
||
#endif
|
||
#endif
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vAddVector*/
|
||
/************************************************************************************************************************/
|
||
/* register load + 3 * (fld fadd) + 3 fstp = 2 + 3 * (1+1) + 3 * (2) = 14 clocks 13 instructions*/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
INLINE void MTH3D_M_vAddVectorASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB)
|
||
{
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ebx,VectB
|
||
mov ecx,VectDest
|
||
fld dword ptr [eax] /*; (VectA)->xX*/
|
||
fadd dword ptr [ebx] /*; (VectB)->xX*/
|
||
fld dword ptr [eax+4] /*; (VectA)->xY*/
|
||
fadd dword ptr [ebx+4] /*; (VectB)->xY*/
|
||
fld dword ptr [eax+8] /*; (VectA)->xZ*/
|
||
fadd dword ptr [ebx+8] /*; (VectB)->xZ*/
|
||
fxch st(2)
|
||
fstp dword ptr [ecx] /*; (VectDest)->xX*/
|
||
fstp dword ptr [ecx+4] /*; (VectDest)->xY*/
|
||
fstp dword ptr [ecx+8] /*; (VectDest)->xZ*/
|
||
}
|
||
}
|
||
#endif /* #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vAddVectorC MTH3D_M_vAddVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 3 * (fld fadd fstp) = 3 * (1+1+2+3pen) = 21 clocks 9 instructions*/
|
||
#define MTH3D_M_vAddVectorORG( VectDest, VectA, VectB) \
|
||
{ (VectDest)->xX = MTH_M_xAdd((VectA)->xX, (VectB)->xX); \
|
||
(VectDest)->xY = MTH_M_xAdd((VectA)->xY, (VectB)->xY); \
|
||
(VectDest)->xZ = MTH_M_xAdd((VectA)->xZ, (VectB)->xZ); }
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vSubVector*/
|
||
/************************************************************************************************************************/
|
||
/* register load + 3 * (fld fadd) + 3 fstp = 2 + 3 * (1+1) + 3 * (2) = 14 clocks 13 instructions*/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
INLINE void MTH3D_M_vSubVectorASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB)
|
||
{
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ebx,VectB
|
||
mov ecx,VectDest
|
||
fld dword ptr [eax] /*; (VectA)->xX*/
|
||
fsub dword ptr [ebx] /*; (VectB)->xX*/
|
||
fld dword ptr [eax+4] /*; (VectA)->xY*/
|
||
fsub dword ptr [ebx+4] /*; (VectB)->xY*/
|
||
fld dword ptr [eax+8] /*; (VectA)->xZ*/
|
||
fsub dword ptr [ebx+8] /*; (VectB)->xZ*/
|
||
fxch st(2)
|
||
fstp dword ptr [ecx] /*; (VectDest)->xX*/
|
||
fstp dword ptr [ecx+4] /*; (VectDest)->xY*/
|
||
fstp dword ptr [ecx+8] /*; (VectDest)->xZ*/
|
||
}
|
||
}
|
||
#endif /* #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vSubVectorC MTH3D_M_vSubVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 3 * (fld fadd fstp) = 3 * (1+1+2+3pen) = 21 clocks 9 instructions*/
|
||
#define MTH3D_M_vSubVectorORG( VectDest, VectA, VectB) \
|
||
{ (VectDest)->xX = MTH_M_xSub((VectA)->xX, (VectB)->xX); \
|
||
(VectDest)->xY = MTH_M_xSub((VectA)->xY, (VectB)->xY); \
|
||
(VectDest)->xZ = MTH_M_xSub((VectA)->xZ, (VectB)->xZ); }
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vNegVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
#define MTH3D_M_vNegVectorASM MTH3D_M_vNegVectorC
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
/* only toggle sign bit : No fpu*/
|
||
/* 10 pairables instructions => 5 clocks*/
|
||
#define MTH3D_M_vNegVectorC( VectDest, VectA) \
|
||
{ long register NegMask=0x80000000; \
|
||
*((long*) &((VectDest)->xX)) = *((long*) &((VectA)->xX )) ^ NegMask; \
|
||
*((long*) &((VectDest)->xY)) = *((long*) &((VectA)->xY )) ^ NegMask; \
|
||
*((long*) &((VectDest)->xZ)) = *((long*) &((VectA)->xZ )) ^ NegMask; \
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 3 * ( fld fchs fstp) = 3 * (1 + 1 + 1+1pen) = 12 clocks 9 instructions*/
|
||
#define MTH3D_M_vNegVectorORG( VectDest, VectA) \
|
||
{ (VectDest)->xX = MTH_M_xNeg( (VectA)->xX ); \
|
||
(VectDest)->xY = MTH_M_xNeg( (VectA)->xY ); \
|
||
(VectDest)->xZ = MTH_M_xNeg( (VectA)->xZ ); }
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vAdd3*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* 14 clocks*/
|
||
INLINE void MTH3D_M_vAdd3ScalarVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA,MTH_tdxReal x,MTH_tdxReal y,MTH_tdxReal z)
|
||
{
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ecx,VectDest
|
||
fld dword ptr [eax]
|
||
fadd dword ptr [x]
|
||
fld dword ptr [eax+4]
|
||
fadd dword ptr [y]
|
||
fld dword ptr [eax+8]
|
||
fadd dword ptr [z]
|
||
fxch st(2)
|
||
fstp dword ptr [ecx]
|
||
fstp dword ptr [ecx+4]
|
||
fstp dword ptr [ecx+8]
|
||
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vAdd3ScalarVectorC MTH3D_M_vAdd3ScalarVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 21 clocks*/
|
||
#define MTH3D_M_vAdd3ScalarVectorORG( VectDest, VectA, x, y, z) \
|
||
{ (VectDest)->xX = MTH_M_xAdd( (VectA)->xX, (x) ); \
|
||
(VectDest)->xY = MTH_M_xAdd( (VectA)->xY, (y) ); \
|
||
(VectDest)->xZ = MTH_M_xAdd( (VectA)->xZ, (z) ); }
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vDivScalarVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* 14 clocks*/
|
||
INLINE void MTH3D_M_vMulScalarVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal a,MTH3D_tdstVector *VectA)
|
||
{
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ebx,VectDest
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [a]
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [a]
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [a]
|
||
fxch st(2)
|
||
fstp dword ptr [ebx]
|
||
fstp dword ptr [ebx+4]
|
||
fstp dword ptr [ebx+8]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vMulScalarVectorC( VectDest, a, VectA) \
|
||
{ register MTH_tdxReal xTempMTH3D_M_vMulScalarVectorC=(a); \
|
||
(VectDest)->xX = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xX); \
|
||
(VectDest)->xY = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xY); \
|
||
(VectDest)->xZ = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xZ); }
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
|
||
#define MTH3D_M_vMulScalarVectorORG( VectDest, a, VectA) \
|
||
{ (VectDest)->xX = MTH_M_xMul((a), (VectA)->xX); \
|
||
(VectDest)->xY = MTH_M_xMul((a), (VectA)->xY); \
|
||
(VectDest)->xZ = MTH_M_xMul((a), (VectA)->xZ); }
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vDivScalarVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
#define MTH3D_M_vDivScalarVectorASM MTH3D_M_vDivScalarVectorC
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
/* Only one division*/
|
||
#define MTH3D_M_vDivScalarVectorC( VectDest, VectA, a) \
|
||
{ register MTH_tdxReal xTempMTH3D_M_vDivScalarVectorC=MTH_M_xDiv(MTH_C_ONE, (a)); \
|
||
MTH3D_M_vMulScalarVector( VectDest ,xTempMTH3D_M_vDivScalarVectorC, VectA); }
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
#define MTH3D_M_vDivScalarVectorORG( VectDest, VectA, a) \
|
||
{ (VectDest)->xX = MTH_M_xDiv((VectA)->xX, (a)); \
|
||
(VectDest)->xY = MTH_M_xDiv((VectA)->xY, (a)); \
|
||
(VectDest)->xZ = MTH_M_xDiv((VectA)->xZ, (a)); }
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vScaleVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* 14 clocks*/
|
||
INLINE void MTH3D_M_vScaleVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB)
|
||
{
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ebx,VectB
|
||
mov ecx,VectDest
|
||
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [ebx]
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [ebx+4]
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [ebx+8]
|
||
fxch st(2)
|
||
fstp dword ptr [ecx]
|
||
fstp dword ptr [ecx+4]
|
||
fstp dword ptr [ecx+8]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vScaleVectorC MTH3D_M_vScaleVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 21 clocks*/
|
||
#define MTH3D_M_vScaleVectorORG( VectDest, VectA, VectB ) \
|
||
{ (VectDest)->xX = MTH_M_xMul( (VectA)->xX, (VectB)->xX); \
|
||
(VectDest)->xY = MTH_M_xMul( (VectA)->xY, (VectB)->xY); \
|
||
(VectDest)->xZ = MTH_M_xMul( (VectA)->xZ, (VectB)->xZ); }
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vMulAddVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* 17 clocks : D=xA+B*/
|
||
INLINE void MTH3D_M_vMulAddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB)
|
||
{
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ebx,VectB
|
||
mov ecx,VectDest
|
||
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [x]
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [x]
|
||
fxch st(1)
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [x]
|
||
fxch st(1)
|
||
fadd dword ptr [ebx]
|
||
fxch st(2)
|
||
fadd dword ptr [ebx+4]
|
||
fxch st(1)
|
||
fadd dword ptr [ebx+8]
|
||
fxch st(2)
|
||
fstp dword ptr [ecx] /* 1 pen*/
|
||
fstp dword ptr [ecx+4]
|
||
fstp dword ptr [ecx+8]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vMulAddVectorC MTH3D_M_vMulAddVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 31 clocks*/
|
||
#define MTH3D_M_vMulAddVectorORG( VectDest, x, VectA, VectB) \
|
||
{ (VectDest)->xX = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xX), (VectB)->xX); \
|
||
(VectDest)->xY = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xY), (VectB)->xY); \
|
||
(VectDest)->xZ = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xZ), (VectB)->xZ); }
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vMul3AddVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* 32 clocks (2 penalties) : D=xA+yB+zC*/
|
||
INLINE void MTH3D_M_vMul3AddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH_tdxReal y,MTH3D_tdstVector *VectB,MTH_tdxReal z,MTH3D_tdstVector *VectC)
|
||
{
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ebx,VectB
|
||
mov ecx,VectC
|
||
mov edx,VectDest
|
||
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [x]
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [x]
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [x]
|
||
fxch st(2)
|
||
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [y]
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [y]
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [y]
|
||
fxch st(2)
|
||
|
||
faddp st(3),st
|
||
faddp st(3),st
|
||
fld dword ptr [ecx]
|
||
fmul dword ptr [z]
|
||
fxch st(1)
|
||
faddp st(4),st
|
||
fld dword ptr [ecx+4]
|
||
fmul dword ptr [z]
|
||
fld dword ptr [ecx+8]
|
||
fmul dword ptr [z]
|
||
fxch st(2)
|
||
|
||
faddp st(3),st
|
||
faddp st(3),st
|
||
faddp st(3),st /* 1 pen*/
|
||
|
||
fstp dword ptr [edx] /* 1 pen*/
|
||
fstp dword ptr [edx+4]
|
||
fstp dword ptr [edx+8]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vMul3AddVectorC MTH3D_M_vMul3AddVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* au moins 42 clocks*/
|
||
#define MTH3D_M_vMul3AddVectorORG( VectDest, x, VectA, y, VectB, z, VectC) \
|
||
{ (VectDest)->xX = MTH_M_xMulAddMulAddMul((x),(VectA)->xX), (y),(VectB)->xX), (z),(VectC)->xX)); \
|
||
(VectDest)->xY = MTH_M_xMulAddMulAddMul((x),(VectA)->xY), (y),(VectB)->xY), (z),(VectC)->xY)); \
|
||
(VectDest)->xZ = MTH_M_xMulAddMulAddMul((x),(VectA)->xZ), (y),(VectB)->xZ), (z),(VectC)->xZ)); }
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vMul4AddVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* 40 clocks (0 penalties) : E=xA+yB+zC+wD*/
|
||
INLINE void MTH3D_M_vMul4AddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH_tdxReal y,MTH3D_tdstVector *VectB,MTH_tdxReal z,MTH3D_tdstVector *VectC,MTH_tdxReal w,MTH3D_tdstVector *VectD)
|
||
{
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ebx,VectB
|
||
mov ecx,VectC
|
||
mov edx,VectD
|
||
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [x]
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [x]
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [x]
|
||
fxch st(2)
|
||
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [y]
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [y]
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [y]
|
||
fxch st(2)
|
||
|
||
faddp st(3),st
|
||
faddp st(3),st
|
||
fld dword ptr [ecx]
|
||
fmul dword ptr [z]
|
||
fxch st(1)
|
||
faddp st(4),st
|
||
fld dword ptr [ecx+4]
|
||
fmul dword ptr [z]
|
||
fld dword ptr [ecx+8]
|
||
fmul dword ptr [z]
|
||
fxch st(2)
|
||
|
||
faddp st(3),st
|
||
faddp st(3),st
|
||
fld dword ptr [edx]
|
||
fmul dword ptr [w]
|
||
fxch st(1)
|
||
faddp st(4),st
|
||
fld dword ptr [edx+4]
|
||
fmul dword ptr [w]
|
||
fld dword ptr [edx+8]
|
||
fmul dword ptr [w]
|
||
fxch st(2)
|
||
|
||
faddp st(3),st
|
||
faddp st(3),st
|
||
mov edx,VectDest
|
||
faddp st(3),st
|
||
|
||
fstp dword ptr [edx]
|
||
fstp dword ptr [edx+4]
|
||
fstp dword ptr [edx+8]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vMul4AddVectorC MTH3D_M_vMul4AddVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* au moins 56 clocks*/
|
||
#define MTH3D_M_vMul4AddVectorORG( VectDest, x, VectA, y, VectB, z, VectC, w, VectD) \
|
||
{ (VectDest)->xX = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xX), (y),(VectB)->xX), (z),(VectC)->xX)),MTH_M_xMul((w),(VectD)->xX)); \
|
||
(VectDest)->xY = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xY), (y),(VectB)->xY), (z),(VectC)->xY)),MTH_M_xMul((w),(VectD)->xY)); \
|
||
(VectDest)->xZ = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xZ), (y),(VectB)->xZ), (z),(VectC)->xZ)),MTH_M_xMul((w),(VectD)->xZ)); }
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vLinearInterpolVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* 21 clocks*/
|
||
INLINE void MTH3D_M_vLinearInterpolVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA, MTH3D_tdstVector *VectB, MTH_tdxReal t)
|
||
{
|
||
/* Cx=Ax+t(Bx-Ax)=Ax+tDx*/
|
||
/* Cy=Ay+t(By-Ay)=Ay+tDy*/
|
||
/* Cz=Az+t(Bz-Az)=Az+tDz*/
|
||
__asm
|
||
{
|
||
mov ebx,VectB
|
||
mov eax,VectA
|
||
mov ecx,VectDest
|
||
|
||
fld dword ptr [ebx] /* Bx*/
|
||
fsub dword ptr [eax] /* Dx*/
|
||
fld dword ptr [ebx+4] /* By Dx*/
|
||
fsub dword ptr [eax+4] /* Dy Dx*/
|
||
fld dword ptr [ebx+8] /* Bz Dy Dx*/
|
||
fsub dword ptr [eax+8] /* Dz Dy Dx*/
|
||
fxch st(2) /* Dx Dy Dz*/
|
||
fmul dword ptr [t] /* tDx Dy Dz*/
|
||
fld dword ptr [eax+4] /* Ay tDx Dy Dz*/
|
||
fxch st(2) /* Dy tDx Ay Dz*/
|
||
fmul dword ptr [t] /* tDy tDx Ay Dz*/
|
||
fld dword ptr [eax] /* Ax tDy tDx Ay Dz*/
|
||
fxch st(4) /* Dz tDy tDx Ay Ax*/
|
||
fmul dword ptr [t] /* tDz tDy tDx Ay Ax*/
|
||
fxch st(2) /* tDx tDy tDz Ay Ax*/
|
||
faddp st(4),st /* tDy tDz Ay Cx*/
|
||
faddp st(2),st /* tDz Cy Cx*/
|
||
fld dword ptr [eax+8] /* Az tDz Cy Cx */
|
||
faddp st(1),st /* Cz Cy Cx*/
|
||
fxch st(2) /* Cx Cy Cz*/
|
||
fstp dword ptr [ecx] /* Cy Cz*/
|
||
fstp dword ptr [ecx+4] /* Cz*/
|
||
fstp dword ptr [ecx+8]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vLinearInterpolVectorC MTH3D_M_vLinearInterpolVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 39 clocks */
|
||
#define MTH3D_M_vLinearInterpolVectorORG( VectDest, VectA, VectB, t ) \
|
||
{ (VectDest)->xX = MTH_M_xLinearInterpol( (VectA)->xX, (VectB)->xX, (t) ); \
|
||
(VectDest)->xY = MTH_M_xLinearInterpol( (VectA)->xY, (VectB)->xY, (t) ); \
|
||
(VectDest)->xZ = MTH_M_xLinearInterpol( (VectA)->xZ, (VectB)->xZ, (t) ); \
|
||
}
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vLinearScaleVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* 21 clocks*/
|
||
INLINE void MTH3D_M_vLinearScaleVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA, MTH3D_tdstVector *VectB, MTH3D_tdstVector *VectC)
|
||
{
|
||
/* x=Ax+Cx.(Bx-Ax)=Ax+Cx.Dx*/
|
||
/* y=Ay+Cy.(By-Ay)=Ay+Cy.Dy*/
|
||
/* z=Az+Cz.(Bz-Az)=Az+Cz.Dz*/
|
||
__asm
|
||
{
|
||
mov ebx,VectB
|
||
mov eax,VectA
|
||
mov ecx,VectDest
|
||
mov edx,VectC
|
||
|
||
fld dword ptr [ebx] /* Bx*/
|
||
fsub dword ptr [eax] /* Dx*/
|
||
fld dword ptr [ebx+4] /* By Dx*/
|
||
fsub dword ptr [eax+4] /* Dy Dx*/
|
||
fld dword ptr [ebx+8] /* Bz Dy Dx*/
|
||
fsub dword ptr [eax+8] /* Dz Dy Dx*/
|
||
fxch st(2) /* Dx Dy Dz*/
|
||
fmul dword ptr [edx] /* CxDx Dy Dz*/
|
||
fld dword ptr [eax+4] /* Ay CxDx Dy Dz*/
|
||
fxch st(2) /* Dy CxDx Ay Dz*/
|
||
fmul dword ptr [edx+4] /* CyDy CxDx Ay Dz*/
|
||
fld dword ptr [eax] /* Ax CyDy CxDx Ay Dz*/
|
||
fxch st(4) /* Dz CyDy CxDx Ay Ax*/
|
||
fmul dword ptr [edx+8] /* CzDz CyDy CxDx Ay Ax*/
|
||
fxch st(2) /* CxDx CyDy CzDz Ay Ax*/
|
||
faddp st(4),st /* CyDy CzDz Ay x*/
|
||
faddp st(2),st /* CzDz y x*/
|
||
fld dword ptr [eax+8] /* Az CzDz y x */
|
||
faddp st(1),st /* z y x*/
|
||
fxch st(2) /* x y z*/
|
||
fstp dword ptr [ecx] /* y z*/
|
||
fstp dword ptr [ecx+4] /* z*/
|
||
fstp dword ptr [ecx+8]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vLinearScaleVectorC MTH3D_M_vLinearScaleVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 39 clocks */
|
||
#define MTH3D_M_vLinearScaleVectorORG( VectDest, VectA, VectB, VectC ) \
|
||
{ (VectDest)->xX = MTH_M_xLinearInterpol( (VectA)->xX, (VectB)->xX, (VectC)->xX ); \
|
||
(VectDest)->xY = MTH_M_xLinearInterpol( (VectA)->xY, (VectB)->xY, (VectC)->xY ); \
|
||
(VectDest)->xZ = MTH_M_xLinearInterpol( (VectA)->xZ, (VectB)->xZ, (VectC)->xZ ); \
|
||
}
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_xDotProductVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* 12 clocks (2 penalties) 11 instructions*/
|
||
#pragma warning(disable:4035)
|
||
INLINE
|
||
MTH_tdxReal MTH3D_M_xDotProductVectorASM(struct MTH3D_tdstVector_ *VectA,struct MTH3D_tdstVector_ *VectB)
|
||
{
|
||
register MTH_tdxReal xDot;
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ebx,VectB
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [ebx]
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [ebx+4]
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [ebx+8]
|
||
fxch st(1)
|
||
faddp st(2),st
|
||
faddp st(1),st /* 2 unavoidable penalties*/
|
||
fstp [xDot]
|
||
}
|
||
return(xDot);
|
||
}
|
||
#pragma warning(default:4035)
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_xDotProductVectorC MTH3D_M_xDotProductVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 15 clocks (5 penalties) 12 instructions*/
|
||
#define MTH3D_M_xDotProductVectorORG( VectA, VectB) \
|
||
MTH_M_xAdd( \
|
||
MTH_M_xAdd( \
|
||
MTH_M_xMul((VectA)->xX, (VectB)->xX), \
|
||
MTH_M_xMul((VectA)->xY, (VectB)->xY) \
|
||
), \
|
||
MTH_M_xMul((VectA)->xZ, (VectB)->xZ) )
|
||
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vCrossProductVectorWithoutBuffer*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* requires no temp buffer if VectA or VectB == VectDest*/
|
||
/* 24 clocks (1 penalty) 23 instructions : 8.70 % pairing*/
|
||
INLINE void MTH3D_M_vCrossProductVectorWithoutBufferASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA,struct MTH3D_tdstVector_ *VectB)
|
||
{
|
||
__asm
|
||
{
|
||
mov eax,VectA
|
||
mov ebx,VectB
|
||
mov edx,VectDest
|
||
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [ebx+8]
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [ebx]
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [ebx+4]
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [ebx+4]
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [ebx+8]
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [ebx]
|
||
fxch st(2)
|
||
fsubp st(5),st
|
||
fsubp st(3),st
|
||
fsubp st(1),st
|
||
fxch st(2)
|
||
fstp dword ptr [edx] /* 1 penalty here : unavoidable !*/
|
||
fstp dword ptr [edx+4]
|
||
fstp dword ptr [edx+8]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vCrossProductVectorWithoutBufferC MTH3D_M_vCrossProductVectorWithoutBufferORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 42 clocks (18 penalties) 25 instructions : 8.00 % pairing*/
|
||
#define MTH3D_M_vCrossProductVectorWithoutBufferORG(VectDest, VectA, VectB) \
|
||
{ (VectDest)->xX=MTH_M_xMulSubMul((VectA)->xY,(VectB)->xZ,(VectA)->xZ,(VectB)->xY); \
|
||
(VectDest)->xY=MTH_M_xMulSubMul((VectA)->xZ,(VectB)->xX,(VectA)->xX,(VectB)->xZ); \
|
||
(VectDest)->xZ=MTH_M_xMulSubMul((VectA)->xX,(VectB)->xY,(VectA)->xY,(VectB)->xX); }
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vCrossProductVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
#define MTH3D_M_vCrossProductVectorASM MTH3D_M_vCrossProductVectorWithoutBufferASM
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vCrossProductVectorC MTH3D_M_vCrossProductVectorWithoutBufferC
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
|
||
#define MTH3D_M_vCrossProductVectorORG(VectDest, VectA, VectB) \
|
||
{ if( (VectDest==VectA) || (VectDest==VectB) ) \
|
||
{ \
|
||
MTH3D_tdstVector VectTmp; \
|
||
MTH3D_M_vCrossProductVectorWithoutBuffer(&VectTmp, VectA, VectB); \
|
||
MTH3D_M_vCopyVector(VectDest, &VectTmp); \
|
||
} \
|
||
else \
|
||
{ \
|
||
MTH3D_M_vCrossProductVectorWithoutBuffer(VectDest, VectA, VectB); \
|
||
} \
|
||
}
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vMulMatrixMatrixWithoutBuffer*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* requires a buffer only if MatDest==A*/
|
||
/* 91 clocks (0 penalty) 95 instructions : 4.21 % pairing*/
|
||
/* tricks remove penalties*/
|
||
INLINE void MTH3D_M_vMulMatrixMatrixWithoutBufferASM(struct MTH3D_tdstMatrix_ *MatDest,struct MTH3D_tdstMatrix_ *MatA,struct MTH3D_tdstMatrix_ *MatB)
|
||
{
|
||
__asm
|
||
{
|
||
mov ebx,MatB
|
||
mov eax,MatA
|
||
mov ecx,MatDest
|
||
/**/
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [eax]
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [eax+4]
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [eax+8]
|
||
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [eax+12]
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [eax+16]
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [eax+20]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [eax+24]
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [eax+28]
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [eax+32]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fxch st(1) /* trick A : preload next value*/
|
||
fld dword ptr [ebx+12]
|
||
fxch st(3)
|
||
|
||
fstp dword ptr [ecx] /* A no more penalty here*/
|
||
fstp dword ptr [ecx+4]
|
||
fstp dword ptr [ecx+8]
|
||
/**/
|
||
fmul dword ptr [eax]
|
||
fld dword ptr [ebx+12]
|
||
fmul dword ptr [eax+4]
|
||
fld dword ptr [ebx+12]
|
||
fmul dword ptr [eax+8]
|
||
|
||
fld dword ptr [ebx+16]
|
||
fmul dword ptr [eax+12]
|
||
fld dword ptr [ebx+16]
|
||
fmul dword ptr [eax+16]
|
||
fld dword ptr [ebx+16]
|
||
fmul dword ptr [eax+20]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fld dword ptr [ebx+20]
|
||
fmul dword ptr [eax+24]
|
||
fld dword ptr [ebx+20]
|
||
fmul dword ptr [eax+28]
|
||
fld dword ptr [ebx+20]
|
||
fmul dword ptr [eax+32]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fxch st(2) /* trick B : preload next value*/
|
||
/* trick C : replace fxch st(1)*/
|
||
fld dword ptr [ebx+24]
|
||
fmul dword ptr [eax]
|
||
fxch st(2) /* trick C : replace fxch st(3)*/
|
||
|
||
fstp dword ptr [ecx+16]
|
||
fstp dword ptr [ecx+12] /* B: no more penalty here*/
|
||
|
||
/*fstp dword ptr [ecx+20] // trick C : store it later*/
|
||
/**/
|
||
fld dword ptr [ebx+24]
|
||
fmul dword ptr [eax+4]
|
||
fld dword ptr [ebx+24]
|
||
fmul dword ptr [eax+8]
|
||
|
||
fld dword ptr [ebx+28]
|
||
fmul dword ptr [eax+12]
|
||
fld dword ptr [ebx+28]
|
||
fmul dword ptr [eax+16]
|
||
fld dword ptr [ebx+28]
|
||
fmul dword ptr [eax+20]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fxch st(3) /* trick C :added*/
|
||
fld dword ptr [ebx+32]
|
||
fmul dword ptr [eax+24]
|
||
fld dword ptr [ebx+32]
|
||
fmul dword ptr [eax+28]
|
||
fld dword ptr [ebx+32]
|
||
fmul dword ptr [eax+32]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(4),st /* trick C : replace faddp st(1),st*/
|
||
|
||
fstp dword ptr [ecx+20] /* trick C : store it later*/
|
||
|
||
/* no more penalty here*/
|
||
fstp dword ptr [ecx+28] /* trick C : swapped stores*/
|
||
fstp dword ptr [ecx+24]
|
||
fstp dword ptr [ecx+32]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vMulMatrixMatrixWithoutBufferC MTH3D_M_vMulMatrixMatrixWithoutBufferORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 174 clocks (69 penalties) 119 instructions : 1.68 % pairing*/
|
||
#define MTH3D_M_vMulMatrixMatrixWithoutBufferORG(MatDest, MatA, MatB) \
|
||
{ \
|
||
(MatDest)->stCol_0.xX = MTH_M_xMulAddMulAddMul( \
|
||
(MatA)->stCol_0.xX, (MatB)->stCol_0.xX, \
|
||
(MatA)->stCol_1.xX, (MatB)->stCol_0.xY, \
|
||
(MatA)->stCol_2.xX, (MatB)->stCol_0.xZ ); \
|
||
(MatDest)->stCol_0.xY = MTH_M_xMulAddMulAddMul( \
|
||
(MatA)->stCol_0.xY, (MatB)->stCol_0.xX, \
|
||
(MatA)->stCol_1.xY, (MatB)->stCol_0.xY, \
|
||
(MatA)->stCol_2.xY, (MatB)->stCol_0.xZ ); \
|
||
(MatDest)->stCol_0.xZ = MTH_M_xMulAddMulAddMul( \
|
||
(MatA)->stCol_0.xZ, (MatB)->stCol_0.xX, \
|
||
(MatA)->stCol_1.xZ, (MatB)->stCol_0.xY, \
|
||
(MatA)->stCol_2.xZ, (MatB)->stCol_0.xZ ); \
|
||
\
|
||
(MatDest)->stCol_1.xX = MTH_M_xMulAddMulAddMul( \
|
||
(MatA)->stCol_0.xX, (MatB)->stCol_1.xX, \
|
||
(MatA)->stCol_1.xX, (MatB)->stCol_1.xY, \
|
||
(MatA)->stCol_2.xX, (MatB)->stCol_1.xZ ); \
|
||
(MatDest)->stCol_1.xY = MTH_M_xMulAddMulAddMul( \
|
||
(MatA)->stCol_0.xY, (MatB)->stCol_1.xX, \
|
||
(MatA)->stCol_1.xY, (MatB)->stCol_1.xY, \
|
||
(MatA)->stCol_2.xY, (MatB)->stCol_1.xZ ); \
|
||
(MatDest)->stCol_1.xZ = MTH_M_xMulAddMulAddMul( \
|
||
(MatA)->stCol_0.xZ, (MatB)->stCol_1.xX, \
|
||
(MatA)->stCol_1.xZ, (MatB)->stCol_1.xY, \
|
||
(MatA)->stCol_2.xZ, (MatB)->stCol_1.xZ ); \
|
||
\
|
||
(MatDest)->stCol_2.xX = MTH_M_xMulAddMulAddMul( \
|
||
(MatA)->stCol_0.xX, (MatB)->stCol_2.xX, \
|
||
(MatA)->stCol_1.xX, (MatB)->stCol_2.xY, \
|
||
(MatA)->stCol_2.xX, (MatB)->stCol_2.xZ ); \
|
||
(MatDest)->stCol_2.xY = MTH_M_xMulAddMulAddMul( \
|
||
(MatA)->stCol_0.xY, (MatB)->stCol_2.xX, \
|
||
(MatA)->stCol_1.xY, (MatB)->stCol_2.xY, \
|
||
(MatA)->stCol_2.xY, (MatB)->stCol_2.xZ ); \
|
||
(MatDest)->stCol_2.xZ = MTH_M_xMulAddMulAddMul( \
|
||
(MatA)->stCol_0.xZ, (MatB)->stCol_2.xX, \
|
||
(MatA)->stCol_1.xZ, (MatB)->stCol_2.xY, \
|
||
(MatA)->stCol_2.xZ, (MatB)->stCol_2.xZ ); \
|
||
}
|
||
|
||
|
||
#if defined(OPTIMIZED_FOR_U64_ASM)
|
||
static inline void MTH3D_M_vMulMatrixMatrixWithoutBufferU64ASM(struct MTH3D_tdstMatrix_ *MatDest,struct MTH3D_tdstMatrix_ *MatA,struct MTH3D_tdstMatrix_ *MatB)
|
||
{
|
||
/* GGG RRR OOO U U M M PPP FFFF !! !!
|
||
G G R R O O U U MM MM P P F !! !!
|
||
G RRR O O U U M M M PPP FFF !! !!
|
||
G GG R R O O U U M M P F
|
||
GGG R R OOO UUU M M P F oo oo
|
||
|
||
PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/
|
||
asm(" .set noreorder ");
|
||
asm(" # Begin MulMatrixMatrixWithoutBufferU64ASM ");
|
||
asm(
|
||
" # Premier MulMatrixVertex \n"
|
||
" lwc1 $f6,0(%2) \n"
|
||
" lwc1 $f8,4(%2) \n"
|
||
" lwc1 $f10,8(%2) \n"
|
||
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
|
||
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
|
||
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
|
||
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
|
||
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
|
||
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
|
||
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
|
||
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
|
||
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
|
||
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
|
||
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
|
||
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
|
||
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
|
||
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
|
||
" swc1 $f0,0(%0) # f0 -> Dest[0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
|
||
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
|
||
" swc1 $f12,4(%0) # f12 -> Dest[1] \n"
|
||
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
|
||
" swc1 $f18,8(%0) # f18 -> Dest[2] \n"
|
||
" # Deuxi<78>me MulMatrixVertex \n"
|
||
" lwc1 $f6,12(%2) \n"
|
||
" lwc1 $f8,16(%2) \n"
|
||
" lwc1 $f10,20(%2) \n"
|
||
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
|
||
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
|
||
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
|
||
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
|
||
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
|
||
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
|
||
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
|
||
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
|
||
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
|
||
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
|
||
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
|
||
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
|
||
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
|
||
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
|
||
" swc1 $f0,12(%0) # f0 -> Dest[0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
|
||
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
|
||
" swc1 $f12,16(%0) # f12 -> Dest[1] \n"
|
||
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
|
||
" swc1 $f18,20(%0) # f18 -> Dest[2] \n"
|
||
" # Troisi<73>me MulMatrixVertex \n"
|
||
" lwc1 $f6,24(%2) \n"
|
||
" lwc1 $f8,28(%2) \n"
|
||
" lwc1 $f10,32(%2) \n"
|
||
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
|
||
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
|
||
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
|
||
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
|
||
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
|
||
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
|
||
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
|
||
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
|
||
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
|
||
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
|
||
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
|
||
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
|
||
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
|
||
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
|
||
" swc1 $f0,24(%0) # f0 -> Dest[0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
|
||
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
|
||
" swc1 $f12,28(%0) # f12 -> Dest[1] \n"
|
||
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
|
||
" swc1 $f18,32(%0) # f18 -> Dest[2] \n"
|
||
: : "r" (MatDest), "r" (MatA), "r" (MatB) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" );
|
||
asm(" # EndOf MulMatrixMatrixWithoutBufferU64ASM ");\
|
||
asm(" .set reorder ");
|
||
}
|
||
#endif /* OPTIMIZED_FOR_U64_ASM*/
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vMulMatrixMatrix*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
#define MTH3D_M_vMulMatrixMatrixASM(Mat_Dest, Mat_A, Mat_B) \
|
||
{ if (Mat_Dest==Mat_A) \
|
||
{ \
|
||
MTH3D_tdstMatrix Mtemp; \
|
||
\
|
||
MTH3D_M_vMulMatrixMatrixWithoutBuffer( \
|
||
&Mtemp, Mat_A, Mat_B); \
|
||
MTH3D_M_vCopyMatrix( Mat_Dest, &Mtemp); \
|
||
} \
|
||
else \
|
||
{ \
|
||
MTH3D_M_vMulMatrixMatrixWithoutBuffer(Mat_Dest, \
|
||
Mat_A, Mat_B); \
|
||
} \
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vMulMatrixMatrixC MTH3D_M_vMulMatrixMatrixORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
|
||
#define MTH3D_M_vMulMatrixMatrixORG(Mat_Dest, Mat_A, Mat_B) \
|
||
{ if( (Mat_Dest==Mat_A) || (Mat_Dest==Mat_B) ) \
|
||
{ \
|
||
MTH3D_tdstMatrix Mtemp; \
|
||
\
|
||
MTH3D_M_vMulMatrixMatrixWithoutBuffer( \
|
||
&Mtemp, Mat_A, Mat_B); \
|
||
MTH3D_M_vCopyMatrix( Mat_Dest, &Mtemp); \
|
||
} \
|
||
else \
|
||
{ \
|
||
MTH3D_M_vMulMatrixMatrixWithoutBuffer(Mat_Dest, \
|
||
Mat_A, Mat_B); \
|
||
} \
|
||
}
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_xDetMatrix*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* only 9 muls instead of 12 !!! */
|
||
/* 31 clocks but 5 penalties*/
|
||
#pragma warning(disable:4035)
|
||
INLINE MTH_tdxReal MTH3D_M_xDetMatrixASM(struct MTH3D_tdstMatrix_ *MatA)
|
||
{
|
||
register MTH_tdxReal xTempMatrix; /* not useful but necessary to no hang compiler in release mode !?!*/
|
||
__asm
|
||
{
|
||
/* = 0X.(1Y.2Z-1Z.2Y) | = 0X.(A-B) | = X*/
|
||
/* + 0Y.(1Z.2X-1X.2Z) | + 0Y.(C-D) | + Y*/
|
||
/* + 0Z.(1X.2Y-1Y.2X) | + 0Z.(E-F) | + Z*/
|
||
mov ecx,MatA
|
||
fld dword ptr [ecx+16] /**/
|
||
fmul dword ptr [ecx+32] /* A*/
|
||
fld dword ptr [ecx+20] /**/
|
||
fmul dword ptr [ecx+28] /* B A*/
|
||
fld dword ptr [ecx+20] /**/
|
||
fmul dword ptr [ecx+24] /* C B A*/
|
||
fld dword ptr [ecx+12] /**/
|
||
fmul dword ptr [ecx+32] /* D C B A*/
|
||
fxch st(2) /* B C D A*/
|
||
fsubp st(3),st /* C D A-B*/
|
||
fld dword ptr [ecx+12] /**/
|
||
fmul dword ptr [ecx+28] /* E C D A-B*/
|
||
fxch st(2) /* D C E A-B*/
|
||
fsubp st(1),st /* C-D E A-B*/
|
||
fxch st(2) /* A-B E C-D*/
|
||
fmul dword ptr [ecx] /* X E C-D*/
|
||
fld dword ptr [ecx+16] /**/
|
||
fmul dword ptr [ecx+24] /* F X E C-D*/
|
||
fxch st(3) /* C-D X E F */
|
||
fmul dword ptr [ecx+4] /* Y X E F*/
|
||
fxch st(3) /* F X E Y*/
|
||
fsubp st(2),st /* X E-F Y*/
|
||
faddp st(2),st /* E-F X+Y*/
|
||
fmul dword ptr [ecx+8] /* Z X+Y*/
|
||
faddp st(1),st /* X+Y+Z*/
|
||
fstp dword ptr [xTempMatrix] /* not useful but necessary to no hang compiler in release mode !?!*/
|
||
}
|
||
return(xTempMatrix); /* not useful but necessary to no hang compiler in release mode !?!*/
|
||
}
|
||
#pragma warning(default:4035)
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
/* only 9 muls instead of 12 !!! */
|
||
/* 37 clocks but 11 penalties*/
|
||
#define MTH3D_M_xDetMatrixC( MatA ) \
|
||
MTH_M_xAdd3( \
|
||
MTH_M_xMul( (MatA)->stCol_0.xX, \
|
||
MTH_M_xMulSubMul( (MatA)->stCol_1.xY, (MatA)->stCol_2.xZ, (MatA)->stCol_1.xZ, (MatA)->stCol_2.xY ) \
|
||
), \
|
||
MTH_M_xMul( (MatA)->stCol_0.xY, \
|
||
MTH_M_xMulSubMul( (MatA)->stCol_1.xZ, (MatA)->stCol_2.xX, (MatA)->stCol_1.xX, (MatA)->stCol_2.xZ ) \
|
||
), \
|
||
MTH_M_xMul( (MatA)->stCol_0.xZ, \
|
||
MTH_M_xMulSubMul( (MatA)->stCol_1.xX, (MatA)->stCol_2.xY, (MatA)->stCol_1.xY, (MatA)->stCol_2.xX ) \
|
||
) \
|
||
)
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 41 clocks but 16 penalties*/
|
||
#define MTH3D_M_xDetMatrixORG(MatA) \
|
||
MTH_M_xSub( \
|
||
MTH_M_xAdd3( \
|
||
MTH_M_xMul3( (MatA)->stCol_0.xX, (MatA)->stCol_1.xY, (MatA)->stCol_2.xZ ), \
|
||
MTH_M_xMul3( (MatA)->stCol_0.xY, (MatA)->stCol_1.xZ, (MatA)->stCol_2.xX ), \
|
||
MTH_M_xMul3( (MatA)->stCol_0.xZ, (MatA)->stCol_1.xX, (MatA)->stCol_2.xY )), \
|
||
MTH_M_xAdd3( \
|
||
MTH_M_xMul3( (MatA)->stCol_0.xZ, (MatA)->stCol_1.xY, (MatA)->stCol_2.xX ), \
|
||
MTH_M_xMul3( (MatA)->stCol_1.xZ, (MatA)->stCol_2.xY, (MatA)->stCol_0.xX ), \
|
||
MTH_M_xMul3( (MatA)->stCol_2.xZ, (MatA)->stCol_0.xY, (MatA)->stCol_1.xX )))
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vMulMatrixVectorWithoutBuffer*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* requires no temporary buffer if VectDest==VectA*/
|
||
/* 32 clocks (1 penalty) 33 instructions : 12,12 % pairing*/
|
||
INLINE void MTH3D_M_vMulMatrixVectorWithoutBufferASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA)
|
||
{
|
||
__asm
|
||
{
|
||
mov ebx,VectA
|
||
mov eax,MatA
|
||
mov ecx,VectDest
|
||
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [eax]
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [eax+4]
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [eax+8]
|
||
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [eax+12]
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [eax+16]
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [eax+20]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [eax+24]
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [eax+28]
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [eax+32]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fxch st(2)
|
||
fstp dword ptr [ecx]
|
||
fstp dword ptr [ecx+4]
|
||
fstp dword ptr [ecx+8]
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vMulMatrixVectorWithoutBufferC MTH3D_M_vMulMatrixVectorWithoutBufferORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 56 clocks (20 penalties) 44 instructions : 4.55 % pairing*/
|
||
#define MTH3D_M_vMulMatrixVectorWithoutBufferORG( VectDest, MatA, VectA) \
|
||
{ (VectDest)->xX = MTH_M_xAdd3( \
|
||
MTH_M_xMul( (MatA)->stCol_0.xX, (VectA)->xX), \
|
||
MTH_M_xMul( (MatA)->stCol_1.xX, (VectA)->xY), \
|
||
MTH_M_xMul( (MatA)->stCol_2.xX, (VectA)->xZ)); \
|
||
(VectDest)->xY = MTH_M_xAdd3( \
|
||
MTH_M_xMul( (MatA)->stCol_0.xY, (VectA)->xX), \
|
||
MTH_M_xMul( (MatA)->stCol_1.xY, (VectA)->xY), \
|
||
MTH_M_xMul( (MatA)->stCol_2.xY, (VectA)->xZ)); \
|
||
(VectDest)->xZ = MTH_M_xAdd3( \
|
||
MTH_M_xMul( (MatA)->stCol_0.xZ, (VectA)->xX), \
|
||
MTH_M_xMul( (MatA)->stCol_1.xZ, (VectA)->xY), \
|
||
MTH_M_xMul( (MatA)->stCol_2.xZ, (VectA)->xZ)); }
|
||
|
||
#if defined(OPTIMIZED_FOR_U64_ASM)
|
||
/* requires no temporary buffer if VectDest==VectA*/
|
||
static inline void MTH3D_M_vMulMatrixVectorWithoutBufferU64ASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA)
|
||
{
|
||
/* GGG RRR OOO U U M M PPP FFFF !! !!
|
||
G G R R O O U U MM MM P P F !! !!
|
||
G RRR O O U U M M M PPP FFF !! !!
|
||
G GG R R O O U U M M P F
|
||
GGG R R OOO UUU M M P F oo oo
|
||
|
||
PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/
|
||
asm(" .set noreorder ");
|
||
asm(" # Begin MulMatrixVectorWithoutBufferU64ASM " );
|
||
asm(
|
||
" lwc1 $f6,0(%2) \n"
|
||
" lwc1 $f8,4(%2) \n"
|
||
" lwc1 $f10,8(%2) \n"
|
||
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
|
||
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
|
||
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
|
||
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
|
||
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
|
||
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
|
||
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
|
||
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
|
||
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
|
||
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
|
||
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
|
||
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
|
||
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
|
||
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
|
||
" swc1 $f0,0(%0) # f0 -> Dest[0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
|
||
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
|
||
" swc1 $f12,4(%0) # f12 -> Dest[1] \n"
|
||
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
|
||
" swc1 $f18,8(%0) # f18 -> Dest[2] \n"
|
||
: : "r" (VectDest), "r" (MatA), "r" (VectA) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" );
|
||
asm(" # EndOf MulMatrixVectorWithoutBufferU64ASM ");
|
||
asm(" .set reorder ");
|
||
}
|
||
#endif /* OPTIMIZED_FOR_U64_ASM*/
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vMulMatrixVector*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
#define MTH3D_M_vMulMatrixVectorASM MTH3D_M_vMulMatrixVectorWithoutBufferASM
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vMulMatrixVectorC MTH3D_M_vMulMatrixVectorORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
#if defined(OPTIMIZED_FOR_U64_ASM)
|
||
#define MTH3D_M_vMulMatrixVectorU64ASM MTH3D_M_vMulMatrixVectorWithoutBufferU64ASM
|
||
#endif /* OPTIMIZED_FOR_U64_ASM*/
|
||
|
||
#define MTH3D_M_vMulMatrixVectorORG( VectDest, MatA, VectA) \
|
||
{ if( VectA==VectDest ) \
|
||
{ \
|
||
MTH3D_tdstVector Vtmp; \
|
||
\
|
||
MTH3D_M_vCopyVector( &Vtmp, VectA); \
|
||
MTH3D_M_vMulMatrixVectorWithoutBuffer( VectDest, MatA, &Vtmp); \
|
||
} \
|
||
else \
|
||
{ \
|
||
MTH3D_M_vMulMatrixVectorWithoutBuffer( VectDest, MatA, VectA); \
|
||
} \
|
||
}
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vInverMatrix*/
|
||
/************************************************************************************************************************/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
/* avoid transposition, stack, 9 divisions by det, 4 negs*/
|
||
/* 131 clocks (38+1 penalties) 98 instructions : 2.04 % pairing*/
|
||
#pragma warning(disable:4725)
|
||
INLINE void MTH3D_M_vInverMatrixASM(struct MTH3D_tdstMatrix_ *MatDest, struct MTH3D_tdstMatrix_ *MatA)
|
||
{
|
||
static MTH_tdxReal ONE=MTH_C_ONE;
|
||
__asm
|
||
{
|
||
mov eax,MatA
|
||
mov ebx,MatDest
|
||
|
||
fld dword ptr [eax+16] /*<====== 1 penalty : AGI stall because of eax load*/
|
||
fmul dword ptr [eax+32] /* load A1*/
|
||
fld dword ptr [eax+20]
|
||
fmul dword ptr [eax+24] /* load A2 */
|
||
fld dword ptr [eax+12]
|
||
fmul dword ptr [eax+28] /* load A3*/
|
||
fld dword ptr [eax+20]
|
||
fmul dword ptr [eax+28] /* load B1*/
|
||
fld dword ptr [eax+12]
|
||
fmul dword ptr [eax+32] /* load B2*/
|
||
fld dword ptr [eax+16]
|
||
fmul dword ptr [eax+24] /* load B3*/
|
||
/* B3 B2 B1 A3 A2 A1*/
|
||
fxch st(2) /* B1 B2 B3 A3 A2 A1*/
|
||
fsubp st(5),st /* B2 B3 A3 A2 P1*/
|
||
fsubp st(3),st /* B3 A3 P2 P1 */
|
||
fsubp st(1),st /* P3 P2 P1*/
|
||
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [eax+28] /* C1 P3 P2 P1*/
|
||
|
||
fld st(3)
|
||
fmul dword ptr [eax] /* X C1 P3 P2 P1*/
|
||
fld st(3)
|
||
fmul dword ptr [eax+4] /* Y X C1 P3 P2 P1*/
|
||
fld st(3)
|
||
fmul dword ptr [eax+8] /* Z Y X C1 P3 P2 P1*/
|
||
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [eax+32] /* D1 Z Y X C1 P3 P2 P1*/
|
||
|
||
fxch st(2)
|
||
faddp st(3),st /* Z D1 X+Y C1 P3 P2 P1*/
|
||
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [eax+32] /* C2 Z D1 X+Y C1 P3 P2 P1*/
|
||
|
||
fxch st(2) /* D1 Z C2 X+Y C1 P3 P2 P1*/
|
||
fsubp st(4),st /* Z C2 X+Y N1 P3 P2 P1*/
|
||
faddp st(2),st /* C2 X+Y+Z N1 P3 P2 P1*/
|
||
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [eax+24] /* D2 C2 X+Y+Z N1 P3 P2 P1*/
|
||
|
||
fld dword ptr [ONE] /* 1 D2 C2 det=X+Y+Z N1 P3 P2 P1*/
|
||
fdivrp st(3),st /* D2 C2 D=1/det N1 P3 P2 P1 <====== unavoidable 38 clocks penalties*/
|
||
/*
|
||
fld dword ptr [ONE] // 1 D2 C2 det=X+Y+Z N1 P3 P2 P1
|
||
fxch st(3) // det D2 C2 1 N1 P3 P2 P1
|
||
fdivp st(3),st // D2 C2 D=1/det N1 P3 P2 P1 <====== unavoidable 38 clocks penalties
|
||
*/
|
||
fsubp st(1),st /* N2 D N1 P3 P2 P1*/
|
||
fxch st(1) /* D N2 N1 P3 P2 P1*/
|
||
|
||
fld st(0)
|
||
fmulp st(6),st /* D N2 N1 P3 P2 p1*/
|
||
fld st(0)
|
||
fmulp st(5),st /* D N2 N1 P3 P2 p1*/
|
||
fld st(0)
|
||
fmulp st(4),st /* D N2 N1 P3 p2 p1*/
|
||
fld st(0)
|
||
fmulp st(3),st /* D N2 n1 p3 p2 p1*/
|
||
fld st(0)
|
||
fmulp st(2),st /* D n2 n1 p3 p2 p1*/
|
||
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [eax+24] /* C3 D n2 n1 p3 p2 p1*/
|
||
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [eax+28] /* D3 C3 D n2 n1 p3 p2 p1*/
|
||
|
||
fxch st(7) /* p1 C3 D n2 n1 p3 p2 D3*/
|
||
fstp dword ptr [ebx] /* C3 D n2 n1 p3 p2 D3*/
|
||
/**/
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [eax+20] /* E1 C3 D n2 n1 p3 p2 D3*/
|
||
|
||
fxch st(1) /* C3 E1 D n2 n1 p3 p2 D3*/
|
||
fsubrp st(7),st /* E1 D n2 n1 p3 p2 N3*/
|
||
/**/
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [eax+16] /* F1 E1 D n2 n1 p3 p2 N3*/
|
||
|
||
fxch st(6) /* p2 E1 D n2 n1 p3 F1 N3*/
|
||
fstp dword ptr [ebx+12] /* E1 D n2 n1 p3 F1 N3*/
|
||
/**/
|
||
fld dword ptr [eax+8]
|
||
fmul dword ptr [eax+12] /* E2 E1 D n2 n1 p3 F1 N3*/
|
||
|
||
fxch st(1) /* E1 E2 D n2 n1 p3 F1 N3*/
|
||
fsubrp st(6),st /* E2 D n2 n1 p3 M1 N3*/
|
||
/**/
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [eax+20] /* F2 E2 D n2 n1 p3 M1 N3*/
|
||
|
||
fxch st(5) /* p3 E2 D n2 n1 F2 M1 N3*/
|
||
fstp dword ptr [ebx+24] /* E2 D n2 n1 F2 M1 N3*/
|
||
/**/
|
||
fld dword ptr [eax]
|
||
fmul dword ptr [eax+16] /* E3 E2 D n2 n1 F2 M1 N3*/
|
||
|
||
fxch st(1) /* E2 E3 D n2 n1 F2 M1 N3*/
|
||
fsubrp st(5),st /* E3 D n2 n1 M2 M1 N3*/
|
||
/**/
|
||
fld dword ptr [eax+4]
|
||
fmul dword ptr [eax+12] /* F3 E3 D n2 n1 M2 M1 N3*/
|
||
|
||
fxch st(4) /* n1 E3 D n2 F3 M2 M1 N3*/
|
||
fstp dword ptr [ebx+4] /* E3 D n2 F3 M2 M1 N3*/
|
||
/**/
|
||
fxch st(1) /* D E3 n2 F3 M2 M1 N3*/
|
||
fld st(0) /* D D E3 n2 F3 M2 M1 N3*/
|
||
fmulp st(7),st /* D E3 n2 F3 M2 M1 n3*/
|
||
fxch st(1) /* E3 D n2 F3 M2 M1 n3*/
|
||
fsubrp st(3),st /* D n2 M3 M2 M1 n3*/
|
||
/**/
|
||
fld st(0) /* D D n2 M3 M2 M1 n3*/
|
||
fmulp st(5),st /* D n2 M3 M2 m1 n3*/
|
||
fld st(0) /* D D n2 M3 M2 m1 n3*/
|
||
fmulp st(4),st /* D n2 M3 m2 m1 n3*/
|
||
/**/
|
||
fxch st(1) /* n2 D M3 m2 m1 n3*/
|
||
fstp dword ptr [ebx+16] /* D M3 m2 m1 n3*/
|
||
fmulp st(1),st /* m3 m2 m1 n3 */
|
||
fxch st(3) /* n3 m2 m1 m3*/
|
||
/**/
|
||
fstp dword ptr [ebx+28] /* m2 m1 m3*/
|
||
fstp dword ptr [ebx+20] /* m1 m3*/
|
||
fstp dword ptr [ebx+8] /* m3*/
|
||
fstp dword ptr [ebx+32]
|
||
}
|
||
}
|
||
#pragma warning(default:4725)
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
/* avoid transposition, stack, 9 divisions by det, 4 negs*/
|
||
/* 214 clocks (81 penalties) 147 instructions : 1.36 % pairing*/
|
||
INLINE void MTH3D_M_vInverMatrixC(MTH3D_tdstMatrix *MatDest, MTH3D_tdstMatrix *MatA)
|
||
{
|
||
register MTH_tdxReal det;
|
||
(MatDest)->stCol_0.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xY,(MatA)->stCol_2.xZ,(MatA)->stCol_1.xZ,(MatA)->stCol_2.xY);
|
||
det = MTH_M_xMul( (MatA)->stCol_0.xX ,(MatDest)->stCol_0.xX );
|
||
(MatDest)->stCol_1.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xZ,(MatA)->stCol_2.xX,(MatA)->stCol_1.xX,(MatA)->stCol_2.xZ);
|
||
det = MTH_M_xAdd(MTH_M_xMul( (MatA)->stCol_0.xY ,(MatDest)->stCol_1.xX ), det);
|
||
(MatDest)->stCol_2.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xX,(MatA)->stCol_2.xY,(MatA)->stCol_1.xY,(MatA)->stCol_2.xX);
|
||
det = MTH_M_xAdd(MTH_M_xMul( (MatA)->stCol_0.xZ ,(MatDest)->stCol_2.xX ), det);
|
||
(MatDest)->stCol_0.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xZ,(MatA)->stCol_2.xY,(MatA)->stCol_0.xY,(MatA)->stCol_2.xZ);
|
||
det = MTH_M_xDiv( MTH_C_ONE, det );
|
||
(MatDest)->stCol_0.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xY,(MatA)->stCol_1.xZ,(MatA)->stCol_0.xZ,(MatA)->stCol_1.xY);
|
||
(MatDest)->stCol_1.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xX,(MatA)->stCol_2.xZ,(MatA)->stCol_0.xZ,(MatA)->stCol_2.xX);
|
||
(MatDest)->stCol_1.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xZ,(MatA)->stCol_1.xX,(MatA)->stCol_0.xX,(MatA)->stCol_1.xZ);
|
||
(MatDest)->stCol_2.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xY,(MatA)->stCol_2.xX,(MatA)->stCol_0.xX,(MatA)->stCol_2.xY);
|
||
(MatDest)->stCol_2.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xX,(MatA)->stCol_1.xY,(MatA)->stCol_0.xY,(MatA)->stCol_1.xX);
|
||
MTH3D_M_vMulScalarMatrix( MatDest, det, MatDest );
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* Can surely be optimized in avoiding many recomputations done for nothing*/
|
||
/* Before modifying MTH3D_M_vDivScalarMatrix, it took 540 clocks 120 instructions !?!*/
|
||
#define MTH3D_M_vInverMatrixORG(MatDest, MatA) \
|
||
{ \
|
||
MTH3D_tdstMatrix Mat_Tmp={0}; \
|
||
MTH3D_tdstMatrix Mat_Com; \
|
||
MTH_tdxReal det; \
|
||
\
|
||
MTH3D_M_vComMatrixWithoutBuffer(&Mat_Com, MatA); \
|
||
MTH3D_M_vTranspMatrix(&Mat_Tmp, &Mat_Com ); \
|
||
det=MTH3D_M_xDetMatrix( MatA ); \
|
||
MTH3D_M_vDivScalarMatrix(MatDest, &Mat_Tmp, det ); \
|
||
}
|
||
|
||
/************************************************************************************************************************/
|
||
/* MTH3D_M_vTransformVectorWithoutBuffer*/
|
||
/************************************************************************************************************************/
|
||
/* 39 clocks (1 penalty) 41 instructions : 9.72 % pairing*/
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
|
||
INLINE void MTH3D_M_vTransformVectorWithoutBufferASM(MTH3D_tdstVector *VectDest,MTH3D_tdstMatrix *MatA,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB)
|
||
{
|
||
__asm
|
||
{
|
||
mov ecx,VectB
|
||
mov edx,VectDest
|
||
mov ebx,VectA
|
||
mov eax,MatA
|
||
|
||
fld dword ptr [ecx]
|
||
fld dword ptr [ecx+4]
|
||
fld dword ptr [ecx+8]
|
||
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [eax]
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [eax+4]
|
||
fld dword ptr [ebx]
|
||
fmul dword ptr [eax+8]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [eax+12]
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [eax+16]
|
||
fld dword ptr [ebx+4]
|
||
fmul dword ptr [eax+20]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [eax+24]
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [eax+28]
|
||
fld dword ptr [ebx+8]
|
||
fmul dword ptr [eax+32]
|
||
|
||
fxch st(2)
|
||
faddp st(5),st
|
||
faddp st(3),st
|
||
faddp st(1),st
|
||
|
||
fxch st(2)
|
||
|
||
fstp dword ptr [edx] /* 1 penalty : unavoidable*/
|
||
fstp dword ptr [edx+4]
|
||
fstp dword ptr [edx+8]
|
||
|
||
}
|
||
}
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
|
||
|
||
#if defined(OPTIMIZED_FOR_PC_FLOATS)
|
||
#define MTH3D_M_vTransformVectorWithoutBufferC MTH3D_M_vTransformVectorWithoutBufferORG
|
||
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
|
||
|
||
/* 79 clocks (30 penalties) 54 instructions : 3.70 % pairing*/
|
||
#define MTH3D_M_vTransformVectorWithoutBufferORG( VectDest, MatA, VectA, VectB) \
|
||
{ MTH3D_M_vMulMatrixVectorWithoutBuffer(VectDest, MatA, VectA); \
|
||
MTH3D_M_vAddVector(VectDest,VectDest,VectB); \
|
||
}
|
||
|
||
#if defined(OPTIMIZED_FOR_U64_ASM)
|
||
/* requires no temporary buffer if VectDest==VectA*/
|
||
static inline void MTH3D_M_vTransformVectorWithoutBufferU64ASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB)
|
||
{
|
||
/* GGG RRR OOO U U M M PPP FFFF !! !!
|
||
G G R R O O U U MM MM P P F !! !!
|
||
G RRR O O U U M M M PPP FFF !! !!
|
||
G GG R R O O U U M M P F
|
||
GGG R R OOO UUU M M P F oo oo
|
||
|
||
PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/
|
||
asm(" .set noreorder ");
|
||
asm(" # Begin TransformVectorWithoutBufferU64ASM ");
|
||
asm(
|
||
" lwc1 $f6,0(%2) \n"
|
||
" lwc1 $f8,4(%2) \n"
|
||
" lwc1 $f10,8(%2) \n"
|
||
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
|
||
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
|
||
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
|
||
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
|
||
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
|
||
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
|
||
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
|
||
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
|
||
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
|
||
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
|
||
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
|
||
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
|
||
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
|
||
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
|
||
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
|
||
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
|
||
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
|
||
" lwc1 $f6,0(%3) # f6 <- VctB[0] \n"
|
||
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
|
||
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
|
||
" lwc1 $f8,4(%3) # f8 <- VctB[1] \n"
|
||
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
|
||
" lwc1 $f10,8(%3) # f10 <- VctB[2] \n"
|
||
" add.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] + VctB[0] \n"
|
||
" add.s $f12,$f12,$f8 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] + VctB[1] \n"
|
||
" swc1 $f0,0(%0) # f0 -> Dest[0] \n"
|
||
" add.s $f18,$f18,$f10 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] + VctB[1] \n"
|
||
" swc1 $f12,4(%0) # f12 -> Dest[1] \n"
|
||
" swc1 $f18,8(%0) # f18 -> Dest[2] \n"
|
||
: : "r" (VectDest), "r" (MatA), "r" (VectA) , "r" (VectB) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" );
|
||
asm(" # EndOf TransformVectorWithoutBufferU64ASM ");
|
||
asm(" .set reorder ");
|
||
}
|
||
#endif /* OPTIMIZED_FOR_U64_ASM*/
|
||
|
||
#endif /* MTH_ASM_H*/
|