reman3/Rayman_X/cpa/public/MTH/MTH3dopt.h

1542 lines
62 KiB
C
Raw Blame History

/* Olivier Couvreur : 3/98 */
#if !defined(MTH_ASM_H)
#define MTH_ASM_H
#include "acp_base.h"
/* force OPTIMIZED_FOR_PC_FLOATS if OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#if !defined(OPTIMIZED_FOR_PC_FLOATS)
#define OPTIMIZED_FOR_PC_FLOATS
#endif
#endif
/************************************************************************************************************************/
/* MTH3D_M_vAddVector*/
/************************************************************************************************************************/
/* register load + 3 * (fld fadd) + 3 fstp = 2 + 3 * (1+1) + 3 * (2) = 14 clocks 13 instructions*/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
INLINE void MTH3D_M_vAddVectorASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB)
{
__asm
{
mov eax,VectA
mov ebx,VectB
mov ecx,VectDest
fld dword ptr [eax] /*; (VectA)->xX*/
fadd dword ptr [ebx] /*; (VectB)->xX*/
fld dword ptr [eax+4] /*; (VectA)->xY*/
fadd dword ptr [ebx+4] /*; (VectB)->xY*/
fld dword ptr [eax+8] /*; (VectA)->xZ*/
fadd dword ptr [ebx+8] /*; (VectB)->xZ*/
fxch st(2)
fstp dword ptr [ecx] /*; (VectDest)->xX*/
fstp dword ptr [ecx+4] /*; (VectDest)->xY*/
fstp dword ptr [ecx+8] /*; (VectDest)->xZ*/
}
}
#endif /* #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vAddVectorC MTH3D_M_vAddVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 3 * (fld fadd fstp) = 3 * (1+1+2+3pen) = 21 clocks 9 instructions*/
#define MTH3D_M_vAddVectorORG( VectDest, VectA, VectB) \
{ (VectDest)->xX = MTH_M_xAdd((VectA)->xX, (VectB)->xX); \
(VectDest)->xY = MTH_M_xAdd((VectA)->xY, (VectB)->xY); \
(VectDest)->xZ = MTH_M_xAdd((VectA)->xZ, (VectB)->xZ); }
/************************************************************************************************************************/
/* MTH3D_M_vSubVector*/
/************************************************************************************************************************/
/* register load + 3 * (fld fadd) + 3 fstp = 2 + 3 * (1+1) + 3 * (2) = 14 clocks 13 instructions*/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
INLINE void MTH3D_M_vSubVectorASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB)
{
__asm
{
mov eax,VectA
mov ebx,VectB
mov ecx,VectDest
fld dword ptr [eax] /*; (VectA)->xX*/
fsub dword ptr [ebx] /*; (VectB)->xX*/
fld dword ptr [eax+4] /*; (VectA)->xY*/
fsub dword ptr [ebx+4] /*; (VectB)->xY*/
fld dword ptr [eax+8] /*; (VectA)->xZ*/
fsub dword ptr [ebx+8] /*; (VectB)->xZ*/
fxch st(2)
fstp dword ptr [ecx] /*; (VectDest)->xX*/
fstp dword ptr [ecx+4] /*; (VectDest)->xY*/
fstp dword ptr [ecx+8] /*; (VectDest)->xZ*/
}
}
#endif /* #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vSubVectorC MTH3D_M_vSubVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 3 * (fld fadd fstp) = 3 * (1+1+2+3pen) = 21 clocks 9 instructions*/
#define MTH3D_M_vSubVectorORG( VectDest, VectA, VectB) \
{ (VectDest)->xX = MTH_M_xSub((VectA)->xX, (VectB)->xX); \
(VectDest)->xY = MTH_M_xSub((VectA)->xY, (VectB)->xY); \
(VectDest)->xZ = MTH_M_xSub((VectA)->xZ, (VectB)->xZ); }
/************************************************************************************************************************/
/* MTH3D_M_vNegVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vNegVectorASM MTH3D_M_vNegVectorC
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
/* only toggle sign bit : No fpu*/
/* 10 pairables instructions => 5 clocks*/
#define MTH3D_M_vNegVectorC( VectDest, VectA) \
{ long register NegMask=0x80000000; \
*((long*) &((VectDest)->xX)) = *((long*) &((VectA)->xX )) ^ NegMask; \
*((long*) &((VectDest)->xY)) = *((long*) &((VectA)->xY )) ^ NegMask; \
*((long*) &((VectDest)->xZ)) = *((long*) &((VectA)->xZ )) ^ NegMask; \
}
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 3 * ( fld fchs fstp) = 3 * (1 + 1 + 1+1pen) = 12 clocks 9 instructions*/
#define MTH3D_M_vNegVectorORG( VectDest, VectA) \
{ (VectDest)->xX = MTH_M_xNeg( (VectA)->xX ); \
(VectDest)->xY = MTH_M_xNeg( (VectA)->xY ); \
(VectDest)->xZ = MTH_M_xNeg( (VectA)->xZ ); }
/************************************************************************************************************************/
/* MTH3D_M_vAdd3*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 14 clocks*/
INLINE void MTH3D_M_vAdd3ScalarVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA,MTH_tdxReal x,MTH_tdxReal y,MTH_tdxReal z)
{
__asm
{
mov eax,VectA
mov ecx,VectDest
fld dword ptr [eax]
fadd dword ptr [x]
fld dword ptr [eax+4]
fadd dword ptr [y]
fld dword ptr [eax+8]
fadd dword ptr [z]
fxch st(2)
fstp dword ptr [ecx]
fstp dword ptr [ecx+4]
fstp dword ptr [ecx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vAdd3ScalarVectorC MTH3D_M_vAdd3ScalarVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 21 clocks*/
#define MTH3D_M_vAdd3ScalarVectorORG( VectDest, VectA, x, y, z) \
{ (VectDest)->xX = MTH_M_xAdd( (VectA)->xX, (x) ); \
(VectDest)->xY = MTH_M_xAdd( (VectA)->xY, (y) ); \
(VectDest)->xZ = MTH_M_xAdd( (VectA)->xZ, (z) ); }
/************************************************************************************************************************/
/* MTH3D_M_vDivScalarVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 14 clocks*/
INLINE void MTH3D_M_vMulScalarVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal a,MTH3D_tdstVector *VectA)
{
__asm
{
mov eax,VectA
mov ebx,VectDest
fld dword ptr [eax]
fmul dword ptr [a]
fld dword ptr [eax+4]
fmul dword ptr [a]
fld dword ptr [eax+8]
fmul dword ptr [a]
fxch st(2)
fstp dword ptr [ebx]
fstp dword ptr [ebx+4]
fstp dword ptr [ebx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulScalarVectorC( VectDest, a, VectA) \
{ register MTH_tdxReal xTempMTH3D_M_vMulScalarVectorC=(a); \
(VectDest)->xX = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xX); \
(VectDest)->xY = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xY); \
(VectDest)->xZ = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xZ); }
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
#define MTH3D_M_vMulScalarVectorORG( VectDest, a, VectA) \
{ (VectDest)->xX = MTH_M_xMul((a), (VectA)->xX); \
(VectDest)->xY = MTH_M_xMul((a), (VectA)->xY); \
(VectDest)->xZ = MTH_M_xMul((a), (VectA)->xZ); }
/************************************************************************************************************************/
/* MTH3D_M_vDivScalarVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vDivScalarVectorASM MTH3D_M_vDivScalarVectorC
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
/* Only one division*/
#define MTH3D_M_vDivScalarVectorC( VectDest, VectA, a) \
{ register MTH_tdxReal xTempMTH3D_M_vDivScalarVectorC=MTH_M_xDiv(MTH_C_ONE, (a)); \
MTH3D_M_vMulScalarVector( VectDest ,xTempMTH3D_M_vDivScalarVectorC, VectA); }
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
#define MTH3D_M_vDivScalarVectorORG( VectDest, VectA, a) \
{ (VectDest)->xX = MTH_M_xDiv((VectA)->xX, (a)); \
(VectDest)->xY = MTH_M_xDiv((VectA)->xY, (a)); \
(VectDest)->xZ = MTH_M_xDiv((VectA)->xZ, (a)); }
/************************************************************************************************************************/
/* MTH3D_M_vScaleVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 14 clocks*/
INLINE void MTH3D_M_vScaleVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB)
{
__asm
{
mov eax,VectA
mov ebx,VectB
mov ecx,VectDest
fld dword ptr [eax]
fmul dword ptr [ebx]
fld dword ptr [eax+4]
fmul dword ptr [ebx+4]
fld dword ptr [eax+8]
fmul dword ptr [ebx+8]
fxch st(2)
fstp dword ptr [ecx]
fstp dword ptr [ecx+4]
fstp dword ptr [ecx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vScaleVectorC MTH3D_M_vScaleVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 21 clocks*/
#define MTH3D_M_vScaleVectorORG( VectDest, VectA, VectB ) \
{ (VectDest)->xX = MTH_M_xMul( (VectA)->xX, (VectB)->xX); \
(VectDest)->xY = MTH_M_xMul( (VectA)->xY, (VectB)->xY); \
(VectDest)->xZ = MTH_M_xMul( (VectA)->xZ, (VectB)->xZ); }
/************************************************************************************************************************/
/* MTH3D_M_vMulAddVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 17 clocks : D=xA+B*/
INLINE void MTH3D_M_vMulAddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB)
{
__asm
{
mov eax,VectA
mov ebx,VectB
mov ecx,VectDest
fld dword ptr [eax]
fmul dword ptr [x]
fld dword ptr [eax+4]
fmul dword ptr [x]
fxch st(1)
fld dword ptr [eax+8]
fmul dword ptr [x]
fxch st(1)
fadd dword ptr [ebx]
fxch st(2)
fadd dword ptr [ebx+4]
fxch st(1)
fadd dword ptr [ebx+8]
fxch st(2)
fstp dword ptr [ecx] /* 1 pen*/
fstp dword ptr [ecx+4]
fstp dword ptr [ecx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulAddVectorC MTH3D_M_vMulAddVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 31 clocks*/
#define MTH3D_M_vMulAddVectorORG( VectDest, x, VectA, VectB) \
{ (VectDest)->xX = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xX), (VectB)->xX); \
(VectDest)->xY = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xY), (VectB)->xY); \
(VectDest)->xZ = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xZ), (VectB)->xZ); }
/************************************************************************************************************************/
/* MTH3D_M_vMul3AddVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 32 clocks (2 penalties) : D=xA+yB+zC*/
INLINE void MTH3D_M_vMul3AddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH_tdxReal y,MTH3D_tdstVector *VectB,MTH_tdxReal z,MTH3D_tdstVector *VectC)
{
__asm
{
mov eax,VectA
mov ebx,VectB
mov ecx,VectC
mov edx,VectDest
fld dword ptr [eax]
fmul dword ptr [x]
fld dword ptr [eax+4]
fmul dword ptr [x]
fld dword ptr [eax+8]
fmul dword ptr [x]
fxch st(2)
fld dword ptr [ebx]
fmul dword ptr [y]
fld dword ptr [ebx+4]
fmul dword ptr [y]
fld dword ptr [ebx+8]
fmul dword ptr [y]
fxch st(2)
faddp st(3),st
faddp st(3),st
fld dword ptr [ecx]
fmul dword ptr [z]
fxch st(1)
faddp st(4),st
fld dword ptr [ecx+4]
fmul dword ptr [z]
fld dword ptr [ecx+8]
fmul dword ptr [z]
fxch st(2)
faddp st(3),st
faddp st(3),st
faddp st(3),st /* 1 pen*/
fstp dword ptr [edx] /* 1 pen*/
fstp dword ptr [edx+4]
fstp dword ptr [edx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMul3AddVectorC MTH3D_M_vMul3AddVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* au moins 42 clocks*/
#define MTH3D_M_vMul3AddVectorORG( VectDest, x, VectA, y, VectB, z, VectC) \
{ (VectDest)->xX = MTH_M_xMulAddMulAddMul((x),(VectA)->xX), (y),(VectB)->xX), (z),(VectC)->xX)); \
(VectDest)->xY = MTH_M_xMulAddMulAddMul((x),(VectA)->xY), (y),(VectB)->xY), (z),(VectC)->xY)); \
(VectDest)->xZ = MTH_M_xMulAddMulAddMul((x),(VectA)->xZ), (y),(VectB)->xZ), (z),(VectC)->xZ)); }
/************************************************************************************************************************/
/* MTH3D_M_vMul4AddVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 40 clocks (0 penalties) : E=xA+yB+zC+wD*/
INLINE void MTH3D_M_vMul4AddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH_tdxReal y,MTH3D_tdstVector *VectB,MTH_tdxReal z,MTH3D_tdstVector *VectC,MTH_tdxReal w,MTH3D_tdstVector *VectD)
{
__asm
{
mov eax,VectA
mov ebx,VectB
mov ecx,VectC
mov edx,VectD
fld dword ptr [eax]
fmul dword ptr [x]
fld dword ptr [eax+4]
fmul dword ptr [x]
fld dword ptr [eax+8]
fmul dword ptr [x]
fxch st(2)
fld dword ptr [ebx]
fmul dword ptr [y]
fld dword ptr [ebx+4]
fmul dword ptr [y]
fld dword ptr [ebx+8]
fmul dword ptr [y]
fxch st(2)
faddp st(3),st
faddp st(3),st
fld dword ptr [ecx]
fmul dword ptr [z]
fxch st(1)
faddp st(4),st
fld dword ptr [ecx+4]
fmul dword ptr [z]
fld dword ptr [ecx+8]
fmul dword ptr [z]
fxch st(2)
faddp st(3),st
faddp st(3),st
fld dword ptr [edx]
fmul dword ptr [w]
fxch st(1)
faddp st(4),st
fld dword ptr [edx+4]
fmul dword ptr [w]
fld dword ptr [edx+8]
fmul dword ptr [w]
fxch st(2)
faddp st(3),st
faddp st(3),st
mov edx,VectDest
faddp st(3),st
fstp dword ptr [edx]
fstp dword ptr [edx+4]
fstp dword ptr [edx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMul4AddVectorC MTH3D_M_vMul4AddVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* au moins 56 clocks*/
#define MTH3D_M_vMul4AddVectorORG( VectDest, x, VectA, y, VectB, z, VectC, w, VectD) \
{ (VectDest)->xX = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xX), (y),(VectB)->xX), (z),(VectC)->xX)),MTH_M_xMul((w),(VectD)->xX)); \
(VectDest)->xY = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xY), (y),(VectB)->xY), (z),(VectC)->xY)),MTH_M_xMul((w),(VectD)->xY)); \
(VectDest)->xZ = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xZ), (y),(VectB)->xZ), (z),(VectC)->xZ)),MTH_M_xMul((w),(VectD)->xZ)); }
/************************************************************************************************************************/
/* MTH3D_M_vLinearInterpolVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 21 clocks*/
INLINE void MTH3D_M_vLinearInterpolVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA, MTH3D_tdstVector *VectB, MTH_tdxReal t)
{
/* Cx=Ax+t(Bx-Ax)=Ax+tDx*/
/* Cy=Ay+t(By-Ay)=Ay+tDy*/
/* Cz=Az+t(Bz-Az)=Az+tDz*/
__asm
{
mov ebx,VectB
mov eax,VectA
mov ecx,VectDest
fld dword ptr [ebx] /* Bx*/
fsub dword ptr [eax] /* Dx*/
fld dword ptr [ebx+4] /* By Dx*/
fsub dword ptr [eax+4] /* Dy Dx*/
fld dword ptr [ebx+8] /* Bz Dy Dx*/
fsub dword ptr [eax+8] /* Dz Dy Dx*/
fxch st(2) /* Dx Dy Dz*/
fmul dword ptr [t] /* tDx Dy Dz*/
fld dword ptr [eax+4] /* Ay tDx Dy Dz*/
fxch st(2) /* Dy tDx Ay Dz*/
fmul dword ptr [t] /* tDy tDx Ay Dz*/
fld dword ptr [eax] /* Ax tDy tDx Ay Dz*/
fxch st(4) /* Dz tDy tDx Ay Ax*/
fmul dword ptr [t] /* tDz tDy tDx Ay Ax*/
fxch st(2) /* tDx tDy tDz Ay Ax*/
faddp st(4),st /* tDy tDz Ay Cx*/
faddp st(2),st /* tDz Cy Cx*/
fld dword ptr [eax+8] /* Az tDz Cy Cx */
faddp st(1),st /* Cz Cy Cx*/
fxch st(2) /* Cx Cy Cz*/
fstp dword ptr [ecx] /* Cy Cz*/
fstp dword ptr [ecx+4] /* Cz*/
fstp dword ptr [ecx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vLinearInterpolVectorC MTH3D_M_vLinearInterpolVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 39 clocks */
#define MTH3D_M_vLinearInterpolVectorORG( VectDest, VectA, VectB, t ) \
{ (VectDest)->xX = MTH_M_xLinearInterpol( (VectA)->xX, (VectB)->xX, (t) ); \
(VectDest)->xY = MTH_M_xLinearInterpol( (VectA)->xY, (VectB)->xY, (t) ); \
(VectDest)->xZ = MTH_M_xLinearInterpol( (VectA)->xZ, (VectB)->xZ, (t) ); \
}
/************************************************************************************************************************/
/* MTH3D_M_vLinearScaleVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 21 clocks*/
INLINE void MTH3D_M_vLinearScaleVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA, MTH3D_tdstVector *VectB, MTH3D_tdstVector *VectC)
{
/* x=Ax+Cx.(Bx-Ax)=Ax+Cx.Dx*/
/* y=Ay+Cy.(By-Ay)=Ay+Cy.Dy*/
/* z=Az+Cz.(Bz-Az)=Az+Cz.Dz*/
__asm
{
mov ebx,VectB
mov eax,VectA
mov ecx,VectDest
mov edx,VectC
fld dword ptr [ebx] /* Bx*/
fsub dword ptr [eax] /* Dx*/
fld dword ptr [ebx+4] /* By Dx*/
fsub dword ptr [eax+4] /* Dy Dx*/
fld dword ptr [ebx+8] /* Bz Dy Dx*/
fsub dword ptr [eax+8] /* Dz Dy Dx*/
fxch st(2) /* Dx Dy Dz*/
fmul dword ptr [edx] /* CxDx Dy Dz*/
fld dword ptr [eax+4] /* Ay CxDx Dy Dz*/
fxch st(2) /* Dy CxDx Ay Dz*/
fmul dword ptr [edx+4] /* CyDy CxDx Ay Dz*/
fld dword ptr [eax] /* Ax CyDy CxDx Ay Dz*/
fxch st(4) /* Dz CyDy CxDx Ay Ax*/
fmul dword ptr [edx+8] /* CzDz CyDy CxDx Ay Ax*/
fxch st(2) /* CxDx CyDy CzDz Ay Ax*/
faddp st(4),st /* CyDy CzDz Ay x*/
faddp st(2),st /* CzDz y x*/
fld dword ptr [eax+8] /* Az CzDz y x */
faddp st(1),st /* z y x*/
fxch st(2) /* x y z*/
fstp dword ptr [ecx] /* y z*/
fstp dword ptr [ecx+4] /* z*/
fstp dword ptr [ecx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vLinearScaleVectorC MTH3D_M_vLinearScaleVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 39 clocks */
#define MTH3D_M_vLinearScaleVectorORG( VectDest, VectA, VectB, VectC ) \
{ (VectDest)->xX = MTH_M_xLinearInterpol( (VectA)->xX, (VectB)->xX, (VectC)->xX ); \
(VectDest)->xY = MTH_M_xLinearInterpol( (VectA)->xY, (VectB)->xY, (VectC)->xY ); \
(VectDest)->xZ = MTH_M_xLinearInterpol( (VectA)->xZ, (VectB)->xZ, (VectC)->xZ ); \
}
/************************************************************************************************************************/
/* MTH3D_M_xDotProductVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 12 clocks (2 penalties) 11 instructions*/
#pragma warning(disable:4035)
INLINE
MTH_tdxReal MTH3D_M_xDotProductVectorASM(struct MTH3D_tdstVector_ *VectA,struct MTH3D_tdstVector_ *VectB)
{
register MTH_tdxReal xDot;
__asm
{
mov eax,VectA
mov ebx,VectB
fld dword ptr [eax]
fmul dword ptr [ebx]
fld dword ptr [eax+4]
fmul dword ptr [ebx+4]
fld dword ptr [eax+8]
fmul dword ptr [ebx+8]
fxch st(1)
faddp st(2),st
faddp st(1),st /* 2 unavoidable penalties*/
fstp [xDot]
}
return(xDot);
}
#pragma warning(default:4035)
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_xDotProductVectorC MTH3D_M_xDotProductVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 15 clocks (5 penalties) 12 instructions*/
#define MTH3D_M_xDotProductVectorORG( VectA, VectB) \
MTH_M_xAdd( \
MTH_M_xAdd( \
MTH_M_xMul((VectA)->xX, (VectB)->xX), \
MTH_M_xMul((VectA)->xY, (VectB)->xY) \
), \
MTH_M_xMul((VectA)->xZ, (VectB)->xZ) )
/************************************************************************************************************************/
/* MTH3D_M_vCrossProductVectorWithoutBuffer*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* requires no temp buffer if VectA or VectB == VectDest*/
/* 24 clocks (1 penalty) 23 instructions : 8.70 % pairing*/
INLINE void MTH3D_M_vCrossProductVectorWithoutBufferASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA,struct MTH3D_tdstVector_ *VectB)
{
__asm
{
mov eax,VectA
mov ebx,VectB
mov edx,VectDest
fld dword ptr [eax+4]
fmul dword ptr [ebx+8]
fld dword ptr [eax+8]
fmul dword ptr [ebx]
fld dword ptr [eax]
fmul dword ptr [ebx+4]
fld dword ptr [eax+8]
fmul dword ptr [ebx+4]
fld dword ptr [eax]
fmul dword ptr [ebx+8]
fld dword ptr [eax+4]
fmul dword ptr [ebx]
fxch st(2)
fsubp st(5),st
fsubp st(3),st
fsubp st(1),st
fxch st(2)
fstp dword ptr [edx] /* 1 penalty here : unavoidable !*/
fstp dword ptr [edx+4]
fstp dword ptr [edx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vCrossProductVectorWithoutBufferC MTH3D_M_vCrossProductVectorWithoutBufferORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 42 clocks (18 penalties) 25 instructions : 8.00 % pairing*/
#define MTH3D_M_vCrossProductVectorWithoutBufferORG(VectDest, VectA, VectB) \
{ (VectDest)->xX=MTH_M_xMulSubMul((VectA)->xY,(VectB)->xZ,(VectA)->xZ,(VectB)->xY); \
(VectDest)->xY=MTH_M_xMulSubMul((VectA)->xZ,(VectB)->xX,(VectA)->xX,(VectB)->xZ); \
(VectDest)->xZ=MTH_M_xMulSubMul((VectA)->xX,(VectB)->xY,(VectA)->xY,(VectB)->xX); }
/************************************************************************************************************************/
/* MTH3D_M_vCrossProductVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vCrossProductVectorASM MTH3D_M_vCrossProductVectorWithoutBufferASM
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vCrossProductVectorC MTH3D_M_vCrossProductVectorWithoutBufferC
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
#define MTH3D_M_vCrossProductVectorORG(VectDest, VectA, VectB) \
{ if( (VectDest==VectA) || (VectDest==VectB) ) \
{ \
MTH3D_tdstVector VectTmp; \
MTH3D_M_vCrossProductVectorWithoutBuffer(&VectTmp, VectA, VectB); \
MTH3D_M_vCopyVector(VectDest, &VectTmp); \
} \
else \
{ \
MTH3D_M_vCrossProductVectorWithoutBuffer(VectDest, VectA, VectB); \
} \
}
/************************************************************************************************************************/
/* MTH3D_M_vMulMatrixMatrixWithoutBuffer*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* requires a buffer only if MatDest==A*/
/* 91 clocks (0 penalty) 95 instructions : 4.21 % pairing*/
/* tricks remove penalties*/
INLINE void MTH3D_M_vMulMatrixMatrixWithoutBufferASM(struct MTH3D_tdstMatrix_ *MatDest,struct MTH3D_tdstMatrix_ *MatA,struct MTH3D_tdstMatrix_ *MatB)
{
__asm
{
mov ebx,MatB
mov eax,MatA
mov ecx,MatDest
/**/
fld dword ptr [ebx]
fmul dword ptr [eax]
fld dword ptr [ebx]
fmul dword ptr [eax+4]
fld dword ptr [ebx]
fmul dword ptr [eax+8]
fld dword ptr [ebx+4]
fmul dword ptr [eax+12]
fld dword ptr [ebx+4]
fmul dword ptr [eax+16]
fld dword ptr [ebx+4]
fmul dword ptr [eax+20]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fld dword ptr [ebx+8]
fmul dword ptr [eax+24]
fld dword ptr [ebx+8]
fmul dword ptr [eax+28]
fld dword ptr [ebx+8]
fmul dword ptr [eax+32]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fxch st(1) /* trick A : preload next value*/
fld dword ptr [ebx+12]
fxch st(3)
fstp dword ptr [ecx] /* A no more penalty here*/
fstp dword ptr [ecx+4]
fstp dword ptr [ecx+8]
/**/
fmul dword ptr [eax]
fld dword ptr [ebx+12]
fmul dword ptr [eax+4]
fld dword ptr [ebx+12]
fmul dword ptr [eax+8]
fld dword ptr [ebx+16]
fmul dword ptr [eax+12]
fld dword ptr [ebx+16]
fmul dword ptr [eax+16]
fld dword ptr [ebx+16]
fmul dword ptr [eax+20]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fld dword ptr [ebx+20]
fmul dword ptr [eax+24]
fld dword ptr [ebx+20]
fmul dword ptr [eax+28]
fld dword ptr [ebx+20]
fmul dword ptr [eax+32]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fxch st(2) /* trick B : preload next value*/
/* trick C : replace fxch st(1)*/
fld dword ptr [ebx+24]
fmul dword ptr [eax]
fxch st(2) /* trick C : replace fxch st(3)*/
fstp dword ptr [ecx+16]
fstp dword ptr [ecx+12] /* B: no more penalty here*/
/*fstp dword ptr [ecx+20] // trick C : store it later*/
/**/
fld dword ptr [ebx+24]
fmul dword ptr [eax+4]
fld dword ptr [ebx+24]
fmul dword ptr [eax+8]
fld dword ptr [ebx+28]
fmul dword ptr [eax+12]
fld dword ptr [ebx+28]
fmul dword ptr [eax+16]
fld dword ptr [ebx+28]
fmul dword ptr [eax+20]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fxch st(3) /* trick C :added*/
fld dword ptr [ebx+32]
fmul dword ptr [eax+24]
fld dword ptr [ebx+32]
fmul dword ptr [eax+28]
fld dword ptr [ebx+32]
fmul dword ptr [eax+32]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(4),st /* trick C : replace faddp st(1),st*/
fstp dword ptr [ecx+20] /* trick C : store it later*/
/* no more penalty here*/
fstp dword ptr [ecx+28] /* trick C : swapped stores*/
fstp dword ptr [ecx+24]
fstp dword ptr [ecx+32]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulMatrixMatrixWithoutBufferC MTH3D_M_vMulMatrixMatrixWithoutBufferORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 174 clocks (69 penalties) 119 instructions : 1.68 % pairing*/
#define MTH3D_M_vMulMatrixMatrixWithoutBufferORG(MatDest, MatA, MatB) \
{ \
(MatDest)->stCol_0.xX = MTH_M_xMulAddMulAddMul( \
(MatA)->stCol_0.xX, (MatB)->stCol_0.xX, \
(MatA)->stCol_1.xX, (MatB)->stCol_0.xY, \
(MatA)->stCol_2.xX, (MatB)->stCol_0.xZ ); \
(MatDest)->stCol_0.xY = MTH_M_xMulAddMulAddMul( \
(MatA)->stCol_0.xY, (MatB)->stCol_0.xX, \
(MatA)->stCol_1.xY, (MatB)->stCol_0.xY, \
(MatA)->stCol_2.xY, (MatB)->stCol_0.xZ ); \
(MatDest)->stCol_0.xZ = MTH_M_xMulAddMulAddMul( \
(MatA)->stCol_0.xZ, (MatB)->stCol_0.xX, \
(MatA)->stCol_1.xZ, (MatB)->stCol_0.xY, \
(MatA)->stCol_2.xZ, (MatB)->stCol_0.xZ ); \
\
(MatDest)->stCol_1.xX = MTH_M_xMulAddMulAddMul( \
(MatA)->stCol_0.xX, (MatB)->stCol_1.xX, \
(MatA)->stCol_1.xX, (MatB)->stCol_1.xY, \
(MatA)->stCol_2.xX, (MatB)->stCol_1.xZ ); \
(MatDest)->stCol_1.xY = MTH_M_xMulAddMulAddMul( \
(MatA)->stCol_0.xY, (MatB)->stCol_1.xX, \
(MatA)->stCol_1.xY, (MatB)->stCol_1.xY, \
(MatA)->stCol_2.xY, (MatB)->stCol_1.xZ ); \
(MatDest)->stCol_1.xZ = MTH_M_xMulAddMulAddMul( \
(MatA)->stCol_0.xZ, (MatB)->stCol_1.xX, \
(MatA)->stCol_1.xZ, (MatB)->stCol_1.xY, \
(MatA)->stCol_2.xZ, (MatB)->stCol_1.xZ ); \
\
(MatDest)->stCol_2.xX = MTH_M_xMulAddMulAddMul( \
(MatA)->stCol_0.xX, (MatB)->stCol_2.xX, \
(MatA)->stCol_1.xX, (MatB)->stCol_2.xY, \
(MatA)->stCol_2.xX, (MatB)->stCol_2.xZ ); \
(MatDest)->stCol_2.xY = MTH_M_xMulAddMulAddMul( \
(MatA)->stCol_0.xY, (MatB)->stCol_2.xX, \
(MatA)->stCol_1.xY, (MatB)->stCol_2.xY, \
(MatA)->stCol_2.xY, (MatB)->stCol_2.xZ ); \
(MatDest)->stCol_2.xZ = MTH_M_xMulAddMulAddMul( \
(MatA)->stCol_0.xZ, (MatB)->stCol_2.xX, \
(MatA)->stCol_1.xZ, (MatB)->stCol_2.xY, \
(MatA)->stCol_2.xZ, (MatB)->stCol_2.xZ ); \
}
#if defined(OPTIMIZED_FOR_U64_ASM)
static inline void MTH3D_M_vMulMatrixMatrixWithoutBufferU64ASM(struct MTH3D_tdstMatrix_ *MatDest,struct MTH3D_tdstMatrix_ *MatA,struct MTH3D_tdstMatrix_ *MatB)
{
/* GGG RRR OOO U U M M PPP FFFF !! !!
G G R R O O U U MM MM P P F !! !!
G RRR O O U U M M M PPP FFF !! !!
G GG R R O O U U M M P F
GGG R R OOO UUU M M P F oo oo
PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/
asm(" .set noreorder ");
asm(" # Begin MulMatrixMatrixWithoutBufferU64ASM ");
asm(
" # Premier MulMatrixVertex \n"
" lwc1 $f6,0(%2) \n"
" lwc1 $f8,4(%2) \n"
" lwc1 $f10,8(%2) \n"
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
" swc1 $f0,0(%0) # f0 -> Dest[0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
" swc1 $f12,4(%0) # f12 -> Dest[1] \n"
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
" swc1 $f18,8(%0) # f18 -> Dest[2] \n"
" # Deuxi<78>me MulMatrixVertex \n"
" lwc1 $f6,12(%2) \n"
" lwc1 $f8,16(%2) \n"
" lwc1 $f10,20(%2) \n"
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
" swc1 $f0,12(%0) # f0 -> Dest[0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
" swc1 $f12,16(%0) # f12 -> Dest[1] \n"
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
" swc1 $f18,20(%0) # f18 -> Dest[2] \n"
" # Troisi<73>me MulMatrixVertex \n"
" lwc1 $f6,24(%2) \n"
" lwc1 $f8,28(%2) \n"
" lwc1 $f10,32(%2) \n"
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
" swc1 $f0,24(%0) # f0 -> Dest[0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
" swc1 $f12,28(%0) # f12 -> Dest[1] \n"
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
" swc1 $f18,32(%0) # f18 -> Dest[2] \n"
: : "r" (MatDest), "r" (MatA), "r" (MatB) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" );
asm(" # EndOf MulMatrixMatrixWithoutBufferU64ASM ");\
asm(" .set reorder ");
}
#endif /* OPTIMIZED_FOR_U64_ASM*/
/************************************************************************************************************************/
/* MTH3D_M_vMulMatrixMatrix*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vMulMatrixMatrixASM(Mat_Dest, Mat_A, Mat_B) \
{ if (Mat_Dest==Mat_A) \
{ \
MTH3D_tdstMatrix Mtemp; \
\
MTH3D_M_vMulMatrixMatrixWithoutBuffer( \
&Mtemp, Mat_A, Mat_B); \
MTH3D_M_vCopyMatrix( Mat_Dest, &Mtemp); \
} \
else \
{ \
MTH3D_M_vMulMatrixMatrixWithoutBuffer(Mat_Dest, \
Mat_A, Mat_B); \
} \
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulMatrixMatrixC MTH3D_M_vMulMatrixMatrixORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
#define MTH3D_M_vMulMatrixMatrixORG(Mat_Dest, Mat_A, Mat_B) \
{ if( (Mat_Dest==Mat_A) || (Mat_Dest==Mat_B) ) \
{ \
MTH3D_tdstMatrix Mtemp; \
\
MTH3D_M_vMulMatrixMatrixWithoutBuffer( \
&Mtemp, Mat_A, Mat_B); \
MTH3D_M_vCopyMatrix( Mat_Dest, &Mtemp); \
} \
else \
{ \
MTH3D_M_vMulMatrixMatrixWithoutBuffer(Mat_Dest, \
Mat_A, Mat_B); \
} \
}
/************************************************************************************************************************/
/* MTH3D_M_xDetMatrix*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* only 9 muls instead of 12 !!! */
/* 31 clocks but 5 penalties*/
#pragma warning(disable:4035)
INLINE MTH_tdxReal MTH3D_M_xDetMatrixASM(struct MTH3D_tdstMatrix_ *MatA)
{
register MTH_tdxReal xTempMatrix; /* not useful but necessary to no hang compiler in release mode !?!*/
__asm
{
/* = 0X.(1Y.2Z-1Z.2Y) | = 0X.(A-B) | = X*/
/* + 0Y.(1Z.2X-1X.2Z) | + 0Y.(C-D) | + Y*/
/* + 0Z.(1X.2Y-1Y.2X) | + 0Z.(E-F) | + Z*/
mov ecx,MatA
fld dword ptr [ecx+16] /**/
fmul dword ptr [ecx+32] /* A*/
fld dword ptr [ecx+20] /**/
fmul dword ptr [ecx+28] /* B A*/
fld dword ptr [ecx+20] /**/
fmul dword ptr [ecx+24] /* C B A*/
fld dword ptr [ecx+12] /**/
fmul dword ptr [ecx+32] /* D C B A*/
fxch st(2) /* B C D A*/
fsubp st(3),st /* C D A-B*/
fld dword ptr [ecx+12] /**/
fmul dword ptr [ecx+28] /* E C D A-B*/
fxch st(2) /* D C E A-B*/
fsubp st(1),st /* C-D E A-B*/
fxch st(2) /* A-B E C-D*/
fmul dword ptr [ecx] /* X E C-D*/
fld dword ptr [ecx+16] /**/
fmul dword ptr [ecx+24] /* F X E C-D*/
fxch st(3) /* C-D X E F */
fmul dword ptr [ecx+4] /* Y X E F*/
fxch st(3) /* F X E Y*/
fsubp st(2),st /* X E-F Y*/
faddp st(2),st /* E-F X+Y*/
fmul dword ptr [ecx+8] /* Z X+Y*/
faddp st(1),st /* X+Y+Z*/
fstp dword ptr [xTempMatrix] /* not useful but necessary to no hang compiler in release mode !?!*/
}
return(xTempMatrix); /* not useful but necessary to no hang compiler in release mode !?!*/
}
#pragma warning(default:4035)
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
/* only 9 muls instead of 12 !!! */
/* 37 clocks but 11 penalties*/
#define MTH3D_M_xDetMatrixC( MatA ) \
MTH_M_xAdd3( \
MTH_M_xMul( (MatA)->stCol_0.xX, \
MTH_M_xMulSubMul( (MatA)->stCol_1.xY, (MatA)->stCol_2.xZ, (MatA)->stCol_1.xZ, (MatA)->stCol_2.xY ) \
), \
MTH_M_xMul( (MatA)->stCol_0.xY, \
MTH_M_xMulSubMul( (MatA)->stCol_1.xZ, (MatA)->stCol_2.xX, (MatA)->stCol_1.xX, (MatA)->stCol_2.xZ ) \
), \
MTH_M_xMul( (MatA)->stCol_0.xZ, \
MTH_M_xMulSubMul( (MatA)->stCol_1.xX, (MatA)->stCol_2.xY, (MatA)->stCol_1.xY, (MatA)->stCol_2.xX ) \
) \
)
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 41 clocks but 16 penalties*/
#define MTH3D_M_xDetMatrixORG(MatA) \
MTH_M_xSub( \
MTH_M_xAdd3( \
MTH_M_xMul3( (MatA)->stCol_0.xX, (MatA)->stCol_1.xY, (MatA)->stCol_2.xZ ), \
MTH_M_xMul3( (MatA)->stCol_0.xY, (MatA)->stCol_1.xZ, (MatA)->stCol_2.xX ), \
MTH_M_xMul3( (MatA)->stCol_0.xZ, (MatA)->stCol_1.xX, (MatA)->stCol_2.xY )), \
MTH_M_xAdd3( \
MTH_M_xMul3( (MatA)->stCol_0.xZ, (MatA)->stCol_1.xY, (MatA)->stCol_2.xX ), \
MTH_M_xMul3( (MatA)->stCol_1.xZ, (MatA)->stCol_2.xY, (MatA)->stCol_0.xX ), \
MTH_M_xMul3( (MatA)->stCol_2.xZ, (MatA)->stCol_0.xY, (MatA)->stCol_1.xX )))
/************************************************************************************************************************/
/* MTH3D_M_vMulMatrixVectorWithoutBuffer*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* requires no temporary buffer if VectDest==VectA*/
/* 32 clocks (1 penalty) 33 instructions : 12,12 % pairing*/
INLINE void MTH3D_M_vMulMatrixVectorWithoutBufferASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA)
{
__asm
{
mov ebx,VectA
mov eax,MatA
mov ecx,VectDest
fld dword ptr [ebx]
fmul dword ptr [eax]
fld dword ptr [ebx]
fmul dword ptr [eax+4]
fld dword ptr [ebx]
fmul dword ptr [eax+8]
fld dword ptr [ebx+4]
fmul dword ptr [eax+12]
fld dword ptr [ebx+4]
fmul dword ptr [eax+16]
fld dword ptr [ebx+4]
fmul dword ptr [eax+20]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fld dword ptr [ebx+8]
fmul dword ptr [eax+24]
fld dword ptr [ebx+8]
fmul dword ptr [eax+28]
fld dword ptr [ebx+8]
fmul dword ptr [eax+32]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fxch st(2)
fstp dword ptr [ecx]
fstp dword ptr [ecx+4]
fstp dword ptr [ecx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulMatrixVectorWithoutBufferC MTH3D_M_vMulMatrixVectorWithoutBufferORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 56 clocks (20 penalties) 44 instructions : 4.55 % pairing*/
#define MTH3D_M_vMulMatrixVectorWithoutBufferORG( VectDest, MatA, VectA) \
{ (VectDest)->xX = MTH_M_xAdd3( \
MTH_M_xMul( (MatA)->stCol_0.xX, (VectA)->xX), \
MTH_M_xMul( (MatA)->stCol_1.xX, (VectA)->xY), \
MTH_M_xMul( (MatA)->stCol_2.xX, (VectA)->xZ)); \
(VectDest)->xY = MTH_M_xAdd3( \
MTH_M_xMul( (MatA)->stCol_0.xY, (VectA)->xX), \
MTH_M_xMul( (MatA)->stCol_1.xY, (VectA)->xY), \
MTH_M_xMul( (MatA)->stCol_2.xY, (VectA)->xZ)); \
(VectDest)->xZ = MTH_M_xAdd3( \
MTH_M_xMul( (MatA)->stCol_0.xZ, (VectA)->xX), \
MTH_M_xMul( (MatA)->stCol_1.xZ, (VectA)->xY), \
MTH_M_xMul( (MatA)->stCol_2.xZ, (VectA)->xZ)); }
#if defined(OPTIMIZED_FOR_U64_ASM)
/* requires no temporary buffer if VectDest==VectA*/
static inline void MTH3D_M_vMulMatrixVectorWithoutBufferU64ASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA)
{
/* GGG RRR OOO U U M M PPP FFFF !! !!
G G R R O O U U MM MM P P F !! !!
G RRR O O U U M M M PPP FFF !! !!
G GG R R O O U U M M P F
GGG R R OOO UUU M M P F oo oo
PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/
asm(" .set noreorder ");
asm(" # Begin MulMatrixVectorWithoutBufferU64ASM " );
asm(
" lwc1 $f6,0(%2) \n"
" lwc1 $f8,4(%2) \n"
" lwc1 $f10,8(%2) \n"
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
" swc1 $f0,0(%0) # f0 -> Dest[0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
" swc1 $f12,4(%0) # f12 -> Dest[1] \n"
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
" swc1 $f18,8(%0) # f18 -> Dest[2] \n"
: : "r" (VectDest), "r" (MatA), "r" (VectA) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" );
asm(" # EndOf MulMatrixVectorWithoutBufferU64ASM ");
asm(" .set reorder ");
}
#endif /* OPTIMIZED_FOR_U64_ASM*/
/************************************************************************************************************************/
/* MTH3D_M_vMulMatrixVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vMulMatrixVectorASM MTH3D_M_vMulMatrixVectorWithoutBufferASM
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulMatrixVectorC MTH3D_M_vMulMatrixVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
#if defined(OPTIMIZED_FOR_U64_ASM)
#define MTH3D_M_vMulMatrixVectorU64ASM MTH3D_M_vMulMatrixVectorWithoutBufferU64ASM
#endif /* OPTIMIZED_FOR_U64_ASM*/
#define MTH3D_M_vMulMatrixVectorORG( VectDest, MatA, VectA) \
{ if( VectA==VectDest ) \
{ \
MTH3D_tdstVector Vtmp; \
\
MTH3D_M_vCopyVector( &Vtmp, VectA); \
MTH3D_M_vMulMatrixVectorWithoutBuffer( VectDest, MatA, &Vtmp); \
} \
else \
{ \
MTH3D_M_vMulMatrixVectorWithoutBuffer( VectDest, MatA, VectA); \
} \
}
/************************************************************************************************************************/
/* MTH3D_M_vInverMatrix*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* avoid transposition, stack, 9 divisions by det, 4 negs*/
/* 131 clocks (38+1 penalties) 98 instructions : 2.04 % pairing*/
#pragma warning(disable:4725)
INLINE void MTH3D_M_vInverMatrixASM(struct MTH3D_tdstMatrix_ *MatDest, struct MTH3D_tdstMatrix_ *MatA)
{
static MTH_tdxReal ONE=MTH_C_ONE;
__asm
{
mov eax,MatA
mov ebx,MatDest
fld dword ptr [eax+16] /*<====== 1 penalty : AGI stall because of eax load*/
fmul dword ptr [eax+32] /* load A1*/
fld dword ptr [eax+20]
fmul dword ptr [eax+24] /* load A2 */
fld dword ptr [eax+12]
fmul dword ptr [eax+28] /* load A3*/
fld dword ptr [eax+20]
fmul dword ptr [eax+28] /* load B1*/
fld dword ptr [eax+12]
fmul dword ptr [eax+32] /* load B2*/
fld dword ptr [eax+16]
fmul dword ptr [eax+24] /* load B3*/
/* B3 B2 B1 A3 A2 A1*/
fxch st(2) /* B1 B2 B3 A3 A2 A1*/
fsubp st(5),st /* B2 B3 A3 A2 P1*/
fsubp st(3),st /* B3 A3 P2 P1 */
fsubp st(1),st /* P3 P2 P1*/
fld dword ptr [eax+8]
fmul dword ptr [eax+28] /* C1 P3 P2 P1*/
fld st(3)
fmul dword ptr [eax] /* X C1 P3 P2 P1*/
fld st(3)
fmul dword ptr [eax+4] /* Y X C1 P3 P2 P1*/
fld st(3)
fmul dword ptr [eax+8] /* Z Y X C1 P3 P2 P1*/
fld dword ptr [eax+4]
fmul dword ptr [eax+32] /* D1 Z Y X C1 P3 P2 P1*/
fxch st(2)
faddp st(3),st /* Z D1 X+Y C1 P3 P2 P1*/
fld dword ptr [eax]
fmul dword ptr [eax+32] /* C2 Z D1 X+Y C1 P3 P2 P1*/
fxch st(2) /* D1 Z C2 X+Y C1 P3 P2 P1*/
fsubp st(4),st /* Z C2 X+Y N1 P3 P2 P1*/
faddp st(2),st /* C2 X+Y+Z N1 P3 P2 P1*/
fld dword ptr [eax+8]
fmul dword ptr [eax+24] /* D2 C2 X+Y+Z N1 P3 P2 P1*/
fld dword ptr [ONE] /* 1 D2 C2 det=X+Y+Z N1 P3 P2 P1*/
fdivrp st(3),st /* D2 C2 D=1/det N1 P3 P2 P1 <====== unavoidable 38 clocks penalties*/
/*
fld dword ptr [ONE] // 1 D2 C2 det=X+Y+Z N1 P3 P2 P1
fxch st(3) // det D2 C2 1 N1 P3 P2 P1
fdivp st(3),st // D2 C2 D=1/det N1 P3 P2 P1 <====== unavoidable 38 clocks penalties
*/
fsubp st(1),st /* N2 D N1 P3 P2 P1*/
fxch st(1) /* D N2 N1 P3 P2 P1*/
fld st(0)
fmulp st(6),st /* D N2 N1 P3 P2 p1*/
fld st(0)
fmulp st(5),st /* D N2 N1 P3 P2 p1*/
fld st(0)
fmulp st(4),st /* D N2 N1 P3 p2 p1*/
fld st(0)
fmulp st(3),st /* D N2 n1 p3 p2 p1*/
fld st(0)
fmulp st(2),st /* D n2 n1 p3 p2 p1*/
fld dword ptr [eax+4]
fmul dword ptr [eax+24] /* C3 D n2 n1 p3 p2 p1*/
fld dword ptr [eax]
fmul dword ptr [eax+28] /* D3 C3 D n2 n1 p3 p2 p1*/
fxch st(7) /* p1 C3 D n2 n1 p3 p2 D3*/
fstp dword ptr [ebx] /* C3 D n2 n1 p3 p2 D3*/
/**/
fld dword ptr [eax+4]
fmul dword ptr [eax+20] /* E1 C3 D n2 n1 p3 p2 D3*/
fxch st(1) /* C3 E1 D n2 n1 p3 p2 D3*/
fsubrp st(7),st /* E1 D n2 n1 p3 p2 N3*/
/**/
fld dword ptr [eax+8]
fmul dword ptr [eax+16] /* F1 E1 D n2 n1 p3 p2 N3*/
fxch st(6) /* p2 E1 D n2 n1 p3 F1 N3*/
fstp dword ptr [ebx+12] /* E1 D n2 n1 p3 F1 N3*/
/**/
fld dword ptr [eax+8]
fmul dword ptr [eax+12] /* E2 E1 D n2 n1 p3 F1 N3*/
fxch st(1) /* E1 E2 D n2 n1 p3 F1 N3*/
fsubrp st(6),st /* E2 D n2 n1 p3 M1 N3*/
/**/
fld dword ptr [eax]
fmul dword ptr [eax+20] /* F2 E2 D n2 n1 p3 M1 N3*/
fxch st(5) /* p3 E2 D n2 n1 F2 M1 N3*/
fstp dword ptr [ebx+24] /* E2 D n2 n1 F2 M1 N3*/
/**/
fld dword ptr [eax]
fmul dword ptr [eax+16] /* E3 E2 D n2 n1 F2 M1 N3*/
fxch st(1) /* E2 E3 D n2 n1 F2 M1 N3*/
fsubrp st(5),st /* E3 D n2 n1 M2 M1 N3*/
/**/
fld dword ptr [eax+4]
fmul dword ptr [eax+12] /* F3 E3 D n2 n1 M2 M1 N3*/
fxch st(4) /* n1 E3 D n2 F3 M2 M1 N3*/
fstp dword ptr [ebx+4] /* E3 D n2 F3 M2 M1 N3*/
/**/
fxch st(1) /* D E3 n2 F3 M2 M1 N3*/
fld st(0) /* D D E3 n2 F3 M2 M1 N3*/
fmulp st(7),st /* D E3 n2 F3 M2 M1 n3*/
fxch st(1) /* E3 D n2 F3 M2 M1 n3*/
fsubrp st(3),st /* D n2 M3 M2 M1 n3*/
/**/
fld st(0) /* D D n2 M3 M2 M1 n3*/
fmulp st(5),st /* D n2 M3 M2 m1 n3*/
fld st(0) /* D D n2 M3 M2 m1 n3*/
fmulp st(4),st /* D n2 M3 m2 m1 n3*/
/**/
fxch st(1) /* n2 D M3 m2 m1 n3*/
fstp dword ptr [ebx+16] /* D M3 m2 m1 n3*/
fmulp st(1),st /* m3 m2 m1 n3 */
fxch st(3) /* n3 m2 m1 m3*/
/**/
fstp dword ptr [ebx+28] /* m2 m1 m3*/
fstp dword ptr [ebx+20] /* m1 m3*/
fstp dword ptr [ebx+8] /* m3*/
fstp dword ptr [ebx+32]
}
}
#pragma warning(default:4725)
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
/* avoid transposition, stack, 9 divisions by det, 4 negs*/
/* 214 clocks (81 penalties) 147 instructions : 1.36 % pairing*/
INLINE void MTH3D_M_vInverMatrixC(MTH3D_tdstMatrix *MatDest, MTH3D_tdstMatrix *MatA)
{
register MTH_tdxReal det;
(MatDest)->stCol_0.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xY,(MatA)->stCol_2.xZ,(MatA)->stCol_1.xZ,(MatA)->stCol_2.xY);
det = MTH_M_xMul( (MatA)->stCol_0.xX ,(MatDest)->stCol_0.xX );
(MatDest)->stCol_1.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xZ,(MatA)->stCol_2.xX,(MatA)->stCol_1.xX,(MatA)->stCol_2.xZ);
det = MTH_M_xAdd(MTH_M_xMul( (MatA)->stCol_0.xY ,(MatDest)->stCol_1.xX ), det);
(MatDest)->stCol_2.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xX,(MatA)->stCol_2.xY,(MatA)->stCol_1.xY,(MatA)->stCol_2.xX);
det = MTH_M_xAdd(MTH_M_xMul( (MatA)->stCol_0.xZ ,(MatDest)->stCol_2.xX ), det);
(MatDest)->stCol_0.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xZ,(MatA)->stCol_2.xY,(MatA)->stCol_0.xY,(MatA)->stCol_2.xZ);
det = MTH_M_xDiv( MTH_C_ONE, det );
(MatDest)->stCol_0.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xY,(MatA)->stCol_1.xZ,(MatA)->stCol_0.xZ,(MatA)->stCol_1.xY);
(MatDest)->stCol_1.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xX,(MatA)->stCol_2.xZ,(MatA)->stCol_0.xZ,(MatA)->stCol_2.xX);
(MatDest)->stCol_1.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xZ,(MatA)->stCol_1.xX,(MatA)->stCol_0.xX,(MatA)->stCol_1.xZ);
(MatDest)->stCol_2.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xY,(MatA)->stCol_2.xX,(MatA)->stCol_0.xX,(MatA)->stCol_2.xY);
(MatDest)->stCol_2.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xX,(MatA)->stCol_1.xY,(MatA)->stCol_0.xY,(MatA)->stCol_1.xX);
MTH3D_M_vMulScalarMatrix( MatDest, det, MatDest );
}
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* Can surely be optimized in avoiding many recomputations done for nothing*/
/* Before modifying MTH3D_M_vDivScalarMatrix, it took 540 clocks 120 instructions !?!*/
#define MTH3D_M_vInverMatrixORG(MatDest, MatA) \
{ \
MTH3D_tdstMatrix Mat_Tmp={0}; \
MTH3D_tdstMatrix Mat_Com; \
MTH_tdxReal det; \
\
MTH3D_M_vComMatrixWithoutBuffer(&Mat_Com, MatA); \
MTH3D_M_vTranspMatrix(&Mat_Tmp, &Mat_Com ); \
det=MTH3D_M_xDetMatrix( MatA ); \
MTH3D_M_vDivScalarMatrix(MatDest, &Mat_Tmp, det ); \
}
/************************************************************************************************************************/
/* MTH3D_M_vTransformVectorWithoutBuffer*/
/************************************************************************************************************************/
/* 39 clocks (1 penalty) 41 instructions : 9.72 % pairing*/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
INLINE void MTH3D_M_vTransformVectorWithoutBufferASM(MTH3D_tdstVector *VectDest,MTH3D_tdstMatrix *MatA,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB)
{
__asm
{
mov ecx,VectB
mov edx,VectDest
mov ebx,VectA
mov eax,MatA
fld dword ptr [ecx]
fld dword ptr [ecx+4]
fld dword ptr [ecx+8]
fld dword ptr [ebx]
fmul dword ptr [eax]
fld dword ptr [ebx]
fmul dword ptr [eax+4]
fld dword ptr [ebx]
fmul dword ptr [eax+8]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fld dword ptr [ebx+4]
fmul dword ptr [eax+12]
fld dword ptr [ebx+4]
fmul dword ptr [eax+16]
fld dword ptr [ebx+4]
fmul dword ptr [eax+20]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fld dword ptr [ebx+8]
fmul dword ptr [eax+24]
fld dword ptr [ebx+8]
fmul dword ptr [eax+28]
fld dword ptr [ebx+8]
fmul dword ptr [eax+32]
fxch st(2)
faddp st(5),st
faddp st(3),st
faddp st(1),st
fxch st(2)
fstp dword ptr [edx] /* 1 penalty : unavoidable*/
fstp dword ptr [edx+4]
fstp dword ptr [edx+8]
}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vTransformVectorWithoutBufferC MTH3D_M_vTransformVectorWithoutBufferORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
/* 79 clocks (30 penalties) 54 instructions : 3.70 % pairing*/
#define MTH3D_M_vTransformVectorWithoutBufferORG( VectDest, MatA, VectA, VectB) \
{ MTH3D_M_vMulMatrixVectorWithoutBuffer(VectDest, MatA, VectA); \
MTH3D_M_vAddVector(VectDest,VectDest,VectB); \
}
#if defined(OPTIMIZED_FOR_U64_ASM)
/* requires no temporary buffer if VectDest==VectA*/
static inline void MTH3D_M_vTransformVectorWithoutBufferU64ASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB)
{
/* GGG RRR OOO U U M M PPP FFFF !! !!
G G R R O O U U MM MM P P F !! !!
G RRR O O U U M M M PPP FFF !! !!
G GG R R O O U U M M P F
GGG R R OOO UUU M M P F oo oo
PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/
asm(" .set noreorder ");
asm(" # Begin TransformVectorWithoutBufferU64ASM ");
asm(
" lwc1 $f6,0(%2) \n"
" lwc1 $f8,4(%2) \n"
" lwc1 $f10,8(%2) \n"
" lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n"
" lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n"
" mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n"
" lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n"
" lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n"
" lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n"
" mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n"
" add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
" lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n"
" mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n"
" add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
" lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n"
" mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n"
" add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
" lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n"
" mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n"
" add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
" lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n"
" mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n"
" lwc1 $f6,0(%3) # f6 <- VctB[0] \n"
" mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n"
" add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
" lwc1 $f8,4(%3) # f8 <- VctB[1] \n"
" add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
" lwc1 $f10,8(%3) # f10 <- VctB[2] \n"
" add.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] + VctB[0] \n"
" add.s $f12,$f12,$f8 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] + VctB[1] \n"
" swc1 $f0,0(%0) # f0 -> Dest[0] \n"
" add.s $f18,$f18,$f10 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] + VctB[1] \n"
" swc1 $f12,4(%0) # f12 -> Dest[1] \n"
" swc1 $f18,8(%0) # f18 -> Dest[2] \n"
: : "r" (VectDest), "r" (MatA), "r" (VectA) , "r" (VectB) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" );
asm(" # EndOf TransformVectorWithoutBufferU64ASM ");
asm(" .set reorder ");
}
#endif /* OPTIMIZED_FOR_U64_ASM*/
#endif /* MTH_ASM_H*/