/* Olivier Couvreur : 3/98 */ #if !defined(MTH_ASM_H) #define MTH_ASM_H #include "acp_base.h" /* force OPTIMIZED_FOR_PC_FLOATS if OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) #if !defined(OPTIMIZED_FOR_PC_FLOATS) #define OPTIMIZED_FOR_PC_FLOATS #endif #endif /************************************************************************************************************************/ /* MTH3D_M_vAddVector*/ /************************************************************************************************************************/ /* register load + 3 * (fld fadd) + 3 fstp = 2 + 3 * (1+1) + 3 * (2) = 14 clocks 13 instructions*/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) INLINE void MTH3D_M_vAddVectorASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB) { __asm { mov eax,VectA mov ebx,VectB mov ecx,VectDest fld dword ptr [eax] /*; (VectA)->xX*/ fadd dword ptr [ebx] /*; (VectB)->xX*/ fld dword ptr [eax+4] /*; (VectA)->xY*/ fadd dword ptr [ebx+4] /*; (VectB)->xY*/ fld dword ptr [eax+8] /*; (VectA)->xZ*/ fadd dword ptr [ebx+8] /*; (VectB)->xZ*/ fxch st(2) fstp dword ptr [ecx] /*; (VectDest)->xX*/ fstp dword ptr [ecx+4] /*; (VectDest)->xY*/ fstp dword ptr [ecx+8] /*; (VectDest)->xZ*/ } } #endif /* #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vAddVectorC MTH3D_M_vAddVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 3 * (fld fadd fstp) = 3 * (1+1+2+3pen) = 21 clocks 9 instructions*/ #define MTH3D_M_vAddVectorORG( VectDest, VectA, VectB) \ { (VectDest)->xX = MTH_M_xAdd((VectA)->xX, (VectB)->xX); \ (VectDest)->xY = MTH_M_xAdd((VectA)->xY, (VectB)->xY); \ (VectDest)->xZ = MTH_M_xAdd((VectA)->xZ, (VectB)->xZ); } /************************************************************************************************************************/ /* MTH3D_M_vSubVector*/ /************************************************************************************************************************/ /* register load + 3 * (fld fadd) + 3 fstp = 2 + 3 * (1+1) + 3 * (2) = 14 clocks 13 instructions*/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) INLINE void MTH3D_M_vSubVectorASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB) { __asm { mov eax,VectA mov ebx,VectB mov ecx,VectDest fld dword ptr [eax] /*; (VectA)->xX*/ fsub dword ptr [ebx] /*; (VectB)->xX*/ fld dword ptr [eax+4] /*; (VectA)->xY*/ fsub dword ptr [ebx+4] /*; (VectB)->xY*/ fld dword ptr [eax+8] /*; (VectA)->xZ*/ fsub dword ptr [ebx+8] /*; (VectB)->xZ*/ fxch st(2) fstp dword ptr [ecx] /*; (VectDest)->xX*/ fstp dword ptr [ecx+4] /*; (VectDest)->xY*/ fstp dword ptr [ecx+8] /*; (VectDest)->xZ*/ } } #endif /* #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vSubVectorC MTH3D_M_vSubVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 3 * (fld fadd fstp) = 3 * (1+1+2+3pen) = 21 clocks 9 instructions*/ #define MTH3D_M_vSubVectorORG( VectDest, VectA, VectB) \ { (VectDest)->xX = MTH_M_xSub((VectA)->xX, (VectB)->xX); \ (VectDest)->xY = MTH_M_xSub((VectA)->xY, (VectB)->xY); \ (VectDest)->xZ = MTH_M_xSub((VectA)->xZ, (VectB)->xZ); } /************************************************************************************************************************/ /* MTH3D_M_vNegVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) #define MTH3D_M_vNegVectorASM MTH3D_M_vNegVectorC #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) /* only toggle sign bit : No fpu*/ /* 10 pairables instructions => 5 clocks*/ #define MTH3D_M_vNegVectorC( VectDest, VectA) \ { long register NegMask=0x80000000; \ *((long*) &((VectDest)->xX)) = *((long*) &((VectA)->xX )) ^ NegMask; \ *((long*) &((VectDest)->xY)) = *((long*) &((VectA)->xY )) ^ NegMask; \ *((long*) &((VectDest)->xZ)) = *((long*) &((VectA)->xZ )) ^ NegMask; \ } #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 3 * ( fld fchs fstp) = 3 * (1 + 1 + 1+1pen) = 12 clocks 9 instructions*/ #define MTH3D_M_vNegVectorORG( VectDest, VectA) \ { (VectDest)->xX = MTH_M_xNeg( (VectA)->xX ); \ (VectDest)->xY = MTH_M_xNeg( (VectA)->xY ); \ (VectDest)->xZ = MTH_M_xNeg( (VectA)->xZ ); } /************************************************************************************************************************/ /* MTH3D_M_vAdd3*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* 14 clocks*/ INLINE void MTH3D_M_vAdd3ScalarVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA,MTH_tdxReal x,MTH_tdxReal y,MTH_tdxReal z) { __asm { mov eax,VectA mov ecx,VectDest fld dword ptr [eax] fadd dword ptr [x] fld dword ptr [eax+4] fadd dword ptr [y] fld dword ptr [eax+8] fadd dword ptr [z] fxch st(2) fstp dword ptr [ecx] fstp dword ptr [ecx+4] fstp dword ptr [ecx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vAdd3ScalarVectorC MTH3D_M_vAdd3ScalarVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 21 clocks*/ #define MTH3D_M_vAdd3ScalarVectorORG( VectDest, VectA, x, y, z) \ { (VectDest)->xX = MTH_M_xAdd( (VectA)->xX, (x) ); \ (VectDest)->xY = MTH_M_xAdd( (VectA)->xY, (y) ); \ (VectDest)->xZ = MTH_M_xAdd( (VectA)->xZ, (z) ); } /************************************************************************************************************************/ /* MTH3D_M_vDivScalarVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* 14 clocks*/ INLINE void MTH3D_M_vMulScalarVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal a,MTH3D_tdstVector *VectA) { __asm { mov eax,VectA mov ebx,VectDest fld dword ptr [eax] fmul dword ptr [a] fld dword ptr [eax+4] fmul dword ptr [a] fld dword ptr [eax+8] fmul dword ptr [a] fxch st(2) fstp dword ptr [ebx] fstp dword ptr [ebx+4] fstp dword ptr [ebx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vMulScalarVectorC( VectDest, a, VectA) \ { register MTH_tdxReal xTempMTH3D_M_vMulScalarVectorC=(a); \ (VectDest)->xX = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xX); \ (VectDest)->xY = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xY); \ (VectDest)->xZ = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xZ); } #endif /* OPTIMIZED_FOR_PC_FLOATS*/ #define MTH3D_M_vMulScalarVectorORG( VectDest, a, VectA) \ { (VectDest)->xX = MTH_M_xMul((a), (VectA)->xX); \ (VectDest)->xY = MTH_M_xMul((a), (VectA)->xY); \ (VectDest)->xZ = MTH_M_xMul((a), (VectA)->xZ); } /************************************************************************************************************************/ /* MTH3D_M_vDivScalarVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) #define MTH3D_M_vDivScalarVectorASM MTH3D_M_vDivScalarVectorC #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) /* Only one division*/ #define MTH3D_M_vDivScalarVectorC( VectDest, VectA, a) \ { register MTH_tdxReal xTempMTH3D_M_vDivScalarVectorC=MTH_M_xDiv(MTH_C_ONE, (a)); \ MTH3D_M_vMulScalarVector( VectDest ,xTempMTH3D_M_vDivScalarVectorC, VectA); } #endif /* OPTIMIZED_FOR_PC_FLOATS*/ #define MTH3D_M_vDivScalarVectorORG( VectDest, VectA, a) \ { (VectDest)->xX = MTH_M_xDiv((VectA)->xX, (a)); \ (VectDest)->xY = MTH_M_xDiv((VectA)->xY, (a)); \ (VectDest)->xZ = MTH_M_xDiv((VectA)->xZ, (a)); } /************************************************************************************************************************/ /* MTH3D_M_vScaleVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* 14 clocks*/ INLINE void MTH3D_M_vScaleVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB) { __asm { mov eax,VectA mov ebx,VectB mov ecx,VectDest fld dword ptr [eax] fmul dword ptr [ebx] fld dword ptr [eax+4] fmul dword ptr [ebx+4] fld dword ptr [eax+8] fmul dword ptr [ebx+8] fxch st(2) fstp dword ptr [ecx] fstp dword ptr [ecx+4] fstp dword ptr [ecx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vScaleVectorC MTH3D_M_vScaleVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 21 clocks*/ #define MTH3D_M_vScaleVectorORG( VectDest, VectA, VectB ) \ { (VectDest)->xX = MTH_M_xMul( (VectA)->xX, (VectB)->xX); \ (VectDest)->xY = MTH_M_xMul( (VectA)->xY, (VectB)->xY); \ (VectDest)->xZ = MTH_M_xMul( (VectA)->xZ, (VectB)->xZ); } /************************************************************************************************************************/ /* MTH3D_M_vMulAddVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* 17 clocks : D=xA+B*/ INLINE void MTH3D_M_vMulAddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB) { __asm { mov eax,VectA mov ebx,VectB mov ecx,VectDest fld dword ptr [eax] fmul dword ptr [x] fld dword ptr [eax+4] fmul dword ptr [x] fxch st(1) fld dword ptr [eax+8] fmul dword ptr [x] fxch st(1) fadd dword ptr [ebx] fxch st(2) fadd dword ptr [ebx+4] fxch st(1) fadd dword ptr [ebx+8] fxch st(2) fstp dword ptr [ecx] /* 1 pen*/ fstp dword ptr [ecx+4] fstp dword ptr [ecx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vMulAddVectorC MTH3D_M_vMulAddVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 31 clocks*/ #define MTH3D_M_vMulAddVectorORG( VectDest, x, VectA, VectB) \ { (VectDest)->xX = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xX), (VectB)->xX); \ (VectDest)->xY = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xY), (VectB)->xY); \ (VectDest)->xZ = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xZ), (VectB)->xZ); } /************************************************************************************************************************/ /* MTH3D_M_vMul3AddVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* 32 clocks (2 penalties) : D=xA+yB+zC*/ INLINE void MTH3D_M_vMul3AddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH_tdxReal y,MTH3D_tdstVector *VectB,MTH_tdxReal z,MTH3D_tdstVector *VectC) { __asm { mov eax,VectA mov ebx,VectB mov ecx,VectC mov edx,VectDest fld dword ptr [eax] fmul dword ptr [x] fld dword ptr [eax+4] fmul dword ptr [x] fld dword ptr [eax+8] fmul dword ptr [x] fxch st(2) fld dword ptr [ebx] fmul dword ptr [y] fld dword ptr [ebx+4] fmul dword ptr [y] fld dword ptr [ebx+8] fmul dword ptr [y] fxch st(2) faddp st(3),st faddp st(3),st fld dword ptr [ecx] fmul dword ptr [z] fxch st(1) faddp st(4),st fld dword ptr [ecx+4] fmul dword ptr [z] fld dword ptr [ecx+8] fmul dword ptr [z] fxch st(2) faddp st(3),st faddp st(3),st faddp st(3),st /* 1 pen*/ fstp dword ptr [edx] /* 1 pen*/ fstp dword ptr [edx+4] fstp dword ptr [edx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vMul3AddVectorC MTH3D_M_vMul3AddVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* au moins 42 clocks*/ #define MTH3D_M_vMul3AddVectorORG( VectDest, x, VectA, y, VectB, z, VectC) \ { (VectDest)->xX = MTH_M_xMulAddMulAddMul((x),(VectA)->xX), (y),(VectB)->xX), (z),(VectC)->xX)); \ (VectDest)->xY = MTH_M_xMulAddMulAddMul((x),(VectA)->xY), (y),(VectB)->xY), (z),(VectC)->xY)); \ (VectDest)->xZ = MTH_M_xMulAddMulAddMul((x),(VectA)->xZ), (y),(VectB)->xZ), (z),(VectC)->xZ)); } /************************************************************************************************************************/ /* MTH3D_M_vMul4AddVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* 40 clocks (0 penalties) : E=xA+yB+zC+wD*/ INLINE void MTH3D_M_vMul4AddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH_tdxReal y,MTH3D_tdstVector *VectB,MTH_tdxReal z,MTH3D_tdstVector *VectC,MTH_tdxReal w,MTH3D_tdstVector *VectD) { __asm { mov eax,VectA mov ebx,VectB mov ecx,VectC mov edx,VectD fld dword ptr [eax] fmul dword ptr [x] fld dword ptr [eax+4] fmul dword ptr [x] fld dword ptr [eax+8] fmul dword ptr [x] fxch st(2) fld dword ptr [ebx] fmul dword ptr [y] fld dword ptr [ebx+4] fmul dword ptr [y] fld dword ptr [ebx+8] fmul dword ptr [y] fxch st(2) faddp st(3),st faddp st(3),st fld dword ptr [ecx] fmul dword ptr [z] fxch st(1) faddp st(4),st fld dword ptr [ecx+4] fmul dword ptr [z] fld dword ptr [ecx+8] fmul dword ptr [z] fxch st(2) faddp st(3),st faddp st(3),st fld dword ptr [edx] fmul dword ptr [w] fxch st(1) faddp st(4),st fld dword ptr [edx+4] fmul dword ptr [w] fld dword ptr [edx+8] fmul dword ptr [w] fxch st(2) faddp st(3),st faddp st(3),st mov edx,VectDest faddp st(3),st fstp dword ptr [edx] fstp dword ptr [edx+4] fstp dword ptr [edx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vMul4AddVectorC MTH3D_M_vMul4AddVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* au moins 56 clocks*/ #define MTH3D_M_vMul4AddVectorORG( VectDest, x, VectA, y, VectB, z, VectC, w, VectD) \ { (VectDest)->xX = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xX), (y),(VectB)->xX), (z),(VectC)->xX)),MTH_M_xMul((w),(VectD)->xX)); \ (VectDest)->xY = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xY), (y),(VectB)->xY), (z),(VectC)->xY)),MTH_M_xMul((w),(VectD)->xY)); \ (VectDest)->xZ = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xZ), (y),(VectB)->xZ), (z),(VectC)->xZ)),MTH_M_xMul((w),(VectD)->xZ)); } /************************************************************************************************************************/ /* MTH3D_M_vLinearInterpolVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* 21 clocks*/ INLINE void MTH3D_M_vLinearInterpolVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA, MTH3D_tdstVector *VectB, MTH_tdxReal t) { /* Cx=Ax+t(Bx-Ax)=Ax+tDx*/ /* Cy=Ay+t(By-Ay)=Ay+tDy*/ /* Cz=Az+t(Bz-Az)=Az+tDz*/ __asm { mov ebx,VectB mov eax,VectA mov ecx,VectDest fld dword ptr [ebx] /* Bx*/ fsub dword ptr [eax] /* Dx*/ fld dword ptr [ebx+4] /* By Dx*/ fsub dword ptr [eax+4] /* Dy Dx*/ fld dword ptr [ebx+8] /* Bz Dy Dx*/ fsub dword ptr [eax+8] /* Dz Dy Dx*/ fxch st(2) /* Dx Dy Dz*/ fmul dword ptr [t] /* tDx Dy Dz*/ fld dword ptr [eax+4] /* Ay tDx Dy Dz*/ fxch st(2) /* Dy tDx Ay Dz*/ fmul dword ptr [t] /* tDy tDx Ay Dz*/ fld dword ptr [eax] /* Ax tDy tDx Ay Dz*/ fxch st(4) /* Dz tDy tDx Ay Ax*/ fmul dword ptr [t] /* tDz tDy tDx Ay Ax*/ fxch st(2) /* tDx tDy tDz Ay Ax*/ faddp st(4),st /* tDy tDz Ay Cx*/ faddp st(2),st /* tDz Cy Cx*/ fld dword ptr [eax+8] /* Az tDz Cy Cx */ faddp st(1),st /* Cz Cy Cx*/ fxch st(2) /* Cx Cy Cz*/ fstp dword ptr [ecx] /* Cy Cz*/ fstp dword ptr [ecx+4] /* Cz*/ fstp dword ptr [ecx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vLinearInterpolVectorC MTH3D_M_vLinearInterpolVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 39 clocks */ #define MTH3D_M_vLinearInterpolVectorORG( VectDest, VectA, VectB, t ) \ { (VectDest)->xX = MTH_M_xLinearInterpol( (VectA)->xX, (VectB)->xX, (t) ); \ (VectDest)->xY = MTH_M_xLinearInterpol( (VectA)->xY, (VectB)->xY, (t) ); \ (VectDest)->xZ = MTH_M_xLinearInterpol( (VectA)->xZ, (VectB)->xZ, (t) ); \ } /************************************************************************************************************************/ /* MTH3D_M_vLinearScaleVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* 21 clocks*/ INLINE void MTH3D_M_vLinearScaleVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA, MTH3D_tdstVector *VectB, MTH3D_tdstVector *VectC) { /* x=Ax+Cx.(Bx-Ax)=Ax+Cx.Dx*/ /* y=Ay+Cy.(By-Ay)=Ay+Cy.Dy*/ /* z=Az+Cz.(Bz-Az)=Az+Cz.Dz*/ __asm { mov ebx,VectB mov eax,VectA mov ecx,VectDest mov edx,VectC fld dword ptr [ebx] /* Bx*/ fsub dword ptr [eax] /* Dx*/ fld dword ptr [ebx+4] /* By Dx*/ fsub dword ptr [eax+4] /* Dy Dx*/ fld dword ptr [ebx+8] /* Bz Dy Dx*/ fsub dword ptr [eax+8] /* Dz Dy Dx*/ fxch st(2) /* Dx Dy Dz*/ fmul dword ptr [edx] /* CxDx Dy Dz*/ fld dword ptr [eax+4] /* Ay CxDx Dy Dz*/ fxch st(2) /* Dy CxDx Ay Dz*/ fmul dword ptr [edx+4] /* CyDy CxDx Ay Dz*/ fld dword ptr [eax] /* Ax CyDy CxDx Ay Dz*/ fxch st(4) /* Dz CyDy CxDx Ay Ax*/ fmul dword ptr [edx+8] /* CzDz CyDy CxDx Ay Ax*/ fxch st(2) /* CxDx CyDy CzDz Ay Ax*/ faddp st(4),st /* CyDy CzDz Ay x*/ faddp st(2),st /* CzDz y x*/ fld dword ptr [eax+8] /* Az CzDz y x */ faddp st(1),st /* z y x*/ fxch st(2) /* x y z*/ fstp dword ptr [ecx] /* y z*/ fstp dword ptr [ecx+4] /* z*/ fstp dword ptr [ecx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vLinearScaleVectorC MTH3D_M_vLinearScaleVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 39 clocks */ #define MTH3D_M_vLinearScaleVectorORG( VectDest, VectA, VectB, VectC ) \ { (VectDest)->xX = MTH_M_xLinearInterpol( (VectA)->xX, (VectB)->xX, (VectC)->xX ); \ (VectDest)->xY = MTH_M_xLinearInterpol( (VectA)->xY, (VectB)->xY, (VectC)->xY ); \ (VectDest)->xZ = MTH_M_xLinearInterpol( (VectA)->xZ, (VectB)->xZ, (VectC)->xZ ); \ } /************************************************************************************************************************/ /* MTH3D_M_xDotProductVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* 12 clocks (2 penalties) 11 instructions*/ #pragma warning(disable:4035) INLINE MTH_tdxReal MTH3D_M_xDotProductVectorASM(struct MTH3D_tdstVector_ *VectA,struct MTH3D_tdstVector_ *VectB) { register MTH_tdxReal xDot; __asm { mov eax,VectA mov ebx,VectB fld dword ptr [eax] fmul dword ptr [ebx] fld dword ptr [eax+4] fmul dword ptr [ebx+4] fld dword ptr [eax+8] fmul dword ptr [ebx+8] fxch st(1) faddp st(2),st faddp st(1),st /* 2 unavoidable penalties*/ fstp [xDot] } return(xDot); } #pragma warning(default:4035) #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_xDotProductVectorC MTH3D_M_xDotProductVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 15 clocks (5 penalties) 12 instructions*/ #define MTH3D_M_xDotProductVectorORG( VectA, VectB) \ MTH_M_xAdd( \ MTH_M_xAdd( \ MTH_M_xMul((VectA)->xX, (VectB)->xX), \ MTH_M_xMul((VectA)->xY, (VectB)->xY) \ ), \ MTH_M_xMul((VectA)->xZ, (VectB)->xZ) ) /************************************************************************************************************************/ /* MTH3D_M_vCrossProductVectorWithoutBuffer*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* requires no temp buffer if VectA or VectB == VectDest*/ /* 24 clocks (1 penalty) 23 instructions : 8.70 % pairing*/ INLINE void MTH3D_M_vCrossProductVectorWithoutBufferASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA,struct MTH3D_tdstVector_ *VectB) { __asm { mov eax,VectA mov ebx,VectB mov edx,VectDest fld dword ptr [eax+4] fmul dword ptr [ebx+8] fld dword ptr [eax+8] fmul dword ptr [ebx] fld dword ptr [eax] fmul dword ptr [ebx+4] fld dword ptr [eax+8] fmul dword ptr [ebx+4] fld dword ptr [eax] fmul dword ptr [ebx+8] fld dword ptr [eax+4] fmul dword ptr [ebx] fxch st(2) fsubp st(5),st fsubp st(3),st fsubp st(1),st fxch st(2) fstp dword ptr [edx] /* 1 penalty here : unavoidable !*/ fstp dword ptr [edx+4] fstp dword ptr [edx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vCrossProductVectorWithoutBufferC MTH3D_M_vCrossProductVectorWithoutBufferORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 42 clocks (18 penalties) 25 instructions : 8.00 % pairing*/ #define MTH3D_M_vCrossProductVectorWithoutBufferORG(VectDest, VectA, VectB) \ { (VectDest)->xX=MTH_M_xMulSubMul((VectA)->xY,(VectB)->xZ,(VectA)->xZ,(VectB)->xY); \ (VectDest)->xY=MTH_M_xMulSubMul((VectA)->xZ,(VectB)->xX,(VectA)->xX,(VectB)->xZ); \ (VectDest)->xZ=MTH_M_xMulSubMul((VectA)->xX,(VectB)->xY,(VectA)->xY,(VectB)->xX); } /************************************************************************************************************************/ /* MTH3D_M_vCrossProductVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) #define MTH3D_M_vCrossProductVectorASM MTH3D_M_vCrossProductVectorWithoutBufferASM #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vCrossProductVectorC MTH3D_M_vCrossProductVectorWithoutBufferC #endif /* OPTIMIZED_FOR_PC_FLOATS*/ #define MTH3D_M_vCrossProductVectorORG(VectDest, VectA, VectB) \ { if( (VectDest==VectA) || (VectDest==VectB) ) \ { \ MTH3D_tdstVector VectTmp; \ MTH3D_M_vCrossProductVectorWithoutBuffer(&VectTmp, VectA, VectB); \ MTH3D_M_vCopyVector(VectDest, &VectTmp); \ } \ else \ { \ MTH3D_M_vCrossProductVectorWithoutBuffer(VectDest, VectA, VectB); \ } \ } /************************************************************************************************************************/ /* MTH3D_M_vMulMatrixMatrixWithoutBuffer*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* requires a buffer only if MatDest==A*/ /* 91 clocks (0 penalty) 95 instructions : 4.21 % pairing*/ /* tricks remove penalties*/ INLINE void MTH3D_M_vMulMatrixMatrixWithoutBufferASM(struct MTH3D_tdstMatrix_ *MatDest,struct MTH3D_tdstMatrix_ *MatA,struct MTH3D_tdstMatrix_ *MatB) { __asm { mov ebx,MatB mov eax,MatA mov ecx,MatDest /**/ fld dword ptr [ebx] fmul dword ptr [eax] fld dword ptr [ebx] fmul dword ptr [eax+4] fld dword ptr [ebx] fmul dword ptr [eax+8] fld dword ptr [ebx+4] fmul dword ptr [eax+12] fld dword ptr [ebx+4] fmul dword ptr [eax+16] fld dword ptr [ebx+4] fmul dword ptr [eax+20] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fld dword ptr [ebx+8] fmul dword ptr [eax+24] fld dword ptr [ebx+8] fmul dword ptr [eax+28] fld dword ptr [ebx+8] fmul dword ptr [eax+32] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fxch st(1) /* trick A : preload next value*/ fld dword ptr [ebx+12] fxch st(3) fstp dword ptr [ecx] /* A no more penalty here*/ fstp dword ptr [ecx+4] fstp dword ptr [ecx+8] /**/ fmul dword ptr [eax] fld dword ptr [ebx+12] fmul dword ptr [eax+4] fld dword ptr [ebx+12] fmul dword ptr [eax+8] fld dword ptr [ebx+16] fmul dword ptr [eax+12] fld dword ptr [ebx+16] fmul dword ptr [eax+16] fld dword ptr [ebx+16] fmul dword ptr [eax+20] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fld dword ptr [ebx+20] fmul dword ptr [eax+24] fld dword ptr [ebx+20] fmul dword ptr [eax+28] fld dword ptr [ebx+20] fmul dword ptr [eax+32] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fxch st(2) /* trick B : preload next value*/ /* trick C : replace fxch st(1)*/ fld dword ptr [ebx+24] fmul dword ptr [eax] fxch st(2) /* trick C : replace fxch st(3)*/ fstp dword ptr [ecx+16] fstp dword ptr [ecx+12] /* B: no more penalty here*/ /*fstp dword ptr [ecx+20] // trick C : store it later*/ /**/ fld dword ptr [ebx+24] fmul dword ptr [eax+4] fld dword ptr [ebx+24] fmul dword ptr [eax+8] fld dword ptr [ebx+28] fmul dword ptr [eax+12] fld dword ptr [ebx+28] fmul dword ptr [eax+16] fld dword ptr [ebx+28] fmul dword ptr [eax+20] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fxch st(3) /* trick C :added*/ fld dword ptr [ebx+32] fmul dword ptr [eax+24] fld dword ptr [ebx+32] fmul dword ptr [eax+28] fld dword ptr [ebx+32] fmul dword ptr [eax+32] fxch st(2) faddp st(5),st faddp st(3),st faddp st(4),st /* trick C : replace faddp st(1),st*/ fstp dword ptr [ecx+20] /* trick C : store it later*/ /* no more penalty here*/ fstp dword ptr [ecx+28] /* trick C : swapped stores*/ fstp dword ptr [ecx+24] fstp dword ptr [ecx+32] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vMulMatrixMatrixWithoutBufferC MTH3D_M_vMulMatrixMatrixWithoutBufferORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 174 clocks (69 penalties) 119 instructions : 1.68 % pairing*/ #define MTH3D_M_vMulMatrixMatrixWithoutBufferORG(MatDest, MatA, MatB) \ { \ (MatDest)->stCol_0.xX = MTH_M_xMulAddMulAddMul( \ (MatA)->stCol_0.xX, (MatB)->stCol_0.xX, \ (MatA)->stCol_1.xX, (MatB)->stCol_0.xY, \ (MatA)->stCol_2.xX, (MatB)->stCol_0.xZ ); \ (MatDest)->stCol_0.xY = MTH_M_xMulAddMulAddMul( \ (MatA)->stCol_0.xY, (MatB)->stCol_0.xX, \ (MatA)->stCol_1.xY, (MatB)->stCol_0.xY, \ (MatA)->stCol_2.xY, (MatB)->stCol_0.xZ ); \ (MatDest)->stCol_0.xZ = MTH_M_xMulAddMulAddMul( \ (MatA)->stCol_0.xZ, (MatB)->stCol_0.xX, \ (MatA)->stCol_1.xZ, (MatB)->stCol_0.xY, \ (MatA)->stCol_2.xZ, (MatB)->stCol_0.xZ ); \ \ (MatDest)->stCol_1.xX = MTH_M_xMulAddMulAddMul( \ (MatA)->stCol_0.xX, (MatB)->stCol_1.xX, \ (MatA)->stCol_1.xX, (MatB)->stCol_1.xY, \ (MatA)->stCol_2.xX, (MatB)->stCol_1.xZ ); \ (MatDest)->stCol_1.xY = MTH_M_xMulAddMulAddMul( \ (MatA)->stCol_0.xY, (MatB)->stCol_1.xX, \ (MatA)->stCol_1.xY, (MatB)->stCol_1.xY, \ (MatA)->stCol_2.xY, (MatB)->stCol_1.xZ ); \ (MatDest)->stCol_1.xZ = MTH_M_xMulAddMulAddMul( \ (MatA)->stCol_0.xZ, (MatB)->stCol_1.xX, \ (MatA)->stCol_1.xZ, (MatB)->stCol_1.xY, \ (MatA)->stCol_2.xZ, (MatB)->stCol_1.xZ ); \ \ (MatDest)->stCol_2.xX = MTH_M_xMulAddMulAddMul( \ (MatA)->stCol_0.xX, (MatB)->stCol_2.xX, \ (MatA)->stCol_1.xX, (MatB)->stCol_2.xY, \ (MatA)->stCol_2.xX, (MatB)->stCol_2.xZ ); \ (MatDest)->stCol_2.xY = MTH_M_xMulAddMulAddMul( \ (MatA)->stCol_0.xY, (MatB)->stCol_2.xX, \ (MatA)->stCol_1.xY, (MatB)->stCol_2.xY, \ (MatA)->stCol_2.xY, (MatB)->stCol_2.xZ ); \ (MatDest)->stCol_2.xZ = MTH_M_xMulAddMulAddMul( \ (MatA)->stCol_0.xZ, (MatB)->stCol_2.xX, \ (MatA)->stCol_1.xZ, (MatB)->stCol_2.xY, \ (MatA)->stCol_2.xZ, (MatB)->stCol_2.xZ ); \ } #if defined(OPTIMIZED_FOR_U64_ASM) static inline void MTH3D_M_vMulMatrixMatrixWithoutBufferU64ASM(struct MTH3D_tdstMatrix_ *MatDest,struct MTH3D_tdstMatrix_ *MatA,struct MTH3D_tdstMatrix_ *MatB) { /* GGG RRR OOO U U M M PPP FFFF !! !! G G R R O O U U MM MM P P F !! !! G RRR O O U U M M M PPP FFF !! !! G GG R R O O U U M M P F GGG R R OOO UUU M M P F oo oo PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/ asm(" .set noreorder "); asm(" # Begin MulMatrixMatrixWithoutBufferU64ASM "); asm( " # Premier MulMatrixVertex \n" " lwc1 $f6,0(%2) \n" " lwc1 $f8,4(%2) \n" " lwc1 $f10,8(%2) \n" " lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n" " lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n" " mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n" " lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n" " lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n" " lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n" " mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n" " add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n" " lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n" " mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n" " add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n" " lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n" " mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n" " add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n" " lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n" " mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n" " add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n" " lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n" " swc1 $f0,0(%0) # f0 -> Dest[0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n" " add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n" " swc1 $f12,4(%0) # f12 -> Dest[1] \n" " add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n" " swc1 $f18,8(%0) # f18 -> Dest[2] \n" " # Deuxième MulMatrixVertex \n" " lwc1 $f6,12(%2) \n" " lwc1 $f8,16(%2) \n" " lwc1 $f10,20(%2) \n" " lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n" " lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n" " mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n" " lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n" " lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n" " lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n" " mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n" " add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n" " lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n" " mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n" " add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n" " lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n" " mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n" " add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n" " lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n" " mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n" " add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n" " lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n" " swc1 $f0,12(%0) # f0 -> Dest[0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n" " add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n" " swc1 $f12,16(%0) # f12 -> Dest[1] \n" " add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n" " swc1 $f18,20(%0) # f18 -> Dest[2] \n" " # Troisième MulMatrixVertex \n" " lwc1 $f6,24(%2) \n" " lwc1 $f8,28(%2) \n" " lwc1 $f10,32(%2) \n" " lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n" " lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n" " mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n" " lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n" " lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n" " lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n" " mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n" " add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n" " lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n" " mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n" " add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n" " lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n" " mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n" " add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n" " lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n" " mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n" " add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n" " lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n" " swc1 $f0,24(%0) # f0 -> Dest[0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n" " add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n" " swc1 $f12,28(%0) # f12 -> Dest[1] \n" " add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n" " swc1 $f18,32(%0) # f18 -> Dest[2] \n" : : "r" (MatDest), "r" (MatA), "r" (MatB) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" ); asm(" # EndOf MulMatrixMatrixWithoutBufferU64ASM ");\ asm(" .set reorder "); } #endif /* OPTIMIZED_FOR_U64_ASM*/ /************************************************************************************************************************/ /* MTH3D_M_vMulMatrixMatrix*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) #define MTH3D_M_vMulMatrixMatrixASM(Mat_Dest, Mat_A, Mat_B) \ { if (Mat_Dest==Mat_A) \ { \ MTH3D_tdstMatrix Mtemp; \ \ MTH3D_M_vMulMatrixMatrixWithoutBuffer( \ &Mtemp, Mat_A, Mat_B); \ MTH3D_M_vCopyMatrix( Mat_Dest, &Mtemp); \ } \ else \ { \ MTH3D_M_vMulMatrixMatrixWithoutBuffer(Mat_Dest, \ Mat_A, Mat_B); \ } \ } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vMulMatrixMatrixC MTH3D_M_vMulMatrixMatrixORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ #define MTH3D_M_vMulMatrixMatrixORG(Mat_Dest, Mat_A, Mat_B) \ { if( (Mat_Dest==Mat_A) || (Mat_Dest==Mat_B) ) \ { \ MTH3D_tdstMatrix Mtemp; \ \ MTH3D_M_vMulMatrixMatrixWithoutBuffer( \ &Mtemp, Mat_A, Mat_B); \ MTH3D_M_vCopyMatrix( Mat_Dest, &Mtemp); \ } \ else \ { \ MTH3D_M_vMulMatrixMatrixWithoutBuffer(Mat_Dest, \ Mat_A, Mat_B); \ } \ } /************************************************************************************************************************/ /* MTH3D_M_xDetMatrix*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* only 9 muls instead of 12 !!! */ /* 31 clocks but 5 penalties*/ #pragma warning(disable:4035) INLINE MTH_tdxReal MTH3D_M_xDetMatrixASM(struct MTH3D_tdstMatrix_ *MatA) { register MTH_tdxReal xTempMatrix; /* not useful but necessary to no hang compiler in release mode !?!*/ __asm { /* = 0X.(1Y.2Z-1Z.2Y) | = 0X.(A-B) | = X*/ /* + 0Y.(1Z.2X-1X.2Z) | + 0Y.(C-D) | + Y*/ /* + 0Z.(1X.2Y-1Y.2X) | + 0Z.(E-F) | + Z*/ mov ecx,MatA fld dword ptr [ecx+16] /**/ fmul dword ptr [ecx+32] /* A*/ fld dword ptr [ecx+20] /**/ fmul dword ptr [ecx+28] /* B A*/ fld dword ptr [ecx+20] /**/ fmul dword ptr [ecx+24] /* C B A*/ fld dword ptr [ecx+12] /**/ fmul dword ptr [ecx+32] /* D C B A*/ fxch st(2) /* B C D A*/ fsubp st(3),st /* C D A-B*/ fld dword ptr [ecx+12] /**/ fmul dword ptr [ecx+28] /* E C D A-B*/ fxch st(2) /* D C E A-B*/ fsubp st(1),st /* C-D E A-B*/ fxch st(2) /* A-B E C-D*/ fmul dword ptr [ecx] /* X E C-D*/ fld dword ptr [ecx+16] /**/ fmul dword ptr [ecx+24] /* F X E C-D*/ fxch st(3) /* C-D X E F */ fmul dword ptr [ecx+4] /* Y X E F*/ fxch st(3) /* F X E Y*/ fsubp st(2),st /* X E-F Y*/ faddp st(2),st /* E-F X+Y*/ fmul dword ptr [ecx+8] /* Z X+Y*/ faddp st(1),st /* X+Y+Z*/ fstp dword ptr [xTempMatrix] /* not useful but necessary to no hang compiler in release mode !?!*/ } return(xTempMatrix); /* not useful but necessary to no hang compiler in release mode !?!*/ } #pragma warning(default:4035) #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) /* only 9 muls instead of 12 !!! */ /* 37 clocks but 11 penalties*/ #define MTH3D_M_xDetMatrixC( MatA ) \ MTH_M_xAdd3( \ MTH_M_xMul( (MatA)->stCol_0.xX, \ MTH_M_xMulSubMul( (MatA)->stCol_1.xY, (MatA)->stCol_2.xZ, (MatA)->stCol_1.xZ, (MatA)->stCol_2.xY ) \ ), \ MTH_M_xMul( (MatA)->stCol_0.xY, \ MTH_M_xMulSubMul( (MatA)->stCol_1.xZ, (MatA)->stCol_2.xX, (MatA)->stCol_1.xX, (MatA)->stCol_2.xZ ) \ ), \ MTH_M_xMul( (MatA)->stCol_0.xZ, \ MTH_M_xMulSubMul( (MatA)->stCol_1.xX, (MatA)->stCol_2.xY, (MatA)->stCol_1.xY, (MatA)->stCol_2.xX ) \ ) \ ) #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 41 clocks but 16 penalties*/ #define MTH3D_M_xDetMatrixORG(MatA) \ MTH_M_xSub( \ MTH_M_xAdd3( \ MTH_M_xMul3( (MatA)->stCol_0.xX, (MatA)->stCol_1.xY, (MatA)->stCol_2.xZ ), \ MTH_M_xMul3( (MatA)->stCol_0.xY, (MatA)->stCol_1.xZ, (MatA)->stCol_2.xX ), \ MTH_M_xMul3( (MatA)->stCol_0.xZ, (MatA)->stCol_1.xX, (MatA)->stCol_2.xY )), \ MTH_M_xAdd3( \ MTH_M_xMul3( (MatA)->stCol_0.xZ, (MatA)->stCol_1.xY, (MatA)->stCol_2.xX ), \ MTH_M_xMul3( (MatA)->stCol_1.xZ, (MatA)->stCol_2.xY, (MatA)->stCol_0.xX ), \ MTH_M_xMul3( (MatA)->stCol_2.xZ, (MatA)->stCol_0.xY, (MatA)->stCol_1.xX ))) /************************************************************************************************************************/ /* MTH3D_M_vMulMatrixVectorWithoutBuffer*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* requires no temporary buffer if VectDest==VectA*/ /* 32 clocks (1 penalty) 33 instructions : 12,12 % pairing*/ INLINE void MTH3D_M_vMulMatrixVectorWithoutBufferASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA) { __asm { mov ebx,VectA mov eax,MatA mov ecx,VectDest fld dword ptr [ebx] fmul dword ptr [eax] fld dword ptr [ebx] fmul dword ptr [eax+4] fld dword ptr [ebx] fmul dword ptr [eax+8] fld dword ptr [ebx+4] fmul dword ptr [eax+12] fld dword ptr [ebx+4] fmul dword ptr [eax+16] fld dword ptr [ebx+4] fmul dword ptr [eax+20] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fld dword ptr [ebx+8] fmul dword ptr [eax+24] fld dword ptr [ebx+8] fmul dword ptr [eax+28] fld dword ptr [ebx+8] fmul dword ptr [eax+32] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fxch st(2) fstp dword ptr [ecx] fstp dword ptr [ecx+4] fstp dword ptr [ecx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vMulMatrixVectorWithoutBufferC MTH3D_M_vMulMatrixVectorWithoutBufferORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 56 clocks (20 penalties) 44 instructions : 4.55 % pairing*/ #define MTH3D_M_vMulMatrixVectorWithoutBufferORG( VectDest, MatA, VectA) \ { (VectDest)->xX = MTH_M_xAdd3( \ MTH_M_xMul( (MatA)->stCol_0.xX, (VectA)->xX), \ MTH_M_xMul( (MatA)->stCol_1.xX, (VectA)->xY), \ MTH_M_xMul( (MatA)->stCol_2.xX, (VectA)->xZ)); \ (VectDest)->xY = MTH_M_xAdd3( \ MTH_M_xMul( (MatA)->stCol_0.xY, (VectA)->xX), \ MTH_M_xMul( (MatA)->stCol_1.xY, (VectA)->xY), \ MTH_M_xMul( (MatA)->stCol_2.xY, (VectA)->xZ)); \ (VectDest)->xZ = MTH_M_xAdd3( \ MTH_M_xMul( (MatA)->stCol_0.xZ, (VectA)->xX), \ MTH_M_xMul( (MatA)->stCol_1.xZ, (VectA)->xY), \ MTH_M_xMul( (MatA)->stCol_2.xZ, (VectA)->xZ)); } #if defined(OPTIMIZED_FOR_U64_ASM) /* requires no temporary buffer if VectDest==VectA*/ static inline void MTH3D_M_vMulMatrixVectorWithoutBufferU64ASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA) { /* GGG RRR OOO U U M M PPP FFFF !! !! G G R R O O U U MM MM P P F !! !! G RRR O O U U M M M PPP FFF !! !! G GG R R O O U U M M P F GGG R R OOO UUU M M P F oo oo PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/ asm(" .set noreorder "); asm(" # Begin MulMatrixVectorWithoutBufferU64ASM " ); asm( " lwc1 $f6,0(%2) \n" " lwc1 $f8,4(%2) \n" " lwc1 $f10,8(%2) \n" " lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n" " lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n" " mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n" " lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n" " lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n" " lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n" " mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n" " add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n" " lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n" " mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n" " add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n" " lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n" " mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n" " add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n" " lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n" " mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n" " add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n" " lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n" " swc1 $f0,0(%0) # f0 -> Dest[0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n" " add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n" " swc1 $f12,4(%0) # f12 -> Dest[1] \n" " add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n" " swc1 $f18,8(%0) # f18 -> Dest[2] \n" : : "r" (VectDest), "r" (MatA), "r" (VectA) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" ); asm(" # EndOf MulMatrixVectorWithoutBufferU64ASM "); asm(" .set reorder "); } #endif /* OPTIMIZED_FOR_U64_ASM*/ /************************************************************************************************************************/ /* MTH3D_M_vMulMatrixVector*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) #define MTH3D_M_vMulMatrixVectorASM MTH3D_M_vMulMatrixVectorWithoutBufferASM #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vMulMatrixVectorC MTH3D_M_vMulMatrixVectorORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ #if defined(OPTIMIZED_FOR_U64_ASM) #define MTH3D_M_vMulMatrixVectorU64ASM MTH3D_M_vMulMatrixVectorWithoutBufferU64ASM #endif /* OPTIMIZED_FOR_U64_ASM*/ #define MTH3D_M_vMulMatrixVectorORG( VectDest, MatA, VectA) \ { if( VectA==VectDest ) \ { \ MTH3D_tdstVector Vtmp; \ \ MTH3D_M_vCopyVector( &Vtmp, VectA); \ MTH3D_M_vMulMatrixVectorWithoutBuffer( VectDest, MatA, &Vtmp); \ } \ else \ { \ MTH3D_M_vMulMatrixVectorWithoutBuffer( VectDest, MatA, VectA); \ } \ } /************************************************************************************************************************/ /* MTH3D_M_vInverMatrix*/ /************************************************************************************************************************/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) /* avoid transposition, stack, 9 divisions by det, 4 negs*/ /* 131 clocks (38+1 penalties) 98 instructions : 2.04 % pairing*/ #pragma warning(disable:4725) INLINE void MTH3D_M_vInverMatrixASM(struct MTH3D_tdstMatrix_ *MatDest, struct MTH3D_tdstMatrix_ *MatA) { static MTH_tdxReal ONE=MTH_C_ONE; __asm { mov eax,MatA mov ebx,MatDest fld dword ptr [eax+16] /*<====== 1 penalty : AGI stall because of eax load*/ fmul dword ptr [eax+32] /* load A1*/ fld dword ptr [eax+20] fmul dword ptr [eax+24] /* load A2 */ fld dword ptr [eax+12] fmul dword ptr [eax+28] /* load A3*/ fld dword ptr [eax+20] fmul dword ptr [eax+28] /* load B1*/ fld dword ptr [eax+12] fmul dword ptr [eax+32] /* load B2*/ fld dword ptr [eax+16] fmul dword ptr [eax+24] /* load B3*/ /* B3 B2 B1 A3 A2 A1*/ fxch st(2) /* B1 B2 B3 A3 A2 A1*/ fsubp st(5),st /* B2 B3 A3 A2 P1*/ fsubp st(3),st /* B3 A3 P2 P1 */ fsubp st(1),st /* P3 P2 P1*/ fld dword ptr [eax+8] fmul dword ptr [eax+28] /* C1 P3 P2 P1*/ fld st(3) fmul dword ptr [eax] /* X C1 P3 P2 P1*/ fld st(3) fmul dword ptr [eax+4] /* Y X C1 P3 P2 P1*/ fld st(3) fmul dword ptr [eax+8] /* Z Y X C1 P3 P2 P1*/ fld dword ptr [eax+4] fmul dword ptr [eax+32] /* D1 Z Y X C1 P3 P2 P1*/ fxch st(2) faddp st(3),st /* Z D1 X+Y C1 P3 P2 P1*/ fld dword ptr [eax] fmul dword ptr [eax+32] /* C2 Z D1 X+Y C1 P3 P2 P1*/ fxch st(2) /* D1 Z C2 X+Y C1 P3 P2 P1*/ fsubp st(4),st /* Z C2 X+Y N1 P3 P2 P1*/ faddp st(2),st /* C2 X+Y+Z N1 P3 P2 P1*/ fld dword ptr [eax+8] fmul dword ptr [eax+24] /* D2 C2 X+Y+Z N1 P3 P2 P1*/ fld dword ptr [ONE] /* 1 D2 C2 det=X+Y+Z N1 P3 P2 P1*/ fdivrp st(3),st /* D2 C2 D=1/det N1 P3 P2 P1 <====== unavoidable 38 clocks penalties*/ /* fld dword ptr [ONE] // 1 D2 C2 det=X+Y+Z N1 P3 P2 P1 fxch st(3) // det D2 C2 1 N1 P3 P2 P1 fdivp st(3),st // D2 C2 D=1/det N1 P3 P2 P1 <====== unavoidable 38 clocks penalties */ fsubp st(1),st /* N2 D N1 P3 P2 P1*/ fxch st(1) /* D N2 N1 P3 P2 P1*/ fld st(0) fmulp st(6),st /* D N2 N1 P3 P2 p1*/ fld st(0) fmulp st(5),st /* D N2 N1 P3 P2 p1*/ fld st(0) fmulp st(4),st /* D N2 N1 P3 p2 p1*/ fld st(0) fmulp st(3),st /* D N2 n1 p3 p2 p1*/ fld st(0) fmulp st(2),st /* D n2 n1 p3 p2 p1*/ fld dword ptr [eax+4] fmul dword ptr [eax+24] /* C3 D n2 n1 p3 p2 p1*/ fld dword ptr [eax] fmul dword ptr [eax+28] /* D3 C3 D n2 n1 p3 p2 p1*/ fxch st(7) /* p1 C3 D n2 n1 p3 p2 D3*/ fstp dword ptr [ebx] /* C3 D n2 n1 p3 p2 D3*/ /**/ fld dword ptr [eax+4] fmul dword ptr [eax+20] /* E1 C3 D n2 n1 p3 p2 D3*/ fxch st(1) /* C3 E1 D n2 n1 p3 p2 D3*/ fsubrp st(7),st /* E1 D n2 n1 p3 p2 N3*/ /**/ fld dword ptr [eax+8] fmul dword ptr [eax+16] /* F1 E1 D n2 n1 p3 p2 N3*/ fxch st(6) /* p2 E1 D n2 n1 p3 F1 N3*/ fstp dword ptr [ebx+12] /* E1 D n2 n1 p3 F1 N3*/ /**/ fld dword ptr [eax+8] fmul dword ptr [eax+12] /* E2 E1 D n2 n1 p3 F1 N3*/ fxch st(1) /* E1 E2 D n2 n1 p3 F1 N3*/ fsubrp st(6),st /* E2 D n2 n1 p3 M1 N3*/ /**/ fld dword ptr [eax] fmul dword ptr [eax+20] /* F2 E2 D n2 n1 p3 M1 N3*/ fxch st(5) /* p3 E2 D n2 n1 F2 M1 N3*/ fstp dword ptr [ebx+24] /* E2 D n2 n1 F2 M1 N3*/ /**/ fld dword ptr [eax] fmul dword ptr [eax+16] /* E3 E2 D n2 n1 F2 M1 N3*/ fxch st(1) /* E2 E3 D n2 n1 F2 M1 N3*/ fsubrp st(5),st /* E3 D n2 n1 M2 M1 N3*/ /**/ fld dword ptr [eax+4] fmul dword ptr [eax+12] /* F3 E3 D n2 n1 M2 M1 N3*/ fxch st(4) /* n1 E3 D n2 F3 M2 M1 N3*/ fstp dword ptr [ebx+4] /* E3 D n2 F3 M2 M1 N3*/ /**/ fxch st(1) /* D E3 n2 F3 M2 M1 N3*/ fld st(0) /* D D E3 n2 F3 M2 M1 N3*/ fmulp st(7),st /* D E3 n2 F3 M2 M1 n3*/ fxch st(1) /* E3 D n2 F3 M2 M1 n3*/ fsubrp st(3),st /* D n2 M3 M2 M1 n3*/ /**/ fld st(0) /* D D n2 M3 M2 M1 n3*/ fmulp st(5),st /* D n2 M3 M2 m1 n3*/ fld st(0) /* D D n2 M3 M2 m1 n3*/ fmulp st(4),st /* D n2 M3 m2 m1 n3*/ /**/ fxch st(1) /* n2 D M3 m2 m1 n3*/ fstp dword ptr [ebx+16] /* D M3 m2 m1 n3*/ fmulp st(1),st /* m3 m2 m1 n3 */ fxch st(3) /* n3 m2 m1 m3*/ /**/ fstp dword ptr [ebx+28] /* m2 m1 m3*/ fstp dword ptr [ebx+20] /* m1 m3*/ fstp dword ptr [ebx+8] /* m3*/ fstp dword ptr [ebx+32] } } #pragma warning(default:4725) #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) /* avoid transposition, stack, 9 divisions by det, 4 negs*/ /* 214 clocks (81 penalties) 147 instructions : 1.36 % pairing*/ INLINE void MTH3D_M_vInverMatrixC(MTH3D_tdstMatrix *MatDest, MTH3D_tdstMatrix *MatA) { register MTH_tdxReal det; (MatDest)->stCol_0.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xY,(MatA)->stCol_2.xZ,(MatA)->stCol_1.xZ,(MatA)->stCol_2.xY); det = MTH_M_xMul( (MatA)->stCol_0.xX ,(MatDest)->stCol_0.xX ); (MatDest)->stCol_1.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xZ,(MatA)->stCol_2.xX,(MatA)->stCol_1.xX,(MatA)->stCol_2.xZ); det = MTH_M_xAdd(MTH_M_xMul( (MatA)->stCol_0.xY ,(MatDest)->stCol_1.xX ), det); (MatDest)->stCol_2.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xX,(MatA)->stCol_2.xY,(MatA)->stCol_1.xY,(MatA)->stCol_2.xX); det = MTH_M_xAdd(MTH_M_xMul( (MatA)->stCol_0.xZ ,(MatDest)->stCol_2.xX ), det); (MatDest)->stCol_0.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xZ,(MatA)->stCol_2.xY,(MatA)->stCol_0.xY,(MatA)->stCol_2.xZ); det = MTH_M_xDiv( MTH_C_ONE, det ); (MatDest)->stCol_0.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xY,(MatA)->stCol_1.xZ,(MatA)->stCol_0.xZ,(MatA)->stCol_1.xY); (MatDest)->stCol_1.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xX,(MatA)->stCol_2.xZ,(MatA)->stCol_0.xZ,(MatA)->stCol_2.xX); (MatDest)->stCol_1.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xZ,(MatA)->stCol_1.xX,(MatA)->stCol_0.xX,(MatA)->stCol_1.xZ); (MatDest)->stCol_2.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xY,(MatA)->stCol_2.xX,(MatA)->stCol_0.xX,(MatA)->stCol_2.xY); (MatDest)->stCol_2.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xX,(MatA)->stCol_1.xY,(MatA)->stCol_0.xY,(MatA)->stCol_1.xX); MTH3D_M_vMulScalarMatrix( MatDest, det, MatDest ); } #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* Can surely be optimized in avoiding many recomputations done for nothing*/ /* Before modifying MTH3D_M_vDivScalarMatrix, it took 540 clocks 120 instructions !?!*/ #define MTH3D_M_vInverMatrixORG(MatDest, MatA) \ { \ MTH3D_tdstMatrix Mat_Tmp={0}; \ MTH3D_tdstMatrix Mat_Com; \ MTH_tdxReal det; \ \ MTH3D_M_vComMatrixWithoutBuffer(&Mat_Com, MatA); \ MTH3D_M_vTranspMatrix(&Mat_Tmp, &Mat_Com ); \ det=MTH3D_M_xDetMatrix( MatA ); \ MTH3D_M_vDivScalarMatrix(MatDest, &Mat_Tmp, det ); \ } /************************************************************************************************************************/ /* MTH3D_M_vTransformVectorWithoutBuffer*/ /************************************************************************************************************************/ /* 39 clocks (1 penalty) 41 instructions : 9.72 % pairing*/ #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM) INLINE void MTH3D_M_vTransformVectorWithoutBufferASM(MTH3D_tdstVector *VectDest,MTH3D_tdstMatrix *MatA,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB) { __asm { mov ecx,VectB mov edx,VectDest mov ebx,VectA mov eax,MatA fld dword ptr [ecx] fld dword ptr [ecx+4] fld dword ptr [ecx+8] fld dword ptr [ebx] fmul dword ptr [eax] fld dword ptr [ebx] fmul dword ptr [eax+4] fld dword ptr [ebx] fmul dword ptr [eax+8] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fld dword ptr [ebx+4] fmul dword ptr [eax+12] fld dword ptr [ebx+4] fmul dword ptr [eax+16] fld dword ptr [ebx+4] fmul dword ptr [eax+20] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fld dword ptr [ebx+8] fmul dword ptr [eax+24] fld dword ptr [ebx+8] fmul dword ptr [eax+28] fld dword ptr [ebx+8] fmul dword ptr [eax+32] fxch st(2) faddp st(5),st faddp st(3),st faddp st(1),st fxch st(2) fstp dword ptr [edx] /* 1 penalty : unavoidable*/ fstp dword ptr [edx+4] fstp dword ptr [edx+8] } } #endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/ #if defined(OPTIMIZED_FOR_PC_FLOATS) #define MTH3D_M_vTransformVectorWithoutBufferC MTH3D_M_vTransformVectorWithoutBufferORG #endif /* OPTIMIZED_FOR_PC_FLOATS*/ /* 79 clocks (30 penalties) 54 instructions : 3.70 % pairing*/ #define MTH3D_M_vTransformVectorWithoutBufferORG( VectDest, MatA, VectA, VectB) \ { MTH3D_M_vMulMatrixVectorWithoutBuffer(VectDest, MatA, VectA); \ MTH3D_M_vAddVector(VectDest,VectDest,VectB); \ } #if defined(OPTIMIZED_FOR_U64_ASM) /* requires no temporary buffer if VectDest==VectA*/ static inline void MTH3D_M_vTransformVectorWithoutBufferU64ASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB) { /* GGG RRR OOO U U M M PPP FFFF !! !! G G R R O O U U MM MM P P F !! !! G RRR O O U U M M M PPP FFF !! !! G GG R R O O U U M M P F GGG R R OOO UUU M M P F oo oo PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/ asm(" .set noreorder "); asm(" # Begin TransformVectorWithoutBufferU64ASM "); asm( " lwc1 $f6,0(%2) \n" " lwc1 $f8,4(%2) \n" " lwc1 $f10,8(%2) \n" " lwc1 $f0,0(%1) # f0 <- Mat[0,0] \n" " lwc1 $f2,12(%1) # f2 <- Mat[0,1] \n" " mul.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] \n" " lwc1 $f4,24(%1) # f4 <- Mat[0,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[0,1] x Vect[1] \n" " lwc1 $f12,4(%1) # f12 <- Mat[1,0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[0,2] x Vect[2] \n" " lwc1 $f14,16(%1) # f14 <- Mat[1,1] \n" " mul.s $f12,$f12,$f6 # f12 <- Mat[1,0] x Vect[0] \n" " add.s $f0,$f0,$f2 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n" " lwc1 $f16,28(%1) # f16 <- Mat[1,2] \n" " mul.s $f14,$f14,$f8 # f14 <- Mat[1,1] x Vect[1] \n" " add.s $f0,$f0,$f4 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n" " lwc1 $f18,8(%1) # f18 <- Mat[2,0] \n" " mul.s $f16,$f16,$f10 # f16 <- Mat[1,2] x Vect[2] \n" " add.s $f12,$f12,$f14 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n" " lwc1 $f2,20(%1) # f2 <- Mat[2,1] \n" " mul.s $f18,$f18,$f6 # f18 <- Mat[2,0] x Vect[0] \n" " add.s $f12,$f12,$f16 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n" " lwc1 $f4,32(%1) # f4 <- Mat[2,2] \n" " mul.s $f2,$f2,$f8 # f2 <- Mat[2,1] x Vect[1] \n" " lwc1 $f6,0(%3) # f6 <- VctB[0] \n" " mul.s $f4,$f4,$f10 # f4 <- Mat[2,2] x Vect[2] \n" " add.s $f18,$f18,$f2 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n" " lwc1 $f8,4(%3) # f8 <- VctB[1] \n" " add.s $f18,$f18,$f4 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n" " lwc1 $f10,8(%3) # f10 <- VctB[2] \n" " add.s $f0,$f0,$f6 # f0 <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] + VctB[0] \n" " add.s $f12,$f12,$f8 # f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] + VctB[1] \n" " swc1 $f0,0(%0) # f0 -> Dest[0] \n" " add.s $f18,$f18,$f10 # f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] + VctB[1] \n" " swc1 $f12,4(%0) # f12 -> Dest[1] \n" " swc1 $f18,8(%0) # f18 -> Dest[2] \n" : : "r" (VectDest), "r" (MatA), "r" (VectA) , "r" (VectB) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" ); asm(" # EndOf TransformVectorWithoutBufferU64ASM "); asm(" .set reorder "); } #endif /* OPTIMIZED_FOR_U64_ASM*/ #endif /* MTH_ASM_H*/