/* Olivier Couvreur : 3/98 */
 
#if !defined(MTH_ASM_H)
#define MTH_ASM_H
#include "acp_base.h"
 
/* force OPTIMIZED_FOR_PC_FLOATS if OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
	#if !defined(OPTIMIZED_FOR_PC_FLOATS)
		#define OPTIMIZED_FOR_PC_FLOATS
	#endif
#endif
 
/************************************************************************************************************************/
/* MTH3D_M_vAddVector*/
/************************************************************************************************************************/
/* register load + 3 * (fld fadd) + 3 fstp = 2 + 3 * (1+1) + 3 * (2) = 14 clocks 13 instructions*/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
INLINE void MTH3D_M_vAddVectorASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB) 
{
	__asm						
	{					
		mov eax,VectA				
		mov ebx,VectB
		mov ecx,VectDest			
		fld  dword ptr [eax]		/*; (VectA)->xX*/
		fadd dword ptr [ebx]		/*; (VectB)->xX*/
		fld  dword ptr [eax+4]		/*; (VectA)->xY*/
		fadd dword ptr [ebx+4]		/*; (VectB)->xY*/
		fld  dword ptr [eax+8]		/*; (VectA)->xZ*/
		fadd dword ptr [ebx+8]		/*; (VectB)->xZ*/
		fxch st(2)				
		fstp dword ptr [ecx]		/*; (VectDest)->xX*/
		fstp dword ptr [ecx+4]		/*; (VectDest)->xY*/
		fstp dword ptr [ecx+8]		/*; (VectDest)->xZ*/
	}
}
#endif /* #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vAddVectorC MTH3D_M_vAddVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 3 * (fld fadd fstp) = 3 * (1+1+2+3pen) = 21 clocks 9 instructions*/
#define MTH3D_M_vAddVectorORG( VectDest, VectA, VectB)                          \
        {       (VectDest)->xX = MTH_M_xAdd((VectA)->xX, (VectB)->xX);          \
                (VectDest)->xY = MTH_M_xAdd((VectA)->xY, (VectB)->xY);          \
                (VectDest)->xZ = MTH_M_xAdd((VectA)->xZ, (VectB)->xZ); }
 
 
/************************************************************************************************************************/
/* MTH3D_M_vSubVector*/
/************************************************************************************************************************/
/* register load + 3 * (fld fadd) + 3 fstp = 2 + 3 * (1+1) + 3 * (2) = 14 clocks 13 instructions*/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
INLINE void MTH3D_M_vSubVectorASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB) 
{
	__asm						
	{					
		mov eax,VectA				
		mov ebx,VectB
		mov ecx,VectDest			
		fld  dword ptr [eax]		/*; (VectA)->xX*/
		fsub dword ptr [ebx]		/*; (VectB)->xX*/
		fld  dword ptr [eax+4]		/*; (VectA)->xY*/
		fsub dword ptr [ebx+4]		/*; (VectB)->xY*/
		fld  dword ptr [eax+8]		/*; (VectA)->xZ*/
		fsub dword ptr [ebx+8]		/*; (VectB)->xZ*/
		fxch st(2)				
		fstp dword ptr [ecx]		/*; (VectDest)->xX*/
		fstp dword ptr [ecx+4]		/*; (VectDest)->xY*/
		fstp dword ptr [ecx+8]		/*; (VectDest)->xZ*/
	}
}
#endif /* #if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vSubVectorC		MTH3D_M_vSubVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 3 * (fld fadd fstp) = 3 * (1+1+2+3pen) = 21 clocks 9 instructions*/
#define MTH3D_M_vSubVectorORG( VectDest, VectA, VectB)                          \
        {       (VectDest)->xX = MTH_M_xSub((VectA)->xX, (VectB)->xX);          \
                (VectDest)->xY = MTH_M_xSub((VectA)->xY, (VectB)->xY);          \
                (VectDest)->xZ = MTH_M_xSub((VectA)->xZ, (VectB)->xZ); }
 
 
/************************************************************************************************************************/
/* MTH3D_M_vNegVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vNegVectorASM    MTH3D_M_vNegVectorC                                   
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
/* only toggle sign bit : No fpu*/
/* 10 pairables instructions => 5 clocks*/
#define MTH3D_M_vNegVectorC( VectDest, VectA)                                        \
		{		long register NegMask=0x80000000;								     \
				*((long*) &((VectDest)->xX)) = *((long*) &((VectA)->xX )) ^ NegMask; \
				*((long*) &((VectDest)->xY)) = *((long*) &((VectA)->xY )) ^ NegMask; \
				*((long*) &((VectDest)->xZ)) = *((long*) &((VectA)->xZ )) ^ NegMask; \
		}                
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 3 * ( fld fchs fstp) = 3 * (1 + 1 + 1+1pen) = 12 clocks 9 instructions*/
#define MTH3D_M_vNegVectorORG( VectDest, VectA)                                 \
        {       (VectDest)->xX = MTH_M_xNeg( (VectA)->xX );                     \
                (VectDest)->xY = MTH_M_xNeg( (VectA)->xY );                     \
                (VectDest)->xZ = MTH_M_xNeg( (VectA)->xZ ); }
 
 
/************************************************************************************************************************/
/* MTH3D_M_vAdd3*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 14 clocks*/
INLINE void MTH3D_M_vAdd3ScalarVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA,MTH_tdxReal x,MTH_tdxReal y,MTH_tdxReal z)
{
	__asm
	{
		mov eax,VectA				
		mov ecx,VectDest			
		fld  dword ptr [eax]		
		fadd dword ptr [x]		
		fld  dword ptr [eax+4]		
		fadd dword ptr [y]		
		fld  dword ptr [eax+8]		
		fadd dword ptr [z]		
		fxch st(2)				
		fstp dword ptr [ecx]		
		fstp dword ptr [ecx+4]		
		fstp dword ptr [ecx+8]		
 
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vAdd3ScalarVectorC			MTH3D_M_vAdd3ScalarVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/*  21 clocks*/
#define MTH3D_M_vAdd3ScalarVectorORG( VectDest, VectA, x, y, z)                 \
        {       (VectDest)->xX = MTH_M_xAdd( (VectA)->xX, (x) );                \
                (VectDest)->xY = MTH_M_xAdd( (VectA)->xY, (y) );                \
                (VectDest)->xZ = MTH_M_xAdd( (VectA)->xZ, (z) ); }
 
 
/************************************************************************************************************************/
/* MTH3D_M_vDivScalarVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 14 clocks*/
INLINE void MTH3D_M_vMulScalarVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal a,MTH3D_tdstVector *VectA)
{
	__asm
	{
		mov eax,VectA
		mov ebx,VectDest
		fld  dword ptr [eax]
		fmul dword ptr [a]
		fld  dword ptr [eax+4]
		fmul dword ptr [a]
		fld  dword ptr [eax+8]
		fmul dword ptr [a]
		fxch st(2)
		fstp dword ptr [ebx]
		fstp dword ptr [ebx+4]
		fstp dword ptr [ebx+8]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulScalarVectorC( VectDest, a, VectA)                          \
				{ register MTH_tdxReal xTempMTH3D_M_vMulScalarVectorC=(a);		\
		        (VectDest)->xX = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xX);                \
                (VectDest)->xY = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xY);                \
                (VectDest)->xZ = MTH_M_xMul(xTempMTH3D_M_vMulScalarVectorC, (VectA)->xZ); }
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
 
#define MTH3D_M_vMulScalarVectorORG( VectDest, a, VectA)                        \
        {       (VectDest)->xX = MTH_M_xMul((a), (VectA)->xX);                  \
                (VectDest)->xY = MTH_M_xMul((a), (VectA)->xY);                  \
                (VectDest)->xZ = MTH_M_xMul((a), (VectA)->xZ); }
 
/************************************************************************************************************************/
/* MTH3D_M_vDivScalarVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vDivScalarVectorASM		MTH3D_M_vDivScalarVectorC
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
/* Only one division*/
#define MTH3D_M_vDivScalarVectorC( VectDest, VectA, a)                          \
        {  register MTH_tdxReal xTempMTH3D_M_vDivScalarVectorC=MTH_M_xDiv(MTH_C_ONE, (a));				\
		   MTH3D_M_vMulScalarVector( VectDest ,xTempMTH3D_M_vDivScalarVectorC, VectA);	}
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
#define MTH3D_M_vDivScalarVectorORG( VectDest, VectA, a)                        \
        {       (VectDest)->xX = MTH_M_xDiv((VectA)->xX, (a));                  \
                (VectDest)->xY = MTH_M_xDiv((VectA)->xY, (a));                  \
                (VectDest)->xZ = MTH_M_xDiv((VectA)->xZ, (a)); }
 
 
/************************************************************************************************************************/
/* MTH3D_M_vScaleVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 14 clocks*/
INLINE void MTH3D_M_vScaleVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB)
{
	__asm
	{
		mov eax,VectA
		mov ebx,VectB
		mov ecx,VectDest
 
		fld  dword ptr [eax]
		fmul dword ptr [ebx]
		fld  dword ptr [eax+4]
		fmul dword ptr [ebx+4]
		fld  dword ptr [eax+8]
		fmul dword ptr [ebx+8]
		fxch st(2)
		fstp dword ptr [ecx]
		fstp dword ptr [ecx+4]
		fstp dword ptr [ecx+8]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vScaleVectorC			MTH3D_M_vScaleVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 21 clocks*/
#define MTH3D_M_vScaleVectorORG( VectDest, VectA, VectB )               \
        {       (VectDest)->xX = MTH_M_xMul( (VectA)->xX, (VectB)->xX); \
                (VectDest)->xY = MTH_M_xMul( (VectA)->xY, (VectB)->xY); \
                (VectDest)->xZ = MTH_M_xMul( (VectA)->xZ, (VectB)->xZ); }            
 
 
/************************************************************************************************************************/
/* MTH3D_M_vMulAddVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 17 clocks : D=xA+B*/
INLINE void MTH3D_M_vMulAddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB)
{
	__asm
	{
		mov eax,VectA
		mov ebx,VectB
		mov ecx,VectDest
 
		fld  dword ptr [eax]
		fmul dword ptr [x]
		fld  dword ptr [eax+4]
		fmul dword ptr [x]
		fxch st(1)
		fld  dword ptr [eax+8]
		fmul dword ptr [x]
		fxch st(1)
		fadd dword ptr [ebx]
		fxch st(2)
		fadd dword ptr [ebx+4]
		fxch st(1)
		fadd dword ptr [ebx+8]
		fxch st(2)
		fstp dword ptr [ecx]		/* 1 pen*/
		fstp dword ptr [ecx+4]		
		fstp dword ptr [ecx+8]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulAddVectorC			MTH3D_M_vMulAddVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 31 clocks*/
#define MTH3D_M_vMulAddVectorORG( VectDest, x,  VectA, VectB)                           \
      { (VectDest)->xX = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xX), (VectB)->xX);          \
        (VectDest)->xY = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xY), (VectB)->xY);          \
        (VectDest)->xZ = MTH_M_xAdd(MTH_M_xMul((x),(VectA)->xZ), (VectB)->xZ); }
 
 
/************************************************************************************************************************/
/* MTH3D_M_vMul3AddVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/*  32 clocks (2 penalties) : D=xA+yB+zC*/
INLINE void MTH3D_M_vMul3AddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH_tdxReal y,MTH3D_tdstVector *VectB,MTH_tdxReal z,MTH3D_tdstVector *VectC)
{
	__asm
	{
		mov eax,VectA
		mov ebx,VectB
		mov ecx,VectC
		mov edx,VectDest
 
		fld  dword ptr [eax]
		fmul dword ptr [x]
		fld  dword ptr [eax+4]
		fmul dword ptr [x]
		fld  dword ptr [eax+8]
		fmul dword ptr [x]
		fxch st(2)
 
		fld  dword ptr [ebx]
		fmul dword ptr [y]
		fld  dword ptr [ebx+4]
		fmul dword ptr [y]
		fld  dword ptr [ebx+8]
		fmul dword ptr [y]
		fxch st(2)
 
		faddp st(3),st				
		faddp st(3),st
		fld  dword ptr [ecx]
		fmul dword ptr [z]
		fxch st(1)
		faddp st(4),st
		fld  dword ptr [ecx+4]
		fmul dword ptr [z]
		fld  dword ptr [ecx+8]
		fmul dword ptr [z]
		fxch st(2)
 
		faddp st(3),st				
		faddp st(3),st
		faddp st(3),st				/* 1 pen*/
 
		fstp dword ptr [edx]		/* 1 pen*/
		fstp dword ptr [edx+4]		
		fstp dword ptr [edx+8]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMul3AddVectorC			MTH3D_M_vMul3AddVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* au moins 42 clocks*/
#define MTH3D_M_vMul3AddVectorORG( VectDest, x,  VectA, y, VectB, z, VectC)									\
      { (VectDest)->xX = MTH_M_xMulAddMulAddMul((x),(VectA)->xX), (y),(VectB)->xX), (z),(VectC)->xX));      \
        (VectDest)->xY = MTH_M_xMulAddMulAddMul((x),(VectA)->xY), (y),(VectB)->xY), (z),(VectC)->xY));      \
        (VectDest)->xZ = MTH_M_xMulAddMulAddMul((x),(VectA)->xZ), (y),(VectB)->xZ), (z),(VectC)->xZ)); }
 
/************************************************************************************************************************/
/* MTH3D_M_vMul4AddVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/*  40 clocks (0 penalties) : E=xA+yB+zC+wD*/
INLINE void MTH3D_M_vMul4AddVectorASM(MTH3D_tdstVector *VectDest,MTH_tdxReal x,MTH3D_tdstVector *VectA,MTH_tdxReal y,MTH3D_tdstVector *VectB,MTH_tdxReal z,MTH3D_tdstVector *VectC,MTH_tdxReal w,MTH3D_tdstVector *VectD)
{
	__asm
	{
		mov eax,VectA
		mov ebx,VectB
		mov ecx,VectC
		mov edx,VectD
 
		fld  dword ptr [eax]
		fmul dword ptr [x]
		fld  dword ptr [eax+4]
		fmul dword ptr [x]
		fld  dword ptr [eax+8]
		fmul dword ptr [x]
		fxch st(2)
 
		fld  dword ptr [ebx]
		fmul dword ptr [y]
		fld  dword ptr [ebx+4]
		fmul dword ptr [y]
		fld  dword ptr [ebx+8]
		fmul dword ptr [y]
		fxch st(2)
 
		faddp st(3),st				
		faddp st(3),st
		fld  dword ptr [ecx]
		fmul dword ptr [z]
		fxch st(1)
		faddp st(4),st
		fld  dword ptr [ecx+4]
		fmul dword ptr [z]
		fld  dword ptr [ecx+8]
		fmul dword ptr [z]
		fxch st(2)
 
		faddp st(3),st				
		faddp st(3),st
		fld  dword ptr [edx]
		fmul dword ptr [w]
		fxch st(1)
		faddp st(4),st
		fld  dword ptr [edx+4]
		fmul dword ptr [w]
		fld  dword ptr [edx+8]
		fmul dword ptr [w]
		fxch st(2)
 
		faddp st(3),st				
		faddp st(3),st
		mov edx,VectDest
		faddp st(3),st				
		
		fstp dword ptr [edx]		
		fstp dword ptr [edx+4]		
		fstp dword ptr [edx+8]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMul4AddVectorC			MTH3D_M_vMul4AddVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* au moins 56 clocks*/
#define MTH3D_M_vMul4AddVectorORG( VectDest, x,  VectA, y, VectB, z, VectC, w, VectD)																\
      { (VectDest)->xX = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xX), (y),(VectB)->xX), (z),(VectC)->xX)),MTH_M_xMul((w),(VectD)->xX));      \
        (VectDest)->xY = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xY), (y),(VectB)->xY), (z),(VectC)->xY)),MTH_M_xMul((w),(VectD)->xY));      \
        (VectDest)->xZ = MTH_M_xAdd(MTH_M_xMulAddMulAddMul((x),(VectA)->xZ), (y),(VectB)->xZ), (z),(VectC)->xZ)),MTH_M_xMul((w),(VectD)->xZ)); }
 
 
/************************************************************************************************************************/
/* MTH3D_M_vLinearInterpolVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 21 clocks*/
INLINE void MTH3D_M_vLinearInterpolVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA, MTH3D_tdstVector *VectB, MTH_tdxReal t)
{
	/* Cx=Ax+t(Bx-Ax)=Ax+tDx*/
	/* Cy=Ay+t(By-Ay)=Ay+tDy*/
	/* Cz=Az+t(Bz-Az)=Az+tDz*/
	__asm
	{
		mov ebx,VectB
		mov eax,VectA
		mov ecx,VectDest
 
		fld  dword ptr [ebx]		/* Bx*/
		fsub dword ptr [eax]		/* Dx*/
		fld  dword ptr [ebx+4]		/* By Dx*/
		fsub dword ptr [eax+4]		/* Dy Dx*/
		fld  dword ptr [ebx+8]		/* Bz Dy Dx*/
		fsub dword ptr [eax+8]		/* Dz Dy Dx*/
		fxch st(2)					/* Dx Dy Dz*/
		fmul dword ptr [t]			/* tDx Dy Dz*/
		fld  dword ptr [eax+4]		/* Ay tDx Dy Dz*/
		fxch st(2)					/* Dy tDx Ay Dz*/
		fmul dword ptr [t]			/* tDy tDx Ay Dz*/
		fld  dword ptr [eax]		/* Ax tDy tDx Ay Dz*/
		fxch st(4)					/* Dz tDy tDx Ay Ax*/
		fmul dword ptr [t]			/* tDz tDy tDx Ay Ax*/
		fxch st(2)					/* tDx tDy tDz Ay Ax*/
		faddp st(4),st				/* tDy tDz Ay Cx*/
		faddp st(2),st				/* tDz Cy Cx*/
		fld  dword ptr [eax+8]      /* Az tDz Cy Cx	*/
		faddp st(1),st				/* Cz Cy Cx*/
		fxch st(2)					/* Cx Cy Cz*/
		fstp dword ptr [ecx]		/* Cy Cz*/
		fstp dword ptr [ecx+4]		/* Cz*/
		fstp dword ptr [ecx+8]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vLinearInterpolVectorC			MTH3D_M_vLinearInterpolVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 39 clocks */
#define MTH3D_M_vLinearInterpolVectorORG( VectDest, VectA, VectB, t )				    \
	{	(VectDest)->xX =	MTH_M_xLinearInterpol( (VectA)->xX, (VectB)->xX, (t) );		\
		(VectDest)->xY =	MTH_M_xLinearInterpol( (VectA)->xY, (VectB)->xY, (t) );		\
		(VectDest)->xZ =	MTH_M_xLinearInterpol( (VectA)->xZ, (VectB)->xZ, (t) );		\
	}
 
 
/************************************************************************************************************************/
/* MTH3D_M_vLinearScaleVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 21 clocks*/
INLINE void MTH3D_M_vLinearScaleVectorASM(MTH3D_tdstVector *VectDest,MTH3D_tdstVector *VectA, MTH3D_tdstVector *VectB, MTH3D_tdstVector *VectC)
{
	/* x=Ax+Cx.(Bx-Ax)=Ax+Cx.Dx*/
	/* y=Ay+Cy.(By-Ay)=Ay+Cy.Dy*/
	/* z=Az+Cz.(Bz-Az)=Az+Cz.Dz*/
	__asm
	{
		mov ebx,VectB
		mov eax,VectA
		mov ecx,VectDest
		mov edx,VectC
 
		fld  dword ptr [ebx]		/* Bx*/
		fsub dword ptr [eax]		/* Dx*/
		fld  dword ptr [ebx+4]		/* By Dx*/
		fsub dword ptr [eax+4]		/* Dy Dx*/
		fld  dword ptr [ebx+8]		/* Bz Dy Dx*/
		fsub dword ptr [eax+8]		/* Dz Dy Dx*/
		fxch st(2)					/* Dx Dy Dz*/
		fmul dword ptr [edx]		/* CxDx Dy Dz*/
		fld  dword ptr [eax+4]		/* Ay CxDx Dy Dz*/
		fxch st(2)					/* Dy CxDx Ay Dz*/
		fmul dword ptr [edx+4]		/* CyDy CxDx Ay Dz*/
		fld  dword ptr [eax]		/* Ax CyDy CxDx Ay Dz*/
		fxch st(4)					/* Dz CyDy CxDx Ay Ax*/
		fmul dword ptr [edx+8]		/* CzDz CyDy CxDx Ay Ax*/
		fxch st(2)					/* CxDx CyDy CzDz Ay Ax*/
		faddp st(4),st				/* CyDy CzDz Ay x*/
		faddp st(2),st				/* CzDz y x*/
		fld  dword ptr [eax+8]      /* Az CzDz y x	*/
		faddp st(1),st				/* z y x*/
		fxch st(2)					/* x y z*/
		fstp dword ptr [ecx]		/* y z*/
		fstp dword ptr [ecx+4]		/* z*/
		fstp dword ptr [ecx+8]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vLinearScaleVectorC			MTH3D_M_vLinearScaleVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 39 clocks */
#define MTH3D_M_vLinearScaleVectorORG( VectDest, VectA, VectB, VectC )				    \
	{	(VectDest)->xX =	MTH_M_xLinearInterpol( (VectA)->xX, (VectB)->xX, (VectC)->xX );		\
		(VectDest)->xY =	MTH_M_xLinearInterpol( (VectA)->xY, (VectB)->xY, (VectC)->xY );		\
		(VectDest)->xZ =	MTH_M_xLinearInterpol( (VectA)->xZ, (VectB)->xZ, (VectC)->xZ );		\
	}
 
/************************************************************************************************************************/
/* MTH3D_M_xDotProductVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* 12 clocks (2 penalties) 11 instructions*/
#pragma warning(disable:4035)
INLINE
MTH_tdxReal MTH3D_M_xDotProductVectorASM(struct MTH3D_tdstVector_ *VectA,struct MTH3D_tdstVector_ *VectB)
{
	register MTH_tdxReal xDot;
	__asm
	{
		mov eax,VectA
		mov ebx,VectB
		fld  dword ptr [eax]
		fmul dword ptr [ebx]
		fld  dword ptr [eax+4]
		fmul dword ptr [ebx+4]
		fld  dword ptr [eax+8]
		fmul dword ptr [ebx+8]
		fxch st(1)
		faddp st(2),st	
		faddp st(1),st	/* 2 unavoidable penalties*/
		fstp [xDot]
	}
	return(xDot);
}
#pragma warning(default:4035)
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_xDotProductVectorC	MTH3D_M_xDotProductVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 15 clocks (5 penalties) 12 instructions*/
#define MTH3D_M_xDotProductVectorORG( VectA, VectB)                             \
	      MTH_M_xAdd(                                                     \
                 MTH_M_xAdd(                                                    \
                MTH_M_xMul((VectA)->xX, (VectB)->xX),                           \
            MTH_M_xMul((VectA)->xY, (VectB)->xY)                                \
                           ),                                                   \
			MTH_M_xMul((VectA)->xZ, (VectB)->xZ) )   
 
 
/************************************************************************************************************************/
/* MTH3D_M_vCrossProductVectorWithoutBuffer*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* requires no temp buffer if VectA or VectB == VectDest*/
/* 24 clocks (1 penalty) 23 instructions : 8.70 % pairing*/
INLINE void MTH3D_M_vCrossProductVectorWithoutBufferASM(struct MTH3D_tdstVector_ *VectDest,struct MTH3D_tdstVector_ *VectA,struct MTH3D_tdstVector_ *VectB)
{
	__asm
	{
		mov eax,VectA
		mov ebx,VectB
		mov edx,VectDest
 
		fld  dword ptr [eax+4]
		fmul dword ptr [ebx+8]
		fld  dword ptr [eax+8]
		fmul dword ptr [ebx]
		fld  dword ptr [eax]
		fmul dword ptr [ebx+4]
		fld  dword ptr [eax+8]
		fmul dword ptr [ebx+4]
		fld  dword ptr [eax]
		fmul dword ptr [ebx+8]
		fld  dword ptr [eax+4]
		fmul dword ptr [ebx]
		fxch st(2)
		fsubp st(5),st
		fsubp st(3),st
		fsubp st(1),st
		fxch st(2)
		fstp  dword ptr [edx]		/* 1 penalty here : unavoidable !*/
		fstp  dword ptr [edx+4]
		fstp  dword ptr [edx+8]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vCrossProductVectorWithoutBufferC MTH3D_M_vCrossProductVectorWithoutBufferORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 42 clocks (18 penalties) 25 instructions : 8.00 % pairing*/
#define MTH3D_M_vCrossProductVectorWithoutBufferORG(VectDest, VectA, VectB)               \
    { (VectDest)->xX=MTH_M_xMulSubMul((VectA)->xY,(VectB)->xZ,(VectA)->xZ,(VectB)->xY);   \
      (VectDest)->xY=MTH_M_xMulSubMul((VectA)->xZ,(VectB)->xX,(VectA)->xX,(VectB)->xZ);   \
      (VectDest)->xZ=MTH_M_xMulSubMul((VectA)->xX,(VectB)->xY,(VectA)->xY,(VectB)->xX); }
 
/************************************************************************************************************************/
/* MTH3D_M_vCrossProductVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vCrossProductVectorASM	  MTH3D_M_vCrossProductVectorWithoutBufferASM
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vCrossProductVectorC	  MTH3D_M_vCrossProductVectorWithoutBufferC
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
 
#define MTH3D_M_vCrossProductVectorORG(VectDest, VectA, VectB)                      \
	{	if( (VectDest==VectA) || (VectDest==VectB) )                                \
            {                                                                       \
             MTH3D_tdstVector VectTmp;                                              \
             MTH3D_M_vCrossProductVectorWithoutBuffer(&VectTmp, VectA, VectB);      \
             MTH3D_M_vCopyVector(VectDest, &VectTmp);                               \
            }                                                                       \
        else                                                                        \
            {                                                                       \
              MTH3D_M_vCrossProductVectorWithoutBuffer(VectDest, VectA, VectB);     \
            }																		\
	}	
 
/************************************************************************************************************************/
/* MTH3D_M_vMulMatrixMatrixWithoutBuffer*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* requires a buffer only if MatDest==A*/
/* 91 clocks (0 penalty) 95 instructions : 4.21 % pairing*/
/* tricks remove penalties*/
INLINE void MTH3D_M_vMulMatrixMatrixWithoutBufferASM(struct MTH3D_tdstMatrix_ *MatDest,struct MTH3D_tdstMatrix_ *MatA,struct MTH3D_tdstMatrix_ *MatB)
{
	__asm
	{
		mov ebx,MatB
		mov eax,MatA
		mov ecx,MatDest
/**/
		fld  dword ptr [ebx]
		fmul dword ptr [eax]
		fld	 dword ptr [ebx]
		fmul dword ptr [eax+4]
		fld	 dword ptr [ebx]
		fmul dword ptr [eax+8]
 
		fld  dword ptr [ebx+4]
		fmul dword ptr [eax+12]
		fld	 dword ptr [ebx+4]
		fmul dword ptr [eax+16]
		fld	 dword ptr [ebx+4]
		fmul dword ptr [eax+20]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fld  dword ptr [ebx+8]
		fmul dword ptr [eax+24]
		fld	 dword ptr [ebx+8]
		fmul dword ptr [eax+28]
		fld	 dword ptr [ebx+8]
		fmul dword ptr [eax+32]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fxch st(1)					/* trick A : preload next value*/
		fld  dword ptr [ebx+12]
		fxch st(3)
 
		fstp dword ptr [ecx]		/* A no more penalty here*/
		fstp dword ptr [ecx+4]
		fstp dword ptr [ecx+8]
/**/
		fmul dword ptr [eax]
		fld	 dword ptr [ebx+12]
		fmul dword ptr [eax+4]
		fld	 dword ptr [ebx+12]
		fmul dword ptr [eax+8]
 
		fld  dword ptr [ebx+16]
		fmul dword ptr [eax+12]
		fld	 dword ptr [ebx+16]
		fmul dword ptr [eax+16]
		fld	 dword ptr [ebx+16]
		fmul dword ptr [eax+20]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fld  dword ptr [ebx+20]
		fmul dword ptr [eax+24]
		fld	 dword ptr [ebx+20]
		fmul dword ptr [eax+28]
		fld	 dword ptr [ebx+20]
		fmul dword ptr [eax+32]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fxch st(2)					/* trick B : preload next value*/
									/* trick C : replace fxch st(1)*/
		fld  dword ptr [ebx+24]	
		fmul dword ptr [eax]
		fxch st(2)					/* trick C : replace fxch st(3)*/
 
		fstp dword ptr [ecx+16]
		fstp dword ptr [ecx+12]		/* B: no more penalty here*/
 
		/*fstp dword ptr [ecx+20]		// trick C : store it later*/
/**/
		fld	 dword ptr [ebx+24]
		fmul dword ptr [eax+4]
		fld	 dword ptr [ebx+24]
		fmul dword ptr [eax+8]
 
		fld  dword ptr [ebx+28]
		fmul dword ptr [eax+12]
		fld	 dword ptr [ebx+28]
		fmul dword ptr [eax+16]
		fld	 dword ptr [ebx+28]
		fmul dword ptr [eax+20]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fxch st(3)					/* trick C :added*/
		fld  dword ptr [ebx+32]
		fmul dword ptr [eax+24]
		fld	 dword ptr [ebx+32]
		fmul dword ptr [eax+28]
		fld	 dword ptr [ebx+32]
		fmul dword ptr [eax+32]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(4),st				/* trick C : replace faddp st(1),st*/
 
		fstp dword ptr [ecx+20]		/* trick C : store it later*/
		
	    /* no more penalty here*/
		fstp dword ptr [ecx+28]		/* trick C : swapped stores*/
		fstp dword ptr [ecx+24]
		fstp dword ptr [ecx+32]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulMatrixMatrixWithoutBufferC      MTH3D_M_vMulMatrixMatrixWithoutBufferORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 174 clocks (69 penalties) 119 instructions : 1.68 % pairing*/
#define MTH3D_M_vMulMatrixMatrixWithoutBufferORG(MatDest, MatA, MatB)                   \
    {                                                                                   \
                        (MatDest)->stCol_0.xX =  MTH_M_xMulAddMulAddMul(                \
                (MatA)->stCol_0.xX, (MatB)->stCol_0.xX,                                 \
                (MatA)->stCol_1.xX, (MatB)->stCol_0.xY,                                 \
                (MatA)->stCol_2.xX, (MatB)->stCol_0.xZ );                               \
            (MatDest)->stCol_0.xY =  MTH_M_xMulAddMulAddMul(                            \
                (MatA)->stCol_0.xY, (MatB)->stCol_0.xX,                                 \
                (MatA)->stCol_1.xY, (MatB)->stCol_0.xY,                                 \
                (MatA)->stCol_2.xY, (MatB)->stCol_0.xZ );                               \
            (MatDest)->stCol_0.xZ =  MTH_M_xMulAddMulAddMul(                            \
                (MatA)->stCol_0.xZ, (MatB)->stCol_0.xX,                                 \
                (MatA)->stCol_1.xZ, (MatB)->stCol_0.xY,                                 \
                (MatA)->stCol_2.xZ, (MatB)->stCol_0.xZ );                               \
                                                                                        \
                        (MatDest)->stCol_1.xX =  MTH_M_xMulAddMulAddMul(                \
                (MatA)->stCol_0.xX, (MatB)->stCol_1.xX,                                 \
                (MatA)->stCol_1.xX, (MatB)->stCol_1.xY,                                 \
                (MatA)->stCol_2.xX, (MatB)->stCol_1.xZ );                               \
            (MatDest)->stCol_1.xY =  MTH_M_xMulAddMulAddMul(                            \
                (MatA)->stCol_0.xY, (MatB)->stCol_1.xX,                                 \
                (MatA)->stCol_1.xY, (MatB)->stCol_1.xY,                                 \
                (MatA)->stCol_2.xY, (MatB)->stCol_1.xZ );                               \
            (MatDest)->stCol_1.xZ =  MTH_M_xMulAddMulAddMul(                            \
                (MatA)->stCol_0.xZ, (MatB)->stCol_1.xX,                                 \
                (MatA)->stCol_1.xZ, (MatB)->stCol_1.xY,                                 \
                (MatA)->stCol_2.xZ, (MatB)->stCol_1.xZ );                               \
                                                                                        \
                        (MatDest)->stCol_2.xX =  MTH_M_xMulAddMulAddMul(                \
                (MatA)->stCol_0.xX, (MatB)->stCol_2.xX,                                 \
                (MatA)->stCol_1.xX, (MatB)->stCol_2.xY,                                 \
                (MatA)->stCol_2.xX, (MatB)->stCol_2.xZ );                               \
            (MatDest)->stCol_2.xY =  MTH_M_xMulAddMulAddMul(                            \
                (MatA)->stCol_0.xY, (MatB)->stCol_2.xX,                                 \
                (MatA)->stCol_1.xY, (MatB)->stCol_2.xY,                                 \
                (MatA)->stCol_2.xY, (MatB)->stCol_2.xZ );                               \
            (MatDest)->stCol_2.xZ =  MTH_M_xMulAddMulAddMul(                            \
                (MatA)->stCol_0.xZ, (MatB)->stCol_2.xX,                                 \
                (MatA)->stCol_1.xZ, (MatB)->stCol_2.xY,                                 \
                (MatA)->stCol_2.xZ, (MatB)->stCol_2.xZ );                               \
      }
 
 
#if defined(OPTIMIZED_FOR_U64_ASM)
static inline void MTH3D_M_vMulMatrixMatrixWithoutBufferU64ASM(struct MTH3D_tdstMatrix_ *MatDest,struct MTH3D_tdstMatrix_ *MatA,struct MTH3D_tdstMatrix_ *MatB)
{
/*   GGG   RRR    OOO   U   U  M   M  PPP   FFFF  !!  !!
    G   G  R  R  O   O  U   U  MM MM  P  P  F     !!  !!
    G      RRR   O   O  U   U  M M M  PPP   FFF   !!  !!
    G  GG  R R   O   O  U   U  M   M  P     F
     GGG   R  R   OOO    UUU   M   M  P     F     oo  oo

     PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/
	asm(" .set noreorder ");
	asm(" # Begin MulMatrixMatrixWithoutBufferU64ASM ");
	asm(
	" # Premier MulMatrixVertex \n"
	" lwc1 $f6,0(%2) \n"
	" lwc1 $f8,4(%2) \n"
	" lwc1 $f10,8(%2) \n"
	" lwc1 $f0,0(%1)				# f0  <- Mat[0,0] \n"
	" lwc1 $f2,12(%1)				# f2  <- Mat[0,1] \n"
	" mul.s $f0,$f0,$f6			# f0  <- Mat[0,0] x Vect[0] \n"
	" lwc1 $f4,24(%1)				# f4  <- Mat[0,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[0,1] x Vect[1] \n"
	" lwc1 $f12,4(%1)				# f12 <- Mat[1,0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[0,2] x Vect[2] \n"
	" lwc1 $f14,16(%1)			# f14 <- Mat[1,1] \n"
	" mul.s $f12,$f12,$f6		# f12 <- Mat[1,0] x Vect[0] \n"
	" add.s $f0,$f0,$f2			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
	" lwc1 $f16,28(%1)			# f16 <- Mat[1,2] \n"
	" mul.s $f14,$f14,$f8		# f14 <- Mat[1,1] x Vect[1] \n"
	" add.s $f0,$f0,$f4			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
	" lwc1 $f18,8(%1)				# f18 <- Mat[2,0] \n"
	" mul.s $f16,$f16,$f10	# f16 <- Mat[1,2] x Vect[2] \n"
	" add.s $f12,$f12,$f14	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
	" lwc1 $f2,20(%1)				# f2  <- Mat[2,1] \n"
	" mul.s $f18,$f18,$f6		# f18 <- Mat[2,0] x Vect[0] \n"
	" add.s $f12,$f12,$f16	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
	" lwc1 $f4,32(%1)				# f4  <- Mat[2,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[2,1] x Vect[1] \n"
	" swc1 $f0,0(%0)				# f0  -> Dest[0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[2,2] x Vect[2] \n"
	" add.s $f18,$f18,$f2		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
	" swc1 $f12,4(%0)				# f12 -> Dest[1] \n"
	" add.s $f18,$f18,$f4		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
	" swc1 $f18,8(%0)				# f18 -> Dest[2] \n"
	" # Deuxi�me MulMatrixVertex \n"
	" lwc1 $f6,12(%2) \n"
	" lwc1 $f8,16(%2) \n"
	" lwc1 $f10,20(%2) \n"
	" lwc1 $f0,0(%1)				# f0  <- Mat[0,0] \n"
	" lwc1 $f2,12(%1)				# f2  <- Mat[0,1] \n"
	" mul.s $f0,$f0,$f6			# f0  <- Mat[0,0] x Vect[0] \n"
	" lwc1 $f4,24(%1)				# f4  <- Mat[0,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[0,1] x Vect[1] \n"
	" lwc1 $f12,4(%1)				# f12 <- Mat[1,0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[0,2] x Vect[2] \n"
	" lwc1 $f14,16(%1)			# f14 <- Mat[1,1] \n"
	" mul.s $f12,$f12,$f6		# f12 <- Mat[1,0] x Vect[0] \n"
	" add.s $f0,$f0,$f2			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
	" lwc1 $f16,28(%1)			# f16 <- Mat[1,2] \n"
	" mul.s $f14,$f14,$f8		# f14 <- Mat[1,1] x Vect[1] \n"
	" add.s $f0,$f0,$f4			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
	" lwc1 $f18,8(%1)				# f18 <- Mat[2,0] \n"
	" mul.s $f16,$f16,$f10	# f16 <- Mat[1,2] x Vect[2] \n"
	" add.s $f12,$f12,$f14	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
	" lwc1 $f2,20(%1)				# f2  <- Mat[2,1] \n"
	" mul.s $f18,$f18,$f6		# f18 <- Mat[2,0] x Vect[0] \n"
	" add.s $f12,$f12,$f16	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
	" lwc1 $f4,32(%1)				# f4  <- Mat[2,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[2,1] x Vect[1] \n"
	" swc1 $f0,12(%0)				# f0  -> Dest[0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[2,2] x Vect[2] \n"
	" add.s $f18,$f18,$f2		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
	" swc1 $f12,16(%0)			# f12 -> Dest[1] \n"
	" add.s $f18,$f18,$f4		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
	" swc1 $f18,20(%0)			# f18 -> Dest[2] \n"
	" # Troisi�me MulMatrixVertex \n"
	" lwc1 $f6,24(%2) \n"
	" lwc1 $f8,28(%2) \n"
	" lwc1 $f10,32(%2) \n"
	" lwc1 $f0,0(%1)				# f0  <- Mat[0,0] \n"
	" lwc1 $f2,12(%1)				# f2  <- Mat[0,1] \n"
	" mul.s $f0,$f0,$f6			# f0  <- Mat[0,0] x Vect[0] \n"
	" lwc1 $f4,24(%1)				# f4  <- Mat[0,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[0,1] x Vect[1] \n"
	" lwc1 $f12,4(%1)				# f12 <- Mat[1,0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[0,2] x Vect[2] \n"
	" lwc1 $f14,16(%1)			# f14 <- Mat[1,1] \n"
	" mul.s $f12,$f12,$f6		# f12 <- Mat[1,0] x Vect[0] \n"
	" add.s $f0,$f0,$f2			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
	" lwc1 $f16,28(%1)			# f16 <- Mat[1,2] \n"
	" mul.s $f14,$f14,$f8		# f14 <- Mat[1,1] x Vect[1] \n"
	" add.s $f0,$f0,$f4			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
	" lwc1 $f18,8(%1)				# f18 <- Mat[2,0] \n"
	" mul.s $f16,$f16,$f10	# f16 <- Mat[1,2] x Vect[2] \n"
	" add.s $f12,$f12,$f14	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
	" lwc1 $f2,20(%1)				# f2  <- Mat[2,1] \n"
	" mul.s $f18,$f18,$f6		# f18 <- Mat[2,0] x Vect[0] \n"
	" add.s $f12,$f12,$f16	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
	" lwc1 $f4,32(%1)				# f4  <- Mat[2,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[2,1] x Vect[1] \n"
	" swc1 $f0,24(%0)				# f0  -> Dest[0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[2,2] x Vect[2] \n"
	" add.s $f18,$f18,$f2		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
	" swc1 $f12,28(%0)			# f12 -> Dest[1] \n"
	" add.s $f18,$f18,$f4		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
	" swc1 $f18,32(%0)			# f18 -> Dest[2] \n"
	: : "r" (MatDest), "r" (MatA), "r" (MatB) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" );
	asm(" # EndOf MulMatrixMatrixWithoutBufferU64ASM ");\
	asm(" .set reorder ");
}
#endif /* OPTIMIZED_FOR_U64_ASM*/
 
/************************************************************************************************************************/
/* MTH3D_M_vMulMatrixMatrix*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vMulMatrixMatrixASM(Mat_Dest, Mat_A, Mat_B)             \
	{       if (Mat_Dest==Mat_A)										\
            {                                                           \
              MTH3D_tdstMatrix Mtemp;                                   \
                                                                        \
              MTH3D_M_vMulMatrixMatrixWithoutBuffer(                    \
                                        &Mtemp, Mat_A, Mat_B);          \
              MTH3D_M_vCopyMatrix( Mat_Dest, &Mtemp);                   \
            }                                                           \
            else                                                        \
            {                                                           \
             MTH3D_M_vMulMatrixMatrixWithoutBuffer(Mat_Dest,            \
                                              Mat_A,  Mat_B);           \
             }															\
	}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulMatrixMatrixC MTH3D_M_vMulMatrixMatrixORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
 
#define MTH3D_M_vMulMatrixMatrixORG(Mat_Dest, Mat_A, Mat_B)             \
    {       if( (Mat_Dest==Mat_A) || (Mat_Dest==Mat_B) )                \
            {                                                           \
              MTH3D_tdstMatrix Mtemp;                                   \
                                                                        \
              MTH3D_M_vMulMatrixMatrixWithoutBuffer(                    \
                                        &Mtemp, Mat_A, Mat_B);          \
              MTH3D_M_vCopyMatrix( Mat_Dest, &Mtemp);                   \
            }                                                           \
            else                                                        \
            {                                                           \
             MTH3D_M_vMulMatrixMatrixWithoutBuffer(Mat_Dest,            \
                                              Mat_A,  Mat_B);           \
             }															\
	}
 
/************************************************************************************************************************/
/* MTH3D_M_xDetMatrix*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* only 9 muls instead of 12 !!! */
/* 31 clocks but 5 penalties*/
#pragma warning(disable:4035)
INLINE MTH_tdxReal MTH3D_M_xDetMatrixASM(struct MTH3D_tdstMatrix_ *MatA) 
{
	register MTH_tdxReal xTempMatrix; /* not useful but necessary to no hang compiler in release mode !?!*/
	__asm						
	{					
		/*  = 0X.(1Y.2Z-1Z.2Y) | =  0X.(A-B) |	=	X*/
		/*  + 0Y.(1Z.2X-1X.2Z) | +  0Y.(C-D) |	+	Y*/
		/*  + 0Z.(1X.2Y-1Y.2X) | +  0Z.(E-F) |	+	Z*/
		mov	ecx,MatA
		fld  dword ptr [ecx+16]			/**/
		fmul dword ptr [ecx+32]			/* A*/
		fld  dword ptr [ecx+20]			/**/
		fmul dword ptr [ecx+28]			/* B A*/
		fld  dword ptr [ecx+20]			/**/
		fmul dword ptr [ecx+24]			/* C B A*/
		fld  dword ptr [ecx+12]			/**/
		fmul dword ptr [ecx+32]			/* D C B A*/
		fxch st(2)						/* B C D A*/
		fsubp st(3),st					/* C D A-B*/
		fld  dword ptr [ecx+12]			/**/
		fmul dword ptr [ecx+28]			/* E C D A-B*/
		fxch st(2)						/* D C E A-B*/
		fsubp st(1),st					/* C-D E A-B*/
		fxch st(2)						/* A-B E C-D*/
		fmul dword ptr [ecx]			/* X E C-D*/
		fld  dword ptr [ecx+16]			/**/
		fmul dword ptr [ecx+24]			/* F X E C-D*/
		fxch st(3)						/* C-D X E F  */
		fmul dword ptr [ecx+4]			/* Y X E F*/
		fxch st(3)						/* F X E Y*/
		fsubp st(2),st					/* X E-F Y*/
		faddp st(2),st					/* E-F X+Y*/
		fmul dword ptr [ecx+8]			/* Z X+Y*/
		faddp st(1),st					/* X+Y+Z*/
		fstp dword ptr [xTempMatrix]	/* not useful but necessary to no hang compiler in release mode !?!*/
	}
	return(xTempMatrix);				/* not useful but necessary to no hang compiler in release mode !?!*/
}
#pragma warning(default:4035)
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
/* only 9 muls instead of 12 !!! */
/* 37 clocks but 11 penalties*/
#define MTH3D_M_xDetMatrixC( MatA )																					    \
		MTH_M_xAdd3(																									\
			MTH_M_xMul(	(MatA)->stCol_0.xX,																				\
				MTH_M_xMulSubMul(   (MatA)->stCol_1.xY, (MatA)->stCol_2.xZ,	(MatA)->stCol_1.xZ, (MatA)->stCol_2.xY )	\
					  ),																								\
			MTH_M_xMul(	(MatA)->stCol_0.xY,																				\
				MTH_M_xMulSubMul(	(MatA)->stCol_1.xZ, (MatA)->stCol_2.xX,	(MatA)->stCol_1.xX, (MatA)->stCol_2.xZ )	\
					  ),																								\
			MTH_M_xMul(	(MatA)->stCol_0.xZ,																				\
				MTH_M_xMulSubMul(	(MatA)->stCol_1.xX, (MatA)->stCol_2.xY,	(MatA)->stCol_1.xY, (MatA)->stCol_2.xX )	\
					  )																									\
				   )
#endif /*  OPTIMIZED_FOR_PC_FLOATS*/
 
/* 41 clocks but 16 penalties*/
#define MTH3D_M_xDetMatrixORG(MatA)                                                                            \
                    MTH_M_xSub(                                                                             \
                            MTH_M_xAdd3(                                                                    \
                             MTH_M_xMul3( (MatA)->stCol_0.xX, (MatA)->stCol_1.xY, (MatA)->stCol_2.xZ ),     \
                             MTH_M_xMul3( (MatA)->stCol_0.xY, (MatA)->stCol_1.xZ, (MatA)->stCol_2.xX ),     \
                             MTH_M_xMul3( (MatA)->stCol_0.xZ, (MatA)->stCol_1.xX, (MatA)->stCol_2.xY )),    \
                            MTH_M_xAdd3(                                                                    \
                             MTH_M_xMul3( (MatA)->stCol_0.xZ, (MatA)->stCol_1.xY, (MatA)->stCol_2.xX ),     \
                             MTH_M_xMul3( (MatA)->stCol_1.xZ, (MatA)->stCol_2.xY, (MatA)->stCol_0.xX ),     \
                             MTH_M_xMul3( (MatA)->stCol_2.xZ, (MatA)->stCol_0.xY, (MatA)->stCol_1.xX )))
 
/************************************************************************************************************************/
/* MTH3D_M_vMulMatrixVectorWithoutBuffer*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* requires no temporary buffer if VectDest==VectA*/
/* 32 clocks (1 penalty) 33 instructions : 12,12 % pairing*/
INLINE void MTH3D_M_vMulMatrixVectorWithoutBufferASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA)           
{
	__asm
	{
		mov ebx,VectA
		mov eax,MatA
		mov ecx,VectDest
 
		fld  dword ptr [ebx]
		fmul dword ptr [eax]
		fld	 dword ptr [ebx]
		fmul dword ptr [eax+4]
		fld	 dword ptr [ebx]
		fmul dword ptr [eax+8]
 
		fld  dword ptr [ebx+4]
		fmul dword ptr [eax+12]
		fld	 dword ptr [ebx+4]
		fmul dword ptr [eax+16]
		fld	 dword ptr [ebx+4]
		fmul dword ptr [eax+20]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fld  dword ptr [ebx+8]
		fmul dword ptr [eax+24]
		fld	 dword ptr [ebx+8]
		fmul dword ptr [eax+28]
		fld	 dword ptr [ebx+8]
		fmul dword ptr [eax+32]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fxch st(2)
		fstp dword ptr [ecx]	
		fstp dword ptr [ecx+4]
		fstp dword ptr [ecx+8]
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulMatrixVectorWithoutBufferC        MTH3D_M_vMulMatrixVectorWithoutBufferORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 56 clocks (20 penalties)  44 instructions : 4.55 % pairing*/
#define MTH3D_M_vMulMatrixVectorWithoutBufferORG( VectDest, MatA, VectA)        \
    {         (VectDest)->xX = MTH_M_xAdd3(                                     \
                                MTH_M_xMul( (MatA)->stCol_0.xX, (VectA)->xX),   \
                        MTH_M_xMul( (MatA)->stCol_1.xX, (VectA)->xY),           \
                        MTH_M_xMul( (MatA)->stCol_2.xX, (VectA)->xZ));          \
             (VectDest)->xY = MTH_M_xAdd3(                                      \
                                MTH_M_xMul( (MatA)->stCol_0.xY, (VectA)->xX),   \
                        MTH_M_xMul( (MatA)->stCol_1.xY, (VectA)->xY),           \
                        MTH_M_xMul( (MatA)->stCol_2.xY, (VectA)->xZ));          \
             (VectDest)->xZ = MTH_M_xAdd3(                                      \
                                MTH_M_xMul( (MatA)->stCol_0.xZ, (VectA)->xX),   \
                        MTH_M_xMul( (MatA)->stCol_1.xZ, (VectA)->xY),           \
                        MTH_M_xMul( (MatA)->stCol_2.xZ, (VectA)->xZ)); }
 
#if defined(OPTIMIZED_FOR_U64_ASM)
/* requires no temporary buffer if VectDest==VectA*/
static inline void MTH3D_M_vMulMatrixVectorWithoutBufferU64ASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA)
{
/*   GGG   RRR    OOO   U   U  M   M  PPP   FFFF  !!  !!
    G   G  R  R  O   O  U   U  MM MM  P  P  F     !!  !!
    G      RRR   O   O  U   U  M M M  PPP   FFF   !!  !!
    G  GG  R R   O   O  U   U  M   M  P     F
     GGG   R  R   OOO    UUU   M   M  P     F     oo  oo

     PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/
	asm(" .set noreorder ");
	asm(" # Begin MulMatrixVectorWithoutBufferU64ASM " );
	asm(
	" lwc1 $f6,0(%2) \n"
	" lwc1 $f8,4(%2) \n"
	" lwc1 $f10,8(%2) \n"
	" lwc1 $f0,0(%1)				# f0  <- Mat[0,0] \n"
	" lwc1 $f2,12(%1)				# f2  <- Mat[0,1] \n"
	" mul.s $f0,$f0,$f6			# f0  <- Mat[0,0] x Vect[0] \n"
	" lwc1 $f4,24(%1)				# f4  <- Mat[0,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[0,1] x Vect[1] \n"
	" lwc1 $f12,4(%1)				# f12 <- Mat[1,0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[0,2] x Vect[2] \n"
	" lwc1 $f14,16(%1)			# f14 <- Mat[1,1] \n"
	" mul.s $f12,$f12,$f6		# f12 <- Mat[1,0] x Vect[0] \n"
	" add.s $f0,$f0,$f2			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
	" lwc1 $f16,28(%1)			# f16 <- Mat[1,2] \n"
	" mul.s $f14,$f14,$f8		# f14 <- Mat[1,1] x Vect[1] \n"
	" add.s $f0,$f0,$f4			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
	" lwc1 $f18,8(%1)				# f18 <- Mat[2,0] \n"
	" mul.s $f16,$f16,$f10	# f16 <- Mat[1,2] x Vect[2] \n"
	" add.s $f12,$f12,$f14	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
	" lwc1 $f2,20(%1)				# f2  <- Mat[2,1] \n"
	" mul.s $f18,$f18,$f6		# f18 <- Mat[2,0] x Vect[0] \n"
	" add.s $f12,$f12,$f16	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
	" lwc1 $f4,32(%1)				# f4  <- Mat[2,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[2,1] x Vect[1] \n"
	" swc1 $f0,0(%0)				# f0  -> Dest[0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[2,2] x Vect[2] \n"
	" add.s $f18,$f18,$f2		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
	" swc1 $f12,4(%0)				# f12 -> Dest[1] \n"
	" add.s $f18,$f18,$f4		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
	" swc1 $f18,8(%0)				# f18 -> Dest[2] \n"
	: : "r" (VectDest), "r" (MatA), "r" (VectA) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" );
	asm(" # EndOf MulMatrixVectorWithoutBufferU64ASM ");
	asm(" .set reorder ");
}
#endif /* OPTIMIZED_FOR_U64_ASM*/
 
/************************************************************************************************************************/
/* MTH3D_M_vMulMatrixVector*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
#define MTH3D_M_vMulMatrixVectorASM                        MTH3D_M_vMulMatrixVectorWithoutBufferASM
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
#define MTH3D_M_vMulMatrixVectorC                          MTH3D_M_vMulMatrixVectorORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
#if defined(OPTIMIZED_FOR_U64_ASM)
#define MTH3D_M_vMulMatrixVectorU64ASM                     MTH3D_M_vMulMatrixVectorWithoutBufferU64ASM
#endif /* OPTIMIZED_FOR_U64_ASM*/
 
#define MTH3D_M_vMulMatrixVectorORG( VectDest, MatA, VectA)                     \
	{   if( VectA==VectDest )                                                   \
        {                                                                       \
          MTH3D_tdstVector Vtmp;                                                \
                                                                                \
          MTH3D_M_vCopyVector( &Vtmp, VectA);                                   \
          MTH3D_M_vMulMatrixVectorWithoutBuffer( VectDest, MatA, &Vtmp);        \
        }                                                                       \
        else                                                                    \
        {                                                                       \
          MTH3D_M_vMulMatrixVectorWithoutBuffer( VectDest, MatA, VectA);        \
        }																		\
	}
          
/************************************************************************************************************************/
/* MTH3D_M_vInverMatrix*/
/************************************************************************************************************************/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
/* avoid transposition, stack, 9 divisions by det, 4 negs*/
/* 131 clocks (38+1 penalties) 98 instructions : 2.04 % pairing*/
#pragma warning(disable:4725)
INLINE void MTH3D_M_vInverMatrixASM(struct MTH3D_tdstMatrix_ *MatDest, struct MTH3D_tdstMatrix_ *MatA)                     
{   
	static MTH_tdxReal ONE=MTH_C_ONE;
		__asm
	{
		mov eax,MatA
		mov ebx,MatDest
 
		fld  dword ptr [eax+16]									/*<======  1 penalty : AGI stall because of eax load*/
		fmul dword ptr [eax+32]		/* load A1*/
		fld  dword ptr [eax+20]
		fmul dword ptr [eax+24]		/* load A2 */
		fld  dword ptr [eax+12]
		fmul dword ptr [eax+28]		/* load A3*/
		fld  dword ptr [eax+20]
		fmul dword ptr [eax+28]		/* load B1*/
		fld  dword ptr [eax+12]
		fmul dword ptr [eax+32]		/* load B2*/
		fld  dword ptr [eax+16]
		fmul dword ptr [eax+24]		/* load B3*/
									/* B3 B2 B1 A3 A2 A1*/
		fxch st(2)					/* B1 B2 B3 A3 A2 A1*/
		fsubp st(5),st				/* B2 B3 A3 A2 P1*/
		fsubp st(3),st				/* B3 A3 P2	P1	*/
		fsubp st(1),st				/* P3 P2 P1*/
				
		fld  dword ptr [eax+8]
		fmul dword ptr [eax+28]		/* C1 P3 P2 P1*/
		
		fld  st(3)	
		fmul dword ptr [eax]		/* X C1 P3 P2 P1*/
		fld  st(3)	
		fmul dword ptr [eax+4]		/* Y X C1 P3 P2 P1*/
		fld  st(3)	
		fmul dword ptr [eax+8]		/* Z Y X C1 P3 P2 P1*/
 
		fld  dword ptr [eax+4]
		fmul dword ptr [eax+32]		/* D1 Z Y X C1 P3 P2 P1*/
		
		fxch st(2)
		faddp st(3),st				/* Z D1 X+Y C1 P3 P2 P1*/
 
		fld  dword ptr [eax]
		fmul dword ptr [eax+32]		/* C2 Z D1 X+Y C1 P3 P2 P1*/
 
		fxch st(2)					/* D1 Z C2 X+Y C1 P3 P2 P1*/
		fsubp st(4),st				/* Z C2 X+Y N1 P3 P2 P1*/
		faddp st(2),st				/* C2 X+Y+Z N1 P3 P2 P1*/
 
		fld  dword ptr [eax+8]
		fmul dword ptr [eax+24]		/* D2 C2 X+Y+Z N1 P3 P2 P1*/
 
		fld dword ptr [ONE]			/* 1 D2 C2 det=X+Y+Z N1 P3 P2 P1*/
		fdivrp st(3),st				/* D2 C2 D=1/det N1 P3 P2 P1		<====== unavoidable 38 clocks penalties*/
/*
		fld dword ptr [ONE]			// 1 D2 C2 det=X+Y+Z N1 P3 P2 P1
		fxch st(3)					// det D2 C2 1 N1 P3 P2 P1
		fdivp st(3),st				// D2 C2 D=1/det N1 P3 P2 P1		<====== unavoidable 38 clocks penalties
*/
		fsubp st(1),st				/* N2 D N1 P3 P2 P1*/
		fxch st(1)					/* D N2 N1 P3 P2 P1*/
		
		fld st(0)
		fmulp st(6),st				/* D N2 N1 P3 P2 p1*/
		fld st(0)
		fmulp st(5),st				/* D N2 N1 P3 P2 p1*/
		fld st(0)
		fmulp st(4),st				/* D N2 N1 P3 p2 p1*/
		fld st(0)
		fmulp st(3),st				/* D N2 n1 p3 p2 p1*/
		fld st(0)
		fmulp st(2),st				/* D n2 n1 p3 p2 p1*/
 
		fld  dword ptr [eax+4]
		fmul dword ptr [eax+24]		/* C3 D n2 n1 p3 p2 p1*/
 
		fld  dword ptr [eax]
		fmul dword ptr [eax+28]     /* D3 C3 D n2 n1 p3 p2 p1*/
 
		fxch st(7)					/* p1 C3 D n2 n1 p3 p2 D3*/
		fstp dword ptr [ebx]		/* C3 D n2 n1 p3 p2 D3*/
/**/
		fld  dword ptr [eax+4]
		fmul dword ptr [eax+20]     /* E1 C3 D n2 n1 p3 p2 D3*/
 
		fxch st(1)					/* C3 E1 D n2 n1 p3 p2 D3*/
		fsubrp st(7),st				/* E1 D n2 n1 p3 p2 N3*/
/**/
		fld  dword ptr [eax+8]
		fmul dword ptr [eax+16]     /* F1 E1 D n2 n1 p3 p2 N3*/
 
		fxch st(6)					/* p2 E1 D n2 n1 p3 F1 N3*/
		fstp dword ptr [ebx+12]		/* E1 D n2 n1 p3 F1 N3*/
/**/
		fld  dword ptr [eax+8]
		fmul dword ptr [eax+12]     /* E2 E1 D n2 n1 p3 F1 N3*/
 
		fxch st(1)					/* E1 E2 D n2 n1 p3 F1 N3*/
		fsubrp st(6),st				/* E2 D n2 n1 p3 M1 N3*/
/**/
		fld  dword ptr [eax]
		fmul dword ptr [eax+20]     /* F2 E2 D n2 n1 p3 M1 N3*/
 
		fxch st(5)					/* p3 E2 D n2 n1 F2 M1 N3*/
		fstp dword ptr [ebx+24]		/* E2 D n2 n1 F2 M1 N3*/
/**/
		fld  dword ptr [eax]
		fmul dword ptr [eax+16]     /* E3 E2 D n2 n1 F2 M1 N3*/
 
		fxch st(1)					/* E2 E3 D n2 n1 F2 M1 N3*/
		fsubrp st(5),st				/* E3 D n2 n1 M2 M1 N3*/
/**/
		fld  dword ptr [eax+4]
		fmul dword ptr [eax+12]     /* F3 E3 D n2 n1 M2 M1 N3*/
 
		fxch st(4)					/* n1 E3 D n2 F3 M2 M1 N3*/
		fstp dword ptr [ebx+4]		/* E3 D n2 F3 M2 M1 N3*/
/**/
		fxch st(1)					/* D E3 n2 F3 M2 M1 N3*/
		fld st(0)					/* D D E3 n2 F3 M2 M1 N3*/
		fmulp st(7),st				/* D E3 n2 F3 M2 M1 n3*/
		fxch st(1)					/* E3 D n2 F3 M2 M1 n3*/
		fsubrp st(3),st				/* D n2 M3 M2 M1 n3*/
/**/
		fld st(0)					/* D D n2 M3 M2 M1 n3*/
		fmulp st(5),st				/* D n2 M3 M2 m1 n3*/
		fld st(0)					/* D D n2 M3 M2 m1 n3*/
		fmulp st(4),st				/* D n2 M3 m2 m1 n3*/
/**/
		fxch st(1)					/* n2 D M3 m2 m1 n3*/
		fstp dword ptr [ebx+16]		/* D M3 m2 m1 n3*/
		fmulp st(1),st				/* m3 m2 m1 n3	*/
		fxch st(3)					/* n3 m2 m1 m3*/
/**/
		fstp dword ptr [ebx+28]		/* m2 m1 m3*/
		fstp dword ptr [ebx+20]		/* m1 m3*/
		fstp dword ptr [ebx+8]		/* m3*/
		fstp dword ptr [ebx+32]
	}
}
#pragma warning(default:4725)
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
/* avoid transposition, stack, 9 divisions by det, 4 negs*/
/* 214 clocks (81 penalties) 147 instructions : 1.36 % pairing*/
INLINE void MTH3D_M_vInverMatrixC(MTH3D_tdstMatrix *MatDest, MTH3D_tdstMatrix *MatA)                     
{                                                               
      register MTH_tdxReal det;                                      
      (MatDest)->stCol_0.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xY,(MatA)->stCol_2.xZ,(MatA)->stCol_1.xZ,(MatA)->stCol_2.xY);              
	  det = MTH_M_xMul( (MatA)->stCol_0.xX ,(MatDest)->stCol_0.xX );																	 			
      (MatDest)->stCol_1.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xZ,(MatA)->stCol_2.xX,(MatA)->stCol_1.xX,(MatA)->stCol_2.xZ); 
	  det = MTH_M_xAdd(MTH_M_xMul( (MatA)->stCol_0.xY ,(MatDest)->stCol_1.xX ), det);													 	
      (MatDest)->stCol_2.xX= MTH_M_xMulSubMul((MatA)->stCol_1.xX,(MatA)->stCol_2.xY,(MatA)->stCol_1.xY,(MatA)->stCol_2.xX);              
  	  det = MTH_M_xAdd(MTH_M_xMul( (MatA)->stCol_0.xZ ,(MatDest)->stCol_2.xX ), det);								  				     
	  (MatDest)->stCol_0.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xZ,(MatA)->stCol_2.xY,(MatA)->stCol_0.xY,(MatA)->stCol_2.xZ);  
	  det =	MTH_M_xDiv(	MTH_C_ONE, det );																								 
	  (MatDest)->stCol_0.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xY,(MatA)->stCol_1.xZ,(MatA)->stCol_0.xZ,(MatA)->stCol_1.xY);              																	 		
      (MatDest)->stCol_1.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xX,(MatA)->stCol_2.xZ,(MatA)->stCol_0.xZ,(MatA)->stCol_2.xX);              
      (MatDest)->stCol_1.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xZ,(MatA)->stCol_1.xX,(MatA)->stCol_0.xX,(MatA)->stCol_1.xZ); 
	  (MatDest)->stCol_2.xY= MTH_M_xMulSubMul((MatA)->stCol_0.xY,(MatA)->stCol_2.xX,(MatA)->stCol_0.xX,(MatA)->stCol_2.xY);  
      (MatDest)->stCol_2.xZ= MTH_M_xMulSubMul((MatA)->stCol_0.xX,(MatA)->stCol_1.xY,(MatA)->stCol_0.xY,(MatA)->stCol_1.xX);				 	
	  MTH3D_M_vMulScalarMatrix( MatDest, det, MatDest );			 
}
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* Can surely be optimized in avoiding many recomputations done for nothing*/
/* Before modifying MTH3D_M_vDivScalarMatrix, it took 540 clocks  120 instructions !?!*/
#define MTH3D_M_vInverMatrixORG(MatDest, MatA)                       \
{																	 \
	  MTH3D_tdstMatrix Mat_Tmp={0};                                   \
      MTH3D_tdstMatrix Mat_Com;                                      \
      MTH_tdxReal det;                                               \
                                                                     \
      MTH3D_M_vComMatrixWithoutBuffer(&Mat_Com, MatA);               \
      MTH3D_M_vTranspMatrix(&Mat_Tmp, &Mat_Com );                    \
      det=MTH3D_M_xDetMatrix( MatA );                                \
      MTH3D_M_vDivScalarMatrix(MatDest, &Mat_Tmp, det );             \
}
 
/************************************************************************************************************************/
/* MTH3D_M_vTransformVectorWithoutBuffer*/
/************************************************************************************************************************/
/* 39 clocks (1 penalty) 41 instructions : 9.72 % pairing*/
#if defined(OPTIMIZED_FOR_PC_FLOATS_WITH_ASM)
INLINE void MTH3D_M_vTransformVectorWithoutBufferASM(MTH3D_tdstVector *VectDest,MTH3D_tdstMatrix *MatA,MTH3D_tdstVector *VectA,MTH3D_tdstVector *VectB)
{
	__asm
	{
		mov ecx,VectB
		mov edx,VectDest
		mov ebx,VectA
		mov eax,MatA
 
		fld  dword ptr [ecx]
		fld  dword ptr [ecx+4]
		fld  dword ptr [ecx+8]
 
		fld  dword ptr [ebx]
		fmul dword ptr [eax]
		fld	 dword ptr [ebx]
		fmul dword ptr [eax+4]
		fld	 dword ptr [ebx]
		fmul dword ptr [eax+8]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fld  dword ptr [ebx+4]
		fmul dword ptr [eax+12]
		fld	 dword ptr [ebx+4]
		fmul dword ptr [eax+16]
		fld	 dword ptr [ebx+4]
		fmul dword ptr [eax+20]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fld  dword ptr [ebx+8]
		fmul dword ptr [eax+24]
		fld	 dword ptr [ebx+8]
		fmul dword ptr [eax+28]
		fld	 dword ptr [ebx+8]
		fmul dword ptr [eax+32]
 
		fxch st(2)
		faddp st(5),st
		faddp st(3),st
		faddp st(1),st
 
		fxch st(2)
 
		fstp dword ptr [edx]		/* 1 penalty : unavoidable*/
		fstp dword ptr [edx+4]
		fstp dword ptr [edx+8]
 
	}
}
#endif /* OPTIMIZED_FOR_PC_FLOATS_WITH_ASM*/
 
#if defined(OPTIMIZED_FOR_PC_FLOATS)
	#define MTH3D_M_vTransformVectorWithoutBufferC	MTH3D_M_vTransformVectorWithoutBufferORG
#endif /* OPTIMIZED_FOR_PC_FLOATS*/
 
/* 79 clocks (30 penalties) 54 instructions : 3.70 % pairing*/
#define MTH3D_M_vTransformVectorWithoutBufferORG( VectDest, MatA, VectA, VectB)	\
	{	MTH3D_M_vMulMatrixVectorWithoutBuffer(VectDest, MatA, VectA);			\
		MTH3D_M_vAddVector(VectDest,VectDest,VectB);							\
	}
 
#if defined(OPTIMIZED_FOR_U64_ASM)
/* requires no temporary buffer if VectDest==VectA*/
static inline void MTH3D_M_vTransformVectorWithoutBufferU64ASM( struct MTH3D_tdstVector_ *VectDest, struct MTH3D_tdstMatrix_ *MatA, struct MTH3D_tdstVector_ *VectA, struct MTH3D_tdstVector_ *VectB)
{
/*   GGG   RRR    OOO   U   U  M   M  PPP   FFFF  !!  !!
    G   G  R  R  O   O  U   U  MM MM  P  P  F     !!  !!
    G      RRR   O   O  U   U  M M M  PPP   FFF   !!  !!
    G  GG  R R   O   O  U   U  M   M  P     F
     GGG   R  R   OOO    UUU   M   M  P     F     oo  oo

     PLEASE DO NOT REMOVE ASM IN LOWER CASE !!!!!*/
	asm(" .set noreorder ");
	asm(" # Begin TransformVectorWithoutBufferU64ASM ");
	asm(
	" lwc1 $f6,0(%2) \n"
	" lwc1 $f8,4(%2) \n"
	" lwc1 $f10,8(%2) \n"
	" lwc1 $f0,0(%1)				# f0  <- Mat[0,0] \n"
	" lwc1 $f2,12(%1)				# f2  <- Mat[0,1] \n"
	" mul.s $f0,$f0,$f6			# f0  <- Mat[0,0] x Vect[0] \n"
	" lwc1 $f4,24(%1)				# f4  <- Mat[0,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[0,1] x Vect[1] \n"
	" lwc1 $f12,4(%1)				# f12 <- Mat[1,0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[0,2] x Vect[2] \n"
	" lwc1 $f14,16(%1)			# f14 <- Mat[1,1] \n"
	" mul.s $f12,$f12,$f6		# f12 <- Mat[1,0] x Vect[0] \n"
	" add.s $f0,$f0,$f2			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] \n"
	" lwc1 $f16,28(%1)			# f16 <- Mat[1,2] \n"
	" mul.s $f14,$f14,$f8		# f14 <- Mat[1,1] x Vect[1] \n"
	" add.s $f0,$f0,$f4			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] \n"
	" lwc1 $f18,8(%1)				# f18 <- Mat[2,0] \n"
	" mul.s $f16,$f16,$f10	# f16 <- Mat[1,2] x Vect[2] \n"
	" add.s $f12,$f12,$f14	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] \n"
	" lwc1 $f2,20(%1)				# f2  <- Mat[2,1] \n"
	" mul.s $f18,$f18,$f6		# f18 <- Mat[2,0] x Vect[0] \n"
	" add.s $f12,$f12,$f16	# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] \n"
	" lwc1 $f4,32(%1)				# f4  <- Mat[2,2] \n"
	" mul.s $f2,$f2,$f8			# f2  <- Mat[2,1] x Vect[1] \n"
	" lwc1 $f6,0(%3)				# f6  <- VctB[0] \n"
	" mul.s $f4,$f4,$f10		# f4  <- Mat[2,2] x Vect[2] \n"
	" add.s $f18,$f18,$f2		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] \n"
	" lwc1 $f8,4(%3)				# f8  <- VctB[1] \n"
	" add.s $f18,$f18,$f4		# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] \n"
	" lwc1 $f10,8(%3)				# f10 <- VctB[2] \n"
	" add.s $f0,$f0,$f6			# f0  <- Mat[0,0] x Vect[0] + Mat[0,1] x Vect[1] + Mat[0,2] x Vect[2] + VctB[0] \n"
	" add.s $f12,$f12,$f8		# f12 <- Mat[1,0] x Vect[0] + Mat[1,1] x Vect[1] + Mat[1,2] x Vect[2] + VctB[1] \n"
	" swc1 $f0,0(%0)				# f0  -> Dest[0] \n"
	" add.s $f18,$f18,$f10	# f18 <- Mat[2,0] x Vect[0] + Mat[2,1] x Vect[1] + Mat[2,2] x Vect[2] + VctB[1] \n"
	" swc1 $f12,4(%0)				# f12 -> Dest[1] \n"
	" swc1 $f18,8(%0)				# f18 -> Dest[2] \n"
	: : "r" (VectDest), "r" (MatA), "r" (VectA) , "r" (VectB) : "$f0","$f2","$f4","$f6","$f8","$f10","$f12","$f14","$f16","$f18" );
	asm(" # EndOf TransformVectorWithoutBufferU64ASM ");
	asm(" .set reorder ");
}
#endif /* OPTIMIZED_FOR_U64_ASM*/
 
#endif /* MTH_ASM_H*/