TITLE vprod_mmx ;***************************************************************************/ ;* ;* INTEL Corporation Proprietary Information ;* ;* This listing is supplied under the terms of a CNDA agreement ;* with INTEL Corporation and may not be disclosed except ;* in accordance with the terms of that agreement. ;* ;* Copyright (c) 1995 Intel Corporation. ;* All rights reserved. ;* ;***************************************************************************/ ; ; ; ;**************************************************************************** ; prevent listing of iammx.inc file .nolist INCLUDE iammx.inc ; IAMMX Emulator Macros .list .586 .model FLAT ;**************************************************************************** ; Data Segment Declarations ;**************************************************************************** .data ;**************************************************************************** ; Constant Segment Declarations ;**************************************************************************** .const ;**************************************************************************** ; Code Segment Declarations ;**************************************************************************** .code COMMENT ^ void vprod_mmx ( int16 *src1, int16 *src2, int prdsize, int16 *result ) ; ^ vprod_mmx PROC NEAR C USES edi esi ebx, s1Ptr:PTR SWORD, s2Ptr:PTR SWORD, prdsize:SDWORD, resPtr:PTR SWORD ; LOCAL src:PTR SWORD, num:PTR SDWORD ; above (commented) line is an example of declaring local parameters ; int 3 pxor mm4, mm4 pxor mm6, mm6 mov eax, s1Ptr ; src1 pointer mov ebx, s2Ptr ; src2 pointer mov ecx, prdsize ; size of src1 and src2 arrays mov edx, resPtr ; pointer to the result pxor mm7, mm7 ; Initialize accumulator vdp16: movq mm0, [eax] ; First 4 source1 elements movq mm1, [ebx] ; First 4 source2 elements movq mm2, [eax+8] ; Next 4 source1 elements pmaddwd mm0, mm1 ; s1[0]*s2[0] + s1[1]*s2[1]::s1[2]*s2[2] + s1[3]*s2[3] paddd mm7, mm0 ; Accumulate movq mm3, [ebx+8] ; Next 4 source2 elements movq mm0, [eax+16] ; Next 4 source1 elements pmaddwd mm2, mm3 ; s1[4]*s2[4] + s1[5]*s2[5]::s1[6]*s2[6] + s1[7]*s2[7] paddd mm7, mm2 ; Accumulate movq mm1, [ebx+16] ; Next 4 source2 elements movq mm2, [eax+24] ; Last 4 source1 elements pmaddwd mm0, mm1 ; s1[8]*s2[8] + s1[9]*s2[9]::s1[10]*s2[10] + s1[11]*s2[11] paddd mm7,mm0 ; Accumulate pmaddwd mm2, [ebx+24] ; s1[12]*s2[12] + s1[13]*s2[13]::s1[14]*s2[14] + s1[15]*s2[15] paddd mm7, mm2 ; Accumulate add eax, 32 ; increment source1 index add ebx, 32 ; increment source2 index sub ecx, 16 jnz vdp16 store: movq mm0, mm7 ; Copy accumulator ; V-pipe empty (data dependency) psrlq mm7,32 ; Shift high order 32 bits of accumulation ; V-pipe empty (data dependency) paddd mm7,mm0 ; Add hIgh and low order 32 bits of accumulation ; V-pipe empty (data dependency) packssdw mm7,mm4 ; Pack the result back to 16 bits, w/ signed saturation ; V-pipe empty (data dependency) ; One cycle stall - Op-Store dependency movdf [edx], mm7 ; Store result ; V-pipe empty (integer inst. does not pair w/ MM memory reference) ; int 3 ret vprod_mmx ENDP END