TITLE   vprod_mmx
;***************************************************************************/
;*
;*                INTEL Corporation Proprietary Information  
;*
;*     This listing is supplied under the terms of a CNDA agreement
;*     with INTEL Corporation and may not be disclosed except 
;*     in accordance with the terms of that agreement.
;*      
;*                  Copyright (c) 1995 Intel Corporation.
;*                         All rights reserved.
;*
;***************************************************************************/
;
;  
;  
;****************************************************************************
; prevent listing of iammx.inc file
.nolist
INCLUDE iammx.inc                   ; IAMMX Emulator Macros
.list

.586
.model FLAT

;****************************************************************************
;    Data Segment Declarations
;****************************************************************************
.data

;****************************************************************************
;    Constant Segment Declarations
;****************************************************************************
.const


;****************************************************************************
;    Code Segment Declarations
;****************************************************************************
.code
COMMENT ^
void vprod_mmx (
    int16 *src1,
    int16 *src2,
    int   prdsize,
    int16 *result ) ;
^

vprod_mmx PROC NEAR C USES edi esi ebx,
            s1Ptr:PTR SWORD, s2Ptr:PTR SWORD,
            prdsize:SDWORD, resPtr:PTR SWORD
;           LOCAL   src:PTR SWORD, num:PTR SDWORD
; above (commented) line is an example of declaring local parameters

;   int 3

    pxor     mm4, mm4
    pxor     mm6, mm6
    mov      eax, s1Ptr         ; src1 pointer
    mov      ebx, s2Ptr         ; src2 pointer
    mov      ecx, prdsize       ; size of src1 and src2 arrays
    mov      edx, resPtr        ; pointer to the result
    pxor     mm7, mm7           ; Initialize accumulator


vdp16:
    movq     mm0, [eax]         ; First 4 source1 elements

    movq	 mm1, [ebx]         ; First 4 source2 elements

    movq	 mm2, [eax+8]       ; Next 4 source1 elements
    pmaddwd  mm0, mm1           ; s1[0]*s2[0] + s1[1]*s2[1]::s1[2]*s2[2] + s1[3]*s2[3]

    paddd    mm7, mm0           ; Accumulate

    movq	 mm3, [ebx+8]       ; Next 4 source2 elements

    movq	 mm0, [eax+16]      ; Next 4 source1 elements
	pmaddwd	 mm2, mm3	        ; s1[4]*s2[4] + s1[5]*s2[5]::s1[6]*s2[6] + s1[7]*s2[7]

    paddd	 mm7, mm2	        ; Accumulate 
    
    movq	 mm1, [ebx+16]      ; Next 4 source2 elements

    movq 	 mm2, [eax+24]      ; Last 4 source1 elements
    pmaddwd  mm0, mm1           ; s1[8]*s2[8] + s1[9]*s2[9]::s1[10]*s2[10] + s1[11]*s2[11]

    paddd    mm7,mm0            ; Accumulate
    
    pmaddwd  mm2, [ebx+24]      ; s1[12]*s2[12] + s1[13]*s2[13]::s1[14]*s2[14] + s1[15]*s2[15]
    paddd	 mm7, mm2           ; Accumulate

    add		 eax, 32            ; increment source1 index
    add 	 ebx, 32            ; increment source2 index

    sub      ecx, 16
    jnz      vdp16

store:
    movq mm0, mm7               ; Copy accumulator
                                ; V-pipe empty (data dependency)

    psrlq mm7,32                ; Shift high order 32 bits of accumulation 
                                ; V-pipe empty (data dependency)

    paddd mm7,mm0               ; Add hIgh and low order 32 bits of accumulation
                                ; V-pipe empty (data dependency)

    packssdw mm7,mm4            ; Pack the result back to 16 bits, w/ signed saturation
                                ; V-pipe empty (data dependency)

                                ; One cycle stall - Op-Store dependency
    movdf    [edx], mm7         ; Store result
                                ; V-pipe empty (integer inst. does not pair w/ MM memory reference)

;   int 3

    ret

vprod_mmx ENDP
END