reman3/Rayman_X/cpa/masm/VDPMMX16.ASM

122 lines
3.9 KiB
NASM

TITLE vprod_mmx
;***************************************************************************/
;*
;* INTEL Corporation Proprietary Information
;*
;* This listing is supplied under the terms of a CNDA agreement
;* with INTEL Corporation and may not be disclosed except
;* in accordance with the terms of that agreement.
;*
;* Copyright (c) 1995 Intel Corporation.
;* All rights reserved.
;*
;***************************************************************************/
;
;
;
;****************************************************************************
; prevent listing of iammx.inc file
.nolist
INCLUDE iammx.inc ; IAMMX Emulator Macros
.list
.586
.model FLAT
;****************************************************************************
; Data Segment Declarations
;****************************************************************************
.data
;****************************************************************************
; Constant Segment Declarations
;****************************************************************************
.const
;****************************************************************************
; Code Segment Declarations
;****************************************************************************
.code
COMMENT ^
void vprod_mmx (
int16 *src1,
int16 *src2,
int prdsize,
int16 *result ) ;
^
vprod_mmx PROC NEAR C USES edi esi ebx,
s1Ptr:PTR SWORD, s2Ptr:PTR SWORD,
prdsize:SDWORD, resPtr:PTR SWORD
; LOCAL src:PTR SWORD, num:PTR SDWORD
; above (commented) line is an example of declaring local parameters
; int 3
pxor mm4, mm4
pxor mm6, mm6
mov eax, s1Ptr ; src1 pointer
mov ebx, s2Ptr ; src2 pointer
mov ecx, prdsize ; size of src1 and src2 arrays
mov edx, resPtr ; pointer to the result
pxor mm7, mm7 ; Initialize accumulator
vdp16:
movq mm0, [eax] ; First 4 source1 elements
movq mm1, [ebx] ; First 4 source2 elements
movq mm2, [eax+8] ; Next 4 source1 elements
pmaddwd mm0, mm1 ; s1[0]*s2[0] + s1[1]*s2[1]::s1[2]*s2[2] + s1[3]*s2[3]
paddd mm7, mm0 ; Accumulate
movq mm3, [ebx+8] ; Next 4 source2 elements
movq mm0, [eax+16] ; Next 4 source1 elements
pmaddwd mm2, mm3 ; s1[4]*s2[4] + s1[5]*s2[5]::s1[6]*s2[6] + s1[7]*s2[7]
paddd mm7, mm2 ; Accumulate
movq mm1, [ebx+16] ; Next 4 source2 elements
movq mm2, [eax+24] ; Last 4 source1 elements
pmaddwd mm0, mm1 ; s1[8]*s2[8] + s1[9]*s2[9]::s1[10]*s2[10] + s1[11]*s2[11]
paddd mm7,mm0 ; Accumulate
pmaddwd mm2, [ebx+24] ; s1[12]*s2[12] + s1[13]*s2[13]::s1[14]*s2[14] + s1[15]*s2[15]
paddd mm7, mm2 ; Accumulate
add eax, 32 ; increment source1 index
add ebx, 32 ; increment source2 index
sub ecx, 16
jnz vdp16
store:
movq mm0, mm7 ; Copy accumulator
; V-pipe empty (data dependency)
psrlq mm7,32 ; Shift high order 32 bits of accumulation
; V-pipe empty (data dependency)
paddd mm7,mm0 ; Add hIgh and low order 32 bits of accumulation
; V-pipe empty (data dependency)
packssdw mm7,mm4 ; Pack the result back to 16 bits, w/ signed saturation
; V-pipe empty (data dependency)
; One cycle stall - Op-Store dependency
movdf [edx], mm7 ; Store result
; V-pipe empty (integer inst. does not pair w/ MM memory reference)
; int 3
ret
vprod_mmx ENDP
END