122 lines
3.9 KiB
NASM
122 lines
3.9 KiB
NASM
TITLE vprod_mmx
|
|
;***************************************************************************/
|
|
;*
|
|
;* INTEL Corporation Proprietary Information
|
|
;*
|
|
;* This listing is supplied under the terms of a CNDA agreement
|
|
;* with INTEL Corporation and may not be disclosed except
|
|
;* in accordance with the terms of that agreement.
|
|
;*
|
|
;* Copyright (c) 1995 Intel Corporation.
|
|
;* All rights reserved.
|
|
;*
|
|
;***************************************************************************/
|
|
;
|
|
;
|
|
;
|
|
;****************************************************************************
|
|
; prevent listing of iammx.inc file
|
|
.nolist
|
|
INCLUDE iammx.inc ; IAMMX Emulator Macros
|
|
.list
|
|
|
|
.586
|
|
.model FLAT
|
|
|
|
;****************************************************************************
|
|
; Data Segment Declarations
|
|
;****************************************************************************
|
|
.data
|
|
|
|
;****************************************************************************
|
|
; Constant Segment Declarations
|
|
;****************************************************************************
|
|
.const
|
|
|
|
|
|
;****************************************************************************
|
|
; Code Segment Declarations
|
|
;****************************************************************************
|
|
.code
|
|
COMMENT ^
|
|
void vprod_mmx (
|
|
int16 *src1,
|
|
int16 *src2,
|
|
int prdsize,
|
|
int16 *result ) ;
|
|
^
|
|
|
|
vprod_mmx PROC NEAR C USES edi esi ebx,
|
|
s1Ptr:PTR SWORD, s2Ptr:PTR SWORD,
|
|
prdsize:SDWORD, resPtr:PTR SWORD
|
|
; LOCAL src:PTR SWORD, num:PTR SDWORD
|
|
; above (commented) line is an example of declaring local parameters
|
|
|
|
; int 3
|
|
|
|
pxor mm4, mm4
|
|
pxor mm6, mm6
|
|
mov eax, s1Ptr ; src1 pointer
|
|
mov ebx, s2Ptr ; src2 pointer
|
|
mov ecx, prdsize ; size of src1 and src2 arrays
|
|
mov edx, resPtr ; pointer to the result
|
|
pxor mm7, mm7 ; Initialize accumulator
|
|
|
|
|
|
vdp16:
|
|
movq mm0, [eax] ; First 4 source1 elements
|
|
|
|
movq mm1, [ebx] ; First 4 source2 elements
|
|
|
|
movq mm2, [eax+8] ; Next 4 source1 elements
|
|
pmaddwd mm0, mm1 ; s1[0]*s2[0] + s1[1]*s2[1]::s1[2]*s2[2] + s1[3]*s2[3]
|
|
|
|
paddd mm7, mm0 ; Accumulate
|
|
|
|
movq mm3, [ebx+8] ; Next 4 source2 elements
|
|
|
|
movq mm0, [eax+16] ; Next 4 source1 elements
|
|
pmaddwd mm2, mm3 ; s1[4]*s2[4] + s1[5]*s2[5]::s1[6]*s2[6] + s1[7]*s2[7]
|
|
|
|
paddd mm7, mm2 ; Accumulate
|
|
|
|
movq mm1, [ebx+16] ; Next 4 source2 elements
|
|
|
|
movq mm2, [eax+24] ; Last 4 source1 elements
|
|
pmaddwd mm0, mm1 ; s1[8]*s2[8] + s1[9]*s2[9]::s1[10]*s2[10] + s1[11]*s2[11]
|
|
|
|
paddd mm7,mm0 ; Accumulate
|
|
|
|
pmaddwd mm2, [ebx+24] ; s1[12]*s2[12] + s1[13]*s2[13]::s1[14]*s2[14] + s1[15]*s2[15]
|
|
paddd mm7, mm2 ; Accumulate
|
|
|
|
add eax, 32 ; increment source1 index
|
|
add ebx, 32 ; increment source2 index
|
|
|
|
sub ecx, 16
|
|
jnz vdp16
|
|
|
|
store:
|
|
movq mm0, mm7 ; Copy accumulator
|
|
; V-pipe empty (data dependency)
|
|
|
|
psrlq mm7,32 ; Shift high order 32 bits of accumulation
|
|
; V-pipe empty (data dependency)
|
|
|
|
paddd mm7,mm0 ; Add hIgh and low order 32 bits of accumulation
|
|
; V-pipe empty (data dependency)
|
|
|
|
packssdw mm7,mm4 ; Pack the result back to 16 bits, w/ signed saturation
|
|
; V-pipe empty (data dependency)
|
|
|
|
; One cycle stall - Op-Store dependency
|
|
movdf [edx], mm7 ; Store result
|
|
; V-pipe empty (integer inst. does not pair w/ MM memory reference)
|
|
|
|
; int 3
|
|
|
|
ret
|
|
|
|
vprod_mmx ENDP
|
|
END
|