1273 lines
18 KiB
NASM
1273 lines
18 KiB
NASM
![]() |
; want 8-byte alignment really!!
|
||
|
_DATA SEGMENT DWORD PUBLIC 'DATA'
|
||
|
|
||
|
|
||
|
PUBLIC _use_mmx_math
|
||
|
PUBLIC _mmx_sign_mask
|
||
|
PUBLIC _mmx_one_fixed_h
|
||
|
|
||
|
align
|
||
|
_mmx_sign_mask:QWORD 0000800000008000h
|
||
|
_mmx_one_fixed_h:QWORD 0001000000000000h
|
||
|
_mmx_one_fixed_hl:QWORD 0001000000010000h
|
||
|
_mmx_one_hl:QWORD 0000000100000001h
|
||
|
store1:QWORD ?
|
||
|
_use_mmx_math:DWORD 1
|
||
|
|
||
|
|
||
|
|
||
|
_DATA ENDS
|
||
|
|
||
|
|
||
|
|
||
|
; want 16-byte alignment really!!
|
||
|
_TEXT SEGMENT DWORD PUBLIC 'CODE'
|
||
|
ASSUME cs:_TEXT, ds:_DATA
|
||
|
|
||
|
.586
|
||
|
|
||
|
PUBLIC MMXAsm_VectorDot_
|
||
|
PUBLIC MMXAsm_VectorDot16_
|
||
|
PUBLIC MMXAsm_VectorTransformed_
|
||
|
PUBLIC MMXAsm_VectorTransform_
|
||
|
PUBLIC MMXAsm_VectorTransformedAndAdd_
|
||
|
PUBLIC MMXAsm_VectorTransformAndAdd_
|
||
|
|
||
|
PUBLIC _MMXAsm_VectorDot
|
||
|
PUBLIC _MMXAsm_VectorDot16
|
||
|
PUBLIC _MMXAsm_VectorTransformed
|
||
|
PUBLIC _MMXAsm_VectorTransform
|
||
|
PUBLIC _MMXAsm_VectorTransformedAndAdd
|
||
|
PUBLIC _MMXAsm_VectorTransformAndAdd
|
||
|
|
||
|
align
|
||
|
_MMXAsm_VectorDot:
|
||
|
MMXAsm_VectorDot_:
|
||
|
|
||
|
if 0
|
||
|
; This is the unoptimized version
|
||
|
|
||
|
; get the data
|
||
|
movq mm0,[edx]
|
||
|
movq mm1,[eax]
|
||
|
movd mm2,[edx+08h]
|
||
|
movd mm3,[eax+08h]
|
||
|
|
||
|
|
||
|
; get it into signed fixed format
|
||
|
movq mm4,mm0
|
||
|
movq mm5,mm1
|
||
|
movq mm6,mm2
|
||
|
movq mm7,mm3
|
||
|
|
||
|
pand mm4,_mmx_sign_mask
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
pand mm6,_mmx_sign_mask
|
||
|
pand mm7,_mmx_sign_mask
|
||
|
|
||
|
paddd mm4,mm4
|
||
|
paddd mm5,mm5
|
||
|
paddd mm6,mm6
|
||
|
paddd mm7,mm7
|
||
|
|
||
|
paddd mm0,mm4
|
||
|
paddd mm1,mm5
|
||
|
paddd mm2,mm6
|
||
|
paddd mm3,mm7
|
||
|
|
||
|
; at this point we have split all 32 bit values
|
||
|
; into 16-bit pairs, high and low, both signed
|
||
|
|
||
|
; mm0: y1h y1l x1h x1l
|
||
|
; mm1: y2h y2l x2h x2l
|
||
|
; mm2: 0 0 z1h z1l
|
||
|
; mm3: 0 0 z2h z2l
|
||
|
|
||
|
; swap 1st and 2nd words in mm0,mm1,mm2,mm3 ??
|
||
|
movq mm4,mm2
|
||
|
movq mm5,mm3
|
||
|
punpcklwd mm4,mm0
|
||
|
; mm4: x1h z1h x1l z1l
|
||
|
punpcklwd mm5,mm1
|
||
|
; mm5: x2h z2h x2l z2l
|
||
|
punpckhwd mm2,mm0
|
||
|
; mm2: y1h 0 y1l 0
|
||
|
punpckhwd mm3,mm1
|
||
|
; mm3: y2h 0 y2l 0
|
||
|
|
||
|
; get the high and low products: x1h*x2h, x1l*x2l, etc
|
||
|
movq mm0,mm2
|
||
|
pmaddwd mm0,mm3
|
||
|
; mm0: y1h*y2h y1l*y2l
|
||
|
movq mm1,mm4
|
||
|
pmaddwd mm1,mm5
|
||
|
; mm1: x1h*x2h+z1h*z2h x1l*x2l+z1l*z2l
|
||
|
|
||
|
; exchange dwords in mm3 and mm5
|
||
|
movq mm6,mm3
|
||
|
movq mm7,mm5
|
||
|
psrlq mm3,32
|
||
|
psrlq mm5,32
|
||
|
punpckldq mm3,mm6
|
||
|
punpckldq mm5,mm7
|
||
|
; mm5: x2l z2l x2h z2h
|
||
|
; mm3: y2l 0 y2h 0
|
||
|
|
||
|
; compute the products x1h*x2l, x1l*x2h, etc
|
||
|
pmaddwd mm2,mm3
|
||
|
; mm2: y1h*y2l y1l*y2h
|
||
|
pmaddwd mm4,mm5
|
||
|
; mm4: x1h*x2l+z1h*z2l x1l*x2h+z1l*z2h
|
||
|
|
||
|
paddd mm2,mm4
|
||
|
; mm2: x1h*x2l+y1h*y2l+z1h*z2l x1l*x2h+y1l*y2h+z1l*z2h
|
||
|
|
||
|
; get the low order dwords of mm0,mm1
|
||
|
movq mm3,mm0
|
||
|
punpckldq mm0,mm1
|
||
|
; mm0: x1l*x2l+z1l*z2l y1l*y2l
|
||
|
|
||
|
; unfortunately, at this point it is possible to have the
|
||
|
; wrong value in mm0: if x1l,x2l,x1l,x2l
|
||
|
; are all -0x8000, the result should
|
||
|
; be +0x80000000, but of course this becomes
|
||
|
; -0x80000000
|
||
|
; in fact the largest +ve value we could have is
|
||
|
; +0x80000000
|
||
|
; and the lowest -ve value we could have is
|
||
|
; -0x7fff0000
|
||
|
; = 0x80010000
|
||
|
; so subtracting ONE at this stage gives us a value
|
||
|
; which is out by ONE, but twos-complement correct
|
||
|
psubd mm0,_mmx_one_fixed_h
|
||
|
|
||
|
; and the high order dwords
|
||
|
punpckhdq mm1,mm3
|
||
|
; mm1: x1h*x2h+z1h*z2h y1h*y2h
|
||
|
; in fact it is swapped, but it doesn't matter
|
||
|
|
||
|
; shift the low order dwords down
|
||
|
psrad mm0,16
|
||
|
; and the high order dwords up
|
||
|
pslld mm1,16
|
||
|
; mm0: x1l*x2l+z1l*z2l>>16 -1 y1l*y2l>>16
|
||
|
; mm1: x1h*x2h+z1h*z2h<<16 y1h*y2h<<16
|
||
|
;(mm2) x1h*x2l+y1h*y2l+z1h*z2l x1l*x2h+y1l*y2h+z1l*z2h
|
||
|
|
||
|
; sum up
|
||
|
paddd mm2,mm0
|
||
|
paddd mm2,mm1
|
||
|
movq mm1,mm2
|
||
|
psrlq mm2,32
|
||
|
paddd mm1,mm2
|
||
|
movd eax,mm1
|
||
|
|
||
|
emms
|
||
|
inc eax
|
||
|
ret
|
||
|
|
||
|
else
|
||
|
;
|
||
|
; Now the optimized version
|
||
|
|
||
|
movq mm0,[edx]
|
||
|
|
||
|
movd mm2,[edx+08h]
|
||
|
movq mm4,mm0
|
||
|
|
||
|
|
||
|
pand mm4,_mmx_sign_mask
|
||
|
movq mm6,mm2
|
||
|
|
||
|
movq mm1,[eax]
|
||
|
paddd mm4,mm4
|
||
|
|
||
|
movd mm3,[eax+08h]
|
||
|
movq mm5,mm1
|
||
|
|
||
|
pand mm6,_mmx_sign_mask
|
||
|
movq mm7,mm3
|
||
|
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
paddd mm6,mm6
|
||
|
|
||
|
pand mm7,_mmx_sign_mask
|
||
|
paddd mm5,mm5
|
||
|
|
||
|
paddd mm0,mm4
|
||
|
paddd mm2,mm6
|
||
|
|
||
|
paddd mm7,mm7
|
||
|
movq mm4,mm2
|
||
|
|
||
|
punpcklwd mm4,mm0
|
||
|
paddd mm1,mm5
|
||
|
|
||
|
punpckhwd mm2,mm0
|
||
|
paddd mm3,mm7
|
||
|
|
||
|
movq mm5,mm3
|
||
|
punpckhwd mm3,mm1
|
||
|
|
||
|
punpcklwd mm5,mm1
|
||
|
movq mm0,mm2
|
||
|
|
||
|
movq mm1,mm4
|
||
|
pmaddwd mm0,mm3
|
||
|
|
||
|
movq mm6,mm3
|
||
|
psrlq mm3,32
|
||
|
|
||
|
movq mm7,mm5
|
||
|
punpckldq mm3,mm6
|
||
|
|
||
|
pmaddwd mm1,mm5
|
||
|
psrlq mm5,32
|
||
|
|
||
|
punpckldq mm5,mm7
|
||
|
pmaddwd mm2,mm3
|
||
|
|
||
|
pmaddwd mm4,mm5
|
||
|
movq mm3,mm0
|
||
|
|
||
|
; these instructions won't pair and I have no instructions I can pair them with
|
||
|
punpckldq mm0,mm1
|
||
|
|
||
|
psubd mm0,_mmx_one_fixed_h
|
||
|
punpckhdq mm1,mm3
|
||
|
|
||
|
psrad mm0,16
|
||
|
paddd mm2,mm4
|
||
|
|
||
|
pslld mm1,16
|
||
|
paddd mm2,mm0
|
||
|
|
||
|
; complete pairing is not possible at this stage - there are too many dependencies
|
||
|
paddd mm2,mm1
|
||
|
|
||
|
movq mm1,mm2
|
||
|
psrlq mm2,32
|
||
|
|
||
|
paddd mm1,mm2
|
||
|
|
||
|
movd eax,mm1
|
||
|
|
||
|
emms
|
||
|
|
||
|
inc eax
|
||
|
ret
|
||
|
|
||
|
endif
|
||
|
|
||
|
; This takes 33 cycles, the orignal C -> nonMMX version takes 80 cycles
|
||
|
|
||
|
align
|
||
|
_MMXAsm_VectorDot16:
|
||
|
MMXAsm_VectorDot16_:
|
||
|
|
||
|
movd mm0,[edx+08h]
|
||
|
|
||
|
packssdw mm0,[edx]
|
||
|
|
||
|
movd mm1,[eax+08h]
|
||
|
|
||
|
packssdw mm1,[eax]
|
||
|
|
||
|
pmaddwd mm0,mm1
|
||
|
|
||
|
movq mm1,mm0
|
||
|
psrlq mm0,32
|
||
|
|
||
|
paddd mm0,mm1
|
||
|
|
||
|
movd eax,mm0
|
||
|
|
||
|
emms
|
||
|
|
||
|
ret
|
||
|
; taking 14 cycles but assuming 16bit input vector fields
|
||
|
|
||
|
|
||
|
align
|
||
|
_MMXAsm_VectorTransformed:
|
||
|
MMXAsm_VectorTransformed_:
|
||
|
|
||
|
if 0
|
||
|
; eax ptr to result
|
||
|
; edx ptr to vector xh, xl, yh, yl, zh, zl
|
||
|
; ecx ptr to matrix a11h, a11l, a12h, etc
|
||
|
|
||
|
; unoptimized version
|
||
|
|
||
|
; NOTE: in the Dot Product there was a problem
|
||
|
; of an internal overflow where -32768*-32768 + -32768*-32768 gave 0x80000000
|
||
|
; which is -ve in two's complement
|
||
|
; the additions and subtractions of ONE to resolve this problem
|
||
|
; are marked '******'
|
||
|
|
||
|
movq mm0,[edx]
|
||
|
movq mm1,mm0
|
||
|
pand mm1,_mmx_sign_mask
|
||
|
paddd mm1,mm1
|
||
|
paddd mm0,mm1
|
||
|
; mm0: yh yl xh xl
|
||
|
|
||
|
movq mm2,[ecx]
|
||
|
movq mm3,mm2
|
||
|
pand mm3,_mmx_sign_mask
|
||
|
paddd mm3,mm3
|
||
|
paddd mm2,mm3
|
||
|
; mm2: a21h a21l a11h a11l
|
||
|
|
||
|
movd mm4,[edx+08h]
|
||
|
movq mm5,mm4
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
paddd mm5,mm5
|
||
|
paddd mm4,mm5
|
||
|
; mm4: 0 0 zh zl
|
||
|
|
||
|
movq mm6,[ecx+18h]
|
||
|
movq mm7,mm6
|
||
|
pand mm7,_mmx_sign_mask
|
||
|
paddd mm7,mm7
|
||
|
paddd mm6,mm7
|
||
|
; mm6: a23h a23l a13h a13l
|
||
|
|
||
|
; interleave
|
||
|
|
||
|
movq mm1,mm0
|
||
|
punpckhwd mm0,mm4
|
||
|
; mm0: 0 yh 0 yl
|
||
|
punpcklwd mm1,mm4
|
||
|
; mm1: zh xh zl xl
|
||
|
|
||
|
movq mm3,mm2
|
||
|
punpckhwd mm2,mm6
|
||
|
; mm2: a23h a21h a23l a21l
|
||
|
punpcklwd mm3,mm6
|
||
|
; mm3: a13h a11h a13l a11l
|
||
|
|
||
|
; get a13*z, a11*x; a23*z a21*x, high and low products
|
||
|
movq mm4,mm1
|
||
|
pmaddwd mm1,mm2
|
||
|
movq mm6,mm4
|
||
|
pmaddwd mm4,mm3
|
||
|
; mm0: 0 yh 0 yl
|
||
|
; mm6: zh xh zl xl
|
||
|
; mm2: a23h a21h a23l a21l
|
||
|
; mm3: a13h a11h a13l a11l
|
||
|
; mm1: zh*a23h+xh*a21h zl*a23l+xl*a21l
|
||
|
; mm4: zh*a13h+xh*a11h zl*a13l+xl*a11l
|
||
|
|
||
|
; exchange dwords in mm6
|
||
|
movq mm7,mm6
|
||
|
psrlq mm6,32
|
||
|
punpckldq mm6,mm7
|
||
|
; mm6: zl xl zh xh
|
||
|
; mm7: zh xh zl xl
|
||
|
|
||
|
; get the high-low 'cross' products
|
||
|
pmaddwd mm2,mm6
|
||
|
pmaddwd mm3,mm6
|
||
|
; mm2: a23h*zl+a21h*xl a23l*zh+a21l*xh
|
||
|
; mm3: a13h*zl+a11h*xl a13l*zh+a11l*xh
|
||
|
|
||
|
; interleave mm1,mm4 and mm2,mm3
|
||
|
movq mm5,mm4
|
||
|
punpckldq mm4,mm1
|
||
|
punpckhdq mm5,mm1
|
||
|
; mm4: zl*a23l+xl*a21l zl*a13l+xl*a11l ******
|
||
|
; mm5: zh*a23h+xh*a21h zh*a13h+xh*a11h
|
||
|
|
||
|
; ******
|
||
|
psubd mm4,_mmx_one_fixed_hl
|
||
|
|
||
|
|
||
|
movq mm1,mm3
|
||
|
punpckldq mm3,mm2
|
||
|
punpckhdq mm1,mm2
|
||
|
; mm1: zl*a23h+xl*a21h zl*a13h+xl*a11h
|
||
|
; mm3: zh*a23l+xh*a21l zh*a13l+xh*a11l
|
||
|
; sum
|
||
|
paddd mm1,mm3
|
||
|
; shift the low order dwords down
|
||
|
psrad mm4,16
|
||
|
; and the high order dwords up
|
||
|
pslld mm5,16
|
||
|
; sum
|
||
|
paddd mm1,mm4
|
||
|
paddd mm1,mm5
|
||
|
; mm1 holding x and y of the result
|
||
|
; mm0: 0 yh 0 yl
|
||
|
; mm1: z*a23+x*a21 z*a13+x*a11
|
||
|
; mm2:
|
||
|
; mm3:
|
||
|
; mm4:
|
||
|
; mm5:
|
||
|
; mm6: zl xl zh xh
|
||
|
; mm7: zh xh zl xl
|
||
|
|
||
|
; grab some more of the matrix
|
||
|
movq mm2,[ecx+08h]
|
||
|
movq mm3,mm2
|
||
|
pand mm3,_mmx_sign_mask
|
||
|
paddd mm3,mm3
|
||
|
paddd mm2,mm3 ; mm7 not mm2 in optimized version
|
||
|
; mm2: a12h a12l a31h a31l
|
||
|
|
||
|
movd mm4,[ecx+20h]
|
||
|
movq mm5,mm4
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
paddd mm5,mm5
|
||
|
paddd mm4,mm5
|
||
|
; mm4: 0 0 a33h a33l
|
||
|
|
||
|
; interleave
|
||
|
movq mm3,mm2
|
||
|
punpcklwd mm2,mm4
|
||
|
; mm2: a33h a31h a33l a31l
|
||
|
psrlq mm3,32
|
||
|
; mm3: 0 0 a12h a12l
|
||
|
|
||
|
; compute mm2 * mm6/7
|
||
|
movq mm4,mm2
|
||
|
pmaddwd mm2,mm7
|
||
|
pmaddwd mm4,mm6
|
||
|
; mm2: a33h*zh+a31h*xh a33l*zl+a31l*xl ******
|
||
|
; mm4: a33h*zl+a31h*xl a33l*zh+a31l*xh
|
||
|
movq mm7,mm2
|
||
|
|
||
|
; ******
|
||
|
psubd mm7,_mmx_one_fixed_hl
|
||
|
|
||
|
pslld mm2,16
|
||
|
psrad mm7,16
|
||
|
paddd mm2,mm4
|
||
|
paddd mm7,mm4
|
||
|
psrlq mm2,32
|
||
|
paddd mm2,mm7
|
||
|
; mm2: ? a33*z+a31*x
|
||
|
|
||
|
|
||
|
|
||
|
; get the rest of the matrix
|
||
|
movq mm5,[ecx+010h]
|
||
|
movq mm6,mm5
|
||
|
pand mm6,_mmx_sign_mask
|
||
|
paddd mm6,mm6
|
||
|
paddd mm5,mm6
|
||
|
; mm5: a32h a32l a22h a22l
|
||
|
; mm3: 0 0 a12h a12l
|
||
|
|
||
|
; mm0: 0 yh 0 yl
|
||
|
movq mm7,mm0
|
||
|
psrlq mm0,32
|
||
|
punpcklwd mm0,mm7
|
||
|
; mm0: 0 0 yl yh
|
||
|
punpckldq mm0,mm0
|
||
|
|
||
|
; mm0: yl yh yl yh
|
||
|
movq mm7,mm0
|
||
|
pmaddwd mm0,mm3
|
||
|
movq mm6,mm7
|
||
|
pmaddwd mm7,mm5
|
||
|
; mm0: 0 yl*a12h+yh*a12l
|
||
|
; mm7: yl*a32h+yh*a32l yl*a22h+yh*a22l
|
||
|
; mm6: yl yh yl yh
|
||
|
punpckldq mm0,mm7
|
||
|
; mm0: yl*a22h+yh*a22l yl*a12h+yh*a12l
|
||
|
paddd mm1,mm0
|
||
|
; mm1: z*a23+x*a21+yl*a22h+yh*a22l z*a13+x*a11+yl*a12h+yh*a12l
|
||
|
psrlq mm7,32
|
||
|
paddd mm2,mm7
|
||
|
; mm2: ? a33*z+a31*x+yl*a32h+yh*a32l
|
||
|
|
||
|
|
||
|
|
||
|
; mm5: a32h a32l a22h a22l
|
||
|
; mm3: 0 0 a12h a12l
|
||
|
; mm6: yl yh yl yh
|
||
|
|
||
|
|
||
|
|
||
|
; get all h and l separate
|
||
|
movq mm4,mm3
|
||
|
punpcklwd mm3,mm5
|
||
|
; mm3: a22h a12h a22l a12l
|
||
|
punpckhwd mm5,mm4
|
||
|
; mm5: 0 a32h 0 a32l
|
||
|
movq mm4,mm3
|
||
|
punpckhdq mm3,mm5
|
||
|
; mm3: 0 a32h a22h a12h
|
||
|
punpckldq mm4,mm5
|
||
|
; mm4: 0 a32l a22l a12l
|
||
|
punpckhwd mm6,mm6
|
||
|
; mm6: yl yl yh yh
|
||
|
movq mm0,mm6
|
||
|
punpckhdq mm6,mm6
|
||
|
; mm6: yl yl yl yl
|
||
|
punpckldq mm0,mm0
|
||
|
; mm0: yh yh yh yh
|
||
|
pmullw mm3,mm0
|
||
|
pmulhw mm4,mm6
|
||
|
; mm3: 0 a32h*yh a22h*yh a12h*yh
|
||
|
; mm4: 0 a32l*yl>>16 a22l*yl>>16 a12l*yl>>16
|
||
|
pxor mm7,mm7
|
||
|
pcmpgtw mm7,mm4
|
||
|
paddw mm3,mm7
|
||
|
|
||
|
movq mm5,mm4
|
||
|
punpcklwd mm4,mm3
|
||
|
punpckhwd mm5,mm3
|
||
|
paddd mm1,mm4
|
||
|
paddd mm2,mm5
|
||
|
|
||
|
; ******
|
||
|
paddd mm1,_mmx_one_hl
|
||
|
paddd mm2,_mmx_one_hl
|
||
|
|
||
|
movq [eax],mm1
|
||
|
movd [eax+08h],mm2
|
||
|
|
||
|
emms
|
||
|
ret
|
||
|
|
||
|
else
|
||
|
;
|
||
|
; optimized version
|
||
|
|
||
|
movq mm0,[edx]
|
||
|
|
||
|
movd mm4,[edx+08h]
|
||
|
movq mm1,mm0
|
||
|
|
||
|
movq mm2,[ecx]
|
||
|
movq mm5,mm4
|
||
|
|
||
|
pand mm1,_mmx_sign_mask
|
||
|
movq mm3,mm2
|
||
|
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
paddd mm1,mm1
|
||
|
|
||
|
movq mm6,[ecx+18h]
|
||
|
paddd mm5,mm5
|
||
|
|
||
|
pand mm3,_mmx_sign_mask
|
||
|
movq mm7,mm6
|
||
|
|
||
|
paddd mm0,mm1
|
||
|
paddd mm3,mm3
|
||
|
|
||
|
pand mm7,_mmx_sign_mask
|
||
|
paddd mm2,mm3
|
||
|
|
||
|
movq mm1,mm0
|
||
|
punpckhwd mm0,mm4
|
||
|
|
||
|
paddd mm4,mm5
|
||
|
paddd mm7,mm7
|
||
|
|
||
|
paddd mm6,mm7
|
||
|
punpcklwd mm1,mm4
|
||
|
|
||
|
movq mm3,mm2
|
||
|
punpckhwd mm2,mm6
|
||
|
|
||
|
punpcklwd mm3,mm6
|
||
|
movq mm4,mm1
|
||
|
|
||
|
movq mm6,mm1
|
||
|
pmaddwd mm4,mm3
|
||
|
|
||
|
movq mm7,mm6
|
||
|
psrlq mm6,32
|
||
|
|
||
|
pmaddwd mm1,mm2
|
||
|
punpckldq mm6,mm7
|
||
|
|
||
|
movq store1,mm7
|
||
|
pmaddwd mm3,mm6
|
||
|
|
||
|
movq mm7,[ecx+08h]
|
||
|
pmaddwd mm2,mm6
|
||
|
|
||
|
movq mm5,mm4
|
||
|
punpckldq mm4,mm1
|
||
|
|
||
|
psubd mm4,_mmx_one_fixed_hl
|
||
|
punpckhdq mm5,mm1
|
||
|
|
||
|
movq mm1,mm7
|
||
|
psrad mm4,16
|
||
|
|
||
|
pand mm1,_mmx_sign_mask
|
||
|
pslld mm5,16
|
||
|
|
||
|
paddd mm1,mm1
|
||
|
paddd mm5,mm4
|
||
|
|
||
|
paddd mm7,mm1
|
||
|
movq mm1,mm3
|
||
|
|
||
|
movd mm4,[ecx+20h]
|
||
|
punpckldq mm3,mm2
|
||
|
|
||
|
paddd mm3,mm5
|
||
|
movq mm5,mm4
|
||
|
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
punpckhdq mm1,mm2
|
||
|
|
||
|
paddd mm1,mm3
|
||
|
paddd mm5,mm5
|
||
|
|
||
|
movq mm2,[ecx+010h]
|
||
|
movq mm3,mm7
|
||
|
|
||
|
paddd mm4,mm5
|
||
|
movq mm5,mm2
|
||
|
|
||
|
pand mm2,_mmx_sign_mask
|
||
|
punpcklwd mm7,mm4
|
||
|
|
||
|
movq mm4,mm7
|
||
|
psrlq mm3,32
|
||
|
|
||
|
pmaddwd mm7,store1
|
||
|
paddd mm2,mm2
|
||
|
|
||
|
pmaddwd mm4,mm6
|
||
|
movq mm6,mm0
|
||
|
|
||
|
psrlq mm0,32
|
||
|
paddd mm5,mm2
|
||
|
|
||
|
punpcklwd mm0,mm6
|
||
|
movq mm2,mm7
|
||
|
|
||
|
psubd mm7,_mmx_one_fixed_hl
|
||
|
pslld mm2,16
|
||
|
|
||
|
psrad mm7,16
|
||
|
paddd mm2,mm4
|
||
|
|
||
|
paddd mm7,mm4
|
||
|
punpckldq mm0,mm0
|
||
|
|
||
|
movq mm6,mm0
|
||
|
psrlq mm2,32
|
||
|
|
||
|
paddd mm2,mm7
|
||
|
movq mm7,mm6
|
||
|
|
||
|
pmaddwd mm0,mm3
|
||
|
punpckhwd mm7,mm7
|
||
|
|
||
|
pmaddwd mm6,mm5
|
||
|
movq mm4,mm3
|
||
|
|
||
|
punpcklwd mm3,mm5
|
||
|
|
||
|
punpckhwd mm5,mm4
|
||
|
movq mm4,mm7
|
||
|
|
||
|
punpckldq mm0,mm6
|
||
|
|
||
|
paddd mm1,mm0
|
||
|
punpckhdq mm7,mm7
|
||
|
|
||
|
movq mm0,mm3
|
||
|
punpckldq mm3,mm5
|
||
|
|
||
|
pmulhw mm3,mm7
|
||
|
punpckhdq mm0,mm5
|
||
|
|
||
|
punpckldq mm4,mm4
|
||
|
|
||
|
pmullw mm0,mm4
|
||
|
psrlq mm6,32
|
||
|
|
||
|
paddd mm2,mm6
|
||
|
pxor mm6,mm6
|
||
|
|
||
|
pcmpgtw mm6,mm3
|
||
|
movq mm5,mm3
|
||
|
|
||
|
paddd mm1,_mmx_one_hl
|
||
|
paddw mm0,mm6
|
||
|
|
||
|
paddd mm2,_mmx_one_hl
|
||
|
punpcklwd mm3,mm0
|
||
|
|
||
|
paddd mm1,mm3
|
||
|
punpckhwd mm5,mm0
|
||
|
|
||
|
paddd mm2,mm5
|
||
|
|
||
|
movq [eax],mm1
|
||
|
|
||
|
movd [eax+08h],mm2
|
||
|
|
||
|
emms
|
||
|
ret
|
||
|
; 63 cycles compared with 204 for the C-nonMMX version
|
||
|
endif
|
||
|
|
||
|
align
|
||
|
_MMXAsm_VectorTransform:
|
||
|
MMXAsm_VectorTransform_:
|
||
|
|
||
|
movq mm0,[eax]
|
||
|
|
||
|
movd mm4,[eax+08h]
|
||
|
movq mm1,mm0
|
||
|
|
||
|
movq mm2,[edx]
|
||
|
movq mm5,mm4
|
||
|
|
||
|
pand mm1,_mmx_sign_mask
|
||
|
movq mm3,mm2
|
||
|
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
paddd mm1,mm1
|
||
|
|
||
|
movq mm6,[edx+18h]
|
||
|
paddd mm5,mm5
|
||
|
|
||
|
pand mm3,_mmx_sign_mask
|
||
|
movq mm7,mm6
|
||
|
|
||
|
paddd mm0,mm1
|
||
|
paddd mm3,mm3
|
||
|
|
||
|
pand mm7,_mmx_sign_mask
|
||
|
paddd mm2,mm3
|
||
|
|
||
|
movq mm1,mm0
|
||
|
punpckhwd mm0,mm4
|
||
|
|
||
|
paddd mm4,mm5
|
||
|
paddd mm7,mm7
|
||
|
|
||
|
paddd mm6,mm7
|
||
|
punpcklwd mm1,mm4
|
||
|
|
||
|
movq mm3,mm2
|
||
|
punpckhwd mm2,mm6
|
||
|
|
||
|
punpcklwd mm3,mm6
|
||
|
movq mm4,mm1
|
||
|
|
||
|
movq mm6,mm1
|
||
|
pmaddwd mm4,mm3
|
||
|
|
||
|
movq mm7,mm6
|
||
|
psrlq mm6,32
|
||
|
|
||
|
pmaddwd mm1,mm2
|
||
|
punpckldq mm6,mm7
|
||
|
|
||
|
movq store1,mm7
|
||
|
pmaddwd mm3,mm6
|
||
|
|
||
|
movq mm7,[edx+08h]
|
||
|
pmaddwd mm2,mm6
|
||
|
|
||
|
movq mm5,mm4
|
||
|
punpckldq mm4,mm1
|
||
|
|
||
|
psubd mm4,_mmx_one_fixed_hl
|
||
|
punpckhdq mm5,mm1
|
||
|
|
||
|
movq mm1,mm7
|
||
|
psrad mm4,16
|
||
|
|
||
|
pand mm1,_mmx_sign_mask
|
||
|
pslld mm5,16
|
||
|
|
||
|
paddd mm1,mm1
|
||
|
paddd mm5,mm4
|
||
|
|
||
|
paddd mm7,mm1
|
||
|
movq mm1,mm3
|
||
|
|
||
|
movd mm4,[edx+20h]
|
||
|
punpckldq mm3,mm2
|
||
|
|
||
|
paddd mm3,mm5
|
||
|
movq mm5,mm4
|
||
|
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
punpckhdq mm1,mm2
|
||
|
|
||
|
paddd mm1,mm3
|
||
|
paddd mm5,mm5
|
||
|
|
||
|
movq mm2,[edx+010h]
|
||
|
movq mm3,mm7
|
||
|
|
||
|
paddd mm4,mm5
|
||
|
movq mm5,mm2
|
||
|
|
||
|
pand mm2,_mmx_sign_mask
|
||
|
punpcklwd mm7,mm4
|
||
|
|
||
|
movq mm4,mm7
|
||
|
psrlq mm3,32
|
||
|
|
||
|
pmaddwd mm7,store1
|
||
|
paddd mm2,mm2
|
||
|
|
||
|
pmaddwd mm4,mm6
|
||
|
movq mm6,mm0
|
||
|
|
||
|
psrlq mm0,32
|
||
|
paddd mm5,mm2
|
||
|
|
||
|
punpcklwd mm0,mm6
|
||
|
movq mm2,mm7
|
||
|
|
||
|
psubd mm7,_mmx_one_fixed_hl
|
||
|
pslld mm2,16
|
||
|
|
||
|
psrad mm7,16
|
||
|
paddd mm2,mm4
|
||
|
|
||
|
paddd mm7,mm4
|
||
|
punpckldq mm0,mm0
|
||
|
|
||
|
movq mm6,mm0
|
||
|
psrlq mm2,32
|
||
|
|
||
|
paddd mm2,mm7
|
||
|
movq mm7,mm6
|
||
|
|
||
|
pmaddwd mm0,mm3
|
||
|
punpckhwd mm7,mm7
|
||
|
|
||
|
pmaddwd mm6,mm5
|
||
|
movq mm4,mm3
|
||
|
|
||
|
punpcklwd mm3,mm5
|
||
|
|
||
|
punpckhwd mm5,mm4
|
||
|
movq mm4,mm7
|
||
|
|
||
|
punpckldq mm0,mm6
|
||
|
|
||
|
paddd mm1,mm0
|
||
|
punpckhdq mm7,mm7
|
||
|
|
||
|
movq mm0,mm3
|
||
|
punpckldq mm3,mm5
|
||
|
|
||
|
pmulhw mm3,mm7
|
||
|
punpckhdq mm0,mm5
|
||
|
|
||
|
punpckldq mm4,mm4
|
||
|
|
||
|
pmullw mm0,mm4
|
||
|
psrlq mm6,32
|
||
|
|
||
|
paddd mm2,mm6
|
||
|
pxor mm6,mm6
|
||
|
|
||
|
pcmpgtw mm6,mm3
|
||
|
movq mm5,mm3
|
||
|
|
||
|
paddd mm1,_mmx_one_hl
|
||
|
paddw mm0,mm6
|
||
|
|
||
|
paddd mm2,_mmx_one_hl
|
||
|
punpcklwd mm3,mm0
|
||
|
|
||
|
paddd mm1,mm3
|
||
|
punpckhwd mm5,mm0
|
||
|
|
||
|
paddd mm2,mm5
|
||
|
|
||
|
movq [eax],mm1
|
||
|
|
||
|
movd [eax+08h],mm2
|
||
|
|
||
|
emms
|
||
|
ret
|
||
|
; 63 cycles compared with 204 for the C-nonMMX version
|
||
|
|
||
|
|
||
|
align
|
||
|
_MMXAsm_VectorTransformedAndAdd:
|
||
|
MMXAsm_VectorTransformedAndAdd_:
|
||
|
|
||
|
movq mm0,[edx]
|
||
|
|
||
|
movd mm4,[edx+08h]
|
||
|
movq mm1,mm0
|
||
|
|
||
|
movq mm2,[ecx]
|
||
|
movq mm5,mm4
|
||
|
|
||
|
pand mm1,_mmx_sign_mask
|
||
|
movq mm3,mm2
|
||
|
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
paddd mm1,mm1
|
||
|
|
||
|
movq mm6,[ecx+18h]
|
||
|
paddd mm5,mm5
|
||
|
|
||
|
pand mm3,_mmx_sign_mask
|
||
|
movq mm7,mm6
|
||
|
|
||
|
paddd mm0,mm1
|
||
|
paddd mm3,mm3
|
||
|
|
||
|
pand mm7,_mmx_sign_mask
|
||
|
paddd mm2,mm3
|
||
|
|
||
|
movq mm1,mm0
|
||
|
punpckhwd mm0,mm4
|
||
|
|
||
|
paddd mm4,mm5
|
||
|
paddd mm7,mm7
|
||
|
|
||
|
paddd mm6,mm7
|
||
|
punpcklwd mm1,mm4
|
||
|
|
||
|
movq mm3,mm2
|
||
|
punpckhwd mm2,mm6
|
||
|
|
||
|
punpcklwd mm3,mm6
|
||
|
movq mm4,mm1
|
||
|
|
||
|
movq mm6,mm1
|
||
|
pmaddwd mm4,mm3
|
||
|
|
||
|
movq mm7,mm6
|
||
|
psrlq mm6,32
|
||
|
|
||
|
pmaddwd mm1,mm2
|
||
|
punpckldq mm6,mm7
|
||
|
|
||
|
movq store1,mm7
|
||
|
pmaddwd mm3,mm6
|
||
|
|
||
|
movq mm7,[ecx+08h]
|
||
|
pmaddwd mm2,mm6
|
||
|
|
||
|
movq mm5,mm4
|
||
|
punpckldq mm4,mm1
|
||
|
|
||
|
psubd mm4,_mmx_one_fixed_hl
|
||
|
punpckhdq mm5,mm1
|
||
|
|
||
|
movq mm1,mm7
|
||
|
psrad mm4,16
|
||
|
|
||
|
pand mm1,_mmx_sign_mask
|
||
|
pslld mm5,16
|
||
|
|
||
|
paddd mm1,mm1
|
||
|
paddd mm5,mm4
|
||
|
|
||
|
paddd mm7,mm1
|
||
|
movq mm1,mm3
|
||
|
|
||
|
movd mm4,[ecx+20h]
|
||
|
punpckldq mm3,mm2
|
||
|
|
||
|
paddd mm3,mm5
|
||
|
movq mm5,mm4
|
||
|
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
punpckhdq mm1,mm2
|
||
|
|
||
|
paddd mm1,mm3
|
||
|
paddd mm5,mm5
|
||
|
|
||
|
movq mm2,[ecx+010h]
|
||
|
movq mm3,mm7
|
||
|
|
||
|
paddd mm4,mm5
|
||
|
movq mm5,mm2
|
||
|
|
||
|
pand mm2,_mmx_sign_mask
|
||
|
punpcklwd mm7,mm4
|
||
|
|
||
|
movq mm4,mm7
|
||
|
psrlq mm3,32
|
||
|
|
||
|
pmaddwd mm7,store1
|
||
|
paddd mm2,mm2
|
||
|
|
||
|
pmaddwd mm4,mm6
|
||
|
movq mm6,mm0
|
||
|
|
||
|
psrlq mm0,32
|
||
|
paddd mm5,mm2
|
||
|
|
||
|
punpcklwd mm0,mm6
|
||
|
movq mm2,mm7
|
||
|
|
||
|
psubd mm7,_mmx_one_fixed_hl
|
||
|
pslld mm2,16
|
||
|
|
||
|
psrad mm7,16
|
||
|
paddd mm2,mm4
|
||
|
|
||
|
paddd mm7,mm4
|
||
|
punpckldq mm0,mm0
|
||
|
|
||
|
movq mm6,mm0
|
||
|
psrlq mm2,32
|
||
|
|
||
|
paddd mm2,mm7
|
||
|
movq mm7,mm6
|
||
|
|
||
|
pmaddwd mm0,mm3
|
||
|
punpckhwd mm7,mm7
|
||
|
|
||
|
pmaddwd mm6,mm5
|
||
|
movq mm4,mm3
|
||
|
|
||
|
paddd mm1,_mmx_one_hl
|
||
|
punpcklwd mm3,mm5
|
||
|
|
||
|
punpckhwd mm5,mm4
|
||
|
movq mm4,mm7
|
||
|
|
||
|
paddd mm2,_mmx_one_hl
|
||
|
punpckldq mm0,mm6
|
||
|
|
||
|
paddd mm1,mm0
|
||
|
punpckhdq mm7,mm7
|
||
|
|
||
|
movq mm0,mm3
|
||
|
punpckldq mm3,mm5
|
||
|
|
||
|
pmulhw mm3,mm7
|
||
|
punpckhdq mm0,mm5
|
||
|
|
||
|
paddd mm1,[ebx]
|
||
|
punpckldq mm4,mm4
|
||
|
|
||
|
pmullw mm0,mm4
|
||
|
psrlq mm6,32
|
||
|
|
||
|
paddd mm2,mm6
|
||
|
pxor mm6,mm6
|
||
|
|
||
|
pcmpgtw mm6,mm3
|
||
|
movq mm5,mm3
|
||
|
|
||
|
movd mm4,[ebx+08h]
|
||
|
paddw mm0,mm6
|
||
|
|
||
|
paddd mm2,mm4
|
||
|
punpcklwd mm3,mm0
|
||
|
|
||
|
paddd mm1,mm3
|
||
|
punpckhwd mm5,mm0
|
||
|
|
||
|
paddd mm2,mm5
|
||
|
|
||
|
movq [eax],mm1
|
||
|
|
||
|
movd [eax+08h],mm2
|
||
|
|
||
|
emms
|
||
|
ret
|
||
|
; 63 cycles compared with 204 for the C-nonMMX version
|
||
|
|
||
|
|
||
|
align
|
||
|
_MMXAsm_VectorTransformAndAdd:
|
||
|
MMXAsm_VectorTransformAndAdd_:
|
||
|
|
||
|
movq mm0,[eax]
|
||
|
|
||
|
movd mm4,[eax+08h]
|
||
|
movq mm1,mm0
|
||
|
|
||
|
movq mm2,[edx]
|
||
|
movq mm5,mm4
|
||
|
|
||
|
pand mm1,_mmx_sign_mask
|
||
|
movq mm3,mm2
|
||
|
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
paddd mm1,mm1
|
||
|
|
||
|
movq mm6,[edx+18h]
|
||
|
paddd mm5,mm5
|
||
|
|
||
|
pand mm3,_mmx_sign_mask
|
||
|
movq mm7,mm6
|
||
|
|
||
|
paddd mm0,mm1
|
||
|
paddd mm3,mm3
|
||
|
|
||
|
pand mm7,_mmx_sign_mask
|
||
|
paddd mm2,mm3
|
||
|
|
||
|
movq mm1,mm0
|
||
|
punpckhwd mm0,mm4
|
||
|
|
||
|
paddd mm4,mm5
|
||
|
paddd mm7,mm7
|
||
|
|
||
|
paddd mm6,mm7
|
||
|
punpcklwd mm1,mm4
|
||
|
|
||
|
movq mm3,mm2
|
||
|
punpckhwd mm2,mm6
|
||
|
|
||
|
punpcklwd mm3,mm6
|
||
|
movq mm4,mm1
|
||
|
|
||
|
movq mm6,mm1
|
||
|
pmaddwd mm4,mm3
|
||
|
|
||
|
movq mm7,mm6
|
||
|
psrlq mm6,32
|
||
|
|
||
|
pmaddwd mm1,mm2
|
||
|
punpckldq mm6,mm7
|
||
|
|
||
|
movq store1,mm7
|
||
|
pmaddwd mm3,mm6
|
||
|
|
||
|
movq mm7,[edx+08h]
|
||
|
pmaddwd mm2,mm6
|
||
|
|
||
|
movq mm5,mm4
|
||
|
punpckldq mm4,mm1
|
||
|
|
||
|
psubd mm4,_mmx_one_fixed_hl
|
||
|
punpckhdq mm5,mm1
|
||
|
|
||
|
movq mm1,mm7
|
||
|
psrad mm4,16
|
||
|
|
||
|
pand mm1,_mmx_sign_mask
|
||
|
pslld mm5,16
|
||
|
|
||
|
paddd mm1,mm1
|
||
|
paddd mm5,mm4
|
||
|
|
||
|
paddd mm7,mm1
|
||
|
movq mm1,mm3
|
||
|
|
||
|
movd mm4,[edx+20h]
|
||
|
punpckldq mm3,mm2
|
||
|
|
||
|
paddd mm3,mm5
|
||
|
movq mm5,mm4
|
||
|
|
||
|
pand mm5,_mmx_sign_mask
|
||
|
punpckhdq mm1,mm2
|
||
|
|
||
|
paddd mm1,mm3
|
||
|
paddd mm5,mm5
|
||
|
|
||
|
movq mm2,[edx+010h]
|
||
|
movq mm3,mm7
|
||
|
|
||
|
paddd mm4,mm5
|
||
|
movq mm5,mm2
|
||
|
|
||
|
pand mm2,_mmx_sign_mask
|
||
|
punpcklwd mm7,mm4
|
||
|
|
||
|
movq mm4,mm7
|
||
|
psrlq mm3,32
|
||
|
|
||
|
pmaddwd mm7,store1
|
||
|
paddd mm2,mm2
|
||
|
|
||
|
pmaddwd mm4,mm6
|
||
|
movq mm6,mm0
|
||
|
|
||
|
psrlq mm0,32
|
||
|
paddd mm5,mm2
|
||
|
|
||
|
punpcklwd mm0,mm6
|
||
|
movq mm2,mm7
|
||
|
|
||
|
psubd mm7,_mmx_one_fixed_hl
|
||
|
pslld mm2,16
|
||
|
|
||
|
psrad mm7,16
|
||
|
paddd mm2,mm4
|
||
|
|
||
|
paddd mm7,mm4
|
||
|
punpckldq mm0,mm0
|
||
|
|
||
|
movq mm6,mm0
|
||
|
psrlq mm2,32
|
||
|
|
||
|
paddd mm2,mm7
|
||
|
movq mm7,mm6
|
||
|
|
||
|
pmaddwd mm0,mm3
|
||
|
punpckhwd mm7,mm7
|
||
|
|
||
|
pmaddwd mm6,mm5
|
||
|
movq mm4,mm3
|
||
|
|
||
|
paddd mm1,_mmx_one_hl
|
||
|
punpcklwd mm3,mm5
|
||
|
|
||
|
punpckhwd mm5,mm4
|
||
|
movq mm4,mm7
|
||
|
|
||
|
paddd mm2,_mmx_one_hl
|
||
|
punpckldq mm0,mm6
|
||
|
|
||
|
paddd mm1,mm0
|
||
|
punpckhdq mm7,mm7
|
||
|
|
||
|
movq mm0,mm3
|
||
|
punpckldq mm3,mm5
|
||
|
|
||
|
pmulhw mm3,mm7
|
||
|
punpckhdq mm0,mm5
|
||
|
|
||
|
paddd mm1,[ecx]
|
||
|
punpckldq mm4,mm4
|
||
|
|
||
|
pmullw mm0,mm4
|
||
|
psrlq mm6,32
|
||
|
|
||
|
paddd mm2,mm6
|
||
|
pxor mm6,mm6
|
||
|
|
||
|
pcmpgtw mm6,mm3
|
||
|
movq mm5,mm3
|
||
|
|
||
|
movd mm4,[ecx+08h]
|
||
|
paddw mm0,mm6
|
||
|
|
||
|
paddd mm2,mm4
|
||
|
punpcklwd mm3,mm0
|
||
|
|
||
|
paddd mm1,mm3
|
||
|
punpckhwd mm5,mm0
|
||
|
|
||
|
paddd mm2,mm5
|
||
|
|
||
|
movq [eax],mm1
|
||
|
|
||
|
movd [eax+08h],mm2
|
||
|
|
||
|
emms
|
||
|
ret
|
||
|
; 63 cycles compared with 204 for the C-nonMMX version
|
||
|
|
||
|
|
||
|
_TEXT ENDS
|
||
|
|
||
|
END
|
||
|
|