home *** CD-ROM | disk | FTP | other *** search
- ; VirtualDub - Video processing and capture application
- ; Graphics support library
- ; Copyright (C) 1998-2004 Avery Lee
- ;
- ; This program is free software; you can redistribute it and/or modify
- ; it under the terms of the GNU General Public License as published by
- ; the Free Software Foundation; either version 2 of the License, or
- ; (at your option) any later version.
- ;
- ; This program is distributed in the hope that it will be useful,
- ; but WITHOUT ANY WARRANTY; without even the implied warranty of
- ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ; GNU General Public License for more details.
- ;
- ; You should have received a copy of the GNU General Public License
- ; along with this program; if not, write to the Free Software
- ; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- ;
- section .rdata, rdata, align=16
-
- x0002000200020002 dq 0002000200020002h
- x0004000400040004 dq 0004000400040004h
- x0008000800080008 dq 0008000800080008h
- x0000200000002000 dq 0000200000002000h
-
- align 16
- MMX_roundval dq 0000200000002000h, 0000200000002000h
-
-
- ;**************************************************************************
-
- x0000FFFF0000FFFF dq 0000FFFF0000FFFFh
- x0000010100000101 dq 0000010100000101h
- x0100010001000100 dq 0100010001000100h
-
- section .text
-
- ;--------------------------------------------------------------------------
- ;_vdasm_resize_interp_row_run_MMX(
- ; [esp+ 4] void *dst,
- ; [esp+ 8] void *src,
- ; [esp+12] ulong width,
- ; [esp+16] __int64 xaccum,
- ; [esp+24] __int64 x_inc);
- ;
- global _vdasm_resize_interp_row_run_MMX
- _vdasm_resize_interp_row_run_MMX:
- push ebp
- push edi
- push esi
- push ebx
-
- mov esi, [esp+8+16]
- mov edi, [esp+4+16]
- mov ebp, [esp+12+16]
-
- movd mm4, dword [esp+16+16]
- pxor mm7, mm7
- movd mm6, dword [esp+24+16]
- punpckldq mm4, mm4
- punpckldq mm6, mm6
-
- shr esi, 2
-
- mov eax, [esp+16+16]
- mov ebx, [esp+20+16]
- add esi, ebx
- mov ebx, [esp+24+16]
- mov ecx, [esp+28+16]
-
- shl ebp,2
- add edi,ebp
- neg ebp
-
- .colloop:
- movd mm1, dword [esi*4+4]
- movq mm5, mm4
-
- movd mm0, dword [esi*4]
- punpcklbw mm1, mm7
-
- punpcklbw mm0, mm7
- psrld mm5, 24
-
- movq mm3, [x0100010001000100]
- packssdw mm5, mm5
-
- pmullw mm1, mm5
- psubw mm3, mm5
-
- pmullw mm0, mm3
- paddd mm4, mm6
-
- ;stall
- ;stall
-
- ;stall
- ;stall
-
- paddw mm0, mm1
-
- psrlw mm0, 8
- add eax, ebx
-
- adc esi, ecx
- packuswb mm0, mm0
-
- movd dword [edi+ebp],mm0
-
- add ebp, 4
- jnz .colloop
-
- pop ebx
- pop esi
- pop edi
- pop ebp
- ret
-
-
-
- ;**************************************************************************
-
- ;vdasm_resize_interp_col_run_MMX(
- ; [esp+ 4] void *dst,
- ; [esp+ 8] void *src1,
- ; [esp+12] void *src2,
- ; [esp+16] ulong width,
- ; [esp+20] ulong yaccum);
-
-
- global _vdasm_resize_interp_col_run_MMX
- _vdasm_resize_interp_col_run_MMX:
- push ebp
- push edi
- push esi
- push ebx
-
- mov esi, [esp+8+16]
- mov edx, [esp+12+16]
- mov edi, [esp+4+16]
- mov ebp, [esp+16+16]
-
- movd mm4, dword [esp+20+16]
- pxor mm7, mm7
- punpcklwd mm4, mm4
- punpckldq mm4, mm4
- psrlw mm4, 8
- pxor mm4, [x0000FFFF0000FFFF]
- paddw mm4, [x0000010100000101]
-
- shl ebp, 2
- add edi, ebp
- add esi, ebp
- add edx, ebp
- neg ebp
-
- .colloop:
- movd mm0, dword [esi+ebp]
- movd mm2, dword [edx+ebp]
-
- punpcklbw mm0, mm7
- punpcklbw mm2, mm7
-
- movq mm1, mm0
- punpcklwd mm0, mm2
- punpckhwd mm1, mm2
-
- pmaddwd mm0, mm4
- pmaddwd mm1, mm4
-
- psrad mm0, 8
- psrad mm1, 8
-
- packssdw mm0, mm1
- packuswb mm0, mm0
-
- movd dword [edi+ebp],mm0
-
- add ebp, 4
- jnz .colloop
-
- pop ebx
- pop esi
- pop edi
- pop ebp
- ret
-
-
- ;--------------------------------------------------------------------------
- ;vdasm_resize_ccint_row_MMX(dst, src, count, xaccum, xinc, tbl);
-
- global _vdasm_resize_ccint_row_MMX
- _vdasm_resize_ccint_row_MMX:
- push ebx
- push esi
- push edi
- push ebp
-
- mov ebx, [esp+4+16] ;ebx = dest addr
- mov ecx, [esp+12+16] ;ecx = count
-
- mov ebp, [esp+20+16] ;ebp = increment
- mov edi, ebp ;edi = increment
- shl ebp, 16 ;ebp = fractional increment
- mov esi, [esp+16+16] ;esi = 16:16 position
- sar edi, 16 ;edi = integer increment
- mov [esp+20+16], ebp ;xinc = fractional increment
- mov ebp, esi ;ebp = 16:16 position
- shr esi, 16 ;esi = integer position
- shl ebp, 16 ;ebp = fraction
- mov [esp+16+16], ebp ;xaccum = fraction
-
- mov eax, [esp+8+16]
-
- shr ebp, 24 ;ebp = fraction (0...255)
- mov [esp+8+16], edi
- shl ebp, 4 ;ebp = fraction*16
- mov edi, ebp
- mov ebp, [esp+4+16] ;ebp = destination
-
- shr eax, 2
- add eax, esi
- shl ecx, 2 ;ecx = count*4
- lea ebp, [ebp+ecx-4]
- neg ecx ;ecx = -count*4
-
- movq mm6, [x0000200000002000]
- pxor mm7, mm7
-
- mov edx,[esp+16+16] ;edx = fractional accumulator
- mov esi,[esp+20+16] ;esi = fractional increment
-
- mov ebx,[esp+24+16] ;ebx = coefficient pointer
-
- movd mm0,dword [eax*4]
- movd mm1,dword [eax*4+4]
- punpcklbw mm0,mm7 ;mm0 = [a1][r1][g1][b1]
-
- ;borrow stack pointer
- push 0 ;don't crash
- push dword [fs:0]
- mov dword [fs:0], esp
- mov esp, [esp+8+24] ;esp = integer increment
- jmp short ccint_loop_MMX_start
-
- ;EAX source pointer / 4
- ;EBX coefficient pointer
- ;ECX count
- ;EDX fractional accumulator
- ;ESI fractional increment
- ;EDI coefficient offset
- ;ESP integer increment
- ;EBP destination pointer
-
- align 16
- ccint_loop_MMX:
- movd mm0,dword [eax*4]
- packuswb mm2,mm2 ;mm0 = [a][r][g][b][a][r][g][b]
-
- movd mm1,dword [eax*4+4]
- punpcklbw mm0,mm7 ;mm0 = [a1][r1][g1][b1]
-
- movd dword [ebp+ecx],mm2
- ccint_loop_MMX_start:
- movq mm4,mm0 ;mm0 = [a1][r1][g1][b1]
-
- movd mm2,dword [eax*4+8]
- punpcklbw mm1,mm7 ;mm1 = [a2][r2][g2][b2]
-
- movd mm3,dword [eax*4+12]
- punpcklbw mm2,mm7 ;mm2 = [a3][r3][g3][b3]
-
- punpcklbw mm3,mm7 ;mm3 = [a4][r4][g4][b4]
- movq mm5,mm2 ;mm2 = [a3][r3][g3][b3]
-
- add edx,esi ;add fractional increment
- punpcklwd mm0,mm1 ;mm0 = [g2][g1][b2][b1]
-
- pmaddwd mm0,[ebx+edi]
- punpcklwd mm2,mm3 ;mm2 = [g4][g3][b4][b3]
-
- pmaddwd mm2,[ebx+edi+8]
- punpckhwd mm4,mm1 ;mm4 = [a2][a1][r2][r1]
-
- pmaddwd mm4,[ebx+edi]
- punpckhwd mm5,mm3 ;mm5 = [a4][a3][b4][b3]
-
- pmaddwd mm5,[ebx+edi+8]
- paddd mm0,mm6
-
- adc eax,esp ;add integer increment and fractional bump to offset
- mov edi,0ff000000h
-
- paddd mm2,mm0 ;mm0 = [ g ][ b ]
- paddd mm4,mm6
-
- psrad mm2,14
- paddd mm4,mm5 ;mm4 = [ a ][ r ]
-
- and edi,edx
- psrad mm4,14
-
- shr edi,20 ;edi = fraction (0...255)*16
- add ecx,4
-
- packssdw mm2,mm4 ;mm0 = [ a ][ r ][ g ][ b ]
- jnc ccint_loop_MMX
-
- packuswb mm2,mm2 ;mm0 = [a][r][g][b][a][r][g][b]
- movd dword [ebp],mm2
-
- mov esp, dword [fs:0]
- pop dword [fs:0]
- pop eax
-
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
-
- ;--------------------------------------------------------------------------
- ;vdasm_resize_ccint_col_MMX(dst, src1, src2, src3, src4, count, tbl);
-
- global _vdasm_resize_ccint_col_MMX
- _vdasm_resize_ccint_col_MMX:
- push ebx
- push esi
- push edi
- push ebp
-
- mov ebp, [esp+4+16] ;ebp = dest addr
- mov esi, [esp+24+16] ;esi = count
- add esi, esi
- add esi, esi
-
- mov eax, [esp+8+16] ;eax = row 1
- mov ebx, [esp+12+16] ;ebx = row 2
- mov ecx, [esp+16+16] ;ecx = row 3
- mov edx, [esp+20+16] ;edx = row 4
- mov edi, [esp+28+16] ;edi = coefficient ptr
-
- add eax, esi
- add ebx, esi
- add ecx, esi
- add edx, esi
- add ebp, esi
- neg esi
-
- movq mm4,[edi]
- movq mm5,[edi+8]
- movq mm6,[x0000200000002000]
- pxor mm7,mm7
-
- movd mm2,dword [eax+esi]
- movd mm1,dword [ebx+esi] ;mm1 = pixel1
- punpcklbw mm2,mm7
- jmp short ccint_col_loop_MMX.entry
-
- align 16
- ccint_col_loop_MMX:
- movd mm2,dword [eax+esi] ;mm2 = pixel0
- packuswb mm0,mm0
-
- movd mm1,dword [ebx+esi] ;mm1 = pixel1
- pxor mm7,mm7
-
- movd dword [ebp+esi-4],mm0
- punpcklbw mm2,mm7
-
- ccint_col_loop_MMX.entry:
- punpcklbw mm1,mm7
- movq mm0,mm2
-
- movd mm3,dword [edx+esi] ;mm3 = pixel3
- punpcklwd mm0,mm1 ;mm0 = [g1][g0][b1][b0]
-
- pmaddwd mm0,mm4
- punpckhwd mm2,mm1 ;mm2 = [a1][a0][r1][r0]
-
- movd mm1,dword [ecx+esi] ;mm1 = pixel2
- punpcklbw mm3,mm7
-
- pmaddwd mm2,mm4
- punpcklbw mm1,mm7
-
- movq mm7,mm1
- punpcklwd mm1,mm3 ;mm1 = [g3][g2][b3][b2]
-
- punpckhwd mm7,mm3 ;mm7 = [a3][a2][r3][r2]
- pmaddwd mm1,mm5
-
- pmaddwd mm7,mm5
- paddd mm0,mm6
-
- paddd mm2,mm6
- paddd mm0,mm1
-
- paddd mm2,mm7
- psrad mm0,14
-
- psrad mm2,14
- add esi,4
-
- packssdw mm0,mm2
- jne ccint_col_loop_MMX
-
- packuswb mm0,mm0
- movd dword [ebp-4],mm0
-
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
-
- ;--------------------------------------------------------------------------
- ;vdasm_resize_ccint_col_SSE2(dst, src1, src2, src3, src4, count, tbl);
-
- global _vdasm_resize_ccint_col_SSE2
- _vdasm_resize_ccint_col_SSE2:
- push ebx
- push esi
- push edi
- push ebp
-
- mov ebp,[esp + 4 + 16] ;ebp = dest addr
- mov esi,[esp + 24 + 16] ;esi = count
- add esi,esi
- add esi,esi
-
- mov eax,[esp + 8 + 16] ;eax = row 1
- mov ebx,[esp + 12 + 16] ;ebx = row 2
- mov ecx,[esp + 16 + 16] ;ecx = row 3
- mov edx,[esp + 20 + 16] ;edx = row 4
- mov edi,[esp + 28 + 16] ;edi = coefficient ptr
-
- neg esi
-
- add esi,4
- jz ccint_col_SSE2_odd
-
- movq xmm4,qword [edi]
- movq xmm5,qword [edi+8]
- punpcklqdq xmm4,xmm4
- punpcklqdq xmm5,xmm5
- movq xmm6,[x0000200000002000]
- punpcklqdq xmm6,xmm6
- pxor xmm7,xmm7
-
- ; jmp short ccint_col_loop_SSE2.entry
-
- ; align 16
- ccint_col_loop_SSE2:
- movq xmm0, qword [eax]
- add eax, 8
- movq xmm1, qword [ebx]
- add ebx, 8
- movq xmm2, qword [ecx]
- add ecx, 8
- movq xmm3, qword [edx]
- add edx, 8
- punpcklbw xmm0,xmm1
- punpcklbw xmm2,xmm3
- movdqa xmm1,xmm0
- movdqa xmm3,xmm2
- punpcklbw xmm0,xmm7
- punpckhbw xmm1,xmm7
- punpcklbw xmm2,xmm7
- punpckhbw xmm3,xmm7
- pmaddwd xmm0,xmm4
- pmaddwd xmm1,xmm4
- pmaddwd xmm2,xmm5
- pmaddwd xmm3,xmm5
- paddd xmm0,xmm6
- paddd xmm1,xmm6
- paddd xmm0,xmm2
- paddd xmm1,xmm3
- psrad xmm0,14
- psrad xmm1,14
- packssdw xmm0,xmm1
- packuswb xmm0,xmm0
- movdq2q mm0,xmm0
- movntq [ebp],mm0
- add ebp,8
- add esi,8
- jnc ccint_col_loop_SSE2
- jnz ccint_col_SSE2_noodd
- ccint_col_SSE2_odd:
- movd mm0, dword [eax]
- pxor mm7,mm7
- movd mm1, dword [ebx]
- movdq2q mm4,xmm4
- movd mm2, dword [ecx]
- movdq2q mm5,xmm5
- movd mm3, dword [edx]
- movdq2q mm6,xmm6
- punpcklbw mm0,mm1
- punpcklbw mm2,mm3
- movq mm1,mm0
- movq mm3,mm2
- punpcklbw mm0,mm7
- punpckhbw mm1,mm7
- punpcklbw mm2,mm7
- punpckhbw mm3,mm7
- pmaddwd mm0,mm4
- pmaddwd mm1,mm4
- pmaddwd mm2,mm5
- pmaddwd mm3,mm5
- paddd mm0,mm6
- paddd mm2,mm6
- paddd mm0,mm2
- paddd mm1,mm3
- psrad mm0,14
- psrad mm1,14
- packssdw mm0,mm1
- packuswb mm0,mm0
- movd eax,mm0
- movnti [ebp],eax
-
- ccint_col_SSE2_noodd:
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
-
-
-
- ;-------------------------------------------------------------------------
- ;
- ; long resize_table_row_MMX(Pixel *out, Pixel *in, int *filter, int filter_width, PixDim w, long accum, long frac);
-
- .code
-
- global _vdasm_resize_table_row_MMX
- _vdasm_resize_table_row_MMX:
- push ebp
- push esi
- push edi
- push ebx
-
- cmp dword [esp+16+16], 4
- jz .accel_4coeff
- cmp dword [esp+16+16], 6
- jz .accel_6coeff
- cmp dword [esp+16+16], 8
- jz .accel_8coeff
-
- mov eax,[esp + 24 + 16]
- mov ebp,[esp + 20 + 16]
- mov ebx,[esp + 8 + 16]
- mov edi,[esp + 4 + 16]
-
- mov esi,eax
- mov edx,eax
-
- pxor mm5,mm5
-
- mov ecx,[esp + 16 + 16]
- shr ecx,1
- mov [esp+16+16],ecx
- test ecx,1
- jnz .pixelloop_odd_pairs
-
- .pixelloop_even_pairs:
- shr esi,14
- and edx,0000ff00h
- and esi,byte -4
-
- mov ecx,[esp + 16 + 16]
- shr edx,5
- add esi,ebx
- imul edx,ecx
- add eax,[esp + 28 + 16]
- add edx,[esp + 12 + 16]
-
- movq mm6,[MMX_roundval]
- pxor mm3,mm3
- movq mm7,mm6
- pxor mm2,mm2
-
- .coeffloop_unaligned_even_pairs:
- movd mm0,dword [esi+0]
- paddd mm7,mm2 ;accumulate alpha/red (pixels 2/3)
-
- punpcklbw mm0,[esi+4] ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
- paddd mm6,mm3 ;accumulate green/blue (pixels 2/3)
-
- movd mm2,dword [esi+8]
- movq mm1,mm0 ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
-
- punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
- movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
-
- pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
- punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
-
- pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
- punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
-
- pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
- punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
-
- pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
- paddd mm7,mm0 ;accumulate alpha/red (pixels 0/1)
-
- paddd mm6,mm1 ;accumulate green/blue (pixels 0/1)
- add edx,16
-
- add esi,16
- sub ecx,2
-
- jne .coeffloop_unaligned_even_pairs
-
- paddd mm7,mm2 ;accumulate alpha/red (pixels 2/3)
- paddd mm6,mm3 ;accumulate green/blue (pixels 2/3)
-
- psrad mm7,14
- psrad mm6,14
-
- packssdw mm6,mm7
- add edi,4
-
- packuswb mm6,mm6
- sub ebp,1
-
- mov esi,eax
- mov edx,eax
-
- movd dword [edi-4],mm6
- jne .pixelloop_even_pairs
-
- pop ebx
- pop edi
- pop esi
- pop ebp
-
- ret
-
- ;----------------------------------------------------------------
-
- .pixelloop_odd_pairs:
- shr esi,14
- and edx,0000ff00h
- and esi,byte -4
-
- mov ecx,[esp + 16 + 16]
- shr edx,5
- add esi,ebx
- imul edx,ecx
- add eax,[esp + 28 + 16]
- sub ecx,1
- add edx,[esp + 12 + 16]
-
- movq mm6,[MMX_roundval]
- pxor mm3,mm3
- pxor mm2,mm2
- movq mm7,mm6
-
- .coeffloop_unaligned_odd_pairs:
- movd mm0,dword [esi+0]
- paddd mm7,mm2 ;accumulate alpha/red (pixels 2/3)
-
- punpcklbw mm0,[esi+4] ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
- paddd mm6,mm3 ;accumulate green/blue (pixels 2/3)
-
- movd mm2,dword [esi+8]
- movq mm1,mm0 ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
-
- punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
- movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
-
- pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
- punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
-
- pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
- punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
-
- pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
- punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
-
- pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
- paddd mm7,mm0 ;accumulate alpha/red (pixels 0/1)
-
- paddd mm6,mm1 ;accumulate green/blue (pixels 0/1)
- add edx,16
-
- add esi,16
- sub ecx,2
-
- jne .coeffloop_unaligned_odd_pairs
-
- paddd mm7,mm2 ;accumulate alpha/red (pixels 2/3)
- paddd mm6,mm3 ;accumulate green/blue (pixels 2/3)
-
- ;finish up odd pair
-
- movd mm0,dword [esi] ;mm0 = [x1][r1][g1][b1]
- punpcklbw mm0,[esi+4] ;mm2 = [x0][x1][r0][r1][g0][g1][b0][b1]
- movq mm1,mm0
- punpcklbw mm0,mm5 ;mm0 = [g0][g1][b0][b1]
- punpckhbw mm1,mm5 ;mm1 = [x0][x1][r0][r1]
-
- pmaddwd mm0,[edx]
- pmaddwd mm1,[edx]
-
- paddd mm6,mm0
- paddd mm7,mm1
-
- ;combine into pixel
-
- psrad mm6,14
-
- psrad mm7,14
-
- packssdw mm6,mm7
- add edi,4
-
- packuswb mm6,mm6
- sub ebp,1
-
- mov esi,eax
- mov edx,eax
-
- movd dword [edi-4],mm6
- jne .pixelloop_odd_pairs
-
- pop ebx
- pop edi
- pop esi
- pop ebp
-
- ret
-
- ;----------------------------------------------------------------
-
- .accel_4coeff:
- mov eax,[esp + 24 + 16]
- mov ebp,[esp + 20 + 16]
- add ebp,ebp
- add ebp,ebp
- mov ebx,[esp + 8 + 16]
- mov edi,[esp + 4 + 16]
- add edi,ebp
- neg ebp
-
- mov esi,eax
- mov edx,eax
-
- movq mm4,[MMX_roundval]
- pxor mm5,mm5
-
- mov ecx,[esp+12+16]
-
- .pixelloop_4coeff:
- shr esi,14
- and edx,0000ff00h
- and esi,byte -4
-
- shr edx,4
- add esi,ebx
- add eax,[esp+28+16]
- add edx,ecx
-
- movd mm0,dword [esi+0]
- movd mm2,dword [esi+8]
- punpcklbw mm0,[esi+4] ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- movq mm1,mm0 ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
-
- pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
- punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
-
- movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
- punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
-
- pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
- punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
-
- pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
- punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
-
- pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
- paddd mm0,mm4 ;accumulate alpha/red (pixels 0/1)
-
- paddd mm1,mm4 ;accumulate green/blue (pixels 0/1)
-
- paddd mm0,mm2 ;accumulate alpha/red (pixels 2/3)
- paddd mm1,mm3 ;accumulate green/blue (pixels 2/3)
-
- psrad mm0,14
- psrad mm1,14
-
- packssdw mm1,mm0
- mov esi,eax
-
- packuswb mm1,mm1
- mov edx,eax
-
- movd dword [edi+ebp],mm1
- add ebp,4
- jne .pixelloop_4coeff
-
- pop ebx
- pop edi
- pop esi
- pop ebp
-
- ret
-
-
- ;----------------------------------------------------------------
-
- .accel_6coeff:
- mov eax,[esp + 24 + 16]
- mov ebp,[esp + 20 + 16]
- add ebp,ebp
- add ebp,ebp
- mov ebx,[esp + 8 + 16]
- mov edi,[esp + 4 + 16]
- add edi,ebp
- neg ebp
-
- mov esi,eax
- mov edx,eax
-
- movq mm4,[MMX_roundval]
- pxor mm5,mm5
-
- mov ecx,[esp+12+16]
-
- .pixelloop_6coeff:
- shr esi,14
- and edx,0000ff00h
- and esi,byte -4
-
- shr edx,5
- lea edx,[edx+edx*2]
- add esi,ebx
- add eax,[esp+28+16]
- add edx,ecx
-
- movd mm0,dword [esi+0]
- movd mm2,dword [esi+8]
- punpcklbw mm0,[esi+4] ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- movq mm1,mm0 ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
-
- pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
- punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
-
- movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
- punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
-
- pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
- punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
-
- pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
- punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
-
- pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
- paddd mm0,mm4 ;accumulate alpha/red (pixels 0/1)
-
- paddd mm1,mm4 ;accumulate green/blue (pixels 0/1)
-
- paddd mm0,mm2 ;accumulate alpha/red (pixels 2/3)
- paddd mm1,mm3 ;accumulate green/blue (pixels 2/3)
-
- movd mm6,dword [esi+16]
-
- punpcklbw mm6,[esi+20] ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- movq mm7,mm6 ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- punpckhbw mm6,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
-
- pmaddwd mm6,[edx+16] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
- punpcklbw mm7,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
-
- pmaddwd mm7,[edx+16] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
- paddd mm0,mm6 ;accumulate alpha/red (pixels 0/1)
-
- paddd mm1,mm7 ;accumulate green/blue (pixels 0/1)
-
-
- psrad mm0,14
- psrad mm1,14
-
- packssdw mm1,mm0
- mov esi,eax
-
- packuswb mm1,mm1
- mov edx,eax
-
- movd dword [edi+ebp],mm1
- add ebp,4
- jne .pixelloop_6coeff
-
- pop ebx
- pop edi
- pop esi
- pop ebp
-
- ret
-
- ;----------------------------------------------------------------
-
- .accel_8coeff:
- mov eax,[esp + 24 + 16]
- mov ebp,[esp + 20 + 16]
- add ebp,ebp
- add ebp,ebp
- mov ebx,[esp + 8 + 16]
- mov edi,[esp + 4 + 16]
- add edi,ebp
- neg ebp
-
- mov esi,eax
- mov edx,eax
-
- movq mm4,[MMX_roundval]
- pxor mm5,mm5
-
- mov ecx,[esp+12+16]
-
- .pixelloop_8coeff:
- shr esi,14
- and edx,0000ff00h
- and esi,byte -4
-
- shr edx,3
- add esi,ebx
- add eax,[esp+28+16]
- add edx,ecx
-
- movd mm0,dword [esi+0]
- movd mm2,dword [esi+8]
- punpcklbw mm0,[esi+4] ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- movq mm1,mm0 ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- punpckhbw mm0,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
-
- pmaddwd mm0,[edx] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
- punpcklbw mm2,[esi+12] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
-
- movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
- punpcklbw mm1,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
-
- pmaddwd mm1,[edx] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
- punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
-
- pmaddwd mm2,[edx+8] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
- punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
-
- pmaddwd mm3,[edx+8] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
- paddd mm0,mm4 ;accumulate alpha/red (pixels 0/1)
-
- paddd mm1,mm4 ;accumulate green/blue (pixels 0/1)
-
- paddd mm0,mm2 ;accumulate alpha/red (pixels 2/3)
- paddd mm1,mm3 ;accumulate green/blue (pixels 2/3)
-
-
- movd mm6,dword [esi+16]
-
- punpcklbw mm6,[esi+20] ;mm1=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- movd mm2,dword [esi+24]
-
- punpcklbw mm2,[esi+28] ;mm2=[a2][a3][r2][r3][g2][g3][b2][b3]
- movq mm7,mm6 ;mm0=[a0][a1][r0][r1][g0][g1][b0][b1]
-
- punpckhbw mm6,mm5 ;mm0=[ a0 ][ a1 ][ r0 ][ r1 ]
- movq mm3,mm2 ;mm3=[a2][a3][r2][r3][g2][g3][b2][b3]
-
- pmaddwd mm6,[edx+16] ;mm0=[a0*f0+a1*f1][r0*f0+r1*f1]
- punpcklbw mm7,mm5 ;mm1=[ g0 ][ g1 ][ b0 ][ b1 ]
-
- pmaddwd mm7,[edx+16] ;mm1=[g0*f0+g1*f1][b0*f0+b1*f1]
- punpckhbw mm2,mm5 ;mm2=[ a2 ][ a3 ][ r0 ][ r1 ]
-
- pmaddwd mm2,[edx+24] ;mm2=[a2*f2+a3*f3][r2*f2+r3*f3]
- punpcklbw mm3,mm5 ;mm3=[ g2 ][ g3 ][ b2 ][ b3 ]
-
- pmaddwd mm3,[edx+24] ;mm3=[g2*f2+g3*f3][b2*f2+b3*f3]
- paddd mm0,mm6 ;accumulate alpha/red (pixels 0/1)
-
- paddd mm1,mm7 ;accumulate green/blue (pixels 0/1)
- paddd mm0,mm2 ;accumulate alpha/red (pixels 0/1)
-
- paddd mm1,mm3 ;accumulate green/blue (pixels 0/1)
-
-
- psrad mm0,14
- psrad mm1,14
-
- packssdw mm1,mm0
- mov esi,eax
-
- packuswb mm1,mm1
- mov edx,eax
-
- movd dword [edi+ebp],mm1
- add ebp,4
- jne .pixelloop_8coeff
-
- pop ebx
- pop edi
- pop esi
- pop ebp
-
- ret
-
-
-
-
-
-
-
- ;-------------------------------------------------------------------------
- ;
- ; long resize_table_col_MMX(Pixel *out, Pixel **in_table, int *filter, int filter_width, PixDim w, long frac);
-
- global _vdasm_resize_table_col_MMX
- _vdasm_resize_table_col_MMX:
- push ebp
- push esi
- push edi
- push ebx
-
- mov edx,[esp + 12 + 16]
- mov eax,[esp + 24 + 16]
- shl eax,2
- imul eax,[esp + 16 + 16]
- add edx,eax
- mov [esp + 12 + 16], edx ;[esp+12+28] = filter pointer
-
- mov ebp,[esp + 20 + 16] ;ebp = pixel counter
- mov edi,[esp + 4 + 16] ;edi = destination pointer
-
- pxor mm5,mm5
-
- cmp dword [esp+16+16], 4
- jz .accel_4coeff
- cmp dword [esp+16+16], 6
- jz .accel_6coeff
-
- mov ecx,[esp + 16 + 16]
- shr ecx,1
- mov [esp + 16 + 16],ecx ;ecx = filter pair count
-
- xor ebx,ebx ;ebx = source offset
-
- mov ecx,[esp + 16 + 16] ;ecx = filter width counter
- .pixelloop:
- mov eax,[esp + 8 + 16] ;esi = row pointer table
- movq mm6,[MMX_roundval]
- movq mm7,mm6
- pxor mm0,mm0
- pxor mm1,mm1
- .coeffloop:
- mov esi,[eax]
- paddd mm6,mm0
-
- movd mm0,dword [esi+ebx] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
- paddd mm7,mm1
-
- mov esi,[eax+4]
- add eax,8
-
- movd mm1,dword [esi+ebx] ;mm1 = [0][0][0][0][x1][r1][g1][b1]
- punpcklbw mm0,mm1 ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
-
- movq mm1,mm0
- punpcklbw mm0,mm5 ;mm0 = [g1][g0][b1][b0]
-
- pmaddwd mm0,[edx]
- punpckhbw mm1,mm5 ;mm1 = [x1][x0][r1][r0]
-
- pmaddwd mm1,[edx]
- add edx,8
-
- sub ecx,1
- jne .coeffloop
-
- paddd mm6,mm0
- paddd mm7,mm1
-
- psrad mm6,14
- psrad mm7,14
- add edi,4
- packssdw mm6,mm7
- add ebx,4
- packuswb mm6,mm6
- sub ebp,1
-
- mov ecx,[esp + 16 + 16] ;ecx = filter width counter
- mov edx,[esp + 12 + 16] ;edx = filter bank pointer
-
- movd dword [edi-4],mm6
- jne .pixelloop
-
- .xit:
- pop ebx
- pop edi
- pop esi
- pop ebp
- ret
-
-
-
- .accel_4coeff:
- movq mm2,[edx]
- movq mm3,[edx+8]
-
- mov esi,[esp+8+16] ;esi = row pointer table
- mov eax,[esi]
- add ebp,ebp
- mov ebx,[esi+4]
- add ebp,ebp
- mov ecx,[esi+8]
- mov esi,[esi+12]
- add eax,ebp
- add ebx,ebp
- add ecx,ebp
- add esi,ebp
- add edi,ebp
- neg ebp
-
- ;EAX source 0
- ;EBX source 1
- ;ECX source 2
- ;ESI source 3
- ;EDI destination
- ;EBP counter
-
- movq mm4,[MMX_roundval]
-
- .pixelloop4:
- movd mm6,dword [eax+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
-
- punpcklbw mm6,[ebx+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
-
- movq mm7,mm6
- punpcklbw mm6,mm5 ;mm0 = [g1][g0][b1][b0]
-
- pmaddwd mm6,mm2
- punpckhbw mm7,mm5 ;mm1 = [x1][x0][r1][r0]
-
- movd mm0,dword [ecx+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
- pmaddwd mm7,mm2
-
- punpcklbw mm0,[esi+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
- paddd mm6,mm4
-
- movq mm1,mm0
- punpcklbw mm0,mm5 ;mm0 = [g1][g0][b1][b0]
-
- pmaddwd mm0,mm3
- punpckhbw mm1,mm5 ;mm1 = [x1][x0][r1][r0]
-
- pmaddwd mm1,mm3
- paddd mm7,mm4
-
- paddd mm6,mm0
- paddd mm7,mm1
-
- psrad mm6,14
- psrad mm7,14
- packssdw mm6,mm7
- packuswb mm6,mm6
-
- movd dword [edi+ebp],mm6
-
- add ebp,4
- jne .pixelloop4
- jmp .xit
-
- .accel_6coeff:
- movq mm2,[edx]
- movq mm3,[edx+8]
- movq mm4,[edx+16]
-
- push 0
- push dword [fs:0]
- mov dword [fs:0],esp
-
- mov esp,[esp+8+24] ;esp = row pointer table
- mov eax,[esp]
- add ebp,ebp
- mov ebx,[esp+4]
- add ebp,ebp
- mov ecx,[esp+8]
- mov edx,[esp+12]
- mov esi,[esp+16]
- mov esp,[esp+20]
- add eax,ebp
- add ebx,ebp
- add ecx,ebp
- add edx,ebp
- add esi,ebp
- add edi,ebp
- add esp,ebp
- neg ebp
-
- ;EAX source 0
- ;EBX source 1
- ;ECX source 2
- ;EDX source 3
- ;ESI source 4
- ;EDI destination
- ;ESP source 5
- ;EBP counter
-
- .pixelloop6:
- movd mm6,dword [eax+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
-
- punpcklbw mm6,[ebx+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
-
- movq mm7,mm6
- punpcklbw mm6,mm5 ;mm0 = [g1][g0][b1][b0]
-
- movd mm0,dword [ecx+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
- punpckhbw mm7,mm5 ;mm1 = [x1][x0][r1][r0]
-
- punpcklbw mm0,[edx+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
- pmaddwd mm6,mm2
-
- movq mm1,mm0
- punpcklbw mm0,mm5 ;mm0 = [g1][g0][b1][b0]
-
- pmaddwd mm7,mm2
- punpckhbw mm1,mm5 ;mm1 = [x1][x0][r1][r0]
-
- paddd mm6,[MMX_roundval]
- pmaddwd mm0,mm3
-
- paddd mm7,[MMX_roundval]
- pmaddwd mm1,mm3
-
- paddd mm6,mm0
-
- movd mm0,dword [esi+ebp] ;mm0 = [0][0][0][0][x0][r0][g0][b0]
- paddd mm7,mm1
-
- punpcklbw mm0,[esp+ebp] ;mm0 = [x0][x1][r0][r1][g0][g1][b0][b1]
- movq mm1,mm0
- punpcklbw mm0,mm5 ;mm0 = [g1][g0][b1][b0]
- punpckhbw mm1,mm5 ;mm1 = [x1][x0][r1][r0]
- pmaddwd mm0,mm4
- pmaddwd mm1,mm4
- paddd mm6,mm0
- paddd mm7,mm1
-
- psrad mm6,14
- psrad mm7,14
- packssdw mm6,mm7
- packuswb mm6,mm6
-
- movd dword [edi+ebp],mm6
-
- add ebp,4
- jne .pixelloop6
-
- mov esp, dword [fs:0]
- pop dword [fs:0]
- pop eax
-
- jmp .xit
-
-
- global _vdasm_resize_table_col_SSE2
- _vdasm_resize_table_col_SSE2:
- push ebp
- push esi
- push edi
- push ebx
-
- mov edx,[esp+12+16]
- mov eax,[esp+24+16]
- shl eax,2
- imul eax,[esp+16+16]
- add edx,eax
- mov [esp+12+16], edx ;[esp+12+16] = filter pointer
-
- mov ebp,[esp+20+16] ;ebp = pixel counter
- mov edi,[esp+4+16] ;edi = destination pointer
-
- pxor xmm7, xmm7
- movdqa xmm6, [MMX_roundval]
-
- cmp dword [esp+16+16], 4
- jz .accel_4coeff
- cmp dword [esp+16+16], 6
- jz .accel_6coeff
-
- mov ecx,[esp+16+16]
- shr ecx,1
- mov [esp+16+16],ecx ;ecx = filter pair count
-
- xor ebx,ebx ;ebx = source offset
-
- mov ecx,[esp+16+16] ;ecx = filter width counter
- .pixelloop:
- mov eax, [esp+8+16] ;esi = row pointer table
- movdqa xmm4, xmm6
- .coeffloop:
- mov esi,[eax]
-
- movd xmm0, dword [esi+ebx]
-
- mov esi,[eax+4]
- add eax,8
-
- movd xmm1, dword [esi+ebx]
- punpcklbw xmm0, xmm1
-
- punpcklbw xmm0, xmm7
-
- movq xmm2, qword [edx]
- pshufd xmm2, xmm2, 01000100b
-
- pmaddwd xmm0, xmm2
-
- paddd xmm4, xmm0
-
- add edx,8
-
- sub ecx,1
- jne .coeffloop
-
- psrad xmm4,14
- add edi,4
- packssdw xmm4,xmm4
- add ebx,4
- packuswb xmm4,xmm4
- sub ebp,1
-
- mov ecx,[esp+16+16] ;ecx = filter width counter
- mov edx,[esp+12+16] ;edx = filter bank pointer
-
- movd dword [edi-4],xmm4
- jne .pixelloop
-
- .xit:
- pop ebx
- pop edi
- pop esi
- pop ebp
- ret
-
- .accel_4coeff:
- shl ebp, 2
- mov eax, [esp+8+16] ;eax = row pointer table
- mov esi, [eax+12]
- mov ecx, [eax+8]
- mov ebx, [eax+4]
- mov eax, [eax]
- lea edi, [edi+ebp-4]
- neg ebp
-
- ;registers:
- ;
- ;EAX source 0
- ;EBX source 1
- ;ECX source 2
- ;ESI source 3
- ;EDI destination
- ;EBP counter
- ;
- movq xmm4, qword [edx] ;xmm4 = coeff 0/1
- movq xmm5, qword [edx+8] ;xmm5 = coeff 2/3
- punpcklqdq xmm4, xmm4
- punpcklqdq xmm5, xmm5
-
- add ebp, 4
- jz .oddpixel_4coeff
-
- .pixelloop_4coeff_dualpel:
- movq xmm0, qword [eax]
- movq xmm1, qword [ebx]
- movq xmm2, qword [ecx]
- movq xmm3, qword [esi]
- add eax,8
- add ebx,8
- add ecx,8
- add esi,8
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpckhbw xmm3, xmm7
- pmaddwd xmm0, xmm4
- pmaddwd xmm1, xmm4
- pmaddwd xmm2, xmm5
- pmaddwd xmm3, xmm5
- paddd xmm0, xmm2
- paddd xmm1, xmm3
- paddd xmm0, xmm6
- paddd xmm1, xmm6
- psrad xmm0, 14
- psrad xmm1, 14
- packssdw xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword [edi+ebp],xmm0
- add ebp, 8
- jae .pixelloop_4coeff_dualpel
- jnz .xit
-
- .oddpixel_4coeff:
- movd xmm0, dword [eax]
- movd xmm1, dword [ebx]
- movd xmm2, dword [ecx]
- movd xmm3, dword [esi]
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- punpcklbw xmm0, xmm7
- punpcklbw xmm2, xmm7
- pmaddwd xmm0, xmm4
- pmaddwd xmm2, xmm5
- paddd xmm0, xmm2
- paddd xmm0, xmm6
- psrad xmm0, 14
- packssdw xmm0, xmm0
- packuswb xmm0, xmm0
- movd dword [edi],xmm0
- jmp .xit
-
-
- .accel_6coeff:
- movq xmm4, qword [edx] ;xmm4 = coeff 0/1
- movq xmm5, qword [edx+8] ;xmm5 = coeff 2/3
- movq xmm6, qword [edx+16] ;xmm5 = coeff 4/5
- punpcklqdq xmm4, xmm4
- punpcklqdq xmm5, xmm5
- punpcklqdq xmm6, xmm6
-
- push 0
- push dword [fs:0]
- mov dword [fs:0],esp
-
- shl ebp, 2
- mov eax, [esp+8+24] ;eax = row pointer table
- mov esp, [eax+20]
- mov esi, [eax+16]
- mov edx, [eax+12]
- mov ecx, [eax+8]
- mov ebx, [eax+4]
- mov eax, [eax]
- lea edi, [edi+ebp-4]
- neg ebp
-
- ;registers:
- ;
- ;EAX source 0
- ;EBX source 1
- ;ECX source 2
- ;EDX source 3
- ;ESI source 4
- ;EDI destination
- ;ESP source 5
- ;EBP counter
- ;
-
- add ebp, 4
- jz .oddpixel_6coeff
-
- .pixelloop_6coeff_dualpel:
- movq xmm0, qword [eax]
- movq xmm1, qword [ebx]
- movq xmm2, qword [ecx]
- movq xmm3, qword [edx]
- add eax,8
- add ebx,8
- add ecx,8
- add edx,8
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpckhbw xmm3, xmm7
- pmaddwd xmm0, xmm4
- pmaddwd xmm1, xmm4
- pmaddwd xmm2, xmm5
- pmaddwd xmm3, xmm5
- paddd xmm0, xmm2
- paddd xmm1, xmm3
-
- movq xmm2, qword [esi]
- movq xmm3, qword [esp]
- add esi, 8
- add esp, 8
- punpcklbw xmm2, xmm3
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm7
- punpckhbw xmm3, xmm7
- pmaddwd xmm2, xmm6
- pmaddwd xmm3, xmm6
- paddd xmm0, xmm2
- paddd xmm1, xmm3
- paddd xmm0, [MMX_roundval]
- paddd xmm1, [MMX_roundval]
- psrad xmm0, 14
- psrad xmm1, 14
- packssdw xmm0, xmm1
- packuswb xmm0, xmm0
- movq qword [edi+ebp],xmm0
- add ebp, 8
- jae .pixelloop_6coeff_dualpel
- jnz .xit_6coeff
-
- .oddpixel_6coeff:
- movd xmm0, dword [eax]
- movd xmm1, dword [ebx]
- movd xmm2, dword [ecx]
- movd xmm3, dword [edx]
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- movd xmm1, dword [esi]
- movd xmm3, dword [esp]
- punpcklbw xmm0, xmm7
- punpcklbw xmm2, xmm7
- pmaddwd xmm0, xmm4
- punpcklbw xmm1, xmm3
- pmaddwd xmm2, xmm5
- punpcklbw xmm1, xmm7
- pmaddwd xmm1, xmm6
- paddd xmm0, xmm2
- paddd xmm1, [MMX_roundval]
- paddd xmm0, xmm1
- psrad xmm0, 14
- packssdw xmm0, xmm0
- packuswb xmm0, xmm0
- movd dword [edi],xmm0
-
- .xit_6coeff:
- mov esp, dword [fs:0]
- pop dword [fs:0]
- pop eax
- jmp .xit
-
-
- end
-