home *** CD-ROM | disk | FTP | other *** search
- ; VirtualDub - Video processing and capture application
- ; Graphics support library
- ; Copyright (C) 1998-2004 Avery Lee
- ;
- ; This program is free software; you can redistribute it and/or modify
- ; it under the terms of the GNU General Public License as published by
- ; the Free Software Foundation; either version 2 of the License, or
- ; (at your option) any later version.
- ;
- ; This program is distributed in the hope that it will be useful,
- ; but WITHOUT ANY WARRANTY; without even the implied warranty of
- ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ; GNU General Public License for more details.
- ;
- ; You should have received a copy of the GNU General Public License
- ; along with this program; if not, write to the Free Software
- ; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- ;
-
- default rel
-
- segment .rdata, align=16
-
- align 16
- roundval dq 0000200000002000h, 0000200000002000h
-
-
- segment .text
-
-
- %macro VDSAVE 1-*
-
- %rep %0
- %rotate -1
- push %1
- [pushreg %1]
- %endrep
-
- %endmacro
-
- %macro VDRESTORE 1-*
-
- %rep %0
- pop %1
-
- %rotate 1
- %endrep
-
- %endmacro
-
- %macro VDSAVEXMM128 2
- %assign %%count %2 + 1 - %1
- %assign %%stkoffset 0
- %assign %%reg %1
-
- sub rsp, %%count*16+8
- [allocstack %%count*16]
-
- %rep %%count
- movdqa oword [rsp+%%stkoffset], xmm %+ %%reg
- [savexmm128 xmm %+ %%reg, %%stkoffset]
-
- %assign %%stkoffset %%stkoffset + 16
- %assign %%reg %%reg + 1
- %endrep
- %endmacro
-
- %macro VDRESTOREXMM128 2
- %assign %%count %2+1-%1
- %assign %%stkoffset %%count*16
- %assign %%reg %2
-
- %rep %%count
- %assign %%stkoffset %%stkoffset-16
- movdqa xmm %+ %%reg, oword [rsp+%%stkoffset]
-
- %assign %%reg %%reg-1
- %endrep
-
- add rsp, %%count*16+8
- %endmacro
-
- ;-------------------------------------------------------------------------
- ;
- ; long vdasm_resize_table_row_SSE2(
- ; Pixel *out, // rcx
- ; Pixel *in, // rdx
- ; int *filter, // r8
- ; int filter_width, // r9d
- ; PixDim w, // [rsp+40]
- ; long accum, // [rsp+48]
- ; long frac); // [rsp+56]
- ;
- global vdasm_resize_table_row_SSE2
- proc_frame vdasm_resize_table_row_SSE2
-
- VDSAVE rbx, rsi, rdi, rbp, r12, r13, r14, r15
- VDSAVEXMM128 6, 15
- end_prolog
-
- .parms equ rsp+168+64
-
- mov r10d, dword [.parms+40]
- shl r10, 2
- add rcx, r10
- neg r10
- shl r9d, 2 ;filter_width <<= 2
-
- movaps xmm6, oword [roundval]
- pxor xmm5, xmm5
- mov rsi, rdx
- shr rsi, 2
-
- mov edi, [.parms+48]
- mov eax, edi
- shl edi, 16
- sar rax, 16
- add rsi, rax
- mov ebp, [.parms+56]
- movsxd r11, ebp
- shl ebp, 16
- sar r11, 16
-
- ;register map
- ;
- ;eax temp coefficient pair counter
- ;rbx temp coefficient pointer
- ;rcx destination
- ;rdx temp source
- ;rsi source/4
- ;edi accumulator
- ;ebp fractional increment
- ;r8 filter
- ;r9 filter_width*4
- ;r10 -width*4
- ;r11 integral increment
- ;r12
- ;r13
- ;r14
- ;r15
-
- cmp r9d, 16
- jz .accel_4coeff
- cmp r9d, 24
- jz .accel_6coeff
-
- test r9d, 8
- jz .pixelloop_even_pairs
- cmp r9d, 8
- jnz .pixelloop_odd_pairs
-
- .pixelloop_single_pairs:
- mov eax, edi
- shr eax, 24
- imul eax, r9d
-
- lea rdx, [rsi*4]
-
- movd xmm0, dword [rdx] ;xmm0 = p0
- movd xmm1, dword [rdx+4] ;xmm1 = p1
- punpcklbw xmm0, xmm1
- punpcklbw xmm0, xmm5
- movq xmm1, qword [r8+rax]
- pshufd xmm1, xmm1, 01000100b
- pmaddwd xmm0, xmm1
-
- movdqa xmm4, xmm6
- paddd xmm4, xmm0
-
- psrad xmm4, 14
- packssdw xmm4, xmm4
- packuswb xmm4, xmm4
-
- add edi, ebp
- adc rsi, r11
-
- movd dword [rcx+r10], xmm4
- add r10, 4
- jnz .pixelloop_single_pairs
- jmp .xit
-
- .pixelloop_odd_pairs:
- movdqa xmm4, xmm6
-
- mov eax, edi
- shr eax, 24
- imul eax, r9d
- lea rbx, [r8+rax]
-
- lea rdx, [rsi*4]
- lea rax, [r9-8]
- .coeffloop_odd_pairs:
- movd xmm0, dword [rdx] ;xmm0 = p0
- movd xmm1, dword [rdx+4] ;xmm1 = p1
- movd xmm2, dword [rdx+8] ;xmm2 = p2
- movd xmm3, dword [rdx+12] ;xmm3 = p3
- add rdx, 16
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- punpcklbw xmm0, xmm5
- punpcklbw xmm2, xmm5
- movq xmm1, qword [rbx]
- movq xmm3, qword [rbx+8]
- add rbx, 16
- pshufd xmm1, xmm1, 01000100b
- pshufd xmm3, xmm3, 01000100b
- pmaddwd xmm0, xmm1
- pmaddwd xmm2, xmm3
- paddd xmm0, xmm2
- paddd xmm4, xmm0
- sub eax, 16
- jnz .coeffloop_odd_pairs
-
- movd xmm0, dword [rdx] ;xmm0 = p0
- movd xmm1, dword [rdx+4] ;xmm1 = p1
- punpcklbw xmm0, xmm1
- punpcklbw xmm0, xmm5
- movq xmm1, qword [rbx]
- pshufd xmm1, xmm1, 01000100b
- pmaddwd xmm0, xmm1
- paddd xmm4, xmm0
-
- psrad xmm4, 14
- packssdw xmm4, xmm4
- packuswb xmm4, xmm4
-
- add edi, ebp
- adc rsi, r11
-
- movd dword [rcx+r10], xmm4
- add r10, 4
- jnz .pixelloop_odd_pairs
- jmp .xit
-
- .pixelloop_even_pairs:
- movdqa xmm4, xmm6
-
- mov eax, edi
- shr eax, 24
- imul eax, r9d
- lea rbx, [r8+rax]
-
- lea rdx, [rsi*4]
- mov eax, r9d
- .coeffloop_even_pairs:
- movd xmm0, dword [rdx] ;xmm0 = p0
- movd xmm1, dword [rdx+4] ;xmm1 = p1
- movd xmm2, dword [rdx+8] ;xmm2 = p2
- movd xmm3, dword [rdx+12] ;xmm3 = p3
- add rdx, 16
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- punpcklbw xmm0, xmm5
- punpcklbw xmm2, xmm5
- movq xmm1, qword [rbx]
- movq xmm3, qword [rbx+8]
- add rbx, 16
- pshufd xmm1, xmm1, 01000100b
- pshufd xmm3, xmm3, 01000100b
- pmaddwd xmm0, xmm1
- pmaddwd xmm2, xmm3
- paddd xmm0, xmm2
- paddd xmm4, xmm0
- sub eax, 16
- jnz .coeffloop_even_pairs
-
- psrad xmm4, 14
- packssdw xmm4, xmm4
- packuswb xmm4, xmm4
-
- add edi, ebp
- adc rsi, r11
-
- movd dword [rcx+r10], xmm4
- add r10, 4
- jnz .pixelloop_even_pairs
-
- .xit:
- VDRESTOREXMM128 6, 15
- VDRESTORE rbx, rsi, rdi, rbp, r12, r13, r14, r15
- ret
-
- .accel_4coeff:
- .pixelloop_4coeff:
- pxor xmm5, xmm5
- movdqa xmm4, xmm6
-
- mov eax, 0ff000000h
- lea rdx, [rsi*4]
- and eax, edi
- shr eax, 20
- lea rbx, [r8+rax]
-
- movd xmm0, dword [rdx] ;xmm0 = p0
- movd xmm1, dword [rdx+4] ;xmm1 = p1
- movd xmm2, dword [rdx+8] ;xmm2 = p2
- movd xmm3, dword [rdx+12] ;xmm3 = p3
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- punpcklbw xmm0, xmm5
- punpcklbw xmm2, xmm5
- movq xmm1, qword [rbx]
- movq xmm3, qword [rbx+8]
- pshufd xmm1, xmm1, 01000100b
- pshufd xmm3, xmm3, 01000100b
- pmaddwd xmm0, xmm1
- pmaddwd xmm2, xmm3
- paddd xmm0, xmm2
- paddd xmm4, xmm0
-
- psrad xmm4, 14
- packssdw xmm4, xmm4
- packuswb xmm4, xmm4
-
- add edi, ebp
- adc rsi, r11
-
- movd dword [rcx+r10], xmm4
- add r10, 4
- jnz .pixelloop_4coeff
- jmp .xit
-
- .accel_6coeff:
- .pixelloop_6coeff:
- pxor xmm5, xmm5
- movdqa xmm4, xmm6
-
- lea rdx, [rsi*4]
- mov eax, edi
- shr eax, 24
- lea rax, [rax+rax*2]
- lea rbx, [r8+rax*8]
-
- movd xmm0, dword [rdx] ;xmm0 = p0
- movd xmm1, dword [rdx+4] ;xmm1 = p1
- movd xmm2, dword [rdx+8] ;xmm2 = p2
- movd xmm3, dword [rdx+12] ;xmm3 = p3
- movd xmm8, dword [rdx+16] ;xmm6 = p4
- movd xmm9, dword [rdx+20] ;xmm7 = p5
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- punpcklbw xmm8, xmm9
- punpcklbw xmm0, xmm5
- punpcklbw xmm2, xmm5
- punpcklbw xmm8, xmm5
- movq xmm1, qword [rbx]
- movq xmm3, qword [rbx+8]
- movq xmm9, qword [rbx+16]
- pshufd xmm1, xmm1, 01000100b
- pshufd xmm3, xmm3, 01000100b
- pshufd xmm9, xmm9, 01000100b
- pmaddwd xmm0, xmm1
- pmaddwd xmm2, xmm3
- pmaddwd xmm8, xmm9
- paddd xmm0, xmm2
- paddd xmm4, xmm0
- paddd xmm4, xmm8
-
- psrad xmm4, 14
- packssdw xmm4, xmm4
- packuswb xmm4, xmm4
-
- add edi, ebp
- adc rsi, r11
-
- movd dword [rcx+r10], xmm4
- add r10, 4
- jnz .pixelloop_6coeff
- jmp .xit
- endproc_frame
-
-
- ;--------------------------------------------------------------------------
- ;
- ; vdasm_resize_table_col_SSE2(
- ; uint32 *dst, // rcx
- ; const uint32 *const *srcs, // rdx
- ; int *filter, // r8
- ; int filter_width, // r9d
- ; PixDim w, // [rsp+40] -> r10d
- ; );
- ;
- global vdasm_resize_table_col_SSE2
- proc_frame vdasm_resize_table_col_SSE2
- VDSAVE rbx, rsi, rdi, rbp, r12, r13, r14, r15
- VDSAVEXMM128 6, 15
- end_prolog
-
- .parms equ rsp+168+64
-
- mov r10d, [.parms+40] ;r10d = w
-
- pxor xmm5, xmm5
- movdqa xmm4, oword [roundval]
- xor rbx, rbx ;rbx = source offset
-
- cmp r9d, 4
- jz .accel_4coeff
- cmp r9d, 6
- jz .accel_6coeff
-
- shr r9d, 1 ;r9d = filter pair count
-
- .pixelloop:
- mov rax, rdx ;rax = row pointer table
- mov rdi, r8 ;rdi = filter
- mov r11d, r9d ;r11d = filter width counter
- movdqa xmm2, xmm4
- .coeffloop:
- mov rsi, [rax]
-
- movd xmm0, dword [rsi+rbx]
-
- mov rsi, [rax+8]
- add rax, 16
-
- movd xmm1, dword [rsi+rbx]
- punpcklbw xmm0, xmm1
-
- punpcklbw xmm0, xmm5
-
- movq xmm1, qword [rdi]
- pshufd xmm1, xmm1, 01000100b
-
- pmaddwd xmm0, xmm1
-
- paddd xmm2, xmm0
-
- add rdi,8
-
- sub r11d,1
- jne .coeffloop
-
- psrad xmm2,14
- packssdw xmm2,xmm2
- add rbx,4
- packuswb xmm2,xmm2
-
- movd dword [rcx],xmm2
- add rcx,4
- sub r10d,1
- jne .pixelloop
-
- .xit:
- VDRESTOREXMM128 6, 15
- VDRESTORE rbx, rsi, rdi, rbp, r12, r13, r14, r15
- ret
-
- .accel_4coeff:
- mov r12, [rdx]
- mov r13, [rdx+8]
- mov r14, [rdx+16]
- mov r15, [rdx+24]
- movq xmm8, qword [r8]
- punpcklqdq xmm8, xmm8
- movq xmm9, qword [r8+8]
- punpcklqdq xmm9, xmm9
-
- sub r10d, 1
- jc .oddpixel_4coeff
- .pixelloop_4coeff:
- movq xmm0, qword [r12+rbx]
- movq xmm1, qword [r13+rbx]
- movq xmm2, qword [r14+rbx]
- movq xmm3, qword [r15+rbx]
-
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
-
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
-
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- punpckhbw xmm3, xmm5
-
- pmaddwd xmm0, xmm8
- pmaddwd xmm1, xmm8
- pmaddwd xmm2, xmm9
- pmaddwd xmm3, xmm9
-
- paddd xmm0, xmm4
- paddd xmm1, xmm4
- paddd xmm0, xmm2
- paddd xmm1, xmm3
-
- psrad xmm0, 14
- psrad xmm1, 14
- packssdw xmm0, xmm1
- packuswb xmm0, xmm0
-
- movq qword [rcx], xmm0
- add rcx, 8
- add rbx, 8
- sub r10d, 2
- ja .pixelloop_4coeff
- jnz .xit
- .oddpixel_4coeff:
- movd xmm0, dword [r12+rbx]
- movd xmm1, dword [r13+rbx]
- movd xmm2, dword [r14+rbx]
- movd xmm3, dword [r15+rbx]
-
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- punpcklbw xmm0, xmm5
- punpcklbw xmm2, xmm5
-
- pmaddwd xmm0, xmm8
- pmaddwd xmm2, xmm9
-
- paddd xmm0, xmm4
- paddd xmm0, xmm2
-
- psrad xmm0, 14
- packssdw xmm0, xmm0
- packuswb xmm0, xmm0
-
- movd dword [rcx], xmm0
-
- jmp .xit
-
- .accel_6coeff:
- mov r12, [rdx]
- mov r13, [rdx+8]
- mov r14, [rdx+16]
- mov r15, [rdx+24]
- mov rsi, [rdx+32]
- mov rdx, [rdx+40]
- movq xmm10, qword [r8]
- punpcklqdq xmm10, xmm10
- movq xmm11, qword [r8+8]
- punpcklqdq xmm11, xmm11
- movq xmm12, qword [r8+16]
- punpcklqdq xmm12, xmm12
-
- sub r10d, 1
- jc .oddpixel_6coeff
- .pixelloop_6coeff:
- movq xmm0, qword [r12+rbx]
- movq xmm1, qword [r13+rbx]
- movq xmm2, qword [r14+rbx]
- movq xmm3, qword [r15+rbx]
- movq xmm8, qword [rsi+rbx]
- movq xmm9, qword [rdx+rbx]
-
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- punpcklbw xmm8, xmm9
-
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
- movdqa xmm9, xmm8
-
- punpcklbw xmm0, xmm5
- punpckhbw xmm1, xmm5
- punpcklbw xmm2, xmm5
- punpckhbw xmm3, xmm5
- punpcklbw xmm8, xmm5
- punpckhbw xmm9, xmm5
-
- pmaddwd xmm0, xmm10
- pmaddwd xmm1, xmm10
- pmaddwd xmm2, xmm11
- pmaddwd xmm3, xmm11
- pmaddwd xmm8, xmm12
- pmaddwd xmm9, xmm12
-
- paddd xmm0, xmm4
- paddd xmm1, xmm4
- paddd xmm2, xmm8
- paddd xmm3, xmm9
- paddd xmm0, xmm2
- paddd xmm1, xmm3
-
- psrad xmm0, 14
- psrad xmm1, 14
- packssdw xmm0, xmm1
- packuswb xmm0, xmm0
-
- movq qword [rcx], xmm0
- add rcx, 8
- add rbx, 8
- sub r10d, 2
- ja .pixelloop_6coeff
- jnz .xit
- .oddpixel_6coeff:
- movd xmm0, dword [r12+rbx]
- movd xmm1, dword [r13+rbx]
- movd xmm2, dword [r14+rbx]
- movd xmm3, dword [r15+rbx]
- movd xmm8, dword [rsi+rbx]
- movd xmm9, dword [rdx+rbx]
-
- punpcklbw xmm0, xmm1
- punpcklbw xmm2, xmm3
- punpcklbw xmm8, xmm9
- punpcklbw xmm0, xmm5
- punpcklbw xmm2, xmm5
- punpcklbw xmm8, xmm5
-
- pmaddwd xmm0, xmm10
- pmaddwd xmm2, xmm11
- pmaddwd xmm8, xmm12
-
- paddd xmm0, xmm4
- paddd xmm2, xmm8
- paddd xmm0, xmm2
-
- psrad xmm0, 14
- packssdw xmm0, xmm0
- packuswb xmm0, xmm0
-
- movd dword [rcx], xmm0
-
- jmp .xit
- endproc_frame
-
- end
-