home *** CD-ROM | disk | FTP | other *** search
- section .rdata, rdata
-
- x07b dq 00707070707070707h
- x0200w dq 00200020002000200h
- x001fw dq 0001f001f001f001fh
- xffc0w dq 0ffc0ffc0ffc0ffc0h
- xffe0w dq 0ffe0ffe0ffe0ffe0h
- x2080w dq 02080208020802080h
- x4200w dq 04200420042004200h
- rb_mask5 dq 000f800f800f800f8h
- g_mask5 dq 00000f8000000f800h
- g_mask6 dq 00000fc000000fc00h
- rb_mul_565 dq 02000000420000004h
- rb_mul_555 dq 02000000820000008h
- r_mask_555 dq 07c007c007c007c00h
- g_mask_555 dq 003e003e003e003e0h
- b_mask_555 dq 0001f001f001f001fh
- r_mask_565 dq 0f800f800f800f800h
- g_mask_565 dq 007e007e007e007e0h
- b_mask_565 dq 0001f001f001f001fh
-
- %macro prologue 1
- push ebx
- push esi
- push edi
- push ebp
- ;.fpo (0,%1,4,4,1,0)
- %endmacro
-
- %macro epilogue 0
- pop ebp
- pop edi
- pop esi
- pop ebx
- %endmacro
-
- section .text
-
- global _vdasm_pixblt_RGB565_to_XRGB1555_MMX
- _vdasm_pixblt_RGB565_to_XRGB1555_MMX:
- prologue 6
-
- mov ebp, [esp+20+16]
- mov edi, [esp+24+16]
- add ebp, ebp
- mov edx, [esp+4+16]
- mov ecx, [esp+12+16]
- lea edx, [edx+ebp-6]
- lea ecx, [ecx+ebp-6]
- neg ebp
- mov [esp+20+16], ebp
-
- movq mm5, [x001fw]
- movq mm4, [xffc0w]
-
- .yloop:
- mov ebp, [esp+20+16]
- add ebp, 6
- jbe .odd
-
- .xloop:
- movq mm0, [ecx+ebp]
- movq mm1, mm5
- pand mm1, mm0
- pand mm0, mm4
- psrlq mm0, 1
- paddw mm0, mm1
- movq [edx+ebp], mm0
- add ebp, 8
- jnc .xloop
-
- sub ebp, 6
- jz .noodd
- .odd:
- movzx eax, word [ecx+ebp+6]
- mov ebx, 0001f001fh
- and ebx, eax
- and eax, 0ffc0ffc0h
- shr eax, 1
- add eax, ebx
- mov [edx+ebp+6], ax
- add ebp, 2
- jnz .odd
- .noodd:
- add ecx, [esp+16+16]
- add edx, [esp+8+16]
- dec edi
- jne .yloop
-
- emms
- epilogue
- ret
-
- global _vdasm_pixblt_XRGB8888_to_XRGB1555_MMX
- _vdasm_pixblt_XRGB8888_to_XRGB1555_MMX:
- prologue 6
-
- mov ebp, [esp+20+16]
- mov edi, [esp+24+16]
- add ebp, ebp
- mov edx, [esp+4+16]
- mov ecx, [esp+12+16]
- lea edx, [edx+ebp-14]
- lea ecx, [ecx+ebp*2-28]
- neg ebp
- mov [esp+20+16], ebp
-
- movq mm5,[rb_mul_555]
- movq mm6,[rb_mask5]
- movq mm7,[g_mask5]
-
- .yloop:
- mov ebp, [esp+20+16]
- add ebp, 14
- jbe .odd
-
- ;This code uses the "pmaddwd" trick for 32->16 conversions from Intel's MMX
- ;Application Notes.
-
- movq mm0,[ecx+ebp*2] ;allocate 0 (0123)
- movq mm2,mm0 ;allocate 2 (0 23)
-
- movq mm1,[ecx+ebp*2+8] ;allocate 1 (0123)
- movq mm3,mm1 ;allocate 3 (0123)
- pand mm0,mm6
- pmaddwd mm0,mm5
- pand mm1,mm6
- pmaddwd mm1,mm5
- pand mm2,mm7
- pand mm3,mm7
- jmp .xloopstart
-
- align 16
- .xloop:
- movq mm0,[ecx+ebp*2] ;allocate 0 (01234)
- por mm4,mm2 ;free 2 (01 34)
-
- por mm3,mm1 ;free 3 (01 34)
- movq mm2,mm0 ;allocate 2 (0 234)
-
- movq mm1,[ecx+ebp*2+8] ;allocate 1 (01234)
- psrld mm4,6
-
- psrld mm3,6
- pand mm0,mm6
-
- packssdw mm4,mm3 ;free 3 (012 4)
- movq mm3,mm1 ;allocate 3 (01234)
-
- pmaddwd mm0,mm5
- pand mm1,mm6
-
- pmaddwd mm1,mm5
- pand mm2,mm7
-
- movq [edx+ebp-8],mm4 ;free 4 (0123 )
- pand mm3,mm7
-
- .xloopstart:
- movq mm4,[ecx+ebp*2+16] ;allocate 4 (01234)
- por mm0,mm2 ;free 2 (01 34)
-
- por mm1,mm3 ;free 3 (01 4)
- psrld mm0,6
-
- movq mm3,[ecx+ebp*2+24] ;allocate 3 (01 34)
- movq mm2,mm4 ;allocate 2 (01234)
-
- psrld mm1,6
- pand mm4,mm6
-
- packssdw mm0,mm1 ;free 1 (0 234)
- movq mm1,mm3 ;allocate 1 (01234)
-
- movq [edx+ebp],mm0 ;free 0 ( 1234)
- pand mm3,mm6
-
- pmaddwd mm4,mm5
- add ebp,16
-
- pmaddwd mm3,mm5
- pand mm2,mm7
-
- pand mm1,mm7
- jnc .xloop
-
- por mm4,mm2 ;free 2 (01 34)
- por mm3,mm1 ;free 3 (01 34)
- psrld mm4,6
- psrld mm3,6
- packssdw mm4,mm3 ;free 3 (012 4)
- movq [edx+ebp-8],mm4 ;free 4 (0123 )
-
- .odd:
- sub ebp, 14
- jz .noodd
- .oddloop:
- mov eax, [ecx+ebp*2+28]
- mov ebx, 00f80000h
- mov esi, eax
- and ebx, eax
- shr ebx, 9
- and esi, 0000f800h
- shr esi, 6
- and eax, 000000f8h
- shr eax, 3
- add esi, ebx
- add eax, esi
- mov [edx+ebp+14], ax
- add ebp, 2
- jnz .oddloop
- .noodd:
- add ecx, [esp+16+16]
- add edx, [esp+8+16]
- dec edi
- jne .yloop
-
- emms
- epilogue
- ret
-
- global _vdasm_pixblt_XRGB1555_to_RGB565_MMX
- _vdasm_pixblt_XRGB1555_to_RGB565_MMX:
- prologue 6
-
- mov ebp, [esp+20+16]
- mov edi, [esp+24+16]
- add ebp, ebp
- mov edx, [esp+4+16]
- mov ecx, [esp+12+16]
- lea edx, [edx+ebp-6]
- lea ecx, [ecx+ebp-6]
- neg ebp
- mov [esp+20+16], ebp
-
- movq mm5, [x0200w]
- movq mm4, [xffe0w]
-
- .yloop:
- mov ebp, [esp+20+16]
- add ebp, 6
- jbe .odd
-
- .xloop:
- movq mm0, [ecx+ebp]
- movq mm1, mm4
- movq mm2, mm0
- pand mm1, mm0
- pand mm0, mm5
- paddw mm1, mm2
- psrlq mm0, 4
- paddw mm0, mm1
- movq [edx+ebp], mm0
- add ebp, 8
- jnc .xloop
-
- .odd:
- sub ebp, 6
- jz .noodd
- .oddloop:
- movzx eax, word [ecx+ebp+6]
- mov ebx, 02000200h
- mov esi, eax
- and ebx, eax
- shr ebx, 4
- and esi, 0ffe0ffe0h
- add eax, esi
- add eax, ebx
- mov [edx+ebp+6], ax
- add ebp, 2
- jnz .oddloop
- .noodd:
- add ecx, [esp+16+16]
- add edx, [esp+8+16]
- dec edi
- jne .yloop
-
- emms
- epilogue
- ret
-
-
- global _vdasm_pixblt_XRGB8888_to_RGB565_MMX
- _vdasm_pixblt_XRGB8888_to_RGB565_MMX:
- prologue 6
-
- mov ebp, [esp+20+16]
- mov edi, [esp+24+16]
- add ebp, ebp
- mov edx, [esp+4+16]
- mov ecx, [esp+12+16]
- lea edx, [edx+ebp-14]
- lea ecx, [ecx+ebp*2-28]
- neg ebp
- mov [esp+20+16], ebp
-
- movq mm5,[rb_mul_565]
- movq mm6,[rb_mask5]
- movq mm7,[g_mask6]
-
- .yloop:
- mov ebp, [esp+20+16]
- add ebp, 14
- jbe .odd
-
- ;This code uses the "pmaddwd" trick for 32->16 conversions from Intel's MMX
- ;Application Notes.
-
- movq mm0,[ecx+ebp*2] ;allocate 0 (0123)
- movq mm2,mm0 ;allocate 2 (0 23)
-
- movq mm1,[ecx+ebp*2+8] ;allocate 1 (0123)
- movq mm3,mm1 ;allocate 3 (0123)
- pand mm0,mm6
- pmaddwd mm0,mm5
- pand mm1,mm6
- pmaddwd mm1,mm5
- pand mm2,mm7
- pand mm3,mm7
- jmp .xloopstart
-
- align 16
- .xloop:
- movq mm0,[ecx+ebp*2] ;allocate 0 (01234)
- por mm4,mm2 ;free 2 (01 34)
-
- por mm3,mm1 ;free 3 (01 34)
- pslld mm4,16-5
-
- pslld mm3,16-5
- movq mm2,mm0 ;allocate 2 (0 234)
-
- movq mm1,[ecx+ebp*2+8] ;allocate 1 (01234)
- psrad mm4,16
-
- psrad mm3,16
- pand mm0,mm6
-
- packssdw mm4,mm3 ;free 3 (012 4)
- movq mm3,mm1 ;allocate 3 (01234)
-
- pmaddwd mm0,mm5
- pand mm1,mm6
-
- pmaddwd mm1,mm5
- pand mm2,mm7
-
- movq [edx+ebp-8],mm4 ;free 4 (0123 )
- pand mm3,mm7
-
- .xloopstart:
- movq mm4,[ecx+ebp*2+16] ;allocate 4 (01234)
- por mm0,mm2 ;free 2 (01 34)
-
- por mm1,mm3 ;free 3 (01 4)
- pslld mm0,16-5
-
- movq mm3,[ecx+ebp*2+24] ;allocate 3 (01 34)
- pslld mm1,16-5
-
- psrad mm0,16
- movq mm2,mm4 ;allocate 2 (01234)
-
- psrad mm1,16
- pand mm4,mm6
-
- packssdw mm0,mm1 ;free 1 (0 234)
- movq mm1,mm3 ;allocate 1 (01234)
-
- movq [edx+ebp],mm0 ;free 0 ( 1234)
- pand mm3,mm6
-
- pmaddwd mm4,mm5
- add ebp,16
-
- pmaddwd mm3,mm5
- pand mm2,mm7
-
- pand mm1,mm7
- jnc .xloop
-
- por mm4,mm2 ;free 2 (01 34)
- por mm3,mm1 ;free 3 (01 34)
- psllq mm4,16-5
- psllq mm3,16-5
- psrad mm4,16
- psrad mm3,16
- packssdw mm4,mm3 ;free 3 (012 4)
- movq [edx+ebp-8],mm4 ;free 4 (0123 )
-
- .odd:
- sub ebp, 14
- jz .noodd
- .oddloop:
- mov eax, [ecx+ebp*2+28]
- mov ebx, 00f80000h
- mov esi, eax
- and ebx, eax
- and eax, 000000f8h
- shr eax, 3
- and esi, 0000fc00h
- shr ebx, 8
- shr esi, 5
- add eax, ebx
- add eax, esi
- mov [edx+ebp+14], ax
- add ebp, 2
- jnz .oddloop
- .noodd:
- add ecx, [esp+16+16]
- add edx, [esp+8+16]
- dec edi
- jne .yloop
-
- emms
- epilogue
- ret
-
- global _vdasm_pixblt_XRGB8888_to_RGB888_MMX
- _vdasm_pixblt_XRGB8888_to_RGB888_MMX:
- prologue 6
-
- mov esi,[esp+12+16]
- mov edi,[esp+4+16]
-
- mov ecx,[esp+20+16]
- lea eax,[ecx+ecx*2]
- lea ebx,[ecx*4]
- sub [esp+8+16],eax
- sub [esp+16+16],ebx
-
- pcmpeqb mm7,mm7
- psrld mm7,8
- movq mm6,mm7
- psllq mm7,32 ;mm7 = high rgb mask
- psrlq mm6,32 ;mm6 = low rgb mask
-
- mov ebp,[esp+20+16]
- mov edx,[esp+24+16]
- mov eax,[esp+16+16]
- mov ebx,[esp+ 8+16]
- .yloop:
- mov ecx,ebp
- shr ecx,3
- jz .checkodd
- .xloop:
- movq mm0,[esi] ;mm0 = a1r1g1b1a0r0g0b0
- movq mm1,mm6
-
- movq mm2,[esi+8] ;mm2 = a3r3g3b3a2r2g2b2
- pand mm1,mm0 ;mm1 = ----------r0g0b0
-
- movq mm3,mm6
- pand mm0,mm7 ;mm0 = --r1g1b1--------
-
- movq mm4,mm2
- pand mm3,mm2 ;mm3 = ----------r2g2b2
-
- psrlq mm0,8 ;mm0 = ----r1g1b1------
- pand mm2,mm7 ;mm2 = --r3g3b3--------
-
- movq mm5,[esi+16] ;mm5 = a5r5g5b5a4r4g4b4
- psllq mm4,48 ;mm4 = g2b2------------
-
- por mm0,mm1 ;mm0 = ----r1g1b1r0g0b0
- psrlq mm3,16 ;mm3 = --------------r2
-
- por mm0,mm4 ;mm0 = g2b2r1g1b1r0g0b0
- movq mm1,mm6
-
- pand mm1,mm5 ;mm1 = ----------r4g4b4
- psrlq mm2,24 ;mm2 = --------r3g3b3--
-
- movq [edi],mm0
- pand mm5,mm7 ;mm5 = --r5g5b5--------
-
- psllq mm1,32 ;mm1 = --r4g4b4--------
- movq mm4,mm5 ;mm4 = --r5g5b5--------
-
- por mm2,mm3 ;mm2 = --------r3g3b3r2
- psllq mm5,24 ;mm5 = b5--------------
-
- movq mm3,[esi+24] ;mm3 = a7r7g7b7a6r6g6b6
- por mm2,mm1 ;mm2 = --r4g4b4r3g3b3r2
-
- movq mm1,mm6
- por mm2,mm5 ;mm2 = b5r4g4b4r3g3b3r2
-
- psrlq mm4,40 ;mm4 = ------------r5g5
- pand mm1,mm3 ;mm1 = ----------r6g6b6
-
- psllq mm1,16 ;mm1 = ------r6g6b6----
- pand mm3,mm7 ;mm3 = --r7g7b7--------
-
- por mm4,mm1 ;mm4 = ------r6g6b6r5g5
- psllq mm3,8 ;mm3 = r7g7b7----------
-
- movq [edi+8],mm2
- por mm4,mm3 ;mm4 = r7g7b7r6g6b6r5g5
-
- add esi,32
- sub ecx,1
-
- movq [edi+16],mm4 ;mm3
-
- lea edi,[edi+24]
- jne .xloop
-
- .checkodd:
- mov ecx,ebp
- and ecx,7
- jz .noodd
- movd mm0,eax
- .oddloop:
- mov eax,[esi]
- add esi,4
- mov [edi],ax
- shr eax,16
- mov [edi+2],al
- add edi,3
- sub ecx,1
- jnz .oddloop
- movd eax,mm0
- .noodd:
- add esi,eax
- add edi,ebx
-
- sub edx,1
- jne .yloop
-
- emms
-
- epilogue
- ret
-
- global _vdasm_pixblt_XRGB1555_to_XRGB8888_MMX
- _vdasm_pixblt_XRGB1555_to_XRGB8888_MMX:
- prologue 6
-
- mov ebp, [esp+20+16]
- mov edi, [esp+24+16]
- add ebp, ebp
- mov edx, [esp+4+16]
- mov ecx, [esp+12+16]
- lea edx, [edx+ebp*2-12]
- lea ecx, [ecx+ebp-6]
- neg ebp
- mov [esp+20+16], ebp
-
- movq mm5, [r_mask_555]
- movq mm6, [g_mask_555]
- movq mm7, [b_mask_555]
-
- .yloop:
- mov ebp, [esp+20+16]
- add ebp, 6
- jbe .odd
-
- .xloop:
- movq mm0, [ecx+ebp]
- movq mm1, mm6
- movq mm2, mm7
- pand mm1, mm0
- pand mm2, mm0
- pand mm0, mm5
-
- paddw mm0, mm0
- pmulhw mm1, [x4200w]
- psllq mm2, 3
- paddw mm0, mm2
- movq mm2, mm0
- psrlw mm0, 5
- pand mm0, [x07b]
- paddw mm0, mm2
- movq mm2, mm0
- punpcklbw mm0, mm1
- punpckhbw mm2, mm1
-
- movq [edx+ebp*2], mm0
- movq [edx+ebp*2+8], mm2
- add ebp, 8
- jnc .xloop
- .odd:
- sub ebp, 6
- jz .noodd
- .oddloop:
- movzx eax, word [ecx+ebp+6]
- mov ebx, 03e0h
- mov esi, 001fh
- and ebx, eax
- and esi, eax
- and eax, 07c00h
- shl esi, 3
- shl ebx, 6
- shl eax, 9
- add ebx, esi
- add eax, ebx
- mov ebx, eax
- shr eax, 5
- and eax, 070707h
- add eax, ebx
- mov [edx+ebp*2+12], eax
- add ebp, 2
- jnz .oddloop
- .noodd:
- add ecx, [esp+16+16]
- add edx, [esp+8+16]
- dec edi
- jne .yloop
-
- emms
- epilogue
- ret
-
-
- global _vdasm_pixblt_RGB565_to_XRGB8888_MMX
- _vdasm_pixblt_RGB565_to_XRGB8888_MMX:
- prologue 6
-
- mov ebp, [esp+20+16]
- mov edi, [esp+24+16]
- add ebp, ebp
- mov edx, [esp+4+16]
- mov ecx, [esp+12+16]
- lea edx, [edx+ebp*2-12]
- lea ecx, [ecx+ebp-6]
- neg ebp
- mov [esp+20+16], ebp
-
- movq mm5, [r_mask_565]
- movq mm6, [g_mask_565]
- movq mm7, [b_mask_565]
-
- .yloop:
- mov ebp, [esp+20+16]
- add ebp, 6
- jbe .odd
-
- .xloop:
- movq mm0, [ecx+ebp]
- movq mm1, mm6
- movq mm2, mm7
- pand mm1, mm0
- pand mm2, mm0
- pand mm0, mm5
-
- pmulhw mm1, [x2080w]
- psllq mm2, 3
- paddw mm0, mm2
- movq mm2, mm0
- psrlw mm0, 5
- pand mm0, [x07b]
- paddw mm0, mm2
- movq mm2, mm0
- punpcklbw mm0, mm1
- punpckhbw mm2, mm1
-
- movq [edx+ebp*2], mm0
- movq [edx+ebp*2+8], mm2
- add ebp, 8
- jnc .xloop
-
- .odd:
- sub ebp, 6
- jz .noodd
- push edi
- .oddloop:
- movzx eax, word [ecx+ebp+6]
- mov ebx, 0000f800h
- and ebx, eax
- mov esi, eax
- shl ebx, 8
- mov edi, eax
- shl eax, 3
- and esi, 000007e0h
- and eax, 000000f8h
- add ebx, eax
- shl esi, 5
- mov eax, ebx
- shr ebx, 5
- and edi, 00000600h
- shr edi, 1
- and ebx, 00070007h
- add esi, edi
- add eax, ebx
- add eax, esi
- mov [edx+ebp*2+12], eax
- add ebp, 2
- jnz .oddloop
- pop edi
- .noodd:
- add ecx, [esp+16+16]
- add edx, [esp+8+16]
- dec edi
- jne .yloop
-
- emms
- epilogue
- ret
-
-
- global _vdasm_pixblt_RGB888_to_XRGB8888_MMX
- _vdasm_pixblt_RGB888_to_XRGB8888_MMX:
- prologue 6
-
- mov esi,[esp+12+16]
- mov edi,[esp+4+16]
-
- mov ecx,[esp+20+16]
- lea eax,[ecx+ecx*2]
- lea ebx,[ecx*4]
- sub [esp+8+16],ebx
- sub [esp+16+16],eax
-
- mov edx,[esp+24+16]
- mov ebx,[esp+20+16]
- mov ecx,[esp+16+16]
- mov eax,[esp+ 8+16]
-
- ;ebx horizontal count backup
- ;ecx source modulo
- ;edx vertical count
- ;esi source
- ;edi destination
- ;ebp horizontal count
-
- .yloop:
- mov ebp,ebx
- shr ebp,3
- jz .checkodd
- .xloop:
- movq mm0,[esi] ;mm0: g2b2r1g1b1r0g0b0
- movq mm1,mm0 ;
-
- psrlq mm1,24 ;mm1: ------g2b2r1g1b1
- movq mm2,mm0 ;
-
- movq mm3,[esi+8] ;mm3: b5r4g4b4r3g3b3r2
- punpckldq mm0,mm1 ;mm0: b2r1g1b1b1r0g0b0 [qword 0 ready]
-
- movq mm4,mm3 ;mm4: b5r4g4b4r3g3b3r2
- psllq mm3,48 ;mm3: b3r2------------
-
- movq mm5,mm4 ;mm5: b5r4g4b4r3g3b3r2
- psrlq mm2,16 ;mm2: ----g2b2--------
-
- movq mm1,[esi+16] ;mm1: r7g7b7r6g6b6r5g5
- por mm2,mm3 ;mm2: b3r2g2b2--------
-
- movq [edi],mm0 ;
- psllq mm4,24 ;mm4: b4r3g3b3r2------
-
- movq mm3,mm5 ;mm3: b5r4g4b4r3g3b3r2
- psrlq mm5,24 ;mm5: ------b5r4g4b4r3
-
- movq mm0,mm1 ;mm0: r7g7b7r6g6b6r5g5
- psllq mm1,40 ;mm1: b6r5g5----------
-
- punpckhdq mm2,mm4 ;mm2: b4r3g3b3b3r2g2b2 [qword 1 ready]
- por mm1,mm5 ;mm1: b6r5g5b5r4g4b4r3
-
- movq mm4,mm0 ;mm4: r7g7b7r6g6b6r5g5
- punpckhdq mm3,mm1 ;mm3: b6r5g5b5b5r4g4b4 [qword 2 ready]
-
- movq [edi+8],mm2
- psrlq mm0,16 ;mm0: ----r7g7b7r6g6b6
-
- movq [edi+16],mm3
- psrlq mm4,40 ;mm4: ----------r7g7b7
-
- punpckldq mm0,mm4 ;mm0: --r7g7b7b7r6g6b6 [qword 3 ready]
- add esi,24
-
- movq [edi+24],mm0
-
- add edi,32
- sub ebp,1
- jne .xloop
-
- .checkodd:
- mov ebp,ebx
- and ebp,7
- jz .noodd
- movd mm7,eax
- .oddloop:
- mov ax,[esi]
- mov [edi],ax
- mov al,[esi+2]
- mov [edi+2],al
- add esi,3
- add edi,4
- sub ebp,1
- jne .oddloop
-
- movd eax,mm7
- .noodd:
- add esi,ecx
- add edi,eax
-
- sub edx,1
- jne .yloop
- emms
- epilogue
- ret
-
- end
-