home *** CD-ROM | disk | FTP | other *** search
- #include "uberblit_swizzle_x86.h"
-
- #ifdef VD_COMPILER_MSVC
- #pragma warning(disable: 4799) // warning C4799: function 'vdasm_extract_8in16_even_MMX' has no EMMS instruction
- #endif
-
- void __declspec(naked) __fastcall vdasm_extract_8in16_even_MMX(void *dst, const void *src, uint32 count) {
- __asm {
- mov eax, [esp+4]
- pcmpeqb mm2, mm2
- psrlw mm2, 8
- sub eax, 8
- jc xtra
- xloop:
- movq mm0, [edx]
- movq mm1, [edx+8]
- pand mm0, mm2
- pand mm1, mm2
- packuswb mm0, mm1
- add edx, 16
- movq [ecx], mm0
- add ecx, 8
- sub eax, 8
- jns xloop
- xtra:
- add eax, 8
- jz fin
- push ebx
- xtraloop:
- mov bl, [edx]
- add edx, 2
- mov [ecx], bl
- add ecx, 1
- sub eax, 1
- jnz xtraloop
-
- pop ebx
- fin:
- ret 4
- }
- }
-
- void __declspec(naked) __fastcall vdasm_extract_8in16_odd_MMX(void *dst, const void *src, uint32 count) {
- __asm {
- mov eax, [esp+4]
- sub eax, 8
- jc xtra
- xloop:
- movq mm0, [edx]
- movq mm1, [edx+8]
- psrlw mm0, 8
- psrlw mm1, 8
- add edx, 16
- packuswb mm0, mm1
- movq [ecx], mm0
- add ecx, 8
- sub eax, 8
- jns xloop
- xtra:
- add eax, 8
- jz fin
- push ebx
- xtraloop:
- mov bl, [edx+1]
- add edx, 2
- mov [ecx], bl
- add ecx, 1
- sub eax, 1
- jnz xtraloop
-
- pop ebx
- fin:
- ret 4
- }
- }
-
- void __declspec(naked) __fastcall vdasm_extract_8in32_MMX(void *dst, const void *src, uint32 count, int byteshift) {
- __asm {
- movd mm4, [esp+8]
- pcmpeqb mm5, mm5
- pslld mm4, 3
- mov eax, [esp+4]
- psrld mm5, 24
- sub eax, 8
- jc xtra
- xloop:
- movq mm0, [edx]
- movq mm1, [edx+8]
- psrld mm0, mm4
- movq mm2, [edx+16]
- psrld mm1, mm4
- pand mm0, mm5
- movq mm3, [edx+24]
- psrld mm2, mm4
- pand mm1, mm5
- packssdw mm0, mm1
- psrld mm3, mm4
- pand mm2, mm5
- pand mm3, mm5
- add edx, 32
- packssdw mm2, mm3
- packuswb mm0, mm2
- movq [ecx], mm0
- add ecx, 8
- sub eax, 8
- jns xloop
- xtra:
- add eax, 8
- jz fin
- add edx, dword ptr [esp+8]
- push ebx
- xtraloop:
- mov bl, [edx]
- add edx, 4
- mov [ecx], bl
- add ecx, 1
- sub eax, 1
- jnz xtraloop
-
- pop ebx
- fin:
- ret 8
- }
- }
-
- void __declspec(naked) __fastcall vdasm_swap_8in16_MMX(void *dst, const void *src, uint32 count) {
- __asm {
- mov eax, [esp+4]
- sub eax, 8
- js xtra
- xloop:
- movq mm0, [edx]
- add edx, 8
- movq mm1, mm0
- psllw mm0, 8
- psrlw mm1, 8
- paddb mm0, mm1
- movq [ecx], mm0
- add ecx, 8
- sub eax, 8
- jns xloop
- xtra:
- add eax, 6
- js nopairs
- push ebx
- pairloop:
- mov bl, [edx]
- mov bh, [edx+1]
- add edx, 2
- mov [ecx], bh
- mov [ecx+1], bl
- add ecx, 2
- sub eax, 2
- jns pairloop
- pop ebx
- nopairs:
- add eax, 2
- jz noodd
- mov al, [edx]
- mov [ecx], al
- noodd:
- ret 4
- }
- }
-
- void __declspec(naked) __fastcall vdasm_interleave_BGRG_MMX(void *dst, const void *srcR, const void *srcG, const void *srcB, uint32 count) {
- __asm {
- push edi
- push esi
- push ebx
- mov esi, [esp+12+12]
- mov edi, [esp+8+12]
- mov ebx, [esp+4+12]
- sub esi, 4
- jc xtra
- ; ecx = dst
- ; edx = srcR
- ; ebx = srcG
- ; edi = srcB
- xloop:
- movd mm0, [edi]
- movd mm1, [edx]
- punpcklbw mm0, mm1
- movq mm1, [ebx]
- movq mm2, mm0
- punpcklbw mm0, mm1
- add edx, 4
- punpckhbw mm2, mm1
- add edi, 4
- movq [ecx], mm0
- add ebx, 8
- movq [ecx+8], mm2
- add ecx, 16
- sub esi, 4
- jns xloop
- xtra:
- add esi, 4
- jz fin
- xtraloop:
- mov al, [edi]
- mov [ecx], al
- mov al, [ebx]
- mov [ecx+1], al
- mov al, [edx]
- mov [ecx+2], al
- mov al, [ebx+1]
- mov [ecx+3], al
- add ebx, 2
- add edx, 1
- add edi, 1
- add ecx, 4
- sub esi, 1
- jnz xtraloop
- fin:
- pop ebx
- pop esi
- pop edi
- ret 12
- }
- }
-
- void __declspec(naked) __fastcall vdasm_interleave_GBGR_MMX(void *dst, const void *srcR, const void *srcG, const void *srcB, uint32 count) {
- __asm {
- push edi
- push esi
- push ebx
- mov esi, [esp+12+12]
- mov edi, [esp+8+12]
- mov ebx, [esp+4+12]
- sub esi, 4
- jc xtra
- ; ecx = dst
- ; edx = srcR
- ; ebx = srcG
- ; edi = srcB
- xloop:
- movd mm0, [edi]
- movd mm1, [edx]
- punpcklbw mm0, mm1
- movq mm2, [ebx]
- movq mm1, mm2
- punpcklbw mm2, mm0
- add edx, 4
- punpckhbw mm1, mm0
- add edi, 4
- movq [ecx], mm2
- add ebx, 8
- movq [ecx+8], mm1
- add ecx, 16
- sub esi, 4
- jns xloop
- xtra:
- add esi, 4
- jz fin
- xtraloop:
- mov al, [ebx]
- mov [ecx], al
- mov al, [edi]
- mov [ecx+1], al
- mov al, [ebx+1]
- mov [ecx+2], al
- mov al, [edx]
- mov [ecx+3], al
- add ebx, 2
- add edx, 1
- add edi, 1
- add ecx, 4
- sub esi, 1
- jnz xtraloop
- fin:
- pop ebx
- pop esi
- pop edi
- ret 12
- }
- }
-
- void __declspec(naked) __fastcall vdasm_interleave_BR_MMX(void *dst, const void *srcB, const void *srcR, uint32 count) {
- __asm {
- push edi
- push esi
- push ebx
- mov esi, [esp+8+12]
- mov ebx, [esp+4+12]
- sub esi, 8
- jc xtra
- ; ecx = dst
- ; edx = srcB
- ; ebx = srcG
- xloop:
- movq mm0, [edx]
- movq mm1, [ebx]
- movq mm2, mm0
- punpcklbw mm0, mm1
- punpckhbw mm2, mm1
- add edx, 8
- movq [ecx], mm0
- add ebx, 8
- movq [ecx+8], mm2
- add ecx, 16
- sub esi, 8
- jns xloop
- xtra:
- add esi, 8
- jz fin
- xtraloop:
- mov al, [edx]
- mov [ecx], al
- mov al, [ebx]
- mov [ecx+1], al
- add ebx, 1
- add edx, 1
- add ecx, 2
- sub esi, 1
- jnz xtraloop
- fin:
- pop ebx
- pop esi
- pop edi
- ret 8
- }
- }
-
- ///////////////////////////////////////////////////////////////////////////////
-
- void VDPixmapGen_8In16_Even_MMX::Compute(void *dst, sint32 y) {
- const uint8 *srcp = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
-
- vdasm_extract_8in16_even_MMX(dst, srcp, mWidth);
- }
-
- void VDPixmapGen_8In16_Odd_MMX::Compute(void *dst, sint32 y) {
- const uint8 *srcp = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
-
- vdasm_extract_8in16_odd_MMX(dst, srcp, mWidth);
- }
-
- void VDPixmapGen_8In32_MMX::Compute(void *dst, sint32 y) {
- const uint8 *srcp = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
-
- vdasm_extract_8in32_MMX(dst, srcp, mWidth, mOffset);
- }
-
- void VDPixmapGen_Swap8In16_MMX::Compute(void *dst, sint32 y) {
- const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
-
- vdasm_swap_8in16_MMX(dst, src, mRowLength);
- }
-
- void VDPixmapGen_B8x2_To_B8R8_MMX::Compute(void *dst0, sint32 y) {
- uint8 *VDRESTRICT dst = (uint8 *VDRESTRICT)dst0;
- const uint8 *VDRESTRICT srcCb = (const uint8 *VDRESTRICT)mpSrcCb->GetRow(y, mSrcIndexCb);
- const uint8 *VDRESTRICT srcCr = (const uint8 *VDRESTRICT)mpSrcCr->GetRow(y, mSrcIndexCr);
-
- vdasm_interleave_BR_MMX(dst, srcCb, srcCr, mWidth);
- }
-
- void VDPixmapGen_B8x3_To_G8B8_G8R8_MMX::Compute(void *VDRESTRICT dst0, sint32 y) {
- uint8 *VDRESTRICT dst = (uint8 *VDRESTRICT)dst0;
- const uint8 *VDRESTRICT srcY = (const uint8 *VDRESTRICT)mpSrcY->GetRow(y, mSrcIndexY);
- const uint8 *VDRESTRICT srcCb = (const uint8 *VDRESTRICT)mpSrcCb->GetRow(y, mSrcIndexCb);
- const uint8 *VDRESTRICT srcCr = (const uint8 *VDRESTRICT)mpSrcCr->GetRow(y, mSrcIndexCr);
-
- vdasm_interleave_GBGR_MMX(dst, srcCr, srcY, srcCb, mWidth >> 1);
-
- if (mWidth & 1) {
- int w2 = mWidth >> 1;
- srcY += mWidth;
- srcCb += w2;
- srcCr += w2;
- dst += mWidth * 2;
-
- dst[-2] = srcY[-1];
- dst[-1] = srcCb[0];
- dst[ 0] = 0; // must be zero for QuickTime compatibility
- dst[ 1] = srcCr[0];
- }
- }
-
- void VDPixmapGen_B8x3_To_B8G8_R8G8_MMX::Compute(void *VDRESTRICT dst0, sint32 y) {
- uint8 *VDRESTRICT dst = (uint8 *VDRESTRICT)dst0;
- const uint8 *VDRESTRICT srcY = (const uint8 * VDRESTRICT)mpSrcY->GetRow(y, mSrcIndexY);
- const uint8 *VDRESTRICT srcCb = (const uint8 * VDRESTRICT)mpSrcCb->GetRow(y, mSrcIndexCb);
- const uint8 *VDRESTRICT srcCr = (const uint8 * VDRESTRICT)mpSrcCr->GetRow(y, mSrcIndexCr);
-
- vdasm_interleave_BGRG_MMX(dst, srcCr, srcY, srcCb, mWidth >> 1);
-
- if (mWidth & 1) {
- int w2 = mWidth >> 1;
- srcY += mWidth;
- srcCb += w2;
- srcCr += w2;
- dst += mWidth * 2;
-
- dst[-2] = srcCb[0];
- dst[-1] = srcY[-1];
- dst[ 0] = srcCr[0];
- dst[ 1] = 0; // must be zero for QuickTime compatibility
- }
- }
-