home *** CD-ROM | disk | FTP | other *** search
/ Computer Shopper 275 / DPCS0111DVD.ISO / Toolkit / Audio-Visual / VirtualDub / Source / VirtualDub-1.9.10-src.7z / src / Meia / source / convert_mmx.cpp < prev    next >
Encoding:
C/C++ Source or Header  |  2009-09-14  |  8.8 KB  |  301 lines

  1. #include <vd2/Meia/MPEGConvert.h>
  2. #include "tables.h"
  3.  
  4. ///////////////////////////////////////////////////////////////////////////
  5.  
  6. using namespace nsVDMPEGTables;
  7.  
  8. extern "C" void asm_YUVtoRGB16_row_MMX(
  9.         void *ARGB1_pointer,
  10.         void *ARGB2_pointer,
  11.         const void *Y1_pointer,
  12.         const void *Y2_pointer,
  13.         const void *U_pointer,
  14.         const void *V_pointer,
  15.         long width
  16.         );
  17.  
  18. extern "C" void asm_YUVtoRGB24_row_MMX(
  19.         void *ARGB1_pointer,
  20.         void *ARGB2_pointer,
  21.         const void *Y1_pointer,
  22.         const void *Y2_pointer,
  23.         const void *U_pointer,
  24.         const void *V_pointer,
  25.         long width
  26.         );
  27.  
  28. extern "C" void asm_YUVtoRGB32_row_MMX(
  29.         void *ARGB1_pointer,
  30.         void *ARGB2_pointer,
  31.         const void *Y1_pointer,
  32.         const void *Y2_pointer,
  33.         const void *U_pointer,
  34.         const void *V_pointer,
  35.         long width
  36.         );
  37.  
  38. namespace nsVDMPEGConvertMMX {
  39.  
  40.     void __declspec(naked) DecodeUYVY(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  41.         __asm {
  42.             push        ebp
  43.             push        edi
  44.             push        esi
  45.             push        ebx
  46.  
  47.             mov            ebx,[esp+36+16]        ;ebx = height
  48.             mov            esi,[esp+32+16]        ;esi = mbw
  49.             xor            esi,-1                ;esi = -mbw-1
  50.             mov            eax,[esp+12+16]        ;eax = ysrc
  51.             mov            ecx,[esp+24+16]        ;ecx = cbsrc
  52.             lea            esi,[esi*8+8]        ;esi = -mbw*8
  53.             mov            edx,[esp+20+16]        ;edx = crsrc
  54.             sub            ecx,esi                ;ecx = cbsrc + mbw*8
  55.             sub            edx,esi                ;edx = crsrc + mbw*8
  56.             mov            [esp+32+16],esi
  57.             add            esi,esi
  58.             sub            eax,esi                ;edx = crsrc + mbw*16
  59.             add            esi,esi
  60.             mov            edi,[esp+ 4+16]        ;edi = dst
  61.             sub            edi,esi                ;edx = crsrc + mbw*32
  62.             mov            esi,[esp+28+16]        ;esi = cpitch
  63. yloop:
  64.             mov            ebp,[esp+32+16]        ;ebp = -mbw*8
  65. xloop:
  66.             movd        mm0,[ecx+ebp]        ;mm0 =  0 |  0 |  0 |  0 | U3 | U2 | U1 | U0
  67.             punpcklbw    mm0,[edx+ebp]        ;mm0 = V3 | U3 | V2 | U2 | V1 | U1 | V0 | U0
  68.             movq        mm2,[eax+ebp*2]        ;mm2 = Y7 | Y6 | Y5 | Y4 | Y3 | Y2 | Y1 | Y0
  69.             movq        mm1,mm0
  70.             punpcklbw    mm0,mm2                ;mm0 = Y3 | V1 | Y2 | U1 | Y1 | V0 | Y0 | V0
  71.             punpckhbw    mm1,mm2                ;mm1 = Y7 | V3 | Y6 | U3 | Y5 | V2 | Y4 | V2
  72.             movq        [edi+ebp*4],mm0
  73.             movq        [edi+ebp*4+8],mm1
  74.             add            ebp,4
  75.             jne            xloop
  76.  
  77.             add            eax,[esp+16+16]        ;ysrc += ypitch
  78.             add            edi,[esp+ 8+16]        ;dst += dpitch
  79.             xor            esi,[esp+28+16]        ;only add chroma bump every other line
  80.             add            ecx,esi                ;usrc += (y&1) ? cpitch : 0;
  81.             add            edx,esi                ;vsrc += (y&1) ? cpitch : 0;
  82.  
  83.             dec            ebx
  84.             jne            yloop
  85.             
  86.             emms
  87.             pop            ebx
  88.             pop            esi
  89.             pop            edi
  90.             pop            ebp
  91.  
  92.             ret
  93.         };
  94.     }
  95.  
  96.     void __declspec(naked) DecodeYUYV(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  97.         __asm {
  98.             push        ebp
  99.             push        edi
  100.             push        esi
  101.             push        ebx
  102.  
  103.             mov            ebx,[esp+36+16]        ;ebx = height
  104.             mov            esi,[esp+32+16]        ;esi = mbw
  105.             xor            esi,-1                ;esi = -mbw-1
  106.             mov            eax,[esp+12+16]        ;eax = ysrc
  107.             mov            ecx,[esp+24+16]        ;ecx = cbsrc
  108.             lea            esi,[esi*8+8]        ;esi = -mbw*8
  109.             mov            edx,[esp+20+16]        ;edx = crsrc
  110.             sub            ecx,esi                ;ecx = cbsrc + mbw*8
  111.             sub            edx,esi                ;edx = crsrc + mbw*8
  112.             mov            [esp+32+16],esi
  113.             add            esi,esi
  114.             sub            eax,esi                ;edx = crsrc + mbw*16
  115.             add            esi,esi
  116.             mov            edi,[esp+ 4+16]        ;edi = dst
  117.             sub            edi,esi                ;edx = crsrc + mbw*32
  118.             mov            esi,[esp+28+16]        ;esi = cpitch
  119. yloop:
  120.             mov            ebp,[esp+32+16]        ;ebp = -mbw*8
  121. xloop:
  122.             movq        mm1,[eax+ebp*2]        ;mm1 = Y7 | Y6 | Y5 | Y4 | Y3 | Y2 | Y1 | Y0
  123.             movd        mm0,[ecx+ebp]        ;mm0 =  0 |  0 |  0 |  0 | U3 | U2 | U1 | U0
  124.             movq        mm2,mm1
  125.             punpcklbw    mm0,[edx+ebp]        ;mm0 = V3 | U3 | V2 | U2 | V1 | U1 | V0 | U0
  126.             punpcklbw    mm1,mm0                ;mm1 = V1 | Y3 | U1 | Y2 | V0 | Y1 | U0 | Y0
  127.             punpckhbw    mm2,mm0                ;mm2 = V3 | Y7 | U3 | Y6 | V2 | Y5 | U2 | Y4
  128.             movq        [edi+ebp*4],mm1
  129.             movq        [edi+ebp*4+8],mm2
  130.             add            ebp,4
  131.             jne            xloop
  132.  
  133.             add            eax,[esp+16+16]        ;ysrc += ypitch
  134.             add            edi,[esp+ 8+16]        ;dst += dpitch
  135.             xor            esi,[esp+28+16]        ;only add chroma bump every other line
  136.             add            ecx,esi                ;usrc += (y&1) ? cpitch : 0;
  137.             add            edx,esi                ;vsrc += (y&1) ? cpitch : 0;
  138.  
  139.             dec            ebx
  140.             jne            yloop
  141.             
  142.             emms
  143.             pop            ebx
  144.             pop            esi
  145.             pop            edi
  146.             pop            ebp
  147.  
  148.             ret
  149.         };
  150.     }
  151.  
  152.     void __declspec(naked) DecodeYVYU(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  153.         __asm {
  154.             push        ebp
  155.             push        edi
  156.             push        esi
  157.             push        ebx
  158.  
  159.             mov            ebx,[esp+36+16]        ;ebx = height
  160.             mov            esi,[esp+32+16]        ;esi = mbw
  161.             xor            esi,-1                ;esi = -mbw-1
  162.             mov            eax,[esp+12+16]        ;eax = ysrc
  163.             mov            ecx,[esp+24+16]        ;ecx = cbsrc
  164.             lea            esi,[esi*8+8]        ;esi = -mbw*8
  165.             mov            edx,[esp+20+16]        ;edx = crsrc
  166.             sub            ecx,esi                ;ecx = cbsrc + mbw*8
  167.             sub            edx,esi                ;edx = crsrc + mbw*8
  168.             mov            [esp+32+16],esi
  169.             add            esi,esi
  170.             sub            eax,esi                ;edx = crsrc + mbw*16
  171.             add            esi,esi
  172.             mov            edi,[esp+ 4+16]        ;edi = dst
  173.             sub            edi,esi                ;edx = crsrc + mbw*32
  174.             mov            esi,[esp+28+16]        ;esi = cpitch
  175. yloop:
  176.             mov            ebp,[esp+32+16]        ;ebp = -mbw*8
  177. xloop:
  178.             movq        mm1,[eax+ebp*2]        ;mm1 = Y7 | Y6 | Y5 | Y4 | Y3 | Y2 | Y1 | Y0
  179.             movd        mm0,[edx+ebp]        ;mm0 =  0 |  0 |  0 |  0 | V3 | V2 | V1 | V0
  180.             movq        mm2,mm1
  181.             punpcklbw    mm0,[ecx+ebp]        ;mm0 = U3 | V3 | U2 | V2 | U1 | V1 | U0 | V0
  182.             punpcklbw    mm1,mm0                ;mm1 = U1 | Y3 | V1 | Y2 | U0 | Y1 | V0 | Y0
  183.             punpckhbw    mm2,mm0                ;mm2 = U3 | Y7 | V3 | Y6 | U2 | Y5 | V2 | Y4
  184.             movq        [edi+ebp*4],mm1
  185.             movq        [edi+ebp*4+8],mm2
  186.             add            ebp,4
  187.             jne            xloop
  188.  
  189.             add            eax,[esp+16+16]        ;ysrc += ypitch
  190.             add            edi,[esp+ 8+16]        ;dst += dpitch
  191.             xor            esi,[esp+28+16]        ;only add chroma bump every other line
  192.             add            ecx,esi                ;usrc += (y&1) ? cpitch : 0;
  193.             add            edx,esi                ;vsrc += (y&1) ? cpitch : 0;
  194.  
  195.             dec            ebx
  196.             jne            yloop
  197.             
  198.             emms
  199.             pop            ebx
  200.             pop            esi
  201.             pop            edi
  202.             pop            ebp
  203.  
  204.             ret
  205.         };
  206.     }
  207.  
  208.     void DecodeRGB15(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  209.         char *dst1 = (char *)_dst;
  210.         char *dst2 = (char *)_dst + dpitch;
  211.         const unsigned char *srcY2 = srcY1 + ypitch;
  212.  
  213.         dpitch *= 2;
  214.         ypitch *= 2;
  215.  
  216.         do {
  217.             if (height == 1) {
  218.                 srcY2 = srcY1;
  219.                 dst2 = dst1;
  220.             }
  221.             asm_YUVtoRGB16_row_MMX(dst1, dst2, srcY1, srcY2, srcCb, srcCr, mbw*8);
  222.             dst1 += dpitch;
  223.             dst2 += dpitch;
  224.             srcY1 += ypitch;
  225.             srcY2 += ypitch;
  226.             srcCr += cpitch;
  227.             srcCb += cpitch;
  228.         } while((height-=2)>0);
  229.  
  230.         __asm emms
  231.     }
  232.  
  233.     void DecodeRGB24(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  234.         char *dst1 = (char *)_dst;
  235.         char *dst2 = (char *)_dst + dpitch;
  236.         const unsigned char *srcY2 = srcY1 + ypitch;
  237.  
  238.         dpitch *= 2;
  239.         ypitch *= 2;
  240.  
  241.         do {
  242.             if (height == 1) {
  243.                 srcY2 = srcY1;
  244.                 dst2 = dst1;
  245.             }
  246.             asm_YUVtoRGB24_row_MMX(dst1, dst2, srcY1, srcY2, srcCb, srcCr, mbw*8);
  247.             dst1 += dpitch;
  248.             dst2 += dpitch;
  249.             srcY1 += ypitch;
  250.             srcY2 += ypitch;
  251.             srcCr += cpitch;
  252.             srcCb += cpitch;
  253.         } while((height-=2)>0);
  254.  
  255.         __asm emms
  256.     }
  257.  
  258.     void DecodeRGB32(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  259.         char *dst1 = (char *)_dst;
  260.         char *dst2 = (char *)_dst + dpitch;
  261.         const unsigned char *srcY2 = srcY1 + ypitch;
  262.  
  263.         dpitch *= 2;
  264.         ypitch *= 2;
  265.  
  266.         do {
  267.             if (height == 1) {
  268.                 srcY2 = srcY1;
  269.                 dst2 = dst1;
  270.             }
  271.             asm_YUVtoRGB32_row_MMX(dst1, dst2, srcY1, srcY2, srcCb, srcCr, mbw*8);
  272.             dst1 += dpitch;
  273.             dst2 += dpitch;
  274.             srcY1 += ypitch;
  275.             srcY2 += ypitch;
  276.             srcCr += cpitch;
  277.             srcCb += cpitch;
  278.         } while((height-=2)>0);
  279.  
  280.         __asm emms
  281.     }
  282. };
  283.  
  284. ///////////////////////////////////////////////////////////////////////////
  285.  
  286. namespace nsVDMPEGConvertReference {
  287.     extern void DecodeY41P(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height);
  288.     extern void DecodeRGB16(void *dst, ptrdiff_t dpitch, const unsigned char *srcY, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height);
  289. }
  290.  
  291. const struct VDMPEGConverterSet g_VDMPEGConvert_mmx = {
  292.     nsVDMPEGConvertMMX::DecodeUYVY,
  293.     nsVDMPEGConvertMMX::DecodeYUYV,
  294.     nsVDMPEGConvertMMX::DecodeYVYU,
  295.     nsVDMPEGConvertReference::DecodeY41P,
  296.     nsVDMPEGConvertMMX::DecodeRGB15,
  297.     nsVDMPEGConvertReference::DecodeRGB16,
  298.     nsVDMPEGConvertMMX::DecodeRGB24,
  299.     nsVDMPEGConvertMMX::DecodeRGB32,
  300. };
  301.