home *** CD-ROM | disk | FTP | other *** search
/ Computer Shopper 275 / DPCS0111DVD.ISO / Toolkit / Audio-Visual / VirtualDub / Source / VirtualDub-1.9.10-src.7z / src / Meia / source / convert_isse.cpp < prev    next >
Encoding:
C/C++ Source or Header  |  2009-09-14  |  9.4 KB  |  329 lines

  1. #include <vd2/Meia/MPEGConvert.h>
  2. #include "tables.h"
  3.  
  4. ///////////////////////////////////////////////////////////////////////////
  5.  
  6. using namespace nsVDMPEGTables;
  7.  
  8. extern "C" void asm_YUVtoRGB16_row_ISSE(
  9.         void *ARGB1_pointer,
  10.         void *ARGB2_pointer,
  11.         const void *Y1_pointer,
  12.         const void *Y2_pointer,
  13.         const void *U_pointer,
  14.         const void *V_pointer,
  15.         long width
  16.         );
  17.  
  18. extern "C" void asm_YUVtoRGB16565_row_ISSE(
  19.         void *ARGB1_pointer,
  20.         void *ARGB2_pointer,
  21.         const void *Y1_pointer,
  22.         const void *Y2_pointer,
  23.         const void *U_pointer,
  24.         const void *V_pointer,
  25.         long width
  26.         );
  27.  
  28. extern "C" void asm_YUVtoRGB24_row_ISSE(
  29.         void *ARGB1_pointer,
  30.         void *ARGB2_pointer,
  31.         const void *Y1_pointer,
  32.         const void *Y2_pointer,
  33.         const void *U_pointer,
  34.         const void *V_pointer,
  35.         long width
  36.         );
  37.  
  38. extern "C" void asm_YUVtoRGB32_row_ISSE(
  39.         void *ARGB1_pointer,
  40.         void *ARGB2_pointer,
  41.         const void *Y1_pointer,
  42.         const void *Y2_pointer,
  43.         const void *U_pointer,
  44.         const void *V_pointer,
  45.         long width
  46.         );
  47.  
  48. namespace nsVDMPEGConvertISSE {
  49.  
  50.     void __declspec(naked) DecodeUYVY(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  51.         __asm {
  52.             push        ebp
  53.             push        edi
  54.             push        esi
  55.             push        ebx
  56.  
  57.             mov            ebx,[esp+36+16]        ;ebx = height
  58.             mov            esi,[esp+32+16]        ;esi = mbw
  59.             xor            esi,-1                ;esi = -mbw-1
  60.             mov            eax,[esp+12+16]        ;eax = ysrc
  61.             mov            ecx,[esp+24+16]        ;ecx = cbsrc
  62.             lea            esi,[esi*8+8]        ;esi = -mbw*8
  63.             mov            edx,[esp+20+16]        ;edx = crsrc
  64.             sub            ecx,esi                ;ecx = cbsrc + mbw*8
  65.             sub            edx,esi                ;edx = crsrc + mbw*8
  66.             mov            [esp+32+16],esi
  67.             add            esi,esi
  68.             sub            eax,esi                ;edx = crsrc + mbw*16
  69.             add            esi,esi
  70.             mov            edi,[esp+ 4+16]        ;edi = dst
  71.             sub            edi,esi                ;edx = crsrc + mbw*32
  72.             mov            esi,[esp+28+16]        ;esi = cpitch
  73. yloop:
  74.             mov            ebp,[esp+32+16]        ;ebp = -mbw*8
  75. xloop:
  76.             prefetchnta    [ecx+ebp+32]
  77.             movd        mm0,[ecx+ebp]        ;mm0 =  0 |  0 |  0 |  0 | U3 | U2 | U1 | U0
  78.             prefetchnta    [edx+ebp+32]
  79.             punpcklbw    mm0,[edx+ebp]        ;mm0 = V3 | U3 | V2 | U2 | V1 | U1 | V0 | U0
  80.             prefetchnta    [eax+ebp*2+64]
  81.             movq        mm2,[eax+ebp*2]        ;mm2 = Y7 | Y6 | Y5 | Y4 | Y3 | Y2 | Y1 | Y0
  82.             movq        mm1,mm0
  83.             punpcklbw    mm0,mm2                ;mm0 = Y3 | V1 | Y2 | U1 | Y1 | V0 | Y0 | V0
  84.             punpckhbw    mm1,mm2                ;mm1 = Y7 | V3 | Y6 | U3 | Y5 | V2 | Y4 | V2
  85.             movntq        [edi+ebp*4],mm0
  86.             movntq        [edi+ebp*4+8],mm1
  87.             add            ebp,4
  88.             jne            xloop
  89.  
  90.             add            eax,[esp+16+16]        ;ysrc += ypitch
  91.             add            edi,[esp+ 8+16]        ;dst += dpitch
  92.             xor            esi,[esp+28+16]        ;only add chroma bump every other line
  93.             add            ecx,esi                ;usrc += (y&1) ? cpitch : 0;
  94.             add            edx,esi                ;vsrc += (y&1) ? cpitch : 0;
  95.  
  96.             dec            ebx
  97.             jne            yloop
  98.             
  99.             sfence
  100.             emms
  101.             pop            ebx
  102.             pop            esi
  103.             pop            edi
  104.             pop            ebp
  105.  
  106.             or            eax,-1
  107.             ret
  108.         };
  109.     }
  110.  
  111.     void __declspec(naked) DecodeYUYV(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  112.         __asm {
  113.             push        ebp
  114.             push        edi
  115.             push        esi
  116.             push        ebx
  117.  
  118.             mov            ebx,[esp+36+16]        ;ebx = height
  119.             mov            esi,[esp+32+16]        ;esi = mbw
  120.             xor            esi,-1                ;esi = -mbw-1
  121.             mov            eax,[esp+12+16]        ;eax = ysrc
  122.             mov            ecx,[esp+24+16]        ;ecx = cbsrc
  123.             lea            esi,[esi*8+8]        ;esi = -mbw*8
  124.             mov            edx,[esp+20+16]        ;edx = crsrc
  125.             sub            ecx,esi                ;ecx = cbsrc + mbw*8
  126.             sub            edx,esi                ;edx = crsrc + mbw*8
  127.             mov            [esp+32+16],esi
  128.             add            esi,esi
  129.             sub            eax,esi                ;edx = crsrc + mbw*16
  130.             add            esi,esi
  131.             mov            edi,[esp+ 4+16]        ;edi = dst
  132.             sub            edi,esi                ;edx = crsrc + mbw*32
  133.             mov            esi,[esp+28+16]        ;esi = cpitch
  134. yloop:
  135.             mov            ebp,[esp+32+16]        ;ebp = -mbw*8
  136. xloop:
  137.             prefetchnta    [eax+ebp*2+64]
  138.             movq        mm1,[eax+ebp*2]        ;mm1 = Y7 | Y6 | Y5 | Y4 | Y3 | Y2 | Y1 | Y0
  139.             prefetchnta    [ecx+ebp+32]
  140.             movd        mm0,[ecx+ebp]        ;mm0 =  0 |  0 |  0 |  0 | U3 | U2 | U1 | U0
  141.             movq        mm2,mm1
  142.             prefetchnta    [edx+ebp+32]
  143.             punpcklbw    mm0,[edx+ebp]        ;mm0 = V3 | U3 | V2 | U2 | V1 | U1 | V0 | U0
  144.             punpcklbw    mm1,mm0                ;mm1 = V1 | Y3 | U1 | Y2 | V0 | Y1 | U0 | Y0
  145.             punpckhbw    mm2,mm0                ;mm2 = V3 | Y7 | U3 | Y6 | V2 | Y5 | U2 | Y4
  146.             movntq        [edi+ebp*4],mm1
  147.             movntq        [edi+ebp*4+8],mm2
  148.             add            ebp,4
  149.             jne            xloop
  150.  
  151.             add            eax,[esp+16+16]        ;ysrc += ypitch
  152.             add            edi,[esp+ 8+16]        ;dst += dpitch
  153.             xor            esi,[esp+28+16]        ;only add chroma bump every other line
  154.             add            ecx,esi                ;usrc += (y&1) ? cpitch : 0;
  155.             add            edx,esi                ;vsrc += (y&1) ? cpitch : 0;
  156.  
  157.             dec            ebx
  158.             jne            yloop
  159.             
  160.             sfence
  161.             emms
  162.             pop            ebx
  163.             pop            esi
  164.             pop            edi
  165.             pop            ebp
  166.  
  167.             or            eax,-1
  168.             ret
  169.         };
  170.     }
  171.  
  172.     void __declspec(naked) DecodeYVYU(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  173.         __asm {
  174.             push        ebp
  175.             push        edi
  176.             push        esi
  177.             push        ebx
  178.  
  179.             mov            ebx,[esp+36+16]        ;ebx = height
  180.             mov            esi,[esp+32+16]        ;esi = mbw
  181.             xor            esi,-1                ;esi = -mbw-1
  182.             mov            eax,[esp+12+16]        ;eax = ysrc
  183.             mov            ecx,[esp+24+16]        ;ecx = cbsrc
  184.             lea            esi,[esi*8+8]        ;esi = -mbw*8
  185.             mov            edx,[esp+20+16]        ;edx = crsrc
  186.             sub            ecx,esi                ;ecx = cbsrc + mbw*8
  187.             sub            edx,esi                ;edx = crsrc + mbw*8
  188.             mov            [esp+32+16],esi
  189.             add            esi,esi
  190.             sub            eax,esi                ;edx = crsrc + mbw*16
  191.             add            esi,esi
  192.             mov            edi,[esp+ 4+16]        ;edi = dst
  193.             sub            edi,esi                ;edx = crsrc + mbw*32
  194.             mov            esi,[esp+28+16]        ;esi = cpitch
  195. yloop:
  196.             mov            ebp,[esp+32+16]        ;ebp = -mbw*8
  197. xloop:
  198.             prefetchnta    [eax+ebp*2+64]
  199.             movq        mm1,[eax+ebp*2]        ;mm1 = Y7 | Y6 | Y5 | Y4 | Y3 | Y2 | Y1 | Y0
  200.             prefetchnta    [edx+ebp+32]
  201.             movd        mm0,[edx+ebp]        ;mm0 =  0 |  0 |  0 |  0 | V3 | V2 | V1 | V0
  202.             movq        mm2,mm1
  203.             prefetchnta    [ecx+ebp+32]
  204.             punpcklbw    mm0,[ecx+ebp]        ;mm0 = U3 | V3 | U2 | V2 | U1 | V1 | U0 | V0
  205.             punpcklbw    mm1,mm0                ;mm1 = U1 | Y3 | V1 | Y2 | U0 | Y1 | V0 | Y0
  206.             punpckhbw    mm2,mm0                ;mm2 = U3 | Y7 | V3 | Y6 | U2 | Y5 | V2 | Y4
  207.             movntq        [edi+ebp*4],mm1
  208.             movntq        [edi+ebp*4+8],mm2
  209.             add            ebp,4
  210.             jne            xloop
  211.  
  212.             add            eax,[esp+16+16]        ;ysrc += ypitch
  213.             add            edi,[esp+ 8+16]        ;dst += dpitch
  214.             xor            esi,[esp+28+16]        ;only add chroma bump every other line
  215.             add            ecx,esi                ;usrc += (y&1) ? cpitch : 0;
  216.             add            edx,esi                ;vsrc += (y&1) ? cpitch : 0;
  217.  
  218.             dec            ebx
  219.             jne            yloop
  220.  
  221.             sfence
  222.             emms
  223.             pop            ebx
  224.             pop            esi
  225.             pop            edi
  226.             pop            ebp
  227.  
  228.             or            eax,-1
  229.             ret
  230.         };
  231.     }
  232.  
  233.     void DecodeRGB15(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  234.         char *dst1 = (char *)_dst;
  235.         char *dst2 = (char *)_dst + dpitch;
  236.         const unsigned char *srcY2 = srcY1 + ypitch;
  237.  
  238.         dpitch *= 2;
  239.         ypitch *= 2;
  240.  
  241.         do {
  242.             if (height == 1) {
  243.                 srcY2 = srcY1;
  244.                 dst2 = dst1;
  245.             }
  246.             asm_YUVtoRGB16_row_ISSE(dst1, dst2, srcY1, srcY2, srcCb, srcCr, mbw*8);
  247.             dst1 += dpitch;
  248.             dst2 += dpitch;
  249.             srcY1 += ypitch;
  250.             srcY2 += ypitch;
  251.             srcCr += cpitch;
  252.             srcCb += cpitch;
  253.         } while((height-=2)>0);
  254.  
  255.         __asm sfence
  256.         __asm emms
  257.     }
  258.  
  259.     void DecodeRGB24(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  260.         char *dst1 = (char *)_dst;
  261.         char *dst2 = (char *)_dst + dpitch;
  262.         const unsigned char *srcY2 = srcY1 + ypitch;
  263.  
  264.         dpitch *= 2;
  265.         ypitch *= 2;
  266.  
  267.         do {
  268.             if (height == 1) {
  269.                 srcY2 = srcY1;
  270.                 dst2 = dst1;
  271.             }
  272.             asm_YUVtoRGB24_row_ISSE(dst1, dst2, srcY1, srcY2, srcCb, srcCr, mbw*8);
  273.             dst1 += dpitch;
  274.             dst2 += dpitch;
  275.             srcY1 += ypitch;
  276.             srcY2 += ypitch;
  277.             srcCr += cpitch;
  278.             srcCb += cpitch;
  279.         } while((height-=2)>0);
  280.  
  281.         __asm sfence
  282.         __asm emms
  283.     }
  284.  
  285.     void DecodeRGB32(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height) {
  286.         char *dst1 = (char *)_dst;
  287.         char *dst2 = (char *)_dst + dpitch;
  288.         const unsigned char *srcY2 = srcY1 + ypitch;
  289.  
  290.         dpitch *= 2;
  291.         ypitch *= 2;
  292.  
  293.         do {
  294.             if (height == 1) {
  295.                 srcY2 = srcY1;
  296.                 dst2 = dst1;
  297.             }
  298.             asm_YUVtoRGB32_row_ISSE(dst1, dst2, srcY1, srcY2, srcCb, srcCr, mbw*8);
  299.             dst1 += dpitch;
  300.             dst2 += dpitch;
  301.             srcY1 += ypitch;
  302.             srcY2 += ypitch;
  303.             srcCr += cpitch;
  304.             srcCb += cpitch;
  305.         } while((height-=2)>0);
  306.  
  307.         __asm sfence
  308.         __asm emms
  309.     }
  310. };
  311.  
  312. ///////////////////////////////////////////////////////////////////////////
  313.  
  314. namespace nsVDMPEGConvertReference {
  315.     extern void DecodeY41P(void *_dst, ptrdiff_t dpitch, const unsigned char *srcY1, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height);
  316.     extern void DecodeRGB16(void *dst, ptrdiff_t dpitch, const unsigned char *srcY, ptrdiff_t ypitch, const unsigned char *srcCr, const unsigned char *srcCb, ptrdiff_t cpitch, int mbw, int height);
  317. }
  318.  
  319. const struct VDMPEGConverterSet g_VDMPEGConvert_isse = {
  320.     nsVDMPEGConvertISSE::DecodeUYVY,
  321.     nsVDMPEGConvertISSE::DecodeYUYV,
  322.     nsVDMPEGConvertISSE::DecodeYVYU,
  323.     nsVDMPEGConvertReference::DecodeY41P,
  324.     nsVDMPEGConvertISSE::DecodeRGB15,
  325.     nsVDMPEGConvertReference::DecodeRGB16,
  326.     nsVDMPEGConvertISSE::DecodeRGB24,
  327.     nsVDMPEGConvertISSE::DecodeRGB32,
  328. };
  329.