home *** CD-ROM | disk | FTP | other *** search
/ Computer Shopper 275 / DPCS0111DVD.ISO / Toolkit / Audio-Visual / VirtualDub / Source / VirtualDub-1.9.10-src.7z / src / Kasumi / source / a_resample_sse41.asm < prev    next >
Encoding:
Assembly Source File  |  2009-09-14  |  6.6 KB  |  359 lines

  1.         segment    .rdata, align=16
  2.  
  3. round        dq        0000000000002000h
  4. colround    dq        0000200000002000h
  5.  
  6.         segment    .text
  7.         
  8.         global        _vdasm_resize_table_row_8_k8_4x_SSE41
  9. _vdasm_resize_table_row_8_k8_4x_SSE41:
  10.         push        ebp
  11.         push        edi
  12.         push        esi
  13.         push        ebx
  14.  
  15.         movq        xmm6, [round]
  16.         pshufd        xmm6, xmm6, 0
  17.  
  18.         mov            ebp, [esp +  4 + 16]        ;ebp = dst
  19.         mov            esi, [esp + 12 + 16]        ;esi = width
  20.         mov            edi, [esp + 16 + 16]        ;edi = kernel
  21. .yloop:
  22.         ;eax = temp
  23.         ;ebx = temp
  24.         ;ecx = temp
  25.         ;edx = temp
  26.         ;esi = horiz counter
  27.         ;edi = filter list
  28.         ;ebp = destination
  29.  
  30.         mov            eax, [edi+0]
  31.         mov            ebx, [edi+4]
  32.         mov            ecx, [edi+8]
  33.         mov            edx, [esp+8+16]
  34.         add            eax, edx
  35.         add            ebx, edx
  36.         add            ecx, edx
  37.         add            edx, [edi+12]
  38.  
  39.         pmovzxbw    xmm0, [eax]
  40.         pmaddwd        xmm0, [edi+10h]
  41.         pmovzxbw    xmm1, [ebx]
  42.         pmaddwd        xmm1, [edi+20h]
  43.         pmovzxbw    xmm2, [ecx]
  44.         pmaddwd        xmm2, [edi+30h]
  45.         pmovzxbw    xmm3, [edx]
  46.         pmaddwd        xmm3, [edi+40h]
  47.         add            edi, 50h
  48.         phaddd        xmm0, xmm1
  49.         phaddd        xmm2, xmm3
  50.         phaddd        xmm0, xmm2
  51.         paddd        xmm0, xmm6
  52.         psrad        xmm0, 14
  53.         packssdw    xmm0, xmm0
  54.         packuswb    xmm0, xmm0
  55.         movd        [ebp], xmm0
  56.  
  57.         add            ebp, 4
  58.         sub            esi, 1
  59.         jne            .yloop
  60.  
  61.         pop            ebx
  62.         pop            esi
  63.         pop            edi
  64.         pop            ebp
  65.         ret
  66.  
  67.         global        _vdasm_resize_table_row_8_k16_4x_SSE41
  68. _vdasm_resize_table_row_8_k16_4x_SSE41:
  69.         push        ebp
  70.         push        edi
  71.         push        esi
  72.         push        ebx
  73.  
  74.         movq        xmm6, [round]
  75.         pshufd        xmm6, xmm6, 0
  76.  
  77.         mov            ebp, [esp +  4 + 16]        ;ebp = dst
  78.         mov            esi, [esp + 12 + 16]        ;esi = width
  79.         mov            edi, [esp + 16 + 16]        ;edi = kernel
  80. .yloop:
  81.         ;eax = temp
  82.         ;ebx = temp
  83.         ;ecx = temp
  84.         ;edx = temp
  85.         ;esi = horiz counter
  86.         ;edi = filter list
  87.         ;ebp = destination
  88.  
  89.         mov            eax, [edi+0]
  90.         mov            ebx, [edi+4]
  91.         mov            ecx, [edi+8]
  92.         mov            edx, [esp+8+16]
  93.         add            eax, edx
  94.         add            ebx, edx
  95.         add            ecx, edx
  96.         add            edx, [edi+12]
  97.  
  98.         pmovzxbw    xmm0, [eax]
  99.         pmaddwd        xmm0, [edi+10h]
  100.         pmovzxbw    xmm1, [ebx]
  101.         pmaddwd        xmm1, [edi+20h]
  102.         pmovzxbw    xmm2, [ecx]
  103.         pmaddwd        xmm2, [edi+30h]
  104.         pmovzxbw    xmm3, [edx]
  105.         pmaddwd        xmm3, [edi+40h]
  106.         pmovzxbw    xmm4, [eax+8]
  107.         pmaddwd        xmm4, [edi+50h]
  108.         pmovzxbw    xmm5, [ebx+8]
  109.         pmaddwd        xmm5, [edi+60h]
  110.         paddd        xmm0, xmm4
  111.         pmovzxbw    xmm4, [ecx+8]
  112.         pmaddwd        xmm4, [edi+70h]
  113.         paddd        xmm1, xmm5
  114.         pmovzxbw    xmm5, [edx+8]
  115.         pmaddwd        xmm5, [edi+80h]
  116.         paddd        xmm2, xmm4
  117.         paddd        xmm3, xmm5
  118.         add            edi, 90h
  119.         phaddd        xmm0, xmm1
  120.         phaddd        xmm2, xmm3
  121.         phaddd        xmm0, xmm2
  122.         paddd        xmm0, xmm6
  123.         psrad        xmm0, 14
  124.         packssdw    xmm0, xmm0
  125.         packuswb    xmm0, xmm0
  126.         movd        [ebp], xmm0
  127.  
  128.         add            ebp, 4
  129.         sub            esi, 1
  130.         jne            .yloop
  131.  
  132.         pop            ebx
  133.         pop            esi
  134.         pop            edi
  135.         pop            ebp
  136.         ret
  137.  
  138.         global        _vdasm_resize_table_row_8_SSE41
  139. _vdasm_resize_table_row_8_SSE41:
  140.         push        ebp
  141.         push        edi
  142.         push        esi
  143.         push        ebx
  144.  
  145.         pxor        xmm7, xmm7
  146.         movq        xmm6, [round]
  147.  
  148.         mov            edi, [esp +  4 + 16]        ;edi = dst
  149.         mov            ebx, [esp +  8 + 16]        ;ebx = src
  150.         mov            ebp, [esp + 12 + 16]        ;ebp = width
  151.         mov            edx, [esp + 16 + 16]        ;edx = kernel
  152. .yloop:
  153.         ;eax = temp
  154.         ;ebx = source base address
  155.         ;ecx = (temp) source
  156.         ;edx = filter list
  157.         ;esi = (temp) kernel width
  158.         ;edi = destination
  159.         ;ebp = horiz counter
  160.  
  161.         mov            eax, [edx]
  162.         add            edx, 16
  163.         lea            ecx, [ebx + eax]
  164.         mov            esi, [esp + 20 + 16]        ;esi = kernel width
  165.  
  166.         movq        xmm2, xmm6
  167. .xloop:
  168.         pmovzxbw    xmm0, [ecx]
  169.         add            ecx, 8
  170.         pmaddwd        xmm0, [edx]
  171.         paddd        xmm2, xmm0
  172.         add            edx, 16
  173.         sub            esi, 8
  174.         jne            .xloop
  175.  
  176.         phaddd        xmm2, xmm2
  177.         phaddd        xmm2, xmm2
  178.         psrad        xmm2, 14
  179.         packssdw    xmm2, xmm2
  180.         packuswb    xmm2, xmm2
  181.         movd        eax, xmm2
  182.         mov            [edi], al
  183.         add            edi, 1
  184.         sub            ebp, 1
  185.         jne            .yloop
  186.  
  187.         pop            ebx
  188.         pop            esi
  189.         pop            edi
  190.         pop            ebp
  191.         ret
  192.         
  193.  
  194.         global        _vdasm_resize_table_col_8_k2_SSE41
  195. _vdasm_resize_table_col_8_k2_SSE41:
  196.         push        ebp
  197.         push        edi
  198.         push        esi
  199.         push        ebx
  200.  
  201.         movq        xmm6, [colround]
  202.         pshufd        xmm6, xmm6, 0
  203.  
  204.         mov            esi, [esp +  4 + 16]        ;esi = dst
  205.         mov            edi, [esp + 16 + 16]        ;edi = kernel
  206.         mov            ebp, [esp + 12 + 16]        ;ebp = width
  207.  
  208.         movq        xmm7, [edi]
  209.         pshufd        xmm7, xmm7, 0
  210.  
  211.         mov            edx, [esp +  8 + 16]        ;ebx = srcs
  212.         mov            eax, [edx+0]
  213.         mov            ebx, [edx+4]
  214.         add            eax, ebp
  215.         add            ebx, ebp
  216.         neg            ebp
  217.         
  218. .yloop:
  219.         ;eax = row0
  220.         ;ebx = row1
  221.         ;ecx =
  222.         ;edx =
  223.         ;edi = kernel
  224.         ;esi = dest
  225.         ;ebp = width counter
  226.  
  227.         movd        xmm0, [eax+ebp]
  228.         movd        xmm2, [ebx+ebp]
  229.         punpcklbw    xmm0, xmm2
  230.         pmovzxbw    xmm0, xmm0
  231.         pmaddwd        xmm0, xmm7
  232.  
  233.         paddd        xmm0, xmm6
  234.  
  235.         psrad        xmm0, 14
  236.         packssdw    xmm0, xmm0
  237.         packuswb    xmm0, xmm0
  238.         movd        [esi], xmm0
  239.         add            esi, 4
  240.         add            ebp, 4
  241.         jnz            .yloop
  242.  
  243.         pop            ebx
  244.         pop            esi
  245.         pop            edi
  246.         pop            ebp
  247.         ret
  248.  
  249.         global        _vdasm_resize_table_col_8_k4_SSE41
  250. _vdasm_resize_table_col_8_k4_SSE41:
  251.         push        ebp
  252.         push        edi
  253.         push        esi
  254.         push        ebx
  255.  
  256.         movq        xmm7, [colround]
  257.         pshufd        xmm7, xmm7, 0
  258.  
  259.         mov            esi, [esp +  4 + 16]        ;esi = dst
  260.         mov            edi, [esp + 16 + 16]        ;edi = kernel
  261.  
  262.         movdqu        xmm6, [edi]
  263.         pshufd        xmm5, xmm6, 0
  264.         pshufd        xmm6, xmm6, 0aah
  265.  
  266.         mov            edx, [esp +  8 + 16]        ;ebx = srcs
  267.         mov            ebp, [esp + 12 + 16]
  268.         mov            eax, [edx+0]
  269.         mov            ebx, [edx+4]
  270.         mov            ecx, [edx+8]
  271.         mov            edx, [edx+12]
  272.         lea            eax, [eax+ebp-4]
  273.         lea            ebx, [ebx+ebp-4]
  274.         lea            ecx, [ecx+ebp-4]
  275.         lea            edx, [edx+ebp-4]
  276.         lea            esi, [esi+ebp-4]
  277.         neg            ebp
  278.         add            ebp,4
  279.         jz            .odd
  280. .yloop:
  281.         ;eax = row0
  282.         ;ebx = row1
  283.         ;ecx = row2
  284.         ;edx = row3
  285.         ;edi = kernel
  286.         ;esi = dest
  287.         ;ebp = width counter
  288.  
  289.         movd        xmm0, [eax+ebp]
  290.         movd        xmm1, [ebx+ebp]
  291.         punpcklbw    xmm0, xmm1
  292.  
  293.         movd        xmm1, [ecx+ebp]
  294.         movd        xmm2, [edx+ebp]
  295.         punpcklbw    xmm1, xmm2
  296.  
  297.         movd        xmm2, [eax+ebp+4]
  298.         movd        xmm3, [ebx+ebp+4]
  299.         punpcklbw    xmm2, xmm3
  300.         
  301.         movd        xmm3, [ecx+ebp+4]
  302.         movd        xmm4, [edx+ebp+4]
  303.         punpcklbw    xmm3, xmm4
  304.         
  305.         pmovzxbw    xmm0, xmm0
  306.         pmaddwd        xmm0, xmm5
  307.         
  308.         pmovzxbw    xmm1, xmm1
  309.         pmaddwd        xmm1, xmm6
  310.         
  311.         pmovzxbw    xmm2, xmm2
  312.         pmaddwd        xmm2, xmm5
  313.         
  314.         pmovzxbw    xmm3, xmm3
  315.         pmaddwd        xmm3, xmm6
  316.  
  317.         paddd        xmm0, xmm1
  318.         paddd        xmm2, xmm3
  319.  
  320.         paddd        xmm0, xmm7
  321.         paddd        xmm2, xmm7
  322.  
  323.         psrad        xmm0, 14
  324.         psrad        xmm2, 14
  325.         
  326.         packssdw    xmm0, xmm2
  327.         packuswb    xmm0, xmm0
  328.         movq        [esi+ebp], xmm0
  329.         add            ebp, 8
  330.         js            .yloop
  331.         jnz            .noodd
  332.  
  333. .odd:
  334.         movd        xmm0, [eax]
  335.         movd        xmm1, [ebx]
  336.         movd        xmm2, [ecx]
  337.         movd        xmm3, [edx]
  338.         punpcklbw    xmm0, xmm1
  339.         punpcklbw    xmm2, xmm3
  340.         pmovzxbw    xmm0, xmm0
  341.         pmovzxbw    xmm2, xmm2
  342.         pmaddwd        xmm0, xmm5
  343.         pmaddwd        xmm2, xmm6
  344.         paddd        xmm0, xmm2
  345.         paddd        xmm0, xmm7
  346.         psrad        xmm0, 14
  347.         packssdw    xmm0, xmm0
  348.         packuswb    xmm0, xmm0
  349.         movd        [esi], xmm0
  350. .noodd:
  351.  
  352.         pop            ebx
  353.         pop            esi
  354.         pop            edi
  355.         pop            ebp
  356.         ret
  357.  
  358.         end
  359.