home *** CD-ROM | disk | FTP | other *** search
/ Computer Shopper 275 / DPCS0111DVD.ISO / Toolkit / Audio-Visual / VirtualDub / Source / VirtualDub-1.9.10-src.7z / src / Kasumi / source / uberblit_resample.cpp < prev    next >
Encoding:
C/C++ Source or Header  |  2009-09-14  |  23.0 KB  |  624 lines

  1. #include <float.h>
  2. #include <math.h>
  3. #include <vd2/system/vdstl.h>
  4. #include <vd2/system/memory.h>
  5. #include <vd2/system/math.h>
  6. #include <vd2/system/cpuaccel.h>
  7. #include <vd2/Kasumi/pixmap.h>
  8. #include <vd2/Kasumi/pixmaputils.h>
  9. #include <vd2/Kasumi/resample.h>
  10.  
  11. #include <vd2/Kasumi/resample_kernels.h>
  12. #include "resample_stages_x86.h"
  13. #include "uberblit_resample.h"
  14.  
  15. namespace {
  16.     sint32 scale32x32_fp16(sint32 x, sint32 y) {
  17.         return (sint32)(((sint64)x * y + 0x8000) >> 16);
  18.     }
  19.  
  20.     template<class T>
  21.     IVDResamplerSeparableRowStage *RowFactory(double cutoff, float filterFactor) {
  22.         return new T;
  23.     }
  24.  
  25.     template<class T>
  26.     IVDResamplerSeparableRowStage *RowFactoryLinear(double cutoff, float filterFactor) {
  27.         return new T(VDResamplerLinearFilter(cutoff));
  28.     }
  29.  
  30.     template<class T>
  31.     IVDResamplerSeparableRowStage *RowFactoryCubic(double cutoff, float filterFactor) {
  32.         return new T(VDResamplerCubicFilter(cutoff, filterFactor));
  33.     }
  34.  
  35.     template<class T>
  36.     IVDResamplerSeparableRowStage *RowFactoryCubic2(double cutoff, float filterFactor) {
  37.         return new T(filterFactor);
  38.     }
  39.  
  40.     template<class T>
  41.     IVDResamplerSeparableRowStage *RowFactoryLanczos3(double cutoff, float filterFactor) {
  42.         return new T(VDResamplerLanczos3Filter(cutoff));
  43.     }
  44.  
  45.     template<class T>
  46.     IVDResamplerSeparableColStage *ColFactory(double cutoff, float filterFactor) {
  47.         return new T;
  48.     }
  49.  
  50.     template<class T>
  51.     IVDResamplerSeparableColStage *ColFactoryLinear(double cutoff, float filterFactor) {
  52.         return new T(VDResamplerLinearFilter(cutoff));
  53.     }
  54.  
  55.     template<class T>
  56.     IVDResamplerSeparableColStage *ColFactoryCubic(double cutoff, float filterFactor) {
  57.         return new T(VDResamplerCubicFilter(cutoff, filterFactor));
  58.     }
  59.  
  60.     template<class T>
  61.     IVDResamplerSeparableColStage *ColFactoryCubic2(double cutoff, float filterFactor) {
  62.         return new T(filterFactor);
  63.     }
  64.  
  65.     template<class T>
  66.     IVDResamplerSeparableColStage *ColFactoryLanczos3(double cutoff, float filterFactor) {
  67.         return new T(VDResamplerLanczos3Filter(cutoff));
  68.     }
  69. }
  70.  
  71. ///////////////////////////////////////////////////////////////////////////
  72. //
  73. // VDPixmapGenResampleRow
  74. //
  75. ///////////////////////////////////////////////////////////////////////////
  76.  
  77. VDPixmapGenResampleRow::VDPixmapGenResampleRow()
  78.     : mpRowStage(NULL)
  79.     , mpRowStage2(NULL)
  80. {
  81. }
  82.  
  83. VDPixmapGenResampleRow::~VDPixmapGenResampleRow() {
  84.     if (mpRowStage)
  85.         delete mpRowStage;
  86. }
  87.  
  88. void VDPixmapGenResampleRow::Init(IVDPixmapGen *src, uint32 srcIndex, uint32 width, float offset, float step, nsVDPixmap::FilterMode filterMode, float filterFactor, bool interpolationOnly) {
  89.     InitSource(src, srcIndex);
  90.  
  91.     sint32 u0 = (sint32)(offset * 65536.0);
  92.     sint32 dudx = (sint32)(step * 65536.0);
  93.  
  94.     mAxis.Init(dudx);
  95.  
  96.     double x_2fc = 1.0;
  97.     if (!interpolationOnly && step > 1.0f)
  98.         x_2fc = 1.0 / step;
  99.  
  100.     struct SpecialCaseSpanRoutine {
  101.         sint32        mPhase;
  102.         sint32        mStep;
  103.         uint32        mType;
  104.         nsVDPixmap::FilterMode mFilterMode;
  105.         uint32 mCPUFlags;
  106.         IVDResamplerSeparableRowStage *(*mpClassFactory)(double filterCutoff, float filterFactor);
  107.     };
  108.  
  109.     static const SpecialCaseSpanRoutine kSpecialCaseSpanRoutines[]={
  110.         // Generic
  111. #if defined _M_IX86
  112.         { +0x0000, 0x008000, kVDPixType_8,        nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_INTEGER_SSE,    RowFactory<VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf_ISSE> },
  113. #endif
  114.  
  115.         { +0x0000, 0x008000, kVDPixType_8,        nsVDPixmap::kFilterLinear,        0,                            RowFactory<VDResamplerRowStageSeparableLinear8_phaseZeroStepHalf> },
  116.     };
  117.  
  118.     long flags = CPUGetEnabledExtensions();
  119.     uint32 type = mpSrc->GetType(mSrcIndex) & kVDPixType_Mask;
  120.  
  121.     for(int i=0; i<sizeof(kSpecialCaseSpanRoutines)/sizeof(kSpecialCaseSpanRoutines[0]); ++i) {
  122.         const SpecialCaseSpanRoutine& rout = kSpecialCaseSpanRoutines[i];
  123.  
  124.         if (rout.mType != type)
  125.             continue;
  126.  
  127.         if (x_2fc < 1.0)
  128.             continue;
  129.  
  130.         if (rout.mStep != dudx)
  131.             continue;
  132.  
  133.         if (rout.mPhase != u0)
  134.             continue;
  135.  
  136.         if (rout.mFilterMode != filterMode)
  137.             continue;
  138.  
  139.         if ((rout.mCPUFlags & flags) != rout.mCPUFlags)
  140.             continue;
  141.  
  142.         mpRowStage = rout.mpClassFactory(x_2fc, filterFactor);
  143.         mpRowStage2 = mpRowStage->AsRowStage2();
  144.         break;
  145.     }
  146.  
  147.     if (!mpRowStage) {
  148.         struct SpanRoutine {
  149.             uint32        mType;
  150.             bool mbInterpOnly;
  151.             nsVDPixmap::FilterMode mFilterMode;
  152.             uint32 mCPUFlags;
  153.             IVDResamplerSeparableRowStage *(*mpClassFactory)(double filterCutoff, float filterFactor);
  154.         };
  155.         
  156.         static const SpanRoutine kSpanRoutines[]={
  157. #if defined _M_IX86
  158.             // X86
  159.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterPoint,        CPUF_SUPPORTS_MMX,    RowFactory<VDResamplerSeparablePointRowStageMMX> },
  160.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterPoint,        0,                    RowFactory<VDResamplerSeparablePointRowStageX86> },
  161.             { kVDPixType_8,            false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_SSE41,    RowFactoryLinear<VDResamplerSeparableTableRowStage8SSE41> },
  162.             { kVDPixType_8,            false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_MMX,    RowFactoryLinear<VDResamplerSeparableTableRowStage8MMX> },
  163.             { kVDPixType_8888,        true,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_MMX,    RowFactory<VDResamplerSeparableLinearRowStageMMX> },
  164.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_SSE2,    RowFactoryLinear<VDResamplerSeparableTableRowStageSSE2> },
  165.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_MMX,    RowFactoryLinear<VDResamplerSeparableTableRowStageMMX> },
  166.             { kVDPixType_8,            false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_SSE41,    RowFactoryCubic<VDResamplerSeparableTableRowStage8SSE41> },
  167.             { kVDPixType_8,            false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_MMX,    RowFactoryCubic<VDResamplerSeparableTableRowStage8MMX> },
  168.             { kVDPixType_8888,        true,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_MMX,    RowFactoryCubic2<VDResamplerSeparableCubicRowStageMMX> },
  169.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_SSE2,    RowFactoryCubic<VDResamplerSeparableTableRowStageSSE2> },
  170.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_MMX,    RowFactoryCubic<VDResamplerSeparableTableRowStageMMX> },
  171.             { kVDPixType_8,            false,    nsVDPixmap::kFilterLanczos3,        CPUF_SUPPORTS_SSE41,    RowFactoryLanczos3<VDResamplerSeparableTableRowStage8SSE41> },
  172.             { kVDPixType_8,            false,    nsVDPixmap::kFilterLanczos3,    CPUF_SUPPORTS_MMX,    RowFactoryLanczos3<VDResamplerSeparableTableRowStage8MMX> },
  173.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterLanczos3,    CPUF_SUPPORTS_SSE2,    RowFactoryLanczos3<VDResamplerSeparableTableRowStageSSE2> },
  174.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterLanczos3,    CPUF_SUPPORTS_MMX,    RowFactoryLanczos3<VDResamplerSeparableTableRowStageMMX> },
  175. #elif defined _M_AMD64
  176.             // AMD64
  177.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_SSE2,    RowFactoryLinear<VDResamplerSeparableTableRowStageSSE2> },
  178.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_SSE2,    RowFactoryCubic<VDResamplerSeparableTableRowStageSSE2> },
  179.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterLanczos3,    CPUF_SUPPORTS_SSE2,    RowFactoryLanczos3<VDResamplerSeparableTableRowStageSSE2> },
  180. #endif
  181.             // Generic
  182.             { kVDPixType_8,            false,    nsVDPixmap::kFilterPoint,        0,                    RowFactory<VDResamplerRowStageSeparablePoint8> },
  183.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterPoint,        0,                    RowFactory<VDResamplerRowStageSeparablePoint32> },
  184.             { kVDPixType_8,            true,    nsVDPixmap::kFilterLinear,        0,                    RowFactory<VDResamplerRowStageSeparableLinear8> },
  185.             { kVDPixType_8888,        true,    nsVDPixmap::kFilterLinear,        0,                    RowFactory<VDResamplerRowStageSeparableLinear32> },
  186.             { kVDPixType_8,            false,    nsVDPixmap::kFilterLinear,        0,                    RowFactoryLinear<VDResamplerRowStageSeparableTable8> },
  187.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterLinear,        0,                    RowFactoryLinear<VDResamplerRowStageSeparableTable32> },
  188.             { kVDPixType_32F_LE,    false,    nsVDPixmap::kFilterLinear,        0,                    RowFactoryLinear<VDResamplerRowStageSeparableTable32F> },
  189.             { kVDPixType_32Fx4_LE,    false,    nsVDPixmap::kFilterLinear,        0,                    RowFactoryLinear<VDResamplerRowStageSeparableTable32Fx4> },
  190.             { kVDPixType_8,            false,    nsVDPixmap::kFilterCubic,        0,                    RowFactoryCubic<VDResamplerRowStageSeparableTable8> },
  191.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterCubic,        0,                    RowFactoryCubic<VDResamplerRowStageSeparableTable32> },
  192.             { kVDPixType_32F_LE,    false,    nsVDPixmap::kFilterCubic,        0,                    RowFactoryCubic<VDResamplerRowStageSeparableTable32F> },
  193.             { kVDPixType_32Fx4_LE,    false,    nsVDPixmap::kFilterCubic,        0,                    RowFactoryCubic<VDResamplerRowStageSeparableTable32Fx4> },
  194.             { kVDPixType_8,            false,    nsVDPixmap::kFilterLanczos3,    0,                    RowFactoryLanczos3<VDResamplerRowStageSeparableTable8> },
  195.             { kVDPixType_8888,        false,    nsVDPixmap::kFilterLanczos3,    0,                    RowFactoryLanczos3<VDResamplerRowStageSeparableTable32> },
  196.             { kVDPixType_32F_LE,    false,    nsVDPixmap::kFilterLanczos3,    0,                    RowFactoryLanczos3<VDResamplerRowStageSeparableTable32F> },
  197.             { kVDPixType_32Fx4_LE,    false,    nsVDPixmap::kFilterLanczos3,    0,                    RowFactoryLanczos3<VDResamplerRowStageSeparableTable32Fx4> },
  198.         };
  199.  
  200.         for(int i=0; i<sizeof(kSpanRoutines)/sizeof(kSpanRoutines[0]); ++i) {
  201.             const SpanRoutine& rout = kSpanRoutines[i];
  202.  
  203.             if (rout.mType != type)
  204.                 continue;
  205.  
  206.             if (rout.mbInterpOnly && x_2fc < 1.0)
  207.                 continue;
  208.  
  209.             if (rout.mFilterMode != filterMode)
  210.                 continue;
  211.  
  212.             if ((rout.mCPUFlags & flags) != rout.mCPUFlags)
  213.                 continue;
  214.  
  215.             mpRowStage = rout.mpClassFactory(x_2fc, filterFactor);
  216.             mpRowStage2 = mpRowStage->AsRowStage2();
  217.             break;
  218.         }
  219.     }
  220.  
  221.     VDASSERT(mpRowStage);
  222.  
  223.     mRowFiltW = mpRowStage->GetWindowSize();
  224.  
  225.     mpSrc->AddWindowRequest(0, 0);
  226.  
  227.     sint32 fsx1 = (sint32)(offset * 65536.0) - ((mRowFiltW-1) << 15);
  228.     mAxis.Compute(width, fsx1, mSrcWidth, mRowFiltW);
  229.     mWidth = width;
  230.  
  231.     switch(type) {
  232.         case kVDPixType_8:
  233.             mBytesPerSample = 1;
  234.             break;
  235.         case kVDPixType_8888:
  236.         case kVDPixType_32F_LE:
  237.             mBytesPerSample = 4;
  238.             break;
  239.         case kVDPixType_32Fx4_LE:
  240.             mBytesPerSample = 16;
  241.             break;
  242.  
  243.         default:
  244.             VDASSERT(false);
  245.     }
  246. }
  247.  
  248. void VDPixmapGenResampleRow::Start() {
  249.     StartWindow(mWidth * mBytesPerSample);
  250.  
  251.     uint32 clipSpace = ((mRowFiltW*3*mBytesPerSample + 15) >> 4) << 2;
  252.     mTempSpace.resize(clipSpace);
  253.  
  254.     if (mpRowStage2)
  255.         mpRowStage2->Init(mAxis, mSrcWidth);
  256. }
  257.  
  258. void VDPixmapGenResampleRow::Compute(void *dst0, sint32 y) {
  259.     switch(mBytesPerSample) {
  260.         case 1:
  261.             Compute8(dst0, y);
  262.             break;
  263.         case 4:
  264.             Compute32(dst0, y);
  265.             break;
  266.         case 16:
  267.             Compute128(dst0, y);
  268.             break;
  269.     }
  270. }
  271.  
  272. void VDPixmapGenResampleRow::Compute8(void *dst0, sint32 y) {
  273.     const uint8 *src = (const uint8 *)mpSrc->GetRow(y, mSrcIndex);
  274.     uint8 *dst = (uint8 *)dst0;
  275.  
  276.     // process pre-copy region
  277.     if (uint32 count = mAxis.dx_precopy) {
  278.         VDMemset8(dst, src[0], count);
  279.         dst += count;
  280.     }
  281.  
  282.     uint8 *p = (uint8*)mTempSpace.data();
  283.     sint32 u = mAxis.u;
  284.     const sint32 dudx = mAxis.dudx;
  285.  
  286.     // process dual-clip region
  287.     if (mpRowStage2) {
  288.         uint32 count = mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip + mAxis.dx_dualclip;
  289.         mpRowStage2->Process(dst, src, count);
  290.         dst += count;
  291.     } else if (uint32 count = mAxis.dx_dualclip) {
  292.         VDMemset8(p, src[0], mRowFiltW);
  293.         memcpy(p + mRowFiltW, src+1, (mSrcWidth-2));
  294.         VDMemset8(p + mRowFiltW + (mSrcWidth-2), src[mSrcWidth-1], mRowFiltW);
  295.  
  296.         mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
  297.         u += dudx*count;
  298.         dst += count;
  299.     } else {
  300.         // process pre-clip region
  301.         if (uint32 count = mAxis.dx_preclip) {
  302.             VDMemset8(p, src[0], mRowFiltW);
  303.             memcpy(p + mRowFiltW, src+1, (mRowFiltW-1));
  304.  
  305.             mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
  306.             u += dudx*count;
  307.             dst += count;
  308.         }
  309.  
  310.         // process active region
  311.         if (uint32 count = mAxis.dx_active) {
  312.             mpRowStage->Process(dst, src, count, u, dudx);
  313.             u += dudx*count;
  314.             dst += count;
  315.         }
  316.  
  317.         // process post-clip region
  318.         if (uint32 count = mAxis.dx_postclip) {
  319.             uint32 offset = mSrcWidth + 1 - mRowFiltW;
  320.  
  321.             memcpy(p, src+offset, (mRowFiltW-1));
  322.             VDMemset8(p + (mRowFiltW-1), src[mSrcWidth-1], mRowFiltW);
  323.  
  324.             mpRowStage->Process(dst, p, count, u - (offset<<16), dudx);
  325.             dst += count;
  326.         }
  327.     }
  328.  
  329.     // process post-copy region
  330.     if (uint32 count = mAxis.dx_postcopy) {
  331.         VDMemset8(dst, src[mSrcWidth-1], count);
  332.     }
  333. }
  334.  
  335. void VDPixmapGenResampleRow::Compute32(void *dst0, sint32 y) {
  336.     const uint32 *src = (const uint32 *)mpSrc->GetRow(y, mSrcIndex);
  337.     uint32 *dst = (uint32 *)dst0;
  338.  
  339.     // process pre-copy region
  340.     if (uint32 count = mAxis.dx_precopy) {
  341.         VDMemset32(dst, src[0], count);
  342.         dst += count;
  343.     }
  344.  
  345.     uint32 *p = mTempSpace.data();
  346.     sint32 u = mAxis.u;
  347.     const sint32 dudx = mAxis.dudx;
  348.  
  349.     // process dual-clip region
  350.     if (uint32 count = mAxis.dx_dualclip) {
  351.         VDMemset32(p, src[0], mRowFiltW);
  352.         memcpy(p + mRowFiltW, src+1, (mSrcWidth-2)*sizeof(uint32));
  353.         VDMemset32(p + mRowFiltW + (mSrcWidth-2), src[mSrcWidth-1], mRowFiltW);
  354.  
  355.         mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
  356.         u += dudx*count;
  357.         dst += count;
  358.     } else if (mpRowStage2) {
  359.         mpRowStage2->Process(dst, p, mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip);
  360.     } else {
  361.         // process pre-clip region
  362.         if (uint32 count = mAxis.dx_preclip) {
  363.             VDMemset32(p, src[0], mRowFiltW);
  364.             memcpy(p + mRowFiltW, src+1, (mRowFiltW-1)*sizeof(uint32));
  365.  
  366.             mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
  367.             u += dudx*count;
  368.             dst += count;
  369.         }
  370.  
  371.         // process active region
  372.         if (uint32 count = mAxis.dx_active) {
  373.             mpRowStage->Process(dst, src, count, u, dudx);
  374.             u += dudx*count;
  375.             dst += count;
  376.         }
  377.  
  378.         // process post-clip region
  379.         if (uint32 count = mAxis.dx_postclip) {
  380.             uint32 offset = mSrcWidth + 1 - mRowFiltW;
  381.  
  382.             memcpy(p, src+offset, (mRowFiltW-1)*sizeof(uint32));
  383.             VDMemset32(p + (mRowFiltW-1), src[mSrcWidth-1], mRowFiltW);
  384.  
  385.             mpRowStage->Process(dst, p, count, u - (offset<<16), dudx);
  386.             dst += count;
  387.         }
  388.     }
  389.  
  390.     // process post-copy region
  391.     if (uint32 count = mAxis.dx_postcopy) {
  392.         VDMemset32(dst, src[mSrcWidth-1], count);
  393.     }
  394. }
  395.  
  396. void VDPixmapGenResampleRow::Compute128(void *dst0, sint32 y) {
  397.     const uint32 *src = (const uint32 *)mpSrc->GetRow(y, mSrcIndex);
  398.     uint32 *dst = (uint32 *)dst0;
  399.  
  400.     // process pre-copy region
  401.     if (uint32 count = mAxis.dx_precopy) {
  402.         VDMemset128(dst, src, count);
  403.         dst += 4*count;
  404.     }
  405.  
  406.     uint32 *p = mTempSpace.data();
  407.     sint32 u = mAxis.u;
  408.     const sint32 dudx = mAxis.dudx;
  409.  
  410.     // process dual-clip region
  411.     if (uint32 count = mAxis.dx_dualclip) {
  412.         VDMemset128(p, src, mRowFiltW);
  413.         memcpy(p + 4*mRowFiltW, src+1, (mSrcWidth-2)*sizeof(uint32)*4);
  414.         VDMemset128(p + 4*(mRowFiltW + (mSrcWidth-2)), src + 4*(mSrcWidth-1), mRowFiltW);
  415.  
  416.         mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
  417.         u += dudx*count;
  418.         dst += count * 4;
  419.     } else if (mpRowStage2) {
  420.         mpRowStage2->Process(dst, p, mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip);
  421.     } else {
  422.         // process pre-clip region
  423.         if (uint32 count = mAxis.dx_preclip) {
  424.             VDMemset128(p, src, mRowFiltW);
  425.             memcpy(p + 4*mRowFiltW, src+1, (mRowFiltW-1)*sizeof(uint32)*4);
  426.  
  427.             mpRowStage->Process(dst, p, count, u + ((mRowFiltW-1)<<16), dudx);
  428.             u += dudx*count;
  429.             dst += count*4;
  430.         }
  431.  
  432.         // process active region
  433.         if (uint32 count = mAxis.dx_active) {
  434.             mpRowStage->Process(dst, src, count, u, dudx);
  435.             u += dudx*count;
  436.             dst += count*4;
  437.         }
  438.  
  439.         // process post-clip region
  440.         if (uint32 count = mAxis.dx_postclip) {
  441.             uint32 offset = mSrcWidth + 1 - mRowFiltW;
  442.  
  443.             memcpy(p, src+offset*4, (mRowFiltW-1)*sizeof(uint32)*4);
  444.             VDMemset128(p + 4*(mRowFiltW-1), src + 4*(mSrcWidth-1), mRowFiltW);
  445.  
  446.             mpRowStage->Process(dst, p, count, u - (offset<<16), dudx);
  447.             dst += count*4;
  448.         }
  449.     }
  450.  
  451.     // process post-copy region
  452.     if (uint32 count = mAxis.dx_postcopy) {
  453.         VDMemset128(dst, src + 4*(mSrcWidth-1), count);
  454.     }
  455. }
  456.  
  457. ///////////////////////////////////////////////////////////////////////////
  458. //
  459. // VDPixmapGenResampleCol
  460. //
  461. ///////////////////////////////////////////////////////////////////////////
  462.  
  463. VDPixmapGenResampleCol::VDPixmapGenResampleCol()
  464.     : mpColStage(NULL)
  465. {
  466. }
  467.  
  468. VDPixmapGenResampleCol::~VDPixmapGenResampleCol() {
  469.     if (mpColStage)
  470.         delete mpColStage;
  471. }
  472.  
  473. void VDPixmapGenResampleCol::Init(IVDPixmapGen *src, uint32 srcIndex, uint32 height, float offset, float step, nsVDPixmap::FilterMode filterMode, float filterFactor, bool interpolationOnly) {
  474.     InitSource(src, srcIndex);
  475.  
  476.     sint32 dvdy = (sint32)(step * 65536.0);
  477.  
  478.     mAxis.Init(dvdy);
  479.  
  480.     // construct stages
  481.     double y_2fc = 1.0;
  482.     if (!interpolationOnly && step > 1.0f)
  483.         y_2fc = 1.0 / step;
  484.  
  485.     struct SpanRoutine {
  486.         uint32 mType;
  487.         bool mbInterpOnly;
  488.         nsVDPixmap::FilterMode mFilterMode;
  489.         uint32 mCPUFlags;
  490.         IVDResamplerSeparableColStage *(*mpClassFactory)(double filterCutoff, float filterFactor);
  491.     };
  492.     
  493.     static const SpanRoutine kSpanRoutines[]={
  494. #if defined _M_IX86
  495.         // X86
  496.         { kVDPixType_8,            false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_SSE41,    ColFactoryLinear<VDResamplerSeparableTableColStage8SSE41> },
  497.         { kVDPixType_8,            false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_MMX,    ColFactoryLinear<VDResamplerSeparableTableColStage8MMX> },
  498.         { kVDPixType_8888,        true,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_MMX,    ColFactory<VDResamplerSeparableLinearColStageMMX> },
  499.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_SSE2,    ColFactoryLinear<VDResamplerSeparableTableColStageSSE2> },
  500.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_MMX,    ColFactoryLinear<VDResamplerSeparableTableColStageMMX> },
  501.         { kVDPixType_8,            false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_SSE41,    ColFactoryCubic<VDResamplerSeparableTableColStage8SSE41> },
  502.         { kVDPixType_8,            false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_MMX,    ColFactoryCubic<VDResamplerSeparableTableColStage8MMX> },
  503.         { kVDPixType_8888,        true,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_SSE2,    ColFactoryCubic2<VDResamplerSeparableCubicColStageSSE2> },
  504.         { kVDPixType_8888,        true,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_MMX,    ColFactoryCubic2<VDResamplerSeparableCubicColStageMMX> },
  505.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_SSE2,    ColFactoryCubic<VDResamplerSeparableTableColStageSSE2> },
  506.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_MMX,    ColFactoryCubic<VDResamplerSeparableTableColStageMMX> },
  507.         { kVDPixType_8,            false,    nsVDPixmap::kFilterLanczos3,    CPUF_SUPPORTS_SSE41,    ColFactoryLanczos3<VDResamplerSeparableTableColStage8SSE41> },
  508.         { kVDPixType_8,            false,    nsVDPixmap::kFilterLanczos3,    CPUF_SUPPORTS_MMX,    ColFactoryLanczos3<VDResamplerSeparableTableColStage8MMX> },
  509.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterLanczos3,    CPUF_SUPPORTS_SSE2,    ColFactoryLanczos3<VDResamplerSeparableTableColStageSSE2> },
  510.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterLanczos3,    CPUF_SUPPORTS_MMX,    ColFactoryLanczos3<VDResamplerSeparableTableColStageMMX> },
  511. #elif defined _M_AMD64
  512.         // AMD64
  513.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterLinear,        CPUF_SUPPORTS_SSE2,    ColFactoryLinear<VDResamplerSeparableTableColStageSSE2> },
  514.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterCubic,        CPUF_SUPPORTS_SSE2,    ColFactoryCubic<VDResamplerSeparableTableColStageSSE2> },
  515.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterLanczos3,    CPUF_SUPPORTS_SSE2,    ColFactoryLanczos3<VDResamplerSeparableTableColStageSSE2> },
  516. #endif
  517.         // Generic
  518.         { kVDPixType_8,            true,    nsVDPixmap::kFilterLinear,        0,                    ColFactory<VDResamplerColStageSeparableLinear8> },
  519.         { kVDPixType_8888,        true,    nsVDPixmap::kFilterLinear,        0,                    ColFactory<VDResamplerColStageSeparableLinear32> },
  520.         { kVDPixType_8,            false,    nsVDPixmap::kFilterLinear,        0,                    ColFactoryLinear<VDResamplerColStageSeparableTable8> },
  521.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterLinear,        0,                    ColFactoryLinear<VDResamplerColStageSeparableTable32> },
  522.         { kVDPixType_32F_LE,    false,    nsVDPixmap::kFilterLinear,        0,                    ColFactoryLinear<VDResamplerColStageSeparableTable32F> },
  523.         { kVDPixType_32Fx4_LE,    false,    nsVDPixmap::kFilterLinear,        0,                    ColFactoryLinear<VDResamplerColStageSeparableTable32Fx4> },
  524.         { kVDPixType_8,            false,    nsVDPixmap::kFilterCubic,        0,                    ColFactoryCubic<VDResamplerColStageSeparableTable8> },
  525.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterCubic,        0,                    ColFactoryCubic<VDResamplerColStageSeparableTable32> },
  526.         { kVDPixType_32F_LE,    false,    nsVDPixmap::kFilterCubic,        0,                    ColFactoryCubic<VDResamplerColStageSeparableTable32F> },
  527.         { kVDPixType_32Fx4_LE,    false,    nsVDPixmap::kFilterCubic,        0,                    ColFactoryCubic<VDResamplerColStageSeparableTable32Fx4> },
  528.         { kVDPixType_8,            false,    nsVDPixmap::kFilterLanczos3,    0,                    ColFactoryLanczos3<VDResamplerColStageSeparableTable8> },
  529.         { kVDPixType_8888,        false,    nsVDPixmap::kFilterLanczos3,    0,                    ColFactoryLanczos3<VDResamplerColStageSeparableTable32> },
  530.         { kVDPixType_32F_LE,    false,    nsVDPixmap::kFilterLanczos3,    0,                    ColFactoryLanczos3<VDResamplerColStageSeparableTable32F> },
  531.         { kVDPixType_32Fx4_LE,    false,    nsVDPixmap::kFilterLanczos3,    0,                    ColFactoryLanczos3<VDResamplerColStageSeparableTable32Fx4> },
  532.     };
  533.  
  534.     long flags = CPUGetEnabledExtensions();
  535.     uint32 type = src->GetType(srcIndex) & kVDPixType_Mask;
  536.     for(int i=0; i<sizeof(kSpanRoutines)/sizeof(kSpanRoutines[0]); ++i) {
  537.         const SpanRoutine& rout = kSpanRoutines[i];
  538.  
  539.         if (rout.mType != type)
  540.             continue;
  541.  
  542.         if (rout.mbInterpOnly && y_2fc < 1.0)
  543.             continue;
  544.  
  545.         if (rout.mFilterMode != filterMode)
  546.             continue;
  547.  
  548.         if ((rout.mCPUFlags & flags) != rout.mCPUFlags)
  549.             continue;
  550.  
  551.         mpColStage = rout.mpClassFactory(y_2fc, filterFactor);
  552.         break;
  553.     }
  554.  
  555.     mWinSize = mpColStage ? mpColStage->GetWindowSize() : 1;
  556.     mWindow.resize(mWinSize);
  557.  
  558.     int delta = (mWinSize + 1) >> 1;
  559.     mpSrc->AddWindowRequest(-delta, delta);
  560.  
  561.     sint32 fsy1 = (sint32)(offset * 65536.0) - ((mWinSize-1)<<15);
  562.     mAxis.Compute(height, fsy1, mSrcHeight, mWinSize);
  563.     mHeight = height;
  564.  
  565.     switch(type) {
  566.         case kVDPixType_8:
  567.             mBytesPerSample = 1;
  568.             break;
  569.         case kVDPixType_8888:
  570.         case kVDPixType_32F_LE:
  571.             mBytesPerSample = 4;
  572.             break;
  573.         case kVDPixType_32Fx4_LE:
  574.             mBytesPerSample = 16;
  575.             break;
  576.  
  577.         default:
  578.             VDASSERT(false);
  579.     }
  580. }
  581.  
  582. void VDPixmapGenResampleCol::Start() {
  583.     mBytesPerRow = mWidth * mBytesPerSample;
  584.     StartWindow(mBytesPerRow);
  585. }
  586.  
  587. void VDPixmapGenResampleCol::Compute(void *dst0, sint32 y) {
  588.     const uint32 winsize = mWinSize;
  589.     const uint32 dx = mSrcWidth;
  590.  
  591.     y -= (sint32)mAxis.dx_precopy;
  592.  
  593.     if (y < 0) {
  594.         const void *srcrow0 = mpSrc->GetRow(0, mSrcIndex);
  595.         memcpy(dst0, srcrow0, mBytesPerRow);
  596.         return;
  597.     }
  598.  
  599.     uint32 midrange = mAxis.dx_preclip + mAxis.dx_active + mAxis.dx_postclip + mAxis.dx_dualclip;
  600.  
  601.     if (y < (sint32)midrange) {
  602.         sint32 v = mAxis.u + mAxis.dudx * y;
  603.  
  604.         if (mpColStage) {
  605.             for(uint32 i=0; i<winsize; ++i) {
  606.                 int sy = (v >> 16) + i;
  607.  
  608.                 if ((unsigned)sy >= (unsigned)mSrcHeight)
  609.                     sy = (~sy >> 31) & (mSrcHeight - 1);
  610.  
  611.                 mWindow[i] = mpSrc->GetRow(sy, mSrcIndex);
  612.             }
  613.  
  614.             mpColStage->Process(dst0, mWindow.data(), dx, v);
  615.         } else
  616.             memcpy(dst0, mpSrc->GetRow(v >> 16, mSrcIndex), mBytesPerRow);
  617.         return;
  618.     }
  619.  
  620.     const void *p = mpSrc->GetRow(mSrcHeight - 1, mSrcIndex);
  621.  
  622.     memcpy(dst0, p, mBytesPerRow);
  623. }
  624.