home *** CD-ROM | disk | FTP | other *** search
/ Computer Shopper 275 / DPCS0111DVD.ISO / Toolkit / Audio-Visual / VirtualDub / Source / VirtualDub-1.9.10-src.7z / src / Priss / source / layer3.cpp < prev    next >
Encoding:
C/C++ Source or Header  |  2009-09-14  |  46.0 KB  |  1,585 lines

  1. //    Priss (NekoAmp 2.0) - MPEG-1/2 audio decoding library
  2. //    Copyright (C) 2003 Avery Lee
  3. //
  4. //    This program is free software; you can redistribute it and/or modify
  5. //    it under the terms of the GNU General Public License as published by
  6. //    the Free Software Foundation; either version 2 of the License, or
  7. //    (at your option) any later version.
  8. //
  9. //    This program is distributed in the hope that it will be useful,
  10. //    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12. //    GNU General Public License for more details.
  13. //
  14. //    You should have received a copy of the GNU General Public License
  15. //    along with this program; if not, write to the Free Software
  16. //    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17. //
  18. ///////////////////////////////////////////////////////////////////////////
  19. //
  20. // MPEG-1/2 layer III decoding
  21. //
  22. // Layer III is fugly compared to layer I/II.  The general stages of decoding
  23. // are:
  24. //
  25. // 1) Sliding window (primitive VBR)
  26. //
  27. //    Data regions of each frame are interpreted as a continuous stream and
  28. //    the encoded values for a frame start 0-511 bytes behind the data region
  29. //    for that frame.
  30. //
  31. // 2) Huffman decoding
  32. //
  33. //    Layer III granules encode 576 frequency lines, composed of 32 subbands
  34. //    of 18 samples each.  The 576 values are broken into four regions: two
  35. //    encoded in pairs using two huffman tables, a third composed of only
  36. //    -1/0/+1 in quadruples, and the highest part of the spectrum all zeroes.
  37. //    When this stage fails, the whole frame is trashed.
  38. //
  39. // 3) Dequantization
  40. //
  41. //    Decoded values are pushed through a nonlinear power ramp and then
  42. //    modulated by scalefactors.  Scalefactors group frequency lines -- not
  43. //    necessarily dividing cleanly between subbands -- differently depending
  44. //    on the block type:
  45. //
  46. //    A) long blocks (usual case) - 21 scalefactor bands
  47. //    B) short blocks (transients) - 12 scalefactor bands for three time
  48. //                                   windows (6 samples each)
  49. //    C) mixed blocks - 2 long bands and 3 windows of 9 bands each.
  50. //
  51. //    Short blocks seldom occur and are not really worth optimizing (one
  52. //    problem is that they only encode one-third of the frequency range).
  53. //    I've never seen mixed blocks.
  54. //
  55. // 4) Stereo processing
  56. //
  57. //    Layer III supports two types of joint stereo, mid/side (MS) and
  58. //    intensity stereo (IS).  In MS, a simple butterfly is used to convert
  59. //    sum and difference signals into stereo; in IS, a mono channel is
  60. //    cast to stereo with different amplitudes on the left and right.
  61. //    When both MS and IS are active, the switch from MS to IS occurs when
  62. //    the data for the right channel ends.
  63. //
  64. //    Priss currently has a small bug in MPEG-2 IS processing: unlike
  65. //    MPEG-1, short blocks in MPEG-2 have the MS-to-IS switch point tracked
  66. //    per window.  MPEG-2 audio with intensity stereo is a bit hard to
  67. //    come by (the LAME build I have doesn't do it), and even harder to get
  68. //    into an MPEG-1 video file, so I don't have a test case for this.
  69. //
  70. // 5) Antialiasing
  71. //
  72. //    I don't really understand this, but rotations are performed between
  73. //    frequency lines in adjacent subbands.  Without it, a sort of "halo"
  74. //    echo effect is heard.  Next.
  75. //
  76. // 6) IMDCT
  77. //
  78. //    The IMDCT, like other DCT transforms, has infinite basis functions and
  79. //    thus performs poorly with transients.  Short blocks combat this by
  80. //    slicing the IMDCT into thirds, thus reducing the length of preecho.
  81. //    Slower transients can also be modeled with the regular 18-point IMDCT
  82. //    by using the attack and release windows (types 1 and 3) instead of
  83. //    the regular sinusoidal window (type 0).
  84. //
  85. // 7) Polyphase filter
  86. //
  87. //    As with layer I and II, it all ends here.  MPEG-1 produces two
  88. //    granules for 1152 samples and MPEG-2 produces one granule of 576
  89. //    samples.
  90.  
  91. #include <math.h>
  92. #include <float.h>
  93. #include "engine.h"
  94. #include "bitreader.h"
  95.  
  96. //#define RDTSC_PROFILE
  97.  
  98. #ifdef RDTSC_PROFILE
  99.  
  100.     #include <windows.h>
  101.  
  102.     static long p_lasttime;
  103.     static long p_frames=0;
  104.     static __int64 p_total=0;
  105.     static __int64 p_scalefac=0;
  106.     static __int64 p_huffdec=0;
  107.     static __int64 p_huffdec1=0;
  108.     static __int64 p_dequan=0;
  109.     static __int64 p_stereo=0;
  110.     static __int64 p_hybrid=0;
  111.     static __int64 p_polyphase=0;
  112.  
  113.     static void __inline profile_set(int) {
  114.         __asm {
  115.             rdtsc
  116.             mov        p_lasttime,eax
  117.         };
  118.     }
  119.  
  120.     static void __inline profile_add(__int64& counter) {
  121.         long diff;
  122.  
  123.         __asm {
  124.             rdtsc
  125.             sub        eax,p_lasttime
  126.             mov        diff,eax
  127.         }
  128.  
  129.         counter += diff;
  130.         p_total += diff;
  131.  
  132.         __asm {
  133.             rdtsc
  134.             mov        p_lasttime,eax
  135.  
  136.         }
  137.     }
  138. #else
  139.  
  140.     #define profile_set(x)
  141.     #define profile_add(x)
  142.  
  143. #endif
  144.  
  145.  
  146. namespace {
  147.     struct LayerIIIRegionSideInfo {
  148.         unsigned    part2_3_length;
  149.         unsigned    big_values;
  150.         unsigned    global_gain;
  151.         unsigned    scalefac_compress;
  152.         bool        window_switching_flag;
  153.         uint8    table_select[3];    // 3 for unswitched, 2 for switched
  154.         union {
  155.             struct {
  156.                 uint8    block_type;
  157.                 bool    mixed_block_flag;
  158.                 uint8    subblock_gain[3];
  159.             } switched;
  160.             struct {
  161.                 uint8    region0_count;
  162.                 uint8    region1_count;
  163.             } unswitched;
  164.         };
  165.         bool        preflag;
  166.         bool        scalefac_scale;
  167.         bool        count1table_select;
  168.     };
  169.     struct LayerIIISideInfo {
  170.         unsigned    main_data_begin;
  171.         bool        scfsi[2][4];
  172.         LayerIIIRegionSideInfo gr[2][2];
  173.     };
  174.  
  175.     void DecodeSideInfoMPEG1(LayerIIISideInfo& si, const uint8 *src, unsigned nch) {
  176.         VDMPEGAudioBitReader bits(src, 33);
  177.         unsigned gr, ch, i;
  178.  
  179.         si.main_data_begin    = bits.get(9);
  180.  
  181.         bits.get(nch>1 ? 3 : 5);        // skip private bits
  182.  
  183.         // read scale factor selectors (ch x 4 bits)
  184.         for(ch=0; ch<nch; ++ch)
  185.             for(i=0; i<4; ++i)
  186.                 si.scfsi[ch][i] = bits.getbool();
  187.  
  188.         // read global regions (2 x ch x 56 bits)
  189.         for(gr=0; gr<2; ++gr) {
  190.             for(ch=0; ch<nch; ++ch) {
  191.                 LayerIIIRegionSideInfo& rsi = si.gr[gr][ch];
  192.  
  193.                 rsi.part2_3_length            = bits.get(12);
  194.                 rsi.big_values                = bits.get(9);
  195.                 rsi.global_gain                = bits.get(8);
  196.                 rsi.scalefac_compress        = bits.get(4);
  197.                 rsi.window_switching_flag    = bits.getbool();
  198.  
  199.                 if (rsi.window_switching_flag) {
  200.                     rsi.switched.block_type            = bits.get(2);
  201.                     rsi.switched.mixed_block_flag    = bits.getbool();
  202.                     for(unsigned r=0; r<2; ++r)
  203.                         rsi.table_select[r] = bits.get(5);
  204.                     rsi.table_select[2] = 0;
  205.                     for(unsigned w=0; w<3; ++w)
  206.                         rsi.switched.subblock_gain[w] = bits.get(3);
  207.                 } else {
  208.                     for(unsigned r=0; r<3; ++r)
  209.                         rsi.table_select[r] = bits.get(5);
  210.                     rsi.unswitched.region0_count = bits.get(4);
  211.                     rsi.unswitched.region1_count = bits.get(3);
  212.                 }
  213.  
  214.                 rsi.preflag = bits.getbool();
  215.                 rsi.scalefac_scale    = bits.getbool();
  216.                 rsi.count1table_select = bits.getbool();
  217.             }
  218.         }
  219.     }
  220.     
  221.     void DecodeSideInfoMPEG2(LayerIIISideInfo& si, const uint8 *src, unsigned nch) {
  222.         VDMPEGAudioBitReader bits(src, 33);
  223.  
  224.         si.main_data_begin    = bits.get(8);
  225.  
  226.         bits.get(nch>1 ? 2 : 1);        // skip private bits
  227.  
  228.         for(unsigned ch=0; ch<nch; ++ch) {        // 63 bits per channel
  229.             LayerIIIRegionSideInfo& rsi = si.gr[0][ch];
  230.  
  231.             rsi.part2_3_length            = bits.get(12);
  232.             rsi.big_values                = bits.get(9);
  233.             rsi.global_gain                = bits.get(8);
  234.             rsi.scalefac_compress        = bits.get(9);
  235.             rsi.window_switching_flag    = bits.getbool();
  236.  
  237.             if (rsi.window_switching_flag) {
  238.                 rsi.switched.block_type            = bits.get(2);
  239.                 rsi.switched.mixed_block_flag    = bits.getbool();
  240.                 for(unsigned r=0; r<2; ++r)
  241.                     rsi.table_select[r] = bits.get(5);
  242.                 rsi.table_select[2] = 0;
  243.                 for(unsigned w=0; w<3; ++w)
  244.                     rsi.switched.subblock_gain[w] = bits.get(3);
  245.             } else {
  246.                 for(unsigned r=0; r<3; ++r)
  247.                     rsi.table_select[r] = bits.get(5);
  248.                 rsi.unswitched.region0_count = bits.get(4);
  249.                 rsi.unswitched.region1_count = bits.get(3);
  250.             }
  251.  
  252.             rsi.scalefac_scale    = bits.getbool();
  253.             rsi.count1table_select = bits.getbool();
  254.         }
  255.     }
  256.  
  257.     static struct costable_t {
  258.         costable_t() {
  259.             for(unsigned i=0; i<36; ++i) {
  260.                 for(unsigned j=0; j<18; ++j) {
  261.                     v[i][j] = (float)cos((3.1415926535897932 / 72) * (i+i+1+18) * (j+j+1));
  262.                 }
  263.             }
  264.         }
  265.  
  266.         float v[36][18];
  267.     } costable;
  268.  
  269.     ////////////////////////////////////
  270.  
  271. #if 0
  272.     void IMDCT_18(const float *in, float (*out)[2][32], float *overlap, const float *window) {
  273.         float t[36];
  274.  
  275.         for(unsigned i=0; i<36; ++i) {
  276.             double s=0;
  277.             for(unsigned j=0; j<18; ++j) {
  278.                 s += in[j] * costable.v[i][j]; //cos((3.1415926535897932 / 72) * (i+i+1+18) * (j+j+1));
  279.             }
  280.  
  281.             t[i] = (float)s;
  282.         }
  283.  
  284.         for(unsigned k=0; k<18; ++k) {
  285.             out[k][0][0] = overlap[k] + t[k]*window[k];
  286.             overlap[k] = t[k+18]*window[k+18];
  287.         }
  288.     }
  289. #else
  290.  
  291.     // The algorithm for this IMDCT comes from HP Laboratories report
  292.     // HPL-2000-66, "Faster MPEG-1 Layer III Audio Decoding" by
  293.     // Scott B. Marovich.  It first uses a technique from the Lee
  294.     // decomposition to shift the IMDCT into an IDCT, then factors
  295.     // the 18-point IDCT down to 4-point and 5-point IDCTs.  Profiling
  296.     // seems to indicate that this is not as fast as FreeAmp's IMDCT,
  297.     // which appears to use a direct IMDCT factorization instead, but
  298.     // layer III decoding is rather rare in MPEG-1 video files and
  299.     // this runs fast enough.
  300.  
  301.     void IMDCT_9(const float (*b)[2], float *d) {
  302.         static const float c1 = 0.93969262078591f;    // cos(1*(pi/9))
  303.         static const float c2 = 0.76604444311898f;    // cos(2*(pi/9))
  304.         static const float c4 = 0.17364817766693f;    // cos(4*(pi/9))
  305.  
  306.         // 'g' stage (5-point IDCT)
  307.         const float G0 = b[0][0];
  308.         const float G1 = b[2][0];
  309.         const float G2 = b[4][0];
  310.         const float G3 = b[6][0];
  311.         const float G4 = b[8][0];
  312.         const float x0 = G3*0.5f + G0;
  313.         const float x1 = G0 - G3;
  314.         const float x2 = G1 - G2 - G4;
  315.         const float g0 = x0 + c1*G1 + c2*G2 + c4*G4;
  316.         const float g1 = x2*0.5f + x1;
  317.         const float g2 = x0 - c4*G1 - c1*G2 + c2*G4;
  318.         const float g3 = x0 - c2*G1 + c4*G2 - c1*G4;
  319.         const float g4 = x1 - x2;
  320.  
  321.         // 'h prime' stage (4-point IDCT)
  322.         static const float odd[4]={
  323.             0.50771330594287f,
  324.             0.57735026918963f,
  325.             0.77786191343021f,
  326.             1.46190220008154f
  327.         };
  328.  
  329.         static const float covals[4]={    // odd[i] * sin((pi/18)*(2i+1))*(-1)^i
  330.             0.08816349035423f,
  331.             -0.28867513459481f,
  332.             0.59587679629710f,
  333.             -1.37373870972731f,
  334.         };
  335.  
  336.         const float H0 = b[1][0];
  337.         const float H1 = b[1][0] + b[3][0];
  338.         const float H2 = b[3][0] + b[5][0];
  339.         const float H3 = b[5][0] + b[7][0];
  340.         const float y0 = H3*0.5f + H0;
  341.         const float y1 = H1 - H2;
  342.         const float h0 = odd[0]*(y0 + c1*H1 + c2*H2) + b[7][0]*covals[0];
  343.         const float h1 = odd[1]*(0.5f*y1 + H0 - H3)  + b[7][0]*covals[1];
  344.         const float h2 = odd[2]*(y0 - c4*H1 - c1*H2) + b[7][0]*covals[2];
  345.         const float h3 = odd[3]*(y0 - c2*H1 + c4*H2) + b[7][0]*covals[3];
  346.  
  347.         d[0] = g0 + h0;
  348.         d[1] = g1 + h1;
  349.         d[2] = g2 + h2;
  350.         d[3] = g3 + h3;
  351.         d[4] = g4;
  352.         d[5] = g3 - h3;
  353.         d[6] = g2 - h2;
  354.         d[7] = g1 - h1;
  355.         d[8] = g0 - h0;
  356.     }
  357.  
  358.     void IMDCT_18(const float *in, float (*out)[2][32], float *overlap, const float *window) {
  359.         static const float pi = 3.1415926535897932384626433832795f;
  360.  
  361.         float a[18];
  362.         float t[18];
  363.  
  364.         a[0] = in[0];
  365.         a[1] = in[0]+in[1];
  366.         a[2] = in[1]+in[2];
  367.         a[3] = in[2]+in[3];
  368.         a[4] = in[3]+in[4];
  369.         a[5] = in[4]+in[5];
  370.         a[6] = in[5]+in[6];
  371.         a[7] = in[6]+in[7];
  372.         a[8] = in[7]+in[8];
  373.         a[9] = in[8]+in[9];
  374.         a[10] = in[9]+in[10];
  375.         a[11] = in[10]+in[11];
  376.         a[12] = in[11]+in[12];
  377.         a[13] = in[12]+in[13];
  378.         a[14] = in[13]+in[14];
  379.         a[15] = in[14]+in[15];
  380.         a[16] = in[15]+in[16];
  381.         a[17] = in[16]+in[17];
  382.  
  383.         a[17] += a[15];
  384.         a[15] += a[13];
  385.         a[13] += a[11];
  386.         a[11] += a[9];
  387.         a[9] += a[7];
  388.         a[7] += a[5];
  389.         a[5] += a[3];
  390.         a[3] += a[1];
  391.  
  392.         float d[18];
  393.  
  394.         IMDCT_9((float(*)[2])&a[0], d);
  395.         IMDCT_9((float(*)[2])&a[1], d+9);
  396.         static const float coeff_9_to_18[9]={        // 1 / (2 cos((pi/36)(2i+1)))
  397.             0.50190991877167f,
  398.             0.51763809020504f,
  399.             0.55168895948125f,
  400.             0.61038729438073f,
  401.             0.70710678118655f,
  402.             0.87172339781055f,
  403.             1.18310079157625f,
  404.             1.93185165257814f,
  405.             5.73685662283492f,
  406.         };
  407.  
  408.         const float y[9]={
  409.             d[9+0] * coeff_9_to_18[0],
  410.             d[9+1] * coeff_9_to_18[1],
  411.             d[9+2] * coeff_9_to_18[2],
  412.             d[9+3] * coeff_9_to_18[3],
  413.             d[9+4] * coeff_9_to_18[4],
  414.             d[9+5] * coeff_9_to_18[5],
  415.             d[9+6] * coeff_9_to_18[6],
  416.             d[9+7] * coeff_9_to_18[7],
  417.             d[9+8] * coeff_9_to_18[8],
  418.         };
  419.  
  420.         // Note: The second half of this butterfly has been reversed.
  421.         t[ 0] = d[0] + y[0];
  422.         t[ 9] = d[0] - y[0];
  423.         t[ 1] = d[1] + y[1];
  424.         t[10] = d[1] - y[1];
  425.         t[ 2] = d[2] + y[2];
  426.         t[11] = d[2] - y[2];
  427.         t[ 3] = d[3] + y[3];
  428.         t[12] = d[3] - y[3];
  429.         t[ 4] = d[4] + y[4];
  430.         t[13] = d[4] - y[4];
  431.         t[ 5] = d[5] + y[5];
  432.         t[14] = d[5] - y[5];
  433.         t[ 6] = d[6] + y[6];
  434.         t[15] = d[6] - y[6];
  435.         t[ 7] = d[7] + y[7];
  436.         t[16] = d[7] - y[7];
  437.         t[ 8] = d[8] + y[8];
  438.         t[17] = d[8] - y[8];
  439.  
  440.         // multiplication to convert idct to imdct has already been folded
  441.         // into the windows
  442.  
  443.         // y[0..8]   =  x[9..17]  = t[17..9]
  444.         // y[9..17]  = -x[17..9]  = -t[9..17]
  445.         // y[18..26] = -x[8..0]   = -t[8..0]        (negation folded into window)
  446.         // y[27..35] = -x[0..8]   = -t[0..8]        (negation folded into window)
  447.  
  448. #if 0
  449.         for(unsigned k=0; k<9; ++k) {
  450.             out[k][0][0] = overlap[k] + t[17-k]*window[k];
  451.             out[k+9][0][0] = overlap[k+9] - t[k+9]*window[k+9];
  452.             overlap[k] = t[8-k]*window[k+18];
  453.             overlap[k+9] = t[k]*window[k+27];
  454.         }
  455. #else
  456.         out[0][0][0] = overlap[0] + t[17]*window[0];
  457.         out[1][0][0] = overlap[1] + t[16]*window[1];
  458.         out[2][0][0] = overlap[2] + t[15]*window[2];
  459.         out[3][0][0] = overlap[3] + t[14]*window[3];
  460.         out[4][0][0] = overlap[4] + t[13]*window[4];
  461.         out[5][0][0] = overlap[5] + t[12]*window[5];
  462.         out[6][0][0] = overlap[6] + t[11]*window[6];
  463.         out[7][0][0] = overlap[7] + t[10]*window[7];
  464.         out[8][0][0] = overlap[8] + t[ 9]*window[8];
  465.         out[9][0][0] = overlap[9] - t[ 9]*window[9];
  466.         out[10][0][0] = overlap[10] - t[10]*window[10];
  467.         out[11][0][0] = overlap[11] - t[11]*window[11];
  468.         out[12][0][0] = overlap[12] - t[12]*window[12];
  469.         out[13][0][0] = overlap[13] - t[13]*window[13];
  470.         out[14][0][0] = overlap[14] - t[14]*window[14];
  471.         out[15][0][0] = overlap[15] - t[15]*window[15];
  472.         out[16][0][0] = overlap[16] - t[16]*window[16];
  473.         out[17][0][0] = overlap[17] - t[17]*window[17];
  474.         overlap[0] = t[8]*window[18];
  475.         overlap[1] = t[7]*window[19];
  476.         overlap[2] = t[6]*window[20];
  477.         overlap[3] = t[5]*window[21];
  478.         overlap[4] = t[4]*window[22];
  479.         overlap[5] = t[3]*window[23];
  480.         overlap[6] = t[2]*window[24];
  481.         overlap[7] = t[1]*window[25];
  482.         overlap[8] = t[0]*window[26];
  483.         overlap[9] = t[0]*window[27];
  484.         overlap[10] = t[1]*window[28];
  485.         overlap[11] = t[2]*window[29];
  486.         overlap[12] = t[3]*window[30];
  487.         overlap[13] = t[4]*window[31];
  488.         overlap[14] = t[5]*window[32];
  489.         overlap[15] = t[6]*window[33];
  490.         overlap[16] = t[7]*window[34];
  491.         overlap[17] = t[8]*window[35];
  492. #endif
  493.     }
  494.  
  495.     void IMDCT_18_Null(float (*out)[2][32], float *overlap) {
  496.         for(unsigned k=0; k<18; ++k) {
  497.             out[k][0][0] = overlap[k];
  498.             overlap[k] = 0;
  499.         }
  500.     }
  501. #endif
  502.  
  503.     ////////////////////////////////////////////////////////////
  504.  
  505. #if 0
  506.     void IMDCT_6_3(const float *in, float (*out)[2][32], float *overlap, const float *window) {
  507.         float t[24]={0};
  508.  
  509.         unsigned k;
  510.  
  511.         for(unsigned w=0; w<3; ++w) {
  512.             for(unsigned i=0; i<12; ++i) {
  513.                 double s=0;
  514.                 for(unsigned j=0; j<6; ++j) {
  515.                     s += in[j*3] * cos((3.1415926535897932 / 24) * (i+i+1+6) * (j+j+1));
  516.                 }
  517.  
  518.                 t[i+w*6] += (float)s * window[i];
  519.             }
  520.  
  521.             ++in;
  522.         }
  523.  
  524.         for(k=0; k<6; ++k)
  525.             out[k][0][0] = overlap[k];
  526.  
  527.         for(k=0; k<12; ++k) {
  528.             out[k+6][0][0] = overlap[k+6] + t[k];
  529.             overlap[k] = t[k+12];
  530.         }
  531.  
  532.         for(k=0; k<6; ++k)
  533.             overlap[k+12] = 0;
  534.     }
  535. #else
  536.  
  537.     void IMDCT_6(float dst[12], const float src[18]) {
  538. #if 0
  539.         {
  540.             for(unsigned i=0; i<12; ++i) {
  541.                 double s=0;
  542.                 for(unsigned j=0; j<6; ++j) {
  543.                     s += src[j*3] * cos((3.1415926535897932 / 24) * (i+i+1+6) * (j+j+1));
  544.                 }
  545.  
  546.                 dst[i] = (float)s;
  547.             }
  548.         }
  549. #else
  550.         //////////////
  551.  
  552.         static const float root3div2 = 0.86602540378443864676372317075294f;
  553.         static const float pi = 3.1415926535897932384626433832795f;
  554.  
  555.         static const float oddscale[3]={        // 1/(2*cos(pi/12*[1 3 5])) - part of Lee decomposition of 6pt IDCT
  556.             0.51763809020504f,
  557.             0.70710678118655f,
  558.             1.93185165257814f,
  559.         };
  560.  
  561.         static const float finalscale[6]={        // 1/(2*cos(pi/24*[1:2:11])) - part of conversion from IDCT to IMDCT
  562.             0.504314480290076f,
  563.             0.541196100146197f,
  564.             0.630236207005132f,
  565.             0.821339815852291f,
  566.             1.30656296487638f,
  567.             3.83064878777019f,
  568.         };
  569.  
  570.         float a[6]={src[0], src[0]+src[3], src[3]+src[6], src[6]+src[9], src[9]+src[12], src[12]+src[15]};
  571.         float c1[6] = { a[0], a[2], a[4], a[1], a[1]+a[3], a[3]+a[5] };
  572.         float c2[6]={
  573.             c1[0] + c1[1]*root3div2 + c1[2]*0.5f,
  574.             c1[0] - c1[2],
  575.             c1[0] - c1[1]*root3div2 + c1[2]*0.5f,
  576.  
  577.             (c1[3] + c1[4]*root3div2 + c1[5]*0.5f) * oddscale[0],
  578.             (c1[3]                     - c1[5]     ) * oddscale[1],
  579.             (c1[3] - c1[4]*root3div2 + c1[5]*0.5f) * oddscale[2],
  580.         };
  581.         float b[6]={
  582.             (c2[0]+c2[3]) * finalscale[0],
  583.             (c2[1]+c2[4]) * finalscale[1],
  584.             (c2[2]+c2[5]) * finalscale[2],
  585.             (c2[2]-c2[5]) * finalscale[3],
  586.             (c2[1]-c2[4]) * finalscale[4],
  587.             (c2[0]-c2[3]) * finalscale[5],
  588.         };
  589.  
  590.         dst[0] = b[3];
  591.         dst[1] = b[4];
  592.         dst[2] = b[5];
  593.         dst[3] = -b[5];
  594.         dst[4] = -b[4];
  595.         dst[5] = -b[3];
  596.         dst[6] = -b[2];
  597.         dst[7] = -b[1];
  598.         dst[8] = -b[0];
  599.         dst[9] = -b[0];
  600.         dst[10] = -b[1];
  601.         dst[11] = -b[2];
  602. #endif
  603.     }
  604.  
  605.     // This also comes from the Marovich paper.  It's not as optimized, but
  606.     // short blocks are comparatively rare.
  607.  
  608.     void IMDCT_6_3(const float *in, float (*out)[2][32], float *overlap, const float *window) {
  609.         float t[24]={0};
  610.  
  611.         unsigned k;
  612.  
  613.         for(unsigned w=0; w<3; ++w) {
  614.             float u[12];
  615.  
  616.             IMDCT_6(u, in+w);
  617.  
  618.             for(unsigned i=0; i<12; ++i)
  619.                 t[i+w*6] += (float)(u[i] * window[i]);
  620.         }
  621.  
  622.         for(k=0; k<6; ++k)
  623.             out[k][0][0] = overlap[k];
  624.  
  625.         for(k=0; k<12; ++k) {
  626.             out[k+6][0][0] = overlap[k+6] + t[k];
  627.             overlap[k] = t[k+12];
  628.         }
  629.  
  630.         for(k=0; k<6; ++k)
  631.             overlap[k+12] = 0;
  632.     }
  633. #endif
  634.  
  635.     ////////////////////////////////////
  636.  
  637.     static void DecodeScalefactorsMPEG1(const LayerIIISideInfo& sideinfo, VDMPEGAudioHuffBitReader& bits, uint8 *scalefac_l, uint8 *scalefac_s, unsigned gr, unsigned ch) {
  638.         const LayerIIIRegionSideInfo& rsi = sideinfo.gr[gr][ch];
  639.  
  640.         // read in scalefactors
  641.         static const uint8 slen1_table[16]={0,0,0,0,3,1,1,1,2,2,2,3,3,3,4,4};
  642.         static const uint8 slen2_table[16]={0,1,2,3,0,1,2,3,1,2,3,1,2,3,2,3};
  643.         unsigned i;
  644.         const unsigned slen1 = slen1_table[rsi.scalefac_compress];
  645.         const unsigned slen2 = slen2_table[rsi.scalefac_compress];
  646.  
  647.         if (rsi.window_switching_flag && rsi.switched.block_type == 2) {    // short blocks
  648.             // Scalefactors for short blocks are ordered as
  649.             // scalefac_s[sfb][window].  We store them as one big array for
  650.             // convenience.
  651.  
  652.             if (rsi.switched.mixed_block_flag) {
  653.                 for(i=0; i<8; ++i)                        // long bands 0-7
  654.                     *scalefac_l++ = bits.get(slen1);
  655.  
  656.                 scalefac_s += 3*3;
  657.                 for(i=0; i<9; ++i)                        // short bands 3-5
  658.                     *scalefac_s++ = slen1?bits.get(slen1):0;
  659.  
  660.                 for(i=0; i<18; ++i)                        // short bands 6-11
  661.                     *scalefac_s++ = slen2?bits.get(slen2):0;
  662.             } else {
  663.                 for(i=0; i<18; ++i)                        // short bands 0-5
  664.                     *scalefac_s++ = slen1?bits.get(slen1):0;
  665.  
  666.                 for(i=0; i<18; ++i)
  667.                     *scalefac_s++ = slen2?bits.get(slen2):0;
  668.             }
  669.         } else {        // long blocks
  670.             // Long blocks are split into four bands.  Depending on
  671.             // the scalefactor selectors, some bands will share
  672.             // scalefactors between regions, and some won't.
  673.  
  674.             unsigned sfb = 0;
  675.  
  676.             static const unsigned sfblimit[4]={6,11,16,21};
  677.             unsigned slen[4]={slen1,slen1,slen2,slen2};
  678.  
  679.             for(unsigned r=0; r<4; ++r) {
  680.                 if (!gr || !sideinfo.scfsi[ch][r]) {
  681.                     if (unsigned slen_bits = slen[r]) {
  682.                         for(; sfb<sfblimit[r]; ++sfb)
  683.                             scalefac_l[sfb] = bits.get(slen_bits);
  684.                     } else {
  685.                         for(; sfb<sfblimit[r]; ++sfb)
  686.                             scalefac_l[sfb] = 0;
  687.                     }
  688.                 } else
  689.                     sfb = sfblimit[r];
  690.             }
  691.         }
  692.     }
  693.  
  694.     static void DecodeScalefactorsMPEG2(LayerIIIRegionSideInfo& rsi, VDMPEGAudioHuffBitReader& bits, uint8 *scalefac_l, uint8 *scalefac_s, unsigned mode_ext, unsigned ch) {
  695.         // Tables of bit allocations per scalefactor breakdown type,
  696.         // three cases per type (long, short, mixed).  Note that for
  697.         // the mixed case the first six scalefactor bands (long bands)
  698.         // have been subtracted from the table.
  699.  
  700.         static const uint8 is_on_566_tab[3][4] = {{7,7,7,0}, {12,12,12,0}, {0,15,12,0}};
  701.         static const uint8 is_on_444_tab[3][4] = {{6,6,6,3}, {12,9,9,6}, {0,12,9,6}};
  702.         static const uint8 is_on_430_tab[3][4] = {{8,8,5,0}, {15,12,9,0}, {0,18,9,0}};
  703.         static const uint8 is_off_5544_tab[3][4] = {{6,5,5,5}, {9,9,9,9}, {0,9,9,9}};
  704.         static const uint8 is_off_5540_tab[3][4] = {{6,5,7,3}, {9,9,12,6}, {0,9,12,6}};
  705.         static const uint8 is_off_4300_tab[3][4] = {{11,10,0,0}, {18,18,0,0}, {9,18,0,0}};
  706.  
  707.         unsigned v = rsi.scalefac_compress;
  708.         unsigned set_bits[4];
  709.  
  710.         rsi.preflag = 0;
  711.  
  712.         const uint8 *set_count;
  713.         const unsigned windowtype = rsi.window_switching_flag && rsi.switched.block_type==2 ? rsi.switched.mixed_block_flag ? 2 : 1 : 0;
  714.  
  715.         if ((mode_ext & 1) && ch) {    // second channel w/ intensity stereo enabled
  716.             v >>= 1;            // LSB is used for intensity scale.
  717.  
  718.             if (v < 180) {            // 5 x 6 x 6 = 180 types
  719.                 set_bits[0] = v / 36;
  720.                 set_bits[1] = (v % 36) / 6;
  721.                 set_bits[2] = v % 6;
  722.                 set_bits[3] = 0;
  723.                 set_count = is_on_566_tab[windowtype];
  724.             } else if (v < 244) {    // 4 x 4 x 4 = 64 types
  725.                 v -= 180;
  726.                 set_bits[0] = (v>>4) & 3;
  727.                 set_bits[1] = (v>>2) & 3;
  728.                 set_bits[2] = v & 3;
  729.                 set_bits[3] = 0;
  730.                 set_count = is_on_444_tab[windowtype];
  731.             } else {                // 4 x 3 = 12 types
  732.                 v -= 244;
  733.                 set_bits[0] = v / 3;
  734.                 set_bits[1] = v % 3;
  735.                 set_bits[2] = 0;
  736.                 set_bits[3] = 0;
  737.                 set_count = is_on_430_tab[windowtype];
  738.             }
  739.         } else {                    // first channel or intensity stereo disabled
  740.             if (v < 400) {            // 5 x 5 x 4 x 4 = 400 types
  741.                 set_bits[0] = (v>>4) / 5;
  742.                 set_bits[1] = (v>>4) % 5;
  743.                 set_bits[2] = (v>>2) & 3;
  744.                 set_bits[3] = v & 3;
  745.                 set_count = is_off_5544_tab[windowtype];
  746.             } else if (v < 500) {    // 5 x 5 x 4 = 100 types
  747.                 v -= 400;
  748.                 set_bits[0] = (v>>2) / 5;
  749.                 set_bits[1] = (v>>2) % 5;
  750.                 set_bits[2] = v & 3;
  751.                 set_bits[3] = 0;
  752.                 set_count = is_off_5540_tab[windowtype];
  753.             } else {                // 4 x 3 = 12 types
  754.                 v -= 500;
  755.                 set_bits[0] = v / 3;
  756.                 set_bits[1] = v % 3;
  757.                 set_bits[2] = 0;
  758.                 set_bits[3] = 0;
  759.                 set_count = is_off_4300_tab[windowtype];
  760.                 rsi.preflag = 1;
  761.             }
  762.         }
  763.  
  764.         uint8 *dst = scalefac_l;
  765.  
  766.         if (rsi.window_switching_flag && rsi.switched.block_type == 2) {
  767.             if (rsi.switched.mixed_block_flag) {
  768.                 // The first six scalefactors, which are in the long band of
  769.                 // mixed blocks, are always in the first set of scalefactors
  770.                 // so we just read them now.
  771.  
  772.                 for(unsigned i=0; i<6; ++i)
  773.                     scalefac_l[i] = set_bits[0] ? bits.get(set_bits[0]) : 0;
  774.  
  775.                 scalefac_s += 9;    // skip first three short bands
  776.             }
  777.  
  778.             dst = scalefac_s;
  779.         }
  780.  
  781.         for(unsigned set=0; set<4; ++set) {
  782.             for(unsigned i=0; i<set_count[set]; ++i)
  783.                 *dst++ = set_bits[set] ? bits.get(set_bits[set]) : 0;
  784.         }
  785.     }
  786.  
  787.     void MidSideButterfly(float *left, float *right, uint32 n) {
  788.         VDASSERT(!(n & 1));
  789.  
  790.         static const float invsqrt2 = 0.70710678118654752440084436210485f;
  791.  
  792.         for(unsigned i=0; i<n; i += 2) {
  793.             const float x0 = left [i+0] * invsqrt2;
  794.             const float y0 = right[i+0] * invsqrt2;
  795.             const float x1 = left [i+1] * invsqrt2;
  796.             const float y1 = right[i+1] * invsqrt2;
  797.  
  798.             left [i+0] = (float)(x0+y0);
  799.             right[i+0] = (float)(x0-y0);
  800.             left [i+1] = (float)(x1+y1);
  801.             right[i+1] = (float)(x1-y1);
  802.         }
  803.     }
  804. }
  805.  
  806. void VDMPEGAudioDecoder::PrereadLayerIII() {
  807.     const bool is_mpeg2 = mHeader.nMPEGVer > 1;
  808.     const unsigned nch = mMode != 3 ? 2 : 1;
  809.  
  810.     // fill Huffman buffer
  811.  
  812.     unsigned sidelen = is_mpeg2 ? (nch>1 ? 17 : 9) : (nch>1 ? 32 : 17);
  813.     unsigned left = mFrameDataSize - sidelen;
  814.     const uint8 *src = mFrameBuffer + sidelen;
  815.  
  816.     while(left > 0) {
  817.         unsigned tc = kL3BufferSize - mL3BufferPos;
  818.         if (tc > left)
  819.             tc = left;
  820.         memcpy(mL3Buffer+mL3BufferPos, src, tc);
  821.         mL3BufferPos = (mL3BufferPos + tc) & (kL3BufferSize-1);
  822.         mL3BufferLevel += tc;
  823.         src += tc;
  824.         left -= tc;
  825.     }
  826.  
  827.     if (mL3BufferLevel > kL3BufferSize)
  828.         mL3BufferLevel = kL3BufferSize;
  829. }
  830.  
  831. bool VDMPEGAudioDecoder::DecodeLayerIII() {
  832.     LayerIIISideInfo sideinfo;
  833.     const unsigned nch = mMode != 3 ? 2 : 1;
  834.     const bool is_mpeg2 = mHeader.nMPEGVer > 1;
  835.  
  836.     if (is_mpeg2)
  837.         DecodeSideInfoMPEG2(sideinfo, mFrameBuffer, nch);
  838.     else
  839.         DecodeSideInfoMPEG1(sideinfo, mFrameBuffer, nch);
  840.  
  841.     const unsigned sidelen = is_mpeg2 ? (nch>1 ? 17 : 9) : (nch>1 ? 32 : 17);
  842.  
  843.     profile_set(0);
  844.  
  845.     uint32 reservoirOffset = sideinfo.main_data_begin + (mFrameDataSize - sidelen);
  846.  
  847.     if (mL3BufferLevel < reservoirOffset)
  848.         return false;
  849.  
  850.     VDMPEGAudioHuffBitReader bits(mL3Buffer, mL3BufferPos - reservoirOffset);
  851.  
  852.     uint8 scalefac_l[2][22]={0};
  853.     uint8 scalefac_s[2][13][3]={0};
  854.  
  855.     // There are only 21 long bands and 12 short bands in MPEG audio, but
  856.     // data above the tables is considered to have a scalefactor of zero.
  857.     // The standard doesn't say whether short block reordering occurs --
  858.     // reference appears to do it, so we'll just pretend there is an
  859.     // extra band when dequantizing.
  860.  
  861.     static const unsigned sLongScalefactorBands[3][3][23]={
  862.         {
  863.             {0,4,8,12,16,20,24,30,36,44,52,62,74,90,110,134,162,196,238,288,342,418,576},    // 44.1KHz
  864.             {0,4,8,12,16,20,24,30,36,42,50,60,72,88,106,128,156,190,230,276,330,384,576},    // 48KHz
  865.             {0,4,8,12,16,20,24,30,36,44,54,66,82,102,126,156,194,240,296,364,448,550,576},    // 32KHz
  866.         },
  867.         {
  868.             {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},    // 22KHz
  869.             {0,6,12,18,24,30,36,44,54,66,80,96,114,136,162,194,232,278,332,394,464,540,576},    // 24KHz
  870.             {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},    // 16KHz
  871.         },
  872.         {
  873.             {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},        // 11KHz
  874.             {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},        // 12KHz
  875.             {0,12,24,36,48,60,72,88,108,132,160,192,232,280,336,400,476,566,568,570,572,574,576},    // 8KHz
  876.         }
  877.     };
  878.  
  879.     static const unsigned sShortScalefactorBands[3][3][14]={
  880.         {
  881.             {0,4,8,12,16,22,30,40,52,66,84,106,136,192},    // 44.1KHz
  882.             {0,4,8,12,16,22,28,38,50,64,80,100,126,192},    // 48KHz
  883.             {0,4,8,12,16,22,30,42,58,78,104,138,180,192},    // 32KHz
  884.         },
  885.         {
  886.             {0,4,8,12,18,24,32,42,56,74,100,132,174,192},    // 22KHz
  887.             {0,4,8,12,18,26,36,48,62,80,104,136,180,192},    // 24KHz
  888.             {0,4,8,12,18,26,36,48,62,80,104,134,174,192},    // 16KHz
  889.         },
  890.         {
  891.             {0,4,8,12,18,26,36,48,62,80,104,136,174,192},    // 11KHz
  892.             {0,4,8,12,18,26,36,48,62,80,104,136,174,192},    // 12KHz
  893.             {0,8,16,24,36,52,72,96,124,160,162,164,166,192},    // 8KHz
  894.         }
  895.     };
  896.  
  897.     unsigned mpegtype = mHeader.lSamplingFreq < 16000 ? 2 : is_mpeg2 ? 1 : 0;
  898.  
  899.     const unsigned *const pLongBands = sLongScalefactorBands[mpegtype][mSamplingRateIndex];
  900.     const unsigned *const pShortBands = sShortScalefactorBands[mpegtype][mSamplingRateIndex];
  901.  
  902.     for(unsigned gr=0; gr<(is_mpeg2 ? 1U : 2U); ++gr) {
  903.         float (&recon)[2][576] = mL3Data.recon;
  904.  
  905.         unsigned ch;
  906.         unsigned ms_bound;
  907.         unsigned is_band;
  908.         unsigned gr_zero_bound = 0;
  909.  
  910.         for(ch=0; ch<nch; ++ch) {
  911.             const LayerIIIRegionSideInfo& rsi = sideinfo.gr[gr][ch];
  912.             unsigned region_end = bits.pos() + rsi.part2_3_length;
  913.             float (&reconch)[576] = recon[ch];
  914.  
  915.             if (is_mpeg2)
  916.                 DecodeScalefactorsMPEG2(sideinfo.gr[gr][ch], bits, &scalefac_l[ch][0], &scalefac_s[ch][0][0], mModeExtension, ch);
  917.             else
  918.                 DecodeScalefactorsMPEG1(sideinfo, bits, &scalefac_l[ch][0], &scalefac_s[ch][0][0], gr, ch);
  919.  
  920.             VDASSERT(scalefac_l[ch][21] == 0);
  921.             VDASSERT(scalefac_s[ch][12][0] == 0);
  922.  
  923.             profile_add(p_scalefac);
  924.  
  925.             // decode big huffman-encoded samples -- three regions
  926.             sint32 freq[576+4];
  927.             unsigned region1_start;
  928.             unsigned region2_start;
  929.             unsigned count1_start    = 2*rsi.big_values;
  930.  
  931.             if (count1_start > 576)
  932.                 throw (int)ERR_INVALIDDATA;
  933.  
  934.             if (rsi.window_switching_flag) {
  935.                 const unsigned block_type = rsi.switched.block_type;
  936.  
  937.                 // Sadly, although the breakpoint is conveniently at frequency
  938.                 // line 36 in both cases in MPEG-1, in MPEG-2 the long window
  939.                 // has band 8 at line 56 instead, and in MPEG-2.5 the short
  940.                 // window breakpoint is at line 72.
  941.  
  942.                 region1_start = block_type == 2 ? pShortBands[3]*3 : pLongBands[8];
  943.                 region2_start = count1_start;
  944.             } else {
  945.                 // check for invalid regions
  946.                 if (rsi.unswitched.region0_count + rsi.unswitched.region1_count + 2 >= 23)
  947.                     throw (int)ERR_INVALIDDATA;
  948.  
  949.                 region1_start = pLongBands[rsi.unswitched.region0_count + 1];
  950.                 region2_start = pLongBands[rsi.unswitched.region0_count + rsi.unswitched.region1_count + 2];
  951.             }
  952.  
  953.             if (region1_start > count1_start)
  954.                 region1_start = count1_start;
  955.             if (region2_start > count1_start)
  956.                 region2_start = count1_start;
  957.  
  958.             // reject invalid table 14
  959.             if ((rsi.table_select[0] == 4 || rsi.table_select[0] == 14) && region1_start > 0)
  960.                 throw (int)ERR_INVALIDDATA;
  961.             if ((rsi.table_select[1] == 4 || rsi.table_select[1] == 14) && region1_start != region2_start)
  962.                 throw (int)ERR_INVALIDDATA;
  963.             if ((rsi.table_select[2] == 4 || rsi.table_select[2] == 14) && region2_start != count1_start)
  964.                 throw (int)ERR_INVALIDDATA;
  965.  
  966.             if (bits.pos() > region_end)
  967.                 throw (int)ERR_INVALIDDATA;
  968.  
  969.             DecodeHuffmanValues(bits, freq, rsi.table_select[0], region1_start >> 1);
  970.  
  971.             if (bits.pos() > region_end)
  972.                 throw (int)ERR_INVALIDDATA;
  973.  
  974.             DecodeHuffmanValues(bits, freq + region1_start, rsi.table_select[1], (region2_start - region1_start) >> 1);
  975.  
  976.             if (bits.pos() > region_end)
  977.                 throw (int)ERR_INVALIDDATA;
  978.  
  979.             DecodeHuffmanValues(bits, freq + region2_start, rsi.table_select[2], (count1_start - region2_start) >> 1);
  980.  
  981.             //if (bits.pos() > region_end)
  982.             //    throw (int)ERR_INVALIDDATA;
  983.  
  984.             profile_add(p_huffdec);
  985.  
  986.             // decode little huffman-encoded samples
  987.             
  988.             uint32 val = bits.peek(32);
  989.  
  990.             unsigned zero_bound;
  991.             {
  992.                 unsigned i = count1_start;
  993.  
  994.                 if (rsi.count1table_select) {        // inverted 4-bit table
  995.                     while(i<=576-4 && bits.pos() < region_end) {
  996.                         unsigned c = bits.get(4);
  997.                         int w=0, x=0, y=0, z=0;
  998.  
  999.                         if (!(c & 8))
  1000.                             w = bits.get(1) ? -1 : 1;
  1001.                         if (!(c & 4))
  1002.                             x = bits.get(1) ? -1 : 1;
  1003.                         if (!(c & 2))
  1004.                             y = bits.get(1) ? -1 : 1;
  1005.                         if (!(c & 1))
  1006.                             z = bits.get(1) ? -1 : 1;
  1007.  
  1008.                         freq[i++] = w;
  1009.                         freq[i++] = x;
  1010.                         freq[i++] = y;
  1011.                         freq[i++] = z;
  1012.                     }
  1013.                 } else {
  1014.                     const L3HuffmanTableDescriptor& tablec1 = sL3HuffmanTables[32];
  1015.  
  1016.                     while(i<=576-4 && bits.pos() < region_end) {
  1017.                         unsigned idx = 0;
  1018.                         
  1019.                         while(tablec1.table[idx][0])
  1020.                             idx += tablec1.table[idx][bits.get(1)];
  1021.  
  1022.                         unsigned c = tablec1.table[idx][1];
  1023.                         int w=0, x=0, y=0, z=0;
  1024.  
  1025.                         if (c & 8)
  1026.                             w = bits.get(1) ? -1 : 1;
  1027.                         if (c & 4)
  1028.                             x = bits.get(1) ? -1 : 1;
  1029.                         if (c & 2)
  1030.                             y = bits.get(1) ? -1 : 1;
  1031.                         if (c & 1)
  1032.                             z = bits.get(1) ? -1 : 1;
  1033.  
  1034.                         freq[i++] = w;
  1035.                         freq[i++] = x;
  1036.                         freq[i++] = y;
  1037.                         freq[i++] = z;
  1038.                     }
  1039.                 }
  1040.  
  1041.                 zero_bound = i;
  1042.                 while(zero_bound > 0 && !freq[zero_bound-1])
  1043.                     --zero_bound;
  1044.  
  1045.                 if (zero_bound > gr_zero_bound)
  1046.                     gr_zero_bound = zero_bound;
  1047.  
  1048.                 if (ch) {
  1049.                     unsigned sfb;
  1050.  
  1051.                     // hmm... what band is it in?
  1052.  
  1053.                     if (rsi.window_switching_flag && rsi.switched.block_type==2) {
  1054.                         if (rsi.switched.mixed_block_flag && zero_bound <= 36) {    // mixed block - 35 scalefactor bands
  1055.                             const unsigned long_bands = is_mpeg2 ? 6 : 8;
  1056.  
  1057.                             for(sfb = 0; sfb < long_bands; ++sfb)
  1058.                                 if (zero_bound <= pLongBands[sfb])
  1059.                                     break;
  1060.  
  1061.                             ms_bound = pLongBands[sfb];
  1062.                             is_band = sfb;
  1063.                         } else {                                // short block - 12 scalefactor bands + end band
  1064.                             for(sfb = 0; sfb < 13; ++sfb)
  1065.                                 if (zero_bound <= pShortBands[sfb]*3)
  1066.                                     break;
  1067.  
  1068.                             ms_bound = pShortBands[sfb]*3;
  1069.                             is_band = sfb;
  1070.                         }
  1071.                     } else {                                    // long block - 21 scalefactor bands + end band
  1072.                         for(sfb = 0; sfb < 22; ++sfb)
  1073.                             if (zero_bound <= pLongBands[sfb])
  1074.                                 break;
  1075.  
  1076.                         ms_bound = pLongBands[sfb];
  1077.                         is_band = sfb;
  1078.                     }
  1079.                 }
  1080.  
  1081.                 if (i < 576)
  1082.                     memset(freq+i, 0, sizeof(freq[0])*(576-i));
  1083.  
  1084.                 VDASSERT(i >= 574 || bits.pos() == region_end);
  1085.  
  1086.                 bits.seek(region_end);
  1087.             }
  1088.  
  1089.             profile_add(p_huffdec1);
  1090.  
  1091.             // requantization
  1092.  
  1093.             {
  1094.                 static const uint8 pretab[22]={0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,2,2,3,3,3,2};
  1095.                 static const float log2 = 0.693147180559945309417232f;
  1096.  
  1097.                 static const float inv_pow2[36]={
  1098.                     1.0f,                    0.7071067691f,
  1099.                     0.5f,                    0.3535533845f,
  1100.                     0.25f,                    0.1767766923f,
  1101.                     0.125f,                    0.08838834614f,
  1102.                     0.0625f,                0.04419417307f,
  1103.                     0.03125f,                0.02209708653f,
  1104.                     0.015625f,                0.01104854327f,
  1105.                     0.0078125f,                0.005524271633f,
  1106.                     0.00390625f,            0.002762135817f,
  1107.                     0.001953125f,            0.001381067908f,
  1108.                     0.0009765625f,            0.0006905339542f,
  1109.                     0.00048828125f,            0.0003452669771f,
  1110.                     0.000244140625f,        0.0001726334885f,
  1111.                     0.0001220703125f,        8.631674427e-005f,
  1112.                     6.103515625e-005f,        4.315837214e-005f,
  1113.                     3.051757813e-005f,        2.157918607e-005f,
  1114.                     1.525878906e-005f,        1.078959303e-005f,
  1115.                     7.629394531e-006f,        5.394796517e-006f,
  1116.                 };
  1117.  
  1118.                 bool short_blocks = rsi.window_switching_flag && rsi.switched.block_type == 2;
  1119.                 unsigned long_bands = short_blocks ? rsi.switched.mixed_block_flag ? is_mpeg2 ? 6 : 8 : 0 : 21;
  1120.                 float global_gain = expf(log2 * 0.25f*(float)(int)(rsi.global_gain - 210));
  1121.                 int scalefac_shift = rsi.scalefac_scale ? 1 : 0;
  1122.                 float scalefac_scale = rsi.scalefac_scale ? -1.0f : -0.5f;
  1123.                 float gain;
  1124.  
  1125.                 // long bands
  1126.                 unsigned i = 0;
  1127.  
  1128.                 for(unsigned sfb=0; sfb<long_bands; ++sfb) {
  1129.                     unsigned band_end = pLongBands[sfb+1];
  1130.  
  1131.                     if (band_end > zero_bound)
  1132.                         band_end = zero_bound;
  1133.  
  1134.                     int sf = scalefac_l[ch][sfb];        // 0-15
  1135.  
  1136.                     if (rsi.preflag)
  1137.                         sf += pretab[sfb];
  1138.  
  1139.                     gain = global_gain * inv_pow2[sf << scalefac_shift];
  1140.  
  1141.                     if (i >= count1_start) {
  1142.                         if (i >= zero_bound)
  1143.                             break;
  1144.  
  1145.                         const float recon_tab[3]={-gain, 0.f, +gain};
  1146.  
  1147.                         for(; i < band_end; i += 2) {
  1148.                             reconch[i  ] = recon_tab[freq[i  ]+1];
  1149.                             reconch[i+1] = recon_tab[freq[i+1]+1];
  1150.                         }
  1151.                     } else {
  1152.                         for(; i < band_end; ++i) {
  1153.                             VDASSERT(abs(freq[i]) < 8192);
  1154.  
  1155.                             sint32 x = freq[i];
  1156.                             float y;
  1157.  
  1158.                             if ((unsigned)(x+128) < 256)
  1159.                                 y = mL3Pow43Tab[x+128];
  1160.                             else {
  1161. //                                y = powf(fabsf((float)x), 4.0f/3.0f);
  1162.  
  1163.                                 // compute c = |x|^4
  1164.                                 float c = (float)x;
  1165.                                 c = c*c;
  1166.                                 c = c*c;
  1167.  
  1168.                                 // compute initial approximation to c^(-1/3) = x^(-4^3)
  1169.                                 union {
  1170.                                     float f;
  1171.                                     int i;
  1172.                                 } conv = { c };
  1173.  
  1174.                                 conv.i = 0x54a21cf0 - conv.i / 3;
  1175.  
  1176.                                 // refine using two Newton-Raphson iterations -- maximum relative error over
  1177.                                 // [0,8192] is 0.00221%.
  1178.                                 float x = conv.f;
  1179.                                 float x2;
  1180.  
  1181.                                 x2 = x*x;
  1182.                                 x = (4.0f / 3.0f)*x - (c*(1.0f/3.0f))*(x2*x2);
  1183.                                 x2 = x*x;
  1184.                                 x = (4.0f / 3.0f)*x - (c*(1.0f/3.0f))*(x2*x2);
  1185.  
  1186.                                 // compute y ~= x^(4/3)
  1187.                                 y = c * x * x;
  1188.  
  1189.                                 // invert as necessary
  1190.  
  1191.                                 static const float signs[2]={-1,1};
  1192.                                 y *= signs[(freq[i] >> 31) + 1];
  1193.                             }
  1194.  
  1195.                             reconch[i] = y * gain;
  1196.                         }
  1197.                     }
  1198.                 }
  1199.  
  1200.                 if (i < 576)
  1201.                     memset(reconch + i, 0, sizeof(reconch[0])*(576 - i));
  1202.  
  1203.                 if (short_blocks) {
  1204.                     int sfb;
  1205.  
  1206.                     if (rsi.switched.mixed_block_flag)
  1207.                         sfb = 3;
  1208.                     else
  1209.                         sfb = 0;
  1210.  
  1211.                     for(; sfb < 13; ++sfb) {
  1212.                         gain = global_gain;
  1213.  
  1214.                         unsigned i = pShortBands[sfb]*3;
  1215.  
  1216.                         for(unsigned window=0; window<3; ++window) {
  1217.                             int sf = scalefac_s[ch][sfb][window];
  1218.  
  1219.                             gain = global_gain * powf(2.0f, scalefac_scale * sf - 2*rsi.switched.subblock_gain[window]);
  1220.  
  1221.                             unsigned j = pShortBands[sfb]*3 + window;
  1222.  
  1223.                             while(j < pShortBands[sfb+1]*3) {
  1224.                                 float y = (float)pow(fabs((double)freq[i]), 4.0/3.0) * gain;
  1225.  
  1226.                                 VDASSERT(abs(freq[i]) < 8192);
  1227.  
  1228.                                 if (freq[i]<0)
  1229.                                     y = -y;
  1230.  
  1231.                                 reconch[j] = (float)y;
  1232.                                 ++i;
  1233.                                 j += 3;
  1234.                             }
  1235.                         }
  1236.                     }
  1237.                 }
  1238.             }
  1239.  
  1240.             profile_add(p_dequan);
  1241.  
  1242.         }
  1243.  
  1244.         // stereo processing
  1245.         //
  1246.         // We can do this after reordering because short block reordering
  1247.         // swaps around samples within a scalefactor band, and the switch
  1248.         // from MS to IS only occurs between bands.
  1249.         //
  1250.         // XXX: There is still a bug here with MPEG-2 intensity stereo
  1251.         //        decoding -- specifically, the zero bound that switches
  1252.         //        IS on is per-window in MPEG-2, but this code only handles
  1253.         //        MPEG-1 mode. MPEG-2 files with intensity stereo are a
  1254.         //        bit hard to come by since LAME doesn't use IS....
  1255.  
  1256.         if (mMode == 1) {                    // joint stereo mode -- mode_ext = MS/IS
  1257.             static const float invsqrt2 = 0.70710678118654752440084436210485f;
  1258.  
  1259.             if (mModeExtension & 2) {        // mid/side stereo mode enabled
  1260.                 if (mModeExtension == 2)
  1261.                     ms_bound = 576;
  1262.  
  1263.                 MidSideButterfly(recon[0], recon[1], ms_bound);
  1264.             }
  1265.  
  1266.             if (mModeExtension & 1) {        // intensity stereo mode enabled
  1267.  
  1268.                 // Unfortunately, 11172-3 doesn't say whether IS positions
  1269.                 // above 7 are valid; they are reachable in low subbands
  1270.                 // with a low ms_bound.  Most of them are benign except for
  1271.                 // 9, which has infinite gain -- presumably this is the
  1272.                 // "I'm yelling inside your head" position.  Oh well.
  1273.  
  1274.                 static const float is_left_tab[2][16]={
  1275.                     {    // MPEG-1
  1276.                         0.0f,
  1277.                         0.211324865405187f,
  1278.                         0.366025403784439f,
  1279.                         0.5f,
  1280.                         0.633974596215561f,
  1281.                         0.788675134594813f,
  1282.                         1.0f,
  1283.                         1.36602540378444f,        // 7 (illegal)
  1284.                         2.36602540378444f,
  1285.                         0.0f,                    // 9 - -1/0.  Ugh.
  1286.                         -1.36602540378444f,
  1287.                         -0.366025403784439f,
  1288.                         0.0f,
  1289.                         0.211324865405187f,
  1290.                         0.366025403784439f,
  1291.                         0.5f,
  1292.                     },
  1293.                     {    // MPEG-2            powers of 1/root-root-2
  1294.                         1.00000000000000,
  1295.                         0.84089641525371,
  1296.                         1.0,
  1297.                         0.70710678118655,
  1298.                         1.0,
  1299.                         0.59460355750136,
  1300.                         1.0,
  1301.                         0.50000000000000,
  1302.                         1.0,
  1303.                         0.42044820762686,
  1304.                         1.0,
  1305.                         0.35355339059327,
  1306.                         1.0,
  1307.                         0.29730177875068,
  1308.                         1.0,
  1309.                         0.21022410381343,
  1310.                     }
  1311.                 };
  1312.  
  1313.                 static const float is_right_tab[2][16]={
  1314.                     {    // MPEG-1
  1315.                         1.0f,
  1316.                         0.788675134594813f,
  1317.                         0.633974596215561f,
  1318.                         0.5f,
  1319.                         0.366025403784439f,
  1320.                         0.211324865405187f,
  1321.                         0.0f,
  1322.                         -0.366025403784438f,        // 7 (illegal)
  1323.                         -1.36602540378444f,
  1324.                         0.0f,                    // 9 - 1/0.  Ugh.
  1325.                         2.36602540378444f,
  1326.                         1.36602540378444f,
  1327.                         1.0f,
  1328.                         0.788675134594813f,
  1329.                         0.633974596215562f,
  1330.                         0.5f,
  1331.                     },
  1332.                     {    // MPEG-2            powers of 1/root-root-2
  1333.                         1.00000000000000,
  1334.                         1.0,
  1335.                         0.84089641525371,
  1336.                         1.0,
  1337.                         0.70710678118655,
  1338.                         1.0,
  1339.                         0.59460355750136,
  1340.                         1.0,
  1341.                         0.50000000000000,
  1342.                         1.0,
  1343.                         0.42044820762686,
  1344.                         1.0,
  1345.                         0.35355339059327,
  1346.                         1.0,
  1347.                         0.29730177875068,
  1348.                         1.0,
  1349.                     }
  1350.                 };
  1351.  
  1352.                 // PITA: IS can occur in short blocks, and we need to know that
  1353.                 // since the scalefactors are involved.  The right channel
  1354.                 // determines the switch.
  1355.  
  1356.                 const LayerIIIRegionSideInfo& rsi = sideinfo.gr[gr][1];
  1357.  
  1358.                 unsigned long_band_start = 0;
  1359.                 unsigned long_band_end = 0;
  1360.                 unsigned short_band_start = 0;
  1361.                 unsigned short_band_end = 0;
  1362.  
  1363.                 // Frequency lines above the last scalefactor band inherit is_pos[]
  1364.                 // from it.
  1365.  
  1366.                 if (rsi.window_switching_flag && rsi.switched.block_type == 2) {
  1367.                     if (rsi.switched.mixed_block_flag) {
  1368.                         if (ms_bound <= pLongBands[8]) {
  1369.                             long_band_start        = is_band;
  1370.                             long_band_end        = 8;
  1371.                             short_band_start    = 3;
  1372.                         } else {
  1373.                             short_band_start    = is_band;
  1374.                         }
  1375.                         short_band_end        = 13;
  1376.                     } else {
  1377.                         short_band_start    = is_band;
  1378.                         short_band_end        = 13;
  1379.                     }
  1380.                     scalefac_s[1][12][0] = scalefac_s[1][11][0];
  1381.                     scalefac_s[1][12][1] = scalefac_s[1][11][1];
  1382.                     scalefac_s[1][12][2] = scalefac_s[1][11][2];
  1383.                 } else {
  1384.                     // dist10 appears to have a bug -- long blocks can't have IS in sfband 0.
  1385.                     if (!is_band)
  1386.                         ++is_band;
  1387.  
  1388.                     long_band_start = is_band;
  1389.                     long_band_end    = 22;
  1390.  
  1391.                     scalefac_l[1][21] = scalefac_l[1][20];
  1392.                 }
  1393.  
  1394.                 unsigned sfb;
  1395.  
  1396.                 for(sfb=long_band_start; sfb<long_band_end; ++sfb) {
  1397.                     const unsigned is_pos = scalefac_l[1][sfb];
  1398.  
  1399.                     if (is_pos != 7) {
  1400.                         const unsigned band_start = pLongBands[sfb];
  1401.                         const unsigned band_end = pLongBands[sfb+1];
  1402.                         float coleft = is_left_tab[is_mpeg2][is_pos];
  1403.                         float coright = is_right_tab[is_mpeg2][is_pos];
  1404.  
  1405.                         if (is_mpeg2 && (sideinfo.gr[gr][1].scalefac_compress & 1)) {
  1406.                             coleft *= coleft;
  1407.                             coright *= coright;
  1408.                         }
  1409.  
  1410.                         for(unsigned i = band_start; i<band_end; ++i) {
  1411.                             const float x = recon[0][i];
  1412.  
  1413.                             recon[0][i] = x*coleft;
  1414.                             recon[1][i] = x*coright;
  1415.                         }
  1416.                     } else if (mModeExtension & 2) {
  1417.                         const unsigned band_start = pLongBands[sfb];
  1418.                         const unsigned band_end = pLongBands[sfb+1];
  1419.  
  1420.                         for(unsigned i = band_start; i<band_end; ++i) {
  1421.                             const float x = recon[0][i] * invsqrt2;
  1422.                             const float y = recon[1][i] * invsqrt2;
  1423.  
  1424.                             recon[0][i] = x+y;
  1425.                             recon[1][i] = x-y;
  1426.                         }
  1427.                     }
  1428.                 }
  1429.  
  1430.                 for(sfb=short_band_start; sfb<short_band_end; ++sfb) {
  1431.                     for(unsigned window=0; window<3; ++window) {
  1432.                         const unsigned is_pos = scalefac_s[1][sfb][window];
  1433.  
  1434.                         if (is_pos != 7) {
  1435.                             const unsigned band_start = pShortBands[sfb]*3 + window;
  1436.                             const unsigned band_end = pShortBands[sfb+1]*3 + window;
  1437.                             float coleft = is_left_tab[is_mpeg2][is_pos];
  1438.                             float coright = is_right_tab[is_mpeg2][is_pos];
  1439.  
  1440.                             if (is_mpeg2 && (sideinfo.gr[gr][1].scalefac_compress & 1)) {
  1441.                                 coleft *= coleft;
  1442.                                 coright *= coright;
  1443.                             }
  1444.  
  1445.                             for(unsigned i=band_start; i<band_end; i+=3) {
  1446.                                 const float x = recon[0][i];
  1447.  
  1448.                                 recon[0][i] = x*coleft;
  1449.                                 recon[1][i] = x*coright;
  1450.                             }
  1451.                         } else if (mModeExtension & 2) {
  1452.                             const unsigned band_start = pShortBands[sfb]*3 + window;
  1453.                             const unsigned band_end = pShortBands[sfb+1]*3 + window;
  1454.  
  1455.                             for(unsigned i=band_start; i<band_end; i+=3) {
  1456.                                 const float x = recon[0][i] * invsqrt2;
  1457.                                 const float y = recon[1][i] * invsqrt2;
  1458.  
  1459.                                 recon[0][i] = (float)(x+y);
  1460.                                 recon[1][i] = (float)(x-y);
  1461.                             }
  1462.                         }
  1463.                     }
  1464.                 }
  1465.             }
  1466.         }
  1467.  
  1468.         profile_add(p_stereo);
  1469.  
  1470.         // alias reduction, IMDCT and polyphase
  1471.  
  1472.         float subbands[18][2][32];
  1473.  
  1474.         for(ch=0; ch<nch; ++ch) {
  1475.             const LayerIIIRegionSideInfo& rsi = sideinfo.gr[gr][ch];
  1476.             unsigned wintype = 0;
  1477.             
  1478.             if (rsi.window_switching_flag)
  1479.                 wintype = rsi.switched.block_type;
  1480.  
  1481.             for(unsigned sb=0; sb<32; ++sb)    {
  1482.                 if (wintype == 2 && (!rsi.switched.mixed_block_flag || sb >= 2)) {
  1483.                     IMDCT_6_3(&recon[ch][sb*18], (float(*)[2][32])&subbands[0][ch][sb], mL3OverlapBuffer[ch][sb], mL3Windows[wintype]);
  1484.                 } else {
  1485.                     if (sb < 31) {
  1486.                         static const float cs_tab[8]={
  1487.                             0.85749292571254f,
  1488.                             0.88174199731771f,
  1489.                             0.94962864910273f,
  1490.                             0.98331459249179f,
  1491.                             0.99551781606759f,
  1492.                             0.99916055817815f,
  1493.                             0.99989919524445f,
  1494.                             0.99999315507028f
  1495.                         };
  1496.  
  1497.                         static const float ca_tab[8]={
  1498.                             -0.51449575542753f,
  1499.                             -0.47173196856497f,
  1500.                             -0.31337745420390f,
  1501.                             -0.18191319961098f,
  1502.                             -0.09457419252642f,
  1503.                             -0.04096558288530f,
  1504.                             -0.01419856857247f,
  1505.                             -0.00369997467376f
  1506.                         };
  1507.  
  1508.                         float *p = &recon[ch][sb*18+18];
  1509.  
  1510.                         struct local {
  1511.                             static inline void antialias(float& x, float& y, float cs, float ca) {
  1512.                                 const float xt = x, yt = y;
  1513.  
  1514.                                 x = xt*cs - yt*ca;
  1515.                                 y = xt*ca + yt*cs;
  1516.                             }
  1517.                         };
  1518.  
  1519.                         local::antialias(p[-1], p[0], cs_tab[0], ca_tab[0]);
  1520.                         local::antialias(p[-2], p[1], cs_tab[1], ca_tab[1]);
  1521.                         local::antialias(p[-3], p[2], cs_tab[2], ca_tab[2]);
  1522.                         local::antialias(p[-4], p[3], cs_tab[3], ca_tab[3]);
  1523.                         local::antialias(p[-5], p[4], cs_tab[4], ca_tab[4]);
  1524.                         local::antialias(p[-6], p[5], cs_tab[5], ca_tab[5]);
  1525.                         local::antialias(p[-7], p[6], cs_tab[6], ca_tab[6]);
  1526.                         local::antialias(p[-8], p[7], cs_tab[7], ca_tab[7]);
  1527.                     }
  1528.  
  1529.                     if (sb*18 >= gr_zero_bound+35) {
  1530.                         IMDCT_18_Null((float(*)[2][32])&subbands[0][ch][sb], mL3OverlapBuffer[ch][sb]);
  1531.                     } else {
  1532.                         IMDCT_18(&recon[ch][sb*18], (float(*)[2][32])&subbands[0][ch][sb], mL3OverlapBuffer[ch][sb], mL3Windows[wintype]);
  1533.                     }
  1534.                 }
  1535.             }
  1536.         }
  1537.  
  1538.         profile_add(p_hybrid);
  1539.  
  1540.         for(unsigned s=0; s<18; ++s) {
  1541.             if (s & 1) {
  1542.                 for(unsigned sb=1; sb<32; sb+=2) {
  1543.                     subbands[s][0][sb] = -subbands[s][0][sb];
  1544.                     subbands[s][1][sb] = -subbands[s][1][sb];
  1545.                 }
  1546.             }
  1547.  
  1548.             if (nch>1) {
  1549.                 mpPolyphaseFilter->Generate(subbands[s][0], subbands[s][1], mpSampleDst);
  1550.                 mpSampleDst += 64;
  1551.             } else {
  1552.                 mpPolyphaseFilter->Generate(subbands[s][0], NULL, mpSampleDst);
  1553.                 mpSampleDst += 32;
  1554.             }
  1555.         }
  1556.  
  1557.         profile_add(p_polyphase);
  1558.     }
  1559.  
  1560. #ifdef RDTSC_PROFILE
  1561.  
  1562.     if (!(++p_frames & 127)) {
  1563.         static char buf[256];
  1564.  
  1565.         sprintf(buf, "%d frames: total %I64d, scalefac %d%%, huffdec %d%%/%d%%, dequan %d%%, stereo %d%%, hybrid %d%% (%lu), poly %d%%\n"
  1566.                 ,p_frames
  1567.                 ,p_total
  1568.                 ,(long)((p_scalefac*100)/p_total)
  1569.                 ,(long)((p_huffdec*100)/p_total)
  1570.                 ,(long)((p_huffdec1*100)/p_total)
  1571.                 ,(long)((p_dequan*100)/p_total)
  1572.                 ,(long)((p_stereo*100)/p_total)
  1573.                 ,(long)((p_hybrid*100)/p_total)
  1574.                 ,(long)p_hybrid
  1575.                 ,(long)((p_polyphase*100)/p_total)
  1576.                 );
  1577.         OutputDebugString(buf);
  1578.     }
  1579. #endif
  1580.  
  1581.     mSamplesDecoded += (is_mpeg2 ? 576 : 1152) * nch;
  1582.  
  1583.     return true;
  1584. }
  1585.