shithub: aacdec

--- a/libfaad/cfft.c

+++ b/libfaad/cfft.c

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: cfft.c,v 1.21 2003/12/17 14:43:16 menno Exp $

+** $Id: cfft.c,v 1.22 2003/12/23 18:41:42 menno Exp $

**/

/*

@@ -47,8 +47,9 @@

    passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd.

   ----------------------------------------------------------------------*/

-static void passf2(const uint16_t ido, const uint16_t l1, const complex_t *cc,

-                   complex_t *ch, const complex_t *wa, const int8_t isign)

+#ifdef USE_SSE

+static void passf2pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,

+                          complex_t *ch, const complex_t *wa)

     uint16_t i, k, ah, ac;

@@ -60,51 +61,146 @@

             ac = 4*k;

             RE(ch[ah])    = RE(cc[ac]) + RE(cc[ac+1]);

-            RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);

             IM(ch[ah])    = IM(cc[ac]) + IM(cc[ac+1]);

+            RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);

             IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);

     } else {

-        if (isign == 1)

+        for (k = 0; k < l1; k++)

-            for (k = 0; k < l1; k++)

+            ah = k*ido;

+            ac = 2*k*ido;

+            for (i = 0; i < ido; i+=4)

-                ah = k*ido;

-                ac = 2*k*ido;

+                __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;

+                __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;

+                __m128 w1, w2, w3, w4;

-                for (i = 0; i < ido; i++)

-                {

-                    complex_t t2;

+                m1 = _mm_load_ps(&RE(cc[ac+i]));

+                m2 = _mm_load_ps(&RE(cc[ac+ido+i]));

+                m5 = _mm_load_ps(&RE(cc[ac+i+2]));

+                m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));

+                w1 = _mm_load_ps(&RE(wa[i]));

+                w3 = _mm_load_ps(&RE(wa[i+2]));

-                    RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);

-                    RE(t2)       = RE(cc[ac+i]) - RE(cc[ac+i+ido]);

+                m3 = _mm_add_ps(m1, m2);

+                m15 = _mm_add_ps(m5, m6);

-                    IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);

-                    IM(t2)       = IM(cc[ac+i]) - IM(cc[ac+i+ido]);

+                m4 = _mm_sub_ps(m1, m2);

+                m16 = _mm_sub_ps(m5, m6);

-                    ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),

-                        IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));

-                }

+                _mm_store_ps(&RE(ch[ah+i]), m3);

+                _mm_store_ps(&RE(ch[ah+i+2]), m15);

+                w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));

+                w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));

+                m7 = _mm_mul_ps(m4, w1);

+                m17 = _mm_mul_ps(m16, w3);

+                m8 = _mm_mul_ps(m4, w2);

+                m18 = _mm_mul_ps(m16, w4);

+                m9  = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));

+                m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));

+                m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));

+                m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));

+                m11 = _mm_add_ps(m9, m10);

+                m21 = _mm_add_ps(m19, m20);

+                m12 = _mm_sub_ps(m9, m10);

+                m22 = _mm_sub_ps(m19, m20);

+                m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));

+                m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));

+                m14 = _mm_unpacklo_ps(m12, m13);

+                m24 = _mm_unpacklo_ps(m22, m23);

+                _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);

+                _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);

-        } else {

-            for (k = 0; k < l1; k++)

-            {

-                ah = k*ido;

-                ac = 2*k*ido;

+        }

+    }

+}

+#endif

-                for (i = 0; i < ido; i++)

-                {

-                    complex_t t2;

+static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,

+                      complex_t *ch, const complex_t *wa)

+{

+    uint16_t i, k, ah, ac;

-                    RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);

-                    RE(t2)       = RE(cc[ac+i]) - RE(cc[ac+i+ido]);

+    if (ido == 1)

+    {

+        for (k = 0; k < l1; k++)

+        {

+            ah = 2*k;

+            ac = 4*k;

-                    IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);

-                    IM(t2)       = IM(cc[ac+i]) - IM(cc[ac+i+ido]);

+            RE(ch[ah])    = RE(cc[ac]) + RE(cc[ac+1]);

+            RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);

+            IM(ch[ah])    = IM(cc[ac]) + IM(cc[ac+1]);

+            IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);

+        }

+    } else {

+        for (k = 0; k < l1; k++)

+        {

+            ah = k*ido;

+            ac = 2*k*ido;

-                    ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),

-                        RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));

-                }

+            for (i = 0; i < ido; i++)

+            {

+                complex_t t2;

+                RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);

+                RE(t2)       = RE(cc[ac+i]) - RE(cc[ac+i+ido]);

+                IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);

+                IM(t2)       = IM(cc[ac+i]) - IM(cc[ac+i+ido]);

+                ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),

+                    IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));

+            }

+        }

+    }

+}

+static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,

+                      complex_t *ch, const complex_t *wa)

+{

+    uint16_t i, k, ah, ac;

+    if (ido == 1)

+    {

+        for (k = 0; k < l1; k++)

+        {

+            ah = 2*k;

+            ac = 4*k;

+            RE(ch[ah])    = RE(cc[ac]) + RE(cc[ac+1]);

+            RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);

+            IM(ch[ah])    = IM(cc[ac]) + IM(cc[ac+1]);

+            IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);

+        }

+    } else {

+        for (k = 0; k < l1; k++)

+        {

+            ah = k*ido;

+            ac = 2*k*ido;

+            for (i = 0; i < ido; i++)

+            {

+                complex_t t2;

+                RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);

+                RE(t2)       = RE(cc[ac+i]) - RE(cc[ac+i+ido]);

+                IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);

+                IM(t2)       = IM(cc[ac+i]) - IM(cc[ac+i+ido]);

+                ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),

+                    RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));

@@ -234,153 +330,315 @@

-static void passf4(const uint16_t ido, const uint16_t l1, const complex_t *cc,

-                   complex_t *ch, const complex_t *wa1, const complex_t *wa2,

-                   const complex_t *wa3, const int8_t isign)

+#ifdef USE_SSE

+static void passf4pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,

+                          complex_t *ch, const complex_t *wa1, const complex_t *wa2,

+                          const complex_t *wa3)

     uint16_t i, k, ac, ah;

     if (ido == 1)

-        if (isign == 1)

+        for (k = 0; k < l1; k+=2)

-            for (k = 0; k < l1; k++)

-            {

-                complex_t t1, t2, t3, t4;

+            __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10;

+            __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, n10;

+            __m128 neg1 = _mm_set_ps(-1.0, 1.0, 1.0, 1.0);

-                ac = 4*k;

-                ah = k;

+            m1 = _mm_load_ps(&RE(cc[4*k]));

+            m2 = _mm_load_ps(&RE(cc[4*k+2]));

+            n1 = _mm_load_ps(&RE(cc[4*k+4]));

+            n2 = _mm_load_ps(&RE(cc[4*k+6]));

-                RE(t2) = RE(cc[ac])   + RE(cc[ac+2]);

-                RE(t1) = RE(cc[ac])   - RE(cc[ac+2]);

-                IM(t2) = IM(cc[ac])   + IM(cc[ac+2]);

-                IM(t1) = IM(cc[ac])   - IM(cc[ac+2]);

-                RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);

-                IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);

-                IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);

-                RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);

+            m3 = _mm_add_ps(m1, m2);

-                RE(ch[ah])      = RE(t2) + RE(t3);

-                RE(ch[ah+2*l1]) = RE(t2) - RE(t3);

+            n4 = _mm_mul_ps(neg1, n1);

+            n5 = _mm_mul_ps(neg1, n2);

+            m4 = _mm_mul_ps(neg1, m1);

+            m5 = _mm_mul_ps(neg1, m2);

-                IM(ch[ah])      = IM(t2) + IM(t3);

-                IM(ch[ah+2*l1]) = IM(t2) - IM(t3);

+            n3 = _mm_add_ps(n1, n2);

+            m6 = _mm_sub_ps(m4, m5);

-                RE(ch[ah+l1])   = RE(t1) + RE(t4);

-                RE(ch[ah+3*l1]) = RE(t1) - RE(t4);

+            m7 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(1, 0, 1, 0));

+            n6 = _mm_sub_ps(n4, n5);

+            m8 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(3, 2, 3, 2));

-                IM(ch[ah+l1])   = IM(t1) + IM(t4);

-                IM(ch[ah+3*l1]) = IM(t1) - IM(t4);

-            }

-        } else {

-            for (k = 0; k < l1; k++)

+            n7 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(1, 0, 1, 0));

+            m9 = _mm_add_ps(m7, m8);

+            n8 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(2, 3, 2, 3));

+            m10 = _mm_sub_ps(m7, m8);

+            n9 = _mm_add_ps(n7, n8);

+            _mm_store_ps(&RE(ch[k]), m9);

+            n10 = _mm_sub_ps(n7, n8);

+            _mm_store_ps(&RE(ch[k+l1]), n9);

+            _mm_store_ps(&RE(ch[k+2*l1]), m10);

+            _mm_store_ps(&RE(ch[k+3*l1]), n10);

+        }

+    } else {

+        for (k = 0; k < l1; k++)

+        {

+            ac = 4*k*ido;

+            ah = k*ido;

+            for (i = 0; i < ido; i+=2)

-                complex_t t1, t2, t3, t4;

+                __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16;

+                __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23;

+                __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30;

+                __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0);

-                ac = 4*k;

-                ah = k;

+                m1 = _mm_load_ps(&RE(cc[ac+i]));

+                m2 = _mm_load_ps(&RE(cc[ac+i+2*ido]));

+                m3 = _mm_add_ps(m1, m2);

+                m4 = _mm_sub_ps(m1, m2);

-                RE(t2) = RE(cc[ac])   + RE(cc[ac+2]);

-                RE(t1) = RE(cc[ac])   - RE(cc[ac+2]);

-                IM(t2) = IM(cc[ac])   + IM(cc[ac+2]);

-                IM(t1) = IM(cc[ac])   - IM(cc[ac+2]);

-                RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);

-                IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);

-                IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);

-                RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);

+                n1 = _mm_load_ps(&RE(cc[ac+i+ido]));

+                n2 = _mm_load_ps(&RE(cc[ac+i+3*ido]));

+                n3 = _mm_add_ps(n1, n2);

-                RE(ch[ah])      = RE(t2) + RE(t3);

-                RE(ch[ah+2*l1]) = RE(t2) - RE(t3);

+                n4 = _mm_mul_ps(neg1, n1);

+                n5 = _mm_mul_ps(neg1, n2);

+                n6 = _mm_sub_ps(n4, n5);

-                IM(ch[ah])      = IM(t2) + IM(t3);

-                IM(ch[ah+2*l1]) = IM(t2) - IM(t3);

+                m5 = _mm_add_ps(m3, n3);

-                RE(ch[ah+l1])   = RE(t1) - RE(t4);

-                RE(ch[ah+3*l1]) = RE(t1) + RE(t4);

+                n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1));

+                n8 = _mm_add_ps(m4, n7);

-                IM(ch[ah+l1])   = IM(t1) - IM(t4);

-                IM(ch[ah+3*l1]) = IM(t1) + IM(t4);

+                m6 = _mm_sub_ps(m3, n3);

+                n9 = _mm_sub_ps(m4, n7);

+                _mm_store_ps(&RE(ch[ah+i]), m5);

+#if 0

+  static INLINE void ComplexMult(real_t *y1, real_t *y2,

+      real_t x1, real_t x2, real_t c1, real_t c2)

+  {

+      *y1 = MUL_F(x1, c1) + MUL_F(x2, c2);

+      *y2 = MUL_F(x2, c1) - MUL_F(x1, c2);

+  }

+  m7.0 = RE(c2)*RE(wa1[i])

+  m7.1 = IM(c2)*IM(wa1[i])

+  m7.2 = RE(c6)*RE(wa1[i+1])

+  m7.3 = IM(c6)*IM(wa1[i+1])

+  m8.0 = RE(c2)*IM(wa1[i])

+  m8.1 = IM(c2)*RE(wa1[i])

+  m8.2 = RE(c6)*IM(wa1[i+1])

+  m8.3 = IM(c6)*RE(wa1[i+1])

+  RE(0) = m7.0 - m7.1

+  IM(0) = m8.0 + m8.1

+  RE(1) = m7.2 - m7.3

+  IM(1) = m8.2 + m8.3

+////

+  RE(0) = RE(c2)*RE(wa1[i])   - IM(c2)*IM(wa1[i])

+  IM(0) = RE(c2)*IM(wa1[i])   + IM(c2)*RE(wa1[i])

+  RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1])

+  IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1])

+#endif

+                w1 = _mm_load_ps(&RE(wa1[i]));

+                w3 = _mm_load_ps(&RE(wa2[i]));

+                w5 = _mm_load_ps(&RE(wa3[i]));

+                w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));

+                w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));

+                w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1));

+                m7 = _mm_mul_ps(n8, w1);

+                m15 = _mm_mul_ps(m6, w3);

+                m23 = _mm_mul_ps(n9, w5);

+                m8 = _mm_mul_ps(n8, w2);

+                m16 = _mm_mul_ps(m6, w4);

+                m24 = _mm_mul_ps(n9, w6);

+                m9  = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));

+                m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0));

+                m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0));

+                m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));

+                m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1));

+                m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1));

+                m11 = _mm_add_ps(m9, m10);

+                m19 = _mm_add_ps(m17, m18);

+                m27 = _mm_add_ps(m25, m26);

+                m12 = _mm_sub_ps(m9, m10);

+                m20 = _mm_sub_ps(m17, m18);

+                m28 = _mm_sub_ps(m25, m26);

+                m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));

+                m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2));

+                m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2));

+                m14 = _mm_unpacklo_ps(m12, m13);

+                m22 = _mm_unpacklo_ps(m20, m21);

+                m30 = _mm_unpacklo_ps(m28, m29);

+                _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);

+                _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22);

+                _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30);

+    }

+}

+#endif

+static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,

+                      complex_t *ch, const complex_t *wa1, const complex_t *wa2,

+                      const complex_t *wa3)

+{

+    uint16_t i, k, ac, ah;

+    if (ido == 1)

+    {

+        for (k = 0; k < l1; k++)

+        {

+            complex_t t1, t2, t3, t4;

+            ac = 4*k;

+            ah = k;

+            RE(t2) = RE(cc[ac])   + RE(cc[ac+2]);

+            RE(t1) = RE(cc[ac])   - RE(cc[ac+2]);

+            IM(t2) = IM(cc[ac])   + IM(cc[ac+2]);

+            IM(t1) = IM(cc[ac])   - IM(cc[ac+2]);

+            RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);

+            IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);

+            IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);

+            RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);

+            RE(ch[ah])      = RE(t2) + RE(t3);

+            RE(ch[ah+2*l1]) = RE(t2) - RE(t3);

+            IM(ch[ah])      = IM(t2) + IM(t3);

+            IM(ch[ah+2*l1]) = IM(t2) - IM(t3);

+            RE(ch[ah+l1])   = RE(t1) + RE(t4);

+            RE(ch[ah+3*l1]) = RE(t1) - RE(t4);

+            IM(ch[ah+l1])   = IM(t1) + IM(t4);

+            IM(ch[ah+3*l1]) = IM(t1) - IM(t4);

+        }

     } else {

-        if (isign == 1)

+        for (k = 0; k < l1; k++)

-            for (k = 0; k < l1; k++)

+            ac = 4*k*ido;

+            ah = k*ido;

+            for (i = 0; i < ido; i++)

-                ac = 4*k*ido;

-                ah = k*ido;

+                complex_t c2, c3, c4, t1, t2, t3, t4;

-                for (i = 0; i < ido; i++)

-                {

-                    complex_t c2, c3, c4, t1, t2, t3, t4;

+                RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);

+                RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);

+                IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);

+                IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);

+                RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);

+                IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);

+                IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);

+                RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);

-                    RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);

-                    RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);

-                    IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);

-                    IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);

-                    RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);

-                    IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);

-                    IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);

-                    RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);

+                RE(c2) = RE(t1) + RE(t4);

+                RE(c4) = RE(t1) - RE(t4);

-                    RE(c2) = RE(t1) + RE(t4);

-                    RE(c4) = RE(t1) - RE(t4);

+                IM(c2) = IM(t1) + IM(t4);

+                IM(c4) = IM(t1) - IM(t4);

-                    IM(c2) = IM(t1) + IM(t4);

-                    IM(c4) = IM(t1) - IM(t4);

+                RE(ch[ah+i]) = RE(t2) + RE(t3);

+                RE(c3)       = RE(t2) - RE(t3);

-                    RE(ch[ah+i]) = RE(t2) + RE(t3);

-                    RE(c3)       = RE(t2) - RE(t3);

+                IM(ch[ah+i]) = IM(t2) + IM(t3);

+                IM(c3)       = IM(t2) - IM(t3);

-                    IM(ch[ah+i]) = IM(t2) + IM(t3);

-                    IM(c3)       = IM(t2) - IM(t3);

-                    ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),

-                        IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));

-                    ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),

-                        IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));

-                    ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),

-                        IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));

-                }

+                ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),

+                    IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));

+                ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),

+                    IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));

+                ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),

+                    IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));

-        } else {

-            for (k = 0; k < l1; k++)

-            {

-                ac = 4*k*ido;

-                ah = k*ido;

+        }

+    }

+}

-                for (i = 0; i < ido; i++)

-                {

-                    complex_t c2, c3, c4, t1, t2, t3, t4;

+static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,

+                      complex_t *ch, const complex_t *wa1, const complex_t *wa2,

+                      const complex_t *wa3)

+{

+    uint16_t i, k, ac, ah;

-                    RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);

-                    RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);

-                    IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);

-                    IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);

-                    RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);

-                    IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);

-                    IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);

-                    RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);

+    if (ido == 1)

+    {

+        for (k = 0; k < l1; k++)

+        {

+            complex_t t1, t2, t3, t4;

-                    RE(c2) = RE(t1) - RE(t4);

-                    RE(c4) = RE(t1) + RE(t4);

+            ac = 4*k;

+            ah = k;

-                    IM(c2) = IM(t1) - IM(t4);

-                    IM(c4) = IM(t1) + IM(t4);

+            RE(t2) = RE(cc[ac])   + RE(cc[ac+2]);

+            RE(t1) = RE(cc[ac])   - RE(cc[ac+2]);

+            IM(t2) = IM(cc[ac])   + IM(cc[ac+2]);

+            IM(t1) = IM(cc[ac])   - IM(cc[ac+2]);

+            RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);

+            IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);

+            IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);

+            RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);

-                    RE(ch[ah+i]) = RE(t2) + RE(t3);

-                    RE(c3)       = RE(t2) - RE(t3);

+            RE(ch[ah])      = RE(t2) + RE(t3);

+            RE(ch[ah+2*l1]) = RE(t2) - RE(t3);

-                    IM(ch[ah+i]) = IM(t2) + IM(t3);

-                    IM(c3)       = IM(t2) - IM(t3);

+            IM(ch[ah])      = IM(t2) + IM(t3);

+            IM(ch[ah+2*l1]) = IM(t2) - IM(t3);

-                    ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),

-                        RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));

-                    ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),

-                        RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));

-                    ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),

-                        RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));

-                }

+            RE(ch[ah+l1])   = RE(t1) - RE(t4);

+            RE(ch[ah+3*l1]) = RE(t1) + RE(t4);

+            IM(ch[ah+l1])   = IM(t1) - IM(t4);

+            IM(ch[ah+3*l1]) = IM(t1) + IM(t4);

+        }

+    } else {

+        for (k = 0; k < l1; k++)

+        {

+            ac = 4*k*ido;

+            ah = k*ido;

+            for (i = 0; i < ido; i++)

+            {

+                complex_t c2, c3, c4, t1, t2, t3, t4;

+                RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);

+                RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);

+                IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);

+                IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);

+                RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);

+                IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);

+                IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);

+                RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);

+                RE(c2) = RE(t1) - RE(t4);

+                RE(c4) = RE(t1) + RE(t4);

+                IM(c2) = IM(t1) - IM(t4);

+                IM(c4) = IM(t1) + IM(t4);

+                RE(ch[ah+i]) = RE(t2) + RE(t3);

+                RE(c3)       = RE(t2) - RE(t3);

+                IM(ch[ah+i]) = IM(t2) + IM(t3);

+                IM(c3)       = IM(t2) - IM(t3);

+                ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),

+                    RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));

+                ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),

+                    RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));

+                ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),

+                    RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));

@@ -584,8 +842,9 @@

    cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs.

   ----------------------------------------------------------------------*/

-INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,

-                   const uint16_t *ifac, const complex_t *wa, const int8_t isign)

+#ifdef USE_SSE

+INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,

+                          const uint16_t *ifac, const complex_t *wa, const int8_t isign)

     uint16_t i;

     uint16_t k1, l1, l2;

@@ -610,17 +869,17 @@

             ix3 = ix2 + ido;

             if (na == 0)

-                passf4((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], isign);

+                passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);

             else

-                passf4((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], isign);

+                passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);

             na = 1 - na;

             break;

         case 2:

             if (na == 0)

-                passf2((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], isign);

+                passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);

             else

-                passf2((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], isign);

+                passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);

             na = 1 - na;

             break;

@@ -661,16 +920,180 @@

         IM(c[i]) = IM(ch[i]);

+#endif

+INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch,

+                      const uint16_t *ifac, const complex_t *wa, const int8_t isign)

+{

+    uint16_t i;

+    uint16_t k1, l1, l2;

+    uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;

+    nf = ifac[1];

+    na = 0;

+    l1 = 1;

+    iw = 0;

+    for (k1 = 2; k1 <= nf+1; k1++)

+    {

+        ip = ifac[k1];

+        l2 = ip*l1;

+        ido = n / l2;

+        idl1 = ido*l1;

+        switch (ip)

+        {

+        case 4:

+            ix2 = iw + ido;

+            ix3 = ix2 + ido;

+            if (na == 0)

+                passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);

+            else

+                passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);

+            na = 1 - na;

+            break;

+        case 2:

+            if (na == 0)

+                passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);

+            else

+                passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);

+            na = 1 - na;

+            break;

+        case 3:

+            ix2 = iw + ido;

+            if (na == 0)

+                passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);

+            else

+                passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);

+            na = 1 - na;

+            break;

+        case 5:

+            ix2 = iw + ido;

+            ix3 = ix2 + ido;

+            ix4 = ix3 + ido;

+            if (na == 0)

+                passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);

+            else

+                passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);

+            na = 1 - na;

+            break;

+        }

+        l1 = l2;

+        iw += (ip-1) * ido;

+    }

+    if (na == 0)

+        return;

+    for (i = 0; i < n; i++)

+    {

+        RE(c[i]) = RE(ch[i]);

+        IM(c[i]) = IM(ch[i]);

+    }

+}

+INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch,

+                      const uint16_t *ifac, const complex_t *wa, const int8_t isign)

+{

+    uint16_t i;

+    uint16_t k1, l1, l2;

+    uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;

+    nf = ifac[1];

+    na = 0;

+    l1 = 1;

+    iw = 0;

+    for (k1 = 2; k1 <= nf+1; k1++)

+    {

+        ip = ifac[k1];

+        l2 = ip*l1;

+        ido = n / l2;

+        idl1 = ido*l1;

+        switch (ip)

+        {

+        case 4:

+            ix2 = iw + ido;

+            ix3 = ix2 + ido;

+            if (na == 0)

+                passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);

+            else

+                passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);

+            na = 1 - na;

+            break;

+        case 2:

+            if (na == 0)

+                passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);

+            else

+                passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);

+            na = 1 - na;

+            break;

+        case 3:

+            ix2 = iw + ido;

+            if (na == 0)

+                passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);

+            else

+                passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);

+            na = 1 - na;

+            break;

+        case 5:

+            ix2 = iw + ido;

+            ix3 = ix2 + ido;

+            ix4 = ix3 + ido;

+            if (na == 0)

+                passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);

+            else

+                passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);

+            na = 1 - na;

+            break;

+        }

+        l1 = l2;

+        iw += (ip-1) * ido;

+    }

+    if (na == 0)

+        return;

+    for (i = 0; i < n; i++)

+    {

+        RE(c[i]) = RE(ch[i]);

+        IM(c[i]) = IM(ch[i]);

+    }

+}

 void cfftf(cfft_info *cfft, complex_t *c)

-    cfftf1(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, -1);

+    cfftf1neg(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, -1);

 void cfftb(cfft_info *cfft, complex_t *c)

-    cfftf1(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);

+    cfftf1pos(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);

+#ifdef USE_SSE

+void cfftb_sse(cfft_info *cfft, complex_t *c)

+{

+    cfftf1pos_sse(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);

+}

+#endif

 static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac)

--- a/libfaad/cfft.h

+++ b/libfaad/cfft.h

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: cfft.h,v 1.14 2003/12/17 14:43:16 menno Exp $

+** $Id: cfft.h,v 1.15 2003/12/23 18:41:42 menno Exp $

**/

 #ifndef __CFFT_H__

@@ -47,13 +47,23 @@

 void cfftu(cfft_info *cfft);

-static void passf2(const uint16_t ido, const uint16_t l1, const complex_t *cc,

-                   complex_t *ch, const complex_t *wa, const int8_t isign);

+#ifdef USE_SSE

+void cfftb_sse(cfft_info *cfft, complex_t *c);

+static void passf2pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,

+                          complex_t *ch, const complex_t *wa);

+static void passf4pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,

+                          const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);

+#endif

+static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,

+                      complex_t *ch, const complex_t *wa);

+static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,

+                      complex_t *ch, const complex_t *wa);

 static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,

                    complex_t *ch, const complex_t *wa1, const complex_t *wa2, const int8_t isign);

-static void passf4(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,

-                   const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,

-                   const int8_t isign);

+static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,

+                      const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);

+static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,

+                      const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);

 static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,

                    const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,

                    const complex_t *wa4, const int8_t isign);

--- a/libfaad/common.c

+++ b/libfaad/common.c

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: common.c,v 1.13 2003/12/17 14:43:16 menno Exp $

+** $Id: common.c,v 1.14 2003/12/23 18:41:42 menno Exp $

**/

 /* just some common functions that could be used anywhere */

@@ -30,37 +30,60 @@

 #include "common.h"

 #include "structs.h"

-#include <malloc.h>

 #include <stdlib.h>

 #include "syntax.h"

 #ifdef USE_SSE

-uint8_t cpu_has_sse()

+__declspec(naked) static int32_t __fastcall test_cpuid()

-    uint32_t feature;

-    __try

+    __asm

-        __asm

-        {

-            xor eax, eax

-            cpuid

-        }

+        pushf

+        pop eax

+        mov ecx,eax

+        xor eax,(1<<21)

+        push eax

+        popf

+        pushf

+        pop eax

+        push ecx

+        popf

+        cmp eax,ecx

+        mov eax,0

+        setne al

+        ret

-    __except (1)

-    {

-        return 0;

-    }

+}

+__declspec(naked) static void __fastcall run_cpuid(int32_t param, int32_t out[4])

+{

     __asm

-        mov eax, 1

+        pushad

+        push edx

+        mov eax,ecx

         cpuid

-        mov feature, edx

+        pop edi

+        mov [edi+0],eax

+        mov [edi+4],ebx

+        mov [edi+8],ecx

+        mov [edi+12],edx

+        popad

+        ret

+}

+uint8_t cpu_has_sse()

+{

+    int32_t features[4];

+    if (test_cpuid())

+    {

+        run_cpuid(1, features);

+    }

     /* check for SSE */

-    if (feature & 0x02000000)

+    if (features[3] & 0x02000000)

         return 1;

     return 0;

--- a/libfaad/common.h

+++ b/libfaad/common.h

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: common.h,v 1.40 2003/12/17 16:37:34 menno Exp $

+** $Id: common.h,v 1.41 2003/12/23 18:41:42 menno Exp $

**/

 #ifndef __COMMON_H__

@@ -117,7 +117,7 @@

 # endif

 #endif

-#if ((defined(_WIN32) && !defined(_WIN32_WCE)) || ((__GNUC__ >= 3) && defined(i386)))

+#if ((defined(_WIN32) && !defined(_WIN32_WCE)) /* || ((__GNUC__ >= 3) && defined(__i386__)) */ )

 #ifndef FIXED_POINT

 /* includes <xmmintrin.h> to enable SSE intrinsics */

 #define USE_SSE

@@ -300,6 +300,19 @@

             fld   f

             fistp i

+        return i;

+    }

+  #elif (defined(__i386__) && defined(__GNUC__))

+    #define HAS_LRINTF

+    // from http://www.stereopsis.com/FPU.html

+    static INLINE int lrintf(float f)

+    {

+        int i;

+        __asm__ __volatile__ (

+            "flds %1        \n\t"

+            "fistpl %0      \n\t"

+            : "=m" (i)

+            : "m" (f));

         return i;

   #endif

--- a/libfaad/error.c

+++ b/libfaad/error.c

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: error.c,v 1.17 2003/12/17 14:43:16 menno Exp $

+** $Id: error.c,v 1.18 2003/12/23 18:41:42 menno Exp $

**/

 #include "common.h"

@@ -45,5 +45,7 @@

     "Maximum number of bitstream elements exceeded",

     "Input data buffer too small",

     "Array index out of range",

-    "Maximum number of scalefactor bands exceeded"

+    "Maximum number of scalefactor bands exceeded",

+    "Quantised value out of range",

+    "LTP lag out of range"

};

\ No newline at end of file

--- a/libfaad/error.h

+++ b/libfaad/error.h

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: error.h,v 1.12 2003/12/17 14:43:16 menno Exp $

+** $Id: error.h,v 1.13 2003/12/23 18:41:42 menno Exp $

**/

 #ifndef __ERROR_H__

@@ -32,7 +32,7 @@

 extern "C" {

 #endif

-#define NUM_ERROR_MESSAGES 17

+#define NUM_ERROR_MESSAGES 19

 extern int8_t *err_msg[];

 #ifdef __cplusplus

--- a/libfaad/mdct.c

+++ b/libfaad/mdct.c

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: mdct.c,v 1.35 2003/12/17 14:43:16 menno Exp $

+** $Id: mdct.c,v 1.36 2003/12/23 18:41:42 menno Exp $

**/

/*

@@ -192,6 +192,7 @@

 #ifdef PROFILE

     mdct->cycles = 0;

+    mdct->fft_cycles = 0;

 #endif

     return mdct;

@@ -203,6 +204,7 @@

 #ifdef PROFILE

         printf("MDCT[%.4d]:         %I64d cycles\n", mdct->N, mdct->cycles);

+        printf("CFFT[%.4d]:         %I64d cycles\n", mdct->N/4, mdct->fft_cycles);

 #endif

         cfftu(mdct->cfft);

@@ -287,6 +289,7 @@

 #ifdef PROFILE

     count2 = faad_get_ts() - count2;

+    mdct->fft_cycles += count1;

     mdct->cycles += (count2 - count1);

 #endif

@@ -359,7 +362,7 @@

 #endif

     /* complex IFFT, any non-scaling FFT can be used here */

-    cfftb(mdct->cfft, Z1);

+    cfftb_sse(mdct->cfft, Z1);

 #ifdef PROFILE

     count1 = faad_get_ts() - count1;

@@ -445,6 +448,7 @@

 #ifdef PROFILE

     count2 = faad_get_ts() - count2;

+    mdct->fft_cycles += count1;

     mdct->cycles += (count2 - count1);

 #endif

--- a/libfaad/output.c

+++ b/libfaad/output.c

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: output.c,v 1.32 2003/12/17 14:43:16 menno Exp $

+** $Id: output.c,v 1.33 2003/12/23 18:41:42 menno Exp $

**/

 #include "common.h"

@@ -39,10 +39,11 @@

 #define DM_MUL REAL_CONST(0.4142135623730950488) // 1/(1+sqrt(2))

 #define RSQRT2 REAL_CONST(0.7071067811865475244) // 1/sqrt(2)

 static INLINE real_t get_sample(real_t **input, uint8_t channel, uint16_t sample,

-                                uint8_t downMatrix, uint8_t *internal_channel)

+                                uint8_t down_matrix, uint8_t *internal_channel)

-    if (!downMatrix)

+    if (!down_matrix)

         return input[internal_channel[channel]][sample];

     if (channel == 0)

@@ -57,127 +58,309 @@

-void* output_to_PCM(faacDecHandle hDecoder,

-                    real_t **input, void *sample_buffer, uint8_t channels,

-                    uint16_t frame_len, uint8_t format)

-{

-    uint8_t ch;

-    uint16_t i, j = 0;

-    uint8_t internal_channel;

+#ifndef HAS_LRINTF

+#define CLIP(sample, max, min) \

+if (sample >= 0.0f)            \

+{                              \

+    sample += 0.5f;            \

+    if (sample >= max)         \

+        sample = max;          \

+} else {                       \

+    sample += -0.5f;           \

+    if (sample <= min)         \

+        sample = min;          \

+}

+#else

+#define CLIP(sample, max, min) \

+if (sample >= 0.0f)            \

+{                              \

+    if (sample >= max)         \

+        sample = max;          \

+} else {                       \

+    if (sample <= min)         \

+        sample = min;          \

+}

+#endif

-    int16_t   *short_sample_buffer = (int16_t*)sample_buffer;

-    int32_t   *int_sample_buffer = (int32_t*)sample_buffer;

-    float32_t *float_sample_buffer = (float32_t*)sample_buffer;

-    double    *double_sample_buffer = (double*)sample_buffer;

+#define CONV(a,b) ((a<<1)|(b&0x1))

-#ifdef PROFILE

-    int64_t count = faad_get_ts();

-#endif

+static void to_PCM_16bit(faacDecHandle hDecoder, real_t **input,

+                         uint8_t channels, uint16_t frame_len,

+                         int16_t **sample_buffer)

+{

+    uint8_t ch, ch1;

+    uint16_t i;

-    /* Copy output to a standard PCM buffer */

-    for (ch = 0; ch < channels; ch++)

+    switch (CONV(channels,hDecoder->downMatrix))

-        internal_channel = hDecoder->internal_channel[ch];

+    case CONV(1,0):

+    case CONV(1,1):

+        for(i = 0; i < frame_len; i++)

+        {

+            real_t inp = input[hDecoder->internal_channel[0]][i];

-        switch (format)

+            CLIP(inp, 32767.0f, -32768.0f);

+            (*sample_buffer)[i] = (int16_t)lrintf(inp);

+        }

+        break;

+    case CONV(2,0):

+        ch  = hDecoder->internal_channel[0];

+        ch1 = hDecoder->internal_channel[1];

+        for(i = 0; i < frame_len; i++)

-        case FAAD_FMT_16BIT:

+            real_t inp0 = input[ch ][i];

+            real_t inp1 = input[ch1][i];

+            CLIP(inp0, 32767.0f, -32768.0f);

+            CLIP(inp1, 32767.0f, -32768.0f);

+            (*sample_buffer)[(i*2)+0] = (int16_t)lrintf(inp0);

+            (*sample_buffer)[(i*2)+1] = (int16_t)lrintf(inp1);

+        }

+        break;

+    default:

+        for (ch = 0; ch < channels; ch++)

+        {

             for(i = 0; i < frame_len; i++)

                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);

-                if (inp >= 0.0f)

-                {

-#ifndef HAS_LRINTF

-                    inp += 0.5f;

-#endif

-                    if (inp >= 32767.0f)

-                    {

-                        inp = 32767.0f;

-                    }

-                } else {

-#ifndef HAS_LRINTF

-                    inp += -0.5f;

-#endif

-                    if (inp <= -32768.0f)

-                    {

-                        inp = -32768.0f;

-                    }

-                }

-                short_sample_buffer[(i*channels)+ch] = (int16_t)lrintf(inp);

-            }

-            break;

-        case FAAD_FMT_24BIT:

+                CLIP(inp, 32767.0f, -32768.0f);

+                (*sample_buffer)[(i*channels)+ch] = (int16_t)lrintf(inp);

+            }

+        }

+        break;

+    }

+}

+static void to_PCM_24bit(faacDecHandle hDecoder, real_t **input,

+                         uint8_t channels, uint16_t frame_len,

+                         int32_t **sample_buffer)

+{

+    uint8_t ch, ch1;

+    uint16_t i;

+    switch (CONV(channels,hDecoder->downMatrix))

+    {

+    case CONV(1,0):

+    case CONV(1,1):

+        for(i = 0; i < frame_len; i++)

+        {

+            real_t inp = input[hDecoder->internal_channel[0]][i];

+            inp *= 256.0f;

+            CLIP(inp, 8388607.0f, -8388608.0f);

+            (*sample_buffer)[i] = (int32_t)lrintf(inp);

+        }

+        break;

+    case CONV(2,0):

+        ch  = hDecoder->internal_channel[0];

+        ch1 = hDecoder->internal_channel[1];

+        for(i = 0; i < frame_len; i++)

+        {

+            real_t inp0 = input[ch ][i];

+            real_t inp1 = input[ch1][i];

+            inp0 *= 256.0f;

+            inp1 *= 256.0f;

+            CLIP(inp0, 8388607.0f, -8388608.0f);

+            CLIP(inp1, 8388607.0f, -8388608.0f);

+            (*sample_buffer)[(i*2)+0] = (int32_t)lrintf(inp0);

+            (*sample_buffer)[(i*2)+1] = (int32_t)lrintf(inp1);

+        }

+        break;

+    default:

+        for (ch = 0; ch < channels; ch++)

+        {

             for(i = 0; i < frame_len; i++)

                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);

                 inp *= 256.0f;

-                if (inp >= 0.0f)

-                {

-#ifndef HAS_LRINTF

-                    inp += 0.5f;

-#endif

-                    if (inp >= 8388607.0f)

-                    {

-                        inp = 8388607.0f;

-                    }

-                } else {

-#ifndef HAS_LRINTF

-                    inp += -0.5f;

-#endif

-                    if (inp <= -8388608.0f)

-                    {

-                        inp = -8388608.0f;

-                    }

-                }

-                int_sample_buffer[(i*channels)+ch] = lrintf(inp);

-            }

-            break;

-        case FAAD_FMT_32BIT:

+                CLIP(inp, 8388607.0f, -8388608.0f);

+                (*sample_buffer)[(i*channels)+ch] = (int32_t)lrintf(inp);

+            }

+        }

+        break;

+    }

+}

+static void to_PCM_32bit(faacDecHandle hDecoder, real_t **input,

+                         uint8_t channels, uint16_t frame_len,

+                         int32_t **sample_buffer)

+{

+    uint8_t ch, ch1;

+    uint16_t i;

+    switch (CONV(channels,hDecoder->downMatrix))

+    {

+    case CONV(1,0):

+    case CONV(1,1):

+        for(i = 0; i < frame_len; i++)

+        {

+            real_t inp = input[hDecoder->internal_channel[0]][i];

+            inp *= 65536.0f;

+            CLIP(inp, 2147483647.0f, -2147483648.0f);

+            (*sample_buffer)[i] = (int32_t)lrintf(inp);

+        }

+        break;

+    case CONV(2,0):

+        ch  = hDecoder->internal_channel[0];

+        ch1 = hDecoder->internal_channel[1];

+        for(i = 0; i < frame_len; i++)

+        {

+            real_t inp0 = input[ch ][i];

+            real_t inp1 = input[ch1][i];

+            inp0 *= 65536.0f;

+            inp1 *= 65536.0f;

+            CLIP(inp0, 2147483647.0f, -2147483648.0f);

+            CLIP(inp1, 2147483647.0f, -2147483648.0f);

+            (*sample_buffer)[(i*2)+0] = (int32_t)lrintf(inp0);

+            (*sample_buffer)[(i*2)+1] = (int32_t)lrintf(inp1);

+        }

+        break;

+    default:

+        for (ch = 0; ch < channels; ch++)

+        {

             for(i = 0; i < frame_len; i++)

                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);

                 inp *= 65536.0f;

-                if (inp >= 0.0f)

-                {

-#ifndef HAS_LRINTF

-                    inp += 0.5f;

-#endif

-                    if (inp >= 2147483647.0f)

-                    {

-                        inp = 2147483647.0f;

-                    }

-                } else {

-#ifndef HAS_LRINTF

-                    inp += -0.5f;

-#endif

-                    if (inp <= -2147483648.0f)

-                    {

-                        inp = -2147483648.0f;

-                    }

-                }

-                int_sample_buffer[(i*channels)+ch] = lrintf(inp);

-            }

-            break;

-        case FAAD_FMT_FLOAT:

+                CLIP(inp, 2147483647.0f, -2147483648.0f);

+                (*sample_buffer)[(i*channels)+ch] = (int32_t)lrintf(inp);

+            }

+        }

+        break;

+    }

+}

+static void to_PCM_float(faacDecHandle hDecoder, real_t **input,

+                         uint8_t channels, uint16_t frame_len,

+                         float32_t **sample_buffer)

+{

+    uint8_t ch, ch1;

+    uint16_t i;

+    switch (CONV(channels,hDecoder->downMatrix))

+    {

+    case CONV(1,0):

+    case CONV(1,1):

+        for(i = 0; i < frame_len; i++)

+        {

+            real_t inp = input[hDecoder->internal_channel[0]][i];

+            (*sample_buffer)[i] = inp*FLOAT_SCALE;

+        }

+        break;

+    case CONV(2,0):

+        ch  = hDecoder->internal_channel[0];

+        ch1 = hDecoder->internal_channel[1];

+        for(i = 0; i < frame_len; i++)

+        {

+            real_t inp0 = input[ch ][i];

+            real_t inp1 = input[ch1][i];

+            (*sample_buffer)[(i*2)+0] = inp0*FLOAT_SCALE;

+            (*sample_buffer)[(i*2)+1] = inp1*FLOAT_SCALE;

+        }

+        break;

+    default:

+        for (ch = 0; ch < channels; ch++)

+        {

             for(i = 0; i < frame_len; i++)

-                //real_t inp = input[internal_channel][i];

                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);

-                float_sample_buffer[(i*channels)+ch] = inp*FLOAT_SCALE;

+                (*sample_buffer)[(i*channels)+ch] = inp*FLOAT_SCALE;

-            break;

-        case FAAD_FMT_DOUBLE:

+        }

+        break;

+    }

+}

+static void to_PCM_double(faacDecHandle hDecoder, real_t **input,

+                          uint8_t channels, uint16_t frame_len,

+                          double **sample_buffer)

+{

+    uint8_t ch, ch1;

+    uint16_t i;

+    switch (CONV(channels,hDecoder->downMatrix))

+    {

+    case CONV(1,0):

+    case CONV(1,1):

+        for(i = 0; i < frame_len; i++)

+        {

+            real_t inp = input[hDecoder->internal_channel[0]][i];

+            (*sample_buffer)[i] = (double)inp*FLOAT_SCALE;

+        }

+        break;

+    case CONV(2,0):

+        ch  = hDecoder->internal_channel[0];

+        ch1 = hDecoder->internal_channel[1];

+        for(i = 0; i < frame_len; i++)

+        {

+            real_t inp0 = input[ch ][i];

+            real_t inp1 = input[ch1][i];

+            (*sample_buffer)[(i*2)+0] = (double)inp0*FLOAT_SCALE;

+            (*sample_buffer)[(i*2)+1] = (double)inp1*FLOAT_SCALE;

+        }

+        break;

+    default:

+        for (ch = 0; ch < channels; ch++)

+        {

             for(i = 0; i < frame_len; i++)

-                //real_t inp = input[internal_channel][i];

                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);

-                double_sample_buffer[(i*channels)+ch] = (double)inp*FLOAT_SCALE;

+                (*sample_buffer)[(i*channels)+ch] = (double)inp*FLOAT_SCALE;

-            break;

+        break;

+}

+void *output_to_PCM(faacDecHandle hDecoder,

+                    real_t **input, void *sample_buffer, uint8_t channels,

+                    uint16_t frame_len, uint8_t format)

+{

+    int16_t   *short_sample_buffer = (int16_t*)sample_buffer;

+    int32_t   *int_sample_buffer = (int32_t*)sample_buffer;

+    float32_t *float_sample_buffer = (float32_t*)sample_buffer;

+    double    *double_sample_buffer = (double*)sample_buffer;

 #ifdef PROFILE

+    int64_t count = faad_get_ts();

+#endif

+    /* Copy output to a standard PCM buffer */

+    switch (format)

+    {

+    case FAAD_FMT_16BIT:

+        to_PCM_16bit(hDecoder, input, channels, frame_len, &short_sample_buffer);

+        break;

+    case FAAD_FMT_24BIT:

+        to_PCM_24bit(hDecoder, input, channels, frame_len, &int_sample_buffer);

+        break;

+    case FAAD_FMT_32BIT:

+        to_PCM_32bit(hDecoder, input, channels, frame_len, &int_sample_buffer);

+        break;

+    case FAAD_FMT_FLOAT:

+        to_PCM_float(hDecoder, input, channels, frame_len, &float_sample_buffer);

+        break;

+    case FAAD_FMT_DOUBLE:

+        to_PCM_double(hDecoder, input, channels, frame_len, &double_sample_buffer);

+        break;

+    }

+#ifdef PROFILE

     count = faad_get_ts() - count;

     hDecoder->output_cycles += count;

 #endif

@@ -208,13 +391,13 @@

                 if (tmp >= 0)

                     tmp += (1 << (REAL_BITS-1));

-                    if (tmp >= REAL_CONST(32768))

+                    if (tmp >= REAL_CONST(32767))

                         tmp = REAL_CONST(32767);

                 } else {

                     tmp += -(1 << (REAL_BITS-1));

-                    if (tmp <= REAL_CONST(-32769))

+                    if (tmp <= REAL_CONST(-32768))

                         tmp = REAL_CONST(-32768);

@@ -231,7 +414,7 @@

                     tmp += (1 << (REAL_BITS-9));

                     tmp >>= (REAL_BITS-8);

-                    if (tmp >= 8388608)

+                    if (tmp >= 8388607)

                         tmp = 8388607;

@@ -238,7 +421,7 @@

                 } else {

                     tmp += -(1 << (REAL_BITS-9));

                     tmp >>= (REAL_BITS-8);

-                    if (tmp <= -8388609)

+                    if (tmp <= -8388608)

                         tmp = -8388608;

--- a/libfaad/specrec.c

+++ b/libfaad/specrec.c

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: specrec.c,v 1.35 2003/12/17 16:37:34 menno Exp $

+** $Id: specrec.c,v 1.36 2003/12/23 18:41:42 menno Exp $

**/

/*

@@ -461,7 +461,7 @@

     memcpy(spec_data, tmp_spec, frame_len*sizeof(real_t));

-static INLINE real_t iquant(int16_t q, const real_t *tab)

+static INLINE real_t iquant(int16_t q, const real_t *tab, uint8_t *error)

 #ifdef FIXED_POINT

     static const real_t errcorr[] = {

@@ -488,35 +488,41 @@

 #else

     if (q < 0)

-        if (-q >= IQ_TABLE_SIZE)

-            return 0;

         /* tab contains a value for all possible q [0,8192] */

-        return -tab[-q];

-    }

+        if (-q < IQ_TABLE_SIZE)

+            return -tab[-q];

-    if (q >= IQ_TABLE_SIZE)

+        *error = 17;

         return 0;

+    } else {

+        /* tab contains a value for all possible q [0,8192] */

+        if (q < IQ_TABLE_SIZE)

+            return tab[q];

-    /* tab contains a value for all possible q [0,8192] */

-    return tab[q];

+        *error = 17;

+        return 0;

+    }

 #endif

-static void inverse_quantization(real_t *x_invquant, const int16_t *x_quant, const uint16_t frame_len)

+static uint8_t inverse_quantization(real_t *x_invquant, const int16_t *x_quant, const uint16_t frame_len)

     int16_t i;

+    uint8_t error = 0; /* Init error flag */

     const real_t *tab = iq_table;

     for(i = 0; i < frame_len; i+=4)

-        x_invquant[i] = iquant(x_quant[i], tab);

-        x_invquant[i+1] = iquant(x_quant[i+1], tab);

-        x_invquant[i+2] = iquant(x_quant[i+2], tab);

-        x_invquant[i+3] = iquant(x_quant[i+3], tab);

+        x_invquant[i] = iquant(x_quant[i], tab, &error);

+        x_invquant[i+1] = iquant(x_quant[i+1], tab, &error);

+        x_invquant[i+2] = iquant(x_quant[i+2], tab, &error);

+        x_invquant[i+3] = iquant(x_quant[i+3], tab, &error);

+    return error;

+#ifndef FIXED_POINT

 ALIGN static const real_t pow2sf_tab[] = {

     2.9802322387695313E-008, 5.9604644775390625E-008, 1.1920928955078125E-007,

     2.384185791015625E-007, 4.76837158203125E-007, 9.5367431640625E-007,

@@ -540,12 +546,15 @@

     8589934592, 17179869184, 34359738368,

     68719476736, 137438953472, 274877906944

};

+#endif

 ALIGN static real_t pow2_table[] =

+#if 0

     COEF_CONST(0.59460355750136053335874998528024), /* 2^-0.75 */

     COEF_CONST(0.70710678118654752440084436210485), /* 2^-0.5 */

     COEF_CONST(0.84089641525371454303112547623321), /* 2^-0.25 */

+#endif

     COEF_CONST(1.0),

     COEF_CONST(1.1892071150027210667174999705605), /* 2^0.25 */

     COEF_CONST(1.4142135623730950488016887242097), /* 2^0.5 */

@@ -573,10 +582,11 @@

             top = ics->sect_sfb_offset[g][sfb+1];

-            exp = (ics->scale_factors[g][sfb] - 100) >> 2;

-            frac = (ics->scale_factors[g][sfb] - 100) & 3;

+            exp = (ics->scale_factors[g][sfb] /* - 100 */) >> 2;

+            frac = (ics->scale_factors[g][sfb] /* - 100 */) & 3;

 #ifdef FIXED_POINT

+            exp -= 25;

             /* IMDCT pre-scaling */

             if (hDecoder->object_type == LD)

@@ -606,16 +616,16 @@

                     x_invquant[k+(groups*nshort)+3] <<= exp;

 #else

-                x_invquant[k+(groups*nshort)]   = x_invquant[k+(groups*nshort)]   * pow2sf_tab[exp+25];

-                x_invquant[k+(groups*nshort)+1] = x_invquant[k+(groups*nshort)+1] * pow2sf_tab[exp+25];

-                x_invquant[k+(groups*nshort)+2] = x_invquant[k+(groups*nshort)+2] * pow2sf_tab[exp+25];

-                x_invquant[k+(groups*nshort)+3] = x_invquant[k+(groups*nshort)+3] * pow2sf_tab[exp+25];

+                x_invquant[k+(groups*nshort)]   = x_invquant[k+(groups*nshort)]   * pow2sf_tab[exp/*+25*/];

+                x_invquant[k+(groups*nshort)+1] = x_invquant[k+(groups*nshort)+1] * pow2sf_tab[exp/*+25*/];

+                x_invquant[k+(groups*nshort)+2] = x_invquant[k+(groups*nshort)+2] * pow2sf_tab[exp/*+25*/];

+                x_invquant[k+(groups*nshort)+3] = x_invquant[k+(groups*nshort)+3] * pow2sf_tab[exp/*+25*/];

 #endif

-                x_invquant[k+(groups*nshort)]   = MUL_C(x_invquant[k+(groups*nshort)],pow2_table[frac + 3]);

-                x_invquant[k+(groups*nshort)+1] = MUL_C(x_invquant[k+(groups*nshort)+1],pow2_table[frac + 3]);

-                x_invquant[k+(groups*nshort)+2] = MUL_C(x_invquant[k+(groups*nshort)+2],pow2_table[frac + 3]);

-                x_invquant[k+(groups*nshort)+3] = MUL_C(x_invquant[k+(groups*nshort)+3],pow2_table[frac + 3]);

+                x_invquant[k+(groups*nshort)]   = MUL_C(x_invquant[k+(groups*nshort)],pow2_table[frac /* + 3*/]);

+                x_invquant[k+(groups*nshort)+1] = MUL_C(x_invquant[k+(groups*nshort)+1],pow2_table[frac /* + 3*/]);

+                x_invquant[k+(groups*nshort)+2] = MUL_C(x_invquant[k+(groups*nshort)+2],pow2_table[frac /* + 3*/]);

+                x_invquant[k+(groups*nshort)+3] = MUL_C(x_invquant[k+(groups*nshort)+3],pow2_table[frac /* + 3*/]);

         groups += ics->window_group_length[g];

@@ -644,16 +654,16 @@

             top = ics->sect_sfb_offset[g][sfb+1];

-            exp = (ics->scale_factors[g][sfb] - 100) >> 2;

-            frac = (ics->scale_factors[g][sfb] - 100) & 3;

+            exp = (ics->scale_factors[g][sfb] /* - 100 */) >> 2;

+            frac = (ics->scale_factors[g][sfb] /* - 100 */) & 3;

             /* minimum size of a sf band is 4 and always a multiple of 4 */

             for ( ; k < top; k += 4)

                 __m128 m1 = _mm_load_ps(&x_invquant[k+(groups*nshort)]);

-                __m128 m2 = _mm_load_ps1(&pow2sf_tab[exp+25]);

+                __m128 m2 = _mm_load_ps1(&pow2sf_tab[exp /*+25*/]);

+                __m128 m3 = _mm_load_ps1(&pow2_table[frac /* + 3*/]);

                 __m128 m4 = _mm_mul_ps(m1, m2);

-                __m128 m3 = _mm_load_ps1(&pow2_table[frac + 3]);

                 __m128 m5 = _mm_mul_ps(m3, m4);

                 _mm_store_ps(&x_invquant[k+(groups*nshort)], m5);

@@ -663,9 +673,10 @@

 #endif

-void reconstruct_single_channel(faacDecHandle hDecoder, ic_stream *ics,

-                                element *sce, int16_t *spec_data)

+uint8_t reconstruct_single_channel(faacDecHandle hDecoder, ic_stream *ics,

+                                   element *sce, int16_t *spec_data)

+    uint8_t retval;

     ALIGN real_t spec_coef[1024];

 #ifdef PROFILE

@@ -673,7 +684,9 @@

 #endif

     /* inverse quantization */

-    inverse_quantization(spec_coef, spec_data, hDecoder->frameLength);

+    retval = inverse_quantization(spec_coef, spec_data, hDecoder->frameLength);

+    if (retval > 0)

+        return retval;

     /* apply scalefactors */

 #ifndef USE_SSE

@@ -682,16 +695,16 @@

     hDecoder->apply_sf_func(hDecoder, ics, spec_coef, hDecoder->frameLength);

 #endif

+    /* deinterleave short block grouping */

+    if (ics->window_sequence == EIGHT_SHORT_SEQUENCE)

+        quant_to_spec(ics, spec_coef, hDecoder->frameLength);

 #ifdef PROFILE

     count = faad_get_ts() - count;

     hDecoder->requant_cycles += count;

 #endif

-    /* deinterleave short block grouping */

-    if (ics->window_sequence == EIGHT_SHORT_SEQUENCE)

-        quant_to_spec(ics, spec_coef, hDecoder->frameLength);

     /* pns decoding */

     pns_decode(ics, NULL, spec_coef, NULL, hDecoder->frameLength, 0, hDecoder->object_type);

@@ -810,11 +823,14 @@

             hDecoder->time_out[sce->channel]+hDecoder->frameLength, hDecoder->frameLength, hDecoder->object_type);

 #endif

+    return 0;

-void reconstruct_channel_pair(faacDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2,

-                              element *cpe, int16_t *spec_data1, int16_t *spec_data2)

+uint8_t reconstruct_channel_pair(faacDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2,

+                                 element *cpe, int16_t *spec_data1, int16_t *spec_data2)

+    uint8_t retval;

     ALIGN real_t spec_coef1[1024];

     ALIGN real_t spec_coef2[1024];

@@ -823,9 +839,14 @@

 #endif

     /* inverse quantization */

-    inverse_quantization(spec_coef1, spec_data1, hDecoder->frameLength);

-    inverse_quantization(spec_coef2, spec_data2, hDecoder->frameLength);

+    retval = inverse_quantization(spec_coef1, spec_data1, hDecoder->frameLength);

+    if (retval > 0)

+        return retval;

+    retval = inverse_quantization(spec_coef2, spec_data2, hDecoder->frameLength);

+    if (retval > 0)

+        return retval;

     /* apply scalefactors */

 #ifndef USE_SSE

     apply_scalefactors(hDecoder, ics1, spec_coef1, hDecoder->frameLength);

@@ -835,11 +856,6 @@

     hDecoder->apply_sf_func(hDecoder, ics2, spec_coef2, hDecoder->frameLength);

 #endif

-#ifdef PROFILE

-    count = faad_get_ts() - count;

-    hDecoder->requant_cycles += count;

-#endif

     /* deinterleave short block grouping */

     if (ics1->window_sequence == EIGHT_SHORT_SEQUENCE)

         quant_to_spec(ics1, spec_coef1, hDecoder->frameLength);

@@ -846,7 +862,12 @@

     if (ics2->window_sequence == EIGHT_SHORT_SEQUENCE)

         quant_to_spec(ics2, spec_coef2, hDecoder->frameLength);

+#ifdef PROFILE

+    count = faad_get_ts() - count;

+    hDecoder->requant_cycles += count;

+#endif

     /* pns decoding */

     if (ics1->ms_mask_present)

@@ -1036,4 +1057,6 @@

             hDecoder->object_type);

 #endif

+    return 0;

--- a/libfaad/specrec.h

+++ b/libfaad/specrec.h

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: specrec.h,v 1.21 2003/12/17 14:43:16 menno Exp $

+** $Id: specrec.h,v 1.22 2003/12/23 18:41:42 menno Exp $

**/

 #ifndef __SPECREC_H__

@@ -36,7 +36,7 @@

 uint8_t window_grouping_info(faacDecHandle hDecoder, ic_stream *ics);

 static void quant_to_spec(ic_stream *ics, real_t *spec_data, uint16_t frame_len);

-static void inverse_quantization(real_t *x_invquant, const int16_t *x_quant, const uint16_t frame_len);

+static uint8_t inverse_quantization(real_t *x_invquant, const int16_t *x_quant, const uint16_t frame_len);

 void apply_scalefactors(faacDecHandle hDecoder, ic_stream *ics, real_t *x_invquant,

                         uint16_t frame_len);

 #ifdef USE_SSE

@@ -43,9 +43,9 @@

 void apply_scalefactors_sse(faacDecHandle hDecoder, ic_stream *ics, real_t *x_invquant,

                             uint16_t frame_len);

 #endif

-void reconstruct_channel_pair(faacDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2,

-                              element *cpe, int16_t *spec_data1, int16_t *spec_data2);

-void reconstruct_single_channel(faacDecHandle hDecoder, ic_stream *ics, element *sce,

+uint8_t reconstruct_channel_pair(faacDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2,

+                                 element *cpe, int16_t *spec_data1, int16_t *spec_data2);

+uint8_t reconstruct_single_channel(faacDecHandle hDecoder, ic_stream *ics, element *sce,

                                 int16_t *spec_data);

 #ifdef __cplusplus

--- a/libfaad/syntax.c

+++ b/libfaad/syntax.c

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: syntax.c,v 1.62 2003/12/17 14:43:16 menno Exp $

+** $Id: syntax.c,v 1.63 2003/12/23 18:41:42 menno Exp $

**/

/*

@@ -530,7 +530,9 @@

         return retval;

     /* noiseless coding is done, spectral reconstruction is done now */

-    reconstruct_single_channel(hDecoder, ics, &sce, spec_data);

+    retval = reconstruct_single_channel(hDecoder, ics, &sce, spec_data);

+    if (retval > 0)

+        return retval;

     return 0;

@@ -581,7 +583,10 @@

             if ((ics1->ltp.data_present = faad_get1bit(ld

                 DEBUGVAR(1,50,"channel_pair_element(): ltp.data_present"))) & 1)

-                ltp_data(hDecoder, ics1, &(ics1->ltp), ld);

+                if ((result = ltp_data(hDecoder, ics1, &(ics1->ltp), ld)) > 0)

+                {

+                    return result;

+                }

 #endif

@@ -604,7 +609,10 @@

         if ((ics1->ltp2.data_present = faad_get1bit(ld

             DEBUGVAR(1,50,"channel_pair_element(): ltp.data_present"))) & 1)

-            ltp_data(hDecoder, ics1, &(ics1->ltp2), ld);

+            if ((result = ltp_data(hDecoder, ics1, &(ics1->ltp2), ld)) > 0)

+            {

+                return result;

+            }

 #endif

@@ -616,7 +624,11 @@

     /* noiseless coding is done, spectral reconstruction is done now */

-    reconstruct_channel_pair(hDecoder, ics1, ics2, &cpe, spec_data1, spec_data2);

+    if ((result = reconstruct_channel_pair(hDecoder, ics1, ics2, &cpe,

+        spec_data1, spec_data2)) > 0)

+    {

+        return result;

+    }

     return 0;

@@ -685,7 +697,10 @@

                     if ((ics->ltp.data_present = faad_get1bit(ld

                         DEBUGVAR(1,50,"ics_info(): ltp.data_present"))) & 1)

-                        ltp_data(hDecoder, ics, &(ics->ltp), ld);

+                        if ((retval = ltp_data(hDecoder, ics, &(ics->ltp), ld)) > 0)

+                        {

+                            return retval;

+                        }

                     if (common_window)

@@ -692,7 +707,10 @@

                         if ((ics->ltp2.data_present = faad_get1bit(ld

                             DEBUGVAR(1,51,"ics_info(): ltp2.data_present"))) & 1)

-                            ltp_data(hDecoder, ics, &(ics->ltp2), ld);

+                            if ((retval = ltp_data(hDecoder, ics, &(ics->ltp2), ld)) > 0)

+                            {

+                                return retval;

+                            }

@@ -1064,9 +1082,13 @@

     if (this_layer_stereo)

-        reconstruct_channel_pair(hDecoder, ics1, ics2, &cpe, spec_data1, spec_data2);

+        hInfo->error = reconstruct_channel_pair(hDecoder, ics1, ics2, &cpe, spec_data1, spec_data2);

+        if (hInfo->error > 0)

+            return;

     } else {

-        reconstruct_single_channel(hDecoder, ics1, &cpe, spec_data1);

+        hInfo->error = reconstruct_single_channel(hDecoder, ics1, &cpe, spec_data1);

+        if (hInfo->error > 0)

+            return;

     hDecoder->element_id[hDecoder->fr_ch_ele] = cpe.ele_id;

@@ -1173,7 +1195,10 @@

             if ((ics->ltp.data_present = faad_get1bit(ld

                 DEBUGVAR(1,310,"aac_scalable_main_header(): ltp.data_present"))) & 1)

-                ltp_data(hDecoder, ics, &(ics->ltp), ld);

+                if ((retval = ltp_data(hDecoder, ics, &(ics->ltp), ld)) > 0)

+                {

+                    return retval;

+                }

 #if 0

@@ -1594,7 +1619,7 @@

 #ifdef LTP_DEC

 /* Table 4.4.28 */

-static void ltp_data(faacDecHandle hDecoder, ic_stream *ics, ltp_info *ltp, bitfile *ld)

+static uint8_t ltp_data(faacDecHandle hDecoder, ic_stream *ics, ltp_info *ltp, bitfile *ld)

     uint8_t sfb, w;

@@ -1621,7 +1646,7 @@

     /* Check length of lag */

     if (ltp->lag > (hDecoder->frameLength << 1))

-        ltp->lag = 0; // FIXME: Error handling

+        return 18;

     ltp->coef = (uint8_t)faad_getbits(ld, 3

         DEBUGVAR(1,82,"ltp_data(): coef"));

@@ -1651,6 +1676,8 @@

                 DEBUGVAR(1,86,"ltp_data(): long_used"));

+    return 0;

 #endif

--- a/libfaad/syntax.h

+++ b/libfaad/syntax.h

@@ -22,7 +22,7 @@

 ** Commercial non-GPL licensing of this software is possible.

 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.

**

-** $Id: syntax.h,v 1.43 2003/12/17 14:43:17 menno Exp $

+** $Id: syntax.h,v 1.44 2003/12/23 18:41:42 menno Exp $

**/

 #ifndef __SYNTAX_H__

@@ -149,7 +149,7 @@

 #endif

 static uint8_t pulse_data(ic_stream *ics, pulse_info *pul, bitfile *ld);

 static void tns_data(ic_stream *ics, tns_info *tns, bitfile *ld);

-static void ltp_data(faacDecHandle hDecoder, ic_stream *ics, ltp_info *ltp, bitfile *ld);

+static uint8_t ltp_data(faacDecHandle hDecoder, ic_stream *ics, ltp_info *ltp, bitfile *ld);

 static uint8_t adts_fixed_header(adts_header *adts, bitfile *ld);

 static void adts_variable_header(adts_header *adts, bitfile *ld);

 static void adts_error_check(adts_header *adts, bitfile *ld);

--

⑨