shithub: aacdec

Download patch

ref: 922513e1151f9cfa68768e4d7d0d914d5b1248a2
parent: 178d2f4e52662b325a770a58dee562f7461e9ce0
author: menno <menno>
date: Tue Dec 23 13:41:42 EST 2003

cfft optimised with SSE
output.c optimised (help specifically for ICL)
some error handling changes for DRM

--- a/libfaad/cfft.c
+++ b/libfaad/cfft.c
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: cfft.c,v 1.21 2003/12/17 14:43:16 menno Exp $
+** $Id: cfft.c,v 1.22 2003/12/23 18:41:42 menno Exp $
 **/
 
 /*
@@ -47,8 +47,9 @@
    passf2, passf3, passf4, passf5. Complex FFT passes fwd and bwd.
   ----------------------------------------------------------------------*/
 
-static void passf2(const uint16_t ido, const uint16_t l1, const complex_t *cc,
-                   complex_t *ch, const complex_t *wa, const int8_t isign)
+#ifdef USE_SSE
+static void passf2pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+                          complex_t *ch, const complex_t *wa)
 {
     uint16_t i, k, ah, ac;
 
@@ -60,51 +61,146 @@
             ac = 4*k;
 
             RE(ch[ah])    = RE(cc[ac]) + RE(cc[ac+1]);
-            RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
             IM(ch[ah])    = IM(cc[ac]) + IM(cc[ac+1]);
+
+            RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
             IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
         }
     } else {
-        if (isign == 1)
+        for (k = 0; k < l1; k++)
         {
-            for (k = 0; k < l1; k++)
+            ah = k*ido;
+            ac = 2*k*ido;
+
+            for (i = 0; i < ido; i+=4)
             {
-                ah = k*ido;
-                ac = 2*k*ido;
+                __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;
+                __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;
+                __m128 w1, w2, w3, w4;
 
-                for (i = 0; i < ido; i++)
-                {
-                    complex_t t2;
+                m1 = _mm_load_ps(&RE(cc[ac+i]));
+                m2 = _mm_load_ps(&RE(cc[ac+ido+i]));
+                m5 = _mm_load_ps(&RE(cc[ac+i+2]));
+                m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));
+                w1 = _mm_load_ps(&RE(wa[i]));
+                w3 = _mm_load_ps(&RE(wa[i+2]));
 
-                    RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
-                    RE(t2)       = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
+                m3 = _mm_add_ps(m1, m2);
+                m15 = _mm_add_ps(m5, m6);
 
-                    IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
-                    IM(t2)       = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
+                m4 = _mm_sub_ps(m1, m2);
+                m16 = _mm_sub_ps(m5, m6);
 
-                    ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
-                        IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
-                }
+                _mm_store_ps(&RE(ch[ah+i]), m3);
+                _mm_store_ps(&RE(ch[ah+i+2]), m15);
+
+                w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
+                w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
+
+                m7 = _mm_mul_ps(m4, w1);
+                m17 = _mm_mul_ps(m16, w3);
+                m8 = _mm_mul_ps(m4, w2);
+                m18 = _mm_mul_ps(m16, w4);
+
+                m9  = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
+                m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));
+                m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
+                m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));
+
+                m11 = _mm_add_ps(m9, m10);
+                m21 = _mm_add_ps(m19, m20);
+                m12 = _mm_sub_ps(m9, m10);
+                m22 = _mm_sub_ps(m19, m20);
+
+                m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
+                m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));
+
+                m14 = _mm_unpacklo_ps(m12, m13);
+                m24 = _mm_unpacklo_ps(m22, m23);
+
+                _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
+                _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);
             }
-        } else {
-            for (k = 0; k < l1; k++)
-            {
-                ah = k*ido;
-                ac = 2*k*ido;
+        }
+    }
+}
+#endif
 
-                for (i = 0; i < ido; i++)
-                {
-                    complex_t t2;
+static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+                      complex_t *ch, const complex_t *wa)
+{
+    uint16_t i, k, ah, ac;
 
-                    RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
-                    RE(t2)       = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
+    if (ido == 1)
+    {
+        for (k = 0; k < l1; k++)
+        {
+            ah = 2*k;
+            ac = 4*k;
 
-                    IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
-                    IM(t2)       = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
+            RE(ch[ah])    = RE(cc[ac]) + RE(cc[ac+1]);
+            RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
+            IM(ch[ah])    = IM(cc[ac]) + IM(cc[ac+1]);
+            IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
+        }
+    } else {
+        for (k = 0; k < l1; k++)
+        {
+            ah = k*ido;
+            ac = 2*k*ido;
 
-                    ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
-                        RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
-                }
+            for (i = 0; i < ido; i++)
+            {
+                complex_t t2;
+
+                RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
+                RE(t2)       = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
+
+                IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
+                IM(t2)       = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
+
+                ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
+                    IM(t2), RE(t2), RE(wa[i]), IM(wa[i]));
+            }
+        }
+    }
+}
+
+static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+                      complex_t *ch, const complex_t *wa)
+{
+    uint16_t i, k, ah, ac;
+
+    if (ido == 1)
+    {
+        for (k = 0; k < l1; k++)
+        {
+            ah = 2*k;
+            ac = 4*k;
+
+            RE(ch[ah])    = RE(cc[ac]) + RE(cc[ac+1]);
+            RE(ch[ah+l1]) = RE(cc[ac]) - RE(cc[ac+1]);
+            IM(ch[ah])    = IM(cc[ac]) + IM(cc[ac+1]);
+            IM(ch[ah+l1]) = IM(cc[ac]) - IM(cc[ac+1]);
+        }
+    } else {
+        for (k = 0; k < l1; k++)
+        {
+            ah = k*ido;
+            ac = 2*k*ido;
+
+            for (i = 0; i < ido; i++)
+            {
+                complex_t t2;
+
+                RE(ch[ah+i]) = RE(cc[ac+i]) + RE(cc[ac+i+ido]);
+                RE(t2)       = RE(cc[ac+i]) - RE(cc[ac+i+ido]);
+
+                IM(ch[ah+i]) = IM(cc[ac+i]) + IM(cc[ac+i+ido]);
+                IM(t2)       = IM(cc[ac+i]) - IM(cc[ac+i+ido]);
+
+                ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
+                    RE(t2), IM(t2), RE(wa[i]), IM(wa[i]));
             }
         }
     }
@@ -234,153 +330,315 @@
     }
 }
 
-static void passf4(const uint16_t ido, const uint16_t l1, const complex_t *cc,
-                   complex_t *ch, const complex_t *wa1, const complex_t *wa2,
-                   const complex_t *wa3, const int8_t isign)
+#ifdef USE_SSE
+static void passf4pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+                          complex_t *ch, const complex_t *wa1, const complex_t *wa2,
+                          const complex_t *wa3)
 {
     uint16_t i, k, ac, ah;
 
     if (ido == 1)
     {
-        if (isign == 1)
+        for (k = 0; k < l1; k+=2)
         {
-            for (k = 0; k < l1; k++)
-            {
-                complex_t t1, t2, t3, t4;
+            __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10;
+            __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, n10;
+            __m128 neg1 = _mm_set_ps(-1.0, 1.0, 1.0, 1.0);
 
-                ac = 4*k;
-                ah = k;
+            m1 = _mm_load_ps(&RE(cc[4*k]));
+            m2 = _mm_load_ps(&RE(cc[4*k+2]));
+            n1 = _mm_load_ps(&RE(cc[4*k+4]));
+            n2 = _mm_load_ps(&RE(cc[4*k+6]));
 
-                RE(t2) = RE(cc[ac])   + RE(cc[ac+2]);
-                RE(t1) = RE(cc[ac])   - RE(cc[ac+2]);
-                IM(t2) = IM(cc[ac])   + IM(cc[ac+2]);
-                IM(t1) = IM(cc[ac])   - IM(cc[ac+2]);
-                RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
-                IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
-                IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
-                RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
+            m3 = _mm_add_ps(m1, m2);
 
-                RE(ch[ah])      = RE(t2) + RE(t3);
-                RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
+            n4 = _mm_mul_ps(neg1, n1);
+            n5 = _mm_mul_ps(neg1, n2);
+            m4 = _mm_mul_ps(neg1, m1);
+            m5 = _mm_mul_ps(neg1, m2);
 
-                IM(ch[ah])      = IM(t2) + IM(t3);
-                IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
+            n3 = _mm_add_ps(n1, n2);
+            m6 = _mm_sub_ps(m4, m5);
 
-                RE(ch[ah+l1])   = RE(t1) + RE(t4);
-                RE(ch[ah+3*l1]) = RE(t1) - RE(t4);
+            m7 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(1, 0, 1, 0));
+            n6 = _mm_sub_ps(n4, n5);
+            m8 = _mm_shuffle_ps(m3, n3, _MM_SHUFFLE(3, 2, 3, 2));
 
-                IM(ch[ah+l1])   = IM(t1) + IM(t4);
-                IM(ch[ah+3*l1]) = IM(t1) - IM(t4);
-            }
-        } else {
-            for (k = 0; k < l1; k++)
+            n7 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(1, 0, 1, 0));
+            m9 = _mm_add_ps(m7, m8);
+            n8 = _mm_shuffle_ps(m6, n6, _MM_SHUFFLE(2, 3, 2, 3));
+
+            m10 = _mm_sub_ps(m7, m8);
+            n9 = _mm_add_ps(n7, n8);
+
+            _mm_store_ps(&RE(ch[k]), m9);
+            n10 = _mm_sub_ps(n7, n8);
+            _mm_store_ps(&RE(ch[k+l1]), n9);
+            _mm_store_ps(&RE(ch[k+2*l1]), m10);
+            _mm_store_ps(&RE(ch[k+3*l1]), n10);
+        }
+    } else {
+        for (k = 0; k < l1; k++)
+        {
+            ac = 4*k*ido;
+            ah = k*ido;
+
+            for (i = 0; i < ido; i+=2)
             {
-                complex_t t1, t2, t3, t4;
+                __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15, m16;
+                __m128 n1, n2, n3, n4, n5, n6, n7, n8, n9, m17, m18, m19, m20, m21, m22, m23;
+                __m128 w1, w2, w3, w4, w5, w6, m24, m25, m26, m27, m28, m29, m30;
+                __m128 neg1 = _mm_set_ps(-1.0, 1.0, -1.0, 1.0);
 
-                ac = 4*k;
-                ah = k;
+                m1 = _mm_load_ps(&RE(cc[ac+i]));
+                m2 = _mm_load_ps(&RE(cc[ac+i+2*ido]));
+                m3 = _mm_add_ps(m1, m2);
+                m4 = _mm_sub_ps(m1, m2);
 
-                RE(t2) = RE(cc[ac])   + RE(cc[ac+2]);
-                RE(t1) = RE(cc[ac])   - RE(cc[ac+2]);
-                IM(t2) = IM(cc[ac])   + IM(cc[ac+2]);
-                IM(t1) = IM(cc[ac])   - IM(cc[ac+2]);
-                RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
-                IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
-                IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
-                RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
+                n1 = _mm_load_ps(&RE(cc[ac+i+ido]));
+                n2 = _mm_load_ps(&RE(cc[ac+i+3*ido]));
+                n3 = _mm_add_ps(n1, n2);
 
-                RE(ch[ah])      = RE(t2) + RE(t3);
-                RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
+                n4 = _mm_mul_ps(neg1, n1);
+                n5 = _mm_mul_ps(neg1, n2);
+                n6 = _mm_sub_ps(n4, n5);
 
-                IM(ch[ah])      = IM(t2) + IM(t3);
-                IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
+                m5 = _mm_add_ps(m3, n3);
 
-                RE(ch[ah+l1])   = RE(t1) - RE(t4);
-                RE(ch[ah+3*l1]) = RE(t1) + RE(t4);
+                n7 = _mm_shuffle_ps(n6, n6, _MM_SHUFFLE(2, 3, 0, 1));
+                n8 = _mm_add_ps(m4, n7);
 
-                IM(ch[ah+l1])   = IM(t1) - IM(t4);
-                IM(ch[ah+3*l1]) = IM(t1) + IM(t4);
+                m6 = _mm_sub_ps(m3, n3);
+                n9 = _mm_sub_ps(m4, n7);
+
+                _mm_store_ps(&RE(ch[ah+i]), m5);
+
+#if 0
+  static INLINE void ComplexMult(real_t *y1, real_t *y2,
+      real_t x1, real_t x2, real_t c1, real_t c2)
+  {
+      *y1 = MUL_F(x1, c1) + MUL_F(x2, c2);
+      *y2 = MUL_F(x2, c1) - MUL_F(x1, c2);
+  }
+
+  m7.0 = RE(c2)*RE(wa1[i])
+  m7.1 = IM(c2)*IM(wa1[i])
+  m7.2 = RE(c6)*RE(wa1[i+1])
+  m7.3 = IM(c6)*IM(wa1[i+1])
+
+  m8.0 = RE(c2)*IM(wa1[i])
+  m8.1 = IM(c2)*RE(wa1[i])
+  m8.2 = RE(c6)*IM(wa1[i+1])
+  m8.3 = IM(c6)*RE(wa1[i+1])
+
+  RE(0) = m7.0 - m7.1
+  IM(0) = m8.0 + m8.1
+  RE(1) = m7.2 - m7.3
+  IM(1) = m8.2 + m8.3
+
+////
+  RE(0) = RE(c2)*RE(wa1[i])   - IM(c2)*IM(wa1[i])
+  IM(0) = RE(c2)*IM(wa1[i])   + IM(c2)*RE(wa1[i])
+  RE(1) = RE(c6)*RE(wa1[i+1]) - IM(c6)*IM(wa1[i+1])
+  IM(1) = RE(c6)*IM(wa1[i+1]) + IM(c6)*RE(wa1[i+1])
+#endif
+
+                w1 = _mm_load_ps(&RE(wa1[i]));
+                w3 = _mm_load_ps(&RE(wa2[i]));
+                w5 = _mm_load_ps(&RE(wa3[i]));
+
+                w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
+                w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));
+                w6 = _mm_shuffle_ps(w5, w5, _MM_SHUFFLE(2, 3, 0, 1));
+
+                m7 = _mm_mul_ps(n8, w1);
+                m15 = _mm_mul_ps(m6, w3);
+                m23 = _mm_mul_ps(n9, w5);
+                m8 = _mm_mul_ps(n8, w2);
+                m16 = _mm_mul_ps(m6, w4);
+                m24 = _mm_mul_ps(n9, w6);
+
+                m9  = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
+                m17 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(2, 0, 2, 0));
+                m25 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(2, 0, 2, 0));
+                m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
+                m18 = _mm_shuffle_ps(m15, m16, _MM_SHUFFLE(3, 1, 3, 1));
+                m26 = _mm_shuffle_ps(m23, m24, _MM_SHUFFLE(3, 1, 3, 1));
+
+                m11 = _mm_add_ps(m9, m10);
+                m19 = _mm_add_ps(m17, m18);
+                m27 = _mm_add_ps(m25, m26);
+                m12 = _mm_sub_ps(m9, m10);
+                m20 = _mm_sub_ps(m17, m18);
+                m28 = _mm_sub_ps(m25, m26);
+
+                m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
+                m21 = _mm_shuffle_ps(m19, m19, _MM_SHUFFLE(0, 0, 3, 2));
+                m29 = _mm_shuffle_ps(m27, m27, _MM_SHUFFLE(0, 0, 3, 2));
+                m14 = _mm_unpacklo_ps(m12, m13);
+                m22 = _mm_unpacklo_ps(m20, m21);
+                m30 = _mm_unpacklo_ps(m28, m29);
+
+                _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
+                _mm_store_ps(&RE(ch[ah+i+2*l1*ido]), m22);
+                _mm_store_ps(&RE(ch[ah+i+3*l1*ido]), m30);
             }
         }
+    }
+}
+#endif
+
+static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+                      complex_t *ch, const complex_t *wa1, const complex_t *wa2,
+                      const complex_t *wa3)
+{
+    uint16_t i, k, ac, ah;
+
+    if (ido == 1)
+    {
+        for (k = 0; k < l1; k++)
+        {
+            complex_t t1, t2, t3, t4;
+
+            ac = 4*k;
+            ah = k;
+
+            RE(t2) = RE(cc[ac])   + RE(cc[ac+2]);
+            RE(t1) = RE(cc[ac])   - RE(cc[ac+2]);
+            IM(t2) = IM(cc[ac])   + IM(cc[ac+2]);
+            IM(t1) = IM(cc[ac])   - IM(cc[ac+2]);
+            RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
+            IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
+            IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
+            RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
+
+            RE(ch[ah])      = RE(t2) + RE(t3);
+            RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
+
+            IM(ch[ah])      = IM(t2) + IM(t3);
+            IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
+
+            RE(ch[ah+l1])   = RE(t1) + RE(t4);
+            RE(ch[ah+3*l1]) = RE(t1) - RE(t4);
+
+            IM(ch[ah+l1])   = IM(t1) + IM(t4);
+            IM(ch[ah+3*l1]) = IM(t1) - IM(t4);
+        }
     } else {
-        if (isign == 1)
+        for (k = 0; k < l1; k++)
         {
-            for (k = 0; k < l1; k++)
+            ac = 4*k*ido;
+            ah = k*ido;
+
+            for (i = 0; i < ido; i++)
             {
-                ac = 4*k*ido;
-                ah = k*ido;
+                complex_t c2, c3, c4, t1, t2, t3, t4;
 
-                for (i = 0; i < ido; i++)
-                {
-                    complex_t c2, c3, c4, t1, t2, t3, t4;
+                RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
+                RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
+                IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
+                IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
+                RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
+                IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
+                IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
+                RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
 
-                    RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
-                    RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
-                    IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
-                    IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
-                    RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
-                    IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
-                    IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
-                    RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
+                RE(c2) = RE(t1) + RE(t4);
+                RE(c4) = RE(t1) - RE(t4);
 
-                    RE(c2) = RE(t1) + RE(t4);
-                    RE(c4) = RE(t1) - RE(t4);
+                IM(c2) = IM(t1) + IM(t4);
+                IM(c4) = IM(t1) - IM(t4);
 
-                    IM(c2) = IM(t1) + IM(t4);
-                    IM(c4) = IM(t1) - IM(t4);
+                RE(ch[ah+i]) = RE(t2) + RE(t3);
+                RE(c3)       = RE(t2) - RE(t3);
 
-                    RE(ch[ah+i]) = RE(t2) + RE(t3);
-                    RE(c3)       = RE(t2) - RE(t3);
+                IM(ch[ah+i]) = IM(t2) + IM(t3);
+                IM(c3)       = IM(t2) - IM(t3);
 
-                    IM(ch[ah+i]) = IM(t2) + IM(t3);
-                    IM(c3)       = IM(t2) - IM(t3);
-
-                    ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
-                        IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
-                    ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
-                        IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
-                    ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
-                        IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
-                }
+                ComplexMult(&IM(ch[ah+i+l1*ido]), &RE(ch[ah+i+l1*ido]),
+                    IM(c2), RE(c2), RE(wa1[i]), IM(wa1[i]));
+                ComplexMult(&IM(ch[ah+i+2*l1*ido]), &RE(ch[ah+i+2*l1*ido]),
+                    IM(c3), RE(c3), RE(wa2[i]), IM(wa2[i]));
+                ComplexMult(&IM(ch[ah+i+3*l1*ido]), &RE(ch[ah+i+3*l1*ido]),
+                    IM(c4), RE(c4), RE(wa3[i]), IM(wa3[i]));
             }
-        } else {
-            for (k = 0; k < l1; k++)
-            {
-                ac = 4*k*ido;
-                ah = k*ido;
+        }
+    }
+}
 
-                for (i = 0; i < ido; i++)
-                {
-                    complex_t c2, c3, c4, t1, t2, t3, t4;
+static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+                      complex_t *ch, const complex_t *wa1, const complex_t *wa2,
+                      const complex_t *wa3)
+{
+    uint16_t i, k, ac, ah;
 
-                    RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
-                    RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
-                    IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
-                    IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
-                    RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
-                    IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
-                    IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
-                    RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
+    if (ido == 1)
+    {
+        for (k = 0; k < l1; k++)
+        {
+            complex_t t1, t2, t3, t4;
 
-                    RE(c2) = RE(t1) - RE(t4);
-                    RE(c4) = RE(t1) + RE(t4);
+            ac = 4*k;
+            ah = k;
 
-                    IM(c2) = IM(t1) - IM(t4);
-                    IM(c4) = IM(t1) + IM(t4);
+            RE(t2) = RE(cc[ac])   + RE(cc[ac+2]);
+            RE(t1) = RE(cc[ac])   - RE(cc[ac+2]);
+            IM(t2) = IM(cc[ac])   + IM(cc[ac+2]);
+            IM(t1) = IM(cc[ac])   - IM(cc[ac+2]);
+            RE(t3) = RE(cc[ac+1]) + RE(cc[ac+3]);
+            IM(t4) = RE(cc[ac+1]) - RE(cc[ac+3]);
+            IM(t3) = IM(cc[ac+3]) + IM(cc[ac+1]);
+            RE(t4) = IM(cc[ac+3]) - IM(cc[ac+1]);
 
-                    RE(ch[ah+i]) = RE(t2) + RE(t3);
-                    RE(c3)       = RE(t2) - RE(t3);
+            RE(ch[ah])      = RE(t2) + RE(t3);
+            RE(ch[ah+2*l1]) = RE(t2) - RE(t3);
 
-                    IM(ch[ah+i]) = IM(t2) + IM(t3);
-                    IM(c3)       = IM(t2) - IM(t3);
+            IM(ch[ah])      = IM(t2) + IM(t3);
+            IM(ch[ah+2*l1]) = IM(t2) - IM(t3);
 
-                    ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
-                        RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
-                    ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
-                        RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
-                    ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
-                        RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
-                }
+            RE(ch[ah+l1])   = RE(t1) - RE(t4);
+            RE(ch[ah+3*l1]) = RE(t1) + RE(t4);
+
+            IM(ch[ah+l1])   = IM(t1) - IM(t4);
+            IM(ch[ah+3*l1]) = IM(t1) + IM(t4);
+        }
+    } else {
+        for (k = 0; k < l1; k++)
+        {
+            ac = 4*k*ido;
+            ah = k*ido;
+
+            for (i = 0; i < ido; i++)
+            {
+                complex_t c2, c3, c4, t1, t2, t3, t4;
+
+                RE(t2) = RE(cc[ac+i]) + RE(cc[ac+i+2*ido]);
+                RE(t1) = RE(cc[ac+i]) - RE(cc[ac+i+2*ido]);
+                IM(t2) = IM(cc[ac+i]) + IM(cc[ac+i+2*ido]);
+                IM(t1) = IM(cc[ac+i]) - IM(cc[ac+i+2*ido]);
+                RE(t3) = RE(cc[ac+i+ido]) + RE(cc[ac+i+3*ido]);
+                IM(t4) = RE(cc[ac+i+ido]) - RE(cc[ac+i+3*ido]);
+                IM(t3) = IM(cc[ac+i+3*ido]) + IM(cc[ac+i+ido]);
+                RE(t4) = IM(cc[ac+i+3*ido]) - IM(cc[ac+i+ido]);
+
+                RE(c2) = RE(t1) - RE(t4);
+                RE(c4) = RE(t1) + RE(t4);
+
+                IM(c2) = IM(t1) - IM(t4);
+                IM(c4) = IM(t1) + IM(t4);
+
+                RE(ch[ah+i]) = RE(t2) + RE(t3);
+                RE(c3)       = RE(t2) - RE(t3);
+
+                IM(ch[ah+i]) = IM(t2) + IM(t3);
+                IM(c3)       = IM(t2) - IM(t3);
+
+                ComplexMult(&RE(ch[ah+i+l1*ido]), &IM(ch[ah+i+l1*ido]),
+                    RE(c2), IM(c2), RE(wa1[i]), IM(wa1[i]));
+                ComplexMult(&RE(ch[ah+i+2*l1*ido]), &IM(ch[ah+i+2*l1*ido]),
+                    RE(c3), IM(c3), RE(wa2[i]), IM(wa2[i]));
+                ComplexMult(&RE(ch[ah+i+3*l1*ido]), &IM(ch[ah+i+3*l1*ido]),
+                    RE(c4), IM(c4), RE(wa3[i]), IM(wa3[i]));
             }
         }
     }
@@ -584,8 +842,9 @@
    cfftf1, cfftf, cfftb, cffti1, cffti. Complex FFTs.
   ----------------------------------------------------------------------*/
 
-INLINE void cfftf1(uint16_t n, complex_t *c, complex_t *ch,
-                   const uint16_t *ifac, const complex_t *wa, const int8_t isign)
+#ifdef USE_SSE
+INLINE void cfftf1pos_sse(uint16_t n, complex_t *c, complex_t *ch,
+                          const uint16_t *ifac, const complex_t *wa, const int8_t isign)
 {
     uint16_t i;
     uint16_t k1, l1, l2;
@@ -610,17 +869,17 @@
             ix3 = ix2 + ido;
 
             if (na == 0)
-                passf4((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], isign);
+                passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
             else
-                passf4((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], isign);
+                passf4pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
 
             na = 1 - na;
             break;
         case 2:
             if (na == 0)
-                passf2((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], isign);
+                passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
             else
-                passf2((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], isign);
+                passf2pos_sse((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
 
             na = 1 - na;
             break;
@@ -661,16 +920,180 @@
         IM(c[i]) = IM(ch[i]);
     }
 }
+#endif
 
+INLINE void cfftf1pos(uint16_t n, complex_t *c, complex_t *ch,
+                      const uint16_t *ifac, const complex_t *wa, const int8_t isign)
+{
+    uint16_t i;
+    uint16_t k1, l1, l2;
+    uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
+
+    nf = ifac[1];
+    na = 0;
+    l1 = 1;
+    iw = 0;
+
+    for (k1 = 2; k1 <= nf+1; k1++)
+    {
+        ip = ifac[k1];
+        l2 = ip*l1;
+        ido = n / l2;
+        idl1 = ido*l1;
+
+        switch (ip)
+        {
+        case 4:
+            ix2 = iw + ido;
+            ix3 = ix2 + ido;
+
+            if (na == 0)
+                passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
+            else
+                passf4pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
+
+            na = 1 - na;
+            break;
+        case 2:
+            if (na == 0)
+                passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
+            else
+                passf2pos((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
+
+            na = 1 - na;
+            break;
+        case 3:
+            ix2 = iw + ido;
+
+            if (na == 0)
+                passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
+            else
+                passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
+
+            na = 1 - na;
+            break;
+        case 5:
+            ix2 = iw + ido;
+            ix3 = ix2 + ido;
+            ix4 = ix3 + ido;
+
+            if (na == 0)
+                passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+            else
+                passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+
+            na = 1 - na;
+            break;
+        }
+
+        l1 = l2;
+        iw += (ip-1) * ido;
+    }
+
+    if (na == 0)
+        return;
+
+    for (i = 0; i < n; i++)
+    {
+        RE(c[i]) = RE(ch[i]);
+        IM(c[i]) = IM(ch[i]);
+    }
+}
+
+INLINE void cfftf1neg(uint16_t n, complex_t *c, complex_t *ch,
+                      const uint16_t *ifac, const complex_t *wa, const int8_t isign)
+{
+    uint16_t i;
+    uint16_t k1, l1, l2;
+    uint16_t na, nf, ip, iw, ix2, ix3, ix4, ido, idl1;
+
+    nf = ifac[1];
+    na = 0;
+    l1 = 1;
+    iw = 0;
+
+    for (k1 = 2; k1 <= nf+1; k1++)
+    {
+        ip = ifac[k1];
+        l2 = ip*l1;
+        ido = n / l2;
+        idl1 = ido*l1;
+
+        switch (ip)
+        {
+        case 4:
+            ix2 = iw + ido;
+            ix3 = ix2 + ido;
+
+            if (na == 0)
+                passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3]);
+            else
+                passf4neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3]);
+
+            na = 1 - na;
+            break;
+        case 2:
+            if (na == 0)
+                passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw]);
+            else
+                passf2neg((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw]);
+
+            na = 1 - na;
+            break;
+        case 3:
+            ix2 = iw + ido;
+
+            if (na == 0)
+                passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], isign);
+            else
+                passf3((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], isign);
+
+            na = 1 - na;
+            break;
+        case 5:
+            ix2 = iw + ido;
+            ix3 = ix2 + ido;
+            ix4 = ix3 + ido;
+
+            if (na == 0)
+                passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)c, ch, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+            else
+                passf5((const uint16_t)ido, (const uint16_t)l1, (const complex_t*)ch, c, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], isign);
+
+            na = 1 - na;
+            break;
+        }
+
+        l1 = l2;
+        iw += (ip-1) * ido;
+    }
+
+    if (na == 0)
+        return;
+
+    for (i = 0; i < n; i++)
+    {
+        RE(c[i]) = RE(ch[i]);
+        IM(c[i]) = IM(ch[i]);
+    }
+}
+
 void cfftf(cfft_info *cfft, complex_t *c)
 {
-    cfftf1(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, -1);
+    cfftf1neg(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, -1);
 }
 
 void cfftb(cfft_info *cfft, complex_t *c)
 {
-    cfftf1(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
+    cfftf1pos(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
 }
+
+#ifdef USE_SSE
+void cfftb_sse(cfft_info *cfft, complex_t *c)
+{
+    cfftf1pos_sse(cfft->n, c, cfft->work, (const uint16_t*)cfft->ifac, (const complex_t*)cfft->tab, +1);
+}
+#endif
 
 static void cffti1(uint16_t n, complex_t *wa, uint16_t *ifac)
 {
--- a/libfaad/cfft.h
+++ b/libfaad/cfft.h
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: cfft.h,v 1.14 2003/12/17 14:43:16 menno Exp $
+** $Id: cfft.h,v 1.15 2003/12/23 18:41:42 menno Exp $
 **/
 
 #ifndef __CFFT_H__
@@ -47,13 +47,23 @@
 void cfftu(cfft_info *cfft);
 
 
-static void passf2(const uint16_t ido, const uint16_t l1, const complex_t *cc,
-                   complex_t *ch, const complex_t *wa, const int8_t isign);
+#ifdef USE_SSE
+void cfftb_sse(cfft_info *cfft, complex_t *c);
+static void passf2pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+                          complex_t *ch, const complex_t *wa);
+static void passf4pos_sse(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
+                          const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
+#endif
+static void passf2pos(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+                      complex_t *ch, const complex_t *wa);
+static void passf2neg(const uint16_t ido, const uint16_t l1, const complex_t *cc,
+                      complex_t *ch, const complex_t *wa);
 static void passf3(const uint16_t ido, const uint16_t l1, const complex_t *cc,
                    complex_t *ch, const complex_t *wa1, const complex_t *wa2, const int8_t isign);
-static void passf4(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
-                   const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
-                   const int8_t isign);
+static void passf4pos(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
+                      const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
+static void passf4neg(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
+                      const complex_t *wa1, const complex_t *wa2, const complex_t *wa3);
 static void passf5(const uint16_t ido, const uint16_t l1, const complex_t *cc, complex_t *ch,
                    const complex_t *wa1, const complex_t *wa2, const complex_t *wa3,
                    const complex_t *wa4, const int8_t isign);
--- a/libfaad/common.c
+++ b/libfaad/common.c
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: common.c,v 1.13 2003/12/17 14:43:16 menno Exp $
+** $Id: common.c,v 1.14 2003/12/23 18:41:42 menno Exp $
 **/
 
 /* just some common functions that could be used anywhere */
@@ -30,37 +30,60 @@
 #include "common.h"
 #include "structs.h"
 
-#include <malloc.h>
 #include <stdlib.h>
 #include "syntax.h"
 
 #ifdef USE_SSE
-uint8_t cpu_has_sse()
+__declspec(naked) static int32_t __fastcall test_cpuid()
 {
-    uint32_t feature;
-
-    __try
+    __asm
     {
-        __asm
-        {
-            xor eax, eax
-            cpuid
-        }
+        pushf
+        pop eax
+        mov ecx,eax
+        xor eax,(1<<21)
+        push eax
+        popf
+        pushf
+        pop eax
+        push ecx
+        popf
+        cmp eax,ecx
+        mov eax,0
+        setne al
+        ret
     }
-    __except (1)
-    {
-        return 0;
-    }
+}
 
+__declspec(naked) static void __fastcall run_cpuid(int32_t param, int32_t out[4])
+{
     __asm
     {
-        mov eax, 1
+        pushad
+        push edx
+        mov eax,ecx
         cpuid
-        mov feature, edx
+        pop edi
+        mov [edi+0],eax
+        mov [edi+4],ebx
+        mov [edi+8],ecx
+        mov [edi+12],edx
+        popad
+        ret
     }
+}
 
+uint8_t cpu_has_sse()
+{
+    int32_t features[4];
+
+    if (test_cpuid())
+    {
+        run_cpuid(1, features);
+    }
+
     /* check for SSE */
-    if (feature & 0x02000000)
+    if (features[3] & 0x02000000)
         return 1;
 
     return 0;
--- a/libfaad/common.h
+++ b/libfaad/common.h
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: common.h,v 1.40 2003/12/17 16:37:34 menno Exp $
+** $Id: common.h,v 1.41 2003/12/23 18:41:42 menno Exp $
 **/
 
 #ifndef __COMMON_H__
@@ -117,7 +117,7 @@
 # endif
 #endif
 
-#if ((defined(_WIN32) && !defined(_WIN32_WCE)) || ((__GNUC__ >= 3) && defined(i386)))
+#if ((defined(_WIN32) && !defined(_WIN32_WCE)) /* || ((__GNUC__ >= 3) && defined(__i386__)) */ )
 #ifndef FIXED_POINT
 /* includes <xmmintrin.h> to enable SSE intrinsics */
 #define USE_SSE
@@ -300,6 +300,19 @@
             fld   f
             fistp i
         }
+        return i;
+    }
+  #elif (defined(__i386__) && defined(__GNUC__))
+    #define HAS_LRINTF
+    // from http://www.stereopsis.com/FPU.html
+    static INLINE int lrintf(float f)
+    {
+        int i;
+        __asm__ __volatile__ (
+            "flds %1        \n\t"
+            "fistpl %0      \n\t"
+            : "=m" (i)
+            : "m" (f));
         return i;
     }
   #endif
--- a/libfaad/error.c
+++ b/libfaad/error.c
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: error.c,v 1.17 2003/12/17 14:43:16 menno Exp $
+** $Id: error.c,v 1.18 2003/12/23 18:41:42 menno Exp $
 **/
 
 #include "common.h"
@@ -45,5 +45,7 @@
     "Maximum number of bitstream elements exceeded",
     "Input data buffer too small",
     "Array index out of range",
-    "Maximum number of scalefactor bands exceeded"
+    "Maximum number of scalefactor bands exceeded",
+    "Quantised value out of range",
+    "LTP lag out of range"
 };
\ No newline at end of file
--- a/libfaad/error.h
+++ b/libfaad/error.h
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: error.h,v 1.12 2003/12/17 14:43:16 menno Exp $
+** $Id: error.h,v 1.13 2003/12/23 18:41:42 menno Exp $
 **/
 
 #ifndef __ERROR_H__
@@ -32,7 +32,7 @@
 extern "C" {
 #endif
 
-#define NUM_ERROR_MESSAGES 17
+#define NUM_ERROR_MESSAGES 19
 extern int8_t *err_msg[];
 
 #ifdef __cplusplus
--- a/libfaad/mdct.c
+++ b/libfaad/mdct.c
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: mdct.c,v 1.35 2003/12/17 14:43:16 menno Exp $
+** $Id: mdct.c,v 1.36 2003/12/23 18:41:42 menno Exp $
 **/
 
 /*
@@ -192,6 +192,7 @@
 
 #ifdef PROFILE
     mdct->cycles = 0;
+    mdct->fft_cycles = 0;
 #endif
 
     return mdct;
@@ -203,6 +204,7 @@
     {
 #ifdef PROFILE
         printf("MDCT[%.4d]:         %I64d cycles\n", mdct->N, mdct->cycles);
+        printf("CFFT[%.4d]:         %I64d cycles\n", mdct->N/4, mdct->fft_cycles);
 #endif
 
         cfftu(mdct->cfft);
@@ -287,6 +289,7 @@
 
 #ifdef PROFILE
     count2 = faad_get_ts() - count2;
+    mdct->fft_cycles += count1;
     mdct->cycles += (count2 - count1);
 #endif
 }
@@ -359,7 +362,7 @@
 #endif
 
     /* complex IFFT, any non-scaling FFT can be used here */
-    cfftb(mdct->cfft, Z1);
+    cfftb_sse(mdct->cfft, Z1);
 
 #ifdef PROFILE
     count1 = faad_get_ts() - count1;
@@ -445,6 +448,7 @@
 
 #ifdef PROFILE
     count2 = faad_get_ts() - count2;
+    mdct->fft_cycles += count1;
     mdct->cycles += (count2 - count1);
 #endif
 }
--- a/libfaad/output.c
+++ b/libfaad/output.c
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: output.c,v 1.32 2003/12/17 14:43:16 menno Exp $
+** $Id: output.c,v 1.33 2003/12/23 18:41:42 menno Exp $
 **/
 
 #include "common.h"
@@ -39,10 +39,11 @@
 #define DM_MUL REAL_CONST(0.4142135623730950488) // 1/(1+sqrt(2))
 #define RSQRT2 REAL_CONST(0.7071067811865475244) // 1/sqrt(2)
 
+
 static INLINE real_t get_sample(real_t **input, uint8_t channel, uint16_t sample,
-                                uint8_t downMatrix, uint8_t *internal_channel)
+                                uint8_t down_matrix, uint8_t *internal_channel)
 {
-    if (!downMatrix)
+    if (!down_matrix)
         return input[internal_channel[channel]][sample];
 
     if (channel == 0)
@@ -57,127 +58,309 @@
     }
 }
 
-void* output_to_PCM(faacDecHandle hDecoder,
-                    real_t **input, void *sample_buffer, uint8_t channels,
-                    uint16_t frame_len, uint8_t format)
-{
-    uint8_t ch;
-    uint16_t i, j = 0;
-    uint8_t internal_channel;
+#ifndef HAS_LRINTF
+#define CLIP(sample, max, min) \
+if (sample >= 0.0f)            \
+{                              \
+    sample += 0.5f;            \
+    if (sample >= max)         \
+        sample = max;          \
+} else {                       \
+    sample += -0.5f;           \
+    if (sample <= min)         \
+        sample = min;          \
+}
+#else
+#define CLIP(sample, max, min) \
+if (sample >= 0.0f)            \
+{                              \
+    if (sample >= max)         \
+        sample = max;          \
+} else {                       \
+    if (sample <= min)         \
+        sample = min;          \
+}
+#endif
 
-    int16_t   *short_sample_buffer = (int16_t*)sample_buffer;
-    int32_t   *int_sample_buffer = (int32_t*)sample_buffer;
-    float32_t *float_sample_buffer = (float32_t*)sample_buffer;
-    double    *double_sample_buffer = (double*)sample_buffer;
+#define CONV(a,b) ((a<<1)|(b&0x1))
 
-#ifdef PROFILE
-    int64_t count = faad_get_ts();
-#endif
+static void to_PCM_16bit(faacDecHandle hDecoder, real_t **input,
+                         uint8_t channels, uint16_t frame_len,
+                         int16_t **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
 
-    /* Copy output to a standard PCM buffer */
-    for (ch = 0; ch < channels; ch++)
+    switch (CONV(channels,hDecoder->downMatrix))
     {
-        internal_channel = hDecoder->internal_channel[ch];
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
 
-        switch (format)
+            CLIP(inp, 32767.0f, -32768.0f);
+
+            (*sample_buffer)[i] = (int16_t)lrintf(inp);
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
         {
-        case FAAD_FMT_16BIT:
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+
+            CLIP(inp0, 32767.0f, -32768.0f);
+            CLIP(inp1, 32767.0f, -32768.0f);
+
+            (*sample_buffer)[(i*2)+0] = (int16_t)lrintf(inp0);
+            (*sample_buffer)[(i*2)+1] = (int16_t)lrintf(inp1);
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
+        {
             for(i = 0; i < frame_len; i++)
             {
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
-                if (inp >= 0.0f)
-                {
-#ifndef HAS_LRINTF
-                    inp += 0.5f;
-#endif
-                    if (inp >= 32767.0f)
-                    {
-                        inp = 32767.0f;
-                    }
-                } else {
-#ifndef HAS_LRINTF
-                    inp += -0.5f;
-#endif
-                    if (inp <= -32768.0f)
-                    {
-                        inp = -32768.0f;
-                    }
-                }
-                short_sample_buffer[(i*channels)+ch] = (int16_t)lrintf(inp);
-            }
-            break;
-        case FAAD_FMT_24BIT:
+
+                CLIP(inp, 32767.0f, -32768.0f);
+
+                (*sample_buffer)[(i*channels)+ch] = (int16_t)lrintf(inp);
+            }
+        }
+        break;
+    }
+}
+
+static void to_PCM_24bit(faacDecHandle hDecoder, real_t **input,
+                         uint8_t channels, uint16_t frame_len,
+                         int32_t **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
+
+    switch (CONV(channels,hDecoder->downMatrix))
+    {
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
+
+            inp *= 256.0f;
+            CLIP(inp, 8388607.0f, -8388608.0f);
+
+            (*sample_buffer)[i] = (int32_t)lrintf(inp);
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+
+            inp0 *= 256.0f;
+            inp1 *= 256.0f;
+            CLIP(inp0, 8388607.0f, -8388608.0f);
+            CLIP(inp1, 8388607.0f, -8388608.0f);
+
+            (*sample_buffer)[(i*2)+0] = (int32_t)lrintf(inp0);
+            (*sample_buffer)[(i*2)+1] = (int32_t)lrintf(inp1);
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
+        {
             for(i = 0; i < frame_len; i++)
             {
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
+
                 inp *= 256.0f;
-                if (inp >= 0.0f)
-                {
-#ifndef HAS_LRINTF
-                    inp += 0.5f;
-#endif
-                    if (inp >= 8388607.0f)
-                    {
-                        inp = 8388607.0f;
-                    }
-                } else {
-#ifndef HAS_LRINTF
-                    inp += -0.5f;
-#endif
-                    if (inp <= -8388608.0f)
-                    {
-                        inp = -8388608.0f;
-                    }
-                }
-                int_sample_buffer[(i*channels)+ch] = lrintf(inp);
-            }
-            break;
-        case FAAD_FMT_32BIT:
+                CLIP(inp, 8388607.0f, -8388608.0f);
+
+                (*sample_buffer)[(i*channels)+ch] = (int32_t)lrintf(inp);
+            }
+        }
+        break;
+    }
+}
+
+static void to_PCM_32bit(faacDecHandle hDecoder, real_t **input,
+                         uint8_t channels, uint16_t frame_len,
+                         int32_t **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
+
+    switch (CONV(channels,hDecoder->downMatrix))
+    {
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
+
+            inp *= 65536.0f;
+            CLIP(inp, 2147483647.0f, -2147483648.0f);
+
+            (*sample_buffer)[i] = (int32_t)lrintf(inp);
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+
+            inp0 *= 65536.0f;
+            inp1 *= 65536.0f;
+            CLIP(inp0, 2147483647.0f, -2147483648.0f);
+            CLIP(inp1, 2147483647.0f, -2147483648.0f);
+
+            (*sample_buffer)[(i*2)+0] = (int32_t)lrintf(inp0);
+            (*sample_buffer)[(i*2)+1] = (int32_t)lrintf(inp1);
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
+        {
             for(i = 0; i < frame_len; i++)
             {
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
+
                 inp *= 65536.0f;
-                if (inp >= 0.0f)
-                {
-#ifndef HAS_LRINTF
-                    inp += 0.5f;
-#endif
-                    if (inp >= 2147483647.0f)
-                    {
-                        inp = 2147483647.0f;
-                    }
-                } else {
-#ifndef HAS_LRINTF
-                    inp += -0.5f;
-#endif
-                    if (inp <= -2147483648.0f)
-                    {
-                        inp = -2147483648.0f;
-                    }
-                }
-                int_sample_buffer[(i*channels)+ch] = lrintf(inp);
-            }
-            break;
-        case FAAD_FMT_FLOAT:
+                CLIP(inp, 2147483647.0f, -2147483648.0f);
+
+                (*sample_buffer)[(i*channels)+ch] = (int32_t)lrintf(inp);
+            }
+        }
+        break;
+    }
+}
+
+static void to_PCM_float(faacDecHandle hDecoder, real_t **input,
+                         uint8_t channels, uint16_t frame_len,
+                         float32_t **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
+
+    switch (CONV(channels,hDecoder->downMatrix))
+    {
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
+            (*sample_buffer)[i] = inp*FLOAT_SCALE;
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+            (*sample_buffer)[(i*2)+0] = inp0*FLOAT_SCALE;
+            (*sample_buffer)[(i*2)+1] = inp1*FLOAT_SCALE;
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
+        {
             for(i = 0; i < frame_len; i++)
             {
-                //real_t inp = input[internal_channel][i];
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
-                float_sample_buffer[(i*channels)+ch] = inp*FLOAT_SCALE;
+                (*sample_buffer)[(i*channels)+ch] = inp*FLOAT_SCALE;
             }
-            break;
-        case FAAD_FMT_DOUBLE:
+        }
+        break;
+    }
+}
+
+static void to_PCM_double(faacDecHandle hDecoder, real_t **input,
+                          uint8_t channels, uint16_t frame_len,
+                          double **sample_buffer)
+{
+    uint8_t ch, ch1;
+    uint16_t i;
+
+    switch (CONV(channels,hDecoder->downMatrix))
+    {
+    case CONV(1,0):
+    case CONV(1,1):
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp = input[hDecoder->internal_channel[0]][i];
+            (*sample_buffer)[i] = (double)inp*FLOAT_SCALE;
+        }
+        break;
+    case CONV(2,0):
+        ch  = hDecoder->internal_channel[0];
+        ch1 = hDecoder->internal_channel[1];
+        for(i = 0; i < frame_len; i++)
+        {
+            real_t inp0 = input[ch ][i];
+            real_t inp1 = input[ch1][i];
+            (*sample_buffer)[(i*2)+0] = (double)inp0*FLOAT_SCALE;
+            (*sample_buffer)[(i*2)+1] = (double)inp1*FLOAT_SCALE;
+        }
+        break;
+    default:
+        for (ch = 0; ch < channels; ch++)
+        {
             for(i = 0; i < frame_len; i++)
             {
-                //real_t inp = input[internal_channel][i];
                 real_t inp = get_sample(input, ch, i, hDecoder->downMatrix, hDecoder->internal_channel);
-                double_sample_buffer[(i*channels)+ch] = (double)inp*FLOAT_SCALE;
+                (*sample_buffer)[(i*channels)+ch] = (double)inp*FLOAT_SCALE;
             }
-            break;
         }
+        break;
     }
+}
 
+void *output_to_PCM(faacDecHandle hDecoder,
+                    real_t **input, void *sample_buffer, uint8_t channels,
+                    uint16_t frame_len, uint8_t format)
+{
+    int16_t   *short_sample_buffer = (int16_t*)sample_buffer;
+    int32_t   *int_sample_buffer = (int32_t*)sample_buffer;
+    float32_t *float_sample_buffer = (float32_t*)sample_buffer;
+    double    *double_sample_buffer = (double*)sample_buffer;
+
 #ifdef PROFILE
+    int64_t count = faad_get_ts();
+#endif
+
+    /* Copy output to a standard PCM buffer */
+    switch (format)
+    {
+    case FAAD_FMT_16BIT:
+        to_PCM_16bit(hDecoder, input, channels, frame_len, &short_sample_buffer);
+        break;
+    case FAAD_FMT_24BIT:
+        to_PCM_24bit(hDecoder, input, channels, frame_len, &int_sample_buffer);
+        break;
+    case FAAD_FMT_32BIT:
+        to_PCM_32bit(hDecoder, input, channels, frame_len, &int_sample_buffer);
+        break;
+    case FAAD_FMT_FLOAT:
+        to_PCM_float(hDecoder, input, channels, frame_len, &float_sample_buffer);
+        break;
+    case FAAD_FMT_DOUBLE:
+        to_PCM_double(hDecoder, input, channels, frame_len, &double_sample_buffer);
+        break;
+    }
+
+#ifdef PROFILE
     count = faad_get_ts() - count;
     hDecoder->output_cycles += count;
 #endif
@@ -208,13 +391,13 @@
                 if (tmp >= 0)
                 {
                     tmp += (1 << (REAL_BITS-1));
-                    if (tmp >= REAL_CONST(32768))
+                    if (tmp >= REAL_CONST(32767))
                     {
                         tmp = REAL_CONST(32767);
                     }
                 } else {
                     tmp += -(1 << (REAL_BITS-1));
-                    if (tmp <= REAL_CONST(-32769))
+                    if (tmp <= REAL_CONST(-32768))
                     {
                         tmp = REAL_CONST(-32768);
                     }
@@ -231,7 +414,7 @@
                 {
                     tmp += (1 << (REAL_BITS-9));
                     tmp >>= (REAL_BITS-8);
-                    if (tmp >= 8388608)
+                    if (tmp >= 8388607)
                     {
                         tmp = 8388607;
                     }
@@ -238,7 +421,7 @@
                 } else {
                     tmp += -(1 << (REAL_BITS-9));
                     tmp >>= (REAL_BITS-8);
-                    if (tmp <= -8388609)
+                    if (tmp <= -8388608)
                     {
                         tmp = -8388608;
                     }
--- a/libfaad/specrec.c
+++ b/libfaad/specrec.c
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: specrec.c,v 1.35 2003/12/17 16:37:34 menno Exp $
+** $Id: specrec.c,v 1.36 2003/12/23 18:41:42 menno Exp $
 **/
 
 /*
@@ -461,7 +461,7 @@
     memcpy(spec_data, tmp_spec, frame_len*sizeof(real_t));
 }
 
-static INLINE real_t iquant(int16_t q, const real_t *tab)
+static INLINE real_t iquant(int16_t q, const real_t *tab, uint8_t *error)
 {
 #ifdef FIXED_POINT
     static const real_t errcorr[] = {
@@ -488,35 +488,41 @@
 #else
     if (q < 0)
     {
-        if (-q >= IQ_TABLE_SIZE)
-            return 0;
-
         /* tab contains a value for all possible q [0,8192] */
-        return -tab[-q];
-    }
+        if (-q < IQ_TABLE_SIZE)
+            return -tab[-q];
 
-    if (q >= IQ_TABLE_SIZE)
+        *error = 17;
         return 0;
+    } else {
+        /* tab contains a value for all possible q [0,8192] */
+        if (q < IQ_TABLE_SIZE)
+            return tab[q];
 
-    /* tab contains a value for all possible q [0,8192] */
-    return tab[q];
+        *error = 17;
+        return 0;
+    }
 #endif
 }
 
-static void inverse_quantization(real_t *x_invquant, const int16_t *x_quant, const uint16_t frame_len)
+static uint8_t inverse_quantization(real_t *x_invquant, const int16_t *x_quant, const uint16_t frame_len)
 {
     int16_t i;
+    uint8_t error = 0; /* Init error flag */
     const real_t *tab = iq_table;
 
     for(i = 0; i < frame_len; i+=4)
     {
-        x_invquant[i] = iquant(x_quant[i], tab);
-        x_invquant[i+1] = iquant(x_quant[i+1], tab);
-        x_invquant[i+2] = iquant(x_quant[i+2], tab);
-        x_invquant[i+3] = iquant(x_quant[i+3], tab);
+        x_invquant[i] = iquant(x_quant[i], tab, &error);
+        x_invquant[i+1] = iquant(x_quant[i+1], tab, &error);
+        x_invquant[i+2] = iquant(x_quant[i+2], tab, &error);
+        x_invquant[i+3] = iquant(x_quant[i+3], tab, &error);
     }
+
+    return error;
 }
 
+#ifndef FIXED_POINT
 ALIGN static const real_t pow2sf_tab[] = {
     2.9802322387695313E-008, 5.9604644775390625E-008, 1.1920928955078125E-007,
     2.384185791015625E-007, 4.76837158203125E-007, 9.5367431640625E-007,
@@ -540,12 +546,15 @@
     8589934592, 17179869184, 34359738368,
     68719476736, 137438953472, 274877906944
 };
+#endif
 
 ALIGN static real_t pow2_table[] =
 {
+#if 0
     COEF_CONST(0.59460355750136053335874998528024), /* 2^-0.75 */
     COEF_CONST(0.70710678118654752440084436210485), /* 2^-0.5 */
     COEF_CONST(0.84089641525371454303112547623321), /* 2^-0.25 */
+#endif
     COEF_CONST(1.0),
     COEF_CONST(1.1892071150027210667174999705605), /* 2^0.25 */
     COEF_CONST(1.4142135623730950488016887242097), /* 2^0.5 */
@@ -573,10 +582,11 @@
         {
             top = ics->sect_sfb_offset[g][sfb+1];
 
-            exp = (ics->scale_factors[g][sfb] - 100) >> 2;
-            frac = (ics->scale_factors[g][sfb] - 100) & 3;
+            exp = (ics->scale_factors[g][sfb] /* - 100 */) >> 2;
+            frac = (ics->scale_factors[g][sfb] /* - 100 */) & 3;
 
 #ifdef FIXED_POINT
+            exp -= 25;
             /* IMDCT pre-scaling */
             if (hDecoder->object_type == LD)
             {
@@ -606,16 +616,16 @@
                     x_invquant[k+(groups*nshort)+3] <<= exp;
                 }
 #else
-                x_invquant[k+(groups*nshort)]   = x_invquant[k+(groups*nshort)]   * pow2sf_tab[exp+25];
-                x_invquant[k+(groups*nshort)+1] = x_invquant[k+(groups*nshort)+1] * pow2sf_tab[exp+25];
-                x_invquant[k+(groups*nshort)+2] = x_invquant[k+(groups*nshort)+2] * pow2sf_tab[exp+25];
-                x_invquant[k+(groups*nshort)+3] = x_invquant[k+(groups*nshort)+3] * pow2sf_tab[exp+25];
+                x_invquant[k+(groups*nshort)]   = x_invquant[k+(groups*nshort)]   * pow2sf_tab[exp/*+25*/];
+                x_invquant[k+(groups*nshort)+1] = x_invquant[k+(groups*nshort)+1] * pow2sf_tab[exp/*+25*/];
+                x_invquant[k+(groups*nshort)+2] = x_invquant[k+(groups*nshort)+2] * pow2sf_tab[exp/*+25*/];
+                x_invquant[k+(groups*nshort)+3] = x_invquant[k+(groups*nshort)+3] * pow2sf_tab[exp/*+25*/];
 #endif
 
-                x_invquant[k+(groups*nshort)]   = MUL_C(x_invquant[k+(groups*nshort)],pow2_table[frac + 3]);
-                x_invquant[k+(groups*nshort)+1] = MUL_C(x_invquant[k+(groups*nshort)+1],pow2_table[frac + 3]);
-                x_invquant[k+(groups*nshort)+2] = MUL_C(x_invquant[k+(groups*nshort)+2],pow2_table[frac + 3]);
-                x_invquant[k+(groups*nshort)+3] = MUL_C(x_invquant[k+(groups*nshort)+3],pow2_table[frac + 3]);
+                x_invquant[k+(groups*nshort)]   = MUL_C(x_invquant[k+(groups*nshort)],pow2_table[frac /* + 3*/]);
+                x_invquant[k+(groups*nshort)+1] = MUL_C(x_invquant[k+(groups*nshort)+1],pow2_table[frac /* + 3*/]);
+                x_invquant[k+(groups*nshort)+2] = MUL_C(x_invquant[k+(groups*nshort)+2],pow2_table[frac /* + 3*/]);
+                x_invquant[k+(groups*nshort)+3] = MUL_C(x_invquant[k+(groups*nshort)+3],pow2_table[frac /* + 3*/]);
             }
         }
         groups += ics->window_group_length[g];
@@ -644,16 +654,16 @@
         {
             top = ics->sect_sfb_offset[g][sfb+1];
 
-            exp = (ics->scale_factors[g][sfb] - 100) >> 2;
-            frac = (ics->scale_factors[g][sfb] - 100) & 3;
+            exp = (ics->scale_factors[g][sfb] /* - 100 */) >> 2;
+            frac = (ics->scale_factors[g][sfb] /* - 100 */) & 3;
 
             /* minimum size of a sf band is 4 and always a multiple of 4 */
             for ( ; k < top; k += 4)
             {
                 __m128 m1 = _mm_load_ps(&x_invquant[k+(groups*nshort)]);
-                __m128 m2 = _mm_load_ps1(&pow2sf_tab[exp+25]);
+                __m128 m2 = _mm_load_ps1(&pow2sf_tab[exp /*+25*/]);
+                __m128 m3 = _mm_load_ps1(&pow2_table[frac /* + 3*/]);
                 __m128 m4 = _mm_mul_ps(m1, m2);
-                __m128 m3 = _mm_load_ps1(&pow2_table[frac + 3]);
                 __m128 m5 = _mm_mul_ps(m3, m4);
                 _mm_store_ps(&x_invquant[k+(groups*nshort)], m5);
             }
@@ -663,9 +673,10 @@
 }
 #endif
 
-void reconstruct_single_channel(faacDecHandle hDecoder, ic_stream *ics,
-                                element *sce, int16_t *spec_data)
+uint8_t reconstruct_single_channel(faacDecHandle hDecoder, ic_stream *ics,
+                                   element *sce, int16_t *spec_data)
 {
+    uint8_t retval;
     ALIGN real_t spec_coef[1024];
 
 #ifdef PROFILE
@@ -673,7 +684,9 @@
 #endif
 
     /* inverse quantization */
-    inverse_quantization(spec_coef, spec_data, hDecoder->frameLength);
+    retval = inverse_quantization(spec_coef, spec_data, hDecoder->frameLength);
+    if (retval > 0)
+        return retval;
 
     /* apply scalefactors */
 #ifndef USE_SSE
@@ -682,16 +695,16 @@
     hDecoder->apply_sf_func(hDecoder, ics, spec_coef, hDecoder->frameLength);
 #endif
 
+    /* deinterleave short block grouping */
+    if (ics->window_sequence == EIGHT_SHORT_SEQUENCE)
+        quant_to_spec(ics, spec_coef, hDecoder->frameLength);
+
 #ifdef PROFILE
     count = faad_get_ts() - count;
     hDecoder->requant_cycles += count;
 #endif
 
-    /* deinterleave short block grouping */
-    if (ics->window_sequence == EIGHT_SHORT_SEQUENCE)
-        quant_to_spec(ics, spec_coef, hDecoder->frameLength);
 
-
     /* pns decoding */
     pns_decode(ics, NULL, spec_coef, NULL, hDecoder->frameLength, 0, hDecoder->object_type);
 
@@ -810,11 +823,14 @@
             hDecoder->time_out[sce->channel]+hDecoder->frameLength, hDecoder->frameLength, hDecoder->object_type);
     }
 #endif
+
+    return 0;
 }
 
-void reconstruct_channel_pair(faacDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2,
-                              element *cpe, int16_t *spec_data1, int16_t *spec_data2)
+uint8_t reconstruct_channel_pair(faacDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2,
+                                 element *cpe, int16_t *spec_data1, int16_t *spec_data2)
 {
+    uint8_t retval;
     ALIGN real_t spec_coef1[1024];
     ALIGN real_t spec_coef2[1024];
 
@@ -823,9 +839,14 @@
 #endif
 
     /* inverse quantization */
-    inverse_quantization(spec_coef1, spec_data1, hDecoder->frameLength);
-    inverse_quantization(spec_coef2, spec_data2, hDecoder->frameLength);
+    retval = inverse_quantization(spec_coef1, spec_data1, hDecoder->frameLength);
+    if (retval > 0)
+        return retval;
 
+    retval = inverse_quantization(spec_coef2, spec_data2, hDecoder->frameLength);
+    if (retval > 0)
+        return retval;
+
     /* apply scalefactors */
 #ifndef USE_SSE
     apply_scalefactors(hDecoder, ics1, spec_coef1, hDecoder->frameLength);
@@ -835,11 +856,6 @@
     hDecoder->apply_sf_func(hDecoder, ics2, spec_coef2, hDecoder->frameLength);
 #endif
 
-#ifdef PROFILE
-    count = faad_get_ts() - count;
-    hDecoder->requant_cycles += count;
-#endif
-
     /* deinterleave short block grouping */
     if (ics1->window_sequence == EIGHT_SHORT_SEQUENCE)
         quant_to_spec(ics1, spec_coef1, hDecoder->frameLength);
@@ -846,7 +862,12 @@
     if (ics2->window_sequence == EIGHT_SHORT_SEQUENCE)
         quant_to_spec(ics2, spec_coef2, hDecoder->frameLength);
 
+#ifdef PROFILE
+    count = faad_get_ts() - count;
+    hDecoder->requant_cycles += count;
+#endif
 
+
     /* pns decoding */
     if (ics1->ms_mask_present)
     {
@@ -1036,4 +1057,6 @@
             hDecoder->object_type);
     }
 #endif
+
+    return 0;
 }
--- a/libfaad/specrec.h
+++ b/libfaad/specrec.h
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: specrec.h,v 1.21 2003/12/17 14:43:16 menno Exp $
+** $Id: specrec.h,v 1.22 2003/12/23 18:41:42 menno Exp $
 **/
 
 #ifndef __SPECREC_H__
@@ -36,7 +36,7 @@
 
 uint8_t window_grouping_info(faacDecHandle hDecoder, ic_stream *ics);
 static void quant_to_spec(ic_stream *ics, real_t *spec_data, uint16_t frame_len);
-static void inverse_quantization(real_t *x_invquant, const int16_t *x_quant, const uint16_t frame_len);
+static uint8_t inverse_quantization(real_t *x_invquant, const int16_t *x_quant, const uint16_t frame_len);
 void apply_scalefactors(faacDecHandle hDecoder, ic_stream *ics, real_t *x_invquant,
                         uint16_t frame_len);
 #ifdef USE_SSE
@@ -43,9 +43,9 @@
 void apply_scalefactors_sse(faacDecHandle hDecoder, ic_stream *ics, real_t *x_invquant,
                             uint16_t frame_len);
 #endif
-void reconstruct_channel_pair(faacDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2,
-                              element *cpe, int16_t *spec_data1, int16_t *spec_data2);
-void reconstruct_single_channel(faacDecHandle hDecoder, ic_stream *ics, element *sce,
+uint8_t reconstruct_channel_pair(faacDecHandle hDecoder, ic_stream *ics1, ic_stream *ics2,
+                                 element *cpe, int16_t *spec_data1, int16_t *spec_data2);
+uint8_t reconstruct_single_channel(faacDecHandle hDecoder, ic_stream *ics, element *sce,
                                 int16_t *spec_data);
 
 #ifdef __cplusplus
--- a/libfaad/syntax.c
+++ b/libfaad/syntax.c
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: syntax.c,v 1.62 2003/12/17 14:43:16 menno Exp $
+** $Id: syntax.c,v 1.63 2003/12/23 18:41:42 menno Exp $
 **/
 
 /*
@@ -530,7 +530,9 @@
         return retval;
 
     /* noiseless coding is done, spectral reconstruction is done now */
-    reconstruct_single_channel(hDecoder, ics, &sce, spec_data);
+    retval = reconstruct_single_channel(hDecoder, ics, &sce, spec_data);
+    if (retval > 0)
+        return retval;
 
     return 0;
 }
@@ -581,7 +583,10 @@
             if ((ics1->ltp.data_present = faad_get1bit(ld
                 DEBUGVAR(1,50,"channel_pair_element(): ltp.data_present"))) & 1)
             {
-                ltp_data(hDecoder, ics1, &(ics1->ltp), ld);
+                if ((result = ltp_data(hDecoder, ics1, &(ics1->ltp), ld)) > 0)
+                {
+                    return result;
+                }
             }
         }
 #endif
@@ -604,7 +609,10 @@
         if ((ics1->ltp2.data_present = faad_get1bit(ld
             DEBUGVAR(1,50,"channel_pair_element(): ltp.data_present"))) & 1)
         {
-            ltp_data(hDecoder, ics1, &(ics1->ltp2), ld);
+            if ((result = ltp_data(hDecoder, ics1, &(ics1->ltp2), ld)) > 0)
+            {
+                return result;
+            }
         }
     }
 #endif
@@ -616,7 +624,11 @@
     }
 
     /* noiseless coding is done, spectral reconstruction is done now */
-    reconstruct_channel_pair(hDecoder, ics1, ics2, &cpe, spec_data1, spec_data2);
+    if ((result = reconstruct_channel_pair(hDecoder, ics1, ics2, &cpe,
+        spec_data1, spec_data2)) > 0)
+    {
+        return result;
+    }
 
     return 0;
 }
@@ -685,7 +697,10 @@
                     if ((ics->ltp.data_present = faad_get1bit(ld
                         DEBUGVAR(1,50,"ics_info(): ltp.data_present"))) & 1)
                     {
-                        ltp_data(hDecoder, ics, &(ics->ltp), ld);
+                        if ((retval = ltp_data(hDecoder, ics, &(ics->ltp), ld)) > 0)
+                        {
+                            return retval;
+                        }
                     }
                     if (common_window)
                     {
@@ -692,7 +707,10 @@
                         if ((ics->ltp2.data_present = faad_get1bit(ld
                             DEBUGVAR(1,51,"ics_info(): ltp2.data_present"))) & 1)
                         {
-                            ltp_data(hDecoder, ics, &(ics->ltp2), ld);
+                            if ((retval = ltp_data(hDecoder, ics, &(ics->ltp2), ld)) > 0)
+                            {
+                                return retval;
+                            }
                         }
                     }
                 }
@@ -1064,9 +1082,13 @@
 
     if (this_layer_stereo)
     {
-        reconstruct_channel_pair(hDecoder, ics1, ics2, &cpe, spec_data1, spec_data2);
+        hInfo->error = reconstruct_channel_pair(hDecoder, ics1, ics2, &cpe, spec_data1, spec_data2);
+        if (hInfo->error > 0)
+            return;
     } else {
-        reconstruct_single_channel(hDecoder, ics1, &cpe, spec_data1);
+        hInfo->error = reconstruct_single_channel(hDecoder, ics1, &cpe, spec_data1);
+        if (hInfo->error > 0)
+            return;
     }
 
     hDecoder->element_id[hDecoder->fr_ch_ele] = cpe.ele_id;
@@ -1173,7 +1195,10 @@
             if ((ics->ltp.data_present = faad_get1bit(ld
                 DEBUGVAR(1,310,"aac_scalable_main_header(): ltp.data_present"))) & 1)
             {
-                ltp_data(hDecoder, ics, &(ics->ltp), ld);
+                if ((retval = ltp_data(hDecoder, ics, &(ics->ltp), ld)) > 0)
+                {
+                    return retval;
+                }
             }
 #if 0
         }
@@ -1594,7 +1619,7 @@
 
 #ifdef LTP_DEC
 /* Table 4.4.28 */
-static void ltp_data(faacDecHandle hDecoder, ic_stream *ics, ltp_info *ltp, bitfile *ld)
+static uint8_t ltp_data(faacDecHandle hDecoder, ic_stream *ics, ltp_info *ltp, bitfile *ld)
 {
     uint8_t sfb, w;
 
@@ -1621,7 +1646,7 @@
 
     /* Check length of lag */
     if (ltp->lag > (hDecoder->frameLength << 1))
-        ltp->lag = 0; // FIXME: Error handling
+        return 18;
 
     ltp->coef = (uint8_t)faad_getbits(ld, 3
         DEBUGVAR(1,82,"ltp_data(): coef"));
@@ -1651,6 +1676,8 @@
                 DEBUGVAR(1,86,"ltp_data(): long_used"));
         }
     }
+
+    return 0;
 }
 #endif
 
--- a/libfaad/syntax.h
+++ b/libfaad/syntax.h
@@ -22,7 +22,7 @@
 ** Commercial non-GPL licensing of this software is possible.
 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
 **
-** $Id: syntax.h,v 1.43 2003/12/17 14:43:17 menno Exp $
+** $Id: syntax.h,v 1.44 2003/12/23 18:41:42 menno Exp $
 **/
 
 #ifndef __SYNTAX_H__
@@ -149,7 +149,7 @@
 #endif
 static uint8_t pulse_data(ic_stream *ics, pulse_info *pul, bitfile *ld);
 static void tns_data(ic_stream *ics, tns_info *tns, bitfile *ld);
-static void ltp_data(faacDecHandle hDecoder, ic_stream *ics, ltp_info *ltp, bitfile *ld);
+static uint8_t ltp_data(faacDecHandle hDecoder, ic_stream *ics, ltp_info *ltp, bitfile *ld);
 static uint8_t adts_fixed_header(adts_header *adts, bitfile *ld);
 static void adts_variable_header(adts_header *adts, bitfile *ld);
 static void adts_error_check(adts_header *adts, bitfile *ld);