shithub: libvpx

--- a/vp8/common/x86/loopfilter_sse2.asm

+++ b/vp8/common/x86/loopfilter_sse2.asm

@@ -11,6 +11,8 @@

 %include "vpx_ports/x86_abi_support.asm"

+; Use of pmaxub instead of psubusb to compute filter mask was seen

+; in ffvp8

 %macro LFH_FILTER_MASK 1

 %if %1

@@ -33,8 +35,6 @@

         psubusb     xmm2,                   xmm6              ; q3-=q2

         por         xmm1,                   xmm2              ; abs(q3-q2)

-        psubusb     xmm1,                   xmm7

 %if %1

         movdqa      xmm4,                   [rsi+rax]         ; q1

 %else

@@ -49,10 +49,8 @@

         psubusb     xmm4,                   xmm6              ; q1-=q2

         psubusb     xmm6,                   xmm3              ; q2-=q1

         por         xmm4,                   xmm6              ; abs(q2-q1)

-        psubusb     xmm4,                   xmm7

+        pmaxub      xmm1,                   xmm4

-        por         xmm1,                   xmm4

 %if %1

         movdqa      xmm4,                   [rsi]             ; q0

 %else

@@ -67,10 +65,8 @@

         psubusb     xmm3,                   xmm0              ; q1-=q0

         por         xmm4,                   xmm3              ; abs(q0-q1)

         movdqa      t0,                     xmm4              ; save to t0

+        pmaxub      xmm1,                   xmm4

-        psubusb     xmm4,                   xmm7

-        por         xmm1,                   xmm4

 %if %1

         neg         rax                     ; negate pitch to deal with above border

@@ -95,10 +91,8 @@

         psubusb     xmm4,                   xmm2              ; p2-=p3

         psubusb     xmm2,                   xmm5              ; p3-=p2

         por         xmm4,                   xmm2              ; abs(p3 - p2)

+        pmaxub      xmm1,                   xmm4

-        psubusb     xmm4,                   xmm7

-        por         xmm1,                   xmm4

 %if %1

         movdqa      xmm4,                   [rsi+2*rax]       ; p1

 %else

@@ -113,9 +107,8 @@

         psubusb     xmm4,                   xmm5              ; p1-=p2

         psubusb     xmm5,                   xmm3              ; p2-=p1

         por         xmm4,                   xmm5              ; abs(p2 - p1)

-        psubusb     xmm4,                   xmm7

+        pmaxub      xmm1,                   xmm4

-        por         xmm1,                   xmm4

         movdqa      xmm2,                   xmm3              ; p1

 %if %1

@@ -133,8 +126,8 @@

         por         xmm4,                   xmm3              ; abs(p1 - p0)

         movdqa        t1,                   xmm4              ; save to t1

-        psubusb     xmm4,                   xmm7

-        por         xmm1,                   xmm4

+        pmaxub      xmm1,                   xmm4

+        psubusb     xmm1,                   xmm7

 %if %1

         movdqa      xmm3,                   [rdi]             ; q1

@@ -872,19 +865,18 @@

         psubusb     xmm0,               xmm7            ; q2-q3

         psubusb     xmm7,               xmm6            ; q3-q2

-        por         xmm7,               xmm0            ; abs (q3-q2)

         movdqa      xmm4,               xmm5            ; q1

-        psubusb     xmm4,               xmm6            ; q1-q2

-        psubusb     xmm6,               xmm5            ; q2-q1

-        por         xmm6,               xmm4            ; abs (q2-q1)

+        por         xmm7,               xmm0            ; abs (q3-q2)

+        psubusb     xmm4,               xmm6            ; q1-q2

         movdqa      xmm0,               xmm1

+        psubusb     xmm6,               xmm5            ; q2-q1

+        por         xmm6,               xmm4            ; abs (q2-q1)

         psubusb     xmm0,               xmm2            ; p2 - p3;

-        psubusb     xmm2,               xmm1            ; p3 - p2;

+        psubusb     xmm2,               xmm1            ; p3 - p2;

         por         xmm0,               xmm2            ; abs(p2-p3)

 %if %1

         movdqa      xmm2,               [rdx]           ; p1

@@ -892,31 +884,20 @@

         movdqa      xmm2,               [rdx+32]        ; p1

 %endif

         movdqa      xmm5,               xmm2            ; p1

+        pmaxub      xmm0,               xmm7

         psubusb     xmm5,               xmm1            ; p1-p2

         psubusb     xmm1,               xmm2            ; p2-p1

+        movdqa      xmm7,               xmm3            ; p0

+        psubusb     xmm7,               xmm2            ; p0-p1

         por         xmm1,               xmm5            ; abs(p2-p1)

+        pmaxub      xmm0,               xmm6

-        mov         rdx,                arg(3)          ; limit

-        movdqa      xmm4,               [rdx]           ; limit

-        psubusb     xmm7,               xmm4

-        psubusb     xmm0,               xmm4            ; abs(p3-p2) > limit

-        psubusb     xmm1,               xmm4            ; abs(p2-p1) > limit

-        psubusb     xmm6,               xmm4            ; abs(q2-q1) > limit

-        por         xmm7,               xmm6            ; or

-        por         xmm0,               xmm1

-        por         xmm0,               xmm7            ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit

+        pmaxub      xmm0,               xmm1

         movdqa      xmm1,               xmm2            ; p1

-        movdqa      xmm7,               xmm3            ; p0

-        psubusb     xmm7,               xmm2            ; p0-p1

         psubusb     xmm2,               xmm3            ; p1-p0

         por         xmm2,               xmm7            ; abs(p1-p0)

@@ -923,8 +904,8 @@

         movdqa      t0,                 xmm2            ; save abs(p1-p0)

         lea         rdx,                srct

-        psubusb     xmm2,               xmm4            ; abs(p1-p0)>limit

-        por         xmm0,               xmm2            ; mask

+        pmaxub      xmm0,               xmm2

 %if %1

         movdqa      xmm5,               [rdx+32]        ; q0

         movdqa      xmm7,               [rdx+48]        ; q1

@@ -940,9 +921,12 @@

         por         xmm7,               xmm5            ; abs(q1-q0)

         movdqa      t1,                 xmm7            ; save abs(q1-q0)

-        psubusb     xmm7,               xmm4            ; abs(q1-q0)> limit

-        por         xmm0,               xmm7            ; mask

+        mov         rdx,                arg(3)          ; limit

+        movdqa      xmm4,               [rdx]           ; limit

+        pmaxub      xmm0,               xmm7

+        psubusb     xmm0,               xmm4

         movdqa      xmm5,               xmm2            ; q1

         psubusb     xmm5,               xmm1            ; q1-=p1

--- a/vp8/common/x86/subpixel_ssse3.asm

+++ b/vp8/common/x86/subpixel_ssse3.asm

@@ -70,28 +70,36 @@

     sub         rdi, rdx

 ;xmm3 free

 filter_block1d8_h6_rowloop_ssse3:

-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movdqa      xmm1, xmm0

-    pshufb      xmm0, [shuf1b GLOBAL]

+    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    movdqa      xmm2, xmm1

-    pshufb      xmm1, [shuf2b GLOBAL]

-    pmaddubsw   xmm0, xmm4

-    pmaddubsw   xmm1, xmm5

+    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    pshufb      xmm2, [shuf3b GLOBAL]

-    add         rdi, rdx

-    pmaddubsw   xmm2, xmm6

+    movdqa      xmm1,   xmm0

+    pmaddubsw   xmm0,   xmm4

+    movdqa      xmm2,   xmm1

+    pshufb      xmm1,   [shuf2bfrom1 GLOBAL]

+    pshufb      xmm2,   [shuf3bfrom1 GLOBAL]

+    pmaddubsw   xmm1,   xmm5

+    lea         rdi,    [rdi + rdx]

+    pmaddubsw   xmm2,   xmm6

     lea         rsi,    [rsi + rax]

     dec         rcx

-    paddsw      xmm0, xmm1

-    paddsw      xmm0, xmm7

-    paddsw      xmm0, xmm2

-    psraw       xmm0, 7

-    packuswb    xmm0, xmm0

+    paddsw      xmm0,   xmm1

+    paddsw      xmm2,   xmm7

+    paddsw      xmm0,   xmm2

+    psraw       xmm0,   7

+    packuswb    xmm0,   xmm0

     movq        MMWORD Ptr [rdi], xmm0

     jnz         filter_block1d8_h6_rowloop_ssse3

@@ -107,8 +115,8 @@

     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm3, XMMWORD PTR [shuf2b GLOBAL]

-    movdqa      xmm4, XMMWORD PTR [shuf3b GLOBAL]

+    movdqa      xmm3, XMMWORD PTR [shuf2bfrom1 GLOBAL]

+    movdqa      xmm4, XMMWORD PTR [shuf3bfrom1 GLOBAL]

     mov         rsi, arg(0)             ;src_ptr

@@ -118,25 +126,34 @@

     movsxd      rdx, dword ptr arg(3)   ;output_pitch

     sub         rdi, rdx

-;xmm3 free

 filter_block1d8_h4_rowloop_ssse3:

-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movdqa      xmm2, xmm0

-    pshufb      xmm0, xmm3 ;[shuf2b GLOBAL]

-    pshufb      xmm2, xmm4 ;[shuf3b GLOBAL]

+    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    pmaddubsw   xmm0, xmm5

-    add         rdi, rdx

-    pmaddubsw   xmm2, xmm6

+    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

+    movdqa      xmm2,   xmm0

+    pshufb      xmm0,   xmm3

+    pshufb      xmm2,   xmm4

+    pmaddubsw   xmm0,   xmm5

+    lea         rdi,    [rdi + rdx]

+    pmaddubsw   xmm2,   xmm6

     lea         rsi,    [rsi + rax]

     dec         rcx

-    paddsw      xmm0, xmm7

-    paddsw      xmm0, xmm2

-    psraw       xmm0, 7

-    packuswb    xmm0, xmm0

+    paddsw      xmm0,   xmm7

+    paddsw      xmm0,   xmm2

+    psraw       xmm0,   7

+    packuswb    xmm0,   xmm0

     movq        MMWORD Ptr [rdi], xmm0

     jnz         filter_block1d8_h4_rowloop_ssse3

@@ -168,7 +185,7 @@

     push        rdi

     ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

+    movsxd      rdx, DWORD PTR arg(5)           ;table index

     xor         rsi, rsi

     shl         rdx, 4      ;

@@ -175,67 +192,81 @@

     lea         rax, [k0_k5 GLOBAL]

     add         rax, rdx

-    mov         rdi, arg(2)             ;output_ptr

-    movdqa      xmm7, [rd GLOBAL]

+    mov         rdi, arg(2)                     ;output_ptr

;;

 ;;    cmp         esi, DWORD PTR [rax]

 ;;    je          vp8_filter_block1d16_h4_ssse3

-    mov         rsi, arg(0)             ;src_ptr

+    mov         rsi, arg(0)                     ;src_ptr

     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

+    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line

+    movsxd      rcx, dword ptr arg(4)           ;output_height

+    movsxd      rdx, dword ptr arg(3)           ;output_pitch

 filter_block1d16_h6_rowloop_ssse3:

-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

+    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movdqa      xmm1, xmm0

-    pshufb      xmm0, [shuf1b GLOBAL]

-    movdqa      xmm2, xmm1

-    pmaddubsw   xmm0, xmm4

-    pshufb      xmm1, [shuf2b GLOBAL]

-    pshufb      xmm2, [shuf3b GLOBAL]

-    pmaddubsw   xmm1, xmm5

+    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    movdqu      xmm3,   XMMWORD PTR [rsi + 6]

+    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    pmaddubsw   xmm2, xmm6

-    paddsw      xmm0, xmm1

-    movdqa      xmm1, xmm3

-    pshufb      xmm3, [shuf1b GLOBAL]

-    paddsw      xmm0, xmm7

-    pmaddubsw   xmm3, xmm4

-    paddsw      xmm0, xmm2

-    movdqa      xmm2, xmm1

-    pshufb      xmm1, [shuf2b GLOBAL]

-    pshufb      xmm2, [shuf3b GLOBAL]

-    pmaddubsw   xmm1, xmm5

-    pmaddubsw   xmm2, xmm6

+    movdqa      xmm1,   xmm0

+    pmaddubsw   xmm0,   xmm4

-    psraw       xmm0, 7

-    packuswb    xmm0, xmm0

+    movdqa      xmm2,   xmm1

+    pshufb      xmm1,   [shuf2bfrom1 GLOBAL]

+    pshufb      xmm2,   [shuf3bfrom1 GLOBAL]

+    movq        xmm3,   MMWORD PTR [rsi +  6]

+    pmaddubsw   xmm1,   xmm5

+    movq        xmm7,   MMWORD PTR [rsi + 11]

+    pmaddubsw   xmm2,   xmm6

+    punpcklbw   xmm3,   xmm7

+    paddsw      xmm0,   xmm1

+    movdqa      xmm1,   xmm3

+    pmaddubsw   xmm3,   xmm4

+    paddsw      xmm0,   xmm2

+    movdqa      xmm2,   xmm1

+    paddsw      xmm0,   [rd GLOBAL]

+    pshufb      xmm1,   [shuf2bfrom1 GLOBAL]

+    pshufb      xmm2,   [shuf3bfrom1 GLOBAL]

+    psraw       xmm0,   7

+    pmaddubsw   xmm1,   xmm5

+    pmaddubsw   xmm2,   xmm6

+    packuswb    xmm0,   xmm0

     lea         rsi,    [rsi + rax]

-    paddsw      xmm3, xmm1

-    paddsw      xmm3, xmm7

-    paddsw      xmm3, xmm2

-    psraw       xmm3, 7

-    packuswb    xmm3, xmm3

+    paddsw      xmm3,   xmm1

-    punpcklqdq  xmm0, xmm3

+    paddsw      xmm3,   xmm2

+    paddsw      xmm3,   [rd GLOBAL]

+    psraw       xmm3,   7

+    packuswb    xmm3,   xmm3

+    punpcklqdq  xmm0,   xmm3

     movdqa      XMMWORD Ptr [rdi], xmm0

-    add         rdi, rdx

+    lea         rdi,    [rdi + rdx]

     dec         rcx

     jnz         filter_block1d16_h6_rowloop_ssse3

     ; begin epilog

     pop rdi

     pop rsi

@@ -268,7 +299,7 @@

     pshufb      xmm3, [shuf3b GLOBAL]

     pshufb      xmm0, [shuf2b GLOBAL]

-    paddsw      xmm1, xmm7

+    paddsw      xmm1, [rd GLOBAL]

     paddsw      xmm1, xmm2

     pmaddubsw   xmm0, xmm5

@@ -278,7 +309,7 @@

     packuswb    xmm1, xmm1

     lea         rsi,    [rsi + rax]

     paddsw      xmm3, xmm0

-    paddsw      xmm3, xmm7

+    paddsw      xmm3, [rd GLOBAL]

     psraw       xmm3, 7

     packuswb    xmm3, xmm3

@@ -939,17 +970,19 @@

 %if ABI_IS_32BIT=0

         movsxd      r8,         dword ptr arg(5)    ; dst_pitch

 %endif

-        movdqu      xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

+        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07

+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        movdqa      xmm4,       xmm3

+        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        movdqu      xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16

+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

         lea         rsi,        [rsi + rdx]         ; next line

-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

         pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14

-        punpckhbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16

+        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16

         pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15

         paddw       xmm3,       [rd GLOBAL]         ; xmm3 += round value

@@ -962,17 +995,18 @@

         packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

 .next_row:

-        movdqu      xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

+        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07

+        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        movdqa      xmm4,       xmm6

+        punpcklbw   xmm6,       xmm5

+        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        movdqu      xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16

+        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

         lea         rsi,        [rsi + rdx]         ; next line

-        punpcklbw   xmm6,       xmm5

         pmaddubsw   xmm6,       xmm1

-        punpckhbw   xmm4,       xmm5

+        punpcklbw   xmm4,       xmm5

         pmaddubsw   xmm4,       xmm1

         paddw       xmm6,       [rd GLOBAL]         ; xmm6 += round value

@@ -1027,49 +1061,51 @@

         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

         ; get the first horizontal line done

-        movdqu      xmm2,       [rsi]               ; load row 0

+        movq        xmm4,       [rsi]               ; load row 0

+        movq        xmm2,       [rsi + 8]           ; load row 0

         lea         rsi,        [rsi + rax]         ; next line

 .next_row:

-        movdqu      xmm3,       [rsi]               ; load row + 1

+        movq        xmm3,       [rsi]               ; load row + 1

+        movq        xmm5,       [rsi + 8]           ; load row + 1

-        movdqu      xmm4,       xmm2

         punpcklbw   xmm4,       xmm3

+        punpcklbw   xmm2,       xmm5

         pmaddubsw   xmm4,       xmm1

-        movdqu      xmm7,       [rsi + rax]         ; load row + 2

+        movq        xmm7,       [rsi + rax]         ; load row + 2

-        punpckhbw   xmm2,       xmm3

-        movdqu      xmm6,       xmm3

         pmaddubsw   xmm2,       xmm1

-        punpcklbw   xmm6,       xmm7

+        movq        xmm6,       [rsi + rax + 8]     ; load row + 2

+        punpcklbw   xmm3,       xmm7

+        punpcklbw   xmm5,       xmm6

+        pmaddubsw   xmm3,       xmm1

         paddw       xmm4,       [rd GLOBAL]

-        pmaddubsw   xmm6,       xmm1

-        psraw       xmm4,       VP8_FILTER_SHIFT

-        punpckhbw   xmm3,       xmm7

+        pmaddubsw   xmm5,       xmm1

         paddw       xmm2,       [rd GLOBAL]

-        pmaddubsw   xmm3,       xmm1

+        psraw       xmm4,       VP8_FILTER_SHIFT

         psraw       xmm2,       VP8_FILTER_SHIFT

-        paddw       xmm6,       [rd GLOBAL]

         packuswb    xmm4,       xmm2

-        psraw       xmm6,       VP8_FILTER_SHIFT

+        paddw       xmm3,       [rd GLOBAL]

         movdqa      [rdi],      xmm4                ; store row 0

-        paddw       xmm3,       [rd GLOBAL]

+        paddw       xmm5,       [rd GLOBAL]

         psraw       xmm3,       VP8_FILTER_SHIFT

-        lea         rsi,        [rsi + 2*rax]

+        psraw       xmm5,       VP8_FILTER_SHIFT

-        packuswb    xmm6,       xmm3

-        movdqa      xmm2,       xmm7

+        packuswb    xmm3,       xmm5

+        movdqa      xmm4,       xmm7

-        movdqa      [rdi + rdx],xmm6                ; store row 1

+        movdqa      [rdi + rdx],xmm3                ; store row 1

+        lea         rsi,        [rsi + 2*rax]

+        movdqa      xmm2,       xmm6

         lea         rdi,        [rdi + 2*rdx]

         cmp         rdi,        rcx

@@ -1083,32 +1119,35 @@

         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

 .next_row:

-        movdqu      xmm2,       [rsi]               ; row 0

-        movdqa      xmm3,       xmm2

+        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07

+        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        movdqu      xmm4,       [rsi + 1]           ; row 0 + 1

-        lea         rsi,        [rsi + rax]         ; next line

         punpcklbw   xmm2,       xmm4

-        movdqu      xmm5,       [rsi]               ; row 1

+        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15

         pmaddubsw   xmm2,       xmm1

-        movdqa      xmm6,       xmm5

+        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        punpckhbw   xmm3,       xmm4

-        movdqu      xmm7,       [rsi + 1]           ; row 1 + 1

+        lea         rsi,        [rsi + rax]         ; next line

+        punpcklbw   xmm3,       xmm4

         pmaddubsw   xmm3,       xmm1

+        movq        xmm5,       [rsi]

         paddw       xmm2,       [rd GLOBAL]

+        movq        xmm7,       [rsi+1]

+        movq        xmm6,       [rsi+8]

         psraw       xmm2,       VP8_FILTER_SHIFT

         punpcklbw   xmm5,       xmm7

+        movq        xmm7,       [rsi+9]

         paddw       xmm3,       [rd GLOBAL]

         pmaddubsw   xmm5,       xmm1

         psraw       xmm3,       VP8_FILTER_SHIFT

-        punpckhbw   xmm6,       xmm7

+        punpcklbw   xmm6,       xmm7

         packuswb    xmm2,       xmm3

         pmaddubsw   xmm6,       xmm1

@@ -1462,6 +1501,13 @@

     db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11

 shuf3b:

     db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10

+align 16

+shuf2bfrom1:

+    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13

+align 16

+shuf3bfrom1:

+    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11

 align 16

rd:

--- a/vp8/decoder/arm/neon/dequant_dc_idct_neon.asm

+++ /dev/null

@@ -1,136 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-    EXPORT  |vp8_dequant_dc_idct_add_neon|

-    ARM

-    REQUIRE8

-    PRESERVE8

-    AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp8_dequant_dc_idct_add_neon(short *input, short *dq, unsigned char *pred,

-;                                  unsigned char *dest, int pitch, int stride,

-;                                  int Dc);

-; r0    short *input,

-; r1    short *dq,

-; r2    unsigned char *pred

-; r3    unsigned char *dest

-; sp    int pitch

-; sp+4  int stride

-; sp+8  int Dc

-|vp8_dequant_dc_idct_add_neon| PROC

-    vld1.16         {q3, q4}, [r0]

-    vld1.16         {q5, q6}, [r1]

-    ldr             r1, [sp, #8]            ;load Dc from stack

-    ldr             r12, _CONSTANTS_

-    vmul.i16        q1, q3, q5              ;input for short_idct4x4llm_neon

-    vmul.i16        q2, q4, q6

-    vmov.16         d2[0], r1

-    ldr             r1, [sp]                ; pitch

-    vld1.32         {d14[0]}, [r2], r1

-    vld1.32         {d14[1]}, [r2], r1

-    vld1.32         {d15[0]}, [r2], r1

-    vld1.32         {d15[1]}, [r2]

-    ldr             r1, [sp, #4]            ; stride

-;|short_idct4x4llm_neon| PROC

-    vld1.16         {d0}, [r12]

-    vswp            d3, d4                  ;q2(vp[4] vp[12])

-    vqdmulh.s16     q3, q2, d0[2]

-    vqdmulh.s16     q4, q2, d0[0]

-    vqadd.s16       d12, d2, d3             ;a1

-    vqsub.s16       d13, d2, d3             ;b1

-    vshr.s16        q3, q3, #1

-    vshr.s16        q4, q4, #1

-    vqadd.s16       q3, q3, q2

-    vqadd.s16       q4, q4, q2

-    vqsub.s16       d10, d6, d9             ;c1

-    vqadd.s16       d11, d7, d8             ;d1

-    vqadd.s16       d2, d12, d11

-    vqadd.s16       d3, d13, d10

-    vqsub.s16       d4, d13, d10

-    vqsub.s16       d5, d12, d11

-    vtrn.32         d2, d4

-    vtrn.32         d3, d5

-    vtrn.16         d2, d3

-    vtrn.16         d4, d5

-; memset(input, 0, 32) -- 32bytes

-    vmov.i16        q14, #0

-    vswp            d3, d4

-    vqdmulh.s16     q3, q2, d0[2]

-    vqdmulh.s16     q4, q2, d0[0]

-    vqadd.s16       d12, d2, d3             ;a1

-    vqsub.s16       d13, d2, d3             ;b1

-    vmov            q15, q14

-    vshr.s16        q3, q3, #1

-    vshr.s16        q4, q4, #1

-    vqadd.s16       q3, q3, q2

-    vqadd.s16       q4, q4, q2

-    vqsub.s16       d10, d6, d9             ;c1

-    vqadd.s16       d11, d7, d8             ;d1

-    vqadd.s16       d2, d12, d11

-    vqadd.s16       d3, d13, d10

-    vqsub.s16       d4, d13, d10

-    vqsub.s16       d5, d12, d11

-    vst1.16         {q14, q15}, [r0]

-    vrshr.s16       d2, d2, #3

-    vrshr.s16       d3, d3, #3

-    vrshr.s16       d4, d4, #3

-    vrshr.s16       d5, d5, #3

-    vtrn.32         d2, d4

-    vtrn.32         d3, d5

-    vtrn.16         d2, d3

-    vtrn.16         d4, d5

-    vaddw.u8        q1, q1, d14

-    vaddw.u8        q2, q2, d15

-    vqmovun.s16     d0, q1

-    vqmovun.s16     d1, q2

-    vst1.32         {d0[0]}, [r3], r1

-    vst1.32         {d0[1]}, [r3], r1

-    vst1.32         {d1[0]}, [r3], r1

-    vst1.32         {d1[1]}, [r3]

-    bx             lr

-    ENDP           ; |vp8_dequant_dc_idct_add_neon|

-; Constant Pool

-_CONSTANTS_       DCD cospi8sqrt2minus1

-cospi8sqrt2minus1 DCD 0x4e7b4e7b

-sinpi8sqrt2       DCD 0x8a8c8a8c

-    END

--- a/vp8/decoder/arm/neon/idct_blk_neon.c

+++ b/vp8/decoder/arm/neon/idct_blk_neon.c

@@ -12,6 +12,21 @@

 #include "idct.h"

 #include "dequantize.h"

+/* place these declarations here because we don't want to maintain them

+ * outside of this scope

+ */

+void idct_dequant_dc_full_2x_neon

+            (short *input, short *dq, unsigned char *pre, unsigned char *dst,

+             int stride, short *dc);

+void idct_dequant_dc_0_2x_neon

+            (short *dc, unsigned char *pre, unsigned char *dst, int stride);

+void idct_dequant_full_2x_neon

+            (short *q, short *dq, unsigned char *pre, unsigned char *dst,

+             int pitch, int stride);

+void idct_dequant_0_2x_neon

+            (short *q, short dq, unsigned char *pre, int pitch,

+             unsigned char *dst, int stride);

 void vp8_dequant_dc_idct_add_y_block_neon

             (short *q, short *dq, unsigned char *pre,

              unsigned char *dst, int stride, char *eobs, short *dc)

@@ -20,26 +35,16 @@

     for (i = 0; i < 4; i++)

-        if (eobs[0] > 1)

-            vp8_dequant_dc_idct_add_neon (q, dq, pre, dst, 16, stride, dc[0]);

+        if (((short *)eobs)[0] & 0xfefe)

+            idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);

         else

-            vp8_dc_only_idct_add_neon (dc[0], pre, dst, 16, stride);

+            idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);

-        if (eobs[1] > 1)

-            vp8_dequant_dc_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);

+        if (((short *)eobs)[1] & 0xfefe)

+            idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);

         else

-            vp8_dc_only_idct_add_neon (dc[1], pre+4, dst+4, 16, stride);

+            idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);

-        if (eobs[2] > 1)

-            vp8_dequant_dc_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);

-        else

-            vp8_dc_only_idct_add_neon (dc[2], pre+8, dst+8, 16, stride);

-        if (eobs[3] > 1)

-            vp8_dequant_dc_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);

-        else

-            vp8_dc_only_idct_add_neon (dc[3], pre+12, dst+12, 16, stride);

         q    += 64;

         dc   += 4;

         pre  += 64;

@@ -56,38 +61,16 @@

     for (i = 0; i < 4; i++)

-        if (eobs[0] > 1)

-            vp8_dequant_idct_add_neon (q, dq, pre, dst, 16, stride);

+        if (((short *)eobs)[0] & 0xfefe)

+            idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);

         else

-        {

-            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dst, 16, stride);

-            ((int *)q)[0] = 0;

-        }

+            idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);

-        if (eobs[1] > 1)

-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dst+4, 16, stride);

+        if (((short *)eobs)[1] & 0xfefe)

+            idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);

         else

-        {

-            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dst+4, 16, stride);

-            ((int *)(q+16))[0] = 0;

-        }

+            idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);

-        if (eobs[2] > 1)

-            vp8_dequant_idct_add_neon (q+32, dq, pre+8, dst+8, 16, stride);

-        else

-        {

-            vp8_dc_only_idct_add_neon (q[32]*dq[0], pre+8, dst+8, 16, stride);

-            ((int *)(q+32))[0] = 0;

-        }

-        if (eobs[3] > 1)

-            vp8_dequant_idct_add_neon (q+48, dq, pre+12, dst+12, 16, stride);

-        else

-        {

-            vp8_dc_only_idct_add_neon (q[48]*dq[0], pre+12, dst+12, 16, stride);

-            ((int *)(q+48))[0] = 0;

-        }

         q    += 64;

         pre  += 64;

         dst  += 4*stride;

@@ -99,53 +82,34 @@

             (short *q, short *dq, unsigned char *pre,

              unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)

-    int i;

+    if (((short *)eobs)[0] & 0xfefe)

+        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);

+    else

+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);

-    for (i = 0; i < 2; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_idct_add_neon (q, dq, pre, dstu, 8, stride);

-        else

-        {

-            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstu, 8, stride);

-            ((int *)q)[0] = 0;

-        }

+    q    += 32;

+    pre  += 32;

+    dstu += 4*stride;

-        if (eobs[1] > 1)

-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstu+4, 8, stride);

-        else

-        {

-            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstu+4, 8, stride);

-            ((int *)(q+16))[0] = 0;

-        }

+    if (((short *)eobs)[1] & 0xfefe)

+        idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);

+    else

+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);

-        q    += 32;

-        pre  += 32;

-        dstu += 4*stride;

-        eobs += 2;

-    }

+    q += 32;

+    pre += 32;

-    for (i = 0; i < 2; i++)

-    {

-        if (eobs[0] > 1)

-            vp8_dequant_idct_add_neon (q, dq, pre, dstv, 8, stride);

-        else

-        {

-            vp8_dc_only_idct_add_neon (q[0]*dq[0], pre, dstv, 8, stride);

-            ((int *)q)[0] = 0;

-        }

+    if (((short *)eobs)[2] & 0xfefe)

+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);

+    else

+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);

-        if (eobs[1] > 1)

-            vp8_dequant_idct_add_neon (q+16, dq, pre+4, dstv+4, 8, stride);

-        else

-        {

-            vp8_dc_only_idct_add_neon (q[16]*dq[0], pre+4, dstv+4, 8, stride);

-            ((int *)(q+16))[0] = 0;

-        }

+    q    += 32;

+    pre  += 32;

+    dstv += 4*stride;

-        q    += 32;

-        pre  += 32;

-        dstv += 4*stride;

-        eobs += 2;

-    }

+    if (((short *)eobs)[3] & 0xfefe)

+        idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);

+    else

+        idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);

--- /dev/null

+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm

@@ -1,0 +1,79 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_0_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,

+;                            int pitch, unsigned char *dst, int stride);

+; r0   *q

+; r1   dq

+; r2   *pre

+; r3   pitch

+; sp   *dst

+; sp+4 stride

+|idct_dequant_0_2x_neon| PROC

+    add             r12, r2, #4

+    vld1.32         {d2[0]}, [r2], r3

+    vld1.32         {d2[1]}, [r2], r3

+    vld1.32         {d4[0]}, [r2], r3

+    vld1.32         {d4[1]}, [r2]

+    vld1.32         {d8[0]}, [r12], r3

+    vld1.32         {d8[1]}, [r12], r3

+    vld1.32         {d10[0]}, [r12], r3

+    vld1.32         {d10[1]}, [r12]

+    ldrh            r12, [r0]               ; lo q

+    ldrh            r2, [r0, #32]           ; hi q

+    mov             r3, #0

+    strh            r3, [r0]

+    strh            r3, [r0, #32]

+    sxth            r12, r12                ; lo

+    mul             r0, r12, r1

+    add             r0, r0, #4

+    asr             r0, r0, #3

+    vdup.16         q0, r0

+    sxth            r2, r2                  ; hi

+    mul             r0, r2, r1

+    add             r0, r0, #4

+    asr             r0, r0, #3

+    vdup.16         q3, r0

+    vaddw.u8        q1, q0, d2              ; lo

+    vaddw.u8        q2, q0, d4

+    vaddw.u8        q4, q3, d8              ; hi

+    vaddw.u8        q5, q3, d10

+    ldr             r2, [sp]                ; dst

+    ldr             r3, [sp, #4]            ; stride

+    vqmovun.s16     d2, q1                  ; lo

+    vqmovun.s16     d4, q2

+    vqmovun.s16     d8, q4                  ; hi

+    vqmovun.s16     d10, q5

+    add             r0, r2, #4

+    vst1.32         {d2[0]}, [r2], r3       ; lo

+    vst1.32         {d2[1]}, [r2], r3

+    vst1.32         {d4[0]}, [r2], r3

+    vst1.32         {d4[1]}, [r2]

+    vst1.32         {d8[0]}, [r0], r3       ; hi

+    vst1.32         {d8[1]}, [r0], r3

+    vst1.32         {d10[0]}, [r0], r3

+    vst1.32         {d10[1]}, [r0]

+    bx             lr

+    ENDP           ; |idct_dequant_0_2x_neon|

+    END

--- /dev/null

+++ b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm

@@ -1,0 +1,69 @@

+;

+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license and patent

+;  grant that can be found in the LICENSE file in the root of the source

+;  tree. All contributing project authors may be found in the AUTHORS

+;  file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_dc_0_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,

+;                               unsigned char *dst, int stride);

+; r0  *dc

+; r1  *pre

+; r2  *dst

+; r3  stride

+|idct_dequant_dc_0_2x_neon| PROC

+    ldr             r0, [r0]                ; *dc

+    mov             r12, #16

+    vld1.32         {d2[0]}, [r1], r12      ; lo

+    vld1.32         {d2[1]}, [r1], r12

+    vld1.32         {d4[0]}, [r1], r12

+    vld1.32         {d4[1]}, [r1]

+    sub             r1, r1, #44

+    vld1.32         {d8[0]}, [r1], r12      ; hi

+    vld1.32         {d8[1]}, [r1], r12

+    vld1.32         {d10[0]}, [r1], r12

+    vld1.32         {d10[1]}, [r1]

+    sxth            r1, r0                  ; lo *dc

+    add             r1, r1, #4

+    asr             r1, r1, #3

+    vdup.16         q0, r1

+    sxth            r0, r0, ror #16         ; hi *dc

+    add             r0, r0, #4

+    asr             r0, r0, #3

+    vdup.16         q3, r0

+    vaddw.u8        q1, q0, d2              ; lo

+    vaddw.u8        q2, q0, d4

+    vaddw.u8        q4, q3, d8              ; hi

+    vaddw.u8        q5, q3, d10

+    vqmovun.s16     d2, q1                  ; lo

+    vqmovun.s16     d4, q2

+    vqmovun.s16     d8, q4                  ; hi

+    vqmovun.s16     d10, q5

+    add             r0, r2, #4

+    vst1.32         {d2[0]}, [r2], r3       ; lo

+    vst1.32         {d2[1]}, [r2], r3

+    vst1.32         {d4[0]}, [r2], r3

+    vst1.32         {d4[1]}, [r2]

+    vst1.32         {d8[0]}, [r0], r3       ; hi

+    vst1.32         {d8[1]}, [r0], r3

+    vst1.32         {d10[0]}, [r0], r3

+    vst1.32         {d10[1]}, [r0]

+    bx             lr

+    ENDP           ;|idct_dequant_dc_0_2x_neon|

+    END

--- /dev/null

+++ b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm

@@ -1,0 +1,206 @@

+;

+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_dc_full_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,

+;                                  unsigned char *dst, int stride, short *dc);

+; r0    *q,

+; r1    *dq,

+; r2    *pre

+; r3    *dst

+; sp    stride

+; sp+4  *dc

+|idct_dequant_dc_full_2x_neon| PROC

+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

+    vld1.16         {q2, q3}, [r0]          ; l q

+    mov             r1, #16                 ; pitch

+    add             r0, r0, #32

+    vld1.16         {q4, q5}, [r0]          ; r q

+    add             r12, r2, #4

+    ; interleave the predictors

+    vld1.32         {d28[0]}, [r2], r1      ; l pre

+    vld1.32         {d28[1]}, [r12], r1     ; r pre

+    vld1.32         {d29[0]}, [r2], r1

+    vld1.32         {d29[1]}, [r12], r1

+    vld1.32         {d30[0]}, [r2], r1

+    vld1.32         {d30[1]}, [r12], r1

+    vld1.32         {d31[0]}, [r2]

+    ldr             r1, [sp, #4]

+    vld1.32         {d31[1]}, [r12]

+    ldr             r2, _CONSTANTS_

+    ldrh            r12, [r1], #2           ; lo *dc

+    ldrh            r1, [r1]                ; hi *dc

+    ; dequant: q[i] = q[i] * dq[i]

+    vmul.i16        q2, q2, q0

+    vmul.i16        q3, q3, q1

+    vmul.i16        q4, q4, q0

+    vmul.i16        q5, q5, q1

+    ; move dc up to neon and overwrite first element

+    vmov.16         d4[0], r12

+    vmov.16         d8[0], r1

+    vld1.16         {d0}, [r2]

+    ; q2: l0r0  q3: l8r8

+    ; q4: l4r4  q5: l12r12

+    vswp            d5, d8

+    vswp            d7, d10

+    ; _CONSTANTS_ * 4,12 >> 16

+    ; q6:  4 * sinpi : c1/temp1

+    ; q7: 12 * sinpi : d1/temp2

+    ; q8:  4 * cospi

+    ; q9: 12 * cospi

+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q7, q5, d0[2]

+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

+    vqdmulh.s16     q9, q5, d0[0]

+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

+    ; vqdmulh only accepts signed values. this was a problem because

+    ; our constant had the high bit set, and was treated as a negative value.

+    ; vqdmulh also doubles the value before it shifts by 16. we need to

+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

+    ; so we can shift the constant without losing precision. this avoids

+    ; shift again afterward, but also avoids the sign issue. win win!

+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

+    ; pre-shift it

+    vshr.s16        q8, q8, #1

+    vshr.s16        q9, q9, #1

+    ; q4:  4 +  4 * cospi : d1/temp1

+    ; q5: 12 + 12 * cospi : c1/temp2

+    vqadd.s16       q4, q4, q8

+    vqadd.s16       q5, q5, q9

+    ; c1 = temp1 - temp2

+    ; d1 = temp1 + temp2

+    vqsub.s16       q2, q6, q5

+    vqadd.s16       q3, q4, q7

+    ; [0]: a1+d1

+    ; [1]: b1+c1

+    ; [2]: b1-c1

+    ; [3]: a1-d1

+    vqadd.s16       q4, q10, q3

+    vqadd.s16       q5, q11, q2

+    vqsub.s16       q6, q11, q2

+    vqsub.s16       q7, q10, q3

+    ; rotate

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; idct loop 2

+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

+    ; q6: l 2, 6,10,14 r 2, 6,10,14

+    ; q7: l 3, 7,11,15 r 3, 7,11,15

+    ; q8:  1 * sinpi : c1/temp1

+    ; q9:  3 * sinpi : d1/temp2

+    ; q10: 1 * cospi

+    ; q11: 3 * cospi

+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q9, q7, d0[2]

+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

+    vqdmulh.s16     q11, q7, d0[0]

+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

+    ; see note on shifting above

+    vshr.s16        q10, q10, #1

+    vshr.s16        q11, q11, #1

+    ; q10: 1 + 1 * cospi : d1/temp1

+    ; q11: 3 + 3 * cospi : c1/temp2

+    vqadd.s16       q10, q5, q10

+    vqadd.s16       q11, q7, q11

+    ; q8: c1 = temp1 - temp2

+    ; q9: d1 = temp1 + temp2

+    vqsub.s16       q8, q8, q11

+    vqadd.s16       q9, q10, q9

+    ; a1+d1

+    ; b1+c1

+    ; b1-c1

+    ; a1-d1

+    vqadd.s16       q4, q2, q9

+    vqadd.s16       q5, q3, q8

+    vqsub.s16       q6, q3, q8

+    vqsub.s16       q7, q2, q9

+    ; +4 >> 3 (rounding)

+    vrshr.s16       q4, q4, #3              ; lo

+    vrshr.s16       q5, q5, #3

+    vrshr.s16       q6, q6, #3              ; hi

+    vrshr.s16       q7, q7, #3

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; adding pre

+    ; input is still packed. pre was read interleaved

+    vaddw.u8        q4, q4, d28

+    vaddw.u8        q5, q5, d29

+    vaddw.u8        q6, q6, d30

+    vaddw.u8        q7, q7, d31

+    vmov.i16        q14, #0

+    vmov            q15, q14

+    vst1.16         {q14, q15}, [r0]        ; write over high input

+    sub             r0, r0, #32

+    vst1.16         {q14, q15}, [r0]        ; write over low input

+    ;saturate and narrow

+    vqmovun.s16     d0, q4                  ; lo

+    vqmovun.s16     d1, q5

+    vqmovun.s16     d2, q6                  ; hi

+    vqmovun.s16     d3, q7

+    ldr             r1, [sp]                ; stride

+    add             r2, r3, #4              ; hi

+    vst1.32         {d0[0]}, [r3], r1       ; lo

+    vst1.32         {d0[1]}, [r2], r1       ; hi

+    vst1.32         {d1[0]}, [r3], r1

+    vst1.32         {d1[1]}, [r2], r1

+    vst1.32         {d2[0]}, [r3], r1

+    vst1.32         {d2[1]}, [r2], r1

+    vst1.32         {d3[0]}, [r3]

+    vst1.32         {d3[1]}, [r2]

+    bx             lr

+    ENDP           ; |idct_dequant_dc_full_2x_neon|

+; Constant Pool

+_CONSTANTS_       DCD cospi8sqrt2minus1

+cospi8sqrt2minus1 DCD 0x4e7b

+; because the lowest bit in 0x8a8c is 0, we can pre-shift this

+sinpi8sqrt2       DCD 0x4546

+    END

--- /dev/null

+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm

@@ -1,0 +1,198 @@

+;

+;  Copyright (c) 2010 The Webm project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+    EXPORT  |idct_dequant_full_2x_neon|

+    ARM

+    REQUIRE8

+    PRESERVE8

+    AREA ||.text||, CODE, READONLY, ALIGN=2

+;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,

+;                               unsigned char *dst, int pitch, int stride);

+; r0    *q,

+; r1    *dq,

+; r2    *pre

+; r3    *dst

+; sp    pitch

+; sp+4  stride

+|idct_dequant_full_2x_neon| PROC

+    vld1.16         {q0, q1}, [r1]          ; dq (same l/r)

+    vld1.16         {q2, q3}, [r0]          ; l q

+    ldr             r1, [sp]                ; pitch

+    add             r0, r0, #32

+    vld1.16         {q4, q5}, [r0]          ; r q

+    add             r12, r2, #4

+    ; interleave the predictors

+    vld1.32         {d28[0]}, [r2], r1      ; l pre

+    vld1.32         {d28[1]}, [r12], r1     ; r pre

+    vld1.32         {d29[0]}, [r2], r1

+    vld1.32         {d29[1]}, [r12], r1

+    vld1.32         {d30[0]}, [r2], r1

+    vld1.32         {d30[1]}, [r12], r1

+    vld1.32         {d31[0]}, [r2]

+    vld1.32         {d31[1]}, [r12]

+    ldr             r2, _CONSTANTS_

+    ; dequant: q[i] = q[i] * dq[i]

+    vmul.i16        q2, q2, q0

+    vmul.i16        q3, q3, q1

+    vmul.i16        q4, q4, q0

+    vmul.i16        q5, q5, q1

+    vld1.16         {d0}, [r2]

+    ; q2: l0r0  q3: l8r8

+    ; q4: l4r4  q5: l12r12

+    vswp            d5, d8

+    vswp            d7, d10

+    ; _CONSTANTS_ * 4,12 >> 16

+    ; q6:  4 * sinpi : c1/temp1

+    ; q7: 12 * sinpi : d1/temp2

+    ; q8:  4 * cospi

+    ; q9: 12 * cospi

+    vqdmulh.s16     q6, q4, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q7, q5, d0[2]

+    vqdmulh.s16     q8, q4, d0[0]           ; cospi8sqrt2minus1

+    vqdmulh.s16     q9, q5, d0[0]

+    vqadd.s16       q10, q2, q3             ; a1 = 0 + 8

+    vqsub.s16       q11, q2, q3             ; b1 = 0 - 8

+    ; vqdmulh only accepts signed values. this was a problem because

+    ; our constant had the high bit set, and was treated as a negative value.

+    ; vqdmulh also doubles the value before it shifts by 16. we need to

+    ; compensate for this. in the case of sinpi8sqrt2, the lowest bit is 0,

+    ; so we can shift the constant without losing precision. this avoids

+    ; shift again afterward, but also avoids the sign issue. win win!

+    ; for cospi8sqrt2minus1 the lowest bit is 1, so we lose precision if we

+    ; pre-shift it

+    vshr.s16        q8, q8, #1

+    vshr.s16        q9, q9, #1

+    ; q4:  4 +  4 * cospi : d1/temp1

+    ; q5: 12 + 12 * cospi : c1/temp2

+    vqadd.s16       q4, q4, q8

+    vqadd.s16       q5, q5, q9

+    ; c1 = temp1 - temp2

+    ; d1 = temp1 + temp2

+    vqsub.s16       q2, q6, q5

+    vqadd.s16       q3, q4, q7

+    ; [0]: a1+d1

+    ; [1]: b1+c1

+    ; [2]: b1-c1

+    ; [3]: a1-d1

+    vqadd.s16       q4, q10, q3

+    vqadd.s16       q5, q11, q2

+    vqsub.s16       q6, q11, q2

+    vqsub.s16       q7, q10, q3

+    ; rotate

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; idct loop 2

+    ; q4: l 0, 4, 8,12 r 0, 4, 8,12

+    ; q5: l 1, 5, 9,13 r 1, 5, 9,13

+    ; q6: l 2, 6,10,14 r 2, 6,10,14

+    ; q7: l 3, 7,11,15 r 3, 7,11,15

+    ; q8:  1 * sinpi : c1/temp1

+    ; q9:  3 * sinpi : d1/temp2

+    ; q10: 1 * cospi

+    ; q11: 3 * cospi

+    vqdmulh.s16     q8, q5, d0[2]           ; sinpi8sqrt2

+    vqdmulh.s16     q9, q7, d0[2]

+    vqdmulh.s16     q10, q5, d0[0]          ; cospi8sqrt2minus1

+    vqdmulh.s16     q11, q7, d0[0]

+    vqadd.s16       q2, q4, q6             ; a1 = 0 + 2

+    vqsub.s16       q3, q4, q6             ; b1 = 0 - 2

+    ; see note on shifting above

+    vshr.s16        q10, q10, #1

+    vshr.s16        q11, q11, #1

+    ; q10: 1 + 1 * cospi : d1/temp1

+    ; q11: 3 + 3 * cospi : c1/temp2

+    vqadd.s16       q10, q5, q10

+    vqadd.s16       q11, q7, q11

+    ; q8: c1 = temp1 - temp2

+    ; q9: d1 = temp1 + temp2

+    vqsub.s16       q8, q8, q11

+    vqadd.s16       q9, q10, q9

+    ; a1+d1

+    ; b1+c1

+    ; b1-c1

+    ; a1-d1

+    vqadd.s16       q4, q2, q9

+    vqadd.s16       q5, q3, q8

+    vqsub.s16       q6, q3, q8

+    vqsub.s16       q7, q2, q9

+    ; +4 >> 3 (rounding)

+    vrshr.s16       q4, q4, #3              ; lo

+    vrshr.s16       q5, q5, #3

+    vrshr.s16       q6, q6, #3              ; hi

+    vrshr.s16       q7, q7, #3

+    vtrn.32         q4, q6

+    vtrn.32         q5, q7

+    vtrn.16         q4, q5

+    vtrn.16         q6, q7

+    ; adding pre

+    ; input is still packed. pre was read interleaved

+    vaddw.u8        q4, q4, d28

+    vaddw.u8        q5, q5, d29

+    vaddw.u8        q6, q6, d30

+    vaddw.u8        q7, q7, d31

+    vmov.i16        q14, #0

+    vmov            q15, q14

+    vst1.16         {q14, q15}, [r0]        ; write over high input

+    sub             r0, r0, #32

+    vst1.16         {q14, q15}, [r0]        ; write over low input

+    ;saturate and narrow

+    vqmovun.s16     d0, q4                  ; lo

+    vqmovun.s16     d1, q5

+    vqmovun.s16     d2, q6                  ; hi

+    vqmovun.s16     d3, q7

+    ldr             r1, [sp, #4]            ; stride

+    add             r2, r3, #4              ; hi

+    vst1.32         {d0[0]}, [r3], r1       ; lo

+    vst1.32         {d0[1]}, [r2], r1       ; hi

+    vst1.32         {d1[0]}, [r3], r1

+    vst1.32         {d1[1]}, [r2], r1

+    vst1.32         {d2[0]}, [r3], r1

+    vst1.32         {d2[1]}, [r2], r1

+    vst1.32         {d3[0]}, [r3]

+    vst1.32         {d3[1]}, [r2]

+    bx             lr

+    ENDP           ; |idct_dequant_full_2x_neon|

+; Constant Pool

+_CONSTANTS_       DCD cospi8sqrt2minus1

+cospi8sqrt2minus1 DCD 0x4e7b

+; because the lowest bit in 0x8a8c is 0, we can pre-shift this

+sinpi8sqrt2       DCD 0x4546

+    END

--- a/vp8/vp8dx_arm.mk

+++ b/vp8/vp8dx_arm.mk

@@ -25,7 +25,10 @@

 VP8_DX_SRCS-$(HAVE_ARMV6)  += decoder/arm/armv6/idct_blk_v6.c

 #File list for neon

-VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_dc_idct_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_full_2x_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_dc_0_2x_neon$(ASM)

 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequant_idct_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_full_2x_neon$(ASM)

+VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_dequant_0_2x_neon$(ASM)

 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/dequantizeb_neon$(ASM)

 VP8_DX_SRCS-$(HAVE_ARMV7)  += decoder/arm/neon/idct_blk_neon.c

--

⑨