shithub: libvpx

--- a/vp8/common/copy_c.c

+++ /dev/null

@@ -1,27 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <string.h>

-#include "./vp8_rtcd.h"

-#include "vpx/vpx_integer.h"

-/* Copy 2 macroblocks to a buffer */

-void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride,

-                    unsigned char *dst_ptr, int dst_stride, int height) {

-  int r;

-  for (r = 0; r < height; ++r) {

-    memcpy(dst_ptr, src_ptr, 32);

-    src_ptr += src_stride;

-    dst_ptr += dst_stride;

-  }

-}

--- a/vp8/common/x86/copy_sse2.asm

+++ /dev/null

@@ -1,94 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-SECTION .text

-;void vp8_copy32xn_sse2(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *dst_ptr,

-;    int  dst_stride,

-;    int height);

-global sym(vp8_copy32xn_sse2) PRIVATE

-sym(vp8_copy32xn_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    SAVE_XMM 7

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov             rsi,        arg(0) ;src_ptr

-        mov             rdi,        arg(2) ;dst_ptr

-        movsxd          rax,        dword ptr arg(1) ;src_stride

-        movsxd          rdx,        dword ptr arg(3) ;dst_stride

-        movsxd          rcx,        dword ptr arg(4) ;height

-.block_copy_sse2_loopx4:

-        movdqu          xmm0,       XMMWORD PTR [rsi]

-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

-        movdqu          xmm2,       XMMWORD PTR [rsi + rax]

-        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]

-        lea             rsi,        [rsi+rax*2]

-        movdqu          xmm4,       XMMWORD PTR [rsi]

-        movdqu          xmm5,       XMMWORD PTR [rsi + 16]

-        movdqu          xmm6,       XMMWORD PTR [rsi + rax]

-        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]

-        lea             rsi,    [rsi+rax*2]

-        movdqa          XMMWORD PTR [rdi], xmm0

-        movdqa          XMMWORD PTR [rdi + 16], xmm1

-        movdqa          XMMWORD PTR [rdi + rdx], xmm2

-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3

-        lea             rdi,    [rdi+rdx*2]

-        movdqa          XMMWORD PTR [rdi], xmm4

-        movdqa          XMMWORD PTR [rdi + 16], xmm5

-        movdqa          XMMWORD PTR [rdi + rdx], xmm6

-        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7

-        lea             rdi,    [rdi+rdx*2]

-        sub             rcx,     4

-        cmp             rcx,     4

-        jge             .block_copy_sse2_loopx4

-        cmp             rcx, 0

-        je              .copy_is_done

-.block_copy_sse2_loop:

-        movdqu          xmm0,       XMMWORD PTR [rsi]

-        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

-        lea             rsi,    [rsi+rax]

-        movdqa          XMMWORD PTR [rdi], xmm0

-        movdqa          XMMWORD PTR [rdi + 16], xmm1

-        lea             rdi,    [rdi+rdx]

-        sub             rcx,     1

-        jne             .block_copy_sse2_loop

-.copy_is_done:

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

--- a/vp8/common/x86/copy_sse3.asm

+++ /dev/null

@@ -1,147 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%macro STACK_FRAME_CREATE_X3 0

-%if ABI_IS_32BIT

-  %define     src_ptr       rsi

-  %define     src_stride    rax

-  %define     ref_ptr       rdi

-  %define     ref_stride    rdx

-  %define     end_ptr       rcx

-  %define     ret_var       rbx

-  %define     result_ptr    arg(4)

-  %define     max_sad       arg(4)

-  %define     height        dword ptr arg(4)

-    push        rbp

-    mov         rbp,        rsp

-    push        rsi

-    push        rdi

-    push        rbx

-    mov         rsi,        arg(0)              ; src_ptr

-    mov         rdi,        arg(2)              ; ref_ptr

-    movsxd      rax,        dword ptr arg(1)    ; src_stride

-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride

-%else

-  %if LIBVPX_YASM_WIN64

-    SAVE_XMM 7, u

-    %define     src_ptr     rcx

-    %define     src_stride  rdx

-    %define     ref_ptr     r8

-    %define     ref_stride  r9

-    %define     end_ptr     r10

-    %define     ret_var     r11

-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]

-    %define     max_sad     [rsp+xmm_stack_space+8+4*8]

-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]

-  %else

-    %define     src_ptr     rdi

-    %define     src_stride  rsi

-    %define     ref_ptr     rdx

-    %define     ref_stride  rcx

-    %define     end_ptr     r9

-    %define     ret_var     r10

-    %define     result_ptr  r8

-    %define     max_sad     r8

-    %define     height      r8

-  %endif

-%endif

-%endmacro

-%macro STACK_FRAME_DESTROY_X3 0

-  %define     src_ptr

-  %define     src_stride

-  %define     ref_ptr

-  %define     ref_stride

-  %define     end_ptr

-  %define     ret_var

-  %define     result_ptr

-  %define     max_sad

-  %define     height

-%if ABI_IS_32BIT

-    pop         rbx

-    pop         rdi

-    pop         rsi

-    pop         rbp

-%else

-  %if LIBVPX_YASM_WIN64

-    RESTORE_XMM

-  %endif

-%endif

-    ret

-%endmacro

-SECTION .text

-;void vp8_copy32xn_sse3(

-;    unsigned char *src_ptr,

-;    int  src_stride,

-;    unsigned char *dst_ptr,

-;    int  dst_stride,

-;    int height);

-global sym(vp8_copy32xn_sse3) PRIVATE

-sym(vp8_copy32xn_sse3):

-    STACK_FRAME_CREATE_X3

-.block_copy_sse3_loopx4:

-        lea             end_ptr,    [src_ptr+src_stride*2]

-        movdqu          xmm0,       XMMWORD PTR [src_ptr]

-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

-        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]

-        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]

-        movdqu          xmm4,       XMMWORD PTR [end_ptr]

-        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]

-        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]

-        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]

-        lea             src_ptr,    [src_ptr+src_stride*4]

-        lea             end_ptr,    [ref_ptr+ref_stride*2]

-        movdqa          XMMWORD PTR [ref_ptr], xmm0

-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

-        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2

-        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3

-        movdqa          XMMWORD PTR [end_ptr], xmm4

-        movdqa          XMMWORD PTR [end_ptr + 16], xmm5

-        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6

-        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7

-        lea             ref_ptr,    [ref_ptr+ref_stride*4]

-        sub             height,     4

-        cmp             height,     4

-        jge             .block_copy_sse3_loopx4

-        ;Check to see if there is more rows need to be copied.

-        cmp             height, 0

-        je              .copy_is_done

-.block_copy_sse3_loop:

-        movdqu          xmm0,       XMMWORD PTR [src_ptr]

-        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

-        lea             src_ptr,    [src_ptr+src_stride]

-        movdqa          XMMWORD PTR [ref_ptr], xmm0

-        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

-        lea             ref_ptr,    [ref_ptr+ref_stride]

-        sub             height,     1

-        jne             .block_copy_sse3_loop

-.copy_is_done:

-    STACK_FRAME_DESTROY_X3

--- /dev/null

+++ b/vp8/encoder/copy_c.c

@@ -1,0 +1,27 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <string.h>

+#include "./vp8_rtcd.h"

+#include "vpx/vpx_integer.h"

+/* Copy 2 macroblocks to a buffer */

+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride,

+                    unsigned char *dst_ptr, int dst_stride, int height) {

+  int r;

+  for (r = 0; r < height; ++r) {

+    memcpy(dst_ptr, src_ptr, 32);

+    src_ptr += src_stride;

+    dst_ptr += dst_stride;

+  }

+}

--- /dev/null

+++ b/vp8/encoder/x86/copy_sse2.asm

@@ -1,0 +1,94 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+SECTION .text

+;void vp8_copy32xn_sse2(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *dst_ptr,

+;    int  dst_stride,

+;    int height);

+global sym(vp8_copy32xn_sse2) PRIVATE

+sym(vp8_copy32xn_sse2):

+    push        rbp

+    mov         rbp, rsp

+    SHADOW_ARGS_TO_STACK 5

+    SAVE_XMM 7

+    push        rsi

+    push        rdi

+    ; end prolog

+        mov             rsi,        arg(0) ;src_ptr

+        mov             rdi,        arg(2) ;dst_ptr

+        movsxd          rax,        dword ptr arg(1) ;src_stride

+        movsxd          rdx,        dword ptr arg(3) ;dst_stride

+        movsxd          rcx,        dword ptr arg(4) ;height

+.block_copy_sse2_loopx4:

+        movdqu          xmm0,       XMMWORD PTR [rsi]

+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

+        movdqu          xmm2,       XMMWORD PTR [rsi + rax]

+        movdqu          xmm3,       XMMWORD PTR [rsi + rax + 16]

+        lea             rsi,        [rsi+rax*2]

+        movdqu          xmm4,       XMMWORD PTR [rsi]

+        movdqu          xmm5,       XMMWORD PTR [rsi + 16]

+        movdqu          xmm6,       XMMWORD PTR [rsi + rax]

+        movdqu          xmm7,       XMMWORD PTR [rsi + rax + 16]

+        lea             rsi,    [rsi+rax*2]

+        movdqa          XMMWORD PTR [rdi], xmm0

+        movdqa          XMMWORD PTR [rdi + 16], xmm1

+        movdqa          XMMWORD PTR [rdi + rdx], xmm2

+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm3

+        lea             rdi,    [rdi+rdx*2]

+        movdqa          XMMWORD PTR [rdi], xmm4

+        movdqa          XMMWORD PTR [rdi + 16], xmm5

+        movdqa          XMMWORD PTR [rdi + rdx], xmm6

+        movdqa          XMMWORD PTR [rdi + rdx + 16], xmm7

+        lea             rdi,    [rdi+rdx*2]

+        sub             rcx,     4

+        cmp             rcx,     4

+        jge             .block_copy_sse2_loopx4

+        cmp             rcx, 0

+        je              .copy_is_done

+.block_copy_sse2_loop:

+        movdqu          xmm0,       XMMWORD PTR [rsi]

+        movdqu          xmm1,       XMMWORD PTR [rsi + 16]

+        lea             rsi,    [rsi+rax]

+        movdqa          XMMWORD PTR [rdi], xmm0

+        movdqa          XMMWORD PTR [rdi + 16], xmm1

+        lea             rdi,    [rdi+rdx]

+        sub             rcx,     1

+        jne             .block_copy_sse2_loop

+.copy_is_done:

+    ; begin epilog

+    pop rdi

+    pop rsi

+    RESTORE_XMM

+    UNSHADOW_ARGS

+    pop         rbp

+    ret

--- /dev/null

+++ b/vp8/encoder/x86/copy_sse3.asm

@@ -1,0 +1,147 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%include "vpx_ports/x86_abi_support.asm"

+%macro STACK_FRAME_CREATE_X3 0

+%if ABI_IS_32BIT

+  %define     src_ptr       rsi

+  %define     src_stride    rax

+  %define     ref_ptr       rdi

+  %define     ref_stride    rdx

+  %define     end_ptr       rcx

+  %define     ret_var       rbx

+  %define     result_ptr    arg(4)

+  %define     max_sad       arg(4)

+  %define     height        dword ptr arg(4)

+    push        rbp

+    mov         rbp,        rsp

+    push        rsi

+    push        rdi

+    push        rbx

+    mov         rsi,        arg(0)              ; src_ptr

+    mov         rdi,        arg(2)              ; ref_ptr

+    movsxd      rax,        dword ptr arg(1)    ; src_stride

+    movsxd      rdx,        dword ptr arg(3)    ; ref_stride

+%else

+  %if LIBVPX_YASM_WIN64

+    SAVE_XMM 7, u

+    %define     src_ptr     rcx

+    %define     src_stride  rdx

+    %define     ref_ptr     r8

+    %define     ref_stride  r9

+    %define     end_ptr     r10

+    %define     ret_var     r11

+    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]

+    %define     max_sad     [rsp+xmm_stack_space+8+4*8]

+    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]

+  %else

+    %define     src_ptr     rdi

+    %define     src_stride  rsi

+    %define     ref_ptr     rdx

+    %define     ref_stride  rcx

+    %define     end_ptr     r9

+    %define     ret_var     r10

+    %define     result_ptr  r8

+    %define     max_sad     r8

+    %define     height      r8

+  %endif

+%endif

+%endmacro

+%macro STACK_FRAME_DESTROY_X3 0

+  %define     src_ptr

+  %define     src_stride

+  %define     ref_ptr

+  %define     ref_stride

+  %define     end_ptr

+  %define     ret_var

+  %define     result_ptr

+  %define     max_sad

+  %define     height

+%if ABI_IS_32BIT

+    pop         rbx

+    pop         rdi

+    pop         rsi

+    pop         rbp

+%else

+  %if LIBVPX_YASM_WIN64

+    RESTORE_XMM

+  %endif

+%endif

+    ret

+%endmacro

+SECTION .text

+;void vp8_copy32xn_sse3(

+;    unsigned char *src_ptr,

+;    int  src_stride,

+;    unsigned char *dst_ptr,

+;    int  dst_stride,

+;    int height);

+global sym(vp8_copy32xn_sse3) PRIVATE

+sym(vp8_copy32xn_sse3):

+    STACK_FRAME_CREATE_X3

+.block_copy_sse3_loopx4:

+        lea             end_ptr,    [src_ptr+src_stride*2]

+        movdqu          xmm0,       XMMWORD PTR [src_ptr]

+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

+        movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]

+        movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]

+        movdqu          xmm4,       XMMWORD PTR [end_ptr]

+        movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]

+        movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]

+        movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]

+        lea             src_ptr,    [src_ptr+src_stride*4]

+        lea             end_ptr,    [ref_ptr+ref_stride*2]

+        movdqa          XMMWORD PTR [ref_ptr], xmm0

+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

+        movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2

+        movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3

+        movdqa          XMMWORD PTR [end_ptr], xmm4

+        movdqa          XMMWORD PTR [end_ptr + 16], xmm5

+        movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6

+        movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7

+        lea             ref_ptr,    [ref_ptr+ref_stride*4]

+        sub             height,     4

+        cmp             height,     4

+        jge             .block_copy_sse3_loopx4

+        ;Check to see if there is more rows need to be copied.

+        cmp             height, 0

+        je              .copy_is_done

+.block_copy_sse3_loop:

+        movdqu          xmm0,       XMMWORD PTR [src_ptr]

+        movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]

+        lea             src_ptr,    [src_ptr+src_stride]

+        movdqa          XMMWORD PTR [ref_ptr], xmm0

+        movdqa          XMMWORD PTR [ref_ptr + 16], xmm1

+        lea             ref_ptr,    [ref_ptr+ref_stride]

+        sub             height,     1

+        jne             .block_copy_sse3_loop

+.copy_is_done:

+    STACK_FRAME_DESTROY_X3

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -15,7 +15,6 @@

 VP8_COMMON_SRCS-yes += common/alloccommon.c

 VP8_COMMON_SRCS-yes += common/blockd.c

 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h

-VP8_COMMON_SRCS-yes += common/copy_c.c

 # VP8_COMMON_SRCS-yes += common/debugmodes.c

 VP8_COMMON_SRCS-yes += common/default_coef_probs.h

 VP8_COMMON_SRCS-yes += common/dequantize.c

@@ -80,7 +79,6 @@

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm

 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm

-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm

@@ -88,7 +86,6 @@

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm

 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm

-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm

 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm

 ifeq ($(CONFIG_POSTPROC),yes)

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -23,6 +23,7 @@

 VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h

 VP8_CX_SRCS-yes += encoder/bitstream.c

 VP8_CX_SRCS-yes += encoder/boolhuff.c

+VP8_CX_SRCS-yes += encoder/copy_c.c

 VP8_CX_SRCS-yes += encoder/dct.c

 VP8_CX_SRCS-yes += encoder/encodeframe.c

 VP8_CX_SRCS-yes += encoder/encodeframe.h

@@ -82,6 +83,8 @@

 VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h

 endif

+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse2.asm

+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse3.asm

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm

 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c

--

⑨