ref: c17a58efdfa8c03e3a1ae8e7f78483d48700499c
dir: /codec/encoder/core/x86/matrix_transpose.asm/
;*! ;* \copy ;* Copyright (c) 2009-2013, Cisco Systems ;* All rights reserved. ;* ;* Redistribution and use in source and binary forms, with or without ;* modification, are permitted provided that the following conditions ;* are met: ;* ;* ?Redistributions of source code must retain the above copyright ;* notice, this list of conditions and the following disclaimer. ;* ;* ?Redistributions in binary form must reproduce the above copyright ;* notice, this list of conditions and the following disclaimer in ;* the documentation and/or other materials provided with the ;* distribution. ;* ;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS ;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ;* POSSIBILITY OF SUCH DAMAGE. ;*************************************************************************/ %include "asm_inc.asm" ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m0, m3, m5, m2, m7, m1, m6, m4 %macro TRANSPOSE_8x8B_MMX 10 MMX_XSwap bw, %1, %2, %8 MMX_XSwap bw, %3, %4, %2 MMX_XSwap bw, %5, %6, %4 movq %6, %9 movq %10, %4 MMX_XSwap bw, %7, %6, %4 MMX_XSwap wd, %1, %3, %6 MMX_XSwap wd, %8, %2, %3 MMX_XSwap wd, %5, %7, %2 movq %7, %10 movq %10, %3 MMX_XSwap wd, %7, %4, %3 MMX_XSwap dq, %1, %5, %4 MMX_XSwap dq, %6, %2, %5 MMX_XSwap dq, %8, %7, %2 movq %7, %10 movq %10, %5 MMX_XSwap dq, %7, %3, %5 movq %3, %10 %endmacro ;in: m0, m3, m5, m2, m7, m1, m6, m4 %macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride movq [%1], mm0 ; result of line 1, x8 bytes movq [%1+%2], mm3 ; result of line 2 lea %1, [%1+2*%2] movq [%1], mm5 ; result of line 3 movq [%1+%2], mm2 ; result of line 4 lea %1, [%1+2*%2] movq [%1], mm7 ; result of line 5 movq [%1+%2], mm1 ; result of line 6 lea %1, [%1+2*%2] movq [%1], mm6 ; result of line 7 movq [%1+%2], mm4 ; result of line 8 %endmacro ;in: m0, m3, m5, m2, m7, m1, m6, m4 %macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32 movq [%1], mm0 ; result of line 1, x8 bytes movq [%1+%2], mm3 ; result of line 2 lea %3, [%1+2*%2] movq [%3], mm5 ; result of line 3 movq [%3+%2], mm2 ; result of line 4 lea %3, [%3+2*%2] movq [%3], mm7 ; result of line 5 movq [%3+%2], mm1 ; result of line 6 lea %3, [%3+2*%2] movq [%3], mm6 ; result of line 7 movq [%3+%2], mm4 ; result of line 8 %endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX ; for transpose 16x8 ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m4, m2, m3, m7, m5, m1, m6, m0 %macro TRANSPOSE_8x16B_SSE2 10 SSE2_XSawp bw, %1, %2, %8 SSE2_XSawp bw, %3, %4, %2 SSE2_XSawp bw, %5, %6, %4 movdqa %6, %9 movdqa %10, %4 SSE2_XSawp bw, %7, %6, %4 SSE2_XSawp wd, %1, %3, %6 SSE2_XSawp wd, %8, %2, %3 SSE2_XSawp wd, %5, %7, %2 movdqa %7, %10 movdqa %10, %3 SSE2_XSawp wd, %7, %4, %3 SSE2_XSawp dq, %1, %5, %4 SSE2_XSawp dq, %6, %2, %5 SSE2_XSawp dq, %8, %7, %2 movdqa %7, %10 movdqa %10, %5 SSE2_XSawp dq, %7, %3, %5 SSE2_XSawp qdq, %1, %8, %3 SSE2_XSawp qdq, %4, %2, %8 SSE2_XSawp qdq, %6, %7, %2 movdqa %7, %10 movdqa %10, %1 SSE2_XSawp qdq, %7, %5, %1 movdqa %5, %10 %endmacro ; end of TRANSPOSE_8x16B_SSE2 %macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride movq [%1], xmm4 ; result of line 1, x8 bytes movq [%1+%2], xmm2 ; result of line 2 lea %1, [%1+2*%2] movq [%1], xmm3 ; result of line 3 movq [%1+%2], xmm7 ; result of line 4 lea %1, [%1+2*%2] movq [%1], xmm5 ; result of line 5 movq [%1+%2], xmm1 ; result of line 6 lea %1, [%1+2*%2] movq [%1], xmm6 ; result of line 7 movq [%1+%2], xmm0 ; result of line 8 lea %1, [%1+2*%2] movhpd [%1], xmm4 ; result of line 9 movhpd [%1+%2], xmm2 ; result of line 10 lea %1, [%1+2*%2] movhpd [%1], xmm3 ; result of line 11 movhpd [%1+%2], xmm7 ; result of line 12 lea %1, [%1+2*%2] movhpd [%1], xmm5 ; result of line 13 movhpd [%1+%2], xmm1 ; result of line 14 lea %1, [%1+2*%2] movhpd [%1], xmm6 ; result of line 15 movhpd [%1+%2], xmm0 ; result of line 16 %endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2 %macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32 movq [%1], xmm4 ; result of line 1, x8 bytes movq [%1+%2], xmm2 ; result of line 2 lea %3, [%1+2*%2] movq [%3], xmm3 ; result of line 3 movq [%3+%2], xmm7 ; result of line 4 lea %3, [%3+2*%2] movq [%3], xmm5 ; result of line 5 movq [%3+%2], xmm1 ; result of line 6 lea %3, [%3+2*%2] movq [%3], xmm6 ; result of line 7 movq [%3+%2], xmm0 ; result of line 8 lea %3, [%3+2*%2] movhpd [%3], xmm4 ; result of line 9 movhpd [%3+%2], xmm2 ; result of line 10 lea %3, [%3+2*%2] movhpd [%3], xmm3 ; result of line 11 movhpd [%3+%2], xmm7 ; result of line 12 lea %3, [%3+2*%2] movhpd [%3], xmm5 ; result of line 13 movhpd [%3+%2], xmm1 ; result of line 14 lea %3, [%3+2*%2] movhpd [%3], xmm6 ; result of line 15 movhpd [%3+%2], xmm0 ; result of line 16 %endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2 SECTION .text WELS_EXTERN TransposeMatrixBlock16x16_sse2 ; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride ); push r4 push r5 %assign push_num 2 LOAD_4_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d mov r4, r7 and r4, 0Fh sub r7, 10h sub r7, r4 lea r5, [r3+r3*2] ; top 8x16 block movdqa xmm0, [r2] movdqa xmm1, [r2+r3] movdqa xmm2, [r2+r3*2] movdqa xmm3, [r2+r5] lea r2, [r2+r3*4] movdqa xmm4, [r2] movdqa xmm5, [r2+r3] movdqa xmm6, [r2+r3*2] ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m4, m2, m3, m7, m5, m1, m6, m0 TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] TRANSPOSE8x16_WRITE_SSE2 r0, r1 ; bottom 8x16 block lea r2, [r2+r3*4] movdqa xmm0, [r2] movdqa xmm1, [r2+r3] movdqa xmm2, [r2+r3*2] movdqa xmm3, [r2+r5] lea r2, [r2+r3*4] movdqa xmm4, [r2] movdqa xmm5, [r2+r3] movdqa xmm6, [r2+r3*2] ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m4, m2, m3, m7, m5, m1, m6, m0 TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7] mov r5, r1 sal r5, 4 sub r0, r5 lea r0, [r0+r1*2+8] TRANSPOSE8x16_WRITE_SSE2 r0, r1 add r7, r4 add r7, 10h POP_XMM LOAD_4_PARA_POP pop r5 pop r4 ret WELS_EXTERN TransposeMatrixBlocksx16_sse2 ; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks ); push r5 push r6 %assign push_num 2 LOAD_5_PARA PUSH_XMM 8 SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r4, r4d mov r5, r7 and r5, 0Fh sub r7, 10h sub r7, r5 TRANSPOSE_LOOP_SSE2: ; explictly loading next loop data lea r6, [r2+r3*8] push r4 %rep 8 mov r4, [r6] mov r4, [r6+r3] lea r6, [r6+r3*2] %endrep pop r4 ; top 8x16 block movdqa xmm0, [r2] movdqa xmm1, [r2+r3] lea r2, [r2+r3*2] movdqa xmm2, [r2] movdqa xmm3, [r2+r3] lea r2, [r2+r3*2] movdqa xmm4, [r2] movdqa xmm5, [r2+r3] lea r2, [r2+r3*2] movdqa xmm6, [r2] ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m4, m2, m3, m7, m5, m1, m6, m0 TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6 lea r2, [r2+r3*2] ; bottom 8x16 block movdqa xmm0, [r2] movdqa xmm1, [r2+r3] lea r2, [r2+r3*2] movdqa xmm2, [r2] movdqa xmm3, [r2+r3] lea r2, [r2+r3*2] movdqa xmm4, [r2] movdqa xmm5, [r2+r3] lea r2, [r2+r3*2] movdqa xmm6, [r2] ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m4, m2, m3, m7, m5, m1, m6, m0 TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7] TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6 lea r2, [r2+r3*2] lea r0, [r0+16] dec r4 jg near TRANSPOSE_LOOP_SSE2 add r7, r5 add r7, 10h POP_XMM LOAD_5_PARA_POP pop r6 pop r5 ret WELS_EXTERN TransposeMatrixBlock8x8_mmx ; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride ); %assign push_num 0 LOAD_4_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d sub r7, 8 movq mm0, [r2] movq mm1, [r2+r3] lea r2, [r2+2*r3] movq mm2, [r2] movq mm3, [r2+r3] lea r2, [r2+2*r3] movq mm4, [r2] movq mm5, [r2+r3] lea r2, [r2+2*r3] movq mm6, [r2] ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m0, m3, m5, m2, m7, m1, m6, m4 TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] TRANSPOSE8x8_WRITE_MMX r0, r1 emms add r7, 8 LOAD_4_PARA_POP ret WELS_EXTERN TransposeMatrixBlocksx8_mmx ; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks ); push r5 push r6 %assign push_num 2 LOAD_5_PARA SIGN_EXTENSION r1, r1d SIGN_EXTENSION r3, r3d SIGN_EXTENSION r4, r4d sub r7, 8 lea r5, [r2+r3*8] TRANSPOSE_BLOCKS_X8_LOOP_MMX: ; explictly loading next loop data %rep 4 mov r6, [r5] mov r6, [r5+r3] lea r5, [r5+r3*2] %endrep movq mm0, [r2] movq mm1, [r2+r3] lea r2, [r2+2*r3] movq mm2, [r2] movq mm3, [r2+r3] lea r2, [r2+2*r3] movq mm4, [r2] movq mm5, [r2+r3] lea r2, [r2+2*r3] movq mm6, [r2] ;in: m0, m1, m2, m3, m4, m5, m6, m7 ;out: m0, m3, m5, m2, m7, m1, m6, m4 TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7] TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6 lea r0, [r0+8] lea r2, [r2+2*r3] dec r4 jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX emms add r7, 8 LOAD_5_PARA_POP pop r6 pop r5 ret