ref: 04dba61d22ab43c1f428302f2abbaecca723fe30
parent: 3761901ed4785ce0c85557362c857f506b4072ce
author: Martin Storsjö <martin@martin.st>
date: Tue Jan 28 08:47:51 EST 2014
Remove an unused assembly source file Nothing within processing uses functions from this file.
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -594,46 +594,6 @@
</FileConfiguration>
</File>
<File
- RelativePath="..\..\src\asm\intra_pred.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
RelativePath="..\..\..\common\satd_sad.asm"
>
<FileConfiguration
--- a/codec/processing/src/asm/intra_pred.asm
+++ /dev/null
@@ -1,1505 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* intra_pred.asm
-;*
-;* Abstract
-;* sse2 function for intra predict operations
-;*
-;* History
-;* 18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-align 16
-sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
-align 16
-sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
-align 16
-sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
-
-; for chroma plane mode
-sse2_plane_inc_c dw 1, 2, 3, 4
-sse2_plane_dec_c dw 4, 3, 2, 1
-align 16
-sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
-
-align 16
-mmx_01bytes: times 16 db 1
-;align 16
-;sse_0x0004bytes: times 8 dw 4
-;ALIGN 16
-;sse_f000 db 255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-
-align 16
-mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
-
-
-;***********************************************************************
-; macros
-;***********************************************************************
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-;%1 will keep the last result
-%macro SSE_DB_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubb %1, %2
-%endmacro
-
-;xmm0, xmm1, xmm2, eax, ecx
-;lower 64 bits of xmm0 save the result
-%macro SSE2_PRED_H_4X4_TWO_LINE 5
- movd %1, [%4-1]
- movdqa %3, %1
- punpcklbw %1, %3
- movdqa %3, %1
- punpcklbw %1, %3
-
- ;add %4, %5
- movd %2, [%4+%5-1]
- movdqa %3, %2
- punpcklbw %2, %3
- movdqa %3, %2
- punpcklbw %2, %3
- punpckldq %1, %2
-%endmacro
-
-%macro SUMW_HORIZON1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
-%endmacro
-
-%macro LOAD_COLUMN 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpcklwd %1, %3
- lea %5, [%5+2*%6]
- movd %4, [%5]
- movd %2, [%5+%6]
- punpcklbw %4, %2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- lea %5, [%5+2*%6]
- punpcklbw %3, %2
- punpcklwd %4, %3
- punpckhdq %1, %4
-%endmacro
-
-%macro SUMW_HORIZON 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
-%endmacro
-
-
-%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro LOAD_COLUMN_C 6
- movd %1, [%5]
- movd %2, [%5+%6]
- punpcklbw %1,%2
- lea %5, [%5+2*%6]
- movd %3, [%5]
- movd %2, [%5+%6]
- punpcklbw %3, %2
- punpckhwd %1, %3
- lea %5, [%5+2*%6]
-%endmacro
-
-%macro LOAD_2_LEFT_AND_ADD 0
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01]
- add r3, r4
- movzx r4, byte [r1+r2-0x01]
- add r3, r4
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-WELS_EXTERN WelsI4x4LumaPredH_sse2
-WELS_EXTERN WelsI4x4LumaPredDDR_mmx
-WELS_EXTERN WelsI4x4LumaPredDc_sse2
-WELS_EXTERN WelsI16x16LumaPredPlane_sse2
-
-ALIGN 16
-;***********************************************************************
-; void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;
-; pred must align to 16
-;***********************************************************************
-WelsI4x4LumaPredH_sse2:
- push r3
- %assign push_num 1
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movzx r3, byte [r1-1]
- movd xmm0, r3d
- pmuludq xmm0, [mmx_01bytes]
-
- movzx r3, byte [r1+r2-1]
- movd xmm1, r3d
- pmuludq xmm1, [mmx_01bytes]
-
- unpcklps xmm0, xmm1
-
- lea r1, [r1+r2*2]
- movzx r3, byte [r1-1]
- movd xmm2, r3d
- pmuludq xmm2, [mmx_01bytes]
-
- movzx r3, byte [r1+r2-1]
- movd xmm3, r3d
- pmuludq xmm3, [mmx_01bytes]
-
- unpcklps xmm2, xmm3
- unpcklpd xmm0, xmm2
-
- movdqa [r0], xmm0
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WelsI16x16LumaPredPlane_sse2:
- ;%define pushsize 4
- ;push esi
- ;mov esi, [esp + pushsize + 8]
- ;mov ecx, [esp + pushsize + 12]
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, 1
- sub r1, r2
-
- ;for H
- pxor xmm7, xmm7
- movq xmm0, [r1]
- movdqa xmm5, [sse2_plane_dec]
- punpcklbw xmm0, xmm7
- pmullw xmm0, xmm5
- movq xmm1, [r1 + 9]
- movdqa xmm6, [sse2_plane_inc]
- punpcklbw xmm1, xmm7
- pmullw xmm1, xmm6
- psubw xmm1, xmm0
-
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
-
- movzx r4, BYTE [r1+16]
- sub r1, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
-
- add r1, 3
- movzx r3, BYTE [r1+8*r2]
- add r4, r3
- shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
-
- sub r1, 3
- add r1, r2
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
- pxor xmm4, xmm4
- punpckhbw xmm0, xmm4
- pmullw xmm0, xmm5
- punpckhbw xmm7, xmm4
- pmullw xmm7, xmm6
- psubw xmm7, xmm0
-
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 5
- add r3, 32
- sar r3, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
-
- ;mov esi, [esp + pushsize + 4]
- add r4, 16
- imul r3, -7
- add r3, r4 ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
-
- xor r3, r3
- movdqa xmm5, [sse2_plane_inc_minus]
-
-get_i16x16_luma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- movdqa xmm3, xmm1
- pmullw xmm3, xmm6
- paddw xmm3, xmm0
- psraw xmm3, 5
- packuswb xmm2, xmm3
- movdqa [r0], xmm2
- paddw xmm0, xmm4
- add r0, 16
- inc r3
- cmp r3, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
- pop r4
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_ONE_LINE 0
- add r0, 16
- add r1, r2
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
- push r3
- %assign push_num 1
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- dec r1
- movzx r3, byte [r1]
- SSE2_Copy16Times xmm0, r3d
- movdqa [r0], xmm0
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- SSE2_PRED_H_16X16_ONE_LINE
- pop r3
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
- ;mov edx, [esp+4] ; pred
- ;mov eax, [esp+8] ; pRef
- ;mov ecx, [esp+12] ; stride
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movdqa xmm0, [r1]
-
- movdqa [r0], xmm0
- movdqa [r0+10h], xmm0
- movdqa [r0+20h], xmm0
- movdqa [r0+30h], xmm0
- movdqa [r0+40h], xmm0
- movdqa [r0+50h], xmm0
- movdqa [r0+60h], xmm0
- movdqa [r0+70h], xmm0
- movdqa [r0+80h], xmm0
- movdqa [r0+90h], xmm0
- movdqa [r0+160], xmm0
- movdqa [r0+176], xmm0
- movdqa [r0+192], xmm0
- movdqa [r0+208], xmm0
- movdqa [r0+224], xmm0
- movdqa [r0+240], xmm0
-
- ret
-
-;***********************************************************************
-; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredPlane_sse2
-WelsIChromaPredPlane_sse2:
- ;%define pushsize 4
- ;push esi
- ;mov esi, [esp + pushsize + 8] ;pRef
- ;mov ecx, [esp + pushsize + 12] ;stride
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, 1
- sub r1, r2
-
- pxor mm7, mm7
- movq mm0, [r1]
- movq mm5, [sse2_plane_dec_c]
- punpcklbw mm0, mm7
- pmullw mm0, mm5
- movq mm1, [r1 + 5]
- movq mm6, [sse2_plane_inc_c]
- punpcklbw mm1, mm7
- pmullw mm1, mm6
- psubw mm1, mm0
-
- movq2dq xmm1, mm1
- pxor xmm2, xmm2
- SUMW_HORIZON xmm1,xmm0,xmm2
- movd r3d, xmm1
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
-
- movzx r3, BYTE [r1+8]
- sub r1, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
-
- add r1, 3
- movzx r4, BYTE [r1+4*r2]
- add r4, r3
- shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
-
- sub r1, 3
- add r1, r2
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
- pxor mm4, mm4
- punpckhbw mm0, mm4
- pmullw mm0, mm5
- punpckhbw mm7, mm4
- pmullw mm7, mm6
- psubw mm7, mm0
-
- movq2dq xmm7, mm7
- pxor xmm2, xmm2
- SUMW_HORIZON xmm7,xmm0,xmm2
- movd r3d, xmm7 ; V
- movsx r3, r3w
- imul r3, 17
- add r3, 16
- sar r3, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
-
- ;mov esi, [esp + pushsize + 4]
- add r4, 16
- imul r3, -3
- add r3, r4 ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
-
- xor r3, r3
- movdqa xmm5, [sse2_plane_mul_b_c]
-
-get_i_chroma_pred_plane_sse2_1:
- movdqa xmm2, xmm1
- pmullw xmm2, xmm5
- paddw xmm2, xmm0
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [r0], xmm2
- paddw xmm0, xmm4
- add r0, 8
- inc r3
- cmp r3, 8
- jnz get_i_chroma_pred_plane_sse2_1
- pop r4
- pop r3
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 6 |7 |8 |9 |10|
-; 11|12|13|14|15|
-; 16|17|18|19|20|
-; 21|22|23|24|25|
-; 7 is the start pixel of current 4x4 block
-; pred[7] = ([6]+[0]*2+[1]+2)/4
-;
-; void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:
- ;mov edx,[esp+4] ;pred
- ;mov eax,[esp+8] ;pRef
- ;mov ecx,[esp+12] ;stride
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
- sub r1, r2 ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
- punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
- psllq mm3,18h ;mm3[5]=[1]
- psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
- movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea r1,[r1+r2*2-8h] ;set eax point to 12
- movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
- psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[16]
- por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
- movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[r1+r2*2] ;mm4[8]=[21]
- psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
- psrlq mm4,38h ;mm4[1]=[21]
- por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
- movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
- pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
- pxor mm1,mm4 ;find odd value in the lowest bit of each byte
- pand mm1,[mmx_01bytes] ;set the odd bit
- psubusb mm3,mm1 ;decrease 1 from odd bytes
- pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
-
- movd [r0+12],mm2
- psrlq mm2,8
- movd [r0+8],mm2
- psrlq mm2,8
- movd [r0+4],mm2
- psrlq mm2,8
- movd [r0],mm2
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; 0 |1 |2 |3 |4 |
-; 5 |6 |7 |8 |9 |
-; 10|11|12|13|14|
-; 15|16|17|18|19|
-; 20|21|22|23|24|
-; 6 is the start pixel of current 4x4 block
-; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
-;
-; void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
-;***********************************************************************
-WelsI4x4LumaPredDc_sse2:
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movzx r4, byte [r1-1h]
- sub r1, r2
- movd xmm0, [r1]
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- xor r3, r3
- movd r3d, xmm0
- add r3, r4
- movzx r4, byte [r1+r2*2-1h]
- add r3, r4
-
- lea r1, [r1+r2*2-1]
- movzx r4, byte [r1+r2]
- add r3, r4
-
- movzx r4, byte [r1+r2*2]
- add r3, r4
- add r3, 4
- sar r3, 3
- imul r3, 0x01010101
-
- movd xmm0, r3d
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- pop r4
- pop r3
- ret
-
-ALIGN 16
-;***********************************************************************
-; void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
-; copy 8 pixel of 8 line from left
-;***********************************************************************
-%macro MMX_PRED_H_8X8_ONE_LINE 4
- movq %1, [%3-8]
- psrlq %1, 38h
-
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
-%endmacro
-
-%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+r2-8]
- psrlq %1, 38h
-
- ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
- pmullw %1, [mmx_01bytes]
- pshufw %1, %1, 0
- movq [%4], %1
-%endmacro
-
-WELS_EXTERN WelsIChromaPredH_mmx
-WelsIChromaPredH_mmx:
- ;mov edx, [esp+4] ;pred
- ;mov eax, [esp+8] ;pRef
- ;mov ecx, [esp+12] ;stride
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movq mm0, [r1-8]
- psrlq mm0, 38h
-
- ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
- pmullw mm0, [mmx_01bytes]
- pshufw mm0, mm0, 0
- movq [r0], mm0
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
-
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
-
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
-
- lea r1,[r1+r2*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
-
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-; copy pixels from top 4 pixels
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredV_sse2
-WelsI4x4LumaPredV_sse2:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movd xmm0, [r1]
- pshufd xmm0, xmm0, 0
- movdqa [r0], xmm0
- ret
-
-ALIGN 16
-;***********************************************************************
-; void __cdecl WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-; copy 8 pixels from top 8 pixels
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredV_sse2
-WelsIChromaPredV_sse2:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq xmm0, [r1]
- movdqa xmm1, xmm0
- punpcklqdq xmm0, xmm1
- movdqa [r0], xmm0
- movdqa [r0+16], xmm0
- movdqa [r0+32], xmm0
- movdqa [r0+48], xmm0
- ret
-
- ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |a |b |
-; |g |h |e |f |
-; |i |j |g |h |
-
-; a = (1 + lt + l0)>>1
-; e = (1 + l0 + l1)>>1
-; g = (1 + l1 + l2)>>1
-; i = (1 + l2 + l3)>>1
-
-; d = (2 + t0 + (t1<<1) + t2)>>2
-; c = (2 + lt + (t0<<1) + t1)>>2
-; b = (2 + l0 + (lt<<1) + t0)>>2
-
-; f = (2 + l1 + (l0<<1) + lt)>>2
-; h = (2 + l2 + (l1<<1) + l0)>>2
-; j = (2 + l3 + (l2<<1) + l1)>>2
-; [b a f e h g j i] + [d c b a] --> mov to memory
-;
-; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
- psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movd mm2, [r1+2*r2-4]
- punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
- psrlq mm2, 20h
- pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
- movq mm1, mm0
- psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
- movq mm2, mm0
- psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
- movq mm3, mm2
- movq mm4, mm1
- pavgb mm1, mm0
-
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm4 ; decrease 1 from odd bytes
-
- pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
-
- movq mm4, mm0
- pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
- punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
-
- psrlq mm2, 20h
- psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
- movq mm4, mm3
- psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
- pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
- psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
-
- movd [r0], mm2
- movd [r0+12], mm3
- psrlq mm3, 10h
- movd [r0+8], mm3
- psrlq mm3, 10h
- movd [r0+4], mm3
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; t3 will never been used
-; destination:
-; |a |b |c |d |
-; |c |d |e |f |
-; |e |f |g |g |
-; |g |g |g |g |
-
-; a = (1 + l0 + l1)>>1
-; c = (1 + l1 + l2)>>1
-; e = (1 + l2 + l3)>>1
-; g = l3
-
-; b = (2 + l0 + (l1<<1) + l2)>>2
-; d = (2 + l1 + (l2<<1) + l3)>>2
-; f = (2 + l2 + (l3<<1) + l3)>>2
-
-; [g g f e d c b a] + [g g g g] --> mov to memory
-;
-; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- movd mm0, [r1-4] ; mm0[3] = l0
- punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
- lea r1, [r1+2*r2]
- movd mm2, [r1-4] ; mm2[3] = l2
- movd mm4, [r1+r2-4] ; mm4[3] = l3
- punpcklbw mm2, mm4
- punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
- psrlq mm4, 18h
- psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
- psrlq mm0, 8h
- pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
- movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
- pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
-
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
- movq mm5, mm2
- pavgb mm2, mm0
-
- pxor mm5, mm0 ; find odd value in the lowest bit of each byte
- pand mm5, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm5 ; decrease 1 from odd bytes
-
- pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
-
- psrlq mm2, 8h
- pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
-
- punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
- punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
- punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
-
- psrlq mm4, 20h
- movd [r0+12], mm4
-
- movd [r0], mm1
- psrlq mm1, 10h
- movd [r0+4], mm1
- psrlq mm1, 10h
- movd [r0+8], mm1
- WELSEMMS
- ret
-
-
-
-ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|
-; l0|
-; l1|
-; l2|
-; l3|
-; l3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |i |a |b |c |
-; |j |e |f |g |
-
-; a = (1 + lt + t0)>>1
-; b = (1 + t0 + t1)>>1
-; c = (1 + t1 + t2)>>1
-; d = (1 + t2 + t3)>>1
-
-; e = (2 + l0 + (lt<<1) + t0)>>2
-; f = (2 + lt + (t0<<1) + t1)>>2
-; g = (2 + t0 + (t1<<1) + t2)>>2
-
-; h = (2 + t1 + (t2<<1) + t3)>>2
-; i = (2 + lt + (l0<<1) + l1)>>2
-; j = (2 + l0 + (l1<<1) + l2)>>2
-;
-; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
- psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
- movd mm1, [r1+2*r2-4]
- punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
- lea r1, [r1+2*r2]
- movq mm2, [r1+r2-8] ; mm2[7] = l2
- punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
- psrlq mm2, 28h
- pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
- movq mm1, mm0
- psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
-
- movq mm2, mm0
- psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
- movq mm3, mm2
- pavgb mm2, mm0
-
- pxor mm3, mm0 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm3 ; decrease 1 from odd bytes
-
- movq mm3, mm0
- psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
- pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
- movq mm2, mm3
-
- psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [r0], mm1
-
- psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [r0+4], mm2
-
- movq mm4, mm3
- psllq mm4, 20h
- psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
-
- movq mm5, mm3
- psllq mm5, 28h
- psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
-
- psllq mm1, 8h
- pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [r0+8], mm4
-
- psllq mm2, 8h
- pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- movd [r0+12], mm5
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
-; destination:
-; |a |b |c |d |
-; |b |c |d |e |
-; |c |d |e |f |
-; |d |e |f |g |
-
-; a = (2 + t0 + t2 + (t1<<1))>>2
-; b = (2 + t1 + t3 + (t2<<1))>>2
-; c = (2 + t2 + t4 + (t3<<1))>>2
-; d = (2 + t3 + t5 + (t4<<1))>>2
-
-; e = (2 + t4 + t6 + (t5<<1))>>2
-; f = (2 + t5 + t7 + (t6<<1))>>2
-; g = (2 + t6 + t7 + (t7<<1))>>2
-
-; [g f e d c b a] --> mov to memory
-;
-; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
-
- movq mm3, mm0
- psrlq mm3, 38h
- psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
-
- psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
- psrlq mm2, 8h
- pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
-
- movq mm3, mm1
- pavgb mm1, mm2
- pxor mm3, mm2 ; find odd value in the lowest bit of each byte
- pand mm3, [mmx_01bytes] ; set the odd bit
- psubusb mm1, mm3 ; decrease 1 from odd bytes
-
- pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
-
- psrlq mm0, 8h
- movd [r0], mm0
- psrlq mm0, 8h
- movd [r0+4], mm0
- psrlq mm0, 8h
- movd [r0+8], mm0
- psrlq mm0, 8h
- movd [r0+12], mm0
- WELSEMMS
- ret
-
-
-ALIGN 16
-;***********************************************************************
-; lt|t0|t1|t2|t3|t4|t5|t6|t7
-; l0|
-; l1|
-; l2|
-; l3|
-; lt,t0,t1,t2,t3 will never been used
-; destination:
-; |a |b |c |d |
-; |e |f |g |h |
-; |b |c |d |i |
-; |f |g |h |j |
-
-; a = (1 + t0 + t1)>>1
-; b = (1 + t1 + t2)>>1
-; c = (1 + t2 + t3)>>1
-; d = (1 + t3 + t4)>>1
-; i = (1 + t4 + t5)>>1
-
-; e = (2 + t0 + (t1<<1) + t2)>>2
-; f = (2 + t1 + (t2<<1) + t3)>>2
-; g = (2 + t2 + (t3<<1) + t4)>>2
-; h = (2 + t3 + (t4<<1) + t5)>>2
-; j = (2 + t4 + (t5<<1) + t6)>>2
-
-; [i d c b a] + [j h g f e] --> mov to memory
-;
-; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:
- %assign push_num 0
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
- movq mm1, mm0
- movq mm2, mm0
-
- psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
- psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
-
- movq mm3, mm1
- pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
-
- movq mm4, mm2
- pavgb mm2, mm0
- pxor mm4, mm0 ; find odd value in the lowest bit of each byte
- pand mm4, [mmx_01bytes] ; set the odd bit
- psubusb mm2, mm4 ; decrease 1 from odd bytes
-
- pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
-
- movd [r0], mm3
- psrlq mm3, 8h
- movd [r0+8], mm3
-
- movd [r0+4], mm2
- psrlq mm2, 8h
- movd [r0+12], mm2
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-;
-; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movq mm0, [r1]
-
- movzx r3, byte [r1+r2-0x01] ; l1
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l2
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l3
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l4
- add r3, r4
- movd mm1, r3d ; mm1 = l1+l2+l3+l4
-
- movzx r3, byte [r1+r2-0x01] ; l5
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l6
- add r3, r4
- movzx r4, byte [r1+r2-0x01] ; l7
- add r3, r4
- lea r1, [r1+2*r2]
- movzx r4, byte [r1-0x01] ; l8
- add r3, r4
- movd mm2, r3d ; mm2 = l5+l6+l7+l8
-
- movq mm3, mm0
- psrlq mm0, 0x20
- psllq mm3, 0x20
- psrlq mm3, 0x20
- pxor mm4, mm4
- psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
- paddq mm3, mm1
- movq mm1, mm2
- paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
- movq mm4, [mmx_0x02]
-
- paddq mm0, mm4
- psrlq mm0, 0x02
-
- paddq mm2, mm4
- psrlq mm2, 0x02
-
- paddq mm3, mm4
- paddq mm3, mm4
- psrlq mm3, 0x03
-
- paddq mm1, mm4
- paddq mm1, mm4
- psrlq mm1, 0x03
-
- pmuludq mm0, [mmx_01bytes]
- pmuludq mm3, [mmx_01bytes]
- psllq mm0, 0x20
- pxor mm0, mm3 ; mm0 = m_up
-
- pmuludq mm2, [mmx_01bytes]
- pmuludq mm1, [mmx_01bytes]
- psllq mm1, 0x20
- pxor mm1, mm2 ; mm2 = m_down
-
- movq [r0], mm0
- movq [r0+0x08], mm0
- movq [r0+0x10], mm0
- movq [r0+0x18], mm0
-
- movq [r0+0x20], mm1
- movq [r0+0x28], mm1
- movq [r0+0x30], mm1
- movq [r0+0x38], mm1
-
- pop r4
- pop r3
- WELSEMMS
- ret
-
-
-
-ALIGN 16
-;***********************************************************************
-;
-; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:
- push r3
- push r4
- %assign push_num 2
- LOAD_3_PARA
- %ifndef X86_32
- movsx r2, r2d
- %endif
- sub r1, r2
- movdqa xmm0, [r1] ; read one row
- pxor xmm1, xmm1
- psadbw xmm0, xmm1
- movdqa xmm1, xmm0
- psrldq xmm1, 0x08
- pslldq xmm0, 0x08
- psrldq xmm0, 0x08
- paddw xmm0, xmm1
-
- movzx r3, byte [r1+r2-0x01]
- movzx r4, byte [r1+2*r2-0x01]
- add r3, r4
- lea r1, [r1+r2]
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- LOAD_2_LEFT_AND_ADD
- add r3, 0x10
- movd xmm1, r3d
- paddw xmm0, xmm1
- psrld xmm0, 0x05
- pmuludq xmm0, [mmx_01bytes]
- pshufd xmm0, xmm0, 0
-
- movdqa [r0], xmm0
- movdqa [r0+0x10], xmm0
- movdqa [r0+0x20], xmm0
- movdqa [r0+0x30], xmm0
- movdqa [r0+0x40], xmm0
- movdqa [r0+0x50], xmm0
- movdqa [r0+0x60], xmm0
- movdqa [r0+0x70], xmm0
- movdqa [r0+0x80], xmm0
- movdqa [r0+0x90], xmm0
- movdqa [r0+0xa0], xmm0
- movdqa [r0+0xb0], xmm0
- movdqa [r0+0xc0], xmm0
- movdqa [r0+0xd0], xmm0
- movdqa [r0+0xe0], xmm0
- movdqa [r0+0xf0], xmm0
-
- pop r4
- pop r3
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
-; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
-;
-;***********************************************************************
-%ifdef X86_ASM
-WELS_EXTERN WelsSmpleSatdThree4x4_sse2
-align 16
-WelsSmpleSatdThree4x4_sse2:
- push ebx
- push esi
- push edi
- mov eax, [esp+24];p_enc
- mov ebx, [esp+28];linesize_enc
-
- ; load source 4x4 samples and Hadamard transform
- movd xmm0, [eax]
- movd xmm1, [eax+ebx]
- lea eax , [eax+2*ebx]
- movd xmm2, [eax]
- movd xmm3, [eax+ebx]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
-
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
-
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
-
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
-
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
- ; Hadamard transform results are saved in xmm0 and xmm2
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- ; load top boundary samples: [a b c d]
- mov eax, [esp+16];p_dec
- sub eax, [esp+20];linesize_dec
- movzx ecx, byte [eax]
- movzx edx, byte [eax+1]
- movzx esi, byte [eax+2]
- movzx edi, byte [eax+3]
-
- ; get the transform results of top boundary samples: [a b c d]
- add edx, ecx ; edx = a + b
- add edi, esi ; edi = c + d
- add ecx, ecx ; ecx = a + a
- add esi, esi ; esi = c + c
- sub ecx, edx ; ecx = a + a - a - b = a - b
- sub esi, edi ; esi = c + c - c - d = c - d
- add edi, edx ; edi = (a + b) + (c + d)
- add edx, edx
- sub edx, edi ; edx = (a + b) - (c + d)
- add esi, ecx ; esi = (a - b) + (c - d)
- add ecx, ecx
- sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-
- movdqa xmm6, xmm0
- movdqa xmm7, xmm2
- movd xmm5, edi ; store the edi for DC mode
- pxor xmm3, xmm3
- pxor xmm4, xmm4
- pinsrw xmm3, edi, 0
- pinsrw xmm3, esi, 4
- psllw xmm3, 2
- pinsrw xmm4, edx, 0
- pinsrw xmm4, ecx, 4
- psllw xmm4, 2
-
- ; get the satd of H
- psubw xmm0, xmm3
- psubw xmm2, xmm4
-
- WELS_AbsW xmm0, xmm1
- WELS_AbsW xmm2, xmm1
- paddusw xmm0, xmm2
- SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
-
- ; load left boundary samples: [a b c d]'
- mov eax, [esp+16]
- mov ebx, [esp+20]
- movzx ecx, byte [eax-1]
- movzx edx, byte [eax+ebx-1]
- lea eax , [eax+2*ebx]
- movzx esi, byte [eax-1]
- movzx edi, byte [eax+ebx-1]
-
- ; get the transform results of left boundary samples: [a b c d]'
- add edx, ecx ; edx = a + b
- add edi, esi ; edi = c + d
- add ecx, ecx ; ecx = a + a
- add esi, esi ; esi = c + c
- sub ecx, edx ; ecx = a + a - a - b = a - b
- sub esi, edi ; esi = c + c - c - d = c - d
- add edi, edx ; edi = (a + b) + (c + d)
- add edx, edx
- sub edx, edi ; edx = (a + b) - (c + d)
- add esi, ecx ; esi = (a - b) + (c - d)
- add ecx, ecx
- sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-
- ; store the transform results in xmm3
- movd xmm3, edi
- pinsrw xmm3, edx, 1
- pinsrw xmm3, ecx, 2
- pinsrw xmm3, esi, 3
- psllw xmm3, 2
-
- ; get the satd of V
- movdqa xmm2, xmm6
- movdqa xmm4, xmm7
- psubw xmm2, xmm3
- WELS_AbsW xmm2, xmm1
- WELS_AbsW xmm4, xmm1
- paddusw xmm2, xmm4
- SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2
-
- ; DC result is stored in xmm1
- add edi, 4
- movd xmm1, edi
- paddw xmm1, xmm5
- psrlw xmm1, 3
- movdqa xmm5, xmm1
- psllw xmm1, 4
-
- ; get the satd of DC
- psubw xmm6, xmm1
- WELS_AbsW xmm6, xmm1
- WELS_AbsW xmm7, xmm1
- paddusw xmm6, xmm7
- SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
-
- ; comparing order: DC H V
- mov edx, [esp+32]
- movd eax, xmm6
- movd edi, xmm2
- movd esi, xmm0
- and eax, 0xffff
- shr eax, 1
- and edi, 0xffff
- shr edi, 1
- and esi, 0xffff
- shr esi, 1
- add eax, [esp+40]
- add edi, [esp+44]
- add esi, [esp+48]
- cmp ax, di
- jg near not_dc
- cmp ax, si
- jg near not_dc_h
-
- ; for DC mode
- movd ebx, xmm5
- imul ebx, 0x01010101
- movd xmm5, ebx
- pshufd xmm5, xmm5, 0
- movdqa [edx], xmm5
- mov ebx, [esp+36]
- mov dword [ebx], 0x02
- pop edi
- pop esi
- pop ebx
- ret
-
-not_dc:
- cmp di, si
- jg near not_dc_h
-
- ; for H mode
- SSE_DB_1_2REG xmm6, xmm7
- mov eax, [esp+16]
- mov ebx, [esp+20]
- movzx ecx, byte [eax-1]
- movd xmm0, ecx
- pmuludq xmm0, xmm6
-
- movzx ecx, byte [eax+ebx-1]
- movd xmm1, ecx
- pmuludq xmm1, xmm6
-%if 1
- punpckldq xmm0, xmm1
-%else
- unpcklps xmm0, xmm1
-%endif
- lea eax, [eax+ebx*2]
- movzx ecx, byte [eax-1]
- movd xmm2, ecx
- pmuludq xmm2, xmm6
-
- movzx ecx, byte [eax+ebx-1]
- movd xmm3, ecx
- pmuludq xmm3, xmm6
-%if 1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
-%else
- unpcklps xmm2, xmm3
- unpcklpd xmm0, xmm2
-%endif
- movdqa [edx],xmm0
-
- mov eax, edi
- mov ebx, [esp+36]
- mov dword [ebx], 0x01
-
- pop edi
- pop esi
- pop ebx
- ret
-not_dc_h:
- ; for V mode
- mov eax, [esp+16]
- sub eax, [esp+20]
- movd xmm0, [eax]
- pshufd xmm0, xmm0, 0
- movdqa [edx],xmm0
-
- mov eax, esi
- mov ebx, [esp+36]
- mov dword [ebx], 0x00
-
- pop edi
- pop esi
- pop ebx
- ret
-%endif
-
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -23,7 +23,6 @@
PROCESSING_ASM_SRCS=\
$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\
$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\
- $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\
$(PROCESSING_SRCDIR)/./src/asm/vaa.asm\
PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)