ref: f9dea467123fbff2c74422a8634b20af4026de49
parent: 8f9a5469beb962c22b6d8bbe78f01ec79fb33a55
author: Martin Storsjö <martin@martin.st>
date: Fri Dec 13 05:06:44 EST 2013
Remove trailing whitespace Most of it was removed in ff6b669176 from C++ source files, but other files were left unchanged.
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
OpenH264
=======
OpenH264 is a codec library which supports H.264 encoding and decoding. It is suitable for use in real time applications such as WebRTC. See http://www.openh264.org/ for more details.
-
+
Encoder Features
------------------------
- Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -17,10 +17,10 @@
- Single reference frame for inter prediction
- Multiple reference frames when using LTR and/or 3-4 temporal layers
- Periodic and on-demand Instantaneous Decoder Refresh (IDR) frame insertion
-- Dynamic changes to bit rate, frame rate, and resolution
+- Dynamic changes to bit rate, frame rate, and resolution
- Annex B byte stream output
- YUV 4:2:0 planar input
-
+
Decoder Features
------------------------
- Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -32,7 +32,7 @@
- Multiple reference frames when specified in Sequence Parameter Set (SPS)
- Annex B byte stream input
- YUV 4:2:0 planar output
-
+
OS Support
----------------
- Windows 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
@@ -40,7 +40,7 @@
- Linux 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
- Android 32-bit (initial release does not include this target, will follow soon)
- iOS 64-bit and 32-bit (not supported yet, may be added in the future)
-
+
Processor Support
-------------------------
- Intel x86 optionally with MMX/SSE (no AVX yet, help is welcome)
@@ -53,30 +53,30 @@
: build the decoder library and executable via codec/build/linux/dec/makefile
: build the encoder library and executable via codec/build/linux/enc/makefile
: build the encoder shared library via processing/build/linux/makefile
-
+
Windows Visual Studio 2008/2010/2012 projects are available:
: build the decoder via the Visual Studio projects in codec/build/win32/dec
: build the encoder via the Visual Studio projects in codec/build/win32/enc
: build the encoder shared library via the Visual Studio projects in processing/build/win32/
-
+
NASM needed to be installed for assembly code: workable version 2.07 or above, nasm can downloaded from http://www.nasm.us/
-
+
API details to be provided later.
-
+
Using the Test App
-------------------------
Linux shell scripts to build the test apps:
: build via testbin/AutoBuild_Linux.sh
: clean via testbin/AutoClean_Linux.sh
-
+
Windows batch files to build the test apps:
: Visual Studio 2008 use testbin/AutoBuild_Windows_VS2008.bat
: Visual Studio 2010 use testbin/AutoBuild_Windows_VS2010.bat
: Visual Studio 2012 use testbin/AutoBuild_Windows_VS2012.bat
-
+
Usage information can be found in testbin/CmdLineReadMe
Command line options and details to be provided later.
-
+
Using the Source
-----------------------
codec - encoder, decoder, console (test app), build (makefile, vcproj)
@@ -83,7 +83,7 @@
processing - raw pixel processing (used by encoder)
testbin - autobuild scripts, test app config files, yuv test files
bin - binaries for library and test app
-
+
Known Issues
-------------------
See the issue tracker on https://github.com/cisco/openh264/issues
@@ -91,7 +91,7 @@
- Encoder errors when compressed frame size exceeds half uncompressed size
- Encoder console app only support multiple of 16 width/height for now
- Decoder errors when compressed frame size exceeds 1MB
-
+
License
----------
BSD, see LICENSE file for details.
--- a/build/mktargets.py
+++ b/build/mktargets.py
@@ -19,7 +19,7 @@
def write_cpp_rule(f, x):
src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-
+
f.write("%s: %s\n"%(dst, src))
f.write('\t$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(' + PREFIX + '_CFLAGS) $(' + PREFIX + '_INCLUDES) -c -o ' + dst + ' ' + src + '\n');
f.write("\n")
@@ -27,7 +27,7 @@
def write_asm_rule(f, x):
src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-
+
f.write("%s: %s\n"%(dst, src))
f.write('\t$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(' + PREFIX + '_ASMFLAGS) $(' + PREFIX + '_ASM_INCLUDES) -o ' + dst + ' ' + src + '\n');
f.write("\n")
@@ -70,7 +70,7 @@
f.write("%s_CPP_SRCS=\\\n"%(PREFIX))
for c in cpp:
f.write("\t$(%s_SRCDIR)/%s\\\n"%(PREFIX, c))
-f.write("\n")
+f.write("\n")
f.write("%s_OBJS += $(%s_CPP_SRCS:.cpp=.o)\n"%(PREFIX, PREFIX))
f.write("ifeq ($(USE_ASM), Yes)\n");
--- a/codec/build/linux/dec/makefile
+++ b/codec/build/linux/dec/makefile
@@ -25,7 +25,7 @@
ASFLAGS= -f elf -DNOPREFIX -I ../../../decoder/core/asm/
LIBS= -lstdc++ -ldl
-#-lm
+#-lm
CFLAGS= $(INCLUDE) -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DHAVE_CACHE_LINE_ALIGN
ifeq ($(DBG),1)
@@ -65,7 +65,7 @@
$(CORESRCDIR)/utils.cpp \
$(PLUSSRCDIR)/welsDecoderExt.cpp \
$(PLUSSRCDIR)/welsCodecTrace.cpp \
-$(COMMONSRCDIR)/logging.cpp
+$(COMMONSRCDIR)/logging.cpp
ASMSRC= $(ASMSRCDIR)/block_add.asm \
$(ASMSRCDIR)/cpuid.asm \
@@ -78,7 +78,7 @@
$(ASMSRCDIR)/mc_luma.asm \
$(ASMSRCDIR)/memzero.asm \
$(ASMSRCDIR)/asm_inc.asm \
-
+
MAINSRC= $(MAINSRCDIR)/d3d9_utils.cpp \
$(MAINSRCDIR)/h264dec.cpp \
$(MAINSRCDIR)/read_config.cpp
@@ -119,7 +119,7 @@
$(OBJDIR)/mb_copy.o \
$(OBJDIR)/mc_luma.o \
$(OBJDIR)/memzero.o \
-$(OBJDIR)/asm_inc.o
+$(OBJDIR)/asm_inc.o
endif
OBJBIN= $(OBJDIR)/d3d9_utils.o \
@@ -134,7 +134,7 @@
dependencies:
@echo "" >dependencies
-
+
checkdir:
@echo 'checkdir..'
@if test ! -d $(BINDIR) ; \
@@ -154,7 +154,7 @@
mkdir -p $(OBJDIR) ; \
fi
@echo
-
+
release:
@echo 'release..'
@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -169,14 +169,14 @@
@rm -f $(OBJBIN)
@rm -f $(BINLIB)
@rm -f $(SHAREDLIB)
- @rm -f $(BIN)
+ @rm -f $(BIN)
tags:
@echo update tag table
@etags $(CORESRCDIR)/*.c $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-
-
-lib: $(OBJDEC)
+
+
+lib: $(OBJDEC)
@echo '$(OBJDEC)'
@echo
@echo 'ar cr $(BINLIB) $(OBJDEC)'
@@ -197,15 +197,15 @@
@$(CXX) -shared -Wl,-Bsymbolic -o $(SHAREDLIB) $(OBJDEC) $(LIBS)
@echo '... done'
@echo
-
+
exe: $(OBJBIN)
- @echo
+ @echo
@echo '$(OBJBIN)'
@echo
@echo '$(CXX) $(LIBS) $(OBJBIN) $(BINLIB) -o $(BIN)'
@echo 'creating binary "$(BIN)"'
- @$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS)
+ @$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS)
@echo '... done'
@echo
@@ -223,31 +223,31 @@
$(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.c
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
$(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+
$(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
@echo 'compiling object file "$@" ...'
- @$(AS) $(ASFLAGS) -o $@ $<
+ @$(AS) $(ASFLAGS) -o $@ $<
#$(OBJDIR)/%.o$(SUFFIX): $(ASMCOMDIR)/%.asm
# @echo 'compiling object file "$@" ...'
# @$(AS) $(ASFLAGS) -o $@ $<
-
+
$(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
$(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+
include $(DEPEND)
--- a/codec/build/linux/enc/makefile
+++ b/codec/build/linux/enc/makefile
@@ -26,8 +26,8 @@
ASFLAGS= -f elf -DNOPREFIX -I ../../../encoder/core/asm/
LIBS= -lstdc++ -ldl -lpthread -lm
-#-lm
-CFLAGS= $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED
+#-lm
+CFLAGS= $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED
ifeq ($(DBG),1)
#SUFFIX= .dbg
@@ -150,7 +150,7 @@
$(OBJDIR)/satd_sad.o \
$(OBJDIR)/score.o \
$(OBJDIR)/asm_inc.o \
-$(OBJDIR)/vaa.o
+$(OBJDIR)/vaa.o
endif
OBJBIN= $(OBJDIR)/read_config.o \
$(OBJDIR)/welsenc.o
@@ -163,7 +163,7 @@
dependencies:
@echo "" >dependencies
-
+
checkdir:
@echo 'checkdir..'
@if test ! -d $(OUTDIR) ; \
@@ -195,9 +195,9 @@
tags:
@echo update tag table
@etags $(THREADLIBSRCDIR)/*.cpp $(COMMSRCDIR)/*.cpp $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-
-
-lib: $(OBJENC)
+
+
+lib: $(OBJENC)
@echo '$(OBJENC)'
@echo
@echo 'ar cr $(BINLIB) $(OBJENC)'
@@ -218,7 +218,7 @@
@$(GCC) -shared -Wl,-Bsymbolic -m32 -o $(SHAREDLIB) $(OBJENC) $(LIBS)
@echo '... done'
@echo
-
+
release:
@echo 'release..'
@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -228,7 +228,7 @@
@echo
exe: $(OBJBIN)
- @echo
+ @echo
@echo '$(OBJBIN)'
@echo
@echo '$(GCC) $(LIBS) $(OBJBIN) $(BINLIB) -m32 -o $(BIN)'
@@ -251,24 +251,24 @@
$(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
$(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
@echo 'compiling object file "$@" ...'
- @$(AS) $(ASFLAGS) -o $@ $<
-
+ @$(AS) $(ASFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
- @$(CC) -m32 -c $(CFLAGS) -o $@ $<
-
+ @$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
$(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
@echo 'compiling object file "$@" ...'
@$(CC) -m32 -c $(CFLAGS) -o $@ $<
--- a/codec/decoder/core/asm/asm_inc.asm
+++ b/codec/decoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
; Options, for DEBUG
;***********************************************************************
-%if 1
+%if 1
%define MOVDQ movdqa
%else
%define MOVDQ movdqu
@@ -58,7 +58,7 @@
BITS 32
;***********************************************************************
-; Macros
+; Macros
;***********************************************************************
%macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
-%endmacro
+%endmacro
%macro MMX_XSwap 4
movq %4, %2
@@ -105,7 +105,7 @@
SSE2_XSawp qdq, %5, %2, %3
%endmacro
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
+;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
%macro SSE2_TransTwo4x4W 5
SSE2_XSawp wd, %1, %2, %5
SSE2_XSawp wd, %3, %4, %2
@@ -125,26 +125,26 @@
movdqa %6, %9
movdqa %9, %4
SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
+
+ SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %9
- movdqa %9, %3
+ movdqa %9, %3
SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
+
+ SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %9
- movdqa %9, %5
+ movdqa %9, %5
SSE2_XSawp dq, %7, %3, %5
-
+
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %9
- movdqa %9, %1
+ movdqa %9, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %9
%endmacro
@@ -170,9 +170,9 @@
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
mov %3h, %3l
movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
;copy a dw into a xmm for 8 times
%macro SSE2_Copy8Times 2
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -48,7 +48,7 @@
; Macros and other preprocessor constants
;*******************************************************************************
-%macro BLOCK_ADD_16_SSE2 4
+%macro BLOCK_ADD_16_SSE2 4
movdqa xmm0, [%2]
movdqa xmm1, [%3]
movdqa xmm2, [%3+10h]
@@ -65,7 +65,7 @@
lea %2, [%2+%4]
lea %3, [%3+%4*2]
- lea %1, [%1+%4]
+ lea %1, [%1+%4]
%endmacro
%macro BLOCK_ADD_8_MMXEXT 4
@@ -106,7 +106,7 @@
lea %2, [%2+%4]
lea %3, [%3+%5*2]
- lea %1, [%1+%4]
+ lea %1, [%1+%4]
%endmacro
@@ -130,24 +130,24 @@
lea %1, [%1+%4]
%endmacro
-%macro BLOCK_ADD_8_STRIDE_2_LINES_SSE2 5
+%macro BLOCK_ADD_8_STRIDE_2_LINES_SSE2 5
movdqa xmm1, [%3]
movq xmm0, [%2]
punpcklbw xmm0, xmm7
paddw xmm0, xmm1
packuswb xmm0, xmm7
- movq [%1], xmm0
-
+ movq [%1], xmm0
+
movdqa xmm3, [%3+%5*2]
movq xmm2, [%2+%4]
punpcklbw xmm2, xmm7
paddw xmm2, xmm3
- packuswb xmm2, xmm7
- movq [%1+%4], xmm2
-
+ packuswb xmm2, xmm7
+ movq [%1+%4], xmm2
+
lea %1, [%1+%4*2]
lea %2, [%2+%4*2]
- lea %3, [%3+%5*4]
+ lea %3, [%3+%5*4]
%endmacro
%macro CHECK_DATA_16_ZERO_SSE4 3
@@ -159,7 +159,7 @@
por xmm0, xmm1
ptest xmm7, xmm0
cmovae eax, %3
-
+
add %1, 20h
add ecx, 04h
mov byte [%2+ebx], al
@@ -170,12 +170,12 @@
movdqa xmm1, [%1+%3]
movdqa xmm2, [%1+%3*2]
movdqa xmm3, [%1+%4]
-
+
mov eax, 0h
mov ebx, 0h
movdqa xmm4, xmm0
movdqa xmm5, xmm2
-
+
punpcklqdq xmm0, xmm1
punpckhqdq xmm4, xmm1
punpcklqdq xmm2, xmm3
@@ -183,12 +183,12 @@
por xmm0, xmm2
por xmm4, xmm5
-
+
ptest xmm7, xmm0
cmovae eax, %5
ptest xmm7, xmm4
- cmovae ebx, %5
-
+ cmovae ebx, %5
+
mov byte [%2], al
mov byte [%2+1], bl
%endmacro
@@ -230,45 +230,45 @@
movdqa xmm0, [%1]
movdqa xmm1, [%1+10h]
mov ebx, [ecx]
-
+
pcmpeqw xmm0, xmm7
pcmpeqw xmm1, xmm7
packsswb xmm0, xmm1
- pmovmskb edx, xmm0
+ pmovmskb edx, xmm0
sub edx, 0ffffh
-
- cmovb eax, ebp
+
+ cmovb eax, ebp
add ecx, 4
add %1, 20h
mov byte [%2+ebx], al
%endmacro
-
+
%macro CHECK_RS_4x4_BLOCK_2_ZERO_SSE2 5
movdqa xmm0, [%1]
movdqa xmm1, [%1 + %3]
movdqa xmm2, [%1 + %3*2]
- movdqa xmm3, [%1 + %4]
-
+ movdqa xmm3, [%1 + %4]
+
movdqa xmm4, xmm0
movdqa xmm5, xmm2
-
+
punpcklqdq xmm0, xmm1
punpckhqdq xmm4, xmm1
punpcklqdq xmm2, xmm3
punpckhqdq xmm5, xmm3
-
+
pcmpeqw xmm0, xmm7
pcmpeqw xmm2, xmm7
pcmpeqw xmm4, xmm7
pcmpeqw xmm5, xmm7
-
+
packsswb xmm0, xmm2
packsswb xmm4, xmm5
pmovmskb eax, xmm0
pmovmskb ebx, xmm4
-
+
sub eax, 0ffffh
mov eax, 0
cmovb eax, %5
@@ -276,7 +276,7 @@
mov ebx, 0
cmovb ebx, %5
mov byte [%2], al
- mov byte [%2+1], bl
+ mov byte [%2+1], bl
%endmacro
;*******************************************************************************
@@ -291,12 +291,12 @@
ALIGN 16
SubMbScanIdx:
- dd 0x0, 0x1, 0x4, 0x5,
+ dd 0x0, 0x1, 0x4, 0x5,
dd 0x2, 0x3, 0x6, 0x7,
dd 0x8, 0x9, 0xc, 0xd,
dd 0xa, 0xb, 0xe, 0xf,
dd 0x10, 0x11, 0x14, 0x15,
- dd 0x12, 0x13, 0x16, 0x17,
+ dd 0x12, 0x13, 0x16, 0x17,
;*******************************************************************************
; Code
@@ -312,10 +312,10 @@
; void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
;*******************************************************************************
WelsResBlockZero16x16_sse2:
- push esi
+ push esi
mov esi, [esp+08h]
- mov ecx, [esp+0ch]
+ mov ecx, [esp+0ch]
lea ecx, [ecx*2]
lea eax, [ecx*3]
@@ -375,7 +375,7 @@
movdqa [esi+eax], xmm7
movdqa [esi+eax+10h], xmm7
-
+
pop esi
ret
@@ -386,7 +386,7 @@
;*******************************************************************************
; void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
;*******************************************************************************
-WelsResBlockZero8x8_sse2:
+WelsResBlockZero8x8_sse2:
push esi
mov esi, [esp+08h]
@@ -407,7 +407,7 @@
movdqa [esi+ecx*2], xmm7
movdqa [esi+eax], xmm7
-
+
pop esi
ret
--- a/codec/decoder/core/asm/cpuid.asm
+++ b/codec/decoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
; void WelsCPUId( int32_t index, int32_t *uiFeatureA, int32_t *uiFeatureB, int32_t *uiFeatureC, int32_t *uiFeatureD )
;****************************************************************************************************
WelsCPUId:
- push ebx
+ push ebx
push edi
-
+
mov eax, [esp+12] ; operating index
cpuid ; cpuid
-
+
; processing various information return
mov edi, [esp+16]
mov [edi], eax
@@ -100,10 +100,10 @@
mov edi, [esp+28]
mov [edi], edx
- pop edi
+ pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsCPUSupportAVX
; need call after cpuid=1 and eax, ecx flag got then
ALIGN 16
@@ -139,7 +139,7 @@
WelsCPUSupportFMA:
mov eax, [esp+4]
mov ecx, [esp+8]
-
+
; refer to detection of FMA addressed in INTEL AVX manual document
and ecx, 018001000H
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
mov eax, 1
ret
fma_not_supported:
- mov eax, 0
+ mov eax, 0
ret
WELS_EXTERN WelsEmms
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -99,9 +99,9 @@
%define kiStride esp+pushsize+8
%define pRs esp+pushsize+12
- mov eax, [pRs ]
- mov edx, [pPred ]
- mov ecx, [kiStride]
+ mov eax, [pRs ]
+ mov edx, [pPred ]
+ mov ecx, [kiStride]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
@@ -114,13 +114,13 @@
WELS_Zero mm7
WELS_DW32 mm6
-
+
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx]
lea edx, [edx+2*ecx]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx]
-
+
%undef pushsize
%undef pPred
%undef kiStride
--- a/codec/decoder/core/asm/deblock.asm
+++ b/codec/decoder/core/asm/deblock.asm
@@ -62,169 +62,169 @@
ALIGN 16
DeblockChromaEq4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
mov edx,[ebp+10h] ; iStride
mov eax,[ebp+8] ; pPixCb
mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
@@ -231,203 +231,203 @@
WELS_EXTERN DeblockChromaLt4V_sse2
DeblockChromaLt4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
-
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
+
;***************************************************************************
-; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta)
;***************************************************************************
@@ -434,606 +434,606 @@
WELS_EXTERN DeblockChromaEq4H_sse2
ALIGN 16
-
+
DeblockChromaEq4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
;*******************************************************************************
-; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
-
+
WELS_EXTERN DeblockChromaLt4H_sse2
-
+
ALIGN 16
DeblockChromaLt4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-
-
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+
+
;*******************************************************************************
-; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta, int8_t * pTC)
;*******************************************************************************
-
+
WELS_EXTERN DeblockLumaLt4V_sse2
-
+
ALIGN 16
DeblockLumaLt4V_sse2:
@@ -1419,12 +1419,12 @@
;*******************************************************************************
-; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta)
;*******************************************************************************
WELS_EXTERN DeblockLumaEq4V_sse2
-
+
ALIGN 16
DeblockLumaEq4V_sse2:
@@ -1965,11 +1965,11 @@
mov esp, ebp
pop ebp
ret
-
-
+
+
;********************************************************************************
;
-; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
;
;********************************************************************************
@@ -1982,49 +1982,49 @@
push ebx
mov ebp, esp
and esp,0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 0Ch]
+ sub esp, 10h
+
+ mov eax, [ebp + 0Ch]
mov ecx, [ebp + 10h]
lea edx, [eax + ecx * 8]
lea ebx, [ecx*3]
-
- movq xmm0, [eax]
+
+ movq xmm0, [eax]
movq xmm7, [edx]
- punpcklqdq xmm0, xmm7
+ punpcklqdq xmm0, xmm7
movq xmm1, [eax + ecx]
movq xmm7, [edx + ecx]
punpcklqdq xmm1, xmm7
- movq xmm2, [eax + ecx*2]
+ movq xmm2, [eax + ecx*2]
movq xmm7, [edx + ecx*2]
punpcklqdq xmm2, xmm7
movq xmm3, [eax + ebx]
movq xmm7, [edx + ebx]
punpcklqdq xmm3, xmm7
-
+
lea eax, [eax + ecx * 4]
lea edx, [edx + ecx * 4]
- movq xmm4, [eax]
+ movq xmm4, [eax]
movq xmm7, [edx]
- punpcklqdq xmm4, xmm7
+ punpcklqdq xmm4, xmm7
movq xmm5, [eax + ecx]
movq xmm7, [edx + ecx]
punpcklqdq xmm5, xmm7
- movq xmm6, [eax + ecx*2]
+ movq xmm6, [eax + ecx*2]
movq xmm7, [edx + ecx*2]
punpcklqdq xmm6, xmm7
-
+
movdqa [esp], xmm0
movq xmm7, [eax + ebx]
movq xmm0, [edx + ebx]
punpcklqdq xmm7, xmm0
movdqa xmm0, [esp]
-
+
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
+
mov eax, [ebp + 14h]
- movdqa [eax], xmm4
+ movdqa [eax], xmm4
movdqa [eax + 10h], xmm2
movdqa [eax + 20h], xmm3
movdqa [eax + 30h], xmm7
@@ -2031,15 +2031,15 @@
movdqa [eax + 40h], xmm5
movdqa [eax + 50h], xmm1
movdqa [eax + 60h], xmm6
- movdqa [eax + 70h], xmm0
-
+ movdqa [eax + 70h], xmm0
+
mov esp, ebp
pop ebx
pop ebp
ret
-
-
-
+
+
+
;*******************************************************************************************
;
; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
@@ -2053,14 +2053,14 @@
DeblockLumaTransposeV2H_sse2:
push ebp
mov ebp, esp
-
+
and esp, 0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 10h]
+ sub esp, 10h
+
+ mov eax, [ebp + 10h]
mov ecx, [ebp + 0Ch]
mov edx, [ebp + 08h]
-
+
movdqa xmm0, [eax]
movdqa xmm1, [eax + 10h]
movdqa xmm2, [eax + 20h]
@@ -2069,23 +2069,23 @@
movdqa xmm5, [eax + 50h]
movdqa xmm6, [eax + 60h]
movdqa xmm7, [eax + 70h]
-
+
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
+
lea eax, [ecx * 3]
-
- movq [edx], xmm4
+
+ movq [edx], xmm4
movq [edx + ecx], xmm2
movq [edx + ecx*2], xmm3
movq [edx + eax], xmm7
-
+
lea edx, [edx + ecx*4]
- movq [edx], xmm5
+ movq [edx], xmm5
movq [edx + ecx], xmm1
movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
+ movq [edx + eax], xmm0
+
psrldq xmm4, 8
psrldq xmm2, 8
psrldq xmm3, 8
@@ -2094,20 +2094,20 @@
psrldq xmm1, 8
psrldq xmm6, 8
psrldq xmm0, 8
-
+
lea edx, [edx + ecx*4]
- movq [edx], xmm4
+ movq [edx], xmm4
movq [edx + ecx], xmm2
movq [edx + ecx*2], xmm3
movq [edx + eax], xmm7
-
+
lea edx, [edx + ecx*4]
- movq [edx], xmm5
+ movq [edx], xmm5
movq [edx + ecx], xmm1
movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
-
+ movq [edx + eax], xmm0
+
+
mov esp, ebp
pop ebp
ret
\ No newline at end of file
--- a/codec/decoder/core/asm/expand_picture.asm
+++ b/codec/decoder/core/asm/expand_picture.asm
@@ -155,11 +155,11 @@
lea %1, [%1+%2]
%endmacro
-%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
+%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
; ebx [width/16(8)]
; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16) ; top
; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16) ; bottom
-
+
%if %1 == 32 ; for luma
sar ebx, 04h ; width / 16(8) pixels
.top_bottom_loops:
@@ -173,7 +173,7 @@
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
+
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
@@ -184,15 +184,15 @@
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
+
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
+ neg ecx ; positive/negative stride need for next loop?
+
dec ebx
- jnz near .top_bottom_loops
+ jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ??
mov edx, ebx
sar ebx, 04h ; (width / 16) pixels
@@ -202,21 +202,21 @@
mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
+ mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
+ mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
+ neg ecx ; positive/negative stride need for next loop?
+
dec ebx
jnz near .top_bottom_loops
@@ -243,13 +243,13 @@
%endif
%endmacro
-%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; ecx [height]
; esi [pSrc+0], edi [pSrc-32], edx [stride], 32(16) ; left
; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16) ; right
; xor eax, eax ; for pixel pData (uint8_t) ; make sure eax=0 at least high 24 bits of eax = 0
-
-%if %1 == 32 ; for luma
+
+%if %1 == 32 ; for luma
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
@@ -256,37 +256,37 @@
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [edi], xmm0
movdqa [edi+16], xmm0
-
+
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [ebp], xmm1
movdqa [ebp+16], xmm1
-
+
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
+ lea ebp, [ebp+edx] ; right dst
+
dec ecx
- jnz near .left_right_loops
-%elif %1 == 16 ; for chroma ??
+ jnz near .left_right_loops
+%elif %1 == 16 ; for chroma ??
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [edi], xmm0
-
+ movdqa [edi], xmm0
+
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [ebp], xmm1 ; might not be aligned 16 bytes in case chroma planes
-
+
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
+ lea ebp, [ebp+edx] ; right dst
+
dec ecx
jnz near .left_right_loops
%endif
@@ -339,25 +339,25 @@
; TL
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
; TR
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
; BL
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
; BR
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
@@ -375,7 +375,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -387,10 +387,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; kiStride
+ mov ecx, edx ; kiStride
neg ecx ; -kiStride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*kiStride
lea eax, [esi+eax] ; last line of picture pData
@@ -398,16 +398,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 32 * stride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; kiWidth-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; kiWidth
- exp_top_bottom_sse2 32
-
+ exp_top_bottom_sse2 32
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -419,7 +419,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -426,7 +426,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 32, a
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -436,7 +436,7 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -32 ; luma=-32, chroma=-16
neg ecx ; -stride
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
@@ -444,19 +444,19 @@
mov ecx, [esp+28] ; kiStride
imul edx, ecx ; (height+32(16)) * stride
lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
+ lea ebx, [ebp+edx] ; last line of bottom-right border
neg ecx ; -kiStride
; for left & right border expanding
- exp_cross_sse2 32, a
-
+ exp_cross_sse2 32, a
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
ALIGN 16
@@ -472,7 +472,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -484,10 +484,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; kiStride
+ mov ecx, edx ; kiStride
neg ecx ; -kiStride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*kiStride
lea eax, [esi+eax] ; last line of picture pData
@@ -495,16 +495,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*kiStride + 16 * kiStride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; kiWidth-1
- lea ebx, [eax+ebx] ; pDst[w-1][h-1]
+ lea ebx, [eax+ebx] ; pDst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; kiWidth
- exp_top_bottom_sse2 16
-
+ exp_top_bottom_sse2 16
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst: left border pSrc
@@ -516,7 +516,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -523,7 +523,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, a
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -533,9 +533,9 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -16 ; chroma=-16
neg ecx ; -stride
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
+ lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; kiStride
add edx, 16 ; height+16, luma=32, chroma=16
@@ -545,15 +545,15 @@
neg ecx ; -kiStride
; for left & right border expanding
exp_cross_sse2 16, a
-
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
ALIGN 16
@@ -569,7 +569,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; pDst
@@ -581,10 +581,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; kiStride
+ mov ecx, edx ; kiStride
neg ecx ; -kiStride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*kiStride
lea eax, [esi+eax] ; last line of picture pData
@@ -592,16 +592,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*kiStride + 16 * kiStride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; kiWidth-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; kiWidth
- exp_top_bottom_sse2 16
-
+ exp_top_bottom_sse2 16
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -613,7 +613,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -620,7 +620,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, u
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -630,9 +630,9 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg ecx ; -kiStride
mov eax, -16 ; chroma=-16
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
+ lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; kiStride
add edx, 16 ; kiHeight+16, luma=32, chroma=16
@@ -642,14 +642,14 @@
neg ecx ; -kiStride
; for left & right border expanding
exp_cross_sse2 16, u
-
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -38,7 +38,7 @@
;* 18/09/2009 Created
;* 19/11/2010 Added
;* WelsI16x16LumaPredDcTop_sse2, WelsI16x16LumaPredDcNA_sse2,
-;* WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2
+;* WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2
;* and WelsIChromaPredDcNA_mmx
;*
;*
@@ -96,13 +96,13 @@
punpcklbw %1, %3
movdqa %3, %1
punpcklbw %1, %3
-
+
;add %4, %5
movd %2, [%4+%5-1]
movdqa %3, %2
punpcklbw %2, %3
movdqa %3, %2
- punpcklbw %2, %3
+ punpcklbw %2, %3
punpckldq %1, %2
%endmacro
@@ -116,24 +116,24 @@
movd %2, [%5+%6]
punpcklbw %3, %2
punpcklwd %1, %3
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
movd %4, [%5]
movd %2, [%5+%6]
punpcklbw %4, %2
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
movd %3, [%5]
movd %2, [%5+%6]
lea %5, [%5+2*%6]
punpcklbw %3, %2
punpcklwd %4, %3
- punpckhdq %1, %4
-%endmacro
+ punpckhdq %1, %4
+%endmacro
%macro SUMW_HORIZON 3
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
@@ -162,7 +162,7 @@
movd %2, [%5+%6]
punpcklbw %3, %2
punpckhwd %1, %3
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
@@ -186,7 +186,7 @@
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
-;
+;
; pPred must align to 16
;*******************************************************************************
WelsI4x4LumaPredH_sse2:
@@ -196,7 +196,7 @@
movzx edx, byte [eax-1]
movd xmm0, edx
pmuludq xmm0, [mmx_01bytes]
-
+
movzx edx, byte [eax+ecx-1]
movd xmm1, edx
pmuludq xmm1, [mmx_01bytes]
@@ -205,11 +205,11 @@
movzx edx, byte [eax+ecx-1]
movd xmm2, edx
pmuludq xmm2, [mmx_01bytes]
-
+
movzx edx, byte [eax+2*ecx-1]
- movd xmm3, edx
+ movd xmm3, edx
pmuludq xmm3, [mmx_01bytes]
-
+
sub eax, ecx
movd [eax], xmm0
movd [eax+ecx], xmm1
@@ -216,9 +216,9 @@
lea eax, [eax+2*ecx]
movd [eax], xmm2
movd [eax+ecx], xmm3
-
+
ret
-
+
;*******************************************************************************
; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
@@ -229,9 +229,9 @@
mov ecx, [esp + pushsize + 8]
sub esi, 1
sub esi, ecx
-
+
;for H
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
movq xmm0, [esi]
movdqa xmm5, [sse2_plane_dec]
punpcklbw xmm0, xmm7
@@ -241,7 +241,7 @@
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
-
+
SUMW_HORIZON xmm1,xmm0,xmm2
movd eax, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
movsx eax, ax
@@ -249,26 +249,26 @@
add eax, 32
sar eax, 6 ; b = (5 * H + 32) >> 6;
SSE2_Copy8Times xmm1, eax ; xmm1 = b,b,b,b,b,b,b,b
-
- movzx edx, BYTE [esi+16]
+
+ movzx edx, BYTE [esi+16]
sub esi, 3
LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, esi, ecx
-
+
add esi, 3
movzx eax, BYTE [esi+8*ecx]
add edx, eax
shl edx, 4 ; a = (left[15*kiStride] + top[15]) << 4;
-
+
sub esi, 3
add esi, ecx
LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, esi, ecx
- pxor xmm4, xmm4
+ pxor xmm4, xmm4
punpckhbw xmm0, xmm4
pmullw xmm0, xmm5
punpckhbw xmm7, xmm4
pmullw xmm7, xmm6
psubw xmm7, xmm0
-
+
SUMW_HORIZON xmm7,xmm0,xmm2
movd eax, xmm7 ; V
movsx eax, ax
@@ -276,17 +276,17 @@
imul eax, 5
add eax, 32
sar eax, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
-
+ SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
+
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -7
- add edx, eax ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
+ add edx, eax ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+
xor eax, eax
movdqa xmm5, [sse2_plane_inc_minus]
-
+
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
@@ -295,7 +295,7 @@
movdqa xmm3, xmm1
pmullw xmm3, xmm6
paddw xmm3, xmm0
- psraw xmm3, 5
+ psraw xmm3, 5
packuswb xmm2, xmm3
movdqa [esi], xmm2
paddw xmm0, xmm4
@@ -302,13 +302,13 @@
add esi, ecx
inc eax
cmp eax, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
-
+ jnz get_i16x16_luma_pred_plane_sse2_1
+
pop esi
ret
-
-
-
+
+
+
;*******************************************************************************
; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
@@ -315,7 +315,7 @@
%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
lea eax, [eax+ecx*2]
-
+
COPY_16_TIMES eax, xmm0
movdqa [eax], xmm0
COPY_16_TIMESS eax, xmm0, ecx
@@ -326,13 +326,12 @@
WelsI16x16LumaPredH_sse2:
mov eax, [esp+4] ; pPred
mov ecx, [esp+8] ; kiStride
-
+
COPY_16_TIMES eax, xmm0
movdqa [eax], xmm0
COPY_16_TIMESS eax, xmm0, ecx
movdqa [eax+ecx], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE_DEC
+
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
@@ -339,9 +338,10 @@
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
SSE2_PRED_H_16X16_TWO_LINE_DEC
-
+ SSE2_PRED_H_16X16_TWO_LINE_DEC
+
ret
-
+
;*******************************************************************************
; void_t WelsI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
@@ -349,10 +349,10 @@
WelsI16x16LumaPredV_sse2:
mov edx, [esp+4] ; pPred
mov ecx, [esp+8] ; kiStride
-
+
sub edx, ecx
movdqa xmm0, [edx]
-
+
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
@@ -377,9 +377,9 @@
movdqa [edx+ecx], xmm0
lea edx, [edx+2*ecx]
movdqa [edx], xmm0
-
+
ret
-
+
;*******************************************************************************
; void_t WelsIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
@@ -391,8 +391,8 @@
mov ecx, [esp + pushsize + 8] ;kiStride
sub esi, 1
sub esi, ecx
-
- pxor mm7, mm7
+
+ pxor mm7, mm7
movq mm0, [esi]
movq mm5, [sse2_plane_dec_c]
punpcklbw mm0, mm7
@@ -402,7 +402,7 @@
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
-
+
movq2dq xmm1, mm1
pxor xmm2, xmm2
SUMW_HORIZON xmm1,xmm0,xmm2
@@ -412,7 +412,7 @@
add eax, 16
sar eax, 5 ; b = (17 * H + 16) >> 5;
SSE2_Copy8Times xmm1, eax ; mm1 = b,b,b,b,b,b,b,b
-
+
movzx edx, BYTE [esi+8]
sub esi, 3
LOAD_COLUMN_C mm0, mm2, mm3, mm4, esi, ecx
@@ -421,17 +421,17 @@
movzx eax, BYTE [esi+4*ecx]
add edx, eax
shl edx, 4 ; a = (left[7*kiStride] + top[7]) << 4;
-
+
sub esi, 3
add esi, ecx
LOAD_COLUMN_C mm7, mm2, mm3, mm4, esi, ecx
- pxor mm4, mm4
+ pxor mm4, mm4
punpckhbw mm0, mm4
pmullw mm0, mm5
punpckhbw mm7, mm4
pmullw mm7, mm6
psubw mm7, mm0
-
+
movq2dq xmm7, mm7
pxor xmm2, xmm2
SUMW_HORIZON xmm7,xmm0,xmm2
@@ -441,17 +441,17 @@
imul eax, 17
add eax, 16
sar eax, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
-
+ SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
+
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -3
- add edx, eax ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
+ add edx, eax ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+
xor eax, eax
movdqa xmm5, [sse2_plane_mul_b_c]
-
+
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
@@ -463,12 +463,12 @@
add esi, ecx
inc eax
cmp eax, 8
- jnz get_i_chroma_pred_plane_sse2_1
-
+ jnz get_i_chroma_pred_plane_sse2_1
+
pop esi
WELSEMMS
- ret
-
+ ret
+
ALIGN 16
;*******************************************************************************
; 0 |1 |2 |3 |4 |
@@ -480,13 +480,13 @@
; pPred[7] = ([6]+[0]*2+[1]+2)/4
;
; void_t __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
-;
+;
;*******************************************************************************
-WelsI4x4LumaPredDDR_mmx:
+WelsI4x4LumaPredDDR_mmx:
mov edx,[esp+4] ;pPred
mov eax,edx
mov ecx,[esp+8] ;kiStride
-
+
movq mm1,[eax+ecx-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
movq mm2,[eax-8] ;get value of 6 mm2[8] = 6
sub eax, ecx ;mov eax to above line of current block(postion of 1)
@@ -513,19 +513,19 @@
pand mm1,[mmx_01bytes] ;set the odd bit
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
-
+
lea edx,[edx+ecx]
- movd [edx+2*ecx],mm2
+ movd [edx+2*ecx],mm2
sub edx,ecx
- psrlq mm2,8
- movd [edx+2*ecx],mm2
- psrlq mm2,8
- movd [edx+ecx],mm2
- psrlq mm2,8
+ psrlq mm2,8
+ movd [edx+2*ecx],mm2
+ psrlq mm2,8
+ movd [edx+ecx],mm2
+ psrlq mm2,8
movd [edx],mm2
WELSEMMS
ret
-
+
ALIGN 16
;*******************************************************************************
; 0 |1 |2 |3 |4 |
@@ -537,36 +537,36 @@
; pPred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
;
; void_t __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
-;
+;
;*******************************************************************************
-WelsI4x4LumaPredDc_sse2:
+WelsI4x4LumaPredDc_sse2:
mov eax,[esp+4] ;pPred
mov ecx,[esp+8] ;kiStride
push ebx
-
+
movzx edx, byte [eax-1h]
-
+
sub eax, ecx
movd xmm0, [eax]
pxor xmm1, xmm1
psadbw xmm0, xmm1
-
+
movd ebx, xmm0
add ebx, edx
-
+
movzx edx, byte [eax+ecx*2-1h]
add ebx, edx
-
+
lea eax, [eax+ecx*2-1]
movzx edx, byte [eax+ecx]
add ebx, edx
-
+
movzx edx, byte [eax+ecx*2]
add ebx, edx
add ebx, 4
sar ebx, 3
imul ebx, 0x01010101
-
+
mov edx, [esp+8] ;pPred
mov [edx], ebx
mov [edx+ecx], ebx
@@ -575,8 +575,8 @@
mov [edx+ecx], ebx
pop ebx
- ret
-
+ ret
+
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -585,7 +585,7 @@
%macro MMX_PRED_H_8X8_ONE_LINE 4
movq %1, [%3-8]
psrlq %1, 38h
-
+
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
movq [%4], %1
@@ -594,7 +594,7 @@
%macro MMX_PRED_H_8X8_ONE_LINEE 4
movq %1, [%3+ecx-8]
psrlq %1, 38h
-
+
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
movq [%4], %1
@@ -605,37 +605,37 @@
mov edx, [esp+4] ;pPred
mov eax, edx
mov ecx, [esp+8] ;kiStride
-
+
movq mm0, [eax-8]
psrlq mm0, 38h
-
+
pmullw mm0, [mmx_01bytes]
pshufw mm0, mm0, 0
movq [edx], mm0
-
+
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-
+
lea eax, [eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
-
+
lea edx, [edx+2*ecx]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-
+
lea eax, [eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
-
+
lea edx, [edx+2*ecx]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-
+
lea eax, [eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
lea edx, [edx+2*ecx]
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-
+
WELSEMMS
- ret
-
+ ret
+
ALIGN 16
;*******************************************************************************
; void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
@@ -645,7 +645,7 @@
get_i4x4_luma_pred_v_asm:
mov eax, [esp+4] ;pPred
mov ecx, [esp+8] ;kiStride
-
+
sub eax, ecx
mov edx, [eax]
mov [eax+ecx], edx
@@ -653,9 +653,9 @@
lea eax, [eax+2*ecx]
mov [eax+ecx], edx
mov [eax+2*ecx], edx
-
- ret
+ ret
+
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -665,7 +665,7 @@
WelsIChromaPredV_mmx:
mov eax, [esp+4] ;pPred
mov ecx, [esp+8] ;kiStride
-
+
sub eax, ecx
movq mm0, [eax]
@@ -680,11 +680,11 @@
lea eax, [eax+2*ecx]
movq [eax+ecx], mm0
movq [eax+2*ecx], mm0
-
+
WELSEMMS
ret
-
-
+
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
@@ -710,13 +710,13 @@
; f = (2 + l1 + (l0<<1) + lt)>>2
; h = (2 + l2 + (l1<<1) + l0)>>2
-; j = (2 + l3 + (l2<<1) + l1)>>2
+; j = (2 + l3 + (l2<<1) + l1)>>2
; [b a f e h g j i] + [d c b a] --> mov to memory
-;
+;
; void_t WelsI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:
+WelsI4x4LumaPredHD_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
@@ -723,16 +723,16 @@
sub eax, ecx
movd mm0, [eax-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
+
+ movd mm1, [eax+2*ecx-4]
+ punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
- movd mm2, [eax+2*ecx-4]
+ movd mm2, [eax+2*ecx-4]
punpcklbw mm2, [eax+ecx-4] ; mm2[7] = l2, mm2[6] = l3
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
psrlq mm2, 20h
pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
+
movq mm1, mm0
psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
movq mm2, mm0
@@ -740,17 +740,17 @@
movq mm3, mm2
movq mm4, mm1
pavgb mm1, mm0
-
+
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm4 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
-
+
movq mm4, mm0
pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
-
+
psrlq mm2, 20h
psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
movq mm4, mm3
@@ -757,7 +757,7 @@
psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
-
+
movd [edx], mm2
lea edx, [edx+ecx]
movd [edx+2*ecx], mm3
@@ -768,9 +768,9 @@
movd [edx+ecx], mm3
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
@@ -793,17 +793,17 @@
; b = (2 + l0 + (l1<<1) + l2)>>2
; d = (2 + l1 + (l2<<1) + l3)>>2
; f = (2 + l2 + (l3<<1) + l3)>>2
-
+
; [g g f e d c b a] + [g g g g] --> mov to memory
-;
+;
; void_t WelsI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:
+WelsI4x4LumaPredHU_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
-
+
movd mm0, [eax-4] ; mm0[3] = l0
punpcklbw mm0, [eax+ecx-4] ; mm0[7] = l1, mm0[6] = l0
lea eax, [eax+2*ecx]
@@ -811,39 +811,39 @@
movd mm4, [eax+ecx-4] ; mm4[3] = l3
punpcklbw mm2, mm4
punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
+
psrlq mm4, 18h
psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
psrlq mm0, 8h
pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
+
movq mm1, mm0
psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
-
+
movq mm2, mm0
psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
movq mm5, mm2
pavgb mm2, mm0
-
+
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
pand mm5, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm5 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
-
+
psrlq mm2, 8h
pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
-
+
punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
-
+
psrlq mm4, 20h
lea edx, [edx+ecx]
movd [edx+2*ecx], mm4
-
+
sub edx, ecx
movd [edx], mm1
psrlq mm1, 10h
@@ -852,9 +852,9 @@
movd [edx+2*ecx], mm1
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|
@@ -880,12 +880,12 @@
; h = (2 + t1 + (t2<<1) + t3)>>2
; i = (2 + lt + (l0<<1) + l1)>>2
-; j = (2 + l0 + (l1<<1) + l2)>>2
-;
+; j = (2 + l0 + (l1<<1) + l2)>>2
+;
; void_t WelsI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:
+WelsI4x4LumaPredVR_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
@@ -892,51 +892,51 @@
sub eax, ecx
movq mm0, [eax-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
+
+ movd mm1, [eax+2*ecx-4]
+ punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
movq mm2, [eax+ecx-8] ; mm2[7] = l2
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
psrlq mm2, 28h
pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
+
movq mm1, mm0
psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
-
+
movq mm2, mm0
psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
movq mm3, mm2
pavgb mm2, mm0
-
+
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm3 ; decrease 1 from odd bytes
-
+
movq mm3, mm0
psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
movq mm2, mm3
-
+
psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
movd [edx], mm1
-
+
psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
movd [edx+ecx], mm2
-
+
movq mm4, mm3
psllq mm4, 20h
psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
-
+
movq mm5, mm3
psllq mm5, 28h
psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
-
+
psllq mm1, 8h
pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
movd [edx+2*ecx], mm4
-
+
psllq mm2, 8h
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
lea edx, [edx+2*ecx]
@@ -943,7 +943,7 @@
movd [edx+ecx], mm5
WELSEMMS
ret
-
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -966,13 +966,13 @@
; e = (2 + t4 + t6 + (t5<<1))>>2
; f = (2 + t5 + t7 + (t6<<1))>>2
; g = (2 + t6 + t7 + (t7<<1))>>2
-
+
; [g f e d c b a] --> mov to memory
-;
+;
; void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:
+WelsI4x4LumaPredDDL_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
@@ -980,11 +980,11 @@
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
-
+
movq mm3, mm0
psrlq mm3, 38h
psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
-
+
psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
psrlq mm2, 8h
pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -994,9 +994,9 @@
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm3 ; decrease 1 from odd bytes
-
+
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
-
+
psrlq mm0, 8h
movd [edx], mm0
psrlq mm0, 8h
@@ -1008,8 +1008,8 @@
movd [edx+ecx], mm0
WELSEMMS
ret
-
-
+
+
ALIGN 16
;*******************************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1035,40 +1035,40 @@
; g = (2 + t2 + (t3<<1) + t4)>>2
; h = (2 + t3 + (t4<<1) + t5)>>2
; j = (2 + t4 + (t5<<1) + t6)>>2
-
+
; [i d c b a] + [j h g f e] --> mov to memory
-;
+;
; void_t WelsI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:
+WelsI4x4LumaPredVL_mmx:
mov edx, [esp+4] ; pPred
mov eax, edx
mov ecx, [esp+8] ; kiStride
-
+
sub eax, ecx
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
-
+
psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
movq mm3, mm1
pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
-
+
movq mm4, mm2
- pavgb mm2, mm0
+ pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm4 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
-
+
movd [edx], mm3
psrlq mm3, 8h
movd [edx+2*ecx], mm3
-
+
movd [edx+ecx], mm2
psrlq mm2, 8h
lea edx, [edx+2*ecx]
@@ -1075,7 +1075,7 @@
movd [edx+ecx], mm2
WELSEMMS
ret
-
+
ALIGN 16
;*******************************************************************************
;
@@ -1082,11 +1082,11 @@
; void_t WelsIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:
+WelsIChromaPredDc_sse2:
push ebx
mov eax, [esp+8] ; pPred
mov ecx, [esp+12] ; kiStride
-
+
sub eax, ecx
movq mm0, [eax]
@@ -1100,7 +1100,7 @@
movzx edx, byte [eax-0x01] ; l4
add ebx, edx
movd mm1, ebx ; mm1 = l1+l2+l3+l4
-
+
movzx ebx, byte [eax+ecx-0x01] ; l5
lea eax, [eax+2*ecx]
movzx edx, byte [eax-0x01] ; l6
@@ -1111,7 +1111,7 @@
movzx edx, byte [eax-0x01] ; l8
add ebx, edx
movd mm2, ebx ; mm2 = l5+l6+l7+l8
-
+
movq mm3, mm0
psrlq mm0, 0x20
psllq mm3, 0x20
@@ -1118,46 +1118,46 @@
psrlq mm3, 0x20
pxor mm4, mm4
psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
paddq mm3, mm1
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
+
movq mm4, [mmx_0x02]
-
+
paddq mm0, mm4
psrlq mm0, 0x02
-
+
paddq mm2, mm4
psrlq mm2, 0x02
-
+
paddq mm3, mm4
paddq mm3, mm4
psrlq mm3, 0x03
-
+
paddq mm1, mm4
paddq mm1, mm4
psrlq mm1, 0x03
-
+
pmuludq mm0, [mmx_01bytes]
pmuludq mm3, [mmx_01bytes]
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
-
+
pmuludq mm2, [mmx_01bytes]
pmuludq mm1, [mmx_01bytes]
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
-
+
mov edx, [esp+8] ; pPred
-
+
movq [edx], mm0
movq [edx+ecx], mm0
movq [edx+2*ecx], mm0
lea edx, [edx+2*ecx]
movq [edx+ecx], mm0
-
+
movq [edx+2*ecx], mm1
lea edx, [edx+2*ecx]
movq [edx+ecx], mm1
@@ -1164,13 +1164,13 @@
movq [edx+2*ecx], mm1
lea edx, [edx+2*ecx]
movq [edx+ecx], mm1
-
+
pop ebx
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;*******************************************************************************
;
@@ -1177,11 +1177,11 @@
; void_t WelsI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:
+WelsI16x16LumaPredDc_sse2:
push ebx
mov eax, [esp+8] ; pPred
mov ecx, [esp+12] ; kiStride
-
+
sub eax, ecx
movdqa xmm0, [eax] ; read one row
pxor xmm1, xmm1
@@ -1191,7 +1191,7 @@
pslldq xmm0, 0x08
psrldq xmm0, 0x08
paddw xmm0, xmm1
-
+
movzx ebx, byte [eax+ecx-0x01]
movzx edx, byte [eax+2*ecx-0x01]
add ebx, edx
@@ -1209,44 +1209,44 @@
psrld xmm0, 0x05
pmuludq xmm0, [mmx_01bytes]
pshufd xmm0, xmm0, 0
-
+
mov edx, [esp+8] ; pPred
-
+
movdqa [edx], xmm0
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
movdqa [edx+2*ecx], xmm0
lea edx, [edx+2*ecx]
-
+
movdqa [edx+ecx], xmm0
pop ebx
ret
-
+
;*******************************************************************************
; for intra prediction as follows, 11/19/2010
;*******************************************************************************
@@ -1258,12 +1258,12 @@
WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
WelsI16x16LumaPredDcTop_sse2:
push ebx
-
+
%define PUSH_SIZE 4
-
+
mov eax, [esp+PUSH_SIZE+4] ; pPred
mov ebx, [esp+PUSH_SIZE+8] ; kiStride
-
+
mov ecx, ebx
neg ecx
movdqa xmm0, [eax+ecx] ; pPred-kiStride, top line
@@ -1278,10 +1278,10 @@
pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
pshuflw xmm1, xmm0, 0b1h ; 10110001
- paddw xmm0, xmm1 ; sum in word unit (x8)
+ paddw xmm0, xmm1 ; sum in word unit (x8)
movd edx, xmm0
and edx, 0ffffh
-
+
add edx, 08h
sar edx, 04h
mov dh, dl
@@ -1288,35 +1288,35 @@
mov ecx, edx
shl ecx, 010h
or edx, ecx
- movd xmm1, edx
+ movd xmm1, edx
pshufd xmm0, xmm1, 00h
movdqa xmm1, xmm0
-
+
lea ecx, [2*ebx+ebx] ; 3*kiStride
-
+
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
%undef PUSH_SIZE
pop ebx
ret
@@ -1328,41 +1328,41 @@
WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
WelsI16x16LumaPredDcNA_sse2:
push ebx
-
+
%define PUSH_SIZE 4
-
+
mov eax, [esp+PUSH_SIZE+4] ; pPred
- mov ebx, [esp+PUSH_SIZE+8] ; kiStride
-
+ mov ebx, [esp+PUSH_SIZE+8] ; kiStride
+
lea ecx, [2*ebx+ebx] ; 3*kiStride
-
+
movdqa xmm0, [sse2_dc_0x80]
- movdqa xmm1, xmm0
+ movdqa xmm1, xmm0
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ movdqa [eax+ecx], xmm1
lea eax, [eax+4*ebx]
movdqa [eax], xmm0
movdqa [eax+ebx], xmm1
movdqa [eax+2*ebx], xmm0
movdqa [eax+ecx], xmm1
-
+
%undef PUSH_SIZE
-
+
pop ebx
ret
-
+
ALIGN 16
;*******************************************************************************
; void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1370,12 +1370,12 @@
WELS_EXTERN WelsIChromaPredDcLeft_mmx
WelsIChromaPredDcLeft_mmx:
push ebx
- push esi
+ push esi
%define PUSH_SIZE 8
mov esi, [esp+PUSH_SIZE+4] ; pPred
mov ecx, [esp+PUSH_SIZE+8] ; kiStride
mov eax, esi
- ; for left
+ ; for left
dec eax
xor ebx, ebx
xor edx, edx
@@ -1384,7 +1384,7 @@
add ebx, edx
lea eax, [eax+2*ecx]
mov dl, [eax]
- add ebx, edx
+ add ebx, edx
mov dl, [eax+ecx]
add ebx, edx
add ebx, 02h
@@ -1451,7 +1451,7 @@
movdqa xmm6, [sse2_wd_0x02]
paddw xmm0, xmm6
psraw xmm0, 02h
- packuswb xmm0, xmm7
+ packuswb xmm0, xmm7
lea ebx, [2*ecx+ecx]
movq [eax], xmm0
movq [eax+ecx], xmm0
@@ -1463,10 +1463,10 @@
movq [eax+2*ecx], xmm0
movq [eax+ebx], xmm0
%undef PUSH_SIZE
- pop ebx
+ pop ebx
ret
-
+
ALIGN 16
;*******************************************************************************
; void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1495,4 +1495,4 @@
ret
-
+
--- a/codec/decoder/core/asm/mb_copy.asm
+++ b/codec/decoder/core/asm/mb_copy.asm
@@ -37,7 +37,7 @@
;* History
;* 15/09/2009 Created
;* 12/28/2009 Modified with larger throughput
-;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
;*
;*
@@ -84,7 +84,7 @@
; int iHeight );
;*******************************************************************************
PixelAvgWidthEq4_mmx:
-
+
push esi
push edi
push ebp
@@ -102,7 +102,7 @@
movd mm0, [ebp]
pavgb mm0, [esi]
movd [edi], mm0
-
+
dec ebx
lea edi, [edi+eax]
lea esi, [esi+ecx]
@@ -115,7 +115,7 @@
pop edi
pop esi
ret
-
+
ALIGN 16
;*******************************************************************************
; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
@@ -124,7 +124,7 @@
; int iHeight );
;*******************************************************************************
PixelAvgWidthEq8_mmx:
-
+
push esi
push edi
push ebp
@@ -145,14 +145,14 @@
movq mm0, [esi+ecx]
pavgb mm0, [ebp+edx]
movq [edi+eax], mm0
-
+
lea esi, [esi+2*ecx]
lea ebp, [ebp+2*edx]
lea edi, [edi+2*eax]
-
+
sub ebx, 2
jnz .height_loop
-
+
WELSEMMS
pop ebx
pop ebp
@@ -174,8 +174,8 @@
push edi
push ebp
push ebx
-
+
mov edi, [esp+20] ; pDst
mov eax, [esp+24] ; iDstStride
mov esi, [esp+28] ; pSrcA
@@ -188,28 +188,28 @@
movdqu xmm0, [esi]
pavgb xmm0, [ebp]
movdqu [edi], xmm0
-
+
movdqu xmm0, [esi+ecx]
pavgb xmm0, [ebp+edx]
movdqu [edi+eax], xmm0
-
+
movdqu xmm0, [esi+2*ecx]
pavgb xmm0, [ebp+2*edx]
movdqu [edi+2*eax], xmm0
-
+
lea esi, [esi+2*ecx]
lea ebp, [ebp+2*edx]
lea edi, [edi+2*eax]
-
+
movdqu xmm0, [esi+ecx]
pavgb xmm0, [ebp+edx]
movdqu [edi+eax], xmm0
-
+
lea esi, [esi+2*ecx]
lea ebp, [ebp+2*edx]
lea edi, [edi+2*eax]
-
-
+
+
sub ebx, 4
jne .height_loop
@@ -232,7 +232,7 @@
push edi
push ebx
-
+
mov esi, [esp+16]
mov eax, [esp+20]
mov edi, [esp+24]
@@ -242,12 +242,12 @@
.height_loop:
mov ebx, [esi]
mov [edi], ebx
-
+
add esi, eax
add edi, ecx
dec edx
jnz .height_loop
- WELSEMMS
+ WELSEMMS
pop ebx
pop edi
pop esi
@@ -275,12 +275,11 @@
add edi, ecx
dec edx
jnz .height_loop
-
- WELSEMMS
+
+ WELSEMMS
pop edi
pop esi
ret
-
@@ -288,6 +287,7 @@
+
ALIGN 16
;*******************************************************************************
; void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
@@ -308,7 +308,7 @@
push edi
mov esi, [esp+12] ; pSrc
- mov eax, [esp+16] ; iSrcStride
+ mov eax, [esp+16] ; iSrcStride
mov edi, [esp+20] ; pDst
mov edx, [esp+24] ; iDstStride
mov ecx, [esp+28] ; iHeight
@@ -324,7 +324,7 @@
lea esi, [esi+eax*2]
lea edi, [edi+edx*2]
jnz .height_loop
-
+
pop edi
pop esi
ret
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ b/codec/decoder/core/asm/mc_chroma.asm
@@ -69,11 +69,11 @@
ALIGN 16
;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
+; void McChromaWidthEq4_mmx( uint8_t *src,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
@@ -81,29 +81,29 @@
push esi
push edi
push ebx
-
+
mov eax, [esp +12 + 20]
movd mm3, [eax]
WELS_Zero mm7
punpcklbw mm3, mm3
movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
+
movq mm5, mm3
punpcklbw mm3, mm7
punpckhbw mm5, mm7
-
+
movq mm6, mm4
punpcklbw mm4, mm7
punpckhbw mm6, mm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
lea ebx, [esi + eax]
movd mm0, [esi]
movd mm1, [esi+1]
@@ -110,17 +110,17 @@
punpcklbw mm0, mm7
punpcklbw mm1, mm7
.xloop:
-
+
pmullw mm0, mm3
pmullw mm1, mm5
paddw mm0, mm1
-
+
movd mm1, [ebx]
punpcklbw mm1, mm7
movq mm2, mm1
pmullw mm1, mm4
paddw mm0, mm1
-
+
movd mm1, [ebx+1]
punpcklbw mm1, mm7
movq mm7, mm1
@@ -130,13 +130,13 @@
paddw mm0, [h264_d0x20_mmx]
psrlw mm0, 6
-
+
WELS_Zero mm7
packuswb mm0, mm7
- movd [edi], mm0
+ movd [edi], mm0
movq mm0, mm2
-
+
lea edi, [edi +edx ]
lea ebx, [ebx + eax]
@@ -151,11 +151,11 @@
ALIGN 16
;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
@@ -163,30 +163,30 @@
push esi
push edi
push ebx
-
+
mov eax, [esp +12 + 20]
movd xmm3, [eax]
WELS_Zero xmm7
punpcklbw xmm3, xmm3
punpcklwd xmm3, xmm3
-
+
movdqa xmm4, xmm3
punpckldq xmm3, xmm3
punpckhdq xmm4, xmm4
movdqa xmm5, xmm3
movdqa xmm6, xmm4
-
+
punpcklbw xmm3, xmm7
punpckhbw xmm5, xmm7
punpcklbw xmm4, xmm7
punpckhbw xmm6, xmm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
lea ebx, [esi + eax]
movq xmm0, [esi]
movq xmm1, [esi+1]
@@ -193,17 +193,17 @@
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
.xloop:
-
+
pmullw xmm0, xmm3
pmullw xmm1, xmm5
paddw xmm0, xmm1
-
+
movq xmm1, [ebx]
punpcklbw xmm1, xmm7
movdqa xmm2, xmm1
pmullw xmm1, xmm4
paddw xmm0, xmm1
-
+
movq xmm1, [ebx+1]
punpcklbw xmm1, xmm7
movdqa xmm7, xmm1
@@ -213,19 +213,19 @@
paddw xmm0, [h264_d0x20_sse2]
psrlw xmm0, 6
-
+
WELS_Zero xmm7
packuswb xmm0, xmm7
- movq [edi], xmm0
+ movq [edi], xmm0
movdqa xmm0, xmm2
-
+
lea edi, [edi +edx ]
lea ebx, [ebx + eax]
dec ecx
jnz near .xloop
-
+
pop ebx
pop edi
pop esi
@@ -237,8 +237,8 @@
ALIGN 16
;***********************************************************************
; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
+; int32_t iSrcStride,
+; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iHeigh);
@@ -248,23 +248,23 @@
push ebx
push esi
push edi
-
+
mov eax, [esp + 12 + 20]
pxor xmm7, xmm7
- movd xmm5, [eax]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
+ movd xmm5, [eax]
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
movdqa xmm6, xmm5
punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- mov eax, [esp + 12 + 4]
- mov edx, [esp + 12 + 8]
- mov esi, [esp + 12 + 12]
- mov edi, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
+ punpckhqdq xmm6, xmm6
+
+ mov eax, [esp + 12 + 4]
+ mov edx, [esp + 12 + 8]
+ mov esi, [esp + 12 + 12]
+ mov edi, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
sub esi, edi
sub esi, edi
movdqa xmm7, [h264_d0x20_sse2]
@@ -273,16 +273,16 @@
movdqa xmm1, xmm0
psrldq xmm1, 1
punpcklbw xmm0, xmm1
-
-.hloop_chroma:
+
+.hloop_chroma:
lea esi, [esi+2*edi]
-
+
movdqu xmm2, [eax+edx]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm4, xmm2
-
+
pmaddubsw xmm0, xmm5
pmaddubsw xmm2, xmm6
paddw xmm0, xmm2
@@ -289,8 +289,8 @@
paddw xmm0, xmm7
psrlw xmm0, 6
packuswb xmm0, xmm0
- movq [esi],xmm0
-
+ movq [esi],xmm0
+
lea eax, [eax+2*edx]
movdqu xmm2, [eax]
movdqa xmm3, xmm2
@@ -297,7 +297,7 @@
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm0, xmm2
-
+
pmaddubsw xmm4, xmm5
pmaddubsw xmm2, xmm6
paddw xmm4, xmm2
@@ -304,8 +304,8 @@
paddw xmm4, xmm7
psrlw xmm4, 6
packuswb xmm4, xmm4
- movq [esi+edi],xmm4
-
+ movq [esi+edi],xmm4
+
sub ecx, 2
jnz .hloop_chroma
pop edi
--- a/codec/decoder/core/asm/mc_luma.asm
+++ b/codec/decoder/core/asm/mc_luma.asm
@@ -69,16 +69,16 @@
ALIGN 16
;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
; int iHeight)
;*******************************************************************************
McHorVer20WidthEq4_mmx:
push esi
push edi
-
+
mov esi, [esp+12]
mov eax, [esp+16]
mov edi, [esp+20]
@@ -100,7 +100,7 @@
punpcklbw mm4, mm7
movd mm5, [esi+3]
punpcklbw mm5, mm7
-
+
paddw mm2, mm3
paddw mm4, mm5
psllw mm4, 2
@@ -113,12 +113,12 @@
psraw mm0, 5
packuswb mm0, mm7
movd [edi], mm0
-
+
add esi, eax
add edi, ecx
dec edx
jnz .height_loop
-
+
WELSEMMS
pop edi
pop esi
@@ -181,8 +181,8 @@
ALIGN 16
;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
-; int16_t iSrcStride,
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+; int16_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride
; int32_t iHeight
@@ -197,11 +197,11 @@
mov edi, [esp+24] ;pDst
mov edx, [esp+28] ;iDstStride
mov ebx, [esp+32] ;iHeight
- pxor xmm7, xmm7
-
+ pxor xmm7, xmm7
+
sub esi, eax ;;;;;;;;need more 5 lines.
sub esi, eax
-
+
.yloop_width_8:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
@@ -215,7 +215,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -225,7 +225,7 @@
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [edi], xmm0
-
+
add esi, eax
add edi, edx
dec ebx
@@ -238,8 +238,8 @@
ALIGN 16
;***********************************************************************
;void_t McHorVer22VerLast_sse2(
-; uint8_t *pSrc,
-; int32_t pSrcStride,
+; uint8_t *pSrc,
+; int32_t pSrcStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
@@ -250,17 +250,17 @@
paddw %1, %6
movdqa %7, %2
movdqa %8, %3
-
-
+
+
paddw %7, %5
paddw %8, %4
-
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
+
+ psubw %1, %7
+ psraw %1, 2
+ paddw %1, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %8, %1
paddw %8, [h264_mc_hc_32]
psraw %8, 6
packuswb %8, %8
@@ -272,15 +272,15 @@
push edi
push ebx
push ebp
-
+
mov esi, [esp+20]
mov eax, [esp+24]
mov edi, [esp+28]
mov edx, [esp+32]
mov ebx, [esp+36]
- mov ecx, [esp+40]
- shr ebx, 3
-
+ mov ecx, [esp+40]
+ shr ebx, 3
+
.width_loop:
movdqa xmm0, [esi]
movdqa xmm1, [esi+eax]
@@ -290,12 +290,12 @@
lea esi, [esi+2*eax]
movdqa xmm4, [esi]
movdqa xmm5, [esi+eax]
-
+
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
lea esi, [esi+2*eax]
movdqa xmm6, [esi]
-
+
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
@@ -302,61 +302,61 @@
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
-
+
add edi, edx
- sub esi, eax
-
+ sub esi, eax
+
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm6, [esi]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm7, [esi+eax]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm0, [esi]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm1, [esi+eax]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm2, [esi]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm3, [esi+eax]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm4, [esi]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm5, [esi+eax]
jmp near .start
-
+
.x_loop_dec:
dec ebx
jz near .exit
@@ -366,9 +366,9 @@
add esi, 16
add edi, 8
jmp .width_loop
-
-
-
+
+
+
.exit:
pop ebp
pop ebx
@@ -379,10 +379,10 @@
ALIGN 16
;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
+; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
@@ -389,18 +389,18 @@
McHorVer20WidthEq8_sse2:
push esi
push edi
-
+
mov esi, [esp + 12] ;pSrc
mov eax, [esp + 16] ;iSrcStride
mov edi, [esp + 20] ;pDst
mov ecx, [esp + 28] ;iHeight
mov edx, [esp + 24] ;iDstStride
-
+
lea esi, [esi-2] ;pSrc -= 2;
-
+
pxor xmm7, xmm7
movdqa xmm6, [h264_w0x10_1]
-.y_loop:
+.y_loop:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -413,7 +413,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -424,7 +424,7 @@
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
-
+
packuswb xmm0, xmm7
movq [edi], xmm0
@@ -432,17 +432,17 @@
lea esi, [esi+eax]
dec ecx
jnz near .y_loop
-
+
pop edi
pop esi
ret
-
+
ALIGN 16
;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
+; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
@@ -449,20 +449,20 @@
McHorVer20WidthEq16_sse2:
push esi
push edi
-
+
mov esi, [esp + 12] ;pSrc
mov eax, [esp + 16] ;iSrcStride
mov edi, [esp + 20] ;pDst
mov ecx, [esp + 28] ;iHeight
mov edx, [esp + 24] ;iDstStride
-
+
lea esi, [esi-2] ;pSrc -= 2;
-
+
pxor xmm7, xmm7
movdqa xmm6, [h264_w0x10_1]
.y_loop:
-
+
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -475,7 +475,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -501,7 +501,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3+8]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -514,9 +514,9 @@
psraw xmm0, 5
packuswb xmm0, xmm7
movq [edi+8], xmm0
-
- lea edi, [edi+edx]
- lea esi, [esi+eax]
+
+ lea edi, [edi+edx]
+ lea esi, [esi+eax]
dec ecx
jnz near .y_loop
pop edi
@@ -525,10 +525,10 @@
;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
; int iHeight )
;*******************************************************************************
ALIGN 16
@@ -535,7 +535,7 @@
McHorVer02WidthEq8_sse2:
push esi
push edi
-
+
mov esi, [esp + 12] ;pSrc
mov edx, [esp + 16] ;iSrcStride
mov edi, [esp + 20] ;pDst
@@ -546,7 +546,7 @@
sub esi, edx
WELS_Zero xmm7
-
+
SSE_LOAD_8P xmm0, xmm7, [esi]
SSE_LOAD_8P xmm1, xmm7, [esi+edx]
lea esi, [esi+2*edx]
@@ -555,8 +555,8 @@
lea esi, [esi+2*edx]
SSE_LOAD_8P xmm4, xmm7, [esi]
SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
-.start:
+
+.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .xx_exit
@@ -566,7 +566,7 @@
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
dec ecx
jz near .xx_exit
-
+
lea edi, [edi+2*eax]
SSE_LOAD_8P xmm7, xmm0, [esi+edx]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
--- a/codec/decoder/core/asm/memzero.asm
+++ b/codec/decoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
;* memzero.asm
;*
;* Abstract
-;*
;*
+;*
;* History
;* 9/16/2009 Created
;*
@@ -47,8 +47,8 @@
; Code
;***********************************************************************
-SECTION .text
-
+SECTION .text
+
ALIGN 16
;***********************************************************************
;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
WelsPrefetchZero_mmx:
mov eax,[esp+4]
prefetchnta [eax]
- ret
+ ret
ALIGN 16
@@ -69,7 +69,7 @@
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8]
neg ecx
-
+
pxor xmm0, xmm0
.memzeroa64_sse2_loops:
movdqa [eax], xmm0
@@ -77,12 +77,12 @@
movdqa [eax+32], xmm0
movdqa [eax+48], xmm0
add eax, 0x40
-
+
add ecx, 0x40
jnz near .memzeroa64_sse2_loops
-
- ret
+ ret
+
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8]
neg ecx
-
+
pxor mm0, mm0
.memzero64_mmx_loops:
movq [eax], mm0
@@ -102,16 +102,16 @@
movq [eax+32], mm0
movq [eax+40], mm0
movq [eax+48], mm0
- movq [eax+56], mm0
+ movq [eax+56], mm0
add eax, 0x40
-
+
add ecx, 0x40
jnz near .memzero64_mmx_loops
-
- WELSEMMS
- ret
-
-ALIGN 16
+
+ WELSEMMS
+ ret
+
+ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
@@ -119,17 +119,17 @@
WelsSetMemZeroSize8_mmx:
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8] ; size
- neg ecx
+ neg ecx
pxor mm0, mm0
-
+
.memzero8_mmx_loops:
movq [eax], mm0
add eax, 0x08
-
+
add ecx, 0x08
jnz near .memzero8_mmx_loops
-
- WELSEMMS
- ret
-
+ WELSEMMS
+ ret
+
+
--- a/codec/decoder/plus/res/welsdec.rc
+++ b/codec/decoder/plus/res/welsdec.rc
@@ -27,18 +27,18 @@
// TEXTINCLUDE
//
-1 TEXTINCLUDE
+1 TEXTINCLUDE
BEGIN
"resource.h\0"
END
-2 TEXTINCLUDE
+2 TEXTINCLUDE
BEGIN
"#include ""afxres.h""\r\n"
"\0"
END
-3 TEXTINCLUDE
+3 TEXTINCLUDE
BEGIN
"\r\n"
"\0"
--- a/codec/encoder/core/asm/asm_inc.asm
+++ b/codec/encoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
; Options, for DEBUG
;***********************************************************************
-%if 1
+%if 1
%define MOVDQ movdqa
%else
%define MOVDQ movdqu
@@ -58,7 +58,7 @@
BITS 32
;***********************************************************************
-; Macros
+; Macros
;***********************************************************************
%macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
-%endmacro
+%endmacro
%macro MMX_XSwap 4
movq %4, %2
@@ -105,7 +105,7 @@
SSE2_XSawp qdq, %5, %2, %3
%endmacro
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
+;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
%macro SSE2_TransTwo4x4W 5
SSE2_XSawp wd, %1, %2, %5
SSE2_XSawp wd, %3, %4, %2
@@ -125,26 +125,26 @@
movdqa %6, %9
movdqa %9, %4
SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
+
+ SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %9
- movdqa %9, %3
+ movdqa %9, %3
SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
+
+ SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %9
- movdqa %9, %5
+ movdqa %9, %5
SSE2_XSawp dq, %7, %3, %5
-
+
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %9
- movdqa %9, %1
+ movdqa %9, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %9
%endmacro
@@ -170,9 +170,9 @@
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
mov %3h, %3l
movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
;copy a dw into a xmm for 8 times
%macro SSE2_Copy8Times 2
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@@ -318,9 +318,9 @@
SECTION .text
-
+
;***********************************************************************
-;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
+;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
;***********************************************************************
WELS_EXTERN CavlcParamCal_sse2
CavlcParamCal_sse2:
@@ -327,16 +327,16 @@
push ebx
push edi
push esi
-
+
mov eax, [esp+16] ;coffLevel
mov edi, [esp+24] ;Level
mov ebx, [esp+32] ;endIdx
cmp ebx, 3
- jne .Level16
+ jne .Level16
pxor xmm1, xmm1
movq xmm0, [eax] ; removed QWORD
- jmp .Cal_begin
-.Level16:
+ jmp .Cal_begin
+.Level16:
movdqa xmm0, [eax]
movdqa xmm1, [eax+16]
.Cal_begin:
@@ -354,7 +354,7 @@
pcmpeqw xmm7, xmm7 ;generate -1
mov ebx, 0xff
;pinsrw xmm6, ebx, 3
-
+
mov bl, dh
lea ebx, [byte_1pos_table+8*ebx]
@@ -362,7 +362,7 @@
pextrw ecx, xmm0, 3
shr ecx, 8
mov dh, cl
-
+
.loopHighFind0:
cmp ecx, 0
je .loopHighFind0End
@@ -372,7 +372,7 @@
add esi, 8
mov esi, [eax+2*esi]
mov [edi], si
- add edi, 2
+ add edi, 2
;add ebx, 1
inc ebx
dec ecx
@@ -403,8 +403,8 @@
;and edx, 0xff
movzx edx, byte [ebx]
mov edx, [eax+2*edx]
- mov [edi], dx
- add edi, 2
+ mov [edi], dx
+ add edi, 2
;add ebx, 1
inc ebx
dec esi
@@ -436,8 +436,8 @@
psllq xmm0, xmm3
psrlq xmm0, xmm3
movdqa xmm4, xmm1
- psllq xmm1, xmm2
- psrlq xmm4, xmm3
+ psllq xmm1, xmm2
+ psrlq xmm4, xmm3
punpcklqdq xmm1, xmm4
por xmm0, xmm1
--- a/codec/encoder/core/asm/cpuid.asm
+++ b/codec/encoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
;****************************************************************************************************
WelsCPUId:
- push ebx
+ push ebx
push edi
-
+
mov eax, [esp+12] ; operating index
cpuid ; cpuid
-
+
; processing various information return
mov edi, [esp+16]
mov [edi], eax
@@ -100,10 +100,10 @@
mov edi, [esp+28]
mov [edi], edx
- pop edi
+ pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsCPUSupportAVX
; need call after cpuid=1 and eax, ecx flag got then
ALIGN 16
@@ -139,7 +139,7 @@
WelsCPUSupportFMA:
mov eax, [esp+4]
mov ecx, [esp+8]
-
+
; refer to detection of FMA addressed in INTEL AVX manual document
and ecx, 018001000H
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
mov eax, 1
ret
fma_not_supported:
- mov eax, 0
+ mov eax, 0
ret
WELS_EXTERN WelsEmms
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -48,26 +48,26 @@
;***********************************************************************
; Constant
-;***********************************************************************
-
+;***********************************************************************
+
align 16
-SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
+SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
dw 10, 13, 10, 13, 13, 16, 13, 16,
- dw 11, 14, 11, 14, 14, 18, 14, 18,
+ dw 11, 14, 11, 14, 14, 18, 14, 18,
dw 11, 14, 11, 14, 14, 18, 14, 18,
- dw 13, 16, 13, 16, 16, 20, 16, 20,
dw 13, 16, 13, 16, 16, 20, 16, 20,
- dw 14, 18, 14, 18, 18, 23, 18, 23,
+ dw 13, 16, 13, 16, 16, 20, 16, 20,
+ dw 14, 18, 14, 18, 18, 23, 18, 23,
dw 14, 18, 14, 18, 18, 23, 18, 23,
- dw 16, 20, 16, 20, 20, 25, 20, 25,
dw 16, 20, 16, 20, 20, 25, 20, 25,
- dw 18, 23, 18, 23, 23, 29, 23, 29,
+ dw 16, 20, 16, 20, 20, 25, 20, 25,
+ dw 18, 23, 18, 23, 23, 29, 23, 29,
dw 18, 23, 18, 23, 23, 29, 23, 29
-
+
;***********************************************************************
; MMX functions
-;***********************************************************************
+;***********************************************************************
%macro MMX_LoadDiff4P 5
movd %1, [%3]
@@ -112,7 +112,7 @@
MMX_SumSub %4, %1, %6
MMX_SumSub %3, %2, %6
MMX_SumSub %3, %4, %6
- MMX_SumSubMul2 %1, %2, %5
+ MMX_SumSubMul2 %1, %2, %5
%endmacro
%macro MMX_IDCT 6
@@ -145,13 +145,13 @@
mov edx, [esp+24] ; i_pix2
WELS_Zero mm7
-
+
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7
- MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
+ MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
-
- MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
+
+ MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
mov eax, [esp+ 8] ; pDct
@@ -178,15 +178,15 @@
%define i_pred esp+pushsize+16
%define pDct esp+pushsize+20
- mov eax, [pDct ]
+ mov eax, [pDct ]
movq mm0, [eax+ 0]
movq mm1, [eax+ 8]
movq mm2, [eax+16]
movq mm3, [eax+24]
- mov edx, [p_dst ]
- mov ecx, [i_dst ]
+ mov edx, [p_dst ]
+ mov ecx, [i_dst ]
mov eax, [p_pred]
- mov ebx, [i_pred]
+ mov ebx, [i_pred]
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
@@ -195,7 +195,7 @@
WELS_Zero mm7
WELS_DW32 mm6
-
+
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx], [eax]
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
lea edx, [edx+2*ecx]
@@ -202,7 +202,7 @@
lea eax, [eax+2*ebx]
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx], [eax]
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
-
+
WELSEMMS
%undef pushsize
%undef p_dst
@@ -220,17 +220,17 @@
%macro SSE2_Store4x8p 6
SSE2_XSawp qdq, %2, %3, %6
SSE2_XSawp qdq, %4, %5, %3
- MOVDQ [%1+0x00], %2
- MOVDQ [%1+0x10], %4
- MOVDQ [%1+0x20], %6
- MOVDQ [%1+0x30], %3
+ MOVDQ [%1+0x00], %2
+ MOVDQ [%1+0x10], %4
+ MOVDQ [%1+0x20], %6
+ MOVDQ [%1+0x30], %3
%endmacro
%macro SSE2_Load4x8p 6
MOVDQ %2, [%1+0x00]
- MOVDQ %4, [%1+0x10]
- MOVDQ %6, [%1+0x20]
- MOVDQ %3, [%1+0x30]
+ MOVDQ %4, [%1+0x10]
+ MOVDQ %6, [%1+0x20]
+ MOVDQ %3, [%1+0x30]
SSE2_XSawp qdq, %4, %3, %5
SSE2_XSawp qdq, %2, %6, %3
%endmacro
@@ -271,40 +271,40 @@
%endmacro
%macro SSE2_Load8DC 6
- movdqa %1, %6 ; %1 = dc0 dc1
+ movdqa %1, %6 ; %1 = dc0 dc1
paddw %1, %5
- psraw %1, $6 ; (dc + 32) >> 6
-
+ psraw %1, $6 ; (dc + 32) >> 6
+
movdqa %2, %1
psrldq %2, 4
punpcklwd %2, %2
- punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+ punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
movdqa %3, %1
psrldq %3, 8
punpcklwd %3, %3
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
-
+
movdqa %4, %1
psrldq %4, 12
punpcklwd %4, %4
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
-
+
punpcklwd %1, %1
- punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+ punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
%endmacro
%macro SSE2_DCT 6
- SSE2_SumSub %6, %3, %5
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %2, %5
- SSE2_SumSubMul2 %6, %1, %4
+ SSE2_SumSub %6, %3, %5
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %2, %5
+ SSE2_SumSubMul2 %6, %1, %4
%endmacro
%macro SSE2_IDCT 7
- SSE2_SumSub %7, %2, %6
- SSE2_SumSubDiv2 %1, %3, %5, %4
- SSE2_SumSub %2, %1, %5
+ SSE2_SumSub %7, %2, %6
+ SSE2_SumSubDiv2 %1, %3, %5, %4
+ SSE2_SumSub %2, %1, %5
SSE2_SumSub %7, %4, %5
%endmacro
@@ -316,12 +316,12 @@
WelsDctFourT4_sse2:
push ebx
push esi
- mov esi, [esp+12]
+ mov esi, [esp+12]
mov eax, [esp+16] ; pix1
mov ebx, [esp+20] ; i_pix1
mov ecx, [esp+24] ; pix2
- mov edx, [esp+28] ; i_pix2
-
+ mov edx, [esp+28] ; i_pix2
+
pxor xmm7, xmm7
;Load 4x8
@@ -331,33 +331,33 @@
lea ecx, [ecx + 2 * edx]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [eax], [ecx]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-
+
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
-
- SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
-
+
+ SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
lea eax, [eax + 2 * ebx]
lea ecx, [ecx + 2 * edx]
-
+
;Load 4x8
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [eax ], [ecx ]
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [eax+ebx ], [ecx+edx]
lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
+ lea ecx, [ecx + 2 * edx]
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [eax], [ecx]
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-
+
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
- SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
- SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+ SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
+ SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
-
+
lea esi, [esi+64]
- SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
-
+ SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
pop esi
pop ebx
ret
@@ -377,21 +377,21 @@
%define pushsize 8
push ebx
push esi
-
- mov eax, [rec]
- mov ebx, [stride]
- mov ecx, [pred]
- mov edx, [pred_stride]
- mov esi, [rs]
+ mov eax, [rec]
+ mov ebx, [stride]
+ mov ecx, [pred]
+ mov edx, [pred_stride]
+ mov esi, [rs]
+
;Load 4x8
- SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
-
+ SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-
+
WELS_Zero xmm7
WELS_DW32 xmm6
@@ -398,41 +398,41 @@
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [eax ], [ecx]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
+ lea ecx, [ecx + 2 * edx]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [eax], [ecx]
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
-
+
add esi, 64
lea eax, [eax + 2 * ebx]
lea ecx, [ecx + 2 * edx]
- SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
-
+ SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
- SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+ SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
WELS_Zero xmm7
WELS_DW32 xmm6
-
+
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [eax ], [ecx]
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
+ lea ecx, [ecx + 2 * edx]
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [eax], [ecx]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx], [ecx + edx]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx], [ecx + edx]
pop esi
pop ebx
ret
-
+
%macro SSE2_StoreDiff4x8p 8
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
- SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
+ SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
- SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
+ SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
%endmacro
-
+
;***********************************************************************
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
;***********************************************************************
@@ -443,47 +443,47 @@
WelsIDctRecI16x16Dc_sse2:
push esi
push edi
-
+
mov ecx, [luma_dc]
- mov eax, [rec]
- mov edx, [stride]
- mov esi, [pred]
- mov edi, [pred_stride]
+ mov eax, [rec]
+ mov edx, [stride]
+ mov esi, [pred]
+ mov edi, [pred_stride]
pxor xmm7, xmm7
WELS_DW32 xmm6
-
+
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-
+ lea esi, [esi + 2 * edi]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
+
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-
+ lea esi, [esi + 2 * edi]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
+
lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
+ lea esi, [esi + 2 * edi]
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
+
pop edi
pop esi
ret
@@ -517,7 +517,7 @@
punpckldq %3, %4
punpcklqdq %1, %3
%endmacro
-
+
;***********************************************************************
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
;***********************************************************************
@@ -525,23 +525,23 @@
WelsHadamardT4Dc_sse2:
mov eax, [esp + 4] ; luma_dc
mov ecx, [esp + 8] ; pDct
-
+
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, ecx
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, ecx + 0x40
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, ecx + 0x100
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, ecx + 0x140
-
+
SSE2_SumSubD xmm1, xmm2, xmm7
SSE2_SumSubD xmm3, xmm4, xmm7
SSE2_SumSubD xmm2, xmm4, xmm7
- SSE2_SumSubD xmm1, xmm3, xmm7
+ SSE2_SumSubD xmm1, xmm3, xmm7
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
-
+
SSE2_SumSubD xmm4, xmm3, xmm7
SSE2_SumSubD xmm5, xmm1, xmm7
- WELS_DD1 xmm6
+ WELS_DD1 xmm6
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
@@ -550,7 +550,7 @@
packssdw xmm2, xmm1
movdqa [eax+ 0], xmm3
movdqa [eax+16], xmm2
-
- ret
+
+ ret
--- a/codec/encoder/core/asm/deblock.asm
+++ b/codec/encoder/core/asm/deblock.asm
@@ -62,169 +62,169 @@
ALIGN 16
DeblockChromaEq4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
mov edx,[ebp+10h] ; iStride
mov eax,[ebp+8] ; pPixCb
mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
@@ -231,203 +231,203 @@
WELS_EXTERN DeblockChromaLt4V_sse2
DeblockChromaLt4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
-
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
+
;***************************************************************************
-; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta)
;***************************************************************************
@@ -434,606 +434,606 @@
WELS_EXTERN DeblockChromaEq4H_sse2
ALIGN 16
-
+
DeblockChromaEq4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
;*******************************************************************************
-; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************
-
+
WELS_EXTERN DeblockChromaLt4H_sse2
-
+
ALIGN 16
DeblockChromaLt4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-
-
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+
+
;*******************************************************************************
-; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta, int8_t * pTC)
;*******************************************************************************
-
+
WELS_EXTERN DeblockLumaLt4V_sse2
-
+
ALIGN 16
DeblockLumaLt4V_sse2:
@@ -1419,12 +1419,12 @@
;*******************************************************************************
-; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
; int32_t iBeta)
;*******************************************************************************
WELS_EXTERN DeblockLumaEq4V_sse2
-
+
ALIGN 16
DeblockLumaEq4V_sse2:
@@ -1965,11 +1965,11 @@
mov esp, ebp
pop ebp
ret
-
-
+
+
;********************************************************************************
;
-; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
;
;********************************************************************************
@@ -1982,49 +1982,49 @@
push ebx
mov ebp, esp
and esp,0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 0Ch]
+ sub esp, 10h
+
+ mov eax, [ebp + 0Ch]
mov ecx, [ebp + 10h]
lea edx, [eax + ecx * 8]
lea ebx, [ecx*3]
-
- movq xmm0, [eax]
+
+ movq xmm0, [eax]
movq xmm7, [edx]
- punpcklqdq xmm0, xmm7
+ punpcklqdq xmm0, xmm7
movq xmm1, [eax + ecx]
movq xmm7, [edx + ecx]
punpcklqdq xmm1, xmm7
- movq xmm2, [eax + ecx*2]
+ movq xmm2, [eax + ecx*2]
movq xmm7, [edx + ecx*2]
punpcklqdq xmm2, xmm7
movq xmm3, [eax + ebx]
movq xmm7, [edx + ebx]
punpcklqdq xmm3, xmm7
-
+
lea eax, [eax + ecx * 4]
lea edx, [edx + ecx * 4]
- movq xmm4, [eax]
+ movq xmm4, [eax]
movq xmm7, [edx]
- punpcklqdq xmm4, xmm7
+ punpcklqdq xmm4, xmm7
movq xmm5, [eax + ecx]
movq xmm7, [edx + ecx]
punpcklqdq xmm5, xmm7
- movq xmm6, [eax + ecx*2]
+ movq xmm6, [eax + ecx*2]
movq xmm7, [edx + ecx*2]
punpcklqdq xmm6, xmm7
-
+
movdqa [esp], xmm0
movq xmm7, [eax + ebx]
movq xmm0, [edx + ebx]
punpcklqdq xmm7, xmm0
movdqa xmm0, [esp]
-
+
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
+
mov eax, [ebp + 14h]
- movdqa [eax], xmm4
+ movdqa [eax], xmm4
movdqa [eax + 10h], xmm2
movdqa [eax + 20h], xmm3
movdqa [eax + 30h], xmm7
@@ -2031,15 +2031,15 @@
movdqa [eax + 40h], xmm5
movdqa [eax + 50h], xmm1
movdqa [eax + 60h], xmm6
- movdqa [eax + 70h], xmm0
-
+ movdqa [eax + 70h], xmm0
+
mov esp, ebp
pop ebx
pop ebp
ret
-
-
-
+
+
+
;*******************************************************************************************
;
; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
@@ -2053,14 +2053,14 @@
DeblockLumaTransposeV2H_sse2:
push ebp
mov ebp, esp
-
+
and esp, 0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 10h]
+ sub esp, 10h
+
+ mov eax, [ebp + 10h]
mov ecx, [ebp + 0Ch]
mov edx, [ebp + 08h]
-
+
movdqa xmm0, [eax]
movdqa xmm1, [eax + 10h]
movdqa xmm2, [eax + 20h]
@@ -2069,23 +2069,23 @@
movdqa xmm5, [eax + 50h]
movdqa xmm6, [eax + 60h]
movdqa xmm7, [eax + 70h]
-
+
SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
+
lea eax, [ecx * 3]
-
- movq [edx], xmm4
+
+ movq [edx], xmm4
movq [edx + ecx], xmm2
movq [edx + ecx*2], xmm3
movq [edx + eax], xmm7
-
+
lea edx, [edx + ecx*4]
- movq [edx], xmm5
+ movq [edx], xmm5
movq [edx + ecx], xmm1
movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
+ movq [edx + eax], xmm0
+
psrldq xmm4, 8
psrldq xmm2, 8
psrldq xmm3, 8
@@ -2094,20 +2094,20 @@
psrldq xmm1, 8
psrldq xmm6, 8
psrldq xmm0, 8
-
+
lea edx, [edx + ecx*4]
- movq [edx], xmm4
+ movq [edx], xmm4
movq [edx + ecx], xmm2
movq [edx + ecx*2], xmm3
movq [edx + eax], xmm7
-
+
lea edx, [edx + ecx*4]
- movq [edx], xmm5
+ movq [edx], xmm5
movq [edx + ecx], xmm1
movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
-
+ movq [edx + eax], xmm0
+
+
mov esp, ebp
pop ebp
ret
\ No newline at end of file
--- a/codec/encoder/core/asm/expand_picture.asm
+++ b/codec/encoder/core/asm/expand_picture.asm
@@ -153,11 +153,11 @@
lea %1, [%1+%2]
%endmacro
-%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
+%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
; ebx [width/16(8)]
; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16) ; top
; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16) ; bottom
-
+
%if %1 == 32 ; for luma
sar ebx, 04h ; width / 16(8) pixels
.top_bottom_loops:
@@ -171,7 +171,7 @@
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
+
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
@@ -182,15 +182,15 @@
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
+
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
+ neg ecx ; positive/negative stride need for next loop?
+
dec ebx
- jnz near .top_bottom_loops
+ jnz near .top_bottom_loops
%elif %1 == 16 ; for chroma ??
mov edx, ebx
sar ebx, 04h ; (width / 16) pixels
@@ -200,21 +200,21 @@
mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm0, a
mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
+ mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
; bottom
movdqa xmm1, [eax] ; last line of picture pData
mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm1, a
mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
+ mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
lea esi, [esi+16] ; top pSrc
lea edi, [edi+16] ; top dst
lea eax, [eax+16] ; bottom pSrc
lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
+ neg ecx ; positive/negative stride need for next loop?
+
dec ebx
jnz near .top_bottom_loops
@@ -241,13 +241,13 @@
%endif
%endmacro
-%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
; ecx [height]
; esi [pSrc+0], edi [pSrc-32], edx [stride], 32(16) ; left
; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16) ; right
; xor eax, eax ; for pixel pData (uint8_t) ; make sure eax=0 at least high 24 bits of eax = 0
-
-%if %1 == 32 ; for luma
+
+%if %1 == 32 ; for luma
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
@@ -254,37 +254,37 @@
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [edi], xmm0
movdqa [edi+16], xmm0
-
+
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdqa [ebp], xmm1
movdqa [ebp+16], xmm1
-
+
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
+ lea ebp, [ebp+edx] ; right dst
+
dec ecx
- jnz near .left_right_loops
-%elif %1 == 16 ; for chroma ??
+ jnz near .left_right_loops
+%elif %1 == 16 ; for chroma ??
.left_right_loops:
; left
mov al, byte [esi] ; pixel pData for left border
butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [edi], xmm0
-
+ movdqa [edi], xmm0
+
; right
mov al, byte [ebx]
butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
movdq%2 [ebp], xmm1 ; might not be aligned 16 bytes in case chroma planes
-
+
lea esi, [esi+edx] ; left pSrc
lea edi, [edi+edx] ; left dst
lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
+ lea ebp, [ebp+edx] ; right dst
+
dec ecx
jnz near .left_right_loops
%endif
@@ -337,25 +337,25 @@
; TL
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
mov_line_end16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
; TR
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
; BL
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
mov_line_end16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
; BR
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
mov_line_end16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
%endif
%endmacro
@@ -373,7 +373,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -385,10 +385,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; stride
+ mov ecx, edx ; stride
neg ecx ; -stride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*stride
lea eax, [esi+eax] ; last line of picture pData
@@ -396,16 +396,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 32 * stride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; width-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; width
- exp_top_bottom_sse2 32
-
+ exp_top_bottom_sse2 32
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -417,7 +417,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -424,7 +424,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 32, a
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -434,7 +434,7 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -32 ; luma=-32, chroma=-16
neg ecx ; -stride
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
@@ -442,19 +442,19 @@
mov ecx, [esp+28] ; stride
imul edx, ecx ; (height+32(16)) * stride
lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
+ lea ebx, [ebp+edx] ; last line of bottom-right border
neg ecx ; -stride
; for left & right border expanding
- exp_cross_sse2 32, a
-
+ exp_cross_sse2 32, a
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
ALIGN 16
@@ -470,7 +470,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -482,10 +482,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; stride
+ mov ecx, edx ; stride
neg ecx ; -stride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*stride
lea eax, [esi+eax] ; last line of picture pData
@@ -493,16 +493,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 16 * stride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; width-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; width
- exp_top_bottom_sse2 16
-
+ exp_top_bottom_sse2 16
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -514,7 +514,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -521,7 +521,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, a
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -531,9 +531,9 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
mov eax, -16 ; chroma=-16
neg ecx ; -stride
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
+ lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; stride
add edx, 16 ; height+16, luma=32, chroma=16
@@ -543,15 +543,15 @@
neg ecx ; -stride
; for left & right border expanding
exp_cross_sse2 16, a
-
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
ALIGN 16
@@ -567,7 +567,7 @@
push esi
push edi
push ebp
-
+
; for both top and bottom border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -579,10 +579,10 @@
mov cl, byte [esi]
butterfly_1to16_sse xmm3, xmm4, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
; load top border
- mov ecx, edx ; stride
+ mov ecx, edx ; stride
neg ecx ; -stride
lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
+ ; load bottom border
dec eax ; h-1
imul eax, edx ; (h-1)*stride
lea eax, [esi+eax] ; last line of picture pData
@@ -590,16 +590,16 @@
lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 16 * stride
; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
dec ebx ; width-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
+ lea ebx, [eax+ebx] ; dst[w-1][h-1]
; xor edx, edx
mov dl, byte [eax] ; bottom-left
butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
mov dl, byte [ebx] ; bottom-right
butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
+ ; for top & bottom expanding
mov ebx, [esp+32] ; width
- exp_top_bottom_sse2 16
-
+ exp_top_bottom_sse2 16
+
; for both left and right border
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst: left border pSrc
@@ -611,7 +611,7 @@
lea edi, [esi+eax] ; left border dst
dec ebx
lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
+ lea ebp, [ebx+1] ; right border dst
; prepare for cross border pData: top-right with xmm4
; xor eax, eax
mov al, byte [ebx] ; top-right
@@ -618,7 +618,7 @@
butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
; for left & right border expanding
exp_left_right_sse2 16, u
-
+
; for cross border [top-left, top-right, bottom-left, bottom-right]
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
mov esi, [esp+24] ; p_dst
@@ -628,9 +628,9 @@
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
neg ecx ; -stride
mov eax, -16 ; chroma=-16
- lea edi, [esi+eax]
+ lea edi, [esi+eax]
lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
+ lea ebp, [esi+ebx]
lea ebp, [ebp+ecx] ; last line of top-right border
mov ecx, [esp+28] ; stride
add edx, 16 ; height+16, luma=32, chroma=16
@@ -640,14 +640,14 @@
neg ecx ; -stride
; for left & right border expanding
exp_cross_sse2 16, u
-
+
; sfence ; commit cache write back memory
-
+
pop ebp
pop edi
pop esi
pop edx
pop ebx
-
+
ret
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -95,13 +95,13 @@
punpcklbw %1, %3
movdqa %3, %1
punpcklbw %1, %3
-
+
;add %4, %5
movd %2, [%4+%5-1]
movdqa %3, %2
punpcklbw %2, %3
movdqa %3, %2
- punpcklbw %2, %3
+ punpcklbw %2, %3
punpckldq %1, %2
%endmacro
@@ -126,24 +126,24 @@
movd %2, [%5+%6]
punpcklbw %3, %2
punpcklwd %1, %3
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
movd %4, [%5]
movd %2, [%5+%6]
punpcklbw %4, %2
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
movd %3, [%5]
movd %2, [%5+%6]
lea %5, [%5+2*%6]
punpcklbw %3, %2
punpcklwd %4, %3
- punpckhdq %1, %4
-%endmacro
+ punpckhdq %1, %4
+%endmacro
%macro SUMW_HORIZON 3
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
@@ -173,7 +173,7 @@
movd %2, [%5+%6]
punpcklbw %3, %2
punpckhwd %1, %3
- lea %5, [%5+2*%6]
+ lea %5, [%5+2*%6]
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
@@ -197,7 +197,7 @@
ALIGN 16
;***********************************************************************
; void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;
+;
; pred must align to 16
;***********************************************************************
WelsI4x4LumaPredH_sse2:
@@ -207,11 +207,11 @@
movzx edx, byte [eax-1]
movd xmm0, edx
pmuludq xmm0, [mmx_01bytes]
-
+
movzx edx, byte [eax+ecx-1]
movd xmm1, edx
pmuludq xmm1, [mmx_01bytes]
-
+
unpcklps xmm0, xmm1
lea eax, [eax+ecx*2]
@@ -218,19 +218,19 @@
movzx edx, byte [eax-1]
movd xmm2, edx
pmuludq xmm2, [mmx_01bytes]
-
+
movzx edx, byte [eax+ecx-1]
- movd xmm3, edx
+ movd xmm3, edx
pmuludq xmm3, [mmx_01bytes]
-
+
unpcklps xmm2, xmm3
unpcklpd xmm0, xmm2
-
+
mov edx, [esp+4] ;pred
movdqa [edx], xmm0
-
+
ret
-
+
;***********************************************************************
; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
@@ -241,9 +241,9 @@
mov ecx, [esp + pushsize + 12]
sub esi, 1
sub esi, ecx
-
+
;for H
- pxor xmm7, xmm7
+ pxor xmm7, xmm7
movq xmm0, [esi]
movdqa xmm5, [sse2_plane_dec]
punpcklbw xmm0, xmm7
@@ -253,7 +253,7 @@
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
psubw xmm1, xmm0
-
+
SUMW_HORIZON xmm1,xmm0,xmm2
movd eax, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
movsx eax, ax
@@ -261,26 +261,26 @@
add eax, 32
sar eax, 6 ; b = (5 * H + 32) >> 6;
SSE2_Copy8Times xmm1, eax ; xmm1 = b,b,b,b,b,b,b,b
-
- movzx edx, BYTE [esi+16]
+
+ movzx edx, BYTE [esi+16]
sub esi, 3
LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, esi, ecx
-
+
add esi, 3
movzx eax, BYTE [esi+8*ecx]
add edx, eax
shl edx, 4 ; a = (left[15*stride] + top[15]) << 4;
-
+
sub esi, 3
add esi, ecx
LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, esi, ecx
- pxor xmm4, xmm4
+ pxor xmm4, xmm4
punpckhbw xmm0, xmm4
pmullw xmm0, xmm5
punpckhbw xmm7, xmm4
pmullw xmm7, xmm6
psubw xmm7, xmm0
-
+
SUMW_HORIZON xmm7,xmm0,xmm2
movd eax, xmm7 ; V
movsx eax, ax
@@ -288,17 +288,17 @@
imul eax, 5
add eax, 32
sar eax, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
-
+ SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
+
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -7
- add edx, eax ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
+ add edx, eax ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+
xor eax, eax
movdqa xmm5, [sse2_plane_inc_minus]
-
+
get_i16x16_luma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
@@ -307,7 +307,7 @@
movdqa xmm3, xmm1
pmullw xmm3, xmm6
paddw xmm3, xmm0
- psraw xmm3, 5
+ psraw xmm3, 5
packuswb xmm2, xmm3
movdqa [esi], xmm2
paddw xmm0, xmm4
@@ -314,13 +314,13 @@
add esi, 16
inc eax
cmp eax, 16
- jnz get_i16x16_luma_pred_plane_sse2_1
-
+ jnz get_i16x16_luma_pred_plane_sse2_1
+
pop esi
ret
-
-
-
+
+
+
;***********************************************************************
; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
@@ -327,7 +327,7 @@
%macro SSE2_PRED_H_16X16_TWO_LINE 1
lea eax, [eax+ecx*2]
-
+
COPY_16_TIMES eax, xmm0
movdqa [edx+%1], xmm0
COPY_16_TIMESS eax, xmm0, ecx
@@ -340,13 +340,13 @@
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
COPY_16_TIMES eax, xmm0
movdqa [edx], xmm0
COPY_16_TIMESS eax, xmm0, ecx
movdqa [edx+0x10], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE 0x20
+
+ SSE2_PRED_H_16X16_TWO_LINE 0x20
SSE2_PRED_H_16X16_TWO_LINE 0x40
SSE2_PRED_H_16X16_TWO_LINE 0x60
SSE2_PRED_H_16X16_TWO_LINE 0x80
@@ -353,9 +353,9 @@
SSE2_PRED_H_16X16_TWO_LINE 0xa0
SSE2_PRED_H_16X16_TWO_LINE 0xc0
SSE2_PRED_H_16X16_TWO_LINE 0xe0
-
+
ret
-
+
;***********************************************************************
; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
@@ -364,10 +364,10 @@
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
sub eax, ecx
movdqa xmm0, [eax]
-
+
movdqa [edx], xmm0
movdqa [edx+10h], xmm0
movdqa [edx+20h], xmm0
@@ -378,15 +378,15 @@
movdqa [edx+70h], xmm0
movdqa [edx+80h], xmm0
movdqa [edx+90h], xmm0
- movdqa [edx+160], xmm0
+ movdqa [edx+160], xmm0
movdqa [edx+176], xmm0
movdqa [edx+192], xmm0
movdqa [edx+208], xmm0
movdqa [edx+224], xmm0
movdqa [edx+240], xmm0
-
+
ret
-
+
;***********************************************************************
; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
@@ -398,8 +398,8 @@
mov ecx, [esp + pushsize + 12] ;stride
sub esi, 1
sub esi, ecx
-
- pxor mm7, mm7
+
+ pxor mm7, mm7
movq mm0, [esi]
movq mm5, [sse2_plane_dec_c]
punpcklbw mm0, mm7
@@ -409,7 +409,7 @@
punpcklbw mm1, mm7
pmullw mm1, mm6
psubw mm1, mm0
-
+
movq2dq xmm1, mm1
pxor xmm2, xmm2
SUMW_HORIZON xmm1,xmm0,xmm2
@@ -419,7 +419,7 @@
add eax, 16
sar eax, 5 ; b = (17 * H + 16) >> 5;
SSE2_Copy8Times xmm1, eax ; mm1 = b,b,b,b,b,b,b,b
-
+
movzx edx, BYTE [esi+8]
sub esi, 3
LOAD_COLUMN_C mm0, mm2, mm3, mm4, esi, ecx
@@ -428,17 +428,17 @@
movzx eax, BYTE [esi+4*ecx]
add edx, eax
shl edx, 4 ; a = (left[7*stride] + top[7]) << 4;
-
+
sub esi, 3
add esi, ecx
LOAD_COLUMN_C mm7, mm2, mm3, mm4, esi, ecx
- pxor mm4, mm4
+ pxor mm4, mm4
punpckhbw mm0, mm4
pmullw mm0, mm5
punpckhbw mm7, mm4
pmullw mm7, mm6
psubw mm7, mm0
-
+
movq2dq xmm7, mm7
pxor xmm2, xmm2
SUMW_HORIZON xmm7,xmm0,xmm2
@@ -448,17 +448,17 @@
imul eax, 17
add eax, 16
sar eax, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
-
+ SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
+
mov esi, [esp + pushsize + 4]
add edx, 16
imul eax, -3
- add edx, eax ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
+ add edx, eax ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+
xor eax, eax
movdqa xmm5, [sse2_plane_mul_b_c]
-
+
get_i_chroma_pred_plane_sse2_1:
movdqa xmm2, xmm1
pmullw xmm2, xmm5
@@ -470,12 +470,12 @@
add esi, 8
inc eax
cmp eax, 8
- jnz get_i_chroma_pred_plane_sse2_1
-
+ jnz get_i_chroma_pred_plane_sse2_1
+
pop esi
WELSEMMS
- ret
-
+ ret
+
ALIGN 16
;***********************************************************************
; 0 |1 |2 |3 |4 |
@@ -487,13 +487,13 @@
; pred[7] = ([6]+[0]*2+[1]+2)/4
;
; void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
+;
;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:
+WelsI4x4LumaPredDDR_mmx:
mov edx,[esp+4] ;pred
mov eax,[esp+8] ;pRef
mov ecx,[esp+12] ;stride
-
+
movq mm1,[eax+ecx-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
movq mm2,[eax-8] ;get value of 6 mm2[8] = 6
sub eax, ecx ;mov eax to above line of current block(postion of 1)
@@ -520,17 +520,17 @@
pand mm1,[mmx_01bytes] ;set the odd bit
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
-
- movd [edx+12],mm2
- psrlq mm2,8
- movd [edx+8],mm2
- psrlq mm2,8
- movd [edx+4],mm2
- psrlq mm2,8
+
+ movd [edx+12],mm2
+ psrlq mm2,8
+ movd [edx+8],mm2
+ psrlq mm2,8
+ movd [edx+4],mm2
+ psrlq mm2,8
movd [edx],mm2
WELSEMMS
ret
-
+
ALIGN 16
;***********************************************************************
; 0 |1 |2 |3 |4 |
@@ -542,44 +542,44 @@
; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
;
; void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;
+;
;***********************************************************************
-WelsI4x4LumaPredDc_sse2:
+WelsI4x4LumaPredDc_sse2:
mov eax,[esp+8] ;pRef
mov ecx,[esp+12] ;stride
push ebx
-
+
movzx edx, byte [eax-1h]
-
+
sub eax, ecx
movd xmm0, [eax]
pxor xmm1, xmm1
psadbw xmm0, xmm1
-
+
movd ebx, xmm0
add ebx, edx
-
+
movzx edx, byte [eax+ecx*2-1h]
add ebx, edx
-
+
lea eax, [eax+ecx*2-1]
movzx edx, byte [eax+ecx]
add ebx, edx
-
+
movzx edx, byte [eax+ecx*2]
add ebx, edx
add ebx, 4
sar ebx, 3
imul ebx, 0x01010101
-
+
mov edx, [esp+8] ;pred
movd xmm0, ebx
pshufd xmm0, xmm0, 0
movdqa [edx], xmm0
-
+
pop ebx
- ret
-
+ ret
+
ALIGN 16
;***********************************************************************
; void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -588,7 +588,7 @@
%macro MMX_PRED_H_8X8_ONE_LINE 4
movq %1, [%3-8]
psrlq %1, 38h
-
+
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
@@ -598,7 +598,7 @@
%macro MMX_PRED_H_8X8_ONE_LINEE 4
movq %1, [%3+ecx-8]
psrlq %1, 38h
-
+
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
pmullw %1, [mmx_01bytes]
pshufw %1, %1, 0
@@ -610,34 +610,34 @@
mov edx, [esp+4] ;pred
mov eax, [esp+8] ;pRef
mov ecx, [esp+12] ;stride
-
+
movq mm0, [eax-8]
psrlq mm0, 38h
-
+
;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
pmullw mm0, [mmx_01bytes]
pshufw mm0, mm0, 0
movq [edx], mm0
-
+
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+8
-
+
lea eax,[eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax,edx+16
-
+
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+24
-
+
lea eax,[eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax,edx+32
-
+
MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+40
-
+
lea eax,[eax+ecx*2]
MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax,edx+48
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+56
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+56
WELSEMMS
- ret
-
+ ret
+
ALIGN 16
;***********************************************************************
; void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -648,12 +648,12 @@
mov edx, [esp+4] ;pred
mov eax, [esp+8] ;pRef
mov ecx, [esp+12] ;stride
-
+
sub eax, ecx
movd xmm0, [eax]
pshufd xmm0, xmm0, 0
movdqa [edx], xmm0
- ret
+ ret
ALIGN 16
;***********************************************************************
@@ -665,7 +665,7 @@
mov edx, [esp+4] ;pred
mov eax, [esp+8] ;pRef
mov ecx, [esp+12] ;stride
-
+
sub eax, ecx
movq xmm0, [eax]
movdqa xmm1, xmm0
@@ -676,8 +676,8 @@
movdqa [edx+32], xmm0
movdqa [edx+48], xmm0
ret
-
-
+
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
@@ -703,13 +703,13 @@
; f = (2 + l1 + (l0<<1) + lt)>>2
; h = (2 + l2 + (l1<<1) + l0)>>2
-; j = (2 + l3 + (l2<<1) + l1)>>2
+; j = (2 + l3 + (l2<<1) + l1)>>2
; [b a f e h g j i] + [d c b a] --> mov to memory
-;
+;
; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:
+WelsI4x4LumaPredHD_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
@@ -716,16 +716,16 @@
sub eax, ecx
movd mm0, [eax-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
+
+ movd mm1, [eax+2*ecx-4]
+ punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
- movd mm2, [eax+2*ecx-4]
+ movd mm2, [eax+2*ecx-4]
punpcklbw mm2, [eax+ecx-4] ; mm2[7] = l2, mm2[6] = l3
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
psrlq mm2, 20h
pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-
+
movq mm1, mm0
psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
movq mm2, mm0
@@ -733,17 +733,17 @@
movq mm3, mm2
movq mm4, mm1
pavgb mm1, mm0
-
+
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm4 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
-
+
movq mm4, mm0
pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
-
+
psrlq mm2, 20h
psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
movq mm4, mm3
@@ -750,7 +750,7 @@
psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
-
+
movd [edx], mm2
movd [edx+12], mm3
psrlq mm3, 10h
@@ -759,9 +759,9 @@
movd [edx+4], mm3
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
@@ -784,17 +784,17 @@
; b = (2 + l0 + (l1<<1) + l2)>>2
; d = (2 + l1 + (l2<<1) + l3)>>2
; f = (2 + l2 + (l3<<1) + l3)>>2
-
+
; [g g f e d c b a] + [g g g g] --> mov to memory
-;
+;
; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:
+WelsI4x4LumaPredHU_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
movd mm0, [eax-4] ; mm0[3] = l0
punpcklbw mm0, [eax+ecx-4] ; mm0[7] = l1, mm0[6] = l0
lea eax, [eax+2*ecx]
@@ -802,38 +802,38 @@
movd mm4, [eax+ecx-4] ; mm4[3] = l3
punpcklbw mm2, mm4
punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-
+
psrlq mm4, 18h
psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
psrlq mm0, 8h
pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-
+
movq mm1, mm0
psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
-
+
movq mm2, mm0
psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
movq mm5, mm2
pavgb mm2, mm0
-
+
pxor mm5, mm0 ; find odd value in the lowest bit of each byte
pand mm5, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm5 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
-
+
psrlq mm2, 8h
pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
-
+
punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
-
+
psrlq mm4, 20h
movd [edx+12], mm4
-
+
movd [edx], mm1
psrlq mm1, 10h
movd [edx+4], mm1
@@ -841,9 +841,9 @@
movd [edx+8], mm1
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
@@ -869,12 +869,12 @@
; h = (2 + t1 + (t2<<1) + t3)>>2
; i = (2 + lt + (l0<<1) + l1)>>2
-; j = (2 + l0 + (l1<<1) + l2)>>2
-;
+; j = (2 + l0 + (l1<<1) + l2)>>2
+;
; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:
+WelsI4x4LumaPredVR_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
@@ -881,57 +881,57 @@
sub eax, ecx
movq mm0, [eax-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
+
+ movd mm1, [eax+2*ecx-4]
+ punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
lea eax, [eax+2*ecx]
movq mm2, [eax+ecx-8] ; mm2[7] = l2
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
psrlq mm2, 28h
pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-
+
movq mm1, mm0
psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
-
+
movq mm2, mm0
psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
movq mm3, mm2
pavgb mm2, mm0
-
+
pxor mm3, mm0 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm3 ; decrease 1 from odd bytes
-
+
movq mm3, mm0
psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
movq mm2, mm3
-
+
psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
movd [edx], mm1
-
+
psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
movd [edx+4], mm2
-
+
movq mm4, mm3
psllq mm4, 20h
psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
-
+
movq mm5, mm3
psllq mm5, 28h
psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
-
+
psllq mm1, 8h
pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
movd [edx+8], mm4
-
+
psllq mm2, 8h
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
movd [edx+12], mm5
WELSEMMS
ret
-
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -954,13 +954,13 @@
; e = (2 + t4 + t6 + (t5<<1))>>2
; f = (2 + t5 + t7 + (t6<<1))>>2
; g = (2 + t6 + t7 + (t7<<1))>>2
-
+
; [g f e d c b a] --> mov to memory
-;
+;
; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:
+WelsI4x4LumaPredDDL_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
@@ -968,11 +968,11 @@
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
-
+
movq mm3, mm0
psrlq mm3, 38h
psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
-
+
psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
psrlq mm2, 8h
pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -982,9 +982,9 @@
pxor mm3, mm2 ; find odd value in the lowest bit of each byte
pand mm3, [mmx_01bytes] ; set the odd bit
psubusb mm1, mm3 ; decrease 1 from odd bytes
-
+
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
-
+
psrlq mm0, 8h
movd [edx], mm0
psrlq mm0, 8h
@@ -995,8 +995,8 @@
movd [edx+12], mm0
WELSEMMS
ret
-
-
+
+
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1022,46 +1022,46 @@
; g = (2 + t2 + (t3<<1) + t4)>>2
; h = (2 + t3 + (t4<<1) + t5)>>2
; j = (2 + t4 + (t5<<1) + t6)>>2
-
+
; [i d c b a] + [j h g f e] --> mov to memory
-;
+;
; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:
+WelsI4x4LumaPredVL_mmx:
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
sub eax, ecx
movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
-
+
psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
movq mm3, mm1
pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
-
+
movq mm4, mm2
- pavgb mm2, mm0
+ pavgb mm2, mm0
pxor mm4, mm0 ; find odd value in the lowest bit of each byte
pand mm4, [mmx_01bytes] ; set the odd bit
psubusb mm2, mm4 ; decrease 1 from odd bytes
-
+
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
-
+
movd [edx], mm3
psrlq mm3, 8h
movd [edx+8], mm3
-
+
movd [edx+4], mm2
psrlq mm2, 8h
movd [edx+12], mm2
WELSEMMS
ret
-
+
ALIGN 16
;***********************************************************************
;
@@ -1068,14 +1068,14 @@
; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:
+WelsIChromaPredDc_sse2:
push ebx
mov eax, [esp+12] ; pRef
mov ecx, [esp+16] ; stride
-
+
sub eax, ecx
movq mm0, [eax]
-
+
;xor ebx, ebx
;movzx edx, byte [eax+ecx-0x01] ; l1
movzx ebx, byte [eax+ecx-0x01] ; l1
@@ -1089,7 +1089,7 @@
movzx edx, byte [eax-0x01] ; l4
add ebx, edx
movd mm1, ebx ; mm1 = l1+l2+l3+l4
-
+
;xor ebx, ebx
;movzx edx, byte [eax+ecx-0x01] ; l5
movzx ebx, byte [eax+ecx-0x01] ; l5
@@ -1103,7 +1103,7 @@
movzx edx, byte [eax-0x01] ; l8
add ebx, edx
movd mm2, ebx ; mm2 = l5+l6+l7+l8
-
+
movq mm3, mm0
psrlq mm0, 0x20
psllq mm3, 0x20
@@ -1110,56 +1110,56 @@
psrlq mm3, 0x20
pxor mm4, mm4
psadbw mm0, mm4
- psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
-
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
paddq mm3, mm1
movq mm1, mm2
paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-
+
movq mm4, [mmx_0x02]
-
+
paddq mm0, mm4
psrlq mm0, 0x02
-
+
paddq mm2, mm4
psrlq mm2, 0x02
-
+
paddq mm3, mm4
paddq mm3, mm4
psrlq mm3, 0x03
-
+
paddq mm1, mm4
paddq mm1, mm4
psrlq mm1, 0x03
-
+
pmuludq mm0, [mmx_01bytes]
pmuludq mm3, [mmx_01bytes]
psllq mm0, 0x20
pxor mm0, mm3 ; mm0 = m_up
-
+
pmuludq mm2, [mmx_01bytes]
pmuludq mm1, [mmx_01bytes]
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
-
+
mov edx, [esp+8] ; pRef
-
+
movq [edx], mm0
movq [edx+0x08], mm0
movq [edx+0x10], mm0
movq [edx+0x18], mm0
-
+
movq [edx+0x20], mm1
movq [edx+0x28], mm1
movq [edx+0x30], mm1
movq [edx+0x38], mm1
-
+
pop ebx
WELSEMMS
ret
-
-
-
+
+
+
ALIGN 16
;***********************************************************************
;
@@ -1166,11 +1166,11 @@
; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:
+WelsI16x16LumaPredDc_sse2:
push ebx
mov eax, [esp+12] ; pRef
mov ecx, [esp+16] ; stride
-
+
sub eax, ecx
movdqa xmm0, [eax] ; read one row
pxor xmm1, xmm1
@@ -1180,7 +1180,7 @@
pslldq xmm0, 0x08
psrldq xmm0, 0x08
paddw xmm0, xmm1
-
+
;xor ebx, ebx
;movzx edx, byte [eax+ecx-0x01]
movzx ebx, byte [eax+ecx-0x01]
@@ -1201,7 +1201,7 @@
psrld xmm0, 0x05
pmuludq xmm0, [mmx_01bytes]
pshufd xmm0, xmm0, 0
-
+
mov edx, [esp+8] ; pred
movdqa [edx], xmm0
movdqa [edx+0x10], xmm0
@@ -1219,7 +1219,7 @@
movdqa [edx+0xd0], xmm0
movdqa [edx+0xe0], xmm0
movdqa [edx+0xf0], xmm0
-
+
pop ebx
ret
@@ -1226,7 +1226,7 @@
;***********************************************************************
;
-;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
+;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
;
;***********************************************************************
@@ -1238,7 +1238,7 @@
push edi
mov eax, [esp+24];p_enc
mov ebx, [esp+28];linesize_enc
-
+
; load source 4x4 samples and Hadamard transform
movd xmm0, [eax]
movd xmm1, [eax+ebx]
@@ -1247,16 +1247,16 @@
movd xmm3, [eax+ebx]
punpckldq xmm0, xmm2
punpckldq xmm1, xmm3
-
+
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
punpcklbw xmm1, xmm6
-
+
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
+
movdqa xmm4, xmm0
paddw xmm0, xmm3
psubw xmm4, xmm3
@@ -1264,7 +1264,7 @@
movdqa xmm2, xmm0
punpcklwd xmm0, xmm4
punpckhwd xmm4, xmm2
-
+
SSE2_XSawp dq, xmm0, xmm4, xmm3
SSE2_XSawp qdq, xmm0, xmm3, xmm5
@@ -1271,14 +1271,14 @@
movdqa xmm7, xmm0
paddw xmm0, xmm5
psubw xmm7, xmm5
-
+
SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
+
; Hadamard transform results are saved in xmm0 and xmm2
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
-
+
; load top boundary samples: [a b c d]
mov eax, [esp+16];p_dec
sub eax, [esp+20];linesize_dec
@@ -1286,7 +1286,7 @@
movzx edx, byte [eax+1]
movzx esi, byte [eax+2]
movzx edi, byte [eax+3]
-
+
; get the transform results of top boundary samples: [a b c d]
add edx, ecx ; edx = a + b
add edi, esi ; edi = c + d
@@ -1300,7 +1300,7 @@
add esi, ecx ; esi = (a - b) + (c - d)
add ecx, ecx
sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-
+
movdqa xmm6, xmm0
movdqa xmm7, xmm2
movd xmm5, edi ; store the edi for DC mode
@@ -1312,16 +1312,16 @@
pinsrw xmm4, edx, 0
pinsrw xmm4, ecx, 4
psllw xmm4, 2
-
+
; get the satd of H
psubw xmm0, xmm3
psubw xmm2, xmm4
-
+
WELS_AbsW xmm0, xmm1
WELS_AbsW xmm2, xmm1
paddusw xmm0, xmm2
SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
-
+
; load left boundary samples: [a b c d]'
mov eax, [esp+16]
mov ebx, [esp+20]
@@ -1330,7 +1330,7 @@
lea eax , [eax+2*ebx]
movzx esi, byte [eax-1]
movzx edi, byte [eax+ebx-1]
-
+
; get the transform results of left boundary samples: [a b c d]'
add edx, ecx ; edx = a + b
add edi, esi ; edi = c + d
@@ -1344,14 +1344,14 @@
add esi, ecx ; esi = (a - b) + (c - d)
add ecx, ecx
sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-
- ; store the transform results in xmm3
+
+ ; store the transform results in xmm3
movd xmm3, edi
pinsrw xmm3, edx, 1
pinsrw xmm3, ecx, 2
pinsrw xmm3, esi, 3
psllw xmm3, 2
-
+
; get the satd of V
movdqa xmm2, xmm6
movdqa xmm4, xmm7
@@ -1368,7 +1368,7 @@
psrlw xmm1, 3
movdqa xmm5, xmm1
psllw xmm1, 4
-
+
; get the satd of DC
psubw xmm6, xmm1
WELS_AbsW xmm6, xmm1
@@ -1375,7 +1375,7 @@
WELS_AbsW xmm7, xmm1
paddusw xmm6, xmm7
SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
-
+
; comparing order: DC H V
mov edx, [esp+32]
movd eax, xmm6
@@ -1394,9 +1394,9 @@
jg near not_dc
cmp ax, si
jg near not_dc_h
-
+
; for DC mode
- movd ebx, xmm5
+ movd ebx, xmm5
imul ebx, 0x01010101
movd xmm5, ebx
pshufd xmm5, xmm5, 0
@@ -1407,11 +1407,11 @@
pop esi
pop ebx
ret
-
+
not_dc:
cmp di, si
jg near not_dc_h
-
+
; for H mode
SSE_DB_1_2REG xmm6, xmm7
mov eax, [esp+16]
@@ -1422,20 +1422,20 @@
movzx ecx, byte [eax+ebx-1]
movd xmm1, ecx
- pmuludq xmm1, xmm6
+ pmuludq xmm1, xmm6
%if 1
punpckldq xmm0, xmm1
-%else
+%else
unpcklps xmm0, xmm1
%endif
lea eax, [eax+ebx*2]
movzx ecx, byte [eax-1]
movd xmm2, ecx
- pmuludq xmm2, xmm6
+ pmuludq xmm2, xmm6
movzx ecx, byte [eax+ebx-1]
- movd xmm3, ecx
- pmuludq xmm3, xmm6
+ movd xmm3, ecx
+ pmuludq xmm3, xmm6
%if 1
punpckldq xmm2, xmm3
punpcklqdq xmm0, xmm2
@@ -1442,13 +1442,13 @@
%else
unpcklps xmm2, xmm3
unpcklpd xmm0, xmm2
-%endif
+%endif
movdqa [edx],xmm0
-
+
mov eax, edi
mov ebx, [esp+36]
mov dword [ebx], 0x01
-
+
pop edi
pop esi
pop ebx
@@ -1460,14 +1460,14 @@
movd xmm0, [eax]
pshufd xmm0, xmm0, 0
movdqa [edx],xmm0
-
+
mov eax, esi
mov ebx, [esp+36]
mov dword [ebx], 0x00
-
+
pop edi
pop esi
pop ebx
ret
-
+
--- a/codec/encoder/core/asm/intra_pred_util.asm
+++ b/codec/encoder/core/asm/intra_pred_util.asm
@@ -32,7 +32,7 @@
;* intra_pred_util.asm
;*
;* Abstract
-;* mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
+;* mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
;* WelsFillingPred1to16 etc.
;*
;* History
@@ -84,7 +84,7 @@
movq mm0, [ecx]
movq [eax ], mm0
movq [eax+8], mm0
-
+
WELSEMMS
ret
@@ -100,16 +100,16 @@
movq mm1, [ecx+8]
movq [eax ], mm0
movq [eax+8], mm1
-
+
WELSEMMS
ret
%macro butterfly_1to8_mmx 3 ; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
- mov %3h, %3l
- movd %2, e%3x ; i.e, 1% = eax (=b0)
- pshufw %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
+ mov %3h, %3l
+ movd %2, e%3x ; i.e, 1% = eax (=b0)
+ pshufw %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
ALIGN 16
;***********************************************************************----------------
@@ -120,10 +120,10 @@
mov cl, byte [esp+8] ; v
butterfly_1to8_mmx mm0, mm1, c ; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-
+
movq [eax ], mm0
movq [eax+8], mm0
-
+
WELSEMMS
ret
@@ -136,9 +136,9 @@
mov eax, [esp+4] ; pred
mov ecx, [esp+8] ; v
- movdqa xmm0, [ecx]
- movdqa [eax], xmm0
-
+ movdqa xmm0, [ecx]
+ movdqa [eax], xmm0
+
ret
ALIGN 16
@@ -150,7 +150,7 @@
mov cl, byte [esp+8] ; v
butterfly_1to16_sse xmm0, xmm1, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
-
+
movdqa [eax], xmm0
-
+
ret
--- a/codec/encoder/core/asm/mb_copy.asm
+++ b/codec/encoder/core/asm/mb_copy.asm
@@ -32,7 +32,7 @@
;* mb_copy.asm
;*
;* Abstract
-;* mb_copy
+;* mb_copy
;*
;*
;*********************************************************************************************/
@@ -52,9 +52,9 @@
WELS_EXTERN WelsCopy16x16_sse2
WELS_EXTERN WelsCopy16x16NotAligned_sse2
WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
-WELS_EXTERN WelsCopy8x16_mmx ;
-WELS_EXTERN UpdateMbMv_sse2 ;
+WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
+WELS_EXTERN WelsCopy8x16_mmx ;
+WELS_EXTERN UpdateMbMv_sse2 ;
;***********************************************************************
; void WelsCopy16x16_sse2( uint8_t* Dst,
@@ -66,7 +66,7 @@
WelsCopy16x16_sse2:
push esi
push edi
- push ebx
+ push ebx
mov edi, [esp+16] ; Dst
mov eax, [esp+20] ; iStrideD
@@ -107,7 +107,7 @@
movdqa xmm5, [esi+ecx]
movdqa xmm6, [esi+2*ecx]
movdqa xmm7, [esi+edx]
-
+
movdqa [edi], xmm0
movdqa [edi+eax], xmm1
movdqa [edi+2*eax], xmm2
@@ -116,7 +116,7 @@
movdqa [edi], xmm4
movdqa [edi+eax], xmm5
movdqa [edi+2*eax], xmm6
- movdqa [edi+ebx], xmm7
+ movdqa [edi+ebx], xmm7
pop ebx
pop edi
@@ -134,7 +134,7 @@
WelsCopy16x16NotAligned_sse2:
push esi
push edi
- push ebx
+ push ebx
mov edi, [esp+16] ; Dst
mov eax, [esp+20] ; iStrideD
@@ -175,7 +175,7 @@
movdqu xmm5, [esi+ecx]
movdqu xmm6, [esi+2*ecx]
movdqu xmm7, [esi+edx]
-
+
movdqa [edi], xmm0
movdqa [edi+eax], xmm1
movdqa [edi+2*eax], xmm2
@@ -184,8 +184,8 @@
movdqa [edi], xmm4
movdqa [edi+eax], xmm5
movdqa [edi+2*eax], xmm6
- movdqa [edi+ebx], xmm7
-
+ movdqa [edi+ebx], xmm7
+
pop ebx
pop edi
pop esi
@@ -202,7 +202,7 @@
WelsCopy16x8NotAligned_sse2:
push esi
push edi
- push ebx
+ push ebx
mov edi, [esp+16] ; Dst
mov eax, [esp+20] ; iStrideD
@@ -220,7 +220,7 @@
movdqu xmm4, [esi]
movdqu xmm5, [esi+ecx]
movdqu xmm6, [esi+2*ecx]
- movdqu xmm7, [esi+edx]
+ movdqu xmm7, [esi+edx]
movdqa [edi], xmm0
movdqa [edi+eax], xmm1
@@ -231,7 +231,7 @@
movdqa [edi+eax], xmm5
movdqa [edi+2*eax], xmm6
movdqa [edi+ebx], xmm7
-
+
pop ebx
pop edi
pop esi
@@ -245,7 +245,7 @@
; int32_t iStrideS )
;***********************************************************************
ALIGN 16
-WelsCopy8x16_mmx:
+WelsCopy8x16_mmx:
push ebx
mov eax, [esp + 8 ] ;Dst
@@ -253,60 +253,60 @@
mov ebx, [esp + 16] ;Src
mov edx, [esp + 20] ;iStrideS
- movq mm0, [ebx]
- movq mm1, [ebx+edx]
+ movq mm0, [ebx]
+ movq mm1, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm2, [ebx]
- movq mm3, [ebx+edx]
+ movq mm2, [ebx]
+ movq mm3, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm4, [ebx]
- movq mm5, [ebx+edx]
+ movq mm4, [ebx]
+ movq mm5, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm6, [ebx]
- movq mm7, [ebx+edx]
+ movq mm6, [ebx]
+ movq mm7, [ebx+edx]
lea ebx, [ebx+2*edx]
-
- movq [eax], mm0
- movq [eax+ecx], mm1
+
+ movq [eax], mm0
+ movq [eax+ecx], mm1
lea eax, [eax+2*ecx]
- movq [eax], mm2
+ movq [eax], mm2
movq [eax+ecx], mm3
lea eax, [eax+2*ecx]
- movq [eax], mm4
+ movq [eax], mm4
movq [eax+ecx], mm5
lea eax, [eax+2*ecx]
- movq [eax], mm6
+ movq [eax], mm6
movq [eax+ecx], mm7
lea eax, [eax+2*ecx]
- movq mm0, [ebx]
- movq mm1, [ebx+edx]
+ movq mm0, [ebx]
+ movq mm1, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm2, [ebx]
- movq mm3, [ebx+edx]
+ movq mm2, [ebx]
+ movq mm3, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm4, [ebx]
- movq mm5, [ebx+edx]
+ movq mm4, [ebx]
+ movq mm5, [ebx+edx]
lea ebx, [ebx+2*edx]
- movq mm6, [ebx]
- movq mm7, [ebx+edx]
-
- movq [eax], mm0
- movq [eax+ecx], mm1
+ movq mm6, [ebx]
+ movq mm7, [ebx+edx]
+
+ movq [eax], mm0
+ movq [eax+ecx], mm1
lea eax, [eax+2*ecx]
- movq [eax], mm2
+ movq [eax], mm2
movq [eax+ecx], mm3
lea eax, [eax+2*ecx]
- movq [eax], mm4
+ movq [eax], mm4
movq [eax+ecx], mm5
lea eax, [eax+2*ecx]
- movq [eax], mm6
- movq [eax+ecx], mm7
+ movq [eax], mm6
+ movq [eax+ecx], mm7
WELSEMMS
- pop ebx
+ pop ebx
ret
-
+
;***********************************************************************
; void WelsCopy8x8_mmx( uint8_t* Dst,
; int32_t iStrideD,
@@ -314,7 +314,7 @@
; int32_t iStrideS )
;***********************************************************************
ALIGN 16
-WelsCopy8x8_mmx:
+WelsCopy8x8_mmx:
push ebx
push esi
mov eax, [esp + 12] ;Dst
@@ -343,7 +343,7 @@
lea esi, [esi+2*ebx]
movq mm6, [esi]
movq mm7, [esi+ebx]
-
+
movq [eax], mm0
movq [eax+ecx], mm1
lea eax, [eax+2*ecx]
@@ -355,12 +355,12 @@
lea eax, [eax+2*ecx]
movq [eax], mm6
movq [eax+ecx], mm7
-
+
WELSEMMS
- pop esi
+ pop esi
pop ebx
ret
-
+
; (dunhuang@cisco), 12/21/2011
;***********************************************************************
; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
@@ -417,8 +417,8 @@
WELS_EXTERN McCopyWidthEq4_mmx
WELS_EXTERN McCopyWidthEq8_mmx
WELS_EXTERN McCopyWidthEq16_sse2
-
+
ALIGN 16
;***********************************************************************
; void PixelAvgWidthEq8_mmx( uint8_t *dst, int32_t iDstStride,
@@ -432,19 +432,19 @@
push esi
push edi
- mov edi, [esp+20]
- mov esi, [esp+28]
- mov edx, [esp+36]
- mov ebp, [esp+24]
- mov eax, [esp+32]
- mov ebx, [esp+40]
- mov ecx, [esp+44]
+ mov edi, [esp+20]
+ mov esi, [esp+28]
+ mov edx, [esp+36]
+ mov ebp, [esp+24]
+ mov eax, [esp+32]
+ mov ebx, [esp+40]
+ mov ecx, [esp+44]
sar ecx, 2
.height_loop:
- movq mm0, [esi]
+ movq mm0, [esi]
pavgb mm0, [edx]
movq [edi], mm0
- movq mm1, [esi+eax]
+ movq mm1, [esi+eax]
pavgb mm1, [edx+ebx]
movq [edi+ebp], mm1
lea edi, [edi+2*ebp]
@@ -451,19 +451,19 @@
lea esi, [esi+2*eax]
lea edx, [edx+2*ebx]
- movq mm2, [esi]
+ movq mm2, [esi]
pavgb mm2, [edx]
movq [edi], mm2
- movq mm3, [esi+eax]
+ movq mm3, [esi+eax]
pavgb mm3, [edx+ebx]
movq [edi+ebp], mm3
lea edi, [edi+2*ebp]
lea esi, [esi+2*eax]
lea edx, [edx+2*ebx]
-
+
dec ecx
jne .height_loop
-
+
WELSEMMS
pop edi
pop esi
@@ -485,19 +485,19 @@
push esi
push edi
- mov edi, [esp+20]
- mov esi, [esp+28]
- mov edx, [esp+36]
- mov ebp, [esp+24]
- mov eax, [esp+32]
- mov ebx, [esp+40]
- mov ecx, [esp+44]
+ mov edi, [esp+20]
+ mov esi, [esp+28]
+ mov edx, [esp+36]
+ mov ebp, [esp+24]
+ mov eax, [esp+32]
+ mov ebx, [esp+40]
+ mov ecx, [esp+44]
sar ecx, 2
.height_loop:
movdqu xmm0, [esi]
movdqu xmm1, [edx]
movdqu xmm2, [esi+eax]
- movdqu xmm3, [edx+ebx]
+ movdqu xmm3, [edx+ebx]
pavgb xmm0, xmm1
pavgb xmm2, xmm3
movdqu [edi], xmm0
@@ -504,12 +504,12 @@
movdqu [edi+ebp], xmm2
lea edi, [edi+2*ebp]
lea esi, [esi+2*eax]
- lea edx, [edx+2*ebx]
+ lea edx, [edx+2*ebx]
movdqu xmm4, [esi]
movdqu xmm5, [edx]
movdqu xmm6, [esi+eax]
- movdqu xmm7, [edx+ebx]
+ movdqu xmm7, [edx+ebx]
pavgb xmm4, xmm5
pavgb xmm6, xmm7
movdqu [edi], xmm4
@@ -516,11 +516,11 @@
movdqu [edi+ebp], xmm6
lea edi, [edi+2*ebp]
lea esi, [esi+2*eax]
- lea edx, [edx+2*ebx]
-
+ lea edx, [edx+2*ebx]
+
dec ecx
jne .height_loop
-
+
pop edi
pop esi
pop ebx
@@ -540,7 +540,7 @@
dec dword [esp+4]
jg avg_w16_align_0_ssse3
ret
-
+
ALIGN 64
avg_w16_align_1_ssse3:
movdqa xmm1, [ebx+16]
@@ -555,7 +555,7 @@
jg avg_w16_align_1_ssse3
ret
-
+
ALIGN 16
;***********************************************************************
; void PixelAvgWidthEq16_ssse3(uint8_t *pDst, int32_t iDstStride,
@@ -574,7 +574,7 @@
mov ebx, [esp+28] ; src1
mov ecx, [esp+36] ; src2
mov esi, [esp+24] ; i_dst_stride
-
+
%define avg_w16_offset (avg_w16_align_1_ssse3-avg_w16_align_0_ssse3)
mov edx, ebx
and edx, 0x01
@@ -582,11 +582,11 @@
lea ebp, [avg_w16_offset]
imul ebp, edx
lea edx, [ebp+eax]
-
- mov eax, [esp+32]
- mov ebp, [esp+44]
+
+ mov eax, [esp+32]
+ mov ebp, [esp+44]
push ebp
- mov ebp, [esp+44]
+ mov ebp, [esp+44]
and ebx, 0xfffffff0
call edx
pop ebp
@@ -607,7 +607,7 @@
push edi
push ebx
-
+
mov esi, [esp+16]
mov eax, [esp+20]
mov edi, [esp+24]
@@ -617,12 +617,12 @@
.height_loop:
mov ebx, [esi]
mov [edi], ebx
-
+
add esi, eax
add edi, ecx
dec edx
jnz .height_loop
- WELSEMMS
+ WELSEMMS
pop ebx
pop edi
pop esi
@@ -650,12 +650,12 @@
add edi, ecx
dec edx
jnz .height_loop
-
- WELSEMMS
+
+ WELSEMMS
pop edi
pop esi
ret
-
+
ALIGN 16
;***********************************************************************
; void McCopyWidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
@@ -664,11 +664,11 @@
push esi
push edi
- mov esi, [esp+12]
- mov eax, [esp+16]
- mov edi, [esp+20]
- mov edx, [esp+24]
- mov ecx, [esp+28]
+ mov esi, [esp+12]
+ mov eax, [esp+16]
+ mov edi, [esp+20]
+ mov edx, [esp+24]
+ mov ecx, [esp+28]
ALIGN 4
.height_loop:
@@ -681,7 +681,7 @@
lea esi, [esi+eax*2]
lea edi, [edi+edx*2]
jnz .height_loop
-
+
pop edi
pop esi
ret
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ b/codec/encoder/core/asm/mc_chroma.asm
@@ -69,11 +69,11 @@
ALIGN 16
;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
+; void McChromaWidthEq4_mmx( uint8_t *src,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
; int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
@@ -81,29 +81,29 @@
push esi
push edi
push ebx
-
+
mov eax, [esp +12 + 20]
movd mm3, [eax]
WELS_Zero mm7
punpcklbw mm3, mm3
movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
+
movq mm5, mm3
punpcklbw mm3, mm7
punpckhbw mm5, mm7
-
+
movq mm6, mm4
punpcklbw mm4, mm7
punpckhbw mm6, mm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
lea ebx, [esi + eax]
movd mm0, [esi]
movd mm1, [esi+1]
@@ -110,17 +110,17 @@
punpcklbw mm0, mm7
punpcklbw mm1, mm7
.xloop:
-
+
pmullw mm0, mm3
pmullw mm1, mm5
paddw mm0, mm1
-
+
movd mm1, [ebx]
punpcklbw mm1, mm7
movq mm2, mm1
pmullw mm1, mm4
paddw mm0, mm1
-
+
movd mm1, [ebx+1]
punpcklbw mm1, mm7
movq mm7, mm1
@@ -130,13 +130,13 @@
paddw mm0, [h264_d0x20_mmx]
psrlw mm0, 6
-
+
WELS_Zero mm7
packuswb mm0, mm7
- movd [edi], mm0
+ movd [edi], mm0
movq mm0, mm2
-
+
lea edi, [edi +edx ]
lea ebx, [ebx + eax]
@@ -151,11 +151,11 @@
ALIGN 16
;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
; int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
@@ -163,30 +163,30 @@
push esi
push edi
push ebx
-
+
mov eax, [esp +12 + 20]
movd xmm3, [eax]
WELS_Zero xmm7
punpcklbw xmm3, xmm3
punpcklwd xmm3, xmm3
-
+
movdqa xmm4, xmm3
punpckldq xmm3, xmm3
punpckhdq xmm4, xmm4
movdqa xmm5, xmm3
movdqa xmm6, xmm4
-
+
punpcklbw xmm3, xmm7
punpckhbw xmm5, xmm7
punpcklbw xmm4, xmm7
punpckhbw xmm6, xmm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
+
+ mov esi, [esp +12+ 4]
+ mov eax, [esp + 12 + 8]
+ mov edi, [esp + 12 + 12]
+ mov edx, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
lea ebx, [esi + eax]
movq xmm0, [esi]
movq xmm1, [esi+1]
@@ -193,17 +193,17 @@
punpcklbw xmm0, xmm7
punpcklbw xmm1, xmm7
.xloop:
-
+
pmullw xmm0, xmm3
pmullw xmm1, xmm5
paddw xmm0, xmm1
-
+
movq xmm1, [ebx]
punpcklbw xmm1, xmm7
movdqa xmm2, xmm1
pmullw xmm1, xmm4
paddw xmm0, xmm1
-
+
movq xmm1, [ebx+1]
punpcklbw xmm1, xmm7
movdqa xmm7, xmm1
@@ -213,19 +213,19 @@
paddw xmm0, [h264_d0x20_sse2]
psrlw xmm0, 6
-
+
WELS_Zero xmm7
packuswb xmm0, xmm7
- movq [edi], xmm0
+ movq [edi], xmm0
movdqa xmm0, xmm2
-
+
lea edi, [edi +edx ]
lea ebx, [ebx + eax]
dec ecx
jnz near .xloop
-
+
pop ebx
pop edi
pop esi
@@ -237,8 +237,8 @@
ALIGN 16
;***********************************************************************
; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
+; int32_t iSrcStride,
+; uint8_t *pDst,
; int32_t iDstStride,
; uint8_t *pABCD,
; int32_t iHeigh);
@@ -248,23 +248,23 @@
push ebx
push esi
push edi
-
+
mov eax, [esp + 12 + 20]
pxor xmm7, xmm7
- movd xmm5, [eax]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
+ movd xmm5, [eax]
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
movdqa xmm6, xmm5
punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- mov eax, [esp + 12 + 4]
- mov edx, [esp + 12 + 8]
- mov esi, [esp + 12 + 12]
- mov edi, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
+ punpckhqdq xmm6, xmm6
+
+ mov eax, [esp + 12 + 4]
+ mov edx, [esp + 12 + 8]
+ mov esi, [esp + 12 + 12]
+ mov edi, [esp + 12 + 16]
+ mov ecx, [esp + 12 + 24]
+
sub esi, edi
sub esi, edi
movdqa xmm7, [h264_d0x20_sse2]
@@ -273,16 +273,16 @@
movdqa xmm1, xmm0
psrldq xmm1, 1
punpcklbw xmm0, xmm1
-
-.hloop_chroma:
+
+.hloop_chroma:
lea esi, [esi+2*edi]
-
+
movdqu xmm2, [eax+edx]
movdqa xmm3, xmm2
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm4, xmm2
-
+
pmaddubsw xmm0, xmm5
pmaddubsw xmm2, xmm6
paddw xmm0, xmm2
@@ -289,8 +289,8 @@
paddw xmm0, xmm7
psrlw xmm0, 6
packuswb xmm0, xmm0
- movq [esi],xmm0
-
+ movq [esi],xmm0
+
lea eax, [eax+2*edx]
movdqu xmm2, [eax]
movdqa xmm3, xmm2
@@ -297,7 +297,7 @@
psrldq xmm3, 1
punpcklbw xmm2, xmm3
movdqa xmm0, xmm2
-
+
pmaddubsw xmm4, xmm5
pmaddubsw xmm2, xmm6
paddw xmm4, xmm2
@@ -304,8 +304,8 @@
paddw xmm4, xmm7
psrlw xmm4, 6
packuswb xmm4, xmm4
- movq [esi+edi],xmm4
-
+ movq [esi+edi],xmm4
+
sub ecx, 2
jnz .hloop_chroma
pop edi
--- a/codec/encoder/core/asm/mc_luma.asm
+++ b/codec/encoder/core/asm/mc_luma.asm
@@ -91,10 +91,10 @@
ALIGN 16
;***********************************************************************
-; void McHorVer20WidthEq16_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
+; void McHorVer20WidthEq16_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
; int32_t iHeight,
; );
;***********************************************************************
@@ -101,19 +101,19 @@
McHorVer20WidthEq16_sse2:
push esi
push edi
-
- mov esi, [esp + 12]
- mov eax, [esp + 16]
- mov edi, [esp + 20]
- mov ecx, [esp + 28]
- mov edx, [esp + 24]
- sub esi, 2
-
+
+ mov esi, [esp + 12]
+ mov eax, [esp + 16]
+ mov edi, [esp + 20]
+ mov ecx, [esp + 28]
+ mov edx, [esp + 24]
+ sub esi, 2
+
WELS_Zero xmm7
movdqa xmm6, [h264_w0x10_1]
.y_loop:
-
+
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -126,7 +126,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -152,7 +152,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3+8]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -165,8 +165,8 @@
psraw xmm0, 5
packuswb xmm0, xmm7
movq [edi+8], xmm0
-
-
+
+
add esi, eax
add edi, edx
dec ecx
@@ -178,9 +178,9 @@
ALIGN 16
;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
-; int32_t iSrcStride,
-; uint8_t* pTap,
+; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
+; int32_t iSrcStride,
+; uint8_t* pTap,
; int32_t iTapStride,
; int32_t iHeight);
;***********************************************************************
@@ -193,11 +193,11 @@
mov edi, [esp+24] ;tap
mov edx, [esp+28] ;tap_stride
mov ebx, [esp+32] ;i_height
- pxor xmm7, xmm7
-
+ pxor xmm7, xmm7
+
sub esi, eax ;;;;;;;;need more 5 lines.
sub esi, eax
-
+
.yloop_width_8:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
@@ -211,7 +211,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -221,7 +221,7 @@
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [edi], xmm0
-
+
add esi, eax
add edi, edx
dec ebx
@@ -230,12 +230,12 @@
pop edi
pop esi
ret
-
+
;***********************************************************************
-; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
+; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
; int32_t iHeight )
;***********************************************************************
ALIGN 16
@@ -242,18 +242,18 @@
McHorVer02WidthEq8_sse2:
push esi
push edi
-
- mov esi, [esp + 12]
- mov edx, [esp + 16]
- mov edi, [esp + 20]
- mov eax, [esp + 24]
- mov ecx, [esp + 28]
+ mov esi, [esp + 12]
+ mov edx, [esp + 16]
+ mov edi, [esp + 20]
+ mov eax, [esp + 24]
+ mov ecx, [esp + 28]
+
sub esi, edx
sub esi, edx
WELS_Zero xmm7
-
+
SSE_LOAD_8P xmm0, xmm7, [esi]
SSE_LOAD_8P xmm1, xmm7, [esi+edx]
lea esi, [esi+2*edx]
@@ -262,8 +262,8 @@
lea esi, [esi+2*edx]
SSE_LOAD_8P xmm4, xmm7, [esi]
SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
-.start:
+
+.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .xx_exit
@@ -273,7 +273,7 @@
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
dec ecx
jz near .xx_exit
-
+
lea edi, [edi+2*eax]
SSE_LOAD_8P xmm7, xmm0, [esi+edx]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -356,11 +356,11 @@
;***********************************************************************
-; void McHorVer02_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
+; void McHorVer02_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
; int32_t iDstStride,
-; int32_t iWidth,
+; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
ALIGN 16
@@ -368,19 +368,19 @@
push esi
push edi
push ebx
-
- mov esi, [esp + 16]
- mov edx, [esp + 20]
- mov edi, [esp + 24]
- mov eax, [esp + 28]
- mov ecx, [esp + 36]
- mov ebx, [esp + 32]
+
+ mov esi, [esp + 16]
+ mov edx, [esp + 20]
+ mov edi, [esp + 24]
+ mov eax, [esp + 28]
+ mov ecx, [esp + 36]
+ mov ebx, [esp + 32]
shr ebx, 3
sub esi, edx
sub esi, edx
-
-.xloop:
- WELS_Zero xmm7
+
+.xloop:
+ WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [esi]
SSE_LOAD_8P xmm1, xmm7, [esi+edx]
lea esi, [esi+2*edx]
@@ -389,7 +389,7 @@
lea esi, [esi+2*edx]
SSE_LOAD_8P xmm4, xmm7, [esi]
SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
+
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
lea esi, [esi+2*edx]
@@ -402,8 +402,8 @@
movdqa xmm5,xmm6
add edi, eax
sub esi, edx
-
-.start:
+
+.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .x_loop_dec
@@ -413,7 +413,7 @@
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*eax]
SSE_LOAD_8P xmm7, xmm0, [esi+edx]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -454,16 +454,16 @@
SSE_LOAD_8P xmm5, xmm6, [esi+edx]
jmp near .start
-.x_loop_dec:
+.x_loop_dec:
dec ebx
jz near .xx_exit
- mov esi, [esp + 16]
- mov edi, [esp + 24]
+ mov esi, [esp + 16]
+ mov edi, [esp + 24]
sub esi, edx
sub esi, edx
add esi, 8
add edi, 8
- mov ecx, [esp + 36]
+ mov ecx, [esp + 36]
jmp near .xloop
.xx_exit:
@@ -473,12 +473,12 @@
ret
-ALIGN 16
+ALIGN 16
;***********************************************************************
-; void McHorVer20_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
+; void McHorVer20_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight
; );
@@ -487,19 +487,19 @@
push esi
push edi
push ebx
- mov esi, [esp+16]
- mov eax, [esp+20]
- mov edi, [esp+24]
- mov edx, [esp+28]
- mov ecx, [esp+32]
- mov ebx, [esp+36]
+ mov esi, [esp+16]
+ mov eax, [esp+20]
+ mov edi, [esp+24]
+ mov edx, [esp+28]
+ mov ecx, [esp+32]
+ mov ebx, [esp+36]
sub esi, 2
- pxor xmm7, xmm7
-
+ pxor xmm7, xmm7
+
cmp ecx, 9
- jne near .width_17
-
-.yloop_width_9:
+ jne near .width_17
+
+.yloop_width_9:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -512,7 +512,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
@@ -526,12 +526,12 @@
paddw xmm0, [h264_w0x10_1]
psraw xmm0, 5
packuswb xmm0, xmm0
- movd [edi], xmm0
-
+ movd [edi], xmm0
+
pxor xmm7, xmm7
movq xmm0, [esi+6]
punpcklbw xmm0, xmm7
-
+
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
@@ -543,8 +543,8 @@
paddw xmm2, [h264_w0x10_1]
psraw xmm2, 5
packuswb xmm2, xmm2
- movq [edi+1], xmm2
-
+ movq [edi+1], xmm2
+
add esi, eax
add edi, edx
dec ebx
@@ -553,8 +553,8 @@
pop edi
pop esi
ret
-
-
+
+
.width_17:
.yloop_width_17:
movq xmm0, [esi]
@@ -569,7 +569,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -582,7 +582,7 @@
psraw xmm0, 5
packuswb xmm0, xmm0
movq [edi], xmm0
-
+
movq xmm0, [esi+8]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5+8]
@@ -595,7 +595,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3+8]
punpcklbw xmm5, xmm7
-
+
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
@@ -610,12 +610,12 @@
psraw xmm0, 5
packuswb xmm0, xmm0
movd [edi+8], xmm0
-
-
+
+
pxor xmm7, xmm7
movq xmm0, [esi+6+8]
punpcklbw xmm0, xmm7
-
+
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
@@ -627,7 +627,7 @@
paddw xmm2, [h264_w0x10_1]
psraw xmm2, 5
packuswb xmm2, xmm2
- movq [edi+9], xmm2
+ movq [edi+9], xmm2
add esi, eax
add edi, edx
dec ebx
@@ -636,14 +636,14 @@
pop edi
pop esi
ret
-
-
+
+
ALIGN 16
;***********************************************************************
;void McHorVer22HorFirst_sse2
-; (uint8_t *pSrc,
-; int32_t iSrcStride,
+; (uint8_t *pSrc,
+; int32_t iSrcStride,
; uint8_t * pTap,
; int32_t iTapStride,
; int32_t iWidth,int32_t iHeight);
@@ -652,21 +652,21 @@
push esi
push edi
push ebx
- mov esi, [esp+16]
- mov eax, [esp+20]
- mov edi, [esp+24]
- mov edx, [esp+28]
- mov ecx, [esp+32]
- mov ebx, [esp+36]
- pxor xmm7, xmm7
-
+ mov esi, [esp+16]
+ mov eax, [esp+20]
+ mov edi, [esp+24]
+ mov edx, [esp+28]
+ mov ecx, [esp+32]
+ mov ebx, [esp+36]
+ pxor xmm7, xmm7
+
sub esi, eax ;;;;;;;;need more 5 lines.
sub esi, eax
-
+
cmp ecx, 9
- jne near .width_17
-
-.yloop_width_9:
+ jne near .width_17
+
+.yloop_width_9:
movq xmm0, [esi]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5]
@@ -679,7 +679,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
@@ -690,12 +690,12 @@
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
- movd [edi], xmm0
-
+ movd [edi], xmm0
+
pxor xmm7, xmm7
movq xmm0, [esi+6]
punpcklbw xmm0, xmm7
-
+
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
@@ -704,9 +704,9 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
- movq [edi+2], xmm2
- movhps [edi+2+8], xmm2
-
+ movq [edi+2], xmm2
+ movhps [edi+2+8], xmm2
+
add esi, eax
add edi, edx
dec ebx
@@ -715,8 +715,8 @@
pop edi
pop esi
ret
-
-
+
+
.width_17:
.yloop_width_17:
movq xmm0, [esi]
@@ -731,7 +731,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3]
punpcklbw xmm5, xmm7
-
+
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
@@ -741,7 +741,7 @@
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [edi], xmm0
-
+
movq xmm0, [esi+8]
punpcklbw xmm0, xmm7
movq xmm1, [esi+5+8]
@@ -754,7 +754,7 @@
punpcklbw xmm4, xmm7
movq xmm5, [esi+3+8]
punpcklbw xmm5, xmm7
-
+
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
@@ -766,12 +766,12 @@
psllw xmm6, 2
paddw xmm0, xmm6
movd [edi+16], xmm0
-
-
+
+
pxor xmm7, xmm7
movq xmm0, [esi+6+8]
punpcklbw xmm0, xmm7
-
+
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
@@ -780,9 +780,9 @@
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
- movq [edi+18], xmm2
- movhps [edi+18+8], xmm2
-
+ movq [edi+18], xmm2
+ movhps [edi+18+8], xmm2
+
add esi, eax
add edi, edx
dec ebx
@@ -791,23 +791,23 @@
pop edi
pop esi
ret
-
-
+
+
%macro FILTER_VER 9
paddw %1, %6
movdqa %7, %2
movdqa %8, %3
-
-
+
+
paddw %7, %5
paddw %8, %4
-
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
+
+ psubw %1, %7
+ psraw %1, 2
+ paddw %1, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %8, %1
paddw %8, [h264_mc_hc_32]
psraw %8, 6
packuswb %8, %8
@@ -815,8 +815,8 @@
%endmacro
;***********************************************************************
;void McHorVer22VerLastAlign_sse2(
-; uint8_t *pTap,
-; int32_t iTapStride,
+; uint8_t *pTap,
+; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
@@ -828,15 +828,15 @@
push edi
push ebx
push ebp
-
+
mov esi, [esp+20]
mov eax, [esp+24]
mov edi, [esp+28]
mov edx, [esp+32]
mov ebx, [esp+36]
- mov ecx, [esp+40]
- shr ebx, 3
-
+ mov ecx, [esp+40]
+ shr ebx, 3
+
.width_loop:
movdqa xmm0, [esi]
movdqa xmm1, [esi+eax]
@@ -846,12 +846,12 @@
lea esi, [esi+2*eax]
movdqa xmm4, [esi]
movdqa xmm5, [esi+eax]
-
+
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
lea esi, [esi+2*eax]
movdqa xmm6, [esi]
-
+
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
@@ -858,61 +858,61 @@
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
-
+
add edi, edx
- sub esi, eax
-
+ sub esi, eax
+
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm6, [esi]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm7, [esi+eax]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm0, [esi]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm1, [esi+eax]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm2, [esi]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm3, [esi+eax]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqa xmm4, [esi]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqa xmm5, [esi+eax]
jmp near .start
-
+
.x_loop_dec:
dec ebx
jz near .exit
@@ -922,9 +922,9 @@
add esi, 16
add edi, 8
jmp .width_loop
-
-
-
+
+
+
.exit:
pop ebp
pop ebx
@@ -934,8 +934,8 @@
;***********************************************************************
;void McHorVer22VerLastUnAlign_sse2(
-; uint8_t *pTap,
-; int32_t iTapStride,
+; uint8_t *pTap,
+; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
@@ -947,15 +947,15 @@
push edi
push ebx
push ebp
-
+
mov esi, [esp+20]
mov eax, [esp+24]
mov edi, [esp+28]
mov edx, [esp+32]
mov ebx, [esp+36]
- mov ecx, [esp+40]
- shr ebx, 3
-
+ mov ecx, [esp+40]
+ shr ebx, 3
+
.width_loop:
movdqu xmm0, [esi]
movdqu xmm1, [esi+eax]
@@ -965,12 +965,12 @@
lea esi, [esi+2*eax]
movdqu xmm4, [esi]
movdqu xmm5, [esi+eax]
-
+
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
lea esi, [esi+2*eax]
movdqu xmm6, [esi]
-
+
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
@@ -977,61 +977,61 @@
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
-
+
add edi, edx
- sub esi, eax
-
+ sub esi, eax
+
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqu xmm6, [esi]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqu xmm7, [esi+eax]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqu xmm0, [esi]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqu xmm1, [esi+eax]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqu xmm2, [esi]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqu xmm3, [esi+eax]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
dec ecx
jz near .x_loop_dec
-
+
lea esi, [esi+2*eax]
movdqu xmm4, [esi]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
dec ecx
jz near .x_loop_dec
-
+
lea edi, [edi+2*edx]
movdqu xmm5, [esi+eax]
jmp near .start
-
+
.x_loop_dec:
dec ebx
jz near .exit
@@ -1041,9 +1041,9 @@
add esi, 16
add edi, 8
jmp .width_loop
-
-
-
+
+
+
.exit:
pop ebp
pop ebx
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
;* memzero.asm
;*
;* Abstract
-;*
;*
+;*
;* History
;* 9/16/2009 Created
;*
@@ -47,8 +47,8 @@
; Code
;***********************************************************************
-SECTION .text
-
+SECTION .text
+
ALIGN 16
;***********************************************************************
;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
WelsPrefetchZero_mmx:
mov eax,[esp+4]
prefetchnta [eax]
- ret
+ ret
ALIGN 16
@@ -69,7 +69,7 @@
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8]
neg ecx
-
+
pxor xmm0, xmm0
.memzeroa64_sse2_loops:
movdqa [eax], xmm0
@@ -77,12 +77,12 @@
movdqa [eax+32], xmm0
movdqa [eax+48], xmm0
add eax, 0x40
-
+
add ecx, 0x40
jnz near .memzeroa64_sse2_loops
-
- ret
+ ret
+
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8]
neg ecx
-
+
pxor mm0, mm0
.memzero64_mmx_loops:
movq [eax], mm0
@@ -102,16 +102,16 @@
movq [eax+32], mm0
movq [eax+40], mm0
movq [eax+48], mm0
- movq [eax+56], mm0
+ movq [eax+56], mm0
add eax, 0x40
-
+
add ecx, 0x40
jnz near .memzero64_mmx_loops
-
- WELSEMMS
- ret
-
-ALIGN 16
+
+ WELSEMMS
+ ret
+
+ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
@@ -119,17 +119,17 @@
WelsSetMemZeroSize8_mmx:
mov eax, [esp + 4] ; dst
mov ecx, [esp + 8] ; size
- neg ecx
+ neg ecx
pxor mm0, mm0
-
+
.memzero8_mmx_loops:
movq [eax], mm0
add eax, 0x08
-
+
add ecx, 0x08
jnz near .memzero8_mmx_loops
-
- WELSEMMS
- ret
-
+ WELSEMMS
+ ret
+
+
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -44,17 +44,17 @@
BITS 32
-SECTION .text
+SECTION .text
;************************************************
-;NEW_QUANT
+;NEW_QUANT
;************************************************
%macro SSE2_Quant8 5
MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
@@ -64,10 +64,10 @@
%macro SSE2_QuantMax8 6
MOVDQ %1, %5
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pmaxsw %6, %1
@@ -86,17 +86,17 @@
WELS_EXTERN WelsQuant4x4_sse2
align 16
WelsQuant4x4_sse2:
- mov eax, [ff]
- mov ecx, [mf]
+ mov eax, [ff]
+ mov ecx, [mf]
MOVDQ xmm2, [eax]
MOVDQ xmm3, [ecx]
-
+
mov edx, [pDct]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
ret
-
+
;***********************************************************************
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
;***********************************************************************
@@ -104,17 +104,17 @@
align 16
WelsQuant4x4Dc_sse2:
mov ax, [mf]
- SSE2_Copy8Times xmm3, eax
-
+ SSE2_Copy8Times xmm3, eax
+
mov cx, [ff]
- SSE2_Copy8Times xmm2, ecx
+ SSE2_Copy8Times xmm2, ecx
mov edx, [pDct]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
-
- ret
-
+
+ ret
+
;***********************************************************************
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
;***********************************************************************
@@ -121,20 +121,20 @@
WELS_EXTERN WelsQuantFour4x4_sse2
align 16
WelsQuantFour4x4_sse2:
- mov eax, [ff]
- mov ecx, [mf]
+ mov eax, [ff]
+ mov ecx, [mf]
MOVDQ xmm2, [eax]
MOVDQ xmm3, [ecx]
-
- mov edx, [pDct]
+
+ mov edx, [pDct]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x20]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x30]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x40]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x50]
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x60]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x70]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x70]
ret
@@ -144,17 +144,17 @@
WELS_EXTERN WelsQuantFour4x4Max_sse2
align 16
WelsQuantFour4x4Max_sse2:
- mov eax, [ff]
- mov ecx, [mf]
+ mov eax, [ff]
+ mov ecx, [mf]
MOVDQ xmm2, [eax]
MOVDQ xmm3, [ecx]
-
- mov edx, [pDct]
+
+ mov edx, [pDct]
pxor xmm4, xmm4
pxor xmm5, xmm5
pxor xmm6, xmm6
pxor xmm7, xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx ], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx ], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10], xmm4
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x20], xmm5
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x30], xmm5
@@ -162,20 +162,20 @@
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x50], xmm6
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x60], xmm7
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x70], xmm7
-
+
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
- pmaxsw xmm0, xmm4
+ pmaxsw xmm0, xmm4
pmaxsw xmm0, xmm5
- pmaxsw xmm0, xmm7
+ pmaxsw xmm0, xmm7
movdqa xmm1, xmm0
punpckhqdq xmm0, xmm1
pmaxsw xmm0, xmm1
- mov edx, [max]
- movq [edx], xmm0
-
- ret
+ mov edx, [max]
+ movq [edx], xmm0
+ ret
+
%macro MMX_Copy4Times 2
movd %1, %2
punpcklwd %1, %1
@@ -185,10 +185,10 @@
SECTION .text
%macro MMX_Quant4 4
- pxor %2, %2
- pcmpgtw %2, %1
- pxor %1, %2
- psubw %1, %2
+ pxor %2, %2
+ pcmpgtw %2, %1
+ pxor %1, %2
+ psubw %1, %2
paddusw %1, %3
pmulhuw %1, %4
pxor %1, %2
@@ -211,13 +211,13 @@
movd mm3, [eax + 0x40]
movd mm1, [eax + 0x60]
punpcklwd mm3, mm1
-
+
mov cx, 0
mov [eax], cx
mov [eax + 0x20], cx
mov [eax + 0x40], cx
mov [eax + 0x60], cx
-
+
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
@@ -229,22 +229,22 @@
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
-
+
;quant_2x2_dc
mov ax, [mf]
- MMX_Copy4Times mm3, eax
+ MMX_Copy4Times mm3, eax
mov cx, [ff]
MMX_Copy4Times mm2, ecx
MMX_Quant4 mm1, mm0, mm2, mm3
-
+
; store dct_2x2
- mov edx, [dct2x2]
+ mov edx, [dct2x2]
movq [edx], mm1
mov ecx, [iChromaDc]
movq [ecx], mm1
-
+
; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
+ pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
@@ -251,10 +251,10 @@
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
movd eax, mm1
-
+
WELSEMMS
ret
-
+
;***********************************************************************
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
;***********************************************************************
@@ -269,7 +269,7 @@
movd mm3, [eax + 0x40]
movd mm1, [eax + 0x60]
punpcklwd mm3, mm1
-
+
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
@@ -281,16 +281,16 @@
paddw mm1, mm3
psubw mm3, mm5
punpcklwd mm1, mm3
-
+
;quant_2x2_dc
mov ax, [mf]
- MMX_Copy4Times mm3, eax
+ MMX_Copy4Times mm3, eax
mov cx, [ff]
MMX_Copy4Times mm2, ecx
MMX_Quant4 mm1, mm0, mm2, mm3
-
+
; pNonZeroCount of dct_2x2
- pcmpeqb mm2, mm2 ; mm2 = FF
+ pcmpeqb mm2, mm2 ; mm2 = FF
pxor mm3, mm3
packsswb mm1, mm3
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
@@ -297,16 +297,16 @@
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
movd eax, mm1
-
- WELSEMMS
- ret
-
-
-%macro SSE2_DeQuant8 3
+
+ WELSEMMS
+ ret
+
+
+%macro SSE2_DeQuant8 3
MOVDQ %2, %1
pmullw %2, %3
MOVDQ %1, %2
-%endmacro
+%endmacro
ALIGN 16
@@ -329,7 +329,7 @@
;***********************************************************************====
;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
;***********************************************************************====
-
+
align 16
WELS_EXTERN WelsDequantFour4x4_sse2
@@ -356,15 +356,15 @@
WELS_EXTERN WelsDequantIHadamard4x4_sse2
align 16
WelsDequantIHadamard4x4_sse2:
- mov eax, [esp + 4]
+ mov eax, [esp + 4]
mov cx, [esp + 8]
-
+
; WelsDequantLumaDc4x4
- SSE2_Copy8Times xmm1, ecx
+ SSE2_Copy8Times xmm1, ecx
;psrlw xmm1, 2 ; for the (>>2) in ihdm
MOVDQ xmm0, [eax]
MOVDQ xmm2, [eax+0x10]
- pmullw xmm0, xmm1
+ pmullw xmm0, xmm1
pmullw xmm2, xmm1
; ihdm_4x4
@@ -371,24 +371,23 @@
movdqa xmm1, xmm0
psrldq xmm1, 8
movdqa xmm3, xmm2
- psrldq xmm3, 8
-
- SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
- SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
+ psrldq xmm3, 8
+
+ SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+ SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
- SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
- SSE2_SumSub xmm2, xmm4, xmm5
- SSE2_SumSub xmm1, xmm0, xmm5
- SSE2_SumSub xmm4, xmm0, xmm5
- SSE2_SumSub xmm2, xmm1, xmm5
+ SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
+ SSE2_SumSub xmm2, xmm4, xmm5
+ SSE2_SumSub xmm1, xmm0, xmm5
+ SSE2_SumSub xmm4, xmm0, xmm5
+ SSE2_SumSub xmm2, xmm1, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
-
+
punpcklqdq xmm0, xmm1
MOVDQ [eax], xmm0
-
+
punpcklqdq xmm2, xmm3
- MOVDQ [eax+16], xmm2
+ MOVDQ [eax+16], xmm2
ret
-
\ No newline at end of file
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -37,7 +37,7 @@
;* WelsSampleSatd16x8_sse2
;* WelsSampleSatd8x16_sse2
;* WelsSampleSatd16x16_sse2
-;*
+;*
;* WelsSampleSad16x8_sse2
;* WelsSampleSad16x16_sse2
;*
@@ -99,12 +99,12 @@
%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %4, %5
- SSE2_SumSub %2, %4, %5
- SSE2_SumSub %1, %3, %5
-%endmacro
+ SSE2_SumSub %3, %4, %5
+ SSE2_SumSub %2, %4, %5
+ SSE2_SumSub %1, %3, %5
+%endmacro
-%macro SSE2_SumAbs4 7
+%macro SSE2_SumAbs4 7
WELS_AbsW %1, %3
WELS_AbsW %2, %3
WELS_AbsW %4, %6
@@ -113,13 +113,13 @@
paddusw %4, %5
paddusw %7, %1
paddusw %7, %4
-%endmacro
+%endmacro
%macro SSE2_SumWHorizon 3
movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
@@ -132,12 +132,12 @@
lea ecx, [ecx+2*edx]
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx]
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-
+
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_LoadDiff8P xmm0,xmm4,xmm7,[eax],[ecx]
@@ -146,11 +146,11 @@
lea ecx, [ecx+2*edx]
SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx]
SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-
+
SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
%endmacro
;***********************************************************************
@@ -165,8 +165,8 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
-
+ mov edx, [esp+20]
+
movd xmm0, [eax]
movd xmm1, [eax+ebx]
lea eax , [eax+2*ebx]
@@ -174,7 +174,7 @@
movd xmm3, [eax+ebx]
punpckldq xmm0, xmm2
punpckldq xmm1, xmm3
-
+
movd xmm4, [ecx]
movd xmm5, [ecx+edx]
lea ecx , [ecx+2*edx]
@@ -188,7 +188,7 @@
punpcklbw xmm1, xmm6
punpcklbw xmm4, xmm6
punpcklbw xmm5, xmm6
-
+
psubw xmm0, xmm4
psubw xmm1, xmm5
@@ -196,7 +196,7 @@
paddw xmm0, xmm1
psubw xmm2, xmm1
SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
+
movdqa xmm4, xmm0
paddw xmm0, xmm3
psubw xmm4, xmm3
@@ -204,7 +204,7 @@
movdqa xmm2, xmm0
punpcklwd xmm0, xmm4
punpckhwd xmm4, xmm2
-
+
SSE2_XSawp dq, xmm0, xmm4, xmm3
SSE2_XSawp qdq, xmm0, xmm3, xmm5
@@ -211,16 +211,16 @@
movdqa xmm7, xmm0
paddw xmm0, xmm5
psubw xmm7, xmm5
-
+
SSE2_XSawp qdq, xmm0, xmm7, xmm1
movdqa xmm2, xmm0
paddw xmm0, xmm1
psubw xmm2, xmm1
-
- WELS_AbsW xmm0, xmm3
+
+ WELS_AbsW xmm0, xmm3
paddusw xmm6, xmm0
- WELS_AbsW xmm2, xmm4
+ WELS_AbsW xmm2, xmm4
paddusw xmm6, xmm2
SSE2_SumWHorizon1 xmm6, xmm4
movd eax, xmm6
@@ -228,7 +228,7 @@
shr eax, 1
pop ebx
ret
-
+
;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -241,16 +241,16 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
- pxor xmm7, xmm7
- SSE2_GetSatd8x8
+ pxor xmm7, xmm7
+ SSE2_GetSatd8x8
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd eax, xmm6
pop ebx
ret
-
+
;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -263,15 +263,15 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSatd8x8
-
+ lea ecx, [ecx+2*edx]
+ SSE2_GetSatd8x8
+
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
movd eax, xmm6
@@ -290,15 +290,15 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
pxor xmm7, xmm7
-
+
SSE2_GetSatd8x8
mov eax, [esp+8]
mov ecx, [esp+16]
add eax, 8
- add ecx, 8
+ add ecx, 8
SSE2_GetSatd8x8
psrlw xmm6, 1
@@ -319,25 +319,25 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
+
+ SSE2_GetSatd8x8
lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
SSE2_GetSatd8x8
-
+
mov eax, [esp+8]
mov ecx, [esp+16]
add eax, 8
add ecx, 8
-
- SSE2_GetSatd8x8
+
+ SSE2_GetSatd8x8
lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
SSE2_GetSatd8x8
-
+
; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
psrlw xmm6, 1
SSE2_SumWHorizon xmm6,xmm4,xmm7
@@ -353,18 +353,18 @@
;***********************************************************************
;
-;Pixel_satd_intra_sse2 BEGIN
+;Pixel_satd_intra_sse2 BEGIN
;
;***********************************************************************
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
pmaddubsw %1, xmm5
movdqa %2, %1
pmaddwd %1, xmm7
pmaddwd %2, xmm6
movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
+ punpckldq %1, %2
+ punpckhdq %2, %3
movdqa %3, %1
punpcklqdq %1, %2
punpckhqdq %3, %2
@@ -373,14 +373,14 @@
packssdw %1, %3
psllw %1, 2
%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
pmaddubsw %1, xmm5
movdqa %2, %1
pmaddwd %1, xmm7
pmaddwd %2, xmm6
movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
+ punpckldq %1, %2
+ punpckhdq %2, %3
movdqa %3, %1
punpcklqdq %1, %2
punpckhqdq %3, %2
@@ -387,7 +387,7 @@
; paddd xmm4, %1 ;for dc
; paddd xmm4, %3 ;for dc
movdqa %4, %1
- punpcklqdq %4, %3
+ punpcklqdq %4, %3
packssdw %1, %3
psllw %1, 2
%endmacro
@@ -415,25 +415,25 @@
pinsrw xmm0, word[esi+%2+8], 4
psubsw xmm0, xmm7
pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2+2], 0
pinsrw xmm0, word[esi+%2+10], 4
psubsw xmm0, xmm1
pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2+4], 0
pinsrw xmm0, word[esi+%2+12], 4
psubsw xmm0, xmm3
pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
pxor xmm0, xmm0
pinsrw xmm0, word[esi+%2+6], 0
pinsrw xmm0, word[esi+%2+14], 4
psubsw xmm0, xmm2
pabsw xmm0, xmm0
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
%endmacro
%macro SSE41_GetX38x4SatdH 3
movq xmm0, [esi+%3+8*%1]
@@ -455,7 +455,7 @@
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm6, xmm0
- paddw xmm6, xmm2
+ paddw xmm6, xmm2
%endmacro
%macro SSE41_ChromaGetX38x4SatdDC 1
shl %1, 4
@@ -463,13 +463,13 @@
psubsw xmm0, xmm7
pabsw xmm0, xmm0
paddw xmm6, xmm0
- paddw xmm6, xmm2
+ paddw xmm6, xmm2
%endmacro
%macro SSE41_I16x16GetX38x4Satd 2
SSE41_GetX38x4SatdDec
SSE41_GetX38x4SatdV %1, %2
SSE41_GetX38x4SatdH %1, %2, 32
- SSE41_I16X16GetX38x4SatdDC
+ SSE41_I16X16GetX38x4SatdDC
%endmacro
%macro SSE41_ChromaGetX38x4Satd 2
SSE41_GetX38x4SatdDec
@@ -478,11 +478,11 @@
SSE41_ChromaGetX38x4SatdDC %1
%endmacro
%macro SSE41_HSum8W 3
- pmaddwd %1, %2
- movhlps %3, %1
- paddd %1, %3
- pshuflw %3, %1,0Eh
- paddd %1, %3
+ pmaddwd %1, %2
+ movhlps %3, %1
+ paddd %1, %3
+ pshuflw %3, %1,0Eh
+ paddd %1, %3
%endmacro
WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
@@ -493,7 +493,7 @@
mov ecx, [esp+16]
mov edx, [esp+20]
mov eax, [esp+24]
- mov ebx, [esp+28]
+ mov ebx, [esp+28]
mov esi, [esp+40] ;temp_satd
pxor xmm4, xmm4
movdqa xmm5, [HSumSubDB1]
@@ -507,29 +507,29 @@
SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
movdqa [esi], xmm0 ;V
- movdqa [esi+16], xmm1
+ movdqa [esi+16], xmm1
add ecx, edx
pinsrb xmm0, byte[ecx-1], 0
pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 2
pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 4
pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 6
pinsrb xmm0, byte[ecx+edx-1], 7
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 8
pinsrb xmm0, byte[ecx+edx-1], 9
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 10
pinsrb xmm0, byte[ecx+edx-1], 11
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 12
pinsrb xmm0, byte[ecx+edx-1], 13
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 14
pinsrb xmm0, byte[ecx+edx-1], 15
movhlps xmm1, xmm0
@@ -549,7 +549,7 @@
pxor xmm6, xmm6 ;DC
mov ecx, 0
mov edi, 0
-.loop16x16_get_satd:
+.loop16x16_get_satd:
.loopStart1:
SSE41_I16x16GetX38x4Satd ecx, edi
inc ecx
@@ -562,8 +562,8 @@
mov ecx, 0
add edi, 16
jmp .loop16x16_get_satd
- .loop16x16_get_satd_end:
- MMX_DW_1_2REG xmm0, xmm1
+ .loop16x16_get_satd_end:
+ MMX_DW_1_2REG xmm0, xmm1
psrlw xmm4, 1 ;/2
psrlw xmm5, 1 ;/2
psrlw xmm6, 1 ;/2
@@ -570,7 +570,7 @@
SSE41_HSum8W xmm4, xmm0, xmm1
SSE41_HSum8W xmm5, xmm0, xmm1
SSE41_HSum8W xmm6, xmm0, xmm1
-
+
; comparing order: DC H V
movd ebx, xmm6 ;DC
movd edi, xmm5 ;H
@@ -577,33 +577,33 @@
movd ecx, xmm4 ;V
mov edx, [esp+36]
shl edx, 1
- add edi, edx
- add ebx, edx
+ add edi, edx
+ add ebx, edx
mov edx, [esp+32]
cmp ebx, edi
jge near not_dc_16x16
cmp ebx, ecx
jge near not_dc_h_16x16
-
+
; for DC mode
mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
+ mov eax, ebx
jmp near return_satd_intra_16x16_x3
not_dc_16x16:
- ; for H mode
+ ; for H mode
cmp edi, ecx
jge near not_dc_h_16x16
mov dword[edx], 1;I16_PRED_H
- mov eax, edi
+ mov eax, edi
jmp near return_satd_intra_16x16_x3
not_dc_h_16x16:
; for V mode
mov dword[edx], 0;I16_PRED_V
mov eax, ecx
-return_satd_intra_16x16_x3:
+return_satd_intra_16x16_x3:
WELSEMMS
- pop edi
- pop esi
+ pop edi
+ pop esi
pop ebx
ret
@@ -619,13 +619,13 @@
add ecx, edx
pinsrb xmm0, byte[ecx-1], 0
pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 2
pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 4
pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
+ lea ecx, [ecx+2*edx]
pinsrb xmm0, byte[ecx-1], 6
pinsrb xmm0, byte[ecx+edx-1], 7
punpcklqdq xmm0, xmm0
@@ -634,10 +634,10 @@
;(sum+2)>>2
movdqa xmm6, [PDQ2]
movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
+ punpckhqdq xmm5, xmm1
paddd xmm5, xmm6
psrld xmm5, 2
-;(sum1+sum2+4)>>3
+;(sum1+sum2+4)>>3
paddd xmm6, xmm6
paddd xmm4, xmm1
paddd xmm4, xmm6
@@ -644,8 +644,8 @@
psrld xmm4, 3
;satd *16
pslld xmm5, 4
- pslld xmm4, 4
-;temp satd
+ pslld xmm4, 4
+;temp satd
movdqa xmm6, xmm4
punpcklqdq xmm4, xmm5
psllq xmm4, 32
@@ -655,12 +655,12 @@
psllq xmm5, 32
psrlq xmm5, 32
movdqa [esi+48], xmm5
-
+
pxor xmm4, xmm4 ;V
pxor xmm5, xmm5 ;H
pxor xmm6, xmm6 ;DC
mov ecx, 0
-loop_chroma_satdx3_cb_cr:
+loop_chroma_satdx3_cb_cr:
SSE41_ChromaGetX38x4Satd ecx, 0
inc ecx
cmp ecx, 2
@@ -668,13 +668,13 @@
%endmacro
%macro SSEReg2MMX 3
- movdq2q %2, %1
- movhlps %1, %1
- movdq2q %3, %1
+ movdq2q %2, %1
+ movhlps %1, %1
+ movdq2q %3, %1
%endmacro
%macro MMXReg2SSE 4
- movq2dq %1, %3
- movq2dq %2, %4
+ movq2dq %1, %3
+ movq2dq %2, %4
punpcklqdq %1, %2
%endmacro
;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
@@ -687,10 +687,10 @@
mov ecx, [esp+16]
mov edx, [esp+20]
mov eax, [esp+24]
- mov ebx, [esp+28]
+ mov ebx, [esp+28]
mov esi, [esp+40] ;temp_satd
xor edi, edi
-loop_chroma_satdx3:
+loop_chroma_satdx3:
SSE41_ChromaGetX38x8Satd
cmp edi, 1
je loop_chroma_satdx3end
@@ -701,16 +701,16 @@
mov ecx, [esp+44]
mov eax, [esp+48]
jmp loop_chroma_satdx3
-loop_chroma_satdx3end:
+loop_chroma_satdx3end:
MMXReg2SSE xmm0, xmm3, mm0, mm1
MMXReg2SSE xmm1, xmm3, mm2, mm3
MMXReg2SSE xmm2, xmm3, mm5, mm6
-
+
paddw xmm4, xmm0
paddw xmm5, xmm1
paddw xmm6, xmm2
-
- MMX_DW_1_2REG xmm0, xmm1
+
+ MMX_DW_1_2REG xmm0, xmm1
psrlw xmm4, 1 ;/2
psrlw xmm5, 1 ;/2
psrlw xmm6, 1 ;/2
@@ -730,57 +730,57 @@
jge near not_dc_8x8
cmp ebx, ecx
jge near not_dc_h_8x8
-
+
; for DC mode
mov dword[edx], 0;I8_PRED_DC
- mov eax, ebx
+ mov eax, ebx
jmp near return_satd_intra_8x8_x3
not_dc_8x8:
- ; for H mode
+ ; for H mode
cmp edi, ecx
jge near not_dc_h_8x8
mov dword[edx], 1;I8_PRED_H
- mov eax, edi
+ mov eax, edi
jmp near return_satd_intra_8x8_x3
not_dc_h_8x8:
; for V mode
mov dword[edx], 2;I8_PRED_V
mov eax, ecx
-return_satd_intra_8x8_x3:
+return_satd_intra_8x8_x3:
WELSEMMS
- pop edi
- pop esi
+ pop edi
+ pop esi
pop ebx
ret
-
+
;***********************************************************************
;
-;Pixel_satd_intra_sse2 END
+;Pixel_satd_intra_sse2 END
;
;***********************************************************************
%macro SSSE3_Get16BSadHVDC 2
- movd xmm6,%1
- pshufb xmm6,xmm1
+ movd xmm6,%1
+ pshufb xmm6,xmm1
movdqa %1, xmm6
- movdqa xmm0,%2
- psadbw xmm0,xmm7
- paddw xmm4,xmm0
movdqa xmm0,%2
- psadbw xmm0,xmm5
- paddw xmm2,xmm0
+ psadbw xmm0,xmm7
+ paddw xmm4,xmm0
+ movdqa xmm0,%2
+ psadbw xmm0,xmm5
+ paddw xmm2,xmm0
psadbw xmm6,%2
- paddw xmm3,xmm6
+ paddw xmm3,xmm6
%endmacro
%macro WelsAddDCValue 4
movzx %2, byte %1
- mov %3, %2
+ mov %3, %2
add %4, %2
-%endmacro
+%endmacro
;***********************************************************************
;
-;Pixel_sad_intra_ssse3 BEGIN
+;Pixel_sad_intra_ssse3 BEGIN
;
;***********************************************************************
WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
@@ -792,14 +792,14 @@
mov edx, [esp+20]
mov edi, [esp+40] ;temp_sad
sub ecx, edx
- movdqa xmm5,[ecx]
+ movdqa xmm5,[ecx]
pxor xmm0,xmm0
- psadbw xmm0,xmm5
- movhlps xmm1,xmm0
- paddw xmm0,xmm1
+ psadbw xmm0,xmm5
+ movhlps xmm1,xmm0
+ paddw xmm0,xmm1
movd eax,xmm0
-
- add ecx,edx
+
+ add ecx,edx
lea ebx, [edx+2*edx]
WelsAddDCValue [ecx-1 ], esi, [edi ], eax
WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
@@ -824,45 +824,45 @@
WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
sub edi, 192
- add eax,10h
- shr eax,5
- movd xmm7,eax
+ add eax,10h
+ shr eax,5
+ movd xmm7,eax
pxor xmm1,xmm1
pshufb xmm7,xmm1
- pxor xmm4,xmm4
- pxor xmm3,xmm3
- pxor xmm2,xmm2
-;sad begin
+ pxor xmm4,xmm4
+ pxor xmm3,xmm3
+ pxor xmm2,xmm2
+;sad begin
mov eax, [esp+24]
- mov ebx, [esp+28]
+ mov ebx, [esp+28]
lea esi, [ebx+2*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
+ add edi, 64
lea eax, [eax+4*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
+ add edi, 64
lea eax, [eax+4*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
+ add edi, 64
lea eax, [eax+4*ebx]
SSSE3_Get16BSadHVDC [edi], [eax]
SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
- pslldq xmm3,4
- por xmm3,xmm2
- movhlps xmm1,xmm3
- paddw xmm3,xmm1
+
+ pslldq xmm3,4
+ por xmm3,xmm2
+ movhlps xmm1,xmm3
+ paddw xmm3,xmm1
movhlps xmm0,xmm4
paddw xmm4,xmm0
; comparing order: DC H V
@@ -872,8 +872,8 @@
movd esi, xmm3 ;H
mov eax, [esp+36] ;lamda
shl eax, 1
- add esi, eax
- add ebx, eax
+ add esi, eax
+ add ebx, eax
mov edx, [esp+32]
cmp ebx, esi
jge near not_dc_16x16_sad
@@ -881,7 +881,7 @@
jge near not_dc_h_16x16_sad
; for DC mode
mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
+ mov eax, ebx
sub edi, 192
%assign x 0
%rep 16
@@ -890,11 +890,11 @@
%endrep
jmp near return_sad_intra_16x16_x3
not_dc_16x16_sad:
- ; for H mode
+ ; for H mode
cmp esi, ecx
jge near not_dc_h_16x16_sad
mov dword[edx], 1;I16_PRED_H
- mov eax, esi
+ mov eax, esi
jmp near return_sad_intra_16x16_x3
not_dc_h_16x16_sad:
; for V mode
@@ -914,12 +914,12 @@
;***********************************************************************
;
-;Pixel_sad_intra_ssse3 END
+;Pixel_sad_intra_ssse3 END
;
;***********************************************************************
;***********************************************************************
;
-;Pixel_satd_wxh_sse41 BEGIN
+;Pixel_satd_wxh_sse41 BEGIN
;
;***********************************************************************
@@ -934,9 +934,9 @@
movq xmm2, [ecx]
punpcklqdq xmm2, xmm2
pmaddubsw xmm2, xmm7
- movq xmm3, [ecx+edx]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
+ movq xmm3, [ecx+edx]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
psubsw xmm0, xmm2
psubsw xmm1, xmm3
movq xmm2, [eax+2*ebx]
@@ -948,12 +948,12 @@
movq xmm4, [ecx+2*edx]
punpcklqdq xmm4, xmm4
pmaddubsw xmm4, xmm7
- movq xmm5, [ecx+edi]
- punpcklqdq xmm5, xmm5
+ movq xmm5, [ecx+edi]
+ punpcklqdq xmm5, xmm5
pmaddubsw xmm5, xmm7
psubsw xmm2, xmm4
psubsw xmm3, xmm5
- SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
+ SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
pabsw xmm0, xmm0
pabsw xmm2, xmm2
pabsw xmm1, xmm1
@@ -970,18 +970,18 @@
pslld xmm2, 16
psrld xmm4, 16
por xmm2, xmm4
- pmaxuw xmm0, xmm2
+ pmaxuw xmm0, xmm2
paddw xmm6, xmm0
%endmacro
%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
- MMX_DW_1_2REG %3, %4
- pmaddwd %2, %3
- movhlps %4, %2
- paddd %2, %4
- pshuflw %4, %2,0Eh
- paddd %2, %4
- movd %1, %2
+ MMX_DW_1_2REG %3, %4
+ pmaddwd %2, %3
+ movhlps %4, %2
+ paddd %2, %4
+ pshuflw %4, %2,0Eh
+ paddd %2, %4
+ movd %1, %2
%endmacro
;***********************************************************************
;
@@ -990,53 +990,53 @@
;***********************************************************************
WELS_EXTERN WelsSampleSatd4x4_sse41
WelsSampleSatd4x4_sse41:
- push ebx
- mov eax,[esp+8]
- mov ebx,[esp+12]
- mov ecx,[esp+16]
- mov edx,[esp+20]
- movdqa xmm4,[HSwapSumSubDB1]
- movd xmm2,[ecx]
- movd xmm5,[ecx+edx]
- shufps xmm2,xmm5,0
- movd xmm3,[ecx+edx*2]
+ push ebx
+ mov eax,[esp+8]
+ mov ebx,[esp+12]
+ mov ecx,[esp+16]
+ mov edx,[esp+20]
+ movdqa xmm4,[HSwapSumSubDB1]
+ movd xmm2,[ecx]
+ movd xmm5,[ecx+edx]
+ shufps xmm2,xmm5,0
+ movd xmm3,[ecx+edx*2]
lea ecx, [edx*2+ecx]
- movd xmm5,[ecx+edx]
- shufps xmm3,xmm5,0
- movd xmm0,[eax]
- movd xmm5,[eax+ebx]
- shufps xmm0,xmm5,0
- movd xmm1,[eax+ebx*2]
+ movd xmm5,[ecx+edx]
+ shufps xmm3,xmm5,0
+ movd xmm0,[eax]
+ movd xmm5,[eax+ebx]
+ shufps xmm0,xmm5,0
+ movd xmm1,[eax+ebx*2]
lea eax, [ebx*2+eax]
- movd xmm5,[eax+ebx]
- shufps xmm1,xmm5,0
- pmaddubsw xmm0,xmm4
- pmaddubsw xmm1,xmm4
- pmaddubsw xmm2,xmm4
- pmaddubsw xmm3,xmm4
- psubw xmm0,xmm2
- psubw xmm1,xmm3
- movdqa xmm2,xmm0
- paddw xmm0,xmm1
- psubw xmm1,xmm2
- movdqa xmm2,xmm0
- punpcklqdq xmm0,xmm1
- punpckhqdq xmm2,xmm1
- movdqa xmm1,xmm0
- paddw xmm0,xmm2
- psubw xmm2,xmm1
- movdqa xmm1,xmm0
- pblendw xmm0,xmm2,0AAh
- pslld xmm2,16
- psrld xmm1,16
- por xmm2,xmm1
- pabsw xmm0,xmm0
- pabsw xmm2,xmm2
- pmaxsw xmm0,xmm2
+ movd xmm5,[eax+ebx]
+ shufps xmm1,xmm5,0
+ pmaddubsw xmm0,xmm4
+ pmaddubsw xmm1,xmm4
+ pmaddubsw xmm2,xmm4
+ pmaddubsw xmm3,xmm4
+ psubw xmm0,xmm2
+ psubw xmm1,xmm3
+ movdqa xmm2,xmm0
+ paddw xmm0,xmm1
+ psubw xmm1,xmm2
+ movdqa xmm2,xmm0
+ punpcklqdq xmm0,xmm1
+ punpckhqdq xmm2,xmm1
+ movdqa xmm1,xmm0
+ paddw xmm0,xmm2
+ psubw xmm2,xmm1
+ movdqa xmm1,xmm0
+ pblendw xmm0,xmm2,0AAh
+ pslld xmm2,16
+ psrld xmm1,16
+ por xmm2,xmm1
+ pabsw xmm0,xmm0
+ pabsw xmm2,xmm2
+ pmaxsw xmm0,xmm2
SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7
- pop ebx
- ret
-
+ pop ebx
+ ret
+
;***********************************************************************
;
;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1051,10 +1051,10 @@
mov eax, [esp+16]
mov ebx, [esp+20]
mov ecx, [esp+24]
- mov edx, [esp+28]
+ mov edx, [esp+28]
movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
+ lea esi, [ebx+ebx*2]
+ lea edi, [edx+edx*2]
pxor xmm6, xmm6
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
@@ -1065,7 +1065,7 @@
pop esi
pop ebx
ret
-
+
;***********************************************************************
;
;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1078,17 +1078,17 @@
push esi
push edi
push ebp
-%define pushsize 16
+%define pushsize 16
mov eax, [esp+pushsize+4]
mov ebx, [esp+pushsize+8]
mov ecx, [esp+pushsize+12]
- mov edx, [esp+pushsize+16]
+ mov edx, [esp+pushsize+16]
movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
+ lea esi, [ebx+ebx*2]
+ lea edi, [edx+edx*2]
pxor xmm6, xmm6
mov ebp, 0
-loop_get_satd_8x16:
+loop_get_satd_8x16:
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
@@ -1116,10 +1116,10 @@
mov eax, [esp+16]
mov ebx, [esp+20]
mov ecx, [esp+24]
- mov edx, [esp+28]
+ mov edx, [esp+28]
movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
+ lea esi, [ebx+ebx*2]
+ lea edi, [edx+edx*2]
pxor xmm6, xmm6
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
@@ -1144,7 +1144,7 @@
;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
;
;***********************************************************************
-
+
WELS_EXTERN WelsSampleSatd16x16_sse41
align 16
WelsSampleSatd16x16_sse41:
@@ -1152,17 +1152,17 @@
push esi
push edi
push ebp
- %define pushsize 16
+ %define pushsize 16
mov eax, [esp+pushsize+4]
mov ebx, [esp+pushsize+8]
mov ecx, [esp+pushsize+12]
- mov edx, [esp+pushsize+16]
+ mov edx, [esp+pushsize+16]
movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
+ lea esi, [ebx+ebx*2]
+ lea edi, [edx+edx*2]
pxor xmm6, xmm6
mov ebp, 0
-loop_get_satd_16x16_left:
+loop_get_satd_16x16_left:
SSE41_GetSatd8x4
lea eax, [eax+4*ebx]
lea ecx, [ecx+4*edx]
@@ -1206,8 +1206,8 @@
lea ecx, [ecx+2*edx]
movdqu xmm1, [ecx]
MOVDQ xmm2, [eax];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
movdqu xmm1, [ecx+edx]
MOVDQ xmm2, [eax+ebx]
psadbw xmm1, xmm2
@@ -1218,7 +1218,7 @@
%macro SSE2_GetSad4x16 0
movdqu xmm0, [ecx]
MOVDQ xmm2, [eax]
- psadbw xmm0, xmm2
+ psadbw xmm0, xmm2
paddw xmm7, xmm0
movdqu xmm1, [ecx+edx]
MOVDQ xmm2, [eax+ebx]
@@ -1226,8 +1226,8 @@
paddw xmm7, xmm1
movdqu xmm1, [ecx+2*edx]
MOVDQ xmm2, [eax+2*ebx];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
movdqu xmm1, [ecx+edi]
MOVDQ xmm2, [eax+esi]
psadbw xmm1, xmm2
@@ -1265,17 +1265,17 @@
WelsSampleSad16x16_sse2:
push ebx
push edi
- push esi
-
+ push esi
+
%define _STACK_SIZE 12
-
+
mov eax, [esp+_STACK_SIZE+4 ]
mov ebx, [esp+_STACK_SIZE+8 ]
lea esi, [3*ebx]
mov ecx, [esp+_STACK_SIZE+12]
- mov edx, [esp+_STACK_SIZE+16]
- lea edi, [3*edx]
-
+ mov edx, [esp+_STACK_SIZE+16]
+ lea edi, [3*edx]
+
pxor xmm7, xmm7
SSE2_GetSad4x16
lea eax, [eax+4*ebx]
@@ -1290,14 +1290,14 @@
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd eax, xmm0
-
- %undef _STACK_SIZE
-
+
+ %undef _STACK_SIZE
+
pop esi
pop edi
pop ebx
ret
-
+
;***********************************************************************
;
;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
@@ -1312,10 +1312,10 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
movdqu xmm0, [ecx]
MOVDQ xmm2, [eax]
- psadbw xmm0, xmm2
+ psadbw xmm0, xmm2
movdqu xmm1, [ecx+edx]
MOVDQ xmm2, [eax+ebx]
psadbw xmm1, xmm2
@@ -1339,19 +1339,19 @@
mov eax, [esp+8]
mov ebx, [esp+12]
mov ecx, [esp+16]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
-
+
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
+ SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
+ SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
@@ -1375,15 +1375,15 @@
push edi
mov eax, [esp+12]
mov ebx, [esp+16]
-
+
pxor xmm7, xmm7
-
+
mov edi, ecx
and edi, 0x07
- sub ecx, edi
+ sub ecx, edi
mov edx, 8
sub edx, edi
-
+
shl edi, 3
shl edx, 3
movd xmm5, edi
@@ -1391,10 +1391,10 @@
mov edi, 8
add edi, ecx
mov edx, [esp+24]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -1402,17 +1402,17 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -1420,7 +1420,7 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
@@ -1427,10 +1427,10 @@
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -1438,17 +1438,17 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -1456,10 +1456,10 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd eax, xmm0
@@ -1469,12 +1469,12 @@
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
+ SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd eax, xmm0
@@ -1485,7 +1485,7 @@
;***********************************************************************
;
-;Pixel_sad_wxh_sse2 END
+;Pixel_sad_wxh_sse2 END
;
;***********************************************************************
@@ -1492,7 +1492,7 @@
;***********************************************************************
;
-;Pixel_sad_4_wxh_sse2 BEGIN
+;Pixel_sad_4_wxh_sse2 BEGIN
;
;***********************************************************************
@@ -1525,20 +1525,20 @@
movdqu xmm3, [ecx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movdqa xmm1, [eax+ebx]
movdqu xmm3, [ecx+edx]
psadbw xmm3, xmm1
paddw xmm4, xmm3
-
+
movdqu xmm2, [ecx+edx-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
-
+
movdqu xmm3, [ecx+edx+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
movdqa xmm2, [eax]
@@ -1599,30 +1599,30 @@
movdqu xmm3, [ecx]
psadbw xmm2, xmm3
paddw xmm5, xmm2
-
+
movdqu xmm2, [ecx-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
-
+
movdqu xmm3, [ecx+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movdqu xmm3, [ecx+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
mov ecx, [esp+24]
movhlps xmm0, xmm4
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
movhlps xmm0, xmm5
- paddw xmm5, xmm0
+ paddw xmm5, xmm0
movhlps xmm0, xmm6
- paddw xmm6, xmm0
+ paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
+ punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [ecx],xmm4
pop ebx
@@ -1646,20 +1646,20 @@
movdqu xmm3, [edi]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movdqa xmm1, [eax+ebx]
movdqu xmm3, [edi+edx]
psadbw xmm3, xmm1
paddw xmm4, xmm3
-
+
movdqu xmm2, [edi+edx-1]
psadbw xmm2, xmm0
paddw xmm6, xmm2
-
+
movdqu xmm3, [edi+edx+1]
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movdqa xmm2, [eax]
@@ -1688,36 +1688,36 @@
movdqu xmm3, [edi]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movdqu xmm0, [edi-1]
psadbw xmm0, xmm1
paddw xmm6, xmm0
-
+
movdqu xmm3, [edi+1]
psadbw xmm3, xmm1
paddw xmm7, xmm3
-
+
movdqu xmm3, [edi+edx]
psadbw xmm1, xmm3
paddw xmm5, xmm1
-
+
mov edi, [esp+28]
movhlps xmm0, xmm4
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
movhlps xmm0, xmm5
- paddw xmm5, xmm0
+ paddw xmm5, xmm0
movhlps xmm0, xmm6
- paddw xmm6, xmm0
+ paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
+ punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [edi],xmm4
pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsSampleSadFour8x16_sse2
WelsSampleSadFour8x16_sse2:
push ebx
@@ -1737,10 +1737,10 @@
movhps xmm3, [edi+edx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
@@ -1749,191 +1749,191 @@
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
mov edi, [esp+28]
movhlps xmm0, xmm4
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
movhlps xmm0, xmm5
- paddw xmm5, xmm0
+ paddw xmm5, xmm0
movhlps xmm0, xmm6
- paddw xmm6, xmm0
+ paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
+ punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [edi],xmm4
pop edi
pop ebx
ret
-
-
+
+
WELS_EXTERN WelsSampleSadFour8x8_sse2
WelsSampleSadFour8x8_sse2:
push ebx
@@ -1953,10 +1953,10 @@
movhps xmm3, [edi+edx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
@@ -1965,99 +1965,99 @@
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
psadbw xmm3, xmm0
paddw xmm4, xmm3
-
-
+
+
movq xmm1, [edi+edx-1]
movq xmm3, [edi+edx+1]
-
+
lea eax, [eax+2*ebx]
lea edi, [edi+2*edx]
movhps xmm1, [edi-1]
movhps xmm3, [edi+1]
-
+
psadbw xmm1, xmm0
paddw xmm6, xmm1
psadbw xmm3, xmm0
paddw xmm7, xmm3
-
+
movq xmm3, [edi]
movhps xmm3, [edi+edx]
psadbw xmm0, xmm3
paddw xmm5, xmm0
-
+
mov edi, [esp+28]
movhlps xmm0, xmm4
- paddw xmm4, xmm0
+ paddw xmm4, xmm0
movhlps xmm0, xmm5
- paddw xmm5, xmm0
+ paddw xmm5, xmm0
movhlps xmm0, xmm6
- paddw xmm6, xmm0
+ paddw xmm6, xmm0
movhlps xmm0, xmm7
paddw xmm7, xmm0
punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
+ punpckldq xmm6, xmm7
punpcklqdq xmm4, xmm6
movdqa [edi],xmm4
pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsSampleSadFour4x4_sse2
WelsSampleSadFour4x4_sse2:
push ebx
@@ -2080,23 +2080,23 @@
punpckldq xmm1, xmm2
movd xmm2, [edi+edx-1]
movd xmm3, [edi+edx+1]
-
+
lea edi, [edi+2*edx]
-
+
movd xmm4, [edi]
movd xmm5, [edi-1]
punpckldq xmm2, xmm5
movd xmm5, [edi+1]
punpckldq xmm3, xmm5
-
+
movd xmm5, [edi+edx]
punpckldq xmm4, xmm5
-
+
punpcklqdq xmm1, xmm4 ;-L
-
+
movd xmm5, [edi+edx-1]
movd xmm6, [edi+edx+1]
-
+
lea edi, [edi+2*edx]
movd xmm7, [edi-1]
punpckldq xmm5, xmm7
@@ -2107,12 +2107,12 @@
movd xmm6, [edi]
movd xmm7, [edi+edx]
punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6 ;+L
+ punpcklqdq xmm4, xmm6 ;+L
psadbw xmm1, xmm0
psadbw xmm2, xmm0
psadbw xmm3, xmm0
psadbw xmm4, xmm0
-
+
movhlps xmm0, xmm1
paddw xmm1, xmm0
movhlps xmm0, xmm2
@@ -2123,13 +2123,13 @@
paddw xmm4, xmm0
mov edi, [esp+28]
punpckldq xmm1, xmm4
- punpckldq xmm2, xmm3
+ punpckldq xmm2, xmm3
punpcklqdq xmm1, xmm2
movdqa [edi],xmm1
pop edi
pop ebx
ret
-
+
;***********************************************************************
;
;Pixel_sad_4_wxh_sse2 END
@@ -2150,40 +2150,40 @@
%define pix2address esp+pushsize+12
%define pix2stride esp+pushsize+16
- mov eax, [pix1address]
- mov ebx, [pix1stride ]
- mov ecx, [pix2address]
- mov edx, [pix2stride ]
+ mov eax, [pix1address]
+ mov ebx, [pix1stride ]
+ mov ecx, [pix2address]
+ mov edx, [pix2stride ]
movd mm0, [eax]
movd mm1, [eax+ebx]
punpckldq mm0, mm1
-
+
movd mm3, [ecx]
movd mm4, [ecx+edx]
punpckldq mm3, mm4
psadbw mm0, mm3
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
-
+
movd mm1, [eax]
movd mm2, [eax+ebx]
punpckldq mm1, mm2
-
+
movd mm3, [ecx]
movd mm4, [ecx+edx]
punpckldq mm3, mm4
psadbw mm1, mm3
paddw mm0, mm1
-
+
movd eax, mm0
WELSEMMS
pop ebx
-%undef pushsize
-%undef pix1address
-%undef pix1stride
-%undef pix2address
-%undef pix2stride
+%undef pushsize
+%undef pix1address
+%undef pix1stride
+%undef pix2address
+%undef pix2stride
ret
\ No newline at end of file
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@@ -45,7 +45,7 @@
bits 32
;***********************************************************************
-; Macros
+; Macros
;***********************************************************************
;***********************************************************************
@@ -59,7 +59,7 @@
sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
align 16
sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
align 16
sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
align 16
@@ -139,7 +139,7 @@
db 4, 8, 5, 8, 8,12, 1, 4, 4, 8
db 4, 7, 7,11, 4, 8, 7,11, 8,11
db 11,15, 1, 4, 3, 7, 4, 7, 7,11
- db 3, 7, 6,10, 7,10,10,14, 4, 7
+ db 3, 7, 6,10, 7,10,10,14, 4, 7
db 7,11, 7,10,10,14, 7,11,10,14
db 11,14,14,18, 0, 4, 3, 7, 3, 6
db 6,10, 3, 7, 6,10, 7,10,10,14
@@ -191,7 +191,7 @@
movdqa [eax],xmm0
movdqa [eax+16], xmm1
ret
-
+
;***********************************************************************
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
;***********************************************************************
@@ -206,7 +206,7 @@
pinsrw xmm0, eax, 7 ; xmm0[7] = [8]
pinsrw xmm1, ecx, 0 ; xmm1[0] = [7]
pshufb xmm1, [pb_scanacdc_maskb]
- pshufb xmm0, [pb_scanacdc_maska]
+ pshufb xmm0, [pb_scanacdc_maska]
mov eax, [esp+4]
movdqa [eax],xmm0
@@ -224,7 +224,7 @@
movdqa xmm2, xmm0
punpcklqdq xmm0, xmm1
punpckhqdq xmm2, xmm1
-
+
movdqa xmm3, xmm0
punpckldq xmm0, xmm2
punpckhdq xmm3, xmm2
@@ -236,10 +236,10 @@
pextrw edx, xmm3, 0
pinsrw xmm3, eax, 0
pinsrw xmm0, edx, 3
-
+
pshufhw xmm1, xmm0, 0x93
pshuflw xmm2, xmm3, 0x39
-
+
movdqa xmm3, xmm2
psrldq xmm1, 2
pslldq xmm3, 14
@@ -255,13 +255,13 @@
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
;***********************************************************************
ALIGN 16
-WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
+WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
WelsCalculateSingleCtr4x4_sse2:
push ebx
mov eax, [esp+8]
movdqa xmm0, [eax]
movdqa xmm1, [eax+16]
-
+
packsswb xmm0, xmm1
pxor xmm3, xmm3
@@ -317,7 +317,7 @@
and edx, 0xff
shr ecx, 8
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
- xor eax, eax
+ xor eax, eax
add al, [nozero_count_table+ecx]
add al, [nozero_count_table+edx]
ret
--- a/codec/encoder/core/asm/vaa.asm
+++ b/codec/encoder/core/asm/vaa.asm
@@ -38,7 +38,7 @@
;* 04/14/2010 Created
;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
-;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
;*
;*************************************************************************/
%include "asm_inc.asm"
@@ -167,7 +167,7 @@
mov ebp, esp
and ebp, 0fh
sub esp, ebp
- sub esp, 32
+ sub esp, 32
%define PUSH_SIZE 52 ; 20 + 32
mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
@@ -179,31 +179,31 @@
add edx, ecx ; iLineSize x 3 [edx]
mov eax, ebx
sal eax, $1 ; iLineSize x 4 [eax]
-
+
pxor xmm7, xmm7
-
+
; loops
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
+ movq [esp], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+8], xmm0
+ movq [esp+8], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
+ movq [esp+16], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [esp+24], xmm0
-
+
movdqa xmm0, [esp] ; block 0~7
movdqa xmm1, [esp+16] ; block 8~15
movdqa xmm2, xmm0
paddw xmm0, xmm1
SUM_WORD_8x2_SSE2 xmm0, xmm3
-
+
pmullw xmm1, xmm1
pmullw xmm2, xmm2
movdqa xmm3, xmm1
@@ -219,7 +219,7 @@
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
+
movd ebx, xmm0
and ebx, 0ffffh ; effective low word truncated
mov ecx, ebx
@@ -227,7 +227,7 @@
sar ebx, $4
movd eax, xmm1
sub eax, ebx
-
+
%undef PUSH_SIZE
add esp, 32
add esp, ebp
@@ -253,7 +253,7 @@
mov ebp, esp
and ebp, 0fh
sub esp, ebp
- sub esp, 32
+ sub esp, 32
%define PUSH_SIZE 52 ; 20 + 32
mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
@@ -265,25 +265,25 @@
add edx, ecx ; iLineSize x 3 [edx]
mov eax, ebx
sal eax, $1 ; iLineSize x 4 [eax]
-
+
pxor xmm7, xmm7
-
+
; loops
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
+ movq [esp], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+8], xmm1
+ movq [esp+8], xmm1
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
+ movq [esp+16], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [esp+24], xmm1
-
+
movdqa xmm0, [esp] ; block 0~7
movdqa xmm1, [esp+16] ; block 8~15
movdqa xmm2, xmm0
@@ -305,7 +305,7 @@
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
+
movd ebx, xmm0
and ebx, 0ffffh ; effective low work truncated
mov ecx, ebx
@@ -313,7 +313,7 @@
sar ebx, $4
movd eax, xmm1
sub eax, ebx
-
+
%undef PUSH_SIZE
add esp, 32
add esp, ebp
@@ -323,7 +323,7 @@
pop edx
pop ebx
ret
-
+
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
;***********************************************************************
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
@@ -331,11 +331,11 @@
ALIGN 16
MdInterAnalysisVaaInfo_sse41:
mov eax, [esp+4]
- movdqa xmm0, [eax] ; load 4 sad_8x8
+ movdqa xmm0, [eax] ; load 4 sad_8x8
pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
+ paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1
psrad xmm2, 06h
@@ -342,7 +342,7 @@
movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h
psubd xmm3, xmm2
- pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
+ pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
pshufd xmm4, xmm3, 01Bh
paddd xmm4, xmm3
pshufd xmm3, xmm4, 0B1h
@@ -354,7 +354,7 @@
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps eax, xmm0
ret
-.threshold_exit:
+.threshold_exit:
mov eax, 15
ret
@@ -365,11 +365,11 @@
ALIGN 16
MdInterAnalysisVaaInfo_sse2:
mov eax, [esp+4]
- movdqa xmm0, [eax] ; load 4 sad_8x8
+ movdqa xmm0, [eax] ; load 4 sad_8x8
pshufd xmm1, xmm0, 01Bh
paddd xmm1, xmm0
pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
+ paddd xmm1, xmm2
psrad xmm1, 02h ; iAverageSad
movdqa xmm2, xmm1
psrad xmm2, 06h
@@ -376,9 +376,9 @@
movdqa xmm3, xmm0 ; iSadBlock
psrad xmm3, 06h
psubd xmm3, xmm2
-
+
; to replace pmulld functionality as below
- movdqa xmm2, xmm3
+ movdqa xmm2, xmm3
pmuludq xmm2, xmm3
pshufd xmm4, xmm3, 0B1h
pmuludq xmm4, xmm4
@@ -385,8 +385,8 @@
movdqa xmm5, xmm2
punpckldq xmm5, xmm4
punpckhdq xmm2, xmm4
- punpcklqdq xmm5, xmm2
-
+ punpcklqdq xmm5, xmm2
+
pshufd xmm4, xmm5, 01Bh
paddd xmm4, xmm5
pshufd xmm5, xmm4, 0B1h
@@ -398,6 +398,6 @@
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
movmskps eax, xmm0
ret
-.threshold_exit:
+.threshold_exit:
mov eax, 15
ret
--- a/codec/encoder/plus/res/welsenc.rc
+++ b/codec/encoder/plus/res/welsenc.rc
@@ -27,18 +27,18 @@
// TEXTINCLUDE
//
-1 TEXTINCLUDE
+1 TEXTINCLUDE
BEGIN
"resource.h\0"
END
-2 TEXTINCLUDE
+2 TEXTINCLUDE
BEGIN
"#include ""afxres.h""\r\n"
"\0"
END
-3 TEXTINCLUDE
+3 TEXTINCLUDE
BEGIN
"\r\n"
"\0"
--- a/processing/build/linux/makefile
+++ b/processing/build/linux/makefile
@@ -3,7 +3,7 @@
OUTDIR = ../../../bin/linux
BINDIR = ../../bin
-OBJDIR = ../../obj
+OBJDIR = ../../obj
SRCDIRS = ../../src/asm \
../../src/common \
../../src/adaptivequantization \
@@ -12,7 +12,7 @@
../../src/downsample \
../../src/scenechangedetection \
../../src/vaacalc \
- ../../src/complexityanalysis
+ ../../src/complexityanalysis
SRCDIRS += ../../src/imagerotate
@@ -28,7 +28,7 @@
endif
ASMFLAGS = -f elf -DNOPREFIX -I ../../src/asm/
LDFLAGS = -lstdc++ -ldl
-
+
SRCEXTS = .cpp
ifeq ($(NASM), 1)
SRCEXTS += .asm
@@ -54,11 +54,11 @@
.SUFFIXES:
all: $(TARGETLIB)
-
+
%.d:%.cpp
@echo -n $(dir $<) > $@
@$(DEPEND_cpp.d) $< >> $@
-
+
%.d:%.asm
@echo -n $(dir $<) > $@
@$(DEPEND_asm.d) $< >> $@
@@ -67,9 +67,9 @@
%.o:%.cpp
$(COMPILE.cpp) $< -o $@
-
+
%.o:%.asm
- $(COMPILE.asm) $< -o $@
+ $(COMPILE.asm) $< -o $@
tags: $(HEADERS) $(SOURCES)
etags $(HEADERS) $(SOURCES)
--- a/processing/src/asm/asm_inc.asm
+++ b/processing/src/asm/asm_inc.asm
@@ -43,7 +43,7 @@
; Options, for DEBUG
;***********************************************************************
-%if 1
+%if 1
%define MOVDQ movdqa
%else
%define MOVDQ movdqu
@@ -58,7 +58,7 @@
BITS 32
;***********************************************************************
-; Macros
+; Macros
;***********************************************************************
%macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
pxor %2, %2
psubw %2, %1
pmaxsw %1, %2
-%endmacro
+%endmacro
%macro MMX_XSwap 4
movq %4, %2
@@ -105,7 +105,7 @@
SSE2_XSawp qdq, %5, %2, %3
%endmacro
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
+;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
%macro SSE2_TransTwo4x4W 5
SSE2_XSawp wd, %1, %2, %5
SSE2_XSawp wd, %3, %4, %2
@@ -125,26 +125,26 @@
movdqa %6, %9
movdqa %9, %4
SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
+
+ SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %9
- movdqa %9, %3
+ movdqa %9, %3
SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
+
+ SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %9
- movdqa %9, %5
+ movdqa %9, %5
SSE2_XSawp dq, %7, %3, %5
-
+
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %9
- movdqa %9, %1
+ movdqa %9, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %9
%endmacro
@@ -170,9 +170,9 @@
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
mov %3h, %3l
movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
;copy a dw into a xmm for 8 times
%macro SSE2_Copy8Times 2
--- a/processing/src/asm/cpuid.asm
+++ b/processing/src/asm/cpuid.asm
@@ -84,12 +84,12 @@
; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
;****************************************************************************************************
WelsCPUId:
- push ebx
+ push ebx
push edi
-
+
mov eax, [esp+12] ; operating index
cpuid ; cpuid
-
+
; processing various information return
mov edi, [esp+16]
mov [edi], eax
@@ -100,10 +100,10 @@
mov edi, [esp+28]
mov [edi], edx
- pop edi
+ pop edi
pop ebx
ret
-
+
WELS_EXTERN WelsCPUSupportAVX
; need call after cpuid=1 and eax, ecx flag got then
ALIGN 16
@@ -139,7 +139,7 @@
WelsCPUSupportFMA:
mov eax, [esp+4]
mov ecx, [esp+8]
-
+
; refer to detection of FMA addressed in INTEL AVX manual document
and ecx, 018001000H
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
mov eax, 1
ret
fma_not_supported:
- mov eax, 0
+ mov eax, 0
ret
WELS_EXTERN WelsEmms
--- a/processing/src/asm/denoisefilter.asm
+++ b/processing/src/asm/denoisefilter.asm
@@ -55,25 +55,25 @@
; Code
;***********************************************************************
SECTION .text
-
+
%macro WEIGHT_LINE 9
movq %2, %9
punpcklbw %2, %7
movdqa %8, %2
-
+
movdqa %1, %6
psubusb %1, %8
psubusb %8, %6
por %8, %1 ; ABS(curPixel - centerPixel);
-
+
movdqa %1, %3
psubusb %1, %8
pmullw %1, %1
psrlw %1, 5
- pmullw %2, %1
+ pmullw %2, %1
paddusw %4, %1
- paddusw %5, %2
+ paddusw %5, %2
%endmacro
%macro WEIGHT_LINE1_UV 4
@@ -91,12 +91,12 @@
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
-
+
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
paddw %3, %2
-
+
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
@@ -119,13 +119,13 @@
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
-
+
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
-
+
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
@@ -149,13 +149,13 @@
punpcklbw %2, %4
pmullw %2, [sse2_20]
paddw %3, %2
-
+
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
-
+
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
@@ -177,7 +177,7 @@
%define stride esp + pushsize + 8
BilateralLumaFilter8_sse2:
push ebx
-
+
pxor xmm7, xmm7
mov eax, [pixel]
mov ebx, eax
@@ -186,23 +186,23 @@
movdqa xmm3, [sse2_32]
pxor xmm4, xmm4 ; nTotWeight
pxor xmm5, xmm5 ; nSum
-
+
dec eax
mov ecx, [stride]
-
+
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 4
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 5
-
+
sub eax, ecx
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 2
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 3
-
+
lea eax, [eax + ecx * 2]
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 6
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 7
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 8
-
+
pcmpeqw xmm0, xmm0
psrlw xmm0, 15
psllw xmm0, 8
@@ -211,10 +211,10 @@
paddusw xmm5, xmm0
psrlw xmm5, 8
packuswb xmm5, xmm5
- movq [ebx], xmm5
-
+ movq [ebx], xmm5
+
pop ebx
- ret
+ ret
WELS_EXTERN WaverageChromaFilter8_sse2
;***********************************************************************
@@ -231,33 +231,33 @@
WaverageChromaFilter8_sse2:
mov edx, [esp + 4] ; pixels
mov ecx, [esp + 8] ; stride
-
+
mov eax, ecx
add eax, eax
sub edx, eax ; pixels - 2 * stride
sub edx, 2
-
- pxor xmm0, xmm0
+
+ pxor xmm0, xmm0
pxor xmm3, xmm3
-
+
movdqu xmm1, [edx]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
+
movdqu xmm1, [edx + ecx]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
- add edx, eax
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
+ add edx, eax
movdqu xmm1, [edx]
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
-
+
movdqu xmm1, [edx + ecx]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
movdqu xmm1, [edx + ecx * 2]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+
psrlw xmm3, 6
packuswb xmm3, xmm3
- movq [edx + 2], xmm3
+ movq [edx + 2], xmm3
- ret
\ No newline at end of file
+ ret
\ No newline at end of file
--- a/processing/src/asm/downsample_bilinear.asm
+++ b/processing/src/asm/downsample_bilinear.asm
@@ -92,11 +92,11 @@
mov edi, [esp+24] ; pDst
mov edx, [esp+28] ; iDstStride
mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
+ mov ecx, [esp+36] ; iSrcStride
mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
+ sar ebp, $1 ; iSrcHeight >> 1
+
.yloops:
mov eax, [esp+40] ; iSrcWidth
sar eax, $1 ; iSrcWidth >> 1
@@ -112,7 +112,7 @@
;=> target:
;: H G F E D C B A, P O N M L K J I
;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movq mm0, [esi] ; 1st pSrc line
movq mm1, [esi+8] ; 1st pSrc line + 8
movq mm2, [esi+ecx] ; 2nd pSrc line
@@ -140,7 +140,7 @@
pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
+ movq mm0, mm4 ;
punpckldq mm0, mm5 ; H G F E D C B A
punpckhdq mm4, mm5 ; h g f e d c b a
@@ -152,7 +152,7 @@
pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
+
; 2nd part horizonal loop: x16 bytes
; mem hi<- ->lo
;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
@@ -245,11 +245,11 @@
mov edi, [esp+24] ; pDst
mov edx, [esp+28] ; iDstStride
mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
+ mov ecx, [esp+36] ; iSrcStride
mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
+ sar ebp, $1 ; iSrcHeight >> 1
+
.yloops:
mov eax, [esp+40] ; iSrcWidth
sar eax, $1 ; iSrcWidth >> 1
@@ -265,7 +265,7 @@
;=> target:
;: H G F E D C B A, P O N M L K J I
;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movq mm0, [esi] ; 1st pSrc line
movq mm1, [esi+8] ; 1st pSrc line + 8
movq mm2, [esi+ecx] ; 2nd pSrc line
@@ -293,7 +293,7 @@
pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
+ movq mm0, mm4 ;
punpckldq mm0, mm5 ; H G F E D C B A
punpckhdq mm4, mm5 ; h g f e d c b a
@@ -306,7 +306,7 @@
pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
- movq [edi ], mm0
+ movq [edi ], mm0
; next SMB
lea esi, [esi+16]
@@ -349,11 +349,11 @@
mov edi, [esp+24] ; pDst
mov edx, [esp+28] ; iDstStride
mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
+ mov ecx, [esp+36] ; iSrcStride
mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
+ sar ebp, $1 ; iSrcHeight >> 1
+
.yloops:
mov eax, [esp+40] ; iSrcWidth
sar eax, $1 ; iSrcWidth >> 1
@@ -369,9 +369,9 @@
;=> target:
;: H G F E D C B A
;: h g f e d c b a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+ecx] ; 2nd pSrc line
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+ecx] ; 2nd pSrc line
; to handle mm0, mm1, mm2, mm3
pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
@@ -382,19 +382,19 @@
pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
punpcklbw mm4, mm5 ; h g H G f e F E
- pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+ pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
; to handle mm2, mm4
- movq mm0, mm2 ;
+ movq mm0, mm2 ;
punpckldq mm0, mm4 ; H G F E D C B A
punpckhdq mm2, mm4 ; h g f e d c b a
; avg within MB horizon width (16 x 2 lines)
pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
- pshufw mm1, mm0, 04eh ; 01001110 B
+ pshufw mm1, mm0, 04eh ; 01001110 B
pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
- movd [edi], mm0
+ movd [edi], mm0
; next unit
lea esi, [esi+8]
@@ -440,11 +440,11 @@
mov edi, [esp+24] ; pDst
mov edx, [esp+28] ; iDstStride
mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
+ mov ecx, [esp+36] ; iSrcStride
mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
+ sar ebp, $1 ; iSrcHeight >> 1
+
movdqa xmm7, [shufb_mask_low] ; mask low
movdqa xmm6, [shufb_mask_high] ; mask high
@@ -467,13 +467,13 @@
;: p o n m l k j i h g f e d c b a
;: P .. A
;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movdqa xmm0, [esi] ; 1st_src_line
movdqa xmm1, [esi+16] ; 1st_src_line + 16
movdqa xmm2, [esi+ecx] ; 2nd_src_line
- movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
+ movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
; packing & avg
movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
@@ -487,7 +487,7 @@
pshufb xmm1, xmm7
pshufb xmm5, xmm6
; psubb xmm5, xmm1
-; psrlw xmm5, 8
+; psrlw xmm5, 8
pavgb xmm1, xmm5
movdqa xmm4, xmm2
@@ -494,7 +494,7 @@
pshufb xmm2, xmm7
pshufb xmm4, xmm6
; psubb xmm4, xmm2
-; psrlw xmm4, 8
+; psrlw xmm4, 8
pavgb xmm2, xmm4
movdqa xmm5, xmm3
@@ -501,13 +501,13 @@
pshufb xmm3, xmm7
pshufb xmm5, xmm6
; psubb xmm5, xmm3
-; psrlw xmm5, 8
+; psrlw xmm5, 8
pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
; write pDst
movdqa [edi], xmm0
@@ -526,7 +526,7 @@
dec ebp
jg near .yloops
-
+
pop ebp
pop edi
pop esi
@@ -551,11 +551,11 @@
mov edi, [esp+24] ; pDst
mov edx, [esp+28] ; iDstStride
mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
+ mov ecx, [esp+36] ; iSrcStride
mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
+
+ sar ebp, $1 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
@@ -574,10 +574,10 @@
;: H G F E D C B A, P O N M L K J I
;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+ecx] ; 2nd_src_line
-
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+ecx] ; 2nd_src_line
+
; packing & avg
movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
@@ -584,7 +584,7 @@
pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
; another implementation for xmm2 high bits
; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
pavgb xmm0, xmm2
movdqa xmm3, xmm1
@@ -591,14 +591,14 @@
pshufb xmm1, xmm7
pshufb xmm3, xmm6
; psubb xmm3, xmm1
-; psrlw xmm3, 8
+; psrlw xmm3, 8
pavgb xmm1, xmm3
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
; write pDst
- movq [edi], xmm0
+ movq [edi], xmm0
; next SMB
lea esi, [esi+16]
@@ -615,7 +615,7 @@
dec ebp
jg near .yloops
-
+
pop ebp
pop edi
pop esi
@@ -641,12 +641,12 @@
mov edi, [esp+24] ; pDst
mov edx, [esp+28] ; iDstStride
mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
+ mov ecx, [esp+36] ; iSrcStride
mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
+ sar ebp, $1 ; iSrcHeight >> 1
+
+ movdqa xmm7, [shufb_mask_low] ; mask low
movdqa xmm6, [shufb_mask_high] ; mask high
.yloops:
@@ -668,13 +668,13 @@
;: p o n m l k j i h g f e d c b a
;: P .. A
;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
movntdqa xmm0, [esi] ; 1st_src_line
movntdqa xmm1, [esi+16] ; 1st_src_line + 16
movntdqa xmm2, [esi+ecx] ; 2nd_src_line
- movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
+ movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
; packing & avg
movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
@@ -703,11 +703,11 @@
; psubb xmm5, xmm3
; psrlw xmm5, 8
pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
; write pDst
movdqa [edi], xmm0
@@ -726,7 +726,7 @@
dec ebp
jg near .yloops
-
+
pop ebp
pop edi
pop esi
@@ -751,10 +751,10 @@
mov edi, [esp+24] ; pDst
mov edx, [esp+28] ; iDstStride
mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
+ mov ecx, [esp+36] ; iSrcStride
mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
+
+ sar ebp, $1 ; iSrcHeight >> 1
movdqa xmm7, [shufb_mask_low] ; mask low
movdqa xmm6, [shufb_mask_high] ; mask high
@@ -774,10 +774,10 @@
;: H G F E D C B A, P O N M L K J I
;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+ecx] ; 2nd_src_line
-
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+ecx] ; 2nd_src_line
+
; packing & avg
movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
@@ -793,11 +793,11 @@
; psrlw xmm3, 8
pavgb xmm1, xmm3
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
; write pDst
- movq [edi], xmm0
+ movq [edi], xmm0
; next SMB
lea esi, [esi+16]
@@ -814,7 +814,7 @@
dec ebp
jg near .yloops
-
+
pop ebp
pop edi
pop esi
@@ -858,7 +858,7 @@
%define xInverse esp + 20
%define dstStep esp + 24
sub esp, localsize
-
+
pxor xmm0, xmm0
mov edx, 32767
mov eax, [uiScaleX]
@@ -871,7 +871,7 @@
psllq xmm1, 32
por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
-
+
mov eax, [uiScaleY]
and eax, 32767
mov ebx, eax
@@ -882,15 +882,15 @@
psllq xmm6, 32
por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
-
+
mov edx, 40003fffh
movd xmm5, edx
punpcklwd xmm5, xmm0 ; 16384 16383
pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
-
+
DOWNSAMPLE:
-
+
mov eax, [dwDstHeight]
mov edi, [pDstData]
mov edx, [dwDstStride]
@@ -901,10 +901,10 @@
mov [tmpHeight], eax
mov eax, 16384
mov [yInverse], eax
-
+
pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
-
-HEIGHT:
+
+HEIGHT:
mov eax, [yInverse]
mov esi, [pSrcData]
shr eax, 15
@@ -912,18 +912,18 @@
add esi, eax ; get current row address
mov ebp, esi
add ebp, [dwSrcStride]
-
+
mov eax, 16384
mov [xInverse], eax
mov ecx, [dwDstWidth]
dec ecx
-
+
movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
-
+
WIDTH:
mov eax, [xInverse]
shr eax, 15
-
+
movd xmm1, [esi+eax] ; xxxxxxba
movd xmm2, [ebp+eax] ; xxxxxxdc
pxor xmm0, xmm0
@@ -930,7 +930,7 @@
punpcklwd xmm1, xmm2 ; xxxxdcba
punpcklbw xmm1, xmm0 ; 0d0c0b0a
punpcklwd xmm1, xmm0 ; 000d000c000b000a
-
+
movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
movdqa xmm0, xmm2
@@ -942,20 +942,20 @@
pshufd xmm1, xmm2, 00001110b
paddq xmm2, xmm1
psrlq xmm2, 29
-
+
movd eax, xmm2
inc eax
shr eax, 1
mov [edi], al
inc edi
-
+
mov eax, [uiScaleX]
add [xInverse], eax
-
+
paddw xmm3, xmm7 ; inc u
psllw xmm3, 1
psrlw xmm3, 1
-
+
loop WIDTH
WIDTH_END:
@@ -964,41 +964,41 @@
mov cl, [esi+eax]
mov [edi], cl
inc edi
-
+
mov eax, [uiScaleY]
add [yInverse], eax
add edi, [dstStep]
-
+
paddw xmm4, xmm6 ; inc v
psllw xmm4, 1
psrlw xmm4, 1
-
+
dec dword [tmpHeight]
jg HEIGHT
-LAST_ROW:
+LAST_ROW:
mov eax, [yInverse]
mov esi, [pSrcData]
shr eax, 15
mul dword [dwSrcStride]
add esi, eax ; get current row address
-
+
mov eax, 16384
mov [xInverse], eax
mov ecx, [dwDstWidth]
-
+
LAST_ROW_WIDTH:
mov eax, [xInverse]
shr eax, 15
-
+
mov al, [esi+eax]
mov [edi], al
inc edi
-
+
mov eax, [uiScaleX]
add [xInverse], eax
-
+
loop LAST_ROW_WIDTH
LAST_ROW_END:
@@ -1026,10 +1026,10 @@
%undef xInverse
%undef dstStep
ret
-
-
-
-
+
+
+
+
WELS_EXTERN GeneralBilinearFastDownsampler_sse2
;**************************************************************************************************************
;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
@@ -1062,7 +1062,7 @@
%define xInverse esp + 20
%define dstStep esp + 24
sub esp, localsize
-
+
pxor xmm0, xmm0
mov edx, 65535
mov eax, [uiScaleX]
@@ -1075,7 +1075,7 @@
psllq xmm1, 32
por xmm1, xmm2 ; 0 uinc 0 -uinc
pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
-
+
mov eax, [uiScaleY]
and eax, 32767
mov ebx, eax
@@ -1086,15 +1086,15 @@
psllq xmm6, 32
por xmm6, xmm2 ; 0 vinc 0 -vinc
pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
-
+
mov edx, 80007fffh ; 32768 32767
- movd xmm5, edx
+ movd xmm5, edx
pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
mov ebx, 16384
-
+
FAST_DOWNSAMPLE:
-
+
mov eax, [dwDstHeight]
mov edi, [pDstData]
mov edx, [dwDstStride]
@@ -1105,11 +1105,11 @@
mov [tmpHeight], eax
mov eax, 16384
mov [yInverse], eax
-
+
pshuflw xmm4, xmm5, 01010000b
psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
-
-FAST_HEIGHT:
+
+FAST_HEIGHT:
mov eax, [yInverse]
mov esi, [pSrcData]
shr eax, 15
@@ -1117,23 +1117,23 @@
add esi, eax ; get current row address
mov ebp, esi
add ebp, [dwSrcStride]
-
+
mov eax, 32768
mov [xInverse], eax
mov ecx, [dwDstWidth]
dec ecx
-
+
movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
-
+
FAST_WIDTH:
mov eax, [xInverse]
shr eax, 16
-
+
movd xmm1, [esi+eax] ; xxxxxxba
movd xmm2, [ebp+eax] ; xxxxxxdc
punpcklwd xmm1, xmm2 ; xxxxdcba
punpcklbw xmm1, xmm0 ; 0d0c0b0a
-
+
movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
pmaddwd xmm2, xmm1
@@ -1142,17 +1142,17 @@
movd xmm1, ebx
paddd xmm2, xmm1
psrld xmm2, 15
-
+
packuswb xmm2, xmm0
movd eax, xmm2
mov [edi], al
inc edi
-
+
mov eax, [uiScaleX]
add [xInverse], eax
-
+
paddw xmm3, xmm7 ; inc u
-
+
loop FAST_WIDTH
FAST_WIDTH_END:
@@ -1161,41 +1161,41 @@
mov cl, [esi+eax]
mov [edi], cl
inc edi
-
+
mov eax, [uiScaleY]
add [yInverse], eax
add edi, [dstStep]
-
+
paddw xmm4, xmm6 ; inc v
psllw xmm4, 1
psrlw xmm4, 1
-
+
dec dword [tmpHeight]
jg FAST_HEIGHT
-FAST_LAST_ROW:
+FAST_LAST_ROW:
mov eax, [yInverse]
mov esi, [pSrcData]
shr eax, 15
mul dword [dwSrcStride]
add esi, eax ; get current row address
-
+
mov eax, 32768
mov [xInverse], eax
mov ecx, [dwDstWidth]
-
+
FAST_LAST_ROW_WIDTH:
mov eax, [xInverse]
shr eax, 16
-
+
mov al, [esi+eax]
mov [edi], al
inc edi
-
+
mov eax, [uiScaleX]
add [xInverse], eax
-
+
loop FAST_LAST_ROW_WIDTH
FAST_LAST_ROW_END:
--- a/processing/src/asm/intra_pred.asm
+++ b/processing/src/asm/intra_pred.asm
@@ -85,7 +85,7 @@
%macro SSE2_PRED_H_16X16_TWO_LINE 1
lea eax, [eax+ecx*2]
-
+
COPY_16_TIMES eax, xmm0
movdqa [edx+%1], xmm0
COPY_16_TIMESS eax, xmm0, ecx
@@ -97,13 +97,13 @@
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
COPY_16_TIMES eax, xmm0
movdqa [edx], xmm0
COPY_16_TIMESS eax, xmm0, ecx
movdqa [edx+0x10], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE 0x20
+
+ SSE2_PRED_H_16X16_TWO_LINE 0x20
SSE2_PRED_H_16X16_TWO_LINE 0x40
SSE2_PRED_H_16X16_TWO_LINE 0x60
SSE2_PRED_H_16X16_TWO_LINE 0x80
@@ -110,9 +110,9 @@
SSE2_PRED_H_16X16_TWO_LINE 0xa0
SSE2_PRED_H_16X16_TWO_LINE 0xc0
SSE2_PRED_H_16X16_TWO_LINE 0xe0
-
+
ret
-
+
;***********************************************************************
; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
@@ -121,10 +121,10 @@
mov edx, [esp+4] ; pred
mov eax, [esp+8] ; pRef
mov ecx, [esp+12] ; stride
-
+
sub eax, ecx
movdqa xmm0, [eax]
-
+
movdqa [edx], xmm0
movdqa [edx+10h], xmm0
movdqa [edx+20h], xmm0
@@ -135,11 +135,11 @@
movdqa [edx+70h], xmm0
movdqa [edx+80h], xmm0
movdqa [edx+90h], xmm0
- movdqa [edx+160], xmm0
+ movdqa [edx+160], xmm0
movdqa [edx+176], xmm0
movdqa [edx+192], xmm0
movdqa [edx+208], xmm0
movdqa [edx+224], xmm0
movdqa [edx+240], xmm0
-
+
ret
\ No newline at end of file
--- a/processing/src/asm/sad.asm
+++ b/processing/src/asm/sad.asm
@@ -67,7 +67,7 @@
%endmacro
-
+
%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
and %1, 0x1f|(%3>>1)
cmp %1, (32-%2)|(%3>>1)
@@ -108,15 +108,15 @@
push edi
mov eax, [esp+12]
mov ebx, [esp+16]
-
+
pxor xmm7, xmm7
-
+
mov edi, ecx
and edi, 0x07
- sub ecx, edi
+ sub ecx, edi
mov edx, 8
sub edx, edi
-
+
shl edi, 3
shl edx, 3
movd xmm5, edi
@@ -124,10 +124,10 @@
mov edi, 8
add edi, ecx
mov edx, [esp+24]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -135,17 +135,17 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -153,7 +153,7 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
@@ -160,10 +160,10 @@
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -171,17 +171,17 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
lea edi, [edi+2*edx]
-
+
movq xmm0, [eax]
movhps xmm0, [eax+ebx]
-
+
movq xmm1, [ecx]
movq xmm2, [edi]
movhps xmm1, [ecx+edx]
@@ -189,10 +189,10 @@
psrlq xmm1, xmm5
psllq xmm2, xmm6
por xmm1, xmm2
-
+
psadbw xmm0, xmm1
paddw xmm7, xmm0
-
+
movhlps xmm0, xmm7
paddw xmm0, xmm7
movd eax, xmm0
@@ -202,12 +202,12 @@
push ebx
mov eax, [esp+8]
mov ebx, [esp+12]
- mov edx, [esp+20]
+ mov edx, [esp+20]
pxor xmm6, xmm6
SSE2_GetSad8x4
lea eax, [eax+2*ebx]
lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
+ SSE2_GetSad8x4
movhlps xmm0, xmm6
paddw xmm0, xmm6
movd eax, xmm0
--- a/processing/src/asm/vaa.asm
+++ b/processing/src/asm/vaa.asm
@@ -163,7 +163,7 @@
paddd xmm6, xmm1
paddd xmm6, xmm3
lea esi, [esi+ebx*2]
- lea edi, [edi+ebx*2]
+ lea edi, [edi+ebx*2]
%endmacro
%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0
@@ -172,11 +172,11 @@
movdqa xmm3, xmm1
psadbw xmm3, xmm2
paddd xmm6, xmm3
-
+
movdqa xmm3, xmm1
psadbw xmm3, xmm0
paddd xmm5, xmm3
-
+
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
@@ -184,7 +184,7 @@
pmaddwd xmm2, xmm2
paddd xmm4, xmm1
paddd xmm4, xmm2
-
+
add esi, ebx
add edi, ebx
%endmacro
@@ -195,16 +195,16 @@
movdqa xmm3, xmm1
psadbw xmm3, xmm2
paddd xmm7, xmm3 ; sad
-
+
movdqa xmm3, xmm1
pmaxub xmm3, xmm2
pminub xmm2, xmm1
psubb xmm3, xmm2 ; diff
-
+
movdqa xmm2, xmm1
psadbw xmm2, xmm0
paddd xmm6, xmm2 ; sum
-
+
movdqa xmm2, xmm1
punpcklbw xmm1, xmm0
punpckhbw xmm2, xmm0
@@ -212,7 +212,7 @@
pmaddwd xmm2, xmm2
paddd xmm5, xmm1
paddd xmm5, xmm2 ; sqsum
-
+
movdqa xmm1, xmm3
punpcklbw xmm1, xmm0
punpckhbw xmm3, xmm0
@@ -220,7 +220,7 @@
pmaddwd xmm3, xmm3
paddd xmm4, xmm1
paddd xmm4, xmm3 ; sqdiff
-
+
add esi, ebx
add edi, ebx
%endmacro
@@ -238,16 +238,16 @@
movdqa xmm3, xmm2
psadbw xmm3, xmm0
paddd sum_ref_reg, xmm3 ; sum_ref
-
+
movdqa xmm3, xmm1
pmaxub xmm3, xmm2
pminub xmm2, xmm1
psubb xmm3, xmm2 ; abs diff
pmaxub mad_reg, xmm3 ; max abs diff
-
+
psadbw xmm3, xmm0
paddd sad_reg, xmm3 ; sad
-
+
add esi, ebx
add edi, ebx
%endmacro
@@ -285,7 +285,7 @@
psllq xmm3, 32
paddd xmm2, xmm3
paddd sad_reg, xmm2 ; sqsum
-
+
movdqa xmm2, [edi]
movdqa xmm3, xmm1
psadbw xmm3, xmm0
@@ -294,13 +294,13 @@
psadbw xmm3, xmm0
pslldq xmm3, 4
paddd sum_reg, xmm3 ; sum_ref
-
+
movdqa xmm3, xmm1
pmaxub xmm3, xmm2
pminub xmm2, xmm1
psubb xmm3, xmm2 ; abs diff
pmaxub mad_reg, xmm3 ; max abs diff
-
+
movdqa xmm1, xmm3
psadbw xmm3, xmm0
paddd sad_reg, xmm3 ; sad
@@ -312,7 +312,7 @@
pmaddwd xmm3, xmm3
paddd sqdiff_reg, xmm1
paddd sqdiff_reg, xmm3 ; sqdiff
-
+
add esi, ebx
add edi, ebx
%endmacro
@@ -351,7 +351,7 @@
mov ebx, [esp+32]
mov ecx, [esp+36]
mov edx, [esp+40]
- pxor xmm0, xmm0
+ pxor xmm0, xmm0
.hloop:
mov eax, ebx
mov ebp, $0
@@ -361,7 +361,7 @@
psadbw xmm1, xmm2
pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float
paddd xmm1, xmm2
- paddd xmm0, xmm1
+ paddd xmm0, xmm1
add ebp, 010h
dec eax
jnz near .wloop
@@ -384,20 +384,20 @@
; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
ALIGN 16
-SampleVariance16x16_sse2:
+SampleVariance16x16_sse2:
push esi
push edi
push ebx
-
+
sub esp, 16
%define SUM [esp]
%define SUM_CUR [esp+4]
%define SQR [esp+8]
%define SQR_CUR [esp+12]
- %define PUSH_SIZE 28 ; 12 + 16
+ %define PUSH_SIZE 28 ; 12 + 16
mov edi, [esp+PUSH_SIZE+4] ; y_ref
- mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
+ mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
mov esi, [esp+PUSH_SIZE+12] ; y_src
mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
mov ecx, 010h ; height = 16
@@ -422,7 +422,7 @@
; sqr += diff * diff;
pmaxub xmm0, xmm1
pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
+ psubb xmm0, xmm1 ; diff
SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
movd ebx, xmm1
add SQR, ebx
@@ -433,7 +433,7 @@
punpcklbw xmm0, xmm7
punpckhbw xmm1, xmm7
paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
movd ebx, xmm0
and ebx, 0ffffh
add SUM_CUR, ebx
@@ -442,12 +442,12 @@
SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
movd ebx, xmm0
add SQR_CUR, ebx
-
+
lea edi, [edi+edx]
lea esi, [esi+eax]
dec ecx
jnz near .hloops
-
+
mov ebx, 0
mov bx, word SUM
sar ebx, 8
@@ -465,7 +465,7 @@
sar ecx, 8
sub ecx, ebx
mov [edi+2], cx ; to store uiTextureIndex
-
+
%undef SUM
%undef SUM_CUR
%undef SQR
@@ -472,10 +472,10 @@
%undef SQR_CUR
%undef PUSH_SIZE
- add esp, 16
+ add esp, 16
pop ebx
pop edi
- pop esi
+ pop esi
ret
@@ -497,7 +497,7 @@
mov ebp, esp
and ebp, 0fh
sub esp, ebp
- sub esp, 32
+ sub esp, 32
%define PUSH_SIZE 52 ; 20 + 32
mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
@@ -509,31 +509,31 @@
add edx, ecx ; linesize x 3 [edx]
mov eax, ebx
sal eax, $1 ; linesize x 4 [eax]
-
+
pxor xmm7, xmm7
-
+
; loops
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
+ movq [esp], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+8], xmm0
+ movq [esp+8], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
+ movq [esp+16], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
movq [esp+24], xmm0
-
+
movdqa xmm0, [esp] ; block 0~7
movdqa xmm1, [esp+16] ; block 8~15
movdqa xmm2, xmm0
paddw xmm0, xmm1
SUM_WORD_8x2_SSE2 xmm0, xmm3
-
+
pmullw xmm1, xmm1
pmullw xmm2, xmm2
movdqa xmm3, xmm1
@@ -549,7 +549,7 @@
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
+
movd ebx, xmm0
and ebx, 0ffffh ; effective low word truncated
mov ecx, ebx
@@ -557,7 +557,7 @@
sar ebx, $4
movd eax, xmm1
sub eax, ebx
-
+
%undef PUSH_SIZE
add esp, 32
add esp, ebp
@@ -567,7 +567,7 @@
pop edx
pop ebx
ret
-
+
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
;***********************************************************************
; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t linesize );
@@ -583,7 +583,7 @@
mov ebp, esp
and ebp, 0fh
sub esp, ebp
- sub esp, 32
+ sub esp, 32
%define PUSH_SIZE 52 ; 20 + 32
mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
@@ -595,25 +595,25 @@
add edx, ecx ; linesize x 3 [edx]
mov eax, ebx
sal eax, $1 ; linesize x 4 [eax]
-
+
pxor xmm7, xmm7
-
+
; loops
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
+ movq [esp], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+8], xmm1
+ movq [esp+8], xmm1
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
+ movq [esp+16], xmm0
lea esi, [esi+eax]
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movq [esp+24], xmm1
-
+
movdqa xmm0, [esp] ; block 0~7
movdqa xmm1, [esp+16] ; block 8~15
movdqa xmm2, xmm0
@@ -635,7 +635,7 @@
paddd xmm1, xmm2
pshufd xmm2, xmm1, 0B1h
paddd xmm1, xmm2
-
+
movd ebx, xmm0
and ebx, 0ffffh ; effective low work truncated
mov ecx, ebx
@@ -643,7 +643,7 @@
sar ebx, $4
movd eax, xmm1
sub eax, ebx
-
+
%undef PUSH_SIZE
add esp, 32
add esp, ebp
@@ -654,12 +654,12 @@
pop ebx
ret
%endif
-
-
+
+
WELS_EXTERN abs_difference_mbrow_sse2
;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
; int32_t gom_pixel_num, int32_t *pSum)
;*************************************************************************************************************
ALIGN 16
@@ -691,13 +691,13 @@
add edi, 16
cmp esi, edx
jl gom_row_loop_p
-
+
sub esi, eax
sub edi, eax
add esi, ebx
add edi, ebx
loop mb_width_loop_p
-
+
movdqa xmm1, xmm0
psrldq xmm1, 8
paddd xmm1, xmm0
@@ -710,7 +710,7 @@
%undef iPicStride
%undef gom_pixel_num
%undef pSum
-%undef pushsize
+%undef pushsize
pop ebx
pop edi
pop esi
@@ -721,7 +721,7 @@
WELS_EXTERN sum_sqrsum_mbrow_sse2
;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
;*************************************************************************************************************
ALIGN 16
@@ -759,11 +759,11 @@
add esi, 16
cmp esi, edx
jl gom_row_loop_i
-
+
sub esi, eax
add esi, ebx
loop mb_width_loop_i
-
+
movdqa xmm3, xmm1
psrldq xmm3, 8
paddd xmm1, xmm3
@@ -770,7 +770,7 @@
movd eax, xmm1
mov edx, [pSum]
add [edx], eax
-
+
movdqa xmm3, xmm2
psrldq xmm3, 8
paddd xmm2, xmm3
@@ -787,7 +787,7 @@
%undef gom_pixel_num
%undef pSum
%undef pSqrSum
-%undef pushsize
+%undef pushsize
pop ebx
pop esi
ret
@@ -819,7 +819,7 @@
mov ebx, [iPicStride]
mov edx, [psad8x8]
mov eax, ebx
-
+
shr dword [iPicWidth], 4 ; iPicWidth/16
shr dword [iPicHeight], 4 ; iPicHeight/16
shl eax, 4 ; iPicStride*16
@@ -839,7 +839,7 @@
movd [edx], xmm6
psrldq xmm6, 8
movd [edx+4], xmm6
-
+
pxor xmm6, xmm6
WELS_SAD_16x2_SSE2
WELS_SAD_16x2_SSE2
@@ -849,24 +849,24 @@
movd [edx+8], xmm6
psrldq xmm6, 8
movd [edx+12], xmm6
-
+
add edx, 16
sub esi, eax
sub edi, eax
add esi, 16
add edi, 16
-
+
dec ecx
jnz width_loop
-
+
pop edi
pop esi
add esi, eax
add edi, eax
-
+
dec dword [iPicHeight]
jnz height_loop
-
+
mov edx, [psadframe]
movdqa xmm5, xmm7
psrldq xmm7, 8
@@ -880,16 +880,16 @@
%undef iPicStride
%undef psadframe
%undef psad8x8
-%undef pushsize
+%undef pushsize
pop ebx
pop edi
pop esi
ret
-
-
+
+
WELS_EXTERN VAACalcSadVar_sse2
;*************************************************************************************************************
-;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
;*************************************************************************************************************
@@ -919,7 +919,7 @@
mov ebx, [iPicStride]
mov edx, [psad8x8]
mov eax, ebx
-
+
shr dword [iPicWidth], 4 ; iPicWidth/16
shr dword [iPicHeight], 4 ; iPicHeight/16
shl eax, 4 ; iPicStride*16
@@ -945,7 +945,7 @@
movd [edx], xmm6
psrldq xmm6, 8
movd [edx+4], xmm6
-
+
pxor xmm6, xmm6
WELS_SAD_SUM_SQSUM_16x1_SSE2
WELS_SAD_SUM_SQSUM_16x1_SSE2
@@ -959,7 +959,7 @@
movd [edx+8], xmm6
psrldq xmm6, 8
movd [edx+12], xmm6
-
+
mov ebp, [psum16x16]
movdqa xmm1, xmm5
psrldq xmm1, 8
@@ -966,7 +966,7 @@
paddd xmm5, xmm1
movd [ebp], xmm5
add dword [psum16x16], 4
-
+
movdqa xmm5, xmm4
psrldq xmm5, 8
paddd xmm4, xmm5
@@ -973,28 +973,28 @@
movdqa xmm3, xmm4
psrldq xmm3, 4
paddd xmm4, xmm3
-
+
mov ebp, [psqsum16x16]
movd [ebp], xmm4
add dword [psqsum16x16], 4
-
+
add edx, 16
sub esi, eax
sub edi, eax
add esi, 16
add edi, 16
-
+
dec ecx
jnz var_width_loop
-
+
mov esi, [tmp_esi]
mov edi, [tmp_edi]
add esi, eax
add edi, eax
-
+
dec dword [iPicHeight]
jnz var_height_loop
-
+
mov edx, [psadframe]
movdqa xmm5, xmm7
psrldq xmm7, 8
@@ -1001,7 +1001,7 @@
paddd xmm7, xmm5
movd [edx], xmm7
- add esp, localsize
+ add esp, localsize
pop ebx
pop edi
pop esi
@@ -1020,12 +1020,12 @@
%undef pushsize
%undef localsize
ret
-
-
+
+
WELS_EXTERN VAACalcSadSsd_sse2
;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
;*************************************************************************************************************
@@ -1059,7 +1059,7 @@
mov ebx, [iPicStride]
mov edx, [psad8x8]
mov eax, ebx
-
+
shr dword [iPicWidth], 4 ; iPicWidth/16
shr dword [iPicHeight], 4 ; iPicHeight/16
shl eax, 4 ; iPicStride*16
@@ -1091,7 +1091,7 @@
movd [edx+4], xmm7
movd ebp, xmm1
add [tmp_sadframe], ebp
-
+
pxor xmm7, xmm7
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
@@ -1108,7 +1108,7 @@
movd [edx+12], xmm7
movd ebp, xmm1
add [tmp_sadframe], ebp
-
+
mov ebp, [psum16x16]
movdqa xmm1, xmm6
psrldq xmm1, 8
@@ -1115,7 +1115,7 @@
paddd xmm6, xmm1
movd [ebp], xmm6
add dword [psum16x16], 4
-
+
mov ebp, [psqsum16x16]
pshufd xmm6, xmm5, 14 ;00001110
paddd xmm6, xmm5
@@ -1123,7 +1123,7 @@
paddd xmm5, xmm6
movd [ebp], xmm5
add dword [psqsum16x16], 4
-
+
mov ebp, [psqdiff16x16]
pshufd xmm5, xmm4, 14 ; 00001110
paddd xmm5, xmm4
@@ -1131,29 +1131,29 @@
paddd xmm4, xmm5
movd [ebp], xmm4
add dword [psqdiff16x16], 4
-
+
add edx, 16
sub esi, eax
sub edi, eax
add esi, 16
add edi, 16
-
+
dec ecx
jnz sqdiff_width_loop
-
+
mov esi, [tmp_esi]
mov edi, [tmp_edi]
add esi, eax
add edi, eax
-
+
dec dword [iPicHeight]
jnz sqdiff_height_loop
-
+
mov ebx, [tmp_sadframe]
mov eax, [psadframe]
mov [eax], ebx
- add esp, localsize
+ add esp, localsize
pop ebx
pop edi
pop esi
@@ -1174,14 +1174,14 @@
%undef pushsize
%undef localsize
ret
-
-
-
-
+
+
+
+
WELS_EXTERN VAACalcSadBgd_sse2
;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
;*************************************************************************************************************
@@ -1211,7 +1211,7 @@
mov edi, [ref_data]
mov ebx, [iPicStride]
mov eax, ebx
-
+
shr dword [iPicWidth], 4 ; iPicWidth/16
shr dword [iPicHeight], 4 ; iPicHeight/16
shl eax, 4 ; iPicStride*16
@@ -1234,11 +1234,11 @@
WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
-
+
+
mov edx, [p_mad8x8]
WELS_MAX_REG_SSE2 xmm4
-
+
;movdqa xmm1, xmm4
;punpcklbw xmm1, xmm0
;punpcklwd xmm1, xmm0
@@ -1247,7 +1247,7 @@
;punpcklwd xmm4, xmm0
;movd [edx+4], xmm4
;add edx, 8
- ;mov [p_mad8x8], edx
+ ;mov [p_mad8x8], edx
mov [tmp_ecx], ecx
movhlps xmm1, xmm4
movd ecx, xmm4
@@ -1257,12 +1257,12 @@
add edx, 2
mov [p_mad8x8], edx
-
+
pslldq xmm7, 4
pslldq xmm6, 4
pslldq xmm5, 4
-
-
+
+
pxor xmm4, xmm4 ; pMad8x8
WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
@@ -1272,10 +1272,10 @@
WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
+
mov edx, [p_mad8x8]
WELS_MAX_REG_SSE2 xmm4
-
+
;movdqa xmm1, xmm4
;punpcklbw xmm1, xmm0
;punpcklwd xmm1, xmm0
@@ -1284,7 +1284,7 @@
;punpcklwd xmm4, xmm0
;movd [edx+4], xmm4
;add edx, 8
- ;mov [p_mad8x8], edx
+ ;mov [p_mad8x8], edx
movhlps xmm1, xmm4
movd ecx, xmm4
mov [edx], cl
@@ -1292,21 +1292,21 @@
mov [edx+1],cl
add edx, 2
mov [p_mad8x8], edx
-
+
; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
-
+
mov edx, [psad8x8]
pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [edx], xmm1
+ movdqa [edx], xmm1
add edx, 16
mov [psad8x8], edx ; sad8x8
-
+
paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
pshufd xmm2, xmm1, 00000011b
paddd xmm1, xmm2
movd edx, xmm1
add ebp, edx ; sad frame
-
+
mov edx, [p_sd8x8]
psubd xmm6, xmm5
pshufd xmm1, xmm6, 10001101b
@@ -1313,30 +1313,30 @@
movdqa [edx], xmm1
add edx, 16
mov [p_sd8x8], edx
-
-
+
+
add edx, 16
sub esi, eax
sub edi, eax
add esi, 16
add edi, 16
-
+
mov ecx, [tmp_ecx]
dec ecx
jnz bgd_width_loop
-
+
mov esi, [tmp_esi]
mov edi, [tmp_edi]
add esi, eax
add edi, eax
-
+
dec dword [iPicHeight]
jnz bgd_height_loop
-
+
mov edx, [psadframe]
mov [edx], ebp
- add esp, localsize
+ add esp, localsize
pop ebx
pop edi
pop esi
@@ -1360,8 +1360,8 @@
WELS_EXTERN VAACalcSadSsdBgd_sse2
;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
;*************************************************************************************************************
@@ -1395,7 +1395,7 @@
mov edi, [ref_data]
mov ebx, [iPicStride]
mov eax, ebx
-
+
shr dword [iPicWidth], 4 ; iPicWidth/16
shr dword [iPicHeight], 4 ; iPicHeight/16
shl eax, 4 ; iPicStride*16
@@ -1418,7 +1418,7 @@
WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
+
mov edx, [psad8x8]
movdqa xmm2, xmm7
pshufd xmm1, xmm2, 00001110b
@@ -1426,17 +1426,17 @@
movd [edx+4], xmm1
add edx, 8
mov [psad8x8], edx ; sad8x8
-
+
paddd xmm1, xmm2
movd edx, xmm1
add [tmp_sadframe], edx ; iFrameSad
-
+
mov edx, [psum16x16]
movdqa xmm1, xmm6
pshufd xmm2, xmm1, 00001110b
paddd xmm1, xmm2
movd [edx], xmm1 ; sum
-
+
mov edx, [p_sd8x8]
pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
psubd xmm6, xmm1 ; 00 diff1 00 diff0
@@ -1444,7 +1444,7 @@
movq [edx], xmm1
add edx, 8
mov [p_sd8x8], edx
-
+
mov edx, [p_mad8x8]
WELS_MAX_REG_SSE2 xmm5
;movdqa xmm1, xmm5
@@ -1464,7 +1464,7 @@
mov [edx+1],cl
add edx, 2
mov [p_mad8x8], edx
-
+
psrlq xmm7, 32
psllq xmm7, 32 ; clear sad
pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
@@ -1477,7 +1477,7 @@
WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
+
mov edx, [psad8x8]
movdqa xmm2, xmm7
pshufd xmm1, xmm2, 00001110b
@@ -1485,11 +1485,11 @@
movd [edx+4], xmm1
add edx, 8
mov [psad8x8], edx ; sad8x8
-
+
paddd xmm1, xmm2
movd edx, xmm1
add [tmp_sadframe], edx ; iFrameSad
-
+
mov edx, [psum16x16]
movdqa xmm1, xmm6
pshufd xmm2, xmm1, 00001110b
@@ -1498,7 +1498,7 @@
add [edx], ebp
add edx, 4
mov [psum16x16], edx
-
+
mov edx, [psqsum16x16]
psrlq xmm7, 32
pshufd xmm2, xmm7, 00001110b
@@ -1506,7 +1506,7 @@
movd [edx], xmm2 ; sqsum
add edx, 4
mov [psqsum16x16], edx
-
+
mov edx, [p_sd8x8]
pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
psubd xmm6, xmm1 ; 00 diff1 00 diff0
@@ -1514,7 +1514,7 @@
movq [edx], xmm1
add edx, 8
mov [p_sd8x8], edx
-
+
mov edx, [p_mad8x8]
WELS_MAX_REG_SSE2 xmm5
;movdqa xmm1, xmm5
@@ -1525,7 +1525,7 @@
;punpcklwd xmm5, xmm0
;movd [edx+4], xmm5
;add edx, 8
- ;mov [p_mad8x8], edx
+ ;mov [p_mad8x8], edx
movhlps xmm1, xmm5
movd ecx, xmm5
mov [edx], cl
@@ -1533,7 +1533,7 @@
mov [edx+1],cl
add edx, 2
mov [p_mad8x8], edx
-
+
mov edx, [psqdiff16x16]
pshufd xmm1, xmm4, 00001110b
paddd xmm4, xmm1
@@ -1542,30 +1542,30 @@
movd [edx], xmm4
add edx, 4
mov [psqdiff16x16], edx
-
+
add edx, 16
sub esi, eax
sub edi, eax
add esi, 16
add edi, 16
-
+
mov ecx, [tmp_ecx]
dec ecx
jnz sqdiff_bgd_width_loop
-
+
mov esi, [tmp_esi]
mov edi, [tmp_edi]
add esi, eax
add edi, eax
-
+
dec dword [iPicHeight]
jnz sqdiff_bgd_height_loop
-
+
mov edx, [psadframe]
mov ebp, [tmp_sadframe]
mov [edx], ebp
- add esp, localsize
+ add esp, localsize
pop ebx
pop edi
pop esi
--- a/processing/src/common/WelsVP.def
+++ b/processing/src/common/WelsVP.def
@@ -33,4 +33,4 @@
LIBRARY welsvp.dll
EXPORTS
CreateVpInterface PRIVATE
- DestroyVpInterface PRIVATE
\ No newline at end of file
+ DestroyVpInterface PRIVATE
\ No newline at end of file
--- a/processing/src/common/WelsVP.rc
+++ b/processing/src/common/WelsVP.rc
@@ -27,18 +27,18 @@
// TEXTINCLUDE
//
-1 TEXTINCLUDE
+1 TEXTINCLUDE
BEGIN
"resource.h\0"
END
-2 TEXTINCLUDE
+2 TEXTINCLUDE
BEGIN
"#include ""afxres.h""\r\n"
"\0"
END
-3 TEXTINCLUDE
+3 TEXTINCLUDE
BEGIN
"\r\n"
"\0"
--- a/testbin/AutoBuild_Windows_VS2008.bat
+++ b/testbin/AutoBuild_Windows_VS2008.bat
@@ -23,7 +23,7 @@
rem call VP build
echo "Welsvp Building....."
cd %VPProjectDir%
-rem vcclean
+rem vcclean
%VCBUILDEXE% WelsVP_2008.vcproj
@@ -33,7 +33,7 @@
cd %CurDir%
cd %EncoderProjectDir%
-rem vcclean
+rem vcclean
%VCBUILDEXE% WelsEncCore.vcproj
%VCBUILDEXE% WelsEncPlus.vcproj
%VCBUILDEXE% encConsole.vcproj
@@ -44,7 +44,7 @@
cd %CurDir%
cd %DecoderProjectDir%
-rem vcclean
+rem vcclean
%VCBUILDEXE% WelsDecCore.vcproj
%VCBUILDEXE% WelsDecPlus.vcproj
%VCBUILDEXE% decConsole.vcproj
--- a/testbin/AutoBuild_Windows_VS2010.bat
+++ b/testbin/AutoBuild_Windows_VS2010.bat
@@ -36,7 +36,7 @@
cd %CurDir%
cd %EncoderProjectDir%
echo current directory is %EncoderProjectDir%
-rem vcclean
+rem vcclean
echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
%VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
@@ -49,7 +49,7 @@
cd %CurDir%
cd %DecoderProjectDir%
echo current directory is %DecoderProjectDir%
-rem vcclean
+rem vcclean
echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2010.sln
--- a/testbin/AutoBuild_Windows_VS2012.bat
+++ b/testbin/AutoBuild_Windows_VS2012.bat
@@ -36,7 +36,7 @@
cd %CurDir%
cd %EncoderProjectDir%
echo current directory is %EncoderProjectDir%
-rem vcclean
+rem vcclean
echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
%VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
@@ -49,7 +49,7 @@
cd %CurDir%
cd %DecoderProjectDir%
echo current directory is %DecoderProjectDir%
-rem vcclean
+rem vcclean
echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2012.sln
--- a/testbin/welsenc.cfg
+++ b/testbin/welsenc.cfg
@@ -12,19 +12,19 @@
EnableFrameCropping 1 # enable frame cropping flag
#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
+LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
# 2: on except for slice boundaries,
# 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off (w.r.t. idc=0)
+ # 4: Luma on but Chroma off (w.r.t. idc=0)
# 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
# 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
-InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
+InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
# 2: on except for slice boundaries,
# 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+ # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
# 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
# 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
@@ -51,7 +51,7 @@
#============================== LONG TERM REFERENCE CONTROL ==============================
EnableLongTermReference 0 # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod 30 # Long Term Reference Marking Period
+LtrMarkPeriod 30 # Long Term Reference Marking Period
#============================== LAYER DEFINITION ==============================
PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
--- a/testbin/welsenc_vd_1d.cfg
+++ b/testbin/welsenc_vd_1d.cfg
@@ -12,19 +12,19 @@
EnableFrameCropping 1 # enable frame cropping flag
#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
+LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
# 2: on except for slice boundaries,
# 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off (w.r.t. idc=0)
+ # 4: Luma on but Chroma off (w.r.t. idc=0)
# 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
# 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
-InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
+InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
# 2: on except for slice boundaries,
# 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+ # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
# 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
# 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
@@ -51,7 +51,7 @@
#============================== LONG TERM REFERENCE CONTROL ==============================
EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod 30 # Long Term Reference Marking Period
+LtrMarkPeriod 30 # Long Term Reference Marking Period
#============================== LAYER DEFINITION ==============================
PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)
--- a/testbin/welsenc_vd_rc.cfg
+++ b/testbin/welsenc_vd_rc.cfg
@@ -12,19 +12,19 @@
EnableFrameCropping 1 # enable frame cropping flag
#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
+LoopFilterDisableIDC 0 # Loop filter idc (0: on, 1: off,
# 2: on except for slice boundaries,
# 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off (w.r.t. idc=0)
+ # 4: Luma on but Chroma off (w.r.t. idc=0)
# 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
# 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
LoopFilterAlphaC0Offset 0 # AlphaOffset(-6..+6): valid range
LoopFilterBetaOffset 0 # BetaOffset (-6..+6): valid range
-InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
+InterLayerLoopFilterDisableIDC 0 # filter idc for inter-layer deblocking (0: on, 1: off,
# 2: on except for slice boundaries,
# 3: two stage. slice boundries on in second stage
- # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
+ # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
# 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
# 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
InterLayerLoopFilterAlphaC0Offset 0 # AlphaOffset for inter-layer deblocking
@@ -51,7 +51,7 @@
#============================== LONG TERM REFERENCE CONTROL ==============================
EnableLongTermReference 1 # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod 30 # Long Term Reference Marking Period
+LtrMarkPeriod 30 # Long Term Reference Marking Period
#============================== LAYER DEFINITION ==============================
PrefixNALAddingCtrl 0 # Control flag of adding prefix unit (0: off, 1: on)