ref: bfe9da47cfa2f51ebef78c59332b5ba2deba306d
parent: 98c1cd7ae022efe276123898af6b892eade0732c
author: grobe0ba <grobe0ba@tcp80.org>
date: Sat Apr 13 10:04:39 EDT 2024
use generated build files for boringssl also, let's use some assembly where we've got it
--- a/third_party/boringssl/Makefile
+++ b/third_party/boringssl/Makefile
@@ -1,278 +1,39 @@
.PHONY: all clean
-CRYPTO_srcs= err_data.c \
- src/crypto/asn1/a_bitstr.c \
- src/crypto/asn1/a_bool.c \
- src/crypto/asn1/a_d2i_fp.c \
- src/crypto/asn1/a_dup.c \
- src/crypto/asn1/a_enum.c \
- src/crypto/asn1/a_gentm.c \
- src/crypto/asn1/a_i2d_fp.c \
- src/crypto/asn1/a_int.c \
- src/crypto/asn1/a_mbstr.c \
- src/crypto/asn1/a_object.c \
- src/crypto/asn1/a_octet.c \
- src/crypto/asn1/a_print.c \
- src/crypto/asn1/a_strex.c \
- src/crypto/asn1/a_strnid.c \
- src/crypto/asn1/a_time.c \
- src/crypto/asn1/a_type.c \
- src/crypto/asn1/a_utctm.c \
- src/crypto/asn1/a_utf8.c \
- src/crypto/asn1/asn1_lib.c \
- src/crypto/asn1/asn1_par.c \
- src/crypto/asn1/asn_pack.c \
- src/crypto/asn1/f_int.c \
- src/crypto/asn1/f_string.c \
- src/crypto/asn1/tasn_dec.c \
- src/crypto/asn1/tasn_enc.c \
- src/crypto/asn1/tasn_fre.c \
- src/crypto/asn1/tasn_new.c \
- src/crypto/asn1/tasn_typ.c \
- src/crypto/asn1/tasn_utl.c \
- src/crypto/asn1/time_support.c \
- src/crypto/base64/base64.c \
- src/crypto/bio/bio.c \
- src/crypto/bio/bio_mem.c \
- src/crypto/bio/connect.c \
- src/crypto/bio/fd.c \
- src/crypto/bio/file.c \
- src/crypto/bio/hexdump.c \
- src/crypto/bio/pair.c \
- src/crypto/bio/printf.c \
- src/crypto/bio/socket.c \
- src/crypto/bio/socket_helper.c \
- src/crypto/blake2/blake2.c \
- src/crypto/bn_extra/bn_asn1.c \
- src/crypto/bn_extra/convert.c \
- src/crypto/buf/buf.c \
- src/crypto/bytestring/asn1_compat.c \
- src/crypto/bytestring/ber.c \
- src/crypto/bytestring/cbb.c \
- src/crypto/bytestring/cbs.c \
- src/crypto/bytestring/unicode.c \
- src/crypto/chacha/chacha.c \
- src/crypto/cipher_extra/cipher_extra.c \
- src/crypto/cipher_extra/derive_key.c \
- src/crypto/cipher_extra/e_aesccm.c \
- src/crypto/cipher_extra/e_aesctrhmac.c \
- src/crypto/cipher_extra/e_aesgcmsiv.c \
- src/crypto/cipher_extra/e_chacha20poly1305.c \
- src/crypto/cipher_extra/e_null.c \
- src/crypto/cipher_extra/e_rc2.c \
- src/crypto/cipher_extra/e_rc4.c \
- src/crypto/cipher_extra/e_tls.c \
- src/crypto/cipher_extra/tls_cbc.c \
- src/crypto/cmac/cmac.c \
- src/crypto/conf/conf.c \
- src/crypto/cpu-aarch64-fuchsia.c \
- src/crypto/cpu-aarch64-linux.c \
- src/crypto/cpu-aarch64-win.c \
- src/crypto/cpu-arm-linux.c \
- src/crypto/cpu-arm.c \
- src/crypto/cpu-intel.c \
- src/crypto/cpu-ppc64le.c \
- src/crypto/crypto.c \
- src/crypto/curve25519/curve25519.c \
- src/crypto/curve25519/spake25519.c \
- src/crypto/dh_extra/dh_asn1.c \
- src/crypto/dh_extra/params.c \
- src/crypto/digest_extra/digest_extra.c \
- src/crypto/dsa/dsa.c \
- src/crypto/dsa/dsa_asn1.c \
- src/crypto/ec_extra/ec_asn1.c \
- src/crypto/ec_extra/ec_derive.c \
- src/crypto/ec_extra/hash_to_curve.c \
- src/crypto/ecdh_extra/ecdh_extra.c \
- src/crypto/ecdsa_extra/ecdsa_asn1.c \
- src/crypto/engine/engine.c \
- src/crypto/err/err.c \
- src/crypto/evp/digestsign.c \
- src/crypto/evp/evp.c \
- src/crypto/evp/evp_asn1.c \
- src/crypto/evp/evp_ctx.c \
- src/crypto/evp/p_dsa_asn1.c \
- src/crypto/evp/p_ec.c \
- src/crypto/evp/p_ec_asn1.c \
- src/crypto/evp/p_ed25519.c \
- src/crypto/evp/p_ed25519_asn1.c \
- src/crypto/evp/p_rsa.c \
- src/crypto/evp/p_rsa_asn1.c \
- src/crypto/evp/p_x25519.c \
- src/crypto/evp/p_x25519_asn1.c \
- src/crypto/evp/pbkdf.c \
- src/crypto/evp/print.c \
- src/crypto/evp/scrypt.c \
- src/crypto/evp/sign.c \
- src/crypto/ex_data.c \
- src/crypto/fipsmodule/bcm.c \
- src/crypto/fipsmodule/fips_shared_support.c \
- src/crypto/hkdf/hkdf.c \
- src/crypto/hpke/hpke.c \
- src/crypto/hrss/hrss.c \
- src/crypto/lhash/lhash.c \
- src/crypto/mem.c \
- src/crypto/obj/obj.c \
- src/crypto/obj/obj_xref.c \
- src/crypto/pem/pem_all.c \
- src/crypto/pem/pem_info.c \
- src/crypto/pem/pem_lib.c \
- src/crypto/pem/pem_oth.c \
- src/crypto/pem/pem_pk8.c \
- src/crypto/pem/pem_pkey.c \
- src/crypto/pem/pem_x509.c \
- src/crypto/pem/pem_xaux.c \
- src/crypto/pkcs7/pkcs7.c \
- src/crypto/pkcs7/pkcs7_x509.c \
- src/crypto/pkcs8/p5_pbev2.c \
- src/crypto/pkcs8/pkcs8.c \
- src/crypto/pkcs8/pkcs8_x509.c \
- src/crypto/poly1305/poly1305.c \
- src/crypto/poly1305/poly1305_arm.c \
- src/crypto/poly1305/poly1305_vec.c \
- src/crypto/pool/pool.c \
- src/crypto/rand_extra/deterministic.c \
- src/crypto/rand_extra/forkunsafe.c \
- src/crypto/rand_extra/fuchsia.c \
- src/crypto/rand_extra/passive.c \
- src/crypto/rand_extra/rand_extra.c \
- src/crypto/rand_extra/windows.c \
- src/crypto/rc4/rc4.c \
- src/crypto/refcount_c11.c \
- src/crypto/refcount_lock.c \
- src/crypto/rsa_extra/rsa_asn1.c \
- src/crypto/rsa_extra/rsa_print.c \
- src/crypto/siphash/siphash.c \
- src/crypto/stack/stack.c \
- src/crypto/thread.c \
- src/crypto/thread_none.c \
- src/crypto/thread_pthread.c \
- src/crypto/thread_win.c \
- src/crypto/trust_token/pmbtoken.c \
- src/crypto/trust_token/trust_token.c \
- src/crypto/trust_token/voprf.c \
- src/crypto/x509/a_digest.c \
- src/crypto/x509/a_sign.c \
- src/crypto/x509/a_verify.c \
- src/crypto/x509/algorithm.c \
- src/crypto/x509/asn1_gen.c \
- src/crypto/x509/by_dir.c \
- src/crypto/x509/by_file.c \
- src/crypto/x509/i2d_pr.c \
- src/crypto/x509/name_print.c \
- src/crypto/x509/rsa_pss.c \
- src/crypto/x509/t_crl.c \
- src/crypto/x509/t_req.c \
- src/crypto/x509/t_x509.c \
- src/crypto/x509/t_x509a.c \
- src/crypto/x509/x509.c \
- src/crypto/x509/x509_att.c \
- src/crypto/x509/x509_cmp.c \
- src/crypto/x509/x509_d2.c \
- src/crypto/x509/x509_def.c \
- src/crypto/x509/x509_ext.c \
- src/crypto/x509/x509_lu.c \
- src/crypto/x509/x509_obj.c \
- src/crypto/x509/x509_req.c \
- src/crypto/x509/x509_set.c \
- src/crypto/x509/x509_trs.c \
- src/crypto/x509/x509_txt.c \
- src/crypto/x509/x509_v3.c \
- src/crypto/x509/x509_vfy.c \
- src/crypto/x509/x509_vpm.c \
- src/crypto/x509/x509cset.c \
- src/crypto/x509/x509name.c \
- src/crypto/x509/x509rset.c \
- src/crypto/x509/x509spki.c \
- src/crypto/x509/x_algor.c \
- src/crypto/x509/x_all.c \
- src/crypto/x509/x_attrib.c \
- src/crypto/x509/x_crl.c \
- src/crypto/x509/x_exten.c \
- src/crypto/x509/x_info.c \
- src/crypto/x509/x_name.c \
- src/crypto/x509/x_pkey.c \
- src/crypto/x509/x_pubkey.c \
- src/crypto/x509/x_req.c \
- src/crypto/x509/x_sig.c \
- src/crypto/x509/x_spki.c \
- src/crypto/x509/x_val.c \
- src/crypto/x509/x_x509.c \
- src/crypto/x509/x_x509a.c \
- src/crypto/x509v3/pcy_cache.c \
- src/crypto/x509v3/pcy_data.c \
- src/crypto/x509v3/pcy_lib.c \
- src/crypto/x509v3/pcy_map.c \
- src/crypto/x509v3/pcy_node.c \
- src/crypto/x509v3/pcy_tree.c \
- src/crypto/x509v3/v3_akey.c \
- src/crypto/x509v3/v3_akeya.c \
- src/crypto/x509v3/v3_alt.c \
- src/crypto/x509v3/v3_bcons.c \
- src/crypto/x509v3/v3_bitst.c \
- src/crypto/x509v3/v3_conf.c \
- src/crypto/x509v3/v3_cpols.c \
- src/crypto/x509v3/v3_crld.c \
- src/crypto/x509v3/v3_enum.c \
- src/crypto/x509v3/v3_extku.c \
- src/crypto/x509v3/v3_genn.c \
- src/crypto/x509v3/v3_ia5.c \
- src/crypto/x509v3/v3_info.c \
- src/crypto/x509v3/v3_int.c \
- src/crypto/x509v3/v3_lib.c \
- src/crypto/x509v3/v3_ncons.c \
- src/crypto/x509v3/v3_ocsp.c \
- src/crypto/x509v3/v3_pci.c \
- src/crypto/x509v3/v3_pcia.c \
- src/crypto/x509v3/v3_pcons.c \
- src/crypto/x509v3/v3_pmaps.c \
- src/crypto/x509v3/v3_prn.c \
- src/crypto/x509v3/v3_purp.c \
- src/crypto/x509v3/v3_skey.c \
- src/crypto/x509v3/v3_utl.c
+include eureka.mk
-SSL_srcs= src/ssl/bio_ssl.cc \
- src/ssl/d1_both.cc \
- src/ssl/d1_lib.cc \
- src/ssl/d1_pkt.cc \
- src/ssl/d1_srtp.cc \
- src/ssl/dtls_method.cc \
- src/ssl/dtls_record.cc \
- src/ssl/encrypted_client_hello.cc \
- src/ssl/extensions.cc \
- src/ssl/handoff.cc \
- src/ssl/handshake.cc \
- src/ssl/handshake_client.cc \
- src/ssl/handshake_server.cc \
- src/ssl/s3_both.cc \
- src/ssl/s3_lib.cc \
- src/ssl/s3_pkt.cc \
- src/ssl/ssl_aead_ctx.cc \
- src/ssl/ssl_asn1.cc \
- src/ssl/ssl_buffer.cc \
- src/ssl/ssl_cert.cc \
- src/ssl/ssl_cipher.cc \
- src/ssl/ssl_file.cc \
- src/ssl/ssl_key_share.cc \
- src/ssl/ssl_lib.cc \
- src/ssl/ssl_privkey.cc \
- src/ssl/ssl_session.cc \
- src/ssl/ssl_stat.cc \
- src/ssl/ssl_transcript.cc \
- src/ssl/ssl_versions.cc \
- src/ssl/ssl_x509.cc \
- src/ssl/t1_enc.cc \
- src/ssl/tls13_both.cc \
- src/ssl/tls13_client.cc \
- src/ssl/tls13_enc.cc \
- src/ssl/tls13_server.cc \
- src/ssl/tls_method.cc \
- src/ssl/tls_record.cc \
+CFLAGS += -Isrc/include
-CRYPTO_objs= $(CRYPTO_srcs:.c=.o)
-SSL_objs= $(SSL_srcs:.cc=.o)
+CRYPTO_objs= $(crypto_sources:.c=.o)
+SSL_objs= $(ssl_sources:.cc=.o)
+MACH := $(shell $(CC) -dumpmachine 2>/dev/null)
+USE_ASM := false
+
+ifneq (,$(findstring linux,$(MACH)))
+ ifneq (,$(findstring x86_64,$(MACH)))
+ CRYPTO_objs += $(linux_x86_64_sources:.S=.o)
+ USE_ASM := true
+ endif
+ ifneq (,$(findstring i686,$(MACH)))
+ CRYPTO_objs += $(linux_x86_sources:.S=.o)
+ USE_ASM := true
+ endif
+ ifneq (,$(findstring aarch64,$(MACH)))
+ CRYPTO_objs += $(linux_aarch64_sources:.S=.o)
+ USE_ASM := true
+ endif
+ ifneq (,$(findstring ppc64le,$(MACH)))
+ CRYPTO_objs += $(linux_ppc64le_sources:.S=.o)
+ USE_ASM := true
+ endif
+endif
+
+ifeq (,$(findstring true,$(USE_ASM)))
+ CFLAGS += -DOPENSSL_NO_ASM
+endif
+
all: libssl.a libcrypto.a
libcrypto.a: $(CRYPTO_objs)
@@ -284,10 +45,13 @@
.SUFFIXES: .c .o
%.o: %.c
- $(CC) -DOPENSSL_NO_ASM -Isrc/include -c -o $@ $<
+ $(CC) $(CFLAGS) -c -o $@ $<
%.o: %.cc
- $(CXX) -DOPENSSL_NO_ASM -Isrc/include -c -o $@ $<
+ $(CXX) $(CFLAGS) -c -o $@ $<
+
+%.o: %.S
+ $(CC) $(CFLAGS) -c -o $@ $<
clean:
rm -f $(CRYPTO_objs)
--- /dev/null
+++ b/third_party/boringssl/eureka.mk
@@ -1,0 +1,375 @@
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is created by generate_build_files.py. Do not edit manually.
+
+crypto_sources := \
+ err_data.c\
+ src/crypto/asn1/a_bitstr.c\
+ src/crypto/asn1/a_bool.c\
+ src/crypto/asn1/a_d2i_fp.c\
+ src/crypto/asn1/a_dup.c\
+ src/crypto/asn1/a_enum.c\
+ src/crypto/asn1/a_gentm.c\
+ src/crypto/asn1/a_i2d_fp.c\
+ src/crypto/asn1/a_int.c\
+ src/crypto/asn1/a_mbstr.c\
+ src/crypto/asn1/a_object.c\
+ src/crypto/asn1/a_octet.c\
+ src/crypto/asn1/a_print.c\
+ src/crypto/asn1/a_strex.c\
+ src/crypto/asn1/a_strnid.c\
+ src/crypto/asn1/a_time.c\
+ src/crypto/asn1/a_type.c\
+ src/crypto/asn1/a_utctm.c\
+ src/crypto/asn1/a_utf8.c\
+ src/crypto/asn1/asn1_lib.c\
+ src/crypto/asn1/asn1_par.c\
+ src/crypto/asn1/asn_pack.c\
+ src/crypto/asn1/f_int.c\
+ src/crypto/asn1/f_string.c\
+ src/crypto/asn1/tasn_dec.c\
+ src/crypto/asn1/tasn_enc.c\
+ src/crypto/asn1/tasn_fre.c\
+ src/crypto/asn1/tasn_new.c\
+ src/crypto/asn1/tasn_typ.c\
+ src/crypto/asn1/tasn_utl.c\
+ src/crypto/asn1/time_support.c\
+ src/crypto/base64/base64.c\
+ src/crypto/bio/bio.c\
+ src/crypto/bio/bio_mem.c\
+ src/crypto/bio/connect.c\
+ src/crypto/bio/fd.c\
+ src/crypto/bio/file.c\
+ src/crypto/bio/hexdump.c\
+ src/crypto/bio/pair.c\
+ src/crypto/bio/printf.c\
+ src/crypto/bio/socket.c\
+ src/crypto/bio/socket_helper.c\
+ src/crypto/blake2/blake2.c\
+ src/crypto/bn_extra/bn_asn1.c\
+ src/crypto/bn_extra/convert.c\
+ src/crypto/buf/buf.c\
+ src/crypto/bytestring/asn1_compat.c\
+ src/crypto/bytestring/ber.c\
+ src/crypto/bytestring/cbb.c\
+ src/crypto/bytestring/cbs.c\
+ src/crypto/bytestring/unicode.c\
+ src/crypto/chacha/chacha.c\
+ src/crypto/cipher_extra/cipher_extra.c\
+ src/crypto/cipher_extra/derive_key.c\
+ src/crypto/cipher_extra/e_aesccm.c\
+ src/crypto/cipher_extra/e_aesctrhmac.c\
+ src/crypto/cipher_extra/e_aesgcmsiv.c\
+ src/crypto/cipher_extra/e_chacha20poly1305.c\
+ src/crypto/cipher_extra/e_null.c\
+ src/crypto/cipher_extra/e_rc2.c\
+ src/crypto/cipher_extra/e_rc4.c\
+ src/crypto/cipher_extra/e_tls.c\
+ src/crypto/cipher_extra/tls_cbc.c\
+ src/crypto/cmac/cmac.c\
+ src/crypto/conf/conf.c\
+ src/crypto/cpu-aarch64-fuchsia.c\
+ src/crypto/cpu-aarch64-linux.c\
+ src/crypto/cpu-aarch64-win.c\
+ src/crypto/cpu-arm-linux.c\
+ src/crypto/cpu-arm.c\
+ src/crypto/cpu-intel.c\
+ src/crypto/cpu-ppc64le.c\
+ src/crypto/crypto.c\
+ src/crypto/curve25519/curve25519.c\
+ src/crypto/curve25519/spake25519.c\
+ src/crypto/dh_extra/dh_asn1.c\
+ src/crypto/dh_extra/params.c\
+ src/crypto/digest_extra/digest_extra.c\
+ src/crypto/dsa/dsa.c\
+ src/crypto/dsa/dsa_asn1.c\
+ src/crypto/ec_extra/ec_asn1.c\
+ src/crypto/ec_extra/ec_derive.c\
+ src/crypto/ec_extra/hash_to_curve.c\
+ src/crypto/ecdh_extra/ecdh_extra.c\
+ src/crypto/ecdsa_extra/ecdsa_asn1.c\
+ src/crypto/engine/engine.c\
+ src/crypto/err/err.c\
+ src/crypto/evp/digestsign.c\
+ src/crypto/evp/evp.c\
+ src/crypto/evp/evp_asn1.c\
+ src/crypto/evp/evp_ctx.c\
+ src/crypto/evp/p_dsa_asn1.c\
+ src/crypto/evp/p_ec.c\
+ src/crypto/evp/p_ec_asn1.c\
+ src/crypto/evp/p_ed25519.c\
+ src/crypto/evp/p_ed25519_asn1.c\
+ src/crypto/evp/p_rsa.c\
+ src/crypto/evp/p_rsa_asn1.c\
+ src/crypto/evp/p_x25519.c\
+ src/crypto/evp/p_x25519_asn1.c\
+ src/crypto/evp/pbkdf.c\
+ src/crypto/evp/print.c\
+ src/crypto/evp/scrypt.c\
+ src/crypto/evp/sign.c\
+ src/crypto/ex_data.c\
+ src/crypto/fipsmodule/bcm.c\
+ src/crypto/fipsmodule/fips_shared_support.c\
+ src/crypto/hkdf/hkdf.c\
+ src/crypto/hpke/hpke.c\
+ src/crypto/hrss/hrss.c\
+ src/crypto/lhash/lhash.c\
+ src/crypto/mem.c\
+ src/crypto/obj/obj.c\
+ src/crypto/obj/obj_xref.c\
+ src/crypto/pem/pem_all.c\
+ src/crypto/pem/pem_info.c\
+ src/crypto/pem/pem_lib.c\
+ src/crypto/pem/pem_oth.c\
+ src/crypto/pem/pem_pk8.c\
+ src/crypto/pem/pem_pkey.c\
+ src/crypto/pem/pem_x509.c\
+ src/crypto/pem/pem_xaux.c\
+ src/crypto/pkcs7/pkcs7.c\
+ src/crypto/pkcs7/pkcs7_x509.c\
+ src/crypto/pkcs8/p5_pbev2.c\
+ src/crypto/pkcs8/pkcs8.c\
+ src/crypto/pkcs8/pkcs8_x509.c\
+ src/crypto/poly1305/poly1305.c\
+ src/crypto/poly1305/poly1305_arm.c\
+ src/crypto/poly1305/poly1305_vec.c\
+ src/crypto/pool/pool.c\
+ src/crypto/rand_extra/deterministic.c\
+ src/crypto/rand_extra/forkunsafe.c\
+ src/crypto/rand_extra/fuchsia.c\
+ src/crypto/rand_extra/passive.c\
+ src/crypto/rand_extra/rand_extra.c\
+ src/crypto/rand_extra/windows.c\
+ src/crypto/rc4/rc4.c\
+ src/crypto/refcount_c11.c\
+ src/crypto/refcount_lock.c\
+ src/crypto/rsa_extra/rsa_asn1.c\
+ src/crypto/rsa_extra/rsa_print.c\
+ src/crypto/siphash/siphash.c\
+ src/crypto/stack/stack.c\
+ src/crypto/thread.c\
+ src/crypto/thread_none.c\
+ src/crypto/thread_pthread.c\
+ src/crypto/thread_win.c\
+ src/crypto/trust_token/pmbtoken.c\
+ src/crypto/trust_token/trust_token.c\
+ src/crypto/trust_token/voprf.c\
+ src/crypto/x509/a_digest.c\
+ src/crypto/x509/a_sign.c\
+ src/crypto/x509/a_verify.c\
+ src/crypto/x509/algorithm.c\
+ src/crypto/x509/asn1_gen.c\
+ src/crypto/x509/by_dir.c\
+ src/crypto/x509/by_file.c\
+ src/crypto/x509/i2d_pr.c\
+ src/crypto/x509/name_print.c\
+ src/crypto/x509/rsa_pss.c\
+ src/crypto/x509/t_crl.c\
+ src/crypto/x509/t_req.c\
+ src/crypto/x509/t_x509.c\
+ src/crypto/x509/t_x509a.c\
+ src/crypto/x509/x509.c\
+ src/crypto/x509/x509_att.c\
+ src/crypto/x509/x509_cmp.c\
+ src/crypto/x509/x509_d2.c\
+ src/crypto/x509/x509_def.c\
+ src/crypto/x509/x509_ext.c\
+ src/crypto/x509/x509_lu.c\
+ src/crypto/x509/x509_obj.c\
+ src/crypto/x509/x509_req.c\
+ src/crypto/x509/x509_set.c\
+ src/crypto/x509/x509_trs.c\
+ src/crypto/x509/x509_txt.c\
+ src/crypto/x509/x509_v3.c\
+ src/crypto/x509/x509_vfy.c\
+ src/crypto/x509/x509_vpm.c\
+ src/crypto/x509/x509cset.c\
+ src/crypto/x509/x509name.c\
+ src/crypto/x509/x509rset.c\
+ src/crypto/x509/x509spki.c\
+ src/crypto/x509/x_algor.c\
+ src/crypto/x509/x_all.c\
+ src/crypto/x509/x_attrib.c\
+ src/crypto/x509/x_crl.c\
+ src/crypto/x509/x_exten.c\
+ src/crypto/x509/x_info.c\
+ src/crypto/x509/x_name.c\
+ src/crypto/x509/x_pkey.c\
+ src/crypto/x509/x_pubkey.c\
+ src/crypto/x509/x_req.c\
+ src/crypto/x509/x_sig.c\
+ src/crypto/x509/x_spki.c\
+ src/crypto/x509/x_val.c\
+ src/crypto/x509/x_x509.c\
+ src/crypto/x509/x_x509a.c\
+ src/crypto/x509v3/pcy_cache.c\
+ src/crypto/x509v3/pcy_data.c\
+ src/crypto/x509v3/pcy_lib.c\
+ src/crypto/x509v3/pcy_map.c\
+ src/crypto/x509v3/pcy_node.c\
+ src/crypto/x509v3/pcy_tree.c\
+ src/crypto/x509v3/v3_akey.c\
+ src/crypto/x509v3/v3_akeya.c\
+ src/crypto/x509v3/v3_alt.c\
+ src/crypto/x509v3/v3_bcons.c\
+ src/crypto/x509v3/v3_bitst.c\
+ src/crypto/x509v3/v3_conf.c\
+ src/crypto/x509v3/v3_cpols.c\
+ src/crypto/x509v3/v3_crld.c\
+ src/crypto/x509v3/v3_enum.c\
+ src/crypto/x509v3/v3_extku.c\
+ src/crypto/x509v3/v3_genn.c\
+ src/crypto/x509v3/v3_ia5.c\
+ src/crypto/x509v3/v3_info.c\
+ src/crypto/x509v3/v3_int.c\
+ src/crypto/x509v3/v3_lib.c\
+ src/crypto/x509v3/v3_ncons.c\
+ src/crypto/x509v3/v3_ocsp.c\
+ src/crypto/x509v3/v3_pci.c\
+ src/crypto/x509v3/v3_pcia.c\
+ src/crypto/x509v3/v3_pcons.c\
+ src/crypto/x509v3/v3_pmaps.c\
+ src/crypto/x509v3/v3_prn.c\
+ src/crypto/x509v3/v3_purp.c\
+ src/crypto/x509v3/v3_skey.c\
+ src/crypto/x509v3/v3_utl.c\
+
+ssl_sources := \
+ src/ssl/bio_ssl.cc\
+ src/ssl/d1_both.cc\
+ src/ssl/d1_lib.cc\
+ src/ssl/d1_pkt.cc\
+ src/ssl/d1_srtp.cc\
+ src/ssl/dtls_method.cc\
+ src/ssl/dtls_record.cc\
+ src/ssl/encrypted_client_hello.cc\
+ src/ssl/extensions.cc\
+ src/ssl/handoff.cc\
+ src/ssl/handshake.cc\
+ src/ssl/handshake_client.cc\
+ src/ssl/handshake_server.cc\
+ src/ssl/s3_both.cc\
+ src/ssl/s3_lib.cc\
+ src/ssl/s3_pkt.cc\
+ src/ssl/ssl_aead_ctx.cc\
+ src/ssl/ssl_asn1.cc\
+ src/ssl/ssl_buffer.cc\
+ src/ssl/ssl_cert.cc\
+ src/ssl/ssl_cipher.cc\
+ src/ssl/ssl_file.cc\
+ src/ssl/ssl_key_share.cc\
+ src/ssl/ssl_lib.cc\
+ src/ssl/ssl_privkey.cc\
+ src/ssl/ssl_session.cc\
+ src/ssl/ssl_stat.cc\
+ src/ssl/ssl_transcript.cc\
+ src/ssl/ssl_versions.cc\
+ src/ssl/ssl_x509.cc\
+ src/ssl/t1_enc.cc\
+ src/ssl/tls13_both.cc\
+ src/ssl/tls13_client.cc\
+ src/ssl/tls13_enc.cc\
+ src/ssl/tls13_server.cc\
+ src/ssl/tls_method.cc\
+ src/ssl/tls_record.cc\
+
+tool_sources := \
+ src/tool/args.cc\
+ src/tool/ciphers.cc\
+ src/tool/client.cc\
+ src/tool/const.cc\
+ src/tool/digest.cc\
+ src/tool/fd.cc\
+ src/tool/file.cc\
+ src/tool/generate_ech.cc\
+ src/tool/generate_ed25519.cc\
+ src/tool/genrsa.cc\
+ src/tool/pkcs12.cc\
+ src/tool/rand.cc\
+ src/tool/server.cc\
+ src/tool/sign.cc\
+ src/tool/speed.cc\
+ src/tool/tool.cc\
+ src/tool/transport_common.cc\
+
+linux_aarch64_sources := \
+ linux-aarch64/crypto/chacha/chacha-armv8.S\
+ linux-aarch64/crypto/fipsmodule/aesv8-armx64.S\
+ linux-aarch64/crypto/fipsmodule/armv8-mont.S\
+ linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S\
+ linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S\
+ linux-aarch64/crypto/fipsmodule/sha1-armv8.S\
+ linux-aarch64/crypto/fipsmodule/sha256-armv8.S\
+ linux-aarch64/crypto/fipsmodule/sha512-armv8.S\
+ linux-aarch64/crypto/fipsmodule/vpaes-armv8.S\
+ linux-aarch64/crypto/test/trampoline-armv8.S\
+
+linux_arm_sources := \
+ linux-arm/crypto/chacha/chacha-armv4.S\
+ linux-arm/crypto/fipsmodule/aesv8-armx32.S\
+ linux-arm/crypto/fipsmodule/armv4-mont.S\
+ linux-arm/crypto/fipsmodule/bsaes-armv7.S\
+ linux-arm/crypto/fipsmodule/ghash-armv4.S\
+ linux-arm/crypto/fipsmodule/ghashv8-armx32.S\
+ linux-arm/crypto/fipsmodule/sha1-armv4-large.S\
+ linux-arm/crypto/fipsmodule/sha256-armv4.S\
+ linux-arm/crypto/fipsmodule/sha512-armv4.S\
+ linux-arm/crypto/fipsmodule/vpaes-armv7.S\
+ linux-arm/crypto/test/trampoline-armv4.S\
+ src/crypto/curve25519/asm/x25519-asm-arm.S\
+ src/crypto/poly1305/poly1305_arm_asm.S\
+
+linux_ppc64le_sources := \
+ linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S\
+ linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S\
+ linux-ppc64le/crypto/test/trampoline-ppc.S\
+
+linux_x86_sources := \
+ linux-x86/crypto/chacha/chacha-x86.S\
+ linux-x86/crypto/fipsmodule/aesni-x86.S\
+ linux-x86/crypto/fipsmodule/bn-586.S\
+ linux-x86/crypto/fipsmodule/co-586.S\
+ linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S\
+ linux-x86/crypto/fipsmodule/ghash-x86.S\
+ linux-x86/crypto/fipsmodule/md5-586.S\
+ linux-x86/crypto/fipsmodule/sha1-586.S\
+ linux-x86/crypto/fipsmodule/sha256-586.S\
+ linux-x86/crypto/fipsmodule/sha512-586.S\
+ linux-x86/crypto/fipsmodule/vpaes-x86.S\
+ linux-x86/crypto/fipsmodule/x86-mont.S\
+ linux-x86/crypto/test/trampoline-x86.S\
+
+linux_x86_64_sources := \
+ linux-x86_64/crypto/chacha/chacha-x86_64.S\
+ linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S\
+ linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S\
+ linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/aesni-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/ghash-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/md5-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S\
+ linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S\
+ linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/rsaz-avx2.S\
+ linux-x86_64/crypto/fipsmodule/sha1-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/sha256-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/sha512-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S\
+ linux-x86_64/crypto/fipsmodule/x86_64-mont.S\
+ linux-x86_64/crypto/fipsmodule/x86_64-mont5.S\
+ linux-x86_64/crypto/test/trampoline-x86_64.S\
+ src/crypto/hrss/asm/poly_rq_mul.S\
+
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S
@@ -1,0 +1,1995 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+
+.hidden OPENSSL_armcap_P
+
+.section .rodata
+
+.align 5
+.Lsigma:
+.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
+.Lone:
+.long 1,0,0,0
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+
+.text
+
+.globl ChaCha20_ctr32
+.hidden ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+ AARCH64_VALID_CALL_TARGET
+ cbz x2,.Labort
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+ adrp x5,:pg_hi21_nc:OPENSSL_armcap_P
+#else
+ adrp x5,OPENSSL_armcap_P
+#endif
+ cmp x2,#192
+ b.lo .Lshort
+ ldr w17,[x5,:lo12:OPENSSL_armcap_P]
+ tst w17,#ARMV7_NEON
+ b.ne ChaCha20_neon
+
+.Lshort:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,.Lsigma
+ add x5,x5,:lo12:.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ldp x28,x30,[x4] // load counter
+#ifdef __ARMEB__
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+
+.Loop_outer:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov w7,w23
+ lsr x8,x23,#32
+ mov w9,w24
+ lsr x10,x24,#32
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#64
+.Loop:
+ sub x4,x4,#1
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ ror w21,w21,#16
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#20
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ add w5,w5,w9
+ add w6,w6,w10
+ add w7,w7,w11
+ add w8,w8,w12
+ eor w17,w17,w5
+ eor w19,w19,w6
+ eor w20,w20,w7
+ eor w21,w21,w8
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ ror w21,w21,#24
+ add w13,w13,w17
+ add w14,w14,w19
+ add w15,w15,w20
+ add w16,w16,w21
+ eor w9,w9,w13
+ eor w10,w10,w14
+ eor w11,w11,w15
+ eor w12,w12,w16
+ ror w9,w9,#25
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#16
+ ror w17,w17,#16
+ ror w19,w19,#16
+ ror w20,w20,#16
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#20
+ ror w11,w11,#20
+ ror w12,w12,#20
+ ror w9,w9,#20
+ add w5,w5,w10
+ add w6,w6,w11
+ add w7,w7,w12
+ add w8,w8,w9
+ eor w21,w21,w5
+ eor w17,w17,w6
+ eor w19,w19,w7
+ eor w20,w20,w8
+ ror w21,w21,#24
+ ror w17,w17,#24
+ ror w19,w19,#24
+ ror w20,w20,#24
+ add w15,w15,w21
+ add w16,w16,w17
+ add w13,w13,w19
+ add w14,w14,w20
+ eor w10,w10,w15
+ eor w11,w11,w16
+ eor w12,w12,w13
+ eor w9,w9,w14
+ ror w10,w10,#25
+ ror w11,w11,#25
+ ror w12,w12,#25
+ ror w9,w9,#25
+ cbnz x4,.Loop
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ b.lo .Ltail
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+
+ b.hi .Loop_outer
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+.Labort:
+ ret
+
+.align 4
+.Ltail:
+ add x2,x2,#64
+.Less_than_64:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ stp x5,x7,[sp,#0]
+ stp x9,x11,[sp,#16]
+ stp x13,x15,[sp,#32]
+ stp x17,x20,[sp,#48]
+
+.Loop_tail:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+
+.type ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,.Lsigma
+ add x5,x5,:lo12:.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ cmp x2,#512
+ b.hs .L512_or_more_neon
+
+ sub sp,sp,#64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+.Loop_outer_neon:
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ mov v0.16b,v24.16b
+ mov w7,w23
+ lsr x8,x23,#32
+ mov v4.16b,v24.16b
+ mov w9,w24
+ lsr x10,x24,#32
+ mov v16.16b,v24.16b
+ mov w11,w25
+ mov v1.16b,v25.16b
+ lsr x12,x25,#32
+ mov v5.16b,v25.16b
+ mov w13,w26
+ mov v17.16b,v25.16b
+ lsr x14,x26,#32
+ mov v3.16b,v27.16b
+ mov w15,w27
+ mov v7.16b,v28.16b
+ lsr x16,x27,#32
+ mov v19.16b,v29.16b
+ mov w17,w28
+ mov v2.16b,v26.16b
+ lsr x19,x28,#32
+ mov v6.16b,v26.16b
+ mov w20,w30
+ mov v18.16b,v26.16b
+ lsr x21,x30,#32
+
+ mov x4,#10
+ subs x2,x2,#256
+.Loop_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w11
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w12
+ eor v7.16b,v7.16b,v4.16b
+ eor w17,w17,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w19,w19,w6
+ rev32 v3.8h,v3.8h
+ eor w20,w20,w7
+ rev32 v7.8h,v7.8h
+ eor w21,w21,w8
+ rev32 v19.8h,v19.8h
+ ror w17,w17,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#20
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#20
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#20
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#12
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#12
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#12
+ ror w9,w9,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w10,w10,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w11,w11,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w12,w12,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w9
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w10
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w11
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w12
+ ushr v7.4s,v21.4s,#24
+ eor w17,w17,w5
+ ushr v19.4s,v22.4s,#24
+ eor w19,w19,w6
+ sli v3.4s,v20.4s,#8
+ eor w20,w20,w7
+ sli v7.4s,v21.4s,#8
+ eor w21,w21,w8
+ sli v19.4s,v22.4s,#8
+ ror w17,w17,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w19,w19,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w20,w20,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w21,w21,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w13,w13,w17
+ eor v21.16b,v5.16b,v6.16b
+ add w14,w14,w19
+ eor v22.16b,v17.16b,v18.16b
+ add w15,w15,w20
+ ushr v1.4s,v20.4s,#25
+ add w16,w16,w21
+ ushr v5.4s,v21.4s,#25
+ eor w9,w9,w13
+ ushr v17.4s,v22.4s,#25
+ eor w10,w10,w14
+ sli v1.4s,v20.4s,#7
+ eor w11,w11,w15
+ sli v5.4s,v21.4s,#7
+ eor w12,w12,w16
+ sli v17.4s,v22.4s,#7
+ ror w9,w9,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w10,w10,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w10
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w11
+ add v16.4s,v16.4s,v17.4s
+ add w7,w7,w12
+ eor v3.16b,v3.16b,v0.16b
+ add w8,w8,w9
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w5
+ eor v19.16b,v19.16b,v16.16b
+ eor w17,w17,w6
+ rev32 v3.8h,v3.8h
+ eor w19,w19,w7
+ rev32 v7.8h,v7.8h
+ eor w20,w20,w8
+ rev32 v19.8h,v19.8h
+ ror w21,w21,#16
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#16
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#16
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#16
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#20
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#20
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#20
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#12
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#12
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#12
+ ror w10,w10,#20
+ add v0.4s,v0.4s,v1.4s
+ ror w11,w11,#20
+ add v4.4s,v4.4s,v5.4s
+ ror w12,w12,#20
+ add v16.4s,v16.4s,v17.4s
+ ror w9,w9,#20
+ eor v20.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v21.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v22.16b,v19.16b,v16.16b
+ add w7,w7,w12
+ ushr v3.4s,v20.4s,#24
+ add w8,w8,w9
+ ushr v7.4s,v21.4s,#24
+ eor w21,w21,w5
+ ushr v19.4s,v22.4s,#24
+ eor w17,w17,w6
+ sli v3.4s,v20.4s,#8
+ eor w19,w19,w7
+ sli v7.4s,v21.4s,#8
+ eor w20,w20,w8
+ sli v19.4s,v22.4s,#8
+ ror w21,w21,#24
+ add v2.4s,v2.4s,v3.4s
+ ror w17,w17,#24
+ add v6.4s,v6.4s,v7.4s
+ ror w19,w19,#24
+ add v18.4s,v18.4s,v19.4s
+ ror w20,w20,#24
+ eor v20.16b,v1.16b,v2.16b
+ add w15,w15,w21
+ eor v21.16b,v5.16b,v6.16b
+ add w16,w16,w17
+ eor v22.16b,v17.16b,v18.16b
+ add w13,w13,w19
+ ushr v1.4s,v20.4s,#25
+ add w14,w14,w20
+ ushr v5.4s,v21.4s,#25
+ eor w10,w10,w15
+ ushr v17.4s,v22.4s,#25
+ eor w11,w11,w16
+ sli v1.4s,v20.4s,#7
+ eor w12,w12,w13
+ sli v5.4s,v21.4s,#7
+ eor w9,w9,w14
+ sli v17.4s,v22.4s,#7
+ ror w10,w10,#25
+ ext v2.16b,v2.16b,v2.16b,#8
+ ror w11,w11,#25
+ ext v6.16b,v6.16b,v6.16b,#8
+ ror w12,w12,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ cbnz x4,.Loop_neon
+
+ add w5,w5,w22 // accumulate key block
+ add v0.4s,v0.4s,v24.4s
+ add x6,x6,x22,lsr#32
+ add v4.4s,v4.4s,v24.4s
+ add w7,w7,w23
+ add v16.4s,v16.4s,v24.4s
+ add x8,x8,x23,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w9,w9,w24
+ add v6.4s,v6.4s,v26.4s
+ add x10,x10,x24,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w11,w11,w25
+ add v3.4s,v3.4s,v27.4s
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add v7.4s,v7.4s,v28.4s
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add v19.4s,v19.4s,v29.4s
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add v1.4s,v1.4s,v25.4s
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add v5.4s,v5.4s,v25.4s
+ add x21,x21,x30,lsr#32
+ add v17.4s,v17.4s,v25.4s
+
+ b.lo .Ltail_neon
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v20.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v21.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v22.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v23.16b
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ add v27.4s,v27.4s,v31.4s // += 4
+ stp x13,x15,[x0,#32]
+ add v28.4s,v28.4s,v31.4s
+ stp x17,x20,[x0,#48]
+ add v29.4s,v29.4s,v31.4s
+ add x0,x0,#64
+
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ eor v16.16b,v16.16b,v0.16b
+ eor v17.16b,v17.16b,v1.16b
+ eor v18.16b,v18.16b,v2.16b
+ eor v19.16b,v19.16b,v3.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ b.hi .Loop_outer_neon
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+.Ltail_neon:
+ add x2,x2,#256
+ cmp x2,#64
+ b.lo .Less_than_64
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#4 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_128
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v0.16b,v0.16b,v20.16b
+ eor v1.16b,v1.16b,v21.16b
+ eor v2.16b,v2.16b,v22.16b
+ eor v3.16b,v3.16b,v23.16b
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+ cmp x2,#64
+ b.lo .Less_than_192
+
+ ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
+ eor v4.16b,v4.16b,v20.16b
+ eor v5.16b,v5.16b,v21.16b
+ eor v6.16b,v6.16b,v22.16b
+ eor v7.16b,v7.16b,v23.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+ b.eq .Ldone_neon
+ sub x2,x2,#64
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
+ b .Last_neon
+
+.Less_than_128:
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
+ b .Last_neon
+.Less_than_192:
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
+ b .Last_neon
+
+.align 4
+.Last_neon:
+ sub x0,x0,#1
+ add x1,x1,x2
+ add x0,x0,x2
+ add x4,sp,x2
+ neg x2,x2
+
+.Loop_tail_neon:
+ ldrb w10,[x1,x2]
+ ldrb w11,[x4,x2]
+ add x2,x2,#1
+ eor w10,w10,w11
+ strb w10,[x0,x2]
+ cbnz x2,.Loop_tail_neon
+
+ stp xzr,xzr,[sp,#0]
+ stp xzr,xzr,[sp,#16]
+ stp xzr,xzr,[sp,#32]
+ stp xzr,xzr,[sp,#48]
+
+.Ldone_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ChaCha20_neon,.-ChaCha20_neon
+.type ChaCha20_512_neon,%function
+.align 5
+ChaCha20_512_neon:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+ adrp x5,.Lsigma
+ add x5,x5,:lo12:.Lsigma
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+.L512_or_more_neon:
+ sub sp,sp,#128+64
+
+ ldp x22,x23,[x5] // load sigma
+ ld1 {v24.4s},[x5],#16
+ ldp x24,x25,[x3] // load key
+ ldp x26,x27,[x3,#16]
+ ld1 {v25.4s,v26.4s},[x3]
+ ldp x28,x30,[x4] // load counter
+ ld1 {v27.4s},[x4]
+ ld1 {v31.4s},[x5]
+#ifdef __ARMEB__
+ rev64 v24.4s,v24.4s
+ ror x24,x24,#32
+ ror x25,x25,#32
+ ror x26,x26,#32
+ ror x27,x27,#32
+ ror x28,x28,#32
+ ror x30,x30,#32
+#endif
+ add v27.4s,v27.4s,v31.4s // += 1
+ stp q24,q25,[sp,#0] // off-load key block, invariant part
+ add v27.4s,v27.4s,v31.4s // not typo
+ str q26,[sp,#32]
+ add v28.4s,v27.4s,v31.4s
+ add v29.4s,v28.4s,v31.4s
+ add v30.4s,v29.4s,v31.4s
+ shl v31.4s,v31.4s,#2 // 1 -> 4
+
+ stp d8,d9,[sp,#128+0] // meet ABI requirements
+ stp d10,d11,[sp,#128+16]
+ stp d12,d13,[sp,#128+32]
+ stp d14,d15,[sp,#128+48]
+
+ sub x2,x2,#512 // not typo
+
+.Loop_outer_512_neon:
+ mov v0.16b,v24.16b
+ mov v4.16b,v24.16b
+ mov v8.16b,v24.16b
+ mov v12.16b,v24.16b
+ mov v16.16b,v24.16b
+ mov v20.16b,v24.16b
+ mov v1.16b,v25.16b
+ mov w5,w22 // unpack key block
+ mov v5.16b,v25.16b
+ lsr x6,x22,#32
+ mov v9.16b,v25.16b
+ mov w7,w23
+ mov v13.16b,v25.16b
+ lsr x8,x23,#32
+ mov v17.16b,v25.16b
+ mov w9,w24
+ mov v21.16b,v25.16b
+ lsr x10,x24,#32
+ mov v3.16b,v27.16b
+ mov w11,w25
+ mov v7.16b,v28.16b
+ lsr x12,x25,#32
+ mov v11.16b,v29.16b
+ mov w13,w26
+ mov v15.16b,v30.16b
+ lsr x14,x26,#32
+ mov v2.16b,v26.16b
+ mov w15,w27
+ mov v6.16b,v26.16b
+ lsr x16,x27,#32
+ add v19.4s,v3.4s,v31.4s // +4
+ mov w17,w28
+ add v23.4s,v7.4s,v31.4s // +4
+ lsr x19,x28,#32
+ mov v10.16b,v26.16b
+ mov w20,w30
+ mov v14.16b,v26.16b
+ lsr x21,x30,#32
+ mov v18.16b,v26.16b
+ stp q27,q28,[sp,#48] // off-load key block, variable part
+ mov v22.16b,v26.16b
+ str q29,[sp,#80]
+
+ mov x4,#5
+ subs x2,x2,#512
+.Loop_upper_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_upper_neon
+
+ add w5,w5,w22 // accumulate key block
+ add x6,x6,x22,lsr#32
+ add w7,w7,w23
+ add x8,x8,x23,lsr#32
+ add w9,w9,w24
+ add x10,x10,x24,lsr#32
+ add w11,w11,w25
+ add x12,x12,x25,lsr#32
+ add w13,w13,w26
+ add x14,x14,x26,lsr#32
+ add w15,w15,w27
+ add x16,x16,x27,lsr#32
+ add w17,w17,w28
+ add x19,x19,x28,lsr#32
+ add w20,w20,w30
+ add x21,x21,x30,lsr#32
+
+ add x5,x5,x6,lsl#32 // pack
+ add x7,x7,x8,lsl#32
+ ldp x6,x8,[x1,#0] // load input
+ add x9,x9,x10,lsl#32
+ add x11,x11,x12,lsl#32
+ ldp x10,x12,[x1,#16]
+ add x13,x13,x14,lsl#32
+ add x15,x15,x16,lsl#32
+ ldp x14,x16,[x1,#32]
+ add x17,x17,x19,lsl#32
+ add x20,x20,x21,lsl#32
+ ldp x19,x21,[x1,#48]
+ add x1,x1,#64
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor x15,x15,x16
+ eor x17,x17,x19
+ eor x20,x20,x21
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#1 // increment counter
+ mov w5,w22 // unpack key block
+ lsr x6,x22,#32
+ stp x9,x11,[x0,#16]
+ mov w7,w23
+ lsr x8,x23,#32
+ stp x13,x15,[x0,#32]
+ mov w9,w24
+ lsr x10,x24,#32
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ mov w11,w25
+ lsr x12,x25,#32
+ mov w13,w26
+ lsr x14,x26,#32
+ mov w15,w27
+ lsr x16,x27,#32
+ mov w17,w28
+ lsr x19,x28,#32
+ mov w20,w30
+ lsr x21,x30,#32
+
+ mov x4,#5
+.Loop_lower_neon:
+ sub x4,x4,#1
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#12
+ ext v7.16b,v7.16b,v7.16b,#12
+ ext v11.16b,v11.16b,v11.16b,#12
+ ext v15.16b,v15.16b,v15.16b,#12
+ ext v19.16b,v19.16b,v19.16b,#12
+ ext v23.16b,v23.16b,v23.16b,#12
+ ext v1.16b,v1.16b,v1.16b,#4
+ ext v5.16b,v5.16b,v5.16b,#4
+ ext v9.16b,v9.16b,v9.16b,#4
+ ext v13.16b,v13.16b,v13.16b,#4
+ ext v17.16b,v17.16b,v17.16b,#4
+ ext v21.16b,v21.16b,v21.16b,#4
+ add v0.4s,v0.4s,v1.4s
+ add w5,w5,w9
+ add v4.4s,v4.4s,v5.4s
+ add w6,w6,w10
+ add v8.4s,v8.4s,v9.4s
+ add w7,w7,w11
+ add v12.4s,v12.4s,v13.4s
+ add w8,w8,w12
+ add v16.4s,v16.4s,v17.4s
+ eor w17,w17,w5
+ add v20.4s,v20.4s,v21.4s
+ eor w19,w19,w6
+ eor v3.16b,v3.16b,v0.16b
+ eor w20,w20,w7
+ eor v7.16b,v7.16b,v4.16b
+ eor w21,w21,w8
+ eor v11.16b,v11.16b,v8.16b
+ ror w17,w17,#16
+ eor v15.16b,v15.16b,v12.16b
+ ror w19,w19,#16
+ eor v19.16b,v19.16b,v16.16b
+ ror w20,w20,#16
+ eor v23.16b,v23.16b,v20.16b
+ ror w21,w21,#16
+ rev32 v3.8h,v3.8h
+ add w13,w13,w17
+ rev32 v7.8h,v7.8h
+ add w14,w14,w19
+ rev32 v11.8h,v11.8h
+ add w15,w15,w20
+ rev32 v15.8h,v15.8h
+ add w16,w16,w21
+ rev32 v19.8h,v19.8h
+ eor w9,w9,w13
+ rev32 v23.8h,v23.8h
+ eor w10,w10,w14
+ add v2.4s,v2.4s,v3.4s
+ eor w11,w11,w15
+ add v6.4s,v6.4s,v7.4s
+ eor w12,w12,w16
+ add v10.4s,v10.4s,v11.4s
+ ror w9,w9,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w10,w10,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w11,w11,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w12,w12,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w9
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w10
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w11
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w12
+ eor v28.16b,v17.16b,v18.16b
+ eor w17,w17,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w19,w19,w6
+ ushr v1.4s,v24.4s,#20
+ eor w20,w20,w7
+ ushr v5.4s,v25.4s,#20
+ eor w21,w21,w8
+ ushr v9.4s,v26.4s,#20
+ ror w17,w17,#24
+ ushr v13.4s,v27.4s,#20
+ ror w19,w19,#24
+ ushr v17.4s,v28.4s,#20
+ ror w20,w20,#24
+ ushr v21.4s,v29.4s,#20
+ ror w21,w21,#24
+ sli v1.4s,v24.4s,#12
+ add w13,w13,w17
+ sli v5.4s,v25.4s,#12
+ add w14,w14,w19
+ sli v9.4s,v26.4s,#12
+ add w15,w15,w20
+ sli v13.4s,v27.4s,#12
+ add w16,w16,w21
+ sli v17.4s,v28.4s,#12
+ eor w9,w9,w13
+ sli v21.4s,v29.4s,#12
+ eor w10,w10,w14
+ add v0.4s,v0.4s,v1.4s
+ eor w11,w11,w15
+ add v4.4s,v4.4s,v5.4s
+ eor w12,w12,w16
+ add v8.4s,v8.4s,v9.4s
+ ror w9,w9,#25
+ add v12.4s,v12.4s,v13.4s
+ ror w10,w10,#25
+ add v16.4s,v16.4s,v17.4s
+ ror w11,w11,#25
+ add v20.4s,v20.4s,v21.4s
+ ror w12,w12,#25
+ eor v24.16b,v3.16b,v0.16b
+ add w5,w5,w10
+ eor v25.16b,v7.16b,v4.16b
+ add w6,w6,w11
+ eor v26.16b,v11.16b,v8.16b
+ add w7,w7,w12
+ eor v27.16b,v15.16b,v12.16b
+ add w8,w8,w9
+ eor v28.16b,v19.16b,v16.16b
+ eor w21,w21,w5
+ eor v29.16b,v23.16b,v20.16b
+ eor w17,w17,w6
+ ushr v3.4s,v24.4s,#24
+ eor w19,w19,w7
+ ushr v7.4s,v25.4s,#24
+ eor w20,w20,w8
+ ushr v11.4s,v26.4s,#24
+ ror w21,w21,#16
+ ushr v15.4s,v27.4s,#24
+ ror w17,w17,#16
+ ushr v19.4s,v28.4s,#24
+ ror w19,w19,#16
+ ushr v23.4s,v29.4s,#24
+ ror w20,w20,#16
+ sli v3.4s,v24.4s,#8
+ add w15,w15,w21
+ sli v7.4s,v25.4s,#8
+ add w16,w16,w17
+ sli v11.4s,v26.4s,#8
+ add w13,w13,w19
+ sli v15.4s,v27.4s,#8
+ add w14,w14,w20
+ sli v19.4s,v28.4s,#8
+ eor w10,w10,w15
+ sli v23.4s,v29.4s,#8
+ eor w11,w11,w16
+ add v2.4s,v2.4s,v3.4s
+ eor w12,w12,w13
+ add v6.4s,v6.4s,v7.4s
+ eor w9,w9,w14
+ add v10.4s,v10.4s,v11.4s
+ ror w10,w10,#20
+ add v14.4s,v14.4s,v15.4s
+ ror w11,w11,#20
+ add v18.4s,v18.4s,v19.4s
+ ror w12,w12,#20
+ add v22.4s,v22.4s,v23.4s
+ ror w9,w9,#20
+ eor v24.16b,v1.16b,v2.16b
+ add w5,w5,w10
+ eor v25.16b,v5.16b,v6.16b
+ add w6,w6,w11
+ eor v26.16b,v9.16b,v10.16b
+ add w7,w7,w12
+ eor v27.16b,v13.16b,v14.16b
+ add w8,w8,w9
+ eor v28.16b,v17.16b,v18.16b
+ eor w21,w21,w5
+ eor v29.16b,v21.16b,v22.16b
+ eor w17,w17,w6
+ ushr v1.4s,v24.4s,#25
+ eor w19,w19,w7
+ ushr v5.4s,v25.4s,#25
+ eor w20,w20,w8
+ ushr v9.4s,v26.4s,#25
+ ror w21,w21,#24
+ ushr v13.4s,v27.4s,#25
+ ror w17,w17,#24
+ ushr v17.4s,v28.4s,#25
+ ror w19,w19,#24
+ ushr v21.4s,v29.4s,#25
+ ror w20,w20,#24
+ sli v1.4s,v24.4s,#7
+ add w15,w15,w21
+ sli v5.4s,v25.4s,#7
+ add w16,w16,w17
+ sli v9.4s,v26.4s,#7
+ add w13,w13,w19
+ sli v13.4s,v27.4s,#7
+ add w14,w14,w20
+ sli v17.4s,v28.4s,#7
+ eor w10,w10,w15
+ sli v21.4s,v29.4s,#7
+ eor w11,w11,w16
+ ext v2.16b,v2.16b,v2.16b,#8
+ eor w12,w12,w13
+ ext v6.16b,v6.16b,v6.16b,#8
+ eor w9,w9,w14
+ ext v10.16b,v10.16b,v10.16b,#8
+ ror w10,w10,#25
+ ext v14.16b,v14.16b,v14.16b,#8
+ ror w11,w11,#25
+ ext v18.16b,v18.16b,v18.16b,#8
+ ror w12,w12,#25
+ ext v22.16b,v22.16b,v22.16b,#8
+ ror w9,w9,#25
+ ext v3.16b,v3.16b,v3.16b,#4
+ ext v7.16b,v7.16b,v7.16b,#4
+ ext v11.16b,v11.16b,v11.16b,#4
+ ext v15.16b,v15.16b,v15.16b,#4
+ ext v19.16b,v19.16b,v19.16b,#4
+ ext v23.16b,v23.16b,v23.16b,#4
+ ext v1.16b,v1.16b,v1.16b,#12
+ ext v5.16b,v5.16b,v5.16b,#12
+ ext v9.16b,v9.16b,v9.16b,#12
+ ext v13.16b,v13.16b,v13.16b,#12
+ ext v17.16b,v17.16b,v17.16b,#12
+ ext v21.16b,v21.16b,v21.16b,#12
+ cbnz x4,.Loop_lower_neon
+
+ add w5,w5,w22 // accumulate key block
+ ldp q24,q25,[sp,#0]
+ add x6,x6,x22,lsr#32
+ ldp q26,q27,[sp,#32]
+ add w7,w7,w23
+ ldp q28,q29,[sp,#64]
+ add x8,x8,x23,lsr#32
+ add v0.4s,v0.4s,v24.4s
+ add w9,w9,w24
+ add v4.4s,v4.4s,v24.4s
+ add x10,x10,x24,lsr#32
+ add v8.4s,v8.4s,v24.4s
+ add w11,w11,w25
+ add v12.4s,v12.4s,v24.4s
+ add x12,x12,x25,lsr#32
+ add v16.4s,v16.4s,v24.4s
+ add w13,w13,w26
+ add v20.4s,v20.4s,v24.4s
+ add x14,x14,x26,lsr#32
+ add v2.4s,v2.4s,v26.4s
+ add w15,w15,w27
+ add v6.4s,v6.4s,v26.4s
+ add x16,x16,x27,lsr#32
+ add v10.4s,v10.4s,v26.4s
+ add w17,w17,w28
+ add v14.4s,v14.4s,v26.4s
+ add x19,x19,x28,lsr#32
+ add v18.4s,v18.4s,v26.4s
+ add w20,w20,w30
+ add v22.4s,v22.4s,v26.4s
+ add x21,x21,x30,lsr#32
+ add v19.4s,v19.4s,v31.4s // +4
+ add x5,x5,x6,lsl#32 // pack
+ add v23.4s,v23.4s,v31.4s // +4
+ add x7,x7,x8,lsl#32
+ add v3.4s,v3.4s,v27.4s
+ ldp x6,x8,[x1,#0] // load input
+ add v7.4s,v7.4s,v28.4s
+ add x9,x9,x10,lsl#32
+ add v11.4s,v11.4s,v29.4s
+ add x11,x11,x12,lsl#32
+ add v15.4s,v15.4s,v30.4s
+ ldp x10,x12,[x1,#16]
+ add v19.4s,v19.4s,v27.4s
+ add x13,x13,x14,lsl#32
+ add v23.4s,v23.4s,v28.4s
+ add x15,x15,x16,lsl#32
+ add v1.4s,v1.4s,v25.4s
+ ldp x14,x16,[x1,#32]
+ add v5.4s,v5.4s,v25.4s
+ add x17,x17,x19,lsl#32
+ add v9.4s,v9.4s,v25.4s
+ add x20,x20,x21,lsl#32
+ add v13.4s,v13.4s,v25.4s
+ ldp x19,x21,[x1,#48]
+ add v17.4s,v17.4s,v25.4s
+ add x1,x1,#64
+ add v21.4s,v21.4s,v25.4s
+
+#ifdef __ARMEB__
+ rev x5,x5
+ rev x7,x7
+ rev x9,x9
+ rev x11,x11
+ rev x13,x13
+ rev x15,x15
+ rev x17,x17
+ rev x20,x20
+#endif
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+ eor x5,x5,x6
+ eor x7,x7,x8
+ eor x9,x9,x10
+ eor x11,x11,x12
+ eor x13,x13,x14
+ eor v0.16b,v0.16b,v24.16b
+ eor x15,x15,x16
+ eor v1.16b,v1.16b,v25.16b
+ eor x17,x17,x19
+ eor v2.16b,v2.16b,v26.16b
+ eor x20,x20,x21
+ eor v3.16b,v3.16b,v27.16b
+ ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
+
+ stp x5,x7,[x0,#0] // store output
+ add x28,x28,#7 // increment counter
+ stp x9,x11,[x0,#16]
+ stp x13,x15,[x0,#32]
+ stp x17,x20,[x0,#48]
+ add x0,x0,#64
+ st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
+
+ ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
+ eor v4.16b,v4.16b,v24.16b
+ eor v5.16b,v5.16b,v25.16b
+ eor v6.16b,v6.16b,v26.16b
+ eor v7.16b,v7.16b,v27.16b
+ st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
+
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ eor v8.16b,v8.16b,v0.16b
+ ldp q24,q25,[sp,#0]
+ eor v9.16b,v9.16b,v1.16b
+ ldp q26,q27,[sp,#32]
+ eor v10.16b,v10.16b,v2.16b
+ eor v11.16b,v11.16b,v3.16b
+ st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
+
+ ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
+ eor v12.16b,v12.16b,v4.16b
+ eor v13.16b,v13.16b,v5.16b
+ eor v14.16b,v14.16b,v6.16b
+ eor v15.16b,v15.16b,v7.16b
+ st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
+
+ ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
+ eor v16.16b,v16.16b,v8.16b
+ eor v17.16b,v17.16b,v9.16b
+ eor v18.16b,v18.16b,v10.16b
+ eor v19.16b,v19.16b,v11.16b
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+
+ shl v0.4s,v31.4s,#1 // 4 -> 8
+ eor v20.16b,v20.16b,v12.16b
+ eor v21.16b,v21.16b,v13.16b
+ eor v22.16b,v22.16b,v14.16b
+ eor v23.16b,v23.16b,v15.16b
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+
+ add v27.4s,v27.4s,v0.4s // += 8
+ add v28.4s,v28.4s,v0.4s
+ add v29.4s,v29.4s,v0.4s
+ add v30.4s,v30.4s,v0.4s
+
+ b.hs .Loop_outer_512_neon
+
+ adds x2,x2,#512
+ ushr v0.4s,v31.4s,#2 // 4 -> 1
+
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
+ ldp d10,d11,[sp,#128+16]
+ ldp d12,d13,[sp,#128+32]
+ ldp d14,d15,[sp,#128+48]
+
+ stp q24,q31,[sp,#0] // wipe off-load area
+ stp q24,q31,[sp,#32]
+ stp q24,q31,[sp,#64]
+
+ b.eq .Ldone_512_neon
+
+ cmp x2,#192
+ sub v27.4s,v27.4s,v0.4s // -= 1
+ sub v28.4s,v28.4s,v0.4s
+ sub v29.4s,v29.4s,v0.4s
+ add sp,sp,#128
+ b.hs .Loop_outer_neon
+
+ eor v25.16b,v25.16b,v25.16b
+ eor v26.16b,v26.16b,v26.16b
+ eor v27.16b,v27.16b,v27.16b
+ eor v28.16b,v28.16b,v28.16b
+ eor v29.16b,v29.16b,v29.16b
+ eor v30.16b,v30.16b,v30.16b
+ b .Loop_outer
+
+.Ldone_512_neon:
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#128+64
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size ChaCha20_512_neon,.-ChaCha20_512_neon
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S
@@ -1,0 +1,802 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch armv8-a+crypto
+.section .rodata
+.align 5
+.Lrcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,%function
+.align 5
+aes_hw_set_encrypt_key:
+.Lenc_key:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ mov x3,#-1
+ cmp x0,#0
+ b.eq .Lenc_key_abort
+ cmp x2,#0
+ b.eq .Lenc_key_abort
+ mov x3,#-2
+ cmp w1,#128
+ b.lt .Lenc_key_abort
+ cmp w1,#256
+ b.gt .Lenc_key_abort
+ tst w1,#0x3f
+ b.ne .Lenc_key_abort
+
+ adrp x3,.Lrcon
+ add x3,x3,:lo12:.Lrcon
+ cmp w1,#192
+
+ eor v0.16b,v0.16b,v0.16b
+ ld1 {v3.16b},[x0],#16
+ mov w1,#8 // reuse w1
+ ld1 {v1.4s,v2.4s},[x3],#32
+
+ b.lt .Loop128
+ b.eq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ b.ne .Loop128
+
+ ld1 {v1.4s},[x3]
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+
+ tbl v6.16b,{v3.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v3.4s},[x2],#16
+ aese v6.16b,v0.16b
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2]
+ add x2,x2,#0x50
+
+ mov w12,#10
+ b .Ldone
+
+.align 4
+.L192:
+ ld1 {v4.8b},[x0],#8
+ movi v6.16b,#8 // borrow v6.16b
+ st1 {v3.4s},[x2],#16
+ sub v2.16b,v2.16b,v6.16b // adjust the mask
+
+.Loop192:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.8b},[x2],#8
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+
+ dup v5.4s,v3.s[3]
+ eor v5.16b,v5.16b,v4.16b
+ eor v6.16b,v6.16b,v1.16b
+ ext v4.16b,v0.16b,v4.16b,#12
+ shl v1.16b,v1.16b,#1
+ eor v4.16b,v4.16b,v5.16b
+ eor v3.16b,v3.16b,v6.16b
+ eor v4.16b,v4.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.ne .Loop192
+
+ mov w12,#12
+ add x2,x2,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ ld1 {v4.16b},[x0]
+ mov w1,#7
+ mov w12,#14
+ st1 {v3.4s},[x2],#16
+
+.Loop256:
+ tbl v6.16b,{v4.16b},v2.16b
+ ext v5.16b,v0.16b,v3.16b,#12
+ st1 {v4.4s},[x2],#16
+ aese v6.16b,v0.16b
+ subs w1,w1,#1
+
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v3.16b,v3.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v6.16b,v6.16b,v1.16b
+ eor v3.16b,v3.16b,v5.16b
+ shl v1.16b,v1.16b,#1
+ eor v3.16b,v3.16b,v6.16b
+ st1 {v3.4s},[x2],#16
+ b.eq .Ldone
+
+ dup v6.4s,v3.s[3] // just splat
+ ext v5.16b,v0.16b,v4.16b,#12
+ aese v6.16b,v0.16b
+
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+ ext v5.16b,v0.16b,v5.16b,#12
+ eor v4.16b,v4.16b,v5.16b
+
+ eor v4.16b,v4.16b,v6.16b
+ b .Loop256
+
+.Ldone:
+ str w12,[x2]
+ mov x3,#0
+
+.Lenc_key_abort:
+ mov x0,x3 // return value
+ ldr x29,[sp],#16
+ ret
+.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,%function
+.align 5
+aes_hw_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ bl .Lenc_key
+
+ cmp x0,#0
+ b.ne .Ldec_key_abort
+
+ sub x2,x2,#240 // restore original x2
+ mov x4,#-16
+ add x0,x2,x12,lsl#4 // end of key schedule
+
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+
+.Loop_imc:
+ ld1 {v0.4s},[x2]
+ ld1 {v1.4s},[x0]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ st1 {v0.4s},[x0],x4
+ st1 {v1.4s},[x2],#16
+ cmp x0,x2
+ b.hi .Loop_imc
+
+ ld1 {v0.4s},[x2]
+ aesimc v0.16b,v0.16b
+ st1 {v0.4s},[x0]
+
+ eor x0,x0,x0 // return value
+.Ldec_key_abort:
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type aes_hw_encrypt,%function
+.align 5
+aes_hw_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+.Loop_enc:
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aese v2.16b,v1.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt .Loop_enc
+
+ aese v2.16b,v0.16b
+ aesmc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aese v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+.size aes_hw_encrypt,.-aes_hw_encrypt
+.globl aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type aes_hw_decrypt,%function
+.align 5
+aes_hw_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr w3,[x2,#240]
+ ld1 {v0.4s},[x2],#16
+ ld1 {v2.16b},[x0]
+ sub w3,w3,#2
+ ld1 {v1.4s},[x2],#16
+
+.Loop_dec:
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2],#16
+ subs w3,w3,#2
+ aesd v2.16b,v1.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v1.4s},[x2],#16
+ b.gt .Loop_dec
+
+ aesd v2.16b,v0.16b
+ aesimc v2.16b,v2.16b
+ ld1 {v0.4s},[x2]
+ aesd v2.16b,v1.16b
+ eor v2.16b,v2.16b,v0.16b
+
+ st1 {v2.16b},[x1]
+ ret
+.size aes_hw_decrypt,.-aes_hw_decrypt
+.globl aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,%function
+.align 5
+aes_hw_cbc_encrypt:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ subs x2,x2,#16
+ mov x8,#16
+ b.lo .Lcbc_abort
+ csel x8,xzr,x8,eq
+
+ cmp w5,#0 // en- or decrypting?
+ ldr w5,[x3,#240]
+ and x2,x2,#-16
+ ld1 {v6.16b},[x4]
+ ld1 {v0.16b},[x0],x8
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#6
+ add x7,x3,x5,lsl#4 // pointer to last 7 round keys
+ sub w5,w5,#2
+ ld1 {v18.4s,v19.4s},[x7],#32
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+
+ add x7,x3,#32
+ mov w6,w5
+ b.eq .Lcbc_dec
+
+ cmp w5,#2
+ eor v0.16b,v0.16b,v6.16b
+ eor v5.16b,v16.16b,v7.16b
+ b.eq .Lcbc_enc128
+
+ ld1 {v2.4s,v3.4s},[x7]
+ add x7,x3,#16
+ add x6,x3,#16*4
+ add x12,x3,#16*5
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ add x14,x3,#16*6
+ add x3,x3,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
+.Loop_cbc_enc:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x6]
+ cmp w5,#4
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x12]
+ b.eq .Lcbc_enc192
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.4s},[x14]
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x3]
+ nop
+
+.Lcbc_enc192:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v17.4s},[x7] // re-pre-load rndkey[1]
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs .Loop_cbc_enc
+
+ st1 {v6.16b},[x1],#16
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ ld1 {v2.4s,v3.4s},[x7]
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ st1 {v6.16b},[x1],#16
+.Lenter_cbc_enc128:
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ subs x2,x2,#16
+ aese v0.16b,v2.16b
+ aesmc v0.16b,v0.16b
+ csel x8,xzr,x8,eq
+ aese v0.16b,v3.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v18.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v19.16b
+ aesmc v0.16b,v0.16b
+ ld1 {v16.16b},[x0],x8
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ eor v16.16b,v16.16b,v5.16b
+ aese v0.16b,v23.16b
+ eor v6.16b,v0.16b,v7.16b
+ b.hs .Loop_cbc_enc128
+
+ st1 {v6.16b},[x1],#16
+ b .Lcbc_done
+.align 5
+.Lcbc_dec:
+ ld1 {v18.16b},[x0],#16
+ subs x2,x2,#32 // bias
+ add w6,w5,#2
+ orr v3.16b,v0.16b,v0.16b
+ orr v1.16b,v0.16b,v0.16b
+ orr v19.16b,v18.16b,v18.16b
+ b.lo .Lcbc_dec_tail
+
+ orr v1.16b,v18.16b,v18.16b
+ ld1 {v18.16b},[x0],#16
+ orr v2.16b,v0.16b,v0.16b
+ orr v3.16b,v1.16b,v1.16b
+ orr v19.16b,v18.16b,v18.16b
+
+.Loop3x_cbc_dec:
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_cbc_dec
+
+ aesd v0.16b,v16.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ eor v4.16b,v6.16b,v7.16b
+ subs x2,x2,#0x30
+ eor v5.16b,v2.16b,v7.16b
+ csel x6,x2,x6,lo // x6, w6, is zero at this point
+ aesd v0.16b,v17.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ add x0,x0,x6 // x0 is adjusted in such way that
+ // at exit from the loop v1.16b-v18.16b
+ // are loaded with last "words"
+ orr v6.16b,v19.16b,v19.16b
+ mov x7,x3
+ aesd v0.16b,v20.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v2.16b},[x0],#16
+ aesd v0.16b,v21.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ aesd v0.16b,v22.16b
+ aesimc v0.16b,v0.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v19.16b},[x0],#16
+ aesd v0.16b,v23.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ add w6,w5,#2
+ eor v4.16b,v4.16b,v0.16b
+ eor v5.16b,v5.16b,v1.16b
+ eor v18.16b,v18.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v4.16b},[x1],#16
+ orr v0.16b,v2.16b,v2.16b
+ st1 {v5.16b},[x1],#16
+ orr v1.16b,v3.16b,v3.16b
+ st1 {v18.16b},[x1],#16
+ orr v18.16b,v19.16b,v19.16b
+ b.hs .Loop3x_cbc_dec
+
+ cmn x2,#0x30
+ b.eq .Lcbc_done
+ nop
+
+.Lcbc_dec_tail:
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lcbc_dec_tail
+
+ aesd v1.16b,v16.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v16.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v17.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v17.16b
+ aesimc v18.16b,v18.16b
+ aesd v1.16b,v20.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v20.16b
+ aesimc v18.16b,v18.16b
+ cmn x2,#0x20
+ aesd v1.16b,v21.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v21.16b
+ aesimc v18.16b,v18.16b
+ eor v5.16b,v6.16b,v7.16b
+ aesd v1.16b,v22.16b
+ aesimc v1.16b,v1.16b
+ aesd v18.16b,v22.16b
+ aesimc v18.16b,v18.16b
+ eor v17.16b,v3.16b,v7.16b
+ aesd v1.16b,v23.16b
+ aesd v18.16b,v23.16b
+ b.eq .Lcbc_dec_one
+ eor v5.16b,v5.16b,v1.16b
+ eor v17.16b,v17.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+ st1 {v17.16b},[x1],#16
+ b .Lcbc_done
+
+.Lcbc_dec_one:
+ eor v5.16b,v5.16b,v18.16b
+ orr v6.16b,v19.16b,v19.16b
+ st1 {v5.16b},[x1],#16
+
+.Lcbc_done:
+ st1 {v6.16b},[x4]
+.Lcbc_abort:
+ ldr x29,[sp],#16
+ ret
+.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,%function
+.align 5
+aes_hw_ctr32_encrypt_blocks:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ldr w5,[x3,#240]
+
+ ldr w8, [x4, #12]
+ ld1 {v0.4s},[x4]
+
+ ld1 {v16.4s,v17.4s},[x3] // load key schedule...
+ sub w5,w5,#4
+ mov x12,#16
+ cmp x2,#2
+ add x7,x3,x5,lsl#4 // pointer to last 5 round keys
+ sub w5,w5,#2
+ ld1 {v20.4s,v21.4s},[x7],#32
+ ld1 {v22.4s,v23.4s},[x7],#32
+ ld1 {v7.4s},[x7]
+ add x7,x3,#32
+ mov w6,w5
+ csel x12,xzr,x12,lo
+
+ // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+ // affected by silicon errata #1742098 [0] and #1655431 [1],
+ // respectively, where the second instruction of an aese/aesmc
+ // instruction pair may execute twice if an interrupt is taken right
+ // after the first instruction consumes an input register of which a
+ // single 32-bit lane has been updated the last time it was modified.
+ //
+ // This function uses a counter in one 32-bit lane. The vmov lines
+ // could write to v1.16b and v18.16b directly, but that trips this bugs.
+ // We write to v6.16b and copy to the final register as a workaround.
+ //
+ // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+ // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+ rev w8, w8
+#endif
+ add w10, w8, #1
+ orr v6.16b,v0.16b,v0.16b
+ rev w10, w10
+ mov v6.s[3],w10
+ add w8, w8, #2
+ orr v1.16b,v6.16b,v6.16b
+ b.ls .Lctr32_tail
+ rev w12, w8
+ mov v6.s[3],w12
+ sub x2,x2,#3 // bias
+ orr v18.16b,v6.16b,v6.16b
+ b .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ aese v18.16b,v17.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Loop3x_ctr32
+
+ aese v0.16b,v16.16b
+ aesmc v4.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v5.16b,v1.16b
+ ld1 {v2.16b},[x0],#16
+ add w9,w8,#1
+ aese v18.16b,v16.16b
+ aesmc v18.16b,v18.16b
+ ld1 {v3.16b},[x0],#16
+ rev w9,w9
+ aese v4.16b,v17.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v17.16b
+ aesmc v5.16b,v5.16b
+ ld1 {v19.16b},[x0],#16
+ mov x7,x3
+ aese v18.16b,v17.16b
+ aesmc v17.16b,v18.16b
+ aese v4.16b,v20.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v20.16b
+ aesmc v5.16b,v5.16b
+ eor v2.16b,v2.16b,v7.16b
+ add w10,w8,#2
+ aese v17.16b,v20.16b
+ aesmc v17.16b,v17.16b
+ eor v3.16b,v3.16b,v7.16b
+ add w8,w8,#3
+ aese v4.16b,v21.16b
+ aesmc v4.16b,v4.16b
+ aese v5.16b,v21.16b
+ aesmc v5.16b,v5.16b
+ // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work
+ // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+ // 32-bit mode. See the comment above.
+ eor v19.16b,v19.16b,v7.16b
+ mov v6.s[3], w9
+ aese v17.16b,v21.16b
+ aesmc v17.16b,v17.16b
+ orr v0.16b,v6.16b,v6.16b
+ rev w10,w10
+ aese v4.16b,v22.16b
+ aesmc v4.16b,v4.16b
+ mov v6.s[3], w10
+ rev w12,w8
+ aese v5.16b,v22.16b
+ aesmc v5.16b,v5.16b
+ orr v1.16b,v6.16b,v6.16b
+ mov v6.s[3], w12
+ aese v17.16b,v22.16b
+ aesmc v17.16b,v17.16b
+ orr v18.16b,v6.16b,v6.16b
+ subs x2,x2,#3
+ aese v4.16b,v23.16b
+ aese v5.16b,v23.16b
+ aese v17.16b,v23.16b
+
+ eor v2.16b,v2.16b,v4.16b
+ ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0]
+ st1 {v2.16b},[x1],#16
+ eor v3.16b,v3.16b,v5.16b
+ mov w6,w5
+ st1 {v3.16b},[x1],#16
+ eor v19.16b,v19.16b,v17.16b
+ ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1]
+ st1 {v19.16b},[x1],#16
+ b.hs .Loop3x_ctr32
+
+ adds x2,x2,#3
+ b.eq .Lctr32_done
+ cmp x2,#1
+ mov x12,#16
+ csel x12,xzr,x12,eq
+
+.Lctr32_tail:
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v16.4s},[x7],#16
+ subs w6,w6,#2
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v17.4s},[x7],#16
+ b.gt .Lctr32_tail
+
+ aese v0.16b,v16.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v16.16b
+ aesmc v1.16b,v1.16b
+ aese v0.16b,v17.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v17.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v2.16b},[x0],x12
+ aese v0.16b,v20.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v20.16b
+ aesmc v1.16b,v1.16b
+ ld1 {v3.16b},[x0]
+ aese v0.16b,v21.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v21.16b
+ aesmc v1.16b,v1.16b
+ eor v2.16b,v2.16b,v7.16b
+ aese v0.16b,v22.16b
+ aesmc v0.16b,v0.16b
+ aese v1.16b,v22.16b
+ aesmc v1.16b,v1.16b
+ eor v3.16b,v3.16b,v7.16b
+ aese v0.16b,v23.16b
+ aese v1.16b,v23.16b
+
+ cmp x2,#1
+ eor v2.16b,v2.16b,v0.16b
+ eor v3.16b,v3.16b,v1.16b
+ st1 {v2.16b},[x1],#16
+ b.eq .Lctr32_done
+ st1 {v3.16b},[x1]
+
+.Lctr32_done:
+ ldr x29,[sp],#16
+ ret
+.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S
@@ -1,0 +1,1436 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl bn_mul_mont
+.hidden bn_mul_mont
+.type bn_mul_mont,%function
+.align 5
+bn_mul_mont:
+ AARCH64_SIGN_LINK_REGISTER
+ tst x5,#7
+ b.eq __bn_sqr8x_mont
+ tst x5,#3
+ b.eq __bn_mul4x_mont
+.Lmul_mont:
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+
+ ldr x9,[x2],#8 // bp[0]
+ sub x22,sp,x5,lsl#3
+ ldp x7,x8,[x1],#16 // ap[0..1]
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ and x22,x22,#-16 // ABI says so
+ ldp x13,x14,[x3],#16 // np[0..1]
+
+ mul x6,x7,x9 // ap[0]*bp[0]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ mul x10,x8,x9 // ap[1]*bp[0]
+ umulh x11,x8,x9
+
+ mul x15,x6,x4 // "tp[0]"*n0
+ mov sp,x22 // alloca
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6 // discarded
+ // (*) As for removal of first multiplication and addition
+ // instructions. The outcome of first addition is
+ // guaranteed to be zero, which leaves two computationally
+ // significant outcomes: it either carries or not. Then
+ // question is when does it carry? Is there alternative
+ // way to deduce it? If you follow operations, you can
+ // observe that condition for carry is quite simple:
+ // x6 being non-zero. So that carry can be calculated
+ // by adding -1 to x6. That's what next instruction does.
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ adc x13,x13,xzr
+ cbz x21,.L1st_skip
+
+.L1st:
+ ldr x8,[x1],#8
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ ldr x14,[x3],#8
+ adds x12,x16,x13
+ mul x10,x8,x9 // ap[j]*bp[0]
+ adc x13,x17,xzr
+ umulh x11,x8,x9
+
+ adds x12,x12,x6
+ mul x16,x14,x15 // np[j]*m1
+ adc x13,x13,xzr
+ umulh x17,x14,x15
+ str x12,[x22],#8 // tp[j-1]
+ cbnz x21,.L1st
+
+.L1st_skip:
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adc x13,x17,xzr
+
+ adds x12,x12,x6
+ sub x20,x5,#8 // i=num-1
+ adcs x13,x13,x7
+
+ adc x19,xzr,xzr // upmost overflow bit
+ stp x12,x13,[x22]
+
+.Louter:
+ ldr x9,[x2],#8 // bp[i]
+ ldp x7,x8,[x1],#16
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+
+ mul x6,x7,x9 // ap[0]*bp[i]
+ sub x21,x5,#16 // j=num-2
+ umulh x7,x7,x9
+ ldp x13,x14,[x3],#16
+ mul x10,x8,x9 // ap[1]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x15,x6,x4
+ sub x20,x20,#8 // i--
+
+ // (*) mul x12,x13,x15 // np[0]*m1
+ umulh x13,x13,x15
+ mul x16,x14,x15 // np[1]*m1
+ // (*) adds x12,x12,x6
+ subs xzr,x6,#1 // (*)
+ umulh x17,x14,x15
+ cbz x21,.Linner_skip
+
+.Linner:
+ ldr x8,[x1],#8
+ adc x13,x13,xzr
+ ldr x23,[x22],#8 // tp[j]
+ adds x6,x10,x7
+ sub x21,x21,#8 // j--
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ ldr x14,[x3],#8
+ adc x13,x17,xzr
+
+ mul x10,x8,x9 // ap[j]*bp[i]
+ adds x6,x6,x23
+ umulh x11,x8,x9
+ adc x7,x7,xzr
+
+ mul x16,x14,x15 // np[j]*m1
+ adds x12,x12,x6
+ umulh x17,x14,x15
+ str x12,[x22,#-16] // tp[j-1]
+ cbnz x21,.Linner
+
+.Linner_skip:
+ ldr x23,[x22],#8 // tp[j]
+ adc x13,x13,xzr
+ adds x6,x10,x7
+ sub x1,x1,x5 // rewind x1
+ adc x7,x11,xzr
+
+ adds x12,x16,x13
+ sub x3,x3,x5 // rewind x3
+ adcs x13,x17,x19
+ adc x19,xzr,xzr
+
+ adds x6,x6,x23
+ adc x7,x7,xzr
+
+ adds x12,x12,x6
+ adcs x13,x13,x7
+ adc x19,x19,xzr // upmost overflow bit
+ stp x12,x13,[x22,#-16]
+
+ cbnz x20,.Louter
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x14,[x3],#8 // np[0]
+ subs x21,x5,#8 // j=num-1 and clear borrow
+ mov x1,x0
+.Lsub:
+ sbcs x8,x23,x14 // tp[j]-np[j]
+ ldr x23,[x22],#8
+ sub x21,x21,#8 // j--
+ ldr x14,[x3],#8
+ str x8,[x1],#8 // rp[j]=tp[j]-np[j]
+ cbnz x21,.Lsub
+
+ sbcs x8,x23,x14
+ sbcs x19,x19,xzr // did it borrow?
+ str x8,[x1],#8 // rp[num-1]
+
+ ldr x23,[sp] // tp[0]
+ add x22,sp,#8
+ ldr x8,[x0],#8 // rp[0]
+ sub x5,x5,#8 // num--
+ nop
+.Lcond_copy:
+ sub x5,x5,#8 // num--
+ csel x14,x23,x8,lo // did it borrow?
+ ldr x23,[x22],#8
+ ldr x8,[x0],#8
+ str xzr,[x22,#-16] // wipe tp
+ str x14,[x0,#-16]
+ cbnz x5,.Lcond_copy
+
+ csel x14,x23,x8,lo
+ str xzr,[x22,#-8] // wipe tp
+ str x14,[x0,#-8]
+
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldr x29,[sp],#64
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size bn_mul_mont,.-bn_mul_mont
+.type __bn_sqr8x_mont,%function
+.align 5
+__bn_sqr8x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
+ // only from bn_mul_mont which has already signed the return address.
+ cmp x1,x2
+ b.ne __bn_mul4x_mont
+.Lsqr8x_mont:
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ stp x0,x3,[sp,#96] // offload rp and np
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ ldp x12,x13,[x1,#8*6]
+
+ sub x2,sp,x5,lsl#4
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ mov sp,x2 // alloca
+ sub x27,x5,#8*8
+ b .Lsqr8x_zero_start
+
+.Lsqr8x_zero:
+ sub x27,x27,#8*8
+ stp xzr,xzr,[x2,#8*0]
+ stp xzr,xzr,[x2,#8*2]
+ stp xzr,xzr,[x2,#8*4]
+ stp xzr,xzr,[x2,#8*6]
+.Lsqr8x_zero_start:
+ stp xzr,xzr,[x2,#8*8]
+ stp xzr,xzr,[x2,#8*10]
+ stp xzr,xzr,[x2,#8*12]
+ stp xzr,xzr,[x2,#8*14]
+ add x2,x2,#8*16
+ cbnz x27,.Lsqr8x_zero
+
+ add x3,x1,x5
+ add x1,x1,#8*8
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ mov x23,xzr
+ mov x24,xzr
+ mov x25,xzr
+ mov x26,xzr
+ mov x2,sp
+ str x4,[x29,#112] // offload n0
+
+ // Multiply everything but a[i]*a[i]
+.align 4
+.Lsqr8x_outer_loop:
+ // a[1]a[0] (i)
+ // a[2]a[0]
+ // a[3]a[0]
+ // a[4]a[0]
+ // a[5]a[0]
+ // a[6]a[0]
+ // a[7]a[0]
+ // a[2]a[1] (ii)
+ // a[3]a[1]
+ // a[4]a[1]
+ // a[5]a[1]
+ // a[6]a[1]
+ // a[7]a[1]
+ // a[3]a[2] (iii)
+ // a[4]a[2]
+ // a[5]a[2]
+ // a[6]a[2]
+ // a[7]a[2]
+ // a[4]a[3] (iv)
+ // a[5]a[3]
+ // a[6]a[3]
+ // a[7]a[3]
+ // a[5]a[4] (v)
+ // a[6]a[4]
+ // a[7]a[4]
+ // a[6]a[5] (vi)
+ // a[7]a[5]
+ // a[7]a[6] (vii)
+
+ mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
+ mul x15,x8,x6
+ mul x16,x9,x6
+ mul x17,x10,x6
+ adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
+ mul x14,x11,x6
+ adcs x21,x21,x15
+ mul x15,x12,x6
+ adcs x22,x22,x16
+ mul x16,x13,x6
+ adcs x23,x23,x17
+ umulh x17,x7,x6 // hi(a[1..7]*a[0])
+ adcs x24,x24,x14
+ umulh x14,x8,x6
+ adcs x25,x25,x15
+ umulh x15,x9,x6
+ adcs x26,x26,x16
+ umulh x16,x10,x6
+ stp x19,x20,[x2],#8*2 // t[0..1]
+ adc x19,xzr,xzr // t[8]
+ adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
+ umulh x17,x11,x6
+ adcs x22,x22,x14
+ umulh x14,x12,x6
+ adcs x23,x23,x15
+ umulh x15,x13,x6
+ adcs x24,x24,x16
+ mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
+ adcs x25,x25,x17
+ mul x17,x9,x7
+ adcs x26,x26,x14
+ mul x14,x10,x7
+ adc x19,x19,x15
+
+ mul x15,x11,x7
+ adds x22,x22,x16
+ mul x16,x12,x7
+ adcs x23,x23,x17
+ mul x17,x13,x7
+ adcs x24,x24,x14
+ umulh x14,x8,x7 // hi(a[2..7]*a[1])
+ adcs x25,x25,x15
+ umulh x15,x9,x7
+ adcs x26,x26,x16
+ umulh x16,x10,x7
+ adcs x19,x19,x17
+ umulh x17,x11,x7
+ stp x21,x22,[x2],#8*2 // t[2..3]
+ adc x20,xzr,xzr // t[9]
+ adds x23,x23,x14
+ umulh x14,x12,x7
+ adcs x24,x24,x15
+ umulh x15,x13,x7
+ adcs x25,x25,x16
+ mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
+ adcs x26,x26,x17
+ mul x17,x10,x8
+ adcs x19,x19,x14
+ mul x14,x11,x8
+ adc x20,x20,x15
+
+ mul x15,x12,x8
+ adds x24,x24,x16
+ mul x16,x13,x8
+ adcs x25,x25,x17
+ umulh x17,x9,x8 // hi(a[3..7]*a[2])
+ adcs x26,x26,x14
+ umulh x14,x10,x8
+ adcs x19,x19,x15
+ umulh x15,x11,x8
+ adcs x20,x20,x16
+ umulh x16,x12,x8
+ stp x23,x24,[x2],#8*2 // t[4..5]
+ adc x21,xzr,xzr // t[10]
+ adds x25,x25,x17
+ umulh x17,x13,x8
+ adcs x26,x26,x14
+ mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
+ adcs x19,x19,x15
+ mul x15,x11,x9
+ adcs x20,x20,x16
+ mul x16,x12,x9
+ adc x21,x21,x17
+
+ mul x17,x13,x9
+ adds x26,x26,x14
+ umulh x14,x10,x9 // hi(a[4..7]*a[3])
+ adcs x19,x19,x15
+ umulh x15,x11,x9
+ adcs x20,x20,x16
+ umulh x16,x12,x9
+ adcs x21,x21,x17
+ umulh x17,x13,x9
+ stp x25,x26,[x2],#8*2 // t[6..7]
+ adc x22,xzr,xzr // t[11]
+ adds x19,x19,x14
+ mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
+ adcs x20,x20,x15
+ mul x15,x12,x10
+ adcs x21,x21,x16
+ mul x16,x13,x10
+ adc x22,x22,x17
+
+ umulh x17,x11,x10 // hi(a[5..7]*a[4])
+ adds x20,x20,x14
+ umulh x14,x12,x10
+ adcs x21,x21,x15
+ umulh x15,x13,x10
+ adcs x22,x22,x16
+ mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
+ adc x23,xzr,xzr // t[12]
+ adds x21,x21,x17
+ mul x17,x13,x11
+ adcs x22,x22,x14
+ umulh x14,x12,x11 // hi(a[6..7]*a[5])
+ adc x23,x23,x15
+
+ umulh x15,x13,x11
+ adds x22,x22,x16
+ mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
+ adcs x23,x23,x17
+ umulh x17,x13,x12 // hi(a[7]*a[6])
+ adc x24,xzr,xzr // t[13]
+ adds x23,x23,x14
+ sub x27,x3,x1 // done yet?
+ adc x24,x24,x15
+
+ adds x24,x24,x16
+ sub x14,x3,x5 // rewinded ap
+ adc x25,xzr,xzr // t[14]
+ add x25,x25,x17
+
+ cbz x27,.Lsqr8x_outer_break
+
+ mov x4,x6
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x0,x1
+ adcs x26,xzr,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved below
+ mov x27,#-8*8
+
+ // a[8]a[0]
+ // a[9]a[0]
+ // a[a]a[0]
+ // a[b]a[0]
+ // a[c]a[0]
+ // a[d]a[0]
+ // a[e]a[0]
+ // a[f]a[0]
+ // a[8]a[1]
+ // a[f]a[1]........................
+ // a[8]a[2]
+ // a[f]a[2]........................
+ // a[8]a[3]
+ // a[f]a[3]........................
+ // a[8]a[4]
+ // a[f]a[4]........................
+ // a[8]a[5]
+ // a[f]a[5]........................
+ // a[8]a[6]
+ // a[f]a[6]........................
+ // a[8]a[7]
+ // a[f]a[7]........................
+.Lsqr8x_mul:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,.Lsqr8x_mul
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ cmp x1,x3 // done yet?
+ b.eq .Lsqr8x_break
+
+ ldp x6,x7,[x2,#8*0]
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ adds x19,x19,x6
+ ldr x4,[x0,#-8*8]
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b .Lsqr8x_mul
+
+.align 4
+.Lsqr8x_break:
+ ldp x6,x7,[x0,#8*0]
+ add x1,x0,#8*8
+ ldp x8,x9,[x0,#8*2]
+ sub x14,x3,x1 // is it last iteration?
+ ldp x10,x11,[x0,#8*4]
+ sub x15,x2,x14
+ ldp x12,x13,[x0,#8*6]
+ cbz x14,.Lsqr8x_outer_loop
+
+ stp x19,x20,[x2,#8*0]
+ ldp x19,x20,[x15,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x15,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x15,#8*4]
+ stp x25,x26,[x2,#8*6]
+ mov x2,x15
+ ldp x25,x26,[x15,#8*6]
+ b .Lsqr8x_outer_loop
+
+.align 4
+.Lsqr8x_outer_break:
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
+ ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
+ ldp x15,x16,[sp,#8*1]
+ ldp x11,x13,[x14,#8*2]
+ add x1,x14,#8*4
+ ldp x17,x14,[sp,#8*3]
+
+ stp x19,x20,[x2,#8*0]
+ mul x19,x7,x7
+ stp x21,x22,[x2,#8*2]
+ umulh x7,x7,x7
+ stp x23,x24,[x2,#8*4]
+ mul x8,x9,x9
+ stp x25,x26,[x2,#8*6]
+ mov x2,sp
+ umulh x9,x9,x9
+ adds x20,x7,x15,lsl#1
+ extr x15,x16,x15,#63
+ sub x27,x5,#8*4
+
+.Lsqr4x_shift_n_add:
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ sub x27,x27,#8*4
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ ldp x7,x9,[x1],#8*2
+ umulh x11,x11,x11
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ extr x17,x14,x17,#63
+ stp x19,x20,[x2,#8*0]
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ stp x21,x22,[x2,#8*2]
+ adcs x24,x11,x14
+ ldp x17,x14,[x2,#8*7]
+ extr x15,x16,x15,#63
+ adcs x25,x12,x15
+ extr x16,x17,x16,#63
+ adcs x26,x13,x16
+ ldp x15,x16,[x2,#8*9]
+ mul x6,x7,x7
+ ldp x11,x13,[x1],#8*2
+ umulh x7,x7,x7
+ mul x8,x9,x9
+ umulh x9,x9,x9
+ stp x23,x24,[x2,#8*4]
+ extr x17,x14,x17,#63
+ stp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ adcs x19,x6,x17
+ extr x14,x15,x14,#63
+ adcs x20,x7,x14
+ ldp x17,x14,[x2,#8*3]
+ extr x15,x16,x15,#63
+ cbnz x27,.Lsqr4x_shift_n_add
+ ldp x1,x4,[x29,#104] // pull np and n0
+
+ adcs x21,x8,x15
+ extr x16,x17,x16,#63
+ adcs x22,x9,x16
+ ldp x15,x16,[x2,#8*5]
+ mul x10,x11,x11
+ umulh x11,x11,x11
+ stp x19,x20,[x2,#8*0]
+ mul x12,x13,x13
+ umulh x13,x13,x13
+ stp x21,x22,[x2,#8*2]
+ extr x17,x14,x17,#63
+ adcs x23,x10,x17
+ extr x14,x15,x14,#63
+ ldp x19,x20,[sp,#8*0]
+ adcs x24,x11,x14
+ extr x15,x16,x15,#63
+ ldp x6,x7,[x1,#8*0]
+ adcs x25,x12,x15
+ extr x16,xzr,x16,#63
+ ldp x8,x9,[x1,#8*2]
+ adc x26,x13,x16
+ ldp x10,x11,[x1,#8*4]
+
+ // Reduce by 512 bits per iteration
+ mul x28,x4,x19 // t[0]*n0
+ ldp x12,x13,[x1,#8*6]
+ add x3,x1,x5
+ ldp x21,x22,[sp,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[sp,#8*4]
+ stp x25,x26,[x2,#8*6]
+ ldp x25,x26,[sp,#8*6]
+ add x1,x1,#8*8
+ mov x30,xzr // initial top-most carry
+ mov x2,sp
+ mov x27,#8
+
+.Lsqr8x_reduction:
+ // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
+ mul x15,x7,x28
+ sub x27,x27,#1
+ mul x16,x8,x28
+ str x28,[x2],#8 // put aside t[0]*n0 for tail processing
+ mul x17,x9,x28
+ // (*) adds xzr,x19,x14
+ subs xzr,x19,#1 // (*)
+ mul x14,x10,x28
+ adcs x19,x20,x15
+ mul x15,x11,x28
+ adcs x20,x21,x16
+ mul x16,x12,x28
+ adcs x21,x22,x17
+ mul x17,x13,x28
+ adcs x22,x23,x14
+ umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
+ adcs x23,x24,x15
+ umulh x15,x7,x28
+ adcs x24,x25,x16
+ umulh x16,x8,x28
+ adcs x25,x26,x17
+ umulh x17,x9,x28
+ adc x26,xzr,xzr
+ adds x19,x19,x14
+ umulh x14,x10,x28
+ adcs x20,x20,x15
+ umulh x15,x11,x28
+ adcs x21,x21,x16
+ umulh x16,x12,x28
+ adcs x22,x22,x17
+ umulh x17,x13,x28
+ mul x28,x4,x19 // next t[0]*n0
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adc x26,x26,x17
+ cbnz x27,.Lsqr8x_reduction
+
+ ldp x14,x15,[x2,#8*0]
+ ldp x16,x17,[x2,#8*2]
+ mov x0,x2
+ sub x27,x3,x1 // done yet?
+ adds x19,x19,x14
+ adcs x20,x20,x15
+ ldp x14,x15,[x2,#8*4]
+ adcs x21,x21,x16
+ adcs x22,x22,x17
+ ldp x16,x17,[x2,#8*6]
+ adcs x23,x23,x14
+ adcs x24,x24,x15
+ adcs x25,x25,x16
+ adcs x26,x26,x17
+ //adc x28,xzr,xzr // moved below
+ cbz x27,.Lsqr8x8_post_condition
+
+ ldr x4,[x2,#-8*8]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ ldp x10,x11,[x1,#8*4]
+ mov x27,#-8*8
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+
+.Lsqr8x_tail:
+ mul x14,x6,x4
+ adc x28,xzr,xzr // carry bit, modulo-scheduled
+ mul x15,x7,x4
+ add x27,x27,#8
+ mul x16,x8,x4
+ mul x17,x9,x4
+ adds x19,x19,x14
+ mul x14,x10,x4
+ adcs x20,x20,x15
+ mul x15,x11,x4
+ adcs x21,x21,x16
+ mul x16,x12,x4
+ adcs x22,x22,x17
+ mul x17,x13,x4
+ adcs x23,x23,x14
+ umulh x14,x6,x4
+ adcs x24,x24,x15
+ umulh x15,x7,x4
+ adcs x25,x25,x16
+ umulh x16,x8,x4
+ adcs x26,x26,x17
+ umulh x17,x9,x4
+ adc x28,x28,xzr
+ str x19,[x2],#8
+ adds x19,x20,x14
+ umulh x14,x10,x4
+ adcs x20,x21,x15
+ umulh x15,x11,x4
+ adcs x21,x22,x16
+ umulh x16,x12,x4
+ adcs x22,x23,x17
+ umulh x17,x13,x4
+ ldr x4,[x0,x27]
+ adcs x23,x24,x14
+ adcs x24,x25,x15
+ adcs x25,x26,x16
+ adcs x26,x28,x17
+ //adc x28,xzr,xzr // moved above
+ cbnz x27,.Lsqr8x_tail
+ // note that carry flag is guaranteed
+ // to be zero at this point
+ ldp x6,x7,[x2,#8*0]
+ sub x27,x3,x1 // done yet?
+ sub x16,x3,x5 // rewinded np
+ ldp x8,x9,[x2,#8*2]
+ ldp x10,x11,[x2,#8*4]
+ ldp x12,x13,[x2,#8*6]
+ cbz x27,.Lsqr8x_tail_break
+
+ ldr x4,[x0,#-8*8]
+ adds x19,x19,x6
+ adcs x20,x20,x7
+ ldp x6,x7,[x1,#8*0]
+ adcs x21,x21,x8
+ adcs x22,x22,x9
+ ldp x8,x9,[x1,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x1,#8*4]
+ adcs x25,x25,x12
+ mov x27,#-8*8
+ adcs x26,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ //adc x28,xzr,xzr // moved above
+ b .Lsqr8x_tail
+
+.align 4
+.Lsqr8x_tail_break:
+ ldr x4,[x29,#112] // pull n0
+ add x27,x2,#8*8 // end of current t[num] window
+
+ subs xzr,x30,#1 // "move" top-most carry to carry bit
+ adcs x14,x19,x6
+ adcs x15,x20,x7
+ ldp x19,x20,[x0,#8*0]
+ adcs x21,x21,x8
+ ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
+ adcs x22,x22,x9
+ ldp x8,x9,[x16,#8*2]
+ adcs x23,x23,x10
+ adcs x24,x24,x11
+ ldp x10,x11,[x16,#8*4]
+ adcs x25,x25,x12
+ adcs x26,x26,x13
+ ldp x12,x13,[x16,#8*6]
+ add x1,x16,#8*8
+ adc x30,xzr,xzr // top-most carry
+ mul x28,x4,x19
+ stp x14,x15,[x2,#8*0]
+ stp x21,x22,[x2,#8*2]
+ ldp x21,x22,[x0,#8*2]
+ stp x23,x24,[x2,#8*4]
+ ldp x23,x24,[x0,#8*4]
+ cmp x27,x29 // did we hit the bottom?
+ stp x25,x26,[x2,#8*6]
+ mov x2,x0 // slide the window
+ ldp x25,x26,[x0,#8*6]
+ mov x27,#8
+ b.ne .Lsqr8x_reduction
+
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ ldr x0,[x29,#96] // pull rp
+ add x2,x2,#8*8
+ subs x14,x19,x6
+ sbcs x15,x20,x7
+ sub x27,x5,#8*8
+ mov x3,x0 // x0 copy
+
+.Lsqr8x_sub:
+ sbcs x16,x21,x8
+ ldp x6,x7,[x1,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x1,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x10,x11,[x1,#8*4]
+ sbcs x17,x26,x13
+ ldp x12,x13,[x1,#8*6]
+ add x1,x1,#8*8
+ ldp x19,x20,[x2,#8*0]
+ sub x27,x27,#8*8
+ ldp x21,x22,[x2,#8*2]
+ ldp x23,x24,[x2,#8*4]
+ ldp x25,x26,[x2,#8*6]
+ add x2,x2,#8*8
+ stp x14,x15,[x0,#8*4]
+ sbcs x14,x19,x6
+ stp x16,x17,[x0,#8*6]
+ add x0,x0,#8*8
+ sbcs x15,x20,x7
+ cbnz x27,.Lsqr8x_sub
+
+ sbcs x16,x21,x8
+ mov x2,sp
+ add x1,sp,x5
+ ldp x6,x7,[x3,#8*0]
+ sbcs x17,x22,x9
+ stp x14,x15,[x0,#8*0]
+ sbcs x14,x23,x10
+ ldp x8,x9,[x3,#8*2]
+ sbcs x15,x24,x11
+ stp x16,x17,[x0,#8*2]
+ sbcs x16,x25,x12
+ ldp x19,x20,[x1,#8*0]
+ sbcs x17,x26,x13
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+ stp x14,x15,[x0,#8*4]
+ stp x16,x17,[x0,#8*6]
+
+ sub x27,x5,#8*4
+.Lsqr4x_cond_copy:
+ sub x27,x27,#8*4
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ ldp x6,x7,[x3,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x16,x21,x8,lo
+ stp xzr,xzr,[x2,#8*2]
+ add x2,x2,#8*4
+ csel x17,x22,x9,lo
+ ldp x8,x9,[x3,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ stp xzr,xzr,[x1,#8*0]
+ stp xzr,xzr,[x1,#8*2]
+ cbnz x27,.Lsqr4x_cond_copy
+
+ csel x14,x19,x6,lo
+ stp xzr,xzr,[x2,#8*0]
+ csel x15,x20,x7,lo
+ stp xzr,xzr,[x2,#8*2]
+ csel x16,x21,x8,lo
+ csel x17,x22,x9,lo
+ stp x14,x15,[x3,#8*0]
+ stp x16,x17,[x3,#8*2]
+
+ b .Lsqr8x_done
+
+.align 4
+.Lsqr8x8_post_condition:
+ adc x28,xzr,xzr
+ ldr x30,[x29,#8] // pull return address
+ // x19-7,x28 hold result, x6-7 hold modulus
+ subs x6,x19,x6
+ ldr x1,[x29,#96] // pull rp
+ sbcs x7,x20,x7
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x8
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x9
+ stp xzr,xzr,[sp,#8*4]
+ sbcs x10,x23,x10
+ stp xzr,xzr,[sp,#8*6]
+ sbcs x11,x24,x11
+ stp xzr,xzr,[sp,#8*8]
+ sbcs x12,x25,x12
+ stp xzr,xzr,[sp,#8*10]
+ sbcs x13,x26,x13
+ stp xzr,xzr,[sp,#8*12]
+ sbcs x28,x28,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*14]
+
+ // x6-7 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ csel x10,x23,x10,lo
+ csel x11,x24,x11,lo
+ stp x8,x9,[x1,#8*2]
+ csel x12,x25,x12,lo
+ csel x13,x26,x13,lo
+ stp x10,x11,[x1,#8*4]
+ stp x12,x13,[x1,#8*6]
+
+.Lsqr8x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size __bn_sqr8x_mont,.-__bn_sqr8x_mont
+.type __bn_mul4x_mont,%function
+.align 5
+__bn_mul4x_mont:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
+ // only from bn_mul_mont or __bn_mul8x_mont which have already signed the
+ // return address.
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ sub x26,sp,x5,lsl#3
+ lsl x5,x5,#3
+ ldr x4,[x4] // *n0
+ sub sp,x26,#8*4 // alloca
+
+ add x10,x2,x5
+ add x27,x1,x5
+ stp x0,x10,[x29,#96] // offload rp and &b[num]
+
+ ldr x24,[x2,#8*0] // b[0]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ mov x19,xzr
+ mov x20,xzr
+ mov x21,xzr
+ mov x22,xzr
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x28,#0
+ mov x26,sp
+
+.Loop_mul4x_1st_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[0])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[0])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ sub x10,x27,x1
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_1st_reduction
+
+ cbz x10,.Lmul4x4_post_condition
+
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldr x25,[sp] // a[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+.Loop_mul4x_1st_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[i])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[i])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i] (or b[0])
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ adcs x23,x23,x0
+ umulh x13,x17,x25
+ adc x0,xzr,xzr
+ ldr x25,[sp,x28] // next t[0]*n0
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_1st_tail
+
+ sub x11,x27,x5 // rewinded x1
+ cbz x10,.Lmul4x_proceed
+
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b .Loop_mul4x_1st_tail
+
+.align 5
+.Lmul4x_proceed:
+ ldr x24,[x2,#8*4]! // *++b
+ adc x30,x0,xzr
+ ldp x6,x7,[x11,#8*0] // a[0..3]
+ sub x3,x3,x5 // rewind np
+ ldp x8,x9,[x11,#8*2]
+ add x1,x11,#8*4
+
+ stp x19,x20,[x26,#8*0] // result!!!
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ stp x21,x22,[x26,#8*2] // result!!!
+ ldp x21,x22,[sp,#8*6]
+
+ ldp x14,x15,[x3,#8*0] // n[0..3]
+ mov x26,sp
+ ldp x16,x17,[x3,#8*2]
+ adds x3,x3,#8*4 // clear carry bit
+ mov x0,xzr
+
+.align 4
+.Loop_mul4x_reduction:
+ mul x10,x6,x24 // lo(a[0..3]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[0..3]*b[4])
+ adcs x20,x20,x11
+ mul x25,x19,x4 // t[0]*n0
+ adcs x21,x21,x12
+ umulh x11,x7,x24
+ adcs x22,x22,x13
+ umulh x12,x8,x24
+ adc x23,xzr,xzr
+ umulh x13,x9,x24
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ // (*) mul x10,x14,x25
+ str x25,[x26],#8 // put aside t[0]*n0 for tail processing
+ adcs x21,x21,x11
+ mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ // (*) adds xzr,x19,x10
+ subs xzr,x19,#1 // (*)
+ umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
+ adcs x19,x20,x11
+ umulh x11,x15,x25
+ adcs x20,x21,x12
+ umulh x12,x16,x25
+ adcs x21,x22,x13
+ umulh x13,x17,x25
+ adcs x22,x23,x0
+ adc x0,xzr,xzr
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_reduction
+
+ adc x0,x0,xzr
+ ldp x10,x11,[x26,#8*4] // t[4..7]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0] // a[4..7]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+
+ ldr x25,[sp] // t[0]*n0
+ ldp x14,x15,[x3,#8*0] // n[4..7]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+
+.align 4
+.Loop_mul4x_tail:
+ mul x10,x6,x24 // lo(a[4..7]*b[4])
+ adc x0,x0,xzr // modulo-scheduled
+ mul x11,x7,x24
+ add x28,x28,#8
+ mul x12,x8,x24
+ and x28,x28,#31
+ mul x13,x9,x24
+ adds x19,x19,x10
+ umulh x10,x6,x24 // hi(a[4..7]*b[4])
+ adcs x20,x20,x11
+ umulh x11,x7,x24
+ adcs x21,x21,x12
+ umulh x12,x8,x24
+ adcs x22,x22,x13
+ umulh x13,x9,x24
+ adc x23,xzr,xzr
+ ldr x24,[x2,x28] // next b[i]
+ adds x20,x20,x10
+ mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
+ adcs x21,x21,x11
+ mul x11,x15,x25
+ adcs x22,x22,x12
+ mul x12,x16,x25
+ adc x23,x23,x13 // can't overflow
+ mul x13,x17,x25
+ adds x19,x19,x10
+ umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
+ adcs x20,x20,x11
+ umulh x11,x15,x25
+ adcs x21,x21,x12
+ umulh x12,x16,x25
+ adcs x22,x22,x13
+ umulh x13,x17,x25
+ adcs x23,x23,x0
+ ldr x25,[sp,x28] // next a[0]*n0
+ adc x0,xzr,xzr
+ str x19,[x26],#8 // result!!!
+ adds x19,x20,x10
+ sub x10,x27,x1 // done yet?
+ adcs x20,x21,x11
+ adcs x21,x22,x12
+ adcs x22,x23,x13
+ //adc x0,x0,xzr
+ cbnz x28,.Loop_mul4x_tail
+
+ sub x11,x3,x5 // rewinded np?
+ adc x0,x0,xzr
+ cbz x10,.Loop_mul4x_break
+
+ ldp x10,x11,[x26,#8*4]
+ ldp x12,x13,[x26,#8*6]
+ ldp x6,x7,[x1,#8*0]
+ ldp x8,x9,[x1,#8*2]
+ add x1,x1,#8*4
+ adds x19,x19,x10
+ adcs x20,x20,x11
+ adcs x21,x21,x12
+ adcs x22,x22,x13
+ //adc x0,x0,xzr
+ ldp x14,x15,[x3,#8*0]
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ b .Loop_mul4x_tail
+
+.align 4
+.Loop_mul4x_break:
+ ldp x12,x13,[x29,#96] // pull rp and &b[num]
+ adds x19,x19,x30
+ add x2,x2,#8*4 // bp++
+ adcs x20,x20,xzr
+ sub x1,x1,x5 // rewind ap
+ adcs x21,x21,xzr
+ stp x19,x20,[x26,#8*0] // result!!!
+ adcs x22,x22,xzr
+ ldp x19,x20,[sp,#8*4] // t[0..3]
+ adc x30,x0,xzr
+ stp x21,x22,[x26,#8*2] // result!!!
+ cmp x2,x13 // done yet?
+ ldp x21,x22,[sp,#8*6]
+ ldp x14,x15,[x11,#8*0] // n[0..3]
+ ldp x16,x17,[x11,#8*2]
+ add x3,x11,#8*4
+ b.eq .Lmul4x_post
+
+ ldr x24,[x2]
+ ldp x6,x7,[x1,#8*0] // a[0..3]
+ ldp x8,x9,[x1,#8*2]
+ adds x1,x1,#8*4 // clear carry bit
+ mov x0,xzr
+ mov x26,sp
+ b .Loop_mul4x_reduction
+
+.align 4
+.Lmul4x_post:
+ // Final step. We see if result is larger than modulus, and
+ // if it is, subtract the modulus. But comparison implies
+ // subtraction. So we subtract modulus, see if it borrowed,
+ // and conditionally copy original value.
+ mov x0,x12
+ mov x27,x12 // x0 copy
+ subs x10,x19,x14
+ add x26,sp,#8*8
+ sbcs x11,x20,x15
+ sub x28,x5,#8*4
+
+.Lmul4x_sub:
+ sbcs x12,x21,x16
+ ldp x14,x15,[x3,#8*0]
+ sub x28,x28,#8*4
+ ldp x19,x20,[x26,#8*0]
+ sbcs x13,x22,x17
+ ldp x16,x17,[x3,#8*2]
+ add x3,x3,#8*4
+ ldp x21,x22,[x26,#8*2]
+ add x26,x26,#8*4
+ stp x10,x11,[x0,#8*0]
+ sbcs x10,x19,x14
+ stp x12,x13,[x0,#8*2]
+ add x0,x0,#8*4
+ sbcs x11,x20,x15
+ cbnz x28,.Lmul4x_sub
+
+ sbcs x12,x21,x16
+ mov x26,sp
+ add x1,sp,#8*4
+ ldp x6,x7,[x27,#8*0]
+ sbcs x13,x22,x17
+ stp x10,x11,[x0,#8*0]
+ ldp x8,x9,[x27,#8*2]
+ stp x12,x13,[x0,#8*2]
+ ldp x19,x20,[x1,#8*0]
+ ldp x21,x22,[x1,#8*2]
+ sbcs xzr,x30,xzr // did it borrow?
+ ldr x30,[x29,#8] // pull return address
+
+ sub x28,x5,#8*4
+.Lmul4x_cond_copy:
+ sub x28,x28,#8*4
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ ldp x6,x7,[x27,#8*4]
+ ldp x19,x20,[x1,#8*4]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*2]
+ add x26,x26,#8*4
+ csel x13,x22,x9,lo
+ ldp x8,x9,[x27,#8*6]
+ ldp x21,x22,[x1,#8*6]
+ add x1,x1,#8*4
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+ add x27,x27,#8*4
+ cbnz x28,.Lmul4x_cond_copy
+
+ csel x10,x19,x6,lo
+ stp xzr,xzr,[x26,#8*0]
+ csel x11,x20,x7,lo
+ stp xzr,xzr,[x26,#8*2]
+ csel x12,x21,x8,lo
+ stp xzr,xzr,[x26,#8*3]
+ csel x13,x22,x9,lo
+ stp xzr,xzr,[x26,#8*4]
+ stp x10,x11,[x27,#8*0]
+ stp x12,x13,[x27,#8*2]
+
+ b .Lmul4x_done
+
+.align 4
+.Lmul4x4_post_condition:
+ adc x0,x0,xzr
+ ldr x1,[x29,#96] // pull rp
+ // x19-3,x0 hold result, x14-7 hold modulus
+ subs x6,x19,x14
+ ldr x30,[x29,#8] // pull return address
+ sbcs x7,x20,x15
+ stp xzr,xzr,[sp,#8*0]
+ sbcs x8,x21,x16
+ stp xzr,xzr,[sp,#8*2]
+ sbcs x9,x22,x17
+ stp xzr,xzr,[sp,#8*4]
+ sbcs xzr,x0,xzr // did it borrow?
+ stp xzr,xzr,[sp,#8*6]
+
+ // x6-3 hold result-modulus
+ csel x6,x19,x6,lo
+ csel x7,x20,x7,lo
+ csel x8,x21,x8,lo
+ csel x9,x22,x9,lo
+ stp x6,x7,[x1,#8*0]
+ stp x8,x9,[x1,#8*2]
+
+.Lmul4x_done:
+ ldp x19,x20,[x29,#16]
+ mov sp,x29
+ ldp x21,x22,[x29,#32]
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+ // x30 is popped earlier
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size __bn_mul4x_mont,.-__bn_mul4x_mont
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 4
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S
@@ -1,0 +1,346 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+.globl gcm_init_neon
+.hidden gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ AARCH64_VALID_CALL_TARGET
+ // This function is adapted from gcm_init_v8. xC2 is t3.
+ ld1 {v17.2d}, [x1] // load H
+ movi v19.16b, #0xe1
+ shl v19.2d, v19.2d, #57 // 0xc2.0
+ ext v3.16b, v17.16b, v17.16b, #8
+ ushr v18.2d, v19.2d, #63
+ dup v17.4s, v17.s[1]
+ ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01
+ ushr v18.2d, v3.2d, #63
+ sshr v17.4s, v17.4s, #31 // broadcast carry bit
+ and v18.16b, v18.16b, v16.16b
+ shl v3.2d, v3.2d, #1
+ ext v18.16b, v18.16b, v18.16b, #8
+ and v16.16b, v16.16b, v17.16b
+ orr v3.16b, v3.16b, v18.16b // H<<<=1
+ eor v5.16b, v3.16b, v16.16b // twisted H
+ st1 {v5.2d}, [x0] // store Htable[0]
+ ret
+.size gcm_init_neon,.-gcm_init_neon
+
+.globl gcm_gmult_neon
+.hidden gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v3.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, .Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v3.16b, v3.16b // byteswap Xi
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+ mov x3, #16
+ b .Lgmult_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl gcm_ghash_neon
+.hidden gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v0.16b}, [x0] // load Xi
+ ld1 {v5.1d}, [x1], #8 // load twisted H
+ ld1 {v6.1d}, [x1]
+ adrp x9, .Lmasks // load constants
+ add x9, x9, :lo12:.Lmasks
+ ld1 {v24.2d, v25.2d}, [x9]
+ rev64 v0.16b, v0.16b // byteswap Xi
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing
+
+.Loop_neon:
+ ld1 {v3.16b}, [x2], #16 // load inp
+ rev64 v3.16b, v3.16b // byteswap inp
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v3.16b, v3.16b, v0.16b // inp ^= Xi
+
+.Lgmult_neon:
+ // Split the input into v3 and v4. (The upper halves are unused,
+ // so it is okay to leave them alone.)
+ ins v4.d[0], v3.d[1]
+ ext v16.8b, v5.8b, v5.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v0.8b, v3.8b, v3.8b, #1 // B1
+ pmull v0.8h, v5.8b, v0.8b // E = A*B1
+ ext v17.8b, v5.8b, v5.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v5.8b, v19.8b // G = A*B2
+ ext v18.8b, v5.8b, v5.8b, #3 // A3
+ eor v16.16b, v16.16b, v0.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v0.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v0.8h, v5.8b, v0.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v0.16b // N = I + J
+ pmull v19.8h, v5.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v0.8h, v5.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v0.16b, v0.16b, v16.16b
+ eor v0.16b, v0.16b, v18.16b
+ eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing
+ ext v16.8b, v7.8b, v7.8b, #1 // A1
+ pmull v16.8h, v16.8b, v3.8b // F = A1*B
+ ext v1.8b, v3.8b, v3.8b, #1 // B1
+ pmull v1.8h, v7.8b, v1.8b // E = A*B1
+ ext v17.8b, v7.8b, v7.8b, #2 // A2
+ pmull v17.8h, v17.8b, v3.8b // H = A2*B
+ ext v19.8b, v3.8b, v3.8b, #2 // B2
+ pmull v19.8h, v7.8b, v19.8b // G = A*B2
+ ext v18.8b, v7.8b, v7.8b, #3 // A3
+ eor v16.16b, v16.16b, v1.16b // L = E + F
+ pmull v18.8h, v18.8b, v3.8b // J = A3*B
+ ext v1.8b, v3.8b, v3.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v1.8h, v7.8b, v1.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v3.8b, v3.8b, #4 // B4
+ eor v18.16b, v18.16b, v1.16b // N = I + J
+ pmull v19.8h, v7.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v1.8h, v7.8b, v3.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v1.16b, v1.16b, v16.16b
+ eor v1.16b, v1.16b, v18.16b
+ ext v16.8b, v6.8b, v6.8b, #1 // A1
+ pmull v16.8h, v16.8b, v4.8b // F = A1*B
+ ext v2.8b, v4.8b, v4.8b, #1 // B1
+ pmull v2.8h, v6.8b, v2.8b // E = A*B1
+ ext v17.8b, v6.8b, v6.8b, #2 // A2
+ pmull v17.8h, v17.8b, v4.8b // H = A2*B
+ ext v19.8b, v4.8b, v4.8b, #2 // B2
+ pmull v19.8h, v6.8b, v19.8b // G = A*B2
+ ext v18.8b, v6.8b, v6.8b, #3 // A3
+ eor v16.16b, v16.16b, v2.16b // L = E + F
+ pmull v18.8h, v18.8b, v4.8b // J = A3*B
+ ext v2.8b, v4.8b, v4.8b, #3 // B3
+ eor v17.16b, v17.16b, v19.16b // M = G + H
+ pmull v2.8h, v6.8b, v2.8b // I = A*B3
+
+ // Here we diverge from the 32-bit version. It computes the following
+ // (instructions reordered for clarity):
+ //
+ // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L)
+ // vand $t0#hi, $t0#hi, $k48
+ // veor $t0#lo, $t0#lo, $t0#hi
+ //
+ // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M)
+ // vand $t1#hi, $t1#hi, $k32
+ // veor $t1#lo, $t1#lo, $t1#hi
+ //
+ // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N)
+ // vand $t2#hi, $t2#hi, $k16
+ // veor $t2#lo, $t2#lo, $t2#hi
+ //
+ // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K)
+ // vmov.i64 $t3#hi, #0
+ //
+ // $kN is a mask with the bottom N bits set. AArch64 cannot compute on
+ // upper halves of SIMD registers, so we must split each half into
+ // separate registers. To compensate, we pair computations up and
+ // parallelize.
+
+ ext v19.8b, v4.8b, v4.8b, #4 // B4
+ eor v18.16b, v18.16b, v2.16b // N = I + J
+ pmull v19.8h, v6.8b, v19.8b // K = A*B4
+
+ // This can probably be scheduled more efficiently. For now, we just
+ // pair up independent instructions.
+ zip1 v20.2d, v16.2d, v17.2d
+ zip1 v22.2d, v18.2d, v19.2d
+ zip2 v21.2d, v16.2d, v17.2d
+ zip2 v23.2d, v18.2d, v19.2d
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ and v21.16b, v21.16b, v24.16b
+ and v23.16b, v23.16b, v25.16b
+ eor v20.16b, v20.16b, v21.16b
+ eor v22.16b, v22.16b, v23.16b
+ zip1 v16.2d, v20.2d, v21.2d
+ zip1 v18.2d, v22.2d, v23.2d
+ zip2 v17.2d, v20.2d, v21.2d
+ zip2 v19.2d, v22.2d, v23.2d
+
+ ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8
+ ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16
+ pmull v2.8h, v6.8b, v4.8b // D = A*B
+ ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32
+ ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24
+ eor v16.16b, v16.16b, v17.16b
+ eor v18.16b, v18.16b, v19.16b
+ eor v2.16b, v2.16b, v16.16b
+ eor v2.16b, v2.16b, v18.16b
+ ext v16.16b, v0.16b, v2.16b, #8
+ eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing
+ eor v1.16b, v1.16b, v2.16b
+ eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi
+ ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result
+ // This is a no-op due to the ins instruction below.
+ // ins v2.d[0], v1.d[1]
+
+ // equivalent of reduction_avx from ghash-x86_64.pl
+ shl v17.2d, v0.2d, #57 // 1st phase
+ shl v18.2d, v0.2d, #62
+ eor v18.16b, v18.16b, v17.16b //
+ shl v17.2d, v0.2d, #63
+ eor v18.16b, v18.16b, v17.16b //
+ // Note Xm contains {Xl.d[1], Xh.d[0]}.
+ eor v18.16b, v18.16b, v1.16b
+ ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0]
+ ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1]
+
+ ushr v18.2d, v0.2d, #1 // 2nd phase
+ eor v2.16b, v2.16b,v0.16b
+ eor v0.16b, v0.16b,v18.16b //
+ ushr v18.2d, v18.2d, #6
+ ushr v0.2d, v0.2d, #1 //
+ eor v0.16b, v0.16b, v2.16b //
+ eor v0.16b, v0.16b, v18.16b //
+
+ subs x3, x3, #16
+ bne .Loop_neon
+
+ rev64 v0.16b, v0.16b // byteswap Xi and write
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v0.16b}, [x0]
+
+ ret
+.size gcm_ghash_neon,.-gcm_ghash_neon
+
+.section .rodata
+.align 4
+.Lmasks:
+.quad 0x0000ffffffffffff // k48
+.quad 0x00000000ffffffff // k32
+.quad 0x000000000000ffff // k16
+.quad 0x0000000000000000 // k0
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S
@@ -1,0 +1,576 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch armv8-a+crypto
+.globl gcm_init_v8
+.hidden gcm_init_v8
+.type gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x1] //load input H
+ movi v19.16b,#0xe1
+ shl v19.2d,v19.2d,#57 //0xc2.0
+ ext v3.16b,v17.16b,v17.16b,#8
+ ushr v18.2d,v19.2d,#63
+ dup v17.4s,v17.s[1]
+ ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01
+ ushr v18.2d,v3.2d,#63
+ sshr v17.4s,v17.4s,#31 //broadcast carry bit
+ and v18.16b,v18.16b,v16.16b
+ shl v3.2d,v3.2d,#1
+ ext v18.16b,v18.16b,v18.16b,#8
+ and v16.16b,v16.16b,v17.16b
+ orr v3.16b,v3.16b,v18.16b //H<<<=1
+ eor v20.16b,v3.16b,v16.16b //twisted H
+ st1 {v20.2d},[x0],#16 //store Htable[0]
+
+ //calculate H^2
+ ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing
+ pmull v0.1q,v20.1d,v20.1d
+ eor v16.16b,v16.16b,v20.16b
+ pmull2 v2.1q,v20.2d,v20.2d
+ pmull v1.1q,v16.1d,v16.1d
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v22.16b,v0.16b,v18.16b
+
+ ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2]
+ //calculate H^3 and H^4
+ pmull v0.1q,v20.1d, v22.1d
+ pmull v5.1q,v22.1d,v22.1d
+ pmull2 v2.1q,v20.2d, v22.2d
+ pmull2 v7.1q,v22.2d,v22.2d
+ pmull v1.1q,v16.1d,v17.1d
+ pmull v6.1q,v17.1d,v17.1d
+
+ ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ ext v17.16b,v5.16b,v7.16b,#8
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v16.16b
+ eor v4.16b,v5.16b,v7.16b
+ eor v6.16b,v6.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase
+ eor v6.16b,v6.16b,v4.16b
+ pmull v4.1q,v5.1d,v19.1d
+
+ ins v2.d[0],v1.d[1]
+ ins v7.d[0],v6.d[1]
+ ins v1.d[1],v0.d[0]
+ ins v6.d[1],v5.d[0]
+ eor v0.16b,v1.16b,v18.16b
+ eor v5.16b,v6.16b,v4.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase
+ ext v4.16b,v5.16b,v5.16b,#8
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v5.1q,v5.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v4.16b,v4.16b,v7.16b
+ eor v20.16b, v0.16b,v18.16b //H^3
+ eor v22.16b,v5.16b,v4.16b //H^4
+
+ ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing
+ ext v17.16b,v22.16b,v22.16b,#8
+ eor v16.16b,v16.16b,v20.16b
+ eor v17.16b,v17.16b,v22.16b
+ ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed
+ st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5]
+ ret
+.size gcm_init_v8,.-gcm_init_v8
+.globl gcm_gmult_v8
+.hidden gcm_gmult_v8
+.type gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+ AARCH64_VALID_CALL_TARGET
+ ld1 {v17.2d},[x0] //load Xi
+ movi v19.16b,#0xe1
+ ld1 {v20.2d,v21.2d},[x1] //load twisted H, ...
+ shl v19.2d,v19.2d,#57
+#ifndef __ARMEB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v3.16b,v17.16b,v17.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+#ifndef __ARMEB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+.size gcm_gmult_v8,.-gcm_gmult_v8
+.globl gcm_ghash_v8
+.hidden gcm_ghash_v8
+.type gcm_ghash_v8,%function
+.align 4
+gcm_ghash_v8:
+ AARCH64_VALID_CALL_TARGET
+ cmp x3,#64
+ b.hs .Lgcm_ghash_v8_4x
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ //"[rotated]" means that
+ //loaded value would have
+ //to be rotated in order to
+ //make it appear as in
+ //algorithm specification
+ subs x3,x3,#32 //see if x3 is 32 or larger
+ mov x12,#16 //x12 is used as post-
+ //increment for input pointer;
+ //as loop is modulo-scheduled
+ //x12 is zeroed just in time
+ //to preclude overstepping
+ //inp[len], which means that
+ //last block[s] are actually
+ //loaded twice, but last
+ //copy is not processed
+ ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v22.2d},[x1]
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi
+ ld1 {v16.2d},[x2],#16 //load [rotated] I[0]
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+#ifndef __ARMEB__
+ rev64 v16.16b,v16.16b
+ rev64 v0.16b,v0.16b
+#endif
+ ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0]
+ b.lo .Lodd_tail_v8 //x3 was less than 32
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[1]
+#ifndef __ARMEB__
+ rev64 v17.16b,v17.16b
+#endif
+ ext v7.16b,v17.16b,v17.16b,#8
+ eor v3.16b,v3.16b,v0.16b //I[i]^=Xi
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ pmull2 v6.1q,v20.2d,v7.2d
+ b .Loop_mod2x_v8
+
+.align 4
+.Loop_mod2x_v8:
+ ext v18.16b,v3.16b,v3.16b,#8
+ subs x3,x3,#32 //is there more data?
+ pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo
+ csel x12,xzr,x12,lo //is it time to zero x12?
+
+ pmull v5.1q,v21.1d,v17.1d
+ eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi
+ eor v0.16b,v0.16b,v4.16b //accumulate
+ pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2]
+
+ eor v2.16b,v2.16b,v6.16b
+ csel x12,xzr,x12,eq //is it time to zero x12?
+ eor v1.16b,v1.16b,v5.16b
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3]
+#ifndef __ARMEB__
+ rev64 v16.16b,v16.16b
+#endif
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+#ifndef __ARMEB__
+ rev64 v17.16b,v17.16b
+#endif
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v7.16b,v17.16b,v17.16b,#8
+ ext v3.16b,v16.16b,v16.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+ pmull v4.1q,v20.1d,v7.1d //H·Ii+1
+ eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v3.16b,v3.16b,v18.16b
+ eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing
+ eor v3.16b,v3.16b,v0.16b
+ pmull2 v6.1q,v20.2d,v7.2d
+ b.hs .Loop_mod2x_v8 //there was at least 32 more bytes
+
+ eor v2.16b,v2.16b,v18.16b
+ ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b
+ adds x3,x3,#32 //re-construct x3
+ eor v0.16b,v0.16b,v2.16b //re-construct v0.16b
+ b.eq .Ldone_v8 //is x3 zero?
+.Lodd_tail_v8:
+ ext v18.16b,v0.16b,v0.16b,#8
+ eor v3.16b,v3.16b,v0.16b //inp^=Xi
+ eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi
+
+ pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo
+ eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing
+ pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi
+ pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+
+.Ldone_v8:
+#ifndef __ARMEB__
+ rev64 v0.16b,v0.16b
+#endif
+ ext v0.16b,v0.16b,v0.16b,#8
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+.size gcm_ghash_v8,.-gcm_ghash_v8
+.type gcm_ghash_v8_4x,%function
+.align 4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+ ld1 {v0.2d},[x0] //load [rotated] Xi
+ ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2
+ movi v19.16b,#0xe1
+ ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4
+ shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant
+
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+#ifndef __ARMEB__
+ rev64 v0.16b,v0.16b
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+ ext v25.16b,v7.16b,v7.16b,#8
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#128
+ b.lo .Ltail4x
+
+ b .Loop4x
+
+.align 4
+.Loop4x:
+ eor v16.16b,v4.16b,v0.16b
+ ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64
+ ext v3.16b,v16.16b,v16.16b,#8
+#ifndef __ARMEB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v7.16b,v7.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ ext v25.16b,v7.16b,v7.16b,#8
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ ext v24.16b,v6.16b,v6.16b,#8
+ eor v1.16b,v1.16b,v30.16b
+ ext v23.16b,v5.16b,v5.16b,#8
+
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ pmull v29.1q,v20.1d,v25.1d //H·Ii+3
+ eor v7.16b,v7.16b,v25.16b
+ eor v1.16b,v1.16b,v17.16b
+ pmull2 v31.1q,v20.2d,v25.2d
+ eor v1.16b,v1.16b,v18.16b
+ pmull v30.1q,v21.1d,v7.1d
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+ pmull2 v24.1q,v22.2d,v24.2d
+ eor v0.16b,v1.16b,v18.16b
+ pmull2 v6.1q,v21.2d,v6.2d
+
+ eor v29.16b,v29.16b,v16.16b
+ eor v31.16b,v31.16b,v24.16b
+ eor v30.16b,v30.16b,v6.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v23.1q,v26.2d,v23.2d
+ pmull v5.1q,v27.1d,v5.1d
+
+ eor v0.16b,v0.16b,v18.16b
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+ eor v30.16b,v30.16b,v5.16b
+
+ subs x3,x3,#64
+ b.hs .Loop4x
+
+.Ltail4x:
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v28.2d,v3.2d
+ pmull2 v1.1q,v27.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+
+ adds x3,x3,#64
+ b.eq .Ldone4x
+
+ cmp x3,#32
+ b.lo .Lone
+ b.eq .Ltwo
+.Lthree:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d,v6.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __ARMEB__
+ rev64 v5.16b,v5.16b
+ rev64 v6.16b,v6.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v24.16b,v6.16b,v6.16b,#8
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ pmull v29.1q,v20.1d,v24.1d //H·Ii+2
+ eor v6.16b,v6.16b,v24.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ pmull2 v31.1q,v20.2d,v24.2d
+ pmull v30.1q,v21.1d,v6.1d
+ eor v0.16b,v0.16b,v18.16b
+ pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull2 v23.1q,v22.2d,v23.2d
+ eor v16.16b,v4.16b,v0.16b
+ pmull2 v5.1q,v21.2d,v5.2d
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ eor v29.16b,v29.16b,v7.16b
+ eor v31.16b,v31.16b,v23.16b
+ eor v30.16b,v30.16b,v5.16b
+
+ pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v26.2d,v3.2d
+ pmull v1.1q,v27.1d,v16.1d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b .Ldone4x
+
+.align 4
+.Ltwo:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d,v5.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __ARMEB__
+ rev64 v5.16b,v5.16b
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ ext v23.16b,v5.16b,v5.16b,#8
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ pmull v29.1q,v20.1d,v23.1d //H·Ii+1
+ eor v5.16b,v5.16b,v23.16b
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull2 v31.1q,v20.2d,v23.2d
+ pmull v30.1q,v21.1d,v5.1d
+
+ pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii)
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v22.2d,v3.2d
+ pmull2 v1.1q,v21.2d,v16.2d
+
+ eor v0.16b,v0.16b,v29.16b
+ eor v2.16b,v2.16b,v31.16b
+ eor v1.16b,v1.16b,v30.16b
+ b .Ldone4x
+
+.align 4
+.Lone:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ ld1 {v4.2d},[x2]
+ eor v1.16b,v1.16b,v18.16b
+#ifndef __ARMEB__
+ rev64 v4.16b,v4.16b
+#endif
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+ eor v16.16b,v4.16b,v0.16b
+ ext v3.16b,v16.16b,v16.16b,#8
+
+ pmull v0.1q,v20.1d,v3.1d
+ eor v16.16b,v16.16b,v3.16b
+ pmull2 v2.1q,v20.2d,v3.2d
+ pmull v1.1q,v21.1d,v16.1d
+
+.Ldone4x:
+ ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing
+ eor v18.16b,v0.16b,v2.16b
+ eor v1.16b,v1.16b,v17.16b
+ eor v1.16b,v1.16b,v18.16b
+
+ pmull v18.1q,v0.1d,v19.1d //1st phase of reduction
+ ins v2.d[0],v1.d[1]
+ ins v1.d[1],v0.d[0]
+ eor v0.16b,v1.16b,v18.16b
+
+ ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction
+ pmull v0.1q,v0.1d,v19.1d
+ eor v18.16b,v18.16b,v2.16b
+ eor v0.16b,v0.16b,v18.16b
+ ext v0.16b,v0.16b,v0.16b,#8
+
+#ifndef __ARMEB__
+ rev64 v0.16b,v0.16b
+#endif
+ st1 {v0.2d},[x0] //write out Xi
+
+ ret
+.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S
@@ -1,0 +1,1238 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+
+.hidden OPENSSL_armcap_P
+.globl sha1_block_data_order
+.hidden sha1_block_data_order
+.type sha1_block_data_order,%function
+.align 6
+sha1_block_data_order:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+ adrp x16,:pg_hi21_nc:OPENSSL_armcap_P
+#else
+ adrp x16,OPENSSL_armcap_P
+#endif
+ ldr w16,[x16,:lo12:OPENSSL_armcap_P]
+ tst w16,#ARMV8_SHA1
+ b.ne .Lv8_entry
+
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+
+ ldp w20,w21,[x0]
+ ldp w22,w23,[x0,#8]
+ ldr w24,[x0,#16]
+
+.Loop:
+ ldr x3,[x1],#64
+ movz w28,#0x7999
+ sub x2,x2,#1
+ movk w28,#0x5a82,lsl#16
+#ifdef __ARMEB__
+ ror x3,x3,#32
+#else
+ rev32 x3,x3
+#endif
+ add w24,w24,w28 // warm it up
+ add w24,w24,w3
+ lsr x4,x3,#32
+ ldr x5,[x1,#-56]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w4 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __ARMEB__
+ ror x5,x5,#32
+#else
+ rev32 x5,x5
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w5 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x6,x5,#32
+ ldr x7,[x1,#-48]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w6 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __ARMEB__
+ ror x7,x7,#32
+#else
+ rev32 x7,x7
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w7 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x8,x7,#32
+ ldr x9,[x1,#-40]
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w8 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+#ifdef __ARMEB__
+ ror x9,x9,#32
+#else
+ rev32 x9,x9
+#endif
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w9 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ lsr x10,x9,#32
+ ldr x11,[x1,#-32]
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w10 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+#ifdef __ARMEB__
+ ror x11,x11,#32
+#else
+ rev32 x11,x11
+#endif
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w11 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ lsr x12,x11,#32
+ ldr x13,[x1,#-24]
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w12 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+#ifdef __ARMEB__
+ ror x13,x13,#32
+#else
+ rev32 x13,x13
+#endif
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ add w24,w24,w13 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ lsr x14,x13,#32
+ ldr x15,[x1,#-16]
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ add w23,w23,w14 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+#ifdef __ARMEB__
+ ror x15,x15,#32
+#else
+ rev32 x15,x15
+#endif
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ add w22,w22,w15 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ lsr x16,x15,#32
+ ldr x17,[x1,#-8]
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ add w21,w21,w16 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+#ifdef __ARMEB__
+ ror x17,x17,#32
+#else
+ rev32 x17,x17
+#endif
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w17 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ lsr x19,x17,#32
+ eor w3,w3,w5
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w3,w3,w11
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w3,w3,w16
+ ror w22,w22,#2
+ add w24,w24,w19 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ bic w25,w23,w21
+ and w26,w22,w21
+ ror w27,w20,#27
+ eor w4,w4,w12
+ add w23,w23,w28 // future e+=K
+ orr w25,w25,w26
+ add w24,w24,w27 // e+=rot(a,5)
+ eor w4,w4,w17
+ ror w21,w21,#2
+ add w23,w23,w3 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ bic w25,w22,w20
+ and w26,w21,w20
+ ror w27,w24,#27
+ eor w5,w5,w13
+ add w22,w22,w28 // future e+=K
+ orr w25,w25,w26
+ add w23,w23,w27 // e+=rot(a,5)
+ eor w5,w5,w19
+ ror w20,w20,#2
+ add w22,w22,w4 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ bic w25,w21,w24
+ and w26,w20,w24
+ ror w27,w23,#27
+ eor w6,w6,w14
+ add w21,w21,w28 // future e+=K
+ orr w25,w25,w26
+ add w22,w22,w27 // e+=rot(a,5)
+ eor w6,w6,w3
+ ror w24,w24,#2
+ add w21,w21,w5 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ bic w25,w20,w23
+ and w26,w24,w23
+ ror w27,w22,#27
+ eor w7,w7,w15
+ add w20,w20,w28 // future e+=K
+ orr w25,w25,w26
+ add w21,w21,w27 // e+=rot(a,5)
+ eor w7,w7,w4
+ ror w23,w23,#2
+ add w20,w20,w6 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ movz w28,#0xeba1
+ movk w28,#0x6ed9,lsl#16
+ eor w8,w8,w10
+ bic w25,w24,w22
+ and w26,w23,w22
+ ror w27,w21,#27
+ eor w8,w8,w16
+ add w24,w24,w28 // future e+=K
+ orr w25,w25,w26
+ add w20,w20,w27 // e+=rot(a,5)
+ eor w8,w8,w5
+ ror w22,w22,#2
+ add w24,w24,w7 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w9,w9,w6
+ add w23,w23,w8 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w10,w10,w7
+ add w22,w22,w9 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w11,w11,w8
+ add w21,w21,w10 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w12,w12,w9
+ add w20,w20,w11 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w13,w13,w10
+ add w24,w24,w12 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w14,w14,w11
+ add w23,w23,w13 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w15,w15,w12
+ add w22,w22,w14 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w16,w16,w13
+ add w21,w21,w15 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w17,w17,w14
+ add w20,w20,w16 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w19,w19,w15
+ add w24,w24,w17 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w3,w3,w16
+ add w23,w23,w19 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w4,w4,w17
+ add w22,w22,w3 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w5,w5,w19
+ add w21,w21,w4 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w6,w6,w3
+ add w20,w20,w5 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w7,w7,w4
+ add w24,w24,w6 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w8,w8,w5
+ add w23,w23,w7 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w9,w9,w6
+ add w22,w22,w8 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w10,w10,w7
+ add w21,w21,w9 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w11,w11,w8
+ add w20,w20,w10 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ movz w28,#0xbcdc
+ movk w28,#0x8f1b,lsl#16
+ eor w12,w12,w14
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w12,w12,w9
+ add w24,w24,w11 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w13,w13,w15
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w13,w13,w5
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w13,w13,w10
+ add w23,w23,w12 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w14,w14,w16
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w14,w14,w6
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w14,w14,w11
+ add w22,w22,w13 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w15,w15,w17
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w15,w15,w7
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w15,w15,w12
+ add w21,w21,w14 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w16,w16,w19
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w16,w16,w8
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w16,w16,w13
+ add w20,w20,w15 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w17,w17,w3
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w17,w17,w9
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w17,w17,w14
+ add w24,w24,w16 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w19,w19,w4
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w19,w19,w10
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w19,w19,w15
+ add w23,w23,w17 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w3,w3,w5
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w3,w3,w11
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w3,w3,w16
+ add w22,w22,w19 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w4,w4,w6
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w4,w4,w12
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w4,w4,w17
+ add w21,w21,w3 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w5,w5,w7
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w5,w5,w13
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w5,w5,w19
+ add w20,w20,w4 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w6,w6,w8
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w6,w6,w14
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w6,w6,w3
+ add w24,w24,w5 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w7,w7,w9
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w7,w7,w15
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w7,w7,w4
+ add w23,w23,w6 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w8,w8,w10
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w8,w8,w16
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w8,w8,w5
+ add w22,w22,w7 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w9,w9,w11
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w9,w9,w17
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w9,w9,w6
+ add w21,w21,w8 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w10,w10,w12
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w10,w10,w19
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w10,w10,w7
+ add w20,w20,w9 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w11,w11,w13
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w11,w11,w3
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w11,w11,w8
+ add w24,w24,w10 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ orr w25,w21,w22
+ and w26,w21,w22
+ eor w12,w12,w14
+ ror w27,w20,#27
+ and w25,w25,w23
+ add w23,w23,w28 // future e+=K
+ eor w12,w12,w4
+ add w24,w24,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w21,w21,#2
+ eor w12,w12,w9
+ add w23,w23,w11 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ orr w25,w20,w21
+ and w26,w20,w21
+ eor w13,w13,w15
+ ror w27,w24,#27
+ and w25,w25,w22
+ add w22,w22,w28 // future e+=K
+ eor w13,w13,w5
+ add w23,w23,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w20,w20,#2
+ eor w13,w13,w10
+ add w22,w22,w12 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ orr w25,w24,w20
+ and w26,w24,w20
+ eor w14,w14,w16
+ ror w27,w23,#27
+ and w25,w25,w21
+ add w21,w21,w28 // future e+=K
+ eor w14,w14,w6
+ add w22,w22,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w24,w24,#2
+ eor w14,w14,w11
+ add w21,w21,w13 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ orr w25,w23,w24
+ and w26,w23,w24
+ eor w15,w15,w17
+ ror w27,w22,#27
+ and w25,w25,w20
+ add w20,w20,w28 // future e+=K
+ eor w15,w15,w7
+ add w21,w21,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w23,w23,#2
+ eor w15,w15,w12
+ add w20,w20,w14 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ movz w28,#0xc1d6
+ movk w28,#0xca62,lsl#16
+ orr w25,w22,w23
+ and w26,w22,w23
+ eor w16,w16,w19
+ ror w27,w21,#27
+ and w25,w25,w24
+ add w24,w24,w28 // future e+=K
+ eor w16,w16,w8
+ add w20,w20,w27 // e+=rot(a,5)
+ orr w25,w25,w26
+ ror w22,w22,#2
+ eor w16,w16,w13
+ add w24,w24,w15 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w17,w17,w14
+ add w23,w23,w16 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w19,w19,w15
+ add w22,w22,w17 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ eor w3,w3,w5
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w3,w3,w11
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w3,w3,w16
+ add w21,w21,w19 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w3,w3,#31
+ eor w4,w4,w6
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w4,w4,w12
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w4,w4,w17
+ add w20,w20,w3 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w4,w4,#31
+ eor w5,w5,w7
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w5,w5,w13
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w5,w5,w19
+ add w24,w24,w4 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w5,w5,#31
+ eor w6,w6,w8
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w6,w6,w14
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w6,w6,w3
+ add w23,w23,w5 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w6,w6,#31
+ eor w7,w7,w9
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w7,w7,w15
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w7,w7,w4
+ add w22,w22,w6 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w7,w7,#31
+ eor w8,w8,w10
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w8,w8,w16
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w8,w8,w5
+ add w21,w21,w7 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w8,w8,#31
+ eor w9,w9,w11
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w9,w9,w17
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w9,w9,w6
+ add w20,w20,w8 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w9,w9,#31
+ eor w10,w10,w12
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w10,w10,w19
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w10,w10,w7
+ add w24,w24,w9 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w10,w10,#31
+ eor w11,w11,w13
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w11,w11,w3
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w11,w11,w8
+ add w23,w23,w10 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w11,w11,#31
+ eor w12,w12,w14
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w12,w12,w4
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w12,w12,w9
+ add w22,w22,w11 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w12,w12,#31
+ eor w13,w13,w15
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w13,w13,w5
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w13,w13,w10
+ add w21,w21,w12 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w13,w13,#31
+ eor w14,w14,w16
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w14,w14,w6
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ eor w14,w14,w11
+ add w20,w20,w13 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ror w14,w14,#31
+ eor w15,w15,w17
+ eor w25,w24,w22
+ ror w27,w21,#27
+ add w24,w24,w28 // future e+=K
+ eor w15,w15,w7
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ eor w15,w15,w12
+ add w24,w24,w14 // future e+=X[i]
+ add w20,w20,w25 // e+=F(b,c,d)
+ ror w15,w15,#31
+ eor w16,w16,w19
+ eor w25,w23,w21
+ ror w27,w20,#27
+ add w23,w23,w28 // future e+=K
+ eor w16,w16,w8
+ eor w25,w25,w22
+ add w24,w24,w27 // e+=rot(a,5)
+ ror w21,w21,#2
+ eor w16,w16,w13
+ add w23,w23,w15 // future e+=X[i]
+ add w24,w24,w25 // e+=F(b,c,d)
+ ror w16,w16,#31
+ eor w17,w17,w3
+ eor w25,w22,w20
+ ror w27,w24,#27
+ add w22,w22,w28 // future e+=K
+ eor w17,w17,w9
+ eor w25,w25,w21
+ add w23,w23,w27 // e+=rot(a,5)
+ ror w20,w20,#2
+ eor w17,w17,w14
+ add w22,w22,w16 // future e+=X[i]
+ add w23,w23,w25 // e+=F(b,c,d)
+ ror w17,w17,#31
+ eor w19,w19,w4
+ eor w25,w21,w24
+ ror w27,w23,#27
+ add w21,w21,w28 // future e+=K
+ eor w19,w19,w10
+ eor w25,w25,w20
+ add w22,w22,w27 // e+=rot(a,5)
+ ror w24,w24,#2
+ eor w19,w19,w15
+ add w21,w21,w17 // future e+=X[i]
+ add w22,w22,w25 // e+=F(b,c,d)
+ ror w19,w19,#31
+ ldp w4,w5,[x0]
+ eor w25,w20,w23
+ ror w27,w22,#27
+ add w20,w20,w28 // future e+=K
+ eor w25,w25,w24
+ add w21,w21,w27 // e+=rot(a,5)
+ ror w23,w23,#2
+ add w20,w20,w19 // future e+=X[i]
+ add w21,w21,w25 // e+=F(b,c,d)
+ ldp w6,w7,[x0,#8]
+ eor w25,w24,w22
+ ror w27,w21,#27
+ eor w25,w25,w23
+ add w20,w20,w27 // e+=rot(a,5)
+ ror w22,w22,#2
+ ldr w8,[x0,#16]
+ add w20,w20,w25 // e+=F(b,c,d)
+ add w21,w21,w5
+ add w22,w22,w6
+ add w20,w20,w4
+ add w23,w23,w7
+ add w24,w24,w8
+ stp w20,w21,[x0]
+ stp w22,w23,[x0,#8]
+ str w24,[x0,#16]
+ cbnz x2,.Loop
+
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x23,x24,[sp,#48]
+ ldp x25,x26,[sp,#64]
+ ldp x27,x28,[sp,#80]
+ ldr x29,[sp],#96
+ ret
+.size sha1_block_data_order,.-sha1_block_data_order
+.type sha1_block_armv8,%function
+.align 6
+sha1_block_armv8:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ AARCH64_VALID_CALL_TARGET
+.Lv8_entry:
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ adrp x4,.Lconst
+ add x4,x4,:lo12:.Lconst
+ eor v1.16b,v1.16b,v1.16b
+ ld1 {v0.4s},[x0],#16
+ ld1 {v1.s}[0],[x0]
+ sub x0,x0,#16
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4]
+
+.Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+
+ add v20.4s,v16.4s,v4.4s
+ rev32 v6.16b,v6.16b
+ orr v22.16b,v0.16b,v0.16b // offload
+
+ add v21.4s,v16.4s,v5.4s
+ rev32 v7.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b
+.inst 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0
+ add v20.4s,v16.4s,v6.4s
+.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 1
+.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v16.4s,v7.4s
+.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 2
+.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v16.4s,v4.4s
+.inst 0x5e281885 //sha1su1 v5.16b,v4.16b
+.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 3
+.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 4
+.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v6.4s
+.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 5
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v7.4s
+.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 6
+.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v17.4s,v4.4s
+.inst 0x5e281885 //sha1su1 v5.16b,v4.16b
+.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 7
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v17.4s,v5.4s
+.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 8
+.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 9
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v7.4s
+.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 10
+.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v4.4s
+.inst 0x5e281885 //sha1su1 v5.16b,v4.16b
+.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 11
+.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v18.4s,v5.4s
+.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 12
+.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v18.4s,v6.4s
+.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 13
+.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b
+.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 14
+.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v4.4s
+.inst 0x5e281885 //sha1su1 v5.16b,v4.16b
+.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 15
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v5.4s
+.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b
+.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 16
+.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+ add v20.4s,v19.4s,v6.4s
+.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 17
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+ add v21.4s,v19.4s,v7.4s
+
+.inst 0x5e280803 //sha1h v3.16b,v0.16b // 18
+.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s
+
+.inst 0x5e280802 //sha1h v2.16b,v0.16b // 19
+.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s
+
+ add v1.4s,v1.4s,v2.4s
+ add v0.4s,v0.4s,v22.4s
+
+ cbnz x2,.Loop_hw
+
+ st1 {v0.4s},[x0],#16
+ st1 {v1.s}[0],[x0]
+
+ ldr x29,[sp],#16
+ ret
+.size sha1_block_armv8,.-sha1_block_armv8
+.section .rodata
+.align 6
+.Lconst:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S
@@ -1,0 +1,1214 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significanty faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.hidden OPENSSL_armcap_P
+.globl sha256_block_data_order
+.hidden sha256_block_data_order
+.type sha256_block_data_order,%function
+.align 6
+sha256_block_data_order:
+ AARCH64_VALID_CALL_TARGET
+#ifndef __KERNEL__
+#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
+ adrp x16,:pg_hi21_nc:OPENSSL_armcap_P
+#else
+ adrp x16,OPENSSL_armcap_P
+#endif
+ ldr w16,[x16,:lo12:OPENSSL_armcap_P]
+ tst w16,#ARMV8_SHA256
+ b.ne .Lv8_entry
+#endif
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*4
+
+ ldp w20,w21,[x0] // load context
+ ldp w22,w23,[x0,#2*4]
+ ldp w24,w25,[x0,#4*4]
+ add x2,x1,x2,lsl#6 // end of input
+ ldp w26,w27,[x0,#6*4]
+ adrp x30,.LK256
+ add x30,x30,:lo12:.LK256
+ stp x0,x2,[x29,#96]
+
+.Loop:
+ ldp w3,w4,[x1],#2*4
+ ldr w19,[x30],#4 // *K++
+ eor w28,w21,w22 // magic seed
+ str x1,[x29,#112]
+#ifndef __ARMEB__
+ rev w3,w3 // 0
+#endif
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w6,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w3 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w4,w4 // 1
+#endif
+ ldp w5,w6,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w7,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w4 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w5,w5 // 2
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w8,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w5 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w6,w6 // 3
+#endif
+ ldp w7,w8,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w9,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w6 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w7,w7 // 4
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w10,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w7 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w10,ror#11 // Sigma1(e)
+ ror w10,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w10,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w8,w8 // 5
+#endif
+ ldp w9,w10,[x1],#2*4
+ add w23,w23,w17 // h+=Sigma0(a)
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w11,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w8 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w11,ror#11 // Sigma1(e)
+ ror w11,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w11,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w9,w9 // 6
+#endif
+ add w22,w22,w17 // h+=Sigma0(a)
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w12,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w9 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w12,ror#11 // Sigma1(e)
+ ror w12,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w12,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w10,w10 // 7
+#endif
+ ldp w11,w12,[x1],#2*4
+ add w21,w21,w17 // h+=Sigma0(a)
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ eor w13,w25,w25,ror#14
+ and w17,w26,w25
+ bic w28,w27,w25
+ add w20,w20,w10 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w13,ror#11 // Sigma1(e)
+ ror w13,w21,#2
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ eor w17,w21,w21,ror#9
+ add w20,w20,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w24,w24,w20 // d+=h
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w13,w17,ror#13 // Sigma0(a)
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w20,w20,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w11,w11 // 8
+#endif
+ add w20,w20,w17 // h+=Sigma0(a)
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ eor w14,w24,w24,ror#14
+ and w17,w25,w24
+ bic w19,w26,w24
+ add w27,w27,w11 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w14,ror#11 // Sigma1(e)
+ ror w14,w20,#2
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ eor w17,w20,w20,ror#9
+ add w27,w27,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w23,w23,w27 // d+=h
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w14,w17,ror#13 // Sigma0(a)
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w27,w27,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w12,w12 // 9
+#endif
+ ldp w13,w14,[x1],#2*4
+ add w27,w27,w17 // h+=Sigma0(a)
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ eor w15,w23,w23,ror#14
+ and w17,w24,w23
+ bic w28,w25,w23
+ add w26,w26,w12 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w15,ror#11 // Sigma1(e)
+ ror w15,w27,#2
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ eor w17,w27,w27,ror#9
+ add w26,w26,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w22,w22,w26 // d+=h
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w15,w17,ror#13 // Sigma0(a)
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w26,w26,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w13,w13 // 10
+#endif
+ add w26,w26,w17 // h+=Sigma0(a)
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ eor w0,w22,w22,ror#14
+ and w17,w23,w22
+ bic w19,w24,w22
+ add w25,w25,w13 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w0,ror#11 // Sigma1(e)
+ ror w0,w26,#2
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ eor w17,w26,w26,ror#9
+ add w25,w25,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w21,w21,w25 // d+=h
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w0,w17,ror#13 // Sigma0(a)
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w25,w25,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w14,w14 // 11
+#endif
+ ldp w15,w0,[x1],#2*4
+ add w25,w25,w17 // h+=Sigma0(a)
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ eor w6,w21,w21,ror#14
+ and w17,w22,w21
+ bic w28,w23,w21
+ add w24,w24,w14 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w6,ror#11 // Sigma1(e)
+ ror w6,w25,#2
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ eor w17,w25,w25,ror#9
+ add w24,w24,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w20,w20,w24 // d+=h
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w17,ror#13 // Sigma0(a)
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w24,w24,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w15,w15 // 12
+#endif
+ add w24,w24,w17 // h+=Sigma0(a)
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ eor w7,w20,w20,ror#14
+ and w17,w21,w20
+ bic w19,w22,w20
+ add w23,w23,w15 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w7,ror#11 // Sigma1(e)
+ ror w7,w24,#2
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ eor w17,w24,w24,ror#9
+ add w23,w23,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w27,w27,w23 // d+=h
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w17,ror#13 // Sigma0(a)
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w23,w23,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w0,w0 // 13
+#endif
+ ldp w1,w2,[x1]
+ add w23,w23,w17 // h+=Sigma0(a)
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ eor w8,w27,w27,ror#14
+ and w17,w20,w27
+ bic w28,w21,w27
+ add w22,w22,w0 // h+=X[i]
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w8,ror#11 // Sigma1(e)
+ ror w8,w23,#2
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ eor w17,w23,w23,ror#9
+ add w22,w22,w16 // h+=Sigma1(e)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ add w26,w26,w22 // d+=h
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w17,ror#13 // Sigma0(a)
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ //add w22,w22,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w1,w1 // 14
+#endif
+ ldr w6,[sp,#12]
+ add w22,w22,w17 // h+=Sigma0(a)
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ eor w9,w26,w26,ror#14
+ and w17,w27,w26
+ bic w19,w20,w26
+ add w21,w21,w1 // h+=X[i]
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w9,ror#11 // Sigma1(e)
+ ror w9,w22,#2
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ eor w17,w22,w22,ror#9
+ add w21,w21,w16 // h+=Sigma1(e)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ add w25,w25,w21 // d+=h
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w17,ror#13 // Sigma0(a)
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ //add w21,w21,w17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev w2,w2 // 15
+#endif
+ ldr w7,[sp,#0]
+ add w21,w21,w17 // h+=Sigma0(a)
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+.Loop_16_xx:
+ ldr w8,[sp,#4]
+ str w11,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w10,w5,#7
+ and w17,w25,w24
+ ror w9,w2,#17
+ bic w19,w26,w24
+ ror w11,w20,#2
+ add w27,w27,w3 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w10,w10,w5,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w11,w11,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w9,w9,w2,ror#19
+ eor w10,w10,w5,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w11,w20,ror#22 // Sigma0(a)
+ eor w9,w9,w2,lsr#10 // sigma1(X[i+14])
+ add w4,w4,w13
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w4,w4,w10
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w4,w4,w9
+ ldr w9,[sp,#8]
+ str w12,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w11,w6,#7
+ and w17,w24,w23
+ ror w10,w3,#17
+ bic w28,w25,w23
+ ror w12,w27,#2
+ add w26,w26,w4 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w11,w11,w6,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w12,w12,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w10,w10,w3,ror#19
+ eor w11,w11,w6,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w12,w27,ror#22 // Sigma0(a)
+ eor w10,w10,w3,lsr#10 // sigma1(X[i+14])
+ add w5,w5,w14
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w5,w5,w11
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w5,w5,w10
+ ldr w10,[sp,#12]
+ str w13,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w12,w7,#7
+ and w17,w23,w22
+ ror w11,w4,#17
+ bic w19,w24,w22
+ ror w13,w26,#2
+ add w25,w25,w5 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w12,w12,w7,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w13,w13,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w11,w11,w4,ror#19
+ eor w12,w12,w7,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w13,w26,ror#22 // Sigma0(a)
+ eor w11,w11,w4,lsr#10 // sigma1(X[i+14])
+ add w6,w6,w15
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w6,w6,w12
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w6,w6,w11
+ ldr w11,[sp,#0]
+ str w14,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w13,w8,#7
+ and w17,w22,w21
+ ror w12,w5,#17
+ bic w28,w23,w21
+ ror w14,w25,#2
+ add w24,w24,w6 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w13,w13,w8,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w14,w14,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w12,w12,w5,ror#19
+ eor w13,w13,w8,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w14,w25,ror#22 // Sigma0(a)
+ eor w12,w12,w5,lsr#10 // sigma1(X[i+14])
+ add w7,w7,w0
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w7,w7,w13
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w7,w7,w12
+ ldr w12,[sp,#4]
+ str w15,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w14,w9,#7
+ and w17,w21,w20
+ ror w13,w6,#17
+ bic w19,w22,w20
+ ror w15,w24,#2
+ add w23,w23,w7 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w14,w14,w9,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w15,w15,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w13,w13,w6,ror#19
+ eor w14,w14,w9,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w15,w24,ror#22 // Sigma0(a)
+ eor w13,w13,w6,lsr#10 // sigma1(X[i+14])
+ add w8,w8,w1
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w8,w8,w14
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w8,w8,w13
+ ldr w13,[sp,#8]
+ str w0,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w15,w10,#7
+ and w17,w20,w27
+ ror w14,w7,#17
+ bic w28,w21,w27
+ ror w0,w23,#2
+ add w22,w22,w8 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w15,w15,w10,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w0,w0,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w14,w14,w7,ror#19
+ eor w15,w15,w10,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w0,w23,ror#22 // Sigma0(a)
+ eor w14,w14,w7,lsr#10 // sigma1(X[i+14])
+ add w9,w9,w2
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w9,w9,w15
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w9,w9,w14
+ ldr w14,[sp,#12]
+ str w1,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w0,w11,#7
+ and w17,w27,w26
+ ror w15,w8,#17
+ bic w19,w20,w26
+ ror w1,w22,#2
+ add w21,w21,w9 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w0,w0,w11,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w1,w1,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w15,w15,w8,ror#19
+ eor w0,w0,w11,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w1,w22,ror#22 // Sigma0(a)
+ eor w15,w15,w8,lsr#10 // sigma1(X[i+14])
+ add w10,w10,w3
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w10,w10,w0
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w10,w10,w15
+ ldr w15,[sp,#0]
+ str w2,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w1,w12,#7
+ and w17,w26,w25
+ ror w0,w9,#17
+ bic w28,w27,w25
+ ror w2,w21,#2
+ add w20,w20,w10 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w1,w1,w12,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w2,w2,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w0,w0,w9,ror#19
+ eor w1,w1,w12,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w2,w21,ror#22 // Sigma0(a)
+ eor w0,w0,w9,lsr#10 // sigma1(X[i+14])
+ add w11,w11,w4
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w11,w11,w1
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w11,w11,w0
+ ldr w0,[sp,#4]
+ str w3,[sp,#0]
+ ror w16,w24,#6
+ add w27,w27,w19 // h+=K[i]
+ ror w2,w13,#7
+ and w17,w25,w24
+ ror w1,w10,#17
+ bic w19,w26,w24
+ ror w3,w20,#2
+ add w27,w27,w11 // h+=X[i]
+ eor w16,w16,w24,ror#11
+ eor w2,w2,w13,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w20,w21 // a^b, b^c in next round
+ eor w16,w16,w24,ror#25 // Sigma1(e)
+ eor w3,w3,w20,ror#13
+ add w27,w27,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w1,w1,w10,ror#19
+ eor w2,w2,w13,lsr#3 // sigma0(X[i+1])
+ add w27,w27,w16 // h+=Sigma1(e)
+ eor w28,w28,w21 // Maj(a,b,c)
+ eor w17,w3,w20,ror#22 // Sigma0(a)
+ eor w1,w1,w10,lsr#10 // sigma1(X[i+14])
+ add w12,w12,w5
+ add w23,w23,w27 // d+=h
+ add w27,w27,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w12,w12,w2
+ add w27,w27,w17 // h+=Sigma0(a)
+ add w12,w12,w1
+ ldr w1,[sp,#8]
+ str w4,[sp,#4]
+ ror w16,w23,#6
+ add w26,w26,w28 // h+=K[i]
+ ror w3,w14,#7
+ and w17,w24,w23
+ ror w2,w11,#17
+ bic w28,w25,w23
+ ror w4,w27,#2
+ add w26,w26,w12 // h+=X[i]
+ eor w16,w16,w23,ror#11
+ eor w3,w3,w14,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w27,w20 // a^b, b^c in next round
+ eor w16,w16,w23,ror#25 // Sigma1(e)
+ eor w4,w4,w27,ror#13
+ add w26,w26,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w2,w2,w11,ror#19
+ eor w3,w3,w14,lsr#3 // sigma0(X[i+1])
+ add w26,w26,w16 // h+=Sigma1(e)
+ eor w19,w19,w20 // Maj(a,b,c)
+ eor w17,w4,w27,ror#22 // Sigma0(a)
+ eor w2,w2,w11,lsr#10 // sigma1(X[i+14])
+ add w13,w13,w6
+ add w22,w22,w26 // d+=h
+ add w26,w26,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w13,w13,w3
+ add w26,w26,w17 // h+=Sigma0(a)
+ add w13,w13,w2
+ ldr w2,[sp,#12]
+ str w5,[sp,#8]
+ ror w16,w22,#6
+ add w25,w25,w19 // h+=K[i]
+ ror w4,w15,#7
+ and w17,w23,w22
+ ror w3,w12,#17
+ bic w19,w24,w22
+ ror w5,w26,#2
+ add w25,w25,w13 // h+=X[i]
+ eor w16,w16,w22,ror#11
+ eor w4,w4,w15,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w26,w27 // a^b, b^c in next round
+ eor w16,w16,w22,ror#25 // Sigma1(e)
+ eor w5,w5,w26,ror#13
+ add w25,w25,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w3,w3,w12,ror#19
+ eor w4,w4,w15,lsr#3 // sigma0(X[i+1])
+ add w25,w25,w16 // h+=Sigma1(e)
+ eor w28,w28,w27 // Maj(a,b,c)
+ eor w17,w5,w26,ror#22 // Sigma0(a)
+ eor w3,w3,w12,lsr#10 // sigma1(X[i+14])
+ add w14,w14,w7
+ add w21,w21,w25 // d+=h
+ add w25,w25,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w14,w14,w4
+ add w25,w25,w17 // h+=Sigma0(a)
+ add w14,w14,w3
+ ldr w3,[sp,#0]
+ str w6,[sp,#12]
+ ror w16,w21,#6
+ add w24,w24,w28 // h+=K[i]
+ ror w5,w0,#7
+ and w17,w22,w21
+ ror w4,w13,#17
+ bic w28,w23,w21
+ ror w6,w25,#2
+ add w24,w24,w14 // h+=X[i]
+ eor w16,w16,w21,ror#11
+ eor w5,w5,w0,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w25,w26 // a^b, b^c in next round
+ eor w16,w16,w21,ror#25 // Sigma1(e)
+ eor w6,w6,w25,ror#13
+ add w24,w24,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w4,w4,w13,ror#19
+ eor w5,w5,w0,lsr#3 // sigma0(X[i+1])
+ add w24,w24,w16 // h+=Sigma1(e)
+ eor w19,w19,w26 // Maj(a,b,c)
+ eor w17,w6,w25,ror#22 // Sigma0(a)
+ eor w4,w4,w13,lsr#10 // sigma1(X[i+14])
+ add w15,w15,w8
+ add w20,w20,w24 // d+=h
+ add w24,w24,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w15,w15,w5
+ add w24,w24,w17 // h+=Sigma0(a)
+ add w15,w15,w4
+ ldr w4,[sp,#4]
+ str w7,[sp,#0]
+ ror w16,w20,#6
+ add w23,w23,w19 // h+=K[i]
+ ror w6,w1,#7
+ and w17,w21,w20
+ ror w5,w14,#17
+ bic w19,w22,w20
+ ror w7,w24,#2
+ add w23,w23,w15 // h+=X[i]
+ eor w16,w16,w20,ror#11
+ eor w6,w6,w1,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w24,w25 // a^b, b^c in next round
+ eor w16,w16,w20,ror#25 // Sigma1(e)
+ eor w7,w7,w24,ror#13
+ add w23,w23,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w5,w5,w14,ror#19
+ eor w6,w6,w1,lsr#3 // sigma0(X[i+1])
+ add w23,w23,w16 // h+=Sigma1(e)
+ eor w28,w28,w25 // Maj(a,b,c)
+ eor w17,w7,w24,ror#22 // Sigma0(a)
+ eor w5,w5,w14,lsr#10 // sigma1(X[i+14])
+ add w0,w0,w9
+ add w27,w27,w23 // d+=h
+ add w23,w23,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w0,w0,w6
+ add w23,w23,w17 // h+=Sigma0(a)
+ add w0,w0,w5
+ ldr w5,[sp,#8]
+ str w8,[sp,#4]
+ ror w16,w27,#6
+ add w22,w22,w28 // h+=K[i]
+ ror w7,w2,#7
+ and w17,w20,w27
+ ror w6,w15,#17
+ bic w28,w21,w27
+ ror w8,w23,#2
+ add w22,w22,w0 // h+=X[i]
+ eor w16,w16,w27,ror#11
+ eor w7,w7,w2,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w23,w24 // a^b, b^c in next round
+ eor w16,w16,w27,ror#25 // Sigma1(e)
+ eor w8,w8,w23,ror#13
+ add w22,w22,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w6,w6,w15,ror#19
+ eor w7,w7,w2,lsr#3 // sigma0(X[i+1])
+ add w22,w22,w16 // h+=Sigma1(e)
+ eor w19,w19,w24 // Maj(a,b,c)
+ eor w17,w8,w23,ror#22 // Sigma0(a)
+ eor w6,w6,w15,lsr#10 // sigma1(X[i+14])
+ add w1,w1,w10
+ add w26,w26,w22 // d+=h
+ add w22,w22,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w1,w1,w7
+ add w22,w22,w17 // h+=Sigma0(a)
+ add w1,w1,w6
+ ldr w6,[sp,#12]
+ str w9,[sp,#8]
+ ror w16,w26,#6
+ add w21,w21,w19 // h+=K[i]
+ ror w8,w3,#7
+ and w17,w27,w26
+ ror w7,w0,#17
+ bic w19,w20,w26
+ ror w9,w22,#2
+ add w21,w21,w1 // h+=X[i]
+ eor w16,w16,w26,ror#11
+ eor w8,w8,w3,ror#18
+ orr w17,w17,w19 // Ch(e,f,g)
+ eor w19,w22,w23 // a^b, b^c in next round
+ eor w16,w16,w26,ror#25 // Sigma1(e)
+ eor w9,w9,w22,ror#13
+ add w21,w21,w17 // h+=Ch(e,f,g)
+ and w28,w28,w19 // (b^c)&=(a^b)
+ eor w7,w7,w0,ror#19
+ eor w8,w8,w3,lsr#3 // sigma0(X[i+1])
+ add w21,w21,w16 // h+=Sigma1(e)
+ eor w28,w28,w23 // Maj(a,b,c)
+ eor w17,w9,w22,ror#22 // Sigma0(a)
+ eor w7,w7,w0,lsr#10 // sigma1(X[i+14])
+ add w2,w2,w11
+ add w25,w25,w21 // d+=h
+ add w21,w21,w28 // h+=Maj(a,b,c)
+ ldr w28,[x30],#4 // *K++, w19 in next round
+ add w2,w2,w8
+ add w21,w21,w17 // h+=Sigma0(a)
+ add w2,w2,w7
+ ldr w7,[sp,#0]
+ str w10,[sp,#12]
+ ror w16,w25,#6
+ add w20,w20,w28 // h+=K[i]
+ ror w9,w4,#7
+ and w17,w26,w25
+ ror w8,w1,#17
+ bic w28,w27,w25
+ ror w10,w21,#2
+ add w20,w20,w2 // h+=X[i]
+ eor w16,w16,w25,ror#11
+ eor w9,w9,w4,ror#18
+ orr w17,w17,w28 // Ch(e,f,g)
+ eor w28,w21,w22 // a^b, b^c in next round
+ eor w16,w16,w25,ror#25 // Sigma1(e)
+ eor w10,w10,w21,ror#13
+ add w20,w20,w17 // h+=Ch(e,f,g)
+ and w19,w19,w28 // (b^c)&=(a^b)
+ eor w8,w8,w1,ror#19
+ eor w9,w9,w4,lsr#3 // sigma0(X[i+1])
+ add w20,w20,w16 // h+=Sigma1(e)
+ eor w19,w19,w22 // Maj(a,b,c)
+ eor w17,w10,w21,ror#22 // Sigma0(a)
+ eor w8,w8,w1,lsr#10 // sigma1(X[i+14])
+ add w3,w3,w12
+ add w24,w24,w20 // d+=h
+ add w20,w20,w19 // h+=Maj(a,b,c)
+ ldr w19,[x30],#4 // *K++, w28 in next round
+ add w3,w3,w9
+ add w20,w20,w17 // h+=Sigma0(a)
+ add w3,w3,w8
+ cbnz w19,.Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#260 // rewind
+
+ ldp w3,w4,[x0]
+ ldp w5,w6,[x0,#2*4]
+ add x1,x1,#14*4 // advance input pointer
+ ldp w7,w8,[x0,#4*4]
+ add w20,w20,w3
+ ldp w9,w10,[x0,#6*4]
+ add w21,w21,w4
+ add w22,w22,w5
+ add w23,w23,w6
+ stp w20,w21,[x0]
+ add w24,w24,w7
+ add w25,w25,w8
+ stp w22,w23,[x0,#2*4]
+ add w26,w26,w9
+ add w27,w27,w10
+ cmp x1,x2
+ stp w24,w25,[x0,#4*4]
+ stp w26,w27,[x0,#6*4]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*4
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size sha256_block_data_order,.-sha256_block_data_order
+
+.section .rodata
+.align 6
+.type .LK256,%object
+.LK256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0 //terminator
+.size .LK256,.-.LK256
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+.text
+#ifndef __KERNEL__
+.type sha256_block_armv8,%function
+.align 6
+sha256_block_armv8:
+.Lv8_entry:
+ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v0.4s,v1.4s},[x0]
+ adrp x3,.LK256
+ add x3,x3,:lo12:.LK256
+
+.Loop_hw:
+ ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
+ sub x2,x2,#1
+ ld1 {v16.4s},[x3],#16
+ rev32 v4.16b,v4.16b
+ rev32 v5.16b,v5.16b
+ rev32 v6.16b,v6.16b
+ rev32 v7.16b,v7.16b
+ orr v18.16b,v0.16b,v0.16b // offload
+ orr v19.16b,v1.16b,v1.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v6.4s
+.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v7.4s
+.inst 0x5e282887 //sha256su0 v7.16b,v4.16b
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b
+ ld1 {v17.4s},[x3],#16
+ add v16.4s,v16.4s,v4.4s
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ ld1 {v16.4s},[x3],#16
+ add v17.4s,v17.4s,v5.4s
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ ld1 {v17.4s},[x3]
+ add v16.4s,v16.4s,v6.4s
+ sub x3,x3,#64*4-16 // rewind
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s
+.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s
+
+ add v17.4s,v17.4s,v7.4s
+ orr v2.16b,v0.16b,v0.16b
+.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s
+.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s
+
+ add v0.4s,v0.4s,v18.4s
+ add v1.4s,v1.4s,v19.4s
+
+ cbnz x2,.Loop_hw
+
+ st1 {v0.4s,v1.4s},[x0]
+
+ ldr x29,[sp],#16
+ ret
+.size sha256_block_armv8,.-sha256_block_armv8
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S
@@ -1,0 +1,1084 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the OpenSSL license (the "License"). You may not use
+// this file except in compliance with the License. You can obtain a copy
+// in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+
+// ====================================================================
+// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// project. The module is, however, dual licensed under OpenSSL and
+// CRYPTOGAMS licenses depending on where you obtain it. For further
+// details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
+// ====================================================================
+//
+// SHA256/512 for ARMv8.
+//
+// Performance in cycles per processed byte and improvement coefficient
+// over code generated with "default" compiler:
+//
+// SHA256-hw SHA256(*) SHA512
+// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
+// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
+// Denver 2.01 10.5 (+26%) 6.70 (+8%)
+// X-Gene 20.0 (+100%) 12.8 (+300%(***))
+// Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
+//
+// (*) Software SHA256 results are of lesser relevance, presented
+// mostly for informational purposes.
+// (**) The result is a trade-off: it's possible to improve it by
+// 10% (or by 1 cycle per round), but at the cost of 20% loss
+// on Cortex-A53 (or by 4 cycles per round).
+// (***) Super-impressive coefficients over gcc-generated code are
+// indication of some compiler "pathology", most notably code
+// generated with -mgeneral-regs-only is significanty faster
+// and the gap is only 40-90%.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#endif
+
+.text
+
+
+.hidden OPENSSL_armcap_P
+.globl sha512_block_data_order
+.hidden sha512_block_data_order
+.type sha512_block_data_order,%function
+.align 6
+sha512_block_data_order:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+ stp x19,x20,[sp,#16]
+ stp x21,x22,[sp,#32]
+ stp x23,x24,[sp,#48]
+ stp x25,x26,[sp,#64]
+ stp x27,x28,[sp,#80]
+ sub sp,sp,#4*8
+
+ ldp x20,x21,[x0] // load context
+ ldp x22,x23,[x0,#2*8]
+ ldp x24,x25,[x0,#4*8]
+ add x2,x1,x2,lsl#7 // end of input
+ ldp x26,x27,[x0,#6*8]
+ adrp x30,.LK512
+ add x30,x30,:lo12:.LK512
+ stp x0,x2,[x29,#96]
+
+.Loop:
+ ldp x3,x4,[x1],#2*8
+ ldr x19,[x30],#8 // *K++
+ eor x28,x21,x22 // magic seed
+ str x1,[x29,#112]
+#ifndef __ARMEB__
+ rev x3,x3 // 0
+#endif
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x6,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x3 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x4,x4 // 1
+#endif
+ ldp x5,x6,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x7,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x4 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x5,x5 // 2
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x8,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x5 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x6,x6 // 3
+#endif
+ ldp x7,x8,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x9,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x6 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x7,x7 // 4
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x10,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x7 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x10,ror#18 // Sigma1(e)
+ ror x10,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x10,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x8,x8 // 5
+#endif
+ ldp x9,x10,[x1],#2*8
+ add x23,x23,x17 // h+=Sigma0(a)
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x11,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x8 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x11,ror#18 // Sigma1(e)
+ ror x11,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x11,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x9,x9 // 6
+#endif
+ add x22,x22,x17 // h+=Sigma0(a)
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x12,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x9 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x12,ror#18 // Sigma1(e)
+ ror x12,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x12,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x10,x10 // 7
+#endif
+ ldp x11,x12,[x1],#2*8
+ add x21,x21,x17 // h+=Sigma0(a)
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ eor x13,x25,x25,ror#23
+ and x17,x26,x25
+ bic x28,x27,x25
+ add x20,x20,x10 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x13,ror#18 // Sigma1(e)
+ ror x13,x21,#28
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ eor x17,x21,x21,ror#5
+ add x20,x20,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x24,x24,x20 // d+=h
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x13,x17,ror#34 // Sigma0(a)
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x20,x20,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x11,x11 // 8
+#endif
+ add x20,x20,x17 // h+=Sigma0(a)
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ eor x14,x24,x24,ror#23
+ and x17,x25,x24
+ bic x19,x26,x24
+ add x27,x27,x11 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x14,ror#18 // Sigma1(e)
+ ror x14,x20,#28
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ eor x17,x20,x20,ror#5
+ add x27,x27,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x23,x23,x27 // d+=h
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x14,x17,ror#34 // Sigma0(a)
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x27,x27,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x12,x12 // 9
+#endif
+ ldp x13,x14,[x1],#2*8
+ add x27,x27,x17 // h+=Sigma0(a)
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ eor x15,x23,x23,ror#23
+ and x17,x24,x23
+ bic x28,x25,x23
+ add x26,x26,x12 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x15,ror#18 // Sigma1(e)
+ ror x15,x27,#28
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ eor x17,x27,x27,ror#5
+ add x26,x26,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x22,x22,x26 // d+=h
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x15,x17,ror#34 // Sigma0(a)
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x26,x26,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x13,x13 // 10
+#endif
+ add x26,x26,x17 // h+=Sigma0(a)
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ eor x0,x22,x22,ror#23
+ and x17,x23,x22
+ bic x19,x24,x22
+ add x25,x25,x13 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x0,ror#18 // Sigma1(e)
+ ror x0,x26,#28
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ eor x17,x26,x26,ror#5
+ add x25,x25,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x21,x21,x25 // d+=h
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x0,x17,ror#34 // Sigma0(a)
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x25,x25,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x14,x14 // 11
+#endif
+ ldp x15,x0,[x1],#2*8
+ add x25,x25,x17 // h+=Sigma0(a)
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ eor x6,x21,x21,ror#23
+ and x17,x22,x21
+ bic x28,x23,x21
+ add x24,x24,x14 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x6,ror#18 // Sigma1(e)
+ ror x6,x25,#28
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ eor x17,x25,x25,ror#5
+ add x24,x24,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x20,x20,x24 // d+=h
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x17,ror#34 // Sigma0(a)
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x24,x24,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x15,x15 // 12
+#endif
+ add x24,x24,x17 // h+=Sigma0(a)
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ eor x7,x20,x20,ror#23
+ and x17,x21,x20
+ bic x19,x22,x20
+ add x23,x23,x15 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x7,ror#18 // Sigma1(e)
+ ror x7,x24,#28
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ eor x17,x24,x24,ror#5
+ add x23,x23,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x27,x27,x23 // d+=h
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x17,ror#34 // Sigma0(a)
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x23,x23,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x0,x0 // 13
+#endif
+ ldp x1,x2,[x1]
+ add x23,x23,x17 // h+=Sigma0(a)
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ eor x8,x27,x27,ror#23
+ and x17,x20,x27
+ bic x28,x21,x27
+ add x22,x22,x0 // h+=X[i]
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x8,ror#18 // Sigma1(e)
+ ror x8,x23,#28
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ eor x17,x23,x23,ror#5
+ add x22,x22,x16 // h+=Sigma1(e)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ add x26,x26,x22 // d+=h
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x17,ror#34 // Sigma0(a)
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ //add x22,x22,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x1,x1 // 14
+#endif
+ ldr x6,[sp,#24]
+ add x22,x22,x17 // h+=Sigma0(a)
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ eor x9,x26,x26,ror#23
+ and x17,x27,x26
+ bic x19,x20,x26
+ add x21,x21,x1 // h+=X[i]
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x9,ror#18 // Sigma1(e)
+ ror x9,x22,#28
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ eor x17,x22,x22,ror#5
+ add x21,x21,x16 // h+=Sigma1(e)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ add x25,x25,x21 // d+=h
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x17,ror#34 // Sigma0(a)
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ //add x21,x21,x17 // h+=Sigma0(a)
+#ifndef __ARMEB__
+ rev x2,x2 // 15
+#endif
+ ldr x7,[sp,#0]
+ add x21,x21,x17 // h+=Sigma0(a)
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+.Loop_16_xx:
+ ldr x8,[sp,#8]
+ str x11,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x10,x5,#1
+ and x17,x25,x24
+ ror x9,x2,#19
+ bic x19,x26,x24
+ ror x11,x20,#28
+ add x27,x27,x3 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x10,x10,x5,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x11,x11,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x9,x9,x2,ror#61
+ eor x10,x10,x5,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x11,x20,ror#39 // Sigma0(a)
+ eor x9,x9,x2,lsr#6 // sigma1(X[i+14])
+ add x4,x4,x13
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x4,x4,x10
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x4,x4,x9
+ ldr x9,[sp,#16]
+ str x12,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x11,x6,#1
+ and x17,x24,x23
+ ror x10,x3,#19
+ bic x28,x25,x23
+ ror x12,x27,#28
+ add x26,x26,x4 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x11,x11,x6,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x12,x12,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x10,x10,x3,ror#61
+ eor x11,x11,x6,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x12,x27,ror#39 // Sigma0(a)
+ eor x10,x10,x3,lsr#6 // sigma1(X[i+14])
+ add x5,x5,x14
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x5,x5,x11
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x5,x5,x10
+ ldr x10,[sp,#24]
+ str x13,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x12,x7,#1
+ and x17,x23,x22
+ ror x11,x4,#19
+ bic x19,x24,x22
+ ror x13,x26,#28
+ add x25,x25,x5 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x12,x12,x7,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x13,x13,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x11,x11,x4,ror#61
+ eor x12,x12,x7,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x13,x26,ror#39 // Sigma0(a)
+ eor x11,x11,x4,lsr#6 // sigma1(X[i+14])
+ add x6,x6,x15
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x6,x6,x12
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x6,x6,x11
+ ldr x11,[sp,#0]
+ str x14,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x13,x8,#1
+ and x17,x22,x21
+ ror x12,x5,#19
+ bic x28,x23,x21
+ ror x14,x25,#28
+ add x24,x24,x6 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x13,x13,x8,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x14,x14,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x12,x12,x5,ror#61
+ eor x13,x13,x8,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x14,x25,ror#39 // Sigma0(a)
+ eor x12,x12,x5,lsr#6 // sigma1(X[i+14])
+ add x7,x7,x0
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x7,x7,x13
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x7,x7,x12
+ ldr x12,[sp,#8]
+ str x15,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x14,x9,#1
+ and x17,x21,x20
+ ror x13,x6,#19
+ bic x19,x22,x20
+ ror x15,x24,#28
+ add x23,x23,x7 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x14,x14,x9,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x15,x15,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x13,x13,x6,ror#61
+ eor x14,x14,x9,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x15,x24,ror#39 // Sigma0(a)
+ eor x13,x13,x6,lsr#6 // sigma1(X[i+14])
+ add x8,x8,x1
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x8,x8,x14
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x8,x8,x13
+ ldr x13,[sp,#16]
+ str x0,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x15,x10,#1
+ and x17,x20,x27
+ ror x14,x7,#19
+ bic x28,x21,x27
+ ror x0,x23,#28
+ add x22,x22,x8 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x15,x15,x10,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x0,x0,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x14,x14,x7,ror#61
+ eor x15,x15,x10,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x0,x23,ror#39 // Sigma0(a)
+ eor x14,x14,x7,lsr#6 // sigma1(X[i+14])
+ add x9,x9,x2
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x9,x9,x15
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x9,x9,x14
+ ldr x14,[sp,#24]
+ str x1,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x0,x11,#1
+ and x17,x27,x26
+ ror x15,x8,#19
+ bic x19,x20,x26
+ ror x1,x22,#28
+ add x21,x21,x9 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x0,x0,x11,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x1,x1,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x15,x15,x8,ror#61
+ eor x0,x0,x11,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x1,x22,ror#39 // Sigma0(a)
+ eor x15,x15,x8,lsr#6 // sigma1(X[i+14])
+ add x10,x10,x3
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x10,x10,x0
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x10,x10,x15
+ ldr x15,[sp,#0]
+ str x2,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x1,x12,#1
+ and x17,x26,x25
+ ror x0,x9,#19
+ bic x28,x27,x25
+ ror x2,x21,#28
+ add x20,x20,x10 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x1,x1,x12,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x2,x2,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x0,x0,x9,ror#61
+ eor x1,x1,x12,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x2,x21,ror#39 // Sigma0(a)
+ eor x0,x0,x9,lsr#6 // sigma1(X[i+14])
+ add x11,x11,x4
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x11,x11,x1
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x11,x11,x0
+ ldr x0,[sp,#8]
+ str x3,[sp,#0]
+ ror x16,x24,#14
+ add x27,x27,x19 // h+=K[i]
+ ror x2,x13,#1
+ and x17,x25,x24
+ ror x1,x10,#19
+ bic x19,x26,x24
+ ror x3,x20,#28
+ add x27,x27,x11 // h+=X[i]
+ eor x16,x16,x24,ror#18
+ eor x2,x2,x13,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x20,x21 // a^b, b^c in next round
+ eor x16,x16,x24,ror#41 // Sigma1(e)
+ eor x3,x3,x20,ror#34
+ add x27,x27,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x1,x1,x10,ror#61
+ eor x2,x2,x13,lsr#7 // sigma0(X[i+1])
+ add x27,x27,x16 // h+=Sigma1(e)
+ eor x28,x28,x21 // Maj(a,b,c)
+ eor x17,x3,x20,ror#39 // Sigma0(a)
+ eor x1,x1,x10,lsr#6 // sigma1(X[i+14])
+ add x12,x12,x5
+ add x23,x23,x27 // d+=h
+ add x27,x27,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x12,x12,x2
+ add x27,x27,x17 // h+=Sigma0(a)
+ add x12,x12,x1
+ ldr x1,[sp,#16]
+ str x4,[sp,#8]
+ ror x16,x23,#14
+ add x26,x26,x28 // h+=K[i]
+ ror x3,x14,#1
+ and x17,x24,x23
+ ror x2,x11,#19
+ bic x28,x25,x23
+ ror x4,x27,#28
+ add x26,x26,x12 // h+=X[i]
+ eor x16,x16,x23,ror#18
+ eor x3,x3,x14,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x27,x20 // a^b, b^c in next round
+ eor x16,x16,x23,ror#41 // Sigma1(e)
+ eor x4,x4,x27,ror#34
+ add x26,x26,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x2,x2,x11,ror#61
+ eor x3,x3,x14,lsr#7 // sigma0(X[i+1])
+ add x26,x26,x16 // h+=Sigma1(e)
+ eor x19,x19,x20 // Maj(a,b,c)
+ eor x17,x4,x27,ror#39 // Sigma0(a)
+ eor x2,x2,x11,lsr#6 // sigma1(X[i+14])
+ add x13,x13,x6
+ add x22,x22,x26 // d+=h
+ add x26,x26,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x13,x13,x3
+ add x26,x26,x17 // h+=Sigma0(a)
+ add x13,x13,x2
+ ldr x2,[sp,#24]
+ str x5,[sp,#16]
+ ror x16,x22,#14
+ add x25,x25,x19 // h+=K[i]
+ ror x4,x15,#1
+ and x17,x23,x22
+ ror x3,x12,#19
+ bic x19,x24,x22
+ ror x5,x26,#28
+ add x25,x25,x13 // h+=X[i]
+ eor x16,x16,x22,ror#18
+ eor x4,x4,x15,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x26,x27 // a^b, b^c in next round
+ eor x16,x16,x22,ror#41 // Sigma1(e)
+ eor x5,x5,x26,ror#34
+ add x25,x25,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x3,x3,x12,ror#61
+ eor x4,x4,x15,lsr#7 // sigma0(X[i+1])
+ add x25,x25,x16 // h+=Sigma1(e)
+ eor x28,x28,x27 // Maj(a,b,c)
+ eor x17,x5,x26,ror#39 // Sigma0(a)
+ eor x3,x3,x12,lsr#6 // sigma1(X[i+14])
+ add x14,x14,x7
+ add x21,x21,x25 // d+=h
+ add x25,x25,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x14,x14,x4
+ add x25,x25,x17 // h+=Sigma0(a)
+ add x14,x14,x3
+ ldr x3,[sp,#0]
+ str x6,[sp,#24]
+ ror x16,x21,#14
+ add x24,x24,x28 // h+=K[i]
+ ror x5,x0,#1
+ and x17,x22,x21
+ ror x4,x13,#19
+ bic x28,x23,x21
+ ror x6,x25,#28
+ add x24,x24,x14 // h+=X[i]
+ eor x16,x16,x21,ror#18
+ eor x5,x5,x0,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x25,x26 // a^b, b^c in next round
+ eor x16,x16,x21,ror#41 // Sigma1(e)
+ eor x6,x6,x25,ror#34
+ add x24,x24,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x4,x4,x13,ror#61
+ eor x5,x5,x0,lsr#7 // sigma0(X[i+1])
+ add x24,x24,x16 // h+=Sigma1(e)
+ eor x19,x19,x26 // Maj(a,b,c)
+ eor x17,x6,x25,ror#39 // Sigma0(a)
+ eor x4,x4,x13,lsr#6 // sigma1(X[i+14])
+ add x15,x15,x8
+ add x20,x20,x24 // d+=h
+ add x24,x24,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x15,x15,x5
+ add x24,x24,x17 // h+=Sigma0(a)
+ add x15,x15,x4
+ ldr x4,[sp,#8]
+ str x7,[sp,#0]
+ ror x16,x20,#14
+ add x23,x23,x19 // h+=K[i]
+ ror x6,x1,#1
+ and x17,x21,x20
+ ror x5,x14,#19
+ bic x19,x22,x20
+ ror x7,x24,#28
+ add x23,x23,x15 // h+=X[i]
+ eor x16,x16,x20,ror#18
+ eor x6,x6,x1,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x24,x25 // a^b, b^c in next round
+ eor x16,x16,x20,ror#41 // Sigma1(e)
+ eor x7,x7,x24,ror#34
+ add x23,x23,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x5,x5,x14,ror#61
+ eor x6,x6,x1,lsr#7 // sigma0(X[i+1])
+ add x23,x23,x16 // h+=Sigma1(e)
+ eor x28,x28,x25 // Maj(a,b,c)
+ eor x17,x7,x24,ror#39 // Sigma0(a)
+ eor x5,x5,x14,lsr#6 // sigma1(X[i+14])
+ add x0,x0,x9
+ add x27,x27,x23 // d+=h
+ add x23,x23,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x0,x0,x6
+ add x23,x23,x17 // h+=Sigma0(a)
+ add x0,x0,x5
+ ldr x5,[sp,#16]
+ str x8,[sp,#8]
+ ror x16,x27,#14
+ add x22,x22,x28 // h+=K[i]
+ ror x7,x2,#1
+ and x17,x20,x27
+ ror x6,x15,#19
+ bic x28,x21,x27
+ ror x8,x23,#28
+ add x22,x22,x0 // h+=X[i]
+ eor x16,x16,x27,ror#18
+ eor x7,x7,x2,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x23,x24 // a^b, b^c in next round
+ eor x16,x16,x27,ror#41 // Sigma1(e)
+ eor x8,x8,x23,ror#34
+ add x22,x22,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x6,x6,x15,ror#61
+ eor x7,x7,x2,lsr#7 // sigma0(X[i+1])
+ add x22,x22,x16 // h+=Sigma1(e)
+ eor x19,x19,x24 // Maj(a,b,c)
+ eor x17,x8,x23,ror#39 // Sigma0(a)
+ eor x6,x6,x15,lsr#6 // sigma1(X[i+14])
+ add x1,x1,x10
+ add x26,x26,x22 // d+=h
+ add x22,x22,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x1,x1,x7
+ add x22,x22,x17 // h+=Sigma0(a)
+ add x1,x1,x6
+ ldr x6,[sp,#24]
+ str x9,[sp,#16]
+ ror x16,x26,#14
+ add x21,x21,x19 // h+=K[i]
+ ror x8,x3,#1
+ and x17,x27,x26
+ ror x7,x0,#19
+ bic x19,x20,x26
+ ror x9,x22,#28
+ add x21,x21,x1 // h+=X[i]
+ eor x16,x16,x26,ror#18
+ eor x8,x8,x3,ror#8
+ orr x17,x17,x19 // Ch(e,f,g)
+ eor x19,x22,x23 // a^b, b^c in next round
+ eor x16,x16,x26,ror#41 // Sigma1(e)
+ eor x9,x9,x22,ror#34
+ add x21,x21,x17 // h+=Ch(e,f,g)
+ and x28,x28,x19 // (b^c)&=(a^b)
+ eor x7,x7,x0,ror#61
+ eor x8,x8,x3,lsr#7 // sigma0(X[i+1])
+ add x21,x21,x16 // h+=Sigma1(e)
+ eor x28,x28,x23 // Maj(a,b,c)
+ eor x17,x9,x22,ror#39 // Sigma0(a)
+ eor x7,x7,x0,lsr#6 // sigma1(X[i+14])
+ add x2,x2,x11
+ add x25,x25,x21 // d+=h
+ add x21,x21,x28 // h+=Maj(a,b,c)
+ ldr x28,[x30],#8 // *K++, x19 in next round
+ add x2,x2,x8
+ add x21,x21,x17 // h+=Sigma0(a)
+ add x2,x2,x7
+ ldr x7,[sp,#0]
+ str x10,[sp,#24]
+ ror x16,x25,#14
+ add x20,x20,x28 // h+=K[i]
+ ror x9,x4,#1
+ and x17,x26,x25
+ ror x8,x1,#19
+ bic x28,x27,x25
+ ror x10,x21,#28
+ add x20,x20,x2 // h+=X[i]
+ eor x16,x16,x25,ror#18
+ eor x9,x9,x4,ror#8
+ orr x17,x17,x28 // Ch(e,f,g)
+ eor x28,x21,x22 // a^b, b^c in next round
+ eor x16,x16,x25,ror#41 // Sigma1(e)
+ eor x10,x10,x21,ror#34
+ add x20,x20,x17 // h+=Ch(e,f,g)
+ and x19,x19,x28 // (b^c)&=(a^b)
+ eor x8,x8,x1,ror#61
+ eor x9,x9,x4,lsr#7 // sigma0(X[i+1])
+ add x20,x20,x16 // h+=Sigma1(e)
+ eor x19,x19,x22 // Maj(a,b,c)
+ eor x17,x10,x21,ror#39 // Sigma0(a)
+ eor x8,x8,x1,lsr#6 // sigma1(X[i+14])
+ add x3,x3,x12
+ add x24,x24,x20 // d+=h
+ add x20,x20,x19 // h+=Maj(a,b,c)
+ ldr x19,[x30],#8 // *K++, x28 in next round
+ add x3,x3,x9
+ add x20,x20,x17 // h+=Sigma0(a)
+ add x3,x3,x8
+ cbnz x19,.Loop_16_xx
+
+ ldp x0,x2,[x29,#96]
+ ldr x1,[x29,#112]
+ sub x30,x30,#648 // rewind
+
+ ldp x3,x4,[x0]
+ ldp x5,x6,[x0,#2*8]
+ add x1,x1,#14*8 // advance input pointer
+ ldp x7,x8,[x0,#4*8]
+ add x20,x20,x3
+ ldp x9,x10,[x0,#6*8]
+ add x21,x21,x4
+ add x22,x22,x5
+ add x23,x23,x6
+ stp x20,x21,[x0]
+ add x24,x24,x7
+ add x25,x25,x8
+ stp x22,x23,[x0,#2*8]
+ add x26,x26,x9
+ add x27,x27,x10
+ cmp x1,x2
+ stp x24,x25,[x0,#4*8]
+ stp x26,x27,[x0,#6*8]
+ b.ne .Loop
+
+ ldp x19,x20,[x29,#16]
+ add sp,sp,#4*8
+ ldp x21,x22,[x29,#32]
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size sha512_block_data_order,.-sha512_block_data_order
+
+.section .rodata
+.align 6
+.type .LK512,%object
+.LK512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0 // terminator
+.size .LK512,.-.LK512
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S
@@ -1,0 +1,1235 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.section .rodata
+
+.type _vpaes_consts,%object
+.align 7 // totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward: // mc_forward
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward: // mc_backward
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr: // sr
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+//
+// "Hot" constants
+//
+.Lk_inv: // inv, inva
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt: // input transform (lo, hi)
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo: // sbou, sbot
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1: // sb1u, sb1t
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2: // sb2u, sb2t
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+//
+// Decryption stuff
+//
+.Lk_dipt: // decryption input transform
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo: // decryption sbox final output
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9: // decryption sbox output *9*u, *9*t
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd: // decryption sbox output *D*u, *D*t
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb: // decryption sbox output *B*u, *B*t
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe: // decryption sbox output *E*u, *E*t
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+
+//
+// Key schedule constants
+//
+.Lk_dksd: // decryption key schedule: invskew x*D
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb: // decryption key schedule: invskew x*B
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse: // decryption key schedule: invskew x*E + 0x63
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9: // decryption key schedule: invskew x*9
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon: // rcon
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt: // output transform
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew: // deskew tables: inverts the sbox's "skew"
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 2
+.size _vpaes_consts,.-_vpaes_consts
+.align 6
+
+.text
+##
+## _aes_preheat
+##
+## Fills register %r10 -> .aes_consts (so you can -fPIC)
+## and %xmm9-%xmm15 as specified below.
+##
+.type _vpaes_encrypt_preheat,%function
+.align 4
+_vpaes_encrypt_preheat:
+ adrp x10, .Lk_inv
+ add x10, x10, :lo12:.Lk_inv
+ movi v17.16b, #0x0f
+ ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
+ ret
+.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
+
+##
+## _aes_encrypt_core
+##
+## AES-encrypt %xmm0.
+##
+## Inputs:
+## %xmm0 = input
+## %xmm9-%xmm15 as in _vpaes_preheat
+## (%rdx) = scheduled keys
+##
+## Output in %xmm0
+## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
+## Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type _vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, .Lk_mc_forward+16
+ add x11, x11, :lo12:.Lk_mc_forward+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b .Lenc_entry
+
+.align 4
+.Lenc_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ sub w8, w8, #1 // nr--
+
+.Lenc_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, .Lenc_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl vpaes_encrypt
+.hidden vpaes_encrypt
+.type vpaes_encrypt,%function
+.align 4
+vpaes_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_encrypt_preheat
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_encrypt,.-vpaes_encrypt
+
+.type _vpaes_encrypt_2x,%function
+.align 4
+_vpaes_encrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+ adrp x11, .Lk_mc_forward+16
+ add x11, x11, :lo12:.Lk_mc_forward+16
+ // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
+ tbl v9.16b, {v20.16b}, v9.16b
+ // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
+ tbl v10.16b, {v21.16b}, v8.16b
+ eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
+ eor v8.16b, v9.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b .Lenc_2x_entry
+
+.align 4
+.Lenc_2x_loop:
+ // middle of middle round
+ add x10, x11, #0x40
+ tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ tbl v12.16b, {v25.16b}, v10.16b
+ ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v24.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ tbl v13.16b, {v27.16b}, v10.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ tbl v10.16b, {v26.16b}, v11.16b
+ ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ tbl v11.16b, {v8.16b}, v1.16b
+ eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ eor v10.16b, v10.16b, v13.16b
+ tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ tbl v8.16b, {v8.16b}, v4.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ eor v11.16b, v11.16b, v10.16b
+ tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ tbl v12.16b, {v11.16b},v1.16b
+ eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ eor v8.16b, v8.16b, v11.16b
+ and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ eor v8.16b, v8.16b, v12.16b
+ sub w8, w8, #1 // nr--
+
+.Lenc_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ tbl v13.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v13.16b
+ eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v13.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
+ cbnz w8, .Lenc_2x_loop
+
+ // middle of last round
+ add x10, x11, #0x80
+ // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
+ // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ tbl v8.16b, {v23.16b}, v11.16b
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ eor v8.16b, v8.16b, v12.16b
+ tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v1.16b
+ ret
+.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
+
+.type _vpaes_decrypt_preheat,%function
+.align 4
+_vpaes_decrypt_preheat:
+ adrp x10, .Lk_inv
+ add x10, x10, :lo12:.Lk_inv
+ movi v17.16b, #0x0f
+ adrp x11, .Lk_dipt
+ add x11, x11, :lo12:.Lk_dipt
+ ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv
+ ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
+ ret
+.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
+
+##
+## Decryption core
+##
+## Same API as encryption core.
+##
+.type _vpaes_decrypt_core,%function
+.align 4
+_vpaes_decrypt_core:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, .Lk_sr
+ add x10, x10, :lo12:.Lk_sr
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, .Lk_mc_forward+48
+ add x10, x10, :lo12:.Lk_mc_forward+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ b .Ldec_entry
+
+.align 4
+.Ldec_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+.Ldec_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, .Ldec_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl vpaes_decrypt
+.hidden vpaes_decrypt
+.type vpaes_decrypt,%function
+.align 4
+vpaes_decrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ ld1 {v7.16b}, [x0]
+ bl _vpaes_decrypt_preheat
+ bl _vpaes_decrypt_core
+ st1 {v0.16b}, [x1]
+
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_decrypt,.-vpaes_decrypt
+
+// v14-v15 input, v0-v1 output
+.type _vpaes_decrypt_2x,%function
+.align 4
+_vpaes_decrypt_2x:
+ mov x9, x2
+ ldr w8, [x2,#240] // pull rounds
+
+ // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11
+ eor x11, x11, #0x30 // xor $0x30, %r11
+ adrp x10, .Lk_sr
+ add x10, x10, :lo12:.Lk_sr
+ and x11, x11, #0x30 // and $0x30, %r11
+ add x11, x11, x10
+ adrp x10, .Lk_mc_forward+48
+ add x10, x10, :lo12:.Lk_mc_forward+48
+
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
+ and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ and v9.16b, v15.16b, v17.16b
+ ushr v8.16b, v15.16b, #4
+ tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ tbl v10.16b, {v20.16b},v9.16b
+ ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
+ // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ tbl v8.16b, {v21.16b},v8.16b
+ eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
+ eor v10.16b, v10.16b, v16.16b
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ eor v8.16b, v8.16b, v10.16b
+ b .Ldec_2x_entry
+
+.align 4
+.Ldec_2x_loop:
+//
+// Inverse mix columns
+//
+ // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ tbl v12.16b, {v24.16b}, v10.16b
+ tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ tbl v9.16b, {v25.16b}, v11.16b
+ eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
+ eor v8.16b, v12.16b, v16.16b
+ // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+
+ tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ tbl v12.16b, {v26.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ tbl v9.16b, {v27.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ tbl v12.16b, {v28.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ tbl v9.16b, {v29.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
+
+ tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ tbl v12.16b, {v30.16b}, v10.16b
+ tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ tbl v8.16b, {v8.16b},v5.16b
+ tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ tbl v9.16b, {v31.16b}, v11.16b
+ eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ eor v8.16b, v8.16b, v12.16b
+ ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
+ eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ eor v8.16b, v8.16b, v9.16b
+ sub w8, w8, #1 // sub $1,%rax # nr--
+
+.Ldec_2x_entry:
+ // top of round
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ and v9.16b, v8.16b, v17.16b
+ ushr v8.16b, v8.16b, #4
+ tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ tbl v10.16b, {v19.16b},v9.16b
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ eor v9.16b, v9.16b, v8.16b
+ tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ tbl v11.16b, {v18.16b},v8.16b
+ tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ tbl v12.16b, {v18.16b},v9.16b
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ eor v11.16b, v11.16b, v10.16b
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ eor v12.16b, v12.16b, v10.16b
+ tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ tbl v10.16b, {v18.16b},v11.16b
+ tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ tbl v11.16b, {v18.16b},v12.16b
+ eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ eor v10.16b, v10.16b, v9.16b
+ eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ eor v11.16b, v11.16b, v8.16b
+ ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
+ cbnz w8, .Ldec_2x_loop
+
+ // middle of last round
+ // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ tbl v12.16b, {v22.16b}, v10.16b
+ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ tbl v9.16b, {v23.16b}, v11.16b
+ ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ eor v12.16b, v12.16b, v16.16b
+ eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ eor v8.16b, v9.16b, v12.16b
+ tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
+ tbl v1.16b, {v8.16b},v2.16b
+ ret
+.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
+########################################################
+## ##
+## AES key schedule ##
+## ##
+########################################################
+.type _vpaes_key_preheat,%function
+.align 4
+_vpaes_key_preheat:
+ adrp x10, .Lk_inv
+ add x10, x10, :lo12:.Lk_inv
+ movi v16.16b, #0x5b // .Lk_s63
+ adrp x11, .Lk_sb1
+ add x11, x11, :lo12:.Lk_sb1
+ movi v17.16b, #0x0f // .Lk_s0F
+ ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt
+ adrp x10, .Lk_dksd
+ add x10, x10, :lo12:.Lk_dksd
+ ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1
+ adrp x11, .Lk_mc_forward
+ add x11, x11, :lo12:.Lk_mc_forward
+ ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
+ ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
+ ld1 {v8.2d}, [x10] // .Lk_rcon
+ ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
+ ret
+.size _vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type _vpaes_schedule_core,%function
+.align 4
+_vpaes_schedule_core:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp,#-16]!
+ add x29,sp,#0
+
+ bl _vpaes_key_preheat // load the tables
+
+ ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
+
+ // input transform
+ mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
+
+ adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10
+ add x10, x10, :lo12:.Lk_sr
+
+ add x8, x8, x10
+ cbnz w3, .Lschedule_am_decrypting
+
+ // encrypting, output zeroth round key after transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx)
+ b .Lschedule_go
+
+.Lschedule_am_decrypting:
+ // decrypting, output zeroth round key after shiftrows
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ eor x8, x8, #0x30 // xor $0x30, %r8
+
+.Lschedule_go:
+ cmp w1, #192 // cmp $192, %esi
+ b.hi .Lschedule_256
+ b.eq .Lschedule_192
+ // 128: fall though
+
+##
+## .schedule_128
+##
+## 128-bit specific part of key schedule.
+##
+## This schedule is really simple, because all its parts
+## are accomplished by the subroutines.
+##
+.Lschedule_128:
+ mov x0, #10 // mov $10, %esi
+
+.Loop_schedule_128:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ cbz x0, .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // write output
+ b .Loop_schedule_128
+
+##
+## .aes_schedule_192
+##
+## 192-bit specific part of key schedule.
+##
+## The main body of this schedule is the same as the 128-bit
+## schedule, but with more smearing. The long, high side is
+## stored in %xmm7 as before, and the short, low side is in
+## the high bits of %xmm6.
+##
+## This schedule is somewhat nastier, however, because each
+## round produces 192 bits of key material, or 1.5 round keys.
+## Therefore, on each cycle we do 2 rounds and produce 3 round
+## keys.
+##
+.align 4
+.Lschedule_192:
+ sub x0, x0, #8
+ ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
+ ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
+ mov x0, #4 // mov $4, %esi
+
+.Loop_schedule_192:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_round
+ ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle // save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle // save key n+1
+ bl _vpaes_schedule_round
+ cbz x0, .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle // save key n+2
+ bl _vpaes_schedule_192_smear
+ b .Loop_schedule_192
+
+##
+## .aes_schedule_256
+##
+## 256-bit specific part of key schedule.
+##
+## The structure here is very similar to the 128-bit
+## schedule, but with an additional "low side" in
+## %xmm6. The low side's rounds are the same as the
+## high side's, except no rcon and no rotation.
+##
+.align 4
+.Lschedule_256:
+ ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform // input transform
+ mov x0, #7 // mov $7, %esi
+
+.Loop_schedule_256:
+ sub x0, x0, #1 // dec %esi
+ bl _vpaes_schedule_mangle // output low result
+ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ // high round
+ bl _vpaes_schedule_round
+ cbz x0, .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ // low round. swap xmm7 and xmm6
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ movi v4.16b, #0
+ mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
+ mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
+
+ b .Loop_schedule_256
+
+##
+## .aes_schedule_mangle_last
+##
+## Mangler for last round of key schedule
+## Mangles %xmm0
+## when encrypting, outputs out(%xmm0) ^ 63
+## when decrypting, outputs unskew(%xmm0)
+##
+## Always called right before return... jumps to cleanup and exits
+##
+.align 4
+.Lschedule_mangle_last:
+ // schedule last round key from xmm0
+ adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ add x11, x11, :lo12:.Lk_deskew
+
+ cbnz w3, .Lschedule_mangle_last_dec
+
+ // encrypting
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
+ adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
+ add x11, x11, :lo12:.Lk_opt
+ add x2, x2, #32 // add $32, %rdx
+ tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
+
+.Lschedule_mangle_last_dec:
+ ld1 {v20.2d,v21.2d}, [x11] // reload constants
+ sub x2, x2, #16 // add $-16, %rdx
+ eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform // output transform
+ st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key
+
+ // cleanup
+ eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
+ eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
+ eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
+ eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
+ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
+ eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
+ ldp x29, x30, [sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+## .aes_schedule_192_smear
+##
+## Smear the short, low side in the 192-bit key schedule.
+##
+## Inputs:
+## %xmm7: high side, b a x y
+## %xmm6: low side, d c 0 0
+## %xmm13: 0
+##
+## Outputs:
+## %xmm6: b+c+d b+c 0 0
+## %xmm0: b+c+d b+c b a
+##
+.type _vpaes_schedule_192_smear,%function
+.align 4
+_vpaes_schedule_192_smear:
+ movi v1.16b, #0
+ dup v0.4s, v7.s[3]
+ ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
+ eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
+ ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+## .aes_schedule_round
+##
+## Runs one main round of the key schedule on %xmm0, %xmm7
+##
+## Specifically, runs subbytes on the high dword of %xmm0
+## then rotates it by one byte and xors into the low dword of
+## %xmm7.
+##
+## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+## next rcon.
+##
+## Smears the dwords of %xmm7 by xoring the low into the
+## second low, result into third, result into highest.
+##
+## Returns results in %xmm7 = %xmm0.
+## Clobbers %xmm1-%xmm4, %r11.
+##
+.type _vpaes_schedule_round,%function
+.align 4
+_vpaes_schedule_round:
+ // extract rcon from xmm8
+ movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
+ ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
+ ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+
+ // rotate
+ dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0
+ ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
+
+ // fall through...
+
+ // low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ // smear xmm7
+ ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
+ eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
+ ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
+
+ // subbytes
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
+ tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
+ tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ // add in smeared stuff
+ eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
+ eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+## .aes_schedule_transform
+##
+## Linear-transform %xmm0 according to tables at (%r11)
+##
+## Requires that %xmm9 = 0x0F0F... as in preheat
+## Output in %xmm0
+## Clobbers %xmm1, %xmm2
+##
+.type _vpaes_schedule_transform,%function
+.align 4
+_vpaes_schedule_transform:
+ and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
+ ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
+ // vmovdqa (%r11), %xmm2 # lo
+ tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
+ // vmovdqa 16(%r11), %xmm1 # hi
+ tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
+ eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+## .aes_schedule_mangle
+##
+## Mangle xmm0 from (basis-transformed) standard version
+## to our version.
+##
+## On encrypt,
+## xor with 0x63
+## multiply by circulant 0,1,1,1
+## apply shiftrows transform
+##
+## On decrypt,
+## xor with 0x63
+## multiply by "inverse mixcolumns" circulant E,B,D,9
+## deskew
+## apply shiftrows transform
+##
+##
+## Writes out to (%rdx), and increments or decrements it
+## Keeps track of round number mod 4 in %r8
+## Preserves xmm0
+## Clobbers xmm1-xmm5
+##
+.type _vpaes_schedule_mangle,%function
+.align 4
+_vpaes_schedule_mangle:
+ mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ // vmovdqa .Lk_mc_forward(%rip),%xmm5
+ cbnz w3, .Lschedule_mangle_dec
+
+ // encrypting
+ eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
+ add x2, x2, #16 // add $16, %rdx
+ tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
+ tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
+ tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
+ eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
+
+ b .Lschedule_mangle_both
+.align 4
+.Lschedule_mangle_dec:
+ // inverse mix columns
+ // lea .Lk_dksd(%rip),%r11
+ ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
+ and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ // vmovdqa 0x00(%r11), %xmm2
+ tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ // vmovdqa 0x10(%r11), %xmm3
+ tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x20(%r11), %xmm2
+ tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x30(%r11), %xmm3
+ tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+
+ // vmovdqa 0x40(%r11), %xmm2
+ tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ // vmovdqa 0x50(%r11), %xmm3
+ tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
+
+ // vmovdqa 0x60(%r11), %xmm2
+ tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
+ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
+ // vmovdqa 0x70(%r11), %xmm4
+ tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
+ ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
+ eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
+ eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
+
+ sub x2, x2, #16 // add $-16, %rdx
+
+.Lschedule_mangle_both:
+ tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
+ add x8, x8, #48 // add $-16, %r8
+ and x8, x8, #~(1<<6) // and $0x30, %r8
+ st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,%function
+.align 4
+vpaes_set_encrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov w3, #0 // mov $0,%ecx
+ mov x8, #0x30 // mov $0x30,%r8d
+ bl _vpaes_schedule_core
+ eor x0, x0, x0
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,%function
+.align 4
+vpaes_set_decrypt_key:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+
+ lsr w9, w1, #5 // shr $5,%eax
+ add w9, w9, #5 // $5,%eax
+ str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ lsl w9, w9, #4 // shl $4,%eax
+ add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
+ add x2, x2, x9
+
+ mov w3, #1 // mov $1,%ecx
+ lsr w8, w1, #1 // shr $1,%r8d
+ and x8, x8, #32 // and $32,%r8d
+ eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+.globl vpaes_cbc_encrypt
+.hidden vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,%function
+.align 4
+vpaes_cbc_encrypt:
+ AARCH64_SIGN_LINK_REGISTER
+ cbz x2, .Lcbc_abort
+ cmp w5, #0 // check direction
+ b.eq vpaes_cbc_decrypt
+
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+
+ ld1 {v0.16b}, [x4] // load ivec
+ bl _vpaes_encrypt_preheat
+ b .Lcbc_enc_loop
+
+.align 4
+.Lcbc_enc_loop:
+ ld1 {v7.16b}, [x0],#16 // load input
+ eor v7.16b, v7.16b, v0.16b // xor with ivec
+ bl _vpaes_encrypt_core
+ st1 {v0.16b}, [x1],#16 // save output
+ subs x17, x17, #16
+ b.hi .Lcbc_enc_loop
+
+ st1 {v0.16b}, [x4] // write ivec
+
+ ldp x29,x30,[sp],#16
+.Lcbc_abort:
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+.type vpaes_cbc_decrypt,%function
+.align 4
+vpaes_cbc_decrypt:
+ // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
+ // only from vpaes_cbc_encrypt which has already signed the return address.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ mov x17, x2 // reassign
+ mov x2, x3 // reassign
+ ld1 {v6.16b}, [x4] // load ivec
+ bl _vpaes_decrypt_preheat
+ tst x17, #16
+ b.eq .Lcbc_dec_loop2x
+
+ ld1 {v7.16b}, [x0], #16 // load input
+ bl _vpaes_decrypt_core
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ orr v6.16b, v7.16b, v7.16b // next ivec value
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #16
+ b.ls .Lcbc_dec_done
+
+.align 4
+.Lcbc_dec_loop2x:
+ ld1 {v14.16b,v15.16b}, [x0], #32
+ bl _vpaes_decrypt_2x
+ eor v0.16b, v0.16b, v6.16b // xor with ivec
+ eor v1.16b, v1.16b, v14.16b
+ orr v6.16b, v15.16b, v15.16b
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #32
+ b.hi .Lcbc_dec_loop2x
+
+.Lcbc_dec_done:
+ st1 {v6.16b}, [x4]
+
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+.globl vpaes_ctr32_encrypt_blocks
+.hidden vpaes_ctr32_encrypt_blocks
+.type vpaes_ctr32_encrypt_blocks,%function
+.align 4
+vpaes_ctr32_encrypt_blocks:
+ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+ stp d10,d11,[sp,#-16]!
+ stp d12,d13,[sp,#-16]!
+ stp d14,d15,[sp,#-16]!
+
+ cbz x2, .Lctr32_done
+
+ // Note, unlike the other functions, x2 here is measured in blocks,
+ // not bytes.
+ mov x17, x2
+ mov x2, x3
+
+ // Load the IV and counter portion.
+ ldr w6, [x4, #12]
+ ld1 {v7.16b}, [x4]
+
+ bl _vpaes_encrypt_preheat
+ tst x17, #1
+ rev w6, w6 // The counter is big-endian.
+ b.eq .Lctr32_prep_loop
+
+ // Handle one block so the remaining block count is even for
+ // _vpaes_encrypt_2x.
+ ld1 {v6.16b}, [x0], #16 // .Load input ahead of time
+ bl _vpaes_encrypt_core
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ st1 {v0.16b}, [x1], #16
+ subs x17, x17, #1
+ // Update the counter.
+ add w6, w6, #1
+ rev w7, w6
+ mov v7.s[3], w7
+ b.ls .Lctr32_done
+
+.Lctr32_prep_loop:
+ // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
+ // uses v14 and v15.
+ mov v15.16b, v7.16b
+ mov v14.16b, v7.16b
+ add w6, w6, #1
+ rev w7, w6
+ mov v15.s[3], w7
+
+.Lctr32_loop:
+ ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time
+ bl _vpaes_encrypt_2x
+ eor v0.16b, v0.16b, v6.16b // XOR input and result
+ eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
+ st1 {v0.16b,v1.16b}, [x1], #32
+ subs x17, x17, #2
+ // Update the counter.
+ add w7, w6, #1
+ add w6, w6, #2
+ rev w7, w7
+ mov v14.s[3], w7
+ rev w7, w6
+ mov v15.s[3], w7
+ b.hi .Lctr32_loop
+
+.Lctr32_done:
+ ldp d14,d15,[sp],#16
+ ldp d12,d13,[sp],#16
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S
@@ -1,0 +1,761 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__aarch64__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+
+// abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+// with |argv|, then saves the callee-saved registers into |state|. It returns
+// the result of |func|. The |unwind| argument is unused.
+// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state,
+// const uint64_t *argv, size_t argc,
+// uint64_t unwind);
+.type abi_test_trampoline, %function
+.globl abi_test_trampoline
+.hidden abi_test_trampoline
+.align 4
+abi_test_trampoline:
+.Labi_test_trampoline_begin:
+ AARCH64_SIGN_LINK_REGISTER
+ // Stack layout (low to high addresses)
+ // x29,x30 (16 bytes)
+ // d8-d15 (64 bytes)
+ // x19-x28 (80 bytes)
+ // x1 (8 bytes)
+ // padding (8 bytes)
+ stp x29, x30, [sp, #-176]!
+ mov x29, sp
+
+ // Saved callee-saved registers and |state|.
+ stp d8, d9, [sp, #16]
+ stp d10, d11, [sp, #32]
+ stp d12, d13, [sp, #48]
+ stp d14, d15, [sp, #64]
+ stp x19, x20, [sp, #80]
+ stp x21, x22, [sp, #96]
+ stp x23, x24, [sp, #112]
+ stp x25, x26, [sp, #128]
+ stp x27, x28, [sp, #144]
+ str x1, [sp, #160]
+
+ // Load registers from |state|, with the exception of x29. x29 is the
+ // frame pointer and also callee-saved, but AAPCS64 allows platforms to
+ // mandate that x29 always point to a frame. iOS64 does so, which means
+ // we cannot fill x29 with entropy without violating ABI rules
+ // ourselves. x29 is tested separately below.
+ ldp d8, d9, [x1], #16
+ ldp d10, d11, [x1], #16
+ ldp d12, d13, [x1], #16
+ ldp d14, d15, [x1], #16
+ ldp x19, x20, [x1], #16
+ ldp x21, x22, [x1], #16
+ ldp x23, x24, [x1], #16
+ ldp x25, x26, [x1], #16
+ ldp x27, x28, [x1], #16
+
+ // Move parameters into temporary registers.
+ mov x9, x0
+ mov x10, x2
+ mov x11, x3
+
+ // Load parameters into registers.
+ cbz x11, .Largs_done
+ ldr x0, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x1, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x2, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x3, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x4, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x5, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x6, [x10], #8
+ subs x11, x11, #1
+ b.eq .Largs_done
+ ldr x7, [x10], #8
+
+.Largs_done:
+ blr x9
+
+ // Reload |state| and store registers.
+ ldr x1, [sp, #160]
+ stp d8, d9, [x1], #16
+ stp d10, d11, [x1], #16
+ stp d12, d13, [x1], #16
+ stp d14, d15, [x1], #16
+ stp x19, x20, [x1], #16
+ stp x21, x22, [x1], #16
+ stp x23, x24, [x1], #16
+ stp x25, x26, [x1], #16
+ stp x27, x28, [x1], #16
+
+ // |func| is required to preserve x29, the frame pointer. We cannot load
+ // random values into x29 (see comment above), so compare it against the
+ // expected value and zero the field of |state| if corrupted.
+ mov x9, sp
+ cmp x29, x9
+ b.eq .Lx29_ok
+ str xzr, [x1]
+
+.Lx29_ok:
+ // Restore callee-saved registers.
+ ldp d8, d9, [sp, #16]
+ ldp d10, d11, [sp, #32]
+ ldp d12, d13, [sp, #48]
+ ldp d14, d15, [sp, #64]
+ ldp x19, x20, [sp, #80]
+ ldp x21, x22, [sp, #96]
+ ldp x23, x24, [sp, #112]
+ ldp x25, x26, [sp, #128]
+ ldp x27, x28, [sp, #144]
+
+ ldp x29, x30, [sp], #176
+ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+.size abi_test_trampoline,.-abi_test_trampoline
+.type abi_test_clobber_x0, %function
+.globl abi_test_clobber_x0
+.hidden abi_test_clobber_x0
+.align 4
+abi_test_clobber_x0:
+ AARCH64_VALID_CALL_TARGET
+ mov x0, xzr
+ ret
+.size abi_test_clobber_x0,.-abi_test_clobber_x0
+.type abi_test_clobber_x1, %function
+.globl abi_test_clobber_x1
+.hidden abi_test_clobber_x1
+.align 4
+abi_test_clobber_x1:
+ AARCH64_VALID_CALL_TARGET
+ mov x1, xzr
+ ret
+.size abi_test_clobber_x1,.-abi_test_clobber_x1
+.type abi_test_clobber_x2, %function
+.globl abi_test_clobber_x2
+.hidden abi_test_clobber_x2
+.align 4
+abi_test_clobber_x2:
+ AARCH64_VALID_CALL_TARGET
+ mov x2, xzr
+ ret
+.size abi_test_clobber_x2,.-abi_test_clobber_x2
+.type abi_test_clobber_x3, %function
+.globl abi_test_clobber_x3
+.hidden abi_test_clobber_x3
+.align 4
+abi_test_clobber_x3:
+ AARCH64_VALID_CALL_TARGET
+ mov x3, xzr
+ ret
+.size abi_test_clobber_x3,.-abi_test_clobber_x3
+.type abi_test_clobber_x4, %function
+.globl abi_test_clobber_x4
+.hidden abi_test_clobber_x4
+.align 4
+abi_test_clobber_x4:
+ AARCH64_VALID_CALL_TARGET
+ mov x4, xzr
+ ret
+.size abi_test_clobber_x4,.-abi_test_clobber_x4
+.type abi_test_clobber_x5, %function
+.globl abi_test_clobber_x5
+.hidden abi_test_clobber_x5
+.align 4
+abi_test_clobber_x5:
+ AARCH64_VALID_CALL_TARGET
+ mov x5, xzr
+ ret
+.size abi_test_clobber_x5,.-abi_test_clobber_x5
+.type abi_test_clobber_x6, %function
+.globl abi_test_clobber_x6
+.hidden abi_test_clobber_x6
+.align 4
+abi_test_clobber_x6:
+ AARCH64_VALID_CALL_TARGET
+ mov x6, xzr
+ ret
+.size abi_test_clobber_x6,.-abi_test_clobber_x6
+.type abi_test_clobber_x7, %function
+.globl abi_test_clobber_x7
+.hidden abi_test_clobber_x7
+.align 4
+abi_test_clobber_x7:
+ AARCH64_VALID_CALL_TARGET
+ mov x7, xzr
+ ret
+.size abi_test_clobber_x7,.-abi_test_clobber_x7
+.type abi_test_clobber_x8, %function
+.globl abi_test_clobber_x8
+.hidden abi_test_clobber_x8
+.align 4
+abi_test_clobber_x8:
+ AARCH64_VALID_CALL_TARGET
+ mov x8, xzr
+ ret
+.size abi_test_clobber_x8,.-abi_test_clobber_x8
+.type abi_test_clobber_x9, %function
+.globl abi_test_clobber_x9
+.hidden abi_test_clobber_x9
+.align 4
+abi_test_clobber_x9:
+ AARCH64_VALID_CALL_TARGET
+ mov x9, xzr
+ ret
+.size abi_test_clobber_x9,.-abi_test_clobber_x9
+.type abi_test_clobber_x10, %function
+.globl abi_test_clobber_x10
+.hidden abi_test_clobber_x10
+.align 4
+abi_test_clobber_x10:
+ AARCH64_VALID_CALL_TARGET
+ mov x10, xzr
+ ret
+.size abi_test_clobber_x10,.-abi_test_clobber_x10
+.type abi_test_clobber_x11, %function
+.globl abi_test_clobber_x11
+.hidden abi_test_clobber_x11
+.align 4
+abi_test_clobber_x11:
+ AARCH64_VALID_CALL_TARGET
+ mov x11, xzr
+ ret
+.size abi_test_clobber_x11,.-abi_test_clobber_x11
+.type abi_test_clobber_x12, %function
+.globl abi_test_clobber_x12
+.hidden abi_test_clobber_x12
+.align 4
+abi_test_clobber_x12:
+ AARCH64_VALID_CALL_TARGET
+ mov x12, xzr
+ ret
+.size abi_test_clobber_x12,.-abi_test_clobber_x12
+.type abi_test_clobber_x13, %function
+.globl abi_test_clobber_x13
+.hidden abi_test_clobber_x13
+.align 4
+abi_test_clobber_x13:
+ AARCH64_VALID_CALL_TARGET
+ mov x13, xzr
+ ret
+.size abi_test_clobber_x13,.-abi_test_clobber_x13
+.type abi_test_clobber_x14, %function
+.globl abi_test_clobber_x14
+.hidden abi_test_clobber_x14
+.align 4
+abi_test_clobber_x14:
+ AARCH64_VALID_CALL_TARGET
+ mov x14, xzr
+ ret
+.size abi_test_clobber_x14,.-abi_test_clobber_x14
+.type abi_test_clobber_x15, %function
+.globl abi_test_clobber_x15
+.hidden abi_test_clobber_x15
+.align 4
+abi_test_clobber_x15:
+ AARCH64_VALID_CALL_TARGET
+ mov x15, xzr
+ ret
+.size abi_test_clobber_x15,.-abi_test_clobber_x15
+.type abi_test_clobber_x16, %function
+.globl abi_test_clobber_x16
+.hidden abi_test_clobber_x16
+.align 4
+abi_test_clobber_x16:
+ AARCH64_VALID_CALL_TARGET
+ mov x16, xzr
+ ret
+.size abi_test_clobber_x16,.-abi_test_clobber_x16
+.type abi_test_clobber_x17, %function
+.globl abi_test_clobber_x17
+.hidden abi_test_clobber_x17
+.align 4
+abi_test_clobber_x17:
+ AARCH64_VALID_CALL_TARGET
+ mov x17, xzr
+ ret
+.size abi_test_clobber_x17,.-abi_test_clobber_x17
+.type abi_test_clobber_x19, %function
+.globl abi_test_clobber_x19
+.hidden abi_test_clobber_x19
+.align 4
+abi_test_clobber_x19:
+ AARCH64_VALID_CALL_TARGET
+ mov x19, xzr
+ ret
+.size abi_test_clobber_x19,.-abi_test_clobber_x19
+.type abi_test_clobber_x20, %function
+.globl abi_test_clobber_x20
+.hidden abi_test_clobber_x20
+.align 4
+abi_test_clobber_x20:
+ AARCH64_VALID_CALL_TARGET
+ mov x20, xzr
+ ret
+.size abi_test_clobber_x20,.-abi_test_clobber_x20
+.type abi_test_clobber_x21, %function
+.globl abi_test_clobber_x21
+.hidden abi_test_clobber_x21
+.align 4
+abi_test_clobber_x21:
+ AARCH64_VALID_CALL_TARGET
+ mov x21, xzr
+ ret
+.size abi_test_clobber_x21,.-abi_test_clobber_x21
+.type abi_test_clobber_x22, %function
+.globl abi_test_clobber_x22
+.hidden abi_test_clobber_x22
+.align 4
+abi_test_clobber_x22:
+ AARCH64_VALID_CALL_TARGET
+ mov x22, xzr
+ ret
+.size abi_test_clobber_x22,.-abi_test_clobber_x22
+.type abi_test_clobber_x23, %function
+.globl abi_test_clobber_x23
+.hidden abi_test_clobber_x23
+.align 4
+abi_test_clobber_x23:
+ AARCH64_VALID_CALL_TARGET
+ mov x23, xzr
+ ret
+.size abi_test_clobber_x23,.-abi_test_clobber_x23
+.type abi_test_clobber_x24, %function
+.globl abi_test_clobber_x24
+.hidden abi_test_clobber_x24
+.align 4
+abi_test_clobber_x24:
+ AARCH64_VALID_CALL_TARGET
+ mov x24, xzr
+ ret
+.size abi_test_clobber_x24,.-abi_test_clobber_x24
+.type abi_test_clobber_x25, %function
+.globl abi_test_clobber_x25
+.hidden abi_test_clobber_x25
+.align 4
+abi_test_clobber_x25:
+ AARCH64_VALID_CALL_TARGET
+ mov x25, xzr
+ ret
+.size abi_test_clobber_x25,.-abi_test_clobber_x25
+.type abi_test_clobber_x26, %function
+.globl abi_test_clobber_x26
+.hidden abi_test_clobber_x26
+.align 4
+abi_test_clobber_x26:
+ AARCH64_VALID_CALL_TARGET
+ mov x26, xzr
+ ret
+.size abi_test_clobber_x26,.-abi_test_clobber_x26
+.type abi_test_clobber_x27, %function
+.globl abi_test_clobber_x27
+.hidden abi_test_clobber_x27
+.align 4
+abi_test_clobber_x27:
+ AARCH64_VALID_CALL_TARGET
+ mov x27, xzr
+ ret
+.size abi_test_clobber_x27,.-abi_test_clobber_x27
+.type abi_test_clobber_x28, %function
+.globl abi_test_clobber_x28
+.hidden abi_test_clobber_x28
+.align 4
+abi_test_clobber_x28:
+ AARCH64_VALID_CALL_TARGET
+ mov x28, xzr
+ ret
+.size abi_test_clobber_x28,.-abi_test_clobber_x28
+.type abi_test_clobber_x29, %function
+.globl abi_test_clobber_x29
+.hidden abi_test_clobber_x29
+.align 4
+abi_test_clobber_x29:
+ AARCH64_VALID_CALL_TARGET
+ mov x29, xzr
+ ret
+.size abi_test_clobber_x29,.-abi_test_clobber_x29
+.type abi_test_clobber_d0, %function
+.globl abi_test_clobber_d0
+.hidden abi_test_clobber_d0
+.align 4
+abi_test_clobber_d0:
+ AARCH64_VALID_CALL_TARGET
+ fmov d0, xzr
+ ret
+.size abi_test_clobber_d0,.-abi_test_clobber_d0
+.type abi_test_clobber_d1, %function
+.globl abi_test_clobber_d1
+.hidden abi_test_clobber_d1
+.align 4
+abi_test_clobber_d1:
+ AARCH64_VALID_CALL_TARGET
+ fmov d1, xzr
+ ret
+.size abi_test_clobber_d1,.-abi_test_clobber_d1
+.type abi_test_clobber_d2, %function
+.globl abi_test_clobber_d2
+.hidden abi_test_clobber_d2
+.align 4
+abi_test_clobber_d2:
+ AARCH64_VALID_CALL_TARGET
+ fmov d2, xzr
+ ret
+.size abi_test_clobber_d2,.-abi_test_clobber_d2
+.type abi_test_clobber_d3, %function
+.globl abi_test_clobber_d3
+.hidden abi_test_clobber_d3
+.align 4
+abi_test_clobber_d3:
+ AARCH64_VALID_CALL_TARGET
+ fmov d3, xzr
+ ret
+.size abi_test_clobber_d3,.-abi_test_clobber_d3
+.type abi_test_clobber_d4, %function
+.globl abi_test_clobber_d4
+.hidden abi_test_clobber_d4
+.align 4
+abi_test_clobber_d4:
+ AARCH64_VALID_CALL_TARGET
+ fmov d4, xzr
+ ret
+.size abi_test_clobber_d4,.-abi_test_clobber_d4
+.type abi_test_clobber_d5, %function
+.globl abi_test_clobber_d5
+.hidden abi_test_clobber_d5
+.align 4
+abi_test_clobber_d5:
+ AARCH64_VALID_CALL_TARGET
+ fmov d5, xzr
+ ret
+.size abi_test_clobber_d5,.-abi_test_clobber_d5
+.type abi_test_clobber_d6, %function
+.globl abi_test_clobber_d6
+.hidden abi_test_clobber_d6
+.align 4
+abi_test_clobber_d6:
+ AARCH64_VALID_CALL_TARGET
+ fmov d6, xzr
+ ret
+.size abi_test_clobber_d6,.-abi_test_clobber_d6
+.type abi_test_clobber_d7, %function
+.globl abi_test_clobber_d7
+.hidden abi_test_clobber_d7
+.align 4
+abi_test_clobber_d7:
+ AARCH64_VALID_CALL_TARGET
+ fmov d7, xzr
+ ret
+.size abi_test_clobber_d7,.-abi_test_clobber_d7
+.type abi_test_clobber_d8, %function
+.globl abi_test_clobber_d8
+.hidden abi_test_clobber_d8
+.align 4
+abi_test_clobber_d8:
+ AARCH64_VALID_CALL_TARGET
+ fmov d8, xzr
+ ret
+.size abi_test_clobber_d8,.-abi_test_clobber_d8
+.type abi_test_clobber_d9, %function
+.globl abi_test_clobber_d9
+.hidden abi_test_clobber_d9
+.align 4
+abi_test_clobber_d9:
+ AARCH64_VALID_CALL_TARGET
+ fmov d9, xzr
+ ret
+.size abi_test_clobber_d9,.-abi_test_clobber_d9
+.type abi_test_clobber_d10, %function
+.globl abi_test_clobber_d10
+.hidden abi_test_clobber_d10
+.align 4
+abi_test_clobber_d10:
+ AARCH64_VALID_CALL_TARGET
+ fmov d10, xzr
+ ret
+.size abi_test_clobber_d10,.-abi_test_clobber_d10
+.type abi_test_clobber_d11, %function
+.globl abi_test_clobber_d11
+.hidden abi_test_clobber_d11
+.align 4
+abi_test_clobber_d11:
+ AARCH64_VALID_CALL_TARGET
+ fmov d11, xzr
+ ret
+.size abi_test_clobber_d11,.-abi_test_clobber_d11
+.type abi_test_clobber_d12, %function
+.globl abi_test_clobber_d12
+.hidden abi_test_clobber_d12
+.align 4
+abi_test_clobber_d12:
+ AARCH64_VALID_CALL_TARGET
+ fmov d12, xzr
+ ret
+.size abi_test_clobber_d12,.-abi_test_clobber_d12
+.type abi_test_clobber_d13, %function
+.globl abi_test_clobber_d13
+.hidden abi_test_clobber_d13
+.align 4
+abi_test_clobber_d13:
+ AARCH64_VALID_CALL_TARGET
+ fmov d13, xzr
+ ret
+.size abi_test_clobber_d13,.-abi_test_clobber_d13
+.type abi_test_clobber_d14, %function
+.globl abi_test_clobber_d14
+.hidden abi_test_clobber_d14
+.align 4
+abi_test_clobber_d14:
+ AARCH64_VALID_CALL_TARGET
+ fmov d14, xzr
+ ret
+.size abi_test_clobber_d14,.-abi_test_clobber_d14
+.type abi_test_clobber_d15, %function
+.globl abi_test_clobber_d15
+.hidden abi_test_clobber_d15
+.align 4
+abi_test_clobber_d15:
+ AARCH64_VALID_CALL_TARGET
+ fmov d15, xzr
+ ret
+.size abi_test_clobber_d15,.-abi_test_clobber_d15
+.type abi_test_clobber_d16, %function
+.globl abi_test_clobber_d16
+.hidden abi_test_clobber_d16
+.align 4
+abi_test_clobber_d16:
+ AARCH64_VALID_CALL_TARGET
+ fmov d16, xzr
+ ret
+.size abi_test_clobber_d16,.-abi_test_clobber_d16
+.type abi_test_clobber_d17, %function
+.globl abi_test_clobber_d17
+.hidden abi_test_clobber_d17
+.align 4
+abi_test_clobber_d17:
+ AARCH64_VALID_CALL_TARGET
+ fmov d17, xzr
+ ret
+.size abi_test_clobber_d17,.-abi_test_clobber_d17
+.type abi_test_clobber_d18, %function
+.globl abi_test_clobber_d18
+.hidden abi_test_clobber_d18
+.align 4
+abi_test_clobber_d18:
+ AARCH64_VALID_CALL_TARGET
+ fmov d18, xzr
+ ret
+.size abi_test_clobber_d18,.-abi_test_clobber_d18
+.type abi_test_clobber_d19, %function
+.globl abi_test_clobber_d19
+.hidden abi_test_clobber_d19
+.align 4
+abi_test_clobber_d19:
+ AARCH64_VALID_CALL_TARGET
+ fmov d19, xzr
+ ret
+.size abi_test_clobber_d19,.-abi_test_clobber_d19
+.type abi_test_clobber_d20, %function
+.globl abi_test_clobber_d20
+.hidden abi_test_clobber_d20
+.align 4
+abi_test_clobber_d20:
+ AARCH64_VALID_CALL_TARGET
+ fmov d20, xzr
+ ret
+.size abi_test_clobber_d20,.-abi_test_clobber_d20
+.type abi_test_clobber_d21, %function
+.globl abi_test_clobber_d21
+.hidden abi_test_clobber_d21
+.align 4
+abi_test_clobber_d21:
+ AARCH64_VALID_CALL_TARGET
+ fmov d21, xzr
+ ret
+.size abi_test_clobber_d21,.-abi_test_clobber_d21
+.type abi_test_clobber_d22, %function
+.globl abi_test_clobber_d22
+.hidden abi_test_clobber_d22
+.align 4
+abi_test_clobber_d22:
+ AARCH64_VALID_CALL_TARGET
+ fmov d22, xzr
+ ret
+.size abi_test_clobber_d22,.-abi_test_clobber_d22
+.type abi_test_clobber_d23, %function
+.globl abi_test_clobber_d23
+.hidden abi_test_clobber_d23
+.align 4
+abi_test_clobber_d23:
+ AARCH64_VALID_CALL_TARGET
+ fmov d23, xzr
+ ret
+.size abi_test_clobber_d23,.-abi_test_clobber_d23
+.type abi_test_clobber_d24, %function
+.globl abi_test_clobber_d24
+.hidden abi_test_clobber_d24
+.align 4
+abi_test_clobber_d24:
+ AARCH64_VALID_CALL_TARGET
+ fmov d24, xzr
+ ret
+.size abi_test_clobber_d24,.-abi_test_clobber_d24
+.type abi_test_clobber_d25, %function
+.globl abi_test_clobber_d25
+.hidden abi_test_clobber_d25
+.align 4
+abi_test_clobber_d25:
+ AARCH64_VALID_CALL_TARGET
+ fmov d25, xzr
+ ret
+.size abi_test_clobber_d25,.-abi_test_clobber_d25
+.type abi_test_clobber_d26, %function
+.globl abi_test_clobber_d26
+.hidden abi_test_clobber_d26
+.align 4
+abi_test_clobber_d26:
+ AARCH64_VALID_CALL_TARGET
+ fmov d26, xzr
+ ret
+.size abi_test_clobber_d26,.-abi_test_clobber_d26
+.type abi_test_clobber_d27, %function
+.globl abi_test_clobber_d27
+.hidden abi_test_clobber_d27
+.align 4
+abi_test_clobber_d27:
+ AARCH64_VALID_CALL_TARGET
+ fmov d27, xzr
+ ret
+.size abi_test_clobber_d27,.-abi_test_clobber_d27
+.type abi_test_clobber_d28, %function
+.globl abi_test_clobber_d28
+.hidden abi_test_clobber_d28
+.align 4
+abi_test_clobber_d28:
+ AARCH64_VALID_CALL_TARGET
+ fmov d28, xzr
+ ret
+.size abi_test_clobber_d28,.-abi_test_clobber_d28
+.type abi_test_clobber_d29, %function
+.globl abi_test_clobber_d29
+.hidden abi_test_clobber_d29
+.align 4
+abi_test_clobber_d29:
+ AARCH64_VALID_CALL_TARGET
+ fmov d29, xzr
+ ret
+.size abi_test_clobber_d29,.-abi_test_clobber_d29
+.type abi_test_clobber_d30, %function
+.globl abi_test_clobber_d30
+.hidden abi_test_clobber_d30
+.align 4
+abi_test_clobber_d30:
+ AARCH64_VALID_CALL_TARGET
+ fmov d30, xzr
+ ret
+.size abi_test_clobber_d30,.-abi_test_clobber_d30
+.type abi_test_clobber_d31, %function
+.globl abi_test_clobber_d31
+.hidden abi_test_clobber_d31
+.align 4
+abi_test_clobber_d31:
+ AARCH64_VALID_CALL_TARGET
+ fmov d31, xzr
+ ret
+.size abi_test_clobber_d31,.-abi_test_clobber_d31
+.type abi_test_clobber_v8_upper, %function
+.globl abi_test_clobber_v8_upper
+.hidden abi_test_clobber_v8_upper
+.align 4
+abi_test_clobber_v8_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v8.d[1], xzr
+ ret
+.size abi_test_clobber_v8_upper,.-abi_test_clobber_v8_upper
+.type abi_test_clobber_v9_upper, %function
+.globl abi_test_clobber_v9_upper
+.hidden abi_test_clobber_v9_upper
+.align 4
+abi_test_clobber_v9_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v9.d[1], xzr
+ ret
+.size abi_test_clobber_v9_upper,.-abi_test_clobber_v9_upper
+.type abi_test_clobber_v10_upper, %function
+.globl abi_test_clobber_v10_upper
+.hidden abi_test_clobber_v10_upper
+.align 4
+abi_test_clobber_v10_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v10.d[1], xzr
+ ret
+.size abi_test_clobber_v10_upper,.-abi_test_clobber_v10_upper
+.type abi_test_clobber_v11_upper, %function
+.globl abi_test_clobber_v11_upper
+.hidden abi_test_clobber_v11_upper
+.align 4
+abi_test_clobber_v11_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v11.d[1], xzr
+ ret
+.size abi_test_clobber_v11_upper,.-abi_test_clobber_v11_upper
+.type abi_test_clobber_v12_upper, %function
+.globl abi_test_clobber_v12_upper
+.hidden abi_test_clobber_v12_upper
+.align 4
+abi_test_clobber_v12_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v12.d[1], xzr
+ ret
+.size abi_test_clobber_v12_upper,.-abi_test_clobber_v12_upper
+.type abi_test_clobber_v13_upper, %function
+.globl abi_test_clobber_v13_upper
+.hidden abi_test_clobber_v13_upper
+.align 4
+abi_test_clobber_v13_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v13.d[1], xzr
+ ret
+.size abi_test_clobber_v13_upper,.-abi_test_clobber_v13_upper
+.type abi_test_clobber_v14_upper, %function
+.globl abi_test_clobber_v14_upper
+.hidden abi_test_clobber_v14_upper
+.align 4
+abi_test_clobber_v14_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v14.d[1], xzr
+ ret
+.size abi_test_clobber_v14_upper,.-abi_test_clobber_v14_upper
+.type abi_test_clobber_v15_upper, %function
+.globl abi_test_clobber_v15_upper
+.hidden abi_test_clobber_v15_upper
+.align 4
+abi_test_clobber_v15_upper:
+ AARCH64_VALID_CALL_TARGET
+ fmov v15.d[1], xzr
+ ret
+.size abi_test_clobber_v15_upper,.-abi_test_clobber_v15_upper
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S
@@ -1,0 +1,1493 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+#if defined(__thumb2__) || defined(__clang__)
+#define ldrhsb ldrbhs
+#endif
+
+.align 5
+.Lsigma:
+.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral
+.Lone:
+.long 1,0,0,0
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.LChaCha20_ctr32
+#else
+.word -1
+#endif
+
+.globl ChaCha20_ctr32
+.hidden ChaCha20_ctr32
+.type ChaCha20_ctr32,%function
+.align 5
+ChaCha20_ctr32:
+.LChaCha20_ctr32:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r14,pc,#16 @ ChaCha20_ctr32
+#else
+ adr r14,.LChaCha20_ctr32
+#endif
+ cmp r2,#0 @ len==0?
+#ifdef __thumb2__
+ itt eq
+#endif
+ addeq sp,sp,#4*3
+ beq .Lno_data
+#if __ARM_MAX_ARCH__>=7
+ cmp r2,#192 @ test len
+ bls .Lshort
+ ldr r4,[r14,#-32]
+ ldr r4,[r14,r4]
+# ifdef __APPLE__
+ ldr r4,[r4]
+# endif
+ tst r4,#ARMV7_NEON
+ bne .LChaCha20_neon
+.Lshort:
+#endif
+ ldmia r12,{r4,r5,r6,r7} @ load counter and nonce
+ sub sp,sp,#4*(16) @ off-load area
+ sub r14,r14,#64 @ .Lsigma
+ stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce
+ ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
+ ldmia r14,{r0,r1,r2,r3} @ load sigma
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key
+ stmdb sp!,{r0,r1,r2,r3} @ copy sigma
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ b .Loop_outer_enter
+
+.align 4
+.Loop_outer:
+ ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
+ str r11,[sp,#4*(32+2)] @ save len
+ str r12, [sp,#4*(32+1)] @ save inp
+ str r14, [sp,#4*(32+0)] @ save out
+.Loop_outer_enter:
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ b .Loop
+
+.align 4
+.Loop:
+ subs r11,r11,#1
+ add r0,r0,r4
+ mov r12,r12,ror#16
+ add r1,r1,r5
+ mov r10,r10,ror#16
+ eor r12,r12,r0,ror#16
+ eor r10,r10,r1,ror#16
+ add r8,r8,r12
+ mov r4,r4,ror#20
+ add r9,r9,r10
+ mov r5,r5,ror#20
+ eor r4,r4,r8,ror#20
+ eor r5,r5,r9,ror#20
+ add r0,r0,r4
+ mov r12,r12,ror#24
+ add r1,r1,r5
+ mov r10,r10,ror#24
+ eor r12,r12,r0,ror#24
+ eor r10,r10,r1,ror#24
+ add r8,r8,r12
+ mov r4,r4,ror#25
+ add r9,r9,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+13)]
+ ldr r10,[sp,#4*(16+15)]
+ eor r4,r4,r8,ror#25
+ eor r5,r5,r9,ror#25
+ str r8,[sp,#4*(16+8)]
+ ldr r8,[sp,#4*(16+10)]
+ add r2,r2,r6
+ mov r14,r14,ror#16
+ str r9,[sp,#4*(16+9)]
+ ldr r9,[sp,#4*(16+11)]
+ add r3,r3,r7
+ mov r10,r10,ror#16
+ eor r14,r14,r2,ror#16
+ eor r10,r10,r3,ror#16
+ add r8,r8,r14
+ mov r6,r6,ror#20
+ add r9,r9,r10
+ mov r7,r7,ror#20
+ eor r6,r6,r8,ror#20
+ eor r7,r7,r9,ror#20
+ add r2,r2,r6
+ mov r14,r14,ror#24
+ add r3,r3,r7
+ mov r10,r10,ror#24
+ eor r14,r14,r2,ror#24
+ eor r10,r10,r3,ror#24
+ add r8,r8,r14
+ mov r6,r6,ror#25
+ add r9,r9,r10
+ mov r7,r7,ror#25
+ eor r6,r6,r8,ror#25
+ eor r7,r7,r9,ror#25
+ add r0,r0,r5
+ mov r10,r10,ror#16
+ add r1,r1,r6
+ mov r12,r12,ror#16
+ eor r10,r10,r0,ror#16
+ eor r12,r12,r1,ror#16
+ add r8,r8,r10
+ mov r5,r5,ror#20
+ add r9,r9,r12
+ mov r6,r6,ror#20
+ eor r5,r5,r8,ror#20
+ eor r6,r6,r9,ror#20
+ add r0,r0,r5
+ mov r10,r10,ror#24
+ add r1,r1,r6
+ mov r12,r12,ror#24
+ eor r10,r10,r0,ror#24
+ eor r12,r12,r1,ror#24
+ add r8,r8,r10
+ mov r5,r5,ror#25
+ str r10,[sp,#4*(16+15)]
+ ldr r10,[sp,#4*(16+13)]
+ add r9,r9,r12
+ mov r6,r6,ror#25
+ eor r5,r5,r8,ror#25
+ eor r6,r6,r9,ror#25
+ str r8,[sp,#4*(16+10)]
+ ldr r8,[sp,#4*(16+8)]
+ add r2,r2,r7
+ mov r10,r10,ror#16
+ str r9,[sp,#4*(16+11)]
+ ldr r9,[sp,#4*(16+9)]
+ add r3,r3,r4
+ mov r14,r14,ror#16
+ eor r10,r10,r2,ror#16
+ eor r14,r14,r3,ror#16
+ add r8,r8,r10
+ mov r7,r7,ror#20
+ add r9,r9,r14
+ mov r4,r4,ror#20
+ eor r7,r7,r8,ror#20
+ eor r4,r4,r9,ror#20
+ add r2,r2,r7
+ mov r10,r10,ror#24
+ add r3,r3,r4
+ mov r14,r14,ror#24
+ eor r10,r10,r2,ror#24
+ eor r14,r14,r3,ror#24
+ add r8,r8,r10
+ mov r7,r7,ror#25
+ add r9,r9,r14
+ mov r4,r4,ror#25
+ eor r7,r7,r8,ror#25
+ eor r4,r4,r9,ror#25
+ bne .Loop
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ cmp r11,#64 @ done yet?
+#ifdef __thumb2__
+ itete lo
+#endif
+ addlo r12,sp,#4*(0) @ shortcut or ...
+ ldrhs r12,[sp,#4*(32+1)] @ ... load inp
+ addlo r14,sp,#4*(0) @ shortcut or ...
+ ldrhs r14,[sp,#4*(32+0)] @ ... load out
+
+ ldr r8,[sp,#4*(0)] @ load key material
+ ldr r9,[sp,#4*(1)]
+
+#if __ARM_ARCH__>=6 || !defined(__ARMEB__)
+# if __ARM_ARCH__<7
+ orr r10,r12,r14
+ tst r10,#3 @ are input and output aligned?
+ ldr r10,[sp,#4*(2)]
+ bne .Lunaligned
+ cmp r11,#64 @ restore flags
+# else
+ ldr r10,[sp,#4*(2)]
+# endif
+ ldr r11,[sp,#4*(3)]
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+
+ add r2,r2,r10
+ add r3,r3,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r0,r0,r8 @ xor with input
+ eorhs r1,r1,r9
+ add r8,sp,#4*(4)
+ str r0,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r1,[r14,#-12]
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+ add r8,sp,#4*(8)
+ str r4,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r5,[r14,#-12]
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r2,r2,r10
+ add r3,r3,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r0,r0,r8
+ eorhs r1,r1,r9
+ add r8,sp,#4*(12)
+ str r0,[r14],#16 @ store output
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r2,r2,r10
+ eorhs r3,r3,r11
+ str r1,[r14,#-12]
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r8,[r12],#16 @ load input
+ ldrhs r9,[r12,#-12]
+ add r6,r6,r10
+ add r7,r7,r11
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhs r10,[r12,#-8]
+ ldrhs r11,[r12,#-4]
+# if __ARM_ARCH__>=6 && defined(__ARMEB__)
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r4,r4,r8
+ eorhs r5,r5,r9
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ itt hs
+# endif
+ eorhs r6,r6,r10
+ eorhs r7,r7,r11
+ str r4,[r14],#16 @ store output
+ str r5,[r14,#-12]
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs r11,r8,#64 @ len-=64
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_outer
+
+ beq .Ldone
+# if __ARM_ARCH__<7
+ b .Ltail
+
+.align 4
+.Lunaligned:@ unaligned endian-neutral path
+ cmp r11,#64 @ restore flags
+# endif
+#endif
+#if __ARM_ARCH__<7
+ ldr r11,[sp,#4*(3)]
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+0)
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ add r0,sp,#4*(16+8)
+ add r4,r4,r8 @ accumulate key material
+ add r5,r5,r9
+ add r6,r6,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+ add r8,sp,#4*(4+4)
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+# ifdef __thumb2__
+ itt hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx"
+ strhi r11,[sp,#4*(16+11)] @ copy "rx"
+ add r0,r0,r8 @ accumulate key material
+ add r1,r1,r9
+ add r2,r2,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r3,r3,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r0,r8,r0 @ xor with input (or zero)
+ eor r1,r9,r1
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r2,r10,r2
+ strb r0,[r14],#16 @ store output
+ eor r3,r11,r3
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r1,[r14,#-12]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-8]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r3,[r14,#-4]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-15]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r1,[r14,#-11]
+ eor r0,r8,r0,lsr#8
+ strb r2,[r14,#-7]
+ eor r1,r9,r1,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r3,[r14,#-3]
+ eor r2,r10,r2,lsr#8
+ strb r0,[r14,#-14]
+ eor r3,r11,r3,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r1,[r14,#-10]
+ strb r2,[r14,#-6]
+ eor r0,r8,r0,lsr#8
+ strb r3,[r14,#-2]
+ eor r1,r9,r1,lsr#8
+ strb r0,[r14,#-13]
+ eor r2,r10,r2,lsr#8
+ strb r1,[r14,#-9]
+ eor r3,r11,r3,lsr#8
+ strb r2,[r14,#-5]
+ strb r3,[r14,#-1]
+ add r8,sp,#4*(4+8)
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ add r4,r4,r8 @ accumulate key material
+# ifdef __thumb2__
+ itt hi
+# endif
+ addhi r8,r8,#1 @ next counter value
+ strhi r8,[sp,#4*(12)] @ save next counter value
+ add r5,r5,r9
+ add r6,r6,r10
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r8,r8,r8 @ zero or ...
+ ldrhsb r8,[r12],#16 @ ... load input
+ eorlo r9,r9,r9
+ ldrhsb r9,[r12,#-12]
+
+ add r7,r7,r11
+# ifdef __thumb2__
+ itete lo
+# endif
+ eorlo r10,r10,r10
+ ldrhsb r10,[r12,#-8]
+ eorlo r11,r11,r11
+ ldrhsb r11,[r12,#-4]
+
+ eor r4,r8,r4 @ xor with input (or zero)
+ eor r5,r9,r5
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-15] @ load more input
+ ldrhsb r9,[r12,#-11]
+ eor r6,r10,r6
+ strb r4,[r14],#16 @ store output
+ eor r7,r11,r7
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-7]
+ ldrhsb r11,[r12,#-3]
+ strb r5,[r14,#-12]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-8]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-14] @ load more input
+ ldrhsb r9,[r12,#-10]
+ strb r7,[r14,#-4]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-15]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-6]
+ ldrhsb r11,[r12,#-2]
+ strb r5,[r14,#-11]
+ eor r4,r8,r4,lsr#8
+ strb r6,[r14,#-7]
+ eor r5,r9,r5,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r8,[r12,#-13] @ load more input
+ ldrhsb r9,[r12,#-9]
+ strb r7,[r14,#-3]
+ eor r6,r10,r6,lsr#8
+ strb r4,[r14,#-14]
+ eor r7,r11,r7,lsr#8
+# ifdef __thumb2__
+ itt hs
+# endif
+ ldrhsb r10,[r12,#-5]
+ ldrhsb r11,[r12,#-1]
+ strb r5,[r14,#-10]
+ strb r6,[r14,#-6]
+ eor r4,r8,r4,lsr#8
+ strb r7,[r14,#-2]
+ eor r5,r9,r5,lsr#8
+ strb r4,[r14,#-13]
+ eor r6,r10,r6,lsr#8
+ strb r5,[r14,#-9]
+ eor r7,r11,r7,lsr#8
+ strb r6,[r14,#-5]
+ strb r7,[r14,#-1]
+# ifdef __thumb2__
+ it ne
+# endif
+ ldrne r8,[sp,#4*(32+2)] @ re-load len
+# ifdef __thumb2__
+ it hs
+# endif
+ subhs r11,r8,#64 @ len-=64
+ bhi .Loop_outer
+
+ beq .Ldone
+#endif
+
+.Ltail:
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ add r9,sp,#4*(0)
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+.Loop_tail:
+ ldrb r10,[r9],#1 @ read buffer on stack
+ ldrb r11,[r12],#1 @ read input
+ subs r8,r8,#1
+ eor r11,r11,r10
+ strb r11,[r14],#1 @ store output
+ bne .Loop_tail
+
+.Ldone:
+ add sp,sp,#4*(32+3)
+.Lno_data:
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.type ChaCha20_neon,%function
+.align 5
+ChaCha20_neon:
+ ldr r12,[sp,#0] @ pull pointer to counter and nonce
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+.LChaCha20_neon:
+ adr r14,.Lsigma
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so
+ stmdb sp!,{r0,r1,r2,r3}
+
+ vld1.32 {q1,q2},[r3] @ load key
+ ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key
+
+ sub sp,sp,#4*(16+16)
+ vld1.32 {q3},[r12] @ load counter and nonce
+ add r12,sp,#4*8
+ ldmia r14,{r0,r1,r2,r3} @ load sigma
+ vld1.32 {q0},[r14]! @ load sigma
+ vld1.32 {q12},[r14] @ one
+ vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce
+ vst1.32 {q0,q1},[sp] @ copy sigma|1/2key
+
+ str r10,[sp,#4*(16+10)] @ off-load "rx"
+ str r11,[sp,#4*(16+11)] @ off-load "rx"
+ vshl.i32 d26,d24,#1 @ two
+ vstr d24,[sp,#4*(16+0)]
+ vshl.i32 d28,d24,#2 @ four
+ vstr d26,[sp,#4*(16+2)]
+ vmov q4,q0
+ vstr d28,[sp,#4*(16+4)]
+ vmov q8,q0
+ vmov q5,q1
+ vmov q9,q1
+ b .Loop_neon_enter
+
+.align 4
+.Loop_neon_outer:
+ ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material
+ cmp r11,#64*2 @ if len<=64*2
+ bls .Lbreak_neon @ switch to integer-only
+ vmov q4,q0
+ str r11,[sp,#4*(32+2)] @ save len
+ vmov q8,q0
+ str r12, [sp,#4*(32+1)] @ save inp
+ vmov q5,q1
+ str r14, [sp,#4*(32+0)] @ save out
+ vmov q9,q1
+.Loop_neon_enter:
+ ldr r11, [sp,#4*(15)]
+ vadd.i32 q7,q3,q12 @ counter+1
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ vmov q6,q2
+ ldr r10, [sp,#4*(13)]
+ vmov q10,q2
+ ldr r14,[sp,#4*(14)]
+ vadd.i32 q11,q7,q12 @ counter+2
+ str r11, [sp,#4*(16+15)]
+ mov r11,#10
+ add r12,r12,#3 @ counter+3
+ b .Loop_neon
+
+.align 4
+.Loop_neon:
+ subs r11,r11,#1
+ vadd.i32 q0,q0,q1
+ add r0,r0,r4
+ vadd.i32 q4,q4,q5
+ mov r12,r12,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r5
+ veor q3,q3,q0
+ mov r10,r10,ror#16
+ veor q7,q7,q4
+ eor r12,r12,r0,ror#16
+ veor q11,q11,q8
+ eor r10,r10,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r12
+ vrev32.16 q7,q7
+ mov r4,r4,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r10
+ vadd.i32 q2,q2,q3
+ mov r5,r5,ror#20
+ vadd.i32 q6,q6,q7
+ eor r4,r4,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r5,r5,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r4
+ veor q13,q5,q6
+ mov r12,r12,ror#24
+ veor q14,q9,q10
+ add r1,r1,r5
+ vshr.u32 q1,q12,#20
+ mov r10,r10,ror#24
+ vshr.u32 q5,q13,#20
+ eor r12,r12,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r10,r10,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r12
+ vsli.32 q5,q13,#12
+ mov r4,r4,ror#25
+ vsli.32 q9,q14,#12
+ add r9,r9,r10
+ vadd.i32 q0,q0,q1
+ mov r5,r5,ror#25
+ vadd.i32 q4,q4,q5
+ str r10,[sp,#4*(16+13)]
+ vadd.i32 q8,q8,q9
+ ldr r10,[sp,#4*(16+15)]
+ veor q12,q3,q0
+ eor r4,r4,r8,ror#25
+ veor q13,q7,q4
+ eor r5,r5,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+8)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+10)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r6
+ vshr.u32 q11,q14,#24
+ mov r14,r14,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+9)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+11)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r7
+ vadd.i32 q2,q2,q3
+ mov r10,r10,ror#16
+ vadd.i32 q6,q6,q7
+ eor r14,r14,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r10,r10,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r14
+ veor q13,q5,q6
+ mov r6,r6,ror#20
+ veor q14,q9,q10
+ add r9,r9,r10
+ vshr.u32 q1,q12,#25
+ mov r7,r7,ror#20
+ vshr.u32 q5,q13,#25
+ eor r6,r6,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r7,r7,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r6
+ vsli.32 q5,q13,#7
+ mov r14,r14,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r7
+ vext.8 q2,q2,q2,#8
+ mov r10,r10,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r14,r14,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r10,r10,r3,ror#24
+ vext.8 q1,q1,q1,#4
+ add r8,r8,r14
+ vext.8 q5,q5,q5,#4
+ mov r6,r6,ror#25
+ vext.8 q9,q9,q9,#4
+ add r9,r9,r10
+ vext.8 q3,q3,q3,#12
+ mov r7,r7,ror#25
+ vext.8 q7,q7,q7,#12
+ eor r6,r6,r8,ror#25
+ vext.8 q11,q11,q11,#12
+ eor r7,r7,r9,ror#25
+ vadd.i32 q0,q0,q1
+ add r0,r0,r5
+ vadd.i32 q4,q4,q5
+ mov r10,r10,ror#16
+ vadd.i32 q8,q8,q9
+ add r1,r1,r6
+ veor q3,q3,q0
+ mov r12,r12,ror#16
+ veor q7,q7,q4
+ eor r10,r10,r0,ror#16
+ veor q11,q11,q8
+ eor r12,r12,r1,ror#16
+ vrev32.16 q3,q3
+ add r8,r8,r10
+ vrev32.16 q7,q7
+ mov r5,r5,ror#20
+ vrev32.16 q11,q11
+ add r9,r9,r12
+ vadd.i32 q2,q2,q3
+ mov r6,r6,ror#20
+ vadd.i32 q6,q6,q7
+ eor r5,r5,r8,ror#20
+ vadd.i32 q10,q10,q11
+ eor r6,r6,r9,ror#20
+ veor q12,q1,q2
+ add r0,r0,r5
+ veor q13,q5,q6
+ mov r10,r10,ror#24
+ veor q14,q9,q10
+ add r1,r1,r6
+ vshr.u32 q1,q12,#20
+ mov r12,r12,ror#24
+ vshr.u32 q5,q13,#20
+ eor r10,r10,r0,ror#24
+ vshr.u32 q9,q14,#20
+ eor r12,r12,r1,ror#24
+ vsli.32 q1,q12,#12
+ add r8,r8,r10
+ vsli.32 q5,q13,#12
+ mov r5,r5,ror#25
+ vsli.32 q9,q14,#12
+ str r10,[sp,#4*(16+15)]
+ vadd.i32 q0,q0,q1
+ ldr r10,[sp,#4*(16+13)]
+ vadd.i32 q4,q4,q5
+ add r9,r9,r12
+ vadd.i32 q8,q8,q9
+ mov r6,r6,ror#25
+ veor q12,q3,q0
+ eor r5,r5,r8,ror#25
+ veor q13,q7,q4
+ eor r6,r6,r9,ror#25
+ veor q14,q11,q8
+ str r8,[sp,#4*(16+10)]
+ vshr.u32 q3,q12,#24
+ ldr r8,[sp,#4*(16+8)]
+ vshr.u32 q7,q13,#24
+ add r2,r2,r7
+ vshr.u32 q11,q14,#24
+ mov r10,r10,ror#16
+ vsli.32 q3,q12,#8
+ str r9,[sp,#4*(16+11)]
+ vsli.32 q7,q13,#8
+ ldr r9,[sp,#4*(16+9)]
+ vsli.32 q11,q14,#8
+ add r3,r3,r4
+ vadd.i32 q2,q2,q3
+ mov r14,r14,ror#16
+ vadd.i32 q6,q6,q7
+ eor r10,r10,r2,ror#16
+ vadd.i32 q10,q10,q11
+ eor r14,r14,r3,ror#16
+ veor q12,q1,q2
+ add r8,r8,r10
+ veor q13,q5,q6
+ mov r7,r7,ror#20
+ veor q14,q9,q10
+ add r9,r9,r14
+ vshr.u32 q1,q12,#25
+ mov r4,r4,ror#20
+ vshr.u32 q5,q13,#25
+ eor r7,r7,r8,ror#20
+ vshr.u32 q9,q14,#25
+ eor r4,r4,r9,ror#20
+ vsli.32 q1,q12,#7
+ add r2,r2,r7
+ vsli.32 q5,q13,#7
+ mov r10,r10,ror#24
+ vsli.32 q9,q14,#7
+ add r3,r3,r4
+ vext.8 q2,q2,q2,#8
+ mov r14,r14,ror#24
+ vext.8 q6,q6,q6,#8
+ eor r10,r10,r2,ror#24
+ vext.8 q10,q10,q10,#8
+ eor r14,r14,r3,ror#24
+ vext.8 q1,q1,q1,#12
+ add r8,r8,r10
+ vext.8 q5,q5,q5,#12
+ mov r7,r7,ror#25
+ vext.8 q9,q9,q9,#12
+ add r9,r9,r14
+ vext.8 q3,q3,q3,#4
+ mov r4,r4,ror#25
+ vext.8 q7,q7,q7,#4
+ eor r7,r7,r8,ror#25
+ vext.8 q11,q11,q11,#4
+ eor r4,r4,r9,ror#25
+ bne .Loop_neon
+
+ add r11,sp,#32
+ vld1.32 {q12,q13},[sp] @ load key material
+ vld1.32 {q14,q15},[r11]
+
+ ldr r11,[sp,#4*(32+2)] @ load len
+
+ str r8, [sp,#4*(16+8)] @ modulo-scheduled store
+ str r9, [sp,#4*(16+9)]
+ str r12,[sp,#4*(16+12)]
+ str r10, [sp,#4*(16+13)]
+ str r14,[sp,#4*(16+14)]
+
+ @ at this point we have first half of 512-bit result in
+ @ rx and second half at sp+4*(16+8)
+
+ ldr r12,[sp,#4*(32+1)] @ load inp
+ ldr r14,[sp,#4*(32+0)] @ load out
+
+ vadd.i32 q0,q0,q12 @ accumulate key material
+ vadd.i32 q4,q4,q12
+ vadd.i32 q8,q8,q12
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ vadd.i32 q1,q1,q13
+ vadd.i32 q5,q5,q13
+ vadd.i32 q9,q9,q13
+ vldr d26,[sp,#4*(16+2)] @ two
+
+ vadd.i32 q2,q2,q14
+ vadd.i32 q6,q6,q14
+ vadd.i32 q10,q10,q14
+ vadd.i32 d14,d14,d24 @ counter+1
+ vadd.i32 d22,d22,d26 @ counter+2
+
+ vadd.i32 q3,q3,q15
+ vadd.i32 q7,q7,q15
+ vadd.i32 q11,q11,q15
+
+ cmp r11,#64*4
+ blo .Ltail_neon
+
+ vld1.8 {q12,q13},[r12]! @ load input
+ mov r11,sp
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12 @ xor with input
+ veor q1,q1,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q4,q4,q12
+ vst1.8 {q0,q1},[r14]! @ store output
+ veor q5,q5,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q2,q3},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q8,q8,q12
+ vld1.32 {q0,q1},[r11]! @ load for next iteration
+ veor d25,d25,d25
+ vldr d24,[sp,#4*(16+4)] @ four
+ veor q9,q9,q13
+ vld1.32 {q2,q3},[r11]
+ veor q10,q10,q14
+ vst1.8 {q4,q5},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q6,q7},[r14]!
+
+ vadd.i32 d6,d6,d24 @ next counter value
+ vldr d24,[sp,#4*(16+0)] @ one
+
+ ldmia sp,{r8,r9,r10,r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ vst1.8 {q8,q9},[r14]!
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+ vst1.8 {q10,q11},[r14]!
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ eor r0,r0,r8 @ xor with input
+ add r8,sp,#4*(4)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r5,r5,r9
+ ldr r9,[r12,#-12]
+ add r6,r6,r10
+ ldr r10,[r12,#-8]
+ add r7,r7,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ eor r4,r4,r8
+ add r8,sp,#4*(8)
+ eor r5,r5,r9
+ str r4,[r14],#16 @ store output
+ eor r6,r6,r10
+ str r5,[r14,#-12]
+ eor r7,r7,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r6,[r14,#-8]
+ add r0,sp,#4*(16+8)
+ str r7,[r14,#-4]
+
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ ldr r8,[r12],#16 @ load input
+ add r1,r1,r9
+ ldr r9,[r12,#-12]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it
+ add r2,r2,r10
+ ldr r10,[r12,#-8]
+# ifdef __thumb2__
+ it hi
+# endif
+ strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it
+ add r3,r3,r11
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+# endif
+ eor r0,r0,r8
+ add r8,sp,#4*(12)
+ eor r1,r1,r9
+ str r0,[r14],#16 @ store output
+ eor r2,r2,r10
+ str r1,[r14,#-12]
+ eor r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+ str r2,[r14,#-8]
+ str r3,[r14,#-4]
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,r8,#4 @ next counter value
+ add r5,r5,r9
+ str r8,[sp,#4*(12)] @ save next counter value
+ ldr r8,[r12],#16 @ load input
+ add r6,r6,r10
+ add r4,r4,#3 @ counter+3
+ ldr r9,[r12,#-12]
+ add r7,r7,r11
+ ldr r10,[r12,#-8]
+ ldr r11,[r12,#-4]
+# ifdef __ARMEB__
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ eor r4,r4,r8
+# ifdef __thumb2__
+ it hi
+# endif
+ ldrhi r8,[sp,#4*(32+2)] @ re-load len
+ eor r5,r5,r9
+ eor r6,r6,r10
+ str r4,[r14],#16 @ store output
+ eor r7,r7,r11
+ str r5,[r14,#-12]
+ sub r11,r8,#64*4 @ len-=64*4
+ str r6,[r14,#-8]
+ str r7,[r14,#-4]
+ bhi .Loop_neon_outer
+
+ b .Ldone_neon
+
+.align 4
+.Lbreak_neon:
+ @ harmonize NEON and integer-only stack frames: load data
+ @ from NEON frame, but save to integer-only one; distance
+ @ between the two is 4*(32+4+16-32)=4*(20).
+
+ str r11, [sp,#4*(20+32+2)] @ save len
+ add r11,sp,#4*(32+4)
+ str r12, [sp,#4*(20+32+1)] @ save inp
+ str r14, [sp,#4*(20+32+0)] @ save out
+
+ ldr r12,[sp,#4*(16+10)]
+ ldr r14,[sp,#4*(16+11)]
+ vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement
+ str r12,[sp,#4*(20+16+10)] @ copy "rx"
+ str r14,[sp,#4*(20+16+11)] @ copy "rx"
+
+ ldr r11, [sp,#4*(15)]
+ ldr r12,[sp,#4*(12)] @ modulo-scheduled load
+ ldr r10, [sp,#4*(13)]
+ ldr r14,[sp,#4*(14)]
+ str r11, [sp,#4*(20+16+15)]
+ add r11,sp,#4*(20)
+ vst1.32 {q0,q1},[r11]! @ copy key
+ add sp,sp,#4*(20) @ switch frame
+ vst1.32 {q2,q3},[r11]
+ mov r11,#10
+ b .Loop @ go integer-only
+
+.align 4
+.Ltail_neon:
+ cmp r11,#64*3
+ bhs .L192_or_more_neon
+ cmp r11,#64*2
+ bhs .L128_or_more_neon
+ cmp r11,#64*1
+ bhs .L64_or_more_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q0,q1},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q2,q3},[r8]
+ b .Loop_tail_neon
+
+.align 4
+.L64_or_more_neon:
+ vld1.8 {q12,q13},[r12]!
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vst1.8 {q0,q1},[r14]!
+ vst1.8 {q2,q3},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q4,q5},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q6,q7},[r8]
+ sub r11,r11,#64*1 @ len-=64*1
+ b .Loop_tail_neon
+
+.align 4
+.L128_or_more_neon:
+ vld1.8 {q12,q13},[r12]!
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vst1.8 {q0,q1},[r14]!
+ veor q6,q6,q14
+ vst1.8 {q2,q3},[r14]!
+ veor q7,q7,q15
+ vst1.8 {q4,q5},[r14]!
+ vst1.8 {q6,q7},[r14]!
+
+ beq .Ldone_neon
+
+ add r8,sp,#4*(8)
+ vst1.8 {q8,q9},[sp]
+ add r10,sp,#4*(0)
+ vst1.8 {q10,q11},[r8]
+ sub r11,r11,#64*2 @ len-=64*2
+ b .Loop_tail_neon
+
+.align 4
+.L192_or_more_neon:
+ vld1.8 {q12,q13},[r12]!
+ vld1.8 {q14,q15},[r12]!
+ veor q0,q0,q12
+ veor q1,q1,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q2,q2,q14
+ veor q3,q3,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q4,q4,q12
+ veor q5,q5,q13
+ vld1.8 {q12,q13},[r12]!
+ veor q6,q6,q14
+ vst1.8 {q0,q1},[r14]!
+ veor q7,q7,q15
+ vld1.8 {q14,q15},[r12]!
+
+ veor q8,q8,q12
+ vst1.8 {q2,q3},[r14]!
+ veor q9,q9,q13
+ vst1.8 {q4,q5},[r14]!
+ veor q10,q10,q14
+ vst1.8 {q6,q7},[r14]!
+ veor q11,q11,q15
+ vst1.8 {q8,q9},[r14]!
+ vst1.8 {q10,q11},[r14]!
+
+ beq .Ldone_neon
+
+ ldmia sp,{r8,r9,r10,r11} @ load key material
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(4)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r6,r6,r10
+ add r7,r7,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7}
+ add r0,sp,#4*(16+8)
+
+ ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half
+
+ add r0,r0,r8 @ accumulate key material
+ add r8,sp,#4*(12)
+ add r1,r1,r9
+ add r2,r2,r10
+ add r3,r3,r11
+ ldmia r8,{r8,r9,r10,r11} @ load key material
+
+ add r4,r4,r8 @ accumulate key material
+ add r8,sp,#4*(8)
+ add r5,r5,r9
+ add r4,r4,#3 @ counter+3
+ add r6,r6,r10
+ add r7,r7,r11
+ ldr r11,[sp,#4*(32+2)] @ re-load len
+# ifdef __ARMEB__
+ rev r0,r0
+ rev r1,r1
+ rev r2,r2
+ rev r3,r3
+ rev r4,r4
+ rev r5,r5
+ rev r6,r6
+ rev r7,r7
+# endif
+ stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7}
+ add r10,sp,#4*(0)
+ sub r11,r11,#64*3 @ len-=64*3
+
+.Loop_tail_neon:
+ ldrb r8,[r10],#1 @ read buffer on stack
+ ldrb r9,[r12],#1 @ read input
+ subs r11,r11,#1
+ eor r8,r8,r9
+ strb r8,[r14],#1 @ store output
+ bne .Loop_tail_neon
+
+.Ldone_neon:
+ add sp,sp,#4*(32+4)
+ vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15}
+ add sp,sp,#4*(16+3)
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+.size ChaCha20_neon,.-ChaCha20_neon
+.comm OPENSSL_armcap_P,4,4
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S
@@ -1,0 +1,800 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.arch armv7-a @ don't confuse not-so-latest binutils with argv8 :-)
+.fpu neon
+.code 32
+#undef __thumb2__
+.align 5
+.Lrcon:
+.long 0x01,0x01,0x01,0x01
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat
+.long 0x1b,0x1b,0x1b,0x1b
+
+.text
+
+.globl aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,%function
+.align 5
+aes_hw_set_encrypt_key:
+.Lenc_key:
+ mov r3,#-1
+ cmp r0,#0
+ beq .Lenc_key_abort
+ cmp r2,#0
+ beq .Lenc_key_abort
+ mov r3,#-2
+ cmp r1,#128
+ blt .Lenc_key_abort
+ cmp r1,#256
+ bgt .Lenc_key_abort
+ tst r1,#0x3f
+ bne .Lenc_key_abort
+
+ adr r3,.Lrcon
+ cmp r1,#192
+
+ veor q0,q0,q0
+ vld1.8 {q3},[r0]!
+ mov r1,#8 @ reuse r1
+ vld1.32 {q1,q2},[r3]!
+
+ blt .Loop128
+ beq .L192
+ b .L256
+
+.align 4
+.Loop128:
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+ bne .Loop128
+
+ vld1.32 {q1},[r3]
+
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+
+ vtbl.8 d20,{q3},d4
+ vtbl.8 d21,{q3},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q3},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ veor q3,q3,q10
+ vst1.32 {q3},[r2]
+ add r2,r2,#0x50
+
+ mov r12,#10
+ b .Ldone
+
+.align 4
+.L192:
+ vld1.8 {d16},[r0]!
+ vmov.i8 q10,#8 @ borrow q10
+ vst1.32 {q3},[r2]!
+ vsub.i8 q2,q2,q10 @ adjust the mask
+
+.Loop192:
+ vtbl.8 d20,{q8},d4
+ vtbl.8 d21,{q8},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {d16},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+
+ vdup.32 q9,d7[1]
+ veor q9,q9,q8
+ veor q10,q10,q1
+ vext.8 q8,q0,q8,#12
+ vshl.u8 q1,q1,#1
+ veor q8,q8,q9
+ veor q3,q3,q10
+ veor q8,q8,q10
+ vst1.32 {q3},[r2]!
+ bne .Loop192
+
+ mov r12,#12
+ add r2,r2,#0x20
+ b .Ldone
+
+.align 4
+.L256:
+ vld1.8 {q8},[r0]
+ mov r1,#7
+ mov r12,#14
+ vst1.32 {q3},[r2]!
+
+.Loop256:
+ vtbl.8 d20,{q8},d4
+ vtbl.8 d21,{q8},d5
+ vext.8 q9,q0,q3,#12
+ vst1.32 {q8},[r2]!
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+ subs r1,r1,#1
+
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q3,q3,q9
+ vext.8 q9,q0,q9,#12
+ veor q10,q10,q1
+ veor q3,q3,q9
+ vshl.u8 q1,q1,#1
+ veor q3,q3,q10
+ vst1.32 {q3},[r2]!
+ beq .Ldone
+
+ vdup.32 q10,d7[1]
+ vext.8 q9,q0,q8,#12
+.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0
+
+ veor q8,q8,q9
+ vext.8 q9,q0,q9,#12
+ veor q8,q8,q9
+ vext.8 q9,q0,q9,#12
+ veor q8,q8,q9
+
+ veor q8,q8,q10
+ b .Loop256
+
+.Ldone:
+ str r12,[r2]
+ mov r3,#0
+
+.Lenc_key_abort:
+ mov r0,r3 @ return value
+
+ bx lr
+.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,%function
+.align 5
+aes_hw_set_decrypt_key:
+ stmdb sp!,{r4,lr}
+ bl .Lenc_key
+
+ cmp r0,#0
+ bne .Ldec_key_abort
+
+ sub r2,r2,#240 @ restore original r2
+ mov r4,#-16
+ add r0,r2,r12,lsl#4 @ end of key schedule
+
+ vld1.32 {q0},[r2]
+ vld1.32 {q1},[r0]
+ vst1.32 {q0},[r0],r4
+ vst1.32 {q1},[r2]!
+
+.Loop_imc:
+ vld1.32 {q0},[r2]
+ vld1.32 {q1},[r0]
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+ vst1.32 {q0},[r0],r4
+ vst1.32 {q1},[r2]!
+ cmp r0,r2
+ bhi .Loop_imc
+
+ vld1.32 {q0},[r2]
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+ vst1.32 {q0},[r0]
+
+ eor r0,r0,r0 @ return value
+.Ldec_key_abort:
+ ldmia sp!,{r4,pc}
+.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type aes_hw_encrypt,%function
+.align 5
+aes_hw_encrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr r3,[r2,#240]
+ vld1.32 {q0},[r2]!
+ vld1.8 {q2},[r0]
+ sub r3,r3,#2
+ vld1.32 {q1},[r2]!
+
+.Loop_enc:
+.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
+.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]!
+ subs r3,r3,#2
+.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
+.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q1},[r2]!
+ bgt .Loop_enc
+
+.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0
+.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2
+ vld1.32 {q0},[r2]
+.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1
+ veor q2,q2,q0
+
+ vst1.8 {q2},[r1]
+ bx lr
+.size aes_hw_encrypt,.-aes_hw_encrypt
+.globl aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type aes_hw_decrypt,%function
+.align 5
+aes_hw_decrypt:
+ AARCH64_VALID_CALL_TARGET
+ ldr r3,[r2,#240]
+ vld1.32 {q0},[r2]!
+ vld1.8 {q2},[r0]
+ sub r3,r3,#2
+ vld1.32 {q1},[r2]!
+
+.Loop_dec:
+.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
+.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]!
+ subs r3,r3,#2
+.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
+.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q1},[r2]!
+ bgt .Loop_dec
+
+.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0
+.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2
+ vld1.32 {q0},[r2]
+.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1
+ veor q2,q2,q0
+
+ vst1.8 {q2},[r1]
+ bx lr
+.size aes_hw_decrypt,.-aes_hw_decrypt
+.globl aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,%function
+.align 5
+aes_hw_cbc_encrypt:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,lr}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldmia ip,{r4,r5} @ load remaining args
+ subs r2,r2,#16
+ mov r8,#16
+ blo .Lcbc_abort
+ moveq r8,#0
+
+ cmp r5,#0 @ en- or decrypting?
+ ldr r5,[r3,#240]
+ and r2,r2,#-16
+ vld1.8 {q6},[r4]
+ vld1.8 {q0},[r0],r8
+
+ vld1.32 {q8,q9},[r3] @ load key schedule...
+ sub r5,r5,#6
+ add r7,r3,r5,lsl#4 @ pointer to last 7 round keys
+ sub r5,r5,#2
+ vld1.32 {q10,q11},[r7]!
+ vld1.32 {q12,q13},[r7]!
+ vld1.32 {q14,q15},[r7]!
+ vld1.32 {q7},[r7]
+
+ add r7,r3,#32
+ mov r6,r5
+ beq .Lcbc_dec
+
+ cmp r5,#2
+ veor q0,q0,q6
+ veor q5,q8,q7
+ beq .Lcbc_enc128
+
+ vld1.32 {q2,q3},[r7]
+ add r7,r3,#16
+ add r6,r3,#16*4
+ add r12,r3,#16*5
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ add r14,r3,#16*6
+ add r3,r3,#16*7
+ b .Lenter_cbc_enc
+
+.align 4
+.Loop_cbc_enc:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vst1.8 {q6},[r1]!
+.Lenter_cbc_enc:
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r6]
+ cmp r5,#4
+.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r12]
+ beq .Lcbc_enc192
+
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q8},[r14]
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r3]
+ nop
+
+.Lcbc_enc192:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r2,r2,#16
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ moveq r8,#0
+.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q8},[r0],r8
+.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ veor q8,q8,q5
+.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.32 {q9},[r7] @ re-pre-load rndkey[1]
+.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+ veor q6,q0,q7
+ bhs .Loop_cbc_enc
+
+ vst1.8 {q6},[r1]!
+ b .Lcbc_done
+
+.align 5
+.Lcbc_enc128:
+ vld1.32 {q2,q3},[r7]
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ b .Lenter_cbc_enc128
+.Loop_cbc_enc128:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vst1.8 {q6},[r1]!
+.Lenter_cbc_enc128:
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ subs r2,r2,#16
+.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ moveq r8,#0
+.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ vld1.8 {q8},[r0],r8
+.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+ veor q8,q8,q5
+.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+ veor q6,q0,q7
+ bhs .Loop_cbc_enc128
+
+ vst1.8 {q6},[r1]!
+ b .Lcbc_done
+.align 5
+.Lcbc_dec:
+ vld1.8 {q10},[r0]!
+ subs r2,r2,#32 @ bias
+ add r6,r5,#2
+ vorr q3,q0,q0
+ vorr q1,q0,q0
+ vorr q11,q10,q10
+ blo .Lcbc_dec_tail
+
+ vorr q1,q10,q10
+ vld1.8 {q10},[r0]!
+ vorr q2,q0,q0
+ vorr q3,q1,q1
+ vorr q11,q10,q10
+
+.Loop3x_cbc_dec:
+.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
+ bgt .Loop3x_cbc_dec
+
+.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q4,q6,q7
+ subs r2,r2,#0x30
+ veor q5,q2,q7
+ movlo r6,r2 @ r6, r6, is zero at this point
+.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q9,q3,q7
+ add r0,r0,r6 @ r0 is adjusted in such way that
+ @ at exit from the loop q1-q10
+ @ are loaded with last "words"
+ vorr q6,q11,q11
+ mov r7,r3
+.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.8 {q2},[r0]!
+.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.8 {q3},[r0]!
+.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14
+.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0
+.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.8 {q11},[r0]!
+.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15
+.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
+.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ add r6,r5,#2
+ veor q4,q4,q0
+ veor q5,q5,q1
+ veor q10,q10,q9
+ vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ vst1.8 {q4},[r1]!
+ vorr q0,q2,q2
+ vst1.8 {q5},[r1]!
+ vorr q1,q3,q3
+ vst1.8 {q10},[r1]!
+ vorr q10,q11,q11
+ bhs .Loop3x_cbc_dec
+
+ cmn r2,#0x30
+ beq .Lcbc_done
+ nop
+
+.Lcbc_dec_tail:
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ vld1.32 {q9},[r7]!
+ bgt .Lcbc_dec_tail
+
+.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ cmn r2,#0x20
+.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q5,q6,q7
+.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14
+.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1
+.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14
+.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10
+ veor q9,q3,q7
+.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15
+.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15
+ beq .Lcbc_dec_one
+ veor q5,q5,q1
+ veor q9,q9,q10
+ vorr q6,q11,q11
+ vst1.8 {q5},[r1]!
+ vst1.8 {q9},[r1]!
+ b .Lcbc_done
+
+.Lcbc_dec_one:
+ veor q5,q5,q10
+ vorr q6,q11,q11
+ vst1.8 {q5},[r1]!
+
+.Lcbc_done:
+ vst1.8 {q6},[r4]
+.Lcbc_abort:
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,pc}
+.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,%function
+.align 5
+aes_hw_ctr32_encrypt_blocks:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+ ldr r5,[r3,#240]
+
+ ldr r8, [r4, #12]
+ vld1.32 {q0},[r4]
+
+ vld1.32 {q8,q9},[r3] @ load key schedule...
+ sub r5,r5,#4
+ mov r12,#16
+ cmp r2,#2
+ add r7,r3,r5,lsl#4 @ pointer to last 5 round keys
+ sub r5,r5,#2
+ vld1.32 {q12,q13},[r7]!
+ vld1.32 {q14,q15},[r7]!
+ vld1.32 {q7},[r7]
+ add r7,r3,#32
+ mov r6,r5
+ movlo r12,#0
+
+ @ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are
+ @ affected by silicon errata #1742098 [0] and #1655431 [1],
+ @ respectively, where the second instruction of an aese/aesmc
+ @ instruction pair may execute twice if an interrupt is taken right
+ @ after the first instruction consumes an input register of which a
+ @ single 32-bit lane has been updated the last time it was modified.
+ @
+ @ This function uses a counter in one 32-bit lane. The
+ @ could write to q1 and q10 directly, but that trips this bugs.
+ @ We write to q6 and copy to the final register as a workaround.
+ @
+ @ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice
+ @ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice
+#ifndef __ARMEB__
+ rev r8, r8
+#endif
+ add r10, r8, #1
+ vorr q6,q0,q0
+ rev r10, r10
+ vmov.32 d13[1],r10
+ add r8, r8, #2
+ vorr q1,q6,q6
+ bls .Lctr32_tail
+ rev r12, r8
+ vmov.32 d13[1],r12
+ sub r2,r2,#3 @ bias
+ vorr q10,q6,q6
+ b .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
+.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
+.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.32 {q9},[r7]!
+ bgt .Loop3x_ctr32
+
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1
+ vld1.8 {q2},[r0]!
+ add r9,r8,#1
+.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8
+.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10
+ vld1.8 {q3},[r0]!
+ rev r9,r9
+.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ vld1.8 {q11},[r0]!
+ mov r7,r3
+.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9
+.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10
+.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ veor q2,q2,q7
+ add r10,r8,#2
+.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12
+.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
+ veor q3,q3,q7
+ add r8,r8,#3
+.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ @ Note the logic to update q0, q1, and q1 is written to work
+ @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in
+ @ 32-bit mode. See the comment above.
+ veor q11,q11,q7
+ vmov.32 d13[1], r9
+.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13
+.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
+ vorr q0,q6,q6
+ rev r10,r10
+.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14
+.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4
+ vmov.32 d13[1], r10
+ rev r12,r8
+.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14
+.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5
+ vorr q1,q6,q6
+ vmov.32 d13[1], r12
+.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14
+.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9
+ vorr q10,q6,q6
+ subs r2,r2,#3
+.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15
+.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15
+.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15
+
+ veor q2,q2,q4
+ vld1.32 {q8},[r7]! @ re-pre-load rndkey[0]
+ vst1.8 {q2},[r1]!
+ veor q3,q3,q5
+ mov r6,r5
+ vst1.8 {q3},[r1]!
+ veor q11,q11,q9
+ vld1.32 {q9},[r7]! @ re-pre-load rndkey[1]
+ vst1.8 {q11},[r1]!
+ bhs .Loop3x_ctr32
+
+ adds r2,r2,#3
+ beq .Lctr32_done
+ cmp r2,#1
+ mov r12,#16
+ moveq r12,#0
+
+.Lctr32_tail:
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q8},[r7]!
+ subs r6,r6,#2
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.32 {q9},[r7]!
+ bgt .Lctr32_tail
+
+.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.8 {q2},[r0],r12
+.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ vld1.8 {q3},[r0]
+.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ veor q2,q2,q7
+.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14
+.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0
+.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14
+.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1
+ veor q3,q3,q7
+.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15
+.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15
+
+ cmp r2,#1
+ veor q2,q2,q0
+ veor q3,q3,q1
+ vst1.8 {q2},[r1]!
+ beq .Lctr32_done
+ vst1.8 {q3},[r1]
+
+.Lctr32_done:
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc}
+.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S
@@ -1,0 +1,977 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.align 5
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lbn_mul_mont
+#endif
+
+.globl bn_mul_mont
+.hidden bn_mul_mont
+.type bn_mul_mont,%function
+
+.align 5
+bn_mul_mont:
+.Lbn_mul_mont:
+ ldr ip,[sp,#4] @ load num
+ stmdb sp!,{r0,r2} @ sp points at argument block
+#if __ARM_MAX_ARCH__>=7
+ tst ip,#7
+ bne .Lialu
+ adr r0,.Lbn_mul_mont
+ ldr r2,.LOPENSSL_armcap
+ ldr r0,[r0,r2]
+#ifdef __APPLE__
+ ldr r0,[r0]
+#endif
+ tst r0,#ARMV7_NEON @ NEON available?
+ ldmia sp, {r0,r2}
+ beq .Lialu
+ add sp,sp,#8
+ b bn_mul8x_mont_neon
+.align 4
+.Lialu:
+#endif
+ cmp ip,#2
+ mov r0,ip @ load num
+#ifdef __thumb2__
+ ittt lt
+#endif
+ movlt r0,#0
+ addlt sp,sp,#2*4
+ blt .Labrt
+
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers
+
+ mov r0,r0,lsl#2 @ rescale r0 for byte count
+ sub sp,sp,r0 @ alloca(4*num)
+ sub sp,sp,#4 @ +extra dword
+ sub r0,r0,#4 @ "num=num-1"
+ add r4,r2,r0 @ &bp[num-1]
+
+ add r0,sp,r0 @ r0 to point at &tp[num-1]
+ ldr r8,[r0,#14*4] @ &n0
+ ldr r2,[r2] @ bp[0]
+ ldr r5,[r1],#4 @ ap[0],ap++
+ ldr r6,[r3],#4 @ np[0],np++
+ ldr r8,[r8] @ *n0
+ str r4,[r0,#15*4] @ save &bp[num]
+
+ umull r10,r11,r5,r2 @ ap[0]*bp[0]
+ str r8,[r0,#14*4] @ save n0 value
+ mul r8,r10,r8 @ "tp[0]"*n0
+ mov r12,#0
+ umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]"
+ mov r4,sp
+
+.L1st:
+ ldr r5,[r1],#4 @ ap[j],ap++
+ mov r10,r11
+ ldr r6,[r3],#4 @ np[j],np++
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[j]*bp[0]
+ mov r14,#0
+ umlal r12,r14,r6,r8 @ np[j]*n0
+ adds r12,r12,r10
+ str r12,[r4],#4 @ tp[j-1]=,tp++
+ adc r12,r14,#0
+ cmp r4,r0
+ bne .L1st
+
+ adds r12,r12,r11
+ ldr r4,[r0,#13*4] @ restore bp
+ mov r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
+ adc r14,r14,#0
+ str r12,[r0] @ tp[num-1]=
+ mov r7,sp
+ str r14,[r0,#4] @ tp[num]=
+
+.Louter:
+ sub r7,r0,r7 @ "original" r0-1 value
+ sub r1,r1,r7 @ "rewind" ap to &ap[1]
+ ldr r2,[r4,#4]! @ *(++bp)
+ sub r3,r3,r7 @ "rewind" np to &np[1]
+ ldr r5,[r1,#-4] @ ap[0]
+ ldr r10,[sp] @ tp[0]
+ ldr r6,[r3,#-4] @ np[0]
+ ldr r7,[sp,#4] @ tp[1]
+
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0]
+ str r4,[r0,#13*4] @ save bp
+ mul r8,r10,r8
+ mov r12,#0
+ umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]"
+ mov r4,sp
+
+.Linner:
+ ldr r5,[r1],#4 @ ap[j],ap++
+ adds r10,r11,r7 @ +=tp[j]
+ ldr r6,[r3],#4 @ np[j],np++
+ mov r11,#0
+ umlal r10,r11,r5,r2 @ ap[j]*bp[i]
+ mov r14,#0
+ umlal r12,r14,r6,r8 @ np[j]*n0
+ adc r11,r11,#0
+ ldr r7,[r4,#8] @ tp[j+1]
+ adds r12,r12,r10
+ str r12,[r4],#4 @ tp[j-1]=,tp++
+ adc r12,r14,#0
+ cmp r4,r0
+ bne .Linner
+
+ adds r12,r12,r11
+ mov r14,#0
+ ldr r4,[r0,#13*4] @ restore bp
+ adc r14,r14,#0
+ ldr r8,[r0,#14*4] @ restore n0
+ adds r12,r12,r7
+ ldr r7,[r0,#15*4] @ restore &bp[num]
+ adc r14,r14,#0
+ str r12,[r0] @ tp[num-1]=
+ str r14,[r0,#4] @ tp[num]=
+
+ cmp r4,r7
+#ifdef __thumb2__
+ itt ne
+#endif
+ movne r7,sp
+ bne .Louter
+
+ ldr r2,[r0,#12*4] @ pull rp
+ mov r5,sp
+ add r0,r0,#4 @ r0 to point at &tp[num]
+ sub r5,r0,r5 @ "original" num value
+ mov r4,sp @ "rewind" r4
+ mov r1,r4 @ "borrow" r1
+ sub r3,r3,r5 @ "rewind" r3 to &np[0]
+
+ subs r7,r7,r7 @ "clear" carry flag
+.Lsub: ldr r7,[r4],#4
+ ldr r6,[r3],#4
+ sbcs r7,r7,r6 @ tp[j]-np[j]
+ str r7,[r2],#4 @ rp[j]=
+ teq r4,r0 @ preserve carry
+ bne .Lsub
+ sbcs r14,r14,#0 @ upmost carry
+ mov r4,sp @ "rewind" r4
+ sub r2,r2,r5 @ "rewind" r2
+
+.Lcopy: ldr r7,[r4] @ conditional copy
+ ldr r5,[r2]
+ str sp,[r4],#4 @ zap tp
+#ifdef __thumb2__
+ it cc
+#endif
+ movcc r5,r7
+ str r5,[r2],#4
+ teq r4,r0 @ preserve carry
+ bne .Lcopy
+
+ mov sp,r0
+ add sp,sp,#4 @ skip over tp[num+1]
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers
+ add sp,sp,#2*4 @ skip over {r0,r2}
+ mov r0,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+ bx lr @ bx lr
+#else
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size bn_mul_mont,.-bn_mul_mont
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.type bn_mul8x_mont_neon,%function
+.align 5
+bn_mul8x_mont_neon:
+ mov ip,sp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+ ldmia ip,{r4,r5} @ load rest of parameter block
+ mov ip,sp
+
+ cmp r5,#8
+ bhi .LNEON_8n
+
+ @ special case for r5==8, everything is in register bank...
+
+ vld1.32 {d28[0]}, [r2,:32]!
+ veor d8,d8,d8
+ sub r7,sp,r5,lsl#4
+ vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-(
+ and r7,r7,#-64
+ vld1.32 {d30[0]}, [r4,:32]
+ mov sp,r7 @ alloca
+ vzip.16 d28,d8
+
+ vmull.u32 q6,d28,d0[0]
+ vmull.u32 q7,d28,d0[1]
+ vmull.u32 q8,d28,d1[0]
+ vshl.i64 d29,d13,#16
+ vmull.u32 q9,d28,d1[1]
+
+ vadd.u64 d29,d29,d12
+ veor d8,d8,d8
+ vmul.u32 d29,d29,d30
+
+ vmull.u32 q10,d28,d2[0]
+ vld1.32 {d4,d5,d6,d7}, [r3]!
+ vmull.u32 q11,d28,d2[1]
+ vmull.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmull.u32 q13,d28,d3[1]
+
+ vmlal.u32 q6,d29,d4[0]
+ sub r9,r5,#1
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+
+ vmlal.u32 q10,d29,d6[0]
+ vmov q5,q6
+ vmlal.u32 q11,d29,d6[1]
+ vmov q6,q7
+ vmlal.u32 q12,d29,d7[0]
+ vmov q7,q8
+ vmlal.u32 q13,d29,d7[1]
+ vmov q8,q9
+ vmov q9,q10
+ vshr.u64 d10,d10,#16
+ vmov q10,q11
+ vmov q11,q12
+ vadd.u64 d10,d10,d11
+ vmov q12,q13
+ veor q13,q13
+ vshr.u64 d10,d10,#16
+
+ b .LNEON_outer8
+
+.align 4
+.LNEON_outer8:
+ vld1.32 {d28[0]}, [r2,:32]!
+ veor d8,d8,d8
+ vzip.16 d28,d8
+ vadd.u64 d12,d12,d10
+
+ vmlal.u32 q6,d28,d0[0]
+ vmlal.u32 q7,d28,d0[1]
+ vmlal.u32 q8,d28,d1[0]
+ vshl.i64 d29,d13,#16
+ vmlal.u32 q9,d28,d1[1]
+
+ vadd.u64 d29,d29,d12
+ veor d8,d8,d8
+ subs r9,r9,#1
+ vmul.u32 d29,d29,d30
+
+ vmlal.u32 q10,d28,d2[0]
+ vmlal.u32 q11,d28,d2[1]
+ vmlal.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q13,d28,d3[1]
+
+ vmlal.u32 q6,d29,d4[0]
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+
+ vmlal.u32 q10,d29,d6[0]
+ vmov q5,q6
+ vmlal.u32 q11,d29,d6[1]
+ vmov q6,q7
+ vmlal.u32 q12,d29,d7[0]
+ vmov q7,q8
+ vmlal.u32 q13,d29,d7[1]
+ vmov q8,q9
+ vmov q9,q10
+ vshr.u64 d10,d10,#16
+ vmov q10,q11
+ vmov q11,q12
+ vadd.u64 d10,d10,d11
+ vmov q12,q13
+ veor q13,q13
+ vshr.u64 d10,d10,#16
+
+ bne .LNEON_outer8
+
+ vadd.u64 d12,d12,d10
+ mov r7,sp
+ vshr.u64 d10,d12,#16
+ mov r8,r5
+ vadd.u64 d13,d13,d10
+ add r6,sp,#96
+ vshr.u64 d10,d13,#16
+ vzip.16 d12,d13
+
+ b .LNEON_tail_entry
+
+.align 4
+.LNEON_8n:
+ veor q6,q6,q6
+ sub r7,sp,#128
+ veor q7,q7,q7
+ sub r7,r7,r5,lsl#4
+ veor q8,q8,q8
+ and r7,r7,#-64
+ veor q9,q9,q9
+ mov sp,r7 @ alloca
+ veor q10,q10,q10
+ add r7,r7,#256
+ veor q11,q11,q11
+ sub r8,r5,#8
+ veor q12,q12,q12
+ veor q13,q13,q13
+
+.LNEON_8n_init:
+ vst1.64 {q6,q7},[r7,:256]!
+ subs r8,r8,#8
+ vst1.64 {q8,q9},[r7,:256]!
+ vst1.64 {q10,q11},[r7,:256]!
+ vst1.64 {q12,q13},[r7,:256]!
+ bne .LNEON_8n_init
+
+ add r6,sp,#256
+ vld1.32 {d0,d1,d2,d3},[r1]!
+ add r10,sp,#8
+ vld1.32 {d30[0]},[r4,:32]
+ mov r9,r5
+ b .LNEON_8n_outer
+
+.align 4
+.LNEON_8n_outer:
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ veor d8,d8,d8
+ vzip.16 d28,d8
+ add r7,sp,#128
+ vld1.32 {d4,d5,d6,d7},[r3]!
+
+ vmlal.u32 q6,d28,d0[0]
+ vmlal.u32 q7,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q8,d28,d1[0]
+ vshl.i64 d29,d13,#16
+ vmlal.u32 q9,d28,d1[1]
+ vadd.u64 d29,d29,d12
+ vmlal.u32 q10,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q11,d28,d2[1]
+ vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0]
+ vmlal.u32 q12,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q13,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q6,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q7,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q8,d29,d5[0]
+ vshr.u64 d12,d12,#16
+ vmlal.u32 q9,d29,d5[1]
+ vmlal.u32 q10,d29,d6[0]
+ vadd.u64 d12,d12,d13
+ vmlal.u32 q11,d29,d6[1]
+ vshr.u64 d12,d12,#16
+ vmlal.u32 q12,d29,d7[0]
+ vmlal.u32 q13,d29,d7[1]
+ vadd.u64 d14,d14,d12
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0]
+ vmlal.u32 q7,d28,d0[0]
+ vld1.64 {q6},[r6,:128]!
+ vmlal.u32 q8,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q9,d28,d1[0]
+ vshl.i64 d29,d15,#16
+ vmlal.u32 q10,d28,d1[1]
+ vadd.u64 d29,d29,d14
+ vmlal.u32 q11,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q12,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1]
+ vmlal.u32 q13,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q6,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q7,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q8,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q9,d29,d5[0]
+ vshr.u64 d14,d14,#16
+ vmlal.u32 q10,d29,d5[1]
+ vmlal.u32 q11,d29,d6[0]
+ vadd.u64 d14,d14,d15
+ vmlal.u32 q12,d29,d6[1]
+ vshr.u64 d14,d14,#16
+ vmlal.u32 q13,d29,d7[0]
+ vmlal.u32 q6,d29,d7[1]
+ vadd.u64 d16,d16,d14
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1]
+ vmlal.u32 q8,d28,d0[0]
+ vld1.64 {q7},[r6,:128]!
+ vmlal.u32 q9,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q10,d28,d1[0]
+ vshl.i64 d29,d17,#16
+ vmlal.u32 q11,d28,d1[1]
+ vadd.u64 d29,d29,d16
+ vmlal.u32 q12,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q13,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2]
+ vmlal.u32 q6,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q7,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q8,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q9,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q10,d29,d5[0]
+ vshr.u64 d16,d16,#16
+ vmlal.u32 q11,d29,d5[1]
+ vmlal.u32 q12,d29,d6[0]
+ vadd.u64 d16,d16,d17
+ vmlal.u32 q13,d29,d6[1]
+ vshr.u64 d16,d16,#16
+ vmlal.u32 q6,d29,d7[0]
+ vmlal.u32 q7,d29,d7[1]
+ vadd.u64 d18,d18,d16
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2]
+ vmlal.u32 q9,d28,d0[0]
+ vld1.64 {q8},[r6,:128]!
+ vmlal.u32 q10,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q11,d28,d1[0]
+ vshl.i64 d29,d19,#16
+ vmlal.u32 q12,d28,d1[1]
+ vadd.u64 d29,d29,d18
+ vmlal.u32 q13,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q6,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3]
+ vmlal.u32 q7,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q8,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q9,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q10,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q11,d29,d5[0]
+ vshr.u64 d18,d18,#16
+ vmlal.u32 q12,d29,d5[1]
+ vmlal.u32 q13,d29,d6[0]
+ vadd.u64 d18,d18,d19
+ vmlal.u32 q6,d29,d6[1]
+ vshr.u64 d18,d18,#16
+ vmlal.u32 q7,d29,d7[0]
+ vmlal.u32 q8,d29,d7[1]
+ vadd.u64 d20,d20,d18
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3]
+ vmlal.u32 q10,d28,d0[0]
+ vld1.64 {q9},[r6,:128]!
+ vmlal.u32 q11,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q12,d28,d1[0]
+ vshl.i64 d29,d21,#16
+ vmlal.u32 q13,d28,d1[1]
+ vadd.u64 d29,d29,d20
+ vmlal.u32 q6,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q7,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4]
+ vmlal.u32 q8,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q9,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q10,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q11,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q12,d29,d5[0]
+ vshr.u64 d20,d20,#16
+ vmlal.u32 q13,d29,d5[1]
+ vmlal.u32 q6,d29,d6[0]
+ vadd.u64 d20,d20,d21
+ vmlal.u32 q7,d29,d6[1]
+ vshr.u64 d20,d20,#16
+ vmlal.u32 q8,d29,d7[0]
+ vmlal.u32 q9,d29,d7[1]
+ vadd.u64 d22,d22,d20
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4]
+ vmlal.u32 q11,d28,d0[0]
+ vld1.64 {q10},[r6,:128]!
+ vmlal.u32 q12,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q13,d28,d1[0]
+ vshl.i64 d29,d23,#16
+ vmlal.u32 q6,d28,d1[1]
+ vadd.u64 d29,d29,d22
+ vmlal.u32 q7,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q8,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5]
+ vmlal.u32 q9,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q10,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q11,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q12,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q13,d29,d5[0]
+ vshr.u64 d22,d22,#16
+ vmlal.u32 q6,d29,d5[1]
+ vmlal.u32 q7,d29,d6[0]
+ vadd.u64 d22,d22,d23
+ vmlal.u32 q8,d29,d6[1]
+ vshr.u64 d22,d22,#16
+ vmlal.u32 q9,d29,d7[0]
+ vmlal.u32 q10,d29,d7[1]
+ vadd.u64 d24,d24,d22
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5]
+ vmlal.u32 q12,d28,d0[0]
+ vld1.64 {q11},[r6,:128]!
+ vmlal.u32 q13,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q6,d28,d1[0]
+ vshl.i64 d29,d25,#16
+ vmlal.u32 q7,d28,d1[1]
+ vadd.u64 d29,d29,d24
+ vmlal.u32 q8,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q9,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6]
+ vmlal.u32 q10,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q11,d28,d3[1]
+ vld1.32 {d28[0]},[r2,:32]! @ *b++
+ vmlal.u32 q12,d29,d4[0]
+ veor d10,d10,d10
+ vmlal.u32 q13,d29,d4[1]
+ vzip.16 d28,d10
+ vmlal.u32 q6,d29,d5[0]
+ vshr.u64 d24,d24,#16
+ vmlal.u32 q7,d29,d5[1]
+ vmlal.u32 q8,d29,d6[0]
+ vadd.u64 d24,d24,d25
+ vmlal.u32 q9,d29,d6[1]
+ vshr.u64 d24,d24,#16
+ vmlal.u32 q10,d29,d7[0]
+ vmlal.u32 q11,d29,d7[1]
+ vadd.u64 d26,d26,d24
+ vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6]
+ vmlal.u32 q13,d28,d0[0]
+ vld1.64 {q12},[r6,:128]!
+ vmlal.u32 q6,d28,d0[1]
+ veor d8,d8,d8
+ vmlal.u32 q7,d28,d1[0]
+ vshl.i64 d29,d27,#16
+ vmlal.u32 q8,d28,d1[1]
+ vadd.u64 d29,d29,d26
+ vmlal.u32 q9,d28,d2[0]
+ vmul.u32 d29,d29,d30
+ vmlal.u32 q10,d28,d2[1]
+ vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7]
+ vmlal.u32 q11,d28,d3[0]
+ vzip.16 d29,d8
+ vmlal.u32 q12,d28,d3[1]
+ vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
+ vmlal.u32 q13,d29,d4[0]
+ vld1.32 {d0,d1,d2,d3},[r1]!
+ vmlal.u32 q6,d29,d4[1]
+ vmlal.u32 q7,d29,d5[0]
+ vshr.u64 d26,d26,#16
+ vmlal.u32 q8,d29,d5[1]
+ vmlal.u32 q9,d29,d6[0]
+ vadd.u64 d26,d26,d27
+ vmlal.u32 q10,d29,d6[1]
+ vshr.u64 d26,d26,#16
+ vmlal.u32 q11,d29,d7[0]
+ vmlal.u32 q12,d29,d7[1]
+ vadd.u64 d12,d12,d26
+ vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7]
+ add r10,sp,#8 @ rewind
+ sub r8,r5,#8
+ b .LNEON_8n_inner
+
+.align 4
+.LNEON_8n_inner:
+ subs r8,r8,#8
+ vmlal.u32 q6,d28,d0[0]
+ vld1.64 {q13},[r6,:128]
+ vmlal.u32 q7,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0]
+ vmlal.u32 q8,d28,d1[0]
+ vld1.32 {d4,d5,d6,d7},[r3]!
+ vmlal.u32 q9,d28,d1[1]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q10,d28,d2[0]
+ vmlal.u32 q11,d28,d2[1]
+ vmlal.u32 q12,d28,d3[0]
+ vmlal.u32 q13,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1]
+ vmlal.u32 q6,d29,d4[0]
+ vmlal.u32 q7,d29,d4[1]
+ vmlal.u32 q8,d29,d5[0]
+ vmlal.u32 q9,d29,d5[1]
+ vmlal.u32 q10,d29,d6[0]
+ vmlal.u32 q11,d29,d6[1]
+ vmlal.u32 q12,d29,d7[0]
+ vmlal.u32 q13,d29,d7[1]
+ vst1.64 {q6},[r7,:128]!
+ vmlal.u32 q7,d28,d0[0]
+ vld1.64 {q6},[r6,:128]
+ vmlal.u32 q8,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1]
+ vmlal.u32 q9,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q10,d28,d1[1]
+ vmlal.u32 q11,d28,d2[0]
+ vmlal.u32 q12,d28,d2[1]
+ vmlal.u32 q13,d28,d3[0]
+ vmlal.u32 q6,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2]
+ vmlal.u32 q7,d29,d4[0]
+ vmlal.u32 q8,d29,d4[1]
+ vmlal.u32 q9,d29,d5[0]
+ vmlal.u32 q10,d29,d5[1]
+ vmlal.u32 q11,d29,d6[0]
+ vmlal.u32 q12,d29,d6[1]
+ vmlal.u32 q13,d29,d7[0]
+ vmlal.u32 q6,d29,d7[1]
+ vst1.64 {q7},[r7,:128]!
+ vmlal.u32 q8,d28,d0[0]
+ vld1.64 {q7},[r6,:128]
+ vmlal.u32 q9,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2]
+ vmlal.u32 q10,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q11,d28,d1[1]
+ vmlal.u32 q12,d28,d2[0]
+ vmlal.u32 q13,d28,d2[1]
+ vmlal.u32 q6,d28,d3[0]
+ vmlal.u32 q7,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3]
+ vmlal.u32 q8,d29,d4[0]
+ vmlal.u32 q9,d29,d4[1]
+ vmlal.u32 q10,d29,d5[0]
+ vmlal.u32 q11,d29,d5[1]
+ vmlal.u32 q12,d29,d6[0]
+ vmlal.u32 q13,d29,d6[1]
+ vmlal.u32 q6,d29,d7[0]
+ vmlal.u32 q7,d29,d7[1]
+ vst1.64 {q8},[r7,:128]!
+ vmlal.u32 q9,d28,d0[0]
+ vld1.64 {q8},[r6,:128]
+ vmlal.u32 q10,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3]
+ vmlal.u32 q11,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q12,d28,d1[1]
+ vmlal.u32 q13,d28,d2[0]
+ vmlal.u32 q6,d28,d2[1]
+ vmlal.u32 q7,d28,d3[0]
+ vmlal.u32 q8,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4]
+ vmlal.u32 q9,d29,d4[0]
+ vmlal.u32 q10,d29,d4[1]
+ vmlal.u32 q11,d29,d5[0]
+ vmlal.u32 q12,d29,d5[1]
+ vmlal.u32 q13,d29,d6[0]
+ vmlal.u32 q6,d29,d6[1]
+ vmlal.u32 q7,d29,d7[0]
+ vmlal.u32 q8,d29,d7[1]
+ vst1.64 {q9},[r7,:128]!
+ vmlal.u32 q10,d28,d0[0]
+ vld1.64 {q9},[r6,:128]
+ vmlal.u32 q11,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4]
+ vmlal.u32 q12,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q13,d28,d1[1]
+ vmlal.u32 q6,d28,d2[0]
+ vmlal.u32 q7,d28,d2[1]
+ vmlal.u32 q8,d28,d3[0]
+ vmlal.u32 q9,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5]
+ vmlal.u32 q10,d29,d4[0]
+ vmlal.u32 q11,d29,d4[1]
+ vmlal.u32 q12,d29,d5[0]
+ vmlal.u32 q13,d29,d5[1]
+ vmlal.u32 q6,d29,d6[0]
+ vmlal.u32 q7,d29,d6[1]
+ vmlal.u32 q8,d29,d7[0]
+ vmlal.u32 q9,d29,d7[1]
+ vst1.64 {q10},[r7,:128]!
+ vmlal.u32 q11,d28,d0[0]
+ vld1.64 {q10},[r6,:128]
+ vmlal.u32 q12,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5]
+ vmlal.u32 q13,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q6,d28,d1[1]
+ vmlal.u32 q7,d28,d2[0]
+ vmlal.u32 q8,d28,d2[1]
+ vmlal.u32 q9,d28,d3[0]
+ vmlal.u32 q10,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6]
+ vmlal.u32 q11,d29,d4[0]
+ vmlal.u32 q12,d29,d4[1]
+ vmlal.u32 q13,d29,d5[0]
+ vmlal.u32 q6,d29,d5[1]
+ vmlal.u32 q7,d29,d6[0]
+ vmlal.u32 q8,d29,d6[1]
+ vmlal.u32 q9,d29,d7[0]
+ vmlal.u32 q10,d29,d7[1]
+ vst1.64 {q11},[r7,:128]!
+ vmlal.u32 q12,d28,d0[0]
+ vld1.64 {q11},[r6,:128]
+ vmlal.u32 q13,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6]
+ vmlal.u32 q6,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q7,d28,d1[1]
+ vmlal.u32 q8,d28,d2[0]
+ vmlal.u32 q9,d28,d2[1]
+ vmlal.u32 q10,d28,d3[0]
+ vmlal.u32 q11,d28,d3[1]
+ vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7]
+ vmlal.u32 q12,d29,d4[0]
+ vmlal.u32 q13,d29,d4[1]
+ vmlal.u32 q6,d29,d5[0]
+ vmlal.u32 q7,d29,d5[1]
+ vmlal.u32 q8,d29,d6[0]
+ vmlal.u32 q9,d29,d6[1]
+ vmlal.u32 q10,d29,d7[0]
+ vmlal.u32 q11,d29,d7[1]
+ vst1.64 {q12},[r7,:128]!
+ vmlal.u32 q13,d28,d0[0]
+ vld1.64 {q12},[r6,:128]
+ vmlal.u32 q6,d28,d0[1]
+ vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7]
+ vmlal.u32 q7,d28,d1[0]
+ it ne
+ addne r6,r6,#16 @ don't advance in last iteration
+ vmlal.u32 q8,d28,d1[1]
+ vmlal.u32 q9,d28,d2[0]
+ vmlal.u32 q10,d28,d2[1]
+ vmlal.u32 q11,d28,d3[0]
+ vmlal.u32 q12,d28,d3[1]
+ it eq
+ subeq r1,r1,r5,lsl#2 @ rewind
+ vmlal.u32 q13,d29,d4[0]
+ vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0]
+ vmlal.u32 q6,d29,d4[1]
+ vld1.32 {d0,d1,d2,d3},[r1]!
+ vmlal.u32 q7,d29,d5[0]
+ add r10,sp,#8 @ rewind
+ vmlal.u32 q8,d29,d5[1]
+ vmlal.u32 q9,d29,d6[0]
+ vmlal.u32 q10,d29,d6[1]
+ vmlal.u32 q11,d29,d7[0]
+ vst1.64 {q13},[r7,:128]!
+ vmlal.u32 q12,d29,d7[1]
+
+ bne .LNEON_8n_inner
+ add r6,sp,#128
+ vst1.64 {q6,q7},[r7,:256]!
+ veor q2,q2,q2 @ d4-d5
+ vst1.64 {q8,q9},[r7,:256]!
+ veor q3,q3,q3 @ d6-d7
+ vst1.64 {q10,q11},[r7,:256]!
+ vst1.64 {q12},[r7,:128]
+
+ subs r9,r9,#8
+ vld1.64 {q6,q7},[r6,:256]!
+ vld1.64 {q8,q9},[r6,:256]!
+ vld1.64 {q10,q11},[r6,:256]!
+ vld1.64 {q12,q13},[r6,:256]!
+
+ itt ne
+ subne r3,r3,r5,lsl#2 @ rewind
+ bne .LNEON_8n_outer
+
+ add r7,sp,#128
+ vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame
+ vshr.u64 d10,d12,#16
+ vst1.64 {q2,q3},[sp,:256]!
+ vadd.u64 d13,d13,d10
+ vst1.64 {q2,q3}, [sp,:256]!
+ vshr.u64 d10,d13,#16
+ vst1.64 {q2,q3}, [sp,:256]!
+ vzip.16 d12,d13
+
+ mov r8,r5
+ b .LNEON_tail_entry
+
+.align 4
+.LNEON_tail:
+ vadd.u64 d12,d12,d10
+ vshr.u64 d10,d12,#16
+ vld1.64 {q8,q9}, [r6, :256]!
+ vadd.u64 d13,d13,d10
+ vld1.64 {q10,q11}, [r6, :256]!
+ vshr.u64 d10,d13,#16
+ vld1.64 {q12,q13}, [r6, :256]!
+ vzip.16 d12,d13
+
+.LNEON_tail_entry:
+ vadd.u64 d14,d14,d10
+ vst1.32 {d12[0]}, [r7, :32]!
+ vshr.u64 d10,d14,#16
+ vadd.u64 d15,d15,d10
+ vshr.u64 d10,d15,#16
+ vzip.16 d14,d15
+ vadd.u64 d16,d16,d10
+ vst1.32 {d14[0]}, [r7, :32]!
+ vshr.u64 d10,d16,#16
+ vadd.u64 d17,d17,d10
+ vshr.u64 d10,d17,#16
+ vzip.16 d16,d17
+ vadd.u64 d18,d18,d10
+ vst1.32 {d16[0]}, [r7, :32]!
+ vshr.u64 d10,d18,#16
+ vadd.u64 d19,d19,d10
+ vshr.u64 d10,d19,#16
+ vzip.16 d18,d19
+ vadd.u64 d20,d20,d10
+ vst1.32 {d18[0]}, [r7, :32]!
+ vshr.u64 d10,d20,#16
+ vadd.u64 d21,d21,d10
+ vshr.u64 d10,d21,#16
+ vzip.16 d20,d21
+ vadd.u64 d22,d22,d10
+ vst1.32 {d20[0]}, [r7, :32]!
+ vshr.u64 d10,d22,#16
+ vadd.u64 d23,d23,d10
+ vshr.u64 d10,d23,#16
+ vzip.16 d22,d23
+ vadd.u64 d24,d24,d10
+ vst1.32 {d22[0]}, [r7, :32]!
+ vshr.u64 d10,d24,#16
+ vadd.u64 d25,d25,d10
+ vshr.u64 d10,d25,#16
+ vzip.16 d24,d25
+ vadd.u64 d26,d26,d10
+ vst1.32 {d24[0]}, [r7, :32]!
+ vshr.u64 d10,d26,#16
+ vadd.u64 d27,d27,d10
+ vshr.u64 d10,d27,#16
+ vzip.16 d26,d27
+ vld1.64 {q6,q7}, [r6, :256]!
+ subs r8,r8,#8
+ vst1.32 {d26[0]}, [r7, :32]!
+ bne .LNEON_tail
+
+ vst1.32 {d10[0]}, [r7, :32] @ top-most bit
+ sub r3,r3,r5,lsl#2 @ rewind r3
+ subs r1,sp,#0 @ clear carry flag
+ add r2,sp,r5,lsl#2
+
+.LNEON_sub:
+ ldmia r1!, {r4,r5,r6,r7}
+ ldmia r3!, {r8,r9,r10,r11}
+ sbcs r8, r4,r8
+ sbcs r9, r5,r9
+ sbcs r10,r6,r10
+ sbcs r11,r7,r11
+ teq r1,r2 @ preserves carry
+ stmia r0!, {r8,r9,r10,r11}
+ bne .LNEON_sub
+
+ ldr r10, [r1] @ load top-most bit
+ mov r11,sp
+ veor q0,q0,q0
+ sub r11,r2,r11 @ this is num*4
+ veor q1,q1,q1
+ mov r1,sp
+ sub r0,r0,r11 @ rewind r0
+ mov r3,r2 @ second 3/4th of frame
+ sbcs r10,r10,#0 @ result is carry flag
+
+.LNEON_copy_n_zap:
+ ldmia r1!, {r4,r5,r6,r7}
+ ldmia r0, {r8,r9,r10,r11}
+ it cc
+ movcc r8, r4
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ itt cc
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ it cc
+ movcc r11,r7
+ ldmia r1, {r4,r5,r6,r7}
+ stmia r0!, {r8,r9,r10,r11}
+ sub r1,r1,#16
+ ldmia r0, {r8,r9,r10,r11}
+ it cc
+ movcc r8, r4
+ vst1.64 {q0,q1}, [r1,:256]! @ wipe
+ itt cc
+ movcc r9, r5
+ movcc r10,r6
+ vst1.64 {q0,q1}, [r3,:256]! @ wipe
+ it cc
+ movcc r11,r7
+ teq r1,r2 @ preserves carry
+ stmia r0!, {r8,r9,r10,r11}
+ bne .LNEON_copy_n_zap
+
+ mov sp,ip
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+ bx lr @ bx lr
+.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S
@@ -1,0 +1,1529 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License"). You may not use
+@ this file except in compliance with the License. You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
+@ of Linaro. Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ Bit-sliced AES for ARM NEON
+@
+@ February 2012.
+@
+@ This implementation is direct adaptation of bsaes-x86_64 module for
+@ ARM NEON. Except that this module is endian-neutral [in sense that
+@ it can be compiled for either endianness] by courtesy of vld1.8's
+@ neutrality. Initial version doesn't implement interface to OpenSSL,
+@ only low-level primitives and unsupported entry points, just enough
+@ to collect performance results, which for Cortex-A8 core are:
+@
+@ encrypt 19.5 cycles per byte processed with 128-bit key
+@ decrypt 22.1 cycles per byte processed with 128-bit key
+@ key conv. 440 cycles per 128-bit key/0.18 of 8x block
+@
+@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
+@ which is [much] worse than anticipated (for further details see
+@ http://www.openssl.org/~appro/Snapdragon-S4.html).
+@
+@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
+@ manages in 20.0 cycles].
+@
+@ When comparing to x86_64 results keep in mind that NEON unit is
+@ [mostly] single-issue and thus can't [fully] benefit from
+@ instruction-level parallelism. And when comparing to aes-armv4
+@ results keep in mind key schedule conversion overhead (see
+@ bsaes-x86_64.pl for further details)...
+@
+@ <appro@openssl.org>
+
+@ April-August 2013
+@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+# define VFP_ABI_FRAME 0x40
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+# define VFP_ABI_FRAME 0
+# define BSAES_ASM_EXTENDED_KEY
+# define XTS_CHAIN_TWEAK
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+#ifdef __thumb__
+# define adrl adr
+#endif
+
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.text
+.syntax unified @ ARMv7-capable assembler is expected to handle this
+#if defined(__thumb2__) && !defined(__APPLE__)
+.thumb
+#else
+.code 32
+# undef __thumb2__
+#endif
+
+.type _bsaes_decrypt8,%function
+.align 4
+_bsaes_decrypt8:
+ adr r6,.
+ vldmia r4!, {q9} @ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr r6,.LM0ISR
+#else
+ add r6,r6,#.LM0ISR-_bsaes_decrypt8
+#endif
+
+ vldmia r6!, {q8} @ .LM0ISR
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Ldec_sbox
+.align 4
+.Ldec_loop:
+ vldmia r4!, {q8,q9,q10,q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Ldec_sbox:
+ veor q1, q1, q4
+ veor q3, q3, q4
+
+ veor q4, q4, q7
+ veor q1, q1, q6
+ veor q2, q2, q7
+ veor q6, q6, q4
+
+ veor q0, q0, q1
+ veor q2, q2, q5
+ veor q7, q7, q6
+ veor q3, q3, q0
+ veor q5, q5, q0
+ veor q1, q1, q3
+ veor q11, q3, q0
+ veor q10, q7, q4
+ veor q9, q1, q6
+ veor q13, q4, q0
+ vmov q8, q10
+ veor q12, q5, q2
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q6, q2
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q3, q7
+ veor q12, q1, q5
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q4, q6
+ veor q9, q9, q14
+ vand q13, q0, q2
+ vand q14, q7, q1
+ vorr q15, q3, q5
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q5, q2
+ veor q8, q1, q6
+ veor q10, q15, q14
+ vand q10, q10, q5
+ veor q5, q5, q1
+ vand q11, q1, q15
+ vand q5, q5, q14
+ veor q1, q11, q10
+ veor q5, q5, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q2
+ veor q12, q12, q8
+ veor q2, q2, q6
+ vand q8, q8, q15
+ vand q6, q6, q13
+ vand q12, q12, q14
+ vand q2, q2, q9
+ veor q8, q8, q12
+ veor q2, q2, q6
+ veor q12, q12, q11
+ veor q6, q6, q10
+ veor q5, q5, q12
+ veor q2, q2, q12
+ veor q1, q1, q8
+ veor q6, q6, q8
+
+ veor q12, q3, q0
+ veor q8, q7, q4
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q4
+ vand q8, q8, q15
+ vand q4, q4, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q4
+ veor q12, q12, q11
+ veor q4, q4, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q3
+ veor q3, q3, q7
+ vand q11, q7, q15
+ vand q3, q3, q14
+ veor q7, q11, q10
+ veor q3, q3, q11
+ veor q3, q3, q12
+ veor q0, q0, q12
+ veor q7, q7, q8
+ veor q4, q4, q8
+ veor q1, q1, q7
+ veor q6, q6, q5
+
+ veor q4, q4, q1
+ veor q2, q2, q7
+ veor q5, q5, q7
+ veor q4, q4, q2
+ veor q7, q7, q0
+ veor q4, q4, q5
+ veor q3, q3, q6
+ veor q6, q6, q1
+ veor q3, q3, q4
+
+ veor q4, q4, q0
+ veor q7, q7, q3
+ subs r5,r5,#1
+ bcc .Ldec_done
+ @ multiplication by 0x05-0x00-0x04-0x00
+ vext.8 q8, q0, q0, #8
+ vext.8 q14, q3, q3, #8
+ vext.8 q15, q5, q5, #8
+ veor q8, q8, q0
+ vext.8 q9, q1, q1, #8
+ veor q14, q14, q3
+ vext.8 q10, q6, q6, #8
+ veor q15, q15, q5
+ vext.8 q11, q4, q4, #8
+ veor q9, q9, q1
+ vext.8 q12, q2, q2, #8
+ veor q10, q10, q6
+ vext.8 q13, q7, q7, #8
+ veor q11, q11, q4
+ veor q12, q12, q2
+ veor q13, q13, q7
+
+ veor q0, q0, q14
+ veor q1, q1, q14
+ veor q6, q6, q8
+ veor q2, q2, q10
+ veor q4, q4, q9
+ veor q1, q1, q15
+ veor q6, q6, q15
+ veor q2, q2, q14
+ veor q7, q7, q11
+ veor q4, q4, q14
+ veor q3, q3, q12
+ veor q2, q2, q15
+ veor q7, q7, q15
+ veor q5, q5, q13
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q6, q6, #12
+ veor q1, q1, q9
+ vext.8 q11, q4, q4, #12
+ veor q6, q6, q10
+ vext.8 q12, q2, q2, #12
+ veor q4, q4, q11
+ vext.8 q13, q7, q7, #12
+ veor q2, q2, q12
+ vext.8 q14, q3, q3, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q3, q3, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q2
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q2, q2, #8
+ veor q12, q12, q4
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q3
+ vext.8 q2, q4, q4, #8
+ veor q11, q11, q6
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q4, q3, q3, #8
+ veor q11, q11, q5
+ vext.8 q3, q6, q6, #8
+ veor q5, q9, q13
+ veor q11, q11, q2
+ veor q7, q7, q15
+ veor q6, q4, q14
+ veor q4, q8, q12
+ veor q2, q3, q10
+ vmov q3, q11
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LISR
+ ite eq @ Thumb2 thing, sanity check in ARM
+ addeq r6,r6,#0x10
+ bne .Ldec_loop
+ vldmia r6, {q12} @ .LISRM0
+ b .Ldec_loop
+.align 4
+.Ldec_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q3, #1
+ vshr.u64 q11, q2, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q4
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q2, #2
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q4
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q4, q4, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q4, #4
+ vshr.u64 q11, q6, #4
+ veor q10, q10, q5
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q4, q4, q10
+ veor q6, q6, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q6, q6, q8
+ veor q4, q4, q8
+ veor q2, q2, q8
+ veor q7, q7, q8
+ veor q3, q3, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_decrypt8,.-_bsaes_decrypt8
+
+.type _bsaes_const,%object
+.align 6
+_bsaes_const:
+.LM0ISR:@ InvShiftRows constants
+.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISR:
+.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
+.LISRM0:
+.quad 0x01040b0e0205080f, 0x0306090c00070a0d
+.LM0SR:@ ShiftRows constants
+.quad 0x0a0e02060f03070b, 0x0004080c05090d01
+.LSR:
+.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+.quad 0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+.quad 0x02060a0e03070b0f, 0x0004080c0105090d
+.LREVM0SR:
+.quad 0x090d01050c000408, 0x03070b0f060a0e02
+.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 6
+.size _bsaes_const,.-_bsaes_const
+
+.type _bsaes_encrypt8,%function
+.align 4
+_bsaes_encrypt8:
+ adr r6,.
+ vldmia r4!, {q9} @ round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr r6,.LM0SR
+#else
+ sub r6,r6,#_bsaes_encrypt8-.LM0SR
+#endif
+
+ vldmia r6!, {q8} @ .LM0SR
+_bsaes_encrypt8_alt:
+ veor q10, q0, q9 @ xor with round0 key
+ veor q11, q1, q9
+ vtbl.8 d0, {q10}, d16
+ vtbl.8 d1, {q10}, d17
+ veor q12, q2, q9
+ vtbl.8 d2, {q11}, d16
+ vtbl.8 d3, {q11}, d17
+ veor q13, q3, q9
+ vtbl.8 d4, {q12}, d16
+ vtbl.8 d5, {q12}, d17
+ veor q14, q4, q9
+ vtbl.8 d6, {q13}, d16
+ vtbl.8 d7, {q13}, d17
+ veor q15, q5, q9
+ vtbl.8 d8, {q14}, d16
+ vtbl.8 d9, {q14}, d17
+ veor q10, q6, q9
+ vtbl.8 d10, {q15}, d16
+ vtbl.8 d11, {q15}, d17
+ veor q11, q7, q9
+ vtbl.8 d12, {q10}, d16
+ vtbl.8 d13, {q10}, d17
+ vtbl.8 d14, {q11}, d16
+ vtbl.8 d15, {q11}, d17
+_bsaes_encrypt8_bitslice:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q6, #1
+ vshr.u64 q11, q4, #1
+ veor q10, q10, q7
+ veor q11, q11, q5
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #1
+ veor q5, q5, q11
+ vshl.u64 q11, q11, #1
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q3
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q5, #2
+ vshr.u64 q11, q4, #2
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #2
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #2
+ veor q5, q5, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q3
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q3, q3, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q3, #4
+ vshr.u64 q11, q2, #4
+ veor q10, q10, q7
+ veor q11, q11, q6
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q6, q6, q11
+ vshl.u64 q11, q11, #4
+ veor q3, q3, q10
+ veor q2, q2, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q5
+ veor q11, q11, q4
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ sub r5,r5,#1
+ b .Lenc_sbox
+.align 4
+.Lenc_loop:
+ vldmia r4!, {q8,q9,q10,q11}
+ veor q8, q8, q0
+ veor q9, q9, q1
+ vtbl.8 d0, {q8}, d24
+ vtbl.8 d1, {q8}, d25
+ vldmia r4!, {q8}
+ veor q10, q10, q2
+ vtbl.8 d2, {q9}, d24
+ vtbl.8 d3, {q9}, d25
+ vldmia r4!, {q9}
+ veor q11, q11, q3
+ vtbl.8 d4, {q10}, d24
+ vtbl.8 d5, {q10}, d25
+ vldmia r4!, {q10}
+ vtbl.8 d6, {q11}, d24
+ vtbl.8 d7, {q11}, d25
+ vldmia r4!, {q11}
+ veor q8, q8, q4
+ veor q9, q9, q5
+ vtbl.8 d8, {q8}, d24
+ vtbl.8 d9, {q8}, d25
+ veor q10, q10, q6
+ vtbl.8 d10, {q9}, d24
+ vtbl.8 d11, {q9}, d25
+ veor q11, q11, q7
+ vtbl.8 d12, {q10}, d24
+ vtbl.8 d13, {q10}, d25
+ vtbl.8 d14, {q11}, d24
+ vtbl.8 d15, {q11}, d25
+.Lenc_sbox:
+ veor q2, q2, q1
+ veor q5, q5, q6
+ veor q3, q3, q0
+ veor q6, q6, q2
+ veor q5, q5, q0
+
+ veor q6, q6, q3
+ veor q3, q3, q7
+ veor q7, q7, q5
+ veor q3, q3, q4
+ veor q4, q4, q5
+
+ veor q2, q2, q7
+ veor q3, q3, q1
+ veor q1, q1, q5
+ veor q11, q7, q4
+ veor q10, q1, q2
+ veor q9, q5, q3
+ veor q13, q2, q4
+ vmov q8, q10
+ veor q12, q6, q0
+
+ vorr q10, q10, q9
+ veor q15, q11, q8
+ vand q14, q11, q12
+ vorr q11, q11, q12
+ veor q12, q12, q9
+ vand q8, q8, q9
+ veor q9, q3, q0
+ vand q15, q15, q12
+ vand q13, q13, q9
+ veor q9, q7, q1
+ veor q12, q5, q6
+ veor q11, q11, q13
+ veor q10, q10, q13
+ vand q13, q9, q12
+ vorr q9, q9, q12
+ veor q11, q11, q15
+ veor q8, q8, q13
+ veor q10, q10, q14
+ veor q9, q9, q15
+ veor q8, q8, q14
+ vand q12, q2, q3
+ veor q9, q9, q14
+ vand q13, q4, q0
+ vand q14, q1, q5
+ vorr q15, q7, q6
+ veor q11, q11, q12
+ veor q9, q9, q14
+ veor q8, q8, q15
+ veor q10, q10, q13
+
+ @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3
+
+ @ new smaller inversion
+
+ vand q14, q11, q9
+ vmov q12, q8
+
+ veor q13, q10, q14
+ veor q15, q8, q14
+ veor q14, q8, q14 @ q14=q15
+
+ vbsl q13, q9, q8
+ vbsl q15, q11, q10
+ veor q11, q11, q10
+
+ vbsl q12, q13, q14
+ vbsl q8, q14, q13
+
+ vand q14, q12, q15
+ veor q9, q9, q8
+
+ veor q14, q14, q11
+ veor q12, q6, q0
+ veor q8, q5, q3
+ veor q10, q15, q14
+ vand q10, q10, q6
+ veor q6, q6, q5
+ vand q11, q5, q15
+ vand q6, q6, q14
+ veor q5, q11, q10
+ veor q6, q6, q11
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q0
+ veor q12, q12, q8
+ veor q0, q0, q3
+ vand q8, q8, q15
+ vand q3, q3, q13
+ vand q12, q12, q14
+ vand q0, q0, q9
+ veor q8, q8, q12
+ veor q0, q0, q3
+ veor q12, q12, q11
+ veor q3, q3, q10
+ veor q6, q6, q12
+ veor q0, q0, q12
+ veor q5, q5, q8
+ veor q3, q3, q8
+
+ veor q12, q7, q4
+ veor q8, q1, q2
+ veor q11, q15, q14
+ veor q10, q13, q9
+ vand q11, q11, q12
+ vand q10, q10, q4
+ veor q12, q12, q8
+ veor q4, q4, q2
+ vand q8, q8, q15
+ vand q2, q2, q13
+ vand q12, q12, q14
+ vand q4, q4, q9
+ veor q8, q8, q12
+ veor q4, q4, q2
+ veor q12, q12, q11
+ veor q2, q2, q10
+ veor q15, q15, q13
+ veor q14, q14, q9
+ veor q10, q15, q14
+ vand q10, q10, q7
+ veor q7, q7, q1
+ vand q11, q1, q15
+ vand q7, q7, q14
+ veor q1, q11, q10
+ veor q7, q7, q11
+ veor q7, q7, q12
+ veor q4, q4, q12
+ veor q1, q1, q8
+ veor q2, q2, q8
+ veor q7, q7, q0
+ veor q1, q1, q6
+ veor q6, q6, q0
+ veor q4, q4, q7
+ veor q0, q0, q1
+
+ veor q1, q1, q5
+ veor q5, q5, q2
+ veor q2, q2, q3
+ veor q3, q3, q5
+ veor q4, q4, q5
+
+ veor q6, q6, q3
+ subs r5,r5,#1
+ bcc .Lenc_done
+ vext.8 q8, q0, q0, #12 @ x0 <<< 32
+ vext.8 q9, q1, q1, #12
+ veor q0, q0, q8 @ x0 ^ (x0 <<< 32)
+ vext.8 q10, q4, q4, #12
+ veor q1, q1, q9
+ vext.8 q11, q6, q6, #12
+ veor q4, q4, q10
+ vext.8 q12, q3, q3, #12
+ veor q6, q6, q11
+ vext.8 q13, q7, q7, #12
+ veor q3, q3, q12
+ vext.8 q14, q2, q2, #12
+ veor q7, q7, q13
+ vext.8 q15, q5, q5, #12
+ veor q2, q2, q14
+
+ veor q9, q9, q0
+ veor q5, q5, q15
+ vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
+ veor q10, q10, q1
+ veor q8, q8, q5
+ veor q9, q9, q5
+ vext.8 q1, q1, q1, #8
+ veor q13, q13, q3
+ veor q0, q0, q8
+ veor q14, q14, q7
+ veor q1, q1, q9
+ vext.8 q8, q3, q3, #8
+ veor q12, q12, q6
+ vext.8 q9, q7, q7, #8
+ veor q15, q15, q2
+ vext.8 q3, q6, q6, #8
+ veor q11, q11, q4
+ vext.8 q7, q5, q5, #8
+ veor q12, q12, q5
+ vext.8 q6, q2, q2, #8
+ veor q11, q11, q5
+ vext.8 q2, q4, q4, #8
+ veor q5, q9, q13
+ veor q4, q8, q12
+ veor q3, q3, q11
+ veor q7, q7, q15
+ veor q6, q6, q14
+ @ vmov q4, q8
+ veor q2, q2, q10
+ @ vmov q5, q9
+ vldmia r6, {q12} @ .LSR
+ ite eq @ Thumb2 thing, samity check in ARM
+ addeq r6,r6,#0x10
+ bne .Lenc_loop
+ vldmia r6, {q12} @ .LSRM0
+ b .Lenc_loop
+.align 4
+.Lenc_done:
+ vmov.i8 q8,#0x55 @ compose .LBS0
+ vmov.i8 q9,#0x33 @ compose .LBS1
+ vshr.u64 q10, q2, #1
+ vshr.u64 q11, q3, #1
+ veor q10, q10, q5
+ veor q11, q11, q7
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #1
+ veor q7, q7, q11
+ vshl.u64 q11, q11, #1
+ veor q2, q2, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q4, #1
+ vshr.u64 q11, q0, #1
+ veor q10, q10, q6
+ veor q11, q11, q1
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #1
+ veor q1, q1, q11
+ vshl.u64 q11, q11, #1
+ veor q4, q4, q10
+ veor q0, q0, q11
+ vmov.i8 q8,#0x0f @ compose .LBS2
+ vshr.u64 q10, q7, #2
+ vshr.u64 q11, q3, #2
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #2
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #2
+ veor q7, q7, q10
+ veor q3, q3, q11
+ vshr.u64 q10, q1, #2
+ vshr.u64 q11, q0, #2
+ veor q10, q10, q6
+ veor q11, q11, q4
+ vand q10, q10, q9
+ vand q11, q11, q9
+ veor q6, q6, q10
+ vshl.u64 q10, q10, #2
+ veor q4, q4, q11
+ vshl.u64 q11, q11, #2
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vshr.u64 q10, q6, #4
+ vshr.u64 q11, q4, #4
+ veor q10, q10, q5
+ veor q11, q11, q2
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q5, q5, q10
+ vshl.u64 q10, q10, #4
+ veor q2, q2, q11
+ vshl.u64 q11, q11, #4
+ veor q6, q6, q10
+ veor q4, q4, q11
+ vshr.u64 q10, q1, #4
+ vshr.u64 q11, q0, #4
+ veor q10, q10, q7
+ veor q11, q11, q3
+ vand q10, q10, q8
+ vand q11, q11, q8
+ veor q7, q7, q10
+ vshl.u64 q10, q10, #4
+ veor q3, q3, q11
+ vshl.u64 q11, q11, #4
+ veor q1, q1, q10
+ veor q0, q0, q11
+ vldmia r4, {q8} @ last round key
+ veor q4, q4, q8
+ veor q6, q6, q8
+ veor q3, q3, q8
+ veor q7, q7, q8
+ veor q2, q2, q8
+ veor q5, q5, q8
+ veor q0, q0, q8
+ veor q1, q1, q8
+ bx lr
+.size _bsaes_encrypt8,.-_bsaes_encrypt8
+.type _bsaes_key_convert,%function
+.align 4
+_bsaes_key_convert:
+ adr r6,.
+ vld1.8 {q7}, [r4]! @ load round 0 key
+#if defined(__thumb2__) || defined(__APPLE__)
+ adr r6,.LM0
+#else
+ sub r6,r6,#_bsaes_key_convert-.LM0
+#endif
+ vld1.8 {q15}, [r4]! @ load round 1 key
+
+ vmov.i8 q8, #0x01 @ bit masks
+ vmov.i8 q9, #0x02
+ vmov.i8 q10, #0x04
+ vmov.i8 q11, #0x08
+ vmov.i8 q12, #0x10
+ vmov.i8 q13, #0x20
+ vldmia r6, {q14} @ .LM0
+
+#ifdef __ARMEL__
+ vrev32.8 q7, q7
+ vrev32.8 q15, q15
+#endif
+ sub r5,r5,#1
+ vstmia r12!, {q7} @ save round 0 key
+ b .Lkey_loop
+
+.align 4
+.Lkey_loop:
+ vtbl.8 d14,{q15},d28
+ vtbl.8 d15,{q15},d29
+ vmov.i8 q6, #0x40
+ vmov.i8 q15, #0x80
+
+ vtst.8 q0, q7, q8
+ vtst.8 q1, q7, q9
+ vtst.8 q2, q7, q10
+ vtst.8 q3, q7, q11
+ vtst.8 q4, q7, q12
+ vtst.8 q5, q7, q13
+ vtst.8 q6, q7, q6
+ vtst.8 q7, q7, q15
+ vld1.8 {q15}, [r4]! @ load next round key
+ vmvn q0, q0 @ "pnot"
+ vmvn q1, q1
+ vmvn q5, q5
+ vmvn q6, q6
+#ifdef __ARMEL__
+ vrev32.8 q15, q15
+#endif
+ subs r5,r5,#1
+ vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key
+ bne .Lkey_loop
+
+ vmov.i8 q7,#0x63 @ compose .L63
+ @ don't save last round key
+ bx lr
+.size _bsaes_key_convert,.-_bsaes_key_convert
+.globl bsaes_cbc_encrypt
+.hidden bsaes_cbc_encrypt
+.type bsaes_cbc_encrypt,%function
+.align 5
+bsaes_cbc_encrypt:
+ @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for
+ @ short inputs. We patch this out, using bsaes for all input sizes.
+
+ @ it is up to the caller to make sure we are called with enc == 0
+
+ mov ip, sp
+ stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ IV is 1st arg on the stack
+ mov r2, r2, lsr#4 @ len in 16 byte blocks
+ sub sp, #0x10 @ scratch space to carry over the IV
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ sifze of bit-slices key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ vldmia sp, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia sp, {q7}
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ add r4, r3, #248
+ vldmia r4, {q6}
+ vstmia r12, {q15} @ save last round key
+ veor q7, q7, q6 @ fix up round 0 key
+ vstmia r4, {q7}
+
+.align 2
+
+#endif
+
+ vld1.8 {q15}, [r8] @ load IV
+ b .Lcbc_dec_loop
+
+.align 4
+.Lcbc_dec_loop:
+ subs r2, r2, #0x8
+ bmi .Lcbc_dec_loop_finish
+
+ vld1.8 {q0,q1}, [r0]! @ load input
+ vld1.8 {q2,q3}, [r0]!
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ vld1.8 {q4,q5}, [r0]!
+ mov r5, r10
+ vld1.8 {q6,q7}, [r0]
+ sub r0, r0, #0x60
+ vstmia r9, {q15} @ put aside IV
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12,q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q14,q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ veor q5, q5, q14
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ vst1.8 {q5}, [r1]!
+
+ b .Lcbc_dec_loop
+
+.Lcbc_dec_loop_finish:
+ adds r2, r2, #8
+ beq .Lcbc_dec_done
+
+ @ Set up most parameters for the _bsaes_decrypt8 call.
+#ifndef BSAES_ASM_EXTENDED_KEY
+ mov r4, sp @ pass the key
+#else
+ add r4, r3, #248
+#endif
+ mov r5, r10
+ vstmia r9, {q15} @ put aside IV
+
+ vld1.8 {q0}, [r0]! @ load input
+ cmp r2, #2
+ blo .Lcbc_dec_one
+ vld1.8 {q1}, [r0]!
+ beq .Lcbc_dec_two
+ vld1.8 {q2}, [r0]!
+ cmp r2, #4
+ blo .Lcbc_dec_three
+ vld1.8 {q3}, [r0]!
+ beq .Lcbc_dec_four
+ vld1.8 {q4}, [r0]!
+ cmp r2, #6
+ blo .Lcbc_dec_five
+ vld1.8 {q5}, [r0]!
+ beq .Lcbc_dec_six
+ vld1.8 {q6}, [r0]!
+ sub r0, r0, #0x70
+
+ bl _bsaes_decrypt8
+
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12,q13}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q3, q3, q13
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ vst1.8 {q3}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_six:
+ sub r0, r0, #0x60
+ bl _bsaes_decrypt8
+ vldmia r9,{q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q12}, [r0]!
+ veor q4, q4, q10
+ veor q2, q2, q11
+ vld1.8 {q15}, [r0]!
+ veor q7, q7, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ vst1.8 {q7}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_five:
+ sub r0, r0, #0x50
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10,q11}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q2, q2, q11
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ vst1.8 {q2}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_four:
+ sub r0, r0, #0x40
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q10}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vld1.8 {q15}, [r0]!
+ veor q4, q4, q10
+ vst1.8 {q0,q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ vst1.8 {q4}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_three:
+ sub r0, r0, #0x30
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8,q9}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]!
+ veor q1, q1, q8
+ veor q6, q6, q9
+ vst1.8 {q0,q1}, [r1]! @ write output
+ vst1.8 {q6}, [r1]!
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_two:
+ sub r0, r0, #0x20
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q8}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q1, q1, q8
+ vst1.8 {q0,q1}, [r1]! @ write output
+ b .Lcbc_dec_done
+.align 4
+.Lcbc_dec_one:
+ sub r0, r0, #0x10
+ bl _bsaes_decrypt8
+ vldmia r9, {q14} @ reload IV
+ vld1.8 {q15}, [r0]! @ reload input
+ veor q0, q0, q14 @ ^= IV
+ vst1.8 {q0}, [r1]! @ write output
+
+.Lcbc_dec_done:
+#ifndef BSAES_ASM_EXTENDED_KEY
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+.Lcbc_dec_bzero:@ wipe key schedule [if any]
+ vstmia sp!, {q0,q1}
+ cmp sp, r9
+ bne .Lcbc_dec_bzero
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ vst1.8 {q15}, [r8] @ return IV
+ VFP_ABI_POP
+ ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
+.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+.globl bsaes_ctr32_encrypt_blocks
+.hidden bsaes_ctr32_encrypt_blocks
+.type bsaes_ctr32_encrypt_blocks,%function
+.align 5
+bsaes_ctr32_encrypt_blocks:
+ @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this
+ @ out to retain a constant-time implementation.
+ mov ip, sp
+ stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
+ VFP_ABI_PUSH
+ ldr r8, [ip] @ ctr is 1st arg on the stack
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
+ mov r9, sp @ save sp
+
+ ldr r10, [r3, #240] @ get # of rounds
+#ifndef BSAES_ASM_EXTENDED_KEY
+ @ allocate the key schedule on the stack
+ sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key
+ add r12, #96 @ size of bit-sliced key schedule
+
+ @ populate the key schedule
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ mov sp, r12 @ sp is sp
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+ vld1.8 {q0}, [r8] @ load counter
+#ifdef __APPLE__
+ mov r8, #:lower16:(.LREVM0SR-.LM0)
+ add r8, r6, r8
+#else
+ add r8, r6, #.LREVM0SR-.LM0 @ borrow r8
+#endif
+ vldmia sp, {q4} @ load round0 key
+#else
+ ldr r12, [r3, #244]
+ eors r12, #1
+ beq 0f
+
+ @ populate the key schedule
+ str r12, [r3, #244]
+ mov r4, r3 @ pass key
+ mov r5, r10 @ pass # of rounds
+ add r12, r3, #248 @ pass key schedule
+ bl _bsaes_key_convert
+ veor q7,q7,q15 @ fix up last round key
+ vstmia r12, {q7} @ save last round key
+
+.align 2
+ add r12, r3, #248
+ vld1.8 {q0}, [r8] @ load counter
+ adrl r8, .LREVM0SR @ borrow r8
+ vldmia r12, {q4} @ load round0 key
+ sub sp, #0x10 @ place for adjusted round0 key
+#endif
+
+ vmov.i32 q8,#1 @ compose 1<<96
+ veor q9,q9,q9
+ vrev32.8 q0,q0
+ vext.8 q8,q9,q8,#4
+ vrev32.8 q4,q4
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vstmia sp, {q4} @ save adjusted round0 key
+ b .Lctr_enc_loop
+
+.align 4
+.Lctr_enc_loop:
+ vadd.u32 q10, q8, q9 @ compose 3<<96
+ vadd.u32 q1, q0, q8 @ +1
+ vadd.u32 q2, q0, q9 @ +2
+ vadd.u32 q3, q0, q10 @ +3
+ vadd.u32 q4, q1, q10
+ vadd.u32 q5, q2, q10
+ vadd.u32 q6, q3, q10
+ vadd.u32 q7, q4, q10
+ vadd.u32 q10, q5, q10 @ next counter
+
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
+ @ to flip byte order in 32-bit counter
+
+ vldmia sp, {q9} @ load round0 key
+#ifndef BSAES_ASM_EXTENDED_KEY
+ add r4, sp, #0x10 @ pass next round key
+#else
+ add r4, r3, #264
+#endif
+ vldmia r8, {q8} @ .LREVM0SR
+ mov r5, r10 @ pass rounds
+ vstmia r9, {q10} @ save next counter
+#ifdef __APPLE__
+ mov r6, #:lower16:(.LREVM0SR-.LSR)
+ sub r6, r8, r6
+#else
+ sub r6, r8, #.LREVM0SR-.LSR @ pass constants
+#endif
+
+ bl _bsaes_encrypt8_alt
+
+ subs r2, r2, #8
+ blo .Lctr_enc_loop_done
+
+ vld1.8 {q8,q9}, [r0]! @ load input
+ vld1.8 {q10,q11}, [r0]!
+ veor q0, q8
+ veor q1, q9
+ vld1.8 {q12,q13}, [r0]!
+ veor q4, q10
+ veor q6, q11
+ vld1.8 {q14,q15}, [r0]!
+ veor q3, q12
+ vst1.8 {q0,q1}, [r1]! @ write output
+ veor q7, q13
+ veor q2, q14
+ vst1.8 {q4}, [r1]!
+ veor q5, q15
+ vst1.8 {q6}, [r1]!
+ vmov.i32 q8, #1 @ compose 1<<96
+ vst1.8 {q3}, [r1]!
+ veor q9, q9, q9
+ vst1.8 {q7}, [r1]!
+ vext.8 q8, q9, q8, #4
+ vst1.8 {q2}, [r1]!
+ vadd.u32 q9,q8,q8 @ compose 2<<96
+ vst1.8 {q5}, [r1]!
+ vldmia r9, {q0} @ load counter
+
+ bne .Lctr_enc_loop
+ b .Lctr_enc_done
+
+.align 4
+.Lctr_enc_loop_done:
+ add r2, r2, #8
+ vld1.8 {q8}, [r0]! @ load input
+ veor q0, q8
+ vst1.8 {q0}, [r1]! @ write output
+ cmp r2, #2
+ blo .Lctr_enc_done
+ vld1.8 {q9}, [r0]!
+ veor q1, q9
+ vst1.8 {q1}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q10}, [r0]!
+ veor q4, q10
+ vst1.8 {q4}, [r1]!
+ cmp r2, #4
+ blo .Lctr_enc_done
+ vld1.8 {q11}, [r0]!
+ veor q6, q11
+ vst1.8 {q6}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q12}, [r0]!
+ veor q3, q12
+ vst1.8 {q3}, [r1]!
+ cmp r2, #6
+ blo .Lctr_enc_done
+ vld1.8 {q13}, [r0]!
+ veor q7, q13
+ vst1.8 {q7}, [r1]!
+ beq .Lctr_enc_done
+ vld1.8 {q14}, [r0]
+ veor q2, q14
+ vst1.8 {q2}, [r1]!
+
+.Lctr_enc_done:
+ vmov.i32 q0, #0
+ vmov.i32 q1, #0
+#ifndef BSAES_ASM_EXTENDED_KEY
+.Lctr_enc_bzero:@ wipe key schedule [if any]
+ vstmia sp!, {q0,q1}
+ cmp sp, r9
+ bne .Lctr_enc_bzero
+#else
+ vstmia sp, {q0,q1}
+#endif
+
+ mov sp, r9
+ add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb
+ VFP_ABI_POP
+ ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return
+
+ @ OpenSSL contains aes_nohw_* fallback code here. We patch this
+ @ out to retain a constant-time implementation.
+.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S
@@ -1,0 +1,255 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL
+@ instructions are in aesv8-armx.pl.)
+.arch armv7-a
+
+.text
+#if defined(__thumb2__) || defined(__clang__)
+.syntax unified
+#define ldrplb ldrbpl
+#define ldrneb ldrbne
+#endif
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.globl gcm_init_neon
+.hidden gcm_init_neon
+.type gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+ vld1.64 d7,[r1]! @ load H
+ vmov.i8 q8,#0xe1
+ vld1.64 d6,[r1]
+ vshl.i64 d17,#57
+ vshr.u64 d16,#63 @ t0=0xc2....01
+ vdup.8 q9,d7[7]
+ vshr.u64 d26,d6,#63
+ vshr.s8 q9,#7 @ broadcast carry bit
+ vshl.i64 q3,q3,#1
+ vand q8,q8,q9
+ vorr d7,d26 @ H<<<=1
+ veor q3,q3,q8 @ twisted H
+ vstmia r0,{q3}
+
+ bx lr @ bx lr
+.size gcm_init_neon,.-gcm_init_neon
+
+.globl gcm_gmult_neon
+.hidden gcm_gmult_neon
+.type gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+ vld1.64 d7,[r0]! @ load Xi
+ vld1.64 d6,[r0]!
+ vmov.i64 d29,#0x0000ffffffffffff
+ vldmia r1,{d26,d27} @ load twisted H
+ vmov.i64 d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+ vrev64.8 q3,q3
+#endif
+ vmov.i64 d31,#0x000000000000ffff
+ veor d28,d26,d27 @ Karatsuba pre-processing
+ mov r3,#16
+ b .Lgmult_neon
+.size gcm_gmult_neon,.-gcm_gmult_neon
+
+.globl gcm_ghash_neon
+.hidden gcm_ghash_neon
+.type gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+ vld1.64 d1,[r0]! @ load Xi
+ vld1.64 d0,[r0]!
+ vmov.i64 d29,#0x0000ffffffffffff
+ vldmia r1,{d26,d27} @ load twisted H
+ vmov.i64 d30,#0x00000000ffffffff
+#ifdef __ARMEL__
+ vrev64.8 q0,q0
+#endif
+ vmov.i64 d31,#0x000000000000ffff
+ veor d28,d26,d27 @ Karatsuba pre-processing
+
+.Loop_neon:
+ vld1.64 d7,[r2]! @ load inp
+ vld1.64 d6,[r2]!
+#ifdef __ARMEL__
+ vrev64.8 q3,q3
+#endif
+ veor q3,q0 @ inp^=Xi
+.Lgmult_neon:
+ vext.8 d16, d26, d26, #1 @ A1
+ vmull.p8 q8, d16, d6 @ F = A1*B
+ vext.8 d0, d6, d6, #1 @ B1
+ vmull.p8 q0, d26, d0 @ E = A*B1
+ vext.8 d18, d26, d26, #2 @ A2
+ vmull.p8 q9, d18, d6 @ H = A2*B
+ vext.8 d22, d6, d6, #2 @ B2
+ vmull.p8 q11, d26, d22 @ G = A*B2
+ vext.8 d20, d26, d26, #3 @ A3
+ veor q8, q8, q0 @ L = E + F
+ vmull.p8 q10, d20, d6 @ J = A3*B
+ vext.8 d0, d6, d6, #3 @ B3
+ veor q9, q9, q11 @ M = G + H
+ vmull.p8 q0, d26, d0 @ I = A*B3
+ veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
+ vand d17, d17, d29
+ vext.8 d22, d6, d6, #4 @ B4
+ veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
+ vand d19, d19, d30
+ vmull.p8 q11, d26, d22 @ K = A*B4
+ veor q10, q10, q0 @ N = I + J
+ veor d16, d16, d17
+ veor d18, d18, d19
+ veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
+ vand d21, d21, d31
+ vext.8 q8, q8, q8, #15
+ veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 d23, #0
+ vext.8 q9, q9, q9, #14
+ veor d20, d20, d21
+ vmull.p8 q0, d26, d6 @ D = A*B
+ vext.8 q11, q11, q11, #12
+ vext.8 q10, q10, q10, #13
+ veor q8, q8, q9
+ veor q10, q10, q11
+ veor q0, q0, q8
+ veor q0, q0, q10
+ veor d6,d6,d7 @ Karatsuba pre-processing
+ vext.8 d16, d28, d28, #1 @ A1
+ vmull.p8 q8, d16, d6 @ F = A1*B
+ vext.8 d2, d6, d6, #1 @ B1
+ vmull.p8 q1, d28, d2 @ E = A*B1
+ vext.8 d18, d28, d28, #2 @ A2
+ vmull.p8 q9, d18, d6 @ H = A2*B
+ vext.8 d22, d6, d6, #2 @ B2
+ vmull.p8 q11, d28, d22 @ G = A*B2
+ vext.8 d20, d28, d28, #3 @ A3
+ veor q8, q8, q1 @ L = E + F
+ vmull.p8 q10, d20, d6 @ J = A3*B
+ vext.8 d2, d6, d6, #3 @ B3
+ veor q9, q9, q11 @ M = G + H
+ vmull.p8 q1, d28, d2 @ I = A*B3
+ veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
+ vand d17, d17, d29
+ vext.8 d22, d6, d6, #4 @ B4
+ veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
+ vand d19, d19, d30
+ vmull.p8 q11, d28, d22 @ K = A*B4
+ veor q10, q10, q1 @ N = I + J
+ veor d16, d16, d17
+ veor d18, d18, d19
+ veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
+ vand d21, d21, d31
+ vext.8 q8, q8, q8, #15
+ veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 d23, #0
+ vext.8 q9, q9, q9, #14
+ veor d20, d20, d21
+ vmull.p8 q1, d28, d6 @ D = A*B
+ vext.8 q11, q11, q11, #12
+ vext.8 q10, q10, q10, #13
+ veor q8, q8, q9
+ veor q10, q10, q11
+ veor q1, q1, q8
+ veor q1, q1, q10
+ vext.8 d16, d27, d27, #1 @ A1
+ vmull.p8 q8, d16, d7 @ F = A1*B
+ vext.8 d4, d7, d7, #1 @ B1
+ vmull.p8 q2, d27, d4 @ E = A*B1
+ vext.8 d18, d27, d27, #2 @ A2
+ vmull.p8 q9, d18, d7 @ H = A2*B
+ vext.8 d22, d7, d7, #2 @ B2
+ vmull.p8 q11, d27, d22 @ G = A*B2
+ vext.8 d20, d27, d27, #3 @ A3
+ veor q8, q8, q2 @ L = E + F
+ vmull.p8 q10, d20, d7 @ J = A3*B
+ vext.8 d4, d7, d7, #3 @ B3
+ veor q9, q9, q11 @ M = G + H
+ vmull.p8 q2, d27, d4 @ I = A*B3
+ veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
+ vand d17, d17, d29
+ vext.8 d22, d7, d7, #4 @ B4
+ veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
+ vand d19, d19, d30
+ vmull.p8 q11, d27, d22 @ K = A*B4
+ veor q10, q10, q2 @ N = I + J
+ veor d16, d16, d17
+ veor d18, d18, d19
+ veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
+ vand d21, d21, d31
+ vext.8 q8, q8, q8, #15
+ veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
+ vmov.i64 d23, #0
+ vext.8 q9, q9, q9, #14
+ veor d20, d20, d21
+ vmull.p8 q2, d27, d7 @ D = A*B
+ vext.8 q11, q11, q11, #12
+ vext.8 q10, q10, q10, #13
+ veor q8, q8, q9
+ veor q10, q10, q11
+ veor q2, q2, q8
+ veor q2, q2, q10
+ veor q1,q1,q0 @ Karatsuba post-processing
+ veor q1,q1,q2
+ veor d1,d1,d2
+ veor d4,d4,d3 @ Xh|Xl - 256-bit result
+
+ @ equivalent of reduction_avx from ghash-x86_64.pl
+ vshl.i64 q9,q0,#57 @ 1st phase
+ vshl.i64 q10,q0,#62
+ veor q10,q10,q9 @
+ vshl.i64 q9,q0,#63
+ veor q10, q10, q9 @
+ veor d1,d1,d20 @
+ veor d4,d4,d21
+
+ vshr.u64 q10,q0,#1 @ 2nd phase
+ veor q2,q2,q0
+ veor q0,q0,q10 @
+ vshr.u64 q10,q10,#6
+ vshr.u64 q0,q0,#1 @
+ veor q0,q0,q2 @
+ veor q0,q0,q10 @
+
+ subs r3,#16
+ bne .Loop_neon
+
+#ifdef __ARMEL__
+ vrev64.8 q0,q0
+#endif
+ sub r0,#16
+ vst1.64 d1,[r0]! @ write out Xi
+ vst1.64 d0,[r0]
+
+ bx lr @ bx lr
+.size gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S
@@ -1,0 +1,257 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+#if __ARM_MAX_ARCH__>=7
+.text
+.fpu neon
+.code 32
+#undef __thumb2__
+.globl gcm_init_v8
+.hidden gcm_init_v8
+.type gcm_init_v8,%function
+.align 4
+gcm_init_v8:
+ AARCH64_VALID_CALL_TARGET
+ vld1.64 {q9},[r1] @ load input H
+ vmov.i8 q11,#0xe1
+ vshl.i64 q11,q11,#57 @ 0xc2.0
+ vext.8 q3,q9,q9,#8
+ vshr.u64 q10,q11,#63
+ vdup.32 q9,d18[1]
+ vext.8 q8,q10,q11,#8 @ t0=0xc2....01
+ vshr.u64 q10,q3,#63
+ vshr.s32 q9,q9,#31 @ broadcast carry bit
+ vand q10,q10,q8
+ vshl.i64 q3,q3,#1
+ vext.8 q10,q10,q10,#8
+ vand q8,q8,q9
+ vorr q3,q3,q10 @ H<<<=1
+ veor q12,q3,q8 @ twisted H
+ vst1.64 {q12},[r0]! @ store Htable[0]
+
+ @ calculate H^2
+ vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing
+.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12
+ veor q8,q8,q12
+.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12
+.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q14,q0,q10
+
+ vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing
+ veor q9,q9,q14
+ vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed
+ vst1.64 {q13,q14},[r0]! @ store Htable[1..2]
+ bx lr
+.size gcm_init_v8,.-gcm_init_v8
+.globl gcm_gmult_v8
+.hidden gcm_gmult_v8
+.type gcm_gmult_v8,%function
+.align 4
+gcm_gmult_v8:
+ AARCH64_VALID_CALL_TARGET
+ vld1.64 {q9},[r0] @ load Xi
+ vmov.i8 q11,#0xe1
+ vld1.64 {q12,q13},[r1] @ load twisted H, ...
+ vshl.u64 q11,q11,#57
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vext.8 q3,q9,q9,#8
+
+.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
+ veor q9,q9,q3 @ Karatsuba pre-processing
+.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
+.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q0,q0,q10
+
+#ifndef __ARMEB__
+ vrev64.8 q0,q0
+#endif
+ vext.8 q0,q0,q0,#8
+ vst1.64 {q0},[r0] @ write out Xi
+
+ bx lr
+.size gcm_gmult_v8,.-gcm_gmult_v8
+.globl gcm_ghash_v8
+.hidden gcm_ghash_v8
+.type gcm_ghash_v8,%function
+.align 4
+gcm_ghash_v8:
+ AARCH64_VALID_CALL_TARGET
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
+ vld1.64 {q0},[r0] @ load [rotated] Xi
+ @ "[rotated]" means that
+ @ loaded value would have
+ @ to be rotated in order to
+ @ make it appear as in
+ @ algorithm specification
+ subs r3,r3,#32 @ see if r3 is 32 or larger
+ mov r12,#16 @ r12 is used as post-
+ @ increment for input pointer;
+ @ as loop is modulo-scheduled
+ @ r12 is zeroed just in time
+ @ to preclude overstepping
+ @ inp[len], which means that
+ @ last block[s] are actually
+ @ loaded twice, but last
+ @ copy is not processed
+ vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2
+ vmov.i8 q11,#0xe1
+ vld1.64 {q14},[r1]
+ moveq r12,#0 @ is it time to zero r12?
+ vext.8 q0,q0,q0,#8 @ rotate Xi
+ vld1.64 {q8},[r2]! @ load [rotated] I[0]
+ vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant
+#ifndef __ARMEB__
+ vrev64.8 q8,q8
+ vrev64.8 q0,q0
+#endif
+ vext.8 q3,q8,q8,#8 @ rotate I[0]
+ blo .Lodd_tail_v8 @ r3 was less than 32
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[1]
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vext.8 q7,q9,q9,#8
+ veor q3,q3,q0 @ I[i]^=Xi
+.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q9,q9,q7 @ Karatsuba pre-processing
+.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ b .Loop_mod2x_v8
+
+.align 4
+.Loop_mod2x_v8:
+ vext.8 q10,q3,q3,#8
+ subs r3,r3,#32 @ is there more data?
+.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo
+ movlo r12,#0 @ is it time to zero r12?
+
+.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9
+ veor q10,q10,q3 @ Karatsuba pre-processing
+.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi
+ veor q0,q0,q4 @ accumulate
+.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+ vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2]
+
+ veor q2,q2,q6
+ moveq r12,#0 @ is it time to zero r12?
+ veor q1,q1,q5
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3]
+#ifndef __ARMEB__
+ vrev64.8 q8,q8
+#endif
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+#ifndef __ARMEB__
+ vrev64.8 q9,q9
+#endif
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ vext.8 q7,q9,q9,#8
+ vext.8 q3,q8,q8,#8
+ veor q0,q1,q10
+.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1
+ veor q3,q3,q2 @ accumulate q3 early
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q3,q3,q10
+ veor q9,q9,q7 @ Karatsuba pre-processing
+ veor q3,q3,q0
+.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7
+ bhs .Loop_mod2x_v8 @ there was at least 32 more bytes
+
+ veor q2,q2,q10
+ vext.8 q3,q8,q8,#8 @ re-construct q3
+ adds r3,r3,#32 @ re-construct r3
+ veor q0,q0,q2 @ re-construct q0
+ beq .Ldone_v8 @ is r3 zero?
+.Lodd_tail_v8:
+ vext.8 q10,q0,q0,#8
+ veor q3,q3,q0 @ inp^=Xi
+ veor q9,q8,q10 @ q9 is rotated inp^Xi
+
+.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo
+ veor q9,q9,q3 @ Karatsuba pre-processing
+.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi
+.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+
+ vext.8 q9,q0,q2,#8 @ Karatsuba post-processing
+ veor q10,q0,q2
+ veor q1,q1,q9
+ veor q1,q1,q10
+.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction
+
+ vmov d4,d3 @ Xh|Xm - 256-bit result
+ vmov d3,d0 @ Xm is rotated Xl
+ veor q0,q1,q10
+
+ vext.8 q10,q0,q0,#8 @ 2nd phase of reduction
+.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11
+ veor q10,q10,q2
+ veor q0,q0,q10
+
+.Ldone_v8:
+#ifndef __ARMEB__
+ vrev64.8 q0,q0
+#endif
+ vext.8 q0,q0,q0,#8
+ vst1.64 {q0},[r0] @ write out Xi
+
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so
+ bx lr
+.size gcm_ghash_v8,.-gcm_ghash_v8
+.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S
@@ -1,0 +1,1511 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+#include <openssl/arm_arch.h>
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.globl sha1_block_data_order
+.hidden sha1_block_data_order
+.type sha1_block_data_order,%function
+
+.align 5
+sha1_block_data_order:
+#if __ARM_MAX_ARCH__>=7
+.Lsha1_block:
+ adr r3,.Lsha1_block
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
+ tst r12,#ARMV8_SHA1
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
+ ldmia r0,{r3,r4,r5,r6,r7}
+.Lloop:
+ ldr r8,.LK_00_19
+ mov r14,sp
+ sub sp,sp,#15*4
+ mov r5,r5,ror#30
+ mov r6,r6,ror#30
+ mov r7,r7,ror#30 @ [6]
+.L_00_15:
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r4,r5 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r6,r8,r6,ror#2 @ E+=K_00_19
+ eor r10,r4,r5 @ F_xx_xx
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r3,r10,ror#2
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r3,r4 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r5,r8,r5,ror#2 @ E+=K_00_19
+ eor r10,r3,r4 @ F_xx_xx
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r7,r10,ror#2
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r7,r3 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r4,r8,r4,ror#2 @ E+=K_00_19
+ eor r10,r7,r3 @ F_xx_xx
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r6,r10,ror#2
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r6,r7 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r3,r8,r3,ror#2 @ E+=K_00_19
+ eor r10,r6,r7 @ F_xx_xx
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r5,r10,ror#2
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+#if defined(__thumb2__)
+ mov r12,sp
+ teq r14,r12
+#else
+ teq r14,sp
+#endif
+ bne .L_00_15 @ [((11+4)*5+2)*3]
+ sub sp,sp,#25*4
+#if __ARM_ARCH__<7
+ ldrb r10,[r1,#2]
+ ldrb r9,[r1,#3]
+ ldrb r11,[r1,#1]
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ ldrb r12,[r1],#4
+ orr r9,r9,r10,lsl#8
+ eor r10,r5,r6 @ F_xx_xx
+ orr r9,r9,r11,lsl#16
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ orr r9,r9,r12,lsl#24
+#else
+ ldr r9,[r1],#4 @ handles unaligned
+ add r7,r8,r7,ror#2 @ E+=K_00_19
+ eor r10,r5,r6 @ F_xx_xx
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+#ifdef __ARMEL__
+ rev r9,r9 @ byte swap
+#endif
+#endif
+ and r10,r4,r10,ror#2
+ add r7,r7,r9 @ E+=X[i]
+ eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
+ str r9,[r14,#-4]!
+ add r7,r7,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
+ add r6,r6,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
+ add r5,r5,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
+ add r4,r4,r10 @ E+=F_00_19(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
+ add r3,r3,r10 @ E+=F_00_19(B,C,D)
+
+ ldr r8,.LK_20_39 @ [+15+16*4]
+ cmn sp,#0 @ [+3], clear carry to denote 20_39
+.L_20_39_or_60_79:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r4,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r3,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r7,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r6,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_20_39(B,C,D)
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ eor r10,r5,r10,ror#2 @ F_xx_xx
+ @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_20_39(B,C,D)
+#if defined(__thumb2__)
+ mov r12,sp
+ teq r14,r12
+#else
+ teq r14,sp @ preserve carry
+#endif
+ bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
+ bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
+
+ ldr r8,.LK_40_59
+ sub sp,sp,#20*4 @ [+2]
+.L_40_59:
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r7,r8,r7,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r5,r6 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r7,r7,r3,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r4,r10,ror#2 @ F_xx_xx
+ and r11,r5,r6 @ F_xx_xx
+ add r7,r7,r9 @ E+=X[i]
+ add r7,r7,r10 @ E+=F_40_59(B,C,D)
+ add r7,r7,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r6,r8,r6,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r4,r5 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r6,r6,r7,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r3,r10,ror#2 @ F_xx_xx
+ and r11,r4,r5 @ F_xx_xx
+ add r6,r6,r9 @ E+=X[i]
+ add r6,r6,r10 @ E+=F_40_59(B,C,D)
+ add r6,r6,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r5,r8,r5,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r3,r4 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r5,r5,r6,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r7,r10,ror#2 @ F_xx_xx
+ and r11,r3,r4 @ F_xx_xx
+ add r5,r5,r9 @ E+=X[i]
+ add r5,r5,r10 @ E+=F_40_59(B,C,D)
+ add r5,r5,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r4,r8,r4,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r7,r3 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r4,r4,r5,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r6,r10,ror#2 @ F_xx_xx
+ and r11,r7,r3 @ F_xx_xx
+ add r4,r4,r9 @ E+=X[i]
+ add r4,r4,r10 @ E+=F_40_59(B,C,D)
+ add r4,r4,r11,ror#2
+ ldr r9,[r14,#15*4]
+ ldr r10,[r14,#13*4]
+ ldr r11,[r14,#7*4]
+ add r3,r8,r3,ror#2 @ E+=K_xx_xx
+ ldr r12,[r14,#2*4]
+ eor r9,r9,r10
+ eor r11,r11,r12 @ 1 cycle stall
+ eor r10,r6,r7 @ F_xx_xx
+ mov r9,r9,ror#31
+ add r3,r3,r4,ror#27 @ E+=ROR(A,27)
+ eor r9,r9,r11,ror#31
+ str r9,[r14,#-4]!
+ and r10,r5,r10,ror#2 @ F_xx_xx
+ and r11,r6,r7 @ F_xx_xx
+ add r3,r3,r9 @ E+=X[i]
+ add r3,r3,r10 @ E+=F_40_59(B,C,D)
+ add r3,r3,r11,ror#2
+#if defined(__thumb2__)
+ mov r12,sp
+ teq r14,r12
+#else
+ teq r14,sp
+#endif
+ bne .L_40_59 @ [+((12+5)*5+2)*4]
+
+ ldr r8,.LK_60_79
+ sub sp,sp,#20*4
+ cmp sp,#0 @ set carry to denote 60_79
+ b .L_20_39_or_60_79 @ [+4], spare 300 bytes
+.L_done:
+ add sp,sp,#80*4 @ "deallocate" stack frame
+ ldmia r0,{r8,r9,r10,r11,r12}
+ add r3,r8,r3
+ add r4,r9,r4
+ add r5,r10,r5,ror#2
+ add r6,r11,r6,ror#2
+ add r7,r12,r7,ror#2
+ stmia r0,{r3,r4,r5,r6,r7}
+ teq r1,r2
+ bne .Lloop @ [+18], total 1307
+
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha1_block_data_order,.-sha1_block_data_order
+
+.align 5
+.LK_00_19:.word 0x5a827999
+.LK_20_39:.word 0x6ed9eba1
+.LK_40_59:.word 0x8f1bbcdc
+.LK_60_79:.word 0xca62c1d6
+#if __ARM_MAX_ARCH__>=7
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lsha1_block
+#endif
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 5
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.type sha1_block_data_order_neon,%function
+.align 4
+sha1_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
+ @ dmb @ errata #451034 on early Cortex A8
+ @ vstmdb sp!,{d8-d15} @ ABI specification says so
+ mov r14,sp
+ sub r12,sp,#64
+ adr r8,.LK_00_19
+ bic r12,r12,#15 @ align for 128-bit stores
+
+ ldmia r0,{r3,r4,r5,r6,r7} @ load context
+ mov sp,r12 @ alloca
+
+ vld1.8 {q0,q1},[r1]! @ handles unaligned
+ veor q15,q15,q15
+ vld1.8 {q2,q3},[r1]!
+ vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19
+ vrev32.8 q0,q0 @ yes, even on
+ vrev32.8 q1,q1 @ big-endian...
+ vrev32.8 q2,q2
+ vadd.i32 q8,q0,q14
+ vrev32.8 q3,q3
+ vadd.i32 q9,q1,q14
+ vst1.32 {q8},[r12,:128]!
+ vadd.i32 q10,q2,q14
+ vst1.32 {q9},[r12,:128]!
+ vst1.32 {q10},[r12,:128]!
+ ldr r9,[sp] @ big RAW stall
+
+.Loop_neon:
+ vext.8 q8,q0,q1,#8
+ bic r10,r6,r4
+ add r7,r7,r9
+ and r11,r5,r4
+ vadd.i32 q13,q3,q14
+ ldr r9,[sp,#4]
+ add r7,r7,r3,ror#27
+ vext.8 q12,q3,q15,#4
+ eor r11,r11,r10
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ veor q8,q8,q0
+ bic r10,r5,r3
+ add r6,r6,r9
+ veor q12,q12,q2
+ and r11,r4,r3
+ ldr r9,[sp,#8]
+ veor q12,q12,q8
+ add r6,r6,r7,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r4,r7
+ add r5,r5,r9
+ vadd.i32 q8,q12,q12
+ and r11,r3,r7
+ ldr r9,[sp,#12]
+ vsri.32 q8,q12,#31
+ add r5,r5,r6,ror#27
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ vshr.u32 q12,q13,#30
+ add r5,r5,r11
+ bic r10,r3,r6
+ vshl.u32 q13,q13,#2
+ add r4,r4,r9
+ and r11,r7,r6
+ veor q8,q8,q12
+ ldr r9,[sp,#16]
+ add r4,r4,r5,ror#27
+ veor q8,q8,q13
+ eor r11,r11,r10
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q9,q1,q2,#8
+ bic r10,r7,r5
+ add r3,r3,r9
+ and r11,r6,r5
+ vadd.i32 q13,q8,q14
+ ldr r9,[sp,#20]
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r3,r3,r4,ror#27
+ vext.8 q12,q8,q15,#4
+ eor r11,r11,r10
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ veor q9,q9,q1
+ bic r10,r6,r4
+ add r7,r7,r9
+ veor q12,q12,q3
+ and r11,r5,r4
+ ldr r9,[sp,#24]
+ veor q12,q12,q9
+ add r7,r7,r3,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r5,r3
+ add r6,r6,r9
+ vadd.i32 q9,q12,q12
+ and r11,r4,r3
+ ldr r9,[sp,#28]
+ vsri.32 q9,q12,#31
+ add r6,r6,r7,ror#27
+ eor r11,r11,r10
+ mov r3,r3,ror#2
+ vshr.u32 q12,q13,#30
+ add r6,r6,r11
+ bic r10,r4,r7
+ vshl.u32 q13,q13,#2
+ add r5,r5,r9
+ and r11,r3,r7
+ veor q9,q9,q12
+ ldr r9,[sp,#32]
+ add r5,r5,r6,ror#27
+ veor q9,q9,q13
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vext.8 q10,q2,q3,#8
+ bic r10,r3,r6
+ add r4,r4,r9
+ and r11,r7,r6
+ vadd.i32 q13,q9,q14
+ ldr r9,[sp,#36]
+ add r4,r4,r5,ror#27
+ vext.8 q12,q9,q15,#4
+ eor r11,r11,r10
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ veor q10,q10,q2
+ bic r10,r7,r5
+ add r3,r3,r9
+ veor q12,q12,q8
+ and r11,r6,r5
+ ldr r9,[sp,#40]
+ veor q12,q12,q10
+ add r3,r3,r4,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r6,r4
+ add r7,r7,r9
+ vadd.i32 q10,q12,q12
+ and r11,r5,r4
+ ldr r9,[sp,#44]
+ vsri.32 q10,q12,#31
+ add r7,r7,r3,ror#27
+ eor r11,r11,r10
+ mov r4,r4,ror#2
+ vshr.u32 q12,q13,#30
+ add r7,r7,r11
+ bic r10,r5,r3
+ vshl.u32 q13,q13,#2
+ add r6,r6,r9
+ and r11,r4,r3
+ veor q10,q10,q12
+ ldr r9,[sp,#48]
+ add r6,r6,r7,ror#27
+ veor q10,q10,q13
+ eor r11,r11,r10
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q11,q3,q8,#8
+ bic r10,r4,r7
+ add r5,r5,r9
+ and r11,r3,r7
+ vadd.i32 q13,q10,q14
+ ldr r9,[sp,#52]
+ add r5,r5,r6,ror#27
+ vext.8 q12,q10,q15,#4
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ veor q11,q11,q3
+ bic r10,r3,r6
+ add r4,r4,r9
+ veor q12,q12,q9
+ and r11,r7,r6
+ ldr r9,[sp,#56]
+ veor q12,q12,q11
+ add r4,r4,r5,ror#27
+ eor r11,r11,r10
+ vst1.32 {q13},[r12,:128]!
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q13,q15,q12,#4
+ bic r10,r7,r5
+ add r3,r3,r9
+ vadd.i32 q11,q12,q12
+ and r11,r6,r5
+ ldr r9,[sp,#60]
+ vsri.32 q11,q12,#31
+ add r3,r3,r4,ror#27
+ eor r11,r11,r10
+ mov r5,r5,ror#2
+ vshr.u32 q12,q13,#30
+ add r3,r3,r11
+ bic r10,r6,r4
+ vshl.u32 q13,q13,#2
+ add r7,r7,r9
+ and r11,r5,r4
+ veor q11,q11,q12
+ ldr r9,[sp,#0]
+ add r7,r7,r3,ror#27
+ veor q11,q11,q13
+ eor r11,r11,r10
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q12,q10,q11,#8
+ bic r10,r5,r3
+ add r6,r6,r9
+ and r11,r4,r3
+ veor q0,q0,q8
+ ldr r9,[sp,#4]
+ add r6,r6,r7,ror#27
+ veor q0,q0,q1
+ eor r11,r11,r10
+ mov r3,r3,ror#2
+ vadd.i32 q13,q11,q14
+ add r6,r6,r11
+ bic r10,r4,r7
+ veor q12,q12,q0
+ add r5,r5,r9
+ and r11,r3,r7
+ vshr.u32 q0,q12,#30
+ ldr r9,[sp,#8]
+ add r5,r5,r6,ror#27
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ eor r11,r11,r10
+ mov r7,r7,ror#2
+ vsli.32 q0,q12,#2
+ add r5,r5,r11
+ bic r10,r3,r6
+ add r4,r4,r9
+ and r11,r7,r6
+ ldr r9,[sp,#12]
+ add r4,r4,r5,ror#27
+ eor r11,r11,r10
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ bic r10,r7,r5
+ add r3,r3,r9
+ and r11,r6,r5
+ ldr r9,[sp,#16]
+ add r3,r3,r4,ror#27
+ eor r11,r11,r10
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q12,q11,q0,#8
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#20]
+ veor q1,q1,q9
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ veor q1,q1,q2
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vadd.i32 q13,q0,q14
+ eor r10,r3,r5
+ add r6,r6,r9
+ veor q12,q12,q1
+ ldr r9,[sp,#24]
+ eor r11,r10,r4
+ vshr.u32 q1,q12,#30
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r6,r6,r11
+ eor r10,r7,r4
+ vsli.32 q1,q12,#2
+ add r5,r5,r9
+ ldr r9,[sp,#28]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#32]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q12,q0,q1,#8
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#36]
+ veor q2,q2,q10
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ veor q2,q2,q3
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vadd.i32 q13,q1,q14
+ eor r10,r4,r6
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r7,r7,r9
+ veor q12,q12,q2
+ ldr r9,[sp,#40]
+ eor r11,r10,r5
+ vshr.u32 q2,q12,#30
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r7,r7,r11
+ eor r10,r3,r5
+ vsli.32 q2,q12,#2
+ add r6,r6,r9
+ ldr r9,[sp,#44]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#48]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vext.8 q12,q1,q2,#8
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#52]
+ veor q3,q3,q11
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ veor q3,q3,q8
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vadd.i32 q13,q2,q14
+ eor r10,r5,r7
+ add r3,r3,r9
+ veor q12,q12,q3
+ ldr r9,[sp,#56]
+ eor r11,r10,r6
+ vshr.u32 q3,q12,#30
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r3,r3,r11
+ eor r10,r4,r6
+ vsli.32 q3,q12,#2
+ add r7,r7,r9
+ ldr r9,[sp,#60]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#0]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q12,q2,q3,#8
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#4]
+ veor q8,q8,q0
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ veor q8,q8,q9
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vadd.i32 q13,q3,q14
+ eor r10,r6,r3
+ add r4,r4,r9
+ veor q12,q12,q8
+ ldr r9,[sp,#8]
+ eor r11,r10,r7
+ vshr.u32 q8,q12,#30
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ add r4,r4,r11
+ eor r10,r5,r7
+ vsli.32 q8,q12,#2
+ add r3,r3,r9
+ ldr r9,[sp,#12]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#16]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q12,q3,q8,#8
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#20]
+ veor q9,q9,q1
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ veor q9,q9,q10
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vadd.i32 q13,q8,q14
+ eor r10,r7,r4
+ add r5,r5,r9
+ veor q12,q12,q9
+ ldr r9,[sp,#24]
+ eor r11,r10,r3
+ vshr.u32 q9,q12,#30
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r5,r5,r11
+ eor r10,r6,r3
+ vsli.32 q9,q12,#2
+ add r4,r4,r9
+ ldr r9,[sp,#28]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#32]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q12,q8,q9,#8
+ add r7,r7,r9
+ and r10,r5,r6
+ ldr r9,[sp,#36]
+ veor q10,q10,q2
+ add r7,r7,r3,ror#27
+ eor r11,r5,r6
+ veor q10,q10,q11
+ add r7,r7,r10
+ and r11,r11,r4
+ vadd.i32 q13,q9,q14
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ veor q12,q12,q10
+ add r6,r6,r9
+ and r10,r4,r5
+ vshr.u32 q10,q12,#30
+ ldr r9,[sp,#40]
+ add r6,r6,r7,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r4,r5
+ add r6,r6,r10
+ vsli.32 q10,q12,#2
+ and r11,r11,r3
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ add r5,r5,r9
+ and r10,r3,r4
+ ldr r9,[sp,#44]
+ add r5,r5,r6,ror#27
+ eor r11,r3,r4
+ add r5,r5,r10
+ and r11,r11,r7
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ add r4,r4,r9
+ and r10,r7,r3
+ ldr r9,[sp,#48]
+ add r4,r4,r5,ror#27
+ eor r11,r7,r3
+ add r4,r4,r10
+ and r11,r11,r6
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vext.8 q12,q9,q10,#8
+ add r3,r3,r9
+ and r10,r6,r7
+ ldr r9,[sp,#52]
+ veor q11,q11,q3
+ add r3,r3,r4,ror#27
+ eor r11,r6,r7
+ veor q11,q11,q0
+ add r3,r3,r10
+ and r11,r11,r5
+ vadd.i32 q13,q10,q14
+ mov r5,r5,ror#2
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r3,r3,r11
+ veor q12,q12,q11
+ add r7,r7,r9
+ and r10,r5,r6
+ vshr.u32 q11,q12,#30
+ ldr r9,[sp,#56]
+ add r7,r7,r3,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r5,r6
+ add r7,r7,r10
+ vsli.32 q11,q12,#2
+ and r11,r11,r4
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ add r6,r6,r9
+ and r10,r4,r5
+ ldr r9,[sp,#60]
+ add r6,r6,r7,ror#27
+ eor r11,r4,r5
+ add r6,r6,r10
+ and r11,r11,r3
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ add r5,r5,r9
+ and r10,r3,r4
+ ldr r9,[sp,#0]
+ add r5,r5,r6,ror#27
+ eor r11,r3,r4
+ add r5,r5,r10
+ and r11,r11,r7
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vext.8 q12,q10,q11,#8
+ add r4,r4,r9
+ and r10,r7,r3
+ ldr r9,[sp,#4]
+ veor q0,q0,q8
+ add r4,r4,r5,ror#27
+ eor r11,r7,r3
+ veor q0,q0,q1
+ add r4,r4,r10
+ and r11,r11,r6
+ vadd.i32 q13,q11,q14
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ veor q12,q12,q0
+ add r3,r3,r9
+ and r10,r6,r7
+ vshr.u32 q0,q12,#30
+ ldr r9,[sp,#8]
+ add r3,r3,r4,ror#27
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ eor r11,r6,r7
+ add r3,r3,r10
+ vsli.32 q0,q12,#2
+ and r11,r11,r5
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ add r7,r7,r9
+ and r10,r5,r6
+ ldr r9,[sp,#12]
+ add r7,r7,r3,ror#27
+ eor r11,r5,r6
+ add r7,r7,r10
+ and r11,r11,r4
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ add r6,r6,r9
+ and r10,r4,r5
+ ldr r9,[sp,#16]
+ add r6,r6,r7,ror#27
+ eor r11,r4,r5
+ add r6,r6,r10
+ and r11,r11,r3
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vext.8 q12,q11,q0,#8
+ add r5,r5,r9
+ and r10,r3,r4
+ ldr r9,[sp,#20]
+ veor q1,q1,q9
+ add r5,r5,r6,ror#27
+ eor r11,r3,r4
+ veor q1,q1,q2
+ add r5,r5,r10
+ and r11,r11,r7
+ vadd.i32 q13,q0,q14
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ veor q12,q12,q1
+ add r4,r4,r9
+ and r10,r7,r3
+ vshr.u32 q1,q12,#30
+ ldr r9,[sp,#24]
+ add r4,r4,r5,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r7,r3
+ add r4,r4,r10
+ vsli.32 q1,q12,#2
+ and r11,r11,r6
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ add r3,r3,r9
+ and r10,r6,r7
+ ldr r9,[sp,#28]
+ add r3,r3,r4,ror#27
+ eor r11,r6,r7
+ add r3,r3,r10
+ and r11,r11,r5
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ add r7,r7,r9
+ and r10,r5,r6
+ ldr r9,[sp,#32]
+ add r7,r7,r3,ror#27
+ eor r11,r5,r6
+ add r7,r7,r10
+ and r11,r11,r4
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vext.8 q12,q0,q1,#8
+ add r6,r6,r9
+ and r10,r4,r5
+ ldr r9,[sp,#36]
+ veor q2,q2,q10
+ add r6,r6,r7,ror#27
+ eor r11,r4,r5
+ veor q2,q2,q3
+ add r6,r6,r10
+ and r11,r11,r3
+ vadd.i32 q13,q1,q14
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ veor q12,q12,q2
+ add r5,r5,r9
+ and r10,r3,r4
+ vshr.u32 q2,q12,#30
+ ldr r9,[sp,#40]
+ add r5,r5,r6,ror#27
+ vst1.32 {q13},[r12,:128]!
+ eor r11,r3,r4
+ add r5,r5,r10
+ vsli.32 q2,q12,#2
+ and r11,r11,r7
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ add r4,r4,r9
+ and r10,r7,r3
+ ldr r9,[sp,#44]
+ add r4,r4,r5,ror#27
+ eor r11,r7,r3
+ add r4,r4,r10
+ and r11,r11,r6
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ add r3,r3,r9
+ and r10,r6,r7
+ ldr r9,[sp,#48]
+ add r3,r3,r4,ror#27
+ eor r11,r6,r7
+ add r3,r3,r10
+ and r11,r11,r5
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ vext.8 q12,q1,q2,#8
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#52]
+ veor q3,q3,q11
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ veor q3,q3,q8
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vadd.i32 q13,q2,q14
+ eor r10,r3,r5
+ add r6,r6,r9
+ veor q12,q12,q3
+ ldr r9,[sp,#56]
+ eor r11,r10,r4
+ vshr.u32 q3,q12,#30
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ vst1.32 {q13},[r12,:128]!
+ add r6,r6,r11
+ eor r10,r7,r4
+ vsli.32 q3,q12,#2
+ add r5,r5,r9
+ ldr r9,[sp,#60]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#0]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ vadd.i32 q13,q3,q14
+ eor r10,r5,r7
+ add r3,r3,r9
+ vst1.32 {q13},[r12,:128]!
+ sub r12,r12,#64
+ teq r1,r2
+ sub r8,r8,#16
+ it eq
+ subeq r1,r1,#64
+ vld1.8 {q0,q1},[r1]!
+ ldr r9,[sp,#4]
+ eor r11,r10,r6
+ vld1.8 {q2,q3},[r1]!
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ vld1.32 {d28[],d29[]},[r8,:32]!
+ add r3,r3,r11
+ eor r10,r4,r6
+ vrev32.8 q0,q0
+ add r7,r7,r9
+ ldr r9,[sp,#8]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#12]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#16]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ vrev32.8 q1,q1
+ eor r10,r6,r3
+ add r4,r4,r9
+ vadd.i32 q8,q0,q14
+ ldr r9,[sp,#20]
+ eor r11,r10,r7
+ vst1.32 {q8},[r12,:128]!
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#24]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#28]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ eor r10,r3,r5
+ add r6,r6,r9
+ ldr r9,[sp,#32]
+ eor r11,r10,r4
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ vrev32.8 q2,q2
+ eor r10,r7,r4
+ add r5,r5,r9
+ vadd.i32 q9,q1,q14
+ ldr r9,[sp,#36]
+ eor r11,r10,r3
+ vst1.32 {q9},[r12,:128]!
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#40]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ ldr r9,[sp,#44]
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ eor r10,r4,r6
+ add r7,r7,r9
+ ldr r9,[sp,#48]
+ eor r11,r10,r5
+ add r7,r7,r3,ror#27
+ mov r4,r4,ror#2
+ add r7,r7,r11
+ vrev32.8 q3,q3
+ eor r10,r3,r5
+ add r6,r6,r9
+ vadd.i32 q10,q2,q14
+ ldr r9,[sp,#52]
+ eor r11,r10,r4
+ vst1.32 {q10},[r12,:128]!
+ add r6,r6,r7,ror#27
+ mov r3,r3,ror#2
+ add r6,r6,r11
+ eor r10,r7,r4
+ add r5,r5,r9
+ ldr r9,[sp,#56]
+ eor r11,r10,r3
+ add r5,r5,r6,ror#27
+ mov r7,r7,ror#2
+ add r5,r5,r11
+ eor r10,r6,r3
+ add r4,r4,r9
+ ldr r9,[sp,#60]
+ eor r11,r10,r7
+ add r4,r4,r5,ror#27
+ mov r6,r6,ror#2
+ add r4,r4,r11
+ eor r10,r5,r7
+ add r3,r3,r9
+ eor r11,r10,r6
+ add r3,r3,r4,ror#27
+ mov r5,r5,ror#2
+ add r3,r3,r11
+ ldmia r0,{r9,r10,r11,r12} @ accumulate context
+ add r3,r3,r9
+ ldr r9,[r0,#16]
+ add r4,r4,r10
+ add r5,r5,r11
+ add r6,r6,r12
+ it eq
+ moveq sp,r14
+ add r7,r7,r9
+ it ne
+ ldrne r9,[sp]
+ stmia r0,{r3,r4,r5,r6,r7}
+ itt ne
+ addne r12,sp,#3*16
+ bne .Loop_neon
+
+ @ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+.size sha1_block_data_order_neon,.-sha1_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xf,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d|0x10
+# endif
+
+.type sha1_block_data_order_armv8,%function
+.align 5
+sha1_block_data_order_armv8:
+.LARMv8:
+ vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so
+
+ veor q1,q1,q1
+ adr r3,.LK_00_19
+ vld1.32 {q0},[r0]!
+ vld1.32 {d2[0]},[r0]
+ sub r0,r0,#16
+ vld1.32 {d16[],d17[]},[r3,:32]!
+ vld1.32 {d18[],d19[]},[r3,:32]!
+ vld1.32 {d20[],d21[]},[r3,:32]!
+ vld1.32 {d22[],d23[]},[r3,:32]
+
+.Loop_v8:
+ vld1.8 {q4,q5},[r1]!
+ vld1.8 {q6,q7},[r1]!
+ vrev32.8 q4,q4
+ vrev32.8 q5,q5
+
+ vadd.i32 q12,q8,q4
+ vrev32.8 q6,q6
+ vmov q14,q0 @ offload
+ subs r2,r2,#1
+
+ vadd.i32 q13,q8,q5
+ vrev32.8 q7,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0
+ INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12
+ vadd.i32 q12,q8,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1
+ INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
+ vadd.i32 q13,q8,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2
+ INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
+ vadd.i32 q12,q8,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3
+ INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13
+ vadd.i32 q13,q9,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4
+ INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12
+ vadd.i32 q12,q9,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q9,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+ vadd.i32 q12,q9,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q9,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+ vadd.i32 q12,q10,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q10,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10
+ INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
+ vadd.i32 q12,q10,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11
+ INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
+ vadd.i32 q13,q10,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12
+ INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
+ vadd.i32 q12,q10,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13
+ INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13
+ vadd.i32 q13,q11,q7
+ INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7
+ INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14
+ INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12
+ vadd.i32 q12,q11,q4
+ INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4
+ INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q11,q5
+ INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5
+ INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+ vadd.i32 q12,q11,q6
+ INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+ vadd.i32 q13,q11,q7
+
+ INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18
+ INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12
+
+ INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19
+ INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13
+
+ vadd.i32 q1,q1,q2
+ vadd.i32 q0,q0,q14
+ bne .Loop_v8
+
+ vst1.32 {q0},[r0]!
+ vst1.32 {d2[0]},[r0]
+
+ vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
+ bx lr @ bx lr
+.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
+#endif
+#if __ARM_MAX_ARCH__>=7
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S
@@ -1,0 +1,2839 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License"). You may not use
+@ this file except in compliance with the License. You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA256 block procedure for ARMv4. May 2007.
+
+@ Performance is ~2x better than gcc 3.4 generated code and in "abso-
+@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
+@ byte [on single-issue Xscale PXA250 core].
+
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 22% improvement on
+@ Cortex A8 core and ~20 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 16%
+@ improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+@ September 2013.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process one
+@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+@ code (meaning that latter performs sub-optimally, nothing was done
+@ about it).
+
+@ May 2014.
+@
+@ Add ARMv8 code path performing at 2.0 cpb on Apple A7.
+
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those
+@ instructions are manually-encoded. (See unsha256.)
+.arch armv7-a
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code 32
+#endif
+
+.type K256,%object
+.align 5
+K256:
+.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size K256,.-K256
+.word 0 @ terminator
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lsha256_block_data_order
+#endif
+.align 5
+
+.globl sha256_block_data_order
+.hidden sha256_block_data_order
+.type sha256_block_data_order,%function
+sha256_block_data_order:
+.Lsha256_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r3,pc,#8 @ sha256_block_data_order
+#else
+ adr r3,.Lsha256_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
+ tst r12,#ARMV8_SHA256
+ bne .LARMv8
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ stmdb sp!,{r0,r1,r2,r4-r11,lr}
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r14,r3,#256+32 @ K256
+ sub sp,sp,#16*4 @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ magic
+ eor r12,r12,r12
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 0
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 0
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 0==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 0==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 1
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 1
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 1==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 1==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 2
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 2
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 2==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 2==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 3
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 3
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 3==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 3==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 4
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 4
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 4==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 4==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 5
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 5==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 5==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 6
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 6
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 6==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 6==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 7
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 7==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 7==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 8
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 8
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 8==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r8,r8,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 8==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 9
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 9
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 9==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r7,r7,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 9==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 10
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 10
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 10==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r6,r6,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 10==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 11
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 11
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 11==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r5,r5,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 11==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 12
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 12
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 12==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r4,r4,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 12==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 13
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 13
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 13==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r11,r11,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 13==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 14
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 14
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ ldrb r12,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r12,lsl#8
+ ldrb r12,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 14==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r10,r10,ror#5
+ orr r2,r2,r12,lsl#24
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+#endif
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 14==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ @ ldr r2,[r1],#4 @ 15
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+ rev r2,r2
+# endif
+#else
+ @ ldrb r2,[r1,#3] @ 15
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ ldrb r3,[r1,#2]
+ ldrb r0,[r1,#1]
+ orr r2,r2,r3,lsl#8
+ ldrb r3,[r1],#4
+ orr r2,r2,r0,lsl#16
+# if 15==15
+ str r1,[sp,#17*4] @ make room for r1
+# endif
+ eor r0,r9,r9,ror#5
+ orr r2,r2,r3,lsl#24
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 15==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+ @ ldr r2,[sp,#1*4] @ 16
+ @ ldr r1,[sp,#14*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#0*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#9*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#0*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 16==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#2*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#15*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#2*4] @ 17
+ @ ldr r1,[sp,#15*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#1*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#10*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#1*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 17==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#3*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#0*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#3*4] @ 18
+ @ ldr r1,[sp,#0*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#2*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#11*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#2*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 18==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#4*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#1*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#4*4] @ 19
+ @ ldr r1,[sp,#1*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#3*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#12*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#3*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 19==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#5*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#2*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#5*4] @ 20
+ @ ldr r1,[sp,#2*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#4*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#13*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#4*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 20==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#6*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#3*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#6*4] @ 21
+ @ ldr r1,[sp,#3*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#5*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#14*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#5*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 21==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#7*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#4*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#7*4] @ 22
+ @ ldr r1,[sp,#4*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#6*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#15*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#6*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 22==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#8*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#5*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#8*4] @ 23
+ @ ldr r1,[sp,#5*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#7*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#0*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#7*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 23==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#9*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#6*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#9*4] @ 24
+ @ ldr r1,[sp,#6*4]
+ mov r0,r2,ror#7
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#8*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#1*4]
+
+ add r12,r12,r0
+ eor r0,r8,r8,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r8,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r11,r11,r2 @ h+=X[i]
+ str r2,[sp,#8*4]
+ eor r2,r9,r10
+ add r11,r11,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r8
+ add r11,r11,r12 @ h+=K256[i]
+ eor r2,r2,r10 @ Ch(e,f,g)
+ eor r0,r4,r4,ror#11
+ add r11,r11,r2 @ h+=Ch(e,f,g)
+#if 24==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r4,r5 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#10*4] @ from future BODY_16_xx
+ eor r12,r4,r5 @ a^b, b^c in next round
+ ldr r1,[sp,#7*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r4,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r7,r7,r11 @ d+=h
+ eor r3,r3,r5 @ Maj(a,b,c)
+ add r11,r11,r0,ror#2 @ h+=Sigma0(a)
+ @ add r11,r11,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#10*4] @ 25
+ @ ldr r1,[sp,#7*4]
+ mov r0,r2,ror#7
+ add r11,r11,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#9*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#2*4]
+
+ add r3,r3,r0
+ eor r0,r7,r7,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r7,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r10,r10,r2 @ h+=X[i]
+ str r2,[sp,#9*4]
+ eor r2,r8,r9
+ add r10,r10,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r7
+ add r10,r10,r3 @ h+=K256[i]
+ eor r2,r2,r9 @ Ch(e,f,g)
+ eor r0,r11,r11,ror#11
+ add r10,r10,r2 @ h+=Ch(e,f,g)
+#if 25==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r11,r4 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#11*4] @ from future BODY_16_xx
+ eor r3,r11,r4 @ a^b, b^c in next round
+ ldr r1,[sp,#8*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r11,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r6,r6,r10 @ d+=h
+ eor r12,r12,r4 @ Maj(a,b,c)
+ add r10,r10,r0,ror#2 @ h+=Sigma0(a)
+ @ add r10,r10,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#11*4] @ 26
+ @ ldr r1,[sp,#8*4]
+ mov r0,r2,ror#7
+ add r10,r10,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#10*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#3*4]
+
+ add r12,r12,r0
+ eor r0,r6,r6,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r6,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r9,r9,r2 @ h+=X[i]
+ str r2,[sp,#10*4]
+ eor r2,r7,r8
+ add r9,r9,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r6
+ add r9,r9,r12 @ h+=K256[i]
+ eor r2,r2,r8 @ Ch(e,f,g)
+ eor r0,r10,r10,ror#11
+ add r9,r9,r2 @ h+=Ch(e,f,g)
+#if 26==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r10,r11 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#12*4] @ from future BODY_16_xx
+ eor r12,r10,r11 @ a^b, b^c in next round
+ ldr r1,[sp,#9*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r10,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r5,r5,r9 @ d+=h
+ eor r3,r3,r11 @ Maj(a,b,c)
+ add r9,r9,r0,ror#2 @ h+=Sigma0(a)
+ @ add r9,r9,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#12*4] @ 27
+ @ ldr r1,[sp,#9*4]
+ mov r0,r2,ror#7
+ add r9,r9,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#11*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#4*4]
+
+ add r3,r3,r0
+ eor r0,r5,r5,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r5,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r8,r8,r2 @ h+=X[i]
+ str r2,[sp,#11*4]
+ eor r2,r6,r7
+ add r8,r8,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r5
+ add r8,r8,r3 @ h+=K256[i]
+ eor r2,r2,r7 @ Ch(e,f,g)
+ eor r0,r9,r9,ror#11
+ add r8,r8,r2 @ h+=Ch(e,f,g)
+#if 27==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r9,r10 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#13*4] @ from future BODY_16_xx
+ eor r3,r9,r10 @ a^b, b^c in next round
+ ldr r1,[sp,#10*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r9,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r4,r4,r8 @ d+=h
+ eor r12,r12,r10 @ Maj(a,b,c)
+ add r8,r8,r0,ror#2 @ h+=Sigma0(a)
+ @ add r8,r8,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#13*4] @ 28
+ @ ldr r1,[sp,#10*4]
+ mov r0,r2,ror#7
+ add r8,r8,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#12*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#5*4]
+
+ add r12,r12,r0
+ eor r0,r4,r4,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r4,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r7,r7,r2 @ h+=X[i]
+ str r2,[sp,#12*4]
+ eor r2,r5,r6
+ add r7,r7,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r4
+ add r7,r7,r12 @ h+=K256[i]
+ eor r2,r2,r6 @ Ch(e,f,g)
+ eor r0,r8,r8,ror#11
+ add r7,r7,r2 @ h+=Ch(e,f,g)
+#if 28==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r8,r9 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#14*4] @ from future BODY_16_xx
+ eor r12,r8,r9 @ a^b, b^c in next round
+ ldr r1,[sp,#11*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r8,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r11,r11,r7 @ d+=h
+ eor r3,r3,r9 @ Maj(a,b,c)
+ add r7,r7,r0,ror#2 @ h+=Sigma0(a)
+ @ add r7,r7,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#14*4] @ 29
+ @ ldr r1,[sp,#11*4]
+ mov r0,r2,ror#7
+ add r7,r7,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#13*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#6*4]
+
+ add r3,r3,r0
+ eor r0,r11,r11,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r11,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r6,r6,r2 @ h+=X[i]
+ str r2,[sp,#13*4]
+ eor r2,r4,r5
+ add r6,r6,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r11
+ add r6,r6,r3 @ h+=K256[i]
+ eor r2,r2,r5 @ Ch(e,f,g)
+ eor r0,r7,r7,ror#11
+ add r6,r6,r2 @ h+=Ch(e,f,g)
+#if 29==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r7,r8 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#15*4] @ from future BODY_16_xx
+ eor r3,r7,r8 @ a^b, b^c in next round
+ ldr r1,[sp,#12*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r7,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r10,r10,r6 @ d+=h
+ eor r12,r12,r8 @ Maj(a,b,c)
+ add r6,r6,r0,ror#2 @ h+=Sigma0(a)
+ @ add r6,r6,r12 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#15*4] @ 30
+ @ ldr r1,[sp,#12*4]
+ mov r0,r2,ror#7
+ add r6,r6,r12 @ h+=Maj(a,b,c) from the past
+ mov r12,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r12,r12,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#14*4]
+ eor r12,r12,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#7*4]
+
+ add r12,r12,r0
+ eor r0,r10,r10,ror#5 @ from BODY_00_15
+ add r2,r2,r12
+ eor r0,r0,r10,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r12,[r14],#4 @ *K256++
+ add r5,r5,r2 @ h+=X[i]
+ str r2,[sp,#14*4]
+ eor r2,r11,r4
+ add r5,r5,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r10
+ add r5,r5,r12 @ h+=K256[i]
+ eor r2,r2,r4 @ Ch(e,f,g)
+ eor r0,r6,r6,ror#11
+ add r5,r5,r2 @ h+=Ch(e,f,g)
+#if 30==31
+ and r12,r12,#0xff
+ cmp r12,#0xf2 @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r12,r6,r7 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#0*4] @ from future BODY_16_xx
+ eor r12,r6,r7 @ a^b, b^c in next round
+ ldr r1,[sp,#13*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r6,ror#20 @ Sigma0(a)
+ and r3,r3,r12 @ (b^c)&=(a^b)
+ add r9,r9,r5 @ d+=h
+ eor r3,r3,r7 @ Maj(a,b,c)
+ add r5,r5,r0,ror#2 @ h+=Sigma0(a)
+ @ add r5,r5,r3 @ h+=Maj(a,b,c)
+ @ ldr r2,[sp,#0*4] @ 31
+ @ ldr r1,[sp,#13*4]
+ mov r0,r2,ror#7
+ add r5,r5,r3 @ h+=Maj(a,b,c) from the past
+ mov r3,r1,ror#17
+ eor r0,r0,r2,ror#18
+ eor r3,r3,r1,ror#19
+ eor r0,r0,r2,lsr#3 @ sigma0(X[i+1])
+ ldr r2,[sp,#15*4]
+ eor r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+ ldr r1,[sp,#8*4]
+
+ add r3,r3,r0
+ eor r0,r9,r9,ror#5 @ from BODY_00_15
+ add r2,r2,r3
+ eor r0,r0,r9,ror#19 @ Sigma1(e)
+ add r2,r2,r1 @ X[i]
+ ldr r3,[r14],#4 @ *K256++
+ add r4,r4,r2 @ h+=X[i]
+ str r2,[sp,#15*4]
+ eor r2,r10,r11
+ add r4,r4,r0,ror#6 @ h+=Sigma1(e)
+ and r2,r2,r9
+ add r4,r4,r3 @ h+=K256[i]
+ eor r2,r2,r11 @ Ch(e,f,g)
+ eor r0,r5,r5,ror#11
+ add r4,r4,r2 @ h+=Ch(e,f,g)
+#if 31==31
+ and r3,r3,#0xff
+ cmp r3,#0xf2 @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+ ldr r2,[r1],#4 @ prefetch
+# else
+ ldrb r2,[r1,#3]
+# endif
+ eor r3,r5,r6 @ a^b, b^c in next round
+#else
+ ldr r2,[sp,#1*4] @ from future BODY_16_xx
+ eor r3,r5,r6 @ a^b, b^c in next round
+ ldr r1,[sp,#14*4] @ from future BODY_16_xx
+#endif
+ eor r0,r0,r5,ror#20 @ Sigma0(a)
+ and r12,r12,r3 @ (b^c)&=(a^b)
+ add r8,r8,r4 @ d+=h
+ eor r12,r12,r6 @ Maj(a,b,c)
+ add r4,r4,r0,ror#2 @ h+=Sigma0(a)
+ @ add r4,r4,r12 @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+ ite eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r3,[sp,#16*4] @ pull ctx
+ bne .Lrounds_16_xx
+
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r0,[r3,#0]
+ ldr r2,[r3,#4]
+ ldr r12,[r3,#8]
+ add r4,r4,r0
+ ldr r0,[r3,#12]
+ add r5,r5,r2
+ ldr r2,[r3,#16]
+ add r6,r6,r12
+ ldr r12,[r3,#20]
+ add r7,r7,r0
+ ldr r0,[r3,#24]
+ add r8,r8,r2
+ ldr r2,[r3,#28]
+ add r9,r9,r12
+ ldr r1,[sp,#17*4] @ pull inp
+ ldr r12,[sp,#18*4] @ pull inp+len
+ add r10,r10,r0
+ add r11,r11,r2
+ stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+ cmp r1,r12
+ sub r14,r14,#256 @ rewind Ktbl
+ bne .Loop
+
+ add sp,sp,#19*4 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha256_block_data_order,.-sha256_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.globl sha256_block_data_order_neon
+.hidden sha256_block_data_order_neon
+.type sha256_block_data_order_neon,%function
+.align 5
+.skip 16
+sha256_block_data_order_neon:
+.LNEON:
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+
+ sub r11,sp,#16*4+16
+ adr r14,K256
+ bic r11,r11,#15 @ align for 128-bit stores
+ mov r12,sp
+ mov sp,r11 @ alloca
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+
+ vld1.8 {q0},[r1]!
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ vld1.32 {q8},[r14,:128]!
+ vld1.32 {q9},[r14,:128]!
+ vld1.32 {q10},[r14,:128]!
+ vld1.32 {q11},[r14,:128]!
+ vrev32.8 q0,q0 @ yes, even on
+ str r0,[sp,#64]
+ vrev32.8 q1,q1 @ big-endian
+ str r1,[sp,#68]
+ mov r1,sp
+ vrev32.8 q2,q2
+ str r2,[sp,#72]
+ vrev32.8 q3,q3
+ str r12,[sp,#76] @ save original sp
+ vadd.i32 q8,q8,q0
+ vadd.i32 q9,q9,q1
+ vst1.32 {q8},[r1,:128]!
+ vadd.i32 q10,q10,q2
+ vst1.32 {q9},[r1,:128]!
+ vadd.i32 q11,q11,q3
+ vst1.32 {q10},[r1,:128]!
+ vst1.32 {q11},[r1,:128]!
+
+ ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+ sub r1,r1,#64
+ ldr r2,[sp,#0]
+ eor r12,r12,r12
+ eor r3,r5,r6
+ b .L_00_48
+
+.align 4
+.L_00_48:
+ vext.8 q8,q0,q1,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q2,q3,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q0,q0,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d7,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d7,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d7,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q0,q0,q9
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d7,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d7,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d0,d0,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d0,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d0,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d0,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ vshr.u32 d24,d0,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d0,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d1,d1,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q0
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q1,q2,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q3,q0,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q1,q1,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d1,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d1,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d1,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q1,q1,q9
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d1,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d1,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d2,d2,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d2,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d2,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d2,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ vshr.u32 d24,d2,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d2,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d3,d3,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q1
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vext.8 q8,q2,q3,#4
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ vext.8 q9,q0,q1,#4
+ add r4,r4,r12
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vadd.i32 q2,q2,q9
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ veor q9,q9,q10
+ add r10,r10,r2
+ vsli.32 q11,q8,#14
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ vshr.u32 d24,d3,#17
+ add r11,r11,r3
+ and r2,r2,r7
+ veor q9,q9,q11
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ vsli.32 d24,d3,#15
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ vshr.u32 d25,d3,#10
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ vadd.i32 q2,q2,q9
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r6,r6,r10
+ vshr.u32 d24,d3,#19
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ vsli.32 d24,d3,#13
+ add r9,r9,r2
+ eor r2,r7,r8
+ veor d25,d25,d24
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ vadd.i32 d4,d4,d25
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ vshr.u32 d24,d4,#17
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ vsli.32 d24,d4,#15
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ vshr.u32 d25,d4,#10
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ vshr.u32 d24,d4,#19
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ vld1.32 {q8},[r14,:128]!
+ add r8,r8,r2
+ vsli.32 d24,d4,#13
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ veor d25,d25,d24
+ add r9,r9,r3
+ and r2,r2,r5
+ vadd.i32 d5,d5,d25
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ vadd.i32 q8,q8,q2
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ vst1.32 {q8},[r1,:128]!
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vext.8 q8,q3,q0,#4
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ vext.8 q9,q1,q2,#4
+ add r8,r8,r12
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ vshr.u32 q10,q8,#7
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vadd.i32 q3,q3,q9
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ vshr.u32 q9,q8,#3
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vsli.32 q10,q8,#25
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ vshr.u32 q11,q8,#18
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ veor q9,q9,q10
+ add r6,r6,r2
+ vsli.32 q11,q8,#14
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ vshr.u32 d24,d5,#17
+ add r7,r7,r3
+ and r2,r2,r11
+ veor q9,q9,q11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ vsli.32 d24,d5,#15
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ vshr.u32 d25,d5,#10
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ vadd.i32 q3,q3,q9
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ veor d25,d25,d24
+ and r12,r12,r3
+ add r10,r10,r6
+ vshr.u32 d24,d5,#19
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ vsli.32 d24,d5,#13
+ add r5,r5,r2
+ eor r2,r11,r4
+ veor d25,d25,d24
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ vadd.i32 d6,d6,d25
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ vshr.u32 d24,d6,#17
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ vsli.32 d24,d6,#15
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ vshr.u32 d25,d6,#10
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ veor d25,d25,d24
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ vshr.u32 d24,d6,#19
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ vld1.32 {q8},[r14,:128]!
+ add r4,r4,r2
+ vsli.32 d24,d6,#13
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ veor d25,d25,d24
+ add r5,r5,r3
+ and r2,r2,r9
+ vadd.i32 d7,d7,d25
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ vadd.i32 q8,q8,q3
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[r14]
+ and r12,r12,r3
+ add r8,r8,r4
+ vst1.32 {q8},[r1,:128]!
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ teq r2,#0 @ check for K256 terminator
+ ldr r2,[sp,#0]
+ sub r1,r1,#64
+ bne .L_00_48
+
+ ldr r1,[sp,#68]
+ ldr r0,[sp,#72]
+ sub r14,r14,#256 @ rewind r14
+ teq r1,r0
+ it eq
+ subeq r1,r1,#64 @ avoid SEGV
+ vld1.8 {q0},[r1]! @ load next input block
+ vld1.8 {q1},[r1]!
+ vld1.8 {q2},[r1]!
+ vld1.8 {q3},[r1]!
+ it ne
+ strne r1,[sp,#68]
+ mov r1,sp
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q0,q0
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q0
+ ldr r2,[sp,#4]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#8]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#12]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#16]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q1,q1
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q1
+ ldr r2,[sp,#20]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#24]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#28]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#32]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ add r11,r11,r2
+ eor r2,r9,r10
+ eor r0,r8,r8,ror#5
+ add r4,r4,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r8
+ eor r12,r0,r8,ror#19
+ eor r0,r4,r4,ror#11
+ eor r2,r2,r10
+ vrev32.8 q2,q2
+ add r11,r11,r12,ror#6
+ eor r12,r4,r5
+ eor r0,r0,r4,ror#20
+ add r11,r11,r2
+ vadd.i32 q8,q8,q2
+ ldr r2,[sp,#36]
+ and r3,r3,r12
+ add r7,r7,r11
+ add r11,r11,r0,ror#2
+ eor r3,r3,r5
+ add r10,r10,r2
+ eor r2,r8,r9
+ eor r0,r7,r7,ror#5
+ add r11,r11,r3
+ and r2,r2,r7
+ eor r3,r0,r7,ror#19
+ eor r0,r11,r11,ror#11
+ eor r2,r2,r9
+ add r10,r10,r3,ror#6
+ eor r3,r11,r4
+ eor r0,r0,r11,ror#20
+ add r10,r10,r2
+ ldr r2,[sp,#40]
+ and r12,r12,r3
+ add r6,r6,r10
+ add r10,r10,r0,ror#2
+ eor r12,r12,r4
+ add r9,r9,r2
+ eor r2,r7,r8
+ eor r0,r6,r6,ror#5
+ add r10,r10,r12
+ and r2,r2,r6
+ eor r12,r0,r6,ror#19
+ eor r0,r10,r10,ror#11
+ eor r2,r2,r8
+ add r9,r9,r12,ror#6
+ eor r12,r10,r11
+ eor r0,r0,r10,ror#20
+ add r9,r9,r2
+ ldr r2,[sp,#44]
+ and r3,r3,r12
+ add r5,r5,r9
+ add r9,r9,r0,ror#2
+ eor r3,r3,r11
+ add r8,r8,r2
+ eor r2,r6,r7
+ eor r0,r5,r5,ror#5
+ add r9,r9,r3
+ and r2,r2,r5
+ eor r3,r0,r5,ror#19
+ eor r0,r9,r9,ror#11
+ eor r2,r2,r7
+ add r8,r8,r3,ror#6
+ eor r3,r9,r10
+ eor r0,r0,r9,ror#20
+ add r8,r8,r2
+ ldr r2,[sp,#48]
+ and r12,r12,r3
+ add r4,r4,r8
+ add r8,r8,r0,ror#2
+ eor r12,r12,r10
+ vst1.32 {q8},[r1,:128]!
+ add r7,r7,r2
+ eor r2,r5,r6
+ eor r0,r4,r4,ror#5
+ add r8,r8,r12
+ vld1.32 {q8},[r14,:128]!
+ and r2,r2,r4
+ eor r12,r0,r4,ror#19
+ eor r0,r8,r8,ror#11
+ eor r2,r2,r6
+ vrev32.8 q3,q3
+ add r7,r7,r12,ror#6
+ eor r12,r8,r9
+ eor r0,r0,r8,ror#20
+ add r7,r7,r2
+ vadd.i32 q8,q8,q3
+ ldr r2,[sp,#52]
+ and r3,r3,r12
+ add r11,r11,r7
+ add r7,r7,r0,ror#2
+ eor r3,r3,r9
+ add r6,r6,r2
+ eor r2,r4,r5
+ eor r0,r11,r11,ror#5
+ add r7,r7,r3
+ and r2,r2,r11
+ eor r3,r0,r11,ror#19
+ eor r0,r7,r7,ror#11
+ eor r2,r2,r5
+ add r6,r6,r3,ror#6
+ eor r3,r7,r8
+ eor r0,r0,r7,ror#20
+ add r6,r6,r2
+ ldr r2,[sp,#56]
+ and r12,r12,r3
+ add r10,r10,r6
+ add r6,r6,r0,ror#2
+ eor r12,r12,r8
+ add r5,r5,r2
+ eor r2,r11,r4
+ eor r0,r10,r10,ror#5
+ add r6,r6,r12
+ and r2,r2,r10
+ eor r12,r0,r10,ror#19
+ eor r0,r6,r6,ror#11
+ eor r2,r2,r4
+ add r5,r5,r12,ror#6
+ eor r12,r6,r7
+ eor r0,r0,r6,ror#20
+ add r5,r5,r2
+ ldr r2,[sp,#60]
+ and r3,r3,r12
+ add r9,r9,r5
+ add r5,r5,r0,ror#2
+ eor r3,r3,r7
+ add r4,r4,r2
+ eor r2,r10,r11
+ eor r0,r9,r9,ror#5
+ add r5,r5,r3
+ and r2,r2,r9
+ eor r3,r0,r9,ror#19
+ eor r0,r5,r5,ror#11
+ eor r2,r2,r11
+ add r4,r4,r3,ror#6
+ eor r3,r5,r6
+ eor r0,r0,r5,ror#20
+ add r4,r4,r2
+ ldr r2,[sp,#64]
+ and r12,r12,r3
+ add r8,r8,r4
+ add r4,r4,r0,ror#2
+ eor r12,r12,r6
+ vst1.32 {q8},[r1,:128]!
+ ldr r0,[r2,#0]
+ add r4,r4,r12 @ h+=Maj(a,b,c) from the past
+ ldr r12,[r2,#4]
+ ldr r3,[r2,#8]
+ ldr r1,[r2,#12]
+ add r4,r4,r0 @ accumulate
+ ldr r0,[r2,#16]
+ add r5,r5,r12
+ ldr r12,[r2,#20]
+ add r6,r6,r3
+ ldr r3,[r2,#24]
+ add r7,r7,r1
+ ldr r1,[r2,#28]
+ add r8,r8,r0
+ str r4,[r2],#4
+ add r9,r9,r12
+ str r5,[r2],#4
+ add r10,r10,r3
+ str r6,[r2],#4
+ add r11,r11,r1
+ str r7,[r2],#4
+ stmia r2,{r8,r9,r10,r11}
+
+ ittte ne
+ movne r1,sp
+ ldrne r2,[sp,#0]
+ eorne r12,r12,r12
+ ldreq sp,[sp,#76] @ restore original sp
+ itt ne
+ eorne r3,r5,r6
+ bne .L_00_48
+
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+
+# if defined(__thumb2__)
+# define INST(a,b,c,d) .byte c,d|0xc,a,b
+# else
+# define INST(a,b,c,d) .byte a,b,c,d
+# endif
+
+.type sha256_block_data_order_armv8,%function
+.align 5
+sha256_block_data_order_armv8:
+.LARMv8:
+ vld1.32 {q0,q1},[r0]
+ sub r3,r3,#256+32
+ add r2,r1,r2,lsl#6 @ len to point at the end of inp
+ b .Loop_v8
+
+.align 4
+.Loop_v8:
+ vld1.8 {q8,q9},[r1]!
+ vld1.8 {q10,q11},[r1]!
+ vld1.32 {q12},[r3]!
+ vrev32.8 q8,q8
+ vrev32.8 q9,q9
+ vrev32.8 q10,q10
+ vrev32.8 q11,q11
+ vmov q14,q0 @ offload
+ vmov q15,q1
+ teq r1,r2
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q10
+ INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+ INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q11
+ INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+ INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10
+ vld1.32 {q13},[r3]!
+ vadd.i32 q12,q12,q8
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vld1.32 {q12},[r3]!
+ vadd.i32 q13,q13,q9
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vld1.32 {q13},[r3]
+ vadd.i32 q12,q12,q10
+ sub r3,r3,#256-16 @ rewind
+ vmov q2,q0
+ INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12
+ INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12
+
+ vadd.i32 q13,q13,q11
+ vmov q2,q0
+ INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13
+ INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13
+
+ vadd.i32 q0,q0,q14
+ vadd.i32 q1,q1,q15
+ it ne
+ bne .Loop_v8
+
+ vst1.32 {q0,q1},[r0]
+
+ bx lr @ bx lr
+.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
+#endif
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S
@@ -1,0 +1,1894 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+@
+@ Licensed under the OpenSSL license (the "License"). You may not use
+@ this file except in compliance with the License. You can obtain a copy
+@ in the file LICENSE in the source distribution or at
+@ https://www.openssl.org/source/license.html
+
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA512 block procedure for ARMv4. September 2007.
+
+@ This code is ~4.5 (four and a half) times faster than code generated
+@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+@ Xscale PXA250 core].
+@
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
+@ Cortex A8 core and ~40 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 7%
+@ improvement on Coxtex A8 core and ~38 cycles per byte.
+
+@ March 2011.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process
+@ one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+@ August 2012.
+@
+@ Improve NEON performance by 12% on Snapdragon S4. In absolute
+@ terms it's 22.6 cycles per byte, which is disappointing result.
+@ Technical writers asserted that 3-way S4 pipeline can sustain
+@ multiple NEON instructions per cycle, but dual NEON issue could
+@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+@ for further details. On side note Cortex-A15 processes one byte in
+@ 16 cycles.
+
+@ Byte order [in]dependence. =========================================
+@
+@ Originally caller was expected to maintain specific *dword* order in
+@ h[0-7], namely with most significant dword at *lower* address, which
+@ was reflected in below two parameters as 0 and 4. Now caller is
+@ expected to maintain native byte order for whole 64-bit values.
+#ifndef __KERNEL__
+# include <openssl/arm_arch.h>
+# define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
+# define VFP_ABI_POP vldmia sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both
+@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions.
+.arch armv7-a
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+# define adrl adr
+#else
+.code 32
+#endif
+
+.type K512,%object
+.align 5
+K512:
+ WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+ WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+ WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+ WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+ WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+ WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+ WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+ WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+ WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+ WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+ WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+ WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+ WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+ WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+ WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+ WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+ WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+ WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+ WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+ WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+ WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+ WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+ WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+ WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+ WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+ WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+ WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+ WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+ WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+ WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+ WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+ WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+ WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+ WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+ WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+ WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+ WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+ WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+ WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word OPENSSL_armcap_P-.Lsha512_block_data_order
+.skip 32-4
+#else
+.skip 32
+#endif
+
+.globl sha512_block_data_order
+.hidden sha512_block_data_order
+.type sha512_block_data_order,%function
+sha512_block_data_order:
+.Lsha512_block_data_order:
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+ sub r3,pc,#8 @ sha512_block_data_order
+#else
+ adr r3,.Lsha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+ ldr r12,.LOPENSSL_armcap
+ ldr r12,[r3,r12] @ OPENSSL_armcap_P
+#ifdef __APPLE__
+ ldr r12,[r12]
+#endif
+ tst r12,#ARMV7_NEON
+ bne .LNEON
+#endif
+ add r2,r1,r2,lsl#7 @ len to point at the end of inp
+ stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ sub r14,r3,#672 @ K512
+ sub sp,sp,#9*8
+
+ ldr r7,[r0,#32+LO]
+ ldr r8,[r0,#32+HI]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
+.Loop:
+ str r9, [sp,#48+0]
+ str r10, [sp,#48+4]
+ str r11, [sp,#56+0]
+ str r12, [sp,#56+4]
+ ldr r5,[r0,#0+LO]
+ ldr r6,[r0,#0+HI]
+ ldr r3,[r0,#8+LO]
+ ldr r4,[r0,#8+HI]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
+ str r3,[sp,#8+0]
+ str r4,[sp,#8+4]
+ str r9, [sp,#16+0]
+ str r10, [sp,#16+4]
+ str r11, [sp,#24+0]
+ str r12, [sp,#24+4]
+ ldr r3,[r0,#40+LO]
+ ldr r4,[r0,#40+HI]
+ str r3,[sp,#40+0]
+ str r4,[sp,#40+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+ ldrb r3,[r1,#7]
+ ldrb r9, [r1,#6]
+ ldrb r10, [r1,#5]
+ ldrb r11, [r1,#4]
+ ldrb r4,[r1,#3]
+ ldrb r12, [r1,#2]
+ orr r3,r3,r9,lsl#8
+ ldrb r9, [r1,#1]
+ orr r3,r3,r10,lsl#16
+ ldrb r10, [r1],#8
+ orr r3,r3,r11,lsl#24
+ orr r4,r4,r12,lsl#8
+ orr r4,r4,r9,lsl#16
+ orr r4,r4,r10,lsl#24
+#else
+ ldr r3,[r1,#4]
+ ldr r4,[r1],#8
+#ifdef __ARMEL__
+ rev r3,r3
+ rev r4,r4
+#endif
+#endif
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
+ mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
+ eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
+ eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
+ eor r9,r9,r7,lsr#18
+ eor r10,r10,r8,lsr#18
+ eor r9,r9,r8,lsl#14
+ eor r10,r10,r7,lsl#14
+ eor r9,r9,r8,lsr#9
+ eor r10,r10,r7,lsr#9
+ eor r9,r9,r7,lsl#23
+ eor r10,r10,r8,lsl#23 @ Sigma1(e)
+ adds r3,r3,r9
+ ldr r9,[sp,#40+0] @ f.lo
+ adc r4,r4,r10 @ T += Sigma1(e)
+ ldr r10,[sp,#40+4] @ f.hi
+ adds r3,r3,r11
+ ldr r11,[sp,#48+0] @ g.lo
+ adc r4,r4,r12 @ T += h
+ ldr r12,[sp,#48+4] @ g.hi
+
+ eor r9,r9,r11
+ str r7,[sp,#32+0]
+ eor r10,r10,r12
+ str r8,[sp,#32+4]
+ and r9,r9,r7
+ str r5,[sp,#0+0]
+ and r10,r10,r8
+ str r6,[sp,#0+4]
+ eor r9,r9,r11
+ ldr r11,[r14,#LO] @ K[i].lo
+ eor r10,r10,r12 @ Ch(e,f,g)
+ ldr r12,[r14,#HI] @ K[i].hi
+
+ adds r3,r3,r9
+ ldr r7,[sp,#24+0] @ d.lo
+ adc r4,r4,r10 @ T += Ch(e,f,g)
+ ldr r8,[sp,#24+4] @ d.hi
+ adds r3,r3,r11
+ and r9,r11,#0xff
+ adc r4,r4,r12 @ T += K[i]
+ adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
+ adc r8,r8,r4 @ d += T
+ teq r9,#148
+
+ ldr r12,[sp,#16+0] @ c.lo
+#if __ARM_ARCH__>=7
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq r14,r14,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov r9,r5,lsr#28
+ mov r10,r6,lsr#28
+ eor r9,r9,r6,lsl#4
+ eor r10,r10,r5,lsl#4
+ eor r9,r9,r6,lsr#2
+ eor r10,r10,r5,lsr#2
+ eor r9,r9,r5,lsl#30
+ eor r10,r10,r6,lsl#30
+ eor r9,r9,r6,lsr#7
+ eor r10,r10,r5,lsr#7
+ eor r9,r9,r5,lsl#25
+ eor r10,r10,r6,lsl#25 @ Sigma0(a)
+ adds r3,r3,r9
+ and r9,r5,r11
+ adc r4,r4,r10 @ T += Sigma0(a)
+
+ ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
+ ldr r11,[sp,#16+4] @ c.hi
+ and r5,r5,r12
+ and r12,r6,r10
+ orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
+ and r6,r6,r11
+ adds r5,r5,r3
+ orr r6,r6,r12 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc r6,r6,r4 @ h += T
+ tst r14,#1
+ add r14,r14,#8
+ tst r14,#1
+ beq .L00_15
+ ldr r9,[sp,#184+0]
+ ldr r10,[sp,#184+4]
+ bic r14,r14,#1
+.L16_79:
+ @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
+ @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+ @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
+ mov r3,r9,lsr#1
+ ldr r11,[sp,#80+0]
+ mov r4,r10,lsr#1
+ ldr r12,[sp,#80+4]
+ eor r3,r3,r10,lsl#31
+ eor r4,r4,r9,lsl#31
+ eor r3,r3,r9,lsr#8
+ eor r4,r4,r10,lsr#8
+ eor r3,r3,r10,lsl#24
+ eor r4,r4,r9,lsl#24
+ eor r3,r3,r9,lsr#7
+ eor r4,r4,r10,lsr#7
+ eor r3,r3,r10,lsl#25
+
+ @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+ @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+ @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+ mov r9,r11,lsr#19
+ mov r10,r12,lsr#19
+ eor r9,r9,r12,lsl#13
+ eor r10,r10,r11,lsl#13
+ eor r9,r9,r12,lsr#29
+ eor r10,r10,r11,lsr#29
+ eor r9,r9,r11,lsl#3
+ eor r10,r10,r12,lsl#3
+ eor r9,r9,r11,lsr#6
+ eor r10,r10,r12,lsr#6
+ ldr r11,[sp,#120+0]
+ eor r9,r9,r12,lsl#26
+
+ ldr r12,[sp,#120+4]
+ adds r3,r3,r9
+ ldr r9,[sp,#192+0]
+ adc r4,r4,r10
+
+ ldr r10,[sp,#192+4]
+ adds r3,r3,r11
+ adc r4,r4,r12
+ adds r3,r3,r9
+ adc r4,r4,r10
+ @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
+ @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+ @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+ mov r9,r7,lsr#14
+ str r3,[sp,#64+0]
+ mov r10,r8,lsr#14
+ str r4,[sp,#64+4]
+ eor r9,r9,r8,lsl#18
+ ldr r11,[sp,#56+0] @ h.lo
+ eor r10,r10,r7,lsl#18
+ ldr r12,[sp,#56+4] @ h.hi
+ eor r9,r9,r7,lsr#18
+ eor r10,r10,r8,lsr#18
+ eor r9,r9,r8,lsl#14
+ eor r10,r10,r7,lsl#14
+ eor r9,r9,r8,lsr#9
+ eor r10,r10,r7,lsr#9
+ eor r9,r9,r7,lsl#23
+ eor r10,r10,r8,lsl#23 @ Sigma1(e)
+ adds r3,r3,r9
+ ldr r9,[sp,#40+0] @ f.lo
+ adc r4,r4,r10 @ T += Sigma1(e)
+ ldr r10,[sp,#40+4] @ f.hi
+ adds r3,r3,r11
+ ldr r11,[sp,#48+0] @ g.lo
+ adc r4,r4,r12 @ T += h
+ ldr r12,[sp,#48+4] @ g.hi
+
+ eor r9,r9,r11
+ str r7,[sp,#32+0]
+ eor r10,r10,r12
+ str r8,[sp,#32+4]
+ and r9,r9,r7
+ str r5,[sp,#0+0]
+ and r10,r10,r8
+ str r6,[sp,#0+4]
+ eor r9,r9,r11
+ ldr r11,[r14,#LO] @ K[i].lo
+ eor r10,r10,r12 @ Ch(e,f,g)
+ ldr r12,[r14,#HI] @ K[i].hi
+
+ adds r3,r3,r9
+ ldr r7,[sp,#24+0] @ d.lo
+ adc r4,r4,r10 @ T += Ch(e,f,g)
+ ldr r8,[sp,#24+4] @ d.hi
+ adds r3,r3,r11
+ and r9,r11,#0xff
+ adc r4,r4,r12 @ T += K[i]
+ adds r7,r7,r3
+ ldr r11,[sp,#8+0] @ b.lo
+ adc r8,r8,r4 @ d += T
+ teq r9,#23
+
+ ldr r12,[sp,#16+0] @ c.lo
+#if __ARM_ARCH__>=7
+ it eq @ Thumb2 thing, sanity check in ARM
+#endif
+ orreq r14,r14,#1
+ @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+ @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+ @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+ mov r9,r5,lsr#28
+ mov r10,r6,lsr#28
+ eor r9,r9,r6,lsl#4
+ eor r10,r10,r5,lsl#4
+ eor r9,r9,r6,lsr#2
+ eor r10,r10,r5,lsr#2
+ eor r9,r9,r5,lsl#30
+ eor r10,r10,r6,lsl#30
+ eor r9,r9,r6,lsr#7
+ eor r10,r10,r5,lsr#7
+ eor r9,r9,r5,lsl#25
+ eor r10,r10,r6,lsl#25 @ Sigma0(a)
+ adds r3,r3,r9
+ and r9,r5,r11
+ adc r4,r4,r10 @ T += Sigma0(a)
+
+ ldr r10,[sp,#8+4] @ b.hi
+ orr r5,r5,r11
+ ldr r11,[sp,#16+4] @ c.hi
+ and r5,r5,r12
+ and r12,r6,r10
+ orr r6,r6,r10
+ orr r5,r5,r9 @ Maj(a,b,c).lo
+ and r6,r6,r11
+ adds r5,r5,r3
+ orr r6,r6,r12 @ Maj(a,b,c).hi
+ sub sp,sp,#8
+ adc r6,r6,r4 @ h += T
+ tst r14,#1
+ add r14,r14,#8
+#if __ARM_ARCH__>=7
+ ittt eq @ Thumb2 thing, sanity check in ARM
+#endif
+ ldreq r9,[sp,#184+0]
+ ldreq r10,[sp,#184+4]
+ beq .L16_79
+ bic r14,r14,#1
+
+ ldr r3,[sp,#8+0]
+ ldr r4,[sp,#8+4]
+ ldr r9, [r0,#0+LO]
+ ldr r10, [r0,#0+HI]
+ ldr r11, [r0,#8+LO]
+ ldr r12, [r0,#8+HI]
+ adds r9,r5,r9
+ str r9, [r0,#0+LO]
+ adc r10,r6,r10
+ str r10, [r0,#0+HI]
+ adds r11,r3,r11
+ str r11, [r0,#8+LO]
+ adc r12,r4,r12
+ str r12, [r0,#8+HI]
+
+ ldr r5,[sp,#16+0]
+ ldr r6,[sp,#16+4]
+ ldr r3,[sp,#24+0]
+ ldr r4,[sp,#24+4]
+ ldr r9, [r0,#16+LO]
+ ldr r10, [r0,#16+HI]
+ ldr r11, [r0,#24+LO]
+ ldr r12, [r0,#24+HI]
+ adds r9,r5,r9
+ str r9, [r0,#16+LO]
+ adc r10,r6,r10
+ str r10, [r0,#16+HI]
+ adds r11,r3,r11
+ str r11, [r0,#24+LO]
+ adc r12,r4,r12
+ str r12, [r0,#24+HI]
+
+ ldr r3,[sp,#40+0]
+ ldr r4,[sp,#40+4]
+ ldr r9, [r0,#32+LO]
+ ldr r10, [r0,#32+HI]
+ ldr r11, [r0,#40+LO]
+ ldr r12, [r0,#40+HI]
+ adds r7,r7,r9
+ str r7,[r0,#32+LO]
+ adc r8,r8,r10
+ str r8,[r0,#32+HI]
+ adds r11,r3,r11
+ str r11, [r0,#40+LO]
+ adc r12,r4,r12
+ str r12, [r0,#40+HI]
+
+ ldr r5,[sp,#48+0]
+ ldr r6,[sp,#48+4]
+ ldr r3,[sp,#56+0]
+ ldr r4,[sp,#56+4]
+ ldr r9, [r0,#48+LO]
+ ldr r10, [r0,#48+HI]
+ ldr r11, [r0,#56+LO]
+ ldr r12, [r0,#56+HI]
+ adds r9,r5,r9
+ str r9, [r0,#48+LO]
+ adc r10,r6,r10
+ str r10, [r0,#48+HI]
+ adds r11,r3,r11
+ str r11, [r0,#56+LO]
+ adc r12,r4,r12
+ str r12, [r0,#56+HI]
+
+ add sp,sp,#640
+ sub r14,r14,#640
+
+ teq r1,r2
+ bne .Loop
+
+ add sp,sp,#8*9 @ destroy frame
+#if __ARM_ARCH__>=5
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+ ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+ tst lr,#1
+ moveq pc,lr @ be binary compatible with V4, yet
+.word 0xe12fff1e @ interoperable with Thumb ISA:-)
+#endif
+.size sha512_block_data_order,.-sha512_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch armv7-a
+.fpu neon
+
+.globl sha512_block_data_order_neon
+.hidden sha512_block_data_order_neon
+.type sha512_block_data_order_neon,%function
+.align 4
+sha512_block_data_order_neon:
+.LNEON:
+ dmb @ errata #451034 on early Cortex A8
+ add r2,r1,r2,lsl#7 @ len to point at the end of inp
+ adr r3,K512
+ VFP_ABI_PUSH
+ vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context
+.Loop_neon:
+ vshr.u64 d24,d20,#14 @ 0
+#if 0<16
+ vld1.64 {d0},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+#if 0>0
+ vadd.i64 d16,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+ vrev64.8 d0,d0
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d0
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 1
+#if 1<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 1>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+ vrev64.8 d1,d1
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d1
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 2
+#if 2<16
+ vld1.64 {d2},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+#if 2>0
+ vadd.i64 d22,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+ vrev64.8 d2,d2
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d2
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 3
+#if 3<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 3>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+ vrev64.8 d3,d3
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d3
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 4
+#if 4<16
+ vld1.64 {d4},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+#if 4>0
+ vadd.i64 d20,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+ vrev64.8 d4,d4
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d4
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 5
+#if 5<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 5>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+ vrev64.8 d5,d5
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d5
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 6
+#if 6<16
+ vld1.64 {d6},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+#if 6>0
+ vadd.i64 d18,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+ vrev64.8 d6,d6
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d6
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 7
+#if 7<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 7>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+ vrev64.8 d7,d7
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d7
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ vshr.u64 d24,d20,#14 @ 8
+#if 8<16
+ vld1.64 {d8},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d20,#18
+#if 8>0
+ vadd.i64 d16,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d20,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+ vrev64.8 d8,d8
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d8
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 9
+#if 9<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 9>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+ vrev64.8 d9,d9
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d9
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 d24,d18,#14 @ 10
+#if 10<16
+ vld1.64 {d10},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d18,#18
+#if 10>0
+ vadd.i64 d22,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d18,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+ vrev64.8 d10,d10
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d10
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 11
+#if 11<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 11>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+ vrev64.8 d11,d11
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d11
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 d24,d16,#14 @ 12
+#if 12<16
+ vld1.64 {d12},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d16,#18
+#if 12>0
+ vadd.i64 d20,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d16,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+ vrev64.8 d12,d12
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d12
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 13
+#if 13<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 13>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+ vrev64.8 d13,d13
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d13
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 d24,d22,#14 @ 14
+#if 14<16
+ vld1.64 {d14},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d22,#18
+#if 14>0
+ vadd.i64 d18,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d22,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+ vrev64.8 d14,d14
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d14
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 15
+#if 15<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 15>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+ vrev64.8 d15,d15
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d15
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ mov r12,#4
+.L16_79_neon:
+ subs r12,#1
+ vshr.u64 q12,q7,#19
+ vshr.u64 q13,q7,#61
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vshr.u64 q15,q7,#6
+ vsli.64 q12,q7,#45
+ vext.8 q14,q0,q1,#8 @ X[i+1]
+ vsli.64 q13,q7,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q0,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q4,q5,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q0,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q0,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d0
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 17
+#if 17<16
+ vld1.64 {d1},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 17>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d1
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 q12,q0,#19
+ vshr.u64 q13,q0,#61
+ vadd.i64 d22,d30 @ h+=Maj from the past
+ vshr.u64 q15,q0,#6
+ vsli.64 q12,q0,#45
+ vext.8 q14,q1,q2,#8 @ X[i+1]
+ vsli.64 q13,q0,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q1,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q5,q6,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q1,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q1,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d2
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 19
+#if 19<16
+ vld1.64 {d3},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 19>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d3
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 q12,q1,#19
+ vshr.u64 q13,q1,#61
+ vadd.i64 d20,d30 @ h+=Maj from the past
+ vshr.u64 q15,q1,#6
+ vsli.64 q12,q1,#45
+ vext.8 q14,q2,q3,#8 @ X[i+1]
+ vsli.64 q13,q1,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q2,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q6,q7,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q2,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q2,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d4
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 21
+#if 21<16
+ vld1.64 {d5},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 21>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d5
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 q12,q2,#19
+ vshr.u64 q13,q2,#61
+ vadd.i64 d18,d30 @ h+=Maj from the past
+ vshr.u64 q15,q2,#6
+ vsli.64 q12,q2,#45
+ vext.8 q14,q3,q4,#8 @ X[i+1]
+ vsli.64 q13,q2,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q3,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q7,q0,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q3,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q3,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d6
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 23
+#if 23<16
+ vld1.64 {d7},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 23>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d7
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ vshr.u64 q12,q3,#19
+ vshr.u64 q13,q3,#61
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vshr.u64 q15,q3,#6
+ vsli.64 q12,q3,#45
+ vext.8 q14,q4,q5,#8 @ X[i+1]
+ vsli.64 q13,q3,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q4,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q0,q1,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d20,#14 @ from NEON_00_15
+ vadd.i64 q4,q14
+ vshr.u64 d25,d20,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d20,#41 @ from NEON_00_15
+ vadd.i64 q4,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d20,#50
+ vsli.64 d25,d20,#46
+ vmov d29,d20
+ vsli.64 d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d21,d22 @ Ch(e,f,g)
+ vshr.u64 d24,d16,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d23
+ vshr.u64 d25,d16,#34
+ vsli.64 d24,d16,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d16,#39
+ vadd.i64 d28,d8
+ vsli.64 d25,d16,#30
+ veor d30,d16,d17
+ vsli.64 d26,d16,#25
+ veor d23,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d18,d17 @ Maj(a,b,c)
+ veor d23,d26 @ Sigma0(a)
+ vadd.i64 d19,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d23,d30
+ vshr.u64 d24,d19,#14 @ 25
+#if 25<16
+ vld1.64 {d9},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d19,#18
+#if 25>0
+ vadd.i64 d23,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d19,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d19,#50
+ vsli.64 d25,d19,#46
+ vmov d29,d19
+ vsli.64 d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d20,d21 @ Ch(e,f,g)
+ vshr.u64 d24,d23,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d22
+ vshr.u64 d25,d23,#34
+ vsli.64 d24,d23,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d23,#39
+ vadd.i64 d28,d9
+ vsli.64 d25,d23,#30
+ veor d30,d23,d16
+ vsli.64 d26,d23,#25
+ veor d22,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d17,d16 @ Maj(a,b,c)
+ veor d22,d26 @ Sigma0(a)
+ vadd.i64 d18,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d22,d30
+ vshr.u64 q12,q4,#19
+ vshr.u64 q13,q4,#61
+ vadd.i64 d22,d30 @ h+=Maj from the past
+ vshr.u64 q15,q4,#6
+ vsli.64 q12,q4,#45
+ vext.8 q14,q5,q6,#8 @ X[i+1]
+ vsli.64 q13,q4,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q5,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q1,q2,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d18,#14 @ from NEON_00_15
+ vadd.i64 q5,q14
+ vshr.u64 d25,d18,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d18,#41 @ from NEON_00_15
+ vadd.i64 q5,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d18,#50
+ vsli.64 d25,d18,#46
+ vmov d29,d18
+ vsli.64 d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d19,d20 @ Ch(e,f,g)
+ vshr.u64 d24,d22,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d21
+ vshr.u64 d25,d22,#34
+ vsli.64 d24,d22,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d22,#39
+ vadd.i64 d28,d10
+ vsli.64 d25,d22,#30
+ veor d30,d22,d23
+ vsli.64 d26,d22,#25
+ veor d21,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d16,d23 @ Maj(a,b,c)
+ veor d21,d26 @ Sigma0(a)
+ vadd.i64 d17,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d21,d30
+ vshr.u64 d24,d17,#14 @ 27
+#if 27<16
+ vld1.64 {d11},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d17,#18
+#if 27>0
+ vadd.i64 d21,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d17,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d17,#50
+ vsli.64 d25,d17,#46
+ vmov d29,d17
+ vsli.64 d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d18,d19 @ Ch(e,f,g)
+ vshr.u64 d24,d21,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d20
+ vshr.u64 d25,d21,#34
+ vsli.64 d24,d21,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d21,#39
+ vadd.i64 d28,d11
+ vsli.64 d25,d21,#30
+ veor d30,d21,d22
+ vsli.64 d26,d21,#25
+ veor d20,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d23,d22 @ Maj(a,b,c)
+ veor d20,d26 @ Sigma0(a)
+ vadd.i64 d16,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d20,d30
+ vshr.u64 q12,q5,#19
+ vshr.u64 q13,q5,#61
+ vadd.i64 d20,d30 @ h+=Maj from the past
+ vshr.u64 q15,q5,#6
+ vsli.64 q12,q5,#45
+ vext.8 q14,q6,q7,#8 @ X[i+1]
+ vsli.64 q13,q5,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q6,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q2,q3,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d16,#14 @ from NEON_00_15
+ vadd.i64 q6,q14
+ vshr.u64 d25,d16,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d16,#41 @ from NEON_00_15
+ vadd.i64 q6,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d16,#50
+ vsli.64 d25,d16,#46
+ vmov d29,d16
+ vsli.64 d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d17,d18 @ Ch(e,f,g)
+ vshr.u64 d24,d20,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d19
+ vshr.u64 d25,d20,#34
+ vsli.64 d24,d20,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d20,#39
+ vadd.i64 d28,d12
+ vsli.64 d25,d20,#30
+ veor d30,d20,d21
+ vsli.64 d26,d20,#25
+ veor d19,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d22,d21 @ Maj(a,b,c)
+ veor d19,d26 @ Sigma0(a)
+ vadd.i64 d23,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d19,d30
+ vshr.u64 d24,d23,#14 @ 29
+#if 29<16
+ vld1.64 {d13},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d23,#18
+#if 29>0
+ vadd.i64 d19,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d23,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d23,#50
+ vsli.64 d25,d23,#46
+ vmov d29,d23
+ vsli.64 d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d16,d17 @ Ch(e,f,g)
+ vshr.u64 d24,d19,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d18
+ vshr.u64 d25,d19,#34
+ vsli.64 d24,d19,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d19,#39
+ vadd.i64 d28,d13
+ vsli.64 d25,d19,#30
+ veor d30,d19,d20
+ vsli.64 d26,d19,#25
+ veor d18,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d21,d20 @ Maj(a,b,c)
+ veor d18,d26 @ Sigma0(a)
+ vadd.i64 d22,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d18,d30
+ vshr.u64 q12,q6,#19
+ vshr.u64 q13,q6,#61
+ vadd.i64 d18,d30 @ h+=Maj from the past
+ vshr.u64 q15,q6,#6
+ vsli.64 q12,q6,#45
+ vext.8 q14,q7,q0,#8 @ X[i+1]
+ vsli.64 q13,q6,#3
+ veor q15,q12
+ vshr.u64 q12,q14,#1
+ veor q15,q13 @ sigma1(X[i+14])
+ vshr.u64 q13,q14,#8
+ vadd.i64 q7,q15
+ vshr.u64 q15,q14,#7
+ vsli.64 q12,q14,#63
+ vsli.64 q13,q14,#56
+ vext.8 q14,q3,q4,#8 @ X[i+9]
+ veor q15,q12
+ vshr.u64 d24,d22,#14 @ from NEON_00_15
+ vadd.i64 q7,q14
+ vshr.u64 d25,d22,#18 @ from NEON_00_15
+ veor q15,q13 @ sigma0(X[i+1])
+ vshr.u64 d26,d22,#41 @ from NEON_00_15
+ vadd.i64 q7,q15
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d22,#50
+ vsli.64 d25,d22,#46
+ vmov d29,d22
+ vsli.64 d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d23,d16 @ Ch(e,f,g)
+ vshr.u64 d24,d18,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d17
+ vshr.u64 d25,d18,#34
+ vsli.64 d24,d18,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d18,#39
+ vadd.i64 d28,d14
+ vsli.64 d25,d18,#30
+ veor d30,d18,d19
+ vsli.64 d26,d18,#25
+ veor d17,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d20,d19 @ Maj(a,b,c)
+ veor d17,d26 @ Sigma0(a)
+ vadd.i64 d21,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d17,d30
+ vshr.u64 d24,d21,#14 @ 31
+#if 31<16
+ vld1.64 {d15},[r1]! @ handles unaligned
+#endif
+ vshr.u64 d25,d21,#18
+#if 31>0
+ vadd.i64 d17,d30 @ h+=Maj from the past
+#endif
+ vshr.u64 d26,d21,#41
+ vld1.64 {d28},[r3,:64]! @ K[i++]
+ vsli.64 d24,d21,#50
+ vsli.64 d25,d21,#46
+ vmov d29,d21
+ vsli.64 d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+ vrev64.8 ,
+#endif
+ veor d25,d24
+ vbsl d29,d22,d23 @ Ch(e,f,g)
+ vshr.u64 d24,d17,#28
+ veor d26,d25 @ Sigma1(e)
+ vadd.i64 d27,d29,d16
+ vshr.u64 d25,d17,#34
+ vsli.64 d24,d17,#36
+ vadd.i64 d27,d26
+ vshr.u64 d26,d17,#39
+ vadd.i64 d28,d15
+ vsli.64 d25,d17,#30
+ veor d30,d17,d18
+ vsli.64 d26,d17,#25
+ veor d16,d24,d25
+ vadd.i64 d27,d28
+ vbsl d30,d19,d18 @ Maj(a,b,c)
+ veor d16,d26 @ Sigma0(a)
+ vadd.i64 d20,d27
+ vadd.i64 d30,d27
+ @ vadd.i64 d16,d30
+ bne .L16_79_neon
+
+ vadd.i64 d16,d30 @ h+=Maj from the past
+ vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp
+ vadd.i64 q8,q12 @ vectorized accumulate
+ vadd.i64 q9,q13
+ vadd.i64 q10,q14
+ vadd.i64 q11,q15
+ vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context
+ teq r1,r2
+ sub r3,#640 @ rewind K512
+ bne .Loop_neon
+
+ VFP_ABI_POP
+ bx lr @ .word 0xe12fff1e
+.size sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm OPENSSL_armcap_P,4,4
+.hidden OPENSSL_armcap_P
+#endif
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S
@@ -1,0 +1,1236 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.syntax unified
+
+.arch armv7-a
+.fpu neon
+
+#if defined(__thumb2__)
+.thumb
+#else
+.code 32
+#endif
+
+.text
+
+.type _vpaes_consts,%object
+.align 7 @ totally strategic alignment
+_vpaes_consts:
+.Lk_mc_forward:@ mc_forward
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+.Lk_mc_backward:@ mc_backward
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+.Lk_sr:@ sr
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+@
+@ "Hot" constants
+@
+.Lk_inv:@ inv, inva
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+.Lk_ipt:@ input transform (lo, hi)
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+.Lk_sbo:@ sbou, sbot
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+.Lk_sb1:@ sb1u, sb1t
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.Lk_sb2:@ sb2u, sb2t
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 2
+.size _vpaes_consts,.-_vpaes_consts
+.align 6
+@@
+@@ _aes_preheat
+@@
+@@ Fills q9-q15 as specified below.
+@@
+.type _vpaes_preheat,%function
+.align 4
+_vpaes_preheat:
+ adr r10, .Lk_inv
+ vmov.i8 q9, #0x0f @ .Lk_s0F
+ vld1.64 {q10,q11}, [r10]! @ .Lk_inv
+ add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo
+ vld1.64 {q12,q13}, [r10]! @ .Lk_sb1
+ vld1.64 {q14,q15}, [r10] @ .Lk_sb2
+ bx lr
+
+@@
+@@ _aes_encrypt_core
+@@
+@@ AES-encrypt q0.
+@@
+@@ Inputs:
+@@ q0 = input
+@@ q9-q15 as in _vpaes_preheat
+@@ [r2] = scheduled keys
+@@
+@@ Output in q0
+@@ Clobbers q1-q5, r8-r11
+@@ Preserves q6-q8 so you get some local vectors
+@@
+@@
+.type _vpaes_encrypt_core,%function
+.align 4
+_vpaes_encrypt_core:
+ mov r9, r2
+ ldr r8, [r2,#240] @ pull rounds
+ adr r11, .Lk_ipt
+ @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
+ @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
+ vld1.64 {q2, q3}, [r11]
+ adr r11, .Lk_mc_forward+16
+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
+ vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1
+ vtbl.8 d3, {q2}, d3
+ vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2
+ vtbl.8 d5, {q3}, d1
+ veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
+
+ @ .Lenc_entry ends with a bnz instruction which is normally paired with
+ @ subs in .Lenc_loop.
+ tst r8, r8
+ b .Lenc_entry
+
+.align 4
+.Lenc_loop:
+ @ middle of middle round
+ add r10, r11, #0x40
+ vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
+ vtbl.8 d9, {q13}, d5
+ vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
+ vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
+ vtbl.8 d1, {q12}, d7
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
+ vtbl.8 d11, {q15}, d5
+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
+ vtbl.8 d5, {q14}, d7
+ vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
+ vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
+ vtbl.8 d7, {q0}, d3
+ veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
+ @ Write to q5 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
+ vtbl.8 d11, {q0}, d9
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
+ vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
+ vtbl.8 d9, {q3}, d3
+ @ Here we restore the original q0/q5 usage.
+ veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
+ and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4
+ veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
+ subs r8, r8, #1 @ nr--
+
+.Lenc_entry:
+ @ top of round
+ vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
+ vtbl.8 d11, {q11}, d3
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vtbl.8 d7, {q10}, d1
+ vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vtbl.8 d9, {q10}, d3
+ veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vtbl.8 d5, {q10}, d7
+ vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ vtbl.8 d7, {q10}, d9
+ veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5
+ bne .Lenc_loop
+
+ @ middle of last round
+ add r10, r11, #0x80
+
+ adr r11, .Lk_sbo
+ @ Read to q1 instead of q4, so the vtbl.8 instruction below does not
+ @ overlap table and destination registers.
+ vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou
+ vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
+ vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ vtbl.8 d9, {q1}, d5
+ vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
+ @ Write to q2 instead of q0 below, to avoid overlapping table and
+ @ destination registers.
+ vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
+ vtbl.8 d5, {q0}, d7
+ veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
+ veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A
+ @ Here we restore the original q0/q2 usage.
+ vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0
+ vtbl.8 d1, {q2}, d3
+ bx lr
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+.globl vpaes_encrypt
+.hidden vpaes_encrypt
+.type vpaes_encrypt,%function
+.align 4
+vpaes_encrypt:
+ @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack
+ @ alignment.
+ stmdb sp!, {r7,r8,r9,r10,r11,lr}
+ @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+ vstmdb sp!, {d8,d9,d10,d11}
+
+ vld1.64 {q0}, [r0]
+ bl _vpaes_preheat
+ bl _vpaes_encrypt_core
+ vst1.64 {q0}, [r1]
+
+ vldmia sp!, {d8,d9,d10,d11}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_encrypt,.-vpaes_encrypt
+
+@
+@ Decryption stuff
+@
+.type _vpaes_decrypt_consts,%object
+.align 4
+_vpaes_decrypt_consts:
+.Lk_dipt:@ decryption input transform
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo:@ decryption sbox final output
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9:@ decryption sbox output *9*u, *9*t
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:@ decryption sbox output *D*u, *D*t
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:@ decryption sbox output *B*u, *B*t
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:@ decryption sbox output *E*u, *E*t
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.size _vpaes_decrypt_consts,.-_vpaes_decrypt_consts
+
+@@
+@@ Decryption core
+@@
+@@ Same API as encryption core, except it clobbers q12-q15 rather than using
+@@ the values from _vpaes_preheat. q9-q11 must still be set from
+@@ _vpaes_preheat.
+@@
+.type _vpaes_decrypt_core,%function
+.align 4
+_vpaes_decrypt_core:
+ mov r9, r2
+ ldr r8, [r2,#240] @ pull rounds
+
+ @ This function performs shuffles with various constants. The x86_64
+ @ version loads them on-demand into %xmm0-%xmm5. This does not work well
+ @ for ARMv7 because those registers are shuffle destinations. The ARMv8
+ @ version preloads those constants into registers, but ARMv7 has half
+ @ the registers to work with. Instead, we load them on-demand into
+ @ q12-q15, registers normally use for preloaded constants. This is fine
+ @ because decryption doesn't use those constants. The values are
+ @ constant, so this does not interfere with potential 2x optimizations.
+ adr r7, .Lk_dipt
+
+ vld1.64 {q12,q13}, [r7] @ vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
+ lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11
+ eor r11, r11, #0x30 @ xor $0x30, %r11
+ adr r10, .Lk_sr
+ and r11, r11, #0x30 @ and $0x30, %r11
+ add r11, r11, r10
+ adr r10, .Lk_mc_forward+48
+
+ vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
+ vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
+ vtbl.8 d5, {q12}, d3
+ vld1.64 {q5}, [r10] @ vmovdqa .Lk_mc_forward+48(%rip), %xmm5
+ @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
+ vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
+ vtbl.8 d1, {q13}, d1
+ veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
+
+ @ .Ldec_entry ends with a bnz instruction which is normally paired with
+ @ subs in .Ldec_loop.
+ tst r8, r8
+ b .Ldec_entry
+
+.align 4
+.Ldec_loop:
+@
+@ Inverse mix columns
+@
+
+ @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of
+ @ the function.
+ adr r10, .Lk_dsb9
+ vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
+ @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
+ @ Load sbd* ahead of time.
+ vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
+ @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
+ vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
+ vtbl.8 d9, {q12}, d5
+ vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
+ vtbl.8 d3, {q13}, d7
+ veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0
+
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+
+ @ Load sbb* ahead of time.
+ vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu
+ @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt
+
+ vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
+ vtbl.8 d9, {q14}, d5
+ @ Write to q1 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vtbl.8 d3, {q0}, d11
+ @ Here we restore the original q0/q1 usage. This instruction is
+ @ reordered from the ARMv8 version so we do not clobber the vtbl.8
+ @ below.
+ veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
+ vtbl.8 d3, {q15}, d7
+ @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
+
+ @ Load sbd* ahead of time.
+ vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu
+ @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet
+
+ vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
+ vtbl.8 d9, {q12}, d5
+ @ Write to q1 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vtbl.8 d3, {q0}, d11
+ @ Here we restore the original q0/q1 usage. This instruction is
+ @ reordered from the ARMv8 version so we do not clobber the vtbl.8
+ @ below.
+ veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
+ vtbl.8 d3, {q13}, d7
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+
+ vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
+ vtbl.8 d9, {q14}, d5
+ @ Write to q1 instead of q0, so the table and destination registers do
+ @ not overlap.
+ vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch
+ vtbl.8 d3, {q0}, d11
+ @ Here we restore the original q0/q1 usage. This instruction is
+ @ reordered from the ARMv8 version so we do not clobber the vtbl.8
+ @ below.
+ veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
+ vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
+ vtbl.8 d3, {q15}, d7
+ vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5
+ veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
+ subs r8, r8, #1 @ sub $1,%rax # nr--
+
+.Ldec_entry:
+ @ top of round
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vtbl.8 d5, {q11}, d3
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vtbl.8 d7, {q10}, d1
+ vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vtbl.8 d9, {q10}, d3
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
+ vtbl.8 d5, {q10}, d7
+ vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
+ vtbl.8 d7, {q10}, d9
+ veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io
+ veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
+ vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0
+ bne .Ldec_loop
+
+ @ middle of last round
+
+ adr r10, .Lk_dsbo
+
+ @ Write to q1 rather than q4 to avoid overlapping table and destination.
+ vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
+ vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
+ vtbl.8 d9, {q1}, d5
+ @ Write to q2 rather than q1 to avoid overlapping table and destination.
+ vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
+ vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
+ vtbl.8 d3, {q2}, d7
+ vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
+ veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
+ @ Write to q1 rather than q0 so the table and destination registers
+ @ below do not overlap.
+ veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A
+ vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0
+ vtbl.8 d1, {q1}, d5
+ bx lr
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+.globl vpaes_decrypt
+.hidden vpaes_decrypt
+.type vpaes_decrypt,%function
+.align 4
+vpaes_decrypt:
+ @ _vpaes_decrypt_core uses r7-r11.
+ stmdb sp!, {r7,r8,r9,r10,r11,lr}
+ @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved.
+ vstmdb sp!, {d8,d9,d10,d11}
+
+ vld1.64 {q0}, [r0]
+ bl _vpaes_preheat
+ bl _vpaes_decrypt_core
+ vst1.64 {q0}, [r1]
+
+ vldmia sp!, {d8,d9,d10,d11}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_decrypt,.-vpaes_decrypt
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@ @@
+@@ AES key schedule @@
+@@ @@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+@ This function diverges from both x86_64 and armv7 in which constants are
+@ pinned. x86_64 has a common preheat function for all operations. aarch64
+@ separates them because it has enough registers to pin nearly all constants.
+@ armv7 does not have enough registers, but needing explicit loads and stores
+@ also complicates using x86_64's register allocation directly.
+@
+@ We pin some constants for convenience and leave q14 and q15 free to load
+@ others on demand.
+
+@
+@ Key schedule constants
+@
+.type _vpaes_key_consts,%object
+.align 4
+_vpaes_key_consts:
+.Lk_dksd:@ decryption key schedule: invskew x*D
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:@ decryption key schedule: invskew x*B
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:@ decryption key schedule: invskew x*E + 0x63
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:@ decryption key schedule: invskew x*9
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon:@ rcon
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt:@ output transform
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew:@ deskew tables: inverts the sbox's "skew"
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+.size _vpaes_key_consts,.-_vpaes_key_consts
+
+.type _vpaes_key_preheat,%function
+.align 4
+_vpaes_key_preheat:
+ adr r11, .Lk_rcon
+ vmov.i8 q12, #0x5b @ .Lk_s63
+ adr r10, .Lk_inv @ Must be aligned to 8 mod 16.
+ vmov.i8 q9, #0x0f @ .Lk_s0F
+ vld1.64 {q10,q11}, [r10] @ .Lk_inv
+ vld1.64 {q8}, [r11] @ .Lk_rcon
+ bx lr
+.size _vpaes_key_preheat,.-_vpaes_key_preheat
+
+.type _vpaes_schedule_core,%function
+.align 4
+_vpaes_schedule_core:
+ @ We only need to save lr, but ARM requires an 8-byte stack alignment,
+ @ so save an extra register.
+ stmdb sp!, {r3,lr}
+
+ bl _vpaes_key_preheat @ load the tables
+
+ adr r11, .Lk_ipt @ Must be aligned to 8 mod 16.
+ vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned)
+
+ @ input transform
+ @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
+ @ overlap table and destination.
+ vmov q4, q0 @ vmovdqa %xmm0, %xmm3
+ bl _vpaes_schedule_transform
+ adr r10, .Lk_sr @ Must be aligned to 8 mod 16.
+ vmov q7, q0 @ vmovdqa %xmm0, %xmm7
+
+ add r8, r8, r10
+ tst r3, r3
+ bne .Lschedule_am_decrypting
+
+ @ encrypting, output zeroth round key after transform
+ vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx)
+ b .Lschedule_go
+
+.Lschedule_am_decrypting:
+ @ decrypting, output zeroth round key after shiftrows
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
+ vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q4}, d3
+ vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx)
+ eor r8, r8, #0x30 @ xor $0x30, %r8
+
+.Lschedule_go:
+ cmp r1, #192 @ cmp $192, %esi
+ bhi .Lschedule_256
+ beq .Lschedule_192
+ @ 128: fall though
+
+@@
+@@ .schedule_128
+@@
+@@ 128-bit specific part of key schedule.
+@@
+@@ This schedule is really simple, because all its parts
+@@ are accomplished by the subroutines.
+@@
+.Lschedule_128:
+ mov r0, #10 @ mov $10, %esi
+
+.Loop_schedule_128:
+ bl _vpaes_schedule_round
+ subs r0, r0, #1 @ dec %esi
+ beq .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle @ write output
+ b .Loop_schedule_128
+
+@@
+@@ .aes_schedule_192
+@@
+@@ 192-bit specific part of key schedule.
+@@
+@@ The main body of this schedule is the same as the 128-bit
+@@ schedule, but with more smearing. The long, high side is
+@@ stored in q7 as before, and the short, low side is in
+@@ the high bits of q6.
+@@
+@@ This schedule is somewhat nastier, however, because each
+@@ round produces 192 bits of key material, or 1.5 round keys.
+@@ Therefore, on each cycle we do 2 rounds and produce 3 round
+@@ keys.
+@@
+.align 4
+.Lschedule_192:
+ sub r0, r0, #8
+ vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
+ bl _vpaes_schedule_transform @ input transform
+ vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part
+ vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4
+ @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
+ mov r0, #4 @ mov $4, %esi
+
+.Loop_schedule_192:
+ bl _vpaes_schedule_round
+ vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0
+ bl _vpaes_schedule_mangle @ save key n
+ bl _vpaes_schedule_192_smear
+ bl _vpaes_schedule_mangle @ save key n+1
+ bl _vpaes_schedule_round
+ subs r0, r0, #1 @ dec %esi
+ beq .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle @ save key n+2
+ bl _vpaes_schedule_192_smear
+ b .Loop_schedule_192
+
+@@
+@@ .aes_schedule_256
+@@
+@@ 256-bit specific part of key schedule.
+@@
+@@ The structure here is very similar to the 128-bit
+@@ schedule, but with an additional "low side" in
+@@ q6. The low side's rounds are the same as the
+@@ high side's, except no rcon and no rotation.
+@@
+.align 4
+.Lschedule_256:
+ vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
+ bl _vpaes_schedule_transform @ input transform
+ mov r0, #7 @ mov $7, %esi
+
+.Loop_schedule_256:
+ bl _vpaes_schedule_mangle @ output low result
+ vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
+
+ @ high round
+ bl _vpaes_schedule_round
+ subs r0, r0, #1 @ dec %esi
+ beq .Lschedule_mangle_last
+ bl _vpaes_schedule_mangle
+
+ @ low round. swap xmm7 and xmm6
+ vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
+ vmov.i8 q4, #0
+ vmov q5, q7 @ vmovdqa %xmm7, %xmm5
+ vmov q7, q6 @ vmovdqa %xmm6, %xmm7
+ bl _vpaes_schedule_low_round
+ vmov q7, q5 @ vmovdqa %xmm5, %xmm7
+
+ b .Loop_schedule_256
+
+@@
+@@ .aes_schedule_mangle_last
+@@
+@@ Mangler for last round of key schedule
+@@ Mangles q0
+@@ when encrypting, outputs out(q0) ^ 63
+@@ when decrypting, outputs unskew(q0)
+@@
+@@ Always called right before return... jumps to cleanup and exits
+@@
+.align 4
+.Lschedule_mangle_last:
+ @ schedule last round key from xmm0
+ adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew
+ tst r3, r3
+ bne .Lschedule_mangle_last_dec
+
+ @ encrypting
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1
+ adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform
+ add r2, r2, #32 @ add $32, %rdx
+ vmov q2, q0
+ vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute
+ vtbl.8 d1, {q2}, d3
+
+.Lschedule_mangle_last_dec:
+ sub r2, r2, #16 @ add $-16, %rdx
+ veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0
+ bl _vpaes_schedule_transform @ output transform
+ vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key
+
+ @ cleanup
+ veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0
+ veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
+ veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2
+ veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3
+ veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4
+ veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5
+ veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6
+ veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7
+ ldmia sp!, {r3,pc} @ return
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+@@
+@@ .aes_schedule_192_smear
+@@
+@@ Smear the short, low side in the 192-bit key schedule.
+@@
+@@ Inputs:
+@@ q7: high side, b a x y
+@@ q6: low side, d c 0 0
+@@
+@@ Outputs:
+@@ q6: b+c+d b+c 0 0
+@@ q0: b+c+d b+c b a
+@@
+.type _vpaes_schedule_192_smear,%function
+.align 4
+_vpaes_schedule_192_smear:
+ vmov.i8 q1, #0
+ vdup.32 q0, d15[1]
+ vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
+ vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
+ veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
+ veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1
+ veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
+ vmov q0, q6 @ vmovdqa %xmm6, %xmm0
+ vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
+ bx lr
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+@@
+@@ .aes_schedule_round
+@@
+@@ Runs one main round of the key schedule on q0, q7
+@@
+@@ Specifically, runs subbytes on the high dword of q0
+@@ then rotates it by one byte and xors into the low dword of
+@@ q7.
+@@
+@@ Adds rcon from low byte of q8, then rotates q8 for
+@@ next rcon.
+@@
+@@ Smears the dwords of q7 by xoring the low into the
+@@ second low, result into third, result into highest.
+@@
+@@ Returns results in q7 = q0.
+@@ Clobbers q1-q4, r11.
+@@
+.type _vpaes_schedule_round,%function
+.align 4
+_vpaes_schedule_round:
+ @ extract rcon from xmm8
+ vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4
+ vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1
+ vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8
+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
+
+ @ rotate
+ vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0
+ vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0
+
+ @ fall through...
+
+ @ low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+ @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12.
+ @ We pin other values in _vpaes_key_preheat, so load them now.
+ adr r11, .Lk_sb1
+ vld1.64 {q14,q15}, [r11]
+
+ @ smear xmm7
+ vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1
+ veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7
+ vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4
+
+ @ subbytes
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i
+ veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7
+ vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
+ vtbl.8 d5, {q11}, d3
+ veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j
+ vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
+ vtbl.8 d7, {q10}, d1
+ veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
+ vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
+ vtbl.8 d9, {q10}, d3
+ veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7
+ vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
+ vtbl.8 d7, {q10}, d7
+ veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
+ vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
+ vtbl.8 d5, {q10}, d9
+ veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io
+ veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
+ vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
+ vtbl.8 d9, {q15}, d7
+ vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
+ vtbl.8 d3, {q14}, d5
+ veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
+
+ @ add in smeared stuff
+ veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0
+ veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7
+ bx lr
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+@@
+@@ .aes_schedule_transform
+@@
+@@ Linear-transform q0 according to tables at [r11]
+@@
+@@ Requires that q9 = 0x0F0F... as in preheat
+@@ Output in q0
+@@ Clobbers q1, q2, q14, q15
+@@
+.type _vpaes_schedule_transform,%function
+.align 4
+_vpaes_schedule_transform:
+ vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo
+ @ vmovdqa 16(%r11), %xmm1 # hi
+ vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1
+ vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0
+ vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d3
+ vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0
+ vtbl.8 d1, {q15}, d1
+ veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0
+ bx lr
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+@@
+@@ .aes_schedule_mangle
+@@
+@@ Mangles q0 from (basis-transformed) standard version
+@@ to our version.
+@@
+@@ On encrypt,
+@@ xor with 0x63
+@@ multiply by circulant 0,1,1,1
+@@ apply shiftrows transform
+@@
+@@ On decrypt,
+@@ xor with 0x63
+@@ multiply by "inverse mixcolumns" circulant E,B,D,9
+@@ deskew
+@@ apply shiftrows transform
+@@
+@@
+@@ Writes out to [r2], and increments or decrements it
+@@ Keeps track of round number mod 4 in r8
+@@ Preserves q0
+@@ Clobbers q1-q5
+@@
+.type _vpaes_schedule_mangle,%function
+.align 4
+_vpaes_schedule_mangle:
+ tst r3, r3
+ vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16.
+ vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5
+ bne .Lschedule_mangle_dec
+
+ @ encrypting
+ @ Write to q2 so we do not overlap table and destination below.
+ veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4
+ add r2, r2, #16 @ add $16, %rdx
+ vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4
+ vtbl.8 d9, {q2}, d11
+ vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1
+ vtbl.8 d3, {q4}, d11
+ vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3
+ vtbl.8 d7, {q1}, d11
+ veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
+ veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3
+
+ b .Lschedule_mangle_both
+.align 4
+.Lschedule_mangle_dec:
+ @ inverse mix columns
+ adr r11, .Lk_dksd @ lea .Lk_dksd(%rip),%r11
+ vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi
+ vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo
+
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2
+ @ vmovdqa 0x10(%r11), %xmm3
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q15}, d3
+ @ Load .Lk_dksb ahead of time.
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2
+ @ vmovdqa 0x30(%r11), %xmm3
+ @ Write to q13 so we do not overlap table and destination.
+ veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
+ vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
+ vtbl.8 d7, {q13}, d11
+
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
+ vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q15}, d3
+ @ Load .Lk_dkse ahead of time.
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2
+ @ vmovdqa 0x50(%r11), %xmm3
+ @ Write to q13 so we do not overlap table and destination.
+ veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
+ vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
+ vtbl.8 d7, {q13}, d11
+
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
+ vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d7, {q15}, d3
+ @ Load .Lk_dkse ahead of time.
+ vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2
+ @ vmovdqa 0x70(%r11), %xmm4
+ @ Write to q13 so we do not overlap table and destination.
+ veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3
+
+ vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2
+ vtbl.8 d5, {q14}, d9
+ vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3
+ vtbl.8 d7, {q13}, d11
+ vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4
+ vtbl.8 d9, {q15}, d3
+ vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1
+ veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2
+ veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3
+
+ sub r2, r2, #16 @ add $-16, %rdx
+
+.Lschedule_mangle_both:
+ @ Write to q2 so table and destination do not overlap.
+ vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3
+ vtbl.8 d5, {q3}, d3
+ add r8, r8, #64-16 @ add $-16, %r8
+ and r8, r8, #~(1<<6) @ and $0x30, %r8
+ vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx)
+ bx lr
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+.globl vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,%function
+.align 4
+vpaes_set_encrypt_key:
+ stmdb sp!, {r7,r8,r9,r10,r11, lr}
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ lsr r9, r1, #5 @ shr $5,%eax
+ add r9, r9, #5 @ $5,%eax
+ str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+
+ mov r3, #0 @ mov $0,%ecx
+ mov r8, #0x30 @ mov $0x30,%r8d
+ bl _vpaes_schedule_core
+ eor r0, r0, r0
+
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,%function
+.align 4
+vpaes_set_decrypt_key:
+ stmdb sp!, {r7,r8,r9,r10,r11, lr}
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ lsr r9, r1, #5 @ shr $5,%eax
+ add r9, r9, #5 @ $5,%eax
+ str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
+ lsl r9, r9, #4 @ shl $4,%eax
+ add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx
+ add r2, r2, r9
+
+ mov r3, #1 @ mov $1,%ecx
+ lsr r8, r1, #1 @ shr $1,%r8d
+ and r8, r8, #32 @ and $32,%r8d
+ eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32
+ bl _vpaes_schedule_core
+
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+@ Additional constants for converting to bsaes.
+.type _vpaes_convert_consts,%object
+.align 4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@ def u64s_to_u128(x, y):
+@ return x | (y << 64)
+@ def u128_to_u64s(w):
+@ return w & ((1<<64)-1), w >> 64
+@ def get_byte(w, i):
+@ return (w >> (i*8)) & 0xff
+@ def apply_table(table, b):
+@ lo = b & 0xf
+@ hi = b >> 4
+@ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@ def opt(b):
+@ table = [
+@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@ ]
+@ return apply_table(table, b)
+@ def rot_byte(b, n):
+@ return 0xff & ((b << n) | (b >> (8-n)))
+@ def skew(x):
+@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@ rot_byte(x, 4))
+@ table = [0, 0]
+@ for i in range(16):
+@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@ table[1] |= skew(opt(i<<4)) << (i*8)
+@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+.Lk_opt_then_skew:
+.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+.quad 0x1f30062936192f00, 0xb49bad829db284ab
+
+@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
+@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
+@ becomes 0x22334411 and then 0x11443322.
+.Lk_decrypt_transform:
+.quad 0x0704050603000102, 0x0f0c0d0e0b08090a
+.size _vpaes_convert_consts,.-_vpaes_convert_consts
+
+@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
+.globl vpaes_encrypt_key_to_bsaes
+.hidden vpaes_encrypt_key_to_bsaes
+.type vpaes_encrypt_key_to_bsaes,%function
+.align 4
+vpaes_encrypt_key_to_bsaes:
+ stmdb sp!, {r11, lr}
+
+ @ See _vpaes_schedule_core for the key schedule logic. In particular,
+ @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper),
+ @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last
+ @ contain the transformations not in the bsaes representation. This
+ @ function inverts those transforms.
+ @
+ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+ @ representation, which does not match the other aes_nohw_*
+ @ implementations. The ARM aes_nohw_* stores each 32-bit word
+ @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+ @ cost of extra REV and VREV32 operations in little-endian ARM.
+
+ vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
+ adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16.
+ add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+ vld1.64 {q12}, [r2]
+ vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64
+ adr r11, .Lk_opt @ Must be aligned to 8 mod 16.
+ vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied
+
+ @ vpaes stores one fewer round count than bsaes, but the number of keys
+ @ is the same.
+ ldr r2, [r1,#240]
+ add r2, r2, #1
+ str r2, [r0,#240]
+
+ @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt).
+ @ Invert this with .Lk_opt.
+ vld1.64 {q0}, [r1]!
+ bl _vpaes_schedule_transform
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+
+ @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied,
+ @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63,
+ @ multiplies by the circulant 0,1,1,1, then applies ShiftRows.
+.Loop_enc_key_to_bsaes:
+ vld1.64 {q0}, [r1]!
+
+ @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle
+ @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30.
+ @ We use r3 rather than r8 to avoid a callee-saved register.
+ vld1.64 {q1}, [r3]
+ vtbl.8 d4, {q0}, d2
+ vtbl.8 d5, {q0}, d3
+ add r3, r3, #16
+ and r3, r3, #~(1<<6)
+ vmov q0, q2
+
+ @ Handle the last key differently.
+ subs r2, r2, #1
+ beq .Loop_enc_key_to_bsaes_last
+
+ @ Multiply by the circulant. This is its own inverse.
+ vtbl.8 d2, {q0}, d24
+ vtbl.8 d3, {q0}, d25
+ vmov q0, q1
+ vtbl.8 d4, {q1}, d24
+ vtbl.8 d5, {q1}, d25
+ veor q0, q0, q2
+ vtbl.8 d2, {q2}, d24
+ vtbl.8 d3, {q2}, d25
+ veor q0, q0, q1
+
+ @ XOR and finish.
+ veor q0, q0, q10
+ bl _vpaes_schedule_transform
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+ b .Loop_enc_key_to_bsaes
+
+.Loop_enc_key_to_bsaes_last:
+ @ The final key does not have a basis transform (note
+ @ .Lschedule_mangle_last inverts the original transform). It only XORs
+ @ 0x63 and applies ShiftRows. The latter was already inverted in the
+ @ loop. Note that, because we act on the original representation, we use
+ @ q11, not q10.
+ veor q0, q0, q11
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]
+
+ @ Wipe registers which contained key material.
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+
+ ldmia sp!, {r11, pc} @ return
+.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes
+
+@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes);
+.globl vpaes_decrypt_key_to_bsaes
+.hidden vpaes_decrypt_key_to_bsaes
+.type vpaes_decrypt_key_to_bsaes,%function
+.align 4
+vpaes_decrypt_key_to_bsaes:
+ stmdb sp!, {r11, lr}
+
+ @ See _vpaes_schedule_core for the key schedule logic. Note vpaes
+ @ computes the decryption key schedule in reverse. Additionally,
+ @ aes-x86_64.pl shares some transformations, so we must only partially
+ @ invert vpaes's transformations. In general, vpaes computes in a
+ @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of
+ @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is
+ @ split into a linear skew and XOR of 0x63). We undo all but MixColumns.
+ @
+ @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key
+ @ representation, which does not match the other aes_nohw_*
+ @ implementations. The ARM aes_nohw_* stores each 32-bit word
+ @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
+ @ cost of extra REV and VREV32 operations in little-endian ARM.
+
+ adr r2, .Lk_decrypt_transform
+ adr r3, .Lk_sr+0x30
+ adr r11, .Lk_opt_then_skew @ Input to _vpaes_schedule_transform.
+ vld1.64 {q12}, [r2] @ Reuse q12 from encryption.
+ vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform
+
+ @ vpaes stores one fewer round count than bsaes, but the number of keys
+ @ is the same.
+ ldr r2, [r1,#240]
+ add r2, r2, #1
+ str r2, [r0,#240]
+
+ @ Undo the basis change and reapply the S-box affine transform. See
+ @ .Lschedule_mangle_last.
+ vld1.64 {q0}, [r1]!
+ bl _vpaes_schedule_transform
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+
+ @ See _vpaes_schedule_mangle for the transform on the middle keys. Note
+ @ it simultaneously inverts MixColumns and the S-box affine transform.
+ @ See .Lk_dksd through .Lk_dks9.
+.Loop_dec_key_to_bsaes:
+ vld1.64 {q0}, [r1]!
+
+ @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going
+ @ forwards cancels inverting for which direction we cycle r3. We use r3
+ @ rather than r8 to avoid a callee-saved register.
+ vld1.64 {q1}, [r3]
+ vtbl.8 d4, {q0}, d2
+ vtbl.8 d5, {q0}, d3
+ add r3, r3, #64-16
+ and r3, r3, #~(1<<6)
+ vmov q0, q2
+
+ @ Handle the last key differently.
+ subs r2, r2, #1
+ beq .Loop_dec_key_to_bsaes_last
+
+ @ Undo the basis change and reapply the S-box affine transform.
+ bl _vpaes_schedule_transform
+
+ @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We
+ @ combine the two operations in .Lk_decrypt_transform.
+ @
+ @ TODO(davidben): Where does the rotation come from?
+ vtbl.8 d2, {q0}, d24
+ vtbl.8 d3, {q0}, d25
+
+ vst1.64 {q1}, [r0]!
+ b .Loop_dec_key_to_bsaes
+
+.Loop_dec_key_to_bsaes_last:
+ @ The final key only inverts ShiftRows (already done in the loop). See
+ @ .Lschedule_am_decrypting. Its basis is not transformed.
+ vrev32.8 q0, q0
+ vst1.64 {q0}, [r0]!
+
+ @ Wipe registers which contained key material.
+ veor q0, q0, q0
+ veor q1, q1, q1
+ veor q2, q2, q2
+
+ ldmia sp!, {r11, pc} @ return
+.size vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes
+.globl vpaes_ctr32_encrypt_blocks
+.hidden vpaes_ctr32_encrypt_blocks
+.type vpaes_ctr32_encrypt_blocks,%function
+.align 4
+vpaes_ctr32_encrypt_blocks:
+ mov ip, sp
+ stmdb sp!, {r7,r8,r9,r10,r11, lr}
+ @ This function uses q4-q7 (d8-d15), which are callee-saved.
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ cmp r2, #0
+ @ r8 is passed on the stack.
+ ldr r8, [ip]
+ beq .Lctr32_done
+
+ @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3.
+ mov r9, r3
+ mov r3, r2
+ mov r2, r9
+
+ @ Load the IV and counter portion.
+ ldr r7, [r8, #12]
+ vld1.8 {q7}, [r8]
+
+ bl _vpaes_preheat
+ rev r7, r7 @ The counter is big-endian.
+
+.Lctr32_loop:
+ vmov q0, q7
+ vld1.8 {q6}, [r0]! @ .Load input ahead of time
+ bl _vpaes_encrypt_core
+ veor q0, q0, q6 @ XOR input and result
+ vst1.8 {q0}, [r1]!
+ subs r3, r3, #1
+ @ Update the counter.
+ add r7, r7, #1
+ rev r9, r7
+ vmov.32 d15[1], r9
+ bne .Lctr32_loop
+
+.Lctr32_done:
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return
+.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S
@@ -1,0 +1,379 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+
+#if !defined(OPENSSL_NO_ASM)
+#if defined(__arm__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.syntax unified
+
+.arch armv7-a
+.fpu vfp
+
+.text
+
+@ abi_test_trampoline loads callee-saved registers from |state|, calls |func|
+@ with |argv|, then saves the callee-saved registers into |state|. It returns
+@ the result of |func|. The |unwind| argument is unused.
+@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state,
+@ const uint32_t *argv, size_t argc,
+@ int unwind);
+.type abi_test_trampoline, %function
+.globl abi_test_trampoline
+.hidden abi_test_trampoline
+.align 4
+abi_test_trampoline:
+ @ Save parameters and all callee-saved registers. For convenience, we
+ @ save r9 on iOS even though it's volatile.
+ vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+ stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr}
+
+ @ Reserve stack space for six (10-4) stack parameters, plus an extra 4
+ @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3).
+ sub sp, sp, #28
+
+ @ Every register in AAPCS is either non-volatile or a parameter (except
+ @ r9 on iOS), so this code, by the actual call, loses all its scratch
+ @ registers. First fill in stack parameters while there are registers
+ @ to spare.
+ cmp r3, #4
+ bls .Lstack_args_done
+ mov r4, sp @ r4 is the output pointer.
+ add r5, r2, r3, lsl #2 @ Set r5 to the end of argv.
+ add r2, r2, #16 @ Skip four arguments.
+.Lstack_args_loop:
+ ldr r6, [r2], #4
+ cmp r2, r5
+ str r6, [r4], #4
+ bne .Lstack_args_loop
+
+.Lstack_args_done:
+ @ Load registers from |r1|.
+ vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+ @ r9 is not volatile on iOS.
+ ldmia r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+ ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+ @ Load register parameters. This uses up our remaining registers, so we
+ @ repurpose lr as scratch space.
+ ldr r3, [sp, #40] @ Reload argc.
+ ldr lr, [sp, #36] @ .Load argv into lr.
+ cmp r3, #3
+ bhi .Larg_r3
+ beq .Larg_r2
+ cmp r3, #1
+ bhi .Larg_r1
+ beq .Larg_r0
+ b .Largs_done
+
+.Larg_r3:
+ ldr r3, [lr, #12] @ argv[3]
+.Larg_r2:
+ ldr r2, [lr, #8] @ argv[2]
+.Larg_r1:
+ ldr r1, [lr, #4] @ argv[1]
+.Larg_r0:
+ ldr r0, [lr] @ argv[0]
+.Largs_done:
+
+ @ With every other register in use, load the function pointer into lr
+ @ and call the function.
+ ldr lr, [sp, #28]
+ blx lr
+
+ @ r1-r3 are free for use again. The trampoline only supports
+ @ single-return functions. Pass r4-r11 to the caller.
+ ldr r1, [sp, #32]
+ vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15}
+#if defined(__APPLE__)
+ @ r9 is not volatile on iOS.
+ stmia r1!, {r4,r5,r6,r7,r8,r10-r11}
+#else
+ stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11}
+#endif
+
+ @ Unwind the stack and restore registers.
+ add sp, sp, #44 @ 44 = 28+16
+ ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above).
+ vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15}
+
+ bx lr
+.size abi_test_trampoline,.-abi_test_trampoline
+.type abi_test_clobber_r0, %function
+.globl abi_test_clobber_r0
+.hidden abi_test_clobber_r0
+.align 4
+abi_test_clobber_r0:
+ mov r0, #0
+ bx lr
+.size abi_test_clobber_r0,.-abi_test_clobber_r0
+.type abi_test_clobber_r1, %function
+.globl abi_test_clobber_r1
+.hidden abi_test_clobber_r1
+.align 4
+abi_test_clobber_r1:
+ mov r1, #0
+ bx lr
+.size abi_test_clobber_r1,.-abi_test_clobber_r1
+.type abi_test_clobber_r2, %function
+.globl abi_test_clobber_r2
+.hidden abi_test_clobber_r2
+.align 4
+abi_test_clobber_r2:
+ mov r2, #0
+ bx lr
+.size abi_test_clobber_r2,.-abi_test_clobber_r2
+.type abi_test_clobber_r3, %function
+.globl abi_test_clobber_r3
+.hidden abi_test_clobber_r3
+.align 4
+abi_test_clobber_r3:
+ mov r3, #0
+ bx lr
+.size abi_test_clobber_r3,.-abi_test_clobber_r3
+.type abi_test_clobber_r4, %function
+.globl abi_test_clobber_r4
+.hidden abi_test_clobber_r4
+.align 4
+abi_test_clobber_r4:
+ mov r4, #0
+ bx lr
+.size abi_test_clobber_r4,.-abi_test_clobber_r4
+.type abi_test_clobber_r5, %function
+.globl abi_test_clobber_r5
+.hidden abi_test_clobber_r5
+.align 4
+abi_test_clobber_r5:
+ mov r5, #0
+ bx lr
+.size abi_test_clobber_r5,.-abi_test_clobber_r5
+.type abi_test_clobber_r6, %function
+.globl abi_test_clobber_r6
+.hidden abi_test_clobber_r6
+.align 4
+abi_test_clobber_r6:
+ mov r6, #0
+ bx lr
+.size abi_test_clobber_r6,.-abi_test_clobber_r6
+.type abi_test_clobber_r7, %function
+.globl abi_test_clobber_r7
+.hidden abi_test_clobber_r7
+.align 4
+abi_test_clobber_r7:
+ mov r7, #0
+ bx lr
+.size abi_test_clobber_r7,.-abi_test_clobber_r7
+.type abi_test_clobber_r8, %function
+.globl abi_test_clobber_r8
+.hidden abi_test_clobber_r8
+.align 4
+abi_test_clobber_r8:
+ mov r8, #0
+ bx lr
+.size abi_test_clobber_r8,.-abi_test_clobber_r8
+.type abi_test_clobber_r9, %function
+.globl abi_test_clobber_r9
+.hidden abi_test_clobber_r9
+.align 4
+abi_test_clobber_r9:
+ mov r9, #0
+ bx lr
+.size abi_test_clobber_r9,.-abi_test_clobber_r9
+.type abi_test_clobber_r10, %function
+.globl abi_test_clobber_r10
+.hidden abi_test_clobber_r10
+.align 4
+abi_test_clobber_r10:
+ mov r10, #0
+ bx lr
+.size abi_test_clobber_r10,.-abi_test_clobber_r10
+.type abi_test_clobber_r11, %function
+.globl abi_test_clobber_r11
+.hidden abi_test_clobber_r11
+.align 4
+abi_test_clobber_r11:
+ mov r11, #0
+ bx lr
+.size abi_test_clobber_r11,.-abi_test_clobber_r11
+.type abi_test_clobber_r12, %function
+.globl abi_test_clobber_r12
+.hidden abi_test_clobber_r12
+.align 4
+abi_test_clobber_r12:
+ mov r12, #0
+ bx lr
+.size abi_test_clobber_r12,.-abi_test_clobber_r12
+.type abi_test_clobber_d0, %function
+.globl abi_test_clobber_d0
+.hidden abi_test_clobber_d0
+.align 4
+abi_test_clobber_d0:
+ mov r0, #0
+ vmov s0, r0
+ vmov s1, r0
+ bx lr
+.size abi_test_clobber_d0,.-abi_test_clobber_d0
+.type abi_test_clobber_d1, %function
+.globl abi_test_clobber_d1
+.hidden abi_test_clobber_d1
+.align 4
+abi_test_clobber_d1:
+ mov r0, #0
+ vmov s2, r0
+ vmov s3, r0
+ bx lr
+.size abi_test_clobber_d1,.-abi_test_clobber_d1
+.type abi_test_clobber_d2, %function
+.globl abi_test_clobber_d2
+.hidden abi_test_clobber_d2
+.align 4
+abi_test_clobber_d2:
+ mov r0, #0
+ vmov s4, r0
+ vmov s5, r0
+ bx lr
+.size abi_test_clobber_d2,.-abi_test_clobber_d2
+.type abi_test_clobber_d3, %function
+.globl abi_test_clobber_d3
+.hidden abi_test_clobber_d3
+.align 4
+abi_test_clobber_d3:
+ mov r0, #0
+ vmov s6, r0
+ vmov s7, r0
+ bx lr
+.size abi_test_clobber_d3,.-abi_test_clobber_d3
+.type abi_test_clobber_d4, %function
+.globl abi_test_clobber_d4
+.hidden abi_test_clobber_d4
+.align 4
+abi_test_clobber_d4:
+ mov r0, #0
+ vmov s8, r0
+ vmov s9, r0
+ bx lr
+.size abi_test_clobber_d4,.-abi_test_clobber_d4
+.type abi_test_clobber_d5, %function
+.globl abi_test_clobber_d5
+.hidden abi_test_clobber_d5
+.align 4
+abi_test_clobber_d5:
+ mov r0, #0
+ vmov s10, r0
+ vmov s11, r0
+ bx lr
+.size abi_test_clobber_d5,.-abi_test_clobber_d5
+.type abi_test_clobber_d6, %function
+.globl abi_test_clobber_d6
+.hidden abi_test_clobber_d6
+.align 4
+abi_test_clobber_d6:
+ mov r0, #0
+ vmov s12, r0
+ vmov s13, r0
+ bx lr
+.size abi_test_clobber_d6,.-abi_test_clobber_d6
+.type abi_test_clobber_d7, %function
+.globl abi_test_clobber_d7
+.hidden abi_test_clobber_d7
+.align 4
+abi_test_clobber_d7:
+ mov r0, #0
+ vmov s14, r0
+ vmov s15, r0
+ bx lr
+.size abi_test_clobber_d7,.-abi_test_clobber_d7
+.type abi_test_clobber_d8, %function
+.globl abi_test_clobber_d8
+.hidden abi_test_clobber_d8
+.align 4
+abi_test_clobber_d8:
+ mov r0, #0
+ vmov s16, r0
+ vmov s17, r0
+ bx lr
+.size abi_test_clobber_d8,.-abi_test_clobber_d8
+.type abi_test_clobber_d9, %function
+.globl abi_test_clobber_d9
+.hidden abi_test_clobber_d9
+.align 4
+abi_test_clobber_d9:
+ mov r0, #0
+ vmov s18, r0
+ vmov s19, r0
+ bx lr
+.size abi_test_clobber_d9,.-abi_test_clobber_d9
+.type abi_test_clobber_d10, %function
+.globl abi_test_clobber_d10
+.hidden abi_test_clobber_d10
+.align 4
+abi_test_clobber_d10:
+ mov r0, #0
+ vmov s20, r0
+ vmov s21, r0
+ bx lr
+.size abi_test_clobber_d10,.-abi_test_clobber_d10
+.type abi_test_clobber_d11, %function
+.globl abi_test_clobber_d11
+.hidden abi_test_clobber_d11
+.align 4
+abi_test_clobber_d11:
+ mov r0, #0
+ vmov s22, r0
+ vmov s23, r0
+ bx lr
+.size abi_test_clobber_d11,.-abi_test_clobber_d11
+.type abi_test_clobber_d12, %function
+.globl abi_test_clobber_d12
+.hidden abi_test_clobber_d12
+.align 4
+abi_test_clobber_d12:
+ mov r0, #0
+ vmov s24, r0
+ vmov s25, r0
+ bx lr
+.size abi_test_clobber_d12,.-abi_test_clobber_d12
+.type abi_test_clobber_d13, %function
+.globl abi_test_clobber_d13
+.hidden abi_test_clobber_d13
+.align 4
+abi_test_clobber_d13:
+ mov r0, #0
+ vmov s26, r0
+ vmov s27, r0
+ bx lr
+.size abi_test_clobber_d13,.-abi_test_clobber_d13
+.type abi_test_clobber_d14, %function
+.globl abi_test_clobber_d14
+.hidden abi_test_clobber_d14
+.align 4
+abi_test_clobber_d14:
+ mov r0, #0
+ vmov s28, r0
+ vmov s29, r0
+ bx lr
+.size abi_test_clobber_d14,.-abi_test_clobber_d14
+.type abi_test_clobber_d15, %function
+.globl abi_test_clobber_d15
+.hidden abi_test_clobber_d15
+.align 4
+abi_test_clobber_d15:
+ mov r0, #0
+ vmov s30, r0
+ vmov s31, r0
+ bx lr
+.size abi_test_clobber_d15,.-abi_test_clobber_d15
+#endif
+#endif // !OPENSSL_NO_ASM
+.section .note.GNU-stack,"",%progbits
--- /dev/null
+++ b/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S
@@ -1,0 +1,3670 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__)
+.machine "any"
+
+.abiversion 2
+.text
+
+.align 7
+.Lrcon:
+.byte 0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01
+.byte 0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b
+.byte 0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d
+.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.Lconsts:
+ mflr 0
+ bcl 20,31,$+4
+ mflr 6
+ addi 6,6,-0x48
+ mtlr 0
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+.byte 65,69,83,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+
+.globl aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,@function
+.align 5
+aes_hw_set_encrypt_key:
+.localentry aes_hw_set_encrypt_key,0
+
+.Lset_encrypt_key:
+ mflr 11
+ std 11,16(1)
+
+ li 6,-1
+ cmpldi 3,0
+ beq- .Lenc_key_abort
+ cmpldi 5,0
+ beq- .Lenc_key_abort
+ li 6,-2
+ cmpwi 4,128
+ blt- .Lenc_key_abort
+ cmpwi 4,256
+ bgt- .Lenc_key_abort
+ andi. 0,4,0x3f
+ bne- .Lenc_key_abort
+
+ lis 0,0xfff0
+ li 12,-1
+ or 0,0,0
+
+ bl .Lconsts
+ mtlr 11
+
+ neg 9,3
+ lvx 1,0,3
+ addi 3,3,15
+ lvsr 3,0,9
+ li 8,0x20
+ cmpwi 4,192
+ lvx 2,0,3
+ vspltisb 5,0x0f
+ lvx 4,0,6
+ vxor 3,3,5
+ lvx 5,8,6
+ addi 6,6,0x10
+ vperm 1,1,2,3
+ li 7,8
+ vxor 0,0,0
+ mtctr 7
+
+ lvsl 8,0,5
+ vspltisb 9,-1
+ lvx 10,0,5
+ vperm 9,9,0,8
+
+ blt .Loop128
+ addi 3,3,8
+ beq .L192
+ addi 3,3,8
+ b .L256
+
+.align 4
+.Loop128:
+ vperm 3,1,1,5
+ vsldoi 6,0,1,12
+ vperm 11,1,1,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ .long 0x10632509
+ stvx 7,0,5
+ addi 5,5,16
+
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ vadduwm 4,4,4
+ vxor 1,1,3
+ bdnz .Loop128
+
+ lvx 4,0,6
+
+ vperm 3,1,1,5
+ vsldoi 6,0,1,12
+ vperm 11,1,1,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ .long 0x10632509
+ stvx 7,0,5
+ addi 5,5,16
+
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ vadduwm 4,4,4
+ vxor 1,1,3
+
+ vperm 3,1,1,5
+ vsldoi 6,0,1,12
+ vperm 11,1,1,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ .long 0x10632509
+ stvx 7,0,5
+ addi 5,5,16
+
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ vxor 1,1,3
+ vperm 11,1,1,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ stvx 7,0,5
+
+ addi 3,5,15
+ addi 5,5,0x50
+
+ li 8,10
+ b .Ldone
+
+.align 4
+.L192:
+ lvx 6,0,3
+ li 7,4
+ vperm 11,1,1,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ stvx 7,0,5
+ addi 5,5,16
+ vperm 2,2,6,3
+ vspltisb 3,8
+ mtctr 7
+ vsububm 5,5,3
+
+.Loop192:
+ vperm 3,2,2,5
+ vsldoi 6,0,1,12
+ .long 0x10632509
+
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+
+ vsldoi 7,0,2,8
+ vspltw 6,1,3
+ vxor 6,6,2
+ vsldoi 2,0,2,12
+ vadduwm 4,4,4
+ vxor 2,2,6
+ vxor 1,1,3
+ vxor 2,2,3
+ vsldoi 7,7,1,8
+
+ vperm 3,2,2,5
+ vsldoi 6,0,1,12
+ vperm 11,7,7,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ .long 0x10632509
+ stvx 7,0,5
+ addi 5,5,16
+
+ vsldoi 7,1,2,8
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vperm 11,7,7,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ stvx 7,0,5
+ addi 5,5,16
+
+ vspltw 6,1,3
+ vxor 6,6,2
+ vsldoi 2,0,2,12
+ vadduwm 4,4,4
+ vxor 2,2,6
+ vxor 1,1,3
+ vxor 2,2,3
+ vperm 11,1,1,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ stvx 7,0,5
+ addi 3,5,15
+ addi 5,5,16
+ bdnz .Loop192
+
+ li 8,12
+ addi 5,5,0x20
+ b .Ldone
+
+.align 4
+.L256:
+ lvx 6,0,3
+ li 7,7
+ li 8,14
+ vperm 11,1,1,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ stvx 7,0,5
+ addi 5,5,16
+ vperm 2,2,6,3
+ mtctr 7
+
+.Loop256:
+ vperm 3,2,2,5
+ vsldoi 6,0,1,12
+ vperm 11,2,2,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ .long 0x10632509
+ stvx 7,0,5
+ addi 5,5,16
+
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ vsldoi 6,0,6,12
+ vxor 1,1,6
+ vadduwm 4,4,4
+ vxor 1,1,3
+ vperm 11,1,1,8
+ vsel 7,10,11,9
+ vor 10,11,11
+ stvx 7,0,5
+ addi 3,5,15
+ addi 5,5,16
+ bdz .Ldone
+
+ vspltw 3,1,3
+ vsldoi 6,0,2,12
+ .long 0x106305C8
+
+ vxor 2,2,6
+ vsldoi 6,0,6,12
+ vxor 2,2,6
+ vsldoi 6,0,6,12
+ vxor 2,2,6
+
+ vxor 2,2,3
+ b .Loop256
+
+.align 4
+.Ldone:
+ lvx 2,0,3
+ vsel 2,10,2,9
+ stvx 2,0,3
+ li 6,0
+ or 12,12,12
+ stw 8,0(5)
+
+.Lenc_key_abort:
+ mr 3,6
+ blr
+.long 0
+.byte 0,12,0x14,1,0,0,3,0
+.long 0
+.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+
+.globl aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,@function
+.align 5
+aes_hw_set_decrypt_key:
+.localentry aes_hw_set_decrypt_key,0
+
+ stdu 1,-64(1)
+ mflr 10
+ std 10,80(1)
+ bl .Lset_encrypt_key
+ mtlr 10
+
+ cmpwi 3,0
+ bne- .Ldec_key_abort
+
+ slwi 7,8,4
+ subi 3,5,240
+ srwi 8,8,1
+ add 5,3,7
+ mtctr 8
+
+.Ldeckey:
+ lwz 0, 0(3)
+ lwz 6, 4(3)
+ lwz 7, 8(3)
+ lwz 8, 12(3)
+ addi 3,3,16
+ lwz 9, 0(5)
+ lwz 10,4(5)
+ lwz 11,8(5)
+ lwz 12,12(5)
+ stw 0, 0(5)
+ stw 6, 4(5)
+ stw 7, 8(5)
+ stw 8, 12(5)
+ subi 5,5,16
+ stw 9, -16(3)
+ stw 10,-12(3)
+ stw 11,-8(3)
+ stw 12,-4(3)
+ bdnz .Ldeckey
+
+ xor 3,3,3
+.Ldec_key_abort:
+ addi 1,1,64
+ blr
+.long 0
+.byte 0,12,4,1,0x80,0,3,0
+.long 0
+.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl aes_hw_encrypt
+.type aes_hw_encrypt,@function
+.align 5
+aes_hw_encrypt:
+.localentry aes_hw_encrypt,0
+
+ lwz 6,240(5)
+ lis 0,0xfc00
+ li 12,-1
+ li 7,15
+ or 0,0,0
+
+ lvx 0,0,3
+ neg 11,4
+ lvx 1,7,3
+ lvsl 2,0,3
+ vspltisb 4,0x0f
+ lvsr 3,0,11
+ vxor 2,2,4
+ li 7,16
+ vperm 0,0,1,2
+ lvx 1,0,5
+ lvsr 5,0,5
+ srwi 6,6,1
+ lvx 2,7,5
+ addi 7,7,16
+ subi 6,6,1
+ vperm 1,2,1,5
+
+ vxor 0,0,1
+ lvx 1,7,5
+ addi 7,7,16
+ mtctr 6
+
+.Loop_enc:
+ vperm 2,1,2,5
+ .long 0x10001508
+ lvx 2,7,5
+ addi 7,7,16
+ vperm 1,2,1,5
+ .long 0x10000D08
+ lvx 1,7,5
+ addi 7,7,16
+ bdnz .Loop_enc
+
+ vperm 2,1,2,5
+ .long 0x10001508
+ lvx 2,7,5
+ vperm 1,2,1,5
+ .long 0x10000D09
+
+ vspltisb 2,-1
+ vxor 1,1,1
+ li 7,15
+ vperm 2,2,1,3
+ vxor 3,3,4
+ lvx 1,0,4
+ vperm 0,0,0,3
+ vsel 1,1,0,2
+ lvx 4,7,4
+ stvx 1,0,4
+ vsel 0,0,4,2
+ stvx 0,7,4
+
+ or 12,12,12
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,3,0
+.long 0
+.size aes_hw_encrypt,.-aes_hw_encrypt
+.globl aes_hw_decrypt
+.type aes_hw_decrypt,@function
+.align 5
+aes_hw_decrypt:
+.localentry aes_hw_decrypt,0
+
+ lwz 6,240(5)
+ lis 0,0xfc00
+ li 12,-1
+ li 7,15
+ or 0,0,0
+
+ lvx 0,0,3
+ neg 11,4
+ lvx 1,7,3
+ lvsl 2,0,3
+ vspltisb 4,0x0f
+ lvsr 3,0,11
+ vxor 2,2,4
+ li 7,16
+ vperm 0,0,1,2
+ lvx 1,0,5
+ lvsr 5,0,5
+ srwi 6,6,1
+ lvx 2,7,5
+ addi 7,7,16
+ subi 6,6,1
+ vperm 1,2,1,5
+
+ vxor 0,0,1
+ lvx 1,7,5
+ addi 7,7,16
+ mtctr 6
+
+.Loop_dec:
+ vperm 2,1,2,5
+ .long 0x10001548
+ lvx 2,7,5
+ addi 7,7,16
+ vperm 1,2,1,5
+ .long 0x10000D48
+ lvx 1,7,5
+ addi 7,7,16
+ bdnz .Loop_dec
+
+ vperm 2,1,2,5
+ .long 0x10001548
+ lvx 2,7,5
+ vperm 1,2,1,5
+ .long 0x10000D49
+
+ vspltisb 2,-1
+ vxor 1,1,1
+ li 7,15
+ vperm 2,2,1,3
+ vxor 3,3,4
+ lvx 1,0,4
+ vperm 0,0,0,3
+ vsel 1,1,0,2
+ lvx 4,7,4
+ stvx 1,0,4
+ vsel 0,0,4,2
+ stvx 0,7,4
+
+ or 12,12,12
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,3,0
+.long 0
+.size aes_hw_decrypt,.-aes_hw_decrypt
+.globl aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,@function
+.align 5
+aes_hw_cbc_encrypt:
+.localentry aes_hw_cbc_encrypt,0
+
+ cmpldi 5,16
+ .long 0x4dc00020
+
+ cmpwi 8,0
+ lis 0,0xffe0
+ li 12,-1
+ or 0,0,0
+
+ li 10,15
+ vxor 0,0,0
+ vspltisb 3,0x0f
+
+ lvx 4,0,7
+ lvsl 6,0,7
+ lvx 5,10,7
+ vxor 6,6,3
+ vperm 4,4,5,6
+
+ neg 11,3
+ lvsr 10,0,6
+ lwz 9,240(6)
+
+ lvsr 6,0,11
+ lvx 5,0,3
+ addi 3,3,15
+ vxor 6,6,3
+
+ lvsl 8,0,4
+ vspltisb 9,-1
+ lvx 7,0,4
+ vperm 9,9,0,8
+ vxor 8,8,3
+
+ srwi 9,9,1
+ li 10,16
+ subi 9,9,1
+ beq .Lcbc_dec
+
+.Lcbc_enc:
+ vor 2,5,5
+ lvx 5,0,3
+ addi 3,3,16
+ mtctr 9
+ subi 5,5,16
+
+ lvx 0,0,6
+ vperm 2,2,5,6
+ lvx 1,10,6
+ addi 10,10,16
+ vperm 0,1,0,10
+ vxor 2,2,0
+ lvx 0,10,6
+ addi 10,10,16
+ vxor 2,2,4
+
+.Loop_cbc_enc:
+ vperm 1,0,1,10
+ .long 0x10420D08
+ lvx 1,10,6
+ addi 10,10,16
+ vperm 0,1,0,10
+ .long 0x10420508
+ lvx 0,10,6
+ addi 10,10,16
+ bdnz .Loop_cbc_enc
+
+ vperm 1,0,1,10
+ .long 0x10420D08
+ lvx 1,10,6
+ li 10,16
+ vperm 0,1,0,10
+ .long 0x10820509
+ cmpldi 5,16
+
+ vperm 3,4,4,8
+ vsel 2,7,3,9
+ vor 7,3,3
+ stvx 2,0,4
+ addi 4,4,16
+ bge .Lcbc_enc
+
+ b .Lcbc_done
+
+.align 4
+.Lcbc_dec:
+ cmpldi 5,128
+ bge _aesp8_cbc_decrypt8x
+ vor 3,5,5
+ lvx 5,0,3
+ addi 3,3,16
+ mtctr 9
+ subi 5,5,16
+
+ lvx 0,0,6
+ vperm 3,3,5,6
+ lvx 1,10,6
+ addi 10,10,16
+ vperm 0,1,0,10
+ vxor 2,3,0
+ lvx 0,10,6
+ addi 10,10,16
+
+.Loop_cbc_dec:
+ vperm 1,0,1,10
+ .long 0x10420D48
+ lvx 1,10,6
+ addi 10,10,16
+ vperm 0,1,0,10
+ .long 0x10420548
+ lvx 0,10,6
+ addi 10,10,16
+ bdnz .Loop_cbc_dec
+
+ vperm 1,0,1,10
+ .long 0x10420D48
+ lvx 1,10,6
+ li 10,16
+ vperm 0,1,0,10
+ .long 0x10420549
+ cmpldi 5,16
+
+ vxor 2,2,4
+ vor 4,3,3
+ vperm 3,2,2,8
+ vsel 2,7,3,9
+ vor 7,3,3
+ stvx 2,0,4
+ addi 4,4,16
+ bge .Lcbc_dec
+
+.Lcbc_done:
+ addi 4,4,-1
+ lvx 2,0,4
+ vsel 2,7,2,9
+ stvx 2,0,4
+
+ neg 8,7
+ li 10,15
+ vxor 0,0,0
+ vspltisb 9,-1
+ vspltisb 3,0x0f
+ lvsr 8,0,8
+ vperm 9,9,0,8
+ vxor 8,8,3
+ lvx 7,0,7
+ vperm 4,4,4,8
+ vsel 2,7,4,9
+ lvx 5,10,7
+ stvx 2,0,7
+ vsel 2,4,5,9
+ stvx 2,10,7
+
+ or 12,12,12
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,6,0
+.long 0
+.align 5
+_aesp8_cbc_decrypt8x:
+ stdu 1,-448(1)
+ li 10,207
+ li 11,223
+ stvx 20,10,1
+ addi 10,10,32
+ stvx 21,11,1
+ addi 11,11,32
+ stvx 22,10,1
+ addi 10,10,32
+ stvx 23,11,1
+ addi 11,11,32
+ stvx 24,10,1
+ addi 10,10,32
+ stvx 25,11,1
+ addi 11,11,32
+ stvx 26,10,1
+ addi 10,10,32
+ stvx 27,11,1
+ addi 11,11,32
+ stvx 28,10,1
+ addi 10,10,32
+ stvx 29,11,1
+ addi 11,11,32
+ stvx 30,10,1
+ stvx 31,11,1
+ li 0,-1
+ stw 12,396(1)
+ li 8,0x10
+ std 26,400(1)
+ li 26,0x20
+ std 27,408(1)
+ li 27,0x30
+ std 28,416(1)
+ li 28,0x40
+ std 29,424(1)
+ li 29,0x50
+ std 30,432(1)
+ li 30,0x60
+ std 31,440(1)
+ li 31,0x70
+ or 0,0,0
+
+ subi 9,9,3
+ subi 5,5,128
+
+ lvx 23,0,6
+ lvx 30,8,6
+ addi 6,6,0x20
+ lvx 31,0,6
+ vperm 23,30,23,10
+ addi 11,1,79
+ mtctr 9
+
+.Load_cbc_dec_key:
+ vperm 24,31,30,10
+ lvx 30,8,6
+ addi 6,6,0x20
+ stvx 24,0,11
+ vperm 25,30,31,10
+ lvx 31,0,6
+ stvx 25,8,11
+ addi 11,11,0x20
+ bdnz .Load_cbc_dec_key
+
+ lvx 26,8,6
+ vperm 24,31,30,10
+ lvx 27,26,6
+ stvx 24,0,11
+ vperm 25,26,31,10
+ lvx 28,27,6
+ stvx 25,8,11
+ addi 11,1,79
+ vperm 26,27,26,10
+ lvx 29,28,6
+ vperm 27,28,27,10
+ lvx 30,29,6
+ vperm 28,29,28,10
+ lvx 31,30,6
+ vperm 29,30,29,10
+ lvx 14,31,6
+ vperm 30,31,30,10
+ lvx 24,0,11
+ vperm 31,14,31,10
+ lvx 25,8,11
+
+
+
+ subi 3,3,15
+
+ li 10,8
+ .long 0x7C001E99
+ lvsl 6,0,10
+ vspltisb 3,0x0f
+ .long 0x7C281E99
+ vxor 6,6,3
+ .long 0x7C5A1E99
+ vperm 0,0,0,6
+ .long 0x7C7B1E99
+ vperm 1,1,1,6
+ .long 0x7D5C1E99
+ vperm 2,2,2,6
+ vxor 14,0,23
+ .long 0x7D7D1E99
+ vperm 3,3,3,6
+ vxor 15,1,23
+ .long 0x7D9E1E99
+ vperm 10,10,10,6
+ vxor 16,2,23
+ .long 0x7DBF1E99
+ addi 3,3,0x80
+ vperm 11,11,11,6
+ vxor 17,3,23
+ vperm 12,12,12,6
+ vxor 18,10,23
+ vperm 13,13,13,6
+ vxor 19,11,23
+ vxor 20,12,23
+ vxor 21,13,23
+
+ mtctr 9
+ b .Loop_cbc_dec8x
+.align 5
+.Loop_cbc_dec8x:
+ .long 0x11CEC548
+ .long 0x11EFC548
+ .long 0x1210C548
+ .long 0x1231C548
+ .long 0x1252C548
+ .long 0x1273C548
+ .long 0x1294C548
+ .long 0x12B5C548
+ lvx 24,26,11
+ addi 11,11,0x20
+
+ .long 0x11CECD48
+ .long 0x11EFCD48
+ .long 0x1210CD48
+ .long 0x1231CD48
+ .long 0x1252CD48
+ .long 0x1273CD48
+ .long 0x1294CD48
+ .long 0x12B5CD48
+ lvx 25,8,11
+ bdnz .Loop_cbc_dec8x
+
+ subic 5,5,128
+ .long 0x11CEC548
+ .long 0x11EFC548
+ .long 0x1210C548
+ .long 0x1231C548
+ .long 0x1252C548
+ .long 0x1273C548
+ .long 0x1294C548
+ .long 0x12B5C548
+
+ subfe. 0,0,0
+ .long 0x11CECD48
+ .long 0x11EFCD48
+ .long 0x1210CD48
+ .long 0x1231CD48
+ .long 0x1252CD48
+ .long 0x1273CD48
+ .long 0x1294CD48
+ .long 0x12B5CD48
+
+ and 0,0,5
+ .long 0x11CED548
+ .long 0x11EFD548
+ .long 0x1210D548
+ .long 0x1231D548
+ .long 0x1252D548
+ .long 0x1273D548
+ .long 0x1294D548
+ .long 0x12B5D548
+
+ add 3,3,0
+
+
+
+ .long 0x11CEDD48
+ .long 0x11EFDD48
+ .long 0x1210DD48
+ .long 0x1231DD48
+ .long 0x1252DD48
+ .long 0x1273DD48
+ .long 0x1294DD48
+ .long 0x12B5DD48
+
+ addi 11,1,79
+ .long 0x11CEE548
+ .long 0x11EFE548
+ .long 0x1210E548
+ .long 0x1231E548
+ .long 0x1252E548
+ .long 0x1273E548
+ .long 0x1294E548
+ .long 0x12B5E548
+ lvx 24,0,11
+
+ .long 0x11CEED48
+ .long 0x11EFED48
+ .long 0x1210ED48
+ .long 0x1231ED48
+ .long 0x1252ED48
+ .long 0x1273ED48
+ .long 0x1294ED48
+ .long 0x12B5ED48
+ lvx 25,8,11
+
+ .long 0x11CEF548
+ vxor 4,4,31
+ .long 0x11EFF548
+ vxor 0,0,31
+ .long 0x1210F548
+ vxor 1,1,31
+ .long 0x1231F548
+ vxor 2,2,31
+ .long 0x1252F548
+ vxor 3,3,31
+ .long 0x1273F548
+ vxor 10,10,31
+ .long 0x1294F548
+ vxor 11,11,31
+ .long 0x12B5F548
+ vxor 12,12,31
+
+ .long 0x11CE2549
+ .long 0x11EF0549
+ .long 0x7C001E99
+ .long 0x12100D49
+ .long 0x7C281E99
+ .long 0x12311549
+ vperm 0,0,0,6
+ .long 0x7C5A1E99
+ .long 0x12521D49
+ vperm 1,1,1,6
+ .long 0x7C7B1E99
+ .long 0x12735549
+ vperm 2,2,2,6
+ .long 0x7D5C1E99
+ .long 0x12945D49
+ vperm 3,3,3,6
+ .long 0x7D7D1E99
+ .long 0x12B56549
+ vperm 10,10,10,6
+ .long 0x7D9E1E99
+ vor 4,13,13
+ vperm 11,11,11,6
+ .long 0x7DBF1E99
+ addi 3,3,0x80
+
+ vperm 14,14,14,6
+ vperm 15,15,15,6
+ .long 0x7DC02799
+ vperm 12,12,12,6
+ vxor 14,0,23
+ vperm 16,16,16,6
+ .long 0x7DE82799
+ vperm 13,13,13,6
+ vxor 15,1,23
+ vperm 17,17,17,6
+ .long 0x7E1A2799
+ vxor 16,2,23
+ vperm 18,18,18,6
+ .long 0x7E3B2799
+ vxor 17,3,23
+ vperm 19,19,19,6
+ .long 0x7E5C2799
+ vxor 18,10,23
+ vperm 20,20,20,6
+ .long 0x7E7D2799
+ vxor 19,11,23
+ vperm 21,21,21,6
+ .long 0x7E9E2799
+ vxor 20,12,23
+ .long 0x7EBF2799
+ addi 4,4,0x80
+ vxor 21,13,23
+
+ mtctr 9
+ beq .Loop_cbc_dec8x
+
+ addic. 5,5,128
+ beq .Lcbc_dec8x_done
+ nop
+ nop
+
+.Loop_cbc_dec8x_tail:
+ .long 0x11EFC548
+ .long 0x1210C548
+ .long 0x1231C548
+ .long 0x1252C548
+ .long 0x1273C548
+ .long 0x1294C548
+ .long 0x12B5C548
+ lvx 24,26,11
+ addi 11,11,0x20
+
+ .long 0x11EFCD48
+ .long 0x1210CD48
+ .long 0x1231CD48
+ .long 0x1252CD48
+ .long 0x1273CD48
+ .long 0x1294CD48
+ .long 0x12B5CD48
+ lvx 25,8,11
+ bdnz .Loop_cbc_dec8x_tail
+
+ .long 0x11EFC548
+ .long 0x1210C548
+ .long 0x1231C548
+ .long 0x1252C548
+ .long 0x1273C548
+ .long 0x1294C548
+ .long 0x12B5C548
+
+ .long 0x11EFCD48
+ .long 0x1210CD48
+ .long 0x1231CD48
+ .long 0x1252CD48
+ .long 0x1273CD48
+ .long 0x1294CD48
+ .long 0x12B5CD48
+
+ .long 0x11EFD548
+ .long 0x1210D548
+ .long 0x1231D548
+ .long 0x1252D548
+ .long 0x1273D548
+ .long 0x1294D548
+ .long 0x12B5D548
+
+ .long 0x11EFDD48
+ .long 0x1210DD48
+ .long 0x1231DD48
+ .long 0x1252DD48
+ .long 0x1273DD48
+ .long 0x1294DD48
+ .long 0x12B5DD48
+
+ .long 0x11EFE548
+ .long 0x1210E548
+ .long 0x1231E548
+ .long 0x1252E548
+ .long 0x1273E548
+ .long 0x1294E548
+ .long 0x12B5E548
+
+ .long 0x11EFED48
+ .long 0x1210ED48
+ .long 0x1231ED48
+ .long 0x1252ED48
+ .long 0x1273ED48
+ .long 0x1294ED48
+ .long 0x12B5ED48
+
+ .long 0x11EFF548
+ vxor 4,4,31
+ .long 0x1210F548
+ vxor 1,1,31
+ .long 0x1231F548
+ vxor 2,2,31
+ .long 0x1252F548
+ vxor 3,3,31
+ .long 0x1273F548
+ vxor 10,10,31
+ .long 0x1294F548
+ vxor 11,11,31
+ .long 0x12B5F548
+ vxor 12,12,31
+
+ cmplwi 5,32
+ blt .Lcbc_dec8x_one
+ nop
+ beq .Lcbc_dec8x_two
+ cmplwi 5,64
+ blt .Lcbc_dec8x_three
+ nop
+ beq .Lcbc_dec8x_four
+ cmplwi 5,96
+ blt .Lcbc_dec8x_five
+ nop
+ beq .Lcbc_dec8x_six
+
+.Lcbc_dec8x_seven:
+ .long 0x11EF2549
+ .long 0x12100D49
+ .long 0x12311549
+ .long 0x12521D49
+ .long 0x12735549
+ .long 0x12945D49
+ .long 0x12B56549
+ vor 4,13,13
+
+ vperm 15,15,15,6
+ vperm 16,16,16,6
+ .long 0x7DE02799
+ vperm 17,17,17,6
+ .long 0x7E082799
+ vperm 18,18,18,6
+ .long 0x7E3A2799
+ vperm 19,19,19,6
+ .long 0x7E5B2799
+ vperm 20,20,20,6
+ .long 0x7E7C2799
+ vperm 21,21,21,6
+ .long 0x7E9D2799
+ .long 0x7EBE2799
+ addi 4,4,0x70
+ b .Lcbc_dec8x_done
+
+.align 5
+.Lcbc_dec8x_six:
+ .long 0x12102549
+ .long 0x12311549
+ .long 0x12521D49
+ .long 0x12735549
+ .long 0x12945D49
+ .long 0x12B56549
+ vor 4,13,13
+
+ vperm 16,16,16,6
+ vperm 17,17,17,6
+ .long 0x7E002799
+ vperm 18,18,18,6
+ .long 0x7E282799
+ vperm 19,19,19,6
+ .long 0x7E5A2799
+ vperm 20,20,20,6
+ .long 0x7E7B2799
+ vperm 21,21,21,6
+ .long 0x7E9C2799
+ .long 0x7EBD2799
+ addi 4,4,0x60
+ b .Lcbc_dec8x_done
+
+.align 5
+.Lcbc_dec8x_five:
+ .long 0x12312549
+ .long 0x12521D49
+ .long 0x12735549
+ .long 0x12945D49
+ .long 0x12B56549
+ vor 4,13,13
+
+ vperm 17,17,17,6
+ vperm 18,18,18,6
+ .long 0x7E202799
+ vperm 19,19,19,6
+ .long 0x7E482799
+ vperm 20,20,20,6
+ .long 0x7E7A2799
+ vperm 21,21,21,6
+ .long 0x7E9B2799
+ .long 0x7EBC2799
+ addi 4,4,0x50
+ b .Lcbc_dec8x_done
+
+.align 5
+.Lcbc_dec8x_four:
+ .long 0x12522549
+ .long 0x12735549
+ .long 0x12945D49
+ .long 0x12B56549
+ vor 4,13,13
+
+ vperm 18,18,18,6
+ vperm 19,19,19,6
+ .long 0x7E402799
+ vperm 20,20,20,6
+ .long 0x7E682799
+ vperm 21,21,21,6
+ .long 0x7E9A2799
+ .long 0x7EBB2799
+ addi 4,4,0x40
+ b .Lcbc_dec8x_done
+
+.align 5
+.Lcbc_dec8x_three:
+ .long 0x12732549
+ .long 0x12945D49
+ .long 0x12B56549
+ vor 4,13,13
+
+ vperm 19,19,19,6
+ vperm 20,20,20,6
+ .long 0x7E602799
+ vperm 21,21,21,6
+ .long 0x7E882799
+ .long 0x7EBA2799
+ addi 4,4,0x30
+ b .Lcbc_dec8x_done
+
+.align 5
+.Lcbc_dec8x_two:
+ .long 0x12942549
+ .long 0x12B56549
+ vor 4,13,13
+
+ vperm 20,20,20,6
+ vperm 21,21,21,6
+ .long 0x7E802799
+ .long 0x7EA82799
+ addi 4,4,0x20
+ b .Lcbc_dec8x_done
+
+.align 5
+.Lcbc_dec8x_one:
+ .long 0x12B52549
+ vor 4,13,13
+
+ vperm 21,21,21,6
+ .long 0x7EA02799
+ addi 4,4,0x10
+
+.Lcbc_dec8x_done:
+ vperm 4,4,4,6
+ .long 0x7C803F99
+
+ li 10,79
+ li 11,95
+ stvx 6,10,1
+ addi 10,10,32
+ stvx 6,11,1
+ addi 11,11,32
+ stvx 6,10,1
+ addi 10,10,32
+ stvx 6,11,1
+ addi 11,11,32
+ stvx 6,10,1
+ addi 10,10,32
+ stvx 6,11,1
+ addi 11,11,32
+ stvx 6,10,1
+ addi 10,10,32
+ stvx 6,11,1
+ addi 11,11,32
+
+ or 12,12,12
+ lvx 20,10,1
+ addi 10,10,32
+ lvx 21,11,1
+ addi 11,11,32
+ lvx 22,10,1
+ addi 10,10,32
+ lvx 23,11,1
+ addi 11,11,32
+ lvx 24,10,1
+ addi 10,10,32
+ lvx 25,11,1
+ addi 11,11,32
+ lvx 26,10,1
+ addi 10,10,32
+ lvx 27,11,1
+ addi 11,11,32
+ lvx 28,10,1
+ addi 10,10,32
+ lvx 29,11,1
+ addi 11,11,32
+ lvx 30,10,1
+ lvx 31,11,1
+ ld 26,400(1)
+ ld 27,408(1)
+ ld 28,416(1)
+ ld 29,424(1)
+ ld 30,432(1)
+ ld 31,440(1)
+ addi 1,1,448
+ blr
+.long 0
+.byte 0,12,0x04,0,0x80,6,6,0
+.long 0
+.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,@function
+.align 5
+aes_hw_ctr32_encrypt_blocks:
+.localentry aes_hw_ctr32_encrypt_blocks,0
+
+ cmpldi 5,1
+ .long 0x4dc00020
+
+ lis 0,0xfff0
+ li 12,-1
+ or 0,0,0
+
+ li 10,15
+ vxor 0,0,0
+ vspltisb 3,0x0f
+
+ lvx 4,0,7
+ lvsl 6,0,7
+ lvx 5,10,7
+ vspltisb 11,1
+ vxor 6,6,3
+ vperm 4,4,5,6
+ vsldoi 11,0,11,1
+
+ neg 11,3
+ lvsr 10,0,6
+ lwz 9,240(6)
+
+ lvsr 6,0,11
+ lvx 5,0,3
+ addi 3,3,15
+ vxor 6,6,3
+
+ srwi 9,9,1
+ li 10,16
+ subi 9,9,1
+
+ cmpldi 5,8
+ bge _aesp8_ctr32_encrypt8x
+
+ lvsl 8,0,4
+ vspltisb 9,-1
+ lvx 7,0,4
+ vperm 9,9,0,8
+ vxor 8,8,3
+
+ lvx 0,0,6
+ mtctr 9
+ lvx 1,10,6
+ addi 10,10,16
+ vperm 0,1,0,10
+ vxor 2,4,0
+ lvx 0,10,6
+ addi 10,10,16
+ b .Loop_ctr32_enc
+
+.align 5
+.Loop_ctr32_enc:
+ vperm 1,0,1,10
+ .long 0x10420D08
+ lvx 1,10,6
+ addi 10,10,16
+ vperm 0,1,0,10
+ .long 0x10420508
+ lvx 0,10,6
+ addi 10,10,16
+ bdnz .Loop_ctr32_enc
+
+ vadduwm 4,4,11
+ vor 3,5,5
+ lvx 5,0,3
+ addi 3,3,16
+ subic. 5,5,1
+
+ vperm 1,0,1,10
+ .long 0x10420D08
+ lvx 1,10,6
+ vperm 3,3,5,6
+ li 10,16
+ vperm 1,1,0,10
+ lvx 0,0,6
+ vxor 3,3,1
+ .long 0x10421D09
+
+ lvx 1,10,6
+ addi 10,10,16
+ vperm 2,2,2,8
+ vsel 3,7,2,9
+ mtctr 9
+ vperm 0,1,0,10
+ vor 7,2,2
+ vxor 2,4,0
+ lvx 0,10,6
+ addi 10,10,16
+ stvx 3,0,4
+ addi 4,4,16
+ bne .Loop_ctr32_enc
+
+ addi 4,4,-1
+ lvx 2,0,4
+ vsel 2,7,2,9
+ stvx 2,0,4
+
+ or 12,12,12
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,6,0
+.long 0
+.align 5
+_aesp8_ctr32_encrypt8x:
+ stdu 1,-448(1)
+ li 10,207
+ li 11,223
+ stvx 20,10,1
+ addi 10,10,32
+ stvx 21,11,1
+ addi 11,11,32
+ stvx 22,10,1
+ addi 10,10,32
+ stvx 23,11,1
+ addi 11,11,32
+ stvx 24,10,1
+ addi 10,10,32
+ stvx 25,11,1
+ addi 11,11,32
+ stvx 26,10,1
+ addi 10,10,32
+ stvx 27,11,1
+ addi 11,11,32
+ stvx 28,10,1
+ addi 10,10,32
+ stvx 29,11,1
+ addi 11,11,32
+ stvx 30,10,1
+ stvx 31,11,1
+ li 0,-1
+ stw 12,396(1)
+ li 8,0x10
+ std 26,400(1)
+ li 26,0x20
+ std 27,408(1)
+ li 27,0x30
+ std 28,416(1)
+ li 28,0x40
+ std 29,424(1)
+ li 29,0x50
+ std 30,432(1)
+ li 30,0x60
+ std 31,440(1)
+ li 31,0x70
+ or 0,0,0
+
+ subi 9,9,3
+
+ lvx 23,0,6
+ lvx 30,8,6
+ addi 6,6,0x20
+ lvx 31,0,6
+ vperm 23,30,23,10
+ addi 11,1,79
+ mtctr 9
+
+.Load_ctr32_enc_key:
+ vperm 24,31,30,10
+ lvx 30,8,6
+ addi 6,6,0x20
+ stvx 24,0,11
+ vperm 25,30,31,10
+ lvx 31,0,6
+ stvx 25,8,11
+ addi 11,11,0x20
+ bdnz .Load_ctr32_enc_key
+
+ lvx 26,8,6
+ vperm 24,31,30,10
+ lvx 27,26,6
+ stvx 24,0,11
+ vperm 25,26,31,10
+ lvx 28,27,6
+ stvx 25,8,11
+ addi 11,1,79
+ vperm 26,27,26,10
+ lvx 29,28,6
+ vperm 27,28,27,10
+ lvx 30,29,6
+ vperm 28,29,28,10
+ lvx 31,30,6
+ vperm 29,30,29,10
+ lvx 15,31,6
+ vperm 30,31,30,10
+ lvx 24,0,11
+ vperm 31,15,31,10
+ lvx 25,8,11
+
+ vadduwm 7,11,11
+ subi 3,3,15
+ sldi 5,5,4
+
+ vadduwm 16,4,11
+ vadduwm 17,4,7
+ vxor 15,4,23
+ li 10,8
+ vadduwm 18,16,7
+ vxor 16,16,23
+ lvsl 6,0,10
+ vadduwm 19,17,7
+ vxor 17,17,23
+ vspltisb 3,0x0f
+ vadduwm 20,18,7
+ vxor 18,18,23
+ vxor 6,6,3
+ vadduwm 21,19,7
+ vxor 19,19,23
+ vadduwm 22,20,7
+ vxor 20,20,23
+ vadduwm 4,21,7
+ vxor 21,21,23
+ vxor 22,22,23
+
+ mtctr 9
+ b .Loop_ctr32_enc8x
+.align 5
+.Loop_ctr32_enc8x:
+ .long 0x11EFC508
+ .long 0x1210C508
+ .long 0x1231C508
+ .long 0x1252C508
+ .long 0x1273C508
+ .long 0x1294C508
+ .long 0x12B5C508
+ .long 0x12D6C508
+.Loop_ctr32_enc8x_middle:
+ lvx 24,26,11
+ addi 11,11,0x20
+
+ .long 0x11EFCD08
+ .long 0x1210CD08
+ .long 0x1231CD08
+ .long 0x1252CD08
+ .long 0x1273CD08
+ .long 0x1294CD08
+ .long 0x12B5CD08
+ .long 0x12D6CD08
+ lvx 25,8,11
+ bdnz .Loop_ctr32_enc8x
+
+ subic 11,5,256
+ .long 0x11EFC508
+ .long 0x1210C508
+ .long 0x1231C508
+ .long 0x1252C508
+ .long 0x1273C508
+ .long 0x1294C508
+ .long 0x12B5C508
+ .long 0x12D6C508
+
+ subfe 0,0,0
+ .long 0x11EFCD08
+ .long 0x1210CD08
+ .long 0x1231CD08
+ .long 0x1252CD08
+ .long 0x1273CD08
+ .long 0x1294CD08
+ .long 0x12B5CD08
+ .long 0x12D6CD08
+
+ and 0,0,11
+ addi 11,1,79
+ .long 0x11EFD508
+ .long 0x1210D508
+ .long 0x1231D508
+ .long 0x1252D508
+ .long 0x1273D508
+ .long 0x1294D508
+ .long 0x12B5D508
+ .long 0x12D6D508
+ lvx 24,0,11
+
+ subic 5,5,129
+ .long 0x11EFDD08
+ addi 5,5,1
+ .long 0x1210DD08
+ .long 0x1231DD08
+ .long 0x1252DD08
+ .long 0x1273DD08
+ .long 0x1294DD08
+ .long 0x12B5DD08
+ .long 0x12D6DD08
+ lvx 25,8,11
+
+ .long 0x11EFE508
+ .long 0x7C001E99
+ .long 0x1210E508
+ .long 0x7C281E99
+ .long 0x1231E508
+ .long 0x7C5A1E99
+ .long 0x1252E508
+ .long 0x7C7B1E99
+ .long 0x1273E508
+ .long 0x7D5C1E99
+ .long 0x1294E508
+ .long 0x7D9D1E99
+ .long 0x12B5E508
+ .long 0x7DBE1E99
+ .long 0x12D6E508
+ .long 0x7DDF1E99
+ addi 3,3,0x80
+
+ .long 0x11EFED08
+ vperm 0,0,0,6
+ .long 0x1210ED08
+ vperm 1,1,1,6
+ .long 0x1231ED08
+ vperm 2,2,2,6
+ .long 0x1252ED08
+ vperm 3,3,3,6
+ .long 0x1273ED08
+ vperm 10,10,10,6
+ .long 0x1294ED08
+ vperm 12,12,12,6
+ .long 0x12B5ED08
+ vperm 13,13,13,6
+ .long 0x12D6ED08
+ vperm 14,14,14,6
+
+ add 3,3,0
+
+
+
+ subfe. 0,0,0
+ .long 0x11EFF508
+ vxor 0,0,31
+ .long 0x1210F508
+ vxor 1,1,31
+ .long 0x1231F508
+ vxor 2,2,31
+ .long 0x1252F508
+ vxor 3,3,31
+ .long 0x1273F508
+ vxor 10,10,31
+ .long 0x1294F508
+ vxor 12,12,31
+ .long 0x12B5F508
+ vxor 13,13,31
+ .long 0x12D6F508
+ vxor 14,14,31
+
+ bne .Lctr32_enc8x_break
+
+ .long 0x100F0509
+ .long 0x10300D09
+ vadduwm 16,4,11
+ .long 0x10511509
+ vadduwm 17,4,7
+ vxor 15,4,23
+ .long 0x10721D09
+ vadduwm 18,16,7
+ vxor 16,16,23
+ .long 0x11535509
+ vadduwm 19,17,7
+ vxor 17,17,23
+ .long 0x11946509
+ vadduwm 20,18,7
+ vxor 18,18,23
+ .long 0x11B56D09
+ vadduwm 21,19,7
+ vxor 19,19,23
+ .long 0x11D67509
+ vadduwm 22,20,7
+ vxor 20,20,23
+ vperm 0,0,0,6
+ vadduwm 4,21,7
+ vxor 21,21,23
+ vperm 1,1,1,6
+ vxor 22,22,23
+ mtctr 9
+
+ .long 0x11EFC508
+ .long 0x7C002799
+ vperm 2,2,2,6
+ .long 0x1210C508
+ .long 0x7C282799
+ vperm 3,3,3,6
+ .long 0x1231C508
+ .long 0x7C5A2799
+ vperm 10,10,10,6
+ .long 0x1252C508
+ .long 0x7C7B2799
+ vperm 12,12,12,6
+ .long 0x1273C508
+ .long 0x7D5C2799
+ vperm 13,13,13,6
+ .long 0x1294C508
+ .long 0x7D9D2799
+ vperm 14,14,14,6
+ .long 0x12B5C508
+ .long 0x7DBE2799
+ .long 0x12D6C508
+ .long 0x7DDF2799
+ addi 4,4,0x80
+
+ b .Loop_ctr32_enc8x_middle
+
+.align 5
+.Lctr32_enc8x_break:
+ cmpwi 5,-0x60
+ blt .Lctr32_enc8x_one
+ nop
+ beq .Lctr32_enc8x_two
+ cmpwi 5,-0x40
+ blt .Lctr32_enc8x_three
+ nop
+ beq .Lctr32_enc8x_four
+ cmpwi 5,-0x20
+ blt .Lctr32_enc8x_five
+ nop
+ beq .Lctr32_enc8x_six
+ cmpwi 5,0x00
+ blt .Lctr32_enc8x_seven
+
+.Lctr32_enc8x_eight:
+ .long 0x11EF0509
+ .long 0x12100D09
+ .long 0x12311509
+ .long 0x12521D09
+ .long 0x12735509
+ .long 0x12946509
+ .long 0x12B56D09
+ .long 0x12D67509
+
+ vperm 15,15,15,6
+ vperm 16,16,16,6
+ .long 0x7DE02799
+ vperm 17,17,17,6
+ .long 0x7E082799
+ vperm 18,18,18,6
+ .long 0x7E3A2799
+ vperm 19,19,19,6
+ .long 0x7E5B2799
+ vperm 20,20,20,6
+ .long 0x7E7C2799
+ vperm 21,21,21,6
+ .long 0x7E9D2799
+ vperm 22,22,22,6
+ .long 0x7EBE2799
+ .long 0x7EDF2799
+ addi 4,4,0x80
+ b .Lctr32_enc8x_done
+
+.align 5
+.Lctr32_enc8x_seven:
+ .long 0x11EF0D09
+ .long 0x12101509
+ .long 0x12311D09
+ .long 0x12525509
+ .long 0x12736509
+ .long 0x12946D09
+ .long 0x12B57509
+
+ vperm 15,15,15,6
+ vperm 16,16,16,6
+ .long 0x7DE02799
+ vperm 17,17,17,6
+ .long 0x7E082799
+ vperm 18,18,18,6
+ .long 0x7E3A2799
+ vperm 19,19,19,6
+ .long 0x7E5B2799
+ vperm 20,20,20,6
+ .long 0x7E7C2799
+ vperm 21,21,21,6
+ .long 0x7E9D2799
+ .long 0x7EBE2799
+ addi 4,4,0x70
+ b .Lctr32_enc8x_done
+
+.align 5
+.Lctr32_enc8x_six:
+ .long 0x11EF1509
+ .long 0x12101D09
+ .long 0x12315509
+ .long 0x12526509
+ .long 0x12736D09
+ .long 0x12947509
+
+ vperm 15,15,15,6
+ vperm 16,16,16,6
+ .long 0x7DE02799
+ vperm 17,17,17,6
+ .long 0x7E082799
+ vperm 18,18,18,6
+ .long 0x7E3A2799
+ vperm 19,19,19,6
+ .long 0x7E5B2799
+ vperm 20,20,20,6
+ .long 0x7E7C2799
+ .long 0x7E9D2799
+ addi 4,4,0x60
+ b .Lctr32_enc8x_done
+
+.align 5
+.Lctr32_enc8x_five:
+ .long 0x11EF1D09
+ .long 0x12105509
+ .long 0x12316509
+ .long 0x12526D09
+ .long 0x12737509
+
+ vperm 15,15,15,6
+ vperm 16,16,16,6
+ .long 0x7DE02799
+ vperm 17,17,17,6
+ .long 0x7E082799
+ vperm 18,18,18,6
+ .long 0x7E3A2799
+ vperm 19,19,19,6
+ .long 0x7E5B2799
+ .long 0x7E7C2799
+ addi 4,4,0x50
+ b .Lctr32_enc8x_done
+
+.align 5
+.Lctr32_enc8x_four:
+ .long 0x11EF5509
+ .long 0x12106509
+ .long 0x12316D09
+ .long 0x12527509
+
+ vperm 15,15,15,6
+ vperm 16,16,16,6
+ .long 0x7DE02799
+ vperm 17,17,17,6
+ .long 0x7E082799
+ vperm 18,18,18,6
+ .long 0x7E3A2799
+ .long 0x7E5B2799
+ addi 4,4,0x40
+ b .Lctr32_enc8x_done
+
+.align 5
+.Lctr32_enc8x_three:
+ .long 0x11EF6509
+ .long 0x12106D09
+ .long 0x12317509
+
+ vperm 15,15,15,6
+ vperm 16,16,16,6
+ .long 0x7DE02799
+ vperm 17,17,17,6
+ .long 0x7E082799
+ .long 0x7E3A2799
+ addi 4,4,0x30
+ b .Lctr32_enc8x_done
+
+.align 5
+.Lctr32_enc8x_two:
+ .long 0x11EF6D09
+ .long 0x12107509
+
+ vperm 15,15,15,6
+ vperm 16,16,16,6
+ .long 0x7DE02799
+ .long 0x7E082799
+ addi 4,4,0x20
+ b .Lctr32_enc8x_done
+
+.align 5
+.Lctr32_enc8x_one:
+ .long 0x11EF7509
+
+ vperm 15,15,15,6
+ .long 0x7DE02799
+ addi 4,4,0x10
+
+.Lctr32_enc8x_done:
+ li 10,79
+ li 11,95
+ stvx 6,10,1
+ addi 10,10,32
+ stvx 6,11,1
+ addi 11,11,32
+ stvx 6,10,1
+ addi 10,10,32
+ stvx 6,11,1
+ addi 11,11,32
+ stvx 6,10,1
+ addi 10,10,32
+ stvx 6,11,1
+ addi 11,11,32
+ stvx 6,10,1
+ addi 10,10,32
+ stvx 6,11,1
+ addi 11,11,32
+
+ or 12,12,12
+ lvx 20,10,1
+ addi 10,10,32
+ lvx 21,11,1
+ addi 11,11,32
+ lvx 22,10,1
+ addi 10,10,32
+ lvx 23,11,1
+ addi 11,11,32
+ lvx 24,10,1
+ addi 10,10,32
+ lvx 25,11,1
+ addi 11,11,32
+ lvx 26,10,1
+ addi 10,10,32
+ lvx 27,11,1
+ addi 11,11,32
+ lvx 28,10,1
+ addi 10,10,32
+ lvx 29,11,1
+ addi 11,11,32
+ lvx 30,10,1
+ lvx 31,11,1
+ ld 26,400(1)
+ ld 27,408(1)
+ ld 28,416(1)
+ ld 29,424(1)
+ ld 30,432(1)
+ ld 31,440(1)
+ addi 1,1,448
+ blr
+.long 0
+.byte 0,12,0x04,0,0x80,6,6,0
+.long 0
+.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+.globl aes_hw_xts_encrypt
+.type aes_hw_xts_encrypt,@function
+.align 5
+aes_hw_xts_encrypt:
+.localentry aes_hw_xts_encrypt,0
+
+ mr 10,3
+ li 3,-1
+ cmpldi 5,16
+ .long 0x4dc00020
+
+ lis 0,0xfff0
+ li 12,-1
+ li 11,0
+ or 0,0,0
+
+ vspltisb 9,0x07
+ lvsl 6,11,11
+ vspltisb 11,0x0f
+ vxor 6,6,9
+
+ li 3,15
+ lvx 8,0,8
+ lvsl 5,0,8
+ lvx 4,3,8
+ vxor 5,5,11
+ vperm 8,8,4,5
+
+ neg 11,10
+ lvsr 5,0,11
+ lvx 2,0,10
+ addi 10,10,15
+ vxor 5,5,11
+
+ cmpldi 7,0
+ beq .Lxts_enc_no_key2
+
+ lvsr 7,0,7
+ lwz 9,240(7)
+ srwi 9,9,1
+ subi 9,9,1
+ li 3,16
+
+ lvx 0,0,7
+ lvx 1,3,7
+ addi 3,3,16
+ vperm 0,1,0,7
+ vxor 8,8,0
+ lvx 0,3,7
+ addi 3,3,16
+ mtctr 9
+
+.Ltweak_xts_enc:
+ vperm 1,0,1,7
+ .long 0x11080D08
+ lvx 1,3,7
+ addi 3,3,16
+ vperm 0,1,0,7
+ .long 0x11080508
+ lvx 0,3,7
+ addi 3,3,16
+ bdnz .Ltweak_xts_enc
+
+ vperm 1,0,1,7
+ .long 0x11080D08
+ lvx 1,3,7
+ vperm 0,1,0,7
+ .long 0x11080509
+
+ li 8,0
+ b .Lxts_enc
+
+.Lxts_enc_no_key2:
+ li 3,-16
+ and 5,5,3
+
+
+.Lxts_enc:
+ lvx 4,0,10
+ addi 10,10,16
+
+ lvsr 7,0,6
+ lwz 9,240(6)
+ srwi 9,9,1
+ subi 9,9,1
+ li 3,16
+
+ vslb 10,9,9
+ vor 10,10,9
+ vspltisb 11,1
+ vsldoi 10,10,11,15
+
+ cmpldi 5,96
+ bge _aesp8_xts_encrypt6x
+
+ andi. 7,5,15
+ subic 0,5,32
+ subi 7,7,16
+ subfe 0,0,0
+ and 0,0,7
+ add 10,10,0
+
+ lvx 0,0,6
+ lvx 1,3,6
+ addi 3,3,16
+ vperm 2,2,4,5
+ vperm 0,1,0,7
+ vxor 2,2,8
+ vxor 2,2,0
+ lvx 0,3,6
+ addi 3,3,16
+ mtctr 9
+ b .Loop_xts_enc
+
+.align 5
+.Loop_xts_enc:
+ vperm 1,0,1,7
+ .long 0x10420D08
+ lvx 1,3,6
+ addi 3,3,16
+ vperm 0,1,0,7
+ .long 0x10420508
+ lvx 0,3,6
+ addi 3,3,16
+ bdnz .Loop_xts_enc
+
+ vperm 1,0,1,7
+ .long 0x10420D08
+ lvx 1,3,6
+ li 3,16
+ vperm 0,1,0,7
+ vxor 0,0,8
+ .long 0x10620509
+
+ vperm 11,3,3,6
+
+ .long 0x7D602799
+
+ addi 4,4,16
+
+ subic. 5,5,16
+ beq .Lxts_enc_done
+
+ vor 2,4,4
+ lvx 4,0,10
+ addi 10,10,16
+ lvx 0,0,6
+ lvx 1,3,6
+ addi 3,3,16
+
+ subic 0,5,32
+ subfe 0,0,0
+ and 0,0,7
+ add 10,10,0
+
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vand 11,11,10
+ vxor 8,8,11
+
+ vperm 2,2,4,5
+ vperm 0,1,0,7
+ vxor 2,2,8
+ vxor 3,3,0
+ vxor 2,2,0
+ lvx 0,3,6
+ addi 3,3,16
+
+ mtctr 9
+ cmpldi 5,16
+ bge .Loop_xts_enc
+
+ vxor 3,3,8
+ lvsr 5,0,5
+ vxor 4,4,4
+ vspltisb 11,-1
+ vperm 4,4,11,5
+ vsel 2,2,3,4
+
+ subi 11,4,17
+ subi 4,4,16
+ mtctr 5
+ li 5,16
+.Loop_xts_enc_steal:
+ lbzu 0,1(11)
+ stb 0,16(11)
+ bdnz .Loop_xts_enc_steal
+
+ mtctr 9
+ b .Loop_xts_enc
+
+.Lxts_enc_done:
+ cmpldi 8,0
+ beq .Lxts_enc_ret
+
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vand 11,11,10
+ vxor 8,8,11
+
+ vperm 8,8,8,6
+ .long 0x7D004799
+
+.Lxts_enc_ret:
+ or 12,12,12
+ li 3,0
+ blr
+.long 0
+.byte 0,12,0x04,0,0x80,6,6,0
+.long 0
+.size aes_hw_xts_encrypt,.-aes_hw_xts_encrypt
+
+.globl aes_hw_xts_decrypt
+.type aes_hw_xts_decrypt,@function
+.align 5
+aes_hw_xts_decrypt:
+.localentry aes_hw_xts_decrypt,0
+
+ mr 10,3
+ li 3,-1
+ cmpldi 5,16
+ .long 0x4dc00020
+
+ lis 0,0xfff8
+ li 12,-1
+ li 11,0
+ or 0,0,0
+
+ andi. 0,5,15
+ neg 0,0
+ andi. 0,0,16
+ sub 5,5,0
+
+ vspltisb 9,0x07
+ lvsl 6,11,11
+ vspltisb 11,0x0f
+ vxor 6,6,9
+
+ li 3,15
+ lvx 8,0,8
+ lvsl 5,0,8
+ lvx 4,3,8
+ vxor 5,5,11
+ vperm 8,8,4,5
+
+ neg 11,10
+ lvsr 5,0,11
+ lvx 2,0,10
+ addi 10,10,15
+ vxor 5,5,11
+
+ cmpldi 7,0
+ beq .Lxts_dec_no_key2
+
+ lvsr 7,0,7
+ lwz 9,240(7)
+ srwi 9,9,1
+ subi 9,9,1
+ li 3,16
+
+ lvx 0,0,7
+ lvx 1,3,7
+ addi 3,3,16
+ vperm 0,1,0,7
+ vxor 8,8,0
+ lvx 0,3,7
+ addi 3,3,16
+ mtctr 9
+
+.Ltweak_xts_dec:
+ vperm 1,0,1,7
+ .long 0x11080D08
+ lvx 1,3,7
+ addi 3,3,16
+ vperm 0,1,0,7
+ .long 0x11080508
+ lvx 0,3,7
+ addi 3,3,16
+ bdnz .Ltweak_xts_dec
+
+ vperm 1,0,1,7
+ .long 0x11080D08
+ lvx 1,3,7
+ vperm 0,1,0,7
+ .long 0x11080509
+
+ li 8,0
+ b .Lxts_dec
+
+.Lxts_dec_no_key2:
+ neg 3,5
+ andi. 3,3,15
+ add 5,5,3
+
+
+.Lxts_dec:
+ lvx 4,0,10
+ addi 10,10,16
+
+ lvsr 7,0,6
+ lwz 9,240(6)
+ srwi 9,9,1
+ subi 9,9,1
+ li 3,16
+
+ vslb 10,9,9
+ vor 10,10,9
+ vspltisb 11,1
+ vsldoi 10,10,11,15
+
+ cmpldi 5,96
+ bge _aesp8_xts_decrypt6x
+
+ lvx 0,0,6
+ lvx 1,3,6
+ addi 3,3,16
+ vperm 2,2,4,5
+ vperm 0,1,0,7
+ vxor 2,2,8
+ vxor 2,2,0
+ lvx 0,3,6
+ addi 3,3,16
+ mtctr 9
+
+ cmpldi 5,16
+ blt .Ltail_xts_dec
+
+
+.align 5
+.Loop_xts_dec:
+ vperm 1,0,1,7
+ .long 0x10420D48
+ lvx 1,3,6
+ addi 3,3,16
+ vperm 0,1,0,7
+ .long 0x10420548
+ lvx 0,3,6
+ addi 3,3,16
+ bdnz .Loop_xts_dec
+
+ vperm 1,0,1,7
+ .long 0x10420D48
+ lvx 1,3,6
+ li 3,16
+ vperm 0,1,0,7
+ vxor 0,0,8
+ .long 0x10620549
+
+ vperm 11,3,3,6
+
+ .long 0x7D602799
+
+ addi 4,4,16
+
+ subic. 5,5,16
+ beq .Lxts_dec_done
+
+ vor 2,4,4
+ lvx 4,0,10
+ addi 10,10,16
+ lvx 0,0,6
+ lvx 1,3,6
+ addi 3,3,16
+
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vand 11,11,10
+ vxor 8,8,11
+
+ vperm 2,2,4,5
+ vperm 0,1,0,7
+ vxor 2,2,8
+ vxor 2,2,0
+ lvx 0,3,6
+ addi 3,3,16
+
+ mtctr 9
+ cmpldi 5,16
+ bge .Loop_xts_dec
+
+.Ltail_xts_dec:
+ vsrab 11,8,9
+ vaddubm 12,8,8
+ vsldoi 11,11,11,15
+ vand 11,11,10
+ vxor 12,12,11
+
+ subi 10,10,16
+ add 10,10,5
+
+ vxor 2,2,8
+ vxor 2,2,12
+
+.Loop_xts_dec_short:
+ vperm 1,0,1,7
+ .long 0x10420D48
+ lvx 1,3,6
+ addi 3,3,16
+ vperm 0,1,0,7
+ .long 0x10420548
+ lvx 0,3,6
+ addi 3,3,16
+ bdnz .Loop_xts_dec_short
+
+ vperm 1,0,1,7
+ .long 0x10420D48
+ lvx 1,3,6
+ li 3,16
+ vperm 0,1,0,7
+ vxor 0,0,12
+ .long 0x10620549
+
+ vperm 11,3,3,6
+
+ .long 0x7D602799
+
+
+ vor 2,4,4
+ lvx 4,0,10
+
+ lvx 0,0,6
+ lvx 1,3,6
+ addi 3,3,16
+ vperm 2,2,4,5
+ vperm 0,1,0,7
+
+ lvsr 5,0,5
+ vxor 4,4,4
+ vspltisb 11,-1
+ vperm 4,4,11,5
+ vsel 2,2,3,4
+
+ vxor 0,0,8
+ vxor 2,2,0
+ lvx 0,3,6
+ addi 3,3,16
+
+ subi 11,4,1
+ mtctr 5
+ li 5,16
+.Loop_xts_dec_steal:
+ lbzu 0,1(11)
+ stb 0,16(11)
+ bdnz .Loop_xts_dec_steal
+
+ mtctr 9
+ b .Loop_xts_dec
+
+.Lxts_dec_done:
+ cmpldi 8,0
+ beq .Lxts_dec_ret
+
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vand 11,11,10
+ vxor 8,8,11
+
+ vperm 8,8,8,6
+ .long 0x7D004799
+
+.Lxts_dec_ret:
+ or 12,12,12
+ li 3,0
+ blr
+.long 0
+.byte 0,12,0x04,0,0x80,6,6,0
+.long 0
+.size aes_hw_xts_decrypt,.-aes_hw_xts_decrypt
+.align 5
+_aesp8_xts_encrypt6x:
+ stdu 1,-448(1)
+ mflr 11
+ li 7,207
+ li 3,223
+ std 11,464(1)
+ stvx 20,7,1
+ addi 7,7,32
+ stvx 21,3,1
+ addi 3,3,32
+ stvx 22,7,1
+ addi 7,7,32
+ stvx 23,3,1
+ addi 3,3,32
+ stvx 24,7,1
+ addi 7,7,32
+ stvx 25,3,1
+ addi 3,3,32
+ stvx 26,7,1
+ addi 7,7,32
+ stvx 27,3,1
+ addi 3,3,32
+ stvx 28,7,1
+ addi 7,7,32
+ stvx 29,3,1
+ addi 3,3,32
+ stvx 30,7,1
+ stvx 31,3,1
+ li 0,-1
+ stw 12,396(1)
+ li 3,0x10
+ std 26,400(1)
+ li 26,0x20
+ std 27,408(1)
+ li 27,0x30
+ std 28,416(1)
+ li 28,0x40
+ std 29,424(1)
+ li 29,0x50
+ std 30,432(1)
+ li 30,0x60
+ std 31,440(1)
+ li 31,0x70
+ or 0,0,0
+
+ subi 9,9,3
+
+ lvx 23,0,6
+ lvx 30,3,6
+ addi 6,6,0x20
+ lvx 31,0,6
+ vperm 23,30,23,7
+ addi 7,1,79
+ mtctr 9
+
+.Load_xts_enc_key:
+ vperm 24,31,30,7
+ lvx 30,3,6
+ addi 6,6,0x20
+ stvx 24,0,7
+ vperm 25,30,31,7
+ lvx 31,0,6
+ stvx 25,3,7
+ addi 7,7,0x20
+ bdnz .Load_xts_enc_key
+
+ lvx 26,3,6
+ vperm 24,31,30,7
+ lvx 27,26,6
+ stvx 24,0,7
+ vperm 25,26,31,7
+ lvx 28,27,6
+ stvx 25,3,7
+ addi 7,1,79
+ vperm 26,27,26,7
+ lvx 29,28,6
+ vperm 27,28,27,7
+ lvx 30,29,6
+ vperm 28,29,28,7
+ lvx 31,30,6
+ vperm 29,30,29,7
+ lvx 22,31,6
+ vperm 30,31,30,7
+ lvx 24,0,7
+ vperm 31,22,31,7
+ lvx 25,3,7
+
+ vperm 0,2,4,5
+ subi 10,10,31
+ vxor 17,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vand 11,11,10
+ vxor 7,0,17
+ vxor 8,8,11
+
+ .long 0x7C235699
+ vxor 18,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 1,1,1,6
+ vand 11,11,10
+ vxor 12,1,18
+ vxor 8,8,11
+
+ .long 0x7C5A5699
+ andi. 31,5,15
+ vxor 19,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 2,2,2,6
+ vand 11,11,10
+ vxor 13,2,19
+ vxor 8,8,11
+
+ .long 0x7C7B5699
+ sub 5,5,31
+ vxor 20,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 3,3,3,6
+ vand 11,11,10
+ vxor 14,3,20
+ vxor 8,8,11
+
+ .long 0x7C9C5699
+ subi 5,5,0x60
+ vxor 21,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 4,4,4,6
+ vand 11,11,10
+ vxor 15,4,21
+ vxor 8,8,11
+
+ .long 0x7CBD5699
+ addi 10,10,0x60
+ vxor 22,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 5,5,5,6
+ vand 11,11,10
+ vxor 16,5,22
+ vxor 8,8,11
+
+ vxor 31,31,23
+ mtctr 9
+ b .Loop_xts_enc6x
+
+.align 5
+.Loop_xts_enc6x:
+ .long 0x10E7C508
+ .long 0x118CC508
+ .long 0x11ADC508
+ .long 0x11CEC508
+ .long 0x11EFC508
+ .long 0x1210C508
+ lvx 24,26,7
+ addi 7,7,0x20
+
+ .long 0x10E7CD08
+ .long 0x118CCD08
+ .long 0x11ADCD08
+ .long 0x11CECD08
+ .long 0x11EFCD08
+ .long 0x1210CD08
+ lvx 25,3,7
+ bdnz .Loop_xts_enc6x
+
+ subic 5,5,96
+ vxor 0,17,31
+ .long 0x10E7C508
+ .long 0x118CC508
+ vsrab 11,8,9
+ vxor 17,8,23
+ vaddubm 8,8,8
+ .long 0x11ADC508
+ .long 0x11CEC508
+ vsldoi 11,11,11,15
+ .long 0x11EFC508
+ .long 0x1210C508
+
+ subfe. 0,0,0
+ vand 11,11,10
+ .long 0x10E7CD08
+ .long 0x118CCD08
+ vxor 8,8,11
+ .long 0x11ADCD08
+ .long 0x11CECD08
+ vxor 1,18,31
+ vsrab 11,8,9
+ vxor 18,8,23
+ .long 0x11EFCD08
+ .long 0x1210CD08
+
+ and 0,0,5
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ .long 0x10E7D508
+ .long 0x118CD508
+ vand 11,11,10
+ .long 0x11ADD508
+ .long 0x11CED508
+ vxor 8,8,11
+ .long 0x11EFD508
+ .long 0x1210D508
+
+ add 10,10,0
+
+
+
+ vxor 2,19,31
+ vsrab 11,8,9
+ vxor 19,8,23
+ vaddubm 8,8,8
+ .long 0x10E7DD08
+ .long 0x118CDD08
+ vsldoi 11,11,11,15
+ .long 0x11ADDD08
+ .long 0x11CEDD08
+ vand 11,11,10
+ .long 0x11EFDD08
+ .long 0x1210DD08
+
+ addi 7,1,79
+ vxor 8,8,11
+ .long 0x10E7E508
+ .long 0x118CE508
+ vxor 3,20,31
+ vsrab 11,8,9
+ vxor 20,8,23
+ .long 0x11ADE508
+ .long 0x11CEE508
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ .long 0x11EFE508
+ .long 0x1210E508
+ lvx 24,0,7
+ vand 11,11,10
+
+ .long 0x10E7ED08
+ .long 0x118CED08
+ vxor 8,8,11
+ .long 0x11ADED08
+ .long 0x11CEED08
+ vxor 4,21,31
+ vsrab 11,8,9
+ vxor 21,8,23
+ .long 0x11EFED08
+ .long 0x1210ED08
+ lvx 25,3,7
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+
+ .long 0x10E7F508
+ .long 0x118CF508
+ vand 11,11,10
+ .long 0x11ADF508
+ .long 0x11CEF508
+ vxor 8,8,11
+ .long 0x11EFF508
+ .long 0x1210F508
+ vxor 5,22,31
+ vsrab 11,8,9
+ vxor 22,8,23
+
+ .long 0x10E70509
+ .long 0x7C005699
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ .long 0x118C0D09
+ .long 0x7C235699
+ .long 0x11AD1509
+ vperm 0,0,0,6
+ .long 0x7C5A5699
+ vand 11,11,10
+ .long 0x11CE1D09
+ vperm 1,1,1,6
+ .long 0x7C7B5699
+ .long 0x11EF2509
+ vperm 2,2,2,6
+ .long 0x7C9C5699
+ vxor 8,8,11
+ .long 0x11702D09
+
+ vperm 3,3,3,6
+ .long 0x7CBD5699
+ addi 10,10,0x60
+ vperm 4,4,4,6
+ vperm 5,5,5,6
+
+ vperm 7,7,7,6
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ vxor 7,0,17
+ vperm 13,13,13,6
+ .long 0x7D832799
+ vxor 12,1,18
+ vperm 14,14,14,6
+ .long 0x7DBA2799
+ vxor 13,2,19
+ vperm 15,15,15,6
+ .long 0x7DDB2799
+ vxor 14,3,20
+ vperm 16,11,11,6
+ .long 0x7DFC2799
+ vxor 15,4,21
+ .long 0x7E1D2799
+
+ vxor 16,5,22
+ addi 4,4,0x60
+
+ mtctr 9
+ beq .Loop_xts_enc6x
+
+ addic. 5,5,0x60
+ beq .Lxts_enc6x_zero
+ cmpwi 5,0x20
+ blt .Lxts_enc6x_one
+ nop
+ beq .Lxts_enc6x_two
+ cmpwi 5,0x40
+ blt .Lxts_enc6x_three
+ nop
+ beq .Lxts_enc6x_four
+
+.Lxts_enc6x_five:
+ vxor 7,1,17
+ vxor 12,2,18
+ vxor 13,3,19
+ vxor 14,4,20
+ vxor 15,5,21
+
+ bl _aesp8_xts_enc5x
+
+ vperm 7,7,7,6
+ vor 17,22,22
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ vperm 13,13,13,6
+ .long 0x7D832799
+ vperm 14,14,14,6
+ .long 0x7DBA2799
+ vxor 11,15,22
+ vperm 15,15,15,6
+ .long 0x7DDB2799
+ .long 0x7DFC2799
+ addi 4,4,0x50
+ bne .Lxts_enc6x_steal
+ b .Lxts_enc6x_done
+
+.align 4
+.Lxts_enc6x_four:
+ vxor 7,2,17
+ vxor 12,3,18
+ vxor 13,4,19
+ vxor 14,5,20
+ vxor 15,15,15
+
+ bl _aesp8_xts_enc5x
+
+ vperm 7,7,7,6
+ vor 17,21,21
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ vperm 13,13,13,6
+ .long 0x7D832799
+ vxor 11,14,21
+ vperm 14,14,14,6
+ .long 0x7DBA2799
+ .long 0x7DDB2799
+ addi 4,4,0x40
+ bne .Lxts_enc6x_steal
+ b .Lxts_enc6x_done
+
+.align 4
+.Lxts_enc6x_three:
+ vxor 7,3,17
+ vxor 12,4,18
+ vxor 13,5,19
+ vxor 14,14,14
+ vxor 15,15,15
+
+ bl _aesp8_xts_enc5x
+
+ vperm 7,7,7,6
+ vor 17,20,20
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ vxor 11,13,20
+ vperm 13,13,13,6
+ .long 0x7D832799
+ .long 0x7DBA2799
+ addi 4,4,0x30
+ bne .Lxts_enc6x_steal
+ b .Lxts_enc6x_done
+
+.align 4
+.Lxts_enc6x_two:
+ vxor 7,4,17
+ vxor 12,5,18
+ vxor 13,13,13
+ vxor 14,14,14
+ vxor 15,15,15
+
+ bl _aesp8_xts_enc5x
+
+ vperm 7,7,7,6
+ vor 17,19,19
+ vxor 11,12,19
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ .long 0x7D832799
+ addi 4,4,0x20
+ bne .Lxts_enc6x_steal
+ b .Lxts_enc6x_done
+
+.align 4
+.Lxts_enc6x_one:
+ vxor 7,5,17
+ nop
+.Loop_xts_enc1x:
+ .long 0x10E7C508
+ lvx 24,26,7
+ addi 7,7,0x20
+
+ .long 0x10E7CD08
+ lvx 25,3,7
+ bdnz .Loop_xts_enc1x
+
+ add 10,10,31
+ cmpwi 31,0
+ .long 0x10E7C508
+
+ subi 10,10,16
+ .long 0x10E7CD08
+
+ lvsr 5,0,31
+ .long 0x10E7D508
+
+ .long 0x7C005699
+ .long 0x10E7DD08
+
+ addi 7,1,79
+ .long 0x10E7E508
+ lvx 24,0,7
+
+ .long 0x10E7ED08
+ lvx 25,3,7
+ vxor 17,17,31
+
+ vperm 0,0,0,6
+ .long 0x10E7F508
+
+ vperm 0,0,0,5
+ .long 0x10E78D09
+
+ vor 17,18,18
+ vxor 11,7,18
+ vperm 7,7,7,6
+ .long 0x7CE02799
+ addi 4,4,0x10
+ bne .Lxts_enc6x_steal
+ b .Lxts_enc6x_done
+
+.align 4
+.Lxts_enc6x_zero:
+ cmpwi 31,0
+ beq .Lxts_enc6x_done
+
+ add 10,10,31
+ subi 10,10,16
+ .long 0x7C005699
+ lvsr 5,0,31
+ vperm 0,0,0,6
+ vperm 0,0,0,5
+ vxor 11,11,17
+.Lxts_enc6x_steal:
+ vxor 0,0,17
+ vxor 7,7,7
+ vspltisb 12,-1
+ vperm 7,7,12,5
+ vsel 7,0,11,7
+
+ subi 30,4,17
+ subi 4,4,16
+ mtctr 31
+.Loop_xts_enc6x_steal:
+ lbzu 0,1(30)
+ stb 0,16(30)
+ bdnz .Loop_xts_enc6x_steal
+
+ li 31,0
+ mtctr 9
+ b .Loop_xts_enc1x
+
+.align 4
+.Lxts_enc6x_done:
+ cmpldi 8,0
+ beq .Lxts_enc6x_ret
+
+ vxor 8,17,23
+ vperm 8,8,8,6
+ .long 0x7D004799
+
+.Lxts_enc6x_ret:
+ mtlr 11
+ li 10,79
+ li 11,95
+ stvx 9,10,1
+ addi 10,10,32
+ stvx 9,11,1
+ addi 11,11,32
+ stvx 9,10,1
+ addi 10,10,32
+ stvx 9,11,1
+ addi 11,11,32
+ stvx 9,10,1
+ addi 10,10,32
+ stvx 9,11,1
+ addi 11,11,32
+ stvx 9,10,1
+ addi 10,10,32
+ stvx 9,11,1
+ addi 11,11,32
+
+ or 12,12,12
+ lvx 20,10,1
+ addi 10,10,32
+ lvx 21,11,1
+ addi 11,11,32
+ lvx 22,10,1
+ addi 10,10,32
+ lvx 23,11,1
+ addi 11,11,32
+ lvx 24,10,1
+ addi 10,10,32
+ lvx 25,11,1
+ addi 11,11,32
+ lvx 26,10,1
+ addi 10,10,32
+ lvx 27,11,1
+ addi 11,11,32
+ lvx 28,10,1
+ addi 10,10,32
+ lvx 29,11,1
+ addi 11,11,32
+ lvx 30,10,1
+ lvx 31,11,1
+ ld 26,400(1)
+ ld 27,408(1)
+ ld 28,416(1)
+ ld 29,424(1)
+ ld 30,432(1)
+ ld 31,440(1)
+ addi 1,1,448
+ blr
+.long 0
+.byte 0,12,0x04,1,0x80,6,6,0
+.long 0
+
+.align 5
+_aesp8_xts_enc5x:
+ .long 0x10E7C508
+ .long 0x118CC508
+ .long 0x11ADC508
+ .long 0x11CEC508
+ .long 0x11EFC508
+ lvx 24,26,7
+ addi 7,7,0x20
+
+ .long 0x10E7CD08
+ .long 0x118CCD08
+ .long 0x11ADCD08
+ .long 0x11CECD08
+ .long 0x11EFCD08
+ lvx 25,3,7
+ bdnz _aesp8_xts_enc5x
+
+ add 10,10,31
+ cmpwi 31,0
+ .long 0x10E7C508
+ .long 0x118CC508
+ .long 0x11ADC508
+ .long 0x11CEC508
+ .long 0x11EFC508
+
+ subi 10,10,16
+ .long 0x10E7CD08
+ .long 0x118CCD08
+ .long 0x11ADCD08
+ .long 0x11CECD08
+ .long 0x11EFCD08
+ vxor 17,17,31
+
+ .long 0x10E7D508
+ lvsr 5,0,31
+ .long 0x118CD508
+ .long 0x11ADD508
+ .long 0x11CED508
+ .long 0x11EFD508
+ vxor 1,18,31
+
+ .long 0x10E7DD08
+ .long 0x7C005699
+ .long 0x118CDD08
+ .long 0x11ADDD08
+ .long 0x11CEDD08
+ .long 0x11EFDD08
+ vxor 2,19,31
+
+ addi 7,1,79
+ .long 0x10E7E508
+ .long 0x118CE508
+ .long 0x11ADE508
+ .long 0x11CEE508
+ .long 0x11EFE508
+ lvx 24,0,7
+ vxor 3,20,31
+
+ .long 0x10E7ED08
+ vperm 0,0,0,6
+ .long 0x118CED08
+ .long 0x11ADED08
+ .long 0x11CEED08
+ .long 0x11EFED08
+ lvx 25,3,7
+ vxor 4,21,31
+
+ .long 0x10E7F508
+ vperm 0,0,0,5
+ .long 0x118CF508
+ .long 0x11ADF508
+ .long 0x11CEF508
+ .long 0x11EFF508
+
+ .long 0x10E78D09
+ .long 0x118C0D09
+ .long 0x11AD1509
+ .long 0x11CE1D09
+ .long 0x11EF2509
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+
+.align 5
+_aesp8_xts_decrypt6x:
+ stdu 1,-448(1)
+ mflr 11
+ li 7,207
+ li 3,223
+ std 11,464(1)
+ stvx 20,7,1
+ addi 7,7,32
+ stvx 21,3,1
+ addi 3,3,32
+ stvx 22,7,1
+ addi 7,7,32
+ stvx 23,3,1
+ addi 3,3,32
+ stvx 24,7,1
+ addi 7,7,32
+ stvx 25,3,1
+ addi 3,3,32
+ stvx 26,7,1
+ addi 7,7,32
+ stvx 27,3,1
+ addi 3,3,32
+ stvx 28,7,1
+ addi 7,7,32
+ stvx 29,3,1
+ addi 3,3,32
+ stvx 30,7,1
+ stvx 31,3,1
+ li 0,-1
+ stw 12,396(1)
+ li 3,0x10
+ std 26,400(1)
+ li 26,0x20
+ std 27,408(1)
+ li 27,0x30
+ std 28,416(1)
+ li 28,0x40
+ std 29,424(1)
+ li 29,0x50
+ std 30,432(1)
+ li 30,0x60
+ std 31,440(1)
+ li 31,0x70
+ or 0,0,0
+
+ subi 9,9,3
+
+ lvx 23,0,6
+ lvx 30,3,6
+ addi 6,6,0x20
+ lvx 31,0,6
+ vperm 23,30,23,7
+ addi 7,1,79
+ mtctr 9
+
+.Load_xts_dec_key:
+ vperm 24,31,30,7
+ lvx 30,3,6
+ addi 6,6,0x20
+ stvx 24,0,7
+ vperm 25,30,31,7
+ lvx 31,0,6
+ stvx 25,3,7
+ addi 7,7,0x20
+ bdnz .Load_xts_dec_key
+
+ lvx 26,3,6
+ vperm 24,31,30,7
+ lvx 27,26,6
+ stvx 24,0,7
+ vperm 25,26,31,7
+ lvx 28,27,6
+ stvx 25,3,7
+ addi 7,1,79
+ vperm 26,27,26,7
+ lvx 29,28,6
+ vperm 27,28,27,7
+ lvx 30,29,6
+ vperm 28,29,28,7
+ lvx 31,30,6
+ vperm 29,30,29,7
+ lvx 22,31,6
+ vperm 30,31,30,7
+ lvx 24,0,7
+ vperm 31,22,31,7
+ lvx 25,3,7
+
+ vperm 0,2,4,5
+ subi 10,10,31
+ vxor 17,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vand 11,11,10
+ vxor 7,0,17
+ vxor 8,8,11
+
+ .long 0x7C235699
+ vxor 18,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 1,1,1,6
+ vand 11,11,10
+ vxor 12,1,18
+ vxor 8,8,11
+
+ .long 0x7C5A5699
+ andi. 31,5,15
+ vxor 19,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 2,2,2,6
+ vand 11,11,10
+ vxor 13,2,19
+ vxor 8,8,11
+
+ .long 0x7C7B5699
+ sub 5,5,31
+ vxor 20,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 3,3,3,6
+ vand 11,11,10
+ vxor 14,3,20
+ vxor 8,8,11
+
+ .long 0x7C9C5699
+ subi 5,5,0x60
+ vxor 21,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 4,4,4,6
+ vand 11,11,10
+ vxor 15,4,21
+ vxor 8,8,11
+
+ .long 0x7CBD5699
+ addi 10,10,0x60
+ vxor 22,8,23
+ vsrab 11,8,9
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ vperm 5,5,5,6
+ vand 11,11,10
+ vxor 16,5,22
+ vxor 8,8,11
+
+ vxor 31,31,23
+ mtctr 9
+ b .Loop_xts_dec6x
+
+.align 5
+.Loop_xts_dec6x:
+ .long 0x10E7C548
+ .long 0x118CC548
+ .long 0x11ADC548
+ .long 0x11CEC548
+ .long 0x11EFC548
+ .long 0x1210C548
+ lvx 24,26,7
+ addi 7,7,0x20
+
+ .long 0x10E7CD48
+ .long 0x118CCD48
+ .long 0x11ADCD48
+ .long 0x11CECD48
+ .long 0x11EFCD48
+ .long 0x1210CD48
+ lvx 25,3,7
+ bdnz .Loop_xts_dec6x
+
+ subic 5,5,96
+ vxor 0,17,31
+ .long 0x10E7C548
+ .long 0x118CC548
+ vsrab 11,8,9
+ vxor 17,8,23
+ vaddubm 8,8,8
+ .long 0x11ADC548
+ .long 0x11CEC548
+ vsldoi 11,11,11,15
+ .long 0x11EFC548
+ .long 0x1210C548
+
+ subfe. 0,0,0
+ vand 11,11,10
+ .long 0x10E7CD48
+ .long 0x118CCD48
+ vxor 8,8,11
+ .long 0x11ADCD48
+ .long 0x11CECD48
+ vxor 1,18,31
+ vsrab 11,8,9
+ vxor 18,8,23
+ .long 0x11EFCD48
+ .long 0x1210CD48
+
+ and 0,0,5
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ .long 0x10E7D548
+ .long 0x118CD548
+ vand 11,11,10
+ .long 0x11ADD548
+ .long 0x11CED548
+ vxor 8,8,11
+ .long 0x11EFD548
+ .long 0x1210D548
+
+ add 10,10,0
+
+
+
+ vxor 2,19,31
+ vsrab 11,8,9
+ vxor 19,8,23
+ vaddubm 8,8,8
+ .long 0x10E7DD48
+ .long 0x118CDD48
+ vsldoi 11,11,11,15
+ .long 0x11ADDD48
+ .long 0x11CEDD48
+ vand 11,11,10
+ .long 0x11EFDD48
+ .long 0x1210DD48
+
+ addi 7,1,79
+ vxor 8,8,11
+ .long 0x10E7E548
+ .long 0x118CE548
+ vxor 3,20,31
+ vsrab 11,8,9
+ vxor 20,8,23
+ .long 0x11ADE548
+ .long 0x11CEE548
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ .long 0x11EFE548
+ .long 0x1210E548
+ lvx 24,0,7
+ vand 11,11,10
+
+ .long 0x10E7ED48
+ .long 0x118CED48
+ vxor 8,8,11
+ .long 0x11ADED48
+ .long 0x11CEED48
+ vxor 4,21,31
+ vsrab 11,8,9
+ vxor 21,8,23
+ .long 0x11EFED48
+ .long 0x1210ED48
+ lvx 25,3,7
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+
+ .long 0x10E7F548
+ .long 0x118CF548
+ vand 11,11,10
+ .long 0x11ADF548
+ .long 0x11CEF548
+ vxor 8,8,11
+ .long 0x11EFF548
+ .long 0x1210F548
+ vxor 5,22,31
+ vsrab 11,8,9
+ vxor 22,8,23
+
+ .long 0x10E70549
+ .long 0x7C005699
+ vaddubm 8,8,8
+ vsldoi 11,11,11,15
+ .long 0x118C0D49
+ .long 0x7C235699
+ .long 0x11AD1549
+ vperm 0,0,0,6
+ .long 0x7C5A5699
+ vand 11,11,10
+ .long 0x11CE1D49
+ vperm 1,1,1,6
+ .long 0x7C7B5699
+ .long 0x11EF2549
+ vperm 2,2,2,6
+ .long 0x7C9C5699
+ vxor 8,8,11
+ .long 0x12102D49
+ vperm 3,3,3,6
+ .long 0x7CBD5699
+ addi 10,10,0x60
+ vperm 4,4,4,6
+ vperm 5,5,5,6
+
+ vperm 7,7,7,6
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ vxor 7,0,17
+ vperm 13,13,13,6
+ .long 0x7D832799
+ vxor 12,1,18
+ vperm 14,14,14,6
+ .long 0x7DBA2799
+ vxor 13,2,19
+ vperm 15,15,15,6
+ .long 0x7DDB2799
+ vxor 14,3,20
+ vperm 16,16,16,6
+ .long 0x7DFC2799
+ vxor 15,4,21
+ .long 0x7E1D2799
+ vxor 16,5,22
+ addi 4,4,0x60
+
+ mtctr 9
+ beq .Loop_xts_dec6x
+
+ addic. 5,5,0x60
+ beq .Lxts_dec6x_zero
+ cmpwi 5,0x20
+ blt .Lxts_dec6x_one
+ nop
+ beq .Lxts_dec6x_two
+ cmpwi 5,0x40
+ blt .Lxts_dec6x_three
+ nop
+ beq .Lxts_dec6x_four
+
+.Lxts_dec6x_five:
+ vxor 7,1,17
+ vxor 12,2,18
+ vxor 13,3,19
+ vxor 14,4,20
+ vxor 15,5,21
+
+ bl _aesp8_xts_dec5x
+
+ vperm 7,7,7,6
+ vor 17,22,22
+ vxor 18,8,23
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ vxor 7,0,18
+ vperm 13,13,13,6
+ .long 0x7D832799
+ vperm 14,14,14,6
+ .long 0x7DBA2799
+ vperm 15,15,15,6
+ .long 0x7DDB2799
+ .long 0x7DFC2799
+ addi 4,4,0x50
+ bne .Lxts_dec6x_steal
+ b .Lxts_dec6x_done
+
+.align 4
+.Lxts_dec6x_four:
+ vxor 7,2,17
+ vxor 12,3,18
+ vxor 13,4,19
+ vxor 14,5,20
+ vxor 15,15,15
+
+ bl _aesp8_xts_dec5x
+
+ vperm 7,7,7,6
+ vor 17,21,21
+ vor 18,22,22
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ vxor 7,0,22
+ vperm 13,13,13,6
+ .long 0x7D832799
+ vperm 14,14,14,6
+ .long 0x7DBA2799
+ .long 0x7DDB2799
+ addi 4,4,0x40
+ bne .Lxts_dec6x_steal
+ b .Lxts_dec6x_done
+
+.align 4
+.Lxts_dec6x_three:
+ vxor 7,3,17
+ vxor 12,4,18
+ vxor 13,5,19
+ vxor 14,14,14
+ vxor 15,15,15
+
+ bl _aesp8_xts_dec5x
+
+ vperm 7,7,7,6
+ vor 17,20,20
+ vor 18,21,21
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ vxor 7,0,21
+ vperm 13,13,13,6
+ .long 0x7D832799
+ .long 0x7DBA2799
+ addi 4,4,0x30
+ bne .Lxts_dec6x_steal
+ b .Lxts_dec6x_done
+
+.align 4
+.Lxts_dec6x_two:
+ vxor 7,4,17
+ vxor 12,5,18
+ vxor 13,13,13
+ vxor 14,14,14
+ vxor 15,15,15
+
+ bl _aesp8_xts_dec5x
+
+ vperm 7,7,7,6
+ vor 17,19,19
+ vor 18,20,20
+ vperm 12,12,12,6
+ .long 0x7CE02799
+ vxor 7,0,20
+ .long 0x7D832799
+ addi 4,4,0x20
+ bne .Lxts_dec6x_steal
+ b .Lxts_dec6x_done
+
+.align 4
+.Lxts_dec6x_one:
+ vxor 7,5,17
+ nop
+.Loop_xts_dec1x:
+ .long 0x10E7C548
+ lvx 24,26,7
+ addi 7,7,0x20
+
+ .long 0x10E7CD48
+ lvx 25,3,7
+ bdnz .Loop_xts_dec1x
+
+ subi 0,31,1
+ .long 0x10E7C548
+
+ andi. 0,0,16
+ cmpwi 31,0
+ .long 0x10E7CD48
+
+ sub 10,10,0
+ .long 0x10E7D548
+
+ .long 0x7C005699
+ .long 0x10E7DD48
+
+ addi 7,1,79
+ .long 0x10E7E548
+ lvx 24,0,7
+
+ .long 0x10E7ED48
+ lvx 25,3,7
+ vxor 17,17,31
+
+ vperm 0,0,0,6
+ .long 0x10E7F548
+
+ mtctr 9
+ .long 0x10E78D49
+
+ vor 17,18,18
+ vor 18,19,19
+ vperm 7,7,7,6
+ .long 0x7CE02799
+ addi 4,4,0x10
+ vxor 7,0,19
+ bne .Lxts_dec6x_steal
+ b .Lxts_dec6x_done
+
+.align 4
+.Lxts_dec6x_zero:
+ cmpwi 31,0
+ beq .Lxts_dec6x_done
+
+ .long 0x7C005699
+ vperm 0,0,0,6
+ vxor 7,0,18
+.Lxts_dec6x_steal:
+ .long 0x10E7C548
+ lvx 24,26,7
+ addi 7,7,0x20
+
+ .long 0x10E7CD48
+ lvx 25,3,7
+ bdnz .Lxts_dec6x_steal
+
+ add 10,10,31
+ .long 0x10E7C548
+
+ cmpwi 31,0
+ .long 0x10E7CD48
+
+ .long 0x7C005699
+ .long 0x10E7D548
+
+ lvsr 5,0,31
+ .long 0x10E7DD48
+
+ addi 7,1,79
+ .long 0x10E7E548
+ lvx 24,0,7
+
+ .long 0x10E7ED48
+ lvx 25,3,7
+ vxor 18,18,31
+
+ vperm 0,0,0,6
+ .long 0x10E7F548
+
+ vperm 0,0,0,5
+ .long 0x11679549
+
+ vperm 7,11,11,6
+ .long 0x7CE02799
+
+
+ vxor 7,7,7
+ vspltisb 12,-1
+ vperm 7,7,12,5
+ vsel 7,0,11,7
+ vxor 7,7,17
+
+ subi 30,4,1
+ mtctr 31
+.Loop_xts_dec6x_steal:
+ lbzu 0,1(30)
+ stb 0,16(30)
+ bdnz .Loop_xts_dec6x_steal
+
+ li 31,0
+ mtctr 9
+ b .Loop_xts_dec1x
+
+.align 4
+.Lxts_dec6x_done:
+ cmpldi 8,0
+ beq .Lxts_dec6x_ret
+
+ vxor 8,17,23
+ vperm 8,8,8,6
+ .long 0x7D004799
+
+.Lxts_dec6x_ret:
+ mtlr 11
+ li 10,79
+ li 11,95
+ stvx 9,10,1
+ addi 10,10,32
+ stvx 9,11,1
+ addi 11,11,32
+ stvx 9,10,1
+ addi 10,10,32
+ stvx 9,11,1
+ addi 11,11,32
+ stvx 9,10,1
+ addi 10,10,32
+ stvx 9,11,1
+ addi 11,11,32
+ stvx 9,10,1
+ addi 10,10,32
+ stvx 9,11,1
+ addi 11,11,32
+
+ or 12,12,12
+ lvx 20,10,1
+ addi 10,10,32
+ lvx 21,11,1
+ addi 11,11,32
+ lvx 22,10,1
+ addi 10,10,32
+ lvx 23,11,1
+ addi 11,11,32
+ lvx 24,10,1
+ addi 10,10,32
+ lvx 25,11,1
+ addi 11,11,32
+ lvx 26,10,1
+ addi 10,10,32
+ lvx 27,11,1
+ addi 11,11,32
+ lvx 28,10,1
+ addi 10,10,32
+ lvx 29,11,1
+ addi 11,11,32
+ lvx 30,10,1
+ lvx 31,11,1
+ ld 26,400(1)
+ ld 27,408(1)
+ ld 28,416(1)
+ ld 29,424(1)
+ ld 30,432(1)
+ ld 31,440(1)
+ addi 1,1,448
+ blr
+.long 0
+.byte 0,12,0x04,1,0x80,6,6,0
+.long 0
+
+.align 5
+_aesp8_xts_dec5x:
+ .long 0x10E7C548
+ .long 0x118CC548
+ .long 0x11ADC548
+ .long 0x11CEC548
+ .long 0x11EFC548
+ lvx 24,26,7
+ addi 7,7,0x20
+
+ .long 0x10E7CD48
+ .long 0x118CCD48
+ .long 0x11ADCD48
+ .long 0x11CECD48
+ .long 0x11EFCD48
+ lvx 25,3,7
+ bdnz _aesp8_xts_dec5x
+
+ subi 0,31,1
+ .long 0x10E7C548
+ .long 0x118CC548
+ .long 0x11ADC548
+ .long 0x11CEC548
+ .long 0x11EFC548
+
+ andi. 0,0,16
+ cmpwi 31,0
+ .long 0x10E7CD48
+ .long 0x118CCD48
+ .long 0x11ADCD48
+ .long 0x11CECD48
+ .long 0x11EFCD48
+ vxor 17,17,31
+
+ sub 10,10,0
+ .long 0x10E7D548
+ .long 0x118CD548
+ .long 0x11ADD548
+ .long 0x11CED548
+ .long 0x11EFD548
+ vxor 1,18,31
+
+ .long 0x10E7DD48
+ .long 0x7C005699
+ .long 0x118CDD48
+ .long 0x11ADDD48
+ .long 0x11CEDD48
+ .long 0x11EFDD48
+ vxor 2,19,31
+
+ addi 7,1,79
+ .long 0x10E7E548
+ .long 0x118CE548
+ .long 0x11ADE548
+ .long 0x11CEE548
+ .long 0x11EFE548
+ lvx 24,0,7
+ vxor 3,20,31
+
+ .long 0x10E7ED48
+ vperm 0,0,0,6
+ .long 0x118CED48
+ .long 0x11ADED48
+ .long 0x11CEED48
+ .long 0x11EFED48
+ lvx 25,3,7
+ vxor 4,21,31
+
+ .long 0x10E7F548
+ .long 0x118CF548
+ .long 0x11ADF548
+ .long 0x11CEF548
+ .long 0x11EFF548
+
+ .long 0x10E78D49
+ .long 0x118C0D49
+ .long 0x11AD1549
+ .long 0x11CE1D49
+ .long 0x11EF2549
+ mtctr 9
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,0,0
+#endif // !OPENSSL_NO_ASM && __powerpc64__
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S
@@ -1,0 +1,587 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__)
+.machine "any"
+
+.abiversion 2
+.text
+
+.globl gcm_init_p8
+.type gcm_init_p8,@function
+.align 5
+gcm_init_p8:
+.localentry gcm_init_p8,0
+
+ li 0,-4096
+ li 8,0x10
+ li 12,-1
+ li 9,0x20
+ or 0,0,0
+ li 10,0x30
+ .long 0x7D202699
+
+ vspltisb 8,-16
+ vspltisb 5,1
+ vaddubm 8,8,8
+ vxor 4,4,4
+ vor 8,8,5
+ vsldoi 8,8,4,15
+ vsldoi 6,4,5,1
+ vaddubm 8,8,8
+ vspltisb 7,7
+ vor 8,8,6
+ vspltb 6,9,0
+ vsl 9,9,5
+ vsrab 6,6,7
+ vand 6,6,8
+ vxor 3,9,6
+
+ vsldoi 9,3,3,8
+ vsldoi 8,4,8,8
+ vsldoi 11,4,9,8
+ vsldoi 10,9,4,8
+
+ .long 0x7D001F99
+ .long 0x7D681F99
+ li 8,0x40
+ .long 0x7D291F99
+ li 9,0x50
+ .long 0x7D4A1F99
+ li 10,0x60
+
+ .long 0x10035CC8
+ .long 0x10234CC8
+ .long 0x104354C8
+
+ .long 0x10E044C8
+
+ vsldoi 5,1,4,8
+ vsldoi 6,4,1,8
+ vxor 0,0,5
+ vxor 2,2,6
+
+ vsldoi 0,0,0,8
+ vxor 0,0,7
+
+ vsldoi 6,0,0,8
+ .long 0x100044C8
+ vxor 6,6,2
+ vxor 16,0,6
+
+ vsldoi 17,16,16,8
+ vsldoi 19,4,17,8
+ vsldoi 18,17,4,8
+
+ .long 0x7E681F99
+ li 8,0x70
+ .long 0x7E291F99
+ li 9,0x80
+ .long 0x7E4A1F99
+ li 10,0x90
+ .long 0x10039CC8
+ .long 0x11B09CC8
+ .long 0x10238CC8
+ .long 0x11D08CC8
+ .long 0x104394C8
+ .long 0x11F094C8
+
+ .long 0x10E044C8
+ .long 0x114D44C8
+
+ vsldoi 5,1,4,8
+ vsldoi 6,4,1,8
+ vsldoi 11,14,4,8
+ vsldoi 9,4,14,8
+ vxor 0,0,5
+ vxor 2,2,6
+ vxor 13,13,11
+ vxor 15,15,9
+
+ vsldoi 0,0,0,8
+ vsldoi 13,13,13,8
+ vxor 0,0,7
+ vxor 13,13,10
+
+ vsldoi 6,0,0,8
+ vsldoi 9,13,13,8
+ .long 0x100044C8
+ .long 0x11AD44C8
+ vxor 6,6,2
+ vxor 9,9,15
+ vxor 0,0,6
+ vxor 13,13,9
+
+ vsldoi 9,0,0,8
+ vsldoi 17,13,13,8
+ vsldoi 11,4,9,8
+ vsldoi 10,9,4,8
+ vsldoi 19,4,17,8
+ vsldoi 18,17,4,8
+
+ .long 0x7D681F99
+ li 8,0xa0
+ .long 0x7D291F99
+ li 9,0xb0
+ .long 0x7D4A1F99
+ li 10,0xc0
+ .long 0x7E681F99
+ .long 0x7E291F99
+ .long 0x7E4A1F99
+
+ or 12,12,12
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,2,0
+.long 0
+.size gcm_init_p8,.-gcm_init_p8
+.globl gcm_gmult_p8
+.type gcm_gmult_p8,@function
+.align 5
+gcm_gmult_p8:
+.localentry gcm_gmult_p8,0
+
+ lis 0,0xfff8
+ li 8,0x10
+ li 12,-1
+ li 9,0x20
+ or 0,0,0
+ li 10,0x30
+ .long 0x7C601E99
+
+ .long 0x7D682699
+ lvsl 12,0,0
+ .long 0x7D292699
+ vspltisb 5,0x07
+ .long 0x7D4A2699
+ vxor 12,12,5
+ .long 0x7D002699
+ vperm 3,3,3,12
+ vxor 4,4,4
+
+ .long 0x10035CC8
+ .long 0x10234CC8
+ .long 0x104354C8
+
+ .long 0x10E044C8
+
+ vsldoi 5,1,4,8
+ vsldoi 6,4,1,8
+ vxor 0,0,5
+ vxor 2,2,6
+
+ vsldoi 0,0,0,8
+ vxor 0,0,7
+
+ vsldoi 6,0,0,8
+ .long 0x100044C8
+ vxor 6,6,2
+ vxor 0,0,6
+
+ vperm 0,0,0,12
+ .long 0x7C001F99
+
+ or 12,12,12
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,2,0
+.long 0
+.size gcm_gmult_p8,.-gcm_gmult_p8
+
+.globl gcm_ghash_p8
+.type gcm_ghash_p8,@function
+.align 5
+gcm_ghash_p8:
+.localentry gcm_ghash_p8,0
+
+ li 0,-4096
+ li 8,0x10
+ li 12,-1
+ li 9,0x20
+ or 0,0,0
+ li 10,0x30
+ .long 0x7C001E99
+
+ .long 0x7D682699
+ li 8,0x40
+ lvsl 12,0,0
+ .long 0x7D292699
+ li 9,0x50
+ vspltisb 5,0x07
+ .long 0x7D4A2699
+ li 10,0x60
+ vxor 12,12,5
+ .long 0x7D002699
+ vperm 0,0,0,12
+ vxor 4,4,4
+
+ cmpldi 6,64
+ bge .Lgcm_ghash_p8_4x
+
+ .long 0x7C602E99
+ addi 5,5,16
+ subic. 6,6,16
+ vperm 3,3,3,12
+ vxor 3,3,0
+ beq .Lshort
+
+ .long 0x7E682699
+ li 8,16
+ .long 0x7E292699
+ add 9,5,6
+ .long 0x7E4A2699
+
+
+.align 5
+.Loop_2x:
+ .long 0x7E002E99
+ vperm 16,16,16,12
+
+ subic 6,6,32
+ .long 0x10039CC8
+ .long 0x11B05CC8
+ subfe 0,0,0
+ .long 0x10238CC8
+ .long 0x11D04CC8
+ and 0,0,6
+ .long 0x104394C8
+ .long 0x11F054C8
+ add 5,5,0
+
+ vxor 0,0,13
+ vxor 1,1,14
+
+ .long 0x10E044C8
+
+ vsldoi 5,1,4,8
+ vsldoi 6,4,1,8
+ vxor 2,2,15
+ vxor 0,0,5
+ vxor 2,2,6
+
+ vsldoi 0,0,0,8
+ vxor 0,0,7
+ .long 0x7C682E99
+ addi 5,5,32
+
+ vsldoi 6,0,0,8
+ .long 0x100044C8
+ vperm 3,3,3,12
+ vxor 6,6,2
+ vxor 3,3,6
+ vxor 3,3,0
+ cmpld 9,5
+ bgt .Loop_2x
+
+ cmplwi 6,0
+ bne .Leven
+
+.Lshort:
+ .long 0x10035CC8
+ .long 0x10234CC8
+ .long 0x104354C8
+
+ .long 0x10E044C8
+
+ vsldoi 5,1,4,8
+ vsldoi 6,4,1,8
+ vxor 0,0,5
+ vxor 2,2,6
+
+ vsldoi 0,0,0,8
+ vxor 0,0,7
+
+ vsldoi 6,0,0,8
+ .long 0x100044C8
+ vxor 6,6,2
+
+.Leven:
+ vxor 0,0,6
+ vperm 0,0,0,12
+ .long 0x7C001F99
+
+ or 12,12,12
+ blr
+.long 0
+.byte 0,12,0x14,0,0,0,4,0
+.long 0
+.align 5
+.gcm_ghash_p8_4x:
+.Lgcm_ghash_p8_4x:
+ stdu 1,-256(1)
+ li 10,63
+ li 11,79
+ stvx 20,10,1
+ addi 10,10,32
+ stvx 21,11,1
+ addi 11,11,32
+ stvx 22,10,1
+ addi 10,10,32
+ stvx 23,11,1
+ addi 11,11,32
+ stvx 24,10,1
+ addi 10,10,32
+ stvx 25,11,1
+ addi 11,11,32
+ stvx 26,10,1
+ addi 10,10,32
+ stvx 27,11,1
+ addi 11,11,32
+ stvx 28,10,1
+ addi 10,10,32
+ stvx 29,11,1
+ addi 11,11,32
+ stvx 30,10,1
+ li 10,0x60
+ stvx 31,11,1
+ li 0,-1
+ stw 12,252(1)
+ or 0,0,0
+
+ lvsl 5,0,8
+
+ li 8,0x70
+ .long 0x7E292699
+ li 9,0x80
+ vspltisb 6,8
+
+ li 10,0x90
+ .long 0x7EE82699
+ li 8,0xa0
+ .long 0x7F092699
+ li 9,0xb0
+ .long 0x7F2A2699
+ li 10,0xc0
+ .long 0x7FA82699
+ li 8,0x10
+ .long 0x7FC92699
+ li 9,0x20
+ .long 0x7FEA2699
+ li 10,0x30
+
+ vsldoi 7,4,6,8
+ vaddubm 18,5,7
+ vaddubm 19,6,18
+
+ srdi 6,6,4
+
+ .long 0x7C602E99
+ .long 0x7E082E99
+ subic. 6,6,8
+ .long 0x7EC92E99
+ .long 0x7F8A2E99
+ addi 5,5,0x40
+ vperm 3,3,3,12
+ vperm 16,16,16,12
+ vperm 22,22,22,12
+ vperm 28,28,28,12
+
+ vxor 2,3,0
+
+ .long 0x11B0BCC8
+ .long 0x11D0C4C8
+ .long 0x11F0CCC8
+
+ vperm 11,17,9,18
+ vperm 5,22,28,19
+ vperm 10,17,9,19
+ vperm 6,22,28,18
+ .long 0x12B68CC8
+ .long 0x12855CC8
+ .long 0x137C4CC8
+ .long 0x134654C8
+
+ vxor 21,21,14
+ vxor 20,20,13
+ vxor 27,27,21
+ vxor 26,26,15
+
+ blt .Ltail_4x
+
+.Loop_4x:
+ .long 0x7C602E99
+ .long 0x7E082E99
+ subic. 6,6,4
+ .long 0x7EC92E99
+ .long 0x7F8A2E99
+ addi 5,5,0x40
+ vperm 16,16,16,12
+ vperm 22,22,22,12
+ vperm 28,28,28,12
+ vperm 3,3,3,12
+
+ .long 0x1002ECC8
+ .long 0x1022F4C8
+ .long 0x1042FCC8
+ .long 0x11B0BCC8
+ .long 0x11D0C4C8
+ .long 0x11F0CCC8
+
+ vxor 0,0,20
+ vxor 1,1,27
+ vxor 2,2,26
+ vperm 5,22,28,19
+ vperm 6,22,28,18
+
+ .long 0x10E044C8
+ .long 0x12855CC8
+ .long 0x134654C8
+
+ vsldoi 5,1,4,8
+ vsldoi 6,4,1,8
+ vxor 0,0,5
+ vxor 2,2,6
+
+ vsldoi 0,0,0,8
+ vxor 0,0,7
+
+ vsldoi 6,0,0,8
+ .long 0x12B68CC8
+ .long 0x137C4CC8
+ .long 0x100044C8
+
+ vxor 20,20,13
+ vxor 26,26,15
+ vxor 2,2,3
+ vxor 21,21,14
+ vxor 2,2,6
+ vxor 27,27,21
+ vxor 2,2,0
+ bge .Loop_4x
+
+.Ltail_4x:
+ .long 0x1002ECC8
+ .long 0x1022F4C8
+ .long 0x1042FCC8
+
+ vxor 0,0,20
+ vxor 1,1,27
+
+ .long 0x10E044C8
+
+ vsldoi 5,1,4,8
+ vsldoi 6,4,1,8
+ vxor 2,2,26
+ vxor 0,0,5
+ vxor 2,2,6
+
+ vsldoi 0,0,0,8
+ vxor 0,0,7
+
+ vsldoi 6,0,0,8
+ .long 0x100044C8
+ vxor 6,6,2
+ vxor 0,0,6
+
+ addic. 6,6,4
+ beq .Ldone_4x
+
+ .long 0x7C602E99
+ cmpldi 6,2
+ li 6,-4
+ blt .Lone
+ .long 0x7E082E99
+ beq .Ltwo
+
+.Lthree:
+ .long 0x7EC92E99
+ vperm 3,3,3,12
+ vperm 16,16,16,12
+ vperm 22,22,22,12
+
+ vxor 2,3,0
+ vor 29,23,23
+ vor 30, 24, 24
+ vor 31,25,25
+
+ vperm 5,16,22,19
+ vperm 6,16,22,18
+ .long 0x12B08CC8
+ .long 0x13764CC8
+ .long 0x12855CC8
+ .long 0x134654C8
+
+ vxor 27,27,21
+ b .Ltail_4x
+
+.align 4
+.Ltwo:
+ vperm 3,3,3,12
+ vperm 16,16,16,12
+
+ vxor 2,3,0
+ vperm 5,4,16,19
+ vperm 6,4,16,18
+
+ vsldoi 29,4,17,8
+ vor 30, 17, 17
+ vsldoi 31,17,4,8
+
+ .long 0x12855CC8
+ .long 0x13704CC8
+ .long 0x134654C8
+
+ b .Ltail_4x
+
+.align 4
+.Lone:
+ vperm 3,3,3,12
+
+ vsldoi 29,4,9,8
+ vor 30, 9, 9
+ vsldoi 31,9,4,8
+
+ vxor 2,3,0
+ vxor 20,20,20
+ vxor 27,27,27
+ vxor 26,26,26
+
+ b .Ltail_4x
+
+.Ldone_4x:
+ vperm 0,0,0,12
+ .long 0x7C001F99
+
+ li 10,63
+ li 11,79
+ or 12,12,12
+ lvx 20,10,1
+ addi 10,10,32
+ lvx 21,11,1
+ addi 11,11,32
+ lvx 22,10,1
+ addi 10,10,32
+ lvx 23,11,1
+ addi 11,11,32
+ lvx 24,10,1
+ addi 10,10,32
+ lvx 25,11,1
+ addi 11,11,32
+ lvx 26,10,1
+ addi 10,10,32
+ lvx 27,11,1
+ addi 11,11,32
+ lvx 28,10,1
+ addi 10,10,32
+ lvx 29,11,1
+ addi 11,11,32
+ lvx 30,10,1
+ lvx 31,11,1
+ addi 1,1,256
+ blr
+.long 0
+.byte 0,12,0x04,0,0x80,0,4,0
+.long 0
+.size gcm_ghash_p8,.-gcm_ghash_p8
+
+.byte 71,72,65,83,72,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 2
+.align 2
+#endif // !OPENSSL_NO_ASM && __powerpc64__
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-ppc64le/crypto/test/trampoline-ppc.S
@@ -1,0 +1,1410 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__)
+.machine "any"
+.abiversion 2
+.text
+
+
+
+
+
+
+
+.globl abi_test_trampoline
+.type abi_test_trampoline,@function
+.align 5
+abi_test_trampoline:
+.localentry abi_test_trampoline,0
+
+
+ mflr 0
+ std 0, 16(1)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ stdu 1, -528(1)
+
+ mfcr 0
+ std 0, 8(1)
+ std 2, 24(1)
+ std 4, 32(1)
+ li 11, 48
+ stvx 20, 11, 1
+ li 11, 64
+ stvx 21, 11, 1
+ li 11, 80
+ stvx 22, 11, 1
+ li 11, 96
+ stvx 23, 11, 1
+ li 11, 112
+ stvx 24, 11, 1
+ li 11, 128
+ stvx 25, 11, 1
+ li 11, 144
+ stvx 26, 11, 1
+ li 11, 160
+ stvx 27, 11, 1
+ li 11, 176
+ stvx 28, 11, 1
+ li 11, 192
+ stvx 29, 11, 1
+ li 11, 208
+ stvx 30, 11, 1
+ li 11, 224
+ stvx 31, 11, 1
+ std 14, 240(1)
+ std 15, 248(1)
+ std 16, 256(1)
+ std 17, 264(1)
+ std 18, 272(1)
+ std 19, 280(1)
+ std 20, 288(1)
+ std 21, 296(1)
+ std 22, 304(1)
+ std 23, 312(1)
+ std 24, 320(1)
+ std 25, 328(1)
+ std 26, 336(1)
+ std 27, 344(1)
+ std 28, 352(1)
+ std 29, 360(1)
+ std 30, 368(1)
+ std 31, 376(1)
+ stfd 14, 384(1)
+ stfd 15, 392(1)
+ stfd 16, 400(1)
+ stfd 17, 408(1)
+ stfd 18, 416(1)
+ stfd 19, 424(1)
+ stfd 20, 432(1)
+ stfd 21, 440(1)
+ stfd 22, 448(1)
+ stfd 23, 456(1)
+ stfd 24, 464(1)
+ stfd 25, 472(1)
+ stfd 26, 480(1)
+ stfd 27, 488(1)
+ stfd 28, 496(1)
+ stfd 29, 504(1)
+ stfd 30, 512(1)
+ stfd 31, 520(1)
+ li 11, 0
+ lvx 20, 11, 4
+ li 11, 16
+ lvx 21, 11, 4
+ li 11, 32
+ lvx 22, 11, 4
+ li 11, 48
+ lvx 23, 11, 4
+ li 11, 64
+ lvx 24, 11, 4
+ li 11, 80
+ lvx 25, 11, 4
+ li 11, 96
+ lvx 26, 11, 4
+ li 11, 112
+ lvx 27, 11, 4
+ li 11, 128
+ lvx 28, 11, 4
+ li 11, 144
+ lvx 29, 11, 4
+ li 11, 160
+ lvx 30, 11, 4
+ li 11, 176
+ lvx 31, 11, 4
+ ld 14, 192(4)
+ ld 15, 200(4)
+ ld 16, 208(4)
+ ld 17, 216(4)
+ ld 18, 224(4)
+ ld 19, 232(4)
+ ld 20, 240(4)
+ ld 21, 248(4)
+ ld 22, 256(4)
+ ld 23, 264(4)
+ ld 24, 272(4)
+ ld 25, 280(4)
+ ld 26, 288(4)
+ ld 27, 296(4)
+ ld 28, 304(4)
+ ld 29, 312(4)
+ ld 30, 320(4)
+ ld 31, 328(4)
+ lfd 14, 336(4)
+ lfd 15, 344(4)
+ lfd 16, 352(4)
+ lfd 17, 360(4)
+ lfd 18, 368(4)
+ lfd 19, 376(4)
+ lfd 20, 384(4)
+ lfd 21, 392(4)
+ lfd 22, 400(4)
+ lfd 23, 408(4)
+ lfd 24, 416(4)
+ lfd 25, 424(4)
+ lfd 26, 432(4)
+ lfd 27, 440(4)
+ lfd 28, 448(4)
+ lfd 29, 456(4)
+ lfd 30, 464(4)
+ lfd 31, 472(4)
+
+ ld 0, 480(4)
+ mtcr 0
+
+
+ addi 11, 5, -8
+ mr 12, 3
+
+
+ cmpdi 6, 0
+ beq .Largs_done
+ mtctr 6
+ ldu 3, 8(11)
+ bdz .Largs_done
+ ldu 4, 8(11)
+ bdz .Largs_done
+ ldu 5, 8(11)
+ bdz .Largs_done
+ ldu 6, 8(11)
+ bdz .Largs_done
+ ldu 7, 8(11)
+ bdz .Largs_done
+ ldu 8, 8(11)
+ bdz .Largs_done
+ ldu 9, 8(11)
+ bdz .Largs_done
+ ldu 10, 8(11)
+
+.Largs_done:
+ li 2, 0
+ mtctr 12
+ bctrl
+ ld 2, 24(1)
+
+ ld 4, 32(1)
+ li 11, 0
+ stvx 20, 11, 4
+ li 11, 16
+ stvx 21, 11, 4
+ li 11, 32
+ stvx 22, 11, 4
+ li 11, 48
+ stvx 23, 11, 4
+ li 11, 64
+ stvx 24, 11, 4
+ li 11, 80
+ stvx 25, 11, 4
+ li 11, 96
+ stvx 26, 11, 4
+ li 11, 112
+ stvx 27, 11, 4
+ li 11, 128
+ stvx 28, 11, 4
+ li 11, 144
+ stvx 29, 11, 4
+ li 11, 160
+ stvx 30, 11, 4
+ li 11, 176
+ stvx 31, 11, 4
+ std 14, 192(4)
+ std 15, 200(4)
+ std 16, 208(4)
+ std 17, 216(4)
+ std 18, 224(4)
+ std 19, 232(4)
+ std 20, 240(4)
+ std 21, 248(4)
+ std 22, 256(4)
+ std 23, 264(4)
+ std 24, 272(4)
+ std 25, 280(4)
+ std 26, 288(4)
+ std 27, 296(4)
+ std 28, 304(4)
+ std 29, 312(4)
+ std 30, 320(4)
+ std 31, 328(4)
+ stfd 14, 336(4)
+ stfd 15, 344(4)
+ stfd 16, 352(4)
+ stfd 17, 360(4)
+ stfd 18, 368(4)
+ stfd 19, 376(4)
+ stfd 20, 384(4)
+ stfd 21, 392(4)
+ stfd 22, 400(4)
+ stfd 23, 408(4)
+ stfd 24, 416(4)
+ stfd 25, 424(4)
+ stfd 26, 432(4)
+ stfd 27, 440(4)
+ stfd 28, 448(4)
+ stfd 29, 456(4)
+ stfd 30, 464(4)
+ stfd 31, 472(4)
+ li 11, 48
+ lvx 20, 11, 1
+ li 11, 64
+ lvx 21, 11, 1
+ li 11, 80
+ lvx 22, 11, 1
+ li 11, 96
+ lvx 23, 11, 1
+ li 11, 112
+ lvx 24, 11, 1
+ li 11, 128
+ lvx 25, 11, 1
+ li 11, 144
+ lvx 26, 11, 1
+ li 11, 160
+ lvx 27, 11, 1
+ li 11, 176
+ lvx 28, 11, 1
+ li 11, 192
+ lvx 29, 11, 1
+ li 11, 208
+ lvx 30, 11, 1
+ li 11, 224
+ lvx 31, 11, 1
+ ld 14, 240(1)
+ ld 15, 248(1)
+ ld 16, 256(1)
+ ld 17, 264(1)
+ ld 18, 272(1)
+ ld 19, 280(1)
+ ld 20, 288(1)
+ ld 21, 296(1)
+ ld 22, 304(1)
+ ld 23, 312(1)
+ ld 24, 320(1)
+ ld 25, 328(1)
+ ld 26, 336(1)
+ ld 27, 344(1)
+ ld 28, 352(1)
+ ld 29, 360(1)
+ ld 30, 368(1)
+ ld 31, 376(1)
+ lfd 14, 384(1)
+ lfd 15, 392(1)
+ lfd 16, 400(1)
+ lfd 17, 408(1)
+ lfd 18, 416(1)
+ lfd 19, 424(1)
+ lfd 20, 432(1)
+ lfd 21, 440(1)
+ lfd 22, 448(1)
+ lfd 23, 456(1)
+ lfd 24, 464(1)
+ lfd 25, 472(1)
+ lfd 26, 480(1)
+ lfd 27, 488(1)
+ lfd 28, 496(1)
+ lfd 29, 504(1)
+ lfd 30, 512(1)
+ lfd 31, 520(1)
+ mfcr 0
+ std 0, 480(4)
+ ld 0, 8(1)
+ mtcrf 0b00111000, 0
+ addi 1, 1, 528
+ ld 0, 16(1)
+ mtlr 0
+ blr
+.size abi_test_trampoline,.-abi_test_trampoline
+.globl abi_test_clobber_r0
+.type abi_test_clobber_r0,@function
+.align 5
+abi_test_clobber_r0:
+.localentry abi_test_clobber_r0,0
+
+ li 0, 0
+ blr
+.size abi_test_clobber_r0,.-abi_test_clobber_r0
+.globl abi_test_clobber_r2
+.type abi_test_clobber_r2,@function
+.align 5
+abi_test_clobber_r2:
+.localentry abi_test_clobber_r2,0
+
+ li 2, 0
+ blr
+.size abi_test_clobber_r2,.-abi_test_clobber_r2
+.globl abi_test_clobber_r3
+.type abi_test_clobber_r3,@function
+.align 5
+abi_test_clobber_r3:
+.localentry abi_test_clobber_r3,0
+
+ li 3, 0
+ blr
+.size abi_test_clobber_r3,.-abi_test_clobber_r3
+.globl abi_test_clobber_r4
+.type abi_test_clobber_r4,@function
+.align 5
+abi_test_clobber_r4:
+.localentry abi_test_clobber_r4,0
+
+ li 4, 0
+ blr
+.size abi_test_clobber_r4,.-abi_test_clobber_r4
+.globl abi_test_clobber_r5
+.type abi_test_clobber_r5,@function
+.align 5
+abi_test_clobber_r5:
+.localentry abi_test_clobber_r5,0
+
+ li 5, 0
+ blr
+.size abi_test_clobber_r5,.-abi_test_clobber_r5
+.globl abi_test_clobber_r6
+.type abi_test_clobber_r6,@function
+.align 5
+abi_test_clobber_r6:
+.localentry abi_test_clobber_r6,0
+
+ li 6, 0
+ blr
+.size abi_test_clobber_r6,.-abi_test_clobber_r6
+.globl abi_test_clobber_r7
+.type abi_test_clobber_r7,@function
+.align 5
+abi_test_clobber_r7:
+.localentry abi_test_clobber_r7,0
+
+ li 7, 0
+ blr
+.size abi_test_clobber_r7,.-abi_test_clobber_r7
+.globl abi_test_clobber_r8
+.type abi_test_clobber_r8,@function
+.align 5
+abi_test_clobber_r8:
+.localentry abi_test_clobber_r8,0
+
+ li 8, 0
+ blr
+.size abi_test_clobber_r8,.-abi_test_clobber_r8
+.globl abi_test_clobber_r9
+.type abi_test_clobber_r9,@function
+.align 5
+abi_test_clobber_r9:
+.localentry abi_test_clobber_r9,0
+
+ li 9, 0
+ blr
+.size abi_test_clobber_r9,.-abi_test_clobber_r9
+.globl abi_test_clobber_r10
+.type abi_test_clobber_r10,@function
+.align 5
+abi_test_clobber_r10:
+.localentry abi_test_clobber_r10,0
+
+ li 10, 0
+ blr
+.size abi_test_clobber_r10,.-abi_test_clobber_r10
+.globl abi_test_clobber_r11
+.type abi_test_clobber_r11,@function
+.align 5
+abi_test_clobber_r11:
+.localentry abi_test_clobber_r11,0
+
+ li 11, 0
+ blr
+.size abi_test_clobber_r11,.-abi_test_clobber_r11
+.globl abi_test_clobber_r12
+.type abi_test_clobber_r12,@function
+.align 5
+abi_test_clobber_r12:
+.localentry abi_test_clobber_r12,0
+
+ li 12, 0
+ blr
+.size abi_test_clobber_r12,.-abi_test_clobber_r12
+.globl abi_test_clobber_r14
+.type abi_test_clobber_r14,@function
+.align 5
+abi_test_clobber_r14:
+.localentry abi_test_clobber_r14,0
+
+ li 14, 0
+ blr
+.size abi_test_clobber_r14,.-abi_test_clobber_r14
+.globl abi_test_clobber_r15
+.type abi_test_clobber_r15,@function
+.align 5
+abi_test_clobber_r15:
+.localentry abi_test_clobber_r15,0
+
+ li 15, 0
+ blr
+.size abi_test_clobber_r15,.-abi_test_clobber_r15
+.globl abi_test_clobber_r16
+.type abi_test_clobber_r16,@function
+.align 5
+abi_test_clobber_r16:
+.localentry abi_test_clobber_r16,0
+
+ li 16, 0
+ blr
+.size abi_test_clobber_r16,.-abi_test_clobber_r16
+.globl abi_test_clobber_r17
+.type abi_test_clobber_r17,@function
+.align 5
+abi_test_clobber_r17:
+.localentry abi_test_clobber_r17,0
+
+ li 17, 0
+ blr
+.size abi_test_clobber_r17,.-abi_test_clobber_r17
+.globl abi_test_clobber_r18
+.type abi_test_clobber_r18,@function
+.align 5
+abi_test_clobber_r18:
+.localentry abi_test_clobber_r18,0
+
+ li 18, 0
+ blr
+.size abi_test_clobber_r18,.-abi_test_clobber_r18
+.globl abi_test_clobber_r19
+.type abi_test_clobber_r19,@function
+.align 5
+abi_test_clobber_r19:
+.localentry abi_test_clobber_r19,0
+
+ li 19, 0
+ blr
+.size abi_test_clobber_r19,.-abi_test_clobber_r19
+.globl abi_test_clobber_r20
+.type abi_test_clobber_r20,@function
+.align 5
+abi_test_clobber_r20:
+.localentry abi_test_clobber_r20,0
+
+ li 20, 0
+ blr
+.size abi_test_clobber_r20,.-abi_test_clobber_r20
+.globl abi_test_clobber_r21
+.type abi_test_clobber_r21,@function
+.align 5
+abi_test_clobber_r21:
+.localentry abi_test_clobber_r21,0
+
+ li 21, 0
+ blr
+.size abi_test_clobber_r21,.-abi_test_clobber_r21
+.globl abi_test_clobber_r22
+.type abi_test_clobber_r22,@function
+.align 5
+abi_test_clobber_r22:
+.localentry abi_test_clobber_r22,0
+
+ li 22, 0
+ blr
+.size abi_test_clobber_r22,.-abi_test_clobber_r22
+.globl abi_test_clobber_r23
+.type abi_test_clobber_r23,@function
+.align 5
+abi_test_clobber_r23:
+.localentry abi_test_clobber_r23,0
+
+ li 23, 0
+ blr
+.size abi_test_clobber_r23,.-abi_test_clobber_r23
+.globl abi_test_clobber_r24
+.type abi_test_clobber_r24,@function
+.align 5
+abi_test_clobber_r24:
+.localentry abi_test_clobber_r24,0
+
+ li 24, 0
+ blr
+.size abi_test_clobber_r24,.-abi_test_clobber_r24
+.globl abi_test_clobber_r25
+.type abi_test_clobber_r25,@function
+.align 5
+abi_test_clobber_r25:
+.localentry abi_test_clobber_r25,0
+
+ li 25, 0
+ blr
+.size abi_test_clobber_r25,.-abi_test_clobber_r25
+.globl abi_test_clobber_r26
+.type abi_test_clobber_r26,@function
+.align 5
+abi_test_clobber_r26:
+.localentry abi_test_clobber_r26,0
+
+ li 26, 0
+ blr
+.size abi_test_clobber_r26,.-abi_test_clobber_r26
+.globl abi_test_clobber_r27
+.type abi_test_clobber_r27,@function
+.align 5
+abi_test_clobber_r27:
+.localentry abi_test_clobber_r27,0
+
+ li 27, 0
+ blr
+.size abi_test_clobber_r27,.-abi_test_clobber_r27
+.globl abi_test_clobber_r28
+.type abi_test_clobber_r28,@function
+.align 5
+abi_test_clobber_r28:
+.localentry abi_test_clobber_r28,0
+
+ li 28, 0
+ blr
+.size abi_test_clobber_r28,.-abi_test_clobber_r28
+.globl abi_test_clobber_r29
+.type abi_test_clobber_r29,@function
+.align 5
+abi_test_clobber_r29:
+.localentry abi_test_clobber_r29,0
+
+ li 29, 0
+ blr
+.size abi_test_clobber_r29,.-abi_test_clobber_r29
+.globl abi_test_clobber_r30
+.type abi_test_clobber_r30,@function
+.align 5
+abi_test_clobber_r30:
+.localentry abi_test_clobber_r30,0
+
+ li 30, 0
+ blr
+.size abi_test_clobber_r30,.-abi_test_clobber_r30
+.globl abi_test_clobber_r31
+.type abi_test_clobber_r31,@function
+.align 5
+abi_test_clobber_r31:
+.localentry abi_test_clobber_r31,0
+
+ li 31, 0
+ blr
+.size abi_test_clobber_r31,.-abi_test_clobber_r31
+.globl abi_test_clobber_f0
+.type abi_test_clobber_f0,@function
+.align 4
+abi_test_clobber_f0:
+.localentry abi_test_clobber_f0,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 0, -8(1)
+ blr
+.size abi_test_clobber_f0,.-abi_test_clobber_f0
+.globl abi_test_clobber_f1
+.type abi_test_clobber_f1,@function
+.align 4
+abi_test_clobber_f1:
+.localentry abi_test_clobber_f1,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 1, -8(1)
+ blr
+.size abi_test_clobber_f1,.-abi_test_clobber_f1
+.globl abi_test_clobber_f2
+.type abi_test_clobber_f2,@function
+.align 4
+abi_test_clobber_f2:
+.localentry abi_test_clobber_f2,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 2, -8(1)
+ blr
+.size abi_test_clobber_f2,.-abi_test_clobber_f2
+.globl abi_test_clobber_f3
+.type abi_test_clobber_f3,@function
+.align 4
+abi_test_clobber_f3:
+.localentry abi_test_clobber_f3,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 3, -8(1)
+ blr
+.size abi_test_clobber_f3,.-abi_test_clobber_f3
+.globl abi_test_clobber_f4
+.type abi_test_clobber_f4,@function
+.align 4
+abi_test_clobber_f4:
+.localentry abi_test_clobber_f4,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 4, -8(1)
+ blr
+.size abi_test_clobber_f4,.-abi_test_clobber_f4
+.globl abi_test_clobber_f5
+.type abi_test_clobber_f5,@function
+.align 4
+abi_test_clobber_f5:
+.localentry abi_test_clobber_f5,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 5, -8(1)
+ blr
+.size abi_test_clobber_f5,.-abi_test_clobber_f5
+.globl abi_test_clobber_f6
+.type abi_test_clobber_f6,@function
+.align 4
+abi_test_clobber_f6:
+.localentry abi_test_clobber_f6,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 6, -8(1)
+ blr
+.size abi_test_clobber_f6,.-abi_test_clobber_f6
+.globl abi_test_clobber_f7
+.type abi_test_clobber_f7,@function
+.align 4
+abi_test_clobber_f7:
+.localentry abi_test_clobber_f7,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 7, -8(1)
+ blr
+.size abi_test_clobber_f7,.-abi_test_clobber_f7
+.globl abi_test_clobber_f8
+.type abi_test_clobber_f8,@function
+.align 4
+abi_test_clobber_f8:
+.localentry abi_test_clobber_f8,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 8, -8(1)
+ blr
+.size abi_test_clobber_f8,.-abi_test_clobber_f8
+.globl abi_test_clobber_f9
+.type abi_test_clobber_f9,@function
+.align 4
+abi_test_clobber_f9:
+.localentry abi_test_clobber_f9,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 9, -8(1)
+ blr
+.size abi_test_clobber_f9,.-abi_test_clobber_f9
+.globl abi_test_clobber_f10
+.type abi_test_clobber_f10,@function
+.align 4
+abi_test_clobber_f10:
+.localentry abi_test_clobber_f10,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 10, -8(1)
+ blr
+.size abi_test_clobber_f10,.-abi_test_clobber_f10
+.globl abi_test_clobber_f11
+.type abi_test_clobber_f11,@function
+.align 4
+abi_test_clobber_f11:
+.localentry abi_test_clobber_f11,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 11, -8(1)
+ blr
+.size abi_test_clobber_f11,.-abi_test_clobber_f11
+.globl abi_test_clobber_f12
+.type abi_test_clobber_f12,@function
+.align 4
+abi_test_clobber_f12:
+.localentry abi_test_clobber_f12,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 12, -8(1)
+ blr
+.size abi_test_clobber_f12,.-abi_test_clobber_f12
+.globl abi_test_clobber_f13
+.type abi_test_clobber_f13,@function
+.align 4
+abi_test_clobber_f13:
+.localentry abi_test_clobber_f13,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 13, -8(1)
+ blr
+.size abi_test_clobber_f13,.-abi_test_clobber_f13
+.globl abi_test_clobber_f14
+.type abi_test_clobber_f14,@function
+.align 4
+abi_test_clobber_f14:
+.localentry abi_test_clobber_f14,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 14, -8(1)
+ blr
+.size abi_test_clobber_f14,.-abi_test_clobber_f14
+.globl abi_test_clobber_f15
+.type abi_test_clobber_f15,@function
+.align 4
+abi_test_clobber_f15:
+.localentry abi_test_clobber_f15,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 15, -8(1)
+ blr
+.size abi_test_clobber_f15,.-abi_test_clobber_f15
+.globl abi_test_clobber_f16
+.type abi_test_clobber_f16,@function
+.align 4
+abi_test_clobber_f16:
+.localentry abi_test_clobber_f16,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 16, -8(1)
+ blr
+.size abi_test_clobber_f16,.-abi_test_clobber_f16
+.globl abi_test_clobber_f17
+.type abi_test_clobber_f17,@function
+.align 4
+abi_test_clobber_f17:
+.localentry abi_test_clobber_f17,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 17, -8(1)
+ blr
+.size abi_test_clobber_f17,.-abi_test_clobber_f17
+.globl abi_test_clobber_f18
+.type abi_test_clobber_f18,@function
+.align 4
+abi_test_clobber_f18:
+.localentry abi_test_clobber_f18,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 18, -8(1)
+ blr
+.size abi_test_clobber_f18,.-abi_test_clobber_f18
+.globl abi_test_clobber_f19
+.type abi_test_clobber_f19,@function
+.align 4
+abi_test_clobber_f19:
+.localentry abi_test_clobber_f19,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 19, -8(1)
+ blr
+.size abi_test_clobber_f19,.-abi_test_clobber_f19
+.globl abi_test_clobber_f20
+.type abi_test_clobber_f20,@function
+.align 4
+abi_test_clobber_f20:
+.localentry abi_test_clobber_f20,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 20, -8(1)
+ blr
+.size abi_test_clobber_f20,.-abi_test_clobber_f20
+.globl abi_test_clobber_f21
+.type abi_test_clobber_f21,@function
+.align 4
+abi_test_clobber_f21:
+.localentry abi_test_clobber_f21,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 21, -8(1)
+ blr
+.size abi_test_clobber_f21,.-abi_test_clobber_f21
+.globl abi_test_clobber_f22
+.type abi_test_clobber_f22,@function
+.align 4
+abi_test_clobber_f22:
+.localentry abi_test_clobber_f22,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 22, -8(1)
+ blr
+.size abi_test_clobber_f22,.-abi_test_clobber_f22
+.globl abi_test_clobber_f23
+.type abi_test_clobber_f23,@function
+.align 4
+abi_test_clobber_f23:
+.localentry abi_test_clobber_f23,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 23, -8(1)
+ blr
+.size abi_test_clobber_f23,.-abi_test_clobber_f23
+.globl abi_test_clobber_f24
+.type abi_test_clobber_f24,@function
+.align 4
+abi_test_clobber_f24:
+.localentry abi_test_clobber_f24,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 24, -8(1)
+ blr
+.size abi_test_clobber_f24,.-abi_test_clobber_f24
+.globl abi_test_clobber_f25
+.type abi_test_clobber_f25,@function
+.align 4
+abi_test_clobber_f25:
+.localentry abi_test_clobber_f25,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 25, -8(1)
+ blr
+.size abi_test_clobber_f25,.-abi_test_clobber_f25
+.globl abi_test_clobber_f26
+.type abi_test_clobber_f26,@function
+.align 4
+abi_test_clobber_f26:
+.localentry abi_test_clobber_f26,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 26, -8(1)
+ blr
+.size abi_test_clobber_f26,.-abi_test_clobber_f26
+.globl abi_test_clobber_f27
+.type abi_test_clobber_f27,@function
+.align 4
+abi_test_clobber_f27:
+.localentry abi_test_clobber_f27,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 27, -8(1)
+ blr
+.size abi_test_clobber_f27,.-abi_test_clobber_f27
+.globl abi_test_clobber_f28
+.type abi_test_clobber_f28,@function
+.align 4
+abi_test_clobber_f28:
+.localentry abi_test_clobber_f28,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 28, -8(1)
+ blr
+.size abi_test_clobber_f28,.-abi_test_clobber_f28
+.globl abi_test_clobber_f29
+.type abi_test_clobber_f29,@function
+.align 4
+abi_test_clobber_f29:
+.localentry abi_test_clobber_f29,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 29, -8(1)
+ blr
+.size abi_test_clobber_f29,.-abi_test_clobber_f29
+.globl abi_test_clobber_f30
+.type abi_test_clobber_f30,@function
+.align 4
+abi_test_clobber_f30:
+.localentry abi_test_clobber_f30,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 30, -8(1)
+ blr
+.size abi_test_clobber_f30,.-abi_test_clobber_f30
+.globl abi_test_clobber_f31
+.type abi_test_clobber_f31,@function
+.align 4
+abi_test_clobber_f31:
+.localentry abi_test_clobber_f31,0
+
+ li 0, 0
+
+ std 0, -8(1)
+ lfd 31, -8(1)
+ blr
+.size abi_test_clobber_f31,.-abi_test_clobber_f31
+.globl abi_test_clobber_v0
+.type abi_test_clobber_v0,@function
+.align 4
+abi_test_clobber_v0:
+.localentry abi_test_clobber_v0,0
+
+ vxor 0, 0, 0
+ blr
+.size abi_test_clobber_v0,.-abi_test_clobber_v0
+.globl abi_test_clobber_v1
+.type abi_test_clobber_v1,@function
+.align 4
+abi_test_clobber_v1:
+.localentry abi_test_clobber_v1,0
+
+ vxor 1, 1, 1
+ blr
+.size abi_test_clobber_v1,.-abi_test_clobber_v1
+.globl abi_test_clobber_v2
+.type abi_test_clobber_v2,@function
+.align 4
+abi_test_clobber_v2:
+.localentry abi_test_clobber_v2,0
+
+ vxor 2, 2, 2
+ blr
+.size abi_test_clobber_v2,.-abi_test_clobber_v2
+.globl abi_test_clobber_v3
+.type abi_test_clobber_v3,@function
+.align 4
+abi_test_clobber_v3:
+.localentry abi_test_clobber_v3,0
+
+ vxor 3, 3, 3
+ blr
+.size abi_test_clobber_v3,.-abi_test_clobber_v3
+.globl abi_test_clobber_v4
+.type abi_test_clobber_v4,@function
+.align 4
+abi_test_clobber_v4:
+.localentry abi_test_clobber_v4,0
+
+ vxor 4, 4, 4
+ blr
+.size abi_test_clobber_v4,.-abi_test_clobber_v4
+.globl abi_test_clobber_v5
+.type abi_test_clobber_v5,@function
+.align 4
+abi_test_clobber_v5:
+.localentry abi_test_clobber_v5,0
+
+ vxor 5, 5, 5
+ blr
+.size abi_test_clobber_v5,.-abi_test_clobber_v5
+.globl abi_test_clobber_v6
+.type abi_test_clobber_v6,@function
+.align 4
+abi_test_clobber_v6:
+.localentry abi_test_clobber_v6,0
+
+ vxor 6, 6, 6
+ blr
+.size abi_test_clobber_v6,.-abi_test_clobber_v6
+.globl abi_test_clobber_v7
+.type abi_test_clobber_v7,@function
+.align 4
+abi_test_clobber_v7:
+.localentry abi_test_clobber_v7,0
+
+ vxor 7, 7, 7
+ blr
+.size abi_test_clobber_v7,.-abi_test_clobber_v7
+.globl abi_test_clobber_v8
+.type abi_test_clobber_v8,@function
+.align 4
+abi_test_clobber_v8:
+.localentry abi_test_clobber_v8,0
+
+ vxor 8, 8, 8
+ blr
+.size abi_test_clobber_v8,.-abi_test_clobber_v8
+.globl abi_test_clobber_v9
+.type abi_test_clobber_v9,@function
+.align 4
+abi_test_clobber_v9:
+.localentry abi_test_clobber_v9,0
+
+ vxor 9, 9, 9
+ blr
+.size abi_test_clobber_v9,.-abi_test_clobber_v9
+.globl abi_test_clobber_v10
+.type abi_test_clobber_v10,@function
+.align 4
+abi_test_clobber_v10:
+.localentry abi_test_clobber_v10,0
+
+ vxor 10, 10, 10
+ blr
+.size abi_test_clobber_v10,.-abi_test_clobber_v10
+.globl abi_test_clobber_v11
+.type abi_test_clobber_v11,@function
+.align 4
+abi_test_clobber_v11:
+.localentry abi_test_clobber_v11,0
+
+ vxor 11, 11, 11
+ blr
+.size abi_test_clobber_v11,.-abi_test_clobber_v11
+.globl abi_test_clobber_v12
+.type abi_test_clobber_v12,@function
+.align 4
+abi_test_clobber_v12:
+.localentry abi_test_clobber_v12,0
+
+ vxor 12, 12, 12
+ blr
+.size abi_test_clobber_v12,.-abi_test_clobber_v12
+.globl abi_test_clobber_v13
+.type abi_test_clobber_v13,@function
+.align 4
+abi_test_clobber_v13:
+.localentry abi_test_clobber_v13,0
+
+ vxor 13, 13, 13
+ blr
+.size abi_test_clobber_v13,.-abi_test_clobber_v13
+.globl abi_test_clobber_v14
+.type abi_test_clobber_v14,@function
+.align 4
+abi_test_clobber_v14:
+.localentry abi_test_clobber_v14,0
+
+ vxor 14, 14, 14
+ blr
+.size abi_test_clobber_v14,.-abi_test_clobber_v14
+.globl abi_test_clobber_v15
+.type abi_test_clobber_v15,@function
+.align 4
+abi_test_clobber_v15:
+.localentry abi_test_clobber_v15,0
+
+ vxor 15, 15, 15
+ blr
+.size abi_test_clobber_v15,.-abi_test_clobber_v15
+.globl abi_test_clobber_v16
+.type abi_test_clobber_v16,@function
+.align 4
+abi_test_clobber_v16:
+.localentry abi_test_clobber_v16,0
+
+ vxor 16, 16, 16
+ blr
+.size abi_test_clobber_v16,.-abi_test_clobber_v16
+.globl abi_test_clobber_v17
+.type abi_test_clobber_v17,@function
+.align 4
+abi_test_clobber_v17:
+.localentry abi_test_clobber_v17,0
+
+ vxor 17, 17, 17
+ blr
+.size abi_test_clobber_v17,.-abi_test_clobber_v17
+.globl abi_test_clobber_v18
+.type abi_test_clobber_v18,@function
+.align 4
+abi_test_clobber_v18:
+.localentry abi_test_clobber_v18,0
+
+ vxor 18, 18, 18
+ blr
+.size abi_test_clobber_v18,.-abi_test_clobber_v18
+.globl abi_test_clobber_v19
+.type abi_test_clobber_v19,@function
+.align 4
+abi_test_clobber_v19:
+.localentry abi_test_clobber_v19,0
+
+ vxor 19, 19, 19
+ blr
+.size abi_test_clobber_v19,.-abi_test_clobber_v19
+.globl abi_test_clobber_v20
+.type abi_test_clobber_v20,@function
+.align 4
+abi_test_clobber_v20:
+.localentry abi_test_clobber_v20,0
+
+ vxor 20, 20, 20
+ blr
+.size abi_test_clobber_v20,.-abi_test_clobber_v20
+.globl abi_test_clobber_v21
+.type abi_test_clobber_v21,@function
+.align 4
+abi_test_clobber_v21:
+.localentry abi_test_clobber_v21,0
+
+ vxor 21, 21, 21
+ blr
+.size abi_test_clobber_v21,.-abi_test_clobber_v21
+.globl abi_test_clobber_v22
+.type abi_test_clobber_v22,@function
+.align 4
+abi_test_clobber_v22:
+.localentry abi_test_clobber_v22,0
+
+ vxor 22, 22, 22
+ blr
+.size abi_test_clobber_v22,.-abi_test_clobber_v22
+.globl abi_test_clobber_v23
+.type abi_test_clobber_v23,@function
+.align 4
+abi_test_clobber_v23:
+.localentry abi_test_clobber_v23,0
+
+ vxor 23, 23, 23
+ blr
+.size abi_test_clobber_v23,.-abi_test_clobber_v23
+.globl abi_test_clobber_v24
+.type abi_test_clobber_v24,@function
+.align 4
+abi_test_clobber_v24:
+.localentry abi_test_clobber_v24,0
+
+ vxor 24, 24, 24
+ blr
+.size abi_test_clobber_v24,.-abi_test_clobber_v24
+.globl abi_test_clobber_v25
+.type abi_test_clobber_v25,@function
+.align 4
+abi_test_clobber_v25:
+.localentry abi_test_clobber_v25,0
+
+ vxor 25, 25, 25
+ blr
+.size abi_test_clobber_v25,.-abi_test_clobber_v25
+.globl abi_test_clobber_v26
+.type abi_test_clobber_v26,@function
+.align 4
+abi_test_clobber_v26:
+.localentry abi_test_clobber_v26,0
+
+ vxor 26, 26, 26
+ blr
+.size abi_test_clobber_v26,.-abi_test_clobber_v26
+.globl abi_test_clobber_v27
+.type abi_test_clobber_v27,@function
+.align 4
+abi_test_clobber_v27:
+.localentry abi_test_clobber_v27,0
+
+ vxor 27, 27, 27
+ blr
+.size abi_test_clobber_v27,.-abi_test_clobber_v27
+.globl abi_test_clobber_v28
+.type abi_test_clobber_v28,@function
+.align 4
+abi_test_clobber_v28:
+.localentry abi_test_clobber_v28,0
+
+ vxor 28, 28, 28
+ blr
+.size abi_test_clobber_v28,.-abi_test_clobber_v28
+.globl abi_test_clobber_v29
+.type abi_test_clobber_v29,@function
+.align 4
+abi_test_clobber_v29:
+.localentry abi_test_clobber_v29,0
+
+ vxor 29, 29, 29
+ blr
+.size abi_test_clobber_v29,.-abi_test_clobber_v29
+.globl abi_test_clobber_v30
+.type abi_test_clobber_v30,@function
+.align 4
+abi_test_clobber_v30:
+.localentry abi_test_clobber_v30,0
+
+ vxor 30, 30, 30
+ blr
+.size abi_test_clobber_v30,.-abi_test_clobber_v30
+.globl abi_test_clobber_v31
+.type abi_test_clobber_v31,@function
+.align 4
+abi_test_clobber_v31:
+.localentry abi_test_clobber_v31,0
+
+ vxor 31, 31, 31
+ blr
+.size abi_test_clobber_v31,.-abi_test_clobber_v31
+.globl abi_test_clobber_cr0
+.type abi_test_clobber_cr0,@function
+.align 4
+abi_test_clobber_cr0:
+.localentry abi_test_clobber_cr0,0
+
+
+
+ mfcr 0
+ not 0, 0
+ mtcrf 128, 0
+ blr
+.size abi_test_clobber_cr0,.-abi_test_clobber_cr0
+.globl abi_test_clobber_cr1
+.type abi_test_clobber_cr1,@function
+.align 4
+abi_test_clobber_cr1:
+.localentry abi_test_clobber_cr1,0
+
+
+
+ mfcr 0
+ not 0, 0
+ mtcrf 64, 0
+ blr
+.size abi_test_clobber_cr1,.-abi_test_clobber_cr1
+.globl abi_test_clobber_cr2
+.type abi_test_clobber_cr2,@function
+.align 4
+abi_test_clobber_cr2:
+.localentry abi_test_clobber_cr2,0
+
+
+
+ mfcr 0
+ not 0, 0
+ mtcrf 32, 0
+ blr
+.size abi_test_clobber_cr2,.-abi_test_clobber_cr2
+.globl abi_test_clobber_cr3
+.type abi_test_clobber_cr3,@function
+.align 4
+abi_test_clobber_cr3:
+.localentry abi_test_clobber_cr3,0
+
+
+
+ mfcr 0
+ not 0, 0
+ mtcrf 16, 0
+ blr
+.size abi_test_clobber_cr3,.-abi_test_clobber_cr3
+.globl abi_test_clobber_cr4
+.type abi_test_clobber_cr4,@function
+.align 4
+abi_test_clobber_cr4:
+.localentry abi_test_clobber_cr4,0
+
+
+
+ mfcr 0
+ not 0, 0
+ mtcrf 8, 0
+ blr
+.size abi_test_clobber_cr4,.-abi_test_clobber_cr4
+.globl abi_test_clobber_cr5
+.type abi_test_clobber_cr5,@function
+.align 4
+abi_test_clobber_cr5:
+.localentry abi_test_clobber_cr5,0
+
+
+
+ mfcr 0
+ not 0, 0
+ mtcrf 4, 0
+ blr
+.size abi_test_clobber_cr5,.-abi_test_clobber_cr5
+.globl abi_test_clobber_cr6
+.type abi_test_clobber_cr6,@function
+.align 4
+abi_test_clobber_cr6:
+.localentry abi_test_clobber_cr6,0
+
+
+
+ mfcr 0
+ not 0, 0
+ mtcrf 2, 0
+ blr
+.size abi_test_clobber_cr6,.-abi_test_clobber_cr6
+.globl abi_test_clobber_cr7
+.type abi_test_clobber_cr7,@function
+.align 4
+abi_test_clobber_cr7:
+.localentry abi_test_clobber_cr7,0
+
+
+
+ mfcr 0
+ not 0, 0
+ mtcrf 1, 0
+ blr
+.size abi_test_clobber_cr7,.-abi_test_clobber_cr7
+.globl abi_test_clobber_ctr
+.type abi_test_clobber_ctr,@function
+.align 4
+abi_test_clobber_ctr:
+.localentry abi_test_clobber_ctr,0
+
+ li 0, 0
+ mtctr 0
+ blr
+.size abi_test_clobber_ctr,.-abi_test_clobber_ctr
+
+.globl abi_test_clobber_lr
+.type abi_test_clobber_lr,@function
+.align 4
+abi_test_clobber_lr:
+.localentry abi_test_clobber_lr,0
+
+ mflr 0
+ mtctr 0
+ li 0, 0
+ mtlr 0
+ bctr
+.size abi_test_clobber_lr,.-abi_test_clobber_lr
+
+#endif // !OPENSSL_NO_ASM && __powerpc64__
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S
@@ -1,0 +1,975 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl ChaCha20_ctr32
+.hidden ChaCha20_ctr32
+.type ChaCha20_ctr32,@function
+.align 16
+ChaCha20_ctr32:
+.L_ChaCha20_ctr32_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ xorl %eax,%eax
+ cmpl 28(%esp),%eax
+ je .L000no_data
+ call .Lpic_point
+.Lpic_point:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp
+ testl $16777216,(%ebp)
+ jz .L001x86
+ testl $512,4(%ebp)
+ jz .L001x86
+ jmp .Lssse3_shortcut
+.L001x86:
+ movl 32(%esp),%esi
+ movl 36(%esp),%edi
+ subl $132,%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ movl %eax,80(%esp)
+ movl %ebx,84(%esp)
+ movl %ecx,88(%esp)
+ movl %edx,92(%esp)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ movl %eax,96(%esp)
+ movl %ebx,100(%esp)
+ movl %ecx,104(%esp)
+ movl %edx,108(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ subl $1,%eax
+ movl %eax,112(%esp)
+ movl %ebx,116(%esp)
+ movl %ecx,120(%esp)
+ movl %edx,124(%esp)
+ jmp .L002entry
+.align 16
+.L003outer_loop:
+ movl %ebx,156(%esp)
+ movl %eax,152(%esp)
+ movl %ecx,160(%esp)
+.L002entry:
+ movl $1634760805,%eax
+ movl $857760878,4(%esp)
+ movl $2036477234,8(%esp)
+ movl $1797285236,12(%esp)
+ movl 84(%esp),%ebx
+ movl 88(%esp),%ebp
+ movl 104(%esp),%ecx
+ movl 108(%esp),%esi
+ movl 116(%esp),%edx
+ movl 120(%esp),%edi
+ movl %ebx,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ecx,40(%esp)
+ movl %esi,44(%esp)
+ movl %edx,52(%esp)
+ movl %edi,56(%esp)
+ movl 92(%esp),%ebx
+ movl 124(%esp),%edi
+ movl 112(%esp),%edx
+ movl 80(%esp),%ebp
+ movl 96(%esp),%ecx
+ movl 100(%esp),%esi
+ addl $1,%edx
+ movl %ebx,28(%esp)
+ movl %edi,60(%esp)
+ movl %edx,112(%esp)
+ movl $10,%ebx
+ jmp .L004loop
+.align 16
+.L004loop:
+ addl %ebp,%eax
+ movl %ebx,128(%esp)
+ movl %ebp,%ebx
+ xorl %eax,%edx
+ roll $16,%edx
+ addl %edx,%ecx
+ xorl %ecx,%ebx
+ movl 52(%esp),%edi
+ roll $12,%ebx
+ movl 20(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,(%esp)
+ roll $8,%edx
+ movl 4(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,48(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ movl %ecx,32(%esp)
+ roll $16,%edi
+ movl %ebx,16(%esp)
+ addl %edi,%esi
+ movl 40(%esp),%ecx
+ xorl %esi,%ebp
+ movl 56(%esp),%edx
+ roll $12,%ebp
+ movl 24(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,4(%esp)
+ roll $8,%edi
+ movl 8(%esp),%eax
+ addl %edi,%esi
+ movl %edi,52(%esp)
+ xorl %esi,%ebp
+ addl %ebx,%eax
+ roll $7,%ebp
+ xorl %eax,%edx
+ movl %esi,36(%esp)
+ roll $16,%edx
+ movl %ebp,20(%esp)
+ addl %edx,%ecx
+ movl 44(%esp),%esi
+ xorl %ecx,%ebx
+ movl 60(%esp),%edi
+ roll $12,%ebx
+ movl 28(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,8(%esp)
+ roll $8,%edx
+ movl 12(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,56(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ roll $16,%edi
+ movl %ebx,24(%esp)
+ addl %edi,%esi
+ xorl %esi,%ebp
+ roll $12,%ebp
+ movl 20(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,12(%esp)
+ roll $8,%edi
+ movl (%esp),%eax
+ addl %edi,%esi
+ movl %edi,%edx
+ xorl %esi,%ebp
+ addl %ebx,%eax
+ roll $7,%ebp
+ xorl %eax,%edx
+ roll $16,%edx
+ movl %ebp,28(%esp)
+ addl %edx,%ecx
+ xorl %ecx,%ebx
+ movl 48(%esp),%edi
+ roll $12,%ebx
+ movl 24(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,(%esp)
+ roll $8,%edx
+ movl 4(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,60(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ movl %ecx,40(%esp)
+ roll $16,%edi
+ movl %ebx,20(%esp)
+ addl %edi,%esi
+ movl 32(%esp),%ecx
+ xorl %esi,%ebp
+ movl 52(%esp),%edx
+ roll $12,%ebp
+ movl 28(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,4(%esp)
+ roll $8,%edi
+ movl 8(%esp),%eax
+ addl %edi,%esi
+ movl %edi,48(%esp)
+ xorl %esi,%ebp
+ addl %ebx,%eax
+ roll $7,%ebp
+ xorl %eax,%edx
+ movl %esi,44(%esp)
+ roll $16,%edx
+ movl %ebp,24(%esp)
+ addl %edx,%ecx
+ movl 36(%esp),%esi
+ xorl %ecx,%ebx
+ movl 56(%esp),%edi
+ roll $12,%ebx
+ movl 16(%esp),%ebp
+ addl %ebx,%eax
+ xorl %eax,%edx
+ movl %eax,8(%esp)
+ roll $8,%edx
+ movl 12(%esp),%eax
+ addl %edx,%ecx
+ movl %edx,52(%esp)
+ xorl %ecx,%ebx
+ addl %ebp,%eax
+ roll $7,%ebx
+ xorl %eax,%edi
+ roll $16,%edi
+ movl %ebx,28(%esp)
+ addl %edi,%esi
+ xorl %esi,%ebp
+ movl 48(%esp),%edx
+ roll $12,%ebp
+ movl 128(%esp),%ebx
+ addl %ebp,%eax
+ xorl %eax,%edi
+ movl %eax,12(%esp)
+ roll $8,%edi
+ movl (%esp),%eax
+ addl %edi,%esi
+ movl %edi,56(%esp)
+ xorl %esi,%ebp
+ roll $7,%ebp
+ decl %ebx
+ jnz .L004loop
+ movl 160(%esp),%ebx
+ addl $1634760805,%eax
+ addl 80(%esp),%ebp
+ addl 96(%esp),%ecx
+ addl 100(%esp),%esi
+ cmpl $64,%ebx
+ jb .L005tail
+ movl 156(%esp),%ebx
+ addl 112(%esp),%edx
+ addl 120(%esp),%edi
+ xorl (%ebx),%eax
+ xorl 16(%ebx),%ebp
+ movl %eax,(%esp)
+ movl 152(%esp),%eax
+ xorl 32(%ebx),%ecx
+ xorl 36(%ebx),%esi
+ xorl 48(%ebx),%edx
+ xorl 56(%ebx),%edi
+ movl %ebp,16(%eax)
+ movl %ecx,32(%eax)
+ movl %esi,36(%eax)
+ movl %edx,48(%eax)
+ movl %edi,56(%eax)
+ movl 4(%esp),%ebp
+ movl 8(%esp),%ecx
+ movl 12(%esp),%esi
+ movl 20(%esp),%edx
+ movl 24(%esp),%edi
+ addl $857760878,%ebp
+ addl $2036477234,%ecx
+ addl $1797285236,%esi
+ addl 84(%esp),%edx
+ addl 88(%esp),%edi
+ xorl 4(%ebx),%ebp
+ xorl 8(%ebx),%ecx
+ xorl 12(%ebx),%esi
+ xorl 20(%ebx),%edx
+ xorl 24(%ebx),%edi
+ movl %ebp,4(%eax)
+ movl %ecx,8(%eax)
+ movl %esi,12(%eax)
+ movl %edx,20(%eax)
+ movl %edi,24(%eax)
+ movl 28(%esp),%ebp
+ movl 40(%esp),%ecx
+ movl 44(%esp),%esi
+ movl 52(%esp),%edx
+ movl 60(%esp),%edi
+ addl 92(%esp),%ebp
+ addl 104(%esp),%ecx
+ addl 108(%esp),%esi
+ addl 116(%esp),%edx
+ addl 124(%esp),%edi
+ xorl 28(%ebx),%ebp
+ xorl 40(%ebx),%ecx
+ xorl 44(%ebx),%esi
+ xorl 52(%ebx),%edx
+ xorl 60(%ebx),%edi
+ leal 64(%ebx),%ebx
+ movl %ebp,28(%eax)
+ movl (%esp),%ebp
+ movl %ecx,40(%eax)
+ movl 160(%esp),%ecx
+ movl %esi,44(%eax)
+ movl %edx,52(%eax)
+ movl %edi,60(%eax)
+ movl %ebp,(%eax)
+ leal 64(%eax),%eax
+ subl $64,%ecx
+ jnz .L003outer_loop
+ jmp .L006done
+.L005tail:
+ addl 112(%esp),%edx
+ addl 120(%esp),%edi
+ movl %eax,(%esp)
+ movl %ebp,16(%esp)
+ movl %ecx,32(%esp)
+ movl %esi,36(%esp)
+ movl %edx,48(%esp)
+ movl %edi,56(%esp)
+ movl 4(%esp),%ebp
+ movl 8(%esp),%ecx
+ movl 12(%esp),%esi
+ movl 20(%esp),%edx
+ movl 24(%esp),%edi
+ addl $857760878,%ebp
+ addl $2036477234,%ecx
+ addl $1797285236,%esi
+ addl 84(%esp),%edx
+ addl 88(%esp),%edi
+ movl %ebp,4(%esp)
+ movl %ecx,8(%esp)
+ movl %esi,12(%esp)
+ movl %edx,20(%esp)
+ movl %edi,24(%esp)
+ movl 28(%esp),%ebp
+ movl 40(%esp),%ecx
+ movl 44(%esp),%esi
+ movl 52(%esp),%edx
+ movl 60(%esp),%edi
+ addl 92(%esp),%ebp
+ addl 104(%esp),%ecx
+ addl 108(%esp),%esi
+ addl 116(%esp),%edx
+ addl 124(%esp),%edi
+ movl %ebp,28(%esp)
+ movl 156(%esp),%ebp
+ movl %ecx,40(%esp)
+ movl 152(%esp),%ecx
+ movl %esi,44(%esp)
+ xorl %esi,%esi
+ movl %edx,52(%esp)
+ movl %edi,60(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+.L007tail_loop:
+ movb (%esi,%ebp,1),%al
+ movb (%esp,%esi,1),%dl
+ leal 1(%esi),%esi
+ xorb %dl,%al
+ movb %al,-1(%ecx,%esi,1)
+ decl %ebx
+ jnz .L007tail_loop
+.L006done:
+ addl $132,%esp
+.L000no_data:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin
+.globl ChaCha20_ssse3
+.hidden ChaCha20_ssse3
+.type ChaCha20_ssse3,@function
+.align 16
+ChaCha20_ssse3:
+.L_ChaCha20_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+.Lssse3_shortcut:
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%ecx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $524,%esp
+ andl $-64,%esp
+ movl %ebp,512(%esp)
+ leal .Lssse3_data-.Lpic_point(%eax),%eax
+ movdqu (%ebx),%xmm3
+ cmpl $256,%ecx
+ jb .L0081x
+ movl %edx,516(%esp)
+ movl %ebx,520(%esp)
+ subl $256,%ecx
+ leal 384(%esp),%ebp
+ movdqu (%edx),%xmm7
+ pshufd $0,%xmm3,%xmm0
+ pshufd $85,%xmm3,%xmm1
+ pshufd $170,%xmm3,%xmm2
+ pshufd $255,%xmm3,%xmm3
+ paddd 48(%eax),%xmm0
+ pshufd $0,%xmm7,%xmm4
+ pshufd $85,%xmm7,%xmm5
+ psubd 64(%eax),%xmm0
+ pshufd $170,%xmm7,%xmm6
+ pshufd $255,%xmm7,%xmm7
+ movdqa %xmm0,64(%ebp)
+ movdqa %xmm1,80(%ebp)
+ movdqa %xmm2,96(%ebp)
+ movdqa %xmm3,112(%ebp)
+ movdqu 16(%edx),%xmm3
+ movdqa %xmm4,-64(%ebp)
+ movdqa %xmm5,-48(%ebp)
+ movdqa %xmm6,-32(%ebp)
+ movdqa %xmm7,-16(%ebp)
+ movdqa 32(%eax),%xmm7
+ leal 128(%esp),%ebx
+ pshufd $0,%xmm3,%xmm0
+ pshufd $85,%xmm3,%xmm1
+ pshufd $170,%xmm3,%xmm2
+ pshufd $255,%xmm3,%xmm3
+ pshufd $0,%xmm7,%xmm4
+ pshufd $85,%xmm7,%xmm5
+ pshufd $170,%xmm7,%xmm6
+ pshufd $255,%xmm7,%xmm7
+ movdqa %xmm0,(%ebp)
+ movdqa %xmm1,16(%ebp)
+ movdqa %xmm2,32(%ebp)
+ movdqa %xmm3,48(%ebp)
+ movdqa %xmm4,-128(%ebp)
+ movdqa %xmm5,-112(%ebp)
+ movdqa %xmm6,-96(%ebp)
+ movdqa %xmm7,-80(%ebp)
+ leal 128(%esi),%esi
+ leal 128(%edi),%edi
+ jmp .L009outer_loop
+.align 16
+.L009outer_loop:
+ movdqa -112(%ebp),%xmm1
+ movdqa -96(%ebp),%xmm2
+ movdqa -80(%ebp),%xmm3
+ movdqa -48(%ebp),%xmm5
+ movdqa -32(%ebp),%xmm6
+ movdqa -16(%ebp),%xmm7
+ movdqa %xmm1,-112(%ebx)
+ movdqa %xmm2,-96(%ebx)
+ movdqa %xmm3,-80(%ebx)
+ movdqa %xmm5,-48(%ebx)
+ movdqa %xmm6,-32(%ebx)
+ movdqa %xmm7,-16(%ebx)
+ movdqa 32(%ebp),%xmm2
+ movdqa 48(%ebp),%xmm3
+ movdqa 64(%ebp),%xmm4
+ movdqa 80(%ebp),%xmm5
+ movdqa 96(%ebp),%xmm6
+ movdqa 112(%ebp),%xmm7
+ paddd 64(%eax),%xmm4
+ movdqa %xmm2,32(%ebx)
+ movdqa %xmm3,48(%ebx)
+ movdqa %xmm4,64(%ebx)
+ movdqa %xmm5,80(%ebx)
+ movdqa %xmm6,96(%ebx)
+ movdqa %xmm7,112(%ebx)
+ movdqa %xmm4,64(%ebp)
+ movdqa -128(%ebp),%xmm0
+ movdqa %xmm4,%xmm6
+ movdqa -64(%ebp),%xmm3
+ movdqa (%ebp),%xmm4
+ movdqa 16(%ebp),%xmm5
+ movl $10,%edx
+ nop
+.align 16
+.L010loop:
+ paddd %xmm3,%xmm0
+ movdqa %xmm3,%xmm2
+ pxor %xmm0,%xmm6
+ pshufb (%eax),%xmm6
+ paddd %xmm6,%xmm4
+ pxor %xmm4,%xmm2
+ movdqa -48(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -112(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 80(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-128(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,64(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ movdqa %xmm4,(%ebx)
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-64(%ebx)
+ paddd %xmm7,%xmm5
+ movdqa 32(%ebx),%xmm4
+ pxor %xmm5,%xmm3
+ movdqa -32(%ebx),%xmm2
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -96(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ movdqa 96(%ebx),%xmm6
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-112(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,80(%ebx)
+ pxor %xmm5,%xmm3
+ paddd %xmm2,%xmm0
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ pxor %xmm0,%xmm6
+ por %xmm1,%xmm3
+ movdqa %xmm5,16(%ebx)
+ pshufb (%eax),%xmm6
+ movdqa %xmm3,-48(%ebx)
+ paddd %xmm6,%xmm4
+ movdqa 48(%ebx),%xmm5
+ pxor %xmm4,%xmm2
+ movdqa -16(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -80(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 112(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-96(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,96(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-32(%ebx)
+ paddd %xmm7,%xmm5
+ pxor %xmm5,%xmm3
+ movdqa -48(%ebx),%xmm2
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -128(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-80(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,%xmm6
+ pxor %xmm5,%xmm3
+ paddd %xmm2,%xmm0
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ pxor %xmm0,%xmm6
+ por %xmm1,%xmm3
+ pshufb (%eax),%xmm6
+ movdqa %xmm3,-16(%ebx)
+ paddd %xmm6,%xmm4
+ pxor %xmm4,%xmm2
+ movdqa -32(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -112(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 64(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-128(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,112(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ movdqa %xmm4,32(%ebx)
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-48(%ebx)
+ paddd %xmm7,%xmm5
+ movdqa (%ebx),%xmm4
+ pxor %xmm5,%xmm3
+ movdqa -16(%ebx),%xmm2
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -96(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ movdqa 80(%ebx),%xmm6
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-112(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,64(%ebx)
+ pxor %xmm5,%xmm3
+ paddd %xmm2,%xmm0
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ pxor %xmm0,%xmm6
+ por %xmm1,%xmm3
+ movdqa %xmm5,48(%ebx)
+ pshufb (%eax),%xmm6
+ movdqa %xmm3,-32(%ebx)
+ paddd %xmm6,%xmm4
+ movdqa 16(%ebx),%xmm5
+ pxor %xmm4,%xmm2
+ movdqa -64(%ebx),%xmm3
+ movdqa %xmm2,%xmm1
+ pslld $12,%xmm2
+ psrld $20,%xmm1
+ por %xmm1,%xmm2
+ movdqa -80(%ebx),%xmm1
+ paddd %xmm2,%xmm0
+ movdqa 96(%ebx),%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm0,-96(%ebx)
+ pshufb 16(%eax),%xmm6
+ paddd %xmm6,%xmm4
+ movdqa %xmm6,80(%ebx)
+ pxor %xmm4,%xmm2
+ paddd %xmm3,%xmm1
+ movdqa %xmm2,%xmm0
+ pslld $7,%xmm2
+ psrld $25,%xmm0
+ pxor %xmm1,%xmm7
+ por %xmm0,%xmm2
+ pshufb (%eax),%xmm7
+ movdqa %xmm2,-16(%ebx)
+ paddd %xmm7,%xmm5
+ pxor %xmm5,%xmm3
+ movdqa %xmm3,%xmm0
+ pslld $12,%xmm3
+ psrld $20,%xmm0
+ por %xmm0,%xmm3
+ movdqa -128(%ebx),%xmm0
+ paddd %xmm3,%xmm1
+ movdqa 64(%ebx),%xmm6
+ pxor %xmm1,%xmm7
+ movdqa %xmm1,-80(%ebx)
+ pshufb 16(%eax),%xmm7
+ paddd %xmm7,%xmm5
+ movdqa %xmm7,96(%ebx)
+ pxor %xmm5,%xmm3
+ movdqa %xmm3,%xmm1
+ pslld $7,%xmm3
+ psrld $25,%xmm1
+ por %xmm1,%xmm3
+ decl %edx
+ jnz .L010loop
+ movdqa %xmm3,-64(%ebx)
+ movdqa %xmm4,(%ebx)
+ movdqa %xmm5,16(%ebx)
+ movdqa %xmm6,64(%ebx)
+ movdqa %xmm7,96(%ebx)
+ movdqa -112(%ebx),%xmm1
+ movdqa -96(%ebx),%xmm2
+ movdqa -80(%ebx),%xmm3
+ paddd -128(%ebp),%xmm0
+ paddd -112(%ebp),%xmm1
+ paddd -96(%ebp),%xmm2
+ paddd -80(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 16(%esi),%esi
+ pxor %xmm0,%xmm4
+ movdqa -64(%ebx),%xmm0
+ pxor %xmm1,%xmm5
+ movdqa -48(%ebx),%xmm1
+ pxor %xmm2,%xmm6
+ movdqa -32(%ebx),%xmm2
+ pxor %xmm3,%xmm7
+ movdqa -16(%ebx),%xmm3
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ paddd -64(%ebp),%xmm0
+ paddd -48(%ebp),%xmm1
+ paddd -32(%ebp),%xmm2
+ paddd -16(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 16(%esi),%esi
+ pxor %xmm0,%xmm4
+ movdqa (%ebx),%xmm0
+ pxor %xmm1,%xmm5
+ movdqa 16(%ebx),%xmm1
+ pxor %xmm2,%xmm6
+ movdqa 32(%ebx),%xmm2
+ pxor %xmm3,%xmm7
+ movdqa 48(%ebx),%xmm3
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ paddd (%ebp),%xmm0
+ paddd 16(%ebp),%xmm1
+ paddd 32(%ebp),%xmm2
+ paddd 48(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 16(%esi),%esi
+ pxor %xmm0,%xmm4
+ movdqa 64(%ebx),%xmm0
+ pxor %xmm1,%xmm5
+ movdqa 80(%ebx),%xmm1
+ pxor %xmm2,%xmm6
+ movdqa 96(%ebx),%xmm2
+ pxor %xmm3,%xmm7
+ movdqa 112(%ebx),%xmm3
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 16(%edi),%edi
+ paddd 64(%ebp),%xmm0
+ paddd 80(%ebp),%xmm1
+ paddd 96(%ebp),%xmm2
+ paddd 112(%ebp),%xmm3
+ movdqa %xmm0,%xmm6
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm6
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm6,%xmm3
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ movdqu -128(%esi),%xmm4
+ movdqu -64(%esi),%xmm5
+ movdqu (%esi),%xmm2
+ movdqu 64(%esi),%xmm7
+ leal 208(%esi),%esi
+ pxor %xmm0,%xmm4
+ pxor %xmm1,%xmm5
+ pxor %xmm2,%xmm6
+ pxor %xmm3,%xmm7
+ movdqu %xmm4,-128(%edi)
+ movdqu %xmm5,-64(%edi)
+ movdqu %xmm6,(%edi)
+ movdqu %xmm7,64(%edi)
+ leal 208(%edi),%edi
+ subl $256,%ecx
+ jnc .L009outer_loop
+ addl $256,%ecx
+ jz .L011done
+ movl 520(%esp),%ebx
+ leal -128(%esi),%esi
+ movl 516(%esp),%edx
+ leal -128(%edi),%edi
+ movd 64(%ebp),%xmm2
+ movdqu (%ebx),%xmm3
+ paddd 96(%eax),%xmm2
+ pand 112(%eax),%xmm3
+ por %xmm2,%xmm3
+.L0081x:
+ movdqa 32(%eax),%xmm0
+ movdqu (%edx),%xmm1
+ movdqu 16(%edx),%xmm2
+ movdqa (%eax),%xmm6
+ movdqa 16(%eax),%xmm7
+ movl %ebp,48(%esp)
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ movl $10,%edx
+ jmp .L012loop1x
+.align 16
+.L013outer1x:
+ movdqa 80(%eax),%xmm3
+ movdqa (%esp),%xmm0
+ movdqa 16(%esp),%xmm1
+ movdqa 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ movl $10,%edx
+ movdqa %xmm3,48(%esp)
+ jmp .L012loop1x
+.align 16
+.L012loop1x:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decl %edx
+ jnz .L012loop1x
+ paddd (%esp),%xmm0
+ paddd 16(%esp),%xmm1
+ paddd 32(%esp),%xmm2
+ paddd 48(%esp),%xmm3
+ cmpl $64,%ecx
+ jb .L014tail
+ movdqu (%esi),%xmm4
+ movdqu 16(%esi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%esi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%esi),%xmm5
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+ leal 64(%esi),%esi
+ movdqu %xmm0,(%edi)
+ movdqu %xmm1,16(%edi)
+ movdqu %xmm2,32(%edi)
+ movdqu %xmm3,48(%edi)
+ leal 64(%edi),%edi
+ subl $64,%ecx
+ jnz .L013outer1x
+ jmp .L011done
+.L014tail:
+ movdqa %xmm0,(%esp)
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm2,32(%esp)
+ movdqa %xmm3,48(%esp)
+ xorl %eax,%eax
+ xorl %edx,%edx
+ xorl %ebp,%ebp
+.L015tail_loop:
+ movb (%esp,%ebp,1),%al
+ movb (%esi,%ebp,1),%dl
+ leal 1(%ebp),%ebp
+ xorb %dl,%al
+ movb %al,-1(%edi,%ebp,1)
+ decl %ecx
+ jnz .L015tail_loop
+.L011done:
+ movl 512(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin
+.align 64
+.Lssse3_data:
+.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+.long 1634760805,857760878,2036477234,1797285236
+.long 0,1,2,3
+.long 4,4,4,4
+.long 1,0,0,0
+.long 4,0,0,0
+.long 0,-1,-1,-1
+.align 64
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54
+.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
+.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
+.byte 114,103,62,0
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S
@@ -1,0 +1,2513 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.globl aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type aes_hw_encrypt,@function
+.align 16
+aes_hw_encrypt:
+.L_aes_hw_encrypt_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L000pic
+.L000pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L001enc1_loop_1:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L001enc1_loop_1
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.size aes_hw_encrypt,.-.L_aes_hw_encrypt_begin
+.globl aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type aes_hw_decrypt,@function
+.align 16
+aes_hw_decrypt:
+.L_aes_hw_decrypt_begin:
+ movl 4(%esp),%eax
+ movl 12(%esp),%edx
+ movups (%eax),%xmm2
+ movl 240(%edx),%ecx
+ movl 8(%esp),%eax
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L002dec1_loop_2:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L002dec1_loop_2
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%eax)
+ pxor %xmm2,%xmm2
+ ret
+.size aes_hw_decrypt,.-.L_aes_hw_decrypt_begin
+.hidden _aesni_encrypt2
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L003enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L003enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ ret
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.hidden _aesni_decrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L004dec2_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L004dec2_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ ret
+.size _aesni_decrypt2,.-_aesni_decrypt2
+.hidden _aesni_encrypt3
+.type _aesni_encrypt3,@function
+.align 16
+_aesni_encrypt3:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L005enc3_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L005enc3_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ ret
+.size _aesni_encrypt3,.-_aesni_encrypt3
+.hidden _aesni_decrypt3
+.type _aesni_decrypt3,@function
+.align 16
+_aesni_decrypt3:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+ addl $16,%ecx
+.L006dec3_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L006dec3_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ ret
+.size _aesni_decrypt3,.-_aesni_decrypt3
+.hidden _aesni_encrypt4
+.type _aesni_encrypt4,@function
+.align 16
+_aesni_encrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shll $4,%ecx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L007enc4_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L007enc4_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ ret
+.size _aesni_encrypt4,.-_aesni_encrypt4
+.hidden _aesni_decrypt4
+.type _aesni_decrypt4,@function
+.align 16
+_aesni_decrypt4:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ shll $4,%ecx
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ movups 32(%edx),%xmm0
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 15,31,64,0
+ addl $16,%ecx
+.L008dec4_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L008dec4_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ ret
+.size _aesni_decrypt4,.-_aesni_decrypt4
+.hidden _aesni_encrypt6
+.type _aesni_encrypt6,@function
+.align 16
+_aesni_encrypt6:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L009_aesni_encrypt6_inner
+.align 16
+.L010enc6_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.L009_aesni_encrypt6_inner:
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.L_aesni_encrypt6_enter:
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L010enc6_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ ret
+.size _aesni_encrypt6,.-_aesni_encrypt6
+.hidden _aesni_decrypt6
+.type _aesni_decrypt6,@function
+.align 16
+_aesni_decrypt6:
+ movups (%edx),%xmm0
+ shll $4,%ecx
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,217
+ leal 32(%edx,%ecx,1),%edx
+ negl %ecx
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm7
+ movups (%edx,%ecx,1),%xmm0
+ addl $16,%ecx
+ jmp .L011_aesni_decrypt6_inner
+.align 16
+.L012dec6_loop:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.L011_aesni_decrypt6_inner:
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.L_aesni_decrypt6_enter:
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L012dec6_loop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ ret
+.size _aesni_decrypt6,.-_aesni_decrypt6
+.globl aes_hw_ecb_encrypt
+.hidden aes_hw_ecb_encrypt
+.type aes_hw_ecb_encrypt,@function
+.align 16
+aes_hw_ecb_encrypt:
+.L_aes_hw_ecb_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ andl $-16,%eax
+ jz .L013ecb_ret
+ movl 240(%edx),%ecx
+ testl %ebx,%ebx
+ jz .L014ecb_decrypt
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb .L015ecb_enc_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp .L016ecb_enc_loop6_enter
+.align 16
+.L017ecb_enc_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+.L016ecb_enc_loop6_enter:
+ call _aesni_encrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc .L017ecb_enc_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz .L013ecb_ret
+.L015ecb_enc_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb .L018ecb_enc_one
+ movups 16(%esi),%xmm3
+ je .L019ecb_enc_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb .L020ecb_enc_three
+ movups 48(%esi),%xmm5
+ je .L021ecb_enc_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L018ecb_enc_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L022enc1_loop_3:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L022enc1_loop_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L019ecb_enc_two:
+ call _aesni_encrypt2
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L020ecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L021ecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L014ecb_decrypt:
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ cmpl $96,%eax
+ jb .L023ecb_dec_tail
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+ subl $96,%eax
+ jmp .L024ecb_dec_loop6_enter
+.align 16
+.L025ecb_dec_loop6:
+ movups %xmm2,(%edi)
+ movdqu (%esi),%xmm2
+ movups %xmm3,16(%edi)
+ movdqu 16(%esi),%xmm3
+ movups %xmm4,32(%edi)
+ movdqu 32(%esi),%xmm4
+ movups %xmm5,48(%edi)
+ movdqu 48(%esi),%xmm5
+ movups %xmm6,64(%edi)
+ movdqu 64(%esi),%xmm6
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqu 80(%esi),%xmm7
+ leal 96(%esi),%esi
+.L024ecb_dec_loop6_enter:
+ call _aesni_decrypt6
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ subl $96,%eax
+ jnc .L025ecb_dec_loop6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ addl $96,%eax
+ jz .L013ecb_ret
+.L023ecb_dec_tail:
+ movups (%esi),%xmm2
+ cmpl $32,%eax
+ jb .L026ecb_dec_one
+ movups 16(%esi),%xmm3
+ je .L027ecb_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $64,%eax
+ jb .L028ecb_dec_three
+ movups 48(%esi),%xmm5
+ je .L029ecb_dec_four
+ movups 64(%esi),%xmm6
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L026ecb_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L030dec1_loop_4:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L030dec1_loop_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L027ecb_dec_two:
+ call _aesni_decrypt2
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L028ecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L013ecb_ret
+.align 16
+.L029ecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+.L013ecb_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_ecb_encrypt,.-.L_aes_hw_ecb_encrypt_begin
+.globl aes_hw_ccm64_encrypt_blocks
+.hidden aes_hw_ccm64_encrypt_blocks
+.type aes_hw_ccm64_encrypt_blocks,@function
+.align 16
+aes_hw_ccm64_encrypt_blocks:
+.L_aes_hw_ccm64_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ shll $4,%ecx
+ movl $16,%ebx
+ leal (%edx),%ebp
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ leal 32(%edx,%ecx,1),%edx
+ subl %ecx,%ebx
+.byte 102,15,56,0,253
+.L031ccm64_enc_outer:
+ movups (%ebp),%xmm0
+ movl %ebx,%ecx
+ movups (%esi),%xmm6
+ xorps %xmm0,%xmm2
+ movups 16(%ebp),%xmm1
+ xorps %xmm6,%xmm0
+ xorps %xmm0,%xmm3
+ movups 32(%ebp),%xmm0
+.L032ccm64_enc2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L032ccm64_enc2_loop
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ paddq 16(%esp),%xmm7
+ decl %eax
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ leal 16(%esi),%esi
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movups %xmm6,(%edi)
+.byte 102,15,56,0,213
+ leal 16(%edi),%edi
+ jnz .L031ccm64_enc_outer
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_ccm64_encrypt_blocks,.-.L_aes_hw_ccm64_encrypt_blocks_begin
+.globl aes_hw_ccm64_decrypt_blocks
+.hidden aes_hw_ccm64_decrypt_blocks
+.type aes_hw_ccm64_decrypt_blocks,@function
+.align 16
+aes_hw_ccm64_decrypt_blocks:
+.L_aes_hw_ccm64_decrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl 40(%esp),%ecx
+ movl %esp,%ebp
+ subl $60,%esp
+ andl $-16,%esp
+ movl %ebp,48(%esp)
+ movdqu (%ebx),%xmm7
+ movdqu (%ecx),%xmm3
+ movl 240(%edx),%ecx
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $1,%ebx
+ xorl %ebp,%ebp
+ movl %ebx,16(%esp)
+ movl %ebp,20(%esp)
+ movl %ebp,24(%esp)
+ movl %ebp,28(%esp)
+ movdqa (%esp),%xmm5
+ movdqa %xmm7,%xmm2
+ movl %edx,%ebp
+ movl %ecx,%ebx
+.byte 102,15,56,0,253
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L033enc1_loop_5:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L033enc1_loop_5
+.byte 102,15,56,221,209
+ shll $4,%ebx
+ movl $16,%ecx
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+ leal 16(%esi),%esi
+ subl %ebx,%ecx
+ leal 32(%ebp,%ebx,1),%edx
+ movl %ecx,%ebx
+ jmp .L034ccm64_dec_outer
+.align 16
+.L034ccm64_dec_outer:
+ xorps %xmm2,%xmm6
+ movdqa %xmm7,%xmm2
+ movups %xmm6,(%edi)
+ leal 16(%edi),%edi
+.byte 102,15,56,0,213
+ subl $1,%eax
+ jz .L035ccm64_dec_break
+ movups (%ebp),%xmm0
+ movl %ebx,%ecx
+ movups 16(%ebp),%xmm1
+ xorps %xmm0,%xmm6
+ xorps %xmm0,%xmm2
+ xorps %xmm6,%xmm3
+ movups 32(%ebp),%xmm0
+.L036ccm64_dec2_loop:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%edx,%ecx,1),%xmm1
+ addl $32,%ecx
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%edx,%ecx,1),%xmm0
+ jnz .L036ccm64_dec2_loop
+ movups (%esi),%xmm6
+ paddq 16(%esp),%xmm7
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ leal 16(%esi),%esi
+ jmp .L034ccm64_dec_outer
+.align 16
+.L035ccm64_dec_break:
+ movl 240(%ebp),%ecx
+ movl %ebp,%edx
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm6
+ leal 32(%edx),%edx
+ xorps %xmm6,%xmm3
+.L037enc1_loop_6:
+.byte 102,15,56,220,217
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L037enc1_loop_6
+.byte 102,15,56,221,217
+ movl 48(%esp),%esp
+ movl 40(%esp),%edi
+ movups %xmm3,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_ccm64_decrypt_blocks,.-.L_aes_hw_ccm64_decrypt_blocks_begin
+.globl aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,@function
+.align 16
+aes_hw_ctr32_encrypt_blocks:
+.L_aes_hw_ctr32_encrypt_blocks_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L038pic
+.L038pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebx
+ movl %esp,%ebp
+ subl $88,%esp
+ andl $-16,%esp
+ movl %ebp,80(%esp)
+ cmpl $1,%eax
+ je .L039ctr32_one_shortcut
+ movdqu (%ebx),%xmm7
+ movl $202182159,(%esp)
+ movl $134810123,4(%esp)
+ movl $67438087,8(%esp)
+ movl $66051,12(%esp)
+ movl $6,%ecx
+ xorl %ebp,%ebp
+ movl %ecx,16(%esp)
+ movl %ecx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %ebp,28(%esp)
+.byte 102,15,58,22,251,3
+.byte 102,15,58,34,253,3
+ movl 240(%edx),%ecx
+ bswap %ebx
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqa (%esp),%xmm2
+.byte 102,15,58,34,195,0
+ leal 3(%ebx),%ebp
+.byte 102,15,58,34,205,0
+ incl %ebx
+.byte 102,15,58,34,195,1
+ incl %ebp
+.byte 102,15,58,34,205,1
+ incl %ebx
+.byte 102,15,58,34,195,2
+ incl %ebp
+.byte 102,15,58,34,205,2
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
+ movdqu (%edx),%xmm6
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ pshufd $192,%xmm0,%xmm2
+ pshufd $128,%xmm0,%xmm3
+ cmpl $6,%eax
+ jb .L040ctr32_tail
+ pxor %xmm6,%xmm7
+ shll $4,%ecx
+ movl $16,%ebx
+ movdqa %xmm7,32(%esp)
+ movl %edx,%ebp
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ subl $6,%eax
+ jmp .L041ctr32_loop6
+.align 16
+.L041ctr32_loop6:
+ pshufd $64,%xmm0,%xmm4
+ movdqa 32(%esp),%xmm0
+ pshufd $192,%xmm1,%xmm5
+ pxor %xmm0,%xmm2
+ pshufd $128,%xmm1,%xmm6
+ pxor %xmm0,%xmm3
+ pshufd $64,%xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+.byte 102,15,56,220,217
+ movups 32(%ebp),%xmm0
+ movl %ebx,%ecx
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ call .L_aesni_encrypt6_enter
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups %xmm2,(%edi)
+ movdqa 16(%esp),%xmm0
+ xorps %xmm1,%xmm4
+ movdqa 64(%esp),%xmm1
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ paddd %xmm0,%xmm1
+ paddd 48(%esp),%xmm0
+ movdqa (%esp),%xmm2
+ movups 48(%esi),%xmm3
+ movups 64(%esi),%xmm4
+ xorps %xmm3,%xmm5
+ movups 80(%esi),%xmm3
+ leal 96(%esi),%esi
+ movdqa %xmm0,48(%esp)
+.byte 102,15,56,0,194
+ xorps %xmm4,%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm3,%xmm7
+ movdqa %xmm1,64(%esp)
+.byte 102,15,56,0,202
+ movups %xmm6,64(%edi)
+ pshufd $192,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ pshufd $128,%xmm0,%xmm3
+ subl $6,%eax
+ jnc .L041ctr32_loop6
+ addl $6,%eax
+ jz .L042ctr32_ret
+ movdqu (%ebp),%xmm7
+ movl %ebp,%edx
+ pxor 32(%esp),%xmm7
+ movl 240(%ebp),%ecx
+.L040ctr32_tail:
+ por %xmm7,%xmm2
+ cmpl $2,%eax
+ jb .L043ctr32_one
+ pshufd $64,%xmm0,%xmm4
+ por %xmm7,%xmm3
+ je .L044ctr32_two
+ pshufd $192,%xmm1,%xmm5
+ por %xmm7,%xmm4
+ cmpl $4,%eax
+ jb .L045ctr32_three
+ pshufd $128,%xmm1,%xmm6
+ por %xmm7,%xmm5
+ je .L046ctr32_four
+ por %xmm7,%xmm6
+ call _aesni_encrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps %xmm1,%xmm2
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm3
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm4
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm5
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ jmp .L042ctr32_ret
+.align 16
+.L039ctr32_one_shortcut:
+ movups (%ebx),%xmm2
+ movl 240(%edx),%ecx
+.L043ctr32_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L047enc1_loop_7:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L047enc1_loop_7
+.byte 102,15,56,221,209
+ movups (%esi),%xmm6
+ xorps %xmm2,%xmm6
+ movups %xmm6,(%edi)
+ jmp .L042ctr32_ret
+.align 16
+.L044ctr32_two:
+ call _aesni_encrypt2
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ jmp .L042ctr32_ret
+.align 16
+.L045ctr32_three:
+ call _aesni_encrypt3
+ movups (%esi),%xmm5
+ movups 16(%esi),%xmm6
+ xorps %xmm5,%xmm2
+ movups 32(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm7,%xmm4
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ jmp .L042ctr32_ret
+.align 16
+.L046ctr32_four:
+ call _aesni_encrypt4
+ movups (%esi),%xmm6
+ movups 16(%esi),%xmm7
+ movups 32(%esi),%xmm1
+ xorps %xmm6,%xmm2
+ movups 48(%esi),%xmm0
+ xorps %xmm7,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+.L042ctr32_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movl 80(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin
+.globl aes_hw_xts_encrypt
+.hidden aes_hw_xts_encrypt
+.type aes_hw_xts_encrypt,@function
+.align 16
+aes_hw_xts_encrypt:
+.L_aes_hw_xts_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L048enc1_loop_8:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L048enc1_loop_8
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ movl 240(%edx),%ecx
+ andl $-16,%esp
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ subl $96,%eax
+ jc .L049xts_enc_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L050xts_enc_loop6
+.align 16
+.L050xts_enc_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movl %ebx,%ecx
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ pxor 16(%esp),%xmm3
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,220,209
+ pxor 48(%esp),%xmm5
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ call .L_aesni_encrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc .L050xts_enc_loop6
+ movl 240(%ebp),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+.L049xts_enc_short:
+ addl $96,%eax
+ jz .L051xts_enc_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb .L052xts_enc_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je .L053xts_enc_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb .L054xts_enc_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je .L055xts_enc_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call _aesni_encrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp .L056xts_enc_done
+.align 16
+.L052xts_enc_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L057enc1_loop_9:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L057enc1_loop_9
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp .L056xts_enc_done
+.align 16
+.L053xts_enc_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ call _aesni_encrypt2
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L056xts_enc_done
+.align 16
+.L054xts_enc_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call _aesni_encrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp .L056xts_enc_done
+.align 16
+.L055xts_enc_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call _aesni_encrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L056xts_enc_done
+.align 16
+.L051xts_enc_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz .L058xts_enc_ret
+ movdqa %xmm1,%xmm5
+ movl %eax,112(%esp)
+ jmp .L059xts_enc_steal
+.align 16
+.L056xts_enc_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz .L058xts_enc_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm5
+ paddq %xmm1,%xmm1
+ pand 96(%esp),%xmm5
+ pxor %xmm1,%xmm5
+.L059xts_enc_steal:
+ movzbl (%esi),%ecx
+ movzbl -16(%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,-16(%edi)
+ movb %dl,(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz .L059xts_enc_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups -16(%edi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L060enc1_loop_10:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L060enc1_loop_10
+.byte 102,15,56,221,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,-16(%edi)
+.L058xts_enc_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_xts_encrypt,.-.L_aes_hw_xts_encrypt_begin
+.globl aes_hw_xts_decrypt
+.hidden aes_hw_xts_decrypt
+.type aes_hw_xts_decrypt,@function
+.align 16
+aes_hw_xts_decrypt:
+.L_aes_hw_xts_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 36(%esp),%edx
+ movl 40(%esp),%esi
+ movl 240(%edx),%ecx
+ movups (%esi),%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L061enc1_loop_11:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L061enc1_loop_11
+.byte 102,15,56,221,209
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ movl %esp,%ebp
+ subl $120,%esp
+ andl $-16,%esp
+ xorl %ebx,%ebx
+ testl $15,%eax
+ setnz %bl
+ shll $4,%ebx
+ subl %ebx,%eax
+ movl $135,96(%esp)
+ movl $0,100(%esp)
+ movl $1,104(%esp)
+ movl $0,108(%esp)
+ movl %eax,112(%esp)
+ movl %ebp,116(%esp)
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ecx,%ebx
+ movdqa %xmm2,%xmm1
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ pcmpgtd %xmm1,%xmm0
+ andl $-16,%eax
+ subl $96,%eax
+ jc .L062xts_dec_short
+ shll $4,%ecx
+ movl $16,%ebx
+ subl %ecx,%ebx
+ leal 32(%edx,%ecx,1),%edx
+ jmp .L063xts_dec_loop6
+.align 16
+.L063xts_dec_loop6:
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,16(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,32(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,64(%esp)
+ paddq %xmm1,%xmm1
+ movups (%ebp),%xmm0
+ pand %xmm3,%xmm7
+ movups (%esi),%xmm2
+ pxor %xmm1,%xmm7
+ movl %ebx,%ecx
+ movdqu 16(%esi),%xmm3
+ xorps %xmm0,%xmm2
+ movdqu 32(%esi),%xmm4
+ pxor %xmm0,%xmm3
+ movdqu 48(%esi),%xmm5
+ pxor %xmm0,%xmm4
+ movdqu 64(%esi),%xmm6
+ pxor %xmm0,%xmm5
+ movdqu 80(%esi),%xmm1
+ pxor %xmm0,%xmm6
+ leal 96(%esi),%esi
+ pxor (%esp),%xmm2
+ movdqa %xmm7,80(%esp)
+ pxor %xmm1,%xmm7
+ movups 16(%ebp),%xmm1
+ pxor 16(%esp),%xmm3
+ pxor 32(%esp),%xmm4
+.byte 102,15,56,222,209
+ pxor 48(%esp),%xmm5
+ pxor 64(%esp),%xmm6
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm7
+ movups 32(%ebp),%xmm0
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ call .L_aesni_decrypt6_enter
+ movdqa 80(%esp),%xmm1
+ pxor %xmm0,%xmm0
+ xorps (%esp),%xmm2
+ pcmpgtd %xmm1,%xmm0
+ xorps 16(%esp),%xmm3
+ movups %xmm2,(%edi)
+ xorps 32(%esp),%xmm4
+ movups %xmm3,16(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm4,32(%edi)
+ xorps 64(%esp),%xmm6
+ movups %xmm5,48(%edi)
+ xorps %xmm1,%xmm7
+ movups %xmm6,64(%edi)
+ pshufd $19,%xmm0,%xmm2
+ movups %xmm7,80(%edi)
+ leal 96(%edi),%edi
+ movdqa 96(%esp),%xmm3
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ subl $96,%eax
+ jnc .L063xts_dec_loop6
+ movl 240(%ebp),%ecx
+ movl %ebp,%edx
+ movl %ecx,%ebx
+.L062xts_dec_short:
+ addl $96,%eax
+ jz .L064xts_dec_done6x
+ movdqa %xmm1,%xmm5
+ cmpl $32,%eax
+ jb .L065xts_dec_one
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ je .L066xts_dec_two
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ cmpl $64,%eax
+ jb .L067xts_dec_three
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa %xmm1,%xmm7
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,(%esp)
+ movdqa %xmm6,16(%esp)
+ je .L068xts_dec_four
+ movdqa %xmm7,32(%esp)
+ pshufd $19,%xmm0,%xmm7
+ movdqa %xmm1,48(%esp)
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm7
+ pxor %xmm1,%xmm7
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ pxor (%esp),%xmm2
+ movdqu 48(%esi),%xmm5
+ pxor 16(%esp),%xmm3
+ movdqu 64(%esi),%xmm6
+ pxor 32(%esp),%xmm4
+ leal 80(%esi),%esi
+ pxor 48(%esp),%xmm5
+ movdqa %xmm7,64(%esp)
+ pxor %xmm7,%xmm6
+ call _aesni_decrypt6
+ movaps 64(%esp),%xmm1
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps 32(%esp),%xmm4
+ movups %xmm2,(%edi)
+ xorps 48(%esp),%xmm5
+ movups %xmm3,16(%edi)
+ xorps %xmm1,%xmm6
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ jmp .L069xts_dec_done
+.align 16
+.L065xts_dec_one:
+ movups (%esi),%xmm2
+ leal 16(%esi),%esi
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L070dec1_loop_12:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L070dec1_loop_12
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ movdqa %xmm5,%xmm1
+ jmp .L069xts_dec_done
+.align 16
+.L066xts_dec_two:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ leal 32(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ call _aesni_decrypt2
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 32(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L069xts_dec_done
+.align 16
+.L067xts_dec_three:
+ movaps %xmm1,%xmm7
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ leal 48(%esi),%esi
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ call _aesni_decrypt3
+ xorps %xmm5,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ leal 48(%edi),%edi
+ movdqa %xmm7,%xmm1
+ jmp .L069xts_dec_done
+.align 16
+.L068xts_dec_four:
+ movaps %xmm1,%xmm6
+ movups (%esi),%xmm2
+ movups 16(%esi),%xmm3
+ movups 32(%esi),%xmm4
+ xorps (%esp),%xmm2
+ movups 48(%esi),%xmm5
+ leal 64(%esi),%esi
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ xorps %xmm6,%xmm5
+ call _aesni_decrypt4
+ xorps (%esp),%xmm2
+ xorps 16(%esp),%xmm3
+ xorps %xmm7,%xmm4
+ movups %xmm2,(%edi)
+ xorps %xmm6,%xmm5
+ movups %xmm3,16(%edi)
+ movups %xmm4,32(%edi)
+ movups %xmm5,48(%edi)
+ leal 64(%edi),%edi
+ movdqa %xmm6,%xmm1
+ jmp .L069xts_dec_done
+.align 16
+.L064xts_dec_done6x:
+ movl 112(%esp),%eax
+ andl $15,%eax
+ jz .L071xts_dec_ret
+ movl %eax,112(%esp)
+ jmp .L072xts_dec_only_one_more
+.align 16
+.L069xts_dec_done:
+ movl 112(%esp),%eax
+ pxor %xmm0,%xmm0
+ andl $15,%eax
+ jz .L071xts_dec_ret
+ pcmpgtd %xmm1,%xmm0
+ movl %eax,112(%esp)
+ pshufd $19,%xmm0,%xmm2
+ pxor %xmm0,%xmm0
+ movdqa 96(%esp),%xmm3
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm2
+ pcmpgtd %xmm1,%xmm0
+ pxor %xmm2,%xmm1
+.L072xts_dec_only_one_more:
+ pshufd $19,%xmm0,%xmm5
+ movdqa %xmm1,%xmm6
+ paddq %xmm1,%xmm1
+ pand %xmm3,%xmm5
+ pxor %xmm1,%xmm5
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%esi),%xmm2
+ xorps %xmm5,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L073dec1_loop_13:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L073dec1_loop_13
+.byte 102,15,56,223,209
+ xorps %xmm5,%xmm2
+ movups %xmm2,(%edi)
+.L074xts_dec_steal:
+ movzbl 16(%esi),%ecx
+ movzbl (%edi),%edx
+ leal 1(%esi),%esi
+ movb %cl,(%edi)
+ movb %dl,16(%edi)
+ leal 1(%edi),%edi
+ subl $1,%eax
+ jnz .L074xts_dec_steal
+ subl 112(%esp),%edi
+ movl %ebp,%edx
+ movl %ebx,%ecx
+ movups (%edi),%xmm2
+ xorps %xmm6,%xmm2
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L075dec1_loop_14:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L075dec1_loop_14
+.byte 102,15,56,223,209
+ xorps %xmm6,%xmm2
+ movups %xmm2,(%edi)
+.L071xts_dec_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ movdqa %xmm0,(%esp)
+ pxor %xmm3,%xmm3
+ movdqa %xmm0,16(%esp)
+ pxor %xmm4,%xmm4
+ movdqa %xmm0,32(%esp)
+ pxor %xmm5,%xmm5
+ movdqa %xmm0,48(%esp)
+ pxor %xmm6,%xmm6
+ movdqa %xmm0,64(%esp)
+ pxor %xmm7,%xmm7
+ movdqa %xmm0,80(%esp)
+ movl 116(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_xts_decrypt,.-.L_aes_hw_xts_decrypt_begin
+.globl aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,@function
+.align 16
+aes_hw_cbc_encrypt:
+.L_aes_hw_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl %esp,%ebx
+ movl 24(%esp),%edi
+ subl $24,%ebx
+ movl 28(%esp),%eax
+ andl $-16,%ebx
+ movl 32(%esp),%edx
+ movl 36(%esp),%ebp
+ testl %eax,%eax
+ jz .L076cbc_abort
+ cmpl $0,40(%esp)
+ xchgl %esp,%ebx
+ movups (%ebp),%xmm7
+ movl 240(%edx),%ecx
+ movl %edx,%ebp
+ movl %ebx,16(%esp)
+ movl %ecx,%ebx
+ je .L077cbc_decrypt
+ movaps %xmm7,%xmm2
+ cmpl $16,%eax
+ jb .L078cbc_enc_tail
+ subl $16,%eax
+ jmp .L079cbc_enc_loop
+.align 16
+.L079cbc_enc_loop:
+ movups (%esi),%xmm7
+ leal 16(%esi),%esi
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ xorps %xmm0,%xmm7
+ leal 32(%edx),%edx
+ xorps %xmm7,%xmm2
+.L080enc1_loop_15:
+.byte 102,15,56,220,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L080enc1_loop_15
+.byte 102,15,56,221,209
+ movl %ebx,%ecx
+ movl %ebp,%edx
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+ subl $16,%eax
+ jnc .L079cbc_enc_loop
+ addl $16,%eax
+ jnz .L078cbc_enc_tail
+ movaps %xmm2,%xmm7
+ pxor %xmm2,%xmm2
+ jmp .L081cbc_ret
+.L078cbc_enc_tail:
+ movl %eax,%ecx
+.long 2767451785
+ movl $16,%ecx
+ subl %eax,%ecx
+ xorl %eax,%eax
+.long 2868115081
+ leal -16(%edi),%edi
+ movl %ebx,%ecx
+ movl %edi,%esi
+ movl %ebp,%edx
+ jmp .L079cbc_enc_loop
+.align 16
+.L077cbc_decrypt:
+ cmpl $80,%eax
+ jbe .L082cbc_dec_tail
+ movaps %xmm7,(%esp)
+ subl $80,%eax
+ jmp .L083cbc_dec_loop6_enter
+.align 16
+.L084cbc_dec_loop6:
+ movaps %xmm0,(%esp)
+ movups %xmm7,(%edi)
+ leal 16(%edi),%edi
+.L083cbc_dec_loop6_enter:
+ movdqu (%esi),%xmm2
+ movdqu 16(%esi),%xmm3
+ movdqu 32(%esi),%xmm4
+ movdqu 48(%esi),%xmm5
+ movdqu 64(%esi),%xmm6
+ movdqu 80(%esi),%xmm7
+ call _aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm1
+ xorps %xmm0,%xmm6
+ movups 80(%esi),%xmm0
+ xorps %xmm1,%xmm7
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ leal 96(%esi),%esi
+ movups %xmm4,32(%edi)
+ movl %ebx,%ecx
+ movups %xmm5,48(%edi)
+ movl %ebp,%edx
+ movups %xmm6,64(%edi)
+ leal 80(%edi),%edi
+ subl $96,%eax
+ ja .L084cbc_dec_loop6
+ movaps %xmm7,%xmm2
+ movaps %xmm0,%xmm7
+ addl $80,%eax
+ jle .L085cbc_dec_clear_tail_collected
+ movups %xmm2,(%edi)
+ leal 16(%edi),%edi
+.L082cbc_dec_tail:
+ movups (%esi),%xmm2
+ movaps %xmm2,%xmm6
+ cmpl $16,%eax
+ jbe .L086cbc_dec_one
+ movups 16(%esi),%xmm3
+ movaps %xmm3,%xmm5
+ cmpl $32,%eax
+ jbe .L087cbc_dec_two
+ movups 32(%esi),%xmm4
+ cmpl $48,%eax
+ jbe .L088cbc_dec_three
+ movups 48(%esi),%xmm5
+ cmpl $64,%eax
+ jbe .L089cbc_dec_four
+ movups 64(%esi),%xmm6
+ movaps %xmm7,(%esp)
+ movups (%esi),%xmm2
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups (%esi),%xmm1
+ movups 16(%esi),%xmm0
+ xorps (%esp),%xmm2
+ xorps %xmm1,%xmm3
+ movups 32(%esi),%xmm1
+ xorps %xmm0,%xmm4
+ movups 48(%esi),%xmm0
+ xorps %xmm1,%xmm5
+ movups 64(%esi),%xmm7
+ xorps %xmm0,%xmm6
+ movups %xmm2,(%edi)
+ movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%edi)
+ pxor %xmm5,%xmm5
+ leal 64(%edi),%edi
+ movaps %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ subl $80,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L086cbc_dec_one:
+ movups (%edx),%xmm0
+ movups 16(%edx),%xmm1
+ leal 32(%edx),%edx
+ xorps %xmm0,%xmm2
+.L091dec1_loop_16:
+.byte 102,15,56,222,209
+ decl %ecx
+ movups (%edx),%xmm1
+ leal 16(%edx),%edx
+ jnz .L091dec1_loop_16
+.byte 102,15,56,223,209
+ xorps %xmm7,%xmm2
+ movaps %xmm6,%xmm7
+ subl $16,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L087cbc_dec_two:
+ call _aesni_decrypt2
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ movaps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ leal 16(%edi),%edi
+ movaps %xmm5,%xmm7
+ subl $32,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L088cbc_dec_three:
+ call _aesni_decrypt3
+ xorps %xmm7,%xmm2
+ xorps %xmm6,%xmm3
+ xorps %xmm5,%xmm4
+ movups %xmm2,(%edi)
+ movaps %xmm4,%xmm2
+ pxor %xmm4,%xmm4
+ movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
+ leal 32(%edi),%edi
+ movups 32(%esi),%xmm7
+ subl $48,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L089cbc_dec_four:
+ call _aesni_decrypt4
+ movups 16(%esi),%xmm1
+ movups 32(%esi),%xmm0
+ xorps %xmm7,%xmm2
+ movups 48(%esi),%xmm7
+ xorps %xmm6,%xmm3
+ movups %xmm2,(%edi)
+ xorps %xmm1,%xmm4
+ movups %xmm3,16(%edi)
+ pxor %xmm3,%xmm3
+ xorps %xmm0,%xmm5
+ movups %xmm4,32(%edi)
+ pxor %xmm4,%xmm4
+ leal 48(%edi),%edi
+ movaps %xmm5,%xmm2
+ pxor %xmm5,%xmm5
+ subl $64,%eax
+ jmp .L090cbc_dec_tail_collected
+.align 16
+.L085cbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+.L090cbc_dec_tail_collected:
+ andl $15,%eax
+ jnz .L092cbc_dec_tail_partial
+ movups %xmm2,(%edi)
+ pxor %xmm0,%xmm0
+ jmp .L081cbc_ret
+.align 16
+.L092cbc_dec_tail_partial:
+ movaps %xmm2,(%esp)
+ pxor %xmm0,%xmm0
+ movl $16,%ecx
+ movl %esp,%esi
+ subl %eax,%ecx
+.long 2767451785
+ movdqa %xmm2,(%esp)
+.L081cbc_ret:
+ movl 16(%esp),%esp
+ movl 36(%esp),%ebp
+ pxor %xmm2,%xmm2
+ pxor %xmm1,%xmm1
+ movups %xmm7,(%ebp)
+ pxor %xmm7,%xmm7
+.L076cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin
+.hidden _aesni_set_encrypt_key
+.type _aesni_set_encrypt_key,@function
+.align 16
+_aesni_set_encrypt_key:
+ pushl %ebp
+ pushl %ebx
+ testl %eax,%eax
+ jz .L093bad_pointer
+ testl %edx,%edx
+ jz .L093bad_pointer
+ call .L094pic
+.L094pic:
+ popl %ebx
+ leal .Lkey_const-.L094pic(%ebx),%ebx
+ leal OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp
+ movups (%eax),%xmm0
+ xorps %xmm4,%xmm4
+ movl 4(%ebp),%ebp
+ leal 16(%edx),%edx
+ andl $268437504,%ebp
+ cmpl $256,%ecx
+ je .L09514rounds
+ cmpl $192,%ecx
+ je .L09612rounds
+ cmpl $128,%ecx
+ jne .L097bad_keybits
+.align 16
+.L09810rounds:
+ cmpl $268435456,%ebp
+ je .L09910rounds_alt
+ movl $9,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,200,1
+ call .L100key_128_cold
+.byte 102,15,58,223,200,2
+ call .L101key_128
+.byte 102,15,58,223,200,4
+ call .L101key_128
+.byte 102,15,58,223,200,8
+ call .L101key_128
+.byte 102,15,58,223,200,16
+ call .L101key_128
+.byte 102,15,58,223,200,32
+ call .L101key_128
+.byte 102,15,58,223,200,64
+ call .L101key_128
+.byte 102,15,58,223,200,128
+ call .L101key_128
+.byte 102,15,58,223,200,27
+ call .L101key_128
+.byte 102,15,58,223,200,54
+ call .L101key_128
+ movups %xmm0,(%edx)
+ movl %ecx,80(%edx)
+ jmp .L102good_key
+.align 16
+.L101key_128:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.L100key_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 16
+.L09910rounds_alt:
+ movdqa (%ebx),%xmm5
+ movl $8,%ecx
+ movdqa 32(%ebx),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,-16(%edx)
+.L103loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leal 16(%edx),%edx
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%edx)
+ movdqa %xmm0,%xmm2
+ decl %ecx
+ jnz .L103loop_key128
+ movdqa 48(%ebx),%xmm4
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%edx)
+ movl $9,%ecx
+ movl %ecx,96(%edx)
+ jmp .L102good_key
+.align 16
+.L09612rounds:
+ movq 16(%eax),%xmm2
+ cmpl $268435456,%ebp
+ je .L10412rounds_alt
+ movl $11,%ecx
+ movups %xmm0,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L105key_192a_cold
+.byte 102,15,58,223,202,2
+ call .L106key_192b
+.byte 102,15,58,223,202,4
+ call .L107key_192a
+.byte 102,15,58,223,202,8
+ call .L106key_192b
+.byte 102,15,58,223,202,16
+ call .L107key_192a
+.byte 102,15,58,223,202,32
+ call .L106key_192b
+.byte 102,15,58,223,202,64
+ call .L107key_192a
+.byte 102,15,58,223,202,128
+ call .L106key_192b
+ movups %xmm0,(%edx)
+ movl %ecx,48(%edx)
+ jmp .L102good_key
+.align 16
+.L107key_192a:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+.align 16
+.L105key_192a_cold:
+ movaps %xmm2,%xmm5
+.L108key_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ ret
+.align 16
+.L106key_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%edx)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%edx)
+ leal 32(%edx),%edx
+ jmp .L108key_192b_warm
+.align 16
+.L10412rounds_alt:
+ movdqa 16(%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $8,%ecx
+ movdqu %xmm0,-16(%edx)
+.L109loop_key192:
+ movq %xmm2,(%edx)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leal 24(%edx),%edx
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%edx)
+ decl %ecx
+ jnz .L109loop_key192
+ movl $11,%ecx
+ movl %ecx,32(%edx)
+ jmp .L102good_key
+.align 16
+.L09514rounds:
+ movups 16(%eax),%xmm2
+ leal 16(%edx),%edx
+ cmpl $268435456,%ebp
+ je .L11014rounds_alt
+ movl $13,%ecx
+ movups %xmm0,-32(%edx)
+ movups %xmm2,-16(%edx)
+.byte 102,15,58,223,202,1
+ call .L111key_256a_cold
+.byte 102,15,58,223,200,1
+ call .L112key_256b
+.byte 102,15,58,223,202,2
+ call .L113key_256a
+.byte 102,15,58,223,200,2
+ call .L112key_256b
+.byte 102,15,58,223,202,4
+ call .L113key_256a
+.byte 102,15,58,223,200,4
+ call .L112key_256b
+.byte 102,15,58,223,202,8
+ call .L113key_256a
+.byte 102,15,58,223,200,8
+ call .L112key_256b
+.byte 102,15,58,223,202,16
+ call .L113key_256a
+.byte 102,15,58,223,200,16
+ call .L112key_256b
+.byte 102,15,58,223,202,32
+ call .L113key_256a
+.byte 102,15,58,223,200,32
+ call .L112key_256b
+.byte 102,15,58,223,202,64
+ call .L113key_256a
+ movups %xmm0,(%edx)
+ movl %ecx,16(%edx)
+ xorl %eax,%eax
+ jmp .L102good_key
+.align 16
+.L113key_256a:
+ movups %xmm2,(%edx)
+ leal 16(%edx),%edx
+.L111key_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ ret
+.align 16
+.L112key_256b:
+ movups %xmm0,(%edx)
+ leal 16(%edx),%edx
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ ret
+.align 16
+.L11014rounds_alt:
+ movdqa (%ebx),%xmm5
+ movdqa 32(%ebx),%xmm4
+ movl $7,%ecx
+ movdqu %xmm0,-32(%edx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,-16(%edx)
+.L114loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%edx)
+ decl %ecx
+ jz .L115done_key256
+ pshufd $255,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%edx)
+ leal 32(%edx),%edx
+ movdqa %xmm2,%xmm1
+ jmp .L114loop_key256
+.L115done_key256:
+ movl $13,%ecx
+ movl %ecx,16(%edx)
+.L102good_key:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ xorl %eax,%eax
+ popl %ebx
+ popl %ebp
+ ret
+.align 4
+.L093bad_pointer:
+ movl $-1,%eax
+ popl %ebx
+ popl %ebp
+ ret
+.align 4
+.L097bad_keybits:
+ pxor %xmm0,%xmm0
+ movl $-2,%eax
+ popl %ebx
+ popl %ebp
+ ret
+.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
+.globl aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,@function
+.align 16
+aes_hw_set_encrypt_key:
+.L_aes_hw_set_encrypt_key_begin:
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L116pic
+.L116pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+3-.L116pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call _aesni_set_encrypt_key
+ ret
+.size aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin
+.globl aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,@function
+.align 16
+aes_hw_set_decrypt_key:
+.L_aes_hw_set_decrypt_key_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ call _aesni_set_encrypt_key
+ movl 12(%esp),%edx
+ shll $4,%ecx
+ testl %eax,%eax
+ jnz .L117dec_key_ret
+ leal 16(%edx,%ecx,1),%eax
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+ movups %xmm0,(%eax)
+ movups %xmm1,(%edx)
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+.L118dec_key_inverse:
+ movups (%edx),%xmm0
+ movups (%eax),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leal 16(%edx),%edx
+ leal -16(%eax),%eax
+ movups %xmm0,16(%eax)
+ movups %xmm1,-16(%edx)
+ cmpl %edx,%eax
+ ja .L118dec_key_inverse
+ movups (%edx),%xmm0
+.byte 102,15,56,219,192
+ movups %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorl %eax,%eax
+.L117dec_key_ret:
+ ret
+.size aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin
+.align 64
+.Lkey_const:
+.long 202313229,202313229,202313229,202313229
+.long 67569157,67569157,67569157,67569157
+.long 1,1,1,1
+.long 27,27,27,27
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69
+.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83
+.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115
+.byte 115,108,46,111,114,103,62,0
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S
@@ -1,0 +1,997 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl bn_mul_add_words
+.hidden bn_mul_add_words
+.type bn_mul_add_words,@function
+.align 16
+bn_mul_add_words:
+.L_bn_mul_add_words_begin:
+ call .L000PIC_me_up
+.L000PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L001maw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+ jmp .L002maw_sse2_entry
+.align 16
+.L003maw_sse2_unrolled:
+ movd (%eax),%mm3
+ paddq %mm3,%mm1
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ movd 4(%edx),%mm4
+ pmuludq %mm0,%mm4
+ movd 8(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd 12(%edx),%mm7
+ pmuludq %mm0,%mm7
+ paddq %mm2,%mm1
+ movd 4(%eax),%mm3
+ paddq %mm4,%mm3
+ movd 8(%eax),%mm5
+ paddq %mm6,%mm5
+ movd 12(%eax),%mm4
+ paddq %mm4,%mm7
+ movd %mm1,(%eax)
+ movd 16(%edx),%mm2
+ pmuludq %mm0,%mm2
+ psrlq $32,%mm1
+ movd 20(%edx),%mm4
+ pmuludq %mm0,%mm4
+ paddq %mm3,%mm1
+ movd 24(%edx),%mm6
+ pmuludq %mm0,%mm6
+ movd %mm1,4(%eax)
+ psrlq $32,%mm1
+ movd 28(%edx),%mm3
+ addl $32,%edx
+ pmuludq %mm0,%mm3
+ paddq %mm5,%mm1
+ movd 16(%eax),%mm5
+ paddq %mm5,%mm2
+ movd %mm1,8(%eax)
+ psrlq $32,%mm1
+ paddq %mm7,%mm1
+ movd 20(%eax),%mm5
+ paddq %mm5,%mm4
+ movd %mm1,12(%eax)
+ psrlq $32,%mm1
+ paddq %mm2,%mm1
+ movd 24(%eax),%mm5
+ paddq %mm5,%mm6
+ movd %mm1,16(%eax)
+ psrlq $32,%mm1
+ paddq %mm4,%mm1
+ movd 28(%eax),%mm5
+ paddq %mm5,%mm3
+ movd %mm1,20(%eax)
+ psrlq $32,%mm1
+ paddq %mm6,%mm1
+ movd %mm1,24(%eax)
+ psrlq $32,%mm1
+ paddq %mm3,%mm1
+ movd %mm1,28(%eax)
+ leal 32(%eax),%eax
+ psrlq $32,%mm1
+ subl $8,%ecx
+ jz .L004maw_sse2_exit
+.L002maw_sse2_entry:
+ testl $4294967288,%ecx
+ jnz .L003maw_sse2_unrolled
+.align 4
+.L005maw_sse2_loop:
+ movd (%edx),%mm2
+ movd (%eax),%mm3
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm3,%mm1
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L005maw_sse2_loop
+.L004maw_sse2_exit:
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L001maw_non_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 28(%esp),%ecx
+ movl 24(%esp),%ebx
+ andl $4294967288,%ecx
+ movl 32(%esp),%ebp
+ pushl %ecx
+ jz .L006maw_finish
+.align 16
+.L007maw_loop:
+
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+
+ movl 28(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 28(%edi),%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ subl $8,%ecx
+ leal 32(%ebx),%ebx
+ leal 32(%edi),%edi
+ jnz .L007maw_loop
+.L006maw_finish:
+ movl 32(%esp),%ecx
+ andl $7,%ecx
+ jnz .L008maw_finish2
+ jmp .L009maw_end
+.L008maw_finish2:
+
+ movl (%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl (%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 4(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 4(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 8(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 8(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 12(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 12(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 16(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 16(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 20(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 20(%edi),%eax
+ adcl $0,%edx
+ decl %ecx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ jz .L009maw_end
+
+ movl 24(%ebx),%eax
+ mull %ebp
+ addl %esi,%eax
+ adcl $0,%edx
+ addl 24(%edi),%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+.L009maw_end:
+ movl %esi,%eax
+ popl %ecx
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_add_words,.-.L_bn_mul_add_words_begin
+.globl bn_mul_words
+.hidden bn_mul_words
+.type bn_mul_words,@function
+.align 16
+bn_mul_words:
+.L_bn_mul_words_begin:
+ call .L010PIC_me_up
+.L010PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L011mw_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+ movd 16(%esp),%mm0
+ pxor %mm1,%mm1
+.align 16
+.L012mw_sse2_loop:
+ movd (%edx),%mm2
+ pmuludq %mm0,%mm2
+ leal 4(%edx),%edx
+ paddq %mm2,%mm1
+ movd %mm1,(%eax)
+ subl $1,%ecx
+ psrlq $32,%mm1
+ leal 4(%eax),%eax
+ jnz .L012mw_sse2_loop
+ movd %mm1,%eax
+ emms
+ ret
+.align 16
+.L011mw_non_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ xorl %esi,%esi
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ebp
+ movl 32(%esp),%ecx
+ andl $4294967288,%ebp
+ jz .L013mw_finish
+.L014mw_loop:
+
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+
+ movl 28(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,28(%edi)
+ movl %edx,%esi
+
+ addl $32,%ebx
+ addl $32,%edi
+ subl $8,%ebp
+ jz .L013mw_finish
+ jmp .L014mw_loop
+.L013mw_finish:
+ movl 28(%esp),%ebp
+ andl $7,%ebp
+ jnz .L015mw_finish2
+ jmp .L016mw_end
+.L015mw_finish2:
+
+ movl (%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 4(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,4(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 8(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,8(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 12(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,12(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 16(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,16(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 20(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,20(%edi)
+ movl %edx,%esi
+ decl %ebp
+ jz .L016mw_end
+
+ movl 24(%ebx),%eax
+ mull %ecx
+ addl %esi,%eax
+ adcl $0,%edx
+ movl %eax,24(%edi)
+ movl %edx,%esi
+.L016mw_end:
+ movl %esi,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_words,.-.L_bn_mul_words_begin
+.globl bn_sqr_words
+.hidden bn_sqr_words
+.type bn_sqr_words,@function
+.align 16
+bn_sqr_words:
+.L_bn_sqr_words_begin:
+ call .L017PIC_me_up
+.L017PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L018sqr_non_sse2
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ movl 12(%esp),%ecx
+.align 16
+.L019sqr_sse2_loop:
+ movd (%edx),%mm0
+ pmuludq %mm0,%mm0
+ leal 4(%edx),%edx
+ movq %mm0,(%eax)
+ subl $1,%ecx
+ leal 8(%eax),%eax
+ jnz .L019sqr_sse2_loop
+ emms
+ ret
+.align 16
+.L018sqr_non_sse2:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%ebx
+ andl $4294967288,%ebx
+ jz .L020sw_finish
+.L021sw_loop:
+
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ movl %edx,4(%esi)
+
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ movl %edx,12(%esi)
+
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ movl %edx,20(%esi)
+
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ movl %edx,28(%esi)
+
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ movl %edx,36(%esi)
+
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ movl %edx,44(%esi)
+
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+
+ movl 28(%edi),%eax
+ mull %eax
+ movl %eax,56(%esi)
+ movl %edx,60(%esi)
+
+ addl $32,%edi
+ addl $64,%esi
+ subl $8,%ebx
+ jnz .L021sw_loop
+.L020sw_finish:
+ movl 28(%esp),%ebx
+ andl $7,%ebx
+ jz .L022sw_end
+
+ movl (%edi),%eax
+ mull %eax
+ movl %eax,(%esi)
+ decl %ebx
+ movl %edx,4(%esi)
+ jz .L022sw_end
+
+ movl 4(%edi),%eax
+ mull %eax
+ movl %eax,8(%esi)
+ decl %ebx
+ movl %edx,12(%esi)
+ jz .L022sw_end
+
+ movl 8(%edi),%eax
+ mull %eax
+ movl %eax,16(%esi)
+ decl %ebx
+ movl %edx,20(%esi)
+ jz .L022sw_end
+
+ movl 12(%edi),%eax
+ mull %eax
+ movl %eax,24(%esi)
+ decl %ebx
+ movl %edx,28(%esi)
+ jz .L022sw_end
+
+ movl 16(%edi),%eax
+ mull %eax
+ movl %eax,32(%esi)
+ decl %ebx
+ movl %edx,36(%esi)
+ jz .L022sw_end
+
+ movl 20(%edi),%eax
+ mull %eax
+ movl %eax,40(%esi)
+ decl %ebx
+ movl %edx,44(%esi)
+ jz .L022sw_end
+
+ movl 24(%edi),%eax
+ mull %eax
+ movl %eax,48(%esi)
+ movl %edx,52(%esi)
+.L022sw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sqr_words,.-.L_bn_sqr_words_begin
+.globl bn_div_words
+.hidden bn_div_words
+.type bn_div_words,@function
+.align 16
+bn_div_words:
+.L_bn_div_words_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ movl 12(%esp),%ecx
+ divl %ecx
+ ret
+.size bn_div_words,.-.L_bn_div_words_begin
+.globl bn_add_words
+.hidden bn_add_words
+.type bn_add_words,@function
+.align 16
+bn_add_words:
+.L_bn_add_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L023aw_finish
+.L024aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L024aw_loop
+.L023aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L025aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L025aw_end
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L025aw_end
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L025aw_end
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L025aw_end
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L025aw_end
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L025aw_end
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ addl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ addl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+.L025aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_add_words,.-.L_bn_add_words_begin
+.globl bn_sub_words
+.hidden bn_sub_words
+.type bn_sub_words,@function
+.align 16
+bn_sub_words:
+.L_bn_sub_words_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+
+ movl 20(%esp),%ebx
+ movl 24(%esp),%esi
+ movl 28(%esp),%edi
+ movl 32(%esp),%ebp
+ xorl %eax,%eax
+ andl $4294967288,%ebp
+ jz .L026aw_finish
+.L027aw_loop:
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,(%ebx)
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,4(%ebx)
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,8(%ebx)
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,12(%ebx)
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,16(%ebx)
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,20(%ebx)
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+
+ movl 28(%esi),%ecx
+ movl 28(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,28(%ebx)
+
+ addl $32,%esi
+ addl $32,%edi
+ addl $32,%ebx
+ subl $8,%ebp
+ jnz .L027aw_loop
+.L026aw_finish:
+ movl 32(%esp),%ebp
+ andl $7,%ebp
+ jz .L028aw_end
+
+ movl (%esi),%ecx
+ movl (%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,(%ebx)
+ jz .L028aw_end
+
+ movl 4(%esi),%ecx
+ movl 4(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,4(%ebx)
+ jz .L028aw_end
+
+ movl 8(%esi),%ecx
+ movl 8(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,8(%ebx)
+ jz .L028aw_end
+
+ movl 12(%esi),%ecx
+ movl 12(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,12(%ebx)
+ jz .L028aw_end
+
+ movl 16(%esi),%ecx
+ movl 16(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,16(%ebx)
+ jz .L028aw_end
+
+ movl 20(%esi),%ecx
+ movl 20(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ decl %ebp
+ movl %ecx,20(%ebx)
+ jz .L028aw_end
+
+ movl 24(%esi),%ecx
+ movl 24(%edi),%edx
+ subl %eax,%ecx
+ movl $0,%eax
+ adcl %eax,%eax
+ subl %edx,%ecx
+ adcl $0,%eax
+ movl %ecx,24(%ebx)
+.L028aw_end:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_sub_words,.-.L_bn_sub_words_begin
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S
@@ -1,0 +1,1266 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl bn_mul_comba8
+.hidden bn_mul_comba8
+.type bn_mul_comba8,@function
+.align 16
+bn_mul_comba8:
+.L_bn_mul_comba8_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 16(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 20(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 24(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,28(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 16(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 12(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 16(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 16(%esi),%eax
+ adcl %edx,%ecx
+ movl 20(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 12(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,36(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esi),%eax
+ adcl %edx,%ebp
+ movl 20(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 16(%esi),%eax
+ adcl %edx,%ebp
+ movl 24(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 12(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 16(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 24(%esi),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esi),%eax
+ adcl %edx,%ebx
+ movl 24(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 16(%esi),%eax
+ adcl %edx,%ebx
+ movl 28(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 20(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,44(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 24(%esi),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esi),%eax
+ adcl %edx,%ecx
+ movl 28(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 24(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 24(%esi),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 28(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,52(%eax)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%eax)
+
+
+ movl %ebx,60(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_mul_comba8,.-.L_bn_mul_comba8_begin
+.globl bn_mul_comba4
+.hidden bn_mul_comba4
+.type bn_mul_comba4,@function
+.align 16
+bn_mul_comba4:
+.L_bn_mul_comba4_begin:
+ pushl %esi
+ movl 12(%esp),%esi
+ pushl %edi
+ movl 20(%esp),%edi
+ pushl %ebp
+ pushl %ebx
+ xorl %ebx,%ebx
+ movl (%esi),%eax
+ xorl %ecx,%ecx
+ movl (%edi),%edx
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl (%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%eax)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl (%esi),%eax
+ adcl %edx,%ebp
+ movl 4(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl (%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,4(%eax)
+ movl 8(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 4(%esi),%eax
+ adcl %edx,%ebx
+ movl 4(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl (%esi),%eax
+ adcl %edx,%ebx
+ movl 8(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl (%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 8(%esi),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 4(%esi),%eax
+ adcl %edx,%ecx
+ movl 8(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl (%esi),%eax
+ adcl %edx,%ecx
+ movl 12(%edi),%edx
+ adcl $0,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ movl 4(%edi),%edx
+ adcl $0,%ebp
+ movl %ebx,12(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 8(%esi),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 4(%esi),%eax
+ adcl %edx,%ebp
+ movl 12(%edi),%edx
+ adcl $0,%ebx
+
+ mull %edx
+ addl %eax,%ecx
+ movl 20(%esp),%eax
+ adcl %edx,%ebp
+ movl 8(%edi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 8(%esi),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+
+ mull %edx
+ addl %eax,%ebp
+ movl 20(%esp),%eax
+ adcl %edx,%ebx
+ movl 12(%edi),%edx
+ adcl $0,%ecx
+ movl %ebp,20(%eax)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%ebx
+ movl 20(%esp),%eax
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%eax)
+
+
+ movl %ecx,28(%eax)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_mul_comba4,.-.L_bn_mul_comba4_begin
+.globl bn_sqr_comba8
+.hidden bn_sqr_comba8
+.type bn_sqr_comba8,@function
+.align 16
+bn_sqr_comba8:
+.L_bn_sqr_comba8_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 12(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl (%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 20(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 16(%esi),%eax
+ adcl $0,%ebp
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%eax
+ adcl $0,%ebx
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,28(%edi)
+ movl 4(%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 8(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 20(%esi),%eax
+ adcl $0,%ecx
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 16(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 8(%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,32(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+ movl 12(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 20(%esi),%eax
+ adcl $0,%ebp
+ movl 16(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 28(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,36(%edi)
+ movl 12(%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 24(%esi),%eax
+ adcl $0,%ebx
+ movl 16(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 20(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 16(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,40(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 24(%esi),%eax
+ adcl $0,%ecx
+ movl 20(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 28(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,44(%edi)
+ movl 20(%esi),%edx
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%eax
+ adcl $0,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 24(%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,48(%edi)
+ movl 28(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 28(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,52(%edi)
+
+
+ xorl %ecx,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ adcl $0,%ecx
+ movl %ebp,56(%edi)
+
+ movl %ebx,60(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_sqr_comba8,.-.L_bn_sqr_comba8_begin
+.globl bn_sqr_comba4
+.hidden bn_sqr_comba4
+.type bn_sqr_comba4,@function
+.align 16
+bn_sqr_comba4:
+.L_bn_sqr_comba4_begin:
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+ pushl %ebx
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ xorl %ebx,%ebx
+ xorl %ecx,%ecx
+ movl (%esi),%eax
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl (%esi),%edx
+ adcl $0,%ebp
+ movl %ebx,(%edi)
+ movl 4(%esi),%eax
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+ movl %ecx,4(%edi)
+ movl (%esi),%edx
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 4(%esi),%eax
+ adcl $0,%ecx
+
+ mull %eax
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl (%esi),%edx
+ adcl $0,%ecx
+ movl %ebp,8(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ebp,%ebp
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 8(%esi),%eax
+ adcl $0,%ebp
+ movl 4(%esi),%edx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebp
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ movl 12(%esi),%eax
+ adcl $0,%ebp
+ movl %ebx,12(%edi)
+ movl 4(%esi),%edx
+
+
+ xorl %ebx,%ebx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ebx
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%eax
+ adcl $0,%ebx
+
+ mull %eax
+ addl %eax,%ecx
+ adcl %edx,%ebp
+ movl 8(%esi),%edx
+ adcl $0,%ebx
+ movl %ecx,16(%edi)
+ movl 12(%esi),%eax
+
+
+ xorl %ecx,%ecx
+
+ mull %edx
+ addl %eax,%eax
+ adcl %edx,%edx
+ adcl $0,%ecx
+ addl %eax,%ebp
+ adcl %edx,%ebx
+ movl 12(%esi),%eax
+ adcl $0,%ecx
+ movl %ebp,20(%edi)
+
+
+ xorl %ebp,%ebp
+
+ mull %eax
+ addl %eax,%ebx
+ adcl %edx,%ecx
+ adcl $0,%ebp
+ movl %ebx,24(%edi)
+
+ movl %ecx,28(%edi)
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S
@@ -1,0 +1,294 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl gcm_gmult_ssse3
+.hidden gcm_gmult_ssse3
+.type gcm_gmult_ssse3,@function
+.align 16
+gcm_gmult_ssse3:
+.L_gcm_gmult_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movdqu (%edi),%xmm0
+ call .L000pic_point
+.L000pic_point:
+ popl %eax
+ movdqa .Lreverse_bytes-.L000pic_point(%eax),%xmm7
+ movdqa .Llow4_mask-.L000pic_point(%eax),%xmm2
+.byte 102,15,56,0,199
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ movl $5,%eax
+.L001loop_row_1:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L001loop_row_1
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $5,%eax
+.L002loop_row_2:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L002loop_row_2
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $6,%eax
+.L003loop_row_3:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L003loop_row_3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,0,215
+ movdqu %xmm2,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_gmult_ssse3,.-.L_gcm_gmult_ssse3_begin
+.globl gcm_ghash_ssse3
+.hidden gcm_ghash_ssse3
+.type gcm_ghash_ssse3,@function
+.align 16
+gcm_ghash_ssse3:
+.L_gcm_ghash_ssse3_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%edi
+ movl 24(%esp),%esi
+ movl 28(%esp),%edx
+ movl 32(%esp),%ecx
+ movdqu (%edi),%xmm0
+ call .L004pic_point
+.L004pic_point:
+ popl %ebx
+ movdqa .Lreverse_bytes-.L004pic_point(%ebx),%xmm7
+ andl $-16,%ecx
+.byte 102,15,56,0,199
+ pxor %xmm3,%xmm3
+.L005loop_ghash:
+ movdqa .Llow4_mask-.L004pic_point(%ebx),%xmm2
+ movdqu (%edx),%xmm1
+.byte 102,15,56,0,207
+ pxor %xmm1,%xmm0
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ pxor %xmm2,%xmm2
+ movl $5,%eax
+.L006loop_row_4:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L006loop_row_4
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $5,%eax
+.L007loop_row_5:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L007loop_row_5
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movl $6,%eax
+.L008loop_row_6:
+ movdqa (%esi),%xmm4
+ leal 16(%esi),%esi
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+ pxor %xmm5,%xmm2
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+ subl $1,%eax
+ jnz .L008loop_row_6
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movdqa %xmm2,%xmm0
+ leal -256(%esi),%esi
+ leal 16(%edx),%edx
+ subl $16,%ecx
+ jnz .L005loop_ghash
+.byte 102,15,56,0,199
+ movdqu %xmm0,(%edi)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_ghash_ssse3,.-.L_gcm_ghash_ssse3_begin
+.align 16
+.Lreverse_bytes:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.align 16
+.Llow4_mask:
+.long 252645135,252645135,252645135,252645135
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S
@@ -1,0 +1,330 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl gcm_init_clmul
+.hidden gcm_init_clmul
+.type gcm_init_clmul,@function
+.align 16
+gcm_init_clmul:
+.L_gcm_init_clmul_begin:
+ movl 4(%esp),%edx
+ movl 8(%esp),%eax
+ call .L000pic
+.L000pic:
+ popl %ecx
+ leal .Lbswap-.L000pic(%ecx),%ecx
+ movdqu (%eax),%xmm2
+ pshufd $78,%xmm2,%xmm2
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+ pand 16(%ecx),%xmm5
+ pxor %xmm5,%xmm2
+ movdqa %xmm2,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,(%edx)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%edx)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%edx)
+ ret
+.size gcm_init_clmul,.-.L_gcm_init_clmul_begin
+.globl gcm_gmult_clmul
+.hidden gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+.L_gcm_gmult_clmul_begin:
+ movl 4(%esp),%eax
+ movl 8(%esp),%edx
+ call .L001pic
+.L001pic:
+ popl %ecx
+ leal .Lbswap-.L001pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movups (%edx),%xmm2
+.byte 102,15,56,0,197
+ movups 32(%edx),%xmm4
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ ret
+.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin
+.globl gcm_ghash_clmul
+.hidden gcm_ghash_clmul
+.type gcm_ghash_clmul,@function
+.align 16
+gcm_ghash_clmul:
+.L_gcm_ghash_clmul_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%eax
+ movl 24(%esp),%edx
+ movl 28(%esp),%esi
+ movl 32(%esp),%ebx
+ call .L002pic
+.L002pic:
+ popl %ecx
+ leal .Lbswap-.L002pic(%ecx),%ecx
+ movdqu (%eax),%xmm0
+ movdqa (%ecx),%xmm5
+ movdqu (%edx),%xmm2
+.byte 102,15,56,0,197
+ subl $16,%ebx
+ jz .L003odd_tail
+ movdqu (%esi),%xmm3
+ movdqu 16(%esi),%xmm6
+.byte 102,15,56,0,221
+.byte 102,15,56,0,245
+ movdqu 32(%edx),%xmm5
+ pxor %xmm3,%xmm0
+ pshufd $78,%xmm6,%xmm3
+ movdqa %xmm6,%xmm7
+ pxor %xmm6,%xmm3
+ leal 32(%esi),%esi
+.byte 102,15,58,68,242,0
+.byte 102,15,58,68,250,17
+.byte 102,15,58,68,221,0
+ movups 16(%edx),%xmm2
+ nop
+ subl $32,%ebx
+ jbe .L004even_tail
+ jmp .L005mod_loop
+.align 32
+.L005mod_loop:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+ nop
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movups (%edx),%xmm2
+ xorps %xmm6,%xmm0
+ movdqa (%ecx),%xmm5
+ xorps %xmm7,%xmm1
+ movdqu (%esi),%xmm7
+ pxor %xmm0,%xmm3
+ movdqu 16(%esi),%xmm6
+ pxor %xmm1,%xmm3
+.byte 102,15,56,0,253
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+.byte 102,15,56,0,245
+ pxor %xmm7,%xmm1
+ movdqa %xmm6,%xmm7
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+.byte 102,15,58,68,242,0
+ movups 32(%edx),%xmm5
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ pshufd $78,%xmm7,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm7,%xmm3
+ pxor %xmm4,%xmm1
+.byte 102,15,58,68,250,17
+ movups 16(%edx),%xmm2
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,58,68,221,0
+ leal 32(%esi),%esi
+ subl $32,%ebx
+ ja .L005mod_loop
+.L004even_tail:
+ pshufd $78,%xmm0,%xmm4
+ movdqa %xmm0,%xmm1
+ pxor %xmm0,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,229,16
+ movdqa (%ecx),%xmm5
+ xorps %xmm6,%xmm0
+ xorps %xmm7,%xmm1
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pxor %xmm3,%xmm4
+ movdqa %xmm4,%xmm3
+ psrldq $8,%xmm4
+ pslldq $8,%xmm3
+ pxor %xmm4,%xmm1
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ testl %ebx,%ebx
+ jnz .L006done
+ movups (%edx),%xmm2
+.L003odd_tail:
+ movdqu (%esi),%xmm3
+.byte 102,15,56,0,221
+ pxor %xmm3,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pshufd $78,%xmm2,%xmm4
+ pxor %xmm0,%xmm3
+ pxor %xmm2,%xmm4
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ xorps %xmm0,%xmm3
+ xorps %xmm1,%xmm3
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.L006done:
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%eax)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin
+.align 64
+.Lbswap:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67
+.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112
+.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62
+.byte 0
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S
@@ -1,0 +1,688 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl md5_block_asm_data_order
+.hidden md5_block_asm_data_order
+.type md5_block_asm_data_order,@function
+.align 16
+md5_block_asm_data_order:
+.L_md5_block_asm_data_order_begin:
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%edi
+ movl 16(%esp),%esi
+ movl 20(%esp),%ecx
+ pushl %ebp
+ shll $6,%ecx
+ pushl %ebx
+ addl %esi,%ecx
+ subl $64,%ecx
+ movl (%edi),%eax
+ pushl %ecx
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+.L000start:
+
+
+ movl %ecx,%edi
+ movl (%esi),%ebp
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 3614090360(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 4(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 3905402710(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 8(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 606105819(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 12(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 3250441966(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 16(%esi),%ebp
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 4118548399(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 20(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 1200080426(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 24(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 2821735955(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 28(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 4249261313(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 32(%esi),%ebp
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 1770035416(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 36(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 2336552879(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 40(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 4294925233(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 44(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 2304563134(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 48(%esi),%ebp
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ andl %ebx,%edi
+ leal 1804603682(%eax,%ebp,1),%eax
+ xorl %edx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $7,%eax
+ movl 52(%esi),%ebp
+ addl %ebx,%eax
+
+ xorl %ecx,%edi
+ andl %eax,%edi
+ leal 4254626195(%edx,%ebp,1),%edx
+ xorl %ecx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $12,%edx
+ movl 56(%esi),%ebp
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ andl %edx,%edi
+ leal 2792965006(%ecx,%ebp,1),%ecx
+ xorl %ebx,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $17,%ecx
+ movl 60(%esi),%ebp
+ addl %edx,%ecx
+
+ xorl %eax,%edi
+ andl %ecx,%edi
+ leal 1236535329(%ebx,%ebp,1),%ebx
+ xorl %eax,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $22,%ebx
+ movl 4(%esi),%ebp
+ addl %ecx,%ebx
+
+
+
+ leal 4129170786(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 24(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 3225465664(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 44(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 643717713(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl (%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 3921069994(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 20(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ leal 3593408605(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 40(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 38016083(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 60(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 3634488961(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 16(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 3889429448(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 36(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ leal 568446438(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 56(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 3275163606(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 12(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 4107603335(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 32(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 1163531501(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 52(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+ leal 2850285829(%eax,%ebp,1),%eax
+ xorl %ebx,%edi
+ andl %edx,%edi
+ movl 8(%esi),%ebp
+ xorl %ecx,%edi
+ addl %edi,%eax
+ movl %ebx,%edi
+ roll $5,%eax
+ addl %ebx,%eax
+
+ leal 4243563512(%edx,%ebp,1),%edx
+ xorl %eax,%edi
+ andl %ecx,%edi
+ movl 28(%esi),%ebp
+ xorl %ebx,%edi
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $9,%edx
+ addl %eax,%edx
+
+ leal 1735328473(%ecx,%ebp,1),%ecx
+ xorl %edx,%edi
+ andl %ebx,%edi
+ movl 48(%esi),%ebp
+ xorl %eax,%edi
+ addl %edi,%ecx
+ movl %edx,%edi
+ roll $14,%ecx
+ addl %edx,%ecx
+
+ leal 2368359562(%ebx,%ebp,1),%ebx
+ xorl %ecx,%edi
+ andl %eax,%edi
+ movl 20(%esi),%ebp
+ xorl %edx,%edi
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $20,%ebx
+ addl %ecx,%ebx
+
+
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 4294588738(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 32(%esi),%ebp
+ movl %ebx,%edi
+
+ leal 2272392833(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 44(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 1839030562(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 56(%esi),%ebp
+ movl %edx,%edi
+
+ leal 4259657740(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 4(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 2763975236(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 16(%esi),%ebp
+ movl %ebx,%edi
+
+ leal 1272893353(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 28(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 4139469664(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 40(%esi),%ebp
+ movl %edx,%edi
+
+ leal 3200236656(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 52(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 681279174(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl (%esi),%ebp
+ movl %ebx,%edi
+
+ leal 3936430074(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 12(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 3572445317(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 24(%esi),%ebp
+ movl %edx,%edi
+
+ leal 76029189(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl 36(%esi),%ebp
+ addl %edi,%ebx
+ movl %ecx,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+ xorl %edx,%edi
+ xorl %ebx,%edi
+ leal 3654602809(%eax,%ebp,1),%eax
+ addl %edi,%eax
+ roll $4,%eax
+ movl 48(%esi),%ebp
+ movl %ebx,%edi
+
+ leal 3873151461(%edx,%ebp,1),%edx
+ addl %ebx,%eax
+ xorl %ecx,%edi
+ xorl %eax,%edi
+ movl 60(%esi),%ebp
+ addl %edi,%edx
+ movl %eax,%edi
+ roll $11,%edx
+ addl %eax,%edx
+
+ xorl %ebx,%edi
+ xorl %edx,%edi
+ leal 530742520(%ecx,%ebp,1),%ecx
+ addl %edi,%ecx
+ roll $16,%ecx
+ movl 8(%esi),%ebp
+ movl %edx,%edi
+
+ leal 3299628645(%ebx,%ebp,1),%ebx
+ addl %edx,%ecx
+ xorl %eax,%edi
+ xorl %ecx,%edi
+ movl (%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $23,%ebx
+ addl %ecx,%ebx
+
+
+
+ xorl %edx,%edi
+ orl %ebx,%edi
+ leal 4096336452(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 28(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 1126891415(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 56(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 2878612391(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 20(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 4237533241(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 48(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+
+ orl %ebx,%edi
+ leal 1700485571(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 12(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 2399980690(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 40(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 4293915773(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 4(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 2240044497(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 32(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+
+ orl %ebx,%edi
+ leal 1873313359(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 60(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 4264355552(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 24(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 2734768916(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 52(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 1309151649(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 16(%esi),%ebp
+ addl %edi,%ebx
+ movl $-1,%edi
+ roll $21,%ebx
+ xorl %edx,%edi
+ addl %ecx,%ebx
+
+ orl %ebx,%edi
+ leal 4149444226(%eax,%ebp,1),%eax
+ xorl %ecx,%edi
+ movl 44(%esi),%ebp
+ addl %edi,%eax
+ movl $-1,%edi
+ roll $6,%eax
+ xorl %ecx,%edi
+ addl %ebx,%eax
+
+ orl %eax,%edi
+ leal 3174756917(%edx,%ebp,1),%edx
+ xorl %ebx,%edi
+ movl 8(%esi),%ebp
+ addl %edi,%edx
+ movl $-1,%edi
+ roll $10,%edx
+ xorl %ebx,%edi
+ addl %eax,%edx
+
+ orl %edx,%edi
+ leal 718787259(%ecx,%ebp,1),%ecx
+ xorl %eax,%edi
+ movl 36(%esi),%ebp
+ addl %edi,%ecx
+ movl $-1,%edi
+ roll $15,%ecx
+ xorl %eax,%edi
+ addl %edx,%ecx
+
+ orl %ecx,%edi
+ leal 3951481745(%ebx,%ebp,1),%ebx
+ xorl %edx,%edi
+ movl 24(%esp),%ebp
+ addl %edi,%ebx
+ addl $64,%esi
+ roll $21,%ebx
+ movl (%ebp),%edi
+ addl %ecx,%ebx
+ addl %edi,%eax
+ movl 4(%ebp),%edi
+ addl %edi,%ebx
+ movl 8(%ebp),%edi
+ addl %edi,%ecx
+ movl 12(%ebp),%edi
+ addl %edi,%edx
+ movl %eax,(%ebp)
+ movl %ebx,4(%ebp)
+ movl (%esp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ cmpl %esi,%edi
+ jae .L000start
+ popl %eax
+ popl %ebx
+ popl %ebp
+ popl %edi
+ popl %esi
+ ret
+.size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S
@@ -1,0 +1,3808 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl sha1_block_data_order
+.hidden sha1_block_data_order
+.type sha1_block_data_order,@function
+.align 16
+sha1_block_data_order:
+.L_sha1_block_data_order_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal OPENSSL_ia32cap_P-.L000pic_point(%ebp),%esi
+ leal .LK_XX_XX-.L000pic_point(%ebp),%ebp
+ movl (%esi),%eax
+ movl 4(%esi),%edx
+ testl $512,%edx
+ jz .L001x86
+ movl 8(%esi),%ecx
+ testl $16777216,%eax
+ jz .L001x86
+ andl $268435456,%edx
+ andl $1073741824,%eax
+ orl %edx,%eax
+ cmpl $1342177280,%eax
+ je .Lavx_shortcut
+ jmp .Lssse3_shortcut
+.align 16
+.L001x86:
+ movl 20(%esp),%ebp
+ movl 24(%esp),%esi
+ movl 28(%esp),%eax
+ subl $76,%esp
+ shll $6,%eax
+ addl %esi,%eax
+ movl %eax,104(%esp)
+ movl 16(%ebp),%edi
+ jmp .L002loop
+.align 16
+.L002loop:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movl %ecx,8(%esp)
+ movl %edx,12(%esp)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,16(%esp)
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %edx,28(%esp)
+ movl 32(%esi),%eax
+ movl 36(%esi),%ebx
+ movl 40(%esi),%ecx
+ movl 44(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,40(%esp)
+ movl %edx,44(%esp)
+ movl 48(%esi),%eax
+ movl 52(%esi),%ebx
+ movl 56(%esi),%ecx
+ movl 60(%esi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ movl %eax,48(%esp)
+ movl %ebx,52(%esp)
+ movl %ecx,56(%esp)
+ movl %edx,60(%esp)
+ movl %esi,100(%esp)
+ movl (%ebp),%eax
+ movl 4(%ebp),%ebx
+ movl 8(%ebp),%ecx
+ movl 12(%ebp),%edx
+
+ movl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl (%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 4(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 8(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 12(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ addl %ecx,%ebp
+
+ movl %edi,%ebx
+ movl %ebp,%ecx
+ roll $5,%ebp
+ xorl %esi,%ebx
+ addl %eax,%ebp
+ movl 16(%esp),%eax
+ andl %edx,%ebx
+ rorl $2,%edx
+ xorl %esi,%ebx
+ leal 1518500249(%ebp,%eax,1),%ebp
+ addl %ebx,%ebp
+
+ movl %edx,%eax
+ movl %ebp,%ebx
+ roll $5,%ebp
+ xorl %edi,%eax
+ addl %esi,%ebp
+ movl 20(%esp),%esi
+ andl %ecx,%eax
+ rorl $2,%ecx
+ xorl %edi,%eax
+ leal 1518500249(%ebp,%esi,1),%ebp
+ addl %eax,%ebp
+
+ movl %ecx,%esi
+ movl %ebp,%eax
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl 24(%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 28(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 32(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 36(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ addl %ecx,%ebp
+
+ movl %edi,%ebx
+ movl %ebp,%ecx
+ roll $5,%ebp
+ xorl %esi,%ebx
+ addl %eax,%ebp
+ movl 40(%esp),%eax
+ andl %edx,%ebx
+ rorl $2,%edx
+ xorl %esi,%ebx
+ leal 1518500249(%ebp,%eax,1),%ebp
+ addl %ebx,%ebp
+
+ movl %edx,%eax
+ movl %ebp,%ebx
+ roll $5,%ebp
+ xorl %edi,%eax
+ addl %esi,%ebp
+ movl 44(%esp),%esi
+ andl %ecx,%eax
+ rorl $2,%ecx
+ xorl %edi,%eax
+ leal 1518500249(%ebp,%esi,1),%ebp
+ addl %eax,%ebp
+
+ movl %ecx,%esi
+ movl %ebp,%eax
+ roll $5,%ebp
+ xorl %edx,%esi
+ addl %edi,%ebp
+ movl 48(%esp),%edi
+ andl %ebx,%esi
+ rorl $2,%ebx
+ xorl %edx,%esi
+ leal 1518500249(%ebp,%edi,1),%ebp
+ addl %esi,%ebp
+
+ movl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ xorl %ecx,%edi
+ addl %edx,%ebp
+ movl 52(%esp),%edx
+ andl %eax,%edi
+ rorl $2,%eax
+ xorl %ecx,%edi
+ leal 1518500249(%ebp,%edx,1),%ebp
+ addl %edi,%ebp
+
+ movl %eax,%edx
+ movl %ebp,%edi
+ roll $5,%ebp
+ xorl %ebx,%edx
+ addl %ecx,%ebp
+ movl 56(%esp),%ecx
+ andl %esi,%edx
+ rorl $2,%esi
+ xorl %ebx,%edx
+ leal 1518500249(%ebp,%ecx,1),%ebp
+ addl %edx,%ebp
+
+ movl %esi,%ecx
+ movl %ebp,%edx
+ roll $5,%ebp
+ xorl %eax,%ecx
+ addl %ebx,%ebp
+ movl 60(%esp),%ebx
+ andl %edi,%ecx
+ rorl $2,%edi
+ xorl %eax,%ecx
+ leal 1518500249(%ebp,%ebx,1),%ebp
+ movl (%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 8(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 32(%esp),%ebx
+ andl %edx,%ebp
+ xorl 52(%esp),%ebx
+ roll $1,%ebx
+ xorl %esi,%ebp
+ addl %ebp,%eax
+ movl %ecx,%ebp
+ rorl $2,%edx
+ movl %ebx,(%esp)
+ roll $5,%ebp
+ leal 1518500249(%ebx,%eax,1),%ebx
+ movl 4(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 12(%esp),%eax
+ xorl %edi,%ebp
+ xorl 36(%esp),%eax
+ andl %ecx,%ebp
+ xorl 56(%esp),%eax
+ roll $1,%eax
+ xorl %edi,%ebp
+ addl %ebp,%esi
+ movl %ebx,%ebp
+ rorl $2,%ecx
+ movl %eax,4(%esp)
+ roll $5,%ebp
+ leal 1518500249(%eax,%esi,1),%eax
+ movl 8(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 16(%esp),%esi
+ xorl %edx,%ebp
+ xorl 40(%esp),%esi
+ andl %ebx,%ebp
+ xorl 60(%esp),%esi
+ roll $1,%esi
+ xorl %edx,%ebp
+ addl %ebp,%edi
+ movl %eax,%ebp
+ rorl $2,%ebx
+ movl %esi,8(%esp)
+ roll $5,%ebp
+ leal 1518500249(%esi,%edi,1),%esi
+ movl 12(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 20(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 44(%esp),%edi
+ andl %eax,%ebp
+ xorl (%esp),%edi
+ roll $1,%edi
+ xorl %ecx,%ebp
+ addl %ebp,%edx
+ movl %esi,%ebp
+ rorl $2,%eax
+ movl %edi,12(%esp)
+ roll $5,%ebp
+ leal 1518500249(%edi,%edx,1),%edi
+ movl 16(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 24(%esp),%edx
+ xorl %eax,%ebp
+ xorl 48(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,16(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 20(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 8(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,20(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 24(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 32(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 56(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,24(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 28(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edi,%ebp
+ xorl 16(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,28(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 32(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 40(%esp),%esi
+ xorl %ecx,%ebp
+ xorl (%esp),%esi
+ xorl %edx,%ebp
+ xorl 20(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,32(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 36(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 44(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,36(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl 40(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 48(%esp),%edx
+ xorl %eax,%ebp
+ xorl 8(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,40(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 44(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 32(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,44(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 48(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 56(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 16(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,48(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 52(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edi,%ebp
+ xorl 40(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,52(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 56(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl (%esp),%esi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%esi
+ xorl %edx,%ebp
+ xorl 44(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,56(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 60(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 4(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,60(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl (%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 8(%esp),%edx
+ xorl %eax,%ebp
+ xorl 32(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 52(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 4(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 56(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,4(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 8(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 16(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 40(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 60(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,8(%esp)
+ leal 1859775393(%ebx,%eax,1),%ebx
+ movl 12(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edx,%ebp
+ xorl 44(%esp),%eax
+ xorl %edi,%ebp
+ xorl (%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,12(%esp)
+ leal 1859775393(%eax,%esi,1),%eax
+ movl 16(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 24(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%esi
+ xorl %edx,%ebp
+ xorl 4(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,16(%esp)
+ leal 1859775393(%esi,%edi,1),%esi
+ movl 20(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 28(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 52(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 8(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,20(%esp)
+ leal 1859775393(%edi,%edx,1),%edi
+ movl 24(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 32(%esp),%edx
+ xorl %eax,%ebp
+ xorl 56(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 12(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,24(%esp)
+ leal 1859775393(%edx,%ecx,1),%edx
+ movl 28(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 36(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 60(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 16(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,28(%esp)
+ leal 1859775393(%ecx,%ebx,1),%ecx
+ movl 32(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 40(%esp),%ebx
+ xorl %esi,%ebp
+ xorl (%esp),%ebx
+ andl %edx,%ebp
+ xorl 20(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,32(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 36(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 44(%esp),%eax
+ xorl %edi,%ebp
+ xorl 4(%esp),%eax
+ andl %ecx,%ebp
+ xorl 24(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,36(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 40(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 48(%esp),%esi
+ xorl %edx,%ebp
+ xorl 8(%esp),%esi
+ andl %ebx,%ebp
+ xorl 28(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,40(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 44(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 52(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 12(%esp),%edi
+ andl %eax,%ebp
+ xorl 32(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,44(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 48(%esp),%edx
+ addl %ebp,%edi
+
+ movl %eax,%ebp
+ xorl 56(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 16(%esp),%edx
+ andl %esi,%ebp
+ xorl 36(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,48(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 52(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %esi,%ebp
+ xorl 60(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 20(%esp),%ecx
+ andl %edi,%ebp
+ xorl 40(%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,52(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 56(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl (%esp),%ebx
+ xorl %esi,%ebp
+ xorl 24(%esp),%ebx
+ andl %edx,%ebp
+ xorl 44(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,56(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 60(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 4(%esp),%eax
+ xorl %edi,%ebp
+ xorl 28(%esp),%eax
+ andl %ecx,%ebp
+ xorl 48(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,60(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl (%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 8(%esp),%esi
+ xorl %edx,%ebp
+ xorl 32(%esp),%esi
+ andl %ebx,%ebp
+ xorl 52(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 4(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 12(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 36(%esp),%edi
+ andl %eax,%ebp
+ xorl 56(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,4(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 8(%esp),%edx
+ addl %ebp,%edi
+
+ movl %eax,%ebp
+ xorl 16(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 40(%esp),%edx
+ andl %esi,%ebp
+ xorl 60(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,8(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 12(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %esi,%ebp
+ xorl 20(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 44(%esp),%ecx
+ andl %edi,%ebp
+ xorl (%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,12(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 16(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 24(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 48(%esp),%ebx
+ andl %edx,%ebp
+ xorl 4(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,16(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 20(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 28(%esp),%eax
+ xorl %edi,%ebp
+ xorl 52(%esp),%eax
+ andl %ecx,%ebp
+ xorl 8(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,20(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 24(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ecx,%ebp
+ xorl 32(%esp),%esi
+ xorl %edx,%ebp
+ xorl 56(%esp),%esi
+ andl %ebx,%ebp
+ xorl 12(%esp),%esi
+ roll $1,%esi
+ addl %edi,%ebp
+ rorl $2,%ebx
+ movl %eax,%edi
+ roll $5,%edi
+ movl %esi,24(%esp)
+ leal 2400959708(%esi,%ebp,1),%esi
+ movl %ecx,%ebp
+ addl %edi,%esi
+ andl %edx,%ebp
+ movl 28(%esp),%edi
+ addl %ebp,%esi
+
+ movl %ebx,%ebp
+ xorl 36(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 60(%esp),%edi
+ andl %eax,%ebp
+ xorl 16(%esp),%edi
+ roll $1,%edi
+ addl %edx,%ebp
+ rorl $2,%eax
+ movl %esi,%edx
+ roll $5,%edx
+ movl %edi,28(%esp)
+ leal 2400959708(%edi,%ebp,1),%edi
+ movl %ebx,%ebp
+ addl %edx,%edi
+ andl %ecx,%ebp
+ movl 32(%esp),%edx
+ addl %ebp,%edi
+
+ movl %eax,%ebp
+ xorl 40(%esp),%edx
+ xorl %ebx,%ebp
+ xorl (%esp),%edx
+ andl %esi,%ebp
+ xorl 20(%esp),%edx
+ roll $1,%edx
+ addl %ecx,%ebp
+ rorl $2,%esi
+ movl %edi,%ecx
+ roll $5,%ecx
+ movl %edx,32(%esp)
+ leal 2400959708(%edx,%ebp,1),%edx
+ movl %eax,%ebp
+ addl %ecx,%edx
+ andl %ebx,%ebp
+ movl 36(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %esi,%ebp
+ xorl 44(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 4(%esp),%ecx
+ andl %edi,%ebp
+ xorl 24(%esp),%ecx
+ roll $1,%ecx
+ addl %ebx,%ebp
+ rorl $2,%edi
+ movl %edx,%ebx
+ roll $5,%ebx
+ movl %ecx,36(%esp)
+ leal 2400959708(%ecx,%ebp,1),%ecx
+ movl %esi,%ebp
+ addl %ebx,%ecx
+ andl %eax,%ebp
+ movl 40(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edi,%ebp
+ xorl 48(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 8(%esp),%ebx
+ andl %edx,%ebp
+ xorl 28(%esp),%ebx
+ roll $1,%ebx
+ addl %eax,%ebp
+ rorl $2,%edx
+ movl %ecx,%eax
+ roll $5,%eax
+ movl %ebx,40(%esp)
+ leal 2400959708(%ebx,%ebp,1),%ebx
+ movl %edi,%ebp
+ addl %eax,%ebx
+ andl %esi,%ebp
+ movl 44(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %edx,%ebp
+ xorl 52(%esp),%eax
+ xorl %edi,%ebp
+ xorl 12(%esp),%eax
+ andl %ecx,%ebp
+ xorl 32(%esp),%eax
+ roll $1,%eax
+ addl %esi,%ebp
+ rorl $2,%ecx
+ movl %ebx,%esi
+ roll $5,%esi
+ movl %eax,44(%esp)
+ leal 2400959708(%eax,%ebp,1),%eax
+ movl %edx,%ebp
+ addl %esi,%eax
+ andl %edi,%ebp
+ movl 48(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 56(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 16(%esp),%esi
+ xorl %edx,%ebp
+ xorl 36(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,48(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 52(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 60(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 20(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 40(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,52(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 56(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl (%esp),%edx
+ xorl %eax,%ebp
+ xorl 24(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 44(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,56(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 60(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 4(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 48(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,60(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl (%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 8(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 32(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 4(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 12(%esp),%eax
+ xorl %edx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edi,%ebp
+ xorl 56(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,4(%esp)
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 8(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 16(%esp),%esi
+ xorl %ecx,%ebp
+ xorl 40(%esp),%esi
+ xorl %edx,%ebp
+ xorl 60(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,8(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 12(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 20(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 44(%esp),%edi
+ xorl %ecx,%ebp
+ xorl (%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,12(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 16(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 24(%esp),%edx
+ xorl %eax,%ebp
+ xorl 48(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,16(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 20(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 28(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 8(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,20(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl 24(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 32(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 56(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,24(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 28(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 36(%esp),%eax
+ xorl %edx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edi,%ebp
+ xorl 16(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ movl %eax,28(%esp)
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 32(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl 40(%esp),%esi
+ xorl %ecx,%ebp
+ xorl (%esp),%esi
+ xorl %edx,%ebp
+ xorl 20(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ movl %esi,32(%esp)
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 36(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 44(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 4(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ movl %edi,36(%esp)
+ leal 3395469782(%edi,%edx,1),%edi
+ movl 40(%esp),%edx
+ addl %ebp,%edi
+
+ movl %esi,%ebp
+ xorl 48(%esp),%edx
+ xorl %eax,%ebp
+ xorl 8(%esp),%edx
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edx
+ roll $1,%edx
+ addl %ebp,%ecx
+ rorl $2,%esi
+ movl %edi,%ebp
+ roll $5,%ebp
+ movl %edx,40(%esp)
+ leal 3395469782(%edx,%ecx,1),%edx
+ movl 44(%esp),%ecx
+ addl %ebp,%edx
+
+ movl %edi,%ebp
+ xorl 52(%esp),%ecx
+ xorl %esi,%ebp
+ xorl 12(%esp),%ecx
+ xorl %eax,%ebp
+ xorl 32(%esp),%ecx
+ roll $1,%ecx
+ addl %ebp,%ebx
+ rorl $2,%edi
+ movl %edx,%ebp
+ roll $5,%ebp
+ movl %ecx,44(%esp)
+ leal 3395469782(%ecx,%ebx,1),%ecx
+ movl 48(%esp),%ebx
+ addl %ebp,%ecx
+
+ movl %edx,%ebp
+ xorl 56(%esp),%ebx
+ xorl %edi,%ebp
+ xorl 16(%esp),%ebx
+ xorl %esi,%ebp
+ xorl 36(%esp),%ebx
+ roll $1,%ebx
+ addl %ebp,%eax
+ rorl $2,%edx
+ movl %ecx,%ebp
+ roll $5,%ebp
+ movl %ebx,48(%esp)
+ leal 3395469782(%ebx,%eax,1),%ebx
+ movl 52(%esp),%eax
+ addl %ebp,%ebx
+
+ movl %ecx,%ebp
+ xorl 60(%esp),%eax
+ xorl %edx,%ebp
+ xorl 20(%esp),%eax
+ xorl %edi,%ebp
+ xorl 40(%esp),%eax
+ roll $1,%eax
+ addl %ebp,%esi
+ rorl $2,%ecx
+ movl %ebx,%ebp
+ roll $5,%ebp
+ leal 3395469782(%eax,%esi,1),%eax
+ movl 56(%esp),%esi
+ addl %ebp,%eax
+
+ movl %ebx,%ebp
+ xorl (%esp),%esi
+ xorl %ecx,%ebp
+ xorl 24(%esp),%esi
+ xorl %edx,%ebp
+ xorl 44(%esp),%esi
+ roll $1,%esi
+ addl %ebp,%edi
+ rorl $2,%ebx
+ movl %eax,%ebp
+ roll $5,%ebp
+ leal 3395469782(%esi,%edi,1),%esi
+ movl 60(%esp),%edi
+ addl %ebp,%esi
+
+ movl %eax,%ebp
+ xorl 4(%esp),%edi
+ xorl %ebx,%ebp
+ xorl 28(%esp),%edi
+ xorl %ecx,%ebp
+ xorl 48(%esp),%edi
+ roll $1,%edi
+ addl %ebp,%edx
+ rorl $2,%eax
+ movl %esi,%ebp
+ roll $5,%ebp
+ leal 3395469782(%edi,%edx,1),%edi
+ addl %ebp,%edi
+ movl 96(%esp),%ebp
+ movl 100(%esp),%edx
+ addl (%ebp),%edi
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%eax
+ addl 12(%ebp),%ebx
+ addl 16(%ebp),%ecx
+ movl %edi,(%ebp)
+ addl $64,%edx
+ movl %esi,4(%ebp)
+ cmpl 104(%esp),%edx
+ movl %eax,8(%ebp)
+ movl %ecx,%edi
+ movl %ebx,12(%ebp)
+ movl %edx,%esi
+ movl %ecx,16(%ebp)
+ jb .L002loop
+ addl $76,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha1_block_data_order,.-.L_sha1_block_data_order_begin
+.hidden _sha1_block_data_order_ssse3
+.type _sha1_block_data_order_ssse3,@function
+.align 16
+_sha1_block_data_order_ssse3:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L003pic_point
+.L003pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L003pic_point(%ebp),%ebp
+.Lssse3_shortcut:
+ movdqa (%ebp),%xmm7
+ movdqa 16(%ebp),%xmm0
+ movdqa 32(%ebp),%xmm1
+ movdqa 48(%ebp),%xmm2
+ movdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ movdqa %xmm0,112(%esp)
+ movdqa %xmm1,128(%esp)
+ movdqa %xmm2,144(%esp)
+ shll $6,%edx
+ movdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ movdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ movdqu -64(%ebp),%xmm0
+ movdqu -48(%ebp),%xmm1
+ movdqu -32(%ebp),%xmm2
+ movdqu -16(%ebp),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ movdqa %xmm7,96(%esp)
+.byte 102,15,56,0,222
+ paddd %xmm7,%xmm0
+ paddd %xmm7,%xmm1
+ paddd %xmm7,%xmm2
+ movdqa %xmm0,(%esp)
+ psubd %xmm7,%xmm0
+ movdqa %xmm1,16(%esp)
+ psubd %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ xorl %edx,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebp,%esi
+ jmp .L004loop
+.align 16
+.L004loop:
+ rorl $2,%ebx
+ xorl %edx,%esi
+ movl %eax,%ebp
+ punpcklqdq %xmm1,%xmm4
+ movdqa %xmm3,%xmm6
+ addl (%esp),%edi
+ xorl %ecx,%ebx
+ paddd %xmm3,%xmm7
+ movdqa %xmm0,64(%esp)
+ roll $5,%eax
+ addl %esi,%edi
+ psrldq $4,%xmm6
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm2,%xmm6
+ xorl %ecx,%ebp
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ pxor %xmm6,%xmm4
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ movdqa %xmm4,%xmm6
+ xorl %ebx,%esi
+ pslldq $12,%xmm0
+ paddd %xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ psrld $31,%xmm6
+ xorl %eax,%edi
+ roll $5,%edx
+ movdqa %xmm0,%xmm7
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ psrld $30,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm6,%xmm4
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ pslld $2,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ pxor %xmm0,%xmm4
+ movdqa 96(%esp),%xmm0
+ addl %ebp,%ebx
+ andl %edx,%esi
+ pxor %xmm7,%xmm4
+ pshufd $238,%xmm1,%xmm5
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ movl %ebx,%ebp
+ punpcklqdq %xmm2,%xmm5
+ movdqa %xmm4,%xmm7
+ addl 16(%esp),%eax
+ xorl %edx,%ecx
+ paddd %xmm4,%xmm0
+ movdqa %xmm1,80(%esp)
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm7
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm7
+ xorl %edx,%ebp
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ pxor %xmm7,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm0,(%esp)
+ addl %ebp,%edi
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ movdqa %xmm5,%xmm7
+ xorl %ecx,%esi
+ pslldq $12,%xmm1
+ paddd %xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ psrld $31,%xmm7
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ psrld $30,%xmm1
+ addl %edi,%edx
+ rorl $7,%edi
+ por %xmm7,%xmm5
+ xorl %ebx,%ebp
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ pslld $2,%xmm0
+ xorl %eax,%edi
+ roll $5,%edx
+ pxor %xmm1,%xmm5
+ movdqa 112(%esp),%xmm1
+ addl %ebp,%ecx
+ andl %edi,%esi
+ pxor %xmm0,%xmm5
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ punpcklqdq %xmm3,%xmm6
+ movdqa %xmm5,%xmm0
+ addl 32(%esp),%ebx
+ xorl %edi,%edx
+ paddd %xmm5,%xmm1
+ movdqa %xmm2,96(%esp)
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm0
+ andl %edx,%ebp
+ xorl %edi,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm0
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ pxor %xmm0,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm1,16(%esp)
+ addl %ebp,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm0
+ xorl %edx,%esi
+ pslldq $12,%xmm2
+ paddd %xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ psrld $31,%xmm0
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm2,%xmm1
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ psrld $30,%xmm2
+ addl %eax,%edi
+ rorl $7,%eax
+ por %xmm0,%xmm6
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm0
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ pslld $2,%xmm1
+ xorl %ebx,%eax
+ roll $5,%edi
+ pxor %xmm2,%xmm6
+ movdqa 112(%esp),%xmm2
+ addl %ebp,%edx
+ andl %eax,%esi
+ pxor %xmm1,%xmm6
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%esi
+ movl %edx,%ebp
+ punpcklqdq %xmm4,%xmm7
+ movdqa %xmm6,%xmm1
+ addl 48(%esp),%ecx
+ xorl %eax,%edi
+ paddd %xmm6,%xmm2
+ movdqa %xmm3,64(%esp)
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm1
+ andl %edi,%ebp
+ xorl %eax,%edi
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm1
+ xorl %eax,%ebp
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ pxor %xmm1,%xmm7
+ xorl %edi,%edx
+ roll $5,%ecx
+ movdqa %xmm2,32(%esp)
+ addl %ebp,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm1
+ xorl %edi,%esi
+ pslldq $12,%xmm3
+ paddd %xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ psrld $31,%xmm1
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm3,%xmm2
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ psrld $30,%xmm3
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm1,%xmm7
+ xorl %edx,%ebp
+ movdqa 80(%esp),%xmm1
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ pslld $2,%xmm2
+ xorl %ecx,%ebx
+ roll $5,%eax
+ pxor %xmm3,%xmm7
+ movdqa 112(%esp),%xmm3
+ addl %ebp,%edi
+ andl %ebx,%esi
+ pxor %xmm2,%xmm7
+ pshufd $238,%xmm6,%xmm2
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ roll $5,%edi
+ movdqa %xmm3,%xmm4
+ addl %esi,%edx
+ paddd %xmm7,%xmm3
+ andl %eax,%ebp
+ pxor %xmm2,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ rorl $7,%edi
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ roll $5,%edx
+ pslld $2,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ psrld $30,%xmm2
+ xorl %eax,%edi
+ addl %edx,%ecx
+ rorl $7,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ xorl %edi,%edx
+ roll $5,%ecx
+ por %xmm2,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ movdqa 96(%esp),%xmm2
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ pshufd $238,%xmm7,%xmm3
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 16(%esp),%edi
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm4,%xmm5
+ rorl $7,%ebx
+ paddd %xmm0,%xmm4
+ addl %eax,%edi
+ pxor %xmm3,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm3
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm3,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ movdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ pshufd $238,%xmm0,%xmm4
+ addl %ecx,%ebx
+ addl 32(%esp),%eax
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ movdqa 128(%esp),%xmm6
+ rorl $7,%ecx
+ paddd %xmm1,%xmm5
+ addl %ebx,%eax
+ pxor %xmm4,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ pslld $2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ psrld $30,%xmm4
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ por %xmm4,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ movdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ pshufd $238,%xmm1,%xmm5
+ addl %edx,%ecx
+ addl 48(%esp),%ebx
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%edx
+ paddd %xmm2,%xmm6
+ addl %ecx,%ebx
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pslld $2,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ psrld $30,%xmm5
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ por %xmm5,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ movdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ pshufd $238,%xmm2,%xmm6
+ addl %edi,%edx
+ addl (%esp),%ecx
+ pxor %xmm0,%xmm4
+ punpcklqdq %xmm3,%xmm6
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ movdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ movdqa %xmm7,%xmm0
+ rorl $7,%edi
+ paddd %xmm3,%xmm7
+ addl %edx,%ecx
+ pxor %xmm6,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm6
+ movdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pslld $2,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ psrld $30,%xmm6
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ por %xmm6,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ movdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ pshufd $238,%xmm3,%xmm7
+ addl %eax,%edi
+ addl 16(%esp),%edx
+ pxor %xmm1,%xmm5
+ punpcklqdq %xmm4,%xmm7
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ pxor %xmm6,%xmm5
+ movdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ movdqa %xmm0,%xmm1
+ rorl $7,%eax
+ paddd %xmm4,%xmm0
+ addl %edi,%edx
+ pxor %xmm7,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm7
+ movdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ pslld $2,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ psrld $30,%xmm7
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ por %xmm7,%xmm5
+ addl 28(%esp),%eax
+ movdqa 80(%esp),%xmm7
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pshufd $238,%xmm4,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 32(%esp),%edi
+ pxor %xmm2,%xmm6
+ punpcklqdq %xmm5,%xmm0
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ pxor %xmm7,%xmm6
+ movdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ roll $5,%eax
+ movdqa %xmm1,%xmm2
+ addl %esi,%edi
+ paddd %xmm5,%xmm1
+ xorl %ebx,%ebp
+ pxor %xmm0,%xmm6
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ andl %ebx,%ebp
+ movdqa %xmm6,%xmm0
+ movdqa %xmm1,16(%esp)
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ pslld $2,%xmm6
+ addl %ebp,%edx
+ xorl %eax,%esi
+ psrld $30,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ por %xmm0,%xmm6
+ movl %edx,%ebp
+ xorl %eax,%esi
+ movdqa 96(%esp),%xmm0
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ pshufd $238,%xmm5,%xmm1
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 48(%esp),%eax
+ pxor %xmm3,%xmm7
+ punpcklqdq %xmm6,%xmm1
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ pxor %xmm0,%xmm7
+ movdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ roll $5,%ebx
+ movdqa 144(%esp),%xmm3
+ addl %esi,%eax
+ paddd %xmm6,%xmm2
+ xorl %ecx,%ebp
+ pxor %xmm1,%xmm7
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ andl %ecx,%ebp
+ movdqa %xmm7,%xmm1
+ movdqa %xmm2,32(%esp)
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ pslld $2,%xmm7
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ psrld $30,%xmm1
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ por %xmm1,%xmm7
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ movdqa 64(%esp),%xmm1
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ pshufd $238,%xmm6,%xmm2
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl (%esp),%ebx
+ pxor %xmm4,%xmm0
+ punpcklqdq %xmm7,%xmm2
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ pxor %xmm1,%xmm0
+ movdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ roll $5,%ecx
+ movdqa %xmm3,%xmm4
+ addl %esi,%ebx
+ paddd %xmm7,%xmm3
+ xorl %edx,%ebp
+ pxor %xmm2,%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ andl %edx,%ebp
+ movdqa %xmm0,%xmm2
+ movdqa %xmm3,48(%esp)
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ pslld $2,%xmm0
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ psrld $30,%xmm2
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ por %xmm2,%xmm0
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ movdqa 80(%esp),%xmm2
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ pshufd $238,%xmm7,%xmm3
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 16(%esp),%ecx
+ pxor %xmm5,%xmm1
+ punpcklqdq %xmm0,%xmm3
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%edi
+ pxor %xmm2,%xmm1
+ movdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ roll $5,%edx
+ movdqa %xmm4,%xmm5
+ addl %esi,%ecx
+ paddd %xmm0,%xmm4
+ xorl %edi,%ebp
+ pxor %xmm3,%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ andl %edi,%ebp
+ movdqa %xmm1,%xmm3
+ movdqa %xmm4,(%esp)
+ xorl %eax,%edi
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ roll $5,%ecx
+ pslld $2,%xmm1
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ psrld $30,%xmm3
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ rorl $7,%ecx
+ por %xmm3,%xmm1
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ movdqa 96(%esp),%xmm3
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pshufd $238,%xmm0,%xmm4
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 32(%esp),%edx
+ pxor %xmm6,%xmm2
+ punpcklqdq %xmm1,%xmm4
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ pxor %xmm3,%xmm2
+ movdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ roll $5,%edi
+ movdqa %xmm5,%xmm6
+ addl %esi,%edx
+ paddd %xmm1,%xmm5
+ xorl %eax,%ebp
+ pxor %xmm4,%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ andl %eax,%ebp
+ movdqa %xmm2,%xmm4
+ movdqa %xmm5,16(%esp)
+ xorl %ebx,%eax
+ rorl $7,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ roll $5,%edx
+ pslld $2,%xmm2
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ psrld $30,%xmm4
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ rorl $7,%edx
+ por %xmm4,%xmm2
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ movdqa 64(%esp),%xmm4
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ pshufd $238,%xmm1,%xmm5
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ addl 48(%esp),%edi
+ pxor %xmm7,%xmm3
+ punpcklqdq %xmm2,%xmm5
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ movdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ movdqa %xmm6,%xmm7
+ rorl $7,%ebx
+ paddd %xmm2,%xmm6
+ addl %eax,%edi
+ pxor %xmm5,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ movdqa %xmm3,%xmm5
+ movdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ pslld $2,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ psrld $30,%xmm5
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ por %xmm5,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ paddd %xmm3,%xmm7
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ movdqa %xmm7,48(%esp)
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L005done
+ movdqa 160(%esp),%xmm7
+ movdqa 176(%esp),%xmm6
+ movdqu (%ebp),%xmm0
+ movdqu 16(%ebp),%xmm1
+ movdqu 32(%ebp),%xmm2
+ movdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+.byte 102,15,56,0,198
+ movl %ebp,196(%esp)
+ movdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+.byte 102,15,56,0,206
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ paddd %xmm7,%xmm0
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ movdqa %xmm0,(%esp)
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ psubd %xmm7,%xmm0
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+.byte 102,15,56,0,214
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ paddd %xmm7,%xmm1
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ movdqa %xmm1,16(%esp)
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ psubd %xmm7,%xmm1
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+.byte 102,15,56,0,222
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ paddd %xmm7,%xmm2
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ movdqa %xmm2,32(%esp)
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ psubd %xmm7,%xmm2
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %ecx,%ebx
+ movl %edx,12(%ebp)
+ xorl %edx,%ebx
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ pshufd $238,%xmm0,%xmm4
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L004loop
+.align 16
+.L005done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ roll $5,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ roll $5,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ roll $5,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ roll $5,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ rorl $7,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ roll $5,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ rorl $7,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %ebp,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3
+.hidden _sha1_block_data_order_avx
+.type _sha1_block_data_order_avx,@function
+.align 16
+_sha1_block_data_order_avx:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ call .L006pic_point
+.L006pic_point:
+ popl %ebp
+ leal .LK_XX_XX-.L006pic_point(%ebp),%ebp
+.Lavx_shortcut:
+ vzeroall
+ vmovdqa (%ebp),%xmm7
+ vmovdqa 16(%ebp),%xmm0
+ vmovdqa 32(%ebp),%xmm1
+ vmovdqa 48(%ebp),%xmm2
+ vmovdqa 64(%ebp),%xmm6
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebp
+ movl 28(%esp),%edx
+ movl %esp,%esi
+ subl $208,%esp
+ andl $-64,%esp
+ vmovdqa %xmm0,112(%esp)
+ vmovdqa %xmm1,128(%esp)
+ vmovdqa %xmm2,144(%esp)
+ shll $6,%edx
+ vmovdqa %xmm7,160(%esp)
+ addl %ebp,%edx
+ vmovdqa %xmm6,176(%esp)
+ addl $64,%ebp
+ movl %edi,192(%esp)
+ movl %ebp,196(%esp)
+ movl %edx,200(%esp)
+ movl %esi,204(%esp)
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ movl 16(%edi),%edi
+ movl %ebx,%esi
+ vmovdqu -64(%ebp),%xmm0
+ vmovdqu -48(%ebp),%xmm1
+ vmovdqu -32(%ebp),%xmm2
+ vmovdqu -16(%ebp),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm7,96(%esp)
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm7,%xmm0,%xmm4
+ vpaddd %xmm7,%xmm1,%xmm5
+ vpaddd %xmm7,%xmm2,%xmm6
+ vmovdqa %xmm4,(%esp)
+ movl %ecx,%ebp
+ vmovdqa %xmm5,16(%esp)
+ xorl %edx,%ebp
+ vmovdqa %xmm6,32(%esp)
+ andl %ebp,%esi
+ jmp .L007loop
+.align 16
+.L007loop:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%ebp
+ addl (%esp),%edi
+ vpaddd %xmm3,%xmm7,%xmm7
+ vmovdqa %xmm0,64(%esp)
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vmovdqa %xmm7,48(%esp)
+ movl %edi,%esi
+ addl 4(%esp),%edx
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm6
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm0
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%ebp
+ addl 8(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm0,%xmm7
+ vpor %xmm6,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm0,%xmm0
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vpxor %xmm7,%xmm4,%xmm4
+ movl %ecx,%esi
+ addl 12(%esp),%ebx
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpxor %xmm0,%xmm4,%xmm4
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vmovdqa 96(%esp),%xmm0
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%ebp
+ addl 16(%esp),%eax
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqa %xmm1,80(%esp)
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vmovdqa %xmm0,(%esp)
+ movl %eax,%esi
+ addl 20(%esp),%edi
+ vpxor %xmm7,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm1
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %edi,%ebp
+ addl 24(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm0
+ vpor %xmm7,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpxor %xmm0,%xmm5,%xmm5
+ movl %edx,%esi
+ addl 28(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm1,%xmm5,%xmm5
+ addl %ebp,%ecx
+ andl %edi,%esi
+ vmovdqa 112(%esp),%xmm1
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%ebp
+ addl 32(%esp),%ebx
+ vpaddd %xmm5,%xmm1,%xmm1
+ vmovdqa %xmm2,96(%esp)
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm0
+ addl %esi,%ebx
+ andl %edx,%ebp
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%ebp
+ vmovdqa %xmm1,16(%esp)
+ movl %ebx,%esi
+ addl 36(%esp),%eax
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm2
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%ebp
+ addl 40(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm1
+ vpor %xmm0,%xmm6,%xmm6
+ addl %esi,%edi
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ vmovdqa 64(%esp),%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%ebp
+ vpxor %xmm1,%xmm6,%xmm6
+ movl %edi,%esi
+ addl 44(%esp),%edx
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ addl %ebp,%edx
+ andl %eax,%esi
+ vmovdqa 112(%esp),%xmm2
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%ebp
+ addl 48(%esp),%ecx
+ vpaddd %xmm6,%xmm2,%xmm2
+ vmovdqa %xmm3,64(%esp)
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm1
+ addl %esi,%ecx
+ andl %edi,%ebp
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm1,%xmm1
+ shrdl $7,%edx,%edx
+ xorl %eax,%ebp
+ vmovdqa %xmm2,32(%esp)
+ movl %ecx,%esi
+ addl 52(%esp),%ebx
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm1
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %edi,%esi
+ vpslldq $12,%xmm7,%xmm3
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%ebp
+ addl 56(%esp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm2
+ vpor %xmm1,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ vmovdqa 80(%esp),%xmm1
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%ebp
+ vpxor %xmm2,%xmm7,%xmm7
+ movl %eax,%esi
+ addl 60(%esp),%edi
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpxor %xmm3,%xmm7,%xmm7
+ addl %ebp,%edi
+ andl %ebx,%esi
+ vmovdqa 112(%esp),%xmm3
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %edi,%ebp
+ addl (%esp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,80(%esp)
+ xorl %ebx,%eax
+ shldl $5,%edi,%edi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ addl %esi,%edx
+ andl %eax,%ebp
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %edi,%edx
+ shrdl $7,%edi,%edi
+ xorl %ebx,%ebp
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ movl %edx,%esi
+ addl 4(%esp),%ecx
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %ebp,%ecx
+ andl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%ebp
+ addl 8(%esp),%ebx
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edi,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm2
+ addl %esi,%ebx
+ andl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 12(%esp),%eax
+ xorl %edi,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,96(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm3,%xmm1,%xmm1
+ addl 20(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm3,%xmm1,%xmm1
+ addl 28(%esp),%ebx
+ xorl %edi,%ebp
+ vmovdqa 64(%esp),%xmm3
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,64(%esp)
+ addl %esi,%eax
+ xorl %edx,%ebp
+ vmovdqa 128(%esp),%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm4,%xmm2,%xmm2
+ addl 36(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpor %xmm4,%xmm2,%xmm2
+ addl 44(%esp),%ecx
+ xorl %eax,%ebp
+ vmovdqa 80(%esp),%xmm4
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,80(%esp)
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%edx
+ xorl %ebx,%ebp
+ vmovdqa 96(%esp),%xmm5
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ vpxor %xmm0,%xmm4,%xmm4
+ addl (%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ vmovdqa %xmm0,96(%esp)
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ vmovdqa %xmm7,%xmm0
+ vpaddd %xmm3,%xmm7,%xmm7
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpxor %xmm6,%xmm4,%xmm4
+ addl 4(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm6
+ vmovdqa %xmm7,48(%esp)
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm6,%xmm4,%xmm4
+ addl 12(%esp),%edi
+ xorl %ecx,%ebp
+ vmovdqa 64(%esp),%xmm6
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpxor %xmm6,%xmm5,%xmm5
+ vmovdqa %xmm1,64(%esp)
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ vmovdqa %xmm0,%xmm1
+ vpaddd %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpxor %xmm7,%xmm5,%xmm5
+ addl 20(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm7
+ vmovdqa %xmm0,(%esp)
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm7,%xmm5,%xmm5
+ addl 28(%esp),%eax
+ vmovdqa 80(%esp),%xmm7
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%esp),%edi
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ vmovdqa %xmm2,80(%esp)
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ vmovdqa %xmm1,%xmm2
+ vpaddd %xmm5,%xmm1,%xmm1
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ vpxor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 36(%esp),%edx
+ vpsrld $30,%xmm6,%xmm0
+ vmovdqa %xmm1,16(%esp)
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 40(%esp),%ecx
+ andl %eax,%esi
+ vpor %xmm0,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vmovdqa 96(%esp),%xmm0
+ movl %edx,%ebp
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 44(%esp),%ebx
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm1
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%esp),%eax
+ andl %edx,%esi
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ vmovdqa %xmm3,96(%esp)
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ vmovdqa 144(%esp),%xmm3
+ vpaddd %xmm6,%xmm2,%xmm2
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%esp),%edi
+ vpsrld $30,%xmm7,%xmm1
+ vmovdqa %xmm2,32(%esp)
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 56(%esp),%edx
+ andl %ebx,%esi
+ vpor %xmm1,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vmovdqa 64(%esp),%xmm1
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 60(%esp),%ecx
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm2
+ vpxor %xmm4,%xmm0,%xmm0
+ addl (%esp),%ebx
+ andl %edi,%esi
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ vmovdqa %xmm4,64(%esp)
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ vmovdqa %xmm3,%xmm4
+ vpaddd %xmm7,%xmm3,%xmm3
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 4(%esp),%eax
+ vpsrld $30,%xmm0,%xmm2
+ vmovdqa %xmm3,48(%esp)
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%esp),%edi
+ andl %ecx,%esi
+ vpor %xmm2,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vmovdqa 80(%esp),%xmm2
+ movl %eax,%ebp
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ebx,%ebp
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ addl 12(%esp),%edx
+ andl %ebx,%ebp
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %edi,%esi
+ xorl %ebx,%ebp
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %edi,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm3
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%esp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm5,80(%esp)
+ movl %edx,%ebp
+ xorl %eax,%esi
+ vmovdqa %xmm4,%xmm5
+ vpaddd %xmm0,%xmm4,%xmm4
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm3,%xmm1,%xmm1
+ xorl %edi,%ebp
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 20(%esp),%ebx
+ vpsrld $30,%xmm1,%xmm3
+ vmovdqa %xmm4,(%esp)
+ andl %edi,%ebp
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %edi,%ebp
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edx,%esi
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 24(%esp),%eax
+ andl %edx,%esi
+ vpor %xmm3,%xmm1,%xmm1
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ vmovdqa 96(%esp),%xmm3
+ movl %ebx,%ebp
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%ebp
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%esp),%edi
+ andl %ecx,%ebp
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%ebp
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%edi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%esp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ vmovdqa %xmm6,96(%esp)
+ movl %edi,%ebp
+ xorl %ebx,%esi
+ vmovdqa %xmm5,%xmm6
+ vpaddd %xmm1,%xmm5,%xmm5
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ vpxor %xmm4,%xmm2,%xmm2
+ xorl %eax,%ebp
+ xorl %ebx,%eax
+ addl %edi,%edx
+ addl 36(%esp),%ecx
+ vpsrld $30,%xmm2,%xmm4
+ vmovdqa %xmm5,16(%esp)
+ andl %eax,%ebp
+ xorl %ebx,%eax
+ shrdl $7,%edi,%edi
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %edi,%esi
+ xorl %eax,%edi
+ addl %edx,%ecx
+ addl 40(%esp),%ebx
+ andl %edi,%esi
+ vpor %xmm4,%xmm2,%xmm2
+ xorl %eax,%edi
+ shrdl $7,%edx,%edx
+ vmovdqa 64(%esp),%xmm4
+ movl %ecx,%ebp
+ xorl %edi,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%ebp
+ xorl %edi,%edx
+ addl %ecx,%ebx
+ addl 44(%esp),%eax
+ andl %edx,%ebp
+ xorl %edi,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ vmovdqa %xmm7,64(%esp)
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ vmovdqa %xmm6,%xmm7
+ vpaddd %xmm2,%xmm6,%xmm6
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ vpxor %xmm5,%xmm3,%xmm3
+ addl 52(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ vpsrld $30,%xmm3,%xmm5
+ vmovdqa %xmm6,32(%esp)
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vpor %xmm5,%xmm3,%xmm3
+ addl 60(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl (%esp),%eax
+ vpaddd %xmm3,%xmm7,%xmm7
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm7,48(%esp)
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 8(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 12(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ movl 196(%esp),%ebp
+ cmpl 200(%esp),%ebp
+ je .L008done
+ vmovdqa 160(%esp),%xmm7
+ vmovdqa 176(%esp),%xmm6
+ vmovdqu (%ebp),%xmm0
+ vmovdqu 16(%ebp),%xmm1
+ vmovdqu 32(%ebp),%xmm2
+ vmovdqu 48(%ebp),%xmm3
+ addl $64,%ebp
+ vpshufb %xmm6,%xmm0,%xmm0
+ movl %ebp,196(%esp)
+ vmovdqa %xmm7,96(%esp)
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm7,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,(%esp)
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ vpaddd %xmm7,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%esp)
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ vpaddd %xmm7,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ vmovdqa %xmm6,32(%esp)
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,%ebx
+ movl %ecx,8(%ebp)
+ xorl %edx,%ebx
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ movl %esi,%ebp
+ andl %ebx,%esi
+ movl %ebp,%ebx
+ jmp .L007loop
+.align 16
+.L008done:
+ addl 16(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%esp),%edi
+ xorl %ecx,%esi
+ movl %eax,%ebp
+ shldl $5,%eax,%eax
+ addl %esi,%edi
+ xorl %ecx,%ebp
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 28(%esp),%edx
+ xorl %ebx,%ebp
+ movl %edi,%esi
+ shldl $5,%edi,%edi
+ addl %ebp,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 32(%esp),%ecx
+ xorl %eax,%esi
+ movl %edx,%ebp
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%ebp
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 36(%esp),%ebx
+ xorl %edi,%ebp
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %ebp,%ebx
+ xorl %edi,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%esp),%eax
+ xorl %edx,%esi
+ movl %ebx,%ebp
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%ebp
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%esp),%edi
+ xorl %ecx,%ebp
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %ebp,%edi
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%edi
+ addl 48(%esp),%edx
+ xorl %ebx,%esi
+ movl %edi,%ebp
+ shldl $5,%edi,%edi
+ addl %esi,%edx
+ xorl %ebx,%ebp
+ shrdl $7,%eax,%eax
+ addl %edi,%edx
+ addl 52(%esp),%ecx
+ xorl %eax,%ebp
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %ebp,%ecx
+ xorl %eax,%esi
+ shrdl $7,%edi,%edi
+ addl %edx,%ecx
+ addl 56(%esp),%ebx
+ xorl %edi,%esi
+ movl %ecx,%ebp
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edi,%ebp
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%esp),%eax
+ xorl %edx,%ebp
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %ebp,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroall
+ movl 192(%esp),%ebp
+ addl (%ebp),%eax
+ movl 204(%esp),%esp
+ addl 4(%ebp),%esi
+ addl 8(%ebp),%ecx
+ movl %eax,(%ebp)
+ addl 12(%ebp),%edx
+ movl %esi,4(%ebp)
+ addl 16(%ebp),%edi
+ movl %ecx,8(%ebp)
+ movl %edx,12(%ebp)
+ movl %edi,16(%ebp)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx
+.align 64
+.LK_XX_XX:
+.long 1518500249,1518500249,1518500249,1518500249
+.long 1859775393,1859775393,1859775393,1859775393
+.long 2400959708,2400959708,2400959708,2400959708
+.long 3395469782,3395469782,3395469782,3395469782
+.long 66051,67438087,134810123,202182159
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115
+.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82
+.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112
+.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S
@@ -1,0 +1,5567 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl sha256_block_data_order
+.hidden sha256_block_data_order
+.type sha256_block_data_order,@function
+.align 16
+sha256_block_data_order:
+.L_sha256_block_data_order_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal .L001K256-.L000pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $6,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ leal OPENSSL_ia32cap_P-.L001K256(%ebp),%edx
+ movl (%edx),%ecx
+ movl 4(%edx),%ebx
+ testl $1048576,%ecx
+ jnz .L002loop
+ movl 8(%edx),%edx
+ testl $16777216,%ecx
+ jz .L003no_xmm
+ andl $1073741824,%ecx
+ andl $268435968,%ebx
+ orl %ebx,%ecx
+ andl $1342177280,%ecx
+ cmpl $1342177280,%ecx
+ je .L004AVX
+ testl $512,%ebx
+ jnz .L005SSSE3
+.L003no_xmm:
+ subl %edi,%eax
+ cmpl $256,%eax
+ jae .L006unrolled
+ jmp .L002loop
+.align 16
+.L002loop:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ bswap %eax
+ movl 12(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ bswap %eax
+ movl 28(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %eax
+ movl 44(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ bswap %eax
+ movl 60(%edi),%edx
+ bswap %ebx
+ pushl %eax
+ bswap %ecx
+ pushl %ebx
+ bswap %edx
+ pushl %ecx
+ pushl %edx
+ addl $64,%edi
+ leal -36(%esp),%esp
+ movl %edi,104(%esp)
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,8(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,12(%esp)
+ movl %edi,16(%esp)
+ movl %ebx,(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ movl %edi,32(%esp)
+.align 16
+.L00700_15:
+ movl %edx,%ecx
+ movl 24(%esp),%esi
+ rorl $14,%ecx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl 96(%esp),%ebx
+ rorl $5,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %esi,%ebx
+ rorl $9,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ rorl $11,%ecx
+ movl (%ebp),%esi
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3248222580,%esi
+ jne .L00700_15
+ movl 156(%esp),%ecx
+ jmp .L00816_63
+.align 16
+.L00816_63:
+ movl %ecx,%ebx
+ movl 104(%esp),%esi
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 160(%esp),%ebx
+ shrl $10,%edi
+ addl 124(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 24(%esp),%esi
+ rorl $14,%ecx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %edx,%ecx
+ xorl %edi,%esi
+ movl %ebx,96(%esp)
+ rorl $5,%ecx
+ andl %edx,%esi
+ movl %edx,20(%esp)
+ xorl %ecx,%edx
+ addl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %esi,%ebx
+ rorl $9,%ecx
+ addl %edx,%ebx
+ movl 8(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,4(%esp)
+ leal -4(%esp),%esp
+ rorl $11,%ecx
+ movl (%ebp),%esi
+ xorl %eax,%ecx
+ movl 20(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %esi,%ebx
+ movl %eax,(%esp)
+ addl %ebx,%edx
+ andl 4(%esp),%eax
+ addl %ecx,%ebx
+ xorl %edi,%eax
+ movl 156(%esp),%ecx
+ addl $4,%ebp
+ addl %ebx,%eax
+ cmpl $3329325298,%esi
+ jne .L00816_63
+ movl 356(%esp),%esi
+ movl 8(%esp),%ebx
+ movl 16(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl 24(%esp),%eax
+ movl 28(%esp),%ebx
+ movl 32(%esp),%ecx
+ movl 360(%esp),%edi
+ addl 16(%esi),%edx
+ addl 20(%esi),%eax
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %eax,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ leal 356(%esp),%esp
+ subl $256,%ebp
+ cmpl 8(%esp),%edi
+ jb .L002loop
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L001K256:
+.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298
+.long 66051,67438087,134810123,202182159
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97
+.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
+.align 16
+.L006unrolled:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebp
+ movl 8(%esi),%ecx
+ movl 12(%esi),%ebx
+ movl %ebp,4(%esp)
+ xorl %ecx,%ebp
+ movl %ecx,8(%esp)
+ movl %ebx,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %ebx,20(%esp)
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ jmp .L009grand_loop
+.align 16
+.L009grand_loop:
+ movl (%edi),%ebx
+ movl 4(%edi),%ecx
+ bswap %ebx
+ movl 8(%edi),%esi
+ bswap %ecx
+ movl %ebx,32(%esp)
+ bswap %esi
+ movl %ecx,36(%esp)
+ movl %esi,40(%esp)
+ movl 12(%edi),%ebx
+ movl 16(%edi),%ecx
+ bswap %ebx
+ movl 20(%edi),%esi
+ bswap %ecx
+ movl %ebx,44(%esp)
+ bswap %esi
+ movl %ecx,48(%esp)
+ movl %esi,52(%esp)
+ movl 24(%edi),%ebx
+ movl 28(%edi),%ecx
+ bswap %ebx
+ movl 32(%edi),%esi
+ bswap %ecx
+ movl %ebx,56(%esp)
+ bswap %esi
+ movl %ecx,60(%esp)
+ movl %esi,64(%esp)
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ bswap %ebx
+ movl 44(%edi),%esi
+ bswap %ecx
+ movl %ebx,68(%esp)
+ bswap %esi
+ movl %ecx,72(%esp)
+ movl %esi,76(%esp)
+ movl 48(%edi),%ebx
+ movl 52(%edi),%ecx
+ bswap %ebx
+ movl 56(%edi),%esi
+ bswap %ecx
+ movl %ebx,80(%esp)
+ bswap %esi
+ movl %ecx,84(%esp)
+ movl %esi,88(%esp)
+ movl 60(%edi),%ebx
+ addl $64,%edi
+ bswap %ebx
+ movl %edi,100(%esp)
+ movl %ebx,92(%esp)
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 32(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1116352408(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 36(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1899447441(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 40(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3049323471(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 44(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3921009573(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 48(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 961987163(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 52(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1508970993(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 56(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2453635748(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 60(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2870763221(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl 64(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3624381080(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl 68(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 310598401(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl 72(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 607225278(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl 76(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1426881987(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl 80(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1925078388(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl 84(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2162078206(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl %edx,%ecx
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl 88(%esp),%ebx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2614888103(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl %edx,%esi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl 92(%esp),%ebx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3248222580(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3835390401(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 4022224774(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 264347078(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 604807628(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 770255983(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1249150122(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1555081692(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1996064986(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2554220882(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2821834349(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2952996808(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3210313671(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3336571891(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3584528711(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 113926993(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 338241895(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 666307205(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 773529912(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1294757372(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1396182291(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1695183700(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1986661051(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2177026350(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2456956037(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2730485921(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2820302411(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3259730800(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3345764771(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3516065817(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3600352804(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,88(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 4094571909(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,92(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 275423344(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 36(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 88(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 32(%esp),%ebx
+ shrl $10,%edi
+ addl 68(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,32(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 430227734(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 40(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 92(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 36(%esp),%ebx
+ shrl $10,%edi
+ addl 72(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,36(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 506948616(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 44(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 32(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 40(%esp),%ebx
+ shrl $10,%edi
+ addl 76(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,40(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 659060556(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 48(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 36(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 44(%esp),%ebx
+ shrl $10,%edi
+ addl 80(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,44(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 883997877(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 52(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 40(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 48(%esp),%ebx
+ shrl $10,%edi
+ addl 84(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,48(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 958139571(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 56(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 44(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 52(%esp),%ebx
+ shrl $10,%edi
+ addl 88(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,52(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1322822218(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 60(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 48(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 56(%esp),%ebx
+ shrl $10,%edi
+ addl 92(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,56(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1537002063(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 64(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 52(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 60(%esp),%ebx
+ shrl $10,%edi
+ addl 32(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,60(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 1747873779(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 68(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 56(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 64(%esp),%ebx
+ shrl $10,%edi
+ addl 36(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 20(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 24(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,64(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ addl 28(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 4(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 1955562222(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 72(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 12(%esp),%edx
+ addl %ecx,%ebp
+ movl 60(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 68(%esp),%ebx
+ shrl $10,%edi
+ addl 40(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 16(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 20(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,68(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,12(%esp)
+ xorl %esi,%edx
+ addl 24(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl (%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,28(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2024104815(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 76(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %esi,%eax
+ movl 64(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 72(%esp),%ebx
+ shrl $10,%edi
+ addl 44(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 12(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 16(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,72(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ addl 20(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 28(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,24(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2227730452(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 80(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 4(%esp),%edx
+ addl %ecx,%ebp
+ movl 68(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 76(%esp),%ebx
+ shrl $10,%edi
+ addl 48(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 8(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 12(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,76(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,4(%esp)
+ xorl %esi,%edx
+ addl 16(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 24(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,20(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2361852424(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 84(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %esi,%eax
+ movl 72(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 80(%esp),%ebx
+ shrl $10,%edi
+ addl 52(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 4(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 8(%esp),%edi
+ xorl %ecx,%edx
+ movl %ebx,80(%esp)
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ addl 12(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 20(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,16(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 2428436474(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 88(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 28(%esp),%edx
+ addl %ecx,%ebp
+ movl 76(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 84(%esp),%ebx
+ shrl $10,%edi
+ addl 56(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl (%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 4(%esp),%edi
+ xorl %esi,%edx
+ movl %ebx,84(%esp)
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,28(%esp)
+ xorl %esi,%edx
+ addl 8(%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 16(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,12(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 2756734187(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ movl 92(%esp),%ecx
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %esi,%eax
+ movl 80(%esp),%esi
+ movl %ecx,%ebx
+ rorl $11,%ecx
+ movl %esi,%edi
+ rorl $2,%esi
+ xorl %ebx,%ecx
+ shrl $3,%ebx
+ rorl $7,%ecx
+ xorl %edi,%esi
+ xorl %ecx,%ebx
+ rorl $17,%esi
+ addl 88(%esp),%ebx
+ shrl $10,%edi
+ addl 60(%esp),%ebx
+ movl %edx,%ecx
+ xorl %esi,%edi
+ movl 28(%esp),%esi
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl (%esp),%edi
+ xorl %ecx,%edx
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ addl 4(%esp),%ebx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%ebx
+ rorl $9,%ecx
+ movl %eax,%esi
+ movl 12(%esp),%edi
+ xorl %eax,%ecx
+ movl %eax,8(%esp)
+ xorl %edi,%eax
+ rorl $11,%ecx
+ andl %eax,%ebp
+ leal 3204031479(%ebx,%edx,1),%edx
+ xorl %esi,%ecx
+ xorl %edi,%ebp
+ movl 32(%esp),%esi
+ rorl $2,%ecx
+ addl %edx,%ebp
+ addl 20(%esp),%edx
+ addl %ecx,%ebp
+ movl 84(%esp),%ecx
+ movl %esi,%ebx
+ rorl $11,%esi
+ movl %ecx,%edi
+ rorl $2,%ecx
+ xorl %ebx,%esi
+ shrl $3,%ebx
+ rorl $7,%esi
+ xorl %edi,%ecx
+ xorl %esi,%ebx
+ rorl $17,%ecx
+ addl 92(%esp),%ebx
+ shrl $10,%edi
+ addl 64(%esp),%ebx
+ movl %edx,%esi
+ xorl %ecx,%edi
+ movl 24(%esp),%ecx
+ rorl $14,%edx
+ addl %edi,%ebx
+ movl 28(%esp),%edi
+ xorl %esi,%edx
+ xorl %edi,%ecx
+ rorl $5,%edx
+ andl %esi,%ecx
+ movl %esi,20(%esp)
+ xorl %esi,%edx
+ addl (%esp),%ebx
+ xorl %ecx,%edi
+ rorl $6,%edx
+ movl %ebp,%esi
+ addl %edi,%ebx
+ rorl $9,%esi
+ movl %ebp,%ecx
+ movl 8(%esp),%edi
+ xorl %ebp,%esi
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ rorl $11,%esi
+ andl %ebp,%eax
+ leal 3329325298(%ebx,%edx,1),%edx
+ xorl %ecx,%esi
+ xorl %edi,%eax
+ rorl $2,%esi
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %esi,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebp
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebp
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebp,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebp,4(%esp)
+ xorl %edi,%ebp
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ebx
+ movl 28(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ebx
+ addl 28(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %ebx,24(%esi)
+ movl %ecx,28(%esi)
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ebx,24(%esp)
+ movl %ecx,28(%esp)
+ cmpl 104(%esp),%edi
+ jb .L009grand_loop
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L005SSSE3:
+ leal -96(%esp),%esp
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ movdqa 256(%ebp),%xmm7
+ jmp .L010grand_ssse3
+.align 16
+.L010grand_ssse3:
+ movdqu (%edi),%xmm0
+ movdqu 16(%edi),%xmm1
+ movdqu 32(%edi),%xmm2
+ movdqu 48(%edi),%xmm3
+ addl $64,%edi
+.byte 102,15,56,0,199
+ movl %edi,100(%esp)
+.byte 102,15,56,0,207
+ movdqa (%ebp),%xmm4
+.byte 102,15,56,0,215
+ movdqa 16(%ebp),%xmm5
+ paddd %xmm0,%xmm4
+.byte 102,15,56,0,223
+ movdqa 32(%ebp),%xmm6
+ paddd %xmm1,%xmm5
+ movdqa 48(%ebp),%xmm7
+ movdqa %xmm4,32(%esp)
+ paddd %xmm2,%xmm6
+ movdqa %xmm5,48(%esp)
+ paddd %xmm3,%xmm7
+ movdqa %xmm6,64(%esp)
+ movdqa %xmm7,80(%esp)
+ jmp .L011ssse3_00_47
+.align 16
+.L011ssse3_00_47:
+ addl $64,%ebp
+ movl %edx,%ecx
+ movdqa %xmm1,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,224,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,250,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm3,%xmm7
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm0
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm0
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm0,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa (%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm0
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm0,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,32(%esp)
+ movl %edx,%ecx
+ movdqa %xmm2,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,225,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,251,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm0,%xmm7
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm1
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm1
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm1,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 16(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm1
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm1,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,48(%esp)
+ movl %edx,%ecx
+ movdqa %xmm3,%xmm4
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ movdqa %xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+.byte 102,15,58,15,226,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,248,4
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm1,%xmm7
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm2
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm2
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ pshufd $80,%xmm2,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 32(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm2
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ paddd %xmm2,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,64(%esp)
+ movl %edx,%ecx
+ movdqa %xmm0,%xmm4
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ movdqa %xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+.byte 102,15,58,15,227,4
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+.byte 102,15,58,15,249,4
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ movdqa %xmm4,%xmm5
+ rorl $6,%edx
+ movl %eax,%ecx
+ movdqa %xmm4,%xmm6
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ psrld $3,%xmm4
+ movl %eax,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ psrld $7,%xmm6
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ pshufd $250,%xmm2,%xmm7
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ pslld $14,%xmm5
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ psrld $11,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm5,%xmm4
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ pslld $11,%xmm5
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ pxor %xmm6,%xmm4
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ movdqa %xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ pxor %xmm5,%xmm4
+ movl %ebx,%ecx
+ addl %edi,%edx
+ psrld $10,%xmm7
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm4,%xmm3
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ psrlq $17,%xmm6
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ pxor %xmm6,%xmm7
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ psrlq $2,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ pshufd $128,%xmm7,%xmm7
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ psrldq $8,%xmm7
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ paddd %xmm7,%xmm3
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ pshufd $80,%xmm3,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ movdqa %xmm7,%xmm6
+ rorl $11,%ecx
+ psrld $10,%xmm7
+ andl %eax,%ebx
+ psrlq $17,%xmm6
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ pxor %xmm6,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ psrlq $2,%xmm6
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ pxor %xmm6,%xmm7
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ pshufd $8,%xmm7,%xmm7
+ xorl %edi,%esi
+ rorl $5,%edx
+ movdqa 48(%ebp),%xmm6
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ pslldq $8,%xmm7
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ paddd %xmm7,%xmm3
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ paddd %xmm3,%xmm6
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L011ssse3_00_47
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ rorl $9,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ rorl $11,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ rorl $2,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ rorl $14,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ rorl $5,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ rorl $6,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ rorl $9,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ rorl $11,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ rorl $2,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ movdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L010grand_ssse3
+ movl 108(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L004AVX:
+ leal -96(%esp),%esp
+ vzeroall
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edi
+ movl %ebx,4(%esp)
+ xorl %ecx,%ebx
+ movl %ecx,8(%esp)
+ movl %edi,12(%esp)
+ movl 16(%esi),%edx
+ movl 20(%esi),%edi
+ movl 24(%esi),%ecx
+ movl 28(%esi),%esi
+ movl %edi,20(%esp)
+ movl 100(%esp),%edi
+ movl %ecx,24(%esp)
+ movl %esi,28(%esp)
+ vmovdqa 256(%ebp),%xmm7
+ jmp .L012grand_avx
+.align 32
+.L012grand_avx:
+ vmovdqu (%edi),%xmm0
+ vmovdqu 16(%edi),%xmm1
+ vmovdqu 32(%edi),%xmm2
+ vmovdqu 48(%edi),%xmm3
+ addl $64,%edi
+ vpshufb %xmm7,%xmm0,%xmm0
+ movl %edi,100(%esp)
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd (%ebp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 16(%ebp),%xmm1,%xmm5
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ vpaddd 48(%ebp),%xmm3,%xmm7
+ vmovdqa %xmm4,32(%esp)
+ vmovdqa %xmm5,48(%esp)
+ vmovdqa %xmm6,64(%esp)
+ vmovdqa %xmm7,80(%esp)
+ jmp .L013avx_00_47
+.align 16
+.L013avx_00_47:
+ addl $64,%ebp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm0,%xmm0
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm0,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm0,%xmm0
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd (%ebp),%xmm0,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,32(%esp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm1,%xmm1
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm1,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm1,%xmm1
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 16(%ebp),%xmm1,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,48(%esp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ vpaddd %xmm4,%xmm2,%xmm2
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm2,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm2,%xmm2
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ vpaddd 32(%ebp),%xmm2,%xmm6
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,64(%esp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrld $3,%xmm4,%xmm7
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ vpslld $14,%xmm4,%xmm5
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpsrld $11,%xmm6,%xmm6
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ vpslld $11,%xmm5,%xmm5
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ vpsrld $10,%xmm7,%xmm6
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ vpxor %xmm5,%xmm4,%xmm4
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ vpaddd %xmm4,%xmm3,%xmm3
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ vpxor %xmm5,%xmm6,%xmm6
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ vpsrlq $19,%xmm7,%xmm7
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ vpshufd $132,%xmm6,%xmm7
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ vpsrldq $8,%xmm7,%xmm7
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ vpshufd $80,%xmm3,%xmm7
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ vpsrld $10,%xmm7,%xmm6
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ vpsrlq $17,%xmm7,%xmm5
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ vpxor %xmm5,%xmm6,%xmm6
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ vpsrlq $19,%xmm7,%xmm7
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ vpshufd $232,%xmm6,%xmm7
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ vpslldq $8,%xmm7,%xmm7
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ vpaddd %xmm7,%xmm3,%xmm3
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ vpaddd 48(%ebp),%xmm3,%xmm6
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ vmovdqa %xmm6,80(%esp)
+ cmpl $66051,64(%ebp)
+ jne .L013avx_00_47
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 32(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 36(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 40(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 44(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 48(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 52(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 56(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 60(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 20(%esp),%esi
+ xorl %ecx,%edx
+ movl 24(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,16(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 4(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 28(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 64(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 12(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 16(%esp),%esi
+ xorl %ecx,%edx
+ movl 20(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl (%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,28(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 24(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 68(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 8(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 12(%esp),%esi
+ xorl %ecx,%edx
+ movl 16(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,8(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 28(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,24(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 20(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 72(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 4(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 8(%esp),%esi
+ xorl %ecx,%edx
+ movl 12(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,4(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 24(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,20(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 16(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 76(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl (%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 4(%esp),%esi
+ xorl %ecx,%edx
+ movl 8(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 20(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,16(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 12(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 80(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 28(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl (%esp),%esi
+ xorl %ecx,%edx
+ movl 4(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,28(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 16(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,12(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl 8(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 84(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 24(%esp),%edx
+ addl %ecx,%eax
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 28(%esp),%esi
+ xorl %ecx,%edx
+ movl (%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,24(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %eax,%ecx
+ addl %edi,%edx
+ movl 12(%esp),%edi
+ movl %eax,%esi
+ shrdl $9,%ecx,%ecx
+ movl %eax,8(%esp)
+ xorl %eax,%ecx
+ xorl %edi,%eax
+ addl 4(%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %eax,%ebx
+ xorl %esi,%ecx
+ addl 88(%esp),%edx
+ xorl %edi,%ebx
+ shrdl $2,%ecx,%ecx
+ addl %edx,%ebx
+ addl 20(%esp),%edx
+ addl %ecx,%ebx
+ movl %edx,%ecx
+ shrdl $14,%edx,%edx
+ movl 24(%esp),%esi
+ xorl %ecx,%edx
+ movl 28(%esp),%edi
+ xorl %edi,%esi
+ shrdl $5,%edx,%edx
+ andl %ecx,%esi
+ movl %ecx,20(%esp)
+ xorl %ecx,%edx
+ xorl %esi,%edi
+ shrdl $6,%edx,%edx
+ movl %ebx,%ecx
+ addl %edi,%edx
+ movl 8(%esp),%edi
+ movl %ebx,%esi
+ shrdl $9,%ecx,%ecx
+ movl %ebx,4(%esp)
+ xorl %ebx,%ecx
+ xorl %edi,%ebx
+ addl (%esp),%edx
+ shrdl $11,%ecx,%ecx
+ andl %ebx,%eax
+ xorl %esi,%ecx
+ addl 92(%esp),%edx
+ xorl %edi,%eax
+ shrdl $2,%ecx,%ecx
+ addl %edx,%eax
+ addl 16(%esp),%edx
+ addl %ecx,%eax
+ movl 96(%esp),%esi
+ xorl %edi,%ebx
+ movl 12(%esp),%ecx
+ addl (%esi),%eax
+ addl 4(%esi),%ebx
+ addl 8(%esi),%edi
+ addl 12(%esi),%ecx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ movl %edi,8(%esi)
+ movl %ecx,12(%esi)
+ movl %ebx,4(%esp)
+ xorl %edi,%ebx
+ movl %edi,8(%esp)
+ movl %ecx,12(%esp)
+ movl 20(%esp),%edi
+ movl 24(%esp),%ecx
+ addl 16(%esi),%edx
+ addl 20(%esi),%edi
+ addl 24(%esi),%ecx
+ movl %edx,16(%esi)
+ movl %edi,20(%esi)
+ movl %edi,20(%esp)
+ movl 28(%esp),%edi
+ movl %ecx,24(%esi)
+ addl 28(%esi),%edi
+ movl %ecx,24(%esp)
+ movl %edi,28(%esi)
+ movl %edi,28(%esp)
+ movl 100(%esp),%edi
+ vmovdqa 64(%ebp),%xmm7
+ subl $192,%ebp
+ cmpl 104(%esp),%edi
+ jb .L012grand_avx
+ movl 108(%esp),%esp
+ vzeroall
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size sha256_block_data_order,.-.L_sha256_block_data_order_begin
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S
@@ -1,0 +1,2837 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl sha512_block_data_order
+.hidden sha512_block_data_order
+.type sha512_block_data_order,@function
+.align 16
+sha512_block_data_order:
+.L_sha512_block_data_order_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl %esp,%ebx
+ call .L000pic_point
+.L000pic_point:
+ popl %ebp
+ leal .L001K512-.L000pic_point(%ebp),%ebp
+ subl $16,%esp
+ andl $-64,%esp
+ shll $7,%eax
+ addl %edi,%eax
+ movl %esi,(%esp)
+ movl %edi,4(%esp)
+ movl %eax,8(%esp)
+ movl %ebx,12(%esp)
+ leal OPENSSL_ia32cap_P-.L001K512(%ebp),%edx
+ movl (%edx),%ecx
+ testl $67108864,%ecx
+ jz .L002loop_x86
+ movl 4(%edx),%edx
+ movq (%esi),%mm0
+ andl $16777216,%ecx
+ movq 8(%esi),%mm1
+ andl $512,%edx
+ movq 16(%esi),%mm2
+ orl %edx,%ecx
+ movq 24(%esi),%mm3
+ movq 32(%esi),%mm4
+ movq 40(%esi),%mm5
+ movq 48(%esi),%mm6
+ movq 56(%esi),%mm7
+ cmpl $16777728,%ecx
+ je .L003SSSE3
+ subl $80,%esp
+ jmp .L004loop_sse2
+.align 16
+.L004loop_sse2:
+ movq %mm1,8(%esp)
+ movq %mm2,16(%esp)
+ movq %mm3,24(%esp)
+ movq %mm5,40(%esp)
+ movq %mm6,48(%esp)
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ movq %mm0,%mm3
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ movl $15,%edx
+ bswap %eax
+ bswap %ebx
+ jmp .L00500_14_sse2
+.align 16
+.L00500_14_sse2:
+ movd %eax,%mm1
+ movl (%edi),%eax
+ movd %ebx,%mm7
+ movl 4(%edi),%ebx
+ addl $8,%edi
+ bswap %eax
+ bswap %ebx
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ movq 48(%esp),%mm6
+ decl %edx
+ jnz .L00500_14_sse2
+ movd %eax,%mm1
+ movd %ebx,%mm7
+ punpckldq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm3,%mm0
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm2,%mm3
+ movq %mm0,%mm2
+ addl $8,%ebp
+ paddq %mm6,%mm3
+ pxor %mm0,%mm0
+ movl $32,%edx
+ jmp .L00616_79_sse2
+.align 16
+.L00616_79_sse2:
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm0
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm2
+ addl $8,%ebp
+ movq 88(%esp),%mm5
+ movq %mm7,%mm1
+ psrlq $1,%mm7
+ movq %mm5,%mm6
+ psrlq $6,%mm5
+ psllq $56,%mm1
+ paddq %mm3,%mm2
+ movq %mm7,%mm3
+ psrlq $6,%mm7
+ pxor %mm1,%mm3
+ psllq $7,%mm1
+ pxor %mm7,%mm3
+ psrlq $1,%mm7
+ pxor %mm1,%mm3
+ movq %mm5,%mm1
+ psrlq $13,%mm5
+ pxor %mm3,%mm7
+ psllq $3,%mm6
+ pxor %mm5,%mm1
+ paddq 200(%esp),%mm7
+ pxor %mm6,%mm1
+ psrlq $42,%mm5
+ paddq 128(%esp),%mm7
+ pxor %mm5,%mm1
+ psllq $42,%mm6
+ movq 40(%esp),%mm5
+ pxor %mm6,%mm1
+ movq 48(%esp),%mm6
+ paddq %mm1,%mm7
+ movq %mm4,%mm1
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ movq %mm7,72(%esp)
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ paddq (%ebp),%mm7
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ subl $8,%esp
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 192(%esp),%mm7
+ paddq %mm6,%mm0
+ addl $8,%ebp
+ decl %edx
+ jnz .L00616_79_sse2
+ paddq %mm3,%mm0
+ movq 8(%esp),%mm1
+ movq 24(%esp),%mm3
+ movq 40(%esp),%mm5
+ movq 48(%esp),%mm6
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movl $640,%eax
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ leal (%esp,%eax,1),%esp
+ subl %eax,%ebp
+ cmpl 88(%esp),%edi
+ jb .L004loop_sse2
+ movl 92(%esp),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 32
+.L003SSSE3:
+ leal -64(%esp),%edx
+ subl $256,%esp
+ movdqa 640(%ebp),%xmm1
+ movdqu (%edi),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%edi),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%edi),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%edi),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%edi),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%edi),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%edi),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%edi),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movdqa %xmm2,-16(%edx)
+ nop
+.align 32
+.L007loop_ssse3:
+ movdqa 16(%edx),%xmm2
+ movdqa %xmm3,48(%edx)
+ leal 128(%ebp),%ebp
+ movq %mm1,8(%esp)
+ movl %edi,%ebx
+ movq %mm2,16(%esp)
+ leal 128(%edi),%edi
+ movq %mm3,24(%esp)
+ cmpl %eax,%edi
+ movq %mm5,40(%esp)
+ cmovbl %edi,%ebx
+ movq %mm6,48(%esp)
+ movl $4,%ecx
+ pxor %mm1,%mm2
+ movq %mm7,56(%esp)
+ pxor %mm3,%mm3
+ jmp .L00800_47_ssse3
+.align 32
+.L00800_47_ssse3:
+ movdqa %xmm5,%xmm3
+ movdqa %xmm2,%xmm1
+.byte 102,15,58,15,208,8
+ movdqa %xmm4,(%edx)
+.byte 102,15,58,15,220,8
+ movdqa %xmm2,%xmm4
+ psrlq $7,%xmm2
+ paddq %xmm3,%xmm0
+ movdqa %xmm4,%xmm3
+ psrlq $1,%xmm4
+ psllq $56,%xmm3
+ pxor %xmm4,%xmm2
+ psrlq $7,%xmm4
+ pxor %xmm3,%xmm2
+ psllq $7,%xmm3
+ pxor %xmm4,%xmm2
+ movdqa %xmm7,%xmm4
+ pxor %xmm3,%xmm2
+ movdqa %xmm7,%xmm3
+ psrlq $6,%xmm4
+ paddq %xmm2,%xmm0
+ movdqa %xmm7,%xmm2
+ psrlq $19,%xmm3
+ psllq $3,%xmm2
+ pxor %xmm3,%xmm4
+ psrlq $42,%xmm3
+ pxor %xmm2,%xmm4
+ psllq $42,%xmm2
+ pxor %xmm3,%xmm4
+ movdqa 32(%edx),%xmm3
+ pxor %xmm2,%xmm4
+ movdqa (%ebp),%xmm2
+ movq %mm4,%mm1
+ paddq %xmm4,%xmm0
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm0,%xmm2
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm2,-128(%edx)
+ movdqa %xmm6,%xmm4
+ movdqa %xmm3,%xmm2
+.byte 102,15,58,15,217,8
+ movdqa %xmm5,16(%edx)
+.byte 102,15,58,15,229,8
+ movdqa %xmm3,%xmm5
+ psrlq $7,%xmm3
+ paddq %xmm4,%xmm1
+ movdqa %xmm5,%xmm4
+ psrlq $1,%xmm5
+ psllq $56,%xmm4
+ pxor %xmm5,%xmm3
+ psrlq $7,%xmm5
+ pxor %xmm4,%xmm3
+ psllq $7,%xmm4
+ pxor %xmm5,%xmm3
+ movdqa %xmm0,%xmm5
+ pxor %xmm4,%xmm3
+ movdqa %xmm0,%xmm4
+ psrlq $6,%xmm5
+ paddq %xmm3,%xmm1
+ movdqa %xmm0,%xmm3
+ psrlq $19,%xmm4
+ psllq $3,%xmm3
+ pxor %xmm4,%xmm5
+ psrlq $42,%xmm4
+ pxor %xmm3,%xmm5
+ psllq $42,%xmm3
+ pxor %xmm4,%xmm5
+ movdqa 48(%edx),%xmm4
+ pxor %xmm3,%xmm5
+ movdqa 16(%ebp),%xmm3
+ movq %mm4,%mm1
+ paddq %xmm5,%xmm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm1,%xmm3
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm3,-112(%edx)
+ movdqa %xmm7,%xmm5
+ movdqa %xmm4,%xmm3
+.byte 102,15,58,15,226,8
+ movdqa %xmm6,32(%edx)
+.byte 102,15,58,15,238,8
+ movdqa %xmm4,%xmm6
+ psrlq $7,%xmm4
+ paddq %xmm5,%xmm2
+ movdqa %xmm6,%xmm5
+ psrlq $1,%xmm6
+ psllq $56,%xmm5
+ pxor %xmm6,%xmm4
+ psrlq $7,%xmm6
+ pxor %xmm5,%xmm4
+ psllq $7,%xmm5
+ pxor %xmm6,%xmm4
+ movdqa %xmm1,%xmm6
+ pxor %xmm5,%xmm4
+ movdqa %xmm1,%xmm5
+ psrlq $6,%xmm6
+ paddq %xmm4,%xmm2
+ movdqa %xmm1,%xmm4
+ psrlq $19,%xmm5
+ psllq $3,%xmm4
+ pxor %xmm5,%xmm6
+ psrlq $42,%xmm5
+ pxor %xmm4,%xmm6
+ psllq $42,%xmm4
+ pxor %xmm5,%xmm6
+ movdqa (%edx),%xmm5
+ pxor %xmm4,%xmm6
+ movdqa 32(%ebp),%xmm4
+ movq %mm4,%mm1
+ paddq %xmm6,%xmm2
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm2,%xmm4
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm4,-96(%edx)
+ movdqa %xmm0,%xmm6
+ movdqa %xmm5,%xmm4
+.byte 102,15,58,15,235,8
+ movdqa %xmm7,48(%edx)
+.byte 102,15,58,15,247,8
+ movdqa %xmm5,%xmm7
+ psrlq $7,%xmm5
+ paddq %xmm6,%xmm3
+ movdqa %xmm7,%xmm6
+ psrlq $1,%xmm7
+ psllq $56,%xmm6
+ pxor %xmm7,%xmm5
+ psrlq $7,%xmm7
+ pxor %xmm6,%xmm5
+ psllq $7,%xmm6
+ pxor %xmm7,%xmm5
+ movdqa %xmm2,%xmm7
+ pxor %xmm6,%xmm5
+ movdqa %xmm2,%xmm6
+ psrlq $6,%xmm7
+ paddq %xmm5,%xmm3
+ movdqa %xmm2,%xmm5
+ psrlq $19,%xmm6
+ psllq $3,%xmm5
+ pxor %xmm6,%xmm7
+ psrlq $42,%xmm6
+ pxor %xmm5,%xmm7
+ psllq $42,%xmm5
+ pxor %xmm6,%xmm7
+ movdqa 16(%edx),%xmm6
+ pxor %xmm5,%xmm7
+ movdqa 48(%ebp),%xmm5
+ movq %mm4,%mm1
+ paddq %xmm7,%xmm3
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm3,%xmm5
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm5,-80(%edx)
+ movdqa %xmm1,%xmm7
+ movdqa %xmm6,%xmm5
+.byte 102,15,58,15,244,8
+ movdqa %xmm0,(%edx)
+.byte 102,15,58,15,248,8
+ movdqa %xmm6,%xmm0
+ psrlq $7,%xmm6
+ paddq %xmm7,%xmm4
+ movdqa %xmm0,%xmm7
+ psrlq $1,%xmm0
+ psllq $56,%xmm7
+ pxor %xmm0,%xmm6
+ psrlq $7,%xmm0
+ pxor %xmm7,%xmm6
+ psllq $7,%xmm7
+ pxor %xmm0,%xmm6
+ movdqa %xmm3,%xmm0
+ pxor %xmm7,%xmm6
+ movdqa %xmm3,%xmm7
+ psrlq $6,%xmm0
+ paddq %xmm6,%xmm4
+ movdqa %xmm3,%xmm6
+ psrlq $19,%xmm7
+ psllq $3,%xmm6
+ pxor %xmm7,%xmm0
+ psrlq $42,%xmm7
+ pxor %xmm6,%xmm0
+ psllq $42,%xmm6
+ pxor %xmm7,%xmm0
+ movdqa 32(%edx),%xmm7
+ pxor %xmm6,%xmm0
+ movdqa 64(%ebp),%xmm6
+ movq %mm4,%mm1
+ paddq %xmm0,%xmm4
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ paddq %xmm4,%xmm6
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm6,-64(%edx)
+ movdqa %xmm2,%xmm0
+ movdqa %xmm7,%xmm6
+.byte 102,15,58,15,253,8
+ movdqa %xmm1,16(%edx)
+.byte 102,15,58,15,193,8
+ movdqa %xmm7,%xmm1
+ psrlq $7,%xmm7
+ paddq %xmm0,%xmm5
+ movdqa %xmm1,%xmm0
+ psrlq $1,%xmm1
+ psllq $56,%xmm0
+ pxor %xmm1,%xmm7
+ psrlq $7,%xmm1
+ pxor %xmm0,%xmm7
+ psllq $7,%xmm0
+ pxor %xmm1,%xmm7
+ movdqa %xmm4,%xmm1
+ pxor %xmm0,%xmm7
+ movdqa %xmm4,%xmm0
+ psrlq $6,%xmm1
+ paddq %xmm7,%xmm5
+ movdqa %xmm4,%xmm7
+ psrlq $19,%xmm0
+ psllq $3,%xmm7
+ pxor %xmm0,%xmm1
+ psrlq $42,%xmm0
+ pxor %xmm7,%xmm1
+ psllq $42,%xmm7
+ pxor %xmm0,%xmm1
+ movdqa 48(%edx),%xmm0
+ pxor %xmm7,%xmm1
+ movdqa 80(%ebp),%xmm7
+ movq %mm4,%mm1
+ paddq %xmm1,%xmm5
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ paddq %xmm5,%xmm7
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm7,-48(%edx)
+ movdqa %xmm3,%xmm1
+ movdqa %xmm0,%xmm7
+.byte 102,15,58,15,198,8
+ movdqa %xmm2,32(%edx)
+.byte 102,15,58,15,202,8
+ movdqa %xmm0,%xmm2
+ psrlq $7,%xmm0
+ paddq %xmm1,%xmm6
+ movdqa %xmm2,%xmm1
+ psrlq $1,%xmm2
+ psllq $56,%xmm1
+ pxor %xmm2,%xmm0
+ psrlq $7,%xmm2
+ pxor %xmm1,%xmm0
+ psllq $7,%xmm1
+ pxor %xmm2,%xmm0
+ movdqa %xmm5,%xmm2
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm1
+ psrlq $6,%xmm2
+ paddq %xmm0,%xmm6
+ movdqa %xmm5,%xmm0
+ psrlq $19,%xmm1
+ psllq $3,%xmm0
+ pxor %xmm1,%xmm2
+ psrlq $42,%xmm1
+ pxor %xmm0,%xmm2
+ psllq $42,%xmm0
+ pxor %xmm1,%xmm2
+ movdqa (%edx),%xmm1
+ pxor %xmm0,%xmm2
+ movdqa 96(%ebp),%xmm0
+ movq %mm4,%mm1
+ paddq %xmm2,%xmm6
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ paddq %xmm6,%xmm0
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm0,-32(%edx)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm1,%xmm0
+.byte 102,15,58,15,207,8
+ movdqa %xmm3,48(%edx)
+.byte 102,15,58,15,211,8
+ movdqa %xmm1,%xmm3
+ psrlq $7,%xmm1
+ paddq %xmm2,%xmm7
+ movdqa %xmm3,%xmm2
+ psrlq $1,%xmm3
+ psllq $56,%xmm2
+ pxor %xmm3,%xmm1
+ psrlq $7,%xmm3
+ pxor %xmm2,%xmm1
+ psllq $7,%xmm2
+ pxor %xmm3,%xmm1
+ movdqa %xmm6,%xmm3
+ pxor %xmm2,%xmm1
+ movdqa %xmm6,%xmm2
+ psrlq $6,%xmm3
+ paddq %xmm1,%xmm7
+ movdqa %xmm6,%xmm1
+ psrlq $19,%xmm2
+ psllq $3,%xmm1
+ pxor %xmm2,%xmm3
+ psrlq $42,%xmm2
+ pxor %xmm1,%xmm3
+ psllq $42,%xmm1
+ pxor %xmm2,%xmm3
+ movdqa 16(%edx),%xmm2
+ pxor %xmm1,%xmm3
+ movdqa 112(%ebp),%xmm1
+ movq %mm4,%mm1
+ paddq %xmm3,%xmm7
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ paddq %xmm7,%xmm1
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm1,-16(%edx)
+ leal 128(%ebp),%ebp
+ decl %ecx
+ jnz .L00800_47_ssse3
+ movdqa (%ebp),%xmm1
+ leal -640(%ebp),%ebp
+ movdqu (%ebx),%xmm0
+.byte 102,15,56,0,193
+ movdqa (%ebp),%xmm3
+ movdqa %xmm1,%xmm2
+ movdqu 16(%ebx),%xmm1
+ paddq %xmm0,%xmm3
+.byte 102,15,56,0,202
+ movq %mm4,%mm1
+ movq -128(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -120(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm3,-128(%edx)
+ movdqa 16(%ebp),%xmm4
+ movdqa %xmm2,%xmm3
+ movdqu 32(%ebx),%xmm2
+ paddq %xmm1,%xmm4
+.byte 102,15,56,0,211
+ movq %mm4,%mm1
+ movq -112(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -104(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm4,-112(%edx)
+ movdqa 32(%ebp),%xmm5
+ movdqa %xmm3,%xmm4
+ movdqu 48(%ebx),%xmm3
+ paddq %xmm2,%xmm5
+.byte 102,15,56,0,220
+ movq %mm4,%mm1
+ movq -96(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -88(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm5,-96(%edx)
+ movdqa 48(%ebp),%xmm6
+ movdqa %xmm4,%xmm5
+ movdqu 64(%ebx),%xmm4
+ paddq %xmm3,%xmm6
+.byte 102,15,56,0,229
+ movq %mm4,%mm1
+ movq -80(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -72(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm6,-80(%edx)
+ movdqa 64(%ebp),%xmm7
+ movdqa %xmm5,%xmm6
+ movdqu 80(%ebx),%xmm5
+ paddq %xmm4,%xmm7
+.byte 102,15,56,0,238
+ movq %mm4,%mm1
+ movq -64(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,32(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 56(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 24(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 8(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 32(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 40(%esp),%mm6
+ movq %mm4,%mm1
+ movq -56(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,24(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,56(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 48(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 16(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq (%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 24(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 32(%esp),%mm6
+ movdqa %xmm7,-64(%edx)
+ movdqa %xmm0,(%edx)
+ movdqa 80(%ebp),%xmm0
+ movdqa %xmm6,%xmm7
+ movdqu 96(%ebx),%xmm6
+ paddq %xmm5,%xmm0
+.byte 102,15,56,0,247
+ movq %mm4,%mm1
+ movq -48(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,16(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,48(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 40(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 8(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 56(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 16(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 24(%esp),%mm6
+ movq %mm4,%mm1
+ movq -40(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,8(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,40(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 32(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq (%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 48(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 8(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 16(%esp),%mm6
+ movdqa %xmm0,-48(%edx)
+ movdqa %xmm1,16(%edx)
+ movdqa 96(%ebp),%xmm1
+ movdqa %xmm7,%xmm0
+ movdqu 112(%ebx),%xmm7
+ paddq %xmm6,%xmm1
+.byte 102,15,56,0,248
+ movq %mm4,%mm1
+ movq -32(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,32(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 24(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 56(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 40(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq (%esp),%mm5
+ paddq %mm6,%mm2
+ movq 8(%esp),%mm6
+ movq %mm4,%mm1
+ movq -24(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,56(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,24(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 16(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 48(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 32(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 56(%esp),%mm5
+ paddq %mm6,%mm0
+ movq (%esp),%mm6
+ movdqa %xmm1,-32(%edx)
+ movdqa %xmm2,32(%edx)
+ movdqa 112(%ebp),%xmm2
+ movdqa (%edx),%xmm0
+ paddq %xmm7,%xmm2
+ movq %mm4,%mm1
+ movq -16(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,48(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm0
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm0,16(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq 8(%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 40(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm0,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm0,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 24(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm0,%mm2
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ pxor %mm7,%mm6
+ movq 48(%esp),%mm5
+ paddq %mm6,%mm2
+ movq 56(%esp),%mm6
+ movq %mm4,%mm1
+ movq -8(%edx),%mm7
+ pxor %mm6,%mm5
+ psrlq $14,%mm1
+ movq %mm4,40(%esp)
+ pand %mm4,%mm5
+ psllq $23,%mm4
+ paddq %mm3,%mm2
+ movq %mm1,%mm3
+ psrlq $4,%mm1
+ pxor %mm6,%mm5
+ pxor %mm4,%mm3
+ psllq $23,%mm4
+ pxor %mm1,%mm3
+ movq %mm2,8(%esp)
+ paddq %mm5,%mm7
+ pxor %mm4,%mm3
+ psrlq $23,%mm1
+ paddq (%esp),%mm7
+ pxor %mm1,%mm3
+ psllq $4,%mm4
+ pxor %mm4,%mm3
+ movq 32(%esp),%mm4
+ paddq %mm7,%mm3
+ movq %mm2,%mm5
+ psrlq $28,%mm5
+ paddq %mm3,%mm4
+ movq %mm2,%mm6
+ movq %mm5,%mm7
+ psllq $25,%mm6
+ movq 16(%esp),%mm1
+ psrlq $6,%mm5
+ pxor %mm6,%mm7
+ psllq $5,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm2
+ psrlq $5,%mm5
+ pxor %mm6,%mm7
+ pand %mm2,%mm0
+ psllq $6,%mm6
+ pxor %mm5,%mm7
+ pxor %mm1,%mm0
+ pxor %mm7,%mm6
+ movq 40(%esp),%mm5
+ paddq %mm6,%mm0
+ movq 48(%esp),%mm6
+ movdqa %xmm2,-16(%edx)
+ movq 8(%esp),%mm1
+ paddq %mm3,%mm0
+ movq 24(%esp),%mm3
+ movq 56(%esp),%mm7
+ pxor %mm1,%mm2
+ paddq (%esi),%mm0
+ paddq 8(%esi),%mm1
+ paddq 16(%esi),%mm2
+ paddq 24(%esi),%mm3
+ paddq 32(%esi),%mm4
+ paddq 40(%esi),%mm5
+ paddq 48(%esi),%mm6
+ paddq 56(%esi),%mm7
+ movq %mm0,(%esi)
+ movq %mm1,8(%esi)
+ movq %mm2,16(%esi)
+ movq %mm3,24(%esi)
+ movq %mm4,32(%esi)
+ movq %mm5,40(%esi)
+ movq %mm6,48(%esi)
+ movq %mm7,56(%esi)
+ cmpl %eax,%edi
+ jb .L007loop_ssse3
+ movl 76(%edx),%esp
+ emms
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 16
+.L002loop_x86:
+ movl (%edi),%eax
+ movl 4(%edi),%ebx
+ movl 8(%edi),%ecx
+ movl 12(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 16(%edi),%eax
+ movl 20(%edi),%ebx
+ movl 24(%edi),%ecx
+ movl 28(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 32(%edi),%eax
+ movl 36(%edi),%ebx
+ movl 40(%edi),%ecx
+ movl 44(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 48(%edi),%eax
+ movl 52(%edi),%ebx
+ movl 56(%edi),%ecx
+ movl 60(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 64(%edi),%eax
+ movl 68(%edi),%ebx
+ movl 72(%edi),%ecx
+ movl 76(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 80(%edi),%eax
+ movl 84(%edi),%ebx
+ movl 88(%edi),%ecx
+ movl 92(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 96(%edi),%eax
+ movl 100(%edi),%ebx
+ movl 104(%edi),%ecx
+ movl 108(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ movl 112(%edi),%eax
+ movl 116(%edi),%ebx
+ movl 120(%edi),%ecx
+ movl 124(%edi),%edx
+ bswap %eax
+ bswap %ebx
+ bswap %ecx
+ bswap %edx
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+ addl $128,%edi
+ subl $72,%esp
+ movl %edi,204(%esp)
+ leal 8(%esp),%edi
+ movl $16,%ecx
+.long 2784229001
+.align 16
+.L00900_15_x86:
+ movl 40(%esp),%ecx
+ movl 44(%esp),%edx
+ movl %ecx,%esi
+ shrl $9,%ecx
+ movl %edx,%edi
+ shrl $9,%edx
+ movl %ecx,%ebx
+ shll $14,%esi
+ movl %edx,%eax
+ shll $14,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%eax
+ shll $4,%esi
+ xorl %edx,%ebx
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $4,%ecx
+ xorl %edi,%eax
+ shrl $4,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 48(%esp),%ecx
+ movl 52(%esp),%edx
+ movl 56(%esp),%esi
+ movl 60(%esp),%edi
+ addl 64(%esp),%eax
+ adcl 68(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ andl 40(%esp),%ecx
+ andl 44(%esp),%edx
+ addl 192(%esp),%eax
+ adcl 196(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ movl (%ebp),%esi
+ movl 4(%ebp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 32(%esp),%ecx
+ movl 36(%esp),%edx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,%esi
+ shrl $2,%ecx
+ movl %edx,%edi
+ shrl $2,%edx
+ movl %ecx,%ebx
+ shll $4,%esi
+ movl %edx,%eax
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%ebx
+ shll $21,%esi
+ xorl %edx,%eax
+ shll $21,%edi
+ xorl %esi,%eax
+ shrl $21,%ecx
+ xorl %edi,%ebx
+ shrl $21,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ orl %esi,%ecx
+ orl %edi,%edx
+ andl 24(%esp),%ecx
+ andl 28(%esp),%edx
+ andl 8(%esp),%esi
+ andl 12(%esp),%edi
+ orl %esi,%ecx
+ orl %edi,%edx
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movb (%ebp),%dl
+ subl $8,%esp
+ leal 8(%ebp),%ebp
+ cmpb $148,%dl
+ jne .L00900_15_x86
+.align 16
+.L01016_79_x86:
+ movl 312(%esp),%ecx
+ movl 316(%esp),%edx
+ movl %ecx,%esi
+ shrl $1,%ecx
+ movl %edx,%edi
+ shrl $1,%edx
+ movl %ecx,%eax
+ shll $24,%esi
+ movl %edx,%ebx
+ shll $24,%edi
+ xorl %esi,%ebx
+ shrl $6,%ecx
+ xorl %edi,%eax
+ shrl $6,%edx
+ xorl %ecx,%eax
+ shll $7,%esi
+ xorl %edx,%ebx
+ shll $1,%edi
+ xorl %esi,%ebx
+ shrl $1,%ecx
+ xorl %edi,%eax
+ shrl $1,%edx
+ xorl %ecx,%eax
+ shll $6,%edi
+ xorl %edx,%ebx
+ xorl %edi,%eax
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movl 208(%esp),%ecx
+ movl 212(%esp),%edx
+ movl %ecx,%esi
+ shrl $6,%ecx
+ movl %edx,%edi
+ shrl $6,%edx
+ movl %ecx,%eax
+ shll $3,%esi
+ movl %edx,%ebx
+ shll $3,%edi
+ xorl %esi,%eax
+ shrl $13,%ecx
+ xorl %edi,%ebx
+ shrl $13,%edx
+ xorl %ecx,%eax
+ shll $10,%esi
+ xorl %edx,%ebx
+ shll $10,%edi
+ xorl %esi,%ebx
+ shrl $10,%ecx
+ xorl %edi,%eax
+ shrl $10,%edx
+ xorl %ecx,%ebx
+ shll $13,%edi
+ xorl %edx,%eax
+ xorl %edi,%eax
+ movl 320(%esp),%ecx
+ movl 324(%esp),%edx
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ movl 248(%esp),%esi
+ movl 252(%esp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,192(%esp)
+ movl %ebx,196(%esp)
+ movl 40(%esp),%ecx
+ movl 44(%esp),%edx
+ movl %ecx,%esi
+ shrl $9,%ecx
+ movl %edx,%edi
+ shrl $9,%edx
+ movl %ecx,%ebx
+ shll $14,%esi
+ movl %edx,%eax
+ shll $14,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%eax
+ shll $4,%esi
+ xorl %edx,%ebx
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $4,%ecx
+ xorl %edi,%eax
+ shrl $4,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 48(%esp),%ecx
+ movl 52(%esp),%edx
+ movl 56(%esp),%esi
+ movl 60(%esp),%edi
+ addl 64(%esp),%eax
+ adcl 68(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ andl 40(%esp),%ecx
+ andl 44(%esp),%edx
+ addl 192(%esp),%eax
+ adcl 196(%esp),%ebx
+ xorl %esi,%ecx
+ xorl %edi,%edx
+ movl (%ebp),%esi
+ movl 4(%ebp),%edi
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 32(%esp),%ecx
+ movl 36(%esp),%edx
+ addl %esi,%eax
+ adcl %edi,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl %eax,32(%esp)
+ movl %ebx,36(%esp)
+ movl %ecx,%esi
+ shrl $2,%ecx
+ movl %edx,%edi
+ shrl $2,%edx
+ movl %ecx,%ebx
+ shll $4,%esi
+ movl %edx,%eax
+ shll $4,%edi
+ xorl %esi,%ebx
+ shrl $5,%ecx
+ xorl %edi,%eax
+ shrl $5,%edx
+ xorl %ecx,%ebx
+ shll $21,%esi
+ xorl %edx,%eax
+ shll $21,%edi
+ xorl %esi,%eax
+ shrl $21,%ecx
+ xorl %edi,%ebx
+ shrl $21,%edx
+ xorl %ecx,%eax
+ shll $5,%esi
+ xorl %edx,%ebx
+ shll $5,%edi
+ xorl %esi,%eax
+ xorl %edi,%ebx
+ movl 8(%esp),%ecx
+ movl 12(%esp),%edx
+ movl 16(%esp),%esi
+ movl 20(%esp),%edi
+ addl (%esp),%eax
+ adcl 4(%esp),%ebx
+ orl %esi,%ecx
+ orl %edi,%edx
+ andl 24(%esp),%ecx
+ andl 28(%esp),%edx
+ andl 8(%esp),%esi
+ andl 12(%esp),%edi
+ orl %esi,%ecx
+ orl %edi,%edx
+ addl %ecx,%eax
+ adcl %edx,%ebx
+ movl %eax,(%esp)
+ movl %ebx,4(%esp)
+ movb (%ebp),%dl
+ subl $8,%esp
+ leal 8(%ebp),%ebp
+ cmpb $23,%dl
+ jne .L01016_79_x86
+ movl 840(%esp),%esi
+ movl 844(%esp),%edi
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%edx
+ addl 8(%esp),%eax
+ adcl 12(%esp),%ebx
+ movl %eax,(%esi)
+ movl %ebx,4(%esi)
+ addl 16(%esp),%ecx
+ adcl 20(%esp),%edx
+ movl %ecx,8(%esi)
+ movl %edx,12(%esi)
+ movl 16(%esi),%eax
+ movl 20(%esi),%ebx
+ movl 24(%esi),%ecx
+ movl 28(%esi),%edx
+ addl 24(%esp),%eax
+ adcl 28(%esp),%ebx
+ movl %eax,16(%esi)
+ movl %ebx,20(%esi)
+ addl 32(%esp),%ecx
+ adcl 36(%esp),%edx
+ movl %ecx,24(%esi)
+ movl %edx,28(%esi)
+ movl 32(%esi),%eax
+ movl 36(%esi),%ebx
+ movl 40(%esi),%ecx
+ movl 44(%esi),%edx
+ addl 40(%esp),%eax
+ adcl 44(%esp),%ebx
+ movl %eax,32(%esi)
+ movl %ebx,36(%esi)
+ addl 48(%esp),%ecx
+ adcl 52(%esp),%edx
+ movl %ecx,40(%esi)
+ movl %edx,44(%esi)
+ movl 48(%esi),%eax
+ movl 52(%esi),%ebx
+ movl 56(%esi),%ecx
+ movl 60(%esi),%edx
+ addl 56(%esp),%eax
+ adcl 60(%esp),%ebx
+ movl %eax,48(%esi)
+ movl %ebx,52(%esi)
+ addl 64(%esp),%ecx
+ adcl 68(%esp),%edx
+ movl %ecx,56(%esi)
+ movl %edx,60(%esi)
+ addl $840,%esp
+ subl $640,%ebp
+ cmpl 8(%esp),%edi
+ jb .L002loop_x86
+ movl 12(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.align 64
+.L001K512:
+.long 3609767458,1116352408
+.long 602891725,1899447441
+.long 3964484399,3049323471
+.long 2173295548,3921009573
+.long 4081628472,961987163
+.long 3053834265,1508970993
+.long 2937671579,2453635748
+.long 3664609560,2870763221
+.long 2734883394,3624381080
+.long 1164996542,310598401
+.long 1323610764,607225278
+.long 3590304994,1426881987
+.long 4068182383,1925078388
+.long 991336113,2162078206
+.long 633803317,2614888103
+.long 3479774868,3248222580
+.long 2666613458,3835390401
+.long 944711139,4022224774
+.long 2341262773,264347078
+.long 2007800933,604807628
+.long 1495990901,770255983
+.long 1856431235,1249150122
+.long 3175218132,1555081692
+.long 2198950837,1996064986
+.long 3999719339,2554220882
+.long 766784016,2821834349
+.long 2566594879,2952996808
+.long 3203337956,3210313671
+.long 1034457026,3336571891
+.long 2466948901,3584528711
+.long 3758326383,113926993
+.long 168717936,338241895
+.long 1188179964,666307205
+.long 1546045734,773529912
+.long 1522805485,1294757372
+.long 2643833823,1396182291
+.long 2343527390,1695183700
+.long 1014477480,1986661051
+.long 1206759142,2177026350
+.long 344077627,2456956037
+.long 1290863460,2730485921
+.long 3158454273,2820302411
+.long 3505952657,3259730800
+.long 106217008,3345764771
+.long 3606008344,3516065817
+.long 1432725776,3600352804
+.long 1467031594,4094571909
+.long 851169720,275423344
+.long 3100823752,430227734
+.long 1363258195,506948616
+.long 3750685593,659060556
+.long 3785050280,883997877
+.long 3318307427,958139571
+.long 3812723403,1322822218
+.long 2003034995,1537002063
+.long 3602036899,1747873779
+.long 1575990012,1955562222
+.long 1125592928,2024104815
+.long 2716904306,2227730452
+.long 442776044,2361852424
+.long 593698344,2428436474
+.long 3733110249,2756734187
+.long 2999351573,3204031479
+.long 3815920427,3329325298
+.long 3928383900,3391569614
+.long 566280711,3515267271
+.long 3454069534,3940187606
+.long 4000239992,4118630271
+.long 1914138554,116418474
+.long 2731055270,174292421
+.long 3203993006,289380356
+.long 320620315,460393269
+.long 587496836,685471733
+.long 1086792851,852142971
+.long 365543100,1017036298
+.long 2618297676,1126000580
+.long 3409855158,1288033470
+.long 4234509866,1501505948
+.long 987167468,1607167915
+.long 1246189591,1816402316
+.long 67438087,66051
+.long 202182159,134810123
+.size sha512_block_data_order,.-.L_sha512_block_data_order_begin
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97
+.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32
+.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97
+.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103
+.byte 62,0
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S
@@ -1,0 +1,708 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+#ifdef BORINGSSL_DISPATCH_TEST
+#endif
+.align 64
+.L_vpaes_consts:
+.long 218628480,235210255,168496130,67568393
+.long 252381056,17041926,33884169,51187212
+.long 252645135,252645135,252645135,252645135
+.long 1512730624,3266504856,1377990664,3401244816
+.long 830229760,1275146365,2969422977,3447763452
+.long 3411033600,2979783055,338359620,2782886510
+.long 4209124096,907596821,221174255,1006095553
+.long 191964160,3799684038,3164090317,1589111125
+.long 182528256,1777043520,2877432650,3265356744
+.long 1874708224,3503451415,3305285752,363511674
+.long 1606117888,3487855781,1093350906,2384367825
+.long 197121,67569157,134941193,202313229
+.long 67569157,134941193,202313229,197121
+.long 134941193,202313229,197121,67569157
+.long 202313229,197121,67569157,134941193
+.long 33619971,100992007,168364043,235736079
+.long 235736079,33619971,100992007,168364043
+.long 168364043,235736079,33619971,100992007
+.long 100992007,168364043,235736079,33619971
+.long 50462976,117835012,185207048,252579084
+.long 252314880,51251460,117574920,184942860
+.long 184682752,252054788,50987272,118359308
+.long 118099200,185467140,251790600,50727180
+.long 2946363062,528716217,1300004225,1881839624
+.long 1532713819,1532713819,1532713819,1532713819
+.long 3602276352,4288629033,3737020424,4153884961
+.long 1354558464,32357713,2958822624,3775749553
+.long 1201988352,132424512,1572796698,503232858
+.long 2213177600,1597421020,4103937655,675398315
+.long 2749646592,4273543773,1511898873,121693092
+.long 3040248576,1103263732,2871565598,1608280554
+.long 2236667136,2588920351,482954393,64377734
+.long 3069987328,291237287,2117370568,3650299247
+.long 533321216,3573750986,2572112006,1401264716
+.long 1339849704,2721158661,548607111,3445553514
+.long 2128193280,3054596040,2183486460,1257083700
+.long 655635200,1165381986,3923443150,2344132524
+.long 190078720,256924420,290342170,357187870
+.long 1610966272,2263057382,4103205268,309794674
+.long 2592527872,2233205587,1335446729,3402964816
+.long 3973531904,3225098121,3002836325,1918774430
+.long 3870401024,2102906079,2284471353,4117666579
+.long 617007872,1021508343,366931923,691083277
+.long 2528395776,3491914898,2968704004,1613121270
+.long 3445188352,3247741094,844474987,4093578302
+.long 651481088,1190302358,1689581232,574775300
+.long 4289380608,206939853,2555985458,2489840491
+.long 2130264064,327674451,3566485037,3349835193
+.long 2470714624,316102159,3636825756,3393945945
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105
+.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83
+.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117
+.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105
+.byte 118,101,114,115,105,116,121,41,0
+.align 64
+.hidden _vpaes_preheat
+.type _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+ addl (%esp),%ebp
+ movdqa -48(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm6
+ ret
+.size _vpaes_preheat,.-_vpaes_preheat
+.hidden _vpaes_encrypt_core
+.type _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+ movl $16,%ecx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa (%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ pand %xmm6,%xmm0
+ movdqu (%edx),%xmm5
+.byte 102,15,56,0,208
+ movdqa 16(%ebp),%xmm0
+ pxor %xmm5,%xmm2
+ psrld $4,%xmm1
+ addl $16,%edx
+.byte 102,15,56,0,193
+ leal 192(%ebp),%ebx
+ pxor %xmm2,%xmm0
+ jmp .L000enc_entry
+.align 16
+.L001enc_loop:
+ movdqa 32(%ebp),%xmm4
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,226
+.byte 102,15,56,0,195
+ pxor %xmm5,%xmm4
+ movdqa 64(%ebp),%xmm5
+ pxor %xmm4,%xmm0
+ movdqa -64(%ebx,%ecx,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa 80(%ebp),%xmm2
+ movdqa (%ebx,%ecx,1),%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
+.byte 102,15,56,0,193
+ addl $16,%edx
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addl $16,%ecx
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ subl $1,%eax
+ pxor %xmm3,%xmm0
+.L000enc_entry:
+ movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm5
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,232
+ movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm7,%xmm4
+ pxor %xmm5,%xmm3
+.byte 102,15,56,0,224
+ movdqa %xmm7,%xmm2
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%edx),%xmm5
+ pxor %xmm1,%xmm3
+ jnz .L001enc_loop
+ movdqa 96(%ebp),%xmm4
+ movdqa 112(%ebp),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%ebx,%ecx,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ ret
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+.hidden _vpaes_decrypt_core
+.type _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+ leal 608(%ebp),%ebx
+ movl 240(%edx),%eax
+ movdqa %xmm6,%xmm1
+ movdqa -64(%ebx),%xmm2
+ pandn %xmm0,%xmm1
+ movl %eax,%ecx
+ psrld $4,%xmm1
+ movdqu (%edx),%xmm5
+ shll $4,%ecx
+ pand %xmm6,%xmm0
+.byte 102,15,56,0,208
+ movdqa -48(%ebx),%xmm0
+ xorl $48,%ecx
+.byte 102,15,56,0,193
+ andl $48,%ecx
+ pxor %xmm5,%xmm2
+ movdqa 176(%ebp),%xmm5
+ pxor %xmm2,%xmm0
+ addl $16,%edx
+ leal -352(%ebx,%ecx,1),%ecx
+ jmp .L002dec_entry
+.align 16
+.L003dec_loop:
+ movdqa -32(%ebx),%xmm4
+ movdqa -16(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa (%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 32(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 64(%ebx),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%ebx),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ addl $16,%edx
+.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subl $1,%eax
+.L002dec_entry:
+ movdqa %xmm6,%xmm1
+ movdqa -32(%ebp),%xmm2
+ pandn %xmm0,%xmm1
+ pand %xmm6,%xmm0
+ psrld $4,%xmm1
+.byte 102,15,56,0,208
+ movdqa %xmm7,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm7,%xmm4
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm7,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm7,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%edx),%xmm0
+ pxor %xmm1,%xmm3
+ jnz .L003dec_loop
+ movdqa 96(%ebx),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%ebx),%xmm0
+ movdqa (%ecx),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ ret
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+.hidden _vpaes_schedule_core
+.type _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+ addl (%esp),%ebp
+ movdqu (%esi),%xmm0
+ movdqa 320(%ebp),%xmm2
+ movdqa %xmm0,%xmm3
+ leal (%ebp),%ebx
+ movdqa %xmm2,4(%esp)
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+ testl %edi,%edi
+ jnz .L004schedule_am_decrypting
+ movdqu %xmm0,(%edx)
+ jmp .L005schedule_go
+.L004schedule_am_decrypting:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%edx)
+ xorl $48,%ecx
+.L005schedule_go:
+ cmpl $192,%eax
+ ja .L006schedule_256
+ je .L007schedule_192
+.L008schedule_128:
+ movl $10,%eax
+.L009loop_schedule_128:
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp .L009loop_schedule_128
+.align 16
+.L007schedule_192:
+ movdqu 8(%esi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%eax
+.L011loop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp .L011loop_schedule_192
+.align 16
+.L006schedule_256:
+ movdqu 16(%esi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%eax
+.L012loop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+ call _vpaes_schedule_round
+ decl %eax
+ jz .L010schedule_mangle_last
+ call _vpaes_schedule_mangle
+ pshufd $255,%xmm0,%xmm0
+ movdqa %xmm7,20(%esp)
+ movdqa %xmm6,%xmm7
+ call .L_vpaes_schedule_low_round
+ movdqa 20(%esp),%xmm7
+ jmp .L012loop_schedule_256
+.align 16
+.L010schedule_mangle_last:
+ leal 384(%ebp),%ebx
+ testl %edi,%edi
+ jnz .L013schedule_mangle_last_dec
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,193
+ leal 352(%ebp),%ebx
+ addl $32,%edx
+.L013schedule_mangle_last_dec:
+ addl $-16,%edx
+ pxor 336(%ebp),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%edx)
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ ret
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+.hidden _vpaes_schedule_192_smear
+.type _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+ pshufd $128,%xmm6,%xmm1
+ pshufd $254,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ movhlps %xmm1,%xmm6
+ ret
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+.hidden _vpaes_schedule_round
+.type _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+ movdqa 8(%esp),%xmm2
+ pxor %xmm1,%xmm1
+.byte 102,15,58,15,202,15
+.byte 102,15,58,15,210,15
+ pxor %xmm1,%xmm7
+ pshufd $255,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+ movdqa %xmm2,8(%esp)
+.L_vpaes_schedule_low_round:
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor 336(%ebp),%xmm7
+ movdqa -16(%ebp),%xmm4
+ movdqa -48(%ebp),%xmm5
+ movdqa %xmm4,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm4,%xmm0
+ movdqa -32(%ebp),%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm5,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm5,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm5,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa 32(%ebp),%xmm4
+.byte 102,15,56,0,226
+ movdqa 48(%ebp),%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ ret
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+.hidden _vpaes_schedule_transform
+.type _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+ movdqa -16(%ebp),%xmm2
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+ movdqa (%ebx),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%ebx),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ ret
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+.hidden _vpaes_schedule_mangle
+.type _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+ movdqa %xmm0,%xmm4
+ movdqa 128(%ebp),%xmm5
+ testl %edi,%edi
+ jnz .L014schedule_mangle_dec
+ addl $16,%edx
+ pxor 336(%ebp),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+ jmp .L015schedule_mangle_both
+.align 16
+.L014schedule_mangle_dec:
+ movdqa -16(%ebp),%xmm2
+ leal 416(%ebp),%esi
+ movdqa %xmm2,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm4
+ movdqa (%esi),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 32(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 64(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+ movdqa 96(%esi),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%esi),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ addl $-16,%edx
+.L015schedule_mangle_both:
+ movdqa 256(%ebp,%ecx,1),%xmm1
+.byte 102,15,56,0,217
+ addl $-16,%ecx
+ andl $48,%ecx
+ movdqu %xmm3,(%edx)
+ ret
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+.globl vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+.L_vpaes_set_encrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L016pic
+.L016pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ movl $48,%ecx
+ movl $0,%edi
+ leal .L_vpaes_consts+0x30-.L017pic_point,%ebp
+ call _vpaes_schedule_core
+.L017pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin
+.globl vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+.L_vpaes_set_decrypt_key_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%eax
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movl %eax,%ebx
+ shrl $5,%ebx
+ addl $5,%ebx
+ movl %ebx,240(%edx)
+ shll $4,%ebx
+ leal 16(%edx,%ebx,1),%edx
+ movl $1,%edi
+ movl %eax,%ecx
+ shrl $1,%ecx
+ andl $32,%ecx
+ xorl $32,%ecx
+ leal .L_vpaes_consts+0x30-.L018pic_point,%ebp
+ call _vpaes_schedule_core
+.L018pic_point:
+ movl 48(%esp),%esp
+ xorl %eax,%eax
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin
+.globl vpaes_encrypt
+.hidden vpaes_encrypt
+.type vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+.L_vpaes_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+#ifdef BORINGSSL_DISPATCH_TEST
+ pushl %ebx
+ pushl %edx
+ call .L019pic
+.L019pic:
+ popl %ebx
+ leal BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx
+ movl $1,%edx
+ movb %dl,(%ebx)
+ popl %edx
+ popl %ebx
+#endif
+ leal .L_vpaes_consts+0x30-.L020pic_point,%ebp
+ call _vpaes_preheat
+.L020pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_encrypt,.-.L_vpaes_encrypt_begin
+.globl vpaes_decrypt
+.hidden vpaes_decrypt
+.type vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+.L_vpaes_decrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ leal .L_vpaes_consts+0x30-.L021pic_point,%ebp
+ call _vpaes_preheat
+.L021pic_point:
+ movl 20(%esp),%esi
+ leal -56(%esp),%ebx
+ movl 24(%esp),%edi
+ andl $-16,%ebx
+ movl 28(%esp),%edx
+ xchgl %esp,%ebx
+ movl %ebx,48(%esp)
+ movdqu (%esi),%xmm0
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%edi)
+ movl 48(%esp),%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_decrypt,.-.L_vpaes_decrypt_begin
+.globl vpaes_cbc_encrypt
+.hidden vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+.L_vpaes_cbc_encrypt_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 20(%esp),%esi
+ movl 24(%esp),%edi
+ movl 28(%esp),%eax
+ movl 32(%esp),%edx
+ subl $16,%eax
+ jc .L022cbc_abort
+ leal -56(%esp),%ebx
+ movl 36(%esp),%ebp
+ andl $-16,%ebx
+ movl 40(%esp),%ecx
+ xchgl %esp,%ebx
+ movdqu (%ebp),%xmm1
+ subl %esi,%edi
+ movl %ebx,48(%esp)
+ movl %edi,(%esp)
+ movl %edx,4(%esp)
+ movl %ebp,8(%esp)
+ movl %eax,%edi
+ leal .L_vpaes_consts+0x30-.L023pic_point,%ebp
+ call _vpaes_preheat
+.L023pic_point:
+ cmpl $0,%ecx
+ je .L024cbc_dec_loop
+ jmp .L025cbc_enc_loop
+.align 16
+.L025cbc_enc_loop:
+ movdqu (%esi),%xmm0
+ pxor %xmm1,%xmm0
+ call _vpaes_encrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ movdqa %xmm0,%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L025cbc_enc_loop
+ jmp .L026cbc_done
+.align 16
+.L024cbc_dec_loop:
+ movdqu (%esi),%xmm0
+ movdqa %xmm1,16(%esp)
+ movdqa %xmm0,32(%esp)
+ call _vpaes_decrypt_core
+ movl (%esp),%ebx
+ movl 4(%esp),%edx
+ pxor 16(%esp),%xmm0
+ movdqa 32(%esp),%xmm1
+ movdqu %xmm0,(%ebx,%esi,1)
+ leal 16(%esi),%esi
+ subl $16,%edi
+ jnc .L024cbc_dec_loop
+.L026cbc_done:
+ movl 8(%esp),%ebx
+ movl 48(%esp),%esp
+ movdqu %xmm1,(%ebx)
+.L022cbc_abort:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S
@@ -1,0 +1,484 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl bn_mul_mont
+.hidden bn_mul_mont
+.type bn_mul_mont,@function
+.align 16
+bn_mul_mont:
+.L_bn_mul_mont_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ xorl %eax,%eax
+ movl 40(%esp),%edi
+ cmpl $4,%edi
+ jl .L000just_leave
+ leal 20(%esp),%esi
+ leal 24(%esp),%edx
+ addl $2,%edi
+ negl %edi
+ leal -32(%esp,%edi,4),%ebp
+ negl %edi
+ movl %ebp,%eax
+ subl %edx,%eax
+ andl $2047,%eax
+ subl %eax,%ebp
+ xorl %ebp,%edx
+ andl $2048,%edx
+ xorl $2048,%edx
+ subl %edx,%ebp
+ andl $-64,%ebp
+ movl %esp,%eax
+ subl %ebp,%eax
+ andl $-4096,%eax
+ movl %esp,%edx
+ leal (%ebp,%eax,1),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja .L001page_walk
+ jmp .L002page_walk_done
+.align 16
+.L001page_walk:
+ leal -4096(%esp),%esp
+ movl (%esp),%eax
+ cmpl %ebp,%esp
+ ja .L001page_walk
+.L002page_walk_done:
+ movl (%esi),%eax
+ movl 4(%esi),%ebx
+ movl 8(%esi),%ecx
+ movl 12(%esi),%ebp
+ movl 16(%esi),%esi
+ movl (%esi),%esi
+ movl %eax,4(%esp)
+ movl %ebx,8(%esp)
+ movl %ecx,12(%esp)
+ movl %ebp,16(%esp)
+ movl %esi,20(%esp)
+ leal -3(%edi),%ebx
+ movl %edx,24(%esp)
+ call .L003PIC_me_up
+.L003PIC_me_up:
+ popl %eax
+ leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax
+ btl $26,(%eax)
+ jnc .L004non_sse2
+ movl $-1,%eax
+ movd %eax,%mm7
+ movl 8(%esp),%esi
+ movl 12(%esp),%edi
+ movl 16(%esp),%ebp
+ xorl %edx,%edx
+ xorl %ecx,%ecx
+ movd (%edi),%mm4
+ movd (%esi),%mm5
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ movq %mm5,%mm2
+ movq %mm5,%mm0
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ incl %ecx
+.align 16
+.L0051st:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ leal 1(%ecx),%ecx
+ cmpl %ebx,%ecx
+ jl .L0051st
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm2,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ incl %edx
+.L006outer:
+ xorl %ecx,%ecx
+ movd (%edi,%edx,4),%mm4
+ movd (%esi),%mm5
+ movd 32(%esp),%mm6
+ movd (%ebp),%mm3
+ pmuludq %mm4,%mm5
+ paddq %mm6,%mm5
+ movq %mm5,%mm0
+ movq %mm5,%mm2
+ pand %mm7,%mm0
+ pmuludq 20(%esp),%mm5
+ pmuludq %mm5,%mm3
+ paddq %mm0,%mm3
+ movd 36(%esp),%mm6
+ movd 4(%ebp),%mm1
+ movd 4(%esi),%mm0
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ incl %ecx
+ decl %ebx
+.L007inner:
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ movd 36(%esp,%ecx,4),%mm6
+ pand %mm7,%mm0
+ movd 4(%ebp,%ecx,4),%mm1
+ paddq %mm0,%mm3
+ movd 4(%esi,%ecx,4),%mm0
+ psrlq $32,%mm2
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm3
+ paddq %mm6,%mm2
+ decl %ebx
+ leal 1(%ecx),%ecx
+ jnz .L007inner
+ movl %ecx,%ebx
+ pmuludq %mm4,%mm0
+ pmuludq %mm5,%mm1
+ paddq %mm0,%mm2
+ paddq %mm1,%mm3
+ movq %mm2,%mm0
+ pand %mm7,%mm0
+ paddq %mm0,%mm3
+ movd %mm3,28(%esp,%ecx,4)
+ psrlq $32,%mm2
+ psrlq $32,%mm3
+ movd 36(%esp,%ebx,4),%mm6
+ paddq %mm2,%mm3
+ paddq %mm6,%mm3
+ movq %mm3,32(%esp,%ebx,4)
+ leal 1(%edx),%edx
+ cmpl %ebx,%edx
+ jle .L006outer
+ emms
+ jmp .L008common_tail
+.align 16
+.L004non_sse2:
+ movl 8(%esp),%esi
+ leal 1(%ebx),%ebp
+ movl 12(%esp),%edi
+ xorl %ecx,%ecx
+ movl %esi,%edx
+ andl $1,%ebp
+ subl %edi,%edx
+ leal 4(%edi,%ebx,4),%eax
+ orl %edx,%ebp
+ movl (%edi),%edi
+ jz .L009bn_sqr_mont
+ movl %eax,28(%esp)
+ movl (%esi),%eax
+ xorl %edx,%edx
+.align 16
+.L010mull:
+ movl %edx,%ebp
+ mull %edi
+ addl %eax,%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ movl (%esi,%ecx,4),%eax
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L010mull
+ movl %edx,%ebp
+ mull %edi
+ movl 20(%esp),%edi
+ addl %ebp,%eax
+ movl 16(%esp),%esi
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ movl %eax,32(%esp,%ebx,4)
+ xorl %ecx,%ecx
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ movl (%esi),%eax
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ incl %ecx
+ jmp .L0112ndmadd
+.align 16
+.L0121stmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L0121stmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ addl %eax,%ebp
+ adcl $0,%edx
+ imull 32(%esp),%edi
+ xorl %ecx,%ecx
+ addl 36(%esp,%ebx,4),%edx
+ movl %ebp,32(%esp,%ebx,4)
+ adcl $0,%ecx
+ movl (%esi),%eax
+ movl %edx,36(%esp,%ebx,4)
+ movl %ecx,40(%esp,%ebx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl 4(%esi),%eax
+ adcl $0,%edx
+ movl $1,%ecx
+.align 16
+.L0112ndmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl .L0112ndmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ xorl %eax,%eax
+ movl 12(%esp),%ecx
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ leal 4(%ecx),%ecx
+ movl %edx,32(%esp,%ebx,4)
+ cmpl 28(%esp),%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je .L008common_tail
+ movl (%ecx),%edi
+ movl 8(%esp),%esi
+ movl %ecx,12(%esp)
+ xorl %ecx,%ecx
+ xorl %edx,%edx
+ movl (%esi),%eax
+ jmp .L0121stmadd
+.align 16
+.L009bn_sqr_mont:
+ movl %ebx,(%esp)
+ movl %ecx,12(%esp)
+ movl %edi,%eax
+ mull %edi
+ movl %eax,32(%esp)
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+ incl %ecx
+.align 16
+.L013sqr:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal 1(%ecx),%ecx
+ adcl $0,%edx
+ leal (%ebx,%eax,2),%ebp
+ shrl $31,%eax
+ cmpl (%esp),%ecx
+ movl %eax,%ebx
+ movl %ebp,28(%esp,%ecx,4)
+ jl .L013sqr
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ movl 20(%esp),%edi
+ adcl $0,%edx
+ movl 16(%esp),%esi
+ leal (%ebx,%eax,2),%ebp
+ imull 32(%esp),%edi
+ shrl $31,%eax
+ movl %ebp,32(%esp,%ecx,4)
+ leal (%eax,%edx,2),%ebp
+ movl (%esi),%eax
+ shrl $31,%edx
+ movl %ebp,36(%esp,%ecx,4)
+ movl %edx,40(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ movl %ecx,%ebx
+ adcl $0,%edx
+ movl 4(%esi),%eax
+ movl $1,%ecx
+.align 16
+.L0143rdmadd:
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ecx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl 4(%esi,%ecx,4),%eax
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %edx,%ebp
+ mull %edi
+ addl 36(%esp,%ecx,4),%ebp
+ leal 2(%ecx),%ecx
+ adcl $0,%edx
+ addl %eax,%ebp
+ movl (%esi,%ecx,4),%eax
+ adcl $0,%edx
+ cmpl %ebx,%ecx
+ movl %ebp,24(%esp,%ecx,4)
+ jl .L0143rdmadd
+ movl %edx,%ebp
+ mull %edi
+ addl 32(%esp,%ebx,4),%ebp
+ adcl $0,%edx
+ addl %eax,%ebp
+ adcl $0,%edx
+ movl %ebp,28(%esp,%ebx,4)
+ movl 12(%esp),%ecx
+ xorl %eax,%eax
+ movl 8(%esp),%esi
+ addl 36(%esp,%ebx,4),%edx
+ adcl 40(%esp,%ebx,4),%eax
+ movl %edx,32(%esp,%ebx,4)
+ cmpl %ebx,%ecx
+ movl %eax,36(%esp,%ebx,4)
+ je .L008common_tail
+ movl 4(%esi,%ecx,4),%edi
+ leal 1(%ecx),%ecx
+ movl %edi,%eax
+ movl %ecx,12(%esp)
+ mull %edi
+ addl 32(%esp,%ecx,4),%eax
+ adcl $0,%edx
+ movl %eax,32(%esp,%ecx,4)
+ xorl %ebp,%ebp
+ cmpl %ebx,%ecx
+ leal 1(%ecx),%ecx
+ je .L015sqrlast
+ movl %edx,%ebx
+ shrl $1,%edx
+ andl $1,%ebx
+.align 16
+.L016sqradd:
+ movl (%esi,%ecx,4),%eax
+ movl %edx,%ebp
+ mull %edi
+ addl %ebp,%eax
+ leal (%eax,%eax,1),%ebp
+ adcl $0,%edx
+ shrl $31,%eax
+ addl 32(%esp,%ecx,4),%ebp
+ leal 1(%ecx),%ecx
+ adcl $0,%eax
+ addl %ebx,%ebp
+ adcl $0,%eax
+ cmpl (%esp),%ecx
+ movl %ebp,28(%esp,%ecx,4)
+ movl %eax,%ebx
+ jle .L016sqradd
+ movl %edx,%ebp
+ addl %edx,%edx
+ shrl $31,%ebp
+ addl %ebx,%edx
+ adcl $0,%ebp
+.L015sqrlast:
+ movl 20(%esp),%edi
+ movl 16(%esp),%esi
+ imull 32(%esp),%edi
+ addl 32(%esp,%ecx,4),%edx
+ movl (%esi),%eax
+ adcl $0,%ebp
+ movl %edx,32(%esp,%ecx,4)
+ movl %ebp,36(%esp,%ecx,4)
+ mull %edi
+ addl 32(%esp),%eax
+ leal -1(%ecx),%ebx
+ adcl $0,%edx
+ movl $1,%ecx
+ movl 4(%esi),%eax
+ jmp .L0143rdmadd
+.align 16
+.L008common_tail:
+ movl 16(%esp),%ebp
+ movl 4(%esp),%edi
+ leal 32(%esp),%esi
+ movl (%esi),%eax
+ movl %ebx,%ecx
+ xorl %edx,%edx
+.align 16
+.L017sub:
+ sbbl (%ebp,%edx,4),%eax
+ movl %eax,(%edi,%edx,4)
+ decl %ecx
+ movl 4(%esi,%edx,4),%eax
+ leal 1(%edx),%edx
+ jge .L017sub
+ sbbl $0,%eax
+ movl $-1,%edx
+ xorl %eax,%edx
+ jmp .L018copy
+.align 16
+.L018copy:
+ movl 32(%esp,%ebx,4),%esi
+ movl (%edi,%ebx,4),%ebp
+ movl %ecx,32(%esp,%ebx,4)
+ andl %eax,%esi
+ andl %edx,%ebp
+ orl %esi,%ebp
+ movl %ebp,(%edi,%ebx,4)
+ decl %ebx
+ jge .L018copy
+ movl 24(%esp),%esp
+ movl $1,%eax
+.L000just_leave:
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size bn_mul_mont,.-.L_bn_mul_mont_begin
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105
+.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56
+.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121
+.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46
+.byte 111,114,103,62,0
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S
@@ -1,0 +1,206 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__i386__)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.globl abi_test_trampoline
+.hidden abi_test_trampoline
+.type abi_test_trampoline,@function
+.align 16
+abi_test_trampoline:
+.L_abi_test_trampoline_begin:
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 24(%esp),%ecx
+ movl (%ecx),%esi
+ movl 4(%ecx),%edi
+ movl 8(%ecx),%ebx
+ movl 12(%ecx),%ebp
+ subl $44,%esp
+ movl 72(%esp),%eax
+ xorl %ecx,%ecx
+.L000loop:
+ cmpl 76(%esp),%ecx
+ jae .L001loop_done
+ movl (%eax,%ecx,4),%edx
+ movl %edx,(%esp,%ecx,4)
+ addl $1,%ecx
+ jmp .L000loop
+.L001loop_done:
+ call *64(%esp)
+ addl $44,%esp
+ movl 24(%esp),%ecx
+ movl %esi,(%ecx)
+ movl %edi,4(%ecx)
+ movl %ebx,8(%ecx)
+ movl %ebp,12(%ecx)
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+.size abi_test_trampoline,.-.L_abi_test_trampoline_begin
+.globl abi_test_get_and_clear_direction_flag
+.hidden abi_test_get_and_clear_direction_flag
+.type abi_test_get_and_clear_direction_flag,@function
+.align 16
+abi_test_get_and_clear_direction_flag:
+.L_abi_test_get_and_clear_direction_flag_begin:
+ pushfl
+ popl %eax
+ andl $1024,%eax
+ shrl $10,%eax
+ cld
+ ret
+.size abi_test_get_and_clear_direction_flag,.-.L_abi_test_get_and_clear_direction_flag_begin
+.globl abi_test_set_direction_flag
+.hidden abi_test_set_direction_flag
+.type abi_test_set_direction_flag,@function
+.align 16
+abi_test_set_direction_flag:
+.L_abi_test_set_direction_flag_begin:
+ std
+ ret
+.size abi_test_set_direction_flag,.-.L_abi_test_set_direction_flag_begin
+.globl abi_test_clobber_eax
+.hidden abi_test_clobber_eax
+.type abi_test_clobber_eax,@function
+.align 16
+abi_test_clobber_eax:
+.L_abi_test_clobber_eax_begin:
+ xorl %eax,%eax
+ ret
+.size abi_test_clobber_eax,.-.L_abi_test_clobber_eax_begin
+.globl abi_test_clobber_ebx
+.hidden abi_test_clobber_ebx
+.type abi_test_clobber_ebx,@function
+.align 16
+abi_test_clobber_ebx:
+.L_abi_test_clobber_ebx_begin:
+ xorl %ebx,%ebx
+ ret
+.size abi_test_clobber_ebx,.-.L_abi_test_clobber_ebx_begin
+.globl abi_test_clobber_ecx
+.hidden abi_test_clobber_ecx
+.type abi_test_clobber_ecx,@function
+.align 16
+abi_test_clobber_ecx:
+.L_abi_test_clobber_ecx_begin:
+ xorl %ecx,%ecx
+ ret
+.size abi_test_clobber_ecx,.-.L_abi_test_clobber_ecx_begin
+.globl abi_test_clobber_edx
+.hidden abi_test_clobber_edx
+.type abi_test_clobber_edx,@function
+.align 16
+abi_test_clobber_edx:
+.L_abi_test_clobber_edx_begin:
+ xorl %edx,%edx
+ ret
+.size abi_test_clobber_edx,.-.L_abi_test_clobber_edx_begin
+.globl abi_test_clobber_edi
+.hidden abi_test_clobber_edi
+.type abi_test_clobber_edi,@function
+.align 16
+abi_test_clobber_edi:
+.L_abi_test_clobber_edi_begin:
+ xorl %edi,%edi
+ ret
+.size abi_test_clobber_edi,.-.L_abi_test_clobber_edi_begin
+.globl abi_test_clobber_esi
+.hidden abi_test_clobber_esi
+.type abi_test_clobber_esi,@function
+.align 16
+abi_test_clobber_esi:
+.L_abi_test_clobber_esi_begin:
+ xorl %esi,%esi
+ ret
+.size abi_test_clobber_esi,.-.L_abi_test_clobber_esi_begin
+.globl abi_test_clobber_ebp
+.hidden abi_test_clobber_ebp
+.type abi_test_clobber_ebp,@function
+.align 16
+abi_test_clobber_ebp:
+.L_abi_test_clobber_ebp_begin:
+ xorl %ebp,%ebp
+ ret
+.size abi_test_clobber_ebp,.-.L_abi_test_clobber_ebp_begin
+.globl abi_test_clobber_xmm0
+.hidden abi_test_clobber_xmm0
+.type abi_test_clobber_xmm0,@function
+.align 16
+abi_test_clobber_xmm0:
+.L_abi_test_clobber_xmm0_begin:
+ pxor %xmm0,%xmm0
+ ret
+.size abi_test_clobber_xmm0,.-.L_abi_test_clobber_xmm0_begin
+.globl abi_test_clobber_xmm1
+.hidden abi_test_clobber_xmm1
+.type abi_test_clobber_xmm1,@function
+.align 16
+abi_test_clobber_xmm1:
+.L_abi_test_clobber_xmm1_begin:
+ pxor %xmm1,%xmm1
+ ret
+.size abi_test_clobber_xmm1,.-.L_abi_test_clobber_xmm1_begin
+.globl abi_test_clobber_xmm2
+.hidden abi_test_clobber_xmm2
+.type abi_test_clobber_xmm2,@function
+.align 16
+abi_test_clobber_xmm2:
+.L_abi_test_clobber_xmm2_begin:
+ pxor %xmm2,%xmm2
+ ret
+.size abi_test_clobber_xmm2,.-.L_abi_test_clobber_xmm2_begin
+.globl abi_test_clobber_xmm3
+.hidden abi_test_clobber_xmm3
+.type abi_test_clobber_xmm3,@function
+.align 16
+abi_test_clobber_xmm3:
+.L_abi_test_clobber_xmm3_begin:
+ pxor %xmm3,%xmm3
+ ret
+.size abi_test_clobber_xmm3,.-.L_abi_test_clobber_xmm3_begin
+.globl abi_test_clobber_xmm4
+.hidden abi_test_clobber_xmm4
+.type abi_test_clobber_xmm4,@function
+.align 16
+abi_test_clobber_xmm4:
+.L_abi_test_clobber_xmm4_begin:
+ pxor %xmm4,%xmm4
+ ret
+.size abi_test_clobber_xmm4,.-.L_abi_test_clobber_xmm4_begin
+.globl abi_test_clobber_xmm5
+.hidden abi_test_clobber_xmm5
+.type abi_test_clobber_xmm5,@function
+.align 16
+abi_test_clobber_xmm5:
+.L_abi_test_clobber_xmm5_begin:
+ pxor %xmm5,%xmm5
+ ret
+.size abi_test_clobber_xmm5,.-.L_abi_test_clobber_xmm5_begin
+.globl abi_test_clobber_xmm6
+.hidden abi_test_clobber_xmm6
+.type abi_test_clobber_xmm6,@function
+.align 16
+abi_test_clobber_xmm6:
+.L_abi_test_clobber_xmm6_begin:
+ pxor %xmm6,%xmm6
+ ret
+.size abi_test_clobber_xmm6,.-.L_abi_test_clobber_xmm6_begin
+.globl abi_test_clobber_xmm7
+.hidden abi_test_clobber_xmm7
+.type abi_test_clobber_xmm7,@function
+.align 16
+abi_test_clobber_xmm7:
+.L_abi_test_clobber_xmm7_begin:
+ pxor %xmm7,%xmm7
+ ret
+.size abi_test_clobber_xmm7,.-.L_abi_test_clobber_xmm7_begin
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S
@@ -1,0 +1,1633 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.align 64
+.Lzero:
+.long 0,0,0,0
+.Lone:
+.long 1,0,0,0
+.Linc:
+.long 0,1,2,3
+.Lfour:
+.long 4,4,4,4
+.Lincy:
+.long 0,2,4,6,1,3,5,7
+.Leight:
+.long 8,8,8,8,8,8,8,8
+.Lrot16:
+.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
+.Lrot24:
+.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
+.Lsigma:
+.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
+.align 64
+.Lzeroz:
+.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
+.Lfourz:
+.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
+.Lincz:
+.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+.Lsixteen:
+.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
+.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.globl ChaCha20_ctr32
+.hidden ChaCha20_ctr32
+.type ChaCha20_ctr32,@function
+.align 64
+ChaCha20_ctr32:
+.cfi_startproc
+ cmpq $0,%rdx
+ je .Lno_data
+ movq OPENSSL_ia32cap_P+4(%rip),%r10
+ testl $512,%r10d
+ jnz .LChaCha20_ssse3
+
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbx,-16
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset r15,-56
+ subq $64+24,%rsp
+.cfi_adjust_cfa_offset 88
+.Lctr32_body:
+
+
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lone(%rip),%xmm4
+
+
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq %rdx,%rbp
+ jmp .Loop_outer
+
+.align 32
+.Loop_outer:
+ movl $0x61707865,%eax
+ movl $0x3320646e,%ebx
+ movl $0x79622d32,%ecx
+ movl $0x6b206574,%edx
+ movl 16(%rsp),%r8d
+ movl 20(%rsp),%r9d
+ movl 24(%rsp),%r10d
+ movl 28(%rsp),%r11d
+ movd %xmm3,%r12d
+ movl 52(%rsp),%r13d
+ movl 56(%rsp),%r14d
+ movl 60(%rsp),%r15d
+
+ movq %rbp,64+0(%rsp)
+ movl $10,%ebp
+ movq %rsi,64+8(%rsp)
+.byte 102,72,15,126,214
+ movq %rdi,64+16(%rsp)
+ movq %rsi,%rdi
+ shrq $32,%rdi
+ jmp .Loop
+
+.align 32
+.Loop:
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $16,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $16,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $12,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $12,%r9d
+ addl %r8d,%eax
+ xorl %eax,%r12d
+ roll $8,%r12d
+ addl %r9d,%ebx
+ xorl %ebx,%r13d
+ roll $8,%r13d
+ addl %r12d,%esi
+ xorl %esi,%r8d
+ roll $7,%r8d
+ addl %r13d,%edi
+ xorl %edi,%r9d
+ roll $7,%r9d
+ movl %esi,32(%rsp)
+ movl %edi,36(%rsp)
+ movl 40(%rsp),%esi
+ movl 44(%rsp),%edi
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $16,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $16,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $12,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $12,%r11d
+ addl %r10d,%ecx
+ xorl %ecx,%r14d
+ roll $8,%r14d
+ addl %r11d,%edx
+ xorl %edx,%r15d
+ roll $8,%r15d
+ addl %r14d,%esi
+ xorl %esi,%r10d
+ roll $7,%r10d
+ addl %r15d,%edi
+ xorl %edi,%r11d
+ roll $7,%r11d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $16,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $16,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $12,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $12,%r10d
+ addl %r9d,%eax
+ xorl %eax,%r15d
+ roll $8,%r15d
+ addl %r10d,%ebx
+ xorl %ebx,%r12d
+ roll $8,%r12d
+ addl %r15d,%esi
+ xorl %esi,%r9d
+ roll $7,%r9d
+ addl %r12d,%edi
+ xorl %edi,%r10d
+ roll $7,%r10d
+ movl %esi,40(%rsp)
+ movl %edi,44(%rsp)
+ movl 32(%rsp),%esi
+ movl 36(%rsp),%edi
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $16,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $16,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $12,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $12,%r8d
+ addl %r11d,%ecx
+ xorl %ecx,%r13d
+ roll $8,%r13d
+ addl %r8d,%edx
+ xorl %edx,%r14d
+ roll $8,%r14d
+ addl %r13d,%esi
+ xorl %esi,%r11d
+ roll $7,%r11d
+ addl %r14d,%edi
+ xorl %edi,%r8d
+ roll $7,%r8d
+ decl %ebp
+ jnz .Loop
+ movl %edi,36(%rsp)
+ movl %esi,32(%rsp)
+ movq 64(%rsp),%rbp
+ movdqa %xmm2,%xmm1
+ movq 64+8(%rsp),%rsi
+ paddd %xmm4,%xmm3
+ movq 64+16(%rsp),%rdi
+
+ addl $0x61707865,%eax
+ addl $0x3320646e,%ebx
+ addl $0x79622d32,%ecx
+ addl $0x6b206574,%edx
+ addl 16(%rsp),%r8d
+ addl 20(%rsp),%r9d
+ addl 24(%rsp),%r10d
+ addl 28(%rsp),%r11d
+ addl 48(%rsp),%r12d
+ addl 52(%rsp),%r13d
+ addl 56(%rsp),%r14d
+ addl 60(%rsp),%r15d
+ paddd 32(%rsp),%xmm1
+
+ cmpq $64,%rbp
+ jb .Ltail
+
+ xorl 0(%rsi),%eax
+ xorl 4(%rsi),%ebx
+ xorl 8(%rsi),%ecx
+ xorl 12(%rsi),%edx
+ xorl 16(%rsi),%r8d
+ xorl 20(%rsi),%r9d
+ xorl 24(%rsi),%r10d
+ xorl 28(%rsi),%r11d
+ movdqu 32(%rsi),%xmm0
+ xorl 48(%rsi),%r12d
+ xorl 52(%rsi),%r13d
+ xorl 56(%rsi),%r14d
+ xorl 60(%rsi),%r15d
+ leaq 64(%rsi),%rsi
+ pxor %xmm1,%xmm0
+
+ movdqa %xmm2,32(%rsp)
+ movd %xmm3,48(%rsp)
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movl %r12d,48(%rdi)
+ movl %r13d,52(%rdi)
+ movl %r14d,56(%rdi)
+ movl %r15d,60(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rbp
+ jnz .Loop_outer
+
+ jmp .Ldone
+
+.align 16
+.Ltail:
+ movl %eax,0(%rsp)
+ movl %ebx,4(%rsp)
+ xorq %rbx,%rbx
+ movl %ecx,8(%rsp)
+ movl %edx,12(%rsp)
+ movl %r8d,16(%rsp)
+ movl %r9d,20(%rsp)
+ movl %r10d,24(%rsp)
+ movl %r11d,28(%rsp)
+ movdqa %xmm1,32(%rsp)
+ movl %r12d,48(%rsp)
+ movl %r13d,52(%rsp)
+ movl %r14d,56(%rsp)
+ movl %r15d,60(%rsp)
+
+.Loop_tail:
+ movzbl (%rsi,%rbx,1),%eax
+ movzbl (%rsp,%rbx,1),%edx
+ leaq 1(%rbx),%rbx
+ xorl %edx,%eax
+ movb %al,-1(%rdi,%rbx,1)
+ decq %rbp
+ jnz .Loop_tail
+
+.Ldone:
+ leaq 64+24+48(%rsp),%rsi
+ movq -48(%rsi),%r15
+.cfi_restore r15
+ movq -40(%rsi),%r14
+.cfi_restore r14
+ movq -32(%rsi),%r13
+.cfi_restore r13
+ movq -24(%rsi),%r12
+.cfi_restore r12
+ movq -16(%rsi),%rbp
+.cfi_restore rbp
+ movq -8(%rsi),%rbx
+.cfi_restore rbx
+ leaq (%rsi),%rsp
+.cfi_adjust_cfa_offset -136
+.Lno_data:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_ctr32,.-ChaCha20_ctr32
+.type ChaCha20_ssse3,@function
+.align 32
+ChaCha20_ssse3:
+.LChaCha20_ssse3:
+.cfi_startproc
+ movq %rsp,%r9
+.cfi_def_cfa_register r9
+ cmpq $128,%rdx
+ ja .LChaCha20_4x
+
+.Ldo_sse3_after_all:
+ subq $64+8,%rsp
+ movdqa .Lsigma(%rip),%xmm0
+ movdqu (%rcx),%xmm1
+ movdqu 16(%rcx),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa .Lrot16(%rip),%xmm6
+ movdqa .Lrot24(%rip),%xmm7
+
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ movq $10,%r8
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_outer_ssse3:
+ movdqa .Lone(%rip),%xmm3
+ movdqa 0(%rsp),%xmm0
+ movdqa 16(%rsp),%xmm1
+ movdqa 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+ movq $10,%r8
+ movdqa %xmm3,48(%rsp)
+ jmp .Loop_ssse3
+
+.align 32
+.Loop_ssse3:
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $57,%xmm1,%xmm1
+ pshufd $147,%xmm3,%xmm3
+ nop
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,222
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $20,%xmm1
+ pslld $12,%xmm4
+ por %xmm4,%xmm1
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,223
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm4
+ psrld $25,%xmm1
+ pslld $7,%xmm4
+ por %xmm4,%xmm1
+ pshufd $78,%xmm2,%xmm2
+ pshufd $147,%xmm1,%xmm1
+ pshufd $57,%xmm3,%xmm3
+ decq %r8
+ jnz .Loop_ssse3
+ paddd 0(%rsp),%xmm0
+ paddd 16(%rsp),%xmm1
+ paddd 32(%rsp),%xmm2
+ paddd 48(%rsp),%xmm3
+
+ cmpq $64,%rdx
+ jb .Ltail_ssse3
+
+ movdqu 0(%rsi),%xmm4
+ movdqu 16(%rsi),%xmm5
+ pxor %xmm4,%xmm0
+ movdqu 32(%rsi),%xmm4
+ pxor %xmm5,%xmm1
+ movdqu 48(%rsi),%xmm5
+ leaq 64(%rsi),%rsi
+ pxor %xmm4,%xmm2
+ pxor %xmm5,%xmm3
+
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm1,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+ leaq 64(%rdi),%rdi
+
+ subq $64,%rdx
+ jnz .Loop_outer_ssse3
+
+ jmp .Ldone_ssse3
+
+.align 16
+.Ltail_ssse3:
+ movdqa %xmm0,0(%rsp)
+ movdqa %xmm1,16(%rsp)
+ movdqa %xmm2,32(%rsp)
+ movdqa %xmm3,48(%rsp)
+ xorq %r8,%r8
+
+.Loop_tail_ssse3:
+ movzbl (%rsi,%r8,1),%eax
+ movzbl (%rsp,%r8,1),%ecx
+ leaq 1(%r8),%r8
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r8,1)
+ decq %rdx
+ jnz .Loop_tail_ssse3
+
+.Ldone_ssse3:
+ leaq (%r9),%rsp
+.cfi_def_cfa_register rsp
+.Lssse3_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_ssse3,.-ChaCha20_ssse3
+.type ChaCha20_4x,@function
+.align 32
+ChaCha20_4x:
+.LChaCha20_4x:
+.cfi_startproc
+ movq %rsp,%r9
+.cfi_def_cfa_register r9
+ movq %r10,%r11
+ shrq $32,%r10
+ testq $32,%r10
+ jnz .LChaCha20_8x
+ cmpq $192,%rdx
+ ja .Lproceed4x
+
+ andq $71303168,%r11
+ cmpq $4194304,%r11
+ je .Ldo_sse3_after_all
+
+.Lproceed4x:
+ subq $0x140+8,%rsp
+ movdqa .Lsigma(%rip),%xmm11
+ movdqu (%rcx),%xmm15
+ movdqu 16(%rcx),%xmm7
+ movdqu (%r8),%xmm3
+ leaq 256(%rsp),%rcx
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ pshufd $0x00,%xmm11,%xmm8
+ pshufd $0x55,%xmm11,%xmm9
+ movdqa %xmm8,64(%rsp)
+ pshufd $0xaa,%xmm11,%xmm10
+ movdqa %xmm9,80(%rsp)
+ pshufd $0xff,%xmm11,%xmm11
+ movdqa %xmm10,96(%rsp)
+ movdqa %xmm11,112(%rsp)
+
+ pshufd $0x00,%xmm15,%xmm12
+ pshufd $0x55,%xmm15,%xmm13
+ movdqa %xmm12,128-256(%rcx)
+ pshufd $0xaa,%xmm15,%xmm14
+ movdqa %xmm13,144-256(%rcx)
+ pshufd $0xff,%xmm15,%xmm15
+ movdqa %xmm14,160-256(%rcx)
+ movdqa %xmm15,176-256(%rcx)
+
+ pshufd $0x00,%xmm7,%xmm4
+ pshufd $0x55,%xmm7,%xmm5
+ movdqa %xmm4,192-256(%rcx)
+ pshufd $0xaa,%xmm7,%xmm6
+ movdqa %xmm5,208-256(%rcx)
+ pshufd $0xff,%xmm7,%xmm7
+ movdqa %xmm6,224-256(%rcx)
+ movdqa %xmm7,240-256(%rcx)
+
+ pshufd $0x00,%xmm3,%xmm0
+ pshufd $0x55,%xmm3,%xmm1
+ paddd .Linc(%rip),%xmm0
+ pshufd $0xaa,%xmm3,%xmm2
+ movdqa %xmm1,272-256(%rcx)
+ pshufd $0xff,%xmm3,%xmm3
+ movdqa %xmm2,288-256(%rcx)
+ movdqa %xmm3,304-256(%rcx)
+
+ jmp .Loop_enter4x
+
+.align 32
+.Loop_outer4x:
+ movdqa 64(%rsp),%xmm8
+ movdqa 80(%rsp),%xmm9
+ movdqa 96(%rsp),%xmm10
+ movdqa 112(%rsp),%xmm11
+ movdqa 128-256(%rcx),%xmm12
+ movdqa 144-256(%rcx),%xmm13
+ movdqa 160-256(%rcx),%xmm14
+ movdqa 176-256(%rcx),%xmm15
+ movdqa 192-256(%rcx),%xmm4
+ movdqa 208-256(%rcx),%xmm5
+ movdqa 224-256(%rcx),%xmm6
+ movdqa 240-256(%rcx),%xmm7
+ movdqa 256-256(%rcx),%xmm0
+ movdqa 272-256(%rcx),%xmm1
+ movdqa 288-256(%rcx),%xmm2
+ movdqa 304-256(%rcx),%xmm3
+ paddd .Lfour(%rip),%xmm0
+
+.Loop_enter4x:
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm7,48(%rsp)
+ movdqa (%r10),%xmm7
+ movl $10,%eax
+ movdqa %xmm0,256-256(%rcx)
+ jmp .Loop4x
+
+.align 32
+.Loop4x:
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,199
+.byte 102,15,56,0,207
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm6
+ pslld $12,%xmm12
+ psrld $20,%xmm6
+ movdqa %xmm13,%xmm7
+ pslld $12,%xmm13
+ por %xmm6,%xmm12
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm13
+ paddd %xmm12,%xmm8
+ paddd %xmm13,%xmm9
+ pxor %xmm8,%xmm0
+ pxor %xmm9,%xmm1
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+ paddd %xmm0,%xmm4
+ paddd %xmm1,%xmm5
+ pxor %xmm4,%xmm12
+ pxor %xmm5,%xmm13
+ movdqa %xmm12,%xmm7
+ pslld $7,%xmm12
+ psrld $25,%xmm7
+ movdqa %xmm13,%xmm6
+ pslld $7,%xmm13
+ por %xmm7,%xmm12
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm13
+ movdqa %xmm4,0(%rsp)
+ movdqa %xmm5,16(%rsp)
+ movdqa 32(%rsp),%xmm4
+ movdqa 48(%rsp),%xmm5
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,215
+.byte 102,15,56,0,223
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm6
+ pslld $12,%xmm14
+ psrld $20,%xmm6
+ movdqa %xmm15,%xmm7
+ pslld $12,%xmm15
+ por %xmm6,%xmm14
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm15
+ paddd %xmm14,%xmm10
+ paddd %xmm15,%xmm11
+ pxor %xmm10,%xmm2
+ pxor %xmm11,%xmm3
+.byte 102,15,56,0,214
+.byte 102,15,56,0,222
+ paddd %xmm2,%xmm4
+ paddd %xmm3,%xmm5
+ pxor %xmm4,%xmm14
+ pxor %xmm5,%xmm15
+ movdqa %xmm14,%xmm7
+ pslld $7,%xmm14
+ psrld $25,%xmm7
+ movdqa %xmm15,%xmm6
+ pslld $7,%xmm15
+ por %xmm7,%xmm14
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm15
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,223
+.byte 102,15,56,0,199
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm6
+ pslld $12,%xmm13
+ psrld $20,%xmm6
+ movdqa %xmm14,%xmm7
+ pslld $12,%xmm14
+ por %xmm6,%xmm13
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm14
+ paddd %xmm13,%xmm8
+ paddd %xmm14,%xmm9
+ pxor %xmm8,%xmm3
+ pxor %xmm9,%xmm0
+.byte 102,15,56,0,222
+.byte 102,15,56,0,198
+ paddd %xmm3,%xmm4
+ paddd %xmm0,%xmm5
+ pxor %xmm4,%xmm13
+ pxor %xmm5,%xmm14
+ movdqa %xmm13,%xmm7
+ pslld $7,%xmm13
+ psrld $25,%xmm7
+ movdqa %xmm14,%xmm6
+ pslld $7,%xmm14
+ por %xmm7,%xmm13
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm14
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm5,48(%rsp)
+ movdqa 0(%rsp),%xmm4
+ movdqa 16(%rsp),%xmm5
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,207
+.byte 102,15,56,0,215
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm6
+ pslld $12,%xmm15
+ psrld $20,%xmm6
+ movdqa %xmm12,%xmm7
+ pslld $12,%xmm12
+ por %xmm6,%xmm15
+ psrld $20,%xmm7
+ movdqa (%r11),%xmm6
+ por %xmm7,%xmm12
+ paddd %xmm15,%xmm10
+ paddd %xmm12,%xmm11
+ pxor %xmm10,%xmm1
+ pxor %xmm11,%xmm2
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ paddd %xmm1,%xmm4
+ paddd %xmm2,%xmm5
+ pxor %xmm4,%xmm15
+ pxor %xmm5,%xmm12
+ movdqa %xmm15,%xmm7
+ pslld $7,%xmm15
+ psrld $25,%xmm7
+ movdqa %xmm12,%xmm6
+ pslld $7,%xmm12
+ por %xmm7,%xmm15
+ psrld $25,%xmm6
+ movdqa (%r10),%xmm7
+ por %xmm6,%xmm12
+ decl %eax
+ jnz .Loop4x
+
+ paddd 64(%rsp),%xmm8
+ paddd 80(%rsp),%xmm9
+ paddd 96(%rsp),%xmm10
+ paddd 112(%rsp),%xmm11
+
+ movdqa %xmm8,%xmm6
+ punpckldq %xmm9,%xmm8
+ movdqa %xmm10,%xmm7
+ punpckldq %xmm11,%xmm10
+ punpckhdq %xmm9,%xmm6
+ punpckhdq %xmm11,%xmm7
+ movdqa %xmm8,%xmm9
+ punpcklqdq %xmm10,%xmm8
+ movdqa %xmm6,%xmm11
+ punpcklqdq %xmm7,%xmm6
+ punpckhqdq %xmm10,%xmm9
+ punpckhqdq %xmm7,%xmm11
+ paddd 128-256(%rcx),%xmm12
+ paddd 144-256(%rcx),%xmm13
+ paddd 160-256(%rcx),%xmm14
+ paddd 176-256(%rcx),%xmm15
+
+ movdqa %xmm8,0(%rsp)
+ movdqa %xmm9,16(%rsp)
+ movdqa 32(%rsp),%xmm8
+ movdqa 48(%rsp),%xmm9
+
+ movdqa %xmm12,%xmm10
+ punpckldq %xmm13,%xmm12
+ movdqa %xmm14,%xmm7
+ punpckldq %xmm15,%xmm14
+ punpckhdq %xmm13,%xmm10
+ punpckhdq %xmm15,%xmm7
+ movdqa %xmm12,%xmm13
+ punpcklqdq %xmm14,%xmm12
+ movdqa %xmm10,%xmm15
+ punpcklqdq %xmm7,%xmm10
+ punpckhqdq %xmm14,%xmm13
+ punpckhqdq %xmm7,%xmm15
+ paddd 192-256(%rcx),%xmm4
+ paddd 208-256(%rcx),%xmm5
+ paddd 224-256(%rcx),%xmm8
+ paddd 240-256(%rcx),%xmm9
+
+ movdqa %xmm6,32(%rsp)
+ movdqa %xmm11,48(%rsp)
+
+ movdqa %xmm4,%xmm14
+ punpckldq %xmm5,%xmm4
+ movdqa %xmm8,%xmm7
+ punpckldq %xmm9,%xmm8
+ punpckhdq %xmm5,%xmm14
+ punpckhdq %xmm9,%xmm7
+ movdqa %xmm4,%xmm5
+ punpcklqdq %xmm8,%xmm4
+ movdqa %xmm14,%xmm9
+ punpcklqdq %xmm7,%xmm14
+ punpckhqdq %xmm8,%xmm5
+ punpckhqdq %xmm7,%xmm9
+ paddd 256-256(%rcx),%xmm0
+ paddd 272-256(%rcx),%xmm1
+ paddd 288-256(%rcx),%xmm2
+ paddd 304-256(%rcx),%xmm3
+
+ movdqa %xmm0,%xmm8
+ punpckldq %xmm1,%xmm0
+ movdqa %xmm2,%xmm7
+ punpckldq %xmm3,%xmm2
+ punpckhdq %xmm1,%xmm8
+ punpckhdq %xmm3,%xmm7
+ movdqa %xmm0,%xmm1
+ punpcklqdq %xmm2,%xmm0
+ movdqa %xmm8,%xmm3
+ punpcklqdq %xmm7,%xmm8
+ punpckhqdq %xmm2,%xmm1
+ punpckhqdq %xmm7,%xmm3
+ cmpq $256,%rdx
+ jb .Ltail4x
+
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 48(%rsp),%xmm6
+ pxor %xmm15,%xmm11
+ pxor %xmm9,%xmm2
+ pxor %xmm3,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $256,%rdx
+ jnz .Loop_outer4x
+
+ jmp .Ldone4x
+
+.Ltail4x:
+ cmpq $192,%rdx
+ jae .L192_or_more4x
+ cmpq $128,%rdx
+ jae .L128_or_more4x
+ cmpq $64,%rdx
+ jae .L64_or_more4x
+
+
+ xorq %r10,%r10
+
+ movdqa %xmm12,16(%rsp)
+ movdqa %xmm4,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L64_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 16(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm13,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm5,32(%rsp)
+ subq $64,%rdx
+ movdqa %xmm1,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L128_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm11,80(%rdi)
+ movdqu %xmm2,96(%rdi)
+ movdqu %xmm7,112(%rdi)
+ je .Ldone4x
+
+ movdqa 32(%rsp),%xmm6
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm10,16(%rsp)
+ leaq 128(%rdi),%rdi
+ movdqa %xmm14,32(%rsp)
+ subq $128,%rdx
+ movdqa %xmm8,48(%rsp)
+ jmp .Loop_tail4x
+
+.align 32
+.L192_or_more4x:
+ movdqu 0(%rsi),%xmm6
+ movdqu 16(%rsi),%xmm11
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm7
+ pxor 0(%rsp),%xmm6
+ pxor %xmm12,%xmm11
+ pxor %xmm4,%xmm2
+ pxor %xmm0,%xmm7
+
+ movdqu %xmm6,0(%rdi)
+ movdqu 64(%rsi),%xmm6
+ movdqu %xmm11,16(%rdi)
+ movdqu 80(%rsi),%xmm11
+ movdqu %xmm2,32(%rdi)
+ movdqu 96(%rsi),%xmm2
+ movdqu %xmm7,48(%rdi)
+ movdqu 112(%rsi),%xmm7
+ leaq 128(%rsi),%rsi
+ pxor 16(%rsp),%xmm6
+ pxor %xmm13,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm1,%xmm7
+
+ movdqu %xmm6,64(%rdi)
+ movdqu 0(%rsi),%xmm6
+ movdqu %xmm11,80(%rdi)
+ movdqu 16(%rsi),%xmm11
+ movdqu %xmm2,96(%rdi)
+ movdqu 32(%rsi),%xmm2
+ movdqu %xmm7,112(%rdi)
+ leaq 128(%rdi),%rdi
+ movdqu 48(%rsi),%xmm7
+ pxor 32(%rsp),%xmm6
+ pxor %xmm10,%xmm11
+ pxor %xmm14,%xmm2
+ pxor %xmm8,%xmm7
+ movdqu %xmm6,0(%rdi)
+ movdqu %xmm11,16(%rdi)
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm7,48(%rdi)
+ je .Ldone4x
+
+ movdqa 48(%rsp),%xmm6
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ movdqa %xmm6,0(%rsp)
+ movdqa %xmm15,16(%rsp)
+ leaq 64(%rdi),%rdi
+ movdqa %xmm9,32(%rsp)
+ subq $192,%rdx
+ movdqa %xmm3,48(%rsp)
+
+.Loop_tail4x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail4x
+
+.Ldone4x:
+ leaq (%r9),%rsp
+.cfi_def_cfa_register rsp
+.L4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_4x,.-ChaCha20_4x
+.type ChaCha20_8x,@function
+.align 32
+ChaCha20_8x:
+.LChaCha20_8x:
+.cfi_startproc
+ movq %rsp,%r9
+.cfi_def_cfa_register r9
+ subq $0x280+8,%rsp
+ andq $-32,%rsp
+ vzeroupper
+
+
+
+
+
+
+
+
+
+
+ vbroadcasti128 .Lsigma(%rip),%ymm11
+ vbroadcasti128 (%rcx),%ymm3
+ vbroadcasti128 16(%rcx),%ymm15
+ vbroadcasti128 (%r8),%ymm7
+ leaq 256(%rsp),%rcx
+ leaq 512(%rsp),%rax
+ leaq .Lrot16(%rip),%r10
+ leaq .Lrot24(%rip),%r11
+
+ vpshufd $0x00,%ymm11,%ymm8
+ vpshufd $0x55,%ymm11,%ymm9
+ vmovdqa %ymm8,128-256(%rcx)
+ vpshufd $0xaa,%ymm11,%ymm10
+ vmovdqa %ymm9,160-256(%rcx)
+ vpshufd $0xff,%ymm11,%ymm11
+ vmovdqa %ymm10,192-256(%rcx)
+ vmovdqa %ymm11,224-256(%rcx)
+
+ vpshufd $0x00,%ymm3,%ymm0
+ vpshufd $0x55,%ymm3,%ymm1
+ vmovdqa %ymm0,256-256(%rcx)
+ vpshufd $0xaa,%ymm3,%ymm2
+ vmovdqa %ymm1,288-256(%rcx)
+ vpshufd $0xff,%ymm3,%ymm3
+ vmovdqa %ymm2,320-256(%rcx)
+ vmovdqa %ymm3,352-256(%rcx)
+
+ vpshufd $0x00,%ymm15,%ymm12
+ vpshufd $0x55,%ymm15,%ymm13
+ vmovdqa %ymm12,384-512(%rax)
+ vpshufd $0xaa,%ymm15,%ymm14
+ vmovdqa %ymm13,416-512(%rax)
+ vpshufd $0xff,%ymm15,%ymm15
+ vmovdqa %ymm14,448-512(%rax)
+ vmovdqa %ymm15,480-512(%rax)
+
+ vpshufd $0x00,%ymm7,%ymm4
+ vpshufd $0x55,%ymm7,%ymm5
+ vpaddd .Lincy(%rip),%ymm4,%ymm4
+ vpshufd $0xaa,%ymm7,%ymm6
+ vmovdqa %ymm5,544-512(%rax)
+ vpshufd $0xff,%ymm7,%ymm7
+ vmovdqa %ymm6,576-512(%rax)
+ vmovdqa %ymm7,608-512(%rax)
+
+ jmp .Loop_enter8x
+
+.align 32
+.Loop_outer8x:
+ vmovdqa 128-256(%rcx),%ymm8
+ vmovdqa 160-256(%rcx),%ymm9
+ vmovdqa 192-256(%rcx),%ymm10
+ vmovdqa 224-256(%rcx),%ymm11
+ vmovdqa 256-256(%rcx),%ymm0
+ vmovdqa 288-256(%rcx),%ymm1
+ vmovdqa 320-256(%rcx),%ymm2
+ vmovdqa 352-256(%rcx),%ymm3
+ vmovdqa 384-512(%rax),%ymm12
+ vmovdqa 416-512(%rax),%ymm13
+ vmovdqa 448-512(%rax),%ymm14
+ vmovdqa 480-512(%rax),%ymm15
+ vmovdqa 512-512(%rax),%ymm4
+ vmovdqa 544-512(%rax),%ymm5
+ vmovdqa 576-512(%rax),%ymm6
+ vmovdqa 608-512(%rax),%ymm7
+ vpaddd .Leight(%rip),%ymm4,%ymm4
+
+.Loop_enter8x:
+ vmovdqa %ymm14,64(%rsp)
+ vmovdqa %ymm15,96(%rsp)
+ vbroadcasti128 (%r10),%ymm15
+ vmovdqa %ymm4,512-512(%rax)
+ movl $10,%eax
+ jmp .Loop8x
+
+.align 32
+.Loop8x:
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $12,%ymm0,%ymm14
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $12,%ymm1,%ymm15
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpxor %ymm4,%ymm8,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm1,%ymm9,%ymm9
+ vpxor %ymm5,%ymm9,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm12,%ymm12
+ vpxor %ymm0,%ymm12,%ymm0
+ vpslld $7,%ymm0,%ymm15
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm5,%ymm13,%ymm13
+ vpxor %ymm1,%ymm13,%ymm1
+ vpslld $7,%ymm1,%ymm14
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vmovdqa %ymm12,0(%rsp)
+ vmovdqa %ymm13,32(%rsp)
+ vmovdqa 64(%rsp),%ymm12
+ vmovdqa 96(%rsp),%ymm13
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $12,%ymm2,%ymm14
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $12,%ymm3,%ymm15
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vpaddd %ymm2,%ymm10,%ymm10
+ vpxor %ymm6,%ymm10,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm3,%ymm11,%ymm11
+ vpxor %ymm7,%ymm11,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm6,%ymm12,%ymm12
+ vpxor %ymm2,%ymm12,%ymm2
+ vpslld $7,%ymm2,%ymm15
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm7,%ymm13,%ymm13
+ vpxor %ymm3,%ymm13,%ymm3
+ vpslld $7,%ymm3,%ymm14
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm15,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm15,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $12,%ymm1,%ymm14
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm1,%ymm14,%ymm1
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $12,%ymm2,%ymm15
+ vpsrld $20,%ymm2,%ymm2
+ vpor %ymm2,%ymm15,%ymm2
+ vpaddd %ymm1,%ymm8,%ymm8
+ vpxor %ymm7,%ymm8,%ymm7
+ vpshufb %ymm14,%ymm7,%ymm7
+ vpaddd %ymm2,%ymm9,%ymm9
+ vpxor %ymm4,%ymm9,%ymm4
+ vpshufb %ymm14,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm12,%ymm12
+ vpxor %ymm1,%ymm12,%ymm1
+ vpslld $7,%ymm1,%ymm15
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm1,%ymm15,%ymm1
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm4,%ymm13,%ymm13
+ vpxor %ymm2,%ymm13,%ymm2
+ vpslld $7,%ymm2,%ymm14
+ vpsrld $25,%ymm2,%ymm2
+ vpor %ymm2,%ymm14,%ymm2
+ vmovdqa %ymm12,64(%rsp)
+ vmovdqa %ymm13,96(%rsp)
+ vmovdqa 0(%rsp),%ymm12
+ vmovdqa 32(%rsp),%ymm13
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm15,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm15,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $12,%ymm3,%ymm14
+ vpsrld $20,%ymm3,%ymm3
+ vpor %ymm3,%ymm14,%ymm3
+ vbroadcasti128 (%r11),%ymm14
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $12,%ymm0,%ymm15
+ vpsrld $20,%ymm0,%ymm0
+ vpor %ymm0,%ymm15,%ymm0
+ vpaddd %ymm3,%ymm10,%ymm10
+ vpxor %ymm5,%ymm10,%ymm5
+ vpshufb %ymm14,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpxor %ymm6,%ymm11,%ymm6
+ vpshufb %ymm14,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm12,%ymm12
+ vpxor %ymm3,%ymm12,%ymm3
+ vpslld $7,%ymm3,%ymm15
+ vpsrld $25,%ymm3,%ymm3
+ vpor %ymm3,%ymm15,%ymm3
+ vbroadcasti128 (%r10),%ymm15
+ vpaddd %ymm6,%ymm13,%ymm13
+ vpxor %ymm0,%ymm13,%ymm0
+ vpslld $7,%ymm0,%ymm14
+ vpsrld $25,%ymm0,%ymm0
+ vpor %ymm0,%ymm14,%ymm0
+ decl %eax
+ jnz .Loop8x
+
+ leaq 512(%rsp),%rax
+ vpaddd 128-256(%rcx),%ymm8,%ymm8
+ vpaddd 160-256(%rcx),%ymm9,%ymm9
+ vpaddd 192-256(%rcx),%ymm10,%ymm10
+ vpaddd 224-256(%rcx),%ymm11,%ymm11
+
+ vpunpckldq %ymm9,%ymm8,%ymm14
+ vpunpckldq %ymm11,%ymm10,%ymm15
+ vpunpckhdq %ymm9,%ymm8,%ymm8
+ vpunpckhdq %ymm11,%ymm10,%ymm10
+ vpunpcklqdq %ymm15,%ymm14,%ymm9
+ vpunpckhqdq %ymm15,%ymm14,%ymm14
+ vpunpcklqdq %ymm10,%ymm8,%ymm11
+ vpunpckhqdq %ymm10,%ymm8,%ymm8
+ vpaddd 256-256(%rcx),%ymm0,%ymm0
+ vpaddd 288-256(%rcx),%ymm1,%ymm1
+ vpaddd 320-256(%rcx),%ymm2,%ymm2
+ vpaddd 352-256(%rcx),%ymm3,%ymm3
+
+ vpunpckldq %ymm1,%ymm0,%ymm10
+ vpunpckldq %ymm3,%ymm2,%ymm15
+ vpunpckhdq %ymm1,%ymm0,%ymm0
+ vpunpckhdq %ymm3,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm10,%ymm1
+ vpunpckhqdq %ymm15,%ymm10,%ymm10
+ vpunpcklqdq %ymm2,%ymm0,%ymm3
+ vpunpckhqdq %ymm2,%ymm0,%ymm0
+ vperm2i128 $0x20,%ymm1,%ymm9,%ymm15
+ vperm2i128 $0x31,%ymm1,%ymm9,%ymm1
+ vperm2i128 $0x20,%ymm10,%ymm14,%ymm9
+ vperm2i128 $0x31,%ymm10,%ymm14,%ymm10
+ vperm2i128 $0x20,%ymm3,%ymm11,%ymm14
+ vperm2i128 $0x31,%ymm3,%ymm11,%ymm3
+ vperm2i128 $0x20,%ymm0,%ymm8,%ymm11
+ vperm2i128 $0x31,%ymm0,%ymm8,%ymm0
+ vmovdqa %ymm15,0(%rsp)
+ vmovdqa %ymm9,32(%rsp)
+ vmovdqa 64(%rsp),%ymm15
+ vmovdqa 96(%rsp),%ymm9
+
+ vpaddd 384-512(%rax),%ymm12,%ymm12
+ vpaddd 416-512(%rax),%ymm13,%ymm13
+ vpaddd 448-512(%rax),%ymm15,%ymm15
+ vpaddd 480-512(%rax),%ymm9,%ymm9
+
+ vpunpckldq %ymm13,%ymm12,%ymm2
+ vpunpckldq %ymm9,%ymm15,%ymm8
+ vpunpckhdq %ymm13,%ymm12,%ymm12
+ vpunpckhdq %ymm9,%ymm15,%ymm15
+ vpunpcklqdq %ymm8,%ymm2,%ymm13
+ vpunpckhqdq %ymm8,%ymm2,%ymm2
+ vpunpcklqdq %ymm15,%ymm12,%ymm9
+ vpunpckhqdq %ymm15,%ymm12,%ymm12
+ vpaddd 512-512(%rax),%ymm4,%ymm4
+ vpaddd 544-512(%rax),%ymm5,%ymm5
+ vpaddd 576-512(%rax),%ymm6,%ymm6
+ vpaddd 608-512(%rax),%ymm7,%ymm7
+
+ vpunpckldq %ymm5,%ymm4,%ymm15
+ vpunpckldq %ymm7,%ymm6,%ymm8
+ vpunpckhdq %ymm5,%ymm4,%ymm4
+ vpunpckhdq %ymm7,%ymm6,%ymm6
+ vpunpcklqdq %ymm8,%ymm15,%ymm5
+ vpunpckhqdq %ymm8,%ymm15,%ymm15
+ vpunpcklqdq %ymm6,%ymm4,%ymm7
+ vpunpckhqdq %ymm6,%ymm4,%ymm4
+ vperm2i128 $0x20,%ymm5,%ymm13,%ymm8
+ vperm2i128 $0x31,%ymm5,%ymm13,%ymm5
+ vperm2i128 $0x20,%ymm15,%ymm2,%ymm13
+ vperm2i128 $0x31,%ymm15,%ymm2,%ymm15
+ vperm2i128 $0x20,%ymm7,%ymm9,%ymm2
+ vperm2i128 $0x31,%ymm7,%ymm9,%ymm7
+ vperm2i128 $0x20,%ymm4,%ymm12,%ymm9
+ vperm2i128 $0x31,%ymm4,%ymm12,%ymm4
+ vmovdqa 0(%rsp),%ymm6
+ vmovdqa 32(%rsp),%ymm12
+
+ cmpq $512,%rdx
+ jb .Ltail8x
+
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm12,%ymm12
+ vpxor 32(%rsi),%ymm13,%ymm13
+ vpxor 64(%rsi),%ymm10,%ymm10
+ vpxor 96(%rsi),%ymm15,%ymm15
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm12,0(%rdi)
+ vmovdqu %ymm13,32(%rdi)
+ vmovdqu %ymm10,64(%rdi)
+ vmovdqu %ymm15,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm14,%ymm14
+ vpxor 32(%rsi),%ymm2,%ymm2
+ vpxor 64(%rsi),%ymm3,%ymm3
+ vpxor 96(%rsi),%ymm7,%ymm7
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm14,0(%rdi)
+ vmovdqu %ymm2,32(%rdi)
+ vmovdqu %ymm3,64(%rdi)
+ vmovdqu %ymm7,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ vpxor 0(%rsi),%ymm11,%ymm11
+ vpxor 32(%rsi),%ymm9,%ymm9
+ vpxor 64(%rsi),%ymm0,%ymm0
+ vpxor 96(%rsi),%ymm4,%ymm4
+ leaq 128(%rsi),%rsi
+ vmovdqu %ymm11,0(%rdi)
+ vmovdqu %ymm9,32(%rdi)
+ vmovdqu %ymm0,64(%rdi)
+ vmovdqu %ymm4,96(%rdi)
+ leaq 128(%rdi),%rdi
+
+ subq $512,%rdx
+ jnz .Loop_outer8x
+
+ jmp .Ldone8x
+
+.Ltail8x:
+ cmpq $448,%rdx
+ jae .L448_or_more8x
+ cmpq $384,%rdx
+ jae .L384_or_more8x
+ cmpq $320,%rdx
+ jae .L320_or_more8x
+ cmpq $256,%rdx
+ jae .L256_or_more8x
+ cmpq $192,%rdx
+ jae .L192_or_more8x
+ cmpq $128,%rdx
+ jae .L128_or_more8x
+ cmpq $64,%rdx
+ jae .L64_or_more8x
+
+ xorq %r10,%r10
+ vmovdqa %ymm6,0(%rsp)
+ vmovdqa %ymm8,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L64_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ je .Ldone8x
+
+ leaq 64(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm1,0(%rsp)
+ leaq 64(%rdi),%rdi
+ subq $64,%rdx
+ vmovdqa %ymm5,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L128_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ je .Ldone8x
+
+ leaq 128(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm12,0(%rsp)
+ leaq 128(%rdi),%rdi
+ subq $128,%rdx
+ vmovdqa %ymm13,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L192_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ je .Ldone8x
+
+ leaq 192(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm10,0(%rsp)
+ leaq 192(%rdi),%rdi
+ subq $192,%rdx
+ vmovdqa %ymm15,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L256_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ je .Ldone8x
+
+ leaq 256(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm14,0(%rsp)
+ leaq 256(%rdi),%rdi
+ subq $256,%rdx
+ vmovdqa %ymm2,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L320_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ je .Ldone8x
+
+ leaq 320(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm3,0(%rsp)
+ leaq 320(%rdi),%rdi
+ subq $320,%rdx
+ vmovdqa %ymm7,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L384_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ je .Ldone8x
+
+ leaq 384(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm11,0(%rsp)
+ leaq 384(%rdi),%rdi
+ subq $384,%rdx
+ vmovdqa %ymm9,32(%rsp)
+ jmp .Loop_tail8x
+
+.align 32
+.L448_or_more8x:
+ vpxor 0(%rsi),%ymm6,%ymm6
+ vpxor 32(%rsi),%ymm8,%ymm8
+ vpxor 64(%rsi),%ymm1,%ymm1
+ vpxor 96(%rsi),%ymm5,%ymm5
+ vpxor 128(%rsi),%ymm12,%ymm12
+ vpxor 160(%rsi),%ymm13,%ymm13
+ vpxor 192(%rsi),%ymm10,%ymm10
+ vpxor 224(%rsi),%ymm15,%ymm15
+ vpxor 256(%rsi),%ymm14,%ymm14
+ vpxor 288(%rsi),%ymm2,%ymm2
+ vpxor 320(%rsi),%ymm3,%ymm3
+ vpxor 352(%rsi),%ymm7,%ymm7
+ vpxor 384(%rsi),%ymm11,%ymm11
+ vpxor 416(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm6,0(%rdi)
+ vmovdqu %ymm8,32(%rdi)
+ vmovdqu %ymm1,64(%rdi)
+ vmovdqu %ymm5,96(%rdi)
+ vmovdqu %ymm12,128(%rdi)
+ vmovdqu %ymm13,160(%rdi)
+ vmovdqu %ymm10,192(%rdi)
+ vmovdqu %ymm15,224(%rdi)
+ vmovdqu %ymm14,256(%rdi)
+ vmovdqu %ymm2,288(%rdi)
+ vmovdqu %ymm3,320(%rdi)
+ vmovdqu %ymm7,352(%rdi)
+ vmovdqu %ymm11,384(%rdi)
+ vmovdqu %ymm9,416(%rdi)
+ je .Ldone8x
+
+ leaq 448(%rsi),%rsi
+ xorq %r10,%r10
+ vmovdqa %ymm0,0(%rsp)
+ leaq 448(%rdi),%rdi
+ subq $448,%rdx
+ vmovdqa %ymm4,32(%rsp)
+
+.Loop_tail8x:
+ movzbl (%rsi,%r10,1),%eax
+ movzbl (%rsp,%r10,1),%ecx
+ leaq 1(%r10),%r10
+ xorl %ecx,%eax
+ movb %al,-1(%rdi,%r10,1)
+ decq %rdx
+ jnz .Loop_tail8x
+
+.Ldone8x:
+ vzeroall
+ leaq (%r9),%rsp
+.cfi_def_cfa_register rsp
+.L8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ChaCha20_8x,.-ChaCha20_8x
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S
@@ -1,0 +1,3079 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.data
+
+.align 16
+one:
+.quad 1,0
+two:
+.quad 2,0
+three:
+.quad 3,0
+four:
+.quad 4,0
+five:
+.quad 5,0
+six:
+.quad 6,0
+seven:
+.quad 7,0
+eight:
+.quad 8,0
+
+OR_MASK:
+.long 0x00000000,0x00000000,0x00000000,0x80000000
+poly:
+.quad 0x1, 0xc200000000000000
+mask:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+con1:
+.long 1,1,1,1
+con2:
+.long 0x1b,0x1b,0x1b,0x1b
+con3:
+.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
+and_mask:
+.long 0,0xffffffff, 0xffffffff, 0xffffffff
+.text
+.type GFMUL,@function
+.align 16
+GFMUL:
+.cfi_startproc
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm2,%xmm2
+ vpxor %xmm3,%xmm5,%xmm5
+
+ vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
+ vpshufd $78,%xmm2,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+
+ vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3
+ vpshufd $78,%xmm2,%xmm4
+ vpxor %xmm4,%xmm3,%xmm2
+
+ vpxor %xmm5,%xmm2,%xmm0
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size GFMUL, .-GFMUL
+.globl aesgcmsiv_htable_init
+.hidden aesgcmsiv_htable_init
+.type aesgcmsiv_htable_init,@function
+.align 16
+aesgcmsiv_htable_init:
+.cfi_startproc
+ vmovdqa (%rsi),%xmm0
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm0,(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,16(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,32(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,48(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,64(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,80(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,96(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,112(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
+.globl aesgcmsiv_htable6_init
+.hidden aesgcmsiv_htable6_init
+.type aesgcmsiv_htable6_init,@function
+.align 16
+aesgcmsiv_htable6_init:
+.cfi_startproc
+ vmovdqa (%rsi),%xmm0
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm0,(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,16(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,32(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,48(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,64(%rdi)
+ call GFMUL
+ vmovdqa %xmm0,80(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
+.globl aesgcmsiv_htable_polyval
+.hidden aesgcmsiv_htable_polyval
+.type aesgcmsiv_htable_polyval,@function
+.align 16
+aesgcmsiv_htable_polyval:
+.cfi_startproc
+ testq %rdx,%rdx
+ jnz .Lhtable_polyval_start
+ .byte 0xf3,0xc3
+
+.Lhtable_polyval_start:
+ vzeroall
+
+
+
+ movq %rdx,%r11
+ andq $127,%r11
+
+ jz .Lhtable_polyval_no_prefix
+
+ vpxor %xmm9,%xmm9,%xmm9
+ vmovdqa (%rcx),%xmm1
+ subq %r11,%rdx
+
+ subq $16,%r11
+
+
+ vmovdqu (%rsi),%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5
+ vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3
+ vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4
+ vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+ leaq 16(%rsi),%rsi
+ testq %r11,%r11
+ jnz .Lhtable_polyval_prefix_loop
+ jmp .Lhtable_polyval_prefix_complete
+
+
+.align 64
+.Lhtable_polyval_prefix_loop:
+ subq $16,%r11
+
+ vmovdqu (%rsi),%xmm0
+
+ vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+ testq %r11,%r11
+
+ leaq 16(%rsi),%rsi
+
+ jnz .Lhtable_polyval_prefix_loop
+
+.Lhtable_polyval_prefix_complete:
+ vpsrldq $8,%xmm5,%xmm6
+ vpslldq $8,%xmm5,%xmm5
+
+ vpxor %xmm6,%xmm4,%xmm9
+ vpxor %xmm5,%xmm3,%xmm1
+
+ jmp .Lhtable_polyval_main_loop
+
+.Lhtable_polyval_no_prefix:
+
+
+
+
+ vpxor %xmm1,%xmm1,%xmm1
+ vmovdqa (%rcx),%xmm9
+
+.align 64
+.Lhtable_polyval_main_loop:
+ subq $0x80,%rdx
+ jb .Lhtable_polyval_out
+
+ vmovdqu 112(%rsi),%xmm0
+
+ vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5
+ vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3
+ vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4
+ vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vmovdqu 96(%rsi),%xmm0
+ vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+
+ vmovdqu 80(%rsi),%xmm0
+
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+
+ vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpxor %xmm7,%xmm1,%xmm1
+
+ vmovdqu 64(%rsi),%xmm0
+
+ vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vmovdqu 48(%rsi),%xmm0
+
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+
+ vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpxor %xmm7,%xmm1,%xmm1
+
+ vmovdqu 32(%rsi),%xmm0
+
+ vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vmovdqu 16(%rsi),%xmm0
+
+ vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vmovdqu 0(%rsi),%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+
+ vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm3,%xmm3
+ vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm4,%xmm4
+ vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6
+ vpxor %xmm6,%xmm5,%xmm5
+
+
+ vpsrldq $8,%xmm5,%xmm6
+ vpslldq $8,%xmm5,%xmm5
+
+ vpxor %xmm6,%xmm4,%xmm9
+ vpxor %xmm5,%xmm3,%xmm1
+
+ leaq 128(%rsi),%rsi
+ jmp .Lhtable_polyval_main_loop
+
+
+
+.Lhtable_polyval_out:
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+ vpxor %xmm6,%xmm1,%xmm1
+
+ vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6
+ vpalignr $8,%xmm1,%xmm1,%xmm1
+ vpxor %xmm6,%xmm1,%xmm1
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vmovdqu %xmm1,(%rcx)
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
+.globl aesgcmsiv_polyval_horner
+.hidden aesgcmsiv_polyval_horner
+.type aesgcmsiv_polyval_horner,@function
+.align 16
+aesgcmsiv_polyval_horner:
+.cfi_startproc
+ testq %rcx,%rcx
+ jnz .Lpolyval_horner_start
+ .byte 0xf3,0xc3
+
+.Lpolyval_horner_start:
+
+
+
+ xorq %r10,%r10
+ shlq $4,%rcx
+
+ vmovdqa (%rsi),%xmm1
+ vmovdqa (%rdi),%xmm0
+
+.Lpolyval_horner_loop:
+ vpxor (%rdx,%r10,1),%xmm0,%xmm0
+ call GFMUL
+
+ addq $16,%r10
+ cmpq %r10,%rcx
+ jne .Lpolyval_horner_loop
+
+
+ vmovdqa %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
+.globl aes128gcmsiv_aes_ks
+.hidden aes128gcmsiv_aes_ks
+.type aes128gcmsiv_aes_ks,@function
+.align 16
+aes128gcmsiv_aes_ks:
+.cfi_startproc
+ vmovdqu (%rdi),%xmm1
+ vmovdqa %xmm1,(%rsi)
+
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+
+ movq $8,%rax
+
+.Lks128_loop:
+ addq $16,%rsi
+ subq $1,%rax
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,(%rsi)
+ jne .Lks128_loop
+
+ vmovdqa con2(%rip),%xmm0
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,16(%rsi)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslldq $4,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpslldq $4,%xmm3,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,32(%rsi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
+.globl aes256gcmsiv_aes_ks
+.hidden aes256gcmsiv_aes_ks
+.type aes256gcmsiv_aes_ks,@function
+.align 16
+aes256gcmsiv_aes_ks:
+.cfi_startproc
+ vmovdqu (%rdi),%xmm1
+ vmovdqu 16(%rdi),%xmm3
+ vmovdqa %xmm1,(%rsi)
+ vmovdqa %xmm3,16(%rsi)
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+ vpxor %xmm14,%xmm14,%xmm14
+ movq $6,%rax
+
+.Lks256_loop:
+ addq $32,%rsi
+ subq $1,%rax
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,(%rsi)
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpsllq $32,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpshufb con3(%rip),%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vmovdqa %xmm3,16(%rsi)
+ jne .Lks256_loop
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpsllq $32,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vmovdqa %xmm1,32(%rsi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.globl aes128gcmsiv_aes_ks_enc_x1
+.hidden aes128gcmsiv_aes_ks_enc_x1
+.type aes128gcmsiv_aes_ks_enc_x1,@function
+.align 16
+aes128gcmsiv_aes_ks_enc_x1:
+.cfi_startproc
+ vmovdqa (%rcx),%xmm1
+ vmovdqa 0(%rdi),%xmm4
+
+ vmovdqa %xmm1,(%rdx)
+ vpxor %xmm1,%xmm4,%xmm4
+
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,16(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,32(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,48(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,64(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,80(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,96(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,112(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,128(%rdx)
+
+
+ vmovdqa con2(%rip),%xmm0
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenc %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,144(%rdx)
+
+ vpshufb %xmm15,%xmm1,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpsllq $32,%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpshufb con3(%rip),%xmm1,%xmm3
+ vpxor %xmm3,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+
+ vaesenclast %xmm1,%xmm4,%xmm4
+ vmovdqa %xmm1,160(%rdx)
+
+
+ vmovdqa %xmm4,0(%rsi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
+.globl aes128gcmsiv_kdf
+.hidden aes128gcmsiv_kdf
+.type aes128gcmsiv_kdf,@function
+.align 16
+aes128gcmsiv_kdf:
+.cfi_startproc
+
+
+
+
+ vmovdqa (%rdx),%xmm1
+ vmovdqa 0(%rdi),%xmm9
+ vmovdqa and_mask(%rip),%xmm12
+ vmovdqa one(%rip),%xmm13
+ vpshufd $0x90,%xmm9,%xmm9
+ vpand %xmm12,%xmm9,%xmm9
+ vpaddd %xmm13,%xmm9,%xmm10
+ vpaddd %xmm13,%xmm10,%xmm11
+ vpaddd %xmm13,%xmm11,%xmm12
+
+ vpxor %xmm1,%xmm9,%xmm9
+ vpxor %xmm1,%xmm10,%xmm10
+ vpxor %xmm1,%xmm11,%xmm11
+ vpxor %xmm1,%xmm12,%xmm12
+
+ vmovdqa 16(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 32(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 48(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 64(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 80(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 96(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 112(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 128(%rdx),%xmm2
+ vaesenc %xmm2,%xmm9,%xmm9
+ vaesenc %xmm2,%xmm10,%xmm10
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+
+ vmovdqa 144(%rdx),%xmm1
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+
+ vmovdqa 160(%rdx),%xmm2
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vaesenclast %xmm2,%xmm10,%xmm10
+ vaesenclast %xmm2,%xmm11,%xmm11
+ vaesenclast %xmm2,%xmm12,%xmm12
+
+
+ vmovdqa %xmm9,0(%rsi)
+ vmovdqa %xmm10,16(%rsi)
+ vmovdqa %xmm11,32(%rsi)
+ vmovdqa %xmm12,48(%rsi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
+.globl aes128gcmsiv_enc_msg_x4
+.hidden aes128gcmsiv_enc_msg_x4
+.type aes128gcmsiv_enc_msg_x4,@function
+.align 16
+aes128gcmsiv_enc_msg_x4:
+.cfi_startproc
+ testq %r8,%r8
+ jnz .L128_enc_msg_x4_start
+ .byte 0xf3,0xc3
+
+.L128_enc_msg_x4_start:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-24
+
+ shrq $4,%r8
+ movq %r8,%r10
+ shlq $62,%r10
+ shrq $62,%r10
+
+
+ vmovdqa (%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+
+ vmovdqu four(%rip),%xmm4
+ vmovdqa %xmm15,%xmm0
+ vpaddd one(%rip),%xmm15,%xmm1
+ vpaddd two(%rip),%xmm15,%xmm2
+ vpaddd three(%rip),%xmm15,%xmm3
+
+ shrq $2,%r8
+ je .L128_enc_msg_x4_check_remainder
+
+ subq $64,%rsi
+ subq $64,%rdi
+
+.L128_enc_msg_x4_loop1:
+ addq $64,%rsi
+ addq $64,%rdi
+
+ vmovdqa %xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vmovdqa %xmm2,%xmm7
+ vmovdqa %xmm3,%xmm8
+
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqu 32(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm1,%xmm1
+ vmovdqu 48(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm2,%xmm2
+ vmovdqu 64(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm3,%xmm3
+
+ vmovdqu 80(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 96(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 112(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 128(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 144(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm12
+ vaesenclast %xmm12,%xmm5,%xmm5
+ vaesenclast %xmm12,%xmm6,%xmm6
+ vaesenclast %xmm12,%xmm7,%xmm7
+ vaesenclast %xmm12,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm5,%xmm5
+ vpxor 16(%rdi),%xmm6,%xmm6
+ vpxor 32(%rdi),%xmm7,%xmm7
+ vpxor 48(%rdi),%xmm8,%xmm8
+
+ subq $1,%r8
+
+ vmovdqu %xmm5,0(%rsi)
+ vmovdqu %xmm6,16(%rsi)
+ vmovdqu %xmm7,32(%rsi)
+ vmovdqu %xmm8,48(%rsi)
+
+ jne .L128_enc_msg_x4_loop1
+
+ addq $64,%rsi
+ addq $64,%rdi
+
+.L128_enc_msg_x4_check_remainder:
+ cmpq $0,%r10
+ je .L128_enc_msg_x4_out
+
+.L128_enc_msg_x4_loop2:
+
+
+ vmovdqa %xmm0,%xmm5
+ vpaddd one(%rip),%xmm0,%xmm0
+
+ vpxor (%rcx),%xmm5,%xmm5
+ vaesenc 16(%rcx),%xmm5,%xmm5
+ vaesenc 32(%rcx),%xmm5,%xmm5
+ vaesenc 48(%rcx),%xmm5,%xmm5
+ vaesenc 64(%rcx),%xmm5,%xmm5
+ vaesenc 80(%rcx),%xmm5,%xmm5
+ vaesenc 96(%rcx),%xmm5,%xmm5
+ vaesenc 112(%rcx),%xmm5,%xmm5
+ vaesenc 128(%rcx),%xmm5,%xmm5
+ vaesenc 144(%rcx),%xmm5,%xmm5
+ vaesenclast 160(%rcx),%xmm5,%xmm5
+
+
+ vpxor (%rdi),%xmm5,%xmm5
+ vmovdqu %xmm5,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+
+ subq $1,%r10
+ jne .L128_enc_msg_x4_loop2
+
+.L128_enc_msg_x4_out:
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
+.globl aes128gcmsiv_enc_msg_x8
+.hidden aes128gcmsiv_enc_msg_x8
+.type aes128gcmsiv_enc_msg_x8,@function
+.align 16
+aes128gcmsiv_enc_msg_x8:
+.cfi_startproc
+ testq %r8,%r8
+ jnz .L128_enc_msg_x8_start
+ .byte 0xf3,0xc3
+
+.L128_enc_msg_x8_start:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-24
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-32
+ movq %rsp,%rbp
+.cfi_def_cfa_register rbp
+
+
+ subq $128,%rsp
+ andq $-64,%rsp
+
+ shrq $4,%r8
+ movq %r8,%r10
+ shlq $61,%r10
+ shrq $61,%r10
+
+
+ vmovdqu (%rdx),%xmm1
+ vpor OR_MASK(%rip),%xmm1,%xmm1
+
+
+ vpaddd seven(%rip),%xmm1,%xmm0
+ vmovdqu %xmm0,(%rsp)
+ vpaddd one(%rip),%xmm1,%xmm9
+ vpaddd two(%rip),%xmm1,%xmm10
+ vpaddd three(%rip),%xmm1,%xmm11
+ vpaddd four(%rip),%xmm1,%xmm12
+ vpaddd five(%rip),%xmm1,%xmm13
+ vpaddd six(%rip),%xmm1,%xmm14
+ vmovdqa %xmm1,%xmm0
+
+ shrq $3,%r8
+ je .L128_enc_msg_x8_check_remainder
+
+ subq $128,%rsi
+ subq $128,%rdi
+
+.L128_enc_msg_x8_loop1:
+ addq $128,%rsi
+ addq $128,%rdi
+
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm9,%xmm2
+ vmovdqa %xmm10,%xmm3
+ vmovdqa %xmm11,%xmm4
+ vmovdqa %xmm12,%xmm5
+ vmovdqa %xmm13,%xmm6
+ vmovdqa %xmm14,%xmm7
+
+ vmovdqu (%rsp),%xmm8
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vpxor (%rcx),%xmm2,%xmm2
+ vpxor (%rcx),%xmm3,%xmm3
+ vpxor (%rcx),%xmm4,%xmm4
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu (%rsp),%xmm14
+ vpaddd eight(%rip),%xmm14,%xmm14
+ vmovdqu %xmm14,(%rsp)
+ vmovdqu 32(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpsubd one(%rip),%xmm14,%xmm14
+ vmovdqu 48(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm0,%xmm0
+ vmovdqu 64(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm9,%xmm9
+ vmovdqu 80(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm10,%xmm10
+ vmovdqu 96(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm11,%xmm11
+ vmovdqu 112(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm12,%xmm12
+ vmovdqu 128(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm13,%xmm13
+ vmovdqu 144(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm15
+ vaesenclast %xmm15,%xmm1,%xmm1
+ vaesenclast %xmm15,%xmm2,%xmm2
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vaesenclast %xmm15,%xmm6,%xmm6
+ vaesenclast %xmm15,%xmm7,%xmm7
+ vaesenclast %xmm15,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm1,%xmm1
+ vpxor 16(%rdi),%xmm2,%xmm2
+ vpxor 32(%rdi),%xmm3,%xmm3
+ vpxor 48(%rdi),%xmm4,%xmm4
+ vpxor 64(%rdi),%xmm5,%xmm5
+ vpxor 80(%rdi),%xmm6,%xmm6
+ vpxor 96(%rdi),%xmm7,%xmm7
+ vpxor 112(%rdi),%xmm8,%xmm8
+
+ decq %r8
+
+ vmovdqu %xmm1,0(%rsi)
+ vmovdqu %xmm2,16(%rsi)
+ vmovdqu %xmm3,32(%rsi)
+ vmovdqu %xmm4,48(%rsi)
+ vmovdqu %xmm5,64(%rsi)
+ vmovdqu %xmm6,80(%rsi)
+ vmovdqu %xmm7,96(%rsi)
+ vmovdqu %xmm8,112(%rsi)
+
+ jne .L128_enc_msg_x8_loop1
+
+ addq $128,%rsi
+ addq $128,%rdi
+
+.L128_enc_msg_x8_check_remainder:
+ cmpq $0,%r10
+ je .L128_enc_msg_x8_out
+
+.L128_enc_msg_x8_loop2:
+
+
+ vmovdqa %xmm0,%xmm1
+ vpaddd one(%rip),%xmm0,%xmm0
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vaesenc 16(%rcx),%xmm1,%xmm1
+ vaesenc 32(%rcx),%xmm1,%xmm1
+ vaesenc 48(%rcx),%xmm1,%xmm1
+ vaesenc 64(%rcx),%xmm1,%xmm1
+ vaesenc 80(%rcx),%xmm1,%xmm1
+ vaesenc 96(%rcx),%xmm1,%xmm1
+ vaesenc 112(%rcx),%xmm1,%xmm1
+ vaesenc 128(%rcx),%xmm1,%xmm1
+ vaesenc 144(%rcx),%xmm1,%xmm1
+ vaesenclast 160(%rcx),%xmm1,%xmm1
+
+
+ vpxor (%rdi),%xmm1,%xmm1
+
+ vmovdqu %xmm1,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+
+ decq %r10
+ jne .L128_enc_msg_x8_loop2
+
+.L128_enc_msg_x8_out:
+ movq %rbp,%rsp
+.cfi_def_cfa_register %rsp
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
+.globl aes128gcmsiv_dec
+.hidden aes128gcmsiv_dec
+.type aes128gcmsiv_dec,@function
+.align 16
+aes128gcmsiv_dec:
+.cfi_startproc
+ testq $~15,%r9
+ jnz .L128_dec_start
+ .byte 0xf3,0xc3
+
+.L128_dec_start:
+ vzeroupper
+ vmovdqa (%rdx),%xmm0
+ movq %rdx,%rax
+
+ leaq 32(%rax),%rax
+ leaq 32(%rcx),%rcx
+
+
+ vmovdqu (%rdi,%r9,1),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+ andq $~15,%r9
+
+
+ cmpq $96,%r9
+ jb .L128_dec_loop2
+
+
+ subq $96,%r9
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vpxor (%r8),%xmm7,%xmm7
+ vpxor (%r8),%xmm8,%xmm8
+ vpxor (%r8),%xmm9,%xmm9
+ vpxor (%r8),%xmm10,%xmm10
+ vpxor (%r8),%xmm11,%xmm11
+ vpxor (%r8),%xmm12,%xmm12
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+
+ vpxor 0(%rdi),%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm12,%xmm12
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ addq $96,%rdi
+ addq $96,%rsi
+ jmp .L128_dec_loop1
+
+
+.align 64
+.L128_dec_loop1:
+ cmpq $96,%r9
+ jb .L128_dec_finish_96
+ subq $96,%r9
+
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vmovdqa (%r8),%xmm4
+ vpxor %xmm4,%xmm7,%xmm7
+ vpxor %xmm4,%xmm8,%xmm8
+ vpxor %xmm4,%xmm9,%xmm9
+ vpxor %xmm4,%xmm10,%xmm10
+ vpxor %xmm4,%xmm11,%xmm11
+ vpxor %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vmovdqa 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm6
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor 0(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+ leaq 96(%rdi),%rdi
+ leaq 96(%rsi),%rsi
+ jmp .L128_dec_loop1
+
+.L128_dec_finish_96:
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+.L128_dec_loop2:
+
+
+
+ cmpq $16,%r9
+ jb .L128_dec_out
+ subq $16,%r9
+
+ vmovdqa %xmm15,%xmm2
+ vpaddd one(%rip),%xmm15,%xmm15
+
+ vpxor 0(%r8),%xmm2,%xmm2
+ vaesenc 16(%r8),%xmm2,%xmm2
+ vaesenc 32(%r8),%xmm2,%xmm2
+ vaesenc 48(%r8),%xmm2,%xmm2
+ vaesenc 64(%r8),%xmm2,%xmm2
+ vaesenc 80(%r8),%xmm2,%xmm2
+ vaesenc 96(%r8),%xmm2,%xmm2
+ vaesenc 112(%r8),%xmm2,%xmm2
+ vaesenc 128(%r8),%xmm2,%xmm2
+ vaesenc 144(%r8),%xmm2,%xmm2
+ vaesenclast 160(%r8),%xmm2,%xmm2
+ vpxor (%rdi),%xmm2,%xmm2
+ vmovdqu %xmm2,(%rsi)
+ addq $16,%rdi
+ addq $16,%rsi
+
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa -32(%rcx),%xmm1
+ call GFMUL
+
+ jmp .L128_dec_loop2
+
+.L128_dec_out:
+ vmovdqu %xmm0,(%rdx)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
+.globl aes128gcmsiv_ecb_enc_block
+.hidden aes128gcmsiv_ecb_enc_block
+.type aes128gcmsiv_ecb_enc_block,@function
+.align 16
+aes128gcmsiv_ecb_enc_block:
+.cfi_startproc
+ vmovdqa (%rdi),%xmm1
+
+ vpxor (%rdx),%xmm1,%xmm1
+ vaesenc 16(%rdx),%xmm1,%xmm1
+ vaesenc 32(%rdx),%xmm1,%xmm1
+ vaesenc 48(%rdx),%xmm1,%xmm1
+ vaesenc 64(%rdx),%xmm1,%xmm1
+ vaesenc 80(%rdx),%xmm1,%xmm1
+ vaesenc 96(%rdx),%xmm1,%xmm1
+ vaesenc 112(%rdx),%xmm1,%xmm1
+ vaesenc 128(%rdx),%xmm1,%xmm1
+ vaesenc 144(%rdx),%xmm1,%xmm1
+ vaesenclast 160(%rdx),%xmm1,%xmm1
+
+ vmovdqa %xmm1,(%rsi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
+.globl aes256gcmsiv_aes_ks_enc_x1
+.hidden aes256gcmsiv_aes_ks_enc_x1
+.type aes256gcmsiv_aes_ks_enc_x1,@function
+.align 16
+aes256gcmsiv_aes_ks_enc_x1:
+.cfi_startproc
+ vmovdqa con1(%rip),%xmm0
+ vmovdqa mask(%rip),%xmm15
+ vmovdqa (%rdi),%xmm8
+ vmovdqa (%rcx),%xmm1
+ vmovdqa 16(%rcx),%xmm3
+ vpxor %xmm1,%xmm8,%xmm8
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm1,(%rdx)
+ vmovdqu %xmm3,16(%rdx)
+ vpxor %xmm14,%xmm14,%xmm14
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,32(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,48(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,64(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,80(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,96(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,112(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,128(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,144(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,160(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,176(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslld $1,%xmm0,%xmm0
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenc %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,192(%rdx)
+
+ vpshufd $0xff,%xmm1,%xmm2
+ vaesenclast %xmm14,%xmm2,%xmm2
+ vpslldq $4,%xmm3,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpxor %xmm2,%xmm3,%xmm3
+ vaesenc %xmm3,%xmm8,%xmm8
+ vmovdqu %xmm3,208(%rdx)
+
+ vpshufb %xmm15,%xmm3,%xmm2
+ vaesenclast %xmm0,%xmm2,%xmm2
+ vpslldq $4,%xmm1,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpslldq $4,%xmm4,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpxor %xmm2,%xmm1,%xmm1
+ vaesenclast %xmm1,%xmm8,%xmm8
+ vmovdqu %xmm1,224(%rdx)
+
+ vmovdqa %xmm8,(%rsi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
+.globl aes256gcmsiv_ecb_enc_block
+.hidden aes256gcmsiv_ecb_enc_block
+.type aes256gcmsiv_ecb_enc_block,@function
+.align 16
+aes256gcmsiv_ecb_enc_block:
+.cfi_startproc
+ vmovdqa (%rdi),%xmm1
+ vpxor (%rdx),%xmm1,%xmm1
+ vaesenc 16(%rdx),%xmm1,%xmm1
+ vaesenc 32(%rdx),%xmm1,%xmm1
+ vaesenc 48(%rdx),%xmm1,%xmm1
+ vaesenc 64(%rdx),%xmm1,%xmm1
+ vaesenc 80(%rdx),%xmm1,%xmm1
+ vaesenc 96(%rdx),%xmm1,%xmm1
+ vaesenc 112(%rdx),%xmm1,%xmm1
+ vaesenc 128(%rdx),%xmm1,%xmm1
+ vaesenc 144(%rdx),%xmm1,%xmm1
+ vaesenc 160(%rdx),%xmm1,%xmm1
+ vaesenc 176(%rdx),%xmm1,%xmm1
+ vaesenc 192(%rdx),%xmm1,%xmm1
+ vaesenc 208(%rdx),%xmm1,%xmm1
+ vaesenclast 224(%rdx),%xmm1,%xmm1
+ vmovdqa %xmm1,(%rsi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
+.globl aes256gcmsiv_enc_msg_x4
+.hidden aes256gcmsiv_enc_msg_x4
+.type aes256gcmsiv_enc_msg_x4,@function
+.align 16
+aes256gcmsiv_enc_msg_x4:
+.cfi_startproc
+ testq %r8,%r8
+ jnz .L256_enc_msg_x4_start
+ .byte 0xf3,0xc3
+
+.L256_enc_msg_x4_start:
+ movq %r8,%r10
+ shrq $4,%r8
+ shlq $60,%r10
+ jz .L256_enc_msg_x4_start2
+ addq $1,%r8
+
+.L256_enc_msg_x4_start2:
+ movq %r8,%r10
+ shlq $62,%r10
+ shrq $62,%r10
+
+
+ vmovdqa (%rdx),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+
+ vmovdqa four(%rip),%xmm4
+ vmovdqa %xmm15,%xmm0
+ vpaddd one(%rip),%xmm15,%xmm1
+ vpaddd two(%rip),%xmm15,%xmm2
+ vpaddd three(%rip),%xmm15,%xmm3
+
+ shrq $2,%r8
+ je .L256_enc_msg_x4_check_remainder
+
+ subq $64,%rsi
+ subq $64,%rdi
+
+.L256_enc_msg_x4_loop1:
+ addq $64,%rsi
+ addq $64,%rdi
+
+ vmovdqa %xmm0,%xmm5
+ vmovdqa %xmm1,%xmm6
+ vmovdqa %xmm2,%xmm7
+ vmovdqa %xmm3,%xmm8
+
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm0,%xmm0
+ vmovdqu 32(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm1,%xmm1
+ vmovdqu 48(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm2,%xmm2
+ vmovdqu 64(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vpaddd %xmm4,%xmm3,%xmm3
+
+ vmovdqu 80(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 96(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 112(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 128(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 144(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 176(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 192(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 208(%rcx),%xmm12
+ vaesenc %xmm12,%xmm5,%xmm5
+ vaesenc %xmm12,%xmm6,%xmm6
+ vaesenc %xmm12,%xmm7,%xmm7
+ vaesenc %xmm12,%xmm8,%xmm8
+
+ vmovdqu 224(%rcx),%xmm12
+ vaesenclast %xmm12,%xmm5,%xmm5
+ vaesenclast %xmm12,%xmm6,%xmm6
+ vaesenclast %xmm12,%xmm7,%xmm7
+ vaesenclast %xmm12,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm5,%xmm5
+ vpxor 16(%rdi),%xmm6,%xmm6
+ vpxor 32(%rdi),%xmm7,%xmm7
+ vpxor 48(%rdi),%xmm8,%xmm8
+
+ subq $1,%r8
+
+ vmovdqu %xmm5,0(%rsi)
+ vmovdqu %xmm6,16(%rsi)
+ vmovdqu %xmm7,32(%rsi)
+ vmovdqu %xmm8,48(%rsi)
+
+ jne .L256_enc_msg_x4_loop1
+
+ addq $64,%rsi
+ addq $64,%rdi
+
+.L256_enc_msg_x4_check_remainder:
+ cmpq $0,%r10
+ je .L256_enc_msg_x4_out
+
+.L256_enc_msg_x4_loop2:
+
+
+
+ vmovdqa %xmm0,%xmm5
+ vpaddd one(%rip),%xmm0,%xmm0
+ vpxor (%rcx),%xmm5,%xmm5
+ vaesenc 16(%rcx),%xmm5,%xmm5
+ vaesenc 32(%rcx),%xmm5,%xmm5
+ vaesenc 48(%rcx),%xmm5,%xmm5
+ vaesenc 64(%rcx),%xmm5,%xmm5
+ vaesenc 80(%rcx),%xmm5,%xmm5
+ vaesenc 96(%rcx),%xmm5,%xmm5
+ vaesenc 112(%rcx),%xmm5,%xmm5
+ vaesenc 128(%rcx),%xmm5,%xmm5
+ vaesenc 144(%rcx),%xmm5,%xmm5
+ vaesenc 160(%rcx),%xmm5,%xmm5
+ vaesenc 176(%rcx),%xmm5,%xmm5
+ vaesenc 192(%rcx),%xmm5,%xmm5
+ vaesenc 208(%rcx),%xmm5,%xmm5
+ vaesenclast 224(%rcx),%xmm5,%xmm5
+
+
+ vpxor (%rdi),%xmm5,%xmm5
+
+ vmovdqu %xmm5,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+
+ subq $1,%r10
+ jne .L256_enc_msg_x4_loop2
+
+.L256_enc_msg_x4_out:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
+.globl aes256gcmsiv_enc_msg_x8
+.hidden aes256gcmsiv_enc_msg_x8
+.type aes256gcmsiv_enc_msg_x8,@function
+.align 16
+aes256gcmsiv_enc_msg_x8:
+.cfi_startproc
+ testq %r8,%r8
+ jnz .L256_enc_msg_x8_start
+ .byte 0xf3,0xc3
+
+.L256_enc_msg_x8_start:
+
+ movq %rsp,%r11
+ subq $16,%r11
+ andq $-64,%r11
+
+ movq %r8,%r10
+ shrq $4,%r8
+ shlq $60,%r10
+ jz .L256_enc_msg_x8_start2
+ addq $1,%r8
+
+.L256_enc_msg_x8_start2:
+ movq %r8,%r10
+ shlq $61,%r10
+ shrq $61,%r10
+
+
+ vmovdqa (%rdx),%xmm1
+ vpor OR_MASK(%rip),%xmm1,%xmm1
+
+
+ vpaddd seven(%rip),%xmm1,%xmm0
+ vmovdqa %xmm0,(%r11)
+ vpaddd one(%rip),%xmm1,%xmm9
+ vpaddd two(%rip),%xmm1,%xmm10
+ vpaddd three(%rip),%xmm1,%xmm11
+ vpaddd four(%rip),%xmm1,%xmm12
+ vpaddd five(%rip),%xmm1,%xmm13
+ vpaddd six(%rip),%xmm1,%xmm14
+ vmovdqa %xmm1,%xmm0
+
+ shrq $3,%r8
+ jz .L256_enc_msg_x8_check_remainder
+
+ subq $128,%rsi
+ subq $128,%rdi
+
+.L256_enc_msg_x8_loop1:
+ addq $128,%rsi
+ addq $128,%rdi
+
+ vmovdqa %xmm0,%xmm1
+ vmovdqa %xmm9,%xmm2
+ vmovdqa %xmm10,%xmm3
+ vmovdqa %xmm11,%xmm4
+ vmovdqa %xmm12,%xmm5
+ vmovdqa %xmm13,%xmm6
+ vmovdqa %xmm14,%xmm7
+
+ vmovdqa (%r11),%xmm8
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vpxor (%rcx),%xmm2,%xmm2
+ vpxor (%rcx),%xmm3,%xmm3
+ vpxor (%rcx),%xmm4,%xmm4
+ vpxor (%rcx),%xmm5,%xmm5
+ vpxor (%rcx),%xmm6,%xmm6
+ vpxor (%rcx),%xmm7,%xmm7
+ vpxor (%rcx),%xmm8,%xmm8
+
+ vmovdqu 16(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqa (%r11),%xmm14
+ vpaddd eight(%rip),%xmm14,%xmm14
+ vmovdqa %xmm14,(%r11)
+ vmovdqu 32(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpsubd one(%rip),%xmm14,%xmm14
+ vmovdqu 48(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm0,%xmm0
+ vmovdqu 64(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm9,%xmm9
+ vmovdqu 80(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm10,%xmm10
+ vmovdqu 96(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm11,%xmm11
+ vmovdqu 112(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm12,%xmm12
+ vmovdqu 128(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vpaddd eight(%rip),%xmm13,%xmm13
+ vmovdqu 144(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 160(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 176(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 192(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 208(%rcx),%xmm15
+ vaesenc %xmm15,%xmm1,%xmm1
+ vaesenc %xmm15,%xmm2,%xmm2
+ vaesenc %xmm15,%xmm3,%xmm3
+ vaesenc %xmm15,%xmm4,%xmm4
+ vaesenc %xmm15,%xmm5,%xmm5
+ vaesenc %xmm15,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm8,%xmm8
+
+ vmovdqu 224(%rcx),%xmm15
+ vaesenclast %xmm15,%xmm1,%xmm1
+ vaesenclast %xmm15,%xmm2,%xmm2
+ vaesenclast %xmm15,%xmm3,%xmm3
+ vaesenclast %xmm15,%xmm4,%xmm4
+ vaesenclast %xmm15,%xmm5,%xmm5
+ vaesenclast %xmm15,%xmm6,%xmm6
+ vaesenclast %xmm15,%xmm7,%xmm7
+ vaesenclast %xmm15,%xmm8,%xmm8
+
+
+
+ vpxor 0(%rdi),%xmm1,%xmm1
+ vpxor 16(%rdi),%xmm2,%xmm2
+ vpxor 32(%rdi),%xmm3,%xmm3
+ vpxor 48(%rdi),%xmm4,%xmm4
+ vpxor 64(%rdi),%xmm5,%xmm5
+ vpxor 80(%rdi),%xmm6,%xmm6
+ vpxor 96(%rdi),%xmm7,%xmm7
+ vpxor 112(%rdi),%xmm8,%xmm8
+
+ subq $1,%r8
+
+ vmovdqu %xmm1,0(%rsi)
+ vmovdqu %xmm2,16(%rsi)
+ vmovdqu %xmm3,32(%rsi)
+ vmovdqu %xmm4,48(%rsi)
+ vmovdqu %xmm5,64(%rsi)
+ vmovdqu %xmm6,80(%rsi)
+ vmovdqu %xmm7,96(%rsi)
+ vmovdqu %xmm8,112(%rsi)
+
+ jne .L256_enc_msg_x8_loop1
+
+ addq $128,%rsi
+ addq $128,%rdi
+
+.L256_enc_msg_x8_check_remainder:
+ cmpq $0,%r10
+ je .L256_enc_msg_x8_out
+
+.L256_enc_msg_x8_loop2:
+
+
+ vmovdqa %xmm0,%xmm1
+ vpaddd one(%rip),%xmm0,%xmm0
+
+ vpxor (%rcx),%xmm1,%xmm1
+ vaesenc 16(%rcx),%xmm1,%xmm1
+ vaesenc 32(%rcx),%xmm1,%xmm1
+ vaesenc 48(%rcx),%xmm1,%xmm1
+ vaesenc 64(%rcx),%xmm1,%xmm1
+ vaesenc 80(%rcx),%xmm1,%xmm1
+ vaesenc 96(%rcx),%xmm1,%xmm1
+ vaesenc 112(%rcx),%xmm1,%xmm1
+ vaesenc 128(%rcx),%xmm1,%xmm1
+ vaesenc 144(%rcx),%xmm1,%xmm1
+ vaesenc 160(%rcx),%xmm1,%xmm1
+ vaesenc 176(%rcx),%xmm1,%xmm1
+ vaesenc 192(%rcx),%xmm1,%xmm1
+ vaesenc 208(%rcx),%xmm1,%xmm1
+ vaesenclast 224(%rcx),%xmm1,%xmm1
+
+
+ vpxor (%rdi),%xmm1,%xmm1
+
+ vmovdqu %xmm1,(%rsi)
+
+ addq $16,%rdi
+ addq $16,%rsi
+ subq $1,%r10
+ jnz .L256_enc_msg_x8_loop2
+
+.L256_enc_msg_x8_out:
+ .byte 0xf3,0xc3
+
+.cfi_endproc
+.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
+.globl aes256gcmsiv_dec
+.hidden aes256gcmsiv_dec
+.type aes256gcmsiv_dec,@function
+.align 16
+aes256gcmsiv_dec:
+.cfi_startproc
+ testq $~15,%r9
+ jnz .L256_dec_start
+ .byte 0xf3,0xc3
+
+.L256_dec_start:
+ vzeroupper
+ vmovdqa (%rdx),%xmm0
+ movq %rdx,%rax
+
+ leaq 32(%rax),%rax
+ leaq 32(%rcx),%rcx
+
+
+ vmovdqu (%rdi,%r9,1),%xmm15
+ vpor OR_MASK(%rip),%xmm15,%xmm15
+ andq $~15,%r9
+
+
+ cmpq $96,%r9
+ jb .L256_dec_loop2
+
+
+ subq $96,%r9
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vpxor (%r8),%xmm7,%xmm7
+ vpxor (%r8),%xmm8,%xmm8
+ vpxor (%r8),%xmm9,%xmm9
+ vpxor (%r8),%xmm10,%xmm10
+ vpxor (%r8),%xmm11,%xmm11
+ vpxor (%r8),%xmm12,%xmm12
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 176(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 192(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 208(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 224(%r8),%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+
+ vpxor 0(%rdi),%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm12,%xmm12
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ addq $96,%rdi
+ addq $96,%rsi
+ jmp .L256_dec_loop1
+
+
+.align 64
+.L256_dec_loop1:
+ cmpq $96,%r9
+ jb .L256_dec_finish_96
+ subq $96,%r9
+
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqa %xmm15,%xmm7
+ vpaddd one(%rip),%xmm7,%xmm8
+ vpaddd two(%rip),%xmm7,%xmm9
+ vpaddd one(%rip),%xmm9,%xmm10
+ vpaddd two(%rip),%xmm9,%xmm11
+ vpaddd one(%rip),%xmm11,%xmm12
+ vpaddd two(%rip),%xmm11,%xmm15
+
+ vmovdqa (%r8),%xmm4
+ vpxor %xmm4,%xmm7,%xmm7
+ vpxor %xmm4,%xmm8,%xmm8
+ vpxor %xmm4,%xmm9,%xmm9
+ vpxor %xmm4,%xmm10,%xmm10
+ vpxor %xmm4,%xmm11,%xmm11
+ vpxor %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 32(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 48(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 64(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 96(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 112(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vmovdqa 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 128(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vmovdqu 144(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 160(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 176(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 192(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 208(%r8),%xmm4
+ vaesenc %xmm4,%xmm7,%xmm7
+ vaesenc %xmm4,%xmm8,%xmm8
+ vaesenc %xmm4,%xmm9,%xmm9
+ vaesenc %xmm4,%xmm10,%xmm10
+ vaesenc %xmm4,%xmm11,%xmm11
+ vaesenc %xmm4,%xmm12,%xmm12
+
+ vmovdqu 224(%r8),%xmm6
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor 0(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm7,%xmm7
+ vpxor 16(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm8,%xmm8
+ vpxor 32(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vpxor 48(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm10,%xmm10
+ vpxor 64(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm11,%xmm11
+ vpxor 80(%rdi),%xmm6,%xmm4
+ vaesenclast %xmm4,%xmm12,%xmm12
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vmovdqu %xmm7,0(%rsi)
+ vmovdqu %xmm8,16(%rsi)
+ vmovdqu %xmm9,32(%rsi)
+ vmovdqu %xmm10,48(%rsi)
+ vmovdqu %xmm11,64(%rsi)
+ vmovdqu %xmm12,80(%rsi)
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+ leaq 96(%rdi),%rdi
+ leaq 96(%rsi),%rsi
+ jmp .L256_dec_loop1
+
+.L256_dec_finish_96:
+ vmovdqa %xmm12,%xmm6
+ vmovdqa %xmm11,16-32(%rax)
+ vmovdqa %xmm10,32-32(%rax)
+ vmovdqa %xmm9,48-32(%rax)
+ vmovdqa %xmm8,64-32(%rax)
+ vmovdqa %xmm7,80-32(%rax)
+
+ vmovdqu 0-32(%rcx),%xmm4
+ vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1
+ vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3
+ vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu -16(%rax),%xmm6
+ vmovdqu -16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 0(%rax),%xmm6
+ vmovdqu 0(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 16(%rax),%xmm6
+ vmovdqu 16(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vmovdqu 32(%rax),%xmm6
+ vmovdqu 32(%rcx),%xmm13
+
+ vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+
+ vmovdqu 80-32(%rax),%xmm6
+ vpxor %xmm0,%xmm6,%xmm6
+ vmovdqu 80-32(%rcx),%xmm5
+ vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+ vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm2,%xmm5
+ vpslldq $8,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm0
+
+ vmovdqa poly(%rip),%xmm3
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpalignr $8,%xmm0,%xmm0,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0
+ vpxor %xmm0,%xmm2,%xmm0
+
+ vpxor %xmm5,%xmm0,%xmm0
+
+.L256_dec_loop2:
+
+
+
+ cmpq $16,%r9
+ jb .L256_dec_out
+ subq $16,%r9
+
+ vmovdqa %xmm15,%xmm2
+ vpaddd one(%rip),%xmm15,%xmm15
+
+ vpxor 0(%r8),%xmm2,%xmm2
+ vaesenc 16(%r8),%xmm2,%xmm2
+ vaesenc 32(%r8),%xmm2,%xmm2
+ vaesenc 48(%r8),%xmm2,%xmm2
+ vaesenc 64(%r8),%xmm2,%xmm2
+ vaesenc 80(%r8),%xmm2,%xmm2
+ vaesenc 96(%r8),%xmm2,%xmm2
+ vaesenc 112(%r8),%xmm2,%xmm2
+ vaesenc 128(%r8),%xmm2,%xmm2
+ vaesenc 144(%r8),%xmm2,%xmm2
+ vaesenc 160(%r8),%xmm2,%xmm2
+ vaesenc 176(%r8),%xmm2,%xmm2
+ vaesenc 192(%r8),%xmm2,%xmm2
+ vaesenc 208(%r8),%xmm2,%xmm2
+ vaesenclast 224(%r8),%xmm2,%xmm2
+ vpxor (%rdi),%xmm2,%xmm2
+ vmovdqu %xmm2,(%rsi)
+ addq $16,%rdi
+ addq $16,%rsi
+
+ vpxor %xmm2,%xmm0,%xmm0
+ vmovdqa -32(%rcx),%xmm1
+ call GFMUL
+
+ jmp .L256_dec_loop2
+
+.L256_dec_out:
+ vmovdqu %xmm0,(%rdx)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
+.globl aes256gcmsiv_kdf
+.hidden aes256gcmsiv_kdf
+.type aes256gcmsiv_kdf,@function
+.align 16
+aes256gcmsiv_kdf:
+.cfi_startproc
+
+
+
+
+ vmovdqa (%rdx),%xmm1
+ vmovdqa 0(%rdi),%xmm4
+ vmovdqa and_mask(%rip),%xmm11
+ vmovdqa one(%rip),%xmm8
+ vpshufd $0x90,%xmm4,%xmm4
+ vpand %xmm11,%xmm4,%xmm4
+ vpaddd %xmm8,%xmm4,%xmm6
+ vpaddd %xmm8,%xmm6,%xmm7
+ vpaddd %xmm8,%xmm7,%xmm11
+ vpaddd %xmm8,%xmm11,%xmm12
+ vpaddd %xmm8,%xmm12,%xmm13
+
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm1,%xmm6,%xmm6
+ vpxor %xmm1,%xmm7,%xmm7
+ vpxor %xmm1,%xmm11,%xmm11
+ vpxor %xmm1,%xmm12,%xmm12
+ vpxor %xmm1,%xmm13,%xmm13
+
+ vmovdqa 16(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 32(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 48(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 64(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 80(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 96(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 112(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 128(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 144(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 160(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 176(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 192(%rdx),%xmm2
+ vaesenc %xmm2,%xmm4,%xmm4
+ vaesenc %xmm2,%xmm6,%xmm6
+ vaesenc %xmm2,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vaesenc %xmm2,%xmm12,%xmm12
+ vaesenc %xmm2,%xmm13,%xmm13
+
+ vmovdqa 208(%rdx),%xmm1
+ vaesenc %xmm1,%xmm4,%xmm4
+ vaesenc %xmm1,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+
+ vmovdqa 224(%rdx),%xmm2
+ vaesenclast %xmm2,%xmm4,%xmm4
+ vaesenclast %xmm2,%xmm6,%xmm6
+ vaesenclast %xmm2,%xmm7,%xmm7
+ vaesenclast %xmm2,%xmm11,%xmm11
+ vaesenclast %xmm2,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+
+
+ vmovdqa %xmm4,0(%rsi)
+ vmovdqa %xmm6,16(%rsi)
+ vmovdqa %xmm7,32(%rsi)
+ vmovdqa %xmm11,48(%rsi)
+ vmovdqa %xmm12,64(%rsi)
+ vmovdqa %xmm13,80(%rsi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S
@@ -1,0 +1,8922 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+chacha20_poly1305_constants:
+
+.align 64
+.Lchacha20_consts:
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
+.Lrol8:
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
+.Lrol16:
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
+.Lavx2_init:
+.long 0,0,0,0
+.Lsse_inc:
+.long 1,0,0,0
+.Lavx2_inc:
+.long 2,0,0,0,2,0,0,0
+.Lclamp:
+.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
+.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
+.align 16
+.Land_masks:
+.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
+.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+.type poly_hash_ad_internal,@function
+.align 64
+poly_hash_ad_internal:
+.cfi_startproc
+.cfi_def_cfa rsp, 8
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+ cmpq $13,%r8
+ jne .Lhash_ad_loop
+.Lpoly_fast_tls_ad:
+
+ movq (%rcx),%r10
+ movq 5(%rcx),%r11
+ shrq $24,%r11
+ movq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ .byte 0xf3,0xc3
+.Lhash_ad_loop:
+
+ cmpq $16,%r8
+ jb .Lhash_ad_tail
+ addq 0+0(%rcx),%r10
+ adcq 8+0(%rcx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rcx),%rcx
+ subq $16,%r8
+ jmp .Lhash_ad_loop
+.Lhash_ad_tail:
+ cmpq $0,%r8
+ je .Lhash_ad_done
+
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ addq %r8,%rcx
+.Lhash_ad_tail_loop:
+ shldq $8,%r13,%r14
+ shlq $8,%r13
+ movzbq -1(%rcx),%r15
+ xorq %r15,%r13
+ decq %rcx
+ decq %r8
+ jne .Lhash_ad_tail_loop
+
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+.Lhash_ad_done:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size poly_hash_ad_internal, .-poly_hash_ad_internal
+
+.globl chacha20_poly1305_open
+.hidden chacha20_poly1305_open
+.type chacha20_poly1305_open,@function
+.align 64
+chacha20_poly1305_open:
+.cfi_startproc
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+
+ pushq %r9
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r9,-64
+ subq $288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset 288 + 32
+
+ leaq 32(%rsp),%rbp
+ andq $-32,%rbp
+
+ movq %rdx,%rbx
+ movq %r8,0+0+32(%rbp)
+ movq %rbx,8+0+32(%rbp)
+
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ andl $288,%eax
+ xorl $288,%eax
+ jz chacha20_poly1305_open_avx2
+
+ cmpq $128,%rbx
+ jbe .Lopen_sse_128
+
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqu 0(%r9),%xmm4
+ movdqu 16(%r9),%xmm8
+ movdqu 32(%r9),%xmm12
+
+ movdqa %xmm12,%xmm7
+
+ movdqa %xmm4,0+48(%rbp)
+ movdqa %xmm8,0+64(%rbp)
+ movdqa %xmm12,0+96(%rbp)
+ movq $10,%r10
+.Lopen_sse_init_rounds:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %r10
+ jne .Lopen_sse_init_rounds
+
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+
+ pand .Lclamp(%rip),%xmm0
+ movdqa %xmm0,0+0(%rbp)
+ movdqa %xmm4,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+.Lopen_sse_main_loop:
+ cmpq $256,%rbx
+ jb .Lopen_sse_tail
+
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 0+96(%rbp),%xmm15
+ paddd .Lsse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+
+
+
+ movq $4,%rcx
+ movq %rsi,%r8
+.Lopen_sse_main_loop_rounds:
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+
+ leaq 16(%r8),%r8
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %rcx
+ jge .Lopen_sse_main_loop_rounds
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ cmpq $-6,%rcx
+ jg .Lopen_sse_main_loop_rounds
+ paddd .Lchacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqa %xmm12,0+80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm12
+ pxor %xmm3,%xmm12
+ movdqu %xmm12,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm12
+ pxor %xmm7,%xmm12
+ movdqu %xmm12,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm12
+ pxor %xmm11,%xmm12
+ movdqu %xmm12,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm12
+ pxor %xmm15,%xmm12
+ movdqu %xmm12,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+ movdqu 0 + 192(%rsi),%xmm3
+ movdqu 16 + 192(%rsi),%xmm7
+ movdqu 32 + 192(%rsi),%xmm11
+ movdqu 48 + 192(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor 0+80(%rbp),%xmm15
+ movdqu %xmm0,0 + 192(%rdi)
+ movdqu %xmm4,16 + 192(%rdi)
+ movdqu %xmm8,32 + 192(%rdi)
+ movdqu %xmm15,48 + 192(%rdi)
+
+ leaq 256(%rsi),%rsi
+ leaq 256(%rdi),%rdi
+ subq $256,%rbx
+ jmp .Lopen_sse_main_loop
+.Lopen_sse_tail:
+
+ testq %rbx,%rbx
+ jz .Lopen_sse_finalize
+ cmpq $192,%rbx
+ ja .Lopen_sse_tail_256
+ cmpq $128,%rbx
+ ja .Lopen_sse_tail_192
+ cmpq $64,%rbx
+ ja .Lopen_sse_tail_128
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa 0+96(%rbp),%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+
+ xorq %r8,%r8
+ movq %rbx,%rcx
+ cmpq $16,%rcx
+ jb .Lopen_sse_tail_64_rounds
+.Lopen_sse_tail_64_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+.Lopen_sse_tail_64_rounds:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ cmpq $16,%rcx
+ jae .Lopen_sse_tail_64_rounds_and_x1hash
+ cmpq $160,%r8
+ jne .Lopen_sse_tail_64_rounds
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+ jmp .Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_128:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa 0+96(%rbp),%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+
+ movq %rbx,%rcx
+ andq $-16,%rcx
+ xorq %r8,%r8
+.Lopen_sse_tail_128_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lopen_sse_tail_128_rounds:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+
+ cmpq %rcx,%r8
+ jb .Lopen_sse_tail_128_rounds_and_x1hash
+ cmpq $160,%r8
+ jne .Lopen_sse_tail_128_rounds
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 0(%rdi)
+ movdqu %xmm5,16 + 0(%rdi)
+ movdqu %xmm9,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+
+ subq $64,%rbx
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ jmp .Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_192:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa 0+96(%rbp),%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+
+ movq %rbx,%rcx
+ movq $160,%r8
+ cmpq $160,%rcx
+ cmovgq %r8,%rcx
+ andq $-16,%rcx
+ xorq %r8,%r8
+.Lopen_sse_tail_192_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lopen_sse_tail_192_rounds:
+ addq $16,%r8
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ cmpq %rcx,%r8
+ jb .Lopen_sse_tail_192_rounds_and_x1hash
+ cmpq $160,%r8
+ jne .Lopen_sse_tail_192_rounds
+ cmpq $176,%rbx
+ jb .Lopen_sse_tail_192_finish
+ addq 0+160(%rsi),%r10
+ adcq 8+160(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ cmpq $192,%rbx
+ jb .Lopen_sse_tail_192_finish
+ addq 0+176(%rsi),%r10
+ adcq 8+176(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lopen_sse_tail_192_finish:
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+ leaq 128(%rdi),%rdi
+ jmp .Lopen_sse_tail_64_dec_loop
+
+.Lopen_sse_tail_256:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 0+96(%rbp),%xmm15
+ paddd .Lsse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+
+ xorq %r8,%r8
+.Lopen_sse_tail_256_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movdqa %xmm11,0+80(%rbp)
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm4
+ pxor %xmm11,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm4
+ pxor %xmm11,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm5
+ pxor %xmm11,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm5
+ pxor %xmm11,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm6
+ pxor %xmm11,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm6
+ pxor %xmm11,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ movdqa 0+80(%rbp),%xmm11
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa %xmm9,0+80(%rbp)
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .Lrol16(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $12,%xmm9
+ psrld $20,%xmm7
+ pxor %xmm9,%xmm7
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .Lrol8(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $7,%xmm9
+ psrld $25,%xmm7
+ pxor %xmm9,%xmm7
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+ movdqa 0+80(%rbp),%xmm9
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ movdqa %xmm11,0+80(%rbp)
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm4
+ pxor %xmm11,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm4
+ pxor %xmm11,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm5
+ pxor %xmm11,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm5
+ pxor %xmm11,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $12,%xmm11
+ psrld $20,%xmm6
+ pxor %xmm11,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm11
+ pslld $7,%xmm11
+ psrld $25,%xmm6
+ pxor %xmm11,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+ movdqa 0+80(%rbp),%xmm11
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ movdqa %xmm9,0+80(%rbp)
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .Lrol16(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $12,%xmm9
+ psrld $20,%xmm7
+ pxor %xmm9,%xmm7
+ paddd %xmm7,%xmm3
+ pxor %xmm3,%xmm15
+ pshufb .Lrol8(%rip),%xmm15
+ paddd %xmm15,%xmm11
+ pxor %xmm11,%xmm7
+ movdqa %xmm7,%xmm9
+ pslld $7,%xmm9
+ psrld $25,%xmm7
+ pxor %xmm9,%xmm7
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+ movdqa 0+80(%rbp),%xmm9
+
+ addq $16,%r8
+ cmpq $160,%r8
+ jb .Lopen_sse_tail_256_rounds_and_x1hash
+
+ movq %rbx,%rcx
+ andq $-16,%rcx
+.Lopen_sse_tail_256_hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ addq $16,%r8
+ cmpq %rcx,%r8
+ jb .Lopen_sse_tail_256_hash
+ paddd .Lchacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqa %xmm12,0+80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm12
+ pxor %xmm3,%xmm12
+ movdqu %xmm12,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm12
+ pxor %xmm7,%xmm12
+ movdqu %xmm12,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm12
+ pxor %xmm11,%xmm12
+ movdqu %xmm12,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm12
+ pxor %xmm15,%xmm12
+ movdqu %xmm12,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ movdqa 0+80(%rbp),%xmm12
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ leaq 192(%rdi),%rdi
+
+
+.Lopen_sse_tail_64_dec_loop:
+ cmpq $16,%rbx
+ jb .Lopen_sse_tail_16_init
+ subq $16,%rbx
+ movdqu (%rsi),%xmm3
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movdqa %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ jmp .Lopen_sse_tail_64_dec_loop
+.Lopen_sse_tail_16_init:
+ movdqa %xmm0,%xmm1
+
+
+.Lopen_sse_tail_16:
+ testq %rbx,%rbx
+ jz .Lopen_sse_finalize
+
+
+
+ pxor %xmm3,%xmm3
+ leaq -1(%rsi,%rbx,1),%rsi
+ movq %rbx,%r8
+.Lopen_sse_tail_16_compose:
+ pslldq $1,%xmm3
+ pinsrb $0,(%rsi),%xmm3
+ subq $1,%rsi
+ subq $1,%r8
+ jnz .Lopen_sse_tail_16_compose
+
+.byte 102,73,15,126,221
+ pextrq $1,%xmm3,%r14
+
+ pxor %xmm1,%xmm3
+
+
+.Lopen_sse_tail_16_extract:
+ pextrb $0,%xmm3,(%rdi)
+ psrldq $1,%xmm3
+ addq $1,%rdi
+ subq $1,%rbx
+ jne .Lopen_sse_tail_16_extract
+
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+.Lopen_sse_finalize:
+ addq 0+0+32(%rbp),%r10
+ adcq 8+0+32(%rbp),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movq %r10,%r13
+ movq %r11,%r14
+ movq %r12,%r15
+ subq $-5,%r10
+ sbbq $-1,%r11
+ sbbq $3,%r12
+ cmovcq %r13,%r10
+ cmovcq %r14,%r11
+ cmovcq %r15,%r12
+
+ addq 0+0+16(%rbp),%r10
+ adcq 8+0+16(%rbp),%r11
+
+.cfi_remember_state
+ addq $288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+
+ popq %r9
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r9
+ movq %r10,(%r9)
+ movq %r11,8(%r9)
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ .byte 0xf3,0xc3
+
+.Lopen_sse_128:
+.cfi_restore_state
+ movdqu .Lchacha20_consts(%rip),%xmm0
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqu 0(%r9),%xmm4
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqu 16(%r9),%xmm8
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqu 32(%r9),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa %xmm13,%xmm15
+ movq $10,%r10
+
+.Lopen_sse_128_rounds:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ decq %r10
+ jnz .Lopen_sse_128_rounds
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd %xmm7,%xmm4
+ paddd %xmm7,%xmm5
+ paddd %xmm7,%xmm6
+ paddd %xmm11,%xmm9
+ paddd %xmm11,%xmm10
+ paddd %xmm15,%xmm13
+ paddd .Lsse_inc(%rip),%xmm15
+ paddd %xmm15,%xmm14
+
+ pand .Lclamp(%rip),%xmm0
+ movdqa %xmm0,0+0(%rbp)
+ movdqa %xmm4,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+.Lopen_sse_128_xor_hash:
+ cmpq $16,%rbx
+ jb .Lopen_sse_tail_16
+ subq $16,%rbx
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+
+
+ movdqu 0(%rsi),%xmm3
+ pxor %xmm3,%xmm1
+ movdqu %xmm1,0(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movdqa %xmm5,%xmm1
+ movdqa %xmm9,%xmm5
+ movdqa %xmm13,%xmm9
+ movdqa %xmm2,%xmm13
+ movdqa %xmm6,%xmm2
+ movdqa %xmm10,%xmm6
+ movdqa %xmm14,%xmm10
+ jmp .Lopen_sse_128_xor_hash
+.size chacha20_poly1305_open, .-chacha20_poly1305_open
+.cfi_endproc
+
+
+
+
+
+
+
+.globl chacha20_poly1305_seal
+.hidden chacha20_poly1305_seal
+.type chacha20_poly1305_seal,@function
+.align 64
+chacha20_poly1305_seal:
+.cfi_startproc
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+
+
+ pushq %r9
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r9,-64
+ subq $288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset 288 + 32
+ leaq 32(%rsp),%rbp
+ andq $-32,%rbp
+
+ movq 56(%r9),%rbx
+ addq %rdx,%rbx
+ movq %r8,0+0+32(%rbp)
+ movq %rbx,8+0+32(%rbp)
+ movq %rdx,%rbx
+
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ andl $288,%eax
+ xorl $288,%eax
+ jz chacha20_poly1305_seal_avx2
+
+ cmpq $128,%rbx
+ jbe .Lseal_sse_128
+
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqu 0(%r9),%xmm4
+ movdqu 16(%r9),%xmm8
+ movdqu 32(%r9),%xmm12
+
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqa %xmm8,%xmm11
+ movdqa %xmm12,%xmm15
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm14
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd .Lsse_inc(%rip),%xmm12
+
+ movdqa %xmm4,0+48(%rbp)
+ movdqa %xmm8,0+64(%rbp)
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+ movq $10,%r10
+.Lseal_sse_init_rounds:
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ decq %r10
+ jnz .Lseal_sse_init_rounds
+ paddd .Lchacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+
+ pand .Lclamp(%rip),%xmm3
+ movdqa %xmm3,0+0(%rbp)
+ movdqa %xmm7,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ cmpq $192,%rbx
+ ja .Lseal_sse_main_init
+ movq $128,%rcx
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+ jmp .Lseal_sse_128_tail_hash
+.Lseal_sse_main_init:
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor %xmm12,%xmm15
+ movdqu %xmm0,0 + 128(%rdi)
+ movdqu %xmm4,16 + 128(%rdi)
+ movdqu %xmm8,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ movq $192,%rcx
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ movq $2,%rcx
+ movq $8,%r8
+ cmpq $64,%rbx
+ jbe .Lseal_sse_tail_64
+ cmpq $128,%rbx
+ jbe .Lseal_sse_tail_128
+ cmpq $192,%rbx
+ jbe .Lseal_sse_tail_192
+
+.Lseal_sse_main_loop:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa %xmm0,%xmm3
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa 0+96(%rbp),%xmm15
+ paddd .Lsse_inc(%rip),%xmm15
+ movdqa %xmm15,%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+ movdqa %xmm15,0+144(%rbp)
+
+.align 32
+.Lseal_sse_main_rounds:
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+.byte 102,15,58,15,255,4
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,12
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ movdqa %xmm8,0+80(%rbp)
+ movdqa .Lrol16(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $20,%xmm8
+ pslld $32-20,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa .Lrol8(%rip),%xmm8
+ paddd %xmm7,%xmm3
+ paddd %xmm6,%xmm2
+ paddd %xmm5,%xmm1
+ paddd %xmm4,%xmm0
+ pxor %xmm3,%xmm15
+ pxor %xmm2,%xmm14
+ pxor %xmm1,%xmm13
+ pxor %xmm0,%xmm12
+.byte 102,69,15,56,0,248
+.byte 102,69,15,56,0,240
+.byte 102,69,15,56,0,232
+.byte 102,69,15,56,0,224
+ movdqa 0+80(%rbp),%xmm8
+ paddd %xmm15,%xmm11
+ paddd %xmm14,%xmm10
+ paddd %xmm13,%xmm9
+ paddd %xmm12,%xmm8
+ pxor %xmm11,%xmm7
+ pxor %xmm10,%xmm6
+ pxor %xmm9,%xmm5
+ pxor %xmm8,%xmm4
+ movdqa %xmm8,0+80(%rbp)
+ movdqa %xmm7,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm7
+ pxor %xmm8,%xmm7
+ movdqa %xmm6,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm6
+ pxor %xmm8,%xmm6
+ movdqa %xmm5,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm5
+ pxor %xmm8,%xmm5
+ movdqa %xmm4,%xmm8
+ psrld $25,%xmm8
+ pslld $32-25,%xmm4
+ pxor %xmm8,%xmm4
+ movdqa 0+80(%rbp),%xmm8
+.byte 102,15,58,15,255,12
+.byte 102,69,15,58,15,219,8
+.byte 102,69,15,58,15,255,4
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+
+ leaq 16(%rdi),%rdi
+ decq %r8
+ jge .Lseal_sse_main_rounds
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_sse_main_rounds
+ paddd .Lchacha20_consts(%rip),%xmm3
+ paddd 0+48(%rbp),%xmm7
+ paddd 0+64(%rbp),%xmm11
+ paddd 0+144(%rbp),%xmm15
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+ movdqa %xmm14,0+80(%rbp)
+ movdqa %xmm14,0+80(%rbp)
+ movdqu 0 + 0(%rsi),%xmm14
+ pxor %xmm3,%xmm14
+ movdqu %xmm14,0 + 0(%rdi)
+ movdqu 16 + 0(%rsi),%xmm14
+ pxor %xmm7,%xmm14
+ movdqu %xmm14,16 + 0(%rdi)
+ movdqu 32 + 0(%rsi),%xmm14
+ pxor %xmm11,%xmm14
+ movdqu %xmm14,32 + 0(%rdi)
+ movdqu 48 + 0(%rsi),%xmm14
+ pxor %xmm15,%xmm14
+ movdqu %xmm14,48 + 0(%rdi)
+
+ movdqa 0+80(%rbp),%xmm14
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 64(%rdi)
+ movdqu %xmm6,16 + 64(%rdi)
+ movdqu %xmm10,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+ movdqu 0 + 128(%rsi),%xmm3
+ movdqu 16 + 128(%rsi),%xmm7
+ movdqu 32 + 128(%rsi),%xmm11
+ movdqu 48 + 128(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 128(%rdi)
+ movdqu %xmm5,16 + 128(%rdi)
+ movdqu %xmm9,32 + 128(%rdi)
+ movdqu %xmm15,48 + 128(%rdi)
+
+ cmpq $256,%rbx
+ ja .Lseal_sse_main_loop_xor
+
+ movq $192,%rcx
+ subq $192,%rbx
+ leaq 192(%rsi),%rsi
+ jmp .Lseal_sse_128_tail_hash
+.Lseal_sse_main_loop_xor:
+ movdqu 0 + 192(%rsi),%xmm3
+ movdqu 16 + 192(%rsi),%xmm7
+ movdqu 32 + 192(%rsi),%xmm11
+ movdqu 48 + 192(%rsi),%xmm15
+ pxor %xmm3,%xmm0
+ pxor %xmm7,%xmm4
+ pxor %xmm11,%xmm8
+ pxor %xmm12,%xmm15
+ movdqu %xmm0,0 + 192(%rdi)
+ movdqu %xmm4,16 + 192(%rdi)
+ movdqu %xmm8,32 + 192(%rdi)
+ movdqu %xmm15,48 + 192(%rdi)
+
+ leaq 256(%rsi),%rsi
+ subq $256,%rbx
+ movq $6,%rcx
+ movq $4,%r8
+ cmpq $192,%rbx
+ jg .Lseal_sse_main_loop
+ movq %rbx,%rcx
+ testq %rbx,%rbx
+ je .Lseal_sse_128_tail_hash
+ movq $6,%rcx
+ cmpq $128,%rbx
+ ja .Lseal_sse_tail_192
+ cmpq $64,%rbx
+ ja .Lseal_sse_tail_128
+
+.Lseal_sse_tail_64:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa 0+96(%rbp),%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+
+.Lseal_sse_tail_64_rounds_and_x2hash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_sse_tail_64_rounds_and_x1hash:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_sse_tail_64_rounds_and_x2hash
+ decq %r8
+ jge .Lseal_sse_tail_64_rounds_and_x1hash
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+
+ jmp .Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_128:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa 0+96(%rbp),%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+
+.Lseal_sse_tail_128_rounds_and_x2hash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_sse_tail_128_rounds_and_x1hash:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_sse_tail_128_rounds_and_x2hash
+ decq %r8
+ jge .Lseal_sse_tail_128_rounds_and_x1hash
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 0(%rdi)
+ movdqu %xmm5,16 + 0(%rdi)
+ movdqu %xmm9,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+
+ movq $64,%rcx
+ subq $64,%rbx
+ leaq 64(%rsi),%rsi
+ jmp .Lseal_sse_128_tail_hash
+
+.Lseal_sse_tail_192:
+ movdqa .Lchacha20_consts(%rip),%xmm0
+ movdqa 0+48(%rbp),%xmm4
+ movdqa 0+64(%rbp),%xmm8
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm5
+ movdqa %xmm8,%xmm9
+ movdqa %xmm0,%xmm2
+ movdqa %xmm4,%xmm6
+ movdqa %xmm8,%xmm10
+ movdqa 0+96(%rbp),%xmm14
+ paddd .Lsse_inc(%rip),%xmm14
+ movdqa %xmm14,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm13,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,0+96(%rbp)
+ movdqa %xmm13,0+112(%rbp)
+ movdqa %xmm14,0+128(%rbp)
+
+.Lseal_sse_tail_192_rounds_and_x2hash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_sse_tail_192_rounds_and_x1hash:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ leaq 16(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_sse_tail_192_rounds_and_x2hash
+ decq %r8
+ jge .Lseal_sse_tail_192_rounds_and_x1hash
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd 0+48(%rbp),%xmm6
+ paddd 0+64(%rbp),%xmm10
+ paddd 0+128(%rbp),%xmm14
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd 0+48(%rbp),%xmm5
+ paddd 0+64(%rbp),%xmm9
+ paddd 0+112(%rbp),%xmm13
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd 0+48(%rbp),%xmm4
+ paddd 0+64(%rbp),%xmm8
+ paddd 0+96(%rbp),%xmm12
+ movdqu 0 + 0(%rsi),%xmm3
+ movdqu 16 + 0(%rsi),%xmm7
+ movdqu 32 + 0(%rsi),%xmm11
+ movdqu 48 + 0(%rsi),%xmm15
+ pxor %xmm3,%xmm2
+ pxor %xmm7,%xmm6
+ pxor %xmm11,%xmm10
+ pxor %xmm14,%xmm15
+ movdqu %xmm2,0 + 0(%rdi)
+ movdqu %xmm6,16 + 0(%rdi)
+ movdqu %xmm10,32 + 0(%rdi)
+ movdqu %xmm15,48 + 0(%rdi)
+ movdqu 0 + 64(%rsi),%xmm3
+ movdqu 16 + 64(%rsi),%xmm7
+ movdqu 32 + 64(%rsi),%xmm11
+ movdqu 48 + 64(%rsi),%xmm15
+ pxor %xmm3,%xmm1
+ pxor %xmm7,%xmm5
+ pxor %xmm11,%xmm9
+ pxor %xmm13,%xmm15
+ movdqu %xmm1,0 + 64(%rdi)
+ movdqu %xmm5,16 + 64(%rdi)
+ movdqu %xmm9,32 + 64(%rdi)
+ movdqu %xmm15,48 + 64(%rdi)
+
+ movq $128,%rcx
+ subq $128,%rbx
+ leaq 128(%rsi),%rsi
+
+.Lseal_sse_128_tail_hash:
+ cmpq $16,%rcx
+ jb .Lseal_sse_128_tail_xor
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+ leaq 16(%rdi),%rdi
+ jmp .Lseal_sse_128_tail_hash
+
+.Lseal_sse_128_tail_xor:
+ cmpq $16,%rbx
+ jb .Lseal_sse_tail_16
+ subq $16,%rbx
+
+ movdqu 0(%rsi),%xmm3
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0(%rdi)
+
+ addq 0(%rdi),%r10
+ adcq 8(%rdi),%r11
+ adcq $1,%r12
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm8,%xmm4
+ movdqa %xmm12,%xmm8
+ movdqa %xmm1,%xmm12
+ movdqa %xmm5,%xmm1
+ movdqa %xmm9,%xmm5
+ movdqa %xmm13,%xmm9
+ jmp .Lseal_sse_128_tail_xor
+
+.Lseal_sse_tail_16:
+ testq %rbx,%rbx
+ jz .Lprocess_blocks_of_extra_in
+
+ movq %rbx,%r8
+ movq %rbx,%rcx
+ leaq -1(%rsi,%rbx,1),%rsi
+ pxor %xmm15,%xmm15
+.Lseal_sse_tail_16_compose:
+ pslldq $1,%xmm15
+ pinsrb $0,(%rsi),%xmm15
+ leaq -1(%rsi),%rsi
+ decq %rcx
+ jne .Lseal_sse_tail_16_compose
+
+
+ pxor %xmm0,%xmm15
+
+
+ movq %rbx,%rcx
+ movdqu %xmm15,%xmm0
+.Lseal_sse_tail_16_extract:
+ pextrb $0,%xmm0,(%rdi)
+ psrldq $1,%xmm0
+ addq $1,%rdi
+ subq $1,%rcx
+ jnz .Lseal_sse_tail_16_extract
+
+
+
+
+
+
+
+
+ movq 288 + 0 + 32(%rsp),%r9
+ movq 56(%r9),%r14
+ movq 48(%r9),%r13
+ testq %r14,%r14
+ jz .Lprocess_partial_block
+
+ movq $16,%r15
+ subq %rbx,%r15
+ cmpq %r15,%r14
+
+ jge .Lload_extra_in
+ movq %r14,%r15
+
+.Lload_extra_in:
+
+
+ leaq -1(%r13,%r15,1),%rsi
+
+
+ addq %r15,%r13
+ subq %r15,%r14
+ movq %r13,48(%r9)
+ movq %r14,56(%r9)
+
+
+
+ addq %r15,%r8
+
+
+ pxor %xmm11,%xmm11
+.Lload_extra_load_loop:
+ pslldq $1,%xmm11
+ pinsrb $0,(%rsi),%xmm11
+ leaq -1(%rsi),%rsi
+ subq $1,%r15
+ jnz .Lload_extra_load_loop
+
+
+
+
+ movq %rbx,%r15
+
+.Lload_extra_shift_loop:
+ pslldq $1,%xmm11
+ subq $1,%r15
+ jnz .Lload_extra_shift_loop
+
+
+
+
+ leaq .Land_masks(%rip),%r15
+ shlq $4,%rbx
+ pand -16(%r15,%rbx,1),%xmm15
+
+
+ por %xmm11,%xmm15
+
+
+
+.byte 102,77,15,126,253
+ pextrq $1,%xmm15,%r14
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+.Lprocess_blocks_of_extra_in:
+
+ movq 288+32+0 (%rsp),%r9
+ movq 48(%r9),%rsi
+ movq 56(%r9),%r8
+ movq %r8,%rcx
+ shrq $4,%r8
+
+.Lprocess_extra_hash_loop:
+ jz process_extra_in_trailer
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rsi),%rsi
+ subq $1,%r8
+ jmp .Lprocess_extra_hash_loop
+process_extra_in_trailer:
+ andq $15,%rcx
+ movq %rcx,%rbx
+ jz .Ldo_length_block
+ leaq -1(%rsi,%rcx,1),%rsi
+
+.Lprocess_extra_in_trailer_load:
+ pslldq $1,%xmm15
+ pinsrb $0,(%rsi),%xmm15
+ leaq -1(%rsi),%rsi
+ subq $1,%rcx
+ jnz .Lprocess_extra_in_trailer_load
+
+.Lprocess_partial_block:
+
+ leaq .Land_masks(%rip),%r15
+ shlq $4,%rbx
+ pand -16(%r15,%rbx,1),%xmm15
+.byte 102,77,15,126,253
+ pextrq $1,%xmm15,%r14
+ addq %r13,%r10
+ adcq %r14,%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+.Ldo_length_block:
+ addq 0+0+32(%rbp),%r10
+ adcq 8+0+32(%rbp),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ movq %r10,%r13
+ movq %r11,%r14
+ movq %r12,%r15
+ subq $-5,%r10
+ sbbq $-1,%r11
+ sbbq $3,%r12
+ cmovcq %r13,%r10
+ cmovcq %r14,%r11
+ cmovcq %r15,%r12
+
+ addq 0+0+16(%rbp),%r10
+ adcq 8+0+16(%rbp),%r11
+
+.cfi_remember_state
+ addq $288 + 0 + 32,%rsp
+.cfi_adjust_cfa_offset -(288 + 32)
+
+ popq %r9
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r9
+ movq %r10,(%r9)
+ movq %r11,8(%r9)
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbx
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore %rbp
+ .byte 0xf3,0xc3
+
+.Lseal_sse_128:
+.cfi_restore_state
+ movdqu .Lchacha20_consts(%rip),%xmm0
+ movdqa %xmm0,%xmm1
+ movdqa %xmm0,%xmm2
+ movdqu 0(%r9),%xmm4
+ movdqa %xmm4,%xmm5
+ movdqa %xmm4,%xmm6
+ movdqu 16(%r9),%xmm8
+ movdqa %xmm8,%xmm9
+ movdqa %xmm8,%xmm10
+ movdqu 32(%r9),%xmm14
+ movdqa %xmm14,%xmm12
+ paddd .Lsse_inc(%rip),%xmm12
+ movdqa %xmm12,%xmm13
+ paddd .Lsse_inc(%rip),%xmm13
+ movdqa %xmm4,%xmm7
+ movdqa %xmm8,%xmm11
+ movdqa %xmm12,%xmm15
+ movq $10,%r10
+
+.Lseal_sse_128_rounds:
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,4
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,12
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,4
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,12
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,4
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,12
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol16(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm4
+ pxor %xmm3,%xmm4
+ paddd %xmm4,%xmm0
+ pxor %xmm0,%xmm12
+ pshufb .Lrol8(%rip),%xmm12
+ paddd %xmm12,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,15,228,12
+.byte 102,69,15,58,15,192,8
+.byte 102,69,15,58,15,228,4
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol16(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm5
+ pxor %xmm3,%xmm5
+ paddd %xmm5,%xmm1
+ pxor %xmm1,%xmm13
+ pshufb .Lrol8(%rip),%xmm13
+ paddd %xmm13,%xmm9
+ pxor %xmm9,%xmm5
+ movdqa %xmm5,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm5
+ pxor %xmm3,%xmm5
+.byte 102,15,58,15,237,12
+.byte 102,69,15,58,15,201,8
+.byte 102,69,15,58,15,237,4
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol16(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $12,%xmm3
+ psrld $20,%xmm6
+ pxor %xmm3,%xmm6
+ paddd %xmm6,%xmm2
+ pxor %xmm2,%xmm14
+ pshufb .Lrol8(%rip),%xmm14
+ paddd %xmm14,%xmm10
+ pxor %xmm10,%xmm6
+ movdqa %xmm6,%xmm3
+ pslld $7,%xmm3
+ psrld $25,%xmm6
+ pxor %xmm3,%xmm6
+.byte 102,15,58,15,246,12
+.byte 102,69,15,58,15,210,8
+.byte 102,69,15,58,15,246,4
+
+ decq %r10
+ jnz .Lseal_sse_128_rounds
+ paddd .Lchacha20_consts(%rip),%xmm0
+ paddd .Lchacha20_consts(%rip),%xmm1
+ paddd .Lchacha20_consts(%rip),%xmm2
+ paddd %xmm7,%xmm4
+ paddd %xmm7,%xmm5
+ paddd %xmm7,%xmm6
+ paddd %xmm11,%xmm8
+ paddd %xmm11,%xmm9
+ paddd %xmm15,%xmm12
+ paddd .Lsse_inc(%rip),%xmm15
+ paddd %xmm15,%xmm13
+
+ pand .Lclamp(%rip),%xmm2
+ movdqa %xmm2,0+0(%rbp)
+ movdqa %xmm6,0+16(%rbp)
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ jmp .Lseal_sse_128_tail_xor
+.size chacha20_poly1305_seal, .-chacha20_poly1305_seal
+.cfi_endproc
+
+
+.type chacha20_poly1305_open_avx2,@function
+.align 64
+chacha20_poly1305_open_avx2:
+.cfi_startproc
+
+
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r9,-64
+.cfi_adjust_cfa_offset 288 + 32
+
+ vzeroupper
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vbroadcasti128 0(%r9),%ymm4
+ vbroadcasti128 16(%r9),%ymm8
+ vbroadcasti128 32(%r9),%ymm12
+ vpaddd .Lavx2_init(%rip),%ymm12,%ymm12
+ cmpq $192,%rbx
+ jbe .Lopen_avx2_192
+ cmpq $320,%rbx
+ jbe .Lopen_avx2_320
+
+ vmovdqa %ymm4,0+64(%rbp)
+ vmovdqa %ymm8,0+96(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+ movq $10,%r10
+.Lopen_avx2_init_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ decq %r10
+ jne .Lopen_avx2_init_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+
+ movq %r8,%r8
+ call poly_hash_ad_internal
+
+ xorq %rcx,%rcx
+.Lopen_avx2_init_hash:
+ addq 0+0(%rsi,%rcx,1),%r10
+ adcq 8+0(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ addq $16,%rcx
+ cmpq $64,%rcx
+ jne .Lopen_avx2_init_hash
+
+ vpxor 0(%rsi),%ymm0,%ymm0
+ vpxor 32(%rsi),%ymm4,%ymm4
+
+ vmovdqu %ymm0,0(%rdi)
+ vmovdqu %ymm4,32(%rdi)
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ subq $64,%rbx
+.Lopen_avx2_main_loop:
+
+ cmpq $512,%rbx
+ jb .Lopen_avx2_main_loop_done
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+ xorq %rcx,%rcx
+.Lopen_avx2_main_loop_rounds:
+ addq 0+0(%rsi,%rcx,1),%r10
+ adcq 8+0(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ addq 0+16(%rsi,%rcx,1),%r10
+ adcq 8+16(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq 0+32(%rsi,%rcx,1),%r10
+ adcq 8+32(%rsi,%rcx,1),%r11
+ adcq $1,%r12
+
+ leaq 48(%rcx),%rcx
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ cmpq $60*8,%rcx
+ jne .Lopen_avx2_main_loop_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ addq 0+60*8(%rsi),%r10
+ adcq 8+60*8(%rsi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ addq 0+60*8+16(%rsi),%r10
+ adcq 8+60*8+16(%rsi),%r11
+ adcq $1,%r12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
+ vpxor 0+384(%rsi),%ymm3,%ymm3
+ vpxor 32+384(%rsi),%ymm0,%ymm0
+ vpxor 64+384(%rsi),%ymm4,%ymm4
+ vpxor 96+384(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm3,0+384(%rdi)
+ vmovdqu %ymm0,32+384(%rdi)
+ vmovdqu %ymm4,64+384(%rdi)
+ vmovdqu %ymm8,96+384(%rdi)
+
+ leaq 512(%rsi),%rsi
+ leaq 512(%rdi),%rdi
+ subq $512,%rbx
+ jmp .Lopen_avx2_main_loop
+.Lopen_avx2_main_loop_done:
+ testq %rbx,%rbx
+ vzeroupper
+ je .Lopen_sse_finalize
+
+ cmpq $384,%rbx
+ ja .Lopen_avx2_tail_512
+ cmpq $256,%rbx
+ ja .Lopen_avx2_tail_384
+ cmpq $128,%rbx
+ ja .Lopen_avx2_tail_256
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+
+ xorq %r8,%r8
+ movq %rbx,%rcx
+ andq $-16,%rcx
+ testq %rcx,%rcx
+ je .Lopen_avx2_tail_128_rounds
+.Lopen_avx2_tail_128_rounds_and_x1hash:
+ addq 0+0(%rsi,%r8,1),%r10
+ adcq 8+0(%rsi,%r8,1),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lopen_avx2_tail_128_rounds:
+ addq $16,%r8
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ cmpq %rcx,%r8
+ jb .Lopen_avx2_tail_128_rounds_and_x1hash
+ cmpq $160,%r8
+ jne .Lopen_avx2_tail_128_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ jmp .Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_256:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+
+ movq %rbx,0+128(%rbp)
+ movq %rbx,%rcx
+ subq $128,%rcx
+ shrq $4,%rcx
+ movq $10,%r8
+ cmpq $10,%rcx
+ cmovgq %r8,%rcx
+ movq %rsi,%rbx
+ xorq %r8,%r8
+.Lopen_avx2_tail_256_rounds_and_x1hash:
+ addq 0+0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+.Lopen_avx2_tail_256_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+
+ incq %r8
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ cmpq %rcx,%r8
+ jb .Lopen_avx2_tail_256_rounds_and_x1hash
+ cmpq $10,%r8
+ jne .Lopen_avx2_tail_256_rounds
+ movq %rbx,%r8
+ subq %rsi,%rbx
+ movq %rbx,%rcx
+ movq 0+128(%rbp),%rbx
+.Lopen_avx2_tail_256_hash:
+ addq $16,%rcx
+ cmpq %rbx,%rcx
+ jg .Lopen_avx2_tail_256_done
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ jmp .Lopen_avx2_tail_256_hash
+.Lopen_avx2_tail_256_done:
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm1,%ymm1
+ vpxor 64+0(%rsi),%ymm5,%ymm5
+ vpxor 96+0(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm1,32+0(%rdi)
+ vmovdqu %ymm5,64+0(%rdi)
+ vmovdqu %ymm9,96+0(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 128(%rsi),%rsi
+ leaq 128(%rdi),%rdi
+ subq $128,%rbx
+ jmp .Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_384:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+
+ movq %rbx,0+128(%rbp)
+ movq %rbx,%rcx
+ subq $256,%rcx
+ shrq $4,%rcx
+ addq $6,%rcx
+ movq $10,%r8
+ cmpq $10,%rcx
+ cmovgq %r8,%rcx
+ movq %rsi,%rbx
+ xorq %r8,%r8
+.Lopen_avx2_tail_384_rounds_and_x2hash:
+ addq 0+0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+.Lopen_avx2_tail_384_rounds_and_x1hash:
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq 0+0(%rbx),%r10
+ adcq 8+0(%rbx),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rbx),%rbx
+ incq %r8
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+
+ cmpq %rcx,%r8
+ jb .Lopen_avx2_tail_384_rounds_and_x2hash
+ cmpq $10,%r8
+ jne .Lopen_avx2_tail_384_rounds_and_x1hash
+ movq %rbx,%r8
+ subq %rsi,%rbx
+ movq %rbx,%rcx
+ movq 0+128(%rbp),%rbx
+.Lopen_avx2_384_tail_hash:
+ addq $16,%rcx
+ cmpq %rbx,%rcx
+ jg .Lopen_avx2_384_tail_done
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ jmp .Lopen_avx2_384_tail_hash
+.Lopen_avx2_384_tail_done:
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm2,%ymm2
+ vpxor 64+0(%rsi),%ymm6,%ymm6
+ vpxor 96+0(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm2,32+0(%rdi)
+ vmovdqu %ymm6,64+0(%rdi)
+ vmovdqu %ymm10,96+0(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm1,%ymm1
+ vpxor 64+128(%rsi),%ymm5,%ymm5
+ vpxor 96+128(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm1,32+128(%rdi)
+ vmovdqu %ymm5,64+128(%rdi)
+ vmovdqu %ymm9,96+128(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 256(%rsi),%rsi
+ leaq 256(%rdi),%rdi
+ subq $256,%rbx
+ jmp .Lopen_avx2_tail_128_xor
+
+.Lopen_avx2_tail_512:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+ xorq %rcx,%rcx
+ movq %rsi,%r8
+.Lopen_avx2_tail_512_rounds_and_x2hash:
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+.Lopen_avx2_tail_512_rounds_and_x1hash:
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ addq 0+16(%r8),%r10
+ adcq 8+16(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%r8),%r8
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ incq %rcx
+ cmpq $4,%rcx
+ jl .Lopen_avx2_tail_512_rounds_and_x2hash
+ cmpq $10,%rcx
+ jne .Lopen_avx2_tail_512_rounds_and_x1hash
+ movq %rbx,%rcx
+ subq $384,%rcx
+ andq $-16,%rcx
+.Lopen_avx2_tail_512_hash:
+ testq %rcx,%rcx
+ je .Lopen_avx2_tail_512_done
+ addq 0+0(%r8),%r10
+ adcq 8+0(%r8),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%r8),%r8
+ subq $16,%rcx
+ jmp .Lopen_avx2_tail_512_hash
+.Lopen_avx2_tail_512_done:
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ leaq 384(%rsi),%rsi
+ leaq 384(%rdi),%rdi
+ subq $384,%rbx
+.Lopen_avx2_tail_128_xor:
+ cmpq $32,%rbx
+ jb .Lopen_avx2_tail_32_xor
+ subq $32,%rbx
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+ leaq 32(%rdi),%rdi
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ jmp .Lopen_avx2_tail_128_xor
+.Lopen_avx2_tail_32_xor:
+ cmpq $16,%rbx
+ vmovdqa %xmm0,%xmm1
+ jb .Lopen_avx2_exit
+ subq $16,%rbx
+
+ vpxor (%rsi),%xmm0,%xmm1
+ vmovdqu %xmm1,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ vperm2i128 $0x11,%ymm0,%ymm0,%ymm0
+ vmovdqa %xmm0,%xmm1
+.Lopen_avx2_exit:
+ vzeroupper
+ jmp .Lopen_sse_tail_16
+
+.Lopen_avx2_192:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13
+ vmovdqa %ymm12,%ymm11
+ vmovdqa %ymm13,%ymm15
+ movq $10,%r10
+.Lopen_avx2_192_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+
+ decq %r10
+ jne .Lopen_avx2_192_rounds
+ vpaddd %ymm2,%ymm0,%ymm0
+ vpaddd %ymm2,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpaddd %ymm6,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm8,%ymm8
+ vpaddd %ymm10,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm13,%ymm13
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+.Lopen_avx2_short:
+ movq %r8,%r8
+ call poly_hash_ad_internal
+.Lopen_avx2_short_hash_and_xor_loop:
+ cmpq $32,%rbx
+ jb .Lopen_avx2_short_tail_32
+ subq $32,%rbx
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rsi),%r10
+ adcq 8+16(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+ leaq 32(%rdi),%rdi
+
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm5,%ymm1
+ vmovdqa %ymm9,%ymm5
+ vmovdqa %ymm13,%ymm9
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm6,%ymm2
+ jmp .Lopen_avx2_short_hash_and_xor_loop
+.Lopen_avx2_short_tail_32:
+ cmpq $16,%rbx
+ vmovdqa %xmm0,%xmm1
+ jb .Lopen_avx2_short_tail_32_exit
+ subq $16,%rbx
+ addq 0+0(%rsi),%r10
+ adcq 8+0(%rsi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ vpxor (%rsi),%xmm0,%xmm3
+ vmovdqu %xmm3,(%rdi)
+ leaq 16(%rsi),%rsi
+ leaq 16(%rdi),%rdi
+ vextracti128 $1,%ymm0,%xmm1
+.Lopen_avx2_short_tail_32_exit:
+ vzeroupper
+ jmp .Lopen_sse_tail_16
+
+.Lopen_avx2_320:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13
+ vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ movq $10,%r10
+.Lopen_avx2_320_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ decq %r10
+ jne .Lopen_avx2_320_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm8,%ymm8
+ vpaddd %ymm11,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm10,%ymm10
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
+ jmp .Lopen_avx2_short
+.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
+.cfi_endproc
+
+
+.type chacha20_poly1305_seal_avx2,@function
+.align 64
+chacha20_poly1305_seal_avx2:
+.cfi_startproc
+
+
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r9,-64
+.cfi_adjust_cfa_offset 288 + 32
+
+ vzeroupper
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vbroadcasti128 0(%r9),%ymm4
+ vbroadcasti128 16(%r9),%ymm8
+ vbroadcasti128 32(%r9),%ymm12
+ vpaddd .Lavx2_init(%rip),%ymm12,%ymm12
+ cmpq $192,%rbx
+ jbe .Lseal_avx2_192
+ cmpq $320,%rbx
+ jbe .Lseal_avx2_320
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm4,0+64(%rbp)
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm8,0+96(%rbp)
+ vmovdqa %ymm12,%ymm15
+ vpaddd .Lavx2_inc(%rip),%ymm15,%ymm14
+ vpaddd .Lavx2_inc(%rip),%ymm14,%ymm13
+ vpaddd .Lavx2_inc(%rip),%ymm13,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm15,0+256(%rbp)
+ movq $10,%r10
+.Lseal_avx2_init_rounds:
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ decq %r10
+ jnz .Lseal_avx2_init_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm15
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm3
+ vpand .Lclamp(%rip),%ymm15,%ymm15
+ vmovdqa %ymm15,0+0(%rbp)
+ movq %r8,%r8
+ call poly_hash_ad_internal
+
+ vpxor 0(%rsi),%ymm3,%ymm3
+ vpxor 32(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm3,0(%rdi)
+ vmovdqu %ymm11,32(%rdi)
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm15
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+64(%rsi),%ymm15,%ymm15
+ vpxor 32+64(%rsi),%ymm2,%ymm2
+ vpxor 64+64(%rsi),%ymm6,%ymm6
+ vpxor 96+64(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm15,0+64(%rdi)
+ vmovdqu %ymm2,32+64(%rdi)
+ vmovdqu %ymm6,64+64(%rdi)
+ vmovdqu %ymm10,96+64(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm15
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+192(%rsi),%ymm15,%ymm15
+ vpxor 32+192(%rsi),%ymm1,%ymm1
+ vpxor 64+192(%rsi),%ymm5,%ymm5
+ vpxor 96+192(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm15,0+192(%rdi)
+ vmovdqu %ymm1,32+192(%rdi)
+ vmovdqu %ymm5,64+192(%rdi)
+ vmovdqu %ymm9,96+192(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm15
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm15,%ymm8
+
+ leaq 320(%rsi),%rsi
+ subq $320,%rbx
+ movq $320,%rcx
+ cmpq $128,%rbx
+ jbe .Lseal_avx2_short_hash_remainder
+ vpxor 0(%rsi),%ymm0,%ymm0
+ vpxor 32(%rsi),%ymm4,%ymm4
+ vpxor 64(%rsi),%ymm8,%ymm8
+ vpxor 96(%rsi),%ymm12,%ymm12
+ vmovdqu %ymm0,320(%rdi)
+ vmovdqu %ymm4,352(%rdi)
+ vmovdqu %ymm8,384(%rdi)
+ vmovdqu %ymm12,416(%rdi)
+ leaq 128(%rsi),%rsi
+ subq $128,%rbx
+ movq $8,%rcx
+ movq $2,%r8
+ cmpq $128,%rbx
+ jbe .Lseal_avx2_tail_128
+ cmpq $256,%rbx
+ jbe .Lseal_avx2_tail_256
+ cmpq $384,%rbx
+ jbe .Lseal_avx2_tail_384
+ cmpq $512,%rbx
+ jbe .Lseal_avx2_tail_512
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+
+ subq $16,%rdi
+ movq $9,%rcx
+ jmp .Lseal_avx2_main_loop_rounds_entry
+.align 32
+.Lseal_avx2_main_loop:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+ movq $10,%rcx
+.align 32
+.Lseal_avx2_main_loop_rounds:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+.Lseal_avx2_main_loop_rounds_entry:
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq 0+32(%rdi),%r10
+ adcq 8+32(%rdi),%r11
+ adcq $1,%r12
+
+ leaq 48(%rdi),%rdi
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+ decq %rcx
+ jne .Lseal_avx2_main_loop_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm4
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm8
+ vpxor 0+384(%rsi),%ymm3,%ymm3
+ vpxor 32+384(%rsi),%ymm0,%ymm0
+ vpxor 64+384(%rsi),%ymm4,%ymm4
+ vpxor 96+384(%rsi),%ymm8,%ymm8
+ vmovdqu %ymm3,0+384(%rdi)
+ vmovdqu %ymm0,32+384(%rdi)
+ vmovdqu %ymm4,64+384(%rdi)
+ vmovdqu %ymm8,96+384(%rdi)
+
+ leaq 512(%rsi),%rsi
+ subq $512,%rbx
+ cmpq $512,%rbx
+ jg .Lseal_avx2_main_loop
+
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ movq $10,%rcx
+ xorq %r8,%r8
+
+ cmpq $384,%rbx
+ ja .Lseal_avx2_tail_512
+ cmpq $256,%rbx
+ ja .Lseal_avx2_tail_384
+ cmpq $128,%rbx
+ ja .Lseal_avx2_tail_256
+
+.Lseal_avx2_tail_128:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_128_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_avx2_tail_128_rounds_and_2xhash:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_avx2_tail_128_rounds_and_3xhash
+ decq %r8
+ jge .Lseal_avx2_tail_128_rounds_and_2xhash
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ jmp .Lseal_avx2_short_loop
+
+.Lseal_avx2_tail_256:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+
+.Lseal_avx2_tail_256_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_avx2_tail_256_rounds_and_2xhash:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_avx2_tail_256_rounds_and_3xhash
+ decq %r8
+ jge .Lseal_avx2_tail_256_rounds_and_2xhash
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm1,%ymm1
+ vpxor 64+0(%rsi),%ymm5,%ymm5
+ vpxor 96+0(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm1,32+0(%rdi)
+ vmovdqu %ymm5,64+0(%rdi)
+ vmovdqu %ymm9,96+0(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $128,%rcx
+ leaq 128(%rsi),%rsi
+ subq $128,%rbx
+ jmp .Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_384:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+
+.Lseal_avx2_tail_384_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_avx2_tail_384_rounds_and_2xhash:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_avx2_tail_384_rounds_and_3xhash
+ decq %r8
+ jge .Lseal_avx2_tail_384_rounds_and_2xhash
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+0(%rsi),%ymm3,%ymm3
+ vpxor 32+0(%rsi),%ymm2,%ymm2
+ vpxor 64+0(%rsi),%ymm6,%ymm6
+ vpxor 96+0(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+0(%rdi)
+ vmovdqu %ymm2,32+0(%rdi)
+ vmovdqu %ymm6,64+0(%rdi)
+ vmovdqu %ymm10,96+0(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm1,%ymm1
+ vpxor 64+128(%rsi),%ymm5,%ymm5
+ vpxor 96+128(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm1,32+128(%rdi)
+ vmovdqu %ymm5,64+128(%rdi)
+ vmovdqu %ymm9,96+128(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $256,%rcx
+ leaq 256(%rsi),%rsi
+ subq $256,%rbx
+ jmp .Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_tail_512:
+ vmovdqa .Lchacha20_consts(%rip),%ymm0
+ vmovdqa 0+64(%rbp),%ymm4
+ vmovdqa 0+96(%rbp),%ymm8
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm10
+ vmovdqa %ymm0,%ymm3
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa .Lavx2_inc(%rip),%ymm12
+ vpaddd 0+160(%rbp),%ymm12,%ymm15
+ vpaddd %ymm15,%ymm12,%ymm14
+ vpaddd %ymm14,%ymm12,%ymm13
+ vpaddd %ymm13,%ymm12,%ymm12
+ vmovdqa %ymm15,0+256(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm12,0+160(%rbp)
+
+.Lseal_avx2_tail_512_rounds_and_3xhash:
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ addq %rax,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+.Lseal_avx2_tail_512_rounds_and_2xhash:
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $4,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $12,%ymm15,%ymm15,%ymm15
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ addq %rax,%r15
+ adcq %rdx,%r9
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vmovdqa %ymm8,0+128(%rbp)
+ vmovdqa .Lrol16(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $20,%ymm7,%ymm8
+ vpslld $32-20,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $20,%ymm6,%ymm8
+ vpslld $32-20,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $20,%ymm5,%ymm8
+ vpslld $32-20,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $20,%ymm4,%ymm8
+ vpslld $32-20,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa .Lrol8(%rip),%ymm8
+ vpaddd %ymm7,%ymm3,%ymm3
+ vpaddd %ymm6,%ymm2,%ymm2
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm3,%ymm15,%ymm15
+ vpxor %ymm2,%ymm14,%ymm14
+ vpxor %ymm1,%ymm13,%ymm13
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm8,%ymm15,%ymm15
+ vpshufb %ymm8,%ymm14,%ymm14
+ vpshufb %ymm8,%ymm13,%ymm13
+ vpshufb %ymm8,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpaddd 0+128(%rbp),%ymm12,%ymm8
+ vpxor %ymm11,%ymm7,%ymm7
+ vpxor %ymm10,%ymm6,%ymm6
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa %ymm8,0+128(%rbp)
+ vpsrld $25,%ymm7,%ymm8
+ movq 0+0+0(%rbp),%rdx
+ movq %rdx,%r15
+ mulxq %r10,%r13,%r14
+ mulxq %r11,%rax,%rdx
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ vpslld $32-25,%ymm7,%ymm7
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $25,%ymm6,%ymm8
+ vpslld $32-25,%ymm6,%ymm6
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $25,%ymm5,%ymm8
+ vpslld $32-25,%ymm5,%ymm5
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $25,%ymm4,%ymm8
+ vpslld $32-25,%ymm4,%ymm4
+ vpxor %ymm8,%ymm4,%ymm4
+ vmovdqa 0+128(%rbp),%ymm8
+ vpalignr $12,%ymm7,%ymm7,%ymm7
+ vpalignr $8,%ymm11,%ymm11,%ymm11
+ vpalignr $4,%ymm15,%ymm15,%ymm15
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ movq 8+0+0(%rbp),%rdx
+ mulxq %r10,%r10,%rax
+ addq %r10,%r14
+ mulxq %r11,%r11,%r9
+ adcq %r11,%r15
+ adcq $0,%r9
+ imulq %r12,%rdx
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ addq %rax,%r15
+ adcq %rdx,%r9
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+ decq %rcx
+ jg .Lseal_avx2_tail_512_rounds_and_3xhash
+ decq %r8
+ jge .Lseal_avx2_tail_512_rounds_and_2xhash
+ vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3
+ vpaddd 0+64(%rbp),%ymm7,%ymm7
+ vpaddd 0+96(%rbp),%ymm11,%ymm11
+ vpaddd 0+256(%rbp),%ymm15,%ymm15
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd 0+64(%rbp),%ymm6,%ymm6
+ vpaddd 0+96(%rbp),%ymm10,%ymm10
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd 0+64(%rbp),%ymm5,%ymm5
+ vpaddd 0+96(%rbp),%ymm9,%ymm9
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd 0+64(%rbp),%ymm4,%ymm4
+ vpaddd 0+96(%rbp),%ymm8,%ymm8
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+
+ vmovdqa %ymm0,0+128(%rbp)
+ vperm2i128 $0x02,%ymm3,%ymm7,%ymm0
+ vperm2i128 $0x13,%ymm3,%ymm7,%ymm7
+ vperm2i128 $0x02,%ymm11,%ymm15,%ymm3
+ vperm2i128 $0x13,%ymm11,%ymm15,%ymm11
+ vpxor 0+0(%rsi),%ymm0,%ymm0
+ vpxor 32+0(%rsi),%ymm3,%ymm3
+ vpxor 64+0(%rsi),%ymm7,%ymm7
+ vpxor 96+0(%rsi),%ymm11,%ymm11
+ vmovdqu %ymm0,0+0(%rdi)
+ vmovdqu %ymm3,32+0(%rdi)
+ vmovdqu %ymm7,64+0(%rdi)
+ vmovdqu %ymm11,96+0(%rdi)
+
+ vmovdqa 0+128(%rbp),%ymm0
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm3
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm6
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm10
+ vpxor 0+128(%rsi),%ymm3,%ymm3
+ vpxor 32+128(%rsi),%ymm2,%ymm2
+ vpxor 64+128(%rsi),%ymm6,%ymm6
+ vpxor 96+128(%rsi),%ymm10,%ymm10
+ vmovdqu %ymm3,0+128(%rdi)
+ vmovdqu %ymm2,32+128(%rdi)
+ vmovdqu %ymm6,64+128(%rdi)
+ vmovdqu %ymm10,96+128(%rdi)
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm3
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm5
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm9
+ vpxor 0+256(%rsi),%ymm3,%ymm3
+ vpxor 32+256(%rsi),%ymm1,%ymm1
+ vpxor 64+256(%rsi),%ymm5,%ymm5
+ vpxor 96+256(%rsi),%ymm9,%ymm9
+ vmovdqu %ymm3,0+256(%rdi)
+ vmovdqu %ymm1,32+256(%rdi)
+ vmovdqu %ymm5,64+256(%rdi)
+ vmovdqu %ymm9,96+256(%rdi)
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm3
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x02,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm12
+ vmovdqa %ymm3,%ymm8
+
+ movq $384,%rcx
+ leaq 384(%rsi),%rsi
+ subq $384,%rbx
+ jmp .Lseal_avx2_short_hash_remainder
+
+.Lseal_avx2_320:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13
+ vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14
+ vmovdqa %ymm4,%ymm7
+ vmovdqa %ymm8,%ymm11
+ vmovdqa %ymm12,0+160(%rbp)
+ vmovdqa %ymm13,0+192(%rbp)
+ vmovdqa %ymm14,0+224(%rbp)
+ movq $10,%r10
+.Lseal_avx2_320_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $12,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $4,%ymm6,%ymm6,%ymm6
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol16(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpsrld $20,%ymm6,%ymm3
+ vpslld $12,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpaddd %ymm6,%ymm2,%ymm2
+ vpxor %ymm2,%ymm14,%ymm14
+ vpshufb .Lrol8(%rip),%ymm14,%ymm14
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm3
+ vpsrld $25,%ymm6,%ymm6
+ vpxor %ymm3,%ymm6,%ymm6
+ vpalignr $4,%ymm14,%ymm14,%ymm14
+ vpalignr $8,%ymm10,%ymm10,%ymm10
+ vpalignr $12,%ymm6,%ymm6,%ymm6
+
+ decq %r10
+ jne .Lseal_avx2_320_rounds
+ vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0
+ vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1
+ vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2
+ vpaddd %ymm7,%ymm4,%ymm4
+ vpaddd %ymm7,%ymm5,%ymm5
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm8,%ymm8
+ vpaddd %ymm11,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm10,%ymm10
+ vpaddd 0+160(%rbp),%ymm12,%ymm12
+ vpaddd 0+192(%rbp),%ymm13,%ymm13
+ vpaddd 0+224(%rbp),%ymm14,%ymm14
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+ vperm2i128 $0x02,%ymm2,%ymm6,%ymm9
+ vperm2i128 $0x02,%ymm10,%ymm14,%ymm13
+ vperm2i128 $0x13,%ymm2,%ymm6,%ymm2
+ vperm2i128 $0x13,%ymm10,%ymm14,%ymm6
+ jmp .Lseal_avx2_short
+
+.Lseal_avx2_192:
+ vmovdqa %ymm0,%ymm1
+ vmovdqa %ymm0,%ymm2
+ vmovdqa %ymm4,%ymm5
+ vmovdqa %ymm4,%ymm6
+ vmovdqa %ymm8,%ymm9
+ vmovdqa %ymm8,%ymm10
+ vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13
+ vmovdqa %ymm12,%ymm11
+ vmovdqa %ymm13,%ymm15
+ movq $10,%r10
+.Lseal_avx2_192_rounds:
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $12,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $4,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $12,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $4,%ymm5,%ymm5,%ymm5
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol16(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $20,%ymm4,%ymm3
+ vpslld $12,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpaddd %ymm4,%ymm0,%ymm0
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb .Lrol8(%rip),%ymm12,%ymm12
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm3
+ vpsrld $25,%ymm4,%ymm4
+ vpxor %ymm3,%ymm4,%ymm4
+ vpalignr $4,%ymm12,%ymm12,%ymm12
+ vpalignr $8,%ymm8,%ymm8,%ymm8
+ vpalignr $12,%ymm4,%ymm4,%ymm4
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol16(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpsrld $20,%ymm5,%ymm3
+ vpslld $12,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpaddd %ymm5,%ymm1,%ymm1
+ vpxor %ymm1,%ymm13,%ymm13
+ vpshufb .Lrol8(%rip),%ymm13,%ymm13
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm3
+ vpsrld $25,%ymm5,%ymm5
+ vpxor %ymm3,%ymm5,%ymm5
+ vpalignr $4,%ymm13,%ymm13,%ymm13
+ vpalignr $8,%ymm9,%ymm9,%ymm9
+ vpalignr $12,%ymm5,%ymm5,%ymm5
+
+ decq %r10
+ jne .Lseal_avx2_192_rounds
+ vpaddd %ymm2,%ymm0,%ymm0
+ vpaddd %ymm2,%ymm1,%ymm1
+ vpaddd %ymm6,%ymm4,%ymm4
+ vpaddd %ymm6,%ymm5,%ymm5
+ vpaddd %ymm10,%ymm8,%ymm8
+ vpaddd %ymm10,%ymm9,%ymm9
+ vpaddd %ymm11,%ymm12,%ymm12
+ vpaddd %ymm15,%ymm13,%ymm13
+ vperm2i128 $0x02,%ymm0,%ymm4,%ymm3
+
+ vpand .Lclamp(%rip),%ymm3,%ymm3
+ vmovdqa %ymm3,0+0(%rbp)
+
+ vperm2i128 $0x13,%ymm0,%ymm4,%ymm0
+ vperm2i128 $0x13,%ymm8,%ymm12,%ymm4
+ vperm2i128 $0x02,%ymm1,%ymm5,%ymm8
+ vperm2i128 $0x02,%ymm9,%ymm13,%ymm12
+ vperm2i128 $0x13,%ymm1,%ymm5,%ymm1
+ vperm2i128 $0x13,%ymm9,%ymm13,%ymm5
+.Lseal_avx2_short:
+ movq %r8,%r8
+ call poly_hash_ad_internal
+ xorq %rcx,%rcx
+.Lseal_avx2_short_hash_remainder:
+ cmpq $16,%rcx
+ jb .Lseal_avx2_short_loop
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ subq $16,%rcx
+ addq $16,%rdi
+ jmp .Lseal_avx2_short_hash_remainder
+.Lseal_avx2_short_loop:
+ cmpq $32,%rbx
+ jb .Lseal_avx2_short_tail
+ subq $32,%rbx
+
+ vpxor (%rsi),%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ leaq 32(%rsi),%rsi
+
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+ addq 0+16(%rdi),%r10
+ adcq 8+16(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 32(%rdi),%rdi
+
+ vmovdqa %ymm4,%ymm0
+ vmovdqa %ymm8,%ymm4
+ vmovdqa %ymm12,%ymm8
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm5,%ymm1
+ vmovdqa %ymm9,%ymm5
+ vmovdqa %ymm13,%ymm9
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm6,%ymm2
+ jmp .Lseal_avx2_short_loop
+.Lseal_avx2_short_tail:
+ cmpq $16,%rbx
+ jb .Lseal_avx2_exit
+ subq $16,%rbx
+ vpxor (%rsi),%xmm0,%xmm3
+ vmovdqu %xmm3,(%rdi)
+ leaq 16(%rsi),%rsi
+ addq 0+0(%rdi),%r10
+ adcq 8+0(%rdi),%r11
+ adcq $1,%r12
+ movq 0+0+0(%rbp),%rax
+ movq %rax,%r15
+ mulq %r10
+ movq %rax,%r13
+ movq %rdx,%r14
+ movq 0+0+0(%rbp),%rax
+ mulq %r11
+ imulq %r12,%r15
+ addq %rax,%r14
+ adcq %rdx,%r15
+ movq 8+0+0(%rbp),%rax
+ movq %rax,%r9
+ mulq %r10
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 8+0+0(%rbp),%rax
+ mulq %r11
+ addq %rax,%r15
+ adcq $0,%rdx
+ imulq %r12,%r9
+ addq %r10,%r15
+ adcq %rdx,%r9
+ movq %r13,%r10
+ movq %r14,%r11
+ movq %r15,%r12
+ andq $3,%r12
+ movq %r15,%r13
+ andq $-4,%r13
+ movq %r9,%r14
+ shrdq $2,%r9,%r15
+ shrq $2,%r9
+ addq %r13,%r15
+ adcq %r14,%r9
+ addq %r15,%r10
+ adcq %r9,%r11
+ adcq $0,%r12
+
+ leaq 16(%rdi),%rdi
+ vextracti128 $1,%ymm0,%xmm0
+.Lseal_avx2_exit:
+ vzeroupper
+ jmp .Lseal_sse_tail_16
+.cfi_endproc
+.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S
@@ -1,0 +1,852 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.type _aesni_ctr32_ghash_6x,@function
+.align 32
+_aesni_ctr32_ghash_6x:
+.cfi_startproc
+ vmovdqu 32(%r11),%xmm2
+ subq $6,%rdx
+ vpxor %xmm4,%xmm4,%xmm4
+ vmovdqu 0-128(%rcx),%xmm15
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovdqu %xmm4,16+8(%rsp)
+ jmp .Loop6x
+
+.align 32
+.Loop6x:
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm15,%xmm10,%xmm10
+ vpxor %xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+ vmovdqu %xmm1,(%r8)
+ vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
+ vpxor %xmm15,%xmm12,%xmm12
+ vmovups 16-128(%rcx),%xmm2
+ vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ xorq %r12,%r12
+ cmpq %r14,%r15
+
+ vaesenc %xmm2,%xmm9,%xmm9
+ vmovdqu 48+8(%rsp),%xmm0
+ vpxor %xmm15,%xmm13,%xmm13
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
+ vaesenc %xmm2,%xmm10,%xmm10
+ vpxor %xmm15,%xmm14,%xmm14
+ setnc %r12b
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vaesenc %xmm2,%xmm11,%xmm11
+ vmovdqu 16-32(%r9),%xmm3
+ negq %r12
+ vaesenc %xmm2,%xmm12,%xmm12
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
+ vpxor %xmm4,%xmm8,%xmm8
+ vaesenc %xmm2,%xmm13,%xmm13
+ vpxor %xmm5,%xmm1,%xmm4
+ andq $0x60,%r12
+ vmovups 32-128(%rcx),%xmm15
+ vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
+ vaesenc %xmm2,%xmm14,%xmm14
+
+ vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
+ leaq (%r14,%r12,1),%r14
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
+ vmovdqu 64+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 88(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 80(%r14),%r12
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,32+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,40+8(%rsp)
+ vmovdqu 48-32(%r9),%xmm5
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 48-128(%rcx),%xmm15
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
+ vmovdqu 80+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor %xmm1,%xmm4,%xmm4
+ vmovdqu 64-32(%r9),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 64-128(%rcx),%xmm15
+ vpxor %xmm2,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 72(%r14),%r13
+ vpxor %xmm5,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 64(%r14),%r12
+ vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
+ vmovdqu 96+8(%rsp),%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,48+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,56+8(%rsp)
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 96-32(%r9),%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 80-128(%rcx),%xmm15
+ vpxor %xmm3,%xmm6,%xmm6
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 56(%r14),%r13
+ vpxor %xmm1,%xmm7,%xmm7
+ vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
+ vpxor 112+8(%rsp),%xmm8,%xmm8
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 48(%r14),%r12
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,64+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,72+8(%rsp)
+ vpxor %xmm3,%xmm4,%xmm4
+ vmovdqu 112-32(%r9),%xmm3
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vmovups 96-128(%rcx),%xmm15
+ vpxor %xmm5,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm1,%xmm6,%xmm6
+ vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
+ vaesenc %xmm15,%xmm10,%xmm10
+ movbeq 40(%r14),%r13
+ vpxor %xmm2,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 32(%r14),%r12
+ vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r13,80+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ movq %r12,88+8(%rsp)
+ vpxor %xmm5,%xmm6,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor %xmm1,%xmm6,%xmm6
+
+ vmovups 112-128(%rcx),%xmm15
+ vpslldq $8,%xmm6,%xmm5
+ vpxor %xmm2,%xmm4,%xmm4
+ vmovdqu 16(%r11),%xmm3
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor %xmm8,%xmm7,%xmm7
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor %xmm5,%xmm4,%xmm4
+ movbeq 24(%r14),%r13
+ vaesenc %xmm15,%xmm11,%xmm11
+ movbeq 16(%r14),%r12
+ vpalignr $8,%xmm4,%xmm4,%xmm0
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ movq %r13,96+8(%rsp)
+ vaesenc %xmm15,%xmm12,%xmm12
+ movq %r12,104+8(%rsp)
+ vaesenc %xmm15,%xmm13,%xmm13
+ vmovups 128-128(%rcx),%xmm1
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vmovups 144-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm10,%xmm10
+ vpsrldq $8,%xmm6,%xmm6
+ vaesenc %xmm1,%xmm11,%xmm11
+ vpxor %xmm6,%xmm7,%xmm7
+ vaesenc %xmm1,%xmm12,%xmm12
+ vpxor %xmm0,%xmm4,%xmm4
+ movbeq 8(%r14),%r13
+ vaesenc %xmm1,%xmm13,%xmm13
+ movbeq 0(%r14),%r12
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 160-128(%rcx),%xmm1
+ cmpl $11,%ebp
+ jb .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 176-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 192-128(%rcx),%xmm1
+ je .Lenc_tail
+
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+
+ vaesenc %xmm1,%xmm9,%xmm9
+ vaesenc %xmm1,%xmm10,%xmm10
+ vaesenc %xmm1,%xmm11,%xmm11
+ vaesenc %xmm1,%xmm12,%xmm12
+ vaesenc %xmm1,%xmm13,%xmm13
+ vmovups 208-128(%rcx),%xmm15
+ vaesenc %xmm1,%xmm14,%xmm14
+ vmovups 224-128(%rcx),%xmm1
+ jmp .Lenc_tail
+
+.align 32
+.Lhandle_ctr32:
+ vmovdqu (%r11),%xmm0
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vmovdqu 0-32(%r9),%xmm3
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm15,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm15,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpshufb %xmm0,%xmm1,%xmm1
+ jmp .Lresume_ctr32
+
+.align 32
+.Lenc_tail:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vmovdqu %xmm7,16+8(%rsp)
+ vpalignr $8,%xmm4,%xmm4,%xmm8
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
+ vpxor 0(%rdi),%xmm1,%xmm2
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 16(%rdi),%xmm1,%xmm0
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 32(%rdi),%xmm1,%xmm5
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 48(%rdi),%xmm1,%xmm6
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 64(%rdi),%xmm1,%xmm7
+ vpxor 80(%rdi),%xmm1,%xmm3
+ vmovdqu (%r8),%xmm1
+
+ vaesenclast %xmm2,%xmm9,%xmm9
+ vmovdqu 32(%r11),%xmm2
+ vaesenclast %xmm0,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm1,%xmm0
+ movq %r13,112+8(%rsp)
+ leaq 96(%rdi),%rdi
+ vaesenclast %xmm5,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm0,%xmm5
+ movq %r12,120+8(%rsp)
+ leaq 96(%rsi),%rsi
+ vmovdqu 0-128(%rcx),%xmm15
+ vaesenclast %xmm6,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm5,%xmm6
+ vaesenclast %xmm7,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm6,%xmm7
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vpaddb %xmm2,%xmm7,%xmm3
+
+ addq $0x60,%r10
+ subq $0x6,%rdx
+ jc .L6x_done
+
+ vmovups %xmm9,-96(%rsi)
+ vpxor %xmm15,%xmm1,%xmm9
+ vmovups %xmm10,-80(%rsi)
+ vmovdqa %xmm0,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vmovdqa %xmm5,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vmovdqa %xmm6,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vmovdqa %xmm7,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vmovdqa %xmm3,%xmm14
+ vmovdqu 32+8(%rsp),%xmm7
+ jmp .Loop6x
+
+.L6x_done:
+ vpxor 16+8(%rsp),%xmm8,%xmm8
+ vpxor %xmm4,%xmm8,%xmm8
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+.globl aesni_gcm_decrypt
+.hidden aesni_gcm_decrypt
+.type aesni_gcm_decrypt,@function
+.align 32
+aesni_gcm_decrypt:
+.cfi_startproc
+ xorq %r10,%r10
+
+
+
+ cmpq $0x60,%rdx
+ jb .Lgcm_dec_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ vmovdqu (%r9),%xmm8
+ andq $-128,%rsp
+ vmovdqu (%r11),%xmm0
+ leaq 128(%rcx),%rcx
+ leaq 32+32(%r9),%r9
+ movl 240-128(%rcx),%ebp
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Ldec_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Ldec_no_key_aliasing
+ subq %r15,%rsp
+.Ldec_no_key_aliasing:
+
+ vmovdqu 80(%rdi),%xmm7
+ leaq (%rdi),%r14
+ vmovdqu 64(%rdi),%xmm4
+
+
+
+
+
+
+
+ leaq -192(%rdi,%rdx,1),%r15
+
+ vmovdqu 48(%rdi),%xmm5
+ shrq $4,%rdx
+ xorq %r10,%r10
+ vmovdqu 32(%rdi),%xmm6
+ vpshufb %xmm0,%xmm7,%xmm7
+ vmovdqu 16(%rdi),%xmm2
+ vpshufb %xmm0,%xmm4,%xmm4
+ vmovdqu (%rdi),%xmm3
+ vpshufb %xmm0,%xmm5,%xmm5
+ vmovdqu %xmm4,48(%rsp)
+ vpshufb %xmm0,%xmm6,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm2,%xmm2
+ vmovdqu %xmm6,80(%rsp)
+ vpshufb %xmm0,%xmm3,%xmm3
+ vmovdqu %xmm2,96(%rsp)
+ vmovdqu %xmm3,112(%rsp)
+
+ call _aesni_ctr32_ghash_6x
+
+ vmovups %xmm9,-96(%rsi)
+ vmovups %xmm10,-80(%rsi)
+ vmovups %xmm11,-64(%rsi)
+ vmovups %xmm12,-48(%rsi)
+ vmovups %xmm13,-32(%rsi)
+ vmovups %xmm14,-16(%rsi)
+
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_dec_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type _aesni_ctr32_6x,@function
+.align 32
+_aesni_ctr32_6x:
+.cfi_startproc
+ vmovdqu 0-128(%rcx),%xmm4
+ vmovdqu 32(%r11),%xmm2
+ leaq -1(%rbp),%r13
+ vmovups 16-128(%rcx),%xmm15
+ leaq 32-128(%rcx),%r12
+ vpxor %xmm4,%xmm1,%xmm9
+ addl $100663296,%ebx
+ jc .Lhandle_ctr32_2
+ vpaddb %xmm2,%xmm1,%xmm10
+ vpaddb %xmm2,%xmm10,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddb %xmm2,%xmm11,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddb %xmm2,%xmm12,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpaddb %xmm2,%xmm13,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpaddb %xmm2,%xmm14,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+
+.align 16
+.Loop_ctr32:
+ vaesenc %xmm15,%xmm9,%xmm9
+ vaesenc %xmm15,%xmm10,%xmm10
+ vaesenc %xmm15,%xmm11,%xmm11
+ vaesenc %xmm15,%xmm12,%xmm12
+ vaesenc %xmm15,%xmm13,%xmm13
+ vaesenc %xmm15,%xmm14,%xmm14
+ vmovups (%r12),%xmm15
+ leaq 16(%r12),%r12
+ decl %r13d
+ jnz .Loop_ctr32
+
+ vmovdqu (%r12),%xmm3
+ vaesenc %xmm15,%xmm9,%xmm9
+ vpxor 0(%rdi),%xmm3,%xmm4
+ vaesenc %xmm15,%xmm10,%xmm10
+ vpxor 16(%rdi),%xmm3,%xmm5
+ vaesenc %xmm15,%xmm11,%xmm11
+ vpxor 32(%rdi),%xmm3,%xmm6
+ vaesenc %xmm15,%xmm12,%xmm12
+ vpxor 48(%rdi),%xmm3,%xmm8
+ vaesenc %xmm15,%xmm13,%xmm13
+ vpxor 64(%rdi),%xmm3,%xmm2
+ vaesenc %xmm15,%xmm14,%xmm14
+ vpxor 80(%rdi),%xmm3,%xmm3
+ leaq 96(%rdi),%rdi
+
+ vaesenclast %xmm4,%xmm9,%xmm9
+ vaesenclast %xmm5,%xmm10,%xmm10
+ vaesenclast %xmm6,%xmm11,%xmm11
+ vaesenclast %xmm8,%xmm12,%xmm12
+ vaesenclast %xmm2,%xmm13,%xmm13
+ vaesenclast %xmm3,%xmm14,%xmm14
+ vmovups %xmm9,0(%rsi)
+ vmovups %xmm10,16(%rsi)
+ vmovups %xmm11,32(%rsi)
+ vmovups %xmm12,48(%rsi)
+ vmovups %xmm13,64(%rsi)
+ vmovups %xmm14,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ .byte 0xf3,0xc3
+.align 32
+.Lhandle_ctr32_2:
+ vpshufb %xmm0,%xmm1,%xmm6
+ vmovdqu 48(%r11),%xmm5
+ vpaddd 64(%r11),%xmm6,%xmm10
+ vpaddd %xmm5,%xmm6,%xmm11
+ vpaddd %xmm5,%xmm10,%xmm12
+ vpshufb %xmm0,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm11,%xmm13
+ vpshufb %xmm0,%xmm11,%xmm11
+ vpxor %xmm4,%xmm10,%xmm10
+ vpaddd %xmm5,%xmm12,%xmm14
+ vpshufb %xmm0,%xmm12,%xmm12
+ vpxor %xmm4,%xmm11,%xmm11
+ vpaddd %xmm5,%xmm13,%xmm1
+ vpshufb %xmm0,%xmm13,%xmm13
+ vpxor %xmm4,%xmm12,%xmm12
+ vpshufb %xmm0,%xmm14,%xmm14
+ vpxor %xmm4,%xmm13,%xmm13
+ vpshufb %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm14,%xmm14
+ jmp .Loop_ctr32
+.cfi_endproc
+.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl aesni_gcm_encrypt
+.hidden aesni_gcm_encrypt
+.type aesni_gcm_encrypt,@function
+.align 32
+aesni_gcm_encrypt:
+.cfi_startproc
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+2(%rip)
+#endif
+ xorq %r10,%r10
+
+
+
+
+ cmpq $288,%rdx
+ jb .Lgcm_enc_abort
+
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+
+ vmovdqu (%r8),%xmm1
+ addq $-128,%rsp
+ movl 12(%r8),%ebx
+ leaq .Lbswap_mask(%rip),%r11
+ leaq -128(%rcx),%r14
+ movq $0xf80,%r15
+ leaq 128(%rcx),%rcx
+ vmovdqu (%r11),%xmm0
+ andq $-128,%rsp
+ movl 240-128(%rcx),%ebp
+
+ andq %r15,%r14
+ andq %rsp,%r15
+ subq %r14,%r15
+ jc .Lenc_no_key_aliasing
+ cmpq $768,%r15
+ jnc .Lenc_no_key_aliasing
+ subq %r15,%rsp
+.Lenc_no_key_aliasing:
+
+ leaq (%rsi),%r14
+
+
+
+
+
+
+
+
+ leaq -192(%rsi,%rdx,1),%r15
+
+ shrq $4,%rdx
+
+ call _aesni_ctr32_6x
+ vpshufb %xmm0,%xmm9,%xmm8
+ vpshufb %xmm0,%xmm10,%xmm2
+ vmovdqu %xmm8,112(%rsp)
+ vpshufb %xmm0,%xmm11,%xmm4
+ vmovdqu %xmm2,96(%rsp)
+ vpshufb %xmm0,%xmm12,%xmm5
+ vmovdqu %xmm4,80(%rsp)
+ vpshufb %xmm0,%xmm13,%xmm6
+ vmovdqu %xmm5,64(%rsp)
+ vpshufb %xmm0,%xmm14,%xmm7
+ vmovdqu %xmm6,48(%rsp)
+
+ call _aesni_ctr32_6x
+
+ vmovdqu (%r9),%xmm8
+ leaq 32+32(%r9),%r9
+ subq $12,%rdx
+ movq $192,%r10
+ vpshufb %xmm0,%xmm8,%xmm8
+
+ call _aesni_ctr32_ghash_6x
+ vmovdqu 32(%rsp),%xmm7
+ vmovdqu (%r11),%xmm0
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm7,%xmm7,%xmm1
+ vmovdqu 32-32(%r9),%xmm15
+ vmovups %xmm9,-96(%rsi)
+ vpshufb %xmm0,%xmm9,%xmm9
+ vpxor %xmm7,%xmm1,%xmm1
+ vmovups %xmm10,-80(%rsi)
+ vpshufb %xmm0,%xmm10,%xmm10
+ vmovups %xmm11,-64(%rsi)
+ vpshufb %xmm0,%xmm11,%xmm11
+ vmovups %xmm12,-48(%rsi)
+ vpshufb %xmm0,%xmm12,%xmm12
+ vmovups %xmm13,-32(%rsi)
+ vpshufb %xmm0,%xmm13,%xmm13
+ vmovups %xmm14,-16(%rsi)
+ vpshufb %xmm0,%xmm14,%xmm14
+ vmovdqu %xmm9,16(%rsp)
+ vmovdqu 48(%rsp),%xmm6
+ vmovdqu 16-32(%r9),%xmm0
+ vpunpckhqdq %xmm6,%xmm6,%xmm2
+ vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5
+ vpxor %xmm6,%xmm2,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+
+ vmovdqu 64(%rsp),%xmm9
+ vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm9,%xmm9,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6
+ vpxor %xmm9,%xmm5,%xmm5
+ vpxor %xmm7,%xmm6,%xmm6
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vmovdqu 80(%rsp),%xmm1
+ vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm4,%xmm7,%xmm7
+ vpunpckhqdq %xmm1,%xmm1,%xmm4
+ vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpxor %xmm6,%xmm9,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 96(%rsp),%xmm2
+ vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm7,%xmm6,%xmm6
+ vpunpckhqdq %xmm2,%xmm2,%xmm7
+ vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1
+ vpxor %xmm2,%xmm7,%xmm7
+ vpxor %xmm9,%xmm1,%xmm1
+ vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm5,%xmm4,%xmm4
+
+ vpxor 112(%rsp),%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5
+ vmovdqu 112-32(%r9),%xmm0
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpxor %xmm6,%xmm5,%xmm5
+ vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm1,%xmm2,%xmm2
+ vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7
+ vpxor %xmm4,%xmm7,%xmm4
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6
+ vmovdqu 0-32(%r9),%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm1
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8
+ vpxor %xmm14,%xmm1,%xmm1
+ vpxor %xmm5,%xmm6,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9
+ vmovdqu 32-32(%r9),%xmm15
+ vpxor %xmm2,%xmm8,%xmm7
+ vpxor %xmm4,%xmm9,%xmm6
+
+ vmovdqu 16-32(%r9),%xmm0
+ vpxor %xmm5,%xmm7,%xmm9
+ vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4
+ vpxor %xmm9,%xmm6,%xmm6
+ vpunpckhqdq %xmm13,%xmm13,%xmm2
+ vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14
+ vpxor %xmm13,%xmm2,%xmm2
+ vpslldq $8,%xmm6,%xmm9
+ vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1
+ vpxor %xmm9,%xmm5,%xmm8
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5
+ vmovdqu 48-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm12,%xmm12,%xmm9
+ vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13
+ vpxor %xmm12,%xmm9,%xmm9
+ vpxor %xmm14,%xmm13,%xmm13
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2
+ vmovdqu 80-32(%r9),%xmm15
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4
+ vmovdqu 64-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm11,%xmm11,%xmm1
+ vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12
+ vpxor %xmm11,%xmm1,%xmm1
+ vpxor %xmm13,%xmm12,%xmm12
+ vxorps 16(%rsp),%xmm7,%xmm7
+ vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9
+ vpxor %xmm2,%xmm9,%xmm9
+
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5
+ vmovdqu 96-32(%r9),%xmm3
+ vpxor %xmm4,%xmm5,%xmm5
+ vpunpckhqdq %xmm10,%xmm10,%xmm2
+ vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11
+ vpxor %xmm10,%xmm2,%xmm2
+ vpalignr $8,%xmm8,%xmm8,%xmm14
+ vpxor %xmm12,%xmm11,%xmm11
+ vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1
+ vmovdqu 128-32(%r9),%xmm15
+ vpxor %xmm9,%xmm1,%xmm1
+
+ vxorps %xmm7,%xmm14,%xmm14
+ vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8
+ vxorps %xmm14,%xmm8,%xmm8
+
+ vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4
+ vmovdqu 112-32(%r9),%xmm0
+ vpxor %xmm5,%xmm4,%xmm4
+ vpunpckhqdq %xmm8,%xmm8,%xmm9
+ vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10
+ vpxor %xmm8,%xmm9,%xmm9
+ vpxor %xmm11,%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2
+ vpxor %xmm1,%xmm2,%xmm2
+
+ vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5
+ vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7
+ vpxor %xmm4,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6
+ vpxor %xmm10,%xmm7,%xmm7
+ vpxor %xmm2,%xmm6,%xmm6
+
+ vpxor %xmm5,%xmm7,%xmm4
+ vpxor %xmm4,%xmm6,%xmm6
+ vpslldq $8,%xmm6,%xmm1
+ vmovdqu 16(%r11),%xmm3
+ vpsrldq $8,%xmm6,%xmm6
+ vpxor %xmm1,%xmm5,%xmm8
+ vpxor %xmm6,%xmm7,%xmm7
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm2,%xmm8,%xmm8
+
+ vpalignr $8,%xmm8,%xmm8,%xmm2
+ vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8
+ vpxor %xmm7,%xmm2,%xmm2
+ vpxor %xmm2,%xmm8,%xmm8
+ vpshufb (%r11),%xmm8,%xmm8
+ vmovdqu %xmm8,-64(%r9)
+
+ vzeroupper
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lgcm_enc_abort:
+ movq %r10,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S
@@ -1,0 +1,2506 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl aes_hw_encrypt
+.hidden aes_hw_encrypt
+.type aes_hw_encrypt,@function
+.align 16
+aes_hw_encrypt:
+.cfi_startproc
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+1(%rip)
+#endif
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+.Loop_enc1_1:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz .Loop_enc1_1
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes_hw_encrypt,.-aes_hw_encrypt
+
+.globl aes_hw_decrypt
+.hidden aes_hw_decrypt
+.type aes_hw_decrypt,@function
+.align 16
+aes_hw_decrypt:
+.cfi_startproc
+ movups (%rdi),%xmm2
+ movl 240(%rdx),%eax
+ movups (%rdx),%xmm0
+ movups 16(%rdx),%xmm1
+ leaq 32(%rdx),%rdx
+ xorps %xmm0,%xmm2
+.Loop_dec1_2:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rdx),%xmm1
+ leaq 16(%rdx),%rdx
+ jnz .Loop_dec1_2
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes_hw_decrypt, .-aes_hw_decrypt
+.type _aesni_encrypt2,@function
+.align 16
+_aesni_encrypt2:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop2:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop2
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_encrypt2,.-_aesni_encrypt2
+.type _aesni_decrypt2,@function
+.align 16
+_aesni_decrypt2:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop2:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop2
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_decrypt2,.-_aesni_decrypt2
+.type _aesni_encrypt3,@function
+.align 16
+_aesni_encrypt3:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Lenc_loop3:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop3
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_encrypt3,.-_aesni_encrypt3
+.type _aesni_decrypt3,@function
+.align 16
+_aesni_decrypt3:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+ addq $16,%rax
+
+.Ldec_loop3:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop3
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_decrypt3,.-_aesni_decrypt3
+.type _aesni_encrypt4,@function
+.align 16
+_aesni_encrypt4:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
+
+.Lenc_loop4:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop4
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_encrypt4,.-_aesni_encrypt4
+.type _aesni_decrypt4,@function
+.align 16
+_aesni_decrypt4:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ xorps %xmm0,%xmm4
+ xorps %xmm0,%xmm5
+ movups 32(%rcx),%xmm0
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 0x0f,0x1f,0x00
+ addq $16,%rax
+
+.Ldec_loop4:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop4
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_decrypt4,.-_aesni_decrypt4
+.type _aesni_encrypt6,@function
+.align 16
+_aesni_encrypt6:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,220,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,220,225
+ pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop6_enter
+.align 16
+.Lenc_loop6:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.Lenc_loop6_enter:
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop6
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_encrypt6,.-_aesni_encrypt6
+.type _aesni_decrypt6,@function
+.align 16
+_aesni_decrypt6:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ pxor %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+.byte 102,15,56,222,209
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+.byte 102,15,56,222,225
+ pxor %xmm0,%xmm7
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop6_enter
+.align 16
+.Ldec_loop6:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.Ldec_loop6_enter:
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop6
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_decrypt6,.-_aesni_decrypt6
+.type _aesni_encrypt8,@function
+.align 16
+_aesni_encrypt8:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,209
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+.byte 102,15,56,220,217
+ pxor %xmm0,%xmm9
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Lenc_loop8_inner
+.align 16
+.Lenc_loop8:
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.Lenc_loop8_inner:
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+.Lenc_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Lenc_loop8
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+.byte 102,15,56,221,208
+.byte 102,15,56,221,216
+.byte 102,15,56,221,224
+.byte 102,15,56,221,232
+.byte 102,15,56,221,240
+.byte 102,15,56,221,248
+.byte 102,68,15,56,221,192
+.byte 102,68,15,56,221,200
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_encrypt8,.-_aesni_encrypt8
+.type _aesni_decrypt8,@function
+.align 16
+_aesni_decrypt8:
+.cfi_startproc
+ movups (%rcx),%xmm0
+ shll $4,%eax
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm2
+ xorps %xmm0,%xmm3
+ pxor %xmm0,%xmm4
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ leaq 32(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm9
+ movups (%rcx,%rax,1),%xmm0
+ addq $16,%rax
+ jmp .Ldec_loop8_inner
+.align 16
+.Ldec_loop8:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.Ldec_loop8_inner:
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+.Ldec_loop8_enter:
+ movups (%rcx,%rax,1),%xmm1
+ addq $32,%rax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups -16(%rcx,%rax,1),%xmm0
+ jnz .Ldec_loop8
+
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+.byte 102,15,56,223,208
+.byte 102,15,56,223,216
+.byte 102,15,56,223,224
+.byte 102,15,56,223,232
+.byte 102,15,56,223,240
+.byte 102,15,56,223,248
+.byte 102,68,15,56,223,192
+.byte 102,68,15,56,223,200
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _aesni_decrypt8,.-_aesni_decrypt8
+.globl aes_hw_ecb_encrypt
+.hidden aes_hw_ecb_encrypt
+.type aes_hw_ecb_encrypt,@function
+.align 16
+aes_hw_ecb_encrypt:
+.cfi_startproc
+ andq $-16,%rdx
+ jz .Lecb_ret
+
+ movl 240(%rcx),%eax
+ movups (%rcx),%xmm0
+ movq %rcx,%r11
+ movl %eax,%r10d
+ testl %r8d,%r8d
+ jz .Lecb_decrypt
+
+ cmpq $0x80,%rdx
+ jb .Lecb_enc_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $0x80,%rdx
+ jmp .Lecb_enc_loop8_enter
+.align 16
+.Lecb_enc_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+.Lecb_enc_loop8_enter:
+
+ call _aesni_encrypt8
+
+ subq $0x80,%rdx
+ jnc .Lecb_enc_loop8
+
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ addq $0x80,%rdx
+ jz .Lecb_ret
+
+.Lecb_enc_tail:
+ movups (%rdi),%xmm2
+ cmpq $0x20,%rdx
+ jb .Lecb_enc_one
+ movups 16(%rdi),%xmm3
+ je .Lecb_enc_two
+ movups 32(%rdi),%xmm4
+ cmpq $0x40,%rdx
+ jb .Lecb_enc_three
+ movups 48(%rdi),%xmm5
+ je .Lecb_enc_four
+ movups 64(%rdi),%xmm6
+ cmpq $0x60,%rdx
+ jb .Lecb_enc_five
+ movups 80(%rdi),%xmm7
+ je .Lecb_enc_six
+ movdqu 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_encrypt8
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ movups %xmm8,96(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_3:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_3
+.byte 102,15,56,221,209
+ movups %xmm2,(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_two:
+ call _aesni_encrypt2
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_three:
+ call _aesni_encrypt3
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_four:
+ call _aesni_encrypt4
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_five:
+ xorps %xmm7,%xmm7
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ jmp .Lecb_ret
+.align 16
+.Lecb_enc_six:
+ call _aesni_encrypt6
+ movups %xmm2,(%rsi)
+ movups %xmm3,16(%rsi)
+ movups %xmm4,32(%rsi)
+ movups %xmm5,48(%rsi)
+ movups %xmm6,64(%rsi)
+ movups %xmm7,80(%rsi)
+ jmp .Lecb_ret
+
+.align 16
+.Lecb_decrypt:
+ cmpq $0x80,%rdx
+ jb .Lecb_dec_tail
+
+ movdqu (%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqu 32(%rdi),%xmm4
+ movdqu 48(%rdi),%xmm5
+ movdqu 64(%rdi),%xmm6
+ movdqu 80(%rdi),%xmm7
+ movdqu 96(%rdi),%xmm8
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+ subq $0x80,%rdx
+ jmp .Lecb_dec_loop8_enter
+.align 16
+.Lecb_dec_loop8:
+ movups %xmm2,(%rsi)
+ movq %r11,%rcx
+ movdqu (%rdi),%xmm2
+ movl %r10d,%eax
+ movups %xmm3,16(%rsi)
+ movdqu 16(%rdi),%xmm3
+ movups %xmm4,32(%rsi)
+ movdqu 32(%rdi),%xmm4
+ movups %xmm5,48(%rsi)
+ movdqu 48(%rdi),%xmm5
+ movups %xmm6,64(%rsi)
+ movdqu 64(%rdi),%xmm6
+ movups %xmm7,80(%rsi)
+ movdqu 80(%rdi),%xmm7
+ movups %xmm8,96(%rsi)
+ movdqu 96(%rdi),%xmm8
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+ movdqu 112(%rdi),%xmm9
+ leaq 128(%rdi),%rdi
+.Lecb_dec_loop8_enter:
+
+ call _aesni_decrypt8
+
+ movups (%r11),%xmm0
+ subq $0x80,%rdx
+ jnc .Lecb_dec_loop8
+
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movq %r11,%rcx
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movl %r10d,%eax
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+ movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ movups %xmm9,112(%rsi)
+ pxor %xmm9,%xmm9
+ leaq 128(%rsi),%rsi
+ addq $0x80,%rdx
+ jz .Lecb_ret
+
+.Lecb_dec_tail:
+ movups (%rdi),%xmm2
+ cmpq $0x20,%rdx
+ jb .Lecb_dec_one
+ movups 16(%rdi),%xmm3
+ je .Lecb_dec_two
+ movups 32(%rdi),%xmm4
+ cmpq $0x40,%rdx
+ jb .Lecb_dec_three
+ movups 48(%rdi),%xmm5
+ je .Lecb_dec_four
+ movups 64(%rdi),%xmm6
+ cmpq $0x60,%rdx
+ jb .Lecb_dec_five
+ movups 80(%rdi),%xmm7
+ je .Lecb_dec_six
+ movups 96(%rdi),%xmm8
+ movups (%rcx),%xmm0
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+ movups %xmm8,96(%rsi)
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_one:
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_4:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_4
+.byte 102,15,56,223,209
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_two:
+ call _aesni_decrypt2
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_three:
+ call _aesni_decrypt3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_four:
+ call _aesni_decrypt4
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_five:
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ jmp .Lecb_ret
+.align 16
+.Lecb_dec_six:
+ call _aesni_decrypt6
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ movups %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movups %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movups %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ movups %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ movups %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+
+.Lecb_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt
+.globl aes_hw_ctr32_encrypt_blocks
+.hidden aes_hw_ctr32_encrypt_blocks
+.type aes_hw_ctr32_encrypt_blocks,@function
+.align 16
+aes_hw_ctr32_encrypt_blocks:
+.cfi_startproc
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb $1,BORINGSSL_function_hit(%rip)
+#endif
+ cmpq $1,%rdx
+ jne .Lctr32_bulk
+
+
+
+ movups (%r8),%xmm2
+ movups (%rdi),%xmm3
+ movl 240(%rcx),%edx
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_enc1_5:
+.byte 102,15,56,220,209
+ decl %edx
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_5
+.byte 102,15,56,221,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ xorps %xmm2,%xmm2
+ jmp .Lctr32_epilogue
+
+.align 16
+.Lctr32_bulk:
+ leaq (%rsp),%r11
+.cfi_def_cfa_register %r11
+ pushq %rbp
+.cfi_offset %rbp,-16
+ subq $128,%rsp
+ andq $-16,%rsp
+
+
+
+
+ movdqu (%r8),%xmm2
+ movdqu (%rcx),%xmm0
+ movl 12(%r8),%r8d
+ pxor %xmm0,%xmm2
+ movl 12(%rcx),%ebp
+ movdqa %xmm2,0(%rsp)
+ bswapl %r8d
+ movdqa %xmm2,%xmm3
+ movdqa %xmm2,%xmm4
+ movdqa %xmm2,%xmm5
+ movdqa %xmm2,64(%rsp)
+ movdqa %xmm2,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movq %rdx,%r10
+ movdqa %xmm2,112(%rsp)
+
+ leaq 1(%r8),%rax
+ leaq 2(%r8),%rdx
+ bswapl %eax
+ bswapl %edx
+ xorl %ebp,%eax
+ xorl %ebp,%edx
+.byte 102,15,58,34,216,3
+ leaq 3(%r8),%rax
+ movdqa %xmm3,16(%rsp)
+.byte 102,15,58,34,226,3
+ bswapl %eax
+ movq %r10,%rdx
+ leaq 4(%r8),%r10
+ movdqa %xmm4,32(%rsp)
+ xorl %ebp,%eax
+ bswapl %r10d
+.byte 102,15,58,34,232,3
+ xorl %ebp,%r10d
+ movdqa %xmm5,48(%rsp)
+ leaq 5(%r8),%r9
+ movl %r10d,64+12(%rsp)
+ bswapl %r9d
+ leaq 6(%r8),%r10
+ movl 240(%rcx),%eax
+ xorl %ebp,%r9d
+ bswapl %r10d
+ movl %r9d,80+12(%rsp)
+ xorl %ebp,%r10d
+ leaq 7(%r8),%r9
+ movl %r10d,96+12(%rsp)
+ bswapl %r9d
+ leaq OPENSSL_ia32cap_P(%rip),%r10
+ movl 4(%r10),%r10d
+ xorl %ebp,%r9d
+ andl $71303168,%r10d
+ movl %r9d,112+12(%rsp)
+
+ movups 16(%rcx),%xmm1
+
+ movdqa 64(%rsp),%xmm6
+ movdqa 80(%rsp),%xmm7
+
+ cmpq $8,%rdx
+ jb .Lctr32_tail
+
+ subq $6,%rdx
+ cmpl $4194304,%r10d
+ je .Lctr32_6x
+
+ leaq 128(%rcx),%rcx
+ subq $2,%rdx
+ jmp .Lctr32_loop8
+
+.align 16
+.Lctr32_6x:
+ shll $4,%eax
+ movl $48,%r10d
+ bswapl %ebp
+ leaq 32(%rcx,%rax,1),%rcx
+ subq %rax,%r10
+ jmp .Lctr32_loop6
+
+.align 16
+.Lctr32_loop6:
+ addl $6,%r8d
+ movups -48(%rcx,%r10,1),%xmm0
+.byte 102,15,56,220,209
+ movl %r8d,%eax
+ xorl %ebp,%eax
+.byte 102,15,56,220,217
+.byte 0x0f,0x38,0xf1,0x44,0x24,12
+ leal 1(%r8),%eax
+.byte 102,15,56,220,225
+ xorl %ebp,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,28
+.byte 102,15,56,220,233
+ leal 2(%r8),%eax
+ xorl %ebp,%eax
+.byte 102,15,56,220,241
+.byte 0x0f,0x38,0xf1,0x44,0x24,44
+ leal 3(%r8),%eax
+.byte 102,15,56,220,249
+ movups -32(%rcx,%r10,1),%xmm1
+ xorl %ebp,%eax
+
+.byte 102,15,56,220,208
+.byte 0x0f,0x38,0xf1,0x44,0x24,60
+ leal 4(%r8),%eax
+.byte 102,15,56,220,216
+ xorl %ebp,%eax
+.byte 0x0f,0x38,0xf1,0x44,0x24,76
+.byte 102,15,56,220,224
+ leal 5(%r8),%eax
+ xorl %ebp,%eax
+.byte 102,15,56,220,232
+.byte 0x0f,0x38,0xf1,0x44,0x24,92
+ movq %r10,%rax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+ movups -16(%rcx,%r10,1),%xmm0
+
+ call .Lenc_loop6
+
+ movdqu (%rdi),%xmm8
+ movdqu 16(%rdi),%xmm9
+ movdqu 32(%rdi),%xmm10
+ movdqu 48(%rdi),%xmm11
+ movdqu 64(%rdi),%xmm12
+ movdqu 80(%rdi),%xmm13
+ leaq 96(%rdi),%rdi
+ movups -64(%rcx,%r10,1),%xmm1
+ pxor %xmm2,%xmm8
+ movaps 0(%rsp),%xmm2
+ pxor %xmm3,%xmm9
+ movaps 16(%rsp),%xmm3
+ pxor %xmm4,%xmm10
+ movaps 32(%rsp),%xmm4
+ pxor %xmm5,%xmm11
+ movaps 48(%rsp),%xmm5
+ pxor %xmm6,%xmm12
+ movaps 64(%rsp),%xmm6
+ pxor %xmm7,%xmm13
+ movaps 80(%rsp),%xmm7
+ movdqu %xmm8,(%rsi)
+ movdqu %xmm9,16(%rsi)
+ movdqu %xmm10,32(%rsi)
+ movdqu %xmm11,48(%rsi)
+ movdqu %xmm12,64(%rsi)
+ movdqu %xmm13,80(%rsi)
+ leaq 96(%rsi),%rsi
+
+ subq $6,%rdx
+ jnc .Lctr32_loop6
+
+ addq $6,%rdx
+ jz .Lctr32_done
+
+ leal -48(%r10),%eax
+ leaq -80(%rcx,%r10,1),%rcx
+ negl %eax
+ shrl $4,%eax
+ jmp .Lctr32_tail
+
+.align 32
+.Lctr32_loop8:
+ addl $8,%r8d
+ movdqa 96(%rsp),%xmm8
+.byte 102,15,56,220,209
+ movl %r8d,%r9d
+ movdqa 112(%rsp),%xmm9
+.byte 102,15,56,220,217
+ bswapl %r9d
+ movups 32-128(%rcx),%xmm0
+.byte 102,15,56,220,225
+ xorl %ebp,%r9d
+ nop
+.byte 102,15,56,220,233
+ movl %r9d,0+12(%rsp)
+ leaq 1(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 48-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,16+12(%rsp)
+ leaq 2(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 64-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,32+12(%rsp)
+ leaq 3(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 80-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,48+12(%rsp)
+ leaq 4(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 96-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,64+12(%rsp)
+ leaq 5(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 112-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+ movl %r9d,80+12(%rsp)
+ leaq 6(%r8),%r9
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 128-128(%rcx),%xmm0
+ bswapl %r9d
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ xorl %ebp,%r9d
+.byte 0x66,0x90
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movl %r9d,96+12(%rsp)
+ leaq 7(%r8),%r9
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 144-128(%rcx),%xmm1
+ bswapl %r9d
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+ xorl %ebp,%r9d
+ movdqu 0(%rdi),%xmm10
+.byte 102,15,56,220,232
+ movl %r9d,112+12(%rsp)
+ cmpl $11,%eax
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 160-128(%rcx),%xmm0
+
+ jb .Lctr32_enc_done
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 176-128(%rcx),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 192-128(%rcx),%xmm0
+ je .Lctr32_enc_done
+
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movups 208-128(%rcx),%xmm1
+
+.byte 102,15,56,220,208
+.byte 102,15,56,220,216
+.byte 102,15,56,220,224
+.byte 102,15,56,220,232
+.byte 102,15,56,220,240
+.byte 102,15,56,220,248
+.byte 102,68,15,56,220,192
+.byte 102,68,15,56,220,200
+ movups 224-128(%rcx),%xmm0
+ jmp .Lctr32_enc_done
+
+.align 16
+.Lctr32_enc_done:
+ movdqu 16(%rdi),%xmm11
+ pxor %xmm0,%xmm10
+ movdqu 32(%rdi),%xmm12
+ pxor %xmm0,%xmm11
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm0,%xmm12
+ movdqu 64(%rdi),%xmm14
+ pxor %xmm0,%xmm13
+ movdqu 80(%rdi),%xmm15
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+.byte 102,68,15,56,220,201
+ movdqu 96(%rdi),%xmm1
+ leaq 128(%rdi),%rdi
+
+.byte 102,65,15,56,221,210
+ pxor %xmm0,%xmm1
+ movdqu 112-128(%rdi),%xmm10
+.byte 102,65,15,56,221,219
+ pxor %xmm0,%xmm10
+ movdqa 0(%rsp),%xmm11
+.byte 102,65,15,56,221,228
+.byte 102,65,15,56,221,237
+ movdqa 16(%rsp),%xmm12
+ movdqa 32(%rsp),%xmm13
+.byte 102,65,15,56,221,246
+.byte 102,65,15,56,221,255
+ movdqa 48(%rsp),%xmm14
+ movdqa 64(%rsp),%xmm15
+.byte 102,68,15,56,221,193
+ movdqa 80(%rsp),%xmm0
+ movups 16-128(%rcx),%xmm1
+.byte 102,69,15,56,221,202
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm0,%xmm7
+ movups %xmm8,96(%rsi)
+ movups %xmm9,112(%rsi)
+ leaq 128(%rsi),%rsi
+
+ subq $8,%rdx
+ jnc .Lctr32_loop8
+
+ addq $8,%rdx
+ jz .Lctr32_done
+ leaq -128(%rcx),%rcx
+
+.Lctr32_tail:
+
+
+ leaq 16(%rcx),%rcx
+ cmpq $4,%rdx
+ jb .Lctr32_loop3
+ je .Lctr32_loop4
+
+
+ shll $4,%eax
+ movdqa 96(%rsp),%xmm8
+ pxor %xmm9,%xmm9
+
+ movups 16(%rcx),%xmm0
+.byte 102,15,56,220,209
+.byte 102,15,56,220,217
+ leaq 32-16(%rcx,%rax,1),%rcx
+ negq %rax
+.byte 102,15,56,220,225
+ addq $16,%rax
+ movups (%rdi),%xmm10
+.byte 102,15,56,220,233
+.byte 102,15,56,220,241
+ movups 16(%rdi),%xmm11
+ movups 32(%rdi),%xmm12
+.byte 102,15,56,220,249
+.byte 102,68,15,56,220,193
+
+ call .Lenc_loop8_enter
+
+ movdqu 48(%rdi),%xmm13
+ pxor %xmm10,%xmm2
+ movdqu 64(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm10,%xmm6
+ movdqu %xmm5,48(%rsi)
+ movdqu %xmm6,64(%rsi)
+ cmpq $6,%rdx
+ jb .Lctr32_done
+
+ movups 80(%rdi),%xmm11
+ xorps %xmm11,%xmm7
+ movups %xmm7,80(%rsi)
+ je .Lctr32_done
+
+ movups 96(%rdi),%xmm12
+ xorps %xmm12,%xmm8
+ movups %xmm8,96(%rsi)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop4:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+.byte 102,15,56,220,233
+ movups (%rcx),%xmm1
+ jnz .Lctr32_loop4
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+ movups (%rdi),%xmm10
+ movups 16(%rdi),%xmm11
+.byte 102,15,56,221,225
+.byte 102,15,56,221,233
+ movups 32(%rdi),%xmm12
+ movups 48(%rdi),%xmm13
+
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm5,48(%rsi)
+ jmp .Lctr32_done
+
+.align 32
+.Lctr32_loop3:
+.byte 102,15,56,220,209
+ leaq 16(%rcx),%rcx
+ decl %eax
+.byte 102,15,56,220,217
+.byte 102,15,56,220,225
+ movups (%rcx),%xmm1
+ jnz .Lctr32_loop3
+.byte 102,15,56,221,209
+.byte 102,15,56,221,217
+.byte 102,15,56,221,225
+
+ movups (%rdi),%xmm10
+ xorps %xmm10,%xmm2
+ movups %xmm2,(%rsi)
+ cmpq $2,%rdx
+ jb .Lctr32_done
+
+ movups 16(%rdi),%xmm11
+ xorps %xmm11,%xmm3
+ movups %xmm3,16(%rsi)
+ je .Lctr32_done
+
+ movups 32(%rdi),%xmm12
+ xorps %xmm12,%xmm4
+ movups %xmm4,32(%rsi)
+
+.Lctr32_done:
+ xorps %xmm0,%xmm0
+ xorl %ebp,%ebp
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ movaps %xmm0,0(%rsp)
+ pxor %xmm8,%xmm8
+ movaps %xmm0,16(%rsp)
+ pxor %xmm9,%xmm9
+ movaps %xmm0,32(%rsp)
+ pxor %xmm10,%xmm10
+ movaps %xmm0,48(%rsp)
+ pxor %xmm11,%xmm11
+ movaps %xmm0,64(%rsp)
+ pxor %xmm12,%xmm12
+ movaps %xmm0,80(%rsp)
+ pxor %xmm13,%xmm13
+ movaps %xmm0,96(%rsp)
+ pxor %xmm14,%xmm14
+ movaps %xmm0,112(%rsp)
+ pxor %xmm15,%xmm15
+ movq -8(%r11),%rbp
+.cfi_restore %rbp
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lctr32_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
+.globl aes_hw_cbc_encrypt
+.hidden aes_hw_cbc_encrypt
+.type aes_hw_cbc_encrypt,@function
+.align 16
+aes_hw_cbc_encrypt:
+.cfi_startproc
+ testq %rdx,%rdx
+ jz .Lcbc_ret
+
+ movl 240(%rcx),%r10d
+ movq %rcx,%r11
+ testl %r9d,%r9d
+ jz .Lcbc_decrypt
+
+ movups (%r8),%xmm2
+ movl %r10d,%eax
+ cmpq $16,%rdx
+ jb .Lcbc_enc_tail
+ subq $16,%rdx
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movups (%rdi),%xmm3
+ leaq 16(%rdi),%rdi
+
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ xorps %xmm0,%xmm3
+ leaq 32(%rcx),%rcx
+ xorps %xmm3,%xmm2
+.Loop_enc1_6:
+.byte 102,15,56,220,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_enc1_6
+.byte 102,15,56,221,209
+ movl %r10d,%eax
+ movq %r11,%rcx
+ movups %xmm2,0(%rsi)
+ leaq 16(%rsi),%rsi
+ subq $16,%rdx
+ jnc .Lcbc_enc_loop
+ addq $16,%rdx
+ jnz .Lcbc_enc_tail
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movups %xmm2,(%r8)
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ jmp .Lcbc_ret
+
+.Lcbc_enc_tail:
+ movq %rdx,%rcx
+ xchgq %rdi,%rsi
+.long 0x9066A4F3
+ movl $16,%ecx
+ subq %rdx,%rcx
+ xorl %eax,%eax
+.long 0x9066AAF3
+ leaq -16(%rdi),%rdi
+ movl %r10d,%eax
+ movq %rdi,%rsi
+ movq %r11,%rcx
+ xorq %rdx,%rdx
+ jmp .Lcbc_enc_loop
+
+.align 16
+.Lcbc_decrypt:
+ cmpq $16,%rdx
+ jne .Lcbc_decrypt_bulk
+
+
+
+ movdqu (%rdi),%xmm2
+ movdqu (%r8),%xmm3
+ movdqa %xmm2,%xmm4
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_7:
+.byte 102,15,56,222,209
+ decl %r10d
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_7
+.byte 102,15,56,223,209
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movdqu %xmm4,(%r8)
+ xorps %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_ret
+.align 16
+.Lcbc_decrypt_bulk:
+ leaq (%rsp),%r11
+.cfi_def_cfa_register %r11
+ pushq %rbp
+.cfi_offset %rbp,-16
+ subq $16,%rsp
+ andq $-16,%rsp
+ movq %rcx,%rbp
+ movups (%r8),%xmm10
+ movl %r10d,%eax
+ cmpq $0x50,%rdx
+ jbe .Lcbc_dec_tail
+
+ movups (%rcx),%xmm0
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+ leaq OPENSSL_ia32cap_P(%rip),%r9
+ movl 4(%r9),%r9d
+ cmpq $0x70,%rdx
+ jbe .Lcbc_dec_six_or_seven
+
+ andl $71303168,%r9d
+ subq $0x50,%rdx
+ cmpl $4194304,%r9d
+ je .Lcbc_dec_loop6_enter
+ subq $0x20,%rdx
+ leaq 112(%rcx),%rcx
+ jmp .Lcbc_dec_loop8_enter
+.align 16
+.Lcbc_dec_loop8:
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+.Lcbc_dec_loop8_enter:
+ movdqu 96(%rdi),%xmm8
+ pxor %xmm0,%xmm2
+ movdqu 112(%rdi),%xmm9
+ pxor %xmm0,%xmm3
+ movups 16-112(%rcx),%xmm1
+ pxor %xmm0,%xmm4
+ movq $-1,%rbp
+ cmpq $0x70,%rdx
+ pxor %xmm0,%xmm5
+ pxor %xmm0,%xmm6
+ pxor %xmm0,%xmm7
+ pxor %xmm0,%xmm8
+
+.byte 102,15,56,222,209
+ pxor %xmm0,%xmm9
+ movups 32-112(%rcx),%xmm0
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+ adcq $0,%rbp
+ andq $128,%rbp
+.byte 102,68,15,56,222,201
+ addq %rdi,%rbp
+ movups 48-112(%rcx),%xmm1
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 64-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 80-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 96-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 112-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 128-112(%rcx),%xmm0
+ nop
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 144-112(%rcx),%xmm1
+ cmpl $11,%eax
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 160-112(%rcx),%xmm0
+ jb .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 176-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 192-112(%rcx),%xmm0
+ je .Lcbc_dec_done
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movups 208-112(%rcx),%xmm1
+ nop
+.byte 102,15,56,222,208
+.byte 102,15,56,222,216
+.byte 102,15,56,222,224
+.byte 102,15,56,222,232
+.byte 102,15,56,222,240
+.byte 102,15,56,222,248
+.byte 102,68,15,56,222,192
+.byte 102,68,15,56,222,200
+ movups 224-112(%rcx),%xmm0
+ jmp .Lcbc_dec_done
+.align 16
+.Lcbc_dec_done:
+.byte 102,15,56,222,209
+.byte 102,15,56,222,217
+ pxor %xmm0,%xmm10
+ pxor %xmm0,%xmm11
+.byte 102,15,56,222,225
+.byte 102,15,56,222,233
+ pxor %xmm0,%xmm12
+ pxor %xmm0,%xmm13
+.byte 102,15,56,222,241
+.byte 102,15,56,222,249
+ pxor %xmm0,%xmm14
+ pxor %xmm0,%xmm15
+.byte 102,68,15,56,222,193
+.byte 102,68,15,56,222,201
+ movdqu 80(%rdi),%xmm1
+
+.byte 102,65,15,56,223,210
+ movdqu 96(%rdi),%xmm10
+ pxor %xmm0,%xmm1
+.byte 102,65,15,56,223,219
+ pxor %xmm0,%xmm10
+ movdqu 112(%rdi),%xmm0
+.byte 102,65,15,56,223,228
+ leaq 128(%rdi),%rdi
+ movdqu 0(%rbp),%xmm11
+.byte 102,65,15,56,223,237
+.byte 102,65,15,56,223,246
+ movdqu 16(%rbp),%xmm12
+ movdqu 32(%rbp),%xmm13
+.byte 102,65,15,56,223,255
+.byte 102,68,15,56,223,193
+ movdqu 48(%rbp),%xmm14
+ movdqu 64(%rbp),%xmm15
+.byte 102,69,15,56,223,202
+ movdqa %xmm0,%xmm10
+ movdqu 80(%rbp),%xmm1
+ movups -112(%rcx),%xmm0
+
+ movups %xmm2,(%rsi)
+ movdqa %xmm11,%xmm2
+ movups %xmm3,16(%rsi)
+ movdqa %xmm12,%xmm3
+ movups %xmm4,32(%rsi)
+ movdqa %xmm13,%xmm4
+ movups %xmm5,48(%rsi)
+ movdqa %xmm14,%xmm5
+ movups %xmm6,64(%rsi)
+ movdqa %xmm15,%xmm6
+ movups %xmm7,80(%rsi)
+ movdqa %xmm1,%xmm7
+ movups %xmm8,96(%rsi)
+ leaq 112(%rsi),%rsi
+
+ subq $0x80,%rdx
+ ja .Lcbc_dec_loop8
+
+ movaps %xmm9,%xmm2
+ leaq -112(%rcx),%rcx
+ addq $0x70,%rdx
+ jle .Lcbc_dec_clear_tail_collected
+ movups %xmm9,(%rsi)
+ leaq 16(%rsi),%rsi
+ cmpq $0x50,%rdx
+ jbe .Lcbc_dec_tail
+
+ movaps %xmm11,%xmm2
+.Lcbc_dec_six_or_seven:
+ cmpq $0x60,%rdx
+ ja .Lcbc_dec_seven
+
+ movaps %xmm7,%xmm8
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ leaq 80(%rsi),%rsi
+ movdqa %xmm7,%xmm2
+ pxor %xmm7,%xmm7
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_seven:
+ movups 96(%rdi),%xmm8
+ xorps %xmm9,%xmm9
+ call _aesni_decrypt8
+ movups 80(%rdi),%xmm9
+ pxor %xmm10,%xmm2
+ movups 96(%rdi),%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ pxor %xmm15,%xmm7
+ movdqu %xmm6,64(%rsi)
+ pxor %xmm6,%xmm6
+ pxor %xmm9,%xmm8
+ movdqu %xmm7,80(%rsi)
+ pxor %xmm7,%xmm7
+ leaq 96(%rsi),%rsi
+ movdqa %xmm8,%xmm2
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_loop6:
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+ movdqu 0(%rdi),%xmm2
+ movdqu 16(%rdi),%xmm3
+ movdqa %xmm2,%xmm11
+ movdqu 32(%rdi),%xmm4
+ movdqa %xmm3,%xmm12
+ movdqu 48(%rdi),%xmm5
+ movdqa %xmm4,%xmm13
+ movdqu 64(%rdi),%xmm6
+ movdqa %xmm5,%xmm14
+ movdqu 80(%rdi),%xmm7
+ movdqa %xmm6,%xmm15
+.Lcbc_dec_loop6_enter:
+ leaq 96(%rdi),%rdi
+ movdqa %xmm7,%xmm8
+
+ call _aesni_decrypt6
+
+ pxor %xmm10,%xmm2
+ movdqa %xmm8,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm14,%xmm6
+ movq %rbp,%rcx
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm15,%xmm7
+ movl %r10d,%eax
+ movdqu %xmm6,64(%rsi)
+ leaq 80(%rsi),%rsi
+ subq $0x60,%rdx
+ ja .Lcbc_dec_loop6
+
+ movdqa %xmm7,%xmm2
+ addq $0x50,%rdx
+ jle .Lcbc_dec_clear_tail_collected
+ movups %xmm7,(%rsi)
+ leaq 16(%rsi),%rsi
+
+.Lcbc_dec_tail:
+ movups (%rdi),%xmm2
+ subq $0x10,%rdx
+ jbe .Lcbc_dec_one
+
+ movups 16(%rdi),%xmm3
+ movaps %xmm2,%xmm11
+ subq $0x10,%rdx
+ jbe .Lcbc_dec_two
+
+ movups 32(%rdi),%xmm4
+ movaps %xmm3,%xmm12
+ subq $0x10,%rdx
+ jbe .Lcbc_dec_three
+
+ movups 48(%rdi),%xmm5
+ movaps %xmm4,%xmm13
+ subq $0x10,%rdx
+ jbe .Lcbc_dec_four
+
+ movups 64(%rdi),%xmm6
+ movaps %xmm5,%xmm14
+ movaps %xmm6,%xmm15
+ xorps %xmm7,%xmm7
+ call _aesni_decrypt6
+ pxor %xmm10,%xmm2
+ movaps %xmm15,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ pxor %xmm14,%xmm6
+ movdqu %xmm5,48(%rsi)
+ pxor %xmm5,%xmm5
+ leaq 64(%rsi),%rsi
+ movdqa %xmm6,%xmm2
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ subq $0x10,%rdx
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_one:
+ movaps %xmm2,%xmm11
+ movups (%rcx),%xmm0
+ movups 16(%rcx),%xmm1
+ leaq 32(%rcx),%rcx
+ xorps %xmm0,%xmm2
+.Loop_dec1_8:
+.byte 102,15,56,222,209
+ decl %eax
+ movups (%rcx),%xmm1
+ leaq 16(%rcx),%rcx
+ jnz .Loop_dec1_8
+.byte 102,15,56,223,209
+ xorps %xmm10,%xmm2
+ movaps %xmm11,%xmm10
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_two:
+ movaps %xmm3,%xmm12
+ call _aesni_decrypt2
+ pxor %xmm10,%xmm2
+ movaps %xmm12,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ movdqa %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ leaq 16(%rsi),%rsi
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_three:
+ movaps %xmm4,%xmm13
+ call _aesni_decrypt3
+ pxor %xmm10,%xmm2
+ movaps %xmm13,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ movdqa %xmm4,%xmm2
+ pxor %xmm4,%xmm4
+ leaq 32(%rsi),%rsi
+ jmp .Lcbc_dec_tail_collected
+.align 16
+.Lcbc_dec_four:
+ movaps %xmm5,%xmm14
+ call _aesni_decrypt4
+ pxor %xmm10,%xmm2
+ movaps %xmm14,%xmm10
+ pxor %xmm11,%xmm3
+ movdqu %xmm2,(%rsi)
+ pxor %xmm12,%xmm4
+ movdqu %xmm3,16(%rsi)
+ pxor %xmm3,%xmm3
+ pxor %xmm13,%xmm5
+ movdqu %xmm4,32(%rsi)
+ pxor %xmm4,%xmm4
+ movdqa %xmm5,%xmm2
+ pxor %xmm5,%xmm5
+ leaq 48(%rsi),%rsi
+ jmp .Lcbc_dec_tail_collected
+
+.align 16
+.Lcbc_dec_clear_tail_collected:
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ pxor %xmm8,%xmm8
+ pxor %xmm9,%xmm9
+.Lcbc_dec_tail_collected:
+ movups %xmm10,(%r8)
+ andq $15,%rdx
+ jnz .Lcbc_dec_tail_partial
+ movups %xmm2,(%rsi)
+ pxor %xmm2,%xmm2
+ jmp .Lcbc_dec_ret
+.align 16
+.Lcbc_dec_tail_partial:
+ movaps %xmm2,(%rsp)
+ pxor %xmm2,%xmm2
+ movq $16,%rcx
+ movq %rsi,%rdi
+ subq %rdx,%rcx
+ leaq (%rsp),%rsi
+.long 0x9066A4F3
+ movdqa %xmm2,(%rsp)
+
+.Lcbc_dec_ret:
+ xorps %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ movq -8(%r11),%rbp
+.cfi_restore %rbp
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lcbc_ret:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
+.globl aes_hw_set_decrypt_key
+.hidden aes_hw_set_decrypt_key
+.type aes_hw_set_decrypt_key,@function
+.align 16
+aes_hw_set_decrypt_key:
+.cfi_startproc
+.byte 0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset 8
+ call __aesni_set_encrypt_key
+ shll $4,%esi
+ testl %eax,%eax
+ jnz .Ldec_key_ret
+ leaq 16(%rdx,%rsi,1),%rdi
+
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+ movups %xmm0,(%rdi)
+ movups %xmm1,(%rdx)
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+
+.Ldec_key_inverse:
+ movups (%rdx),%xmm0
+ movups (%rdi),%xmm1
+.byte 102,15,56,219,192
+.byte 102,15,56,219,201
+ leaq 16(%rdx),%rdx
+ leaq -16(%rdi),%rdi
+ movups %xmm0,16(%rdi)
+ movups %xmm1,-16(%rdx)
+ cmpq %rdx,%rdi
+ ja .Ldec_key_inverse
+
+ movups (%rdx),%xmm0
+.byte 102,15,56,219,192
+ pxor %xmm1,%xmm1
+ movups %xmm0,(%rdi)
+ pxor %xmm0,%xmm0
+.Ldec_key_ret:
+ addq $8,%rsp
+.cfi_adjust_cfa_offset -8
+ .byte 0xf3,0xc3
+.cfi_endproc
+.LSEH_end_set_decrypt_key:
+.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
+.globl aes_hw_set_encrypt_key
+.hidden aes_hw_set_encrypt_key
+.type aes_hw_set_encrypt_key,@function
+.align 16
+aes_hw_set_encrypt_key:
+__aesni_set_encrypt_key:
+.cfi_startproc
+#ifdef BORINGSSL_DISPATCH_TEST
+ movb $1,BORINGSSL_function_hit+3(%rip)
+#endif
+.byte 0x48,0x83,0xEC,0x08
+.cfi_adjust_cfa_offset 8
+ movq $-1,%rax
+ testq %rdi,%rdi
+ jz .Lenc_key_ret
+ testq %rdx,%rdx
+ jz .Lenc_key_ret
+
+ movups (%rdi),%xmm0
+ xorps %xmm4,%xmm4
+ leaq OPENSSL_ia32cap_P(%rip),%r10
+ movl 4(%r10),%r10d
+ andl $268437504,%r10d
+ leaq 16(%rdx),%rax
+ cmpl $256,%esi
+ je .L14rounds
+ cmpl $192,%esi
+ je .L12rounds
+ cmpl $128,%esi
+ jne .Lbad_keybits
+
+.L10rounds:
+ movl $9,%esi
+ cmpl $268435456,%r10d
+ je .L10rounds_alt
+
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,200,1
+ call .Lkey_expansion_128_cold
+.byte 102,15,58,223,200,2
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,4
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,8
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,16
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,32
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,64
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,128
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,27
+ call .Lkey_expansion_128
+.byte 102,15,58,223,200,54
+ call .Lkey_expansion_128
+ movups %xmm0,(%rax)
+ movl %esi,80(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L10rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movl $8,%r10d
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movdqa %xmm0,%xmm2
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key128
+
+.align 16
+.Loop_key128:
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+ leaq 16(%rax),%rax
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,-16(%rax)
+ movdqa %xmm0,%xmm2
+
+ decl %r10d
+ jnz .Loop_key128
+
+ movdqa .Lkey_rcon1b(%rip),%xmm4
+
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+ pslld $1,%xmm4
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ movdqa %xmm0,%xmm2
+.byte 102,15,56,0,197
+.byte 102,15,56,221,196
+
+ movdqa %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm2,%xmm3
+ pslldq $4,%xmm2
+ pxor %xmm3,%xmm2
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,16(%rax)
+
+ movl %esi,96(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds:
+ movq 16(%rdi),%xmm2
+ movl $11,%esi
+ cmpl $268435456,%r10d
+ je .L12rounds_alt
+
+ movups %xmm0,(%rdx)
+.byte 102,15,58,223,202,1
+ call .Lkey_expansion_192a_cold
+.byte 102,15,58,223,202,2
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,4
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,8
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,16
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,32
+ call .Lkey_expansion_192b
+.byte 102,15,58,223,202,64
+ call .Lkey_expansion_192a
+.byte 102,15,58,223,202,128
+ call .Lkey_expansion_192b
+ movups %xmm0,(%rax)
+ movl %esi,48(%rax)
+ xorq %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L12rounds_alt:
+ movdqa .Lkey_rotate192(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $8,%r10d
+ movdqu %xmm0,(%rdx)
+ jmp .Loop_key192
+
+.align 16
+.Loop_key192:
+ movq %xmm2,0(%rax)
+ movdqa %xmm2,%xmm1
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+ pslld $1,%xmm4
+ leaq 24(%rax),%rax
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+
+ pshufd $0xff,%xmm0,%xmm3
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm2
+ movdqu %xmm0,-16(%rax)
+
+ decl %r10d
+ jnz .Loop_key192
+
+ movl %esi,32(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds:
+ movups 16(%rdi),%xmm2
+ movl $13,%esi
+ leaq 16(%rax),%rax
+ cmpl $268435456,%r10d
+ je .L14rounds_alt
+
+ movups %xmm0,(%rdx)
+ movups %xmm2,16(%rdx)
+.byte 102,15,58,223,202,1
+ call .Lkey_expansion_256a_cold
+.byte 102,15,58,223,200,1
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,2
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,2
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,4
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,4
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,8
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,8
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,16
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,16
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,32
+ call .Lkey_expansion_256a
+.byte 102,15,58,223,200,32
+ call .Lkey_expansion_256b
+.byte 102,15,58,223,202,64
+ call .Lkey_expansion_256a
+ movups %xmm0,(%rax)
+ movl %esi,16(%rax)
+ xorq %rax,%rax
+ jmp .Lenc_key_ret
+
+.align 16
+.L14rounds_alt:
+ movdqa .Lkey_rotate(%rip),%xmm5
+ movdqa .Lkey_rcon1(%rip),%xmm4
+ movl $7,%r10d
+ movdqu %xmm0,0(%rdx)
+ movdqa %xmm2,%xmm1
+ movdqu %xmm2,16(%rdx)
+ jmp .Loop_key256
+
+.align 16
+.Loop_key256:
+.byte 102,15,56,0,213
+.byte 102,15,56,221,212
+
+ movdqa %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm0,%xmm3
+ pslldq $4,%xmm0
+ pxor %xmm3,%xmm0
+ pslld $1,%xmm4
+
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,(%rax)
+
+ decl %r10d
+ jz .Ldone_key256
+
+ pshufd $0xff,%xmm0,%xmm2
+ pxor %xmm3,%xmm3
+.byte 102,15,56,221,211
+
+ movdqa %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm1,%xmm3
+ pslldq $4,%xmm1
+ pxor %xmm3,%xmm1
+
+ pxor %xmm1,%xmm2
+ movdqu %xmm2,16(%rax)
+ leaq 32(%rax),%rax
+ movdqa %xmm2,%xmm1
+
+ jmp .Loop_key256
+
+.Ldone_key256:
+ movl %esi,16(%rax)
+ xorl %eax,%eax
+ jmp .Lenc_key_ret
+
+.align 16
+.Lbad_keybits:
+ movq $-2,%rax
+.Lenc_key_ret:
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ addq $8,%rsp
+.cfi_adjust_cfa_offset -8
+ .byte 0xf3,0xc3
+.cfi_endproc
+.LSEH_end_set_encrypt_key:
+
+.align 16
+.Lkey_expansion_128:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_128_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ .byte 0xf3,0xc3
+
+.align 16
+.Lkey_expansion_192a:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_192a_cold:
+ movaps %xmm2,%xmm5
+.Lkey_expansion_192b_warm:
+ shufps $16,%xmm0,%xmm4
+ movdqa %xmm2,%xmm3
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ pslldq $4,%xmm3
+ xorps %xmm4,%xmm0
+ pshufd $85,%xmm1,%xmm1
+ pxor %xmm3,%xmm2
+ pxor %xmm1,%xmm0
+ pshufd $255,%xmm0,%xmm3
+ pxor %xmm3,%xmm2
+ .byte 0xf3,0xc3
+
+.align 16
+.Lkey_expansion_192b:
+ movaps %xmm0,%xmm3
+ shufps $68,%xmm0,%xmm5
+ movups %xmm5,(%rax)
+ shufps $78,%xmm2,%xmm3
+ movups %xmm3,16(%rax)
+ leaq 32(%rax),%rax
+ jmp .Lkey_expansion_192b_warm
+
+.align 16
+.Lkey_expansion_256a:
+ movups %xmm2,(%rax)
+ leaq 16(%rax),%rax
+.Lkey_expansion_256a_cold:
+ shufps $16,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $140,%xmm0,%xmm4
+ xorps %xmm4,%xmm0
+ shufps $255,%xmm1,%xmm1
+ xorps %xmm1,%xmm0
+ .byte 0xf3,0xc3
+
+.align 16
+.Lkey_expansion_256b:
+ movups %xmm0,(%rax)
+ leaq 16(%rax),%rax
+
+ shufps $16,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $140,%xmm2,%xmm4
+ xorps %xmm4,%xmm2
+ shufps $170,%xmm1,%xmm1
+ xorps %xmm1,%xmm2
+ .byte 0xf3,0xc3
+.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
+.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+.long 6,6,6,0
+.Lincrement64:
+.long 1,0,0,0
+.Lxts_magic:
+.long 0x87,0,1,0
+.Lincrement1:
+.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Lkey_rotate:
+.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
+.Lkey_rotate192:
+.long 0x04070605,0x04070605,0x04070605,0x04070605
+.Lkey_rcon1:
+.long 1,1,1,1
+.Lkey_rcon1b:
+.long 0x1b,0x1b,0x1b,0x1b
+
+.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S
@@ -1,0 +1,427 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+
+
+
+
+.type gcm_gmult_ssse3, @function
+.globl gcm_gmult_ssse3
+.hidden gcm_gmult_ssse3
+.align 16
+gcm_gmult_ssse3:
+.cfi_startproc
+.Lgmult_seh_begin:
+ movdqu (%rdi),%xmm0
+ movdqa .Lreverse_bytes(%rip),%xmm10
+ movdqa .Llow4_mask(%rip),%xmm2
+
+
+.byte 102,65,15,56,0,194
+
+
+ movdqa %xmm2,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm2,%xmm0
+
+
+
+
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ movq $5,%rax
+.Loop_row_1:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_1
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $5,%rax
+.Loop_row_2:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_2
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $6,%rax
+.Loop_row_3:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_3
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+
+.byte 102,65,15,56,0,210
+ movdqu %xmm2,(%rdi)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ .byte 0xf3,0xc3
+.Lgmult_seh_end:
+.cfi_endproc
+.size gcm_gmult_ssse3,.-gcm_gmult_ssse3
+
+
+
+
+
+.type gcm_ghash_ssse3, @function
+.globl gcm_ghash_ssse3
+.hidden gcm_ghash_ssse3
+.align 16
+gcm_ghash_ssse3:
+.Lghash_seh_begin:
+.cfi_startproc
+ movdqu (%rdi),%xmm0
+ movdqa .Lreverse_bytes(%rip),%xmm10
+ movdqa .Llow4_mask(%rip),%xmm11
+
+
+ andq $-16,%rcx
+
+
+
+.byte 102,65,15,56,0,194
+
+
+ pxor %xmm3,%xmm3
+.Loop_ghash:
+
+ movdqu (%rdx),%xmm1
+.byte 102,65,15,56,0,202
+ pxor %xmm1,%xmm0
+
+
+ movdqa %xmm11,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm11,%xmm0
+
+
+
+
+ pxor %xmm2,%xmm2
+
+ movq $5,%rax
+.Loop_row_4:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_4
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $5,%rax
+.Loop_row_5:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_5
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movq $6,%rax
+.Loop_row_6:
+ movdqa (%rsi),%xmm4
+ leaq 16(%rsi),%rsi
+
+
+ movdqa %xmm2,%xmm6
+.byte 102,15,58,15,243,1
+ movdqa %xmm6,%xmm3
+ psrldq $1,%xmm2
+
+
+
+
+ movdqa %xmm4,%xmm5
+.byte 102,15,56,0,224
+.byte 102,15,56,0,233
+
+
+ pxor %xmm5,%xmm2
+
+
+
+ movdqa %xmm4,%xmm5
+ psllq $60,%xmm5
+ movdqa %xmm5,%xmm6
+ pslldq $8,%xmm6
+ pxor %xmm6,%xmm3
+
+
+ psrldq $8,%xmm5
+ pxor %xmm5,%xmm2
+ psrlq $4,%xmm4
+ pxor %xmm4,%xmm2
+
+ subq $1,%rax
+ jnz .Loop_row_6
+
+
+
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $1,%xmm3
+ pxor %xmm3,%xmm2
+ psrlq $5,%xmm3
+ pxor %xmm3,%xmm2
+ pxor %xmm3,%xmm3
+ movdqa %xmm2,%xmm0
+
+
+ leaq -256(%rsi),%rsi
+
+
+ leaq 16(%rdx),%rdx
+ subq $16,%rcx
+ jnz .Loop_ghash
+
+
+.byte 102,65,15,56,0,194
+ movdqu %xmm0,(%rdi)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ .byte 0xf3,0xc3
+.Lghash_seh_end:
+.cfi_endproc
+.size gcm_ghash_ssse3,.-gcm_ghash_ssse3
+
+.align 16
+
+
+.Lreverse_bytes:
+.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+.Llow4_mask:
+.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S
@@ -1,0 +1,1127 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl gcm_init_clmul
+.hidden gcm_init_clmul
+.type gcm_init_clmul,@function
+.align 16
+gcm_init_clmul:
+.cfi_startproc
+.L_init_clmul:
+ movdqu (%rsi),%xmm2
+ pshufd $78,%xmm2,%xmm2
+
+
+ pshufd $255,%xmm2,%xmm4
+ movdqa %xmm2,%xmm3
+ psllq $1,%xmm2
+ pxor %xmm5,%xmm5
+ psrlq $63,%xmm3
+ pcmpgtd %xmm4,%xmm5
+ pslldq $8,%xmm3
+ por %xmm3,%xmm2
+
+
+ pand .L0x1c2_polynomial(%rip),%xmm5
+ pxor %xmm5,%xmm2
+
+
+ pshufd $78,%xmm2,%xmm6
+ movdqa %xmm2,%xmm0
+ pxor %xmm2,%xmm6
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm2,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm2,%xmm3
+ movdqu %xmm2,0(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,16(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,32(%rdi)
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm5
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,222,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ pshufd $78,%xmm5,%xmm3
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm5,%xmm3
+ movdqu %xmm5,48(%rdi)
+ pxor %xmm0,%xmm4
+ movdqu %xmm0,64(%rdi)
+.byte 102,15,58,15,227,8
+ movdqu %xmm4,80(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_init_clmul,.-gcm_init_clmul
+.globl gcm_gmult_clmul
+.hidden gcm_gmult_clmul
+.type gcm_gmult_clmul,@function
+.align 16
+gcm_gmult_clmul:
+.cfi_startproc
+.L_gmult_clmul:
+ movdqu (%rdi),%xmm0
+ movdqa .Lbswap_mask(%rip),%xmm5
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm4
+.byte 102,15,56,0,197
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,220,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,197
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_gmult_clmul,.-gcm_gmult_clmul
+.globl gcm_ghash_clmul
+.hidden gcm_ghash_clmul
+.type gcm_ghash_clmul,@function
+.align 32
+gcm_ghash_clmul:
+.cfi_startproc
+.L_ghash_clmul:
+ movdqa .Lbswap_mask(%rip),%xmm10
+
+ movdqu (%rdi),%xmm0
+ movdqu (%rsi),%xmm2
+ movdqu 32(%rsi),%xmm7
+.byte 102,65,15,56,0,194
+
+ subq $0x10,%rcx
+ jz .Lodd_tail
+
+ movdqu 16(%rsi),%xmm6
+ leaq OPENSSL_ia32cap_P(%rip),%rax
+ movl 4(%rax),%eax
+ cmpq $0x30,%rcx
+ jb .Lskip4x
+
+ andl $71303168,%eax
+ cmpl $4194304,%eax
+ je .Lskip4x
+
+ subq $0x30,%rcx
+ movq $0xA040608020C0E000,%rax
+ movdqu 48(%rsi),%xmm14
+ movdqu 64(%rsi),%xmm15
+
+
+
+
+ movdqu 48(%rdx),%xmm3
+ movdqu 32(%rdx),%xmm11
+.byte 102,65,15,56,0,218
+.byte 102,69,15,56,0,218
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm11,%xmm12
+.byte 102,68,15,58,68,222,0
+.byte 102,68,15,58,68,238,17
+.byte 102,68,15,58,68,231,16
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+ xorps %xmm12,%xmm4
+
+ movdqu 16(%rdx),%xmm11
+ movdqu 0(%rdx),%xmm8
+.byte 102,69,15,56,0,218
+.byte 102,69,15,56,0,194
+ movdqa %xmm11,%xmm13
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm8,%xmm0
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+.byte 102,69,15,58,68,238,17
+.byte 102,68,15,58,68,231,0
+ xorps %xmm11,%xmm3
+ xorps %xmm13,%xmm5
+
+ leaq 64(%rdx),%rdx
+ subq $0x40,%rcx
+ jc .Ltail4x
+
+ jmp .Lmod4_loop
+.align 32
+.Lmod4_loop:
+.byte 102,65,15,58,68,199,0
+ xorps %xmm12,%xmm4
+ movdqu 48(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,65,15,58,68,207,17
+ xorps %xmm3,%xmm0
+ movdqu 32(%rdx),%xmm3
+ movdqa %xmm11,%xmm13
+.byte 102,68,15,58,68,199,16
+ pshufd $78,%xmm11,%xmm12
+ xorps %xmm5,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,65,15,56,0,218
+ movups 32(%rsi),%xmm7
+ xorps %xmm4,%xmm8
+.byte 102,68,15,58,68,218,0
+ pshufd $78,%xmm3,%xmm4
+
+ pxor %xmm0,%xmm8
+ movdqa %xmm3,%xmm5
+ pxor %xmm1,%xmm8
+ pxor %xmm3,%xmm4
+ movdqa %xmm8,%xmm9
+.byte 102,68,15,58,68,234,17
+ pslldq $8,%xmm8
+ psrldq $8,%xmm9
+ pxor %xmm8,%xmm0
+ movdqa .L7_mask(%rip),%xmm8
+ pxor %xmm9,%xmm1
+.byte 102,76,15,110,200
+
+ pand %xmm0,%xmm8
+.byte 102,69,15,56,0,200
+ pxor %xmm0,%xmm9
+.byte 102,68,15,58,68,231,0
+ psllq $57,%xmm9
+ movdqa %xmm9,%xmm8
+ pslldq $8,%xmm9
+.byte 102,15,58,68,222,0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ movdqu 0(%rdx),%xmm8
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ movdqu 16(%rdx),%xmm11
+.byte 102,69,15,56,0,218
+.byte 102,15,58,68,231,16
+ xorps %xmm13,%xmm5
+ movups 80(%rsi),%xmm7
+.byte 102,69,15,56,0,194
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+
+ movdqa %xmm11,%xmm13
+ pxor %xmm12,%xmm4
+ pshufd $78,%xmm11,%xmm12
+ pxor %xmm9,%xmm0
+ pxor %xmm8,%xmm1
+ pxor %xmm11,%xmm12
+.byte 102,69,15,58,68,222,0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ movdqa %xmm0,%xmm1
+.byte 102,69,15,58,68,238,17
+ xorps %xmm11,%xmm3
+ pshufd $78,%xmm0,%xmm8
+ pxor %xmm0,%xmm8
+
+.byte 102,68,15,58,68,231,0
+ xorps %xmm13,%xmm5
+
+ leaq 64(%rdx),%rdx
+ subq $0x40,%rcx
+ jnc .Lmod4_loop
+
+.Ltail4x:
+.byte 102,65,15,58,68,199,0
+.byte 102,65,15,58,68,207,17
+.byte 102,68,15,58,68,199,16
+ xorps %xmm12,%xmm4
+ xorps %xmm3,%xmm0
+ xorps %xmm5,%xmm1
+ pxor %xmm0,%xmm1
+ pxor %xmm4,%xmm8
+
+ pxor %xmm1,%xmm8
+ pxor %xmm0,%xmm1
+
+ movdqa %xmm8,%xmm9
+ psrldq $8,%xmm8
+ pslldq $8,%xmm9
+ pxor %xmm8,%xmm1
+ pxor %xmm9,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ addq $0x40,%rcx
+ jz .Ldone
+ movdqu 32(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Lodd_tail
+.Lskip4x:
+
+
+
+
+
+ movdqu (%rdx),%xmm8
+ movdqu 16(%rdx),%xmm3
+.byte 102,69,15,56,0,194
+.byte 102,65,15,56,0,218
+ pxor %xmm8,%xmm0
+
+ movdqa %xmm3,%xmm5
+ pshufd $78,%xmm3,%xmm4
+ pxor %xmm3,%xmm4
+.byte 102,15,58,68,218,0
+.byte 102,15,58,68,234,17
+.byte 102,15,58,68,231,0
+
+ leaq 32(%rdx),%rdx
+ nop
+ subq $0x20,%rcx
+ jbe .Leven_tail
+ nop
+ jmp .Lmod_loop
+
+.align 32
+.Lmod_loop:
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ movdqu (%rdx),%xmm9
+ pxor %xmm0,%xmm8
+.byte 102,69,15,56,0,202
+ movdqu 16(%rdx),%xmm3
+
+ pxor %xmm1,%xmm8
+ pxor %xmm9,%xmm1
+ pxor %xmm8,%xmm4
+.byte 102,65,15,56,0,218
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
+ pslldq $8,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm3,%xmm5
+
+ movdqa %xmm0,%xmm9
+ movdqa %xmm0,%xmm8
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm8
+.byte 102,15,58,68,218,0
+ psllq $1,%xmm0
+ pxor %xmm8,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm8
+ pslldq $8,%xmm0
+ psrldq $8,%xmm8
+ pxor %xmm9,%xmm0
+ pshufd $78,%xmm5,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm5,%xmm4
+
+ movdqa %xmm0,%xmm9
+ psrlq $1,%xmm0
+.byte 102,15,58,68,234,17
+ pxor %xmm9,%xmm1
+ pxor %xmm0,%xmm9
+ psrlq $5,%xmm0
+ pxor %xmm9,%xmm0
+ leaq 32(%rdx),%rdx
+ psrlq $1,%xmm0
+.byte 102,15,58,68,231,0
+ pxor %xmm1,%xmm0
+
+ subq $0x20,%rcx
+ ja .Lmod_loop
+
+.Leven_tail:
+ movdqa %xmm0,%xmm1
+ movdqa %xmm4,%xmm8
+ pshufd $78,%xmm0,%xmm4
+ pxor %xmm0,%xmm4
+
+.byte 102,15,58,68,198,0
+.byte 102,15,58,68,206,17
+.byte 102,15,58,68,231,16
+
+ pxor %xmm3,%xmm0
+ pxor %xmm5,%xmm1
+ pxor %xmm0,%xmm8
+ pxor %xmm1,%xmm8
+ pxor %xmm8,%xmm4
+ movdqa %xmm4,%xmm8
+ psrldq $8,%xmm8
+ pslldq $8,%xmm4
+ pxor %xmm8,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+ testq %rcx,%rcx
+ jnz .Ldone
+
+.Lodd_tail:
+ movdqu (%rdx),%xmm8
+.byte 102,69,15,56,0,194
+ pxor %xmm8,%xmm0
+ movdqa %xmm0,%xmm1
+ pshufd $78,%xmm0,%xmm3
+ pxor %xmm0,%xmm3
+.byte 102,15,58,68,194,0
+.byte 102,15,58,68,202,17
+.byte 102,15,58,68,223,0
+ pxor %xmm0,%xmm3
+ pxor %xmm1,%xmm3
+
+ movdqa %xmm3,%xmm4
+ psrldq $8,%xmm3
+ pslldq $8,%xmm4
+ pxor %xmm3,%xmm1
+ pxor %xmm4,%xmm0
+
+ movdqa %xmm0,%xmm4
+ movdqa %xmm0,%xmm3
+ psllq $5,%xmm0
+ pxor %xmm0,%xmm3
+ psllq $1,%xmm0
+ pxor %xmm3,%xmm0
+ psllq $57,%xmm0
+ movdqa %xmm0,%xmm3
+ pslldq $8,%xmm0
+ psrldq $8,%xmm3
+ pxor %xmm4,%xmm0
+ pxor %xmm3,%xmm1
+
+
+ movdqa %xmm0,%xmm4
+ psrlq $1,%xmm0
+ pxor %xmm4,%xmm1
+ pxor %xmm0,%xmm4
+ psrlq $5,%xmm0
+ pxor %xmm4,%xmm0
+ psrlq $1,%xmm0
+ pxor %xmm1,%xmm0
+.Ldone:
+.byte 102,65,15,56,0,194
+ movdqu %xmm0,(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_ghash_clmul,.-gcm_ghash_clmul
+.globl gcm_init_avx
+.hidden gcm_init_avx
+.type gcm_init_avx,@function
+.align 32
+gcm_init_avx:
+.cfi_startproc
+ vzeroupper
+
+ vmovdqu (%rsi),%xmm2
+ vpshufd $78,%xmm2,%xmm2
+
+
+ vpshufd $255,%xmm2,%xmm4
+ vpsrlq $63,%xmm2,%xmm3
+ vpsllq $1,%xmm2,%xmm2
+ vpxor %xmm5,%xmm5,%xmm5
+ vpcmpgtd %xmm4,%xmm5,%xmm5
+ vpslldq $8,%xmm3,%xmm3
+ vpor %xmm3,%xmm2,%xmm2
+
+
+ vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vpunpckhqdq %xmm2,%xmm2,%xmm6
+ vmovdqa %xmm2,%xmm0
+ vpxor %xmm2,%xmm6,%xmm6
+ movq $4,%r10
+ jmp .Linit_start_avx
+.align 32
+.Linit_loop_avx:
+ vpalignr $8,%xmm3,%xmm4,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+ vmovdqa %xmm0,%xmm5
+ vpunpckhqdq %xmm0,%xmm0,%xmm3
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1
+ vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3
+ vpxor %xmm0,%xmm1,%xmm4
+ vpxor %xmm4,%xmm3,%xmm3
+
+ vpslldq $8,%xmm3,%xmm4
+ vpsrldq $8,%xmm3,%xmm3
+ vpxor %xmm4,%xmm0,%xmm0
+ vpxor %xmm3,%xmm1,%xmm1
+ vpsllq $57,%xmm0,%xmm3
+ vpsllq $62,%xmm0,%xmm4
+ vpxor %xmm3,%xmm4,%xmm4
+ vpsllq $63,%xmm0,%xmm3
+ vpxor %xmm3,%xmm4,%xmm4
+ vpslldq $8,%xmm4,%xmm3
+ vpsrldq $8,%xmm4,%xmm4
+ vpxor %xmm3,%xmm0,%xmm0
+ vpxor %xmm4,%xmm1,%xmm1
+
+ vpsrlq $1,%xmm0,%xmm4
+ vpxor %xmm0,%xmm1,%xmm1
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $5,%xmm4,%xmm4
+ vpxor %xmm4,%xmm0,%xmm0
+ vpsrlq $1,%xmm0,%xmm0
+ vpxor %xmm1,%xmm0,%xmm0
+ vpshufd $78,%xmm5,%xmm3
+ vpshufd $78,%xmm0,%xmm4
+ vpxor %xmm5,%xmm3,%xmm3
+ vmovdqu %xmm5,0(%rdi)
+ vpxor %xmm0,%xmm4,%xmm4
+ vmovdqu %xmm0,16(%rdi)
+ leaq 48(%rdi),%rdi
+ subq $1,%r10
+ jnz .Linit_loop_avx
+
+ vpalignr $8,%xmm4,%xmm3,%xmm5
+ vmovdqu %xmm5,-16(%rdi)
+
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_init_avx,.-gcm_init_avx
+.globl gcm_gmult_avx
+.hidden gcm_gmult_avx
+.type gcm_gmult_avx,@function
+.align 32
+gcm_gmult_avx:
+.cfi_startproc
+ jmp .L_gmult_clmul
+.cfi_endproc
+.size gcm_gmult_avx,.-gcm_gmult_avx
+.globl gcm_ghash_avx
+.hidden gcm_ghash_avx
+.type gcm_ghash_avx,@function
+.align 32
+gcm_ghash_avx:
+.cfi_startproc
+ vzeroupper
+
+ vmovdqu (%rdi),%xmm10
+ leaq .L0x1c2_polynomial(%rip),%r10
+ leaq 64(%rsi),%rsi
+ vmovdqu .Lbswap_mask(%rip),%xmm13
+ vpshufb %xmm13,%xmm10,%xmm10
+ cmpq $0x80,%rcx
+ jb .Lshort_avx
+ subq $0x80,%rcx
+
+ vmovdqu 112(%rdx),%xmm14
+ vmovdqu 0-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vmovdqu 32-64(%rsi),%xmm7
+
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm14,%xmm9,%xmm9
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 80(%rdx),%xmm14
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 48-64(%rsi),%xmm6
+ vpxor %xmm14,%xmm9,%xmm9
+ vmovdqu 64(%rdx),%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 48(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 32(%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+
+ vmovdqu 16(%rdx),%xmm14
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm4,%xmm1,%xmm1
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpxor %xmm5,%xmm2,%xmm2
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu (%rdx),%xmm15
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm1,%xmm4,%xmm4
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+
+ leaq 128(%rdx),%rdx
+ cmpq $0x80,%rcx
+ jb .Ltail_avx
+
+ vpxor %xmm10,%xmm15,%xmm15
+ subq $0x80,%rcx
+ jmp .Loop8x_avx
+
+.align 32
+.Loop8x_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vmovdqu 112(%rdx),%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpxor %xmm15,%xmm8,%xmm8
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11
+ vmovdqu 0-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12
+ vmovdqu 32-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+
+ vmovdqu 96(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpxor %xmm3,%xmm10,%xmm10
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vxorps %xmm4,%xmm11,%xmm11
+ vmovdqu 16-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm5,%xmm12,%xmm12
+ vxorps %xmm15,%xmm8,%xmm8
+
+ vmovdqu 80(%rdx),%xmm14
+ vpxor %xmm10,%xmm12,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpxor %xmm11,%xmm12,%xmm12
+ vpslldq $8,%xmm12,%xmm9
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vpsrldq $8,%xmm12,%xmm12
+ vpxor %xmm9,%xmm10,%xmm10
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm14
+ vxorps %xmm12,%xmm11,%xmm11
+ vpxor %xmm1,%xmm4,%xmm4
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 80-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 64(%rdx),%xmm15
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vxorps %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+
+ vmovdqu 48(%rdx),%xmm14
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 96-64(%rsi),%xmm6
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 128-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu 32(%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpxor %xmm3,%xmm0,%xmm0
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm4,%xmm1,%xmm1
+ vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm5,%xmm2,%xmm2
+ vxorps %xmm12,%xmm10,%xmm10
+
+ vmovdqu 16(%rdx),%xmm14
+ vpalignr $8,%xmm10,%xmm10,%xmm12
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3
+ vpshufb %xmm13,%xmm14,%xmm14
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4
+ vmovdqu 144-64(%rsi),%xmm6
+ vpclmulqdq $0x10,(%r10),%xmm10,%xmm10
+ vxorps %xmm11,%xmm12,%xmm12
+ vpunpckhqdq %xmm14,%xmm14,%xmm9
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5
+ vmovdqu 176-64(%rsi),%xmm7
+ vpxor %xmm14,%xmm9,%xmm9
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vmovdqu (%rdx),%xmm15
+ vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0
+ vpshufb %xmm13,%xmm15,%xmm15
+ vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1
+ vmovdqu 160-64(%rsi),%xmm6
+ vpxor %xmm12,%xmm15,%xmm15
+ vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2
+ vpxor %xmm10,%xmm15,%xmm15
+
+ leaq 128(%rdx),%rdx
+ subq $0x80,%rcx
+ jnc .Loop8x_avx
+
+ addq $0x80,%rcx
+ jmp .Ltail_no_xor_avx
+
+.align 32
+.Lshort_avx:
+ vmovdqu -16(%rdx,%rcx,1),%xmm14
+ leaq (%rdx,%rcx,1),%rdx
+ vmovdqu 0-64(%rsi),%xmm6
+ vmovdqu 32-64(%rsi),%xmm7
+ vpshufb %xmm13,%xmm14,%xmm15
+
+ vmovdqa %xmm0,%xmm3
+ vmovdqa %xmm1,%xmm4
+ vmovdqa %xmm2,%xmm5
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -32(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 16-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -48(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 48-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 80-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -64(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 64-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -80(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 96-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovdqu 128-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -96(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 112-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vpsrldq $8,%xmm7,%xmm7
+ subq $0x10,%rcx
+ jz .Ltail_avx
+
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vmovdqu -112(%rdx),%xmm14
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vmovdqu 144-64(%rsi),%xmm6
+ vpshufb %xmm13,%xmm14,%xmm15
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+ vmovq 184-64(%rsi),%xmm7
+ subq $0x10,%rcx
+ jmp .Ltail_avx
+
+.align 32
+.Ltail_avx:
+ vpxor %xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+ vpunpckhqdq %xmm15,%xmm15,%xmm8
+ vpxor %xmm0,%xmm3,%xmm3
+ vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0
+ vpxor %xmm15,%xmm8,%xmm8
+ vpxor %xmm1,%xmm4,%xmm4
+ vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1
+ vpxor %xmm2,%xmm5,%xmm5
+ vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2
+
+ vmovdqu (%r10),%xmm12
+
+ vpxor %xmm0,%xmm3,%xmm10
+ vpxor %xmm1,%xmm4,%xmm11
+ vpxor %xmm2,%xmm5,%xmm5
+
+ vpxor %xmm10,%xmm5,%xmm5
+ vpxor %xmm11,%xmm5,%xmm5
+ vpslldq $8,%xmm5,%xmm9
+ vpsrldq $8,%xmm5,%xmm5
+ vpxor %xmm9,%xmm10,%xmm10
+ vpxor %xmm5,%xmm11,%xmm11
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9
+ vpalignr $8,%xmm10,%xmm10,%xmm10
+ vpxor %xmm11,%xmm10,%xmm10
+ vpxor %xmm9,%xmm10,%xmm10
+
+ cmpq $0,%rcx
+ jne .Lshort_avx
+
+ vpshufb %xmm13,%xmm10,%xmm10
+ vmovdqu %xmm10,(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size gcm_ghash_avx,.-gcm_ghash_avx
+.align 64
+.Lbswap_mask:
+.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long 7,0,7,0
+.align 64
+
+.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S
@@ -1,0 +1,702 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.align 16
+
+.globl md5_block_asm_data_order
+.hidden md5_block_asm_data_order
+.type md5_block_asm_data_order,@function
+md5_block_asm_data_order:
+.cfi_startproc
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset r12,-32
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset r14,-40
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset r15,-48
+.Lprologue:
+
+
+
+
+ movq %rdi,%rbp
+ shlq $6,%rdx
+ leaq (%rsi,%rdx,1),%rdi
+ movl 0(%rbp),%eax
+ movl 4(%rbp),%ebx
+ movl 8(%rbp),%ecx
+ movl 12(%rbp),%edx
+
+
+
+
+
+
+
+ cmpq %rdi,%rsi
+ je .Lend
+
+
+.Lloop:
+ movl %eax,%r8d
+ movl %ebx,%r9d
+ movl %ecx,%r14d
+ movl %edx,%r15d
+ movl 0(%rsi),%r10d
+ movl %edx,%r11d
+ xorl %ecx,%r11d
+ leal -680876936(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 4(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -389564586(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal 606105819(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1044525330(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 16(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal -176418897(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 20(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal 1200080426(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1473231341(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -45705983(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 32(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1770035416(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 36(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -1958414417(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -42063(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal -1990404162(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 48(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ xorl %ecx,%r11d
+ leal 1804603682(%rax,%r10,1),%eax
+ andl %ebx,%r11d
+ xorl %edx,%r11d
+ movl 52(%rsi),%r10d
+ addl %r11d,%eax
+ roll $7,%eax
+ movl %ecx,%r11d
+ addl %ebx,%eax
+ xorl %ebx,%r11d
+ leal -40341101(%rdx,%r10,1),%edx
+ andl %eax,%r11d
+ xorl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ addl %r11d,%edx
+ roll $12,%edx
+ movl %ebx,%r11d
+ addl %eax,%edx
+ xorl %eax,%r11d
+ leal -1502002290(%rcx,%r10,1),%ecx
+ andl %edx,%r11d
+ xorl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ addl %r11d,%ecx
+ roll $17,%ecx
+ movl %eax,%r11d
+ addl %edx,%ecx
+ xorl %edx,%r11d
+ leal 1236535329(%rbx,%r10,1),%ebx
+ andl %ecx,%r11d
+ xorl %eax,%r11d
+ movl 0(%rsi),%r10d
+ addl %r11d,%ebx
+ roll $22,%ebx
+ movl %edx,%r11d
+ addl %ecx,%ebx
+ movl 4(%rsi),%r10d
+ movl %edx,%r11d
+ movl %edx,%r12d
+ notl %r11d
+ leal -165796510(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 24(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1069501632(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 44(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 643717713(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -373897302(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 20(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -701558691(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 40(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal 38016083(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 60(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -660478335(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 16(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -405537848(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 36(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal 568446438(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 56(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -1019803690(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 12(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal -187363961(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 32(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal 1163531501(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 52(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ notl %r11d
+ leal -1444681467(%rax,%r10,1),%eax
+ andl %ebx,%r12d
+ andl %ecx,%r11d
+ movl 8(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ecx,%r11d
+ addl %r12d,%eax
+ movl %ecx,%r12d
+ roll $5,%eax
+ addl %ebx,%eax
+ notl %r11d
+ leal -51403784(%rdx,%r10,1),%edx
+ andl %eax,%r12d
+ andl %ebx,%r11d
+ movl 28(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %ebx,%r11d
+ addl %r12d,%edx
+ movl %ebx,%r12d
+ roll $9,%edx
+ addl %eax,%edx
+ notl %r11d
+ leal 1735328473(%rcx,%r10,1),%ecx
+ andl %edx,%r12d
+ andl %eax,%r11d
+ movl 48(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %eax,%r11d
+ addl %r12d,%ecx
+ movl %eax,%r12d
+ roll $14,%ecx
+ addl %edx,%ecx
+ notl %r11d
+ leal -1926607734(%rbx,%r10,1),%ebx
+ andl %ecx,%r12d
+ andl %edx,%r11d
+ movl 0(%rsi),%r10d
+ orl %r11d,%r12d
+ movl %edx,%r11d
+ addl %r12d,%ebx
+ movl %edx,%r12d
+ roll $20,%ebx
+ addl %ecx,%ebx
+ movl 20(%rsi),%r10d
+ movl %ecx,%r11d
+ leal -378558(%rax,%r10,1),%eax
+ movl 32(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -2022574463(%rdx,%r10,1),%edx
+ movl 44(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 1839030562(%rcx,%r10,1),%ecx
+ movl 56(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -35309556(%rbx,%r10,1),%ebx
+ movl 4(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -1530992060(%rax,%r10,1),%eax
+ movl 16(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal 1272893353(%rdx,%r10,1),%edx
+ movl 28(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -155497632(%rcx,%r10,1),%ecx
+ movl 40(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -1094730640(%rbx,%r10,1),%ebx
+ movl 52(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal 681279174(%rax,%r10,1),%eax
+ movl 0(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -358537222(%rdx,%r10,1),%edx
+ movl 12(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal -722521979(%rcx,%r10,1),%ecx
+ movl 24(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal 76029189(%rbx,%r10,1),%ebx
+ movl 36(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ leal -640364487(%rax,%r10,1),%eax
+ movl 48(%rsi),%r10d
+ xorl %edx,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%eax
+ roll $4,%eax
+ movl %ebx,%r11d
+ addl %ebx,%eax
+ leal -421815835(%rdx,%r10,1),%edx
+ movl 60(%rsi),%r10d
+ xorl %ecx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%edx
+ roll $11,%edx
+ movl %eax,%r11d
+ addl %eax,%edx
+ leal 530742520(%rcx,%r10,1),%ecx
+ movl 8(%rsi),%r10d
+ xorl %ebx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ecx
+ roll $16,%ecx
+ movl %edx,%r11d
+ addl %edx,%ecx
+ leal -995338651(%rbx,%r10,1),%ebx
+ movl 0(%rsi),%r10d
+ xorl %eax,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%ebx
+ roll $23,%ebx
+ movl %ecx,%r11d
+ addl %ecx,%ebx
+ movl 0(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ xorl %edx,%r11d
+ leal -198630844(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 28(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal 1126891415(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 56(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1416354905(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 20(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -57434055(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 48(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1700485571(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 12(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1894986606(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 40(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1051523(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 4(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -2054922799(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 32(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal 1873313359(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 60(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -30611744(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 24(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal -1560198380(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 52(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal 1309151649(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 16(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+ leal -145523070(%rax,%r10,1),%eax
+ orl %ebx,%r11d
+ xorl %ecx,%r11d
+ addl %r11d,%eax
+ movl 44(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $6,%eax
+ xorl %ecx,%r11d
+ addl %ebx,%eax
+ leal -1120210379(%rdx,%r10,1),%edx
+ orl %eax,%r11d
+ xorl %ebx,%r11d
+ addl %r11d,%edx
+ movl 8(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $10,%edx
+ xorl %ebx,%r11d
+ addl %eax,%edx
+ leal 718787259(%rcx,%r10,1),%ecx
+ orl %edx,%r11d
+ xorl %eax,%r11d
+ addl %r11d,%ecx
+ movl 36(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $15,%ecx
+ xorl %eax,%r11d
+ addl %edx,%ecx
+ leal -343485551(%rbx,%r10,1),%ebx
+ orl %ecx,%r11d
+ xorl %edx,%r11d
+ addl %r11d,%ebx
+ movl 0(%rsi),%r10d
+ movl $0xffffffff,%r11d
+ roll $21,%ebx
+ xorl %edx,%r11d
+ addl %ecx,%ebx
+
+ addl %r8d,%eax
+ addl %r9d,%ebx
+ addl %r14d,%ecx
+ addl %r15d,%edx
+
+
+ addq $64,%rsi
+ cmpq %rdi,%rsi
+ jb .Lloop
+
+
+.Lend:
+ movl %eax,0(%rbp)
+ movl %ebx,4(%rbp)
+ movl %ecx,8(%rbp)
+ movl %edx,12(%rbp)
+
+ movq (%rsp),%r15
+.cfi_restore r15
+ movq 8(%rsp),%r14
+.cfi_restore r14
+ movq 16(%rsp),%r12
+.cfi_restore r12
+ movq 24(%rsp),%rbx
+.cfi_restore rbx
+ movq 32(%rsp),%rbp
+.cfi_restore rbp
+ addq $40,%rsp
+.cfi_adjust_cfa_offset -40
+.Lepilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size md5_block_asm_data_order,.-md5_block_asm_data_order
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S
@@ -1,0 +1,4543 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+
+.align 64
+.Lpoly:
+.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
+
+.LOne:
+.long 1,1,1,1,1,1,1,1
+.LTwo:
+.long 2,2,2,2,2,2,2,2
+.LThree:
+.long 3,3,3,3,3,3,3,3
+.LONE_mont:
+.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
+
+
+
+.globl ecp_nistz256_neg
+.hidden ecp_nistz256_neg
+.type ecp_nistz256_neg,@function
+.align 32
+ecp_nistz256_neg:
+.cfi_startproc
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-24
+.Lneg_body:
+
+ xorq %r8,%r8
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r13,%r13
+
+ subq 0(%rsi),%r8
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r8,%rax
+ sbbq 24(%rsi),%r11
+ leaq .Lpoly(%rip),%rsi
+ movq %r9,%rdx
+ sbbq $0,%r13
+
+ addq 0(%rsi),%r8
+ movq %r10,%rcx
+ adcq 8(%rsi),%r9
+ adcq 16(%rsi),%r10
+ movq %r11,%r12
+ adcq 24(%rsi),%r11
+ testq %r13,%r13
+
+ cmovzq %rax,%r8
+ cmovzq %rdx,%r9
+ movq %r8,0(%rdi)
+ cmovzq %rcx,%r10
+ movq %r9,8(%rdi)
+ cmovzq %r12,%r11
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+
+ movq 0(%rsp),%r13
+.cfi_restore %r13
+ movq 8(%rsp),%r12
+.cfi_restore %r12
+ leaq 16(%rsp),%rsp
+.cfi_adjust_cfa_offset -16
+.Lneg_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_neg,.-ecp_nistz256_neg
+
+
+
+
+
+
+.globl ecp_nistz256_ord_mul_mont
+.hidden ecp_nistz256_ord_mul_mont
+.type ecp_nistz256_ord_mul_mont,@function
+.align 32
+ecp_nistz256_ord_mul_mont:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_mul_montx
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_mul_body:
+
+ movq 0(%rdx),%rax
+ movq %rdx,%rbx
+ leaq .Lord(%rip),%r14
+ movq .LordK(%rip),%r15
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ movq %rax,%r8
+ movq %rcx,%rax
+ movq %rdx,%r9
+
+ mulq 8(%rsi)
+ addq %rax,%r9
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq 16(%rsi)
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r8,%r13
+ imulq %r15,%r8
+
+ movq %rdx,%r11
+ mulq 24(%rsi)
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq 0(%r14)
+ movq %r8,%rbp
+ addq %rax,%r13
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ subq %r8,%r10
+ sbbq $0,%r8
+
+ mulq 8(%r14)
+ addq %rcx,%r9
+ adcq $0,%rdx
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq %rdx,%r10
+ movq %rbp,%rdx
+ adcq $0,%r8
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r11
+ movq 8(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r8,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r9
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r9,%rcx
+ imulq %r15,%r9
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ xorq %r8,%r8
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+
+
+ mulq 0(%r14)
+ movq %r9,%rbp
+ addq %rax,%rcx
+ movq %r9,%rax
+ adcq %rdx,%rcx
+
+ subq %r9,%r11
+ sbbq $0,%r9
+
+ mulq 8(%r14)
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq %rdx,%r11
+ movq %rbp,%rdx
+ adcq $0,%r9
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r12
+ movq 16(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r9,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r10
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r10,%rcx
+ imulq %r15,%r10
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r13
+ adcq $0,%rdx
+ xorq %r9,%r9
+ addq %rax,%r13
+ movq %r10,%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+
+
+ mulq 0(%r14)
+ movq %r10,%rbp
+ addq %rax,%rcx
+ movq %r10,%rax
+ adcq %rdx,%rcx
+
+ subq %r10,%r12
+ sbbq $0,%r10
+
+ mulq 8(%r14)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq %rdx,%r12
+ movq %rbp,%rdx
+ adcq $0,%r10
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r13
+ movq 24(%rbx),%rax
+ sbbq %rdx,%rbp
+
+ addq %r10,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+
+
+ movq %rax,%rcx
+ mulq 0(%rsi)
+ addq %rax,%r11
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 8(%rsi)
+ addq %rbp,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rcx,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq 16(%rsi)
+ addq %rbp,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %rcx,%rax
+ adcq $0,%rdx
+
+ movq %r11,%rcx
+ imulq %r15,%r11
+
+ movq %rdx,%rbp
+ mulq 24(%rsi)
+ addq %rbp,%r8
+ adcq $0,%rdx
+ xorq %r10,%r10
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ adcq $0,%r10
+
+
+ mulq 0(%r14)
+ movq %r11,%rbp
+ addq %rax,%rcx
+ movq %r11,%rax
+ adcq %rdx,%rcx
+
+ subq %r11,%r13
+ sbbq $0,%r11
+
+ mulq 8(%r14)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq %rdx,%r13
+ movq %rbp,%rdx
+ adcq $0,%r11
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r8
+ sbbq %rdx,%rbp
+
+ addq %r11,%r8
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+ movq %r12,%rsi
+ subq 0(%r14),%r12
+ movq %r13,%r11
+ sbbq 8(%r14),%r13
+ movq %r8,%rcx
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rsi,%r12
+ cmovcq %r11,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mul_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+
+
+
+
+
+
+
+.globl ecp_nistz256_ord_sqr_mont
+.hidden ecp_nistz256_ord_sqr_mont
+.type ecp_nistz256_ord_sqr_mont,@function
+.align 32
+ecp_nistz256_ord_sqr_mont:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lecp_nistz256_ord_sqr_montx
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_sqr_body:
+
+ movq 0(%rsi),%r8
+ movq 8(%rsi),%rax
+ movq 16(%rsi),%r14
+ movq 24(%rsi),%r15
+ leaq .Lord(%rip),%rsi
+ movq %rdx,%rbx
+ jmp .Loop_ord_sqr
+
+.align 32
+.Loop_ord_sqr:
+
+ movq %rax,%rbp
+ mulq %r8
+ movq %rax,%r9
+.byte 102,72,15,110,205
+ movq %r14,%rax
+ movq %rdx,%r10
+
+ mulq %r8
+ addq %rax,%r10
+ movq %r15,%rax
+.byte 102,73,15,110,214
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r8
+ addq %rax,%r11
+ movq %r15,%rax
+.byte 102,73,15,110,223
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq %r14
+ movq %rax,%r13
+ movq %r14,%rax
+ movq %rdx,%r14
+
+
+ mulq %rbp
+ addq %rax,%r11
+ movq %r15,%rax
+ adcq $0,%rdx
+ movq %rdx,%r15
+
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+
+ addq %r15,%r12
+ adcq %rdx,%r13
+ adcq $0,%r14
+
+
+ xorq %r15,%r15
+ movq %r8,%rax
+ addq %r9,%r9
+ adcq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq $0,%r15
+
+
+ mulq %rax
+ movq %rax,%r8
+.byte 102,72,15,126,200
+ movq %rdx,%rbp
+
+ mulq %rax
+ addq %rbp,%r9
+ adcq %rax,%r10
+.byte 102,72,15,126,208
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq %rax
+ addq %rbp,%r11
+ adcq %rax,%r12
+.byte 102,72,15,126,216
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ movq %r8,%rcx
+ imulq 32(%rsi),%r8
+
+ mulq %rax
+ addq %rbp,%r13
+ adcq %rax,%r14
+ movq 0(%rsi),%rax
+ adcq %rdx,%r15
+
+
+ mulq %r8
+ movq %r8,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r8,%r10
+ sbbq $0,%rbp
+
+ mulq %r8
+ addq %rcx,%r9
+ adcq $0,%rdx
+ addq %rax,%r9
+ movq %r8,%rax
+ adcq %rdx,%r10
+ movq %r8,%rdx
+ adcq $0,%rbp
+
+ movq %r9,%rcx
+ imulq 32(%rsi),%r9
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r11
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r8
+
+ addq %rbp,%r11
+ adcq $0,%r8
+
+
+ mulq %r9
+ movq %r9,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r9,%r11
+ sbbq $0,%rbp
+
+ mulq %r9
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %r9,%rax
+ adcq %rdx,%r11
+ movq %r9,%rdx
+ adcq $0,%rbp
+
+ movq %r10,%rcx
+ imulq 32(%rsi),%r10
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r8
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r9
+
+ addq %rbp,%r8
+ adcq $0,%r9
+
+
+ mulq %r10
+ movq %r10,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r10,%r8
+ sbbq $0,%rbp
+
+ mulq %r10
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %r10,%rax
+ adcq %rdx,%r8
+ movq %r10,%rdx
+ adcq $0,%rbp
+
+ movq %r11,%rcx
+ imulq 32(%rsi),%r11
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r9
+ movq 0(%rsi),%rax
+ sbbq %rdx,%r10
+
+ addq %rbp,%r9
+ adcq $0,%r10
+
+
+ mulq %r11
+ movq %r11,%rbp
+ addq %rax,%rcx
+ movq 8(%rsi),%rax
+ adcq %rdx,%rcx
+
+ subq %r11,%r9
+ sbbq $0,%rbp
+
+ mulq %r11
+ addq %rcx,%r8
+ adcq $0,%rdx
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ movq %r11,%rdx
+ adcq $0,%rbp
+
+ shlq $32,%rax
+ shrq $32,%rdx
+ subq %rax,%r10
+ sbbq %rdx,%r11
+
+ addq %rbp,%r10
+ adcq $0,%r11
+
+
+ xorq %rdx,%rdx
+ addq %r12,%r8
+ adcq %r13,%r9
+ movq %r8,%r12
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%rax
+ adcq $0,%rdx
+
+
+ subq 0(%rsi),%r8
+ movq %r10,%r14
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r15
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rdx
+
+ cmovcq %r12,%r8
+ cmovncq %r9,%rax
+ cmovncq %r10,%r14
+ cmovncq %r11,%r15
+
+ decq %rbx
+ jnz .Loop_ord_sqr
+
+ movq %r8,0(%rdi)
+ movq %rax,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r14,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r15,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqr_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+
+.type ecp_nistz256_ord_mul_montx,@function
+.align 32
+ecp_nistz256_ord_mul_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_mul_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_mulx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+ leaq .Lord-128(%rip),%r14
+ movq .LordK(%rip),%r15
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ mulxq %r11,%rbp,%r11
+ addq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ mulxq %r15,%rdx,%rax
+ adcq %rbp,%r10
+ adcq %rcx,%r11
+ adcq $0,%r12
+
+
+ xorq %r13,%r13
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+ adcxq %r8,%r12
+ adoxq %r8,%r13
+ adcq $0,%r13
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcxq %r9,%r13
+ adoxq %r9,%r8
+ adcq $0,%r8
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+ adcxq %r10,%r8
+ adoxq %r10,%r9
+ adcq $0,%r9
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ mulxq %r15,%rdx,%rax
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+ mulxq 0+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%r14),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%r14),%rcx,%rbp
+ leaq 128(%r14),%r14
+ movq %r12,%rbx
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ movq %r13,%rdx
+ adcxq %r11,%r9
+ adoxq %r11,%r10
+ adcq $0,%r10
+
+
+
+ movq %r8,%rcx
+ subq 0(%r14),%r12
+ sbbq 8(%r14),%r13
+ sbbq 16(%r14),%r8
+ movq %r9,%rbp
+ sbbq 24(%r14),%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ cmovcq %rcx,%r8
+ cmovcq %rbp,%r9
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_mulx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+
+.type ecp_nistz256_ord_sqr_montx,@function
+.align 32
+ecp_nistz256_ord_sqr_montx:
+.cfi_startproc
+.Lecp_nistz256_ord_sqr_montx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lord_sqrx_body:
+
+ movq %rdx,%rbx
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq .Lord(%rip),%rsi
+ jmp .Loop_ord_sqrx
+
+.align 32
+.Loop_ord_sqrx:
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ movq %rdx,%rax
+.byte 102,73,15,110,206
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ addq %rcx,%r10
+.byte 102,73,15,110,215
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+ mulxq %r8,%rcx,%r14
+ movq %rax,%rdx
+.byte 102,73,15,110,216
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+
+ mulxq %rdx,%r8,%rbp
+.byte 102,72,15,126,202
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+.byte 102,72,15,126,210
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+ mulxq %rdx,%rcx,%rbp
+.byte 0x67
+.byte 102,72,15,126,218
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ adoxq %rbp,%r13
+ mulxq %rdx,%rcx,%rax
+ adoxq %rcx,%r14
+ adoxq %rax,%r15
+
+
+ movq %r8,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ xorq %rax,%rax
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ adcxq %rax,%r8
+
+
+ movq %r9,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ adoxq %rax,%r9
+
+
+ movq %r10,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+ mulxq 8(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r8
+ mulxq 16(%rsi),%rcx,%rbp
+ adcxq %rcx,%r8
+ adoxq %rbp,%r9
+ mulxq 24(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+ adcxq %rax,%r10
+
+
+ movq %r11,%rdx
+ mulxq 32(%rsi),%rdx,%rcx
+
+ mulxq 0(%rsi),%rcx,%rbp
+ adoxq %rcx,%r11
+ adcxq %rbp,%r8
+ mulxq 8(%rsi),%rcx,%rbp
+ adoxq %rcx,%r8
+ adcxq %rbp,%r9
+ mulxq 16(%rsi),%rcx,%rbp
+ adoxq %rcx,%r9
+ adcxq %rbp,%r10
+ mulxq 24(%rsi),%rcx,%rbp
+ adoxq %rcx,%r10
+ adcxq %rbp,%r11
+ adoxq %rax,%r11
+
+
+ addq %r8,%r12
+ adcq %r13,%r9
+ movq %r12,%rdx
+ adcq %r14,%r10
+ adcq %r15,%r11
+ movq %r9,%r14
+ adcq $0,%rax
+
+
+ subq 0(%rsi),%r12
+ movq %r10,%r15
+ sbbq 8(%rsi),%r9
+ sbbq 16(%rsi),%r10
+ movq %r11,%r8
+ sbbq 24(%rsi),%r11
+ sbbq $0,%rax
+
+ cmovncq %r12,%rdx
+ cmovncq %r9,%r14
+ cmovncq %r10,%r15
+ cmovncq %r11,%r8
+
+ decq %rbx
+ jnz .Loop_ord_sqrx
+
+ movq %rdx,0(%rdi)
+ movq %r14,8(%rdi)
+ pxor %xmm1,%xmm1
+ movq %r15,16(%rdi)
+ pxor %xmm2,%xmm2
+ movq %r8,24(%rdi)
+ pxor %xmm3,%xmm3
+
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lord_sqrx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+
+
+
+
+
+
+.globl ecp_nistz256_mul_mont
+.hidden ecp_nistz256_mul_mont
+.type ecp_nistz256_mul_mont,@function
+.align 32
+ecp_nistz256_mul_mont:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+.Lmul_mont:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lmul_body:
+ cmpl $0x80100,%ecx
+ je .Lmul_montx
+ movq %rdx,%rbx
+ movq 0(%rdx),%rax
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+
+ call __ecp_nistz256_mul_montq
+ jmp .Lmul_mont_done
+
+.align 32
+.Lmul_montx:
+ movq %rdx,%rbx
+ movq 0(%rdx),%rdx
+ movq 0(%rsi),%r9
+ movq 8(%rsi),%r10
+ movq 16(%rsi),%r11
+ movq 24(%rsi),%r12
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_mul_montx
+.Lmul_mont_done:
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+.type __ecp_nistz256_mul_montq,@function
+.align 32
+__ecp_nistz256_mul_montq:
+.cfi_startproc
+
+
+ movq %rax,%rbp
+ mulq %r9
+ movq .Lpoly+8(%rip),%r14
+ movq %rax,%r8
+ movq %rbp,%rax
+ movq %rdx,%r9
+
+ mulq %r10
+ movq .Lpoly+24(%rip),%r15
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %r11
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r12
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ xorq %r13,%r13
+ movq %rdx,%r12
+
+
+
+
+
+
+
+
+
+
+ movq %r8,%rbp
+ shlq $32,%r8
+ mulq %r15
+ shrq $32,%rbp
+ addq %r8,%r9
+ adcq %rbp,%r10
+ adcq %rax,%r11
+ movq 8(%rbx),%rax
+ adcq %rdx,%r12
+ adcq $0,%r13
+ xorq %r8,%r8
+
+
+
+ movq %rax,%rbp
+ mulq 0(%rsi)
+ addq %rax,%r9
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 8(%rsi)
+ addq %rcx,%r10
+ adcq $0,%rdx
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 16(%rsi)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 24(%rsi)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %r9,%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+
+
+
+ movq %r9,%rbp
+ shlq $32,%r9
+ mulq %r15
+ shrq $32,%rbp
+ addq %r9,%r10
+ adcq %rbp,%r11
+ adcq %rax,%r12
+ movq 16(%rbx),%rax
+ adcq %rdx,%r13
+ adcq $0,%r8
+ xorq %r9,%r9
+
+
+
+ movq %rax,%rbp
+ mulq 0(%rsi)
+ addq %rax,%r10
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 8(%rsi)
+ addq %rcx,%r11
+ adcq $0,%rdx
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 16(%rsi)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 24(%rsi)
+ addq %rcx,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %r10,%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+
+
+
+ movq %r10,%rbp
+ shlq $32,%r10
+ mulq %r15
+ shrq $32,%rbp
+ addq %r10,%r11
+ adcq %rbp,%r12
+ adcq %rax,%r13
+ movq 24(%rbx),%rax
+ adcq %rdx,%r8
+ adcq $0,%r9
+ xorq %r10,%r10
+
+
+
+ movq %rax,%rbp
+ mulq 0(%rsi)
+ addq %rax,%r11
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 8(%rsi)
+ addq %rcx,%r12
+ adcq $0,%rdx
+ addq %rax,%r12
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 16(%rsi)
+ addq %rcx,%r13
+ adcq $0,%rdx
+ addq %rax,%r13
+ movq %rbp,%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq 24(%rsi)
+ addq %rcx,%r8
+ adcq $0,%rdx
+ addq %rax,%r8
+ movq %r11,%rax
+ adcq %rdx,%r9
+ adcq $0,%r10
+
+
+
+ movq %r11,%rbp
+ shlq $32,%r11
+ mulq %r15
+ shrq $32,%rbp
+ addq %r11,%r12
+ adcq %rbp,%r13
+ movq %r12,%rcx
+ adcq %rax,%r8
+ adcq %rdx,%r9
+ movq %r13,%rbp
+ adcq $0,%r10
+
+
+
+ subq $-1,%r12
+ movq %r8,%rbx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%rdx
+ sbbq %r15,%r9
+ sbbq $0,%r10
+
+ cmovcq %rcx,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rbx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %rdx,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
+
+
+
+
+
+
+
+
+.globl ecp_nistz256_sqr_mont
+.hidden ecp_nistz256_sqr_mont
+.type ecp_nistz256_sqr_mont,@function
+.align 32
+ecp_nistz256_sqr_mont:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+.Lsqr_body:
+ cmpl $0x80100,%ecx
+ je .Lsqr_montx
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+
+ call __ecp_nistz256_sqr_montq
+ jmp .Lsqr_mont_done
+
+.align 32
+.Lsqr_montx:
+ movq 0(%rsi),%rdx
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r15
+ movq 24(%rsi),%r8
+ leaq -128(%rsi),%rsi
+
+ call __ecp_nistz256_sqr_montx
+.Lsqr_mont_done:
+ movq 0(%rsp),%r15
+.cfi_restore %r15
+ movq 8(%rsp),%r14
+.cfi_restore %r14
+ movq 16(%rsp),%r13
+.cfi_restore %r13
+ movq 24(%rsp),%r12
+.cfi_restore %r12
+ movq 32(%rsp),%rbx
+.cfi_restore %rbx
+ movq 40(%rsp),%rbp
+.cfi_restore %rbp
+ leaq 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lsqr_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+.type __ecp_nistz256_sqr_montq,@function
+.align 32
+__ecp_nistz256_sqr_montq:
+.cfi_startproc
+ movq %rax,%r13
+ mulq %r14
+ movq %rax,%r9
+ movq %r15,%rax
+ movq %rdx,%r10
+
+ mulq %r13
+ addq %rax,%r10
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %r13
+ addq %rax,%r11
+ movq %r15,%rax
+ adcq $0,%rdx
+ movq %rdx,%r12
+
+
+ mulq %r14
+ addq %rax,%r11
+ movq %r8,%rax
+ adcq $0,%rdx
+ movq %rdx,%rbp
+
+ mulq %r14
+ addq %rax,%r12
+ movq %r8,%rax
+ adcq $0,%rdx
+ addq %rbp,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+
+ mulq %r15
+ xorq %r15,%r15
+ addq %rax,%r13
+ movq 0(%rsi),%rax
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ addq %r9,%r9
+ adcq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq $0,%r15
+
+ mulq %rax
+ movq %rax,%r8
+ movq 8(%rsi),%rax
+ movq %rdx,%rcx
+
+ mulq %rax
+ addq %rcx,%r9
+ adcq %rax,%r10
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq %rax
+ addq %rcx,%r11
+ adcq %rax,%r12
+ movq 24(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rcx
+
+ mulq %rax
+ addq %rcx,%r13
+ adcq %rax,%r14
+ movq %r8,%rax
+ adcq %rdx,%r15
+
+ movq .Lpoly+8(%rip),%rsi
+ movq .Lpoly+24(%rip),%rbp
+
+
+
+
+ movq %r8,%rcx
+ shlq $32,%r8
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r8,%r9
+ adcq %rcx,%r10
+ adcq %rax,%r11
+ movq %r9,%rax
+ adcq $0,%rdx
+
+
+
+ movq %r9,%rcx
+ shlq $32,%r9
+ movq %rdx,%r8
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r9,%r10
+ adcq %rcx,%r11
+ adcq %rax,%r8
+ movq %r10,%rax
+ adcq $0,%rdx
+
+
+
+ movq %r10,%rcx
+ shlq $32,%r10
+ movq %rdx,%r9
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r10,%r11
+ adcq %rcx,%r8
+ adcq %rax,%r9
+ movq %r11,%rax
+ adcq $0,%rdx
+
+
+
+ movq %r11,%rcx
+ shlq $32,%r11
+ movq %rdx,%r10
+ mulq %rbp
+ shrq $32,%rcx
+ addq %r11,%r8
+ adcq %rcx,%r9
+ adcq %rax,%r10
+ adcq $0,%rdx
+ xorq %r11,%r11
+
+
+
+ addq %r8,%r12
+ adcq %r9,%r13
+ movq %r12,%r8
+ adcq %r10,%r14
+ adcq %rdx,%r15
+ movq %r13,%r9
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r14,%r10
+ sbbq %rsi,%r13
+ sbbq $0,%r14
+ movq %r15,%rcx
+ sbbq %rbp,%r15
+ sbbq $0,%r11
+
+ cmovcq %r8,%r12
+ cmovcq %r9,%r13
+ movq %r12,0(%rdi)
+ cmovcq %r10,%r14
+ movq %r13,8(%rdi)
+ cmovcq %rcx,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
+.type __ecp_nistz256_mul_montx,@function
+.align 32
+__ecp_nistz256_mul_montx:
+.cfi_startproc
+
+
+ mulxq %r9,%r8,%r9
+ mulxq %r10,%rcx,%r10
+ movq $32,%r14
+ xorq %r13,%r13
+ mulxq %r11,%rbp,%r11
+ movq .Lpoly+24(%rip),%r15
+ adcq %rcx,%r9
+ mulxq %r12,%rcx,%r12
+ movq %r8,%rdx
+ adcq %rbp,%r10
+ shlxq %r14,%r8,%rbp
+ adcq %rcx,%r11
+ shrxq %r14,%r8,%rcx
+ adcq $0,%r12
+
+
+
+ addq %rbp,%r9
+ adcq %rcx,%r10
+
+ mulxq %r15,%rcx,%rbp
+ movq 8(%rbx),%rdx
+ adcq %rcx,%r11
+ adcq %rbp,%r12
+ adcq $0,%r13
+ xorq %r8,%r8
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r9
+ adoxq %rbp,%r10
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r9,%rdx
+ adcxq %rcx,%r12
+ shlxq %r14,%r9,%rcx
+ adoxq %rbp,%r13
+ shrxq %r14,%r9,%rbp
+
+ adcxq %r8,%r13
+ adoxq %r8,%r8
+ adcq $0,%r8
+
+
+
+ addq %rcx,%r10
+ adcq %rbp,%r11
+
+ mulxq %r15,%rcx,%rbp
+ movq 16(%rbx),%rdx
+ adcq %rcx,%r12
+ adcq %rbp,%r13
+ adcq $0,%r8
+ xorq %r9,%r9
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r10
+ adoxq %rbp,%r11
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r10,%rdx
+ adcxq %rcx,%r13
+ shlxq %r14,%r10,%rcx
+ adoxq %rbp,%r8
+ shrxq %r14,%r10,%rbp
+
+ adcxq %r9,%r8
+ adoxq %r9,%r9
+ adcq $0,%r9
+
+
+
+ addq %rcx,%r11
+ adcq %rbp,%r12
+
+ mulxq %r15,%rcx,%rbp
+ movq 24(%rbx),%rdx
+ adcq %rcx,%r13
+ adcq %rbp,%r8
+ adcq $0,%r9
+ xorq %r10,%r10
+
+
+
+ mulxq 0+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq 8+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+
+ mulxq 16+128(%rsi),%rcx,%rbp
+ adcxq %rcx,%r13
+ adoxq %rbp,%r8
+
+ mulxq 24+128(%rsi),%rcx,%rbp
+ movq %r11,%rdx
+ adcxq %rcx,%r8
+ shlxq %r14,%r11,%rcx
+ adoxq %rbp,%r9
+ shrxq %r14,%r11,%rbp
+
+ adcxq %r10,%r9
+ adoxq %r10,%r10
+ adcq $0,%r10
+
+
+
+ addq %rcx,%r12
+ adcq %rbp,%r13
+
+ mulxq %r15,%rcx,%rbp
+ movq %r12,%rbx
+ movq .Lpoly+8(%rip),%r14
+ adcq %rcx,%r8
+ movq %r13,%rdx
+ adcq %rbp,%r9
+ adcq $0,%r10
+
+
+
+ xorl %eax,%eax
+ movq %r8,%rcx
+ sbbq $-1,%r12
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%rbp
+ sbbq %r15,%r9
+ sbbq $0,%r10
+
+ cmovcq %rbx,%r12
+ cmovcq %rdx,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %rbp,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
+
+.type __ecp_nistz256_sqr_montx,@function
+.align 32
+__ecp_nistz256_sqr_montx:
+.cfi_startproc
+ mulxq %r14,%r9,%r10
+ mulxq %r15,%rcx,%r11
+ xorl %eax,%eax
+ adcq %rcx,%r10
+ mulxq %r8,%rbp,%r12
+ movq %r14,%rdx
+ adcq %rbp,%r11
+ adcq $0,%r12
+ xorq %r13,%r13
+
+
+ mulxq %r15,%rcx,%rbp
+ adcxq %rcx,%r11
+ adoxq %rbp,%r12
+
+ mulxq %r8,%rcx,%rbp
+ movq %r15,%rdx
+ adcxq %rcx,%r12
+ adoxq %rbp,%r13
+ adcq $0,%r13
+
+
+ mulxq %r8,%rcx,%r14
+ movq 0+128(%rsi),%rdx
+ xorq %r15,%r15
+ adcxq %r9,%r9
+ adoxq %rcx,%r13
+ adcxq %r10,%r10
+ adoxq %r15,%r14
+
+ mulxq %rdx,%r8,%rbp
+ movq 8+128(%rsi),%rdx
+ adcxq %r11,%r11
+ adoxq %rbp,%r9
+ adcxq %r12,%r12
+ mulxq %rdx,%rcx,%rax
+ movq 16+128(%rsi),%rdx
+ adcxq %r13,%r13
+ adoxq %rcx,%r10
+ adcxq %r14,%r14
+.byte 0x67
+ mulxq %rdx,%rcx,%rbp
+ movq 24+128(%rsi),%rdx
+ adoxq %rax,%r11
+ adcxq %r15,%r15
+ adoxq %rcx,%r12
+ movq $32,%rsi
+ adoxq %rbp,%r13
+.byte 0x67,0x67
+ mulxq %rdx,%rcx,%rax
+ movq .Lpoly+24(%rip),%rdx
+ adoxq %rcx,%r14
+ shlxq %rsi,%r8,%rcx
+ adoxq %rax,%r15
+ shrxq %rsi,%r8,%rax
+ movq %rdx,%rbp
+
+
+ addq %rcx,%r9
+ adcq %rax,%r10
+
+ mulxq %r8,%rcx,%r8
+ adcq %rcx,%r11
+ shlxq %rsi,%r9,%rcx
+ adcq $0,%r8
+ shrxq %rsi,%r9,%rax
+
+
+ addq %rcx,%r10
+ adcq %rax,%r11
+
+ mulxq %r9,%rcx,%r9
+ adcq %rcx,%r8
+ shlxq %rsi,%r10,%rcx
+ adcq $0,%r9
+ shrxq %rsi,%r10,%rax
+
+
+ addq %rcx,%r11
+ adcq %rax,%r8
+
+ mulxq %r10,%rcx,%r10
+ adcq %rcx,%r9
+ shlxq %rsi,%r11,%rcx
+ adcq $0,%r10
+ shrxq %rsi,%r11,%rax
+
+
+ addq %rcx,%r8
+ adcq %rax,%r9
+
+ mulxq %r11,%rcx,%r11
+ adcq %rcx,%r10
+ adcq $0,%r11
+
+ xorq %rdx,%rdx
+ addq %r8,%r12
+ movq .Lpoly+8(%rip),%rsi
+ adcq %r9,%r13
+ movq %r12,%r8
+ adcq %r10,%r14
+ adcq %r11,%r15
+ movq %r13,%r9
+ adcq $0,%rdx
+
+ subq $-1,%r12
+ movq %r14,%r10
+ sbbq %rsi,%r13
+ sbbq $0,%r14
+ movq %r15,%r11
+ sbbq %rbp,%r15
+ sbbq $0,%rdx
+
+ cmovcq %r8,%r12
+ cmovcq %r9,%r13
+ movq %r12,0(%rdi)
+ cmovcq %r10,%r14
+ movq %r13,8(%rdi)
+ cmovcq %r11,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
+
+
+.globl ecp_nistz256_select_w5
+.hidden ecp_nistz256_select_w5
+.type ecp_nistz256_select_w5,@function
+.align 32
+ecp_nistz256_select_w5:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rax
+ movq 8(%rax),%rax
+ testl $32,%eax
+ jnz .Lavx2_select_w5
+ movdqa .LOne(%rip),%xmm0
+ movd %edx,%xmm1
+
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+
+ movdqa %xmm0,%xmm8
+ pshufd $0,%xmm1,%xmm1
+
+ movq $16,%rax
+.Lselect_loop_sse_w5:
+
+ movdqa %xmm8,%xmm15
+ paddd %xmm0,%xmm8
+ pcmpeqd %xmm1,%xmm15
+
+ movdqa 0(%rsi),%xmm9
+ movdqa 16(%rsi),%xmm10
+ movdqa 32(%rsi),%xmm11
+ movdqa 48(%rsi),%xmm12
+ movdqa 64(%rsi),%xmm13
+ movdqa 80(%rsi),%xmm14
+ leaq 96(%rsi),%rsi
+
+ pand %xmm15,%xmm9
+ pand %xmm15,%xmm10
+ por %xmm9,%xmm2
+ pand %xmm15,%xmm11
+ por %xmm10,%xmm3
+ pand %xmm15,%xmm12
+ por %xmm11,%xmm4
+ pand %xmm15,%xmm13
+ por %xmm12,%xmm5
+ pand %xmm15,%xmm14
+ por %xmm13,%xmm6
+ por %xmm14,%xmm7
+
+ decq %rax
+ jnz .Lselect_loop_sse_w5
+
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ movdqu %xmm4,32(%rdi)
+ movdqu %xmm5,48(%rdi)
+ movdqu %xmm6,64(%rdi)
+ movdqu %xmm7,80(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.LSEH_end_ecp_nistz256_select_w5:
+.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
+
+
+
+.globl ecp_nistz256_select_w7
+.hidden ecp_nistz256_select_w7
+.type ecp_nistz256_select_w7,@function
+.align 32
+ecp_nistz256_select_w7:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rax
+ movq 8(%rax),%rax
+ testl $32,%eax
+ jnz .Lavx2_select_w7
+ movdqa .LOne(%rip),%xmm8
+ movd %edx,%xmm1
+
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+
+ movdqa %xmm8,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq $64,%rax
+
+.Lselect_loop_sse_w7:
+ movdqa %xmm8,%xmm15
+ paddd %xmm0,%xmm8
+ movdqa 0(%rsi),%xmm9
+ movdqa 16(%rsi),%xmm10
+ pcmpeqd %xmm1,%xmm15
+ movdqa 32(%rsi),%xmm11
+ movdqa 48(%rsi),%xmm12
+ leaq 64(%rsi),%rsi
+
+ pand %xmm15,%xmm9
+ pand %xmm15,%xmm10
+ por %xmm9,%xmm2
+ pand %xmm15,%xmm11
+ por %xmm10,%xmm3
+ pand %xmm15,%xmm12
+ por %xmm11,%xmm4
+ prefetcht0 255(%rsi)
+ por %xmm12,%xmm5
+
+ decq %rax
+ jnz .Lselect_loop_sse_w7
+
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+ movdqu %xmm4,32(%rdi)
+ movdqu %xmm5,48(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.LSEH_end_ecp_nistz256_select_w7:
+.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
+
+
+.type ecp_nistz256_avx2_select_w5,@function
+.align 32
+ecp_nistz256_avx2_select_w5:
+.cfi_startproc
+.Lavx2_select_w5:
+ vzeroupper
+ vmovdqa .LTwo(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+ vpxor %ymm4,%ymm4,%ymm4
+
+ vmovdqa .LOne(%rip),%ymm5
+ vmovdqa .LTwo(%rip),%ymm10
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+ movq $8,%rax
+.Lselect_loop_avx2_w5:
+
+ vmovdqa 0(%rsi),%ymm6
+ vmovdqa 32(%rsi),%ymm7
+ vmovdqa 64(%rsi),%ymm8
+
+ vmovdqa 96(%rsi),%ymm11
+ vmovdqa 128(%rsi),%ymm12
+ vmovdqa 160(%rsi),%ymm13
+
+ vpcmpeqd %ymm1,%ymm5,%ymm9
+ vpcmpeqd %ymm1,%ymm10,%ymm14
+
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpaddd %ymm0,%ymm10,%ymm10
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm9,%ymm6,%ymm6
+ vpand %ymm9,%ymm7,%ymm7
+ vpand %ymm9,%ymm8,%ymm8
+ vpand %ymm14,%ymm11,%ymm11
+ vpand %ymm14,%ymm12,%ymm12
+ vpand %ymm14,%ymm13,%ymm13
+
+ vpxor %ymm6,%ymm2,%ymm2
+ vpxor %ymm7,%ymm3,%ymm3
+ vpxor %ymm8,%ymm4,%ymm4
+ vpxor %ymm11,%ymm2,%ymm2
+ vpxor %ymm12,%ymm3,%ymm3
+ vpxor %ymm13,%ymm4,%ymm4
+
+ decq %rax
+ jnz .Lselect_loop_avx2_w5
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vmovdqu %ymm4,64(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.LSEH_end_ecp_nistz256_avx2_select_w5:
+.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
+
+
+
+.globl ecp_nistz256_avx2_select_w7
+.hidden ecp_nistz256_avx2_select_w7
+.type ecp_nistz256_avx2_select_w7,@function
+.align 32
+ecp_nistz256_avx2_select_w7:
+.cfi_startproc
+.Lavx2_select_w7:
+ vzeroupper
+ vmovdqa .LThree(%rip),%ymm0
+
+ vpxor %ymm2,%ymm2,%ymm2
+ vpxor %ymm3,%ymm3,%ymm3
+
+ vmovdqa .LOne(%rip),%ymm4
+ vmovdqa .LTwo(%rip),%ymm8
+ vmovdqa .LThree(%rip),%ymm12
+
+ vmovd %edx,%xmm1
+ vpermd %ymm1,%ymm2,%ymm1
+
+
+ movq $21,%rax
+.Lselect_loop_avx2_w7:
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vmovdqa 64(%rsi),%ymm9
+ vmovdqa 96(%rsi),%ymm10
+
+ vmovdqa 128(%rsi),%ymm13
+ vmovdqa 160(%rsi),%ymm14
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+ vpcmpeqd %ymm1,%ymm8,%ymm11
+ vpcmpeqd %ymm1,%ymm12,%ymm15
+
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpaddd %ymm0,%ymm12,%ymm12
+ leaq 192(%rsi),%rsi
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+ vpand %ymm11,%ymm9,%ymm9
+ vpand %ymm11,%ymm10,%ymm10
+ vpand %ymm15,%ymm13,%ymm13
+ vpand %ymm15,%ymm14,%ymm14
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+ vpxor %ymm9,%ymm2,%ymm2
+ vpxor %ymm10,%ymm3,%ymm3
+ vpxor %ymm13,%ymm2,%ymm2
+ vpxor %ymm14,%ymm3,%ymm3
+
+ decq %rax
+ jnz .Lselect_loop_avx2_w7
+
+
+ vmovdqa 0(%rsi),%ymm5
+ vmovdqa 32(%rsi),%ymm6
+
+ vpcmpeqd %ymm1,%ymm4,%ymm7
+
+ vpand %ymm7,%ymm5,%ymm5
+ vpand %ymm7,%ymm6,%ymm6
+
+ vpxor %ymm5,%ymm2,%ymm2
+ vpxor %ymm6,%ymm3,%ymm3
+
+ vmovdqu %ymm2,0(%rdi)
+ vmovdqu %ymm3,32(%rdi)
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.LSEH_end_ecp_nistz256_avx2_select_w7:
+.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
+.type __ecp_nistz256_add_toq,@function
+.align 32
+__ecp_nistz256_add_toq:
+.cfi_startproc
+ xorq %r11,%r11
+ addq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ movq %r12,%rax
+ adcq 16(%rbx),%r8
+ adcq 24(%rbx),%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
+
+.type __ecp_nistz256_sub_fromq,@function
+.align 32
+__ecp_nistz256_sub_fromq:
+.cfi_startproc
+ subq 0(%rbx),%r12
+ sbbq 8(%rbx),%r13
+ movq %r12,%rax
+ sbbq 16(%rbx),%r8
+ sbbq 24(%rbx),%r9
+ movq %r13,%rbp
+ sbbq %r11,%r11
+
+ addq $-1,%r12
+ movq %r8,%rcx
+ adcq %r14,%r13
+ adcq $0,%r8
+ movq %r9,%r10
+ adcq %r15,%r9
+ testq %r11,%r11
+
+ cmovzq %rax,%r12
+ cmovzq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovzq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovzq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
+
+.type __ecp_nistz256_subq,@function
+.align 32
+__ecp_nistz256_subq:
+.cfi_startproc
+ subq %r12,%rax
+ sbbq %r13,%rbp
+ movq %rax,%r12
+ sbbq %r8,%rcx
+ sbbq %r9,%r10
+ movq %rbp,%r13
+ sbbq %r11,%r11
+
+ addq $-1,%rax
+ movq %rcx,%r8
+ adcq %r14,%rbp
+ adcq $0,%rcx
+ movq %r10,%r9
+ adcq %r15,%r10
+ testq %r11,%r11
+
+ cmovnzq %rax,%r12
+ cmovnzq %rbp,%r13
+ cmovnzq %rcx,%r8
+ cmovnzq %r10,%r9
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
+
+.type __ecp_nistz256_mul_by_2q,@function
+.align 32
+__ecp_nistz256_mul_by_2q:
+.cfi_startproc
+ xorq %r11,%r11
+ addq %r12,%r12
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
+.globl ecp_nistz256_point_double
+.hidden ecp_nistz256_point_double
+.type ecp_nistz256_point_double,@function
+.align 32
+ecp_nistz256_point_double:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_doublex
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $160+8,%rsp
+.cfi_adjust_cfa_offset 32*5+8
+.Lpoint_doubleq_body:
+
+.Lpoint_double_shortcutq:
+ movdqu 0(%rsi),%xmm0
+ movq %rsi,%rbx
+ movdqu 16(%rsi),%xmm1
+ movq 32+0(%rsi),%r12
+ movq 32+8(%rsi),%r13
+ movq 32+16(%rsi),%r8
+ movq 32+24(%rsi),%r9
+ movq .Lpoly+8(%rip),%r14
+ movq .Lpoly+24(%rip),%r15
+ movdqa %xmm0,96(%rsp)
+ movdqa %xmm1,96+16(%rsp)
+ leaq 32(%rdi),%r10
+ leaq 64(%rdi),%r11
+.byte 102,72,15,110,199
+.byte 102,73,15,110,202
+.byte 102,73,15,110,211
+
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2q
+
+ movq 64+0(%rsi),%rax
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ leaq 64-0(%rsi),%rsi
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 0+0(%rsp),%rax
+ movq 8+0(%rsp),%r14
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 32(%rbx),%rax
+ movq 64+0(%rbx),%r9
+ movq 64+8(%rbx),%r10
+ movq 64+16(%rbx),%r11
+ movq 64+24(%rbx),%r12
+ leaq 64-0(%rbx),%rsi
+ leaq 32(%rbx),%rbx
+.byte 102,72,15,126,215
+ call __ecp_nistz256_mul_montq
+ call __ecp_nistz256_mul_by_2q
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_toq
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+0(%rsp),%rax
+ movq 8+0(%rsp),%r14
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sqr_montq
+ xorq %r9,%r9
+ movq %r12,%rax
+ addq $-1,%r12
+ movq %r13,%r10
+ adcq %rsi,%r13
+ movq %r14,%rcx
+ adcq $0,%r14
+ movq %r15,%r8
+ adcq %rbp,%r15
+ adcq $0,%r9
+ xorq %rsi,%rsi
+ testq $1,%rax
+
+ cmovzq %rax,%r12
+ cmovzq %r10,%r13
+ cmovzq %rcx,%r14
+ cmovzq %r8,%r15
+ cmovzq %rsi,%r9
+
+ movq %r13,%rax
+ shrq $1,%r12
+ shlq $63,%rax
+ movq %r14,%r10
+ shrq $1,%r13
+ orq %rax,%r12
+ shlq $63,%r10
+ movq %r15,%rcx
+ shrq $1,%r14
+ orq %r10,%r13
+ shlq $63,%rcx
+ movq %r12,0(%rdi)
+ shrq $1,%r15
+ movq %r13,8(%rdi)
+ shlq $63,%r9
+ orq %rcx,%r14
+ orq %r9,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ movq 64(%rsp),%rax
+ leaq 64(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2q
+
+ leaq 32(%rsp),%rbx
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_toq
+
+ movq 96(%rsp),%rax
+ leaq 96(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2q
+
+ movq 0+32(%rsp),%rax
+ movq 8+32(%rsp),%r14
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r15
+ movq 24+32(%rsp),%r8
+.byte 102,72,15,126,199
+ call __ecp_nistz256_sqr_montq
+
+ leaq 128(%rsp),%rbx
+ movq %r14,%r8
+ movq %r15,%r9
+ movq %rsi,%r14
+ movq %rbp,%r15
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_subq
+
+ movq 32(%rsp),%rax
+ leaq 32(%rsp),%rbx
+ movq %r12,%r14
+ xorl %ecx,%ecx
+ movq %r12,0+0(%rsp)
+ movq %r13,%r10
+ movq %r13,0+8(%rsp)
+ cmovzq %r8,%r11
+ movq %r8,0+16(%rsp)
+ leaq 0-0(%rsp),%rsi
+ cmovzq %r9,%r12
+ movq %r9,0+24(%rsp)
+ movq %r14,%r9
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+.byte 102,72,15,126,203
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sub_fromq
+
+ leaq 160+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_doubleq_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_double,.-ecp_nistz256_point_double
+.globl ecp_nistz256_point_add
+.hidden ecp_nistz256_point_add
+.type ecp_nistz256_point_add,@function
+.align 32
+ecp_nistz256_point_add:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_addx
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $576+8,%rsp
+.cfi_adjust_cfa_offset 32*18+8
+.Lpoint_addq_body:
+
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq %rsi,%rbx
+ movq %rdx,%rsi
+ movdqa %xmm0,384(%rsp)
+ movdqa %xmm1,384+16(%rsp)
+ movdqa %xmm2,416(%rsp)
+ movdqa %xmm3,416+16(%rsp)
+ movdqa %xmm4,448(%rsp)
+ movdqa %xmm5,448+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rsi),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rsi),%xmm3
+ movq 64+0(%rsi),%rax
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,480(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,480+16(%rsp)
+ movdqu 64(%rsi),%xmm0
+ movdqu 80(%rsi),%xmm1
+ movdqa %xmm2,512(%rsp)
+ movdqa %xmm3,512+16(%rsp)
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+
+ leaq 64-0(%rsi),%rsi
+ movq %rax,544+0(%rsp)
+ movq %r14,544+8(%rsp)
+ movq %r15,544+16(%rsp)
+ movq %r8,544+24(%rsp)
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm1,%xmm4
+ por %xmm1,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+ movq 64+0(%rbx),%rax
+ movq 64+8(%rbx),%r14
+ movq 64+16(%rbx),%r15
+ movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
+
+ leaq 64-0(%rbx),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 544(%rsp),%rax
+ leaq 544(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq 0+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 448(%rsp),%rax
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 416(%rsp),%rax
+ leaq 416(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq 0+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 512(%rsp),%rax
+ leaq 512(%rsp),%rbx
+ movq 0+256(%rsp),%r9
+ movq 8+256(%rsp),%r10
+ leaq 0+256(%rsp),%rsi
+ movq 16+256(%rsp),%r11
+ movq 24+256(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 224(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ orq %r13,%r12
+ movdqa %xmm4,%xmm2
+ orq %r8,%r12
+ orq %r9,%r12
+ por %xmm5,%xmm2
+.byte 102,73,15,110,220
+
+ movq 384(%rsp),%rax
+ leaq 384(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq 0+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 480(%rsp),%rax
+ leaq 480(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 160(%rsp),%rbx
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ orq %r13,%r12
+ orq %r8,%r12
+ orq %r9,%r12
+
+.byte 102,73,15,126,208
+.byte 102,73,15,126,217
+ orq %r8,%r12
+.byte 0x3e
+ jnz .Ladd_proceedq
+
+
+
+ testq %r9,%r9
+ jz .Ladd_doubleq
+
+
+
+
+
+
+.byte 102,72,15,126,199
+ pxor %xmm0,%xmm0
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movdqu %xmm0,48(%rdi)
+ movdqu %xmm0,64(%rdi)
+ movdqu %xmm0,80(%rdi)
+ jmp .Ladd_doneq
+
+.align 32
+.Ladd_doubleq:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+.cfi_adjust_cfa_offset -416
+ jmp .Lpoint_double_shortcutq
+.cfi_adjust_cfa_offset 416
+
+.align 32
+.Ladd_proceedq:
+ movq 0+64(%rsp),%rax
+ movq 8+64(%rsp),%r14
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 448(%rsp),%rax
+ leaq 448(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 0+0(%rsp),%rax
+ movq 8+0(%rsp),%r14
+ leaq 0+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 544(%rsp),%rax
+ leaq 544(%rsp),%rbx
+ movq 0+352(%rsp),%r9
+ movq 8+352(%rsp),%r10
+ leaq 0+352(%rsp),%rsi
+ movq 16+352(%rsp),%r11
+ movq 24+352(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 0(%rsp),%rax
+ leaq 0(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 160(%rsp),%rax
+ leaq 160(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 96(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subq
+
+ leaq 128(%rsp),%rbx
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 192+0(%rsp),%rax
+ movq 192+8(%rsp),%rbp
+ movq 192+16(%rsp),%rcx
+ movq 192+24(%rsp),%r10
+ leaq 320(%rsp),%rdi
+
+ call __ecp_nistz256_subq
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 128(%rsp),%rax
+ leaq 128(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq 0+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 320(%rsp),%rax
+ leaq 320(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 256(%rsp),%rbx
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 352(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 352+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 544(%rsp),%xmm2
+ pand 544+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 480(%rsp),%xmm2
+ pand 480+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 320(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 320+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 512(%rsp),%xmm2
+ pand 512+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+.Ladd_doneq:
+ leaq 576+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_addq_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_add,.-ecp_nistz256_point_add
+.globl ecp_nistz256_point_add_affine
+.hidden ecp_nistz256_point_add_affine
+.type ecp_nistz256_point_add_affine,@function
+.align 32
+ecp_nistz256_point_add_affine:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%rcx
+ movq 8(%rcx),%rcx
+ andl $0x80100,%ecx
+ cmpl $0x80100,%ecx
+ je .Lpoint_add_affinex
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $480+8,%rsp
+.cfi_adjust_cfa_offset 32*15+8
+.Ladd_affineq_body:
+
+ movdqu 0(%rsi),%xmm0
+ movq %rdx,%rbx
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq 64+0(%rsi),%rax
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,320(%rsp)
+ movdqa %xmm1,320+16(%rsp)
+ movdqa %xmm2,352(%rsp)
+ movdqa %xmm3,352+16(%rsp)
+ movdqa %xmm4,384(%rsp)
+ movdqa %xmm5,384+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rbx),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rbx),%xmm1
+ movdqu 32(%rbx),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rbx),%xmm3
+ movdqa %xmm0,416(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,416+16(%rsp)
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+ movdqa %xmm2,448(%rsp)
+ movdqa %xmm3,448+16(%rsp)
+ por %xmm2,%xmm3
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm1,%xmm3
+
+ leaq 64-0(%rsi),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm3,%xmm4
+ movq 0(%rbx),%rax
+
+ movq %r12,%r9
+ por %xmm3,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ movq %r13,%r10
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ movq %r14,%r11
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+
+ leaq 32-0(%rsp),%rsi
+ movq %r15,%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 320(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 384(%rsp),%rax
+ leaq 384(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 384(%rsp),%rax
+ leaq 384(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 448(%rsp),%rax
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq 0+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 352(%rsp),%rbx
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+64(%rsp),%rax
+ movq 8+64(%rsp),%r14
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 0+96(%rsp),%rax
+ movq 8+96(%rsp),%r14
+ leaq 0+96(%rsp),%rsi
+ movq 16+96(%rsp),%r15
+ movq 24+96(%rsp),%r8
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_sqr_montq
+
+ movq 128(%rsp),%rax
+ leaq 128(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 320(%rsp),%rax
+ leaq 320(%rsp),%rbx
+ movq 0+128(%rsp),%r9
+ movq 8+128(%rsp),%r10
+ leaq 0+128(%rsp),%rsi
+ movq 16+128(%rsp),%r11
+ movq 24+128(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 192(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subq
+
+ leaq 160(%rsp),%rbx
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 64(%rsp),%rdi
+
+ call __ecp_nistz256_subq
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 352(%rsp),%rax
+ leaq 352(%rsp),%rbx
+ movq 0+160(%rsp),%r9
+ movq 8+160(%rsp),%r10
+ leaq 0+160(%rsp),%rsi
+ movq 16+160(%rsp),%r11
+ movq 24+160(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ movq 96(%rsp),%rax
+ leaq 96(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq 0+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_mul_montq
+
+ leaq 32(%rsp),%rbx
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_sub_fromq
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand .LONE_mont(%rip),%xmm2
+ pand .LONE_mont+16(%rip),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 224(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 224+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 320(%rsp),%xmm2
+ pand 320+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 256(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 256+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 352(%rsp),%xmm2
+ pand 352+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+ leaq 480+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Ladd_affineq_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+.type __ecp_nistz256_add_tox,@function
+.align 32
+__ecp_nistz256_add_tox:
+.cfi_startproc
+ xorq %r11,%r11
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ movq %r12,%rax
+ adcq 16(%rbx),%r8
+ adcq 24(%rbx),%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
+
+.type __ecp_nistz256_sub_fromx,@function
+.align 32
+__ecp_nistz256_sub_fromx:
+.cfi_startproc
+ xorq %r11,%r11
+ sbbq 0(%rbx),%r12
+ sbbq 8(%rbx),%r13
+ movq %r12,%rax
+ sbbq 16(%rbx),%r8
+ sbbq 24(%rbx),%r9
+ movq %r13,%rbp
+ sbbq $0,%r11
+
+ xorq %r10,%r10
+ adcq $-1,%r12
+ movq %r8,%rcx
+ adcq %r14,%r13
+ adcq $0,%r8
+ movq %r9,%r10
+ adcq %r15,%r9
+
+ btq $0,%r11
+ cmovncq %rax,%r12
+ cmovncq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovncq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovncq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
+
+.type __ecp_nistz256_subx,@function
+.align 32
+__ecp_nistz256_subx:
+.cfi_startproc
+ xorq %r11,%r11
+ sbbq %r12,%rax
+ sbbq %r13,%rbp
+ movq %rax,%r12
+ sbbq %r8,%rcx
+ sbbq %r9,%r10
+ movq %rbp,%r13
+ sbbq $0,%r11
+
+ xorq %r9,%r9
+ adcq $-1,%rax
+ movq %rcx,%r8
+ adcq %r14,%rbp
+ adcq $0,%rcx
+ movq %r10,%r9
+ adcq %r15,%r10
+
+ btq $0,%r11
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ cmovcq %rcx,%r8
+ cmovcq %r10,%r9
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
+
+.type __ecp_nistz256_mul_by_2x,@function
+.align 32
+__ecp_nistz256_mul_by_2x:
+.cfi_startproc
+ xorq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ xorq %r10,%r10
+ sbbq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ cmovcq %rbp,%r13
+ movq %r12,0(%rdi)
+ cmovcq %rcx,%r8
+ movq %r13,8(%rdi)
+ cmovcq %r10,%r9
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
+.type ecp_nistz256_point_doublex,@function
+.align 32
+ecp_nistz256_point_doublex:
+.cfi_startproc
+.Lpoint_doublex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $160+8,%rsp
+.cfi_adjust_cfa_offset 32*5+8
+.Lpoint_doublex_body:
+
+.Lpoint_double_shortcutx:
+ movdqu 0(%rsi),%xmm0
+ movq %rsi,%rbx
+ movdqu 16(%rsi),%xmm1
+ movq 32+0(%rsi),%r12
+ movq 32+8(%rsi),%r13
+ movq 32+16(%rsi),%r8
+ movq 32+24(%rsi),%r9
+ movq .Lpoly+8(%rip),%r14
+ movq .Lpoly+24(%rip),%r15
+ movdqa %xmm0,96(%rsp)
+ movdqa %xmm1,96+16(%rsp)
+ leaq 32(%rdi),%r10
+ leaq 64(%rdi),%r11
+.byte 102,72,15,110,199
+.byte 102,73,15,110,202
+.byte 102,73,15,110,211
+
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ leaq 64-128(%rsi),%rsi
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 32(%rbx),%rdx
+ movq 64+0(%rbx),%r9
+ movq 64+8(%rbx),%r10
+ movq 64+16(%rbx),%r11
+ movq 64+24(%rbx),%r12
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rbx),%rbx
+.byte 102,72,15,126,215
+ call __ecp_nistz256_mul_montx
+ call __ecp_nistz256_mul_by_2x
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96+0(%rsp),%r12
+ movq 96+8(%rsp),%r13
+ leaq 64(%rsp),%rbx
+ movq 96+16(%rsp),%r8
+ movq 96+24(%rsp),%r9
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sqr_montx
+ xorq %r9,%r9
+ movq %r12,%rax
+ addq $-1,%r12
+ movq %r13,%r10
+ adcq %rsi,%r13
+ movq %r14,%rcx
+ adcq $0,%r14
+ movq %r15,%r8
+ adcq %rbp,%r15
+ adcq $0,%r9
+ xorq %rsi,%rsi
+ testq $1,%rax
+
+ cmovzq %rax,%r12
+ cmovzq %r10,%r13
+ cmovzq %rcx,%r14
+ cmovzq %r8,%r15
+ cmovzq %rsi,%r9
+
+ movq %r13,%rax
+ shrq $1,%r12
+ shlq $63,%rax
+ movq %r14,%r10
+ shrq $1,%r13
+ orq %rax,%r12
+ shlq $63,%r10
+ movq %r15,%rcx
+ shrq $1,%r14
+ orq %r10,%r13
+ shlq $63,%rcx
+ movq %r12,0(%rdi)
+ shrq $1,%r15
+ movq %r13,8(%rdi)
+ shlq $63,%r9
+ orq %rcx,%r14
+ orq %r9,%r15
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ movq 64(%rsp),%rdx
+ leaq 64(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ leaq 32(%rsp),%rbx
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_add_tox
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_by_2x
+
+ movq 0+32(%rsp),%rdx
+ movq 8+32(%rsp),%r14
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r15
+ movq 24+32(%rsp),%r8
+.byte 102,72,15,126,199
+ call __ecp_nistz256_sqr_montx
+
+ leaq 128(%rsp),%rbx
+ movq %r14,%r8
+ movq %r15,%r9
+ movq %rsi,%r14
+ movq %rbp,%r15
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_subx
+
+ movq 32(%rsp),%rdx
+ leaq 32(%rsp),%rbx
+ movq %r12,%r14
+ xorl %ecx,%ecx
+ movq %r12,0+0(%rsp)
+ movq %r13,%r10
+ movq %r13,0+8(%rsp)
+ cmovzq %r8,%r11
+ movq %r8,0+16(%rsp)
+ leaq 0-128(%rsp),%rsi
+ cmovzq %r9,%r12
+ movq %r9,0+24(%rsp)
+ movq %r14,%r9
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+.byte 102,72,15,126,203
+.byte 102,72,15,126,207
+ call __ecp_nistz256_sub_fromx
+
+ leaq 160+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_doublex_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex
+.type ecp_nistz256_point_addx,@function
+.align 32
+ecp_nistz256_point_addx:
+.cfi_startproc
+.Lpoint_addx:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $576+8,%rsp
+.cfi_adjust_cfa_offset 32*18+8
+.Lpoint_addx_body:
+
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq %rsi,%rbx
+ movq %rdx,%rsi
+ movdqa %xmm0,384(%rsp)
+ movdqa %xmm1,384+16(%rsp)
+ movdqa %xmm2,416(%rsp)
+ movdqa %xmm3,416+16(%rsp)
+ movdqa %xmm4,448(%rsp)
+ movdqa %xmm5,448+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rsi),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rsi),%xmm3
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,480(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,480+16(%rsp)
+ movdqu 64(%rsi),%xmm0
+ movdqu 80(%rsi),%xmm1
+ movdqa %xmm2,512(%rsp)
+ movdqa %xmm3,512+16(%rsp)
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+
+ leaq 64-128(%rsi),%rsi
+ movq %rdx,544+0(%rsp)
+ movq %r14,544+8(%rsp)
+ movq %r15,544+16(%rsp)
+ movq %r8,544+24(%rsp)
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm1,%xmm4
+ por %xmm1,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+ movq 64+0(%rbx),%rdx
+ movq 64+8(%rbx),%r14
+ movq 64+16(%rbx),%r15
+ movq 64+24(%rbx),%r8
+.byte 102,72,15,110,203
+
+ leaq 64-128(%rbx),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 416(%rsp),%rdx
+ leaq 416(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 512(%rsp),%rdx
+ leaq 512(%rsp),%rbx
+ movq 0+256(%rsp),%r9
+ movq 8+256(%rsp),%r10
+ leaq -128+256(%rsp),%rsi
+ movq 16+256(%rsp),%r11
+ movq 24+256(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 224(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ movdqa %xmm4,%xmm2
+ orq %r8,%r12
+ orq %r9,%r12
+ por %xmm5,%xmm2
+.byte 102,73,15,110,220
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+96(%rsp),%r9
+ movq 8+96(%rsp),%r10
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r11
+ movq 24+96(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 480(%rsp),%rdx
+ leaq 480(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 160(%rsp),%rbx
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ orq %r13,%r12
+ orq %r8,%r12
+ orq %r9,%r12
+
+.byte 102,73,15,126,208
+.byte 102,73,15,126,217
+ orq %r8,%r12
+.byte 0x3e
+ jnz .Ladd_proceedx
+
+
+
+ testq %r9,%r9
+ jz .Ladd_doublex
+
+
+
+
+
+
+.byte 102,72,15,126,199
+ pxor %xmm0,%xmm0
+ movdqu %xmm0,0(%rdi)
+ movdqu %xmm0,16(%rdi)
+ movdqu %xmm0,32(%rdi)
+ movdqu %xmm0,48(%rdi)
+ movdqu %xmm0,64(%rdi)
+ movdqu %xmm0,80(%rdi)
+ jmp .Ladd_donex
+
+.align 32
+.Ladd_doublex:
+.byte 102,72,15,126,206
+.byte 102,72,15,126,199
+ addq $416,%rsp
+.cfi_adjust_cfa_offset -416
+ jmp .Lpoint_double_shortcutx
+.cfi_adjust_cfa_offset 416
+
+.align 32
+.Ladd_proceedx:
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+0(%rsp),%r9
+ movq 8+0(%rsp),%r10
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r11
+ movq 24+0(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0+0(%rsp),%rdx
+ movq 8+0(%rsp),%r14
+ leaq -128+0(%rsp),%rsi
+ movq 16+0(%rsp),%r15
+ movq 24+0(%rsp),%r8
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 544(%rsp),%rdx
+ leaq 544(%rsp),%rbx
+ movq 0+352(%rsp),%r9
+ movq 8+352(%rsp),%r10
+ leaq -128+352(%rsp),%rsi
+ movq 16+352(%rsp),%r11
+ movq 24+352(%rsp),%r12
+ leaq 352(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 0(%rsp),%rdx
+ leaq 0(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 160(%rsp),%rdx
+ leaq 160(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 96(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 128(%rsp),%rbx
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 192+0(%rsp),%rax
+ movq 192+8(%rsp),%rbp
+ movq 192+16(%rsp),%rcx
+ movq 192+24(%rsp),%r10
+ leaq 320(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+224(%rsp),%r9
+ movq 8+224(%rsp),%r10
+ leaq -128+224(%rsp),%rsi
+ movq 16+224(%rsp),%r11
+ movq 24+224(%rsp),%r12
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 256(%rsp),%rbx
+ leaq 320(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 352(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 352+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 544(%rsp),%xmm2
+ pand 544+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 480(%rsp),%xmm2
+ pand 480+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 320(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 320+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 512(%rsp),%xmm2
+ pand 512+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+.Ladd_donex:
+ leaq 576+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpoint_addx_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx
+.type ecp_nistz256_point_add_affinex,@function
+.align 32
+ecp_nistz256_point_add_affinex:
+.cfi_startproc
+.Lpoint_add_affinex:
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbp,-16
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset %rbx,-24
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r15,-56
+ subq $480+8,%rsp
+.cfi_adjust_cfa_offset 32*15+8
+.Ladd_affinex_body:
+
+ movdqu 0(%rsi),%xmm0
+ movq %rdx,%rbx
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqu 64(%rsi),%xmm4
+ movdqu 80(%rsi),%xmm5
+ movq 64+0(%rsi),%rdx
+ movq 64+8(%rsi),%r14
+ movq 64+16(%rsi),%r15
+ movq 64+24(%rsi),%r8
+ movdqa %xmm0,320(%rsp)
+ movdqa %xmm1,320+16(%rsp)
+ movdqa %xmm2,352(%rsp)
+ movdqa %xmm3,352+16(%rsp)
+ movdqa %xmm4,384(%rsp)
+ movdqa %xmm5,384+16(%rsp)
+ por %xmm4,%xmm5
+
+ movdqu 0(%rbx),%xmm0
+ pshufd $0xb1,%xmm5,%xmm3
+ movdqu 16(%rbx),%xmm1
+ movdqu 32(%rbx),%xmm2
+ por %xmm3,%xmm5
+ movdqu 48(%rbx),%xmm3
+ movdqa %xmm0,416(%rsp)
+ pshufd $0x1e,%xmm5,%xmm4
+ movdqa %xmm1,416+16(%rsp)
+ por %xmm0,%xmm1
+.byte 102,72,15,110,199
+ movdqa %xmm2,448(%rsp)
+ movdqa %xmm3,448+16(%rsp)
+ por %xmm2,%xmm3
+ por %xmm4,%xmm5
+ pxor %xmm4,%xmm4
+ por %xmm1,%xmm3
+
+ leaq 64-128(%rsi),%rsi
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ pcmpeqd %xmm4,%xmm5
+ pshufd $0xb1,%xmm3,%xmm4
+ movq 0(%rbx),%rdx
+
+ movq %r12,%r9
+ por %xmm3,%xmm4
+ pshufd $0,%xmm5,%xmm5
+ pshufd $0x1e,%xmm4,%xmm3
+ movq %r13,%r10
+ por %xmm3,%xmm4
+ pxor %xmm3,%xmm3
+ movq %r14,%r11
+ pcmpeqd %xmm3,%xmm4
+ pshufd $0,%xmm4,%xmm4
+
+ leaq 32-128(%rsp),%rsi
+ movq %r15,%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 320(%rsp),%rbx
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 384(%rsp),%rdx
+ leaq 384(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 288(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 448(%rsp),%rdx
+ leaq 448(%rsp),%rbx
+ movq 0+32(%rsp),%r9
+ movq 8+32(%rsp),%r10
+ leaq -128+32(%rsp),%rsi
+ movq 16+32(%rsp),%r11
+ movq 24+32(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 352(%rsp),%rbx
+ leaq 96(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+64(%rsp),%rdx
+ movq 8+64(%rsp),%r14
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r15
+ movq 24+64(%rsp),%r8
+ leaq 128(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 0+96(%rsp),%rdx
+ movq 8+96(%rsp),%r14
+ leaq -128+96(%rsp),%rsi
+ movq 16+96(%rsp),%r15
+ movq 24+96(%rsp),%r8
+ leaq 192(%rsp),%rdi
+ call __ecp_nistz256_sqr_montx
+
+ movq 128(%rsp),%rdx
+ leaq 128(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 160(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 320(%rsp),%rdx
+ leaq 320(%rsp),%rbx
+ movq 0+128(%rsp),%r9
+ movq 8+128(%rsp),%r10
+ leaq -128+128(%rsp),%rsi
+ movq 16+128(%rsp),%r11
+ movq 24+128(%rsp),%r12
+ leaq 0(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+
+
+
+ xorq %r11,%r11
+ addq %r12,%r12
+ leaq 192(%rsp),%rsi
+ adcq %r13,%r13
+ movq %r12,%rax
+ adcq %r8,%r8
+ adcq %r9,%r9
+ movq %r13,%rbp
+ adcq $0,%r11
+
+ subq $-1,%r12
+ movq %r8,%rcx
+ sbbq %r14,%r13
+ sbbq $0,%r8
+ movq %r9,%r10
+ sbbq %r15,%r9
+ sbbq $0,%r11
+
+ cmovcq %rax,%r12
+ movq 0(%rsi),%rax
+ cmovcq %rbp,%r13
+ movq 8(%rsi),%rbp
+ cmovcq %rcx,%r8
+ movq 16(%rsi),%rcx
+ cmovcq %r10,%r9
+ movq 24(%rsi),%r10
+
+ call __ecp_nistz256_subx
+
+ leaq 160(%rsp),%rbx
+ leaq 224(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+ movq 0+0(%rsp),%rax
+ movq 0+8(%rsp),%rbp
+ movq 0+16(%rsp),%rcx
+ movq 0+24(%rsp),%r10
+ leaq 64(%rsp),%rdi
+
+ call __ecp_nistz256_subx
+
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r8,16(%rdi)
+ movq %r9,24(%rdi)
+ movq 352(%rsp),%rdx
+ leaq 352(%rsp),%rbx
+ movq 0+160(%rsp),%r9
+ movq 8+160(%rsp),%r10
+ leaq -128+160(%rsp),%rsi
+ movq 16+160(%rsp),%r11
+ movq 24+160(%rsp),%r12
+ leaq 32(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ movq 96(%rsp),%rdx
+ leaq 96(%rsp),%rbx
+ movq 0+64(%rsp),%r9
+ movq 8+64(%rsp),%r10
+ leaq -128+64(%rsp),%rsi
+ movq 16+64(%rsp),%r11
+ movq 24+64(%rsp),%r12
+ leaq 64(%rsp),%rdi
+ call __ecp_nistz256_mul_montx
+
+ leaq 32(%rsp),%rbx
+ leaq 256(%rsp),%rdi
+ call __ecp_nistz256_sub_fromx
+
+.byte 102,72,15,126,199
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 288(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 288+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand .LONE_mont(%rip),%xmm2
+ pand .LONE_mont+16(%rip),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 384(%rsp),%xmm2
+ pand 384+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,64(%rdi)
+ movdqu %xmm3,80(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 224(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 224+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 416(%rsp),%xmm2
+ pand 416+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 320(%rsp),%xmm2
+ pand 320+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,0(%rdi)
+ movdqu %xmm3,16(%rdi)
+
+ movdqa %xmm5,%xmm0
+ movdqa %xmm5,%xmm1
+ pandn 256(%rsp),%xmm0
+ movdqa %xmm5,%xmm2
+ pandn 256+16(%rsp),%xmm1
+ movdqa %xmm5,%xmm3
+ pand 448(%rsp),%xmm2
+ pand 448+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+
+ movdqa %xmm4,%xmm0
+ movdqa %xmm4,%xmm1
+ pandn %xmm2,%xmm0
+ movdqa %xmm4,%xmm2
+ pandn %xmm3,%xmm1
+ movdqa %xmm4,%xmm3
+ pand 352(%rsp),%xmm2
+ pand 352+16(%rsp),%xmm3
+ por %xmm0,%xmm2
+ por %xmm1,%xmm3
+ movdqu %xmm2,32(%rdi)
+ movdqu %xmm3,48(%rdi)
+
+ leaq 480+56(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbx
+.cfi_restore %rbx
+ movq -8(%rsi),%rbp
+.cfi_restore %rbp
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Ladd_affinex_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S
@@ -1,0 +1,343 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.type beeu_mod_inverse_vartime,@function
+.hidden beeu_mod_inverse_vartime
+.globl beeu_mod_inverse_vartime
+.hidden beeu_mod_inverse_vartime
+.align 32
+beeu_mod_inverse_vartime:
+.cfi_startproc
+ pushq %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp,-16
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset r12,-24
+ pushq %r13
+.cfi_adjust_cfa_offset 8
+.cfi_offset r13,-32
+ pushq %r14
+.cfi_adjust_cfa_offset 8
+.cfi_offset r14,-40
+ pushq %r15
+.cfi_adjust_cfa_offset 8
+.cfi_offset r15,-48
+ pushq %rbx
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbx,-56
+ pushq %rsi
+.cfi_adjust_cfa_offset 8
+.cfi_offset rsi,-64
+
+ subq $80,%rsp
+.cfi_adjust_cfa_offset 80
+ movq %rdi,0(%rsp)
+
+
+ movq $1,%r8
+ xorq %r9,%r9
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %rdi,%rdi
+
+ xorq %r12,%r12
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ xorq %rbp,%rbp
+
+
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu %xmm0,48(%rsp)
+ vmovdqu %xmm1,64(%rsp)
+
+ vmovdqu 0(%rdx),%xmm0
+ vmovdqu 16(%rdx),%xmm1
+ vmovdqu %xmm0,16(%rsp)
+ vmovdqu %xmm1,32(%rsp)
+
+.Lbeeu_loop:
+ xorq %rbx,%rbx
+ orq 48(%rsp),%rbx
+ orq 56(%rsp),%rbx
+ orq 64(%rsp),%rbx
+ orq 72(%rsp),%rbx
+ jz .Lbeeu_loop_end
+
+
+
+
+
+
+
+
+
+
+ movq $1,%rcx
+
+
+.Lbeeu_shift_loop_XB:
+ movq %rcx,%rbx
+ andq 48(%rsp),%rbx
+ jnz .Lbeeu_shift_loop_end_XB
+
+
+ movq $1,%rbx
+ andq %r8,%rbx
+ jz .Lshift1_0
+ addq 0(%rdx),%r8
+ adcq 8(%rdx),%r9
+ adcq 16(%rdx),%r10
+ adcq 24(%rdx),%r11
+ adcq $0,%rdi
+
+.Lshift1_0:
+ shrdq $1,%r9,%r8
+ shrdq $1,%r10,%r9
+ shrdq $1,%r11,%r10
+ shrdq $1,%rdi,%r11
+ shrq $1,%rdi
+
+ shlq $1,%rcx
+
+
+
+
+
+ cmpq $0x8000000,%rcx
+ jne .Lbeeu_shift_loop_XB
+
+.Lbeeu_shift_loop_end_XB:
+ bsfq %rcx,%rcx
+ testq %rcx,%rcx
+ jz .Lbeeu_no_shift_XB
+
+
+
+ movq 8+48(%rsp),%rax
+ movq 16+48(%rsp),%rbx
+ movq 24+48(%rsp),%rsi
+
+ shrdq %cl,%rax,0+48(%rsp)
+ shrdq %cl,%rbx,8+48(%rsp)
+ shrdq %cl,%rsi,16+48(%rsp)
+
+ shrq %cl,%rsi
+ movq %rsi,24+48(%rsp)
+
+
+.Lbeeu_no_shift_XB:
+
+ movq $1,%rcx
+
+
+.Lbeeu_shift_loop_YA:
+ movq %rcx,%rbx
+ andq 16(%rsp),%rbx
+ jnz .Lbeeu_shift_loop_end_YA
+
+
+ movq $1,%rbx
+ andq %r12,%rbx
+ jz .Lshift1_1
+ addq 0(%rdx),%r12
+ adcq 8(%rdx),%r13
+ adcq 16(%rdx),%r14
+ adcq 24(%rdx),%r15
+ adcq $0,%rbp
+
+.Lshift1_1:
+ shrdq $1,%r13,%r12
+ shrdq $1,%r14,%r13
+ shrdq $1,%r15,%r14
+ shrdq $1,%rbp,%r15
+ shrq $1,%rbp
+
+ shlq $1,%rcx
+
+
+
+
+
+ cmpq $0x8000000,%rcx
+ jne .Lbeeu_shift_loop_YA
+
+.Lbeeu_shift_loop_end_YA:
+ bsfq %rcx,%rcx
+ testq %rcx,%rcx
+ jz .Lbeeu_no_shift_YA
+
+
+
+ movq 8+16(%rsp),%rax
+ movq 16+16(%rsp),%rbx
+ movq 24+16(%rsp),%rsi
+
+ shrdq %cl,%rax,0+16(%rsp)
+ shrdq %cl,%rbx,8+16(%rsp)
+ shrdq %cl,%rsi,16+16(%rsp)
+
+ shrq %cl,%rsi
+ movq %rsi,24+16(%rsp)
+
+
+.Lbeeu_no_shift_YA:
+
+ movq 48(%rsp),%rax
+ movq 56(%rsp),%rbx
+ movq 64(%rsp),%rsi
+ movq 72(%rsp),%rcx
+ subq 16(%rsp),%rax
+ sbbq 24(%rsp),%rbx
+ sbbq 32(%rsp),%rsi
+ sbbq 40(%rsp),%rcx
+ jnc .Lbeeu_B_bigger_than_A
+
+
+ movq 16(%rsp),%rax
+ movq 24(%rsp),%rbx
+ movq 32(%rsp),%rsi
+ movq 40(%rsp),%rcx
+ subq 48(%rsp),%rax
+ sbbq 56(%rsp),%rbx
+ sbbq 64(%rsp),%rsi
+ sbbq 72(%rsp),%rcx
+ movq %rax,16(%rsp)
+ movq %rbx,24(%rsp)
+ movq %rsi,32(%rsp)
+ movq %rcx,40(%rsp)
+
+
+ addq %r8,%r12
+ adcq %r9,%r13
+ adcq %r10,%r14
+ adcq %r11,%r15
+ adcq %rdi,%rbp
+ jmp .Lbeeu_loop
+
+.Lbeeu_B_bigger_than_A:
+
+ movq %rax,48(%rsp)
+ movq %rbx,56(%rsp)
+ movq %rsi,64(%rsp)
+ movq %rcx,72(%rsp)
+
+
+ addq %r12,%r8
+ adcq %r13,%r9
+ adcq %r14,%r10
+ adcq %r15,%r11
+ adcq %rbp,%rdi
+
+ jmp .Lbeeu_loop
+
+.Lbeeu_loop_end:
+
+
+
+
+ movq 16(%rsp),%rbx
+ subq $1,%rbx
+ orq 24(%rsp),%rbx
+ orq 32(%rsp),%rbx
+ orq 40(%rsp),%rbx
+
+ jnz .Lbeeu_err
+
+
+
+
+ movq 0(%rdx),%r8
+ movq 8(%rdx),%r9
+ movq 16(%rdx),%r10
+ movq 24(%rdx),%r11
+ xorq %rdi,%rdi
+
+.Lbeeu_reduction_loop:
+ movq %r12,16(%rsp)
+ movq %r13,24(%rsp)
+ movq %r14,32(%rsp)
+ movq %r15,40(%rsp)
+ movq %rbp,48(%rsp)
+
+
+ subq %r8,%r12
+ sbbq %r9,%r13
+ sbbq %r10,%r14
+ sbbq %r11,%r15
+ sbbq $0,%rbp
+
+
+ cmovcq 16(%rsp),%r12
+ cmovcq 24(%rsp),%r13
+ cmovcq 32(%rsp),%r14
+ cmovcq 40(%rsp),%r15
+ jnc .Lbeeu_reduction_loop
+
+
+ subq %r12,%r8
+ sbbq %r13,%r9
+ sbbq %r14,%r10
+ sbbq %r15,%r11
+
+.Lbeeu_save:
+
+ movq 0(%rsp),%rdi
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+
+
+ movq $1,%rax
+ jmp .Lbeeu_finish
+
+.Lbeeu_err:
+
+ xorq %rax,%rax
+
+.Lbeeu_finish:
+ addq $80,%rsp
+.cfi_adjust_cfa_offset -80
+ popq %rsi
+.cfi_adjust_cfa_offset -8
+.cfi_restore rsi
+ popq %rbx
+.cfi_adjust_cfa_offset -8
+.cfi_restore rbx
+ popq %r15
+.cfi_adjust_cfa_offset -8
+.cfi_restore r15
+ popq %r14
+.cfi_adjust_cfa_offset -8
+.cfi_restore r14
+ popq %r13
+.cfi_adjust_cfa_offset -8
+.cfi_restore r13
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore r12
+ popq %rbp
+.cfi_adjust_cfa_offset -8
+.cfi_restore rbp
+ .byte 0xf3,0xc3
+.cfi_endproc
+
+.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S
@@ -1,0 +1,63 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+
+
+
+.globl CRYPTO_rdrand
+.hidden CRYPTO_rdrand
+.type CRYPTO_rdrand,@function
+.align 16
+CRYPTO_rdrand:
+.cfi_startproc
+ xorq %rax,%rax
+.byte 72,15,199,242
+
+ adcq %rax,%rax
+ movq %rdx,0(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size CRYPTO_rdrand,.-CRYPTO_rdrand
+
+
+
+
+
+.globl CRYPTO_rdrand_multiple8_buf
+.hidden CRYPTO_rdrand_multiple8_buf
+.type CRYPTO_rdrand_multiple8_buf,@function
+.align 16
+CRYPTO_rdrand_multiple8_buf:
+.cfi_startproc
+ testq %rsi,%rsi
+ jz .Lout
+ movq $8,%rdx
+.Lloop:
+.byte 72,15,199,241
+ jnc .Lerr
+ movq %rcx,0(%rdi)
+ addq %rdx,%rdi
+ subq %rdx,%rsi
+ jnz .Lloop
+.Lout:
+ movq $1,%rax
+ .byte 0xf3,0xc3
+.Lerr:
+ xorq %rax,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S
@@ -1,0 +1,1749 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.globl rsaz_1024_sqr_avx2
+.hidden rsaz_1024_sqr_avx2
+.type rsaz_1024_sqr_avx2,@function
+.align 64
+rsaz_1024_sqr_avx2:
+.cfi_startproc
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ vzeroupper
+ movq %rax,%rbp
+.cfi_def_cfa_register %rbp
+ movq %rdx,%r13
+ subq $832,%rsp
+ movq %r13,%r15
+ subq $-128,%rdi
+ subq $-128,%rsi
+ subq $-128,%r13
+
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ vpxor %ymm9,%ymm9,%ymm9
+ jz .Lsqr_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%r13),%ymm0
+ andq $-2048,%rsp
+ vmovdqu 32-128(%r13),%ymm1
+ vmovdqu 64-128(%r13),%ymm2
+ vmovdqu 96-128(%r13),%ymm3
+ vmovdqu 128-128(%r13),%ymm4
+ vmovdqu 160-128(%r13),%ymm5
+ vmovdqu 192-128(%r13),%ymm6
+ vmovdqu 224-128(%r13),%ymm7
+ vmovdqu 256-128(%r13),%ymm8
+ leaq 832+128(%rsp),%r13
+ vmovdqu %ymm0,0-128(%r13)
+ vmovdqu %ymm1,32-128(%r13)
+ vmovdqu %ymm2,64-128(%r13)
+ vmovdqu %ymm3,96-128(%r13)
+ vmovdqu %ymm4,128-128(%r13)
+ vmovdqu %ymm5,160-128(%r13)
+ vmovdqu %ymm6,192-128(%r13)
+ vmovdqu %ymm7,224-128(%r13)
+ vmovdqu %ymm8,256-128(%r13)
+ vmovdqu %ymm9,288-128(%r13)
+
+.Lsqr_1024_no_n_copy:
+ andq $-1024,%rsp
+
+ vmovdqu 32-128(%rsi),%ymm1
+ vmovdqu 64-128(%rsi),%ymm2
+ vmovdqu 96-128(%rsi),%ymm3
+ vmovdqu 128-128(%rsi),%ymm4
+ vmovdqu 160-128(%rsi),%ymm5
+ vmovdqu 192-128(%rsi),%ymm6
+ vmovdqu 224-128(%rsi),%ymm7
+ vmovdqu 256-128(%rsi),%ymm8
+
+ leaq 192(%rsp),%rbx
+ vmovdqu .Land_mask(%rip),%ymm15
+ jmp .LOOP_GRANDE_SQR_1024
+
+.align 32
+.LOOP_GRANDE_SQR_1024:
+ leaq 576+128(%rsp),%r9
+ leaq 448(%rsp),%r12
+
+
+
+
+ vpaddq %ymm1,%ymm1,%ymm1
+ vpbroadcastq 0-128(%rsi),%ymm10
+ vpaddq %ymm2,%ymm2,%ymm2
+ vmovdqa %ymm1,0-128(%r9)
+ vpaddq %ymm3,%ymm3,%ymm3
+ vmovdqa %ymm2,32-128(%r9)
+ vpaddq %ymm4,%ymm4,%ymm4
+ vmovdqa %ymm3,64-128(%r9)
+ vpaddq %ymm5,%ymm5,%ymm5
+ vmovdqa %ymm4,96-128(%r9)
+ vpaddq %ymm6,%ymm6,%ymm6
+ vmovdqa %ymm5,128-128(%r9)
+ vpaddq %ymm7,%ymm7,%ymm7
+ vmovdqa %ymm6,160-128(%r9)
+ vpaddq %ymm8,%ymm8,%ymm8
+ vmovdqa %ymm7,192-128(%r9)
+ vpxor %ymm9,%ymm9,%ymm9
+ vmovdqa %ymm8,224-128(%r9)
+
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpbroadcastq 32-128(%rsi),%ymm11
+ vmovdqu %ymm9,288-192(%rbx)
+ vpmuludq %ymm10,%ymm1,%ymm1
+ vmovdqu %ymm9,320-448(%r12)
+ vpmuludq %ymm10,%ymm2,%ymm2
+ vmovdqu %ymm9,352-448(%r12)
+ vpmuludq %ymm10,%ymm3,%ymm3
+ vmovdqu %ymm9,384-448(%r12)
+ vpmuludq %ymm10,%ymm4,%ymm4
+ vmovdqu %ymm9,416-448(%r12)
+ vpmuludq %ymm10,%ymm5,%ymm5
+ vmovdqu %ymm9,448-448(%r12)
+ vpmuludq %ymm10,%ymm6,%ymm6
+ vmovdqu %ymm9,480-448(%r12)
+ vpmuludq %ymm10,%ymm7,%ymm7
+ vmovdqu %ymm9,512-448(%r12)
+ vpmuludq %ymm10,%ymm8,%ymm8
+ vpbroadcastq 64-128(%rsi),%ymm10
+ vmovdqu %ymm9,544-448(%r12)
+
+ movq %rsi,%r15
+ movl $4,%r14d
+ jmp .Lsqr_entry_1024
+.align 32
+.LOOP_SQR_1024:
+ vpbroadcastq 32-128(%r15),%ymm11
+ vpmuludq 0-128(%rsi),%ymm10,%ymm0
+ vpaddq 0-192(%rbx),%ymm0,%ymm0
+ vpmuludq 0-128(%r9),%ymm10,%ymm1
+ vpaddq 32-192(%rbx),%ymm1,%ymm1
+ vpmuludq 32-128(%r9),%ymm10,%ymm2
+ vpaddq 64-192(%rbx),%ymm2,%ymm2
+ vpmuludq 64-128(%r9),%ymm10,%ymm3
+ vpaddq 96-192(%rbx),%ymm3,%ymm3
+ vpmuludq 96-128(%r9),%ymm10,%ymm4
+ vpaddq 128-192(%rbx),%ymm4,%ymm4
+ vpmuludq 128-128(%r9),%ymm10,%ymm5
+ vpaddq 160-192(%rbx),%ymm5,%ymm5
+ vpmuludq 160-128(%r9),%ymm10,%ymm6
+ vpaddq 192-192(%rbx),%ymm6,%ymm6
+ vpmuludq 192-128(%r9),%ymm10,%ymm7
+ vpaddq 224-192(%rbx),%ymm7,%ymm7
+ vpmuludq 224-128(%r9),%ymm10,%ymm8
+ vpbroadcastq 64-128(%r15),%ymm10
+ vpaddq 256-192(%rbx),%ymm8,%ymm8
+.Lsqr_entry_1024:
+ vmovdqu %ymm0,0-192(%rbx)
+ vmovdqu %ymm1,32-192(%rbx)
+
+ vpmuludq 32-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 32-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 64-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 96-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 128-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 160-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 192-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 224-128(%r9),%ymm11,%ymm0
+ vpbroadcastq 96-128(%r15),%ymm11
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+
+ vmovdqu %ymm2,64-192(%rbx)
+ vmovdqu %ymm3,96-192(%rbx)
+
+ vpmuludq 64-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 64-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 96-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq 128-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 160-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 224-128(%r9),%ymm10,%ymm1
+ vpbroadcastq 128-128(%r15),%ymm10
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+
+ vmovdqu %ymm4,128-192(%rbx)
+ vmovdqu %ymm5,160-192(%rbx)
+
+ vpmuludq 96-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 96-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq 128-128(%r9),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm0,%ymm0
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq 224-128(%r9),%ymm11,%ymm2
+ vpbroadcastq 160-128(%r15),%ymm11
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+
+ vmovdqu %ymm6,192-192(%rbx)
+ vmovdqu %ymm7,224-192(%rbx)
+
+ vpmuludq 128-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq 128-128(%r9),%ymm10,%ymm14
+ vpaddq %ymm14,%ymm0,%ymm0
+ vpmuludq 160-128(%r9),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 192-128(%r9),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 224-128(%r9),%ymm10,%ymm3
+ vpbroadcastq 192-128(%r15),%ymm10
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+
+ vmovdqu %ymm8,256-192(%rbx)
+ vmovdqu %ymm0,288-192(%rbx)
+ leaq 8(%rbx),%rbx
+
+ vpmuludq 160-128(%rsi),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 160-128(%r9),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 192-128(%r9),%ymm11,%ymm14
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq 224-128(%r9),%ymm11,%ymm4
+ vpbroadcastq 224-128(%r15),%ymm11
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+
+ vmovdqu %ymm1,320-448(%r12)
+ vmovdqu %ymm2,352-448(%r12)
+
+ vpmuludq 192-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 192-128(%r9),%ymm10,%ymm14
+ vpbroadcastq 256-128(%r15),%ymm0
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq 224-128(%r9),%ymm10,%ymm5
+ vpbroadcastq 0+8-128(%r15),%ymm10
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+
+ vmovdqu %ymm3,384-448(%r12)
+ vmovdqu %ymm4,416-448(%r12)
+ leaq 8(%r15),%r15
+
+ vpmuludq 224-128(%rsi),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 224-128(%r9),%ymm11,%ymm6
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+
+ vpmuludq 256-128(%rsi),%ymm0,%ymm7
+ vmovdqu %ymm5,448-448(%r12)
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vmovdqu %ymm6,480-448(%r12)
+ vmovdqu %ymm7,512-448(%r12)
+ leaq 8(%r12),%r12
+
+ decl %r14d
+ jnz .LOOP_SQR_1024
+
+ vmovdqu 256(%rsp),%ymm8
+ vmovdqu 288(%rsp),%ymm1
+ vmovdqu 320(%rsp),%ymm2
+ leaq 192(%rsp),%rbx
+
+ vpsrlq $29,%ymm8,%ymm14
+ vpand %ymm15,%ymm8,%ymm8
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+
+ vpermq $0x93,%ymm14,%ymm14
+ vpxor %ymm9,%ymm9,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpblendd $3,%ymm11,%ymm9,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,288-192(%rbx)
+ vmovdqu %ymm2,320-192(%rbx)
+
+ movq (%rsp),%rax
+ movq 8(%rsp),%r10
+ movq 16(%rsp),%r11
+ movq 24(%rsp),%r12
+ vmovdqu 32(%rsp),%ymm1
+ vmovdqu 64-192(%rbx),%ymm2
+ vmovdqu 96-192(%rbx),%ymm3
+ vmovdqu 128-192(%rbx),%ymm4
+ vmovdqu 160-192(%rbx),%ymm5
+ vmovdqu 192-192(%rbx),%ymm6
+ vmovdqu 224-192(%rbx),%ymm7
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpbroadcastq %xmm12,%ymm12
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ shrq $29,%r9
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ addq %r9,%r10
+ addq %rax,%r11
+ imulq 24-128(%r13),%rdx
+ addq %rdx,%r12
+
+ movq %r10,%rax
+ imull %ecx,%eax
+ andl $0x1fffffff,%eax
+
+ movl $9,%r14d
+ jmp .LOOP_REDUCE_1024
+
+.align 32
+.LOOP_REDUCE_1024:
+ vmovd %eax,%xmm13
+ vpbroadcastq %xmm13,%ymm13
+
+ vpmuludq 32-128(%r13),%ymm12,%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm10,%ymm1,%ymm1
+ addq %rax,%r10
+ vpmuludq 64-128(%r13),%ymm12,%ymm14
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm14,%ymm2,%ymm2
+ vpmuludq 96-128(%r13),%ymm12,%ymm11
+.byte 0x67
+ addq %rax,%r11
+.byte 0x67
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ shrq $29,%r10
+ vpaddq %ymm11,%ymm3,%ymm3
+ vpmuludq 128-128(%r13),%ymm12,%ymm10
+ addq %rax,%r12
+ addq %r10,%r11
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpmuludq 160-128(%r13),%ymm12,%ymm14
+ movq %r11,%rax
+ imull %ecx,%eax
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpmuludq 192-128(%r13),%ymm12,%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpmuludq 224-128(%r13),%ymm12,%ymm10
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpmuludq 256-128(%r13),%ymm12,%ymm14
+ vmovd %eax,%xmm12
+
+ vpaddq %ymm14,%ymm8,%ymm8
+
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 32-8-128(%r13),%ymm13,%ymm11
+ vmovdqu 96-8-128(%r13),%ymm14
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm1,%ymm1
+ vpmuludq 64-8-128(%r13),%ymm13,%ymm10
+ vmovdqu 128-8-128(%r13),%ymm11
+ addq %rax,%r11
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+ vpaddq %ymm10,%ymm2,%ymm2
+ addq %r12,%rax
+ shrq $29,%r11
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 160-8-128(%r13),%ymm10
+ addq %r11,%rax
+ vpaddq %ymm14,%ymm3,%ymm3
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 192-8-128(%r13),%ymm14
+.byte 0x67
+ movq %rax,%r12
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm4,%ymm4
+ vpmuludq %ymm13,%ymm10,%ymm10
+.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm5,%ymm5
+ vpmuludq %ymm13,%ymm14,%ymm14
+ vmovdqu 256-8-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm6,%ymm6
+ vpmuludq %ymm13,%ymm11,%ymm11
+ vmovdqu 288-8-128(%r13),%ymm9
+ vmovd %eax,%xmm0
+ imulq -128(%r13),%rax
+ vpaddq %ymm11,%ymm7,%ymm7
+ vpmuludq %ymm13,%ymm10,%ymm10
+ vmovdqu 32-16-128(%r13),%ymm14
+ vpbroadcastq %xmm0,%ymm0
+ vpaddq %ymm10,%ymm8,%ymm8
+ vpmuludq %ymm13,%ymm9,%ymm9
+ vmovdqu 64-16-128(%r13),%ymm11
+ addq %rax,%r12
+
+ vmovdqu 32-24-128(%r13),%ymm13
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 96-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpmuludq %ymm0,%ymm13,%ymm13
+ vpmuludq %ymm12,%ymm11,%ymm11
+.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff
+ vpaddq %ymm1,%ymm13,%ymm13
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 160-16-128(%r13),%ymm11
+.byte 0x67
+ vmovq %xmm13,%rax
+ vmovdqu %ymm13,(%rsp)
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpmuludq %ymm12,%ymm14,%ymm14
+ vmovdqu 192-16-128(%r13),%ymm10
+ vpaddq %ymm14,%ymm4,%ymm4
+ vpmuludq %ymm12,%ymm11,%ymm11
+ vmovdqu 224-16-128(%r13),%ymm14
+ vpaddq %ymm11,%ymm5,%ymm5
+ vpmuludq %ymm12,%ymm10,%ymm10
+ vmovdqu 256-16-128(%r13),%ymm11
+ vpaddq %ymm10,%ymm6,%ymm6
+ vpmuludq %ymm12,%ymm14,%ymm14
+ shrq $29,%r12
+ vmovdqu 288-16-128(%r13),%ymm10
+ addq %r12,%rax
+ vpaddq %ymm14,%ymm7,%ymm7
+ vpmuludq %ymm12,%ymm11,%ymm11
+
+ movq %rax,%r9
+ imull %ecx,%eax
+ vpaddq %ymm11,%ymm8,%ymm8
+ vpmuludq %ymm12,%ymm10,%ymm10
+ andl $0x1fffffff,%eax
+ vmovd %eax,%xmm12
+ vmovdqu 96-24-128(%r13),%ymm11
+.byte 0x67
+ vpaddq %ymm10,%ymm9,%ymm9
+ vpbroadcastq %xmm12,%ymm12
+
+ vpmuludq 64-24-128(%r13),%ymm0,%ymm14
+ vmovdqu 128-24-128(%r13),%ymm10
+ movq %rax,%rdx
+ imulq -128(%r13),%rax
+ movq 8(%rsp),%r10
+ vpaddq %ymm14,%ymm2,%ymm1
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 160-24-128(%r13),%ymm14
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%r13),%rax
+.byte 0x67
+ shrq $29,%r9
+ movq 16(%rsp),%r11
+ vpaddq %ymm11,%ymm3,%ymm2
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vmovdqu 192-24-128(%r13),%ymm11
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%r13),%rax
+ vpaddq %ymm10,%ymm4,%ymm3
+ vpmuludq %ymm0,%ymm14,%ymm14
+ vmovdqu 224-24-128(%r13),%ymm10
+ imulq 24-128(%r13),%rdx
+ addq %rax,%r11
+ leaq (%r9,%r10,1),%rax
+ vpaddq %ymm14,%ymm5,%ymm4
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vmovdqu 256-24-128(%r13),%ymm14
+ movq %rax,%r10
+ imull %ecx,%eax
+ vpmuludq %ymm0,%ymm10,%ymm10
+ vpaddq %ymm11,%ymm6,%ymm5
+ vmovdqu 288-24-128(%r13),%ymm11
+ andl $0x1fffffff,%eax
+ vpaddq %ymm10,%ymm7,%ymm6
+ vpmuludq %ymm0,%ymm14,%ymm14
+ addq 24(%rsp),%rdx
+ vpaddq %ymm14,%ymm8,%ymm7
+ vpmuludq %ymm0,%ymm11,%ymm11
+ vpaddq %ymm11,%ymm9,%ymm8
+ vmovq %r12,%xmm9
+ movq %rdx,%r12
+
+ decl %r14d
+ jnz .LOOP_REDUCE_1024
+ leaq 448(%rsp),%r12
+ vpaddq %ymm9,%ymm13,%ymm0
+ vpxor %ymm9,%ymm9,%ymm9
+
+ vpaddq 288-192(%rbx),%ymm0,%ymm0
+ vpaddq 320-448(%r12),%ymm1,%ymm1
+ vpaddq 352-448(%r12),%ymm2,%ymm2
+ vpaddq 384-448(%r12),%ymm3,%ymm3
+ vpaddq 416-448(%r12),%ymm4,%ymm4
+ vpaddq 448-448(%r12),%ymm5,%ymm5
+ vpaddq 480-448(%r12),%ymm6,%ymm6
+ vpaddq 512-448(%r12),%ymm7,%ymm7
+ vpaddq 544-448(%r12),%ymm8,%ymm8
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm14
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm11
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm12,%ymm12
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm13,%ymm13
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm0,%ymm0
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm1,%ymm1
+ vmovdqu %ymm0,0-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm2,%ymm2
+ vmovdqu %ymm1,32-128(%rdi)
+ vpblendd $3,%ymm13,%ymm9,%ymm13
+ vpaddq %ymm12,%ymm3,%ymm3
+ vmovdqu %ymm2,64-128(%rdi)
+ vpaddq %ymm13,%ymm4,%ymm4
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpaddq %ymm13,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm14
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm11
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm12
+ vpermq $0x93,%ymm14,%ymm14
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm13
+ vpermq $0x93,%ymm11,%ymm11
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm13,%ymm13
+
+ vpblendd $3,%ymm9,%ymm14,%ymm10
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm14,%ymm11,%ymm14
+ vpaddq %ymm10,%ymm4,%ymm4
+ vpblendd $3,%ymm11,%ymm12,%ymm11
+ vpaddq %ymm14,%ymm5,%ymm5
+ vmovdqu %ymm4,128-128(%rdi)
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm11,%ymm6,%ymm6
+ vmovdqu %ymm5,160-128(%rdi)
+ vpblendd $3,%ymm13,%ymm0,%ymm13
+ vpaddq %ymm12,%ymm7,%ymm7
+ vmovdqu %ymm6,192-128(%rdi)
+ vpaddq %ymm13,%ymm8,%ymm8
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+
+ movq %rdi,%rsi
+ decl %r8d
+ jne .LOOP_GRANDE_SQR_1024
+
+ vzeroall
+ movq %rbp,%rax
+.cfi_def_cfa_register %rax
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lsqr_1024_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.hidden rsaz_1024_mul_avx2
+.type rsaz_1024_mul_avx2,@function
+.align 64
+rsaz_1024_mul_avx2:
+.cfi_startproc
+ leaq (%rsp),%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ movq %rax,%rbp
+.cfi_def_cfa_register %rbp
+ vzeroall
+ movq %rdx,%r13
+ subq $64,%rsp
+
+
+
+
+
+
+.byte 0x67,0x67
+ movq %rsi,%r15
+ andq $4095,%r15
+ addq $320,%r15
+ shrq $12,%r15
+ movq %rsi,%r15
+ cmovnzq %r13,%rsi
+ cmovnzq %r15,%r13
+
+ movq %rcx,%r15
+ subq $-128,%rsi
+ subq $-128,%rcx
+ subq $-128,%rdi
+
+ andq $4095,%r15
+ addq $320,%r15
+.byte 0x67,0x67
+ shrq $12,%r15
+ jz .Lmul_1024_no_n_copy
+
+
+
+
+
+ subq $320,%rsp
+ vmovdqu 0-128(%rcx),%ymm0
+ andq $-512,%rsp
+ vmovdqu 32-128(%rcx),%ymm1
+ vmovdqu 64-128(%rcx),%ymm2
+ vmovdqu 96-128(%rcx),%ymm3
+ vmovdqu 128-128(%rcx),%ymm4
+ vmovdqu 160-128(%rcx),%ymm5
+ vmovdqu 192-128(%rcx),%ymm6
+ vmovdqu 224-128(%rcx),%ymm7
+ vmovdqu 256-128(%rcx),%ymm8
+ leaq 64+128(%rsp),%rcx
+ vmovdqu %ymm0,0-128(%rcx)
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm1,32-128(%rcx)
+ vpxor %ymm1,%ymm1,%ymm1
+ vmovdqu %ymm2,64-128(%rcx)
+ vpxor %ymm2,%ymm2,%ymm2
+ vmovdqu %ymm3,96-128(%rcx)
+ vpxor %ymm3,%ymm3,%ymm3
+ vmovdqu %ymm4,128-128(%rcx)
+ vpxor %ymm4,%ymm4,%ymm4
+ vmovdqu %ymm5,160-128(%rcx)
+ vpxor %ymm5,%ymm5,%ymm5
+ vmovdqu %ymm6,192-128(%rcx)
+ vpxor %ymm6,%ymm6,%ymm6
+ vmovdqu %ymm7,224-128(%rcx)
+ vpxor %ymm7,%ymm7,%ymm7
+ vmovdqu %ymm8,256-128(%rcx)
+ vmovdqa %ymm0,%ymm8
+ vmovdqu %ymm9,288-128(%rcx)
+.Lmul_1024_no_n_copy:
+ andq $-64,%rsp
+
+ movq (%r13),%rbx
+ vpbroadcastq (%r13),%ymm10
+ vmovdqu %ymm0,(%rsp)
+ xorq %r9,%r9
+.byte 0x67
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+
+ vmovdqu .Land_mask(%rip),%ymm15
+ movl $9,%r14d
+ vmovdqu %ymm9,288-128(%rdi)
+ jmp .Loop_mul_1024
+
+.align 32
+.Loop_mul_1024:
+ vpsrlq $29,%ymm3,%ymm9
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r9,%rax
+ movq %rbx,%r10
+ imulq 8-128(%rsi),%r10
+ addq 8(%rsp),%r10
+
+ movq %rax,%r9
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ movq %rbx,%r11
+ imulq 16-128(%rsi),%r11
+ addq 16(%rsp),%r11
+
+ movq %rbx,%r12
+ imulq 24-128(%rsi),%r12
+ addq 24(%rsp),%r12
+ vpmuludq 32-128(%rsi),%ymm10,%ymm0
+ vmovd %eax,%xmm11
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq 64-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq 96-128(%rsi),%ymm10,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq 128-128(%rsi),%ymm10,%ymm0
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq 160-128(%rsi),%ymm10,%ymm12
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq 192-128(%rsi),%ymm10,%ymm13
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq 224-128(%rsi),%ymm10,%ymm0
+ vpermq $0x93,%ymm9,%ymm9
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq 256-128(%rsi),%ymm10,%ymm12
+ vpbroadcastq 8(%r13),%ymm10
+ vpaddq %ymm12,%ymm8,%ymm8
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r9
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r10
+ movq %rdx,%rax
+ imulq 16-128(%rcx),%rax
+ addq %rax,%r11
+ shrq $29,%r9
+ imulq 24-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r9,%r10
+
+ vpmuludq 32-128(%rcx),%ymm11,%ymm13
+ vmovq %xmm10,%rbx
+ vpaddq %ymm13,%ymm1,%ymm1
+ vpmuludq 64-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm2,%ymm2
+ vpmuludq 96-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpmuludq 128-128(%rcx),%ymm11,%ymm13
+ vpaddq %ymm13,%ymm4,%ymm4
+ vpmuludq 160-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm0,%ymm5,%ymm5
+ vpmuludq 192-128(%rcx),%ymm11,%ymm12
+ vpaddq %ymm12,%ymm6,%ymm6
+ vpmuludq 224-128(%rcx),%ymm11,%ymm13
+ vpblendd $3,%ymm14,%ymm9,%ymm12
+ vpaddq %ymm13,%ymm7,%ymm7
+ vpmuludq 256-128(%rcx),%ymm11,%ymm0
+ vpaddq %ymm12,%ymm3,%ymm3
+ vpaddq %ymm0,%ymm8,%ymm8
+
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rsi),%ymm12
+ movq %rbx,%rax
+ imulq 8-128(%rsi),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rsi),%ymm13
+
+ movq %r10,%rax
+ vpblendd $0xfc,%ymm14,%ymm9,%ymm9
+ imull %r8d,%eax
+ vpaddq %ymm9,%ymm4,%ymm4
+ andl $0x1fffffff,%eax
+
+ imulq 16-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovd %eax,%xmm11
+ vmovdqu -8+96-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -8+128-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+160-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+192-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -8+224-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -8+256-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -8+288-128(%rsi),%ymm9
+ vpaddq %ymm12,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm9,%ymm9
+ vpbroadcastq 16(%r13),%ymm10
+
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r10
+ vmovdqu -8+32-128(%rcx),%ymm0
+ movq %rdx,%rax
+ imulq 8-128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -8+64-128(%rcx),%ymm12
+ shrq $29,%r10
+ imulq 16-128(%rcx),%rdx
+ addq %rdx,%r12
+ addq %r10,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -8+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -8+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -8+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -8+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rsi),%ymm0
+ movq %rbx,%rax
+ imulq -128(%rsi),%rax
+ addq %r11,%rax
+
+ vmovdqu -16+64-128(%rsi),%ymm12
+ movq %rax,%r11
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ imulq 8-128(%rsi),%rbx
+ addq %rbx,%r12
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -16+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -16+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 24(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ vmovdqu -16+32-128(%rcx),%ymm0
+ movq %rax,%rdx
+ imulq -128(%rcx),%rax
+ addq %rax,%r11
+ vmovdqu -16+64-128(%rcx),%ymm12
+ imulq 8-128(%rcx),%rdx
+ addq %rdx,%r12
+ shrq $29,%r11
+
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -16+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+128-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -16+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -16+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -16+288-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+32-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+64-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm9,%ymm9
+
+ addq %r11,%r12
+ imulq -128(%rsi),%rbx
+ addq %rbx,%r12
+
+ movq %r12,%rax
+ imull %r8d,%eax
+ andl $0x1fffffff,%eax
+
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovd %eax,%xmm11
+ vmovdqu -24+96-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm1
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpbroadcastq %xmm11,%ymm11
+ vmovdqu -24+128-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm2,%ymm2
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm3
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm4
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rsi),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rsi),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpmuludq %ymm10,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rsi),%ymm13
+ vpaddq %ymm0,%ymm7,%ymm7
+ vpmuludq %ymm10,%ymm12,%ymm12
+ vpaddq %ymm12,%ymm8,%ymm8
+ vpmuludq %ymm10,%ymm13,%ymm13
+ vpbroadcastq 32(%r13),%ymm10
+ vpaddq %ymm13,%ymm9,%ymm9
+ addq $32,%r13
+
+ vmovdqu -24+32-128(%rcx),%ymm0
+ imulq -128(%rcx),%rax
+ addq %rax,%r12
+ shrq $29,%r12
+
+ vmovdqu -24+64-128(%rcx),%ymm12
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovq %xmm10,%rbx
+ vmovdqu -24+96-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm1,%ymm0
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu %ymm0,(%rsp)
+ vpaddq %ymm12,%ymm2,%ymm1
+ vmovdqu -24+128-128(%rcx),%ymm0
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+160-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm3,%ymm2
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+192-128(%rcx),%ymm13
+ vpaddq %ymm0,%ymm4,%ymm3
+ vpmuludq %ymm11,%ymm12,%ymm12
+ vmovdqu -24+224-128(%rcx),%ymm0
+ vpaddq %ymm12,%ymm5,%ymm4
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovdqu -24+256-128(%rcx),%ymm12
+ vpaddq %ymm13,%ymm6,%ymm5
+ vpmuludq %ymm11,%ymm0,%ymm0
+ vmovdqu -24+288-128(%rcx),%ymm13
+ movq %r12,%r9
+ vpaddq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm11,%ymm12,%ymm12
+ addq (%rsp),%r9
+ vpaddq %ymm12,%ymm8,%ymm7
+ vpmuludq %ymm11,%ymm13,%ymm13
+ vmovq %r12,%xmm12
+ vpaddq %ymm13,%ymm9,%ymm8
+
+ decl %r14d
+ jnz .Loop_mul_1024
+ vpaddq (%rsp),%ymm12,%ymm0
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm10,%ymm10
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpermq $0x93,%ymm11,%ymm11
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vpsrlq $29,%ymm0,%ymm12
+ vpand %ymm15,%ymm0,%ymm0
+ vpsrlq $29,%ymm1,%ymm13
+ vpand %ymm15,%ymm1,%ymm1
+ vpsrlq $29,%ymm2,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm2,%ymm2
+ vpsrlq $29,%ymm3,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm3,%ymm3
+ vpermq $0x93,%ymm10,%ymm10
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm11,%ymm11
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm0,%ymm0
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm1,%ymm1
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm2,%ymm2
+ vpblendd $3,%ymm11,%ymm14,%ymm11
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm11,%ymm4,%ymm4
+
+ vmovdqu %ymm0,0-128(%rdi)
+ vmovdqu %ymm1,32-128(%rdi)
+ vmovdqu %ymm2,64-128(%rdi)
+ vmovdqu %ymm3,96-128(%rdi)
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vpsrlq $29,%ymm4,%ymm12
+ vpand %ymm15,%ymm4,%ymm4
+ vpsrlq $29,%ymm5,%ymm13
+ vpand %ymm15,%ymm5,%ymm5
+ vpsrlq $29,%ymm6,%ymm10
+ vpermq $0x93,%ymm12,%ymm12
+ vpand %ymm15,%ymm6,%ymm6
+ vpsrlq $29,%ymm7,%ymm11
+ vpermq $0x93,%ymm13,%ymm13
+ vpand %ymm15,%ymm7,%ymm7
+ vpsrlq $29,%ymm8,%ymm0
+ vpermq $0x93,%ymm10,%ymm10
+ vpand %ymm15,%ymm8,%ymm8
+ vpermq $0x93,%ymm11,%ymm11
+
+ vpblendd $3,%ymm14,%ymm12,%ymm9
+ vpermq $0x93,%ymm0,%ymm0
+ vpblendd $3,%ymm12,%ymm13,%ymm12
+ vpaddq %ymm9,%ymm4,%ymm4
+ vpblendd $3,%ymm13,%ymm10,%ymm13
+ vpaddq %ymm12,%ymm5,%ymm5
+ vpblendd $3,%ymm10,%ymm11,%ymm10
+ vpaddq %ymm13,%ymm6,%ymm6
+ vpblendd $3,%ymm11,%ymm0,%ymm11
+ vpaddq %ymm10,%ymm7,%ymm7
+ vpaddq %ymm11,%ymm8,%ymm8
+
+ vmovdqu %ymm4,128-128(%rdi)
+ vmovdqu %ymm5,160-128(%rdi)
+ vmovdqu %ymm6,192-128(%rdi)
+ vmovdqu %ymm7,224-128(%rdi)
+ vmovdqu %ymm8,256-128(%rdi)
+ vzeroupper
+
+ movq %rbp,%rax
+.cfi_def_cfa_register %rax
+ movq -48(%rax),%r15
+.cfi_restore %r15
+ movq -40(%rax),%r14
+.cfi_restore %r14
+ movq -32(%rax),%r13
+.cfi_restore %r13
+ movq -24(%rax),%r12
+.cfi_restore %r12
+ movq -16(%rax),%rbp
+.cfi_restore %rbp
+ movq -8(%rax),%rbx
+.cfi_restore %rbx
+ leaq (%rax),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul_1024_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+.globl rsaz_1024_red2norm_avx2
+.hidden rsaz_1024_red2norm_avx2
+.type rsaz_1024_red2norm_avx2,@function
+.align 32
+rsaz_1024_red2norm_avx2:
+.cfi_startproc
+ subq $-128,%rsi
+ xorq %rax,%rax
+ movq -128(%rsi),%r8
+ movq -120(%rsi),%r9
+ movq -112(%rsi),%r10
+ shlq $0,%r8
+ shlq $29,%r9
+ movq %r10,%r11
+ shlq $58,%r10
+ shrq $6,%r11
+ addq %r8,%rax
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,0(%rdi)
+ movq %r11,%rax
+ movq -104(%rsi),%r8
+ movq -96(%rsi),%r9
+ shlq $23,%r8
+ movq %r9,%r10
+ shlq $52,%r9
+ shrq $12,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,8(%rdi)
+ movq %r10,%rax
+ movq -88(%rsi),%r11
+ movq -80(%rsi),%r8
+ shlq $17,%r11
+ movq %r8,%r9
+ shlq $46,%r8
+ shrq $18,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,16(%rdi)
+ movq %r9,%rax
+ movq -72(%rsi),%r10
+ movq -64(%rsi),%r11
+ shlq $11,%r10
+ movq %r11,%r8
+ shlq $40,%r11
+ shrq $24,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,24(%rdi)
+ movq %r8,%rax
+ movq -56(%rsi),%r9
+ movq -48(%rsi),%r10
+ movq -40(%rsi),%r11
+ shlq $5,%r9
+ shlq $34,%r10
+ movq %r11,%r8
+ shlq $63,%r11
+ shrq $1,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,32(%rdi)
+ movq %r8,%rax
+ movq -32(%rsi),%r9
+ movq -24(%rsi),%r10
+ shlq $28,%r9
+ movq %r10,%r11
+ shlq $57,%r10
+ shrq $7,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,40(%rdi)
+ movq %r11,%rax
+ movq -16(%rsi),%r8
+ movq -8(%rsi),%r9
+ shlq $22,%r8
+ movq %r9,%r10
+ shlq $51,%r9
+ shrq $13,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,48(%rdi)
+ movq %r10,%rax
+ movq 0(%rsi),%r11
+ movq 8(%rsi),%r8
+ shlq $16,%r11
+ movq %r8,%r9
+ shlq $45,%r8
+ shrq $19,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,56(%rdi)
+ movq %r9,%rax
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+ shlq $10,%r10
+ movq %r11,%r8
+ shlq $39,%r11
+ shrq $25,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,64(%rdi)
+ movq %r8,%rax
+ movq 32(%rsi),%r9
+ movq 40(%rsi),%r10
+ movq 48(%rsi),%r11
+ shlq $4,%r9
+ shlq $33,%r10
+ movq %r11,%r8
+ shlq $62,%r11
+ shrq $2,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,72(%rdi)
+ movq %r8,%rax
+ movq 56(%rsi),%r9
+ movq 64(%rsi),%r10
+ shlq $27,%r9
+ movq %r10,%r11
+ shlq $56,%r10
+ shrq $8,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,80(%rdi)
+ movq %r11,%rax
+ movq 72(%rsi),%r8
+ movq 80(%rsi),%r9
+ shlq $21,%r8
+ movq %r9,%r10
+ shlq $50,%r9
+ shrq $14,%r10
+ addq %r8,%rax
+ addq %r9,%rax
+ adcq $0,%r10
+ movq %rax,88(%rdi)
+ movq %r10,%rax
+ movq 88(%rsi),%r11
+ movq 96(%rsi),%r8
+ shlq $15,%r11
+ movq %r8,%r9
+ shlq $44,%r8
+ shrq $20,%r9
+ addq %r11,%rax
+ addq %r8,%rax
+ adcq $0,%r9
+ movq %rax,96(%rdi)
+ movq %r9,%rax
+ movq 104(%rsi),%r10
+ movq 112(%rsi),%r11
+ shlq $9,%r10
+ movq %r11,%r8
+ shlq $38,%r11
+ shrq $26,%r8
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,104(%rdi)
+ movq %r8,%rax
+ movq 120(%rsi),%r9
+ movq 128(%rsi),%r10
+ movq 136(%rsi),%r11
+ shlq $3,%r9
+ shlq $32,%r10
+ movq %r11,%r8
+ shlq $61,%r11
+ shrq $3,%r8
+ addq %r9,%rax
+ addq %r10,%rax
+ addq %r11,%rax
+ adcq $0,%r8
+ movq %rax,112(%rdi)
+ movq %r8,%rax
+ movq 144(%rsi),%r9
+ movq 152(%rsi),%r10
+ shlq $26,%r9
+ movq %r10,%r11
+ shlq $55,%r10
+ shrq $9,%r11
+ addq %r9,%rax
+ addq %r10,%rax
+ adcq $0,%r11
+ movq %rax,120(%rdi)
+ movq %r11,%rax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl rsaz_1024_norm2red_avx2
+.hidden rsaz_1024_norm2red_avx2
+.type rsaz_1024_norm2red_avx2,@function
+.align 32
+rsaz_1024_norm2red_avx2:
+.cfi_startproc
+ subq $-128,%rdi
+ movq (%rsi),%r8
+ movl $0x1fffffff,%eax
+ movq 8(%rsi),%r9
+ movq %r8,%r11
+ shrq $0,%r11
+ andq %rax,%r11
+ movq %r11,-128(%rdi)
+ movq %r8,%r10
+ shrq $29,%r10
+ andq %rax,%r10
+ movq %r10,-120(%rdi)
+ shrdq $58,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-112(%rdi)
+ movq 16(%rsi),%r10
+ movq %r9,%r8
+ shrq $23,%r8
+ andq %rax,%r8
+ movq %r8,-104(%rdi)
+ shrdq $52,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-96(%rdi)
+ movq 24(%rsi),%r11
+ movq %r10,%r9
+ shrq $17,%r9
+ andq %rax,%r9
+ movq %r9,-88(%rdi)
+ shrdq $46,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-80(%rdi)
+ movq 32(%rsi),%r8
+ movq %r11,%r10
+ shrq $11,%r10
+ andq %rax,%r10
+ movq %r10,-72(%rdi)
+ shrdq $40,%r8,%r11
+ andq %rax,%r11
+ movq %r11,-64(%rdi)
+ movq 40(%rsi),%r9
+ movq %r8,%r11
+ shrq $5,%r11
+ andq %rax,%r11
+ movq %r11,-56(%rdi)
+ movq %r8,%r10
+ shrq $34,%r10
+ andq %rax,%r10
+ movq %r10,-48(%rdi)
+ shrdq $63,%r9,%r8
+ andq %rax,%r8
+ movq %r8,-40(%rdi)
+ movq 48(%rsi),%r10
+ movq %r9,%r8
+ shrq $28,%r8
+ andq %rax,%r8
+ movq %r8,-32(%rdi)
+ shrdq $57,%r10,%r9
+ andq %rax,%r9
+ movq %r9,-24(%rdi)
+ movq 56(%rsi),%r11
+ movq %r10,%r9
+ shrq $22,%r9
+ andq %rax,%r9
+ movq %r9,-16(%rdi)
+ shrdq $51,%r11,%r10
+ andq %rax,%r10
+ movq %r10,-8(%rdi)
+ movq 64(%rsi),%r8
+ movq %r11,%r10
+ shrq $16,%r10
+ andq %rax,%r10
+ movq %r10,0(%rdi)
+ shrdq $45,%r8,%r11
+ andq %rax,%r11
+ movq %r11,8(%rdi)
+ movq 72(%rsi),%r9
+ movq %r8,%r11
+ shrq $10,%r11
+ andq %rax,%r11
+ movq %r11,16(%rdi)
+ shrdq $39,%r9,%r8
+ andq %rax,%r8
+ movq %r8,24(%rdi)
+ movq 80(%rsi),%r10
+ movq %r9,%r8
+ shrq $4,%r8
+ andq %rax,%r8
+ movq %r8,32(%rdi)
+ movq %r9,%r11
+ shrq $33,%r11
+ andq %rax,%r11
+ movq %r11,40(%rdi)
+ shrdq $62,%r10,%r9
+ andq %rax,%r9
+ movq %r9,48(%rdi)
+ movq 88(%rsi),%r11
+ movq %r10,%r9
+ shrq $27,%r9
+ andq %rax,%r9
+ movq %r9,56(%rdi)
+ shrdq $56,%r11,%r10
+ andq %rax,%r10
+ movq %r10,64(%rdi)
+ movq 96(%rsi),%r8
+ movq %r11,%r10
+ shrq $21,%r10
+ andq %rax,%r10
+ movq %r10,72(%rdi)
+ shrdq $50,%r8,%r11
+ andq %rax,%r11
+ movq %r11,80(%rdi)
+ movq 104(%rsi),%r9
+ movq %r8,%r11
+ shrq $15,%r11
+ andq %rax,%r11
+ movq %r11,88(%rdi)
+ shrdq $44,%r9,%r8
+ andq %rax,%r8
+ movq %r8,96(%rdi)
+ movq 112(%rsi),%r10
+ movq %r9,%r8
+ shrq $9,%r8
+ andq %rax,%r8
+ movq %r8,104(%rdi)
+ shrdq $38,%r10,%r9
+ andq %rax,%r9
+ movq %r9,112(%rdi)
+ movq 120(%rsi),%r11
+ movq %r10,%r9
+ shrq $3,%r9
+ andq %rax,%r9
+ movq %r9,120(%rdi)
+ movq %r10,%r8
+ shrq $32,%r8
+ andq %rax,%r8
+ movq %r8,128(%rdi)
+ shrdq $61,%r11,%r10
+ andq %rax,%r10
+ movq %r10,136(%rdi)
+ xorq %r8,%r8
+ movq %r11,%r10
+ shrq $26,%r10
+ andq %rax,%r10
+ movq %r10,144(%rdi)
+ shrdq $55,%r8,%r11
+ andq %rax,%r11
+ movq %r11,152(%rdi)
+ movq %r8,160(%rdi)
+ movq %r8,168(%rdi)
+ movq %r8,176(%rdi)
+ movq %r8,184(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+.globl rsaz_1024_scatter5_avx2
+.hidden rsaz_1024_scatter5_avx2
+.type rsaz_1024_scatter5_avx2,@function
+.align 32
+rsaz_1024_scatter5_avx2:
+.cfi_startproc
+ vzeroupper
+ vmovdqu .Lscatter_permd(%rip),%ymm5
+ shll $4,%edx
+ leaq (%rdi,%rdx,1),%rdi
+ movl $9,%eax
+ jmp .Loop_scatter_1024
+
+.align 32
+.Loop_scatter_1024:
+ vmovdqu (%rsi),%ymm0
+ leaq 32(%rsi),%rsi
+ vpermd %ymm0,%ymm5,%ymm0
+ vmovdqu %xmm0,(%rdi)
+ leaq 512(%rdi),%rdi
+ decl %eax
+ jnz .Loop_scatter_1024
+
+ vzeroupper
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl rsaz_1024_gather5_avx2
+.hidden rsaz_1024_gather5_avx2
+.type rsaz_1024_gather5_avx2,@function
+.align 32
+rsaz_1024_gather5_avx2:
+.cfi_startproc
+ vzeroupper
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ leaq -256(%rsp),%rsp
+ andq $-32,%rsp
+ leaq .Linc(%rip),%r10
+ leaq -128(%rsp),%rax
+
+ vmovd %edx,%xmm4
+ vmovdqa (%r10),%ymm0
+ vmovdqa 32(%r10),%ymm1
+ vmovdqa 64(%r10),%ymm5
+ vpbroadcastd %xmm4,%ymm4
+
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,0+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm0
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,32+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm1
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,64+128(%rax)
+ vpaddd %ymm5,%ymm0,%ymm2
+ vpcmpeqd %ymm4,%ymm0,%ymm0
+ vmovdqa %ymm3,96+128(%rax)
+ vpaddd %ymm5,%ymm1,%ymm3
+ vpcmpeqd %ymm4,%ymm1,%ymm1
+ vmovdqa %ymm0,128+128(%rax)
+ vpaddd %ymm5,%ymm2,%ymm8
+ vpcmpeqd %ymm4,%ymm2,%ymm2
+ vmovdqa %ymm1,160+128(%rax)
+ vpaddd %ymm5,%ymm3,%ymm9
+ vpcmpeqd %ymm4,%ymm3,%ymm3
+ vmovdqa %ymm2,192+128(%rax)
+ vpaddd %ymm5,%ymm8,%ymm10
+ vpcmpeqd %ymm4,%ymm8,%ymm8
+ vmovdqa %ymm3,224+128(%rax)
+ vpaddd %ymm5,%ymm9,%ymm11
+ vpcmpeqd %ymm4,%ymm9,%ymm9
+ vpaddd %ymm5,%ymm10,%ymm12
+ vpcmpeqd %ymm4,%ymm10,%ymm10
+ vpaddd %ymm5,%ymm11,%ymm13
+ vpcmpeqd %ymm4,%ymm11,%ymm11
+ vpaddd %ymm5,%ymm12,%ymm14
+ vpcmpeqd %ymm4,%ymm12,%ymm12
+ vpaddd %ymm5,%ymm13,%ymm15
+ vpcmpeqd %ymm4,%ymm13,%ymm13
+ vpcmpeqd %ymm4,%ymm14,%ymm14
+ vpcmpeqd %ymm4,%ymm15,%ymm15
+
+ vmovdqa -32(%r10),%ymm7
+ leaq 128(%rsi),%rsi
+ movl $9,%edx
+
+.Loop_gather_1024:
+ vmovdqa 0-128(%rsi),%ymm0
+ vmovdqa 32-128(%rsi),%ymm1
+ vmovdqa 64-128(%rsi),%ymm2
+ vmovdqa 96-128(%rsi),%ymm3
+ vpand 0+128(%rax),%ymm0,%ymm0
+ vpand 32+128(%rax),%ymm1,%ymm1
+ vpand 64+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm1,%ymm4
+ vpand 96+128(%rax),%ymm3,%ymm3
+ vmovdqa 128-128(%rsi),%ymm0
+ vmovdqa 160-128(%rsi),%ymm1
+ vpor %ymm2,%ymm3,%ymm5
+ vmovdqa 192-128(%rsi),%ymm2
+ vmovdqa 224-128(%rsi),%ymm3
+ vpand 128+128(%rax),%ymm0,%ymm0
+ vpand 160+128(%rax),%ymm1,%ymm1
+ vpand 192+128(%rax),%ymm2,%ymm2
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 224+128(%rax),%ymm3,%ymm3
+ vpand 256-128(%rsi),%ymm8,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 288-128(%rsi),%ymm9,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 320-128(%rsi),%ymm10,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 352-128(%rsi),%ymm11,%ymm3
+ vpor %ymm0,%ymm4,%ymm4
+ vpand 384-128(%rsi),%ymm12,%ymm0
+ vpor %ymm1,%ymm5,%ymm5
+ vpand 416-128(%rsi),%ymm13,%ymm1
+ vpor %ymm2,%ymm4,%ymm4
+ vpand 448-128(%rsi),%ymm14,%ymm2
+ vpor %ymm3,%ymm5,%ymm5
+ vpand 480-128(%rsi),%ymm15,%ymm3
+ leaq 512(%rsi),%rsi
+ vpor %ymm0,%ymm4,%ymm4
+ vpor %ymm1,%ymm5,%ymm5
+ vpor %ymm2,%ymm4,%ymm4
+ vpor %ymm3,%ymm5,%ymm5
+
+ vpor %ymm5,%ymm4,%ymm4
+ vextracti128 $1,%ymm4,%xmm5
+ vpor %xmm4,%xmm5,%xmm5
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqu %ymm5,(%rdi)
+ leaq 32(%rdi),%rdi
+ decl %edx
+ jnz .Loop_gather_1024
+
+ vpxor %ymm0,%ymm0,%ymm0
+ vmovdqu %ymm0,(%rdi)
+ vzeroupper
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+ .byte 0xf3,0xc3
+.cfi_endproc
+.LSEH_end_rsaz_1024_gather5:
+.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+.align 64
+.Land_mask:
+.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
+.Lscatter_permd:
+.long 0,2,4,6,7,7,7,7
+.Lgather_permd:
+.long 0,7,1,7,2,7,3,7
+.Linc:
+.long 0,0,0,0, 1,1,1,1
+.long 2,2,2,2, 3,3,3,3
+.long 4,4,4,4, 4,4,4,4
+.align 64
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S
@@ -1,0 +1,5468 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.globl sha1_block_data_order
+.hidden sha1_block_data_order
+.type sha1_block_data_order,@function
+.align 16
+sha1_block_data_order:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%r10
+ movl 0(%r10),%r9d
+ movl 4(%r10),%r8d
+ movl 8(%r10),%r10d
+ testl $512,%r8d
+ jz .Lialu
+ testl $536870912,%r10d
+ jnz _shaext_shortcut
+ andl $296,%r10d
+ cmpl $296,%r10d
+ je _avx2_shortcut
+ andl $268435456,%r8d
+ andl $1073741824,%r9d
+ orl %r9d,%r8d
+ cmpl $1342177280,%r8d
+ je _avx_shortcut
+ jmp _ssse3_shortcut
+
+.align 16
+.Lialu:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ movq %rdi,%r8
+ subq $72,%rsp
+ movq %rsi,%r9
+ andq $-64,%rsp
+ movq %rdx,%r10
+ movq %rax,64(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08
+.Lprologue:
+
+ movl 0(%r8),%esi
+ movl 4(%r8),%edi
+ movl 8(%r8),%r11d
+ movl 12(%r8),%r12d
+ movl 16(%r8),%r13d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movl 0(%r9),%edx
+ bswapl %edx
+ movl 4(%r9),%ebp
+ movl %r12d,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ecx
+ bswapl %ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 8(%r9),%r14d
+ movl %r11d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ecx
+ bswapl %r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 12(%r9),%edx
+ movl %edi,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ecx
+ bswapl %edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 16(%r9),%ebp
+ movl %esi,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ecx
+ bswapl %ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 20(%r9),%r14d
+ movl %r13d,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ecx
+ bswapl %r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 24(%r9),%edx
+ movl %r12d,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ecx
+ bswapl %edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%r14,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 28(%r9),%ebp
+ movl %r11d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ecx
+ bswapl %ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%rdx,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 32(%r9),%r14d
+ movl %edi,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ecx
+ bswapl %r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rbp,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 36(%r9),%edx
+ movl %esi,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ecx
+ bswapl %edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%r14,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 40(%r9),%ebp
+ movl %r13d,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ecx
+ bswapl %ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%rdx,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ movl 44(%r9),%r14d
+ movl %r12d,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ecx
+ bswapl %r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ andl %edi,%eax
+ leal 1518500249(%rbp,%r13,1),%r13d
+ addl %ecx,%r13d
+ xorl %r12d,%eax
+ roll $30,%edi
+ addl %eax,%r13d
+ movl 48(%r9),%edx
+ movl %r11d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ecx
+ bswapl %edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ andl %esi,%eax
+ leal 1518500249(%r14,%r12,1),%r12d
+ addl %ecx,%r12d
+ xorl %r11d,%eax
+ roll $30,%esi
+ addl %eax,%r12d
+ movl 52(%r9),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %r12d,%ecx
+ bswapl %ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ andl %r13d,%eax
+ leal 1518500249(%rdx,%r11,1),%r11d
+ addl %ecx,%r11d
+ xorl %edi,%eax
+ roll $30,%r13d
+ addl %eax,%r11d
+ movl 56(%r9),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r11d,%ecx
+ bswapl %r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ andl %r12d,%eax
+ leal 1518500249(%rbp,%rdi,1),%edi
+ addl %ecx,%edi
+ xorl %esi,%eax
+ roll $30,%r12d
+ addl %eax,%edi
+ movl 60(%r9),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %edi,%ecx
+ bswapl %edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ andl %r11d,%eax
+ leal 1518500249(%r14,%rsi,1),%esi
+ addl %ecx,%esi
+ xorl %r13d,%eax
+ roll $30,%r11d
+ addl %eax,%esi
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %esi,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ andl %edi,%eax
+ leal 1518500249(%rdx,%r13,1),%r13d
+ roll $30,%edi
+ xorl %r12d,%eax
+ addl %ecx,%r13d
+ roll $1,%ebp
+ addl %eax,%r13d
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %r13d,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ andl %esi,%eax
+ leal 1518500249(%rbp,%r12,1),%r12d
+ roll $30,%esi
+ xorl %r11d,%eax
+ addl %ecx,%r12d
+ roll $1,%r14d
+ addl %eax,%r12d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %r12d,%ecx
+ xorl 16(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ andl %r13d,%eax
+ leal 1518500249(%r14,%r11,1),%r11d
+ roll $30,%r13d
+ xorl %edi,%eax
+ addl %ecx,%r11d
+ roll $1,%edx
+ addl %eax,%r11d
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r11d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ andl %r12d,%eax
+ leal 1518500249(%rdx,%rdi,1),%edi
+ roll $30,%r12d
+ xorl %esi,%eax
+ addl %ecx,%edi
+ roll $1,%ebp
+ addl %eax,%edi
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %edi,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ andl %r11d,%eax
+ leal 1518500249(%rbp,%rsi,1),%esi
+ roll $30,%r11d
+ xorl %r13d,%eax
+ addl %ecx,%esi
+ roll $1,%r14d
+ addl %eax,%esi
+ xorl 20(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,16(%rsp)
+ movl %esi,%ecx
+ xorl 28(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,20(%rsp)
+ movl %r13d,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,24(%rsp)
+ movl %r12d,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,28(%rsp)
+ movl %r11d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,32(%rsp)
+ movl %edi,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,36(%rsp)
+ movl %esi,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal 1859775393(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,40(%rsp)
+ movl %r13d,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal 1859775393(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,44(%rsp)
+ movl %r12d,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal 1859775393(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,48(%rsp)
+ movl %r11d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal 1859775393(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,52(%rsp)
+ movl %edi,%ecx
+ xorl 0(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal 1859775393(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,56(%rsp)
+ movl %esi,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal 1859775393(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 0(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,60(%rsp)
+ movl %r13d,%ecx
+ xorl 8(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%r14d
+ leal 1859775393(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 4(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,0(%rsp)
+ movl %r12d,%ecx
+ xorl 12(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%edx
+ leal 1859775393(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 8(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,4(%rsp)
+ movl %r11d,%ecx
+ xorl 16(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%ebp
+ leal 1859775393(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 12(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,8(%rsp)
+ movl %edi,%ecx
+ xorl 20(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%r14d
+ leal 1859775393(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 16(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,12(%rsp)
+ movl %esi,%ecx
+ xorl 24(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%edx
+ leal 1859775393(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 20(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,16(%rsp)
+ movl %r13d,%ecx
+ xorl 28(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%ebp
+ leal 1859775393(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 24(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,20(%rsp)
+ movl %r12d,%ecx
+ xorl 32(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%r14d
+ leal 1859775393(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 28(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,24(%rsp)
+ movl %r11d,%ecx
+ xorl 36(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%edx
+ leal 1859775393(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 32(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,28(%rsp)
+ movl %edi,%ecx
+ xorl 40(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%ebp
+ leal 1859775393(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 36(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,32(%rsp)
+ movl %r12d,%ebx
+ xorl 44(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 4(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 40(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,36(%rsp)
+ movl %r11d,%ebx
+ xorl 48(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 8(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 44(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,40(%rsp)
+ movl %edi,%ebx
+ xorl 52(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 12(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 48(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,44(%rsp)
+ movl %esi,%ebx
+ xorl 56(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 16(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 52(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,48(%rsp)
+ movl %r13d,%ebx
+ xorl 60(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 20(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 56(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,52(%rsp)
+ movl %r12d,%ebx
+ xorl 0(%rsp),%ebp
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 24(%rsp),%ebp
+ leal -1894007588(%rdx,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%ebp
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 60(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,56(%rsp)
+ movl %r11d,%ebx
+ xorl 4(%rsp),%r14d
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 28(%rsp),%r14d
+ leal -1894007588(%rbp,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%r14d
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 0(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,60(%rsp)
+ movl %edi,%ebx
+ xorl 8(%rsp),%edx
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 32(%rsp),%edx
+ leal -1894007588(%r14,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%edx
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 4(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,0(%rsp)
+ movl %esi,%ebx
+ xorl 12(%rsp),%ebp
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 36(%rsp),%ebp
+ leal -1894007588(%rdx,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%ebp
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 8(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,4(%rsp)
+ movl %r13d,%ebx
+ xorl 16(%rsp),%r14d
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 40(%rsp),%r14d
+ leal -1894007588(%rbp,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%r14d
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 12(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,8(%rsp)
+ movl %r12d,%ebx
+ xorl 20(%rsp),%edx
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 44(%rsp),%edx
+ leal -1894007588(%r14,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%edx
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 16(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,12(%rsp)
+ movl %r11d,%ebx
+ xorl 24(%rsp),%ebp
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 48(%rsp),%ebp
+ leal -1894007588(%rdx,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%ebp
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 20(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,16(%rsp)
+ movl %edi,%ebx
+ xorl 28(%rsp),%r14d
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 52(%rsp),%r14d
+ leal -1894007588(%rbp,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%r14d
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 24(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,20(%rsp)
+ movl %esi,%ebx
+ xorl 32(%rsp),%edx
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 56(%rsp),%edx
+ leal -1894007588(%r14,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%edx
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 28(%rsp),%ebp
+ movl %r13d,%eax
+ movl %edx,24(%rsp)
+ movl %r13d,%ebx
+ xorl 36(%rsp),%ebp
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 60(%rsp),%ebp
+ leal -1894007588(%rdx,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%ebp
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 32(%rsp),%r14d
+ movl %r12d,%eax
+ movl %ebp,28(%rsp)
+ movl %r12d,%ebx
+ xorl 40(%rsp),%r14d
+ andl %r11d,%eax
+ movl %esi,%ecx
+ xorl 0(%rsp),%r14d
+ leal -1894007588(%rbp,%r13,1),%r13d
+ xorl %r11d,%ebx
+ roll $5,%ecx
+ addl %eax,%r13d
+ roll $1,%r14d
+ andl %edi,%ebx
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %ebx,%r13d
+ xorl 36(%rsp),%edx
+ movl %r11d,%eax
+ movl %r14d,32(%rsp)
+ movl %r11d,%ebx
+ xorl 44(%rsp),%edx
+ andl %edi,%eax
+ movl %r13d,%ecx
+ xorl 4(%rsp),%edx
+ leal -1894007588(%r14,%r12,1),%r12d
+ xorl %edi,%ebx
+ roll $5,%ecx
+ addl %eax,%r12d
+ roll $1,%edx
+ andl %esi,%ebx
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %ebx,%r12d
+ xorl 40(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,36(%rsp)
+ movl %edi,%ebx
+ xorl 48(%rsp),%ebp
+ andl %esi,%eax
+ movl %r12d,%ecx
+ xorl 8(%rsp),%ebp
+ leal -1894007588(%rdx,%r11,1),%r11d
+ xorl %esi,%ebx
+ roll $5,%ecx
+ addl %eax,%r11d
+ roll $1,%ebp
+ andl %r13d,%ebx
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %ebx,%r11d
+ xorl 44(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,40(%rsp)
+ movl %esi,%ebx
+ xorl 52(%rsp),%r14d
+ andl %r13d,%eax
+ movl %r11d,%ecx
+ xorl 12(%rsp),%r14d
+ leal -1894007588(%rbp,%rdi,1),%edi
+ xorl %r13d,%ebx
+ roll $5,%ecx
+ addl %eax,%edi
+ roll $1,%r14d
+ andl %r12d,%ebx
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %ebx,%edi
+ xorl 48(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,44(%rsp)
+ movl %r13d,%ebx
+ xorl 56(%rsp),%edx
+ andl %r12d,%eax
+ movl %edi,%ecx
+ xorl 16(%rsp),%edx
+ leal -1894007588(%r14,%rsi,1),%esi
+ xorl %r12d,%ebx
+ roll $5,%ecx
+ addl %eax,%esi
+ roll $1,%edx
+ andl %r11d,%ebx
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %ebx,%esi
+ xorl 52(%rsp),%ebp
+ movl %edi,%eax
+ movl %edx,48(%rsp)
+ movl %esi,%ecx
+ xorl 60(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 56(%rsp),%r14d
+ movl %esi,%eax
+ movl %ebp,52(%rsp)
+ movl %r13d,%ecx
+ xorl 0(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 60(%rsp),%edx
+ movl %r13d,%eax
+ movl %r14d,56(%rsp)
+ movl %r12d,%ecx
+ xorl 4(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 0(%rsp),%ebp
+ movl %r12d,%eax
+ movl %edx,60(%rsp)
+ movl %r11d,%ecx
+ xorl 8(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 32(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ xorl 4(%rsp),%r14d
+ movl %r11d,%eax
+ movl %ebp,0(%rsp)
+ movl %edi,%ecx
+ xorl 12(%rsp),%r14d
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 36(%rsp),%r14d
+ leal -899497514(%rbp,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%r14d
+ xorl 8(%rsp),%edx
+ movl %edi,%eax
+ movl %r14d,4(%rsp)
+ movl %esi,%ecx
+ xorl 16(%rsp),%edx
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 40(%rsp),%edx
+ leal -899497514(%r14,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%edx
+ xorl 12(%rsp),%ebp
+ movl %esi,%eax
+ movl %edx,8(%rsp)
+ movl %r13d,%ecx
+ xorl 20(%rsp),%ebp
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 44(%rsp),%ebp
+ leal -899497514(%rdx,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%ebp
+ xorl 16(%rsp),%r14d
+ movl %r13d,%eax
+ movl %ebp,12(%rsp)
+ movl %r12d,%ecx
+ xorl 24(%rsp),%r14d
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 48(%rsp),%r14d
+ leal -899497514(%rbp,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%r14d
+ xorl 20(%rsp),%edx
+ movl %r12d,%eax
+ movl %r14d,16(%rsp)
+ movl %r11d,%ecx
+ xorl 28(%rsp),%edx
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 52(%rsp),%edx
+ leal -899497514(%r14,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%edx
+ xorl 24(%rsp),%ebp
+ movl %r11d,%eax
+ movl %edx,20(%rsp)
+ movl %edi,%ecx
+ xorl 32(%rsp),%ebp
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 56(%rsp),%ebp
+ leal -899497514(%rdx,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%ebp
+ xorl 28(%rsp),%r14d
+ movl %edi,%eax
+ movl %ebp,24(%rsp)
+ movl %esi,%ecx
+ xorl 36(%rsp),%r14d
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 60(%rsp),%r14d
+ leal -899497514(%rbp,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%r14d
+ xorl 32(%rsp),%edx
+ movl %esi,%eax
+ movl %r14d,28(%rsp)
+ movl %r13d,%ecx
+ xorl 40(%rsp),%edx
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 0(%rsp),%edx
+ leal -899497514(%r14,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%edx
+ xorl 36(%rsp),%ebp
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 44(%rsp),%ebp
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 4(%rsp),%ebp
+ leal -899497514(%rdx,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%ebp
+ xorl 40(%rsp),%r14d
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 48(%rsp),%r14d
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 8(%rsp),%r14d
+ leal -899497514(%rbp,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%r14d
+ xorl 44(%rsp),%edx
+ movl %r11d,%eax
+
+ movl %edi,%ecx
+ xorl 52(%rsp),%edx
+ xorl %r13d,%eax
+ roll $5,%ecx
+ xorl 12(%rsp),%edx
+ leal -899497514(%r14,%rsi,1),%esi
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ roll $1,%edx
+ xorl 48(%rsp),%ebp
+ movl %edi,%eax
+
+ movl %esi,%ecx
+ xorl 56(%rsp),%ebp
+ xorl %r12d,%eax
+ roll $5,%ecx
+ xorl 16(%rsp),%ebp
+ leal -899497514(%rdx,%r13,1),%r13d
+ xorl %r11d,%eax
+ addl %ecx,%r13d
+ roll $30,%edi
+ addl %eax,%r13d
+ roll $1,%ebp
+ xorl 52(%rsp),%r14d
+ movl %esi,%eax
+
+ movl %r13d,%ecx
+ xorl 60(%rsp),%r14d
+ xorl %r11d,%eax
+ roll $5,%ecx
+ xorl 20(%rsp),%r14d
+ leal -899497514(%rbp,%r12,1),%r12d
+ xorl %edi,%eax
+ addl %ecx,%r12d
+ roll $30,%esi
+ addl %eax,%r12d
+ roll $1,%r14d
+ xorl 56(%rsp),%edx
+ movl %r13d,%eax
+
+ movl %r12d,%ecx
+ xorl 0(%rsp),%edx
+ xorl %edi,%eax
+ roll $5,%ecx
+ xorl 24(%rsp),%edx
+ leal -899497514(%r14,%r11,1),%r11d
+ xorl %esi,%eax
+ addl %ecx,%r11d
+ roll $30,%r13d
+ addl %eax,%r11d
+ roll $1,%edx
+ xorl 60(%rsp),%ebp
+ movl %r12d,%eax
+
+ movl %r11d,%ecx
+ xorl 4(%rsp),%ebp
+ xorl %esi,%eax
+ roll $5,%ecx
+ xorl 28(%rsp),%ebp
+ leal -899497514(%rdx,%rdi,1),%edi
+ xorl %r13d,%eax
+ addl %ecx,%edi
+ roll $30,%r12d
+ addl %eax,%edi
+ roll $1,%ebp
+ movl %r11d,%eax
+ movl %edi,%ecx
+ xorl %r13d,%eax
+ leal -899497514(%rbp,%rsi,1),%esi
+ roll $5,%ecx
+ xorl %r12d,%eax
+ addl %ecx,%esi
+ roll $30,%r11d
+ addl %eax,%esi
+ addl 0(%r8),%esi
+ addl 4(%r8),%edi
+ addl 8(%r8),%r11d
+ addl 12(%r8),%r12d
+ addl 16(%r8),%r13d
+ movl %esi,0(%r8)
+ movl %edi,4(%r8)
+ movl %r11d,8(%r8)
+ movl %r12d,12(%r8)
+ movl %r13d,16(%r8)
+
+ subq $1,%r10
+ leaq 64(%r9),%r9
+ jnz .Lloop
+
+ movq 64(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order,.-sha1_block_data_order
+.type sha1_block_data_order_shaext,@function
+.align 32
+sha1_block_data_order_shaext:
+_shaext_shortcut:
+.cfi_startproc
+ movdqu (%rdi),%xmm0
+ movd 16(%rdi),%xmm1
+ movdqa K_XX_XX+160(%rip),%xmm3
+
+ movdqu (%rsi),%xmm4
+ pshufd $27,%xmm0,%xmm0
+ movdqu 16(%rsi),%xmm5
+ pshufd $27,%xmm1,%xmm1
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,227
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,235
+.byte 102,15,56,0,243
+ movdqa %xmm1,%xmm9
+.byte 102,15,56,0,251
+ jmp .Loop_shaext
+
+.align 16
+.Loop_shaext:
+ decq %rdx
+ leaq 64(%rsi),%r8
+ paddd %xmm4,%xmm1
+ cmovneq %r8,%rsi
+ movdqa %xmm0,%xmm8
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,0
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,0
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,1
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,1
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+.byte 15,56,201,229
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,213
+ pxor %xmm6,%xmm4
+.byte 15,56,201,238
+.byte 15,56,202,231
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,2
+.byte 15,56,200,206
+ pxor %xmm7,%xmm5
+.byte 15,56,202,236
+.byte 15,56,201,247
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,2
+.byte 15,56,200,215
+ pxor %xmm4,%xmm6
+.byte 15,56,201,252
+.byte 15,56,202,245
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,204
+ pxor %xmm5,%xmm7
+.byte 15,56,202,254
+ movdqu (%rsi),%xmm4
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,213
+ movdqu 16(%rsi),%xmm5
+.byte 102,15,56,0,227
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 15,56,200,206
+ movdqu 32(%rsi),%xmm6
+.byte 102,15,56,0,235
+
+ movdqa %xmm0,%xmm2
+.byte 15,58,204,193,3
+.byte 15,56,200,215
+ movdqu 48(%rsi),%xmm7
+.byte 102,15,56,0,243
+
+ movdqa %xmm0,%xmm1
+.byte 15,58,204,194,3
+.byte 65,15,56,200,201
+.byte 102,15,56,0,251
+
+ paddd %xmm8,%xmm0
+ movdqa %xmm1,%xmm9
+
+ jnz .Loop_shaext
+
+ pshufd $27,%xmm0,%xmm0
+ pshufd $27,%xmm1,%xmm1
+ movdqu %xmm0,(%rdi)
+ movd %xmm1,16(%rdi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
+.type sha1_block_data_order_ssse3,@function
+.align 16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ leaq -64(%rsp),%rsp
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+.byte 102,15,56,0,206
+.byte 102,15,56,0,214
+ addq $64,%r9
+ paddd %xmm9,%xmm0
+.byte 102,15,56,0,222
+ paddd %xmm9,%xmm1
+ paddd %xmm9,%xmm2
+ movdqa %xmm0,0(%rsp)
+ psubd %xmm9,%xmm0
+ movdqa %xmm1,16(%rsp)
+ psubd %xmm9,%xmm1
+ movdqa %xmm2,32(%rsp)
+ psubd %xmm9,%xmm2
+ jmp .Loop_ssse3
+.align 16
+.Loop_ssse3:
+ rorl $2,%ebx
+ pshufd $238,%xmm0,%xmm4
+ xorl %edx,%esi
+ movdqa %xmm3,%xmm8
+ paddd %xmm3,%xmm9
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ punpcklqdq %xmm1,%xmm4
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ psrldq $4,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ pxor %xmm0,%xmm4
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm2,%xmm8
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ pxor %xmm8,%xmm4
+ xorl %ebx,%eax
+ roll $5,%ebp
+ movdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ movdqa %xmm4,%xmm10
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ movdqa %xmm4,%xmm8
+ xorl %ebx,%esi
+ pslldq $12,%xmm10
+ paddd %xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ psrld $31,%xmm8
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm9
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ psrld $30,%xmm10
+ addl %edx,%ecx
+ rorl $7,%edx
+ por %xmm8,%xmm4
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm4
+ xorl %ebp,%edx
+ movdqa -64(%r14),%xmm10
+ roll $5,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ pxor %xmm9,%xmm4
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pshufd $238,%xmm1,%xmm5
+ xorl %ebp,%esi
+ movdqa %xmm4,%xmm9
+ paddd %xmm4,%xmm10
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ punpcklqdq %xmm2,%xmm5
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ psrldq $4,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ pxor %xmm1,%xmm5
+ addl %ebx,%eax
+ rorl $7,%ebx
+ pxor %xmm3,%xmm9
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ pxor %xmm9,%xmm5
+ xorl %ecx,%ebx
+ roll $5,%eax
+ movdqa %xmm10,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ movdqa %xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ movdqa %xmm5,%xmm9
+ xorl %ecx,%esi
+ pslldq $12,%xmm8
+ paddd %xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ psrld $31,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm10
+ andl %eax,%edi
+ xorl %ebx,%eax
+ psrld $30,%xmm8
+ addl %ebp,%edx
+ rorl $7,%ebp
+ por %xmm9,%xmm5
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ pslld $2,%xmm10
+ pxor %xmm8,%xmm5
+ xorl %eax,%ebp
+ movdqa -32(%r14),%xmm8
+ roll $5,%edx
+ addl %edi,%ecx
+ andl %ebp,%esi
+ pxor %xmm10,%xmm5
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ rorl $7,%edx
+ pshufd $238,%xmm2,%xmm6
+ xorl %eax,%esi
+ movdqa %xmm5,%xmm10
+ paddd %xmm5,%xmm8
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ punpcklqdq %xmm3,%xmm6
+ xorl %ebp,%edx
+ roll $5,%ecx
+ addl %esi,%ebx
+ psrldq $4,%xmm10
+ andl %edx,%edi
+ xorl %ebp,%edx
+ pxor %xmm2,%xmm6
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ pxor %xmm4,%xmm10
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ pxor %xmm10,%xmm6
+ xorl %edx,%ecx
+ roll $5,%ebx
+ movdqa %xmm8,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ movdqa %xmm6,%xmm9
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ rorl $7,%ebx
+ movdqa %xmm6,%xmm10
+ xorl %edx,%esi
+ pslldq $12,%xmm9
+ paddd %xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ psrld $31,%xmm10
+ xorl %ecx,%ebx
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm9,%xmm8
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ psrld $30,%xmm9
+ addl %eax,%ebp
+ rorl $7,%eax
+ por %xmm10,%xmm6
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ pslld $2,%xmm8
+ pxor %xmm9,%xmm6
+ xorl %ebx,%eax
+ movdqa -32(%r14),%xmm9
+ roll $5,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ pxor %xmm8,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ rorl $7,%ebp
+ pshufd $238,%xmm3,%xmm7
+ xorl %ebx,%esi
+ movdqa %xmm6,%xmm8
+ paddd %xmm6,%xmm9
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ punpcklqdq %xmm4,%xmm7
+ xorl %eax,%ebp
+ roll $5,%edx
+ addl %esi,%ecx
+ psrldq $4,%xmm8
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ pxor %xmm3,%xmm7
+ addl %edx,%ecx
+ rorl $7,%edx
+ pxor %xmm5,%xmm8
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ pxor %xmm8,%xmm7
+ xorl %ebp,%edx
+ roll $5,%ecx
+ movdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ movdqa %xmm7,%xmm10
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ rorl $7,%ecx
+ movdqa %xmm7,%xmm8
+ xorl %ebp,%esi
+ pslldq $12,%xmm10
+ paddd %xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ psrld $31,%xmm8
+ xorl %edx,%ecx
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa %xmm10,%xmm9
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ psrld $30,%xmm10
+ addl %ebx,%eax
+ rorl $7,%ebx
+ por %xmm8,%xmm7
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ pslld $2,%xmm9
+ pxor %xmm10,%xmm7
+ xorl %ecx,%ebx
+ movdqa -32(%r14),%xmm10
+ roll $5,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ pxor %xmm9,%xmm7
+ pshufd $238,%xmm6,%xmm9
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ rorl $7,%eax
+ pxor %xmm4,%xmm0
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ punpcklqdq %xmm7,%xmm9
+ xorl %ebx,%eax
+ roll $5,%ebp
+ pxor %xmm1,%xmm0
+ addl %esi,%edx
+ andl %eax,%edi
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%eax
+ paddd %xmm7,%xmm10
+ addl %ebp,%edx
+ pxor %xmm9,%xmm0
+ rorl $7,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ movdqa %xmm0,%xmm9
+ xorl %eax,%ebp
+ roll $5,%edx
+ movdqa %xmm10,48(%rsp)
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ pslld $2,%xmm0
+ addl %edx,%ecx
+ rorl $7,%edx
+ psrld $30,%xmm9
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ por %xmm9,%xmm0
+ xorl %ebp,%edx
+ roll $5,%ecx
+ pshufd $238,%xmm7,%xmm10
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm0,%xmm10
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm2,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebx
+ paddd %xmm0,%xmm8
+ addl %eax,%ebp
+ pxor %xmm10,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm1,%xmm10
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm8,0(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 24(%rsp),%ecx
+ pslld $2,%xmm1
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm10
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm10,%xmm1
+ addl %edx,%ecx
+ addl 28(%rsp),%ebx
+ pshufd $238,%xmm0,%xmm8
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ punpcklqdq %xmm1,%xmm8
+ movl %ebx,%edi
+ roll $5,%ebx
+ pxor %xmm3,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa 0(%r14),%xmm10
+ rorl $7,%ecx
+ paddd %xmm1,%xmm9
+ addl %ebx,%eax
+ pxor %xmm8,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ movdqa %xmm2,%xmm8
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ movdqa %xmm9,16(%rsp)
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 40(%rsp),%edx
+ pslld $2,%xmm2
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ psrld $30,%xmm8
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ por %xmm8,%xmm2
+ addl %ebp,%edx
+ addl 44(%rsp),%ecx
+ pshufd $238,%xmm1,%xmm9
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ punpcklqdq %xmm2,%xmm9
+ movl %ecx,%edi
+ roll $5,%ecx
+ pxor %xmm4,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ movdqa %xmm10,%xmm8
+ rorl $7,%edx
+ paddd %xmm2,%xmm10
+ addl %ecx,%ebx
+ pxor %xmm9,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ movdqa %xmm3,%xmm9
+ addl %edi,%eax
+ xorl %edx,%esi
+ movdqa %xmm10,32(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 56(%rsp),%ebp
+ pslld $2,%xmm3
+ xorl %ecx,%esi
+ movl %eax,%edi
+ psrld $30,%xmm9
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ por %xmm9,%xmm3
+ addl %eax,%ebp
+ addl 60(%rsp),%edx
+ pshufd $238,%xmm2,%xmm10
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ pxor %xmm0,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ punpcklqdq %xmm3,%xmm10
+ movl %edx,%edi
+ roll $5,%edx
+ pxor %xmm5,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ movdqa %xmm8,%xmm9
+ rorl $7,%ebp
+ paddd %xmm3,%xmm8
+ addl %edx,%ecx
+ pxor %xmm10,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ movdqa %xmm4,%xmm10
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ movdqa %xmm8,48(%rsp)
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 8(%rsp),%eax
+ pslld $2,%xmm4
+ xorl %edx,%esi
+ movl %ebx,%edi
+ psrld $30,%xmm10
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ por %xmm10,%xmm4
+ addl %ebx,%eax
+ addl 12(%rsp),%ebp
+ pshufd $238,%xmm3,%xmm8
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ pxor %xmm1,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ punpcklqdq %xmm4,%xmm8
+ movl %ebp,%edi
+ roll $5,%ebp
+ pxor %xmm6,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%eax
+ paddd %xmm4,%xmm9
+ addl %ebp,%edx
+ pxor %xmm8,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ movdqa %xmm5,%xmm8
+ addl %edi,%ecx
+ xorl %eax,%esi
+ movdqa %xmm9,0(%rsp)
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 24(%rsp),%ebx
+ pslld $2,%xmm5
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ psrld $30,%xmm8
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ por %xmm8,%xmm5
+ addl %ecx,%ebx
+ addl 28(%rsp),%eax
+ pshufd $238,%xmm4,%xmm9
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ pxor %xmm2,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ punpcklqdq %xmm5,%xmm9
+ movl %eax,%edi
+ xorl %ecx,%esi
+ pxor %xmm7,%xmm6
+ roll $5,%eax
+ addl %esi,%ebp
+ movdqa %xmm10,%xmm8
+ xorl %ebx,%edi
+ paddd %xmm5,%xmm10
+ xorl %ecx,%ebx
+ pxor %xmm9,%xmm6
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movdqa %xmm6,%xmm9
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ movdqa %xmm10,16(%rsp)
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ pslld $2,%xmm6
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ psrld $30,%xmm9
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ por %xmm9,%xmm6
+ rorl $7,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ roll $5,%edx
+ pshufd $238,%xmm5,%xmm10
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ pxor %xmm3,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ punpcklqdq %xmm6,%xmm10
+ movl %ebx,%edi
+ xorl %edx,%esi
+ pxor %xmm0,%xmm7
+ roll $5,%ebx
+ addl %esi,%eax
+ movdqa 32(%r14),%xmm9
+ xorl %ecx,%edi
+ paddd %xmm6,%xmm8
+ xorl %edx,%ecx
+ pxor %xmm10,%xmm7
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movdqa %xmm7,%xmm10
+ movl %eax,%esi
+ xorl %ecx,%edi
+ movdqa %xmm8,32(%rsp)
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ pslld $2,%xmm7
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ psrld $30,%xmm10
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ por %xmm10,%xmm7
+ rorl $7,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ roll $5,%ebp
+ pshufd $238,%xmm6,%xmm8
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ pxor %xmm4,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ punpcklqdq %xmm7,%xmm8
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ pxor %xmm1,%xmm0
+ roll $5,%ecx
+ addl %esi,%ebx
+ movdqa %xmm9,%xmm10
+ xorl %edx,%edi
+ paddd %xmm7,%xmm9
+ xorl %ebp,%edx
+ pxor %xmm8,%xmm0
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movdqa %xmm0,%xmm8
+ movl %ebx,%esi
+ xorl %edx,%edi
+ movdqa %xmm9,48(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ pslld $2,%xmm0
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ psrld $30,%xmm8
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ por %xmm8,%xmm0
+ rorl $7,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ roll $5,%eax
+ pshufd $238,%xmm7,%xmm9
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ pxor %xmm5,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ punpcklqdq %xmm0,%xmm9
+ movl %edx,%edi
+ xorl %eax,%esi
+ pxor %xmm2,%xmm1
+ roll $5,%edx
+ addl %esi,%ecx
+ movdqa %xmm10,%xmm8
+ xorl %ebp,%edi
+ paddd %xmm0,%xmm10
+ xorl %eax,%ebp
+ pxor %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ rorl $7,%edx
+ movdqa %xmm1,%xmm9
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ movdqa %xmm10,0(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ pslld $2,%xmm1
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ psrld $30,%xmm9
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ por %xmm9,%xmm1
+ rorl $7,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ roll $5,%ebx
+ pshufd $238,%xmm0,%xmm10
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ rorl $7,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ pxor %xmm6,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ rorl $7,%eax
+ punpcklqdq %xmm1,%xmm10
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ pxor %xmm3,%xmm2
+ roll $5,%ebp
+ addl %esi,%edx
+ movdqa %xmm8,%xmm9
+ xorl %eax,%edi
+ paddd %xmm1,%xmm8
+ xorl %ebx,%eax
+ pxor %xmm10,%xmm2
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ rorl $7,%ebp
+ movdqa %xmm2,%xmm10
+ movl %edx,%esi
+ xorl %eax,%edi
+ movdqa %xmm8,16(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ pslld $2,%xmm2
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ psrld $30,%xmm10
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ por %xmm10,%xmm2
+ rorl $7,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ roll $5,%ecx
+ pshufd $238,%xmm1,%xmm8
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ rorl $7,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ pxor %xmm7,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ punpcklqdq %xmm2,%xmm8
+ movl %eax,%edi
+ roll $5,%eax
+ pxor %xmm4,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ movdqa %xmm9,%xmm10
+ rorl $7,%ebx
+ paddd %xmm2,%xmm9
+ addl %eax,%ebp
+ pxor %xmm8,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ movdqa %xmm3,%xmm8
+ addl %edi,%edx
+ xorl %ebx,%esi
+ movdqa %xmm9,32(%rsp)
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 56(%rsp),%ecx
+ pslld $2,%xmm3
+ xorl %eax,%esi
+ movl %edx,%edi
+ psrld $30,%xmm8
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ por %xmm8,%xmm3
+ addl %edx,%ecx
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ paddd %xmm3,%xmm10
+ addl %esi,%eax
+ xorl %edx,%edi
+ movdqa %xmm10,48(%rsp)
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je .Ldone_ssse3
+ movdqa 64(%r14),%xmm6
+ movdqa -64(%r14),%xmm9
+ movdqu 0(%r9),%xmm0
+ movdqu 16(%r9),%xmm1
+ movdqu 32(%r9),%xmm2
+ movdqu 48(%r9),%xmm3
+.byte 102,15,56,0,198
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+.byte 102,15,56,0,206
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ paddd %xmm9,%xmm0
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ movdqa %xmm0,0(%rsp)
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ psubd %xmm9,%xmm0
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+.byte 102,15,56,0,214
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ paddd %xmm9,%xmm1
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ movdqa %xmm1,16(%rsp)
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ psubd %xmm9,%xmm1
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+.byte 102,15,56,0,222
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ paddd %xmm9,%xmm2
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ movdqa %xmm2,32(%rsp)
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ psubd %xmm9,%xmm2
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp .Loop_ssse3
+
+.align 16
+.Ldone_ssse3:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ roll $5,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ roll $5,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ roll $5,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ roll $5,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ roll $5,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ roll $5,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ rorl $7,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ roll $5,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ rorl $7,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ roll $5,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ rorl $7,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ roll $5,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ rorl $7,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ roll $5,%ebx
+ addl %edi,%eax
+ rorl $7,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_ssse3:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+.type sha1_block_data_order_avx,@function
+.align 16
+sha1_block_data_order_avx:
+_avx_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ leaq -64(%rsp),%rsp
+ vzeroupper
+ andq $-64,%rsp
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ shlq $6,%r10
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ movl 4(%r8),%ebx
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl %ebx,%esi
+ movl 16(%r8),%ebp
+ movl %ecx,%edi
+ xorl %edx,%edi
+ andl %edi,%esi
+
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ vpshufb %xmm6,%xmm1,%xmm1
+ vpshufb %xmm6,%xmm2,%xmm2
+ vpshufb %xmm6,%xmm3,%xmm3
+ vpaddd %xmm11,%xmm0,%xmm4
+ vpaddd %xmm11,%xmm1,%xmm5
+ vpaddd %xmm11,%xmm2,%xmm6
+ vmovdqa %xmm4,0(%rsp)
+ vmovdqa %xmm5,16(%rsp)
+ vmovdqa %xmm6,32(%rsp)
+ jmp .Loop_avx
+.align 16
+.Loop_avx:
+ shrdl $2,%ebx,%ebx
+ xorl %edx,%esi
+ vpalignr $8,%xmm0,%xmm1,%xmm4
+ movl %eax,%edi
+ addl 0(%rsp),%ebp
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrldq $4,%xmm3,%xmm8
+ addl %esi,%ebp
+ andl %ebx,%edi
+ vpxor %xmm0,%xmm4,%xmm4
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm2,%xmm8,%xmm8
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 4(%rsp),%edx
+ vpxor %xmm8,%xmm4,%xmm4
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%edx
+ andl %eax,%esi
+ vpsrld $31,%xmm4,%xmm8
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpslldq $12,%xmm4,%xmm10
+ vpaddd %xmm4,%xmm4,%xmm4
+ movl %edx,%edi
+ addl 8(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm4,%xmm4
+ addl %esi,%ecx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm4,%xmm4
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 12(%rsp),%ebx
+ vpxor %xmm10,%xmm4,%xmm4
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ andl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpalignr $8,%xmm1,%xmm2,%xmm5
+ movl %ebx,%edi
+ addl 16(%rsp),%eax
+ vpaddd %xmm4,%xmm11,%xmm9
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrldq $4,%xmm4,%xmm8
+ addl %esi,%eax
+ andl %ecx,%edi
+ vpxor %xmm1,%xmm5,%xmm5
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm3,%xmm8,%xmm8
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 20(%rsp),%ebp
+ vpxor %xmm8,%xmm5,%xmm5
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ebp
+ andl %ebx,%esi
+ vpsrld $31,%xmm5,%xmm8
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ vpslldq $12,%xmm5,%xmm10
+ vpaddd %xmm5,%xmm5,%xmm5
+ movl %ebp,%edi
+ addl 24(%rsp),%edx
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm5,%xmm5
+ addl %esi,%edx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm5,%xmm5
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ movl %edx,%esi
+ addl 28(%rsp),%ecx
+ vpxor %xmm10,%xmm5,%xmm5
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vmovdqa -32(%r14),%xmm11
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ vpalignr $8,%xmm2,%xmm3,%xmm6
+ movl %ecx,%edi
+ addl 32(%rsp),%ebx
+ vpaddd %xmm5,%xmm11,%xmm9
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vpsrldq $4,%xmm5,%xmm8
+ addl %esi,%ebx
+ andl %edx,%edi
+ vpxor %xmm2,%xmm6,%xmm6
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpxor %xmm4,%xmm8,%xmm8
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ addl 36(%rsp),%eax
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%eax
+ andl %ecx,%esi
+ vpsrld $31,%xmm6,%xmm8
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%esi
+ vpslldq $12,%xmm6,%xmm10
+ vpaddd %xmm6,%xmm6,%xmm6
+ movl %eax,%edi
+ addl 40(%rsp),%ebp
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm6,%xmm6
+ addl %esi,%ebp
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm6,%xmm6
+ shrdl $7,%eax,%eax
+ xorl %ecx,%edi
+ movl %ebp,%esi
+ addl 44(%rsp),%edx
+ vpxor %xmm10,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%esi
+ vpalignr $8,%xmm3,%xmm4,%xmm7
+ movl %edx,%edi
+ addl 48(%rsp),%ecx
+ vpaddd %xmm6,%xmm11,%xmm9
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpsrldq $4,%xmm6,%xmm8
+ addl %esi,%ecx
+ andl %ebp,%edi
+ vpxor %xmm3,%xmm7,%xmm7
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpxor %xmm5,%xmm8,%xmm8
+ shrdl $7,%edx,%edx
+ xorl %eax,%edi
+ movl %ecx,%esi
+ addl 52(%rsp),%ebx
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%ebx
+ andl %edx,%esi
+ vpsrld $31,%xmm7,%xmm8
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ shrdl $7,%ecx,%ecx
+ xorl %ebp,%esi
+ vpslldq $12,%xmm7,%xmm10
+ vpaddd %xmm7,%xmm7,%xmm7
+ movl %ebx,%edi
+ addl 56(%rsp),%eax
+ xorl %edx,%ecx
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm10,%xmm9
+ vpor %xmm8,%xmm7,%xmm7
+ addl %esi,%eax
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm10,%xmm10
+ vpxor %xmm9,%xmm7,%xmm7
+ shrdl $7,%ebx,%ebx
+ xorl %edx,%edi
+ movl %eax,%esi
+ addl 60(%rsp),%ebp
+ vpxor %xmm10,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ shrdl $7,%eax,%eax
+ xorl %ecx,%esi
+ movl %ebp,%edi
+ addl 0(%rsp),%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ xorl %ebx,%eax
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm7,%xmm11,%xmm9
+ addl %esi,%edx
+ andl %eax,%edi
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ shrdl $7,%ebp,%ebp
+ xorl %ebx,%edi
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ movl %edx,%esi
+ addl 4(%rsp),%ecx
+ xorl %eax,%ebp
+ shldl $5,%edx,%edx
+ vpslld $2,%xmm0,%xmm0
+ addl %edi,%ecx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ shrdl $7,%edx,%edx
+ xorl %eax,%esi
+ movl %ecx,%edi
+ addl 8(%rsp),%ebx
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %ebp,%edx
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ andl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 12(%rsp),%eax
+ xorl %ebp,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm2,%xmm1,%xmm1
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm1,%xmm1
+ addl 20(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm1,%xmm1
+ addl 24(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm1,%xmm1
+ addl 28(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ vpxor %xmm3,%xmm2,%xmm2
+ addl %esi,%eax
+ xorl %edx,%edi
+ vpaddd %xmm1,%xmm11,%xmm9
+ vmovdqa 0(%r14),%xmm11
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpxor %xmm8,%xmm2,%xmm2
+ addl 36(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpslld $2,%xmm2,%xmm2
+ addl 40(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpor %xmm8,%xmm2,%xmm2
+ addl 44(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ vpxor %xmm0,%xmm4,%xmm4
+ addl 0(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpxor %xmm5,%xmm4,%xmm4
+ addl %esi,%ecx
+ xorl %eax,%edi
+ vpaddd %xmm3,%xmm11,%xmm9
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpxor %xmm8,%xmm4,%xmm4
+ addl 4(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ vpsrld $30,%xmm4,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpslld $2,%xmm4,%xmm4
+ addl 8(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vpor %xmm8,%xmm4,%xmm4
+ addl 12(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ vpxor %xmm1,%xmm5,%xmm5
+ addl 16(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpxor %xmm6,%xmm5,%xmm5
+ addl %esi,%edx
+ xorl %ebx,%edi
+ vpaddd %xmm4,%xmm11,%xmm9
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpxor %xmm8,%xmm5,%xmm5
+ addl 20(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ vpsrld $30,%xmm5,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpslld $2,%xmm5,%xmm5
+ addl 24(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vpor %xmm8,%xmm5,%xmm5
+ addl 28(%rsp),%eax
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ vpxor %xmm2,%xmm6,%xmm6
+ addl 32(%rsp),%ebp
+ andl %ecx,%esi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %eax,%edi
+ xorl %ecx,%esi
+ vpaddd %xmm5,%xmm11,%xmm9
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ vpxor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 36(%rsp),%edx
+ vpsrld $30,%xmm6,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ vpslld $2,%xmm6,%xmm6
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 40(%rsp),%ecx
+ andl %eax,%esi
+ vpor %xmm8,%xmm6,%xmm6
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%edi
+ xorl %eax,%esi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 44(%rsp),%ebx
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ vpxor %xmm3,%xmm7,%xmm7
+ addl 48(%rsp),%eax
+ andl %edx,%esi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ vpxor %xmm0,%xmm7,%xmm7
+ movl %ebx,%edi
+ xorl %edx,%esi
+ vpaddd %xmm6,%xmm11,%xmm9
+ vmovdqa 32(%r14),%xmm11
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vpxor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 52(%rsp),%ebp
+ vpsrld $30,%xmm7,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ vpslld $2,%xmm7,%xmm7
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 56(%rsp),%edx
+ andl %ebx,%esi
+ vpor %xmm8,%xmm7,%xmm7
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 60(%rsp),%ecx
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ vpxor %xmm4,%xmm0,%xmm0
+ addl 0(%rsp),%ebx
+ andl %ebp,%esi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ vpxor %xmm1,%xmm0,%xmm0
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ vpaddd %xmm7,%xmm11,%xmm9
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ vpxor %xmm8,%xmm0,%xmm0
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 4(%rsp),%eax
+ vpsrld $30,%xmm0,%xmm8
+ vmovdqa %xmm9,48(%rsp)
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ vpslld $2,%xmm0,%xmm0
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %ecx,%esi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 8(%rsp),%ebp
+ andl %ecx,%esi
+ vpor %xmm8,%xmm0,%xmm0
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%edi
+ xorl %ecx,%esi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ebx,%edi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ addl 12(%rsp),%edx
+ andl %ebx,%edi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ movl %ebp,%esi
+ xorl %ebx,%edi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %eax,%esi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ vpxor %xmm5,%xmm1,%xmm1
+ addl 16(%rsp),%ecx
+ andl %eax,%esi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ vpxor %xmm2,%xmm1,%xmm1
+ movl %edx,%edi
+ xorl %eax,%esi
+ vpaddd %xmm0,%xmm11,%xmm9
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ vpxor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 20(%rsp),%ebx
+ vpsrld $30,%xmm1,%xmm8
+ vmovdqa %xmm9,0(%rsp)
+ andl %ebp,%edi
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%esi
+ vpslld $2,%xmm1,%xmm1
+ xorl %ebp,%edi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %edx,%esi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 24(%rsp),%eax
+ andl %edx,%esi
+ vpor %xmm8,%xmm1,%xmm1
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%edi
+ xorl %edx,%esi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %ecx,%edi
+ xorl %edx,%ecx
+ addl %ebx,%eax
+ addl 28(%rsp),%ebp
+ andl %ecx,%edi
+ xorl %edx,%ecx
+ shrdl $7,%ebx,%ebx
+ movl %eax,%esi
+ xorl %ecx,%edi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ebx,%esi
+ xorl %ecx,%ebx
+ addl %eax,%ebp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ vpxor %xmm6,%xmm2,%xmm2
+ addl 32(%rsp),%edx
+ andl %ebx,%esi
+ xorl %ecx,%ebx
+ shrdl $7,%eax,%eax
+ vpxor %xmm3,%xmm2,%xmm2
+ movl %ebp,%edi
+ xorl %ebx,%esi
+ vpaddd %xmm1,%xmm11,%xmm9
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ vpxor %xmm8,%xmm2,%xmm2
+ xorl %eax,%edi
+ xorl %ebx,%eax
+ addl %ebp,%edx
+ addl 36(%rsp),%ecx
+ vpsrld $30,%xmm2,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ andl %eax,%edi
+ xorl %ebx,%eax
+ shrdl $7,%ebp,%ebp
+ movl %edx,%esi
+ vpslld $2,%xmm2,%xmm2
+ xorl %eax,%edi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %ebp,%esi
+ xorl %eax,%ebp
+ addl %edx,%ecx
+ addl 40(%rsp),%ebx
+ andl %ebp,%esi
+ vpor %xmm8,%xmm2,%xmm2
+ xorl %eax,%ebp
+ shrdl $7,%edx,%edx
+ movl %ecx,%edi
+ xorl %ebp,%esi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %edx,%edi
+ xorl %ebp,%edx
+ addl %ecx,%ebx
+ addl 44(%rsp),%eax
+ andl %edx,%edi
+ xorl %ebp,%edx
+ shrdl $7,%ecx,%ecx
+ movl %ebx,%esi
+ xorl %edx,%edi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ addl %ebx,%eax
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ vpxor %xmm7,%xmm3,%xmm3
+ addl 48(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ vpxor %xmm4,%xmm3,%xmm3
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ vpaddd %xmm2,%xmm11,%xmm9
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ vpxor %xmm8,%xmm3,%xmm3
+ addl 52(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ vpsrld $30,%xmm3,%xmm8
+ vmovdqa %xmm9,32(%rsp)
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vpslld $2,%xmm3,%xmm3
+ addl 56(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vpor %xmm8,%xmm3,%xmm3
+ addl 60(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 0(%rsp),%eax
+ vpaddd %xmm3,%xmm11,%xmm9
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ vmovdqa %xmm9,48(%rsp)
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 4(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 8(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 12(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ cmpq %r10,%r9
+ je .Ldone_avx
+ vmovdqa 64(%r14),%xmm6
+ vmovdqa -64(%r14),%xmm11
+ vmovdqu 0(%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ vpshufb %xmm6,%xmm0,%xmm0
+ addq $64,%r9
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ vpshufb %xmm6,%xmm1,%xmm1
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ vpaddd %xmm11,%xmm0,%xmm4
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ vmovdqa %xmm4,0(%rsp)
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ vpshufb %xmm6,%xmm2,%xmm2
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ vpaddd %xmm11,%xmm1,%xmm5
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ vmovdqa %xmm5,16(%rsp)
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ vpshufb %xmm6,%xmm3,%xmm3
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ vpaddd %xmm11,%xmm2,%xmm6
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ vmovdqa %xmm6,32(%rsp)
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ addl 12(%r8),%edx
+ movl %eax,0(%r8)
+ addl 16(%r8),%ebp
+ movl %esi,4(%r8)
+ movl %esi,%ebx
+ movl %ecx,8(%r8)
+ movl %ecx,%edi
+ movl %edx,12(%r8)
+ xorl %edx,%edi
+ movl %ebp,16(%r8)
+ andl %edi,%esi
+ jmp .Loop_avx
+
+.align 16
+.Ldone_avx:
+ addl 16(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 20(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ xorl %edx,%esi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 24(%rsp),%ebp
+ xorl %ecx,%esi
+ movl %eax,%edi
+ shldl $5,%eax,%eax
+ addl %esi,%ebp
+ xorl %ecx,%edi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 28(%rsp),%edx
+ xorl %ebx,%edi
+ movl %ebp,%esi
+ shldl $5,%ebp,%ebp
+ addl %edi,%edx
+ xorl %ebx,%esi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 32(%rsp),%ecx
+ xorl %eax,%esi
+ movl %edx,%edi
+ shldl $5,%edx,%edx
+ addl %esi,%ecx
+ xorl %eax,%edi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 36(%rsp),%ebx
+ xorl %ebp,%edi
+ movl %ecx,%esi
+ shldl $5,%ecx,%ecx
+ addl %edi,%ebx
+ xorl %ebp,%esi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 40(%rsp),%eax
+ xorl %edx,%esi
+ movl %ebx,%edi
+ shldl $5,%ebx,%ebx
+ addl %esi,%eax
+ xorl %edx,%edi
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ addl 44(%rsp),%ebp
+ xorl %ecx,%edi
+ movl %eax,%esi
+ shldl $5,%eax,%eax
+ addl %edi,%ebp
+ xorl %ecx,%esi
+ shrdl $7,%ebx,%ebx
+ addl %eax,%ebp
+ addl 48(%rsp),%edx
+ xorl %ebx,%esi
+ movl %ebp,%edi
+ shldl $5,%ebp,%ebp
+ addl %esi,%edx
+ xorl %ebx,%edi
+ shrdl $7,%eax,%eax
+ addl %ebp,%edx
+ addl 52(%rsp),%ecx
+ xorl %eax,%edi
+ movl %edx,%esi
+ shldl $5,%edx,%edx
+ addl %edi,%ecx
+ xorl %eax,%esi
+ shrdl $7,%ebp,%ebp
+ addl %edx,%ecx
+ addl 56(%rsp),%ebx
+ xorl %ebp,%esi
+ movl %ecx,%edi
+ shldl $5,%ecx,%ecx
+ addl %esi,%ebx
+ xorl %ebp,%edi
+ shrdl $7,%edx,%edx
+ addl %ecx,%ebx
+ addl 60(%rsp),%eax
+ xorl %edx,%edi
+ movl %ebx,%esi
+ shldl $5,%ebx,%ebx
+ addl %edi,%eax
+ shrdl $7,%ecx,%ecx
+ addl %ebx,%eax
+ vzeroupper
+
+ addl 0(%r8),%eax
+ addl 4(%r8),%esi
+ addl 8(%r8),%ecx
+ movl %eax,0(%r8)
+ addl 12(%r8),%edx
+ movl %esi,4(%r8)
+ addl 16(%r8),%ebp
+ movl %ecx,8(%r8)
+ movl %edx,12(%r8)
+ movl %ebp,16(%r8)
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
+.type sha1_block_data_order_avx2,@function
+.align 16
+sha1_block_data_order_avx2:
+_avx2_shortcut:
+.cfi_startproc
+ movq %rsp,%r11
+.cfi_def_cfa_register %r11
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ vzeroupper
+ movq %rdi,%r8
+ movq %rsi,%r9
+ movq %rdx,%r10
+
+ leaq -640(%rsp),%rsp
+ shlq $6,%r10
+ leaq 64(%r9),%r13
+ andq $-128,%rsp
+ addq %r9,%r10
+ leaq K_XX_XX+64(%rip),%r14
+
+ movl 0(%r8),%eax
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+ movl 4(%r8),%ebp
+ movl 8(%r8),%ecx
+ movl 12(%r8),%edx
+ movl 16(%r8),%esi
+ vmovdqu 64(%r14),%ymm6
+
+ vmovdqu (%r9),%xmm0
+ vmovdqu 16(%r9),%xmm1
+ vmovdqu 32(%r9),%xmm2
+ vmovdqu 48(%r9),%xmm3
+ leaq 64(%r9),%r9
+ vinserti128 $1,(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vpshufb %ymm6,%ymm0,%ymm0
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vpshufb %ymm6,%ymm1,%ymm1
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ vpshufb %ymm6,%ymm2,%ymm2
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm3,%ymm3
+
+ vpaddd %ymm11,%ymm0,%ymm4
+ vpaddd %ymm11,%ymm1,%ymm5
+ vmovdqu %ymm4,0(%rsp)
+ vpaddd %ymm11,%ymm2,%ymm6
+ vmovdqu %ymm5,32(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ vmovdqu %ymm6,64(%rsp)
+ vmovdqu %ymm7,96(%rsp)
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ vpsrldq $4,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpsrld $31,%ymm4,%ymm8
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ vpxor %ymm10,%ymm4,%ymm4
+ vpaddd %ymm11,%ymm4,%ymm9
+ vmovdqu %ymm9,128(%rsp)
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ vpsrldq $4,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ vpxor %ymm8,%ymm5,%ymm5
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ vpxor %ymm10,%ymm5,%ymm5
+ vpaddd %ymm11,%ymm5,%ymm9
+ vmovdqu %ymm9,160(%rsp)
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ vpsrldq $4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ vpxor %ymm8,%ymm6,%ymm6
+ vpsrld $31,%ymm6,%ymm8
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ vpxor %ymm10,%ymm6,%ymm6
+ vpaddd %ymm11,%ymm6,%ymm9
+ vmovdqu %ymm9,192(%rsp)
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ vpsrldq $4,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpsrld $31,%ymm7,%ymm8
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ vpxor %ymm10,%ymm7,%ymm7
+ vpaddd %ymm11,%ymm7,%ymm9
+ vmovdqu %ymm9,224(%rsp)
+ leaq 128(%rsp),%r13
+ jmp .Loop_avx2
+.align 32
+.Loop_avx2:
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ jmp .Lalign32_1
+.align 32
+.Lalign32_1:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpxor %ymm1,%ymm0,%ymm0
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vpor %ymm8,%ymm0,%ymm0
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ vmovdqu %ymm9,256(%rsp)
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpxor %ymm2,%ymm1,%ymm1
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vpor %ymm8,%ymm1,%ymm1
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ vmovdqu %ymm9,288(%rsp)
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ vpxor %ymm3,%ymm2,%ymm2
+ vmovdqu 0(%r14),%ymm11
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vpor %ymm8,%ymm2,%ymm2
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ vmovdqu %ymm9,320(%rsp)
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ vpxor %ymm4,%ymm3,%ymm3
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ vpor %ymm8,%ymm3,%ymm3
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ vmovdqu %ymm9,352(%rsp)
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpalignr $8,%ymm2,%ymm3,%ymm8
+ vpxor %ymm0,%ymm4,%ymm4
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm5,%ymm4,%ymm4
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ vpsrld $30,%ymm4,%ymm8
+ vpslld $2,%ymm4,%ymm4
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpor %ymm8,%ymm4,%ymm4
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpaddd %ymm11,%ymm4,%ymm9
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ vmovdqu %ymm9,384(%rsp)
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpalignr $8,%ymm3,%ymm4,%ymm8
+ vpxor %ymm1,%ymm5,%ymm5
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm6,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpxor %ymm8,%ymm5,%ymm5
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ vpsrld $30,%ymm5,%ymm8
+ vpslld $2,%ymm5,%ymm5
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vpor %ymm8,%ymm5,%ymm5
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ vmovdqu %ymm9,416(%rsp)
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm4,%ymm5,%ymm8
+ vpxor %ymm2,%ymm6,%ymm6
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm7,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ vpxor %ymm8,%ymm6,%ymm6
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ vpsrld $30,%ymm6,%ymm8
+ vpslld $2,%ymm6,%ymm6
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vpor %ymm8,%ymm6,%ymm6
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ vmovdqu %ymm9,448(%rsp)
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm5,%ymm6,%ymm8
+ vpxor %ymm3,%ymm7,%ymm7
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm0,%ymm7,%ymm7
+ vmovdqu 32(%r14),%ymm11
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpxor %ymm8,%ymm7,%ymm7
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ vpsrld $30,%ymm7,%ymm8
+ vpslld $2,%ymm7,%ymm7
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpor %ymm8,%ymm7,%ymm7
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ vmovdqu %ymm9,480(%rsp)
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ jmp .Lalign32_2
+.align 32
+.Lalign32_2:
+ vpalignr $8,%ymm6,%ymm7,%ymm8
+ vpxor %ymm4,%ymm0,%ymm0
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ vpxor %ymm1,%ymm0,%ymm0
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ vpxor %ymm8,%ymm0,%ymm0
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpsrld $30,%ymm0,%ymm8
+ vpslld $2,%ymm0,%ymm0
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ vpor %ymm8,%ymm0,%ymm0
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ vpaddd %ymm11,%ymm0,%ymm9
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ vmovdqu %ymm9,512(%rsp)
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpalignr $8,%ymm7,%ymm0,%ymm8
+ vpxor %ymm5,%ymm1,%ymm1
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm2,%ymm1,%ymm1
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpxor %ymm8,%ymm1,%ymm1
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpsrld $30,%ymm1,%ymm8
+ vpslld $2,%ymm1,%ymm1
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ vpor %ymm8,%ymm1,%ymm1
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ vmovdqu %ymm9,544(%rsp)
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vpalignr $8,%ymm0,%ymm1,%ymm8
+ vpxor %ymm6,%ymm2,%ymm2
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ vpxor %ymm3,%ymm2,%ymm2
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ vpxor %ymm8,%ymm2,%ymm2
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm2,%ymm8
+ vpslld $2,%ymm2,%ymm2
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ vpor %ymm8,%ymm2,%ymm2
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vpaddd %ymm11,%ymm2,%ymm9
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ vmovdqu %ymm9,576(%rsp)
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm1,%ymm2,%ymm8
+ vpxor %ymm7,%ymm3,%ymm3
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ vpxor %ymm4,%ymm3,%ymm3
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm3,%ymm3
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ vpsrld $30,%ymm3,%ymm8
+ vpslld $2,%ymm3,%ymm3
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ vpor %ymm8,%ymm3,%ymm3
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpaddd %ymm11,%ymm3,%ymm9
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vmovdqu %ymm9,608(%rsp)
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%r9),%r13
+ leaq 128(%r9),%rdi
+ cmpq %r10,%r13
+ cmovaeq %r9,%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ je .Ldone_avx2
+ vmovdqu 64(%r14),%ymm6
+ cmpq %r10,%rdi
+ ja .Last_avx2
+
+ vmovdqu -64(%rdi),%xmm0
+ vmovdqu -48(%rdi),%xmm1
+ vmovdqu -32(%rdi),%xmm2
+ vmovdqu -16(%rdi),%xmm3
+ vinserti128 $1,0(%r13),%ymm0,%ymm0
+ vinserti128 $1,16(%r13),%ymm1,%ymm1
+ vinserti128 $1,32(%r13),%ymm2,%ymm2
+ vinserti128 $1,48(%r13),%ymm3,%ymm3
+ jmp .Last_avx2
+
+.align 32
+.Last_avx2:
+ leaq 128+16(%rsp),%r13
+ rorxl $2,%ebp,%ebx
+ andnl %edx,%ebp,%edi
+ andl %ecx,%ebp
+ xorl %edi,%ebp
+ subq $-128,%r9
+ addl -128(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -124(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -120(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -116(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -96(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -92(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -88(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -84(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -64(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -60(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl -56(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl -52(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl -32(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl -28(%r13),%edx
+ andnl %ebx,%esi,%edi
+ addl %eax,%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ andl %ebp,%esi
+ addl %r12d,%edx
+ xorl %edi,%esi
+ addl -24(%r13),%ecx
+ andnl %ebp,%edx,%edi
+ addl %esi,%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ andl %eax,%edx
+ addl %r12d,%ecx
+ xorl %edi,%edx
+ addl -20(%r13),%ebx
+ andnl %eax,%ecx,%edi
+ addl %edx,%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ andl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %edi,%ecx
+ addl 0(%r13),%ebp
+ andnl %esi,%ebx,%edi
+ addl %ecx,%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ andl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %edi,%ebx
+ addl 4(%r13),%eax
+ andnl %edx,%ebp,%edi
+ addl %ebx,%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ andl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edi,%ebp
+ addl 8(%r13),%esi
+ andnl %ecx,%eax,%edi
+ addl %ebp,%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ andl %ebx,%eax
+ addl %r12d,%esi
+ xorl %edi,%eax
+ addl 12(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 32(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 36(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 40(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 44(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl 64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vmovdqu -64(%r14),%ymm11
+ vpshufb %ymm6,%ymm0,%ymm0
+ addl 68(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl 72(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl 76(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl 96(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl 100(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpshufb %ymm6,%ymm1,%ymm1
+ vpaddd %ymm11,%ymm0,%ymm8
+ addl 104(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl 108(%r13),%edx
+ leaq 256(%r13),%r13
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -128(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -124(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -120(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vmovdqu %ymm8,0(%rsp)
+ vpshufb %ymm6,%ymm2,%ymm2
+ vpaddd %ymm11,%ymm1,%ymm9
+ addl -116(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -92(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ addl -88(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -84(%r13),%ebx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ vmovdqu %ymm9,32(%rsp)
+ vpshufb %ymm6,%ymm3,%ymm3
+ vpaddd %ymm11,%ymm2,%ymm6
+ addl -64(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -60(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl -56(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl -52(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ addl -32(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ jmp .Lalign32_3
+.align 32
+.Lalign32_3:
+ vmovdqu %ymm6,64(%rsp)
+ vpaddd %ymm11,%ymm3,%ymm7
+ addl -28(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl -24(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl -20(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 0(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ addl 4(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ andl %edi,%esi
+ vmovdqu %ymm7,96(%rsp)
+ addl 8(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ andl %edi,%edx
+ addl 12(%r13),%ebx
+ xorl %eax,%edx
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 32(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 36(%r13),%eax
+ xorl %edx,%ebx
+ movl %ecx,%edi
+ xorl %edx,%edi
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ andl %edi,%ebp
+ addl 40(%r13),%esi
+ xorl %ecx,%ebp
+ movl %ebx,%edi
+ xorl %ecx,%edi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ andl %edi,%eax
+ vpalignr $8,%ymm0,%ymm1,%ymm4
+ addl 44(%r13),%edx
+ xorl %ebx,%eax
+ movl %ebp,%edi
+ xorl %ebx,%edi
+ vpsrldq $4,%ymm3,%ymm8
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpxor %ymm0,%ymm4,%ymm4
+ vpxor %ymm2,%ymm8,%ymm8
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpxor %ymm8,%ymm4,%ymm4
+ andl %edi,%esi
+ addl 64(%r13),%ecx
+ xorl %ebp,%esi
+ movl %eax,%edi
+ vpsrld $31,%ymm4,%ymm8
+ xorl %ebp,%edi
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ vpslldq $12,%ymm4,%ymm10
+ vpaddd %ymm4,%ymm4,%ymm4
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm4,%ymm4
+ addl %r12d,%ecx
+ andl %edi,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm4,%ymm4
+ addl 68(%r13),%ebx
+ xorl %eax,%edx
+ vpxor %ymm10,%ymm4,%ymm4
+ movl %esi,%edi
+ xorl %eax,%edi
+ leal (%rbx,%rdx,1),%ebx
+ vpaddd %ymm11,%ymm4,%ymm9
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ vmovdqu %ymm9,128(%rsp)
+ addl %r12d,%ebx
+ andl %edi,%ecx
+ addl 72(%r13),%ebp
+ xorl %esi,%ecx
+ movl %edx,%edi
+ xorl %esi,%edi
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ andl %edi,%ebx
+ addl 76(%r13),%eax
+ xorl %edx,%ebx
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpalignr $8,%ymm1,%ymm2,%ymm5
+ addl 96(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrldq $4,%ymm4,%ymm8
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ vpxor %ymm1,%ymm5,%ymm5
+ vpxor %ymm3,%ymm8,%ymm8
+ addl 100(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpxor %ymm8,%ymm5,%ymm5
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpsrld $31,%ymm5,%ymm8
+ vmovdqu -32(%r14),%ymm11
+ xorl %ebx,%esi
+ addl 104(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ vpslldq $12,%ymm5,%ymm10
+ vpaddd %ymm5,%ymm5,%ymm5
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm5,%ymm5
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm5,%ymm5
+ xorl %ebp,%edx
+ addl 108(%r13),%ebx
+ leaq 256(%r13),%r13
+ vpxor %ymm10,%ymm5,%ymm5
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ vpaddd %ymm11,%ymm5,%ymm9
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vmovdqu %ymm9,160(%rsp)
+ addl -128(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpalignr $8,%ymm2,%ymm3,%ymm6
+ addl -124(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ vpsrldq $4,%ymm5,%ymm8
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ vpxor %ymm2,%ymm6,%ymm6
+ vpxor %ymm4,%ymm8,%ymm8
+ addl -120(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpxor %ymm8,%ymm6,%ymm6
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpsrld $31,%ymm6,%ymm8
+ xorl %ecx,%eax
+ addl -116(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ vpslldq $12,%ymm6,%ymm10
+ vpaddd %ymm6,%ymm6,%ymm6
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm6,%ymm6
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm6,%ymm6
+ xorl %ebx,%esi
+ addl -96(%r13),%ecx
+ vpxor %ymm10,%ymm6,%ymm6
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ vpaddd %ymm11,%ymm6,%ymm9
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ vmovdqu %ymm9,192(%rsp)
+ addl -92(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ vpalignr $8,%ymm3,%ymm4,%ymm7
+ addl -88(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ vpsrldq $4,%ymm6,%ymm8
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ vpxor %ymm3,%ymm7,%ymm7
+ vpxor %ymm5,%ymm8,%ymm8
+ addl -84(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ vpxor %ymm8,%ymm7,%ymm7
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ vpsrld $31,%ymm7,%ymm8
+ xorl %edx,%ebp
+ addl -64(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ vpslldq $12,%ymm7,%ymm10
+ vpaddd %ymm7,%ymm7,%ymm7
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ vpsrld $30,%ymm10,%ymm9
+ vpor %ymm8,%ymm7,%ymm7
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ vpslld $2,%ymm10,%ymm10
+ vpxor %ymm9,%ymm7,%ymm7
+ xorl %ecx,%eax
+ addl -60(%r13),%edx
+ vpxor %ymm10,%ymm7,%ymm7
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ rorxl $2,%esi,%eax
+ vpaddd %ymm11,%ymm7,%ymm9
+ xorl %ebp,%esi
+ addl %r12d,%edx
+ xorl %ebx,%esi
+ vmovdqu %ymm9,224(%rsp)
+ addl -56(%r13),%ecx
+ leal (%rcx,%rsi,1),%ecx
+ rorxl $27,%edx,%r12d
+ rorxl $2,%edx,%esi
+ xorl %eax,%edx
+ addl %r12d,%ecx
+ xorl %ebp,%edx
+ addl -52(%r13),%ebx
+ leal (%rbx,%rdx,1),%ebx
+ rorxl $27,%ecx,%r12d
+ rorxl $2,%ecx,%edx
+ xorl %esi,%ecx
+ addl %r12d,%ebx
+ xorl %eax,%ecx
+ addl -32(%r13),%ebp
+ leal (%rcx,%rbp,1),%ebp
+ rorxl $27,%ebx,%r12d
+ rorxl $2,%ebx,%ecx
+ xorl %edx,%ebx
+ addl %r12d,%ebp
+ xorl %esi,%ebx
+ addl -28(%r13),%eax
+ leal (%rax,%rbx,1),%eax
+ rorxl $27,%ebp,%r12d
+ rorxl $2,%ebp,%ebx
+ xorl %ecx,%ebp
+ addl %r12d,%eax
+ xorl %edx,%ebp
+ addl -24(%r13),%esi
+ leal (%rsi,%rbp,1),%esi
+ rorxl $27,%eax,%r12d
+ rorxl $2,%eax,%ebp
+ xorl %ebx,%eax
+ addl %r12d,%esi
+ xorl %ecx,%eax
+ addl -20(%r13),%edx
+ leal (%rdx,%rax,1),%edx
+ rorxl $27,%esi,%r12d
+ addl %r12d,%edx
+ leaq 128(%rsp),%r13
+
+
+ addl 0(%r8),%edx
+ addl 4(%r8),%esi
+ addl 8(%r8),%ebp
+ movl %edx,0(%r8)
+ addl 12(%r8),%ebx
+ movl %esi,4(%r8)
+ movl %edx,%eax
+ addl 16(%r8),%ecx
+ movl %ebp,%r12d
+ movl %ebp,8(%r8)
+ movl %ebx,%edx
+
+ movl %ebx,12(%r8)
+ movl %esi,%ebp
+ movl %ecx,16(%r8)
+
+ movl %ecx,%esi
+ movl %r12d,%ecx
+
+
+ cmpq %r10,%r9
+ jbe .Loop_avx2
+
+.Ldone_avx2:
+ vzeroupper
+ movq -40(%r11),%r14
+.cfi_restore %r14
+ movq -32(%r11),%r13
+.cfi_restore %r13
+ movq -24(%r11),%r12
+.cfi_restore %r12
+ movq -16(%r11),%rbp
+.cfi_restore %rbp
+ movq -8(%r11),%rbx
+.cfi_restore %rbx
+ leaq (%r11),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx2:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
+.align 64
+K_XX_XX:
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
+.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 64
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S
@@ -1,0 +1,3973 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl sha256_block_data_order
+.hidden sha256_block_data_order
+.type sha256_block_data_order,@function
+.align 16
+sha256_block_data_order:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je .Lavx_shortcut
+ testl $512,%r10d
+ jnz .Lssse3_shortcut
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $64+32,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movl %ebx,%edi
+ leaq K256(%rip),%rbp
+ xorl %ecx,%edi
+ movl 0(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 4(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 8(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 12(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 16(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 20(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 24(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 28(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%eax
+ movl 32(%rsi),%r12d
+ movl %r8d,%r13d
+ movl %eax,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r11d
+ movl 36(%rsi),%r12d
+ movl %edx,%r13d
+ movl %r11d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r10d
+ movl 40(%rsi),%r12d
+ movl %ecx,%r13d
+ movl %r10d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%r9d
+ movl 44(%rsi),%r12d
+ movl %ebx,%r13d
+ movl %r9d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ addl %r14d,%r8d
+ movl 48(%rsi),%r12d
+ movl %eax,%r13d
+ movl %r8d,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%edx
+ movl 52(%rsi),%r12d
+ movl %r11d,%r13d
+ movl %edx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ecx
+ movl 56(%rsi),%r12d
+ movl %r10d,%r13d
+ movl %ecx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ addl %r14d,%ebx
+ movl 60(%rsi),%r12d
+ movl %r9d,%r13d
+ movl %ebx,%r14d
+ bswapl %r12d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movl 4(%rsp),%r13d
+ movl 56(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 36(%rsp),%r12d
+
+ addl 0(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,0(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 8(%rsp),%r13d
+ movl 60(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 40(%rsp),%r12d
+
+ addl 4(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,4(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 12(%rsp),%r13d
+ movl 0(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 44(%rsp),%r12d
+
+ addl 8(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,8(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 16(%rsp),%r13d
+ movl 4(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 48(%rsp),%r12d
+
+ addl 12(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,12(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 20(%rsp),%r13d
+ movl 8(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 52(%rsp),%r12d
+
+ addl 16(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,16(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 24(%rsp),%r13d
+ movl 12(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 56(%rsp),%r12d
+
+ addl 20(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,20(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 28(%rsp),%r13d
+ movl 16(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 60(%rsp),%r12d
+
+ addl 24(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,24(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 32(%rsp),%r13d
+ movl 20(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 0(%rsp),%r12d
+
+ addl 28(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,28(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ movl 36(%rsp),%r13d
+ movl 24(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%eax
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 4(%rsp),%r12d
+
+ addl 32(%rsp),%r12d
+ movl %r8d,%r13d
+ addl %r15d,%r12d
+ movl %eax,%r14d
+ rorl $14,%r13d
+ movl %r9d,%r15d
+
+ xorl %r8d,%r13d
+ rorl $9,%r14d
+ xorl %r10d,%r15d
+
+ movl %r12d,32(%rsp)
+ xorl %eax,%r14d
+ andl %r8d,%r15d
+
+ rorl $5,%r13d
+ addl %r11d,%r12d
+ xorl %r10d,%r15d
+
+ rorl $11,%r14d
+ xorl %r8d,%r13d
+ addl %r15d,%r12d
+
+ movl %eax,%r15d
+ addl (%rbp),%r12d
+ xorl %eax,%r14d
+
+ xorl %ebx,%r15d
+ rorl $6,%r13d
+ movl %ebx,%r11d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r11d
+ addl %r12d,%edx
+ addl %r12d,%r11d
+
+ leaq 4(%rbp),%rbp
+ movl 40(%rsp),%r13d
+ movl 28(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r11d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 8(%rsp),%r12d
+
+ addl 36(%rsp),%r12d
+ movl %edx,%r13d
+ addl %edi,%r12d
+ movl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r8d,%edi
+
+ xorl %edx,%r13d
+ rorl $9,%r14d
+ xorl %r9d,%edi
+
+ movl %r12d,36(%rsp)
+ xorl %r11d,%r14d
+ andl %edx,%edi
+
+ rorl $5,%r13d
+ addl %r10d,%r12d
+ xorl %r9d,%edi
+
+ rorl $11,%r14d
+ xorl %edx,%r13d
+ addl %edi,%r12d
+
+ movl %r11d,%edi
+ addl (%rbp),%r12d
+ xorl %r11d,%r14d
+
+ xorl %eax,%edi
+ rorl $6,%r13d
+ movl %eax,%r10d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r10d
+ addl %r12d,%ecx
+ addl %r12d,%r10d
+
+ leaq 4(%rbp),%rbp
+ movl 44(%rsp),%r13d
+ movl 32(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r10d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 12(%rsp),%r12d
+
+ addl 40(%rsp),%r12d
+ movl %ecx,%r13d
+ addl %r15d,%r12d
+ movl %r10d,%r14d
+ rorl $14,%r13d
+ movl %edx,%r15d
+
+ xorl %ecx,%r13d
+ rorl $9,%r14d
+ xorl %r8d,%r15d
+
+ movl %r12d,40(%rsp)
+ xorl %r10d,%r14d
+ andl %ecx,%r15d
+
+ rorl $5,%r13d
+ addl %r9d,%r12d
+ xorl %r8d,%r15d
+
+ rorl $11,%r14d
+ xorl %ecx,%r13d
+ addl %r15d,%r12d
+
+ movl %r10d,%r15d
+ addl (%rbp),%r12d
+ xorl %r10d,%r14d
+
+ xorl %r11d,%r15d
+ rorl $6,%r13d
+ movl %r11d,%r9d
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%r9d
+ addl %r12d,%ebx
+ addl %r12d,%r9d
+
+ leaq 4(%rbp),%rbp
+ movl 48(%rsp),%r13d
+ movl 36(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r9d
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 16(%rsp),%r12d
+
+ addl 44(%rsp),%r12d
+ movl %ebx,%r13d
+ addl %edi,%r12d
+ movl %r9d,%r14d
+ rorl $14,%r13d
+ movl %ecx,%edi
+
+ xorl %ebx,%r13d
+ rorl $9,%r14d
+ xorl %edx,%edi
+
+ movl %r12d,44(%rsp)
+ xorl %r9d,%r14d
+ andl %ebx,%edi
+
+ rorl $5,%r13d
+ addl %r8d,%r12d
+ xorl %edx,%edi
+
+ rorl $11,%r14d
+ xorl %ebx,%r13d
+ addl %edi,%r12d
+
+ movl %r9d,%edi
+ addl (%rbp),%r12d
+ xorl %r9d,%r14d
+
+ xorl %r10d,%edi
+ rorl $6,%r13d
+ movl %r10d,%r8d
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%r8d
+ addl %r12d,%eax
+ addl %r12d,%r8d
+
+ leaq 20(%rbp),%rbp
+ movl 52(%rsp),%r13d
+ movl 40(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%r8d
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 20(%rsp),%r12d
+
+ addl 48(%rsp),%r12d
+ movl %eax,%r13d
+ addl %r15d,%r12d
+ movl %r8d,%r14d
+ rorl $14,%r13d
+ movl %ebx,%r15d
+
+ xorl %eax,%r13d
+ rorl $9,%r14d
+ xorl %ecx,%r15d
+
+ movl %r12d,48(%rsp)
+ xorl %r8d,%r14d
+ andl %eax,%r15d
+
+ rorl $5,%r13d
+ addl %edx,%r12d
+ xorl %ecx,%r15d
+
+ rorl $11,%r14d
+ xorl %eax,%r13d
+ addl %r15d,%r12d
+
+ movl %r8d,%r15d
+ addl (%rbp),%r12d
+ xorl %r8d,%r14d
+
+ xorl %r9d,%r15d
+ rorl $6,%r13d
+ movl %r9d,%edx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%edx
+ addl %r12d,%r11d
+ addl %r12d,%edx
+
+ leaq 4(%rbp),%rbp
+ movl 56(%rsp),%r13d
+ movl 44(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%edx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 24(%rsp),%r12d
+
+ addl 52(%rsp),%r12d
+ movl %r11d,%r13d
+ addl %edi,%r12d
+ movl %edx,%r14d
+ rorl $14,%r13d
+ movl %eax,%edi
+
+ xorl %r11d,%r13d
+ rorl $9,%r14d
+ xorl %ebx,%edi
+
+ movl %r12d,52(%rsp)
+ xorl %edx,%r14d
+ andl %r11d,%edi
+
+ rorl $5,%r13d
+ addl %ecx,%r12d
+ xorl %ebx,%edi
+
+ rorl $11,%r14d
+ xorl %r11d,%r13d
+ addl %edi,%r12d
+
+ movl %edx,%edi
+ addl (%rbp),%r12d
+ xorl %edx,%r14d
+
+ xorl %r8d,%edi
+ rorl $6,%r13d
+ movl %r8d,%ecx
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%ecx
+ addl %r12d,%r10d
+ addl %r12d,%ecx
+
+ leaq 4(%rbp),%rbp
+ movl 60(%rsp),%r13d
+ movl 48(%rsp),%r15d
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ecx
+ movl %r15d,%r14d
+ rorl $2,%r15d
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%r15d
+ shrl $10,%r14d
+
+ rorl $17,%r15d
+ xorl %r13d,%r12d
+ xorl %r14d,%r15d
+ addl 28(%rsp),%r12d
+
+ addl 56(%rsp),%r12d
+ movl %r10d,%r13d
+ addl %r15d,%r12d
+ movl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r11d,%r15d
+
+ xorl %r10d,%r13d
+ rorl $9,%r14d
+ xorl %eax,%r15d
+
+ movl %r12d,56(%rsp)
+ xorl %ecx,%r14d
+ andl %r10d,%r15d
+
+ rorl $5,%r13d
+ addl %ebx,%r12d
+ xorl %eax,%r15d
+
+ rorl $11,%r14d
+ xorl %r10d,%r13d
+ addl %r15d,%r12d
+
+ movl %ecx,%r15d
+ addl (%rbp),%r12d
+ xorl %ecx,%r14d
+
+ xorl %edx,%r15d
+ rorl $6,%r13d
+ movl %edx,%ebx
+
+ andl %r15d,%edi
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %edi,%ebx
+ addl %r12d,%r9d
+ addl %r12d,%ebx
+
+ leaq 4(%rbp),%rbp
+ movl 0(%rsp),%r13d
+ movl 52(%rsp),%edi
+
+ movl %r13d,%r12d
+ rorl $11,%r13d
+ addl %r14d,%ebx
+ movl %edi,%r14d
+ rorl $2,%edi
+
+ xorl %r12d,%r13d
+ shrl $3,%r12d
+ rorl $7,%r13d
+ xorl %r14d,%edi
+ shrl $10,%r14d
+
+ rorl $17,%edi
+ xorl %r13d,%r12d
+ xorl %r14d,%edi
+ addl 32(%rsp),%r12d
+
+ addl 60(%rsp),%r12d
+ movl %r9d,%r13d
+ addl %edi,%r12d
+ movl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r10d,%edi
+
+ xorl %r9d,%r13d
+ rorl $9,%r14d
+ xorl %r11d,%edi
+
+ movl %r12d,60(%rsp)
+ xorl %ebx,%r14d
+ andl %r9d,%edi
+
+ rorl $5,%r13d
+ addl %eax,%r12d
+ xorl %r11d,%edi
+
+ rorl $11,%r14d
+ xorl %r9d,%r13d
+ addl %edi,%r12d
+
+ movl %ebx,%edi
+ addl (%rbp),%r12d
+ xorl %ebx,%r14d
+
+ xorl %ecx,%edi
+ rorl $6,%r13d
+ movl %ecx,%eax
+
+ andl %edi,%r15d
+ rorl $2,%r14d
+ addl %r13d,%r12d
+
+ xorl %r15d,%eax
+ addl %r12d,%r8d
+ addl %r12d,%eax
+
+ leaq 20(%rbp),%rbp
+ cmpb $0,3(%rbp)
+ jnz .Lrounds_16_xx
+
+ movq 64+0(%rsp),%rdi
+ addl %r14d,%eax
+ leaq 64(%rsi),%rsi
+
+ addl 0(%rdi),%eax
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order,.-sha256_block_data_order
+.align 64
+.type K256,@object
+K256:
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type sha256_block_data_order_ssse3,@function
+.align 64
+sha256_block_data_order_ssse3:
+.cfi_startproc
+.Lssse3_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_ssse3:
+
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+
+
+ jmp .Lloop_ssse3
+.align 16
+.Lloop_ssse3:
+ movdqa K256+512(%rip),%xmm7
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+.byte 102,15,56,0,199
+ movdqu 48(%rsi),%xmm3
+ leaq K256(%rip),%rbp
+.byte 102,15,56,0,207
+ movdqa 0(%rbp),%xmm4
+ movdqa 32(%rbp),%xmm5
+.byte 102,15,56,0,215
+ paddd %xmm0,%xmm4
+ movdqa 64(%rbp),%xmm6
+.byte 102,15,56,0,223
+ movdqa 96(%rbp),%xmm7
+ paddd %xmm1,%xmm5
+ paddd %xmm2,%xmm6
+ paddd %xmm3,%xmm7
+ movdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ movdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ movdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ movdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lssse3_00_47
+
+.align 16
+.Lssse3_00_47:
+ subq $-128,%rbp
+ rorl $14,%r13d
+ movdqa %xmm1,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm3,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,224,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,250,4
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm3,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm0
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm0
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm0,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 0(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm0
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm0,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,0(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm2,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm0,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,225,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,251,4
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm0,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm1
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm1
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm1,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 32(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm1
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm1,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,16(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm3,%xmm4
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ movdqa %xmm1,%xmm7
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+.byte 102,15,58,15,226,4
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+.byte 102,15,58,15,248,4
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r11d,%edx
+ psrld $7,%xmm6
+ addl %edi,%r11d
+ movl %edx,%r13d
+ pshufd $250,%xmm1,%xmm7
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %r11d,%r14d
+ pxor %xmm5,%xmm4
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ pslld $11,%xmm5
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ pxor %xmm6,%xmm4
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ paddd %xmm4,%xmm2
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ psrlq $17,%xmm6
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ psrldq $8,%xmm7
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ paddd %xmm7,%xmm2
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ pshufd $80,%xmm2,%xmm7
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ movdqa %xmm7,%xmm6
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ psrld $10,%xmm7
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ psrlq $2,%xmm6
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ pxor %xmm6,%xmm7
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ movdqa 64(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ paddd %xmm7,%xmm2
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ paddd %xmm2,%xmm6
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ movdqa %xmm6,32(%rsp)
+ rorl $14,%r13d
+ movdqa %xmm0,%xmm4
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ movdqa %xmm2,%xmm7
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+.byte 102,15,58,15,227,4
+ andl %eax,%r12d
+ xorl %eax,%r13d
+.byte 102,15,58,15,249,4
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm4,%xmm5
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ movdqa %xmm4,%xmm6
+ rorl $6,%r13d
+ andl %r15d,%edi
+ psrld $3,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %edx,%r11d
+ psrld $7,%xmm6
+ addl %edi,%edx
+ movl %r11d,%r13d
+ pshufd $250,%xmm2,%xmm7
+ addl %edx,%r14d
+ rorl $14,%r13d
+ pslld $14,%xmm5
+ movl %r14d,%edx
+ movl %eax,%r12d
+ pxor %xmm6,%xmm4
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ psrld $11,%xmm6
+ xorl %edx,%r14d
+ pxor %xmm5,%xmm4
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ pslld $11,%xmm5
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ pxor %xmm6,%xmm4
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ movdqa %xmm7,%xmm6
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ pxor %xmm5,%xmm4
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ psrld $10,%xmm7
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ paddd %xmm4,%xmm3
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ psrlq $17,%xmm6
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ pxor %xmm6,%xmm7
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ psrlq $2,%xmm6
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ pxor %xmm6,%xmm7
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ pshufd $128,%xmm7,%xmm7
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ psrldq $8,%xmm7
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ paddd %xmm7,%xmm3
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ pshufd $80,%xmm3,%xmm7
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ movdqa %xmm7,%xmm6
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ psrld $10,%xmm7
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ psrlq $17,%xmm6
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ pxor %xmm6,%xmm7
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ psrlq $2,%xmm6
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ pxor %xmm6,%xmm7
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ pshufd $8,%xmm7,%xmm7
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ movdqa 96(%rbp),%xmm6
+ rorl $6,%r13d
+ andl %edi,%r15d
+ pslldq $8,%xmm7
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ paddd %xmm7,%xmm3
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ paddd %xmm3,%xmm6
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lssse3_00_47
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ rorl $14,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ rorl $9,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ rorl $5,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ rorl $11,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ rorl $2,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ rorl $9,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ rorl $5,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ rorl $11,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ rorl $2,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ rorl $9,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ rorl $5,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ rorl $11,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ rorl $2,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ rorl $9,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ rorl $5,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ rorl $11,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ rorl $2,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ rorl $14,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ rorl $9,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ rorl $5,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ rorl $11,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ rorl $2,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ rorl $9,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ rorl $5,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ rorl $11,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ rorl $2,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ rorl $9,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ rorl $5,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ rorl $11,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ rorl $6,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ rorl $2,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ rorl $14,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ rorl $9,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ rorl $5,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ rorl $11,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ rorl $6,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ rorl $2,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_ssse3
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_ssse3:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3
+.type sha256_block_data_order_avx,@function
+.align 64
+sha256_block_data_order_avx:
+.cfi_startproc
+.Lavx_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $96,%rsp
+ leaq (%rsi,%rdx,4),%rdx
+ andq $-64,%rsp
+ movq %rdi,64+0(%rsp)
+ movq %rsi,64+8(%rsp)
+ movq %rdx,64+16(%rsp)
+ movq %rax,88(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movl 0(%rdi),%eax
+ movl 4(%rdi),%ebx
+ movl 8(%rdi),%ecx
+ movl 12(%rdi),%edx
+ movl 16(%rdi),%r8d
+ movl 20(%rdi),%r9d
+ movl 24(%rdi),%r10d
+ movl 28(%rdi),%r11d
+ vmovdqa K256+512+32(%rip),%xmm8
+ vmovdqa K256+512+64(%rip),%xmm9
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K256+512(%rip),%xmm7
+ vmovdqu 0(%rsi),%xmm0
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm7,%xmm0,%xmm0
+ leaq K256(%rip),%rbp
+ vpshufb %xmm7,%xmm1,%xmm1
+ vpshufb %xmm7,%xmm2,%xmm2
+ vpaddd 0(%rbp),%xmm0,%xmm4
+ vpshufb %xmm7,%xmm3,%xmm3
+ vpaddd 32(%rbp),%xmm1,%xmm5
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ vpaddd 96(%rbp),%xmm3,%xmm7
+ vmovdqa %xmm4,0(%rsp)
+ movl %eax,%r14d
+ vmovdqa %xmm5,16(%rsp)
+ movl %ebx,%edi
+ vmovdqa %xmm6,32(%rsp)
+ xorl %ecx,%edi
+ vmovdqa %xmm7,48(%rsp)
+ movl %r8d,%r13d
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ subq $-128,%rbp
+ vpalignr $4,%xmm0,%xmm1,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm2,%xmm3,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm0,%xmm0
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm3,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm0,%xmm0
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm0,%xmm0
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ vpshufd $80,%xmm0,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm0,%xmm0
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 0(%rbp),%xmm0,%xmm6
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,0(%rsp)
+ vpalignr $4,%xmm1,%xmm2,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm3,%xmm0,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm1,%xmm1
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm0,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm1,%xmm1
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm1,%xmm1
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ vpshufd $80,%xmm1,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm1,%xmm1
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 32(%rbp),%xmm1,%xmm6
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,16(%rsp)
+ vpalignr $4,%xmm2,%xmm3,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ vpalignr $4,%xmm0,%xmm1,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ vpaddd %xmm7,%xmm2,%xmm2
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ vpshufd $250,%xmm1,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ vpsrld $11,%xmm6,%xmm6
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ vpaddd %xmm4,%xmm2,%xmm2
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ vpaddd %xmm6,%xmm2,%xmm2
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ vpshufd $80,%xmm2,%xmm7
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ vpaddd %xmm6,%xmm2,%xmm2
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ vpaddd 64(%rbp),%xmm2,%xmm6
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ vmovdqa %xmm6,32(%rsp)
+ vpalignr $4,%xmm3,%xmm0,%xmm4
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ vpalignr $4,%xmm1,%xmm2,%xmm7
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ vpsrld $7,%xmm4,%xmm6
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ vpaddd %xmm7,%xmm3,%xmm3
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ vpsrld $3,%xmm4,%xmm7
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ vpslld $14,%xmm4,%xmm5
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ vpxor %xmm6,%xmm7,%xmm4
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ vpshufd $250,%xmm2,%xmm7
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ vpsrld $11,%xmm6,%xmm6
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ vpxor %xmm5,%xmm4,%xmm4
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ vpslld $11,%xmm5,%xmm5
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ vpxor %xmm6,%xmm4,%xmm4
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ vpsrld $10,%xmm7,%xmm6
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ vpxor %xmm5,%xmm4,%xmm4
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ vpsrlq $17,%xmm7,%xmm7
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ vpaddd %xmm4,%xmm3,%xmm3
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ vpxor %xmm7,%xmm6,%xmm6
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ vpshufb %xmm8,%xmm6,%xmm6
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ vpaddd %xmm6,%xmm3,%xmm3
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ vpshufd $80,%xmm3,%xmm7
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ vpsrld $10,%xmm7,%xmm6
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ vpsrlq $17,%xmm7,%xmm7
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ vpxor %xmm7,%xmm6,%xmm6
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ vpsrlq $2,%xmm7,%xmm7
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ vpxor %xmm7,%xmm6,%xmm6
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ vpshufb %xmm9,%xmm6,%xmm6
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ vpaddd %xmm6,%xmm3,%xmm3
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ vpaddd 96(%rbp),%xmm3,%xmm6
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ vmovdqa %xmm6,48(%rsp)
+ cmpb $0,131(%rbp)
+ jne .Lavx_00_47
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 0(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 4(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 8(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 12(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 16(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 20(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 24(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 28(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%eax
+ movl %r9d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r8d,%r13d
+ xorl %r10d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %eax,%r14d
+ andl %r8d,%r12d
+ xorl %r8d,%r13d
+ addl 32(%rsp),%r11d
+ movl %eax,%r15d
+ xorl %r10d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ebx,%r15d
+ addl %r12d,%r11d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %eax,%r14d
+ addl %r13d,%r11d
+ xorl %ebx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r11d,%edx
+ addl %edi,%r11d
+ movl %edx,%r13d
+ addl %r11d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r11d
+ movl %r8d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %edx,%r13d
+ xorl %r9d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r11d,%r14d
+ andl %edx,%r12d
+ xorl %edx,%r13d
+ addl 36(%rsp),%r10d
+ movl %r11d,%edi
+ xorl %r9d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %eax,%edi
+ addl %r12d,%r10d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r11d,%r14d
+ addl %r13d,%r10d
+ xorl %eax,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r10d,%ecx
+ addl %r15d,%r10d
+ movl %ecx,%r13d
+ addl %r10d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r10d
+ movl %edx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ecx,%r13d
+ xorl %r8d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r10d,%r14d
+ andl %ecx,%r12d
+ xorl %ecx,%r13d
+ addl 40(%rsp),%r9d
+ movl %r10d,%r15d
+ xorl %r8d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r11d,%r15d
+ addl %r12d,%r9d
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r10d,%r14d
+ addl %r13d,%r9d
+ xorl %r11d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %r9d,%ebx
+ addl %edi,%r9d
+ movl %ebx,%r13d
+ addl %r9d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r9d
+ movl %ecx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %ebx,%r13d
+ xorl %edx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r9d,%r14d
+ andl %ebx,%r12d
+ xorl %ebx,%r13d
+ addl 44(%rsp),%r8d
+ movl %r9d,%edi
+ xorl %edx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r10d,%edi
+ addl %r12d,%r8d
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %r9d,%r14d
+ addl %r13d,%r8d
+ xorl %r10d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %r8d,%eax
+ addl %r15d,%r8d
+ movl %eax,%r13d
+ addl %r8d,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%r8d
+ movl %ebx,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %eax,%r13d
+ xorl %ecx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %r8d,%r14d
+ andl %eax,%r12d
+ xorl %eax,%r13d
+ addl 48(%rsp),%edx
+ movl %r8d,%r15d
+ xorl %ecx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r9d,%r15d
+ addl %r12d,%edx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %r8d,%r14d
+ addl %r13d,%edx
+ xorl %r9d,%edi
+ shrdl $2,%r14d,%r14d
+ addl %edx,%r11d
+ addl %edi,%edx
+ movl %r11d,%r13d
+ addl %edx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%edx
+ movl %eax,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r11d,%r13d
+ xorl %ebx,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %edx,%r14d
+ andl %r11d,%r12d
+ xorl %r11d,%r13d
+ addl 52(%rsp),%ecx
+ movl %edx,%edi
+ xorl %ebx,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %r8d,%edi
+ addl %r12d,%ecx
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %edx,%r14d
+ addl %r13d,%ecx
+ xorl %r8d,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %ecx,%r10d
+ addl %r15d,%ecx
+ movl %r10d,%r13d
+ addl %ecx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ecx
+ movl %r11d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r10d,%r13d
+ xorl %eax,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ecx,%r14d
+ andl %r10d,%r12d
+ xorl %r10d,%r13d
+ addl 56(%rsp),%ebx
+ movl %ecx,%r15d
+ xorl %eax,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %edx,%r15d
+ addl %r12d,%ebx
+ shrdl $6,%r13d,%r13d
+ andl %r15d,%edi
+ xorl %ecx,%r14d
+ addl %r13d,%ebx
+ xorl %edx,%edi
+ shrdl $2,%r14d,%r14d
+ addl %ebx,%r9d
+ addl %edi,%ebx
+ movl %r9d,%r13d
+ addl %ebx,%r14d
+ shrdl $14,%r13d,%r13d
+ movl %r14d,%ebx
+ movl %r10d,%r12d
+ shrdl $9,%r14d,%r14d
+ xorl %r9d,%r13d
+ xorl %r11d,%r12d
+ shrdl $5,%r13d,%r13d
+ xorl %ebx,%r14d
+ andl %r9d,%r12d
+ xorl %r9d,%r13d
+ addl 60(%rsp),%eax
+ movl %ebx,%edi
+ xorl %r11d,%r12d
+ shrdl $11,%r14d,%r14d
+ xorl %ecx,%edi
+ addl %r12d,%eax
+ shrdl $6,%r13d,%r13d
+ andl %edi,%r15d
+ xorl %ebx,%r14d
+ addl %r13d,%eax
+ xorl %ecx,%r15d
+ shrdl $2,%r14d,%r14d
+ addl %eax,%r8d
+ addl %r15d,%eax
+ movl %r8d,%r13d
+ addl %eax,%r14d
+ movq 64+0(%rsp),%rdi
+ movl %r14d,%eax
+
+ addl 0(%rdi),%eax
+ leaq 64(%rsi),%rsi
+ addl 4(%rdi),%ebx
+ addl 8(%rdi),%ecx
+ addl 12(%rdi),%edx
+ addl 16(%rdi),%r8d
+ addl 20(%rdi),%r9d
+ addl 24(%rdi),%r10d
+ addl 28(%rdi),%r11d
+
+ cmpq 64+16(%rsp),%rsi
+
+ movl %eax,0(%rdi)
+ movl %ebx,4(%rdi)
+ movl %ecx,8(%rdi)
+ movl %edx,12(%rdi)
+ movl %r8d,16(%rdi)
+ movl %r9d,20(%rdi)
+ movl %r10d,24(%rdi)
+ movl %r11d,28(%rdi)
+ jb .Lloop_avx
+
+ movq 88(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha256_block_data_order_avx,.-sha256_block_data_order_avx
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S
@@ -1,0 +1,2992 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+.globl sha512_block_data_order
+.hidden sha512_block_data_order
+.type sha512_block_data_order,@function
+.align 16
+sha512_block_data_order:
+.cfi_startproc
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 0(%r11),%r9d
+ movl 4(%r11),%r10d
+ movl 8(%r11),%r11d
+ andl $1073741824,%r9d
+ andl $268435968,%r10d
+ orl %r9d,%r10d
+ cmpl $1342177792,%r10d
+ je .Lavx_shortcut
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $128+32,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue:
+
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop
+
+.align 16
+.Lloop:
+ movq %rbx,%rdi
+ leaq K512(%rip),%rbp
+ xorq %rcx,%rdi
+ movq 0(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 8(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 16(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 24(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 32(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 40(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 48(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 56(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rax
+ movq 64(%rsi),%r12
+ movq %r8,%r13
+ movq %rax,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r11
+ movq 72(%rsi),%r12
+ movq %rdx,%r13
+ movq %r11,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r10
+ movq 80(%rsi),%r12
+ movq %rcx,%r13
+ movq %r10,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%r9
+ movq 88(%rsi),%r12
+ movq %rbx,%r13
+ movq %r9,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%r8
+ movq 96(%rsi),%r12
+ movq %rax,%r13
+ movq %r8,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rdx
+ movq 104(%rsi),%r12
+ movq %r11,%r13
+ movq %rdx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ addq %r14,%rcx
+ movq 112(%rsi),%r12
+ movq %r10,%r13
+ movq %rcx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ addq %r14,%rbx
+ movq 120(%rsi),%r12
+ movq %r9,%r13
+ movq %rbx,%r14
+ bswapq %r12
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ jmp .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+ movq 8(%rsp),%r13
+ movq 112(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 72(%rsp),%r12
+
+ addq 0(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,0(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 16(%rsp),%r13
+ movq 120(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 80(%rsp),%r12
+
+ addq 8(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,8(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 24(%rsp),%r13
+ movq 0(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 88(%rsp),%r12
+
+ addq 16(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,16(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 32(%rsp),%r13
+ movq 8(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 96(%rsp),%r12
+
+ addq 24(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,24(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 40(%rsp),%r13
+ movq 16(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 104(%rsp),%r12
+
+ addq 32(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,32(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 48(%rsp),%r13
+ movq 24(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 112(%rsp),%r12
+
+ addq 40(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,40(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 56(%rsp),%r13
+ movq 32(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 120(%rsp),%r12
+
+ addq 48(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,48(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 64(%rsp),%r13
+ movq 40(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 0(%rsp),%r12
+
+ addq 56(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,56(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ movq 72(%rsp),%r13
+ movq 48(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rax
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 8(%rsp),%r12
+
+ addq 64(%rsp),%r12
+ movq %r8,%r13
+ addq %r15,%r12
+ movq %rax,%r14
+ rorq $23,%r13
+ movq %r9,%r15
+
+ xorq %r8,%r13
+ rorq $5,%r14
+ xorq %r10,%r15
+
+ movq %r12,64(%rsp)
+ xorq %rax,%r14
+ andq %r8,%r15
+
+ rorq $4,%r13
+ addq %r11,%r12
+ xorq %r10,%r15
+
+ rorq $6,%r14
+ xorq %r8,%r13
+ addq %r15,%r12
+
+ movq %rax,%r15
+ addq (%rbp),%r12
+ xorq %rax,%r14
+
+ xorq %rbx,%r15
+ rorq $14,%r13
+ movq %rbx,%r11
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r11
+ addq %r12,%rdx
+ addq %r12,%r11
+
+ leaq 8(%rbp),%rbp
+ movq 80(%rsp),%r13
+ movq 56(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r11
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 16(%rsp),%r12
+
+ addq 72(%rsp),%r12
+ movq %rdx,%r13
+ addq %rdi,%r12
+ movq %r11,%r14
+ rorq $23,%r13
+ movq %r8,%rdi
+
+ xorq %rdx,%r13
+ rorq $5,%r14
+ xorq %r9,%rdi
+
+ movq %r12,72(%rsp)
+ xorq %r11,%r14
+ andq %rdx,%rdi
+
+ rorq $4,%r13
+ addq %r10,%r12
+ xorq %r9,%rdi
+
+ rorq $6,%r14
+ xorq %rdx,%r13
+ addq %rdi,%r12
+
+ movq %r11,%rdi
+ addq (%rbp),%r12
+ xorq %r11,%r14
+
+ xorq %rax,%rdi
+ rorq $14,%r13
+ movq %rax,%r10
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r10
+ addq %r12,%rcx
+ addq %r12,%r10
+
+ leaq 24(%rbp),%rbp
+ movq 88(%rsp),%r13
+ movq 64(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r10
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 24(%rsp),%r12
+
+ addq 80(%rsp),%r12
+ movq %rcx,%r13
+ addq %r15,%r12
+ movq %r10,%r14
+ rorq $23,%r13
+ movq %rdx,%r15
+
+ xorq %rcx,%r13
+ rorq $5,%r14
+ xorq %r8,%r15
+
+ movq %r12,80(%rsp)
+ xorq %r10,%r14
+ andq %rcx,%r15
+
+ rorq $4,%r13
+ addq %r9,%r12
+ xorq %r8,%r15
+
+ rorq $6,%r14
+ xorq %rcx,%r13
+ addq %r15,%r12
+
+ movq %r10,%r15
+ addq (%rbp),%r12
+ xorq %r10,%r14
+
+ xorq %r11,%r15
+ rorq $14,%r13
+ movq %r11,%r9
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%r9
+ addq %r12,%rbx
+ addq %r12,%r9
+
+ leaq 8(%rbp),%rbp
+ movq 96(%rsp),%r13
+ movq 72(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r9
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 32(%rsp),%r12
+
+ addq 88(%rsp),%r12
+ movq %rbx,%r13
+ addq %rdi,%r12
+ movq %r9,%r14
+ rorq $23,%r13
+ movq %rcx,%rdi
+
+ xorq %rbx,%r13
+ rorq $5,%r14
+ xorq %rdx,%rdi
+
+ movq %r12,88(%rsp)
+ xorq %r9,%r14
+ andq %rbx,%rdi
+
+ rorq $4,%r13
+ addq %r8,%r12
+ xorq %rdx,%rdi
+
+ rorq $6,%r14
+ xorq %rbx,%r13
+ addq %rdi,%r12
+
+ movq %r9,%rdi
+ addq (%rbp),%r12
+ xorq %r9,%r14
+
+ xorq %r10,%rdi
+ rorq $14,%r13
+ movq %r10,%r8
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%r8
+ addq %r12,%rax
+ addq %r12,%r8
+
+ leaq 24(%rbp),%rbp
+ movq 104(%rsp),%r13
+ movq 80(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%r8
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 40(%rsp),%r12
+
+ addq 96(%rsp),%r12
+ movq %rax,%r13
+ addq %r15,%r12
+ movq %r8,%r14
+ rorq $23,%r13
+ movq %rbx,%r15
+
+ xorq %rax,%r13
+ rorq $5,%r14
+ xorq %rcx,%r15
+
+ movq %r12,96(%rsp)
+ xorq %r8,%r14
+ andq %rax,%r15
+
+ rorq $4,%r13
+ addq %rdx,%r12
+ xorq %rcx,%r15
+
+ rorq $6,%r14
+ xorq %rax,%r13
+ addq %r15,%r12
+
+ movq %r8,%r15
+ addq (%rbp),%r12
+ xorq %r8,%r14
+
+ xorq %r9,%r15
+ rorq $14,%r13
+ movq %r9,%rdx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rdx
+ addq %r12,%r11
+ addq %r12,%rdx
+
+ leaq 8(%rbp),%rbp
+ movq 112(%rsp),%r13
+ movq 88(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rdx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 48(%rsp),%r12
+
+ addq 104(%rsp),%r12
+ movq %r11,%r13
+ addq %rdi,%r12
+ movq %rdx,%r14
+ rorq $23,%r13
+ movq %rax,%rdi
+
+ xorq %r11,%r13
+ rorq $5,%r14
+ xorq %rbx,%rdi
+
+ movq %r12,104(%rsp)
+ xorq %rdx,%r14
+ andq %r11,%rdi
+
+ rorq $4,%r13
+ addq %rcx,%r12
+ xorq %rbx,%rdi
+
+ rorq $6,%r14
+ xorq %r11,%r13
+ addq %rdi,%r12
+
+ movq %rdx,%rdi
+ addq (%rbp),%r12
+ xorq %rdx,%r14
+
+ xorq %r8,%rdi
+ rorq $14,%r13
+ movq %r8,%rcx
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rcx
+ addq %r12,%r10
+ addq %r12,%rcx
+
+ leaq 24(%rbp),%rbp
+ movq 120(%rsp),%r13
+ movq 96(%rsp),%r15
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rcx
+ movq %r15,%r14
+ rorq $42,%r15
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%r15
+ shrq $6,%r14
+
+ rorq $19,%r15
+ xorq %r13,%r12
+ xorq %r14,%r15
+ addq 56(%rsp),%r12
+
+ addq 112(%rsp),%r12
+ movq %r10,%r13
+ addq %r15,%r12
+ movq %rcx,%r14
+ rorq $23,%r13
+ movq %r11,%r15
+
+ xorq %r10,%r13
+ rorq $5,%r14
+ xorq %rax,%r15
+
+ movq %r12,112(%rsp)
+ xorq %rcx,%r14
+ andq %r10,%r15
+
+ rorq $4,%r13
+ addq %rbx,%r12
+ xorq %rax,%r15
+
+ rorq $6,%r14
+ xorq %r10,%r13
+ addq %r15,%r12
+
+ movq %rcx,%r15
+ addq (%rbp),%r12
+ xorq %rcx,%r14
+
+ xorq %rdx,%r15
+ rorq $14,%r13
+ movq %rdx,%rbx
+
+ andq %r15,%rdi
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %rdi,%rbx
+ addq %r12,%r9
+ addq %r12,%rbx
+
+ leaq 8(%rbp),%rbp
+ movq 0(%rsp),%r13
+ movq 104(%rsp),%rdi
+
+ movq %r13,%r12
+ rorq $7,%r13
+ addq %r14,%rbx
+ movq %rdi,%r14
+ rorq $42,%rdi
+
+ xorq %r12,%r13
+ shrq $7,%r12
+ rorq $1,%r13
+ xorq %r14,%rdi
+ shrq $6,%r14
+
+ rorq $19,%rdi
+ xorq %r13,%r12
+ xorq %r14,%rdi
+ addq 64(%rsp),%r12
+
+ addq 120(%rsp),%r12
+ movq %r9,%r13
+ addq %rdi,%r12
+ movq %rbx,%r14
+ rorq $23,%r13
+ movq %r10,%rdi
+
+ xorq %r9,%r13
+ rorq $5,%r14
+ xorq %r11,%rdi
+
+ movq %r12,120(%rsp)
+ xorq %rbx,%r14
+ andq %r9,%rdi
+
+ rorq $4,%r13
+ addq %rax,%r12
+ xorq %r11,%rdi
+
+ rorq $6,%r14
+ xorq %r9,%r13
+ addq %rdi,%r12
+
+ movq %rbx,%rdi
+ addq (%rbp),%r12
+ xorq %rbx,%r14
+
+ xorq %rcx,%rdi
+ rorq $14,%r13
+ movq %rcx,%rax
+
+ andq %rdi,%r15
+ rorq $28,%r14
+ addq %r13,%r12
+
+ xorq %r15,%rax
+ addq %r12,%r8
+ addq %r12,%rax
+
+ leaq 24(%rbp),%rbp
+ cmpb $0,7(%rbp)
+ jnz .Lrounds_16_xx
+
+ movq 128+0(%rsp),%rdi
+ addq %r14,%rax
+ leaq 128(%rsi),%rsi
+
+ addq 0(%rdi),%rax
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha512_block_data_order,.-sha512_block_data_order
+.align 64
+.type K512,@object
+K512:
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0x428a2f98d728ae22,0x7137449123ef65cd
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x3956c25bf348b538,0x59f111f1b605d019
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0xd807aa98a3030242,0x12835b0145706fbe
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0x9bdc06a725c71235,0xc19bf174cf692694
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0x983e5152ee66dfab,0xa831c66d2db43210
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x06ca6351e003826f,0x142929670a0e6e70
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0x81c2c92e47edaee6,0x92722c851482353b
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xd192e819d6ef5218,0xd69906245565a910
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0xf40e35855771202a,0x106aa07032bbd1b8
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0x90befffa23631e28,0xa4506cebde82bde9
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xca273eceea26619c,0xd186b8c721c0c207
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x113f9804bef90dae,0x1b710b35131c471b
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x28db77f523047d84,0x32caab7b40c72493
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.quad 0x0001020304050607,0x08090a0b0c0d0e0f
+.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.type sha512_block_data_order_avx,@function
+.align 64
+sha512_block_data_order_avx:
+.cfi_startproc
+.Lavx_shortcut:
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+ shlq $4,%rdx
+ subq $160,%rsp
+ leaq (%rsi,%rdx,8),%rdx
+ andq $-64,%rsp
+ movq %rdi,128+0(%rsp)
+ movq %rsi,128+8(%rsp)
+ movq %rdx,128+16(%rsp)
+ movq %rax,152(%rsp)
+.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+ vzeroupper
+ movq 0(%rdi),%rax
+ movq 8(%rdi),%rbx
+ movq 16(%rdi),%rcx
+ movq 24(%rdi),%rdx
+ movq 32(%rdi),%r8
+ movq 40(%rdi),%r9
+ movq 48(%rdi),%r10
+ movq 56(%rdi),%r11
+ jmp .Lloop_avx
+.align 16
+.Lloop_avx:
+ vmovdqa K512+1280(%rip),%xmm11
+ vmovdqu 0(%rsi),%xmm0
+ leaq K512+128(%rip),%rbp
+ vmovdqu 16(%rsi),%xmm1
+ vmovdqu 32(%rsi),%xmm2
+ vpshufb %xmm11,%xmm0,%xmm0
+ vmovdqu 48(%rsi),%xmm3
+ vpshufb %xmm11,%xmm1,%xmm1
+ vmovdqu 64(%rsi),%xmm4
+ vpshufb %xmm11,%xmm2,%xmm2
+ vmovdqu 80(%rsi),%xmm5
+ vpshufb %xmm11,%xmm3,%xmm3
+ vmovdqu 96(%rsi),%xmm6
+ vpshufb %xmm11,%xmm4,%xmm4
+ vmovdqu 112(%rsi),%xmm7
+ vpshufb %xmm11,%xmm5,%xmm5
+ vpaddq -128(%rbp),%xmm0,%xmm8
+ vpshufb %xmm11,%xmm6,%xmm6
+ vpaddq -96(%rbp),%xmm1,%xmm9
+ vpshufb %xmm11,%xmm7,%xmm7
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ vpaddq -32(%rbp),%xmm3,%xmm11
+ vmovdqa %xmm8,0(%rsp)
+ vpaddq 0(%rbp),%xmm4,%xmm8
+ vmovdqa %xmm9,16(%rsp)
+ vpaddq 32(%rbp),%xmm5,%xmm9
+ vmovdqa %xmm10,32(%rsp)
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ vmovdqa %xmm11,48(%rsp)
+ vpaddq 96(%rbp),%xmm7,%xmm11
+ vmovdqa %xmm8,64(%rsp)
+ movq %rax,%r14
+ vmovdqa %xmm9,80(%rsp)
+ movq %rbx,%rdi
+ vmovdqa %xmm10,96(%rsp)
+ xorq %rcx,%rdi
+ vmovdqa %xmm11,112(%rsp)
+ movq %r8,%r13
+ jmp .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+ addq $256,%rbp
+ vpalignr $8,%xmm0,%xmm1,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm4,%xmm5,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm0,%xmm0
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm7,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm7,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm0,%xmm0
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm7,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm0,%xmm0
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq -128(%rbp),%xmm0,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,0(%rsp)
+ vpalignr $8,%xmm1,%xmm2,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm5,%xmm6,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm1,%xmm1
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm0,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm0,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm1,%xmm1
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm0,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm1,%xmm1
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq -96(%rbp),%xmm1,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,16(%rsp)
+ vpalignr $8,%xmm2,%xmm3,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm6,%xmm7,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm2,%xmm2
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm1,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm1,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm2,%xmm2
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm1,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm2,%xmm2
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq -64(%rbp),%xmm2,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,32(%rsp)
+ vpalignr $8,%xmm3,%xmm4,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm7,%xmm0,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm3,%xmm3
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm2,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm2,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm3,%xmm3
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm2,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm3,%xmm3
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq -32(%rbp),%xmm3,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,48(%rsp)
+ vpalignr $8,%xmm4,%xmm5,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ vpalignr $8,%xmm0,%xmm1,%xmm11
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r8,%r13
+ xorq %r10,%r12
+ vpaddq %xmm11,%xmm4,%xmm4
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r8,%r12
+ xorq %r8,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rbx,%r15
+ addq %r12,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rax,%r14
+ addq %r13,%r11
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm3,%xmm11
+ addq %r11,%rdx
+ addq %rdi,%r11
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rdx,%r13
+ addq %r11,%r14
+ vpsllq $3,%xmm3,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ vpaddq %xmm8,%xmm4,%xmm4
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm3,%xmm9
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rax,%rdi
+ addq %r12,%r10
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm4,%xmm4
+ xorq %r11,%r14
+ addq %r13,%r10
+ vpaddq 0(%rbp),%xmm4,%xmm10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ vmovdqa %xmm10,64(%rsp)
+ vpalignr $8,%xmm5,%xmm6,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ vpalignr $8,%xmm1,%xmm2,%xmm11
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ vpaddq %xmm11,%xmm5,%xmm5
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r11,%r15
+ addq %r12,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r10,%r14
+ addq %r13,%r9
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm4,%xmm11
+ addq %r9,%rbx
+ addq %rdi,%r9
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %rbx,%r13
+ addq %r9,%r14
+ vpsllq $3,%xmm4,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ vpaddq %xmm8,%xmm5,%xmm5
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm4,%xmm9
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r10,%rdi
+ addq %r12,%r8
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm5,%xmm5
+ xorq %r9,%r14
+ addq %r13,%r8
+ vpaddq 32(%rbp),%xmm5,%xmm10
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ vmovdqa %xmm10,80(%rsp)
+ vpalignr $8,%xmm6,%xmm7,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ vpalignr $8,%xmm2,%xmm3,%xmm11
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ vpaddq %xmm11,%xmm6,%xmm6
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %rax,%r12
+ xorq %rax,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %r9,%r15
+ addq %r12,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %r8,%r14
+ addq %r13,%rdx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm5,%xmm11
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r11,%r13
+ addq %rdx,%r14
+ vpsllq $3,%xmm5,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ vpaddq %xmm8,%xmm6,%xmm6
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm5,%xmm9
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r11,%r12
+ xorq %r11,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm6,%xmm6
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ vpaddq 64(%rbp),%xmm6,%xmm10
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ vmovdqa %xmm10,96(%rsp)
+ vpalignr $8,%xmm7,%xmm0,%xmm8
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ vpalignr $8,%xmm3,%xmm4,%xmm11
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $1,%xmm8,%xmm10
+ xorq %r10,%r13
+ xorq %rax,%r12
+ vpaddq %xmm11,%xmm7,%xmm7
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ vpsrlq $7,%xmm8,%xmm11
+ andq %r10,%r12
+ xorq %r10,%r13
+ vpsllq $56,%xmm8,%xmm9
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ vpxor %xmm10,%xmm11,%xmm8
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ vpsrlq $7,%xmm10,%xmm10
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ vpsllq $7,%xmm9,%xmm9
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ vpxor %xmm10,%xmm8,%xmm8
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ vpsrlq $6,%xmm6,%xmm11
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ vpxor %xmm9,%xmm8,%xmm8
+ movq %r9,%r13
+ addq %rbx,%r14
+ vpsllq $3,%xmm6,%xmm10
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ vpaddq %xmm8,%xmm7,%xmm7
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ vpsrlq $19,%xmm6,%xmm9
+ xorq %r9,%r13
+ xorq %r11,%r12
+ vpxor %xmm10,%xmm11,%xmm11
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ vpsllq $42,%xmm10,%xmm10
+ andq %r9,%r12
+ xorq %r9,%r13
+ vpxor %xmm9,%xmm11,%xmm11
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ vpsrlq $42,%xmm9,%xmm9
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ vpxor %xmm10,%xmm11,%xmm11
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ vpxor %xmm9,%xmm11,%xmm11
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ vpaddq %xmm11,%xmm7,%xmm7
+ xorq %rbx,%r14
+ addq %r13,%rax
+ vpaddq 96(%rbp),%xmm7,%xmm10
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ vmovdqa %xmm10,112(%rsp)
+ cmpb $0,135(%rbp)
+ jne .Lavx_00_47
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 0(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 8(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 16(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 24(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 32(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 40(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 48(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 56(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rax
+ movq %r9,%r12
+ shrdq $5,%r14,%r14
+ xorq %r8,%r13
+ xorq %r10,%r12
+ shrdq $4,%r13,%r13
+ xorq %rax,%r14
+ andq %r8,%r12
+ xorq %r8,%r13
+ addq 64(%rsp),%r11
+ movq %rax,%r15
+ xorq %r10,%r12
+ shrdq $6,%r14,%r14
+ xorq %rbx,%r15
+ addq %r12,%r11
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rax,%r14
+ addq %r13,%r11
+ xorq %rbx,%rdi
+ shrdq $28,%r14,%r14
+ addq %r11,%rdx
+ addq %rdi,%r11
+ movq %rdx,%r13
+ addq %r11,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r11
+ movq %r8,%r12
+ shrdq $5,%r14,%r14
+ xorq %rdx,%r13
+ xorq %r9,%r12
+ shrdq $4,%r13,%r13
+ xorq %r11,%r14
+ andq %rdx,%r12
+ xorq %rdx,%r13
+ addq 72(%rsp),%r10
+ movq %r11,%rdi
+ xorq %r9,%r12
+ shrdq $6,%r14,%r14
+ xorq %rax,%rdi
+ addq %r12,%r10
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r11,%r14
+ addq %r13,%r10
+ xorq %rax,%r15
+ shrdq $28,%r14,%r14
+ addq %r10,%rcx
+ addq %r15,%r10
+ movq %rcx,%r13
+ addq %r10,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r10
+ movq %rdx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rcx,%r13
+ xorq %r8,%r12
+ shrdq $4,%r13,%r13
+ xorq %r10,%r14
+ andq %rcx,%r12
+ xorq %rcx,%r13
+ addq 80(%rsp),%r9
+ movq %r10,%r15
+ xorq %r8,%r12
+ shrdq $6,%r14,%r14
+ xorq %r11,%r15
+ addq %r12,%r9
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r10,%r14
+ addq %r13,%r9
+ xorq %r11,%rdi
+ shrdq $28,%r14,%r14
+ addq %r9,%rbx
+ addq %rdi,%r9
+ movq %rbx,%r13
+ addq %r9,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r9
+ movq %rcx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rbx,%r13
+ xorq %rdx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r9,%r14
+ andq %rbx,%r12
+ xorq %rbx,%r13
+ addq 88(%rsp),%r8
+ movq %r9,%rdi
+ xorq %rdx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r10,%rdi
+ addq %r12,%r8
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %r9,%r14
+ addq %r13,%r8
+ xorq %r10,%r15
+ shrdq $28,%r14,%r14
+ addq %r8,%rax
+ addq %r15,%r8
+ movq %rax,%r13
+ addq %r8,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%r8
+ movq %rbx,%r12
+ shrdq $5,%r14,%r14
+ xorq %rax,%r13
+ xorq %rcx,%r12
+ shrdq $4,%r13,%r13
+ xorq %r8,%r14
+ andq %rax,%r12
+ xorq %rax,%r13
+ addq 96(%rsp),%rdx
+ movq %r8,%r15
+ xorq %rcx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r9,%r15
+ addq %r12,%rdx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %r8,%r14
+ addq %r13,%rdx
+ xorq %r9,%rdi
+ shrdq $28,%r14,%r14
+ addq %rdx,%r11
+ addq %rdi,%rdx
+ movq %r11,%r13
+ addq %rdx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rdx
+ movq %rax,%r12
+ shrdq $5,%r14,%r14
+ xorq %r11,%r13
+ xorq %rbx,%r12
+ shrdq $4,%r13,%r13
+ xorq %rdx,%r14
+ andq %r11,%r12
+ xorq %r11,%r13
+ addq 104(%rsp),%rcx
+ movq %rdx,%rdi
+ xorq %rbx,%r12
+ shrdq $6,%r14,%r14
+ xorq %r8,%rdi
+ addq %r12,%rcx
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rdx,%r14
+ addq %r13,%rcx
+ xorq %r8,%r15
+ shrdq $28,%r14,%r14
+ addq %rcx,%r10
+ addq %r15,%rcx
+ movq %r10,%r13
+ addq %rcx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rcx
+ movq %r11,%r12
+ shrdq $5,%r14,%r14
+ xorq %r10,%r13
+ xorq %rax,%r12
+ shrdq $4,%r13,%r13
+ xorq %rcx,%r14
+ andq %r10,%r12
+ xorq %r10,%r13
+ addq 112(%rsp),%rbx
+ movq %rcx,%r15
+ xorq %rax,%r12
+ shrdq $6,%r14,%r14
+ xorq %rdx,%r15
+ addq %r12,%rbx
+ shrdq $14,%r13,%r13
+ andq %r15,%rdi
+ xorq %rcx,%r14
+ addq %r13,%rbx
+ xorq %rdx,%rdi
+ shrdq $28,%r14,%r14
+ addq %rbx,%r9
+ addq %rdi,%rbx
+ movq %r9,%r13
+ addq %rbx,%r14
+ shrdq $23,%r13,%r13
+ movq %r14,%rbx
+ movq %r10,%r12
+ shrdq $5,%r14,%r14
+ xorq %r9,%r13
+ xorq %r11,%r12
+ shrdq $4,%r13,%r13
+ xorq %rbx,%r14
+ andq %r9,%r12
+ xorq %r9,%r13
+ addq 120(%rsp),%rax
+ movq %rbx,%rdi
+ xorq %r11,%r12
+ shrdq $6,%r14,%r14
+ xorq %rcx,%rdi
+ addq %r12,%rax
+ shrdq $14,%r13,%r13
+ andq %rdi,%r15
+ xorq %rbx,%r14
+ addq %r13,%rax
+ xorq %rcx,%r15
+ shrdq $28,%r14,%r14
+ addq %rax,%r8
+ addq %r15,%rax
+ movq %r8,%r13
+ addq %rax,%r14
+ movq 128+0(%rsp),%rdi
+ movq %r14,%rax
+
+ addq 0(%rdi),%rax
+ leaq 128(%rsi),%rsi
+ addq 8(%rdi),%rbx
+ addq 16(%rdi),%rcx
+ addq 24(%rdi),%rdx
+ addq 32(%rdi),%r8
+ addq 40(%rdi),%r9
+ addq 48(%rdi),%r10
+ addq 56(%rdi),%r11
+
+ cmpq 128+16(%rsp),%rsi
+
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %rcx,16(%rdi)
+ movq %rdx,24(%rdi)
+ movq %r8,32(%rdi)
+ movq %r9,40(%rdi)
+ movq %r10,48(%rdi)
+ movq %r11,56(%rdi)
+ jb .Lloop_avx
+
+ movq 152(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ vzeroupper
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lepilogue_avx:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size sha512_block_data_order_avx,.-sha512_block_data_order_avx
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S
@@ -1,0 +1,1133 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_encrypt_core,@function
+.align 16
+_vpaes_encrypt_core:
+.cfi_startproc
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_ipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movdqu (%r9),%xmm5
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_ipt+16(%rip),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm5,%xmm2
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ leaq .Lk_mc_backward(%rip),%r10
+ jmp .Lenc_entry
+
+.align 16
+.Lenc_loop:
+
+ movdqa %xmm13,%xmm4
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,226
+.byte 102,15,56,0,195
+ pxor %xmm5,%xmm4
+ movdqa %xmm15,%xmm5
+ pxor %xmm4,%xmm0
+ movdqa -64(%r11,%r10,1),%xmm1
+.byte 102,15,56,0,234
+ movdqa (%r11,%r10,1),%xmm4
+ movdqa %xmm14,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm0,%xmm3
+ pxor %xmm5,%xmm2
+.byte 102,15,56,0,193
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+.byte 102,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+.byte 102,15,56,0,193
+ andq $0x30,%r11
+ subq $1,%rax
+ pxor %xmm3,%xmm0
+
+.Lenc_entry:
+
+ movdqa %xmm9,%xmm1
+ movdqa %xmm11,%xmm5
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,232
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm5,%xmm3
+.byte 102,15,56,0,224
+ movdqa %xmm10,%xmm2
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm5
+ pxor %xmm1,%xmm3
+ jnz .Lenc_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+.byte 102,15,56,0,226
+ pxor %xmm5,%xmm4
+.byte 102,15,56,0,195
+ movdqa 64(%r11,%r10,1),%xmm1
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,193
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_encrypt_core_2x,@function
+.align 16
+_vpaes_encrypt_core_2x:
+.cfi_startproc
+ movq %rdx,%r9
+ movq $16,%r11
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa %xmm9,%xmm7
+ movdqa .Lk_ipt(%rip),%xmm2
+ movdqa %xmm2,%xmm8
+ pandn %xmm0,%xmm1
+ pandn %xmm6,%xmm7
+ movdqu (%r9),%xmm5
+
+ psrld $4,%xmm1
+ psrld $4,%xmm7
+ pand %xmm9,%xmm0
+ pand %xmm9,%xmm6
+.byte 102,15,56,0,208
+.byte 102,68,15,56,0,198
+ movdqa .Lk_ipt+16(%rip),%xmm0
+ movdqa %xmm0,%xmm6
+.byte 102,15,56,0,193
+.byte 102,15,56,0,247
+ pxor %xmm5,%xmm2
+ pxor %xmm5,%xmm8
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ pxor %xmm8,%xmm6
+ leaq .Lk_mc_backward(%rip),%r10
+ jmp .Lenc2x_entry
+
+.align 16
+.Lenc2x_loop:
+
+ movdqa .Lk_sb1(%rip),%xmm4
+ movdqa .Lk_sb1+16(%rip),%xmm0
+ movdqa %xmm4,%xmm12
+ movdqa %xmm0,%xmm6
+.byte 102,15,56,0,226
+.byte 102,69,15,56,0,224
+.byte 102,15,56,0,195
+.byte 102,65,15,56,0,243
+ pxor %xmm5,%xmm4
+ pxor %xmm5,%xmm12
+ movdqa .Lk_sb2(%rip),%xmm5
+ movdqa %xmm5,%xmm13
+ pxor %xmm4,%xmm0
+ pxor %xmm12,%xmm6
+ movdqa -64(%r11,%r10,1),%xmm1
+
+.byte 102,15,56,0,234
+.byte 102,69,15,56,0,232
+ movdqa (%r11,%r10,1),%xmm4
+
+ movdqa .Lk_sb2+16(%rip),%xmm2
+ movdqa %xmm2,%xmm8
+.byte 102,15,56,0,211
+.byte 102,69,15,56,0,195
+ movdqa %xmm0,%xmm3
+ movdqa %xmm6,%xmm11
+ pxor %xmm5,%xmm2
+ pxor %xmm13,%xmm8
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ addq $16,%r9
+ pxor %xmm2,%xmm0
+ pxor %xmm8,%xmm6
+.byte 102,15,56,0,220
+.byte 102,68,15,56,0,220
+ addq $16,%r11
+ pxor %xmm0,%xmm3
+ pxor %xmm6,%xmm11
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ andq $0x30,%r11
+ subq $1,%rax
+ pxor %xmm3,%xmm0
+ pxor %xmm11,%xmm6
+
+.Lenc2x_entry:
+
+ movdqa %xmm9,%xmm1
+ movdqa %xmm9,%xmm7
+ movdqa .Lk_inv+16(%rip),%xmm5
+ movdqa %xmm5,%xmm13
+ pandn %xmm0,%xmm1
+ pandn %xmm6,%xmm7
+ psrld $4,%xmm1
+ psrld $4,%xmm7
+ pand %xmm9,%xmm0
+ pand %xmm9,%xmm6
+.byte 102,15,56,0,232
+.byte 102,68,15,56,0,238
+ movdqa %xmm10,%xmm3
+ movdqa %xmm10,%xmm11
+ pxor %xmm1,%xmm0
+ pxor %xmm7,%xmm6
+.byte 102,15,56,0,217
+.byte 102,68,15,56,0,223
+ movdqa %xmm10,%xmm4
+ movdqa %xmm10,%xmm12
+ pxor %xmm5,%xmm3
+ pxor %xmm13,%xmm11
+.byte 102,15,56,0,224
+.byte 102,68,15,56,0,230
+ movdqa %xmm10,%xmm2
+ movdqa %xmm10,%xmm8
+ pxor %xmm5,%xmm4
+ pxor %xmm13,%xmm12
+.byte 102,15,56,0,211
+.byte 102,69,15,56,0,195
+ movdqa %xmm10,%xmm3
+ movdqa %xmm10,%xmm11
+ pxor %xmm0,%xmm2
+ pxor %xmm6,%xmm8
+.byte 102,15,56,0,220
+.byte 102,69,15,56,0,220
+ movdqu (%r9),%xmm5
+
+ pxor %xmm1,%xmm3
+ pxor %xmm7,%xmm11
+ jnz .Lenc2x_loop
+
+
+ movdqa -96(%r10),%xmm4
+ movdqa -80(%r10),%xmm0
+ movdqa %xmm4,%xmm12
+ movdqa %xmm0,%xmm6
+.byte 102,15,56,0,226
+.byte 102,69,15,56,0,224
+ pxor %xmm5,%xmm4
+ pxor %xmm5,%xmm12
+.byte 102,15,56,0,195
+.byte 102,65,15,56,0,243
+ movdqa 64(%r11,%r10,1),%xmm1
+
+ pxor %xmm4,%xmm0
+ pxor %xmm12,%xmm6
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x
+
+
+
+
+
+
+.type _vpaes_decrypt_core,@function
+.align 16
+_vpaes_decrypt_core:
+.cfi_startproc
+ movq %rdx,%r9
+ movl 240(%rdx),%eax
+ movdqa %xmm9,%xmm1
+ movdqa .Lk_dipt(%rip),%xmm2
+ pandn %xmm0,%xmm1
+ movq %rax,%r11
+ psrld $4,%xmm1
+ movdqu (%r9),%xmm5
+ shlq $4,%r11
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa .Lk_dipt+16(%rip),%xmm0
+ xorq $0x30,%r11
+ leaq .Lk_dsbd(%rip),%r10
+.byte 102,15,56,0,193
+ andq $0x30,%r11
+ pxor %xmm5,%xmm2
+ movdqa .Lk_mc_forward+48(%rip),%xmm5
+ pxor %xmm2,%xmm0
+ addq $16,%r9
+ addq %r10,%r11
+ jmp .Ldec_entry
+
+.align 16
+.Ldec_loop:
+
+
+
+ movdqa -32(%r10),%xmm4
+ movdqa -16(%r10),%xmm1
+.byte 102,15,56,0,226
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 0(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 16(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 32(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 48(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ movdqa 64(%r10),%xmm4
+ pxor %xmm1,%xmm0
+ movdqa 80(%r10),%xmm1
+
+.byte 102,15,56,0,226
+.byte 102,15,56,0,197
+.byte 102,15,56,0,203
+ pxor %xmm4,%xmm0
+ addq $16,%r9
+.byte 102,15,58,15,237,12
+ pxor %xmm1,%xmm0
+ subq $1,%rax
+
+.Ldec_entry:
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ movdqa %xmm11,%xmm2
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+.byte 102,15,56,0,208
+ movdqa %xmm10,%xmm3
+ pxor %xmm1,%xmm0
+.byte 102,15,56,0,217
+ movdqa %xmm10,%xmm4
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ movdqa %xmm10,%xmm3
+ pxor %xmm0,%xmm2
+.byte 102,15,56,0,220
+ movdqu (%r9),%xmm0
+ pxor %xmm1,%xmm3
+ jnz .Ldec_loop
+
+
+ movdqa 96(%r10),%xmm4
+.byte 102,15,56,0,226
+ pxor %xmm0,%xmm4
+ movdqa 112(%r10),%xmm0
+ movdqa -352(%r11),%xmm2
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+.byte 102,15,56,0,194
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+
+
+
+
+
+.type _vpaes_schedule_core,@function
+.align 16
+_vpaes_schedule_core:
+.cfi_startproc
+
+
+
+
+
+ call _vpaes_preheat
+ movdqa .Lk_rcon(%rip),%xmm8
+ movdqu (%rdi),%xmm0
+
+
+ movdqa %xmm0,%xmm3
+ leaq .Lk_ipt(%rip),%r11
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm7
+
+ leaq .Lk_sr(%rip),%r10
+ testq %rcx,%rcx
+ jnz .Lschedule_am_decrypting
+
+
+ movdqu %xmm0,(%rdx)
+ jmp .Lschedule_go
+
+.Lschedule_am_decrypting:
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ movdqu %xmm3,(%rdx)
+ xorq $0x30,%r8
+
+.Lschedule_go:
+ cmpl $192,%esi
+ ja .Lschedule_256
+ je .Lschedule_192
+
+
+
+
+
+
+
+
+
+
+.Lschedule_128:
+ movl $10,%esi
+
+.Loop_schedule_128:
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ jmp .Loop_schedule_128
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_192:
+ movdqu 8(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movdqa %xmm0,%xmm6
+ pxor %xmm4,%xmm4
+ movhlps %xmm4,%xmm6
+ movl $4,%esi
+
+.Loop_schedule_192:
+ call _vpaes_schedule_round
+.byte 102,15,58,15,198,8
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+ call _vpaes_schedule_192_smear
+ jmp .Loop_schedule_192
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_256:
+ movdqu 16(%rdi),%xmm0
+ call _vpaes_schedule_transform
+ movl $7,%esi
+
+.Loop_schedule_256:
+ call _vpaes_schedule_mangle
+ movdqa %xmm0,%xmm6
+
+
+ call _vpaes_schedule_round
+ decq %rsi
+ jz .Lschedule_mangle_last
+ call _vpaes_schedule_mangle
+
+
+ pshufd $0xFF,%xmm0,%xmm0
+ movdqa %xmm7,%xmm5
+ movdqa %xmm6,%xmm7
+ call _vpaes_schedule_low_round
+ movdqa %xmm5,%xmm7
+
+ jmp .Loop_schedule_256
+
+
+
+
+
+
+
+
+
+
+
+
+.align 16
+.Lschedule_mangle_last:
+
+ leaq .Lk_deskew(%rip),%r11
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_last_dec
+
+
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,193
+ leaq .Lk_opt(%rip),%r11
+ addq $32,%rdx
+
+.Lschedule_mangle_last_dec:
+ addq $-16,%rdx
+ pxor .Lk_s63(%rip),%xmm0
+ call _vpaes_schedule_transform
+ movdqu %xmm0,(%rdx)
+
+
+ pxor %xmm0,%xmm0
+ pxor %xmm1,%xmm1
+ pxor %xmm2,%xmm2
+ pxor %xmm3,%xmm3
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ pxor %xmm6,%xmm6
+ pxor %xmm7,%xmm7
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_192_smear,@function
+.align 16
+_vpaes_schedule_192_smear:
+.cfi_startproc
+ pshufd $0x80,%xmm6,%xmm1
+ pshufd $0xFE,%xmm7,%xmm0
+ pxor %xmm1,%xmm6
+ pxor %xmm1,%xmm1
+ pxor %xmm0,%xmm6
+ movdqa %xmm6,%xmm0
+ movhlps %xmm1,%xmm6
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_round,@function
+.align 16
+_vpaes_schedule_round:
+.cfi_startproc
+
+ pxor %xmm1,%xmm1
+.byte 102,65,15,58,15,200,15
+.byte 102,69,15,58,15,192,15
+ pxor %xmm1,%xmm7
+
+
+ pshufd $0xFF,%xmm0,%xmm0
+.byte 102,15,58,15,192,1
+
+
+
+
+_vpaes_schedule_low_round:
+
+ movdqa %xmm7,%xmm1
+ pslldq $4,%xmm7
+ pxor %xmm1,%xmm7
+ movdqa %xmm7,%xmm1
+ pslldq $8,%xmm7
+ pxor %xmm1,%xmm7
+ pxor .Lk_s63(%rip),%xmm7
+
+
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa %xmm11,%xmm2
+.byte 102,15,56,0,208
+ pxor %xmm1,%xmm0
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+ movdqa %xmm10,%xmm4
+.byte 102,15,56,0,224
+ pxor %xmm2,%xmm4
+ movdqa %xmm10,%xmm2
+.byte 102,15,56,0,211
+ pxor %xmm0,%xmm2
+ movdqa %xmm10,%xmm3
+.byte 102,15,56,0,220
+ pxor %xmm1,%xmm3
+ movdqa %xmm13,%xmm4
+.byte 102,15,56,0,226
+ movdqa %xmm12,%xmm0
+.byte 102,15,56,0,195
+ pxor %xmm4,%xmm0
+
+
+ pxor %xmm7,%xmm0
+ movdqa %xmm0,%xmm7
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _vpaes_schedule_round,.-_vpaes_schedule_round
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_transform,@function
+.align 16
+_vpaes_schedule_transform:
+.cfi_startproc
+ movdqa %xmm9,%xmm1
+ pandn %xmm0,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm0
+ movdqa (%r11),%xmm2
+.byte 102,15,56,0,208
+ movdqa 16(%r11),%xmm0
+.byte 102,15,56,0,193
+ pxor %xmm2,%xmm0
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.type _vpaes_schedule_mangle,@function
+.align 16
+_vpaes_schedule_mangle:
+.cfi_startproc
+ movdqa %xmm0,%xmm4
+ movdqa .Lk_mc_forward(%rip),%xmm5
+ testq %rcx,%rcx
+ jnz .Lschedule_mangle_dec
+
+
+ addq $16,%rdx
+ pxor .Lk_s63(%rip),%xmm4
+.byte 102,15,56,0,229
+ movdqa %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+.byte 102,15,56,0,229
+ pxor %xmm4,%xmm3
+
+ jmp .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+
+ leaq .Lk_dksd(%rip),%r11
+ movdqa %xmm9,%xmm1
+ pandn %xmm4,%xmm1
+ psrld $4,%xmm1
+ pand %xmm9,%xmm4
+
+ movdqa 0(%r11),%xmm2
+.byte 102,15,56,0,212
+ movdqa 16(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 32(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 48(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 64(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 80(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+.byte 102,15,56,0,221
+
+ movdqa 96(%r11),%xmm2
+.byte 102,15,56,0,212
+ pxor %xmm3,%xmm2
+ movdqa 112(%r11),%xmm3
+.byte 102,15,56,0,217
+ pxor %xmm2,%xmm3
+
+ addq $-16,%rdx
+
+.Lschedule_mangle_both:
+ movdqa (%r8,%r10,1),%xmm1
+.byte 102,15,56,0,217
+ addq $-16,%r8
+ andq $0x30,%r8
+ movdqu %xmm3,(%rdx)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+
+
+
+.globl vpaes_set_encrypt_key
+.hidden vpaes_set_encrypt_key
+.type vpaes_set_encrypt_key,@function
+.align 16
+vpaes_set_encrypt_key:
+.cfi_startproc
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+5(%rip)
+#endif
+
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+
+ movl $0,%ecx
+ movl $0x30,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+.globl vpaes_set_decrypt_key
+.hidden vpaes_set_decrypt_key
+.type vpaes_set_decrypt_key,@function
+.align 16
+vpaes_set_decrypt_key:
+.cfi_startproc
+ movl %esi,%eax
+ shrl $5,%eax
+ addl $5,%eax
+ movl %eax,240(%rdx)
+ shll $4,%eax
+ leaq 16(%rdx,%rax,1),%rdx
+
+ movl $1,%ecx
+ movl %esi,%r8d
+ shrl $1,%r8d
+ andl $32,%r8d
+ xorl $32,%r8d
+ call _vpaes_schedule_core
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+
+.globl vpaes_encrypt
+.hidden vpaes_encrypt
+.type vpaes_encrypt,@function
+.align 16
+vpaes_encrypt:
+.cfi_startproc
+#ifdef BORINGSSL_DISPATCH_TEST
+.extern BORINGSSL_function_hit
+.hidden BORINGSSL_function_hit
+ movb $1,BORINGSSL_function_hit+4(%rip)
+#endif
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_encrypt_core
+ movdqu %xmm0,(%rsi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size vpaes_encrypt,.-vpaes_encrypt
+
+.globl vpaes_decrypt
+.hidden vpaes_decrypt
+.type vpaes_decrypt,@function
+.align 16
+vpaes_decrypt:
+.cfi_startproc
+ movdqu (%rdi),%xmm0
+ call _vpaes_preheat
+ call _vpaes_decrypt_core
+ movdqu %xmm0,(%rsi)
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size vpaes_decrypt,.-vpaes_decrypt
+.globl vpaes_cbc_encrypt
+.hidden vpaes_cbc_encrypt
+.type vpaes_cbc_encrypt,@function
+.align 16
+vpaes_cbc_encrypt:
+.cfi_startproc
+ xchgq %rcx,%rdx
+ subq $16,%rcx
+ jc .Lcbc_abort
+ movdqu (%r8),%xmm6
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ cmpl $0,%r9d
+ je .Lcbc_dec_loop
+ jmp .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+ movdqu (%rdi),%xmm0
+ pxor %xmm6,%xmm0
+ call _vpaes_encrypt_core
+ movdqa %xmm0,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_enc_loop
+ jmp .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+ movdqu (%rdi),%xmm0
+ movdqa %xmm0,%xmm7
+ call _vpaes_decrypt_core
+ pxor %xmm6,%xmm0
+ movdqa %xmm7,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ leaq 16(%rdi),%rdi
+ subq $16,%rcx
+ jnc .Lcbc_dec_loop
+.Lcbc_done:
+ movdqu %xmm6,(%r8)
+.Lcbc_abort:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+.globl vpaes_ctr32_encrypt_blocks
+.hidden vpaes_ctr32_encrypt_blocks
+.type vpaes_ctr32_encrypt_blocks,@function
+.align 16
+vpaes_ctr32_encrypt_blocks:
+.cfi_startproc
+
+ xchgq %rcx,%rdx
+ testq %rcx,%rcx
+ jz .Lctr32_abort
+ movdqu (%r8),%xmm0
+ movdqa .Lctr_add_one(%rip),%xmm8
+ subq %rdi,%rsi
+ call _vpaes_preheat
+ movdqa %xmm0,%xmm6
+ pshufb .Lrev_ctr(%rip),%xmm6
+
+ testq $1,%rcx
+ jz .Lctr32_prep_loop
+
+
+
+ movdqu (%rdi),%xmm7
+ call _vpaes_encrypt_core
+ pxor %xmm7,%xmm0
+ paddd %xmm8,%xmm6
+ movdqu %xmm0,(%rsi,%rdi,1)
+ subq $1,%rcx
+ leaq 16(%rdi),%rdi
+ jz .Lctr32_done
+
+.Lctr32_prep_loop:
+
+
+ movdqa %xmm6,%xmm14
+ movdqa %xmm6,%xmm15
+ paddd %xmm8,%xmm15
+
+.Lctr32_loop:
+ movdqa .Lrev_ctr(%rip),%xmm1
+ movdqa %xmm14,%xmm0
+ movdqa %xmm15,%xmm6
+.byte 102,15,56,0,193
+.byte 102,15,56,0,241
+ call _vpaes_encrypt_core_2x
+ movdqu (%rdi),%xmm1
+ movdqu 16(%rdi),%xmm2
+ movdqa .Lctr_add_two(%rip),%xmm3
+ pxor %xmm1,%xmm0
+ pxor %xmm2,%xmm6
+ paddd %xmm3,%xmm14
+ paddd %xmm3,%xmm15
+ movdqu %xmm0,(%rsi,%rdi,1)
+ movdqu %xmm6,16(%rsi,%rdi,1)
+ subq $2,%rcx
+ leaq 32(%rdi),%rdi
+ jnz .Lctr32_loop
+
+.Lctr32_done:
+.Lctr32_abort:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks
+
+
+
+
+
+
+.type _vpaes_preheat,@function
+.align 16
+_vpaes_preheat:
+.cfi_startproc
+ leaq .Lk_s0F(%rip),%r10
+ movdqa -32(%r10),%xmm10
+ movdqa -16(%r10),%xmm11
+ movdqa 0(%r10),%xmm9
+ movdqa 48(%r10),%xmm13
+ movdqa 64(%r10),%xmm12
+ movdqa 80(%r10),%xmm15
+ movdqa 96(%r10),%xmm14
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size _vpaes_preheat,.-_vpaes_preheat
+
+
+
+
+
+.type _vpaes_consts,@object
+.align 64
+_vpaes_consts:
+.Lk_inv:
+.quad 0x0E05060F0D080180, 0x040703090A0B0C02
+.quad 0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:
+.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:
+.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
+.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:
+.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:
+.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
+.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:
+.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
+.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:
+.quad 0x0407060500030201, 0x0C0F0E0D080B0A09
+.quad 0x080B0A0904070605, 0x000302010C0F0E0D
+.quad 0x0C0F0E0D080B0A09, 0x0407060500030201
+.quad 0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:
+.quad 0x0605040702010003, 0x0E0D0C0F0A09080B
+.quad 0x020100030E0D0C0F, 0x0A09080B06050407
+.quad 0x0E0D0C0F0A09080B, 0x0605040702010003
+.quad 0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:
+.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
+.quad 0x030E09040F0A0500, 0x0B06010C07020D08
+.quad 0x0F060D040B020900, 0x070E050C030A0108
+.quad 0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:
+.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:
+.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:
+.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:
+.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+
+
+
+
+.Lk_dksd:
+.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:
+.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:
+.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:
+.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+
+
+
+
+.Lk_dipt:
+.quad 0x0F505B040B545F00, 0x154A411E114E451A
+.quad 0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:
+.quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:
+.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:
+.quad 0xD022649296B44200, 0x602646F6B0F2D404
+.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:
+.quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:
+.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+
+
+.Lrev_ctr:
+.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
+
+
+.Lctr_add_one:
+.quad 0x0000000000000000, 0x0000000100000000
+.Lctr_add_two:
+.quad 0x0000000000000000, 0x0000000200000000
+
+.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
+.align 64
+.size _vpaes_consts,.-_vpaes_consts
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S
@@ -1,0 +1,1260 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.globl bn_mul_mont
+.hidden bn_mul_mont
+.type bn_mul_mont,@function
+.align 16
+bn_mul_mont:
+.cfi_startproc
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ testl $3,%r9d
+ jnz .Lmul_enter
+ cmpl $8,%r9d
+ jb .Lmul_enter
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ cmpq %rsi,%rdx
+ jne .Lmul4x_enter
+ testl $7,%r9d
+ jz .Lsqr8x_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -16(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
+
+
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.align 16
+.Lmul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
+.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul_body:
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ movq %r9,%r15
+
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsp,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ movq $-1,%rbx
+ xorq %rax,%rbx
+ xorq %r14,%r14
+ movq %r9,%r15
+
+.Lcopy:
+ movq (%rdi,%r14,8),%rcx
+ movq (%rsp,%r14,8),%rdx
+ andq %rbx,%rcx
+ andq %rax,%rdx
+ movq %r9,(%rsp,%r14,8)
+ orq %rcx,%rdx
+ movq %rdx,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mul_mont,.-bn_mul_mont
+.type bn_mul4x_mont,@function
+.align 16
+bn_mul4x_mont:
+.cfi_startproc
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmul4x_enter:
+ andl $0x80100,%r11d
+ cmpl $0x80100,%r11d
+ je .Lmulx4x_enter
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -32(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+ movq %rax,8(%rsp,%r9,8)
+.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jb .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ leaq -4(%r9),%r15
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdx
+ shrq $2,%r15
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rbp,24(%rdi,%r14,8)
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,224
+ pcmpeqd %xmm5,%xmm5
+ pshufd $0,%xmm4,%xmm4
+ movq %r9,%r15
+ pxor %xmm4,%xmm5
+ shrq $2,%r15
+ xorl %eax,%eax
+
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqa (%rsp,%rax,1),%xmm1
+ movdqu (%rdi,%rax,1),%xmm2
+ pand %xmm4,%xmm1
+ pand %xmm5,%xmm2
+ movdqa 16(%rsp,%rax,1),%xmm3
+ movdqa %xmm0,(%rsp,%rax,1)
+ por %xmm2,%xmm1
+ movdqu 16(%rdi,%rax,1),%xmm2
+ movdqu %xmm1,(%rdi,%rax,1)
+ pand %xmm4,%xmm3
+ pand %xmm5,%xmm2
+ movdqa %xmm0,16(%rsp,%rax,1)
+ por %xmm2,%xmm3
+ movdqu %xmm3,16(%rdi,%rax,1)
+ leaq 32(%rax),%rax
+ decq %r15
+ jnz .Lcopy4x
+ movq 8(%rsp,%r9,8),%rsi
+.cfi_def_cfa %rsi, 8
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mul4x_mont,.-bn_mul4x_mont
+.extern bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.extern bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+
+.type bn_sqr8x_mont,@function
+.align 32
+bn_sqr8x_mont:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lsqr8x_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lsqr8x_prologue:
+
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shlq $3+2,%r10
+ negq %r9
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ movq (%r8),%r8
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lsqr8x_sp_alt
+ subq %r11,%rbp
+ leaq -64(%rbp,%r9,2),%rbp
+ jmp .Lsqr8x_sp_done
+
+.align 32
+.Lsqr8x_sp_alt:
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lsqr8x_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lsqr8x_page_walk
+ jmp .Lsqr8x_page_walk_done
+
+.align 16
+.Lsqr8x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lsqr8x_page_walk
+.Lsqr8x_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lsqr8x_body:
+
+.byte 102,72,15,110,209
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,73,15,110,218
+ leaq OPENSSL_ia32cap_P(%rip),%rax
+ movl 8(%rax),%eax
+ andl $0x80100,%eax
+ cmpl $0x80100,%eax
+ jne .Lsqr8x_nox
+
+ call bn_sqrx8x_internal
+
+
+
+
+ leaq (%r8,%rcx,1),%rbx
+ movq %rcx,%r9
+ movq %rcx,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
+
+.align 32
+.Lsqr8x_nox:
+ call bn_sqr8x_internal
+
+
+
+
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+ movq %r9,%rdx
+.byte 102,72,15,126,207
+ sarq $3+2,%rcx
+ jmp .Lsqr8x_sub
+
+.align 32
+.Lsqr8x_sub:
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ movq 16(%rbx),%r14
+ movq 24(%rbx),%r15
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rbp),%r12
+ sbbq 8(%rbp),%r13
+ sbbq 16(%rbp),%r14
+ sbbq 24(%rbp),%r15
+ leaq 32(%rbp),%rbp
+ movq %r12,0(%rdi)
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+ incq %rcx
+ jnz .Lsqr8x_sub
+
+ sbbq $0,%rax
+ leaq (%rbx,%r9,1),%rbx
+ leaq (%rdi,%r9,1),%rdi
+
+.byte 102,72,15,110,200
+ pxor %xmm0,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ jmp .Lsqr8x_cond_copy
+
+.align 32
+.Lsqr8x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ movdqa %xmm0,-32(%rbx,%rdx,1)
+ movdqa %xmm0,-16(%rbx,%rdx,1)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ addq $32,%r9
+ jnz .Lsqr8x_cond_copy
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lsqr8x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_sqr8x_mont,.-bn_sqr8x_mont
+.type bn_mulx4x_mont,@function
+.align 32
+bn_mulx4x_mont:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmulx4x_prologue:
+
+ shll $3,%r9d
+ xorq %r10,%r10
+ subq %r9,%r10
+ movq (%r8),%r8
+ leaq -72(%rsp,%r10,1),%rbp
+ andq $-128,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.align 16
+.Lmulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+ leaq (%rdx,%r9,1),%r10
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r9,0(%rsp)
+ shrq $5,%r9
+ movq %r10,16(%rsp)
+ subq $1,%r9
+ movq %r8,24(%rsp)
+ movq %rdi,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+ movq %r9,48(%rsp)
+ jmp .Lmulx4x_body
+
+.align 32
+.Lmulx4x_body:
+ leaq 8(%rdx),%rdi
+ movq (%rdx),%rdx
+ leaq 64+32(%rsp),%rbx
+ movq %rdx,%r9
+
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r14
+ addq %rax,%r11
+ movq %rdi,8(%rsp)
+ mulxq 16(%rsi),%r12,%r13
+ adcq %r14,%r12
+ adcq $0,%r13
+
+ movq %r8,%rdi
+ imulq 24(%rsp),%r8
+ xorq %rbp,%rbp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%rdi
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
+ movq 48(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_1st
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ addq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ movq (%rdi),%rdx
+ leaq 8(%rdi),%rdi
+ subq %rax,%rsi
+ movq %r15,(%rbx)
+ leaq 64+32(%rsp),%rbx
+ subq %rax,%rcx
+
+ mulxq 0(%rsi),%r8,%r11
+ xorl %ebp,%ebp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ adoxq -16(%rbx),%r12
+ adcxq %rbp,%r13
+ adoxq %rbp,%r13
+
+ movq %rdi,8(%rsp)
+ movq %r8,%r15
+ imulq 24(%rsp),%r8
+ xorl %ebp,%ebp
+
+ mulxq 24(%rsi),%rax,%r14
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ adoxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ leaq 32(%rcx),%rcx
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ movq 48(%rsp),%rdi
+ movq %r12,-16(%rbx)
+
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-32(%rbx)
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_inner
+
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdi
+ adcq %rbp,%r15
+ subq 0(%rbx),%rbp
+ adcq %r15,%r14
+ sbbq %r15,%r15
+ movq %r14,-8(%rbx)
+
+ cmpq 16(%rsp),%rdi
+ jne .Lmulx4x_outer
+
+ leaq 64(%rsp),%rbx
+ subq %rax,%rcx
+ negq %r15
+ movq %rax,%rdx
+ shrq $3+2,%rax
+ movq 32(%rsp),%rdi
+ jmp .Lmulx4x_sub
+
+.align 32
+.Lmulx4x_sub:
+ movq 0(%rbx),%r11
+ movq 8(%rbx),%r12
+ movq 16(%rbx),%r13
+ movq 24(%rbx),%r14
+ leaq 32(%rbx),%rbx
+ sbbq 0(%rcx),%r11
+ sbbq 8(%rcx),%r12
+ sbbq 16(%rcx),%r13
+ sbbq 24(%rcx),%r14
+ leaq 32(%rcx),%rcx
+ movq %r11,0(%rdi)
+ movq %r12,8(%rdi)
+ movq %r13,16(%rdi)
+ movq %r14,24(%rdi)
+ leaq 32(%rdi),%rdi
+ decq %rax
+ jnz .Lmulx4x_sub
+
+ sbbq $0,%r15
+ leaq 64(%rsp),%rbx
+ subq %rdx,%rdi
+
+.byte 102,73,15,110,207
+ pxor %xmm0,%xmm0
+ pshufd $0,%xmm1,%xmm1
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ jmp .Lmulx4x_cond_copy
+
+.align 32
+.Lmulx4x_cond_copy:
+ movdqa 0(%rbx),%xmm2
+ movdqa 16(%rbx),%xmm3
+ leaq 32(%rbx),%rbx
+ movdqu 0(%rdi),%xmm4
+ movdqu 16(%rdi),%xmm5
+ leaq 32(%rdi),%rdi
+ movdqa %xmm0,-32(%rbx)
+ movdqa %xmm0,-16(%rbx)
+ pcmpeqd %xmm1,%xmm0
+ pand %xmm1,%xmm2
+ pand %xmm1,%xmm3
+ pand %xmm0,%xmm4
+ pand %xmm0,%xmm5
+ pxor %xmm0,%xmm0
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqu %xmm4,-32(%rdi)
+ movdqu %xmm5,-16(%rdi)
+ subq $32,%rdx
+ jnz .Lmulx4x_cond_copy
+
+ movq %rdx,(%rbx)
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmulx4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mulx4x_mont,.-bn_mulx4x_mont
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S
@@ -1,0 +1,3790 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+.extern OPENSSL_ia32cap_P
+.hidden OPENSSL_ia32cap_P
+
+.globl bn_mul_mont_gather5
+.hidden bn_mul_mont_gather5
+.type bn_mul_mont_gather5,@function
+.align 64
+bn_mul_mont_gather5:
+.cfi_startproc
+ movl %r9d,%r9d
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ testl $7,%r9d
+ jnz .Lmul_enter
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ movd 8(%rsp),%xmm5
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+
+ negq %r9
+ movq %rsp,%r11
+ leaq -280(%rsp,%r9,8),%r10
+ negq %r9
+ andq $-1024,%r10
+
+
+
+
+
+
+
+
+
+ subq %r10,%r11
+ andq $-4096,%r11
+ leaq (%r10,%r11,1),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+ jmp .Lmul_page_walk_done
+
+.Lmul_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r11
+ cmpq %r10,%rsp
+ ja .Lmul_page_walk
+.Lmul_page_walk_done:
+
+ leaq .Linc(%rip),%r10
+ movq %rax,8(%rsp,%r9,8)
+.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
+.Lmul_body:
+
+ leaq 128(%rdx),%r12
+ movdqa 0(%r10),%xmm0
+ movdqa 16(%r10),%xmm1
+ leaq 24-112(%rsp,%r9,8),%r10
+ andq $-16,%r10
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r9,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ leaq 24+128(%rsp,%r9,8),%rdx
+ andq $-16,%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+
+ movq (%rsi),%rax
+.byte 102,72,15,126,195
+
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r9,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r9,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ movq $-1,%rbx
+ xorq %rax,%rbx
+ xorq %r14,%r14
+ movq %r9,%r15
+
+.Lcopy:
+ movq (%rdi,%r14,8),%rcx
+ movq (%rsp,%r14,8),%rdx
+ andq %rbx,%rcx
+ andq %rax,%rdx
+ movq %r14,(%rsp,%r14,8)
+ orq %rcx,%rdx
+ movq %rdx,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type bn_mul4x_mont_gather5,@function
+.align 32
+bn_mul4x_mont_gather5:
+.cfi_startproc
+.byte 0x67
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmul4x_enter:
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je .Lmulx4x_enter
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmul4x_prologue:
+
+.byte 0x67
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lmul4xsp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lmul4xsp_done
+
+.align 32
+.Lmul4xsp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lmul4xsp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmul4x_page_walk
+ jmp .Lmul4x_page_walk_done
+
+.Lmul4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmul4x_page_walk
+.Lmul4x_page_walk_done:
+
+ negq %r9
+
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmul4x_body:
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type mul4x_internal,@function
+.align 32
+mul4x_internal:
+.cfi_startproc
+ shlq $5,%r9
+ movd 8(%rax),%xmm5
+ leaq .Linc(%rip),%rax
+ leaq 128(%rdx,%r9,1),%r13
+ shrq $5,%r9
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r9,1),%r10
+ leaq 128(%rdx),%r12
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67,0x67
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+.byte 0x67
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+.byte 0x67
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+ pand 64(%r12),%xmm0
+
+ pand 80(%r12),%xmm1
+ pand 96(%r12),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%r12),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%r12),%xmm4
+ movdqa -112(%r12),%xmm5
+ movdqa -96(%r12),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%r12),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%r12),%xmm4
+ movdqa -48(%r12),%xmm5
+ movdqa -32(%r12),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%r12),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%r12),%xmm4
+ movdqa 16(%r12),%xmm5
+ movdqa 32(%r12),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%r12),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ por %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
+ movq %r13,16+8(%rsp)
+ movq %rdi,56+8(%rsp)
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+ leaq (%rsi,%r9,1),%rsi
+ negq %r9
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ leaq 64+8(%rsp),%r14
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+ jmp .L1st4x
+
+.align 32
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+ leaq (%rcx,%r9,1),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ jmp .Louter4x
+
+.align 32
+.Louter4x:
+ leaq 16+128(%r14),%rdx
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r12),%xmm0
+ movdqa -112(%r12),%xmm1
+ movdqa -96(%r12),%xmm2
+ movdqa -80(%r12),%xmm3
+ pand -128(%rdx),%xmm0
+ pand -112(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r12),%xmm0
+ movdqa -48(%r12),%xmm1
+ movdqa -32(%r12),%xmm2
+ movdqa -16(%r12),%xmm3
+ pand -64(%rdx),%xmm0
+ pand -48(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r12),%xmm0
+ movdqa 16(%r12),%xmm1
+ movdqa 32(%r12),%xmm2
+ movdqa 48(%r12),%xmm3
+ pand 0(%rdx),%xmm0
+ pand 16(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r12),%xmm0
+ movdqa 80(%r12),%xmm1
+ movdqa 96(%r12),%xmm2
+ movdqa 112(%r12),%xmm3
+ pand 64(%rdx),%xmm0
+ pand 80(%rdx),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rdx),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rdx),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%r12),%r12
+.byte 102,72,15,126,195
+
+ movq (%r14,%r9,1),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+ movq %rdi,(%r14)
+
+ leaq (%r14,%r9,1),%r14
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdx,%r13
+ jmp .Linner4x
+
+.align 32
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx),%rax
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ addq (%r14),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%rcx),%rcx
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq %rbp,%rax
+ movq -8(%rcx),%rbp
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+ movq %rdi,-16(%r14)
+ leaq (%rcx,%r9,1),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%r14),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ cmpq 16+8(%rsp),%r12
+ jb .Louter4x
+ xorq %rax,%rax
+ subq %r13,%rbp
+ adcq %r15,%r15
+ orq %r15,%rdi
+ subq %rdi,%rax
+ leaq (%r14,%r9,1),%rbx
+ movq (%rcx),%r12
+ leaq (%rcx),%rbp
+ movq %r9,%rcx
+ sarq $3+2,%rcx
+ movq 56+8(%rsp),%rdi
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
+.cfi_endproc
+.size mul4x_internal,.-mul4x_internal
+.globl bn_power5
+.hidden bn_power5
+.type bn_power5,@function
+.align 32
+bn_power5:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ je .Lpowerx5_enter
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lpower5_prologue:
+
+ shll $3,%r9d
+ leal (%r9,%r9,2),%r10d
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lpwr_sp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lpwr_sp_done
+
+.align 32
+.Lpwr_sp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lpwr_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwr_page_walk
+ jmp .Lpwr_page_walk_done
+
+.Lpwr_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwr_page_walk
+.Lpwr_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpower5_body:
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+ call __bn_sqr8x_internal
+ call __bn_post4x_internal
+
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq %rsi,%rdi
+ movq 40(%rsp),%rax
+ leaq 32(%rsp),%r8
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpower5_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_power5,.-bn_power5
+
+.globl bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.type bn_sqr8x_internal,@function
+.align 32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+.cfi_startproc
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 32(%r10),%rbp
+ leaq (%rsi,%r9,1),%rsi
+
+ movq %r9,%rcx
+
+
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r11,-16(%rdi,%rbp,1)
+ movq %rdx,%r10
+
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ movq %rax,%r12
+ movq %rbx,%rax
+ movq %rdx,%r13
+
+ leaq (%rbp),%rcx
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+ jmp .Lsqr4x_1st
+
+.align 32
+.Lsqr4x_1st:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq 16(%rsi,%rcx,1),%rbx
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %r10,8(%rdi,%rcx,1)
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 24(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,16(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+ leaq 32(%rcx),%rcx
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_1st
+
+ mulq %r15
+ addq %rax,%r13
+ leaq 16(%rbp),%rbp
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+ jmp .Lsqr4x_outer
+
+.align 32
+.Lsqr4x_outer:
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq -24(%rdi,%rbp,1),%r10
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r10,-24(%rdi,%rbp,1)
+ movq %rdx,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -16(%rdi,%rbp,1),%r11
+ movq %rdx,%r10
+ adcq $0,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ xorq %r12,%r12
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -8(%rdi,%rbp,1),%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rbp,1)
+
+ leaq (%rbp),%rcx
+ jmp .Lsqr4x_inner
+
+.align 32
+.Lsqr4x_inner:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+ addq (%rdi,%rcx,1),%r13
+ adcq $0,%r12
+
+.byte 0x67
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %r11,(%rdi,%rcx,1)
+ movq %rbx,%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ leaq 16(%rcx),%rcx
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_inner
+
+.byte 0x67
+ mulq %r15
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ addq $16,%rbp
+ jnz .Lsqr4x_outer
+
+
+ movq -32(%rsi),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq %r10,-24(%rdi)
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ movq -8(%rsi),%rbx
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,-16(%rdi)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi)
+
+ mulq %r15
+ addq %rax,%r13
+ movq -16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ mulq %rbx
+ addq $16,%rbp
+ xorq %r14,%r14
+ subq %r9,%rbp
+ xorq %r15,%r15
+
+ addq %r12,%rax
+ adcq $0,%rdx
+ movq %rax,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %r15,24(%rdi)
+
+ movq -16(%rsi,%rbp,1),%rax
+ leaq 48+8(%rsp),%rdi
+ xorq %r10,%r10
+ movq 8(%rdi),%r11
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ leaq 16(%rbp),%rbp
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ jmp .Lsqr4x_shift_n_add
+
+.align 32
+.Lsqr4x_shift_n_add:
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi)
+ adcq %rdx,%r8
+
+ leaq (%r14,%r10,2),%r12
+ movq %r8,-8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq 8(%rsi,%rbp,1),%rax
+ movq %r12,0(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 16(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ addq $32,%rbp
+ jnz .Lsqr4x_shift_n_add
+
+ leaq (%r14,%r10,2),%r12
+.byte 0x67
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ mulq %rax
+ negq %r15
+ adcq %rax,%rbx
+ adcq %rdx,%r8
+ movq %rbx,-16(%rdi)
+ movq %r8,-8(%rdi)
+.byte 102,72,15,126,213
+__bn_sqr8x_reduction:
+ xorq %rax,%rax
+ leaq (%r9,%rbp,1),%rcx
+ leaq 48+8(%rsp,%r9,2),%rdx
+ movq %rcx,0+8(%rsp)
+ leaq 48+8(%rsp,%r9,1),%rdi
+ movq %rdx,8+8(%rsp)
+ negq %r9
+ jmp .L8x_reduction_loop
+
+.align 32
+.L8x_reduction_loop:
+ leaq (%rdi,%r9,1),%rdi
+.byte 0x66
+ movq 0(%rdi),%rbx
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,(%rdx)
+ leaq 64(%rdi),%rdi
+
+.byte 0x67
+ movq %rbx,%r8
+ imulq 32+8(%rsp),%rbx
+ movq 0(%rbp),%rax
+ movl $8,%ecx
+ jmp .L8x_reduce
+
+.align 32
+.L8x_reduce:
+ mulq %rbx
+ movq 8(%rbp),%rax
+ negq %r8
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rbx,48-8+8(%rsp,%rcx,8)
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq 32+8(%rsp),%rsi
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ imulq %r8,%rsi
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq %rsi,%rbx
+ addq %rax,%r15
+ movq 0(%rbp),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz .L8x_reduce
+
+ leaq 64(%rbp),%rbp
+ xorq %rax,%rax
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae .L8x_no_tail
+
+.byte 0x66
+ addq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movq 48+56+8(%rsp),%rbx
+ movl $8,%ecx
+ movq 0(%rbp),%rax
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail:
+ mulq %rbx
+ addq %rax,%r8
+ movq 8(%rbp),%rax
+ movq %r8,(%rdi)
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ leaq 8(%rdi),%rdi
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq 48-16+8(%rsp,%rcx,8),%rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq 0(%rbp),%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz .L8x_tail
+
+ leaq 64(%rbp),%rbp
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae .L8x_tail_done
+
+ movq 48+56+8(%rsp),%rbx
+ negq %rsi
+ movq 0(%rbp),%rax
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movl $8,%ecx
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail_done:
+ xorq %rax,%rax
+ addq (%rdx),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ adcq $0,%rax
+
+ negq %rsi
+.L8x_no_tail:
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+ movq -8(%rbp),%rcx
+ xorq %rsi,%rsi
+
+.byte 102,72,15,126,213
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+.byte 102,73,15,126,217
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+ leaq 64(%rdi),%rdi
+
+ cmpq %rdx,%rdi
+ jb .L8x_reduction_loop
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.type __bn_post4x_internal,@function
+.align 32
+__bn_post4x_internal:
+.cfi_startproc
+ movq 0(%rbp),%r12
+ leaq (%rdi,%r9,1),%rbx
+ movq %r9,%rcx
+.byte 102,72,15,126,207
+ negq %rax
+.byte 102,72,15,126,206
+ sarq $3+2,%rcx
+ decq %r12
+ xorq %r10,%r10
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqr4x_sub_entry
+
+.align 16
+.Lsqr4x_sub:
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqr4x_sub_entry:
+ leaq 32(%rbp),%rbp
+ notq %r12
+ notq %r13
+ notq %r14
+ notq %r15
+ andq %rax,%r12
+ andq %rax,%r13
+ andq %rax,%r14
+ andq %rax,%r15
+
+ negq %r10
+ adcq 0(%rbx),%r12
+ adcq 8(%rbx),%r13
+ adcq 16(%rbx),%r14
+ adcq 24(%rbx),%r15
+ movq %r12,0(%rdi)
+ leaq 32(%rbx),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r10,%r10
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+
+ incq %rcx
+ jnz .Lsqr4x_sub
+
+ movq %r9,%r10
+ negq %r9
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __bn_post4x_internal,.-__bn_post4x_internal
+.globl bn_from_montgomery
+.hidden bn_from_montgomery
+.type bn_from_montgomery,@function
+.align 32
+bn_from_montgomery:
+.cfi_startproc
+ testl $7,%r9d
+ jz bn_from_mont8x
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_from_montgomery,.-bn_from_montgomery
+
+.type bn_from_mont8x,@function
+.align 32
+bn_from_mont8x:
+.cfi_startproc
+.byte 0x67
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lfrom_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lfrom_sp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lfrom_sp_done
+
+.align 32
+.Lfrom_sp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lfrom_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lfrom_page_walk
+ jmp .Lfrom_page_walk_done
+
+.Lfrom_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lfrom_page_walk
+.Lfrom_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lfrom_body:
+ movq %r9,%r11
+ leaq 48(%rsp),%rax
+ pxor %xmm0,%xmm0
+ jmp .Lmul_by_1
+
+.align 32
+.Lmul_by_1:
+ movdqu (%rsi),%xmm1
+ movdqu 16(%rsi),%xmm2
+ movdqu 32(%rsi),%xmm3
+ movdqa %xmm0,(%rax,%r9,1)
+ movdqu 48(%rsi),%xmm4
+ movdqa %xmm0,16(%rax,%r9,1)
+.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
+ movdqa %xmm1,(%rax)
+ movdqa %xmm0,32(%rax,%r9,1)
+ movdqa %xmm2,16(%rax)
+ movdqa %xmm0,48(%rax,%r9,1)
+ movdqa %xmm3,32(%rax)
+ movdqa %xmm4,48(%rax)
+ leaq 64(%rax),%rax
+ subq $64,%r11
+ jnz .Lmul_by_1
+
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 0x67
+ movq %rcx,%rbp
+.byte 102,73,15,110,218
+ leaq OPENSSL_ia32cap_P(%rip),%r11
+ movl 8(%r11),%r11d
+ andl $0x80108,%r11d
+ cmpl $0x80108,%r11d
+ jne .Lfrom_mont_nox
+
+ leaq (%rax,%r9,1),%rdi
+ call __bn_sqrx8x_reduction
+ call __bn_postx4x_internal
+
+ pxor %xmm0,%xmm0
+ leaq 48(%rsp),%rax
+ jmp .Lfrom_mont_zero
+
+.align 32
+.Lfrom_mont_nox:
+ call __bn_sqr8x_reduction
+ call __bn_post4x_internal
+
+ pxor %xmm0,%xmm0
+ leaq 48(%rsp),%rax
+ jmp .Lfrom_mont_zero
+
+.align 32
+.Lfrom_mont_zero:
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ movdqa %xmm0,32(%rax)
+ movdqa %xmm0,48(%rax)
+ leaq 64(%rax),%rax
+ subq $32,%r9
+ jnz .Lfrom_mont_zero
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lfrom_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_from_mont8x,.-bn_from_mont8x
+.type bn_mulx4x_mont_gather5,@function
+.align 32
+bn_mulx4x_mont_gather5:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lmulx4x_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lmulx4x_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lmulx4xsp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lmulx4xsp_done
+
+.Lmulx4xsp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lmulx4xsp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+ jmp .Lmulx4x_page_walk_done
+
+.Lmulx4x_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lmulx4x_page_walk
+.Lmulx4x_page_walk_done:
+
+
+
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lmulx4x_body:
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lmulx4x_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type mulx4x_internal,@function
+.align 32
+mulx4x_internal:
+.cfi_startproc
+ movq %r9,8(%rsp)
+ movq %r9,%r10
+ negq %r9
+ shlq $5,%r9
+ negq %r10
+ leaq 128(%rdx,%r9,1),%r13
+ shrq $5+5,%r9
+ movd 8(%rax),%xmm5
+ subq $1,%r9
+ leaq .Linc(%rip),%rax
+ movq %r13,16+8(%rsp)
+ movq %r9,24+8(%rsp)
+ movq %rdi,56+8(%rsp)
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 88-112(%rsp,%r10,1),%r10
+ leaq 128(%rdx),%rdi
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+.byte 0x67
+ movdqa %xmm1,%xmm2
+.byte 0x67
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,112(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,128(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,144(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,160(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,176(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,192(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,208(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,224(%r10)
+ movdqa %xmm4,%xmm3
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,240(%r10)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,256(%r10)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,272(%r10)
+ movdqa %xmm4,%xmm2
+
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,288(%r10)
+ movdqa %xmm4,%xmm3
+.byte 0x67
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,304(%r10)
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,320(%r10)
+
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,336(%r10)
+
+ pand 64(%rdi),%xmm0
+ pand 80(%rdi),%xmm1
+ pand 96(%rdi),%xmm2
+ movdqa %xmm3,352(%r10)
+ pand 112(%rdi),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -128(%rdi),%xmm4
+ movdqa -112(%rdi),%xmm5
+ movdqa -96(%rdi),%xmm2
+ pand 112(%r10),%xmm4
+ movdqa -80(%rdi),%xmm3
+ pand 128(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 144(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 160(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa -64(%rdi),%xmm4
+ movdqa -48(%rdi),%xmm5
+ movdqa -32(%rdi),%xmm2
+ pand 176(%r10),%xmm4
+ movdqa -16(%rdi),%xmm3
+ pand 192(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 208(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 224(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ movdqa 0(%rdi),%xmm4
+ movdqa 16(%rdi),%xmm5
+ movdqa 32(%rdi),%xmm2
+ pand 240(%r10),%xmm4
+ movdqa 48(%rdi),%xmm3
+ pand 256(%r10),%xmm5
+ por %xmm4,%xmm0
+ pand 272(%r10),%xmm2
+ por %xmm5,%xmm1
+ pand 288(%r10),%xmm3
+ por %xmm2,%xmm0
+ por %xmm3,%xmm1
+ pxor %xmm1,%xmm0
+ pshufd $0x4e,%xmm0,%xmm1
+ por %xmm1,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+ leaq 64+32+8(%rsp),%rbx
+
+ movq %rdx,%r9
+ mulxq 0(%rsi),%r8,%rax
+ mulxq 8(%rsi),%r11,%r12
+ addq %rax,%r11
+ mulxq 16(%rsi),%rax,%r13
+ adcq %rax,%r12
+ adcq $0,%r13
+ mulxq 24(%rsi),%rax,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+ xorq %rbp,%rbp
+ movq %r8,%rdx
+
+ movq %rdi,8+8(%rsp)
+
+ leaq 32(%rsi),%rsi
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r11,-24(%rbx)
+ adcxq %rax,%r12
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r12,-16(%rbx)
+ jmp .Lmulx4x_1st
+
+.align 32
+.Lmulx4x_1st:
+ adcxq %rbp,%r15
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+.byte 0x67,0x67
+ movq %r8,%rdx
+ adcxq %rax,%r13
+ adcxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-32(%rbx)
+ adoxq %r15,%r13
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ leaq 32(%rcx),%rcx
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_1st
+
+ movq 8(%rsp),%rax
+ adcq %rbp,%r15
+ leaq (%rsi,%rax,1),%rsi
+ addq %r15,%r14
+ movq 8+8(%rsp),%rdi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+ jmp .Lmulx4x_outer
+
+.align 32
+.Lmulx4x_outer:
+ leaq 16-256(%rbx),%r10
+ pxor %xmm4,%xmm4
+.byte 0x67,0x67
+ pxor %xmm5,%xmm5
+ movdqa -128(%rdi),%xmm0
+ movdqa -112(%rdi),%xmm1
+ movdqa -96(%rdi),%xmm2
+ pand 256(%r10),%xmm0
+ movdqa -80(%rdi),%xmm3
+ pand 272(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 288(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 304(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%rdi),%xmm0
+ movdqa -48(%rdi),%xmm1
+ movdqa -32(%rdi),%xmm2
+ pand 320(%r10),%xmm0
+ movdqa -16(%rdi),%xmm3
+ pand 336(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 352(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 368(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%rdi),%xmm0
+ movdqa 16(%rdi),%xmm1
+ movdqa 32(%rdi),%xmm2
+ pand 384(%r10),%xmm0
+ movdqa 48(%rdi),%xmm3
+ pand 400(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 416(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 432(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%rdi),%xmm0
+ movdqa 80(%rdi),%xmm1
+ movdqa 96(%rdi),%xmm2
+ pand 448(%r10),%xmm0
+ movdqa 112(%rdi),%xmm3
+ pand 464(%r10),%xmm1
+ por %xmm0,%xmm4
+ pand 480(%r10),%xmm2
+ por %xmm1,%xmm5
+ pand 496(%r10),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ leaq 256(%rdi),%rdi
+.byte 102,72,15,126,194
+
+ movq %rbp,(%rbx)
+ leaq 32(%rbx,%rax,1),%rbx
+ mulxq 0(%rsi),%r8,%r11
+ xorq %rbp,%rbp
+ movq %rdx,%r9
+ mulxq 8(%rsi),%r14,%r12
+ adoxq -32(%rbx),%r8
+ adcxq %r14,%r11
+ mulxq 16(%rsi),%r15,%r13
+ adoxq -24(%rbx),%r11
+ adcxq %r15,%r12
+ mulxq 24(%rsi),%rdx,%r14
+ adoxq -16(%rbx),%r12
+ adcxq %rdx,%r13
+ leaq (%rcx,%rax,1),%rcx
+ leaq 32(%rsi),%rsi
+ adoxq -8(%rbx),%r13
+ adcxq %rbp,%r14
+ adoxq %rbp,%r14
+
+ movq %r8,%r15
+ imulq 32+8(%rsp),%r8
+
+ movq %r8,%rdx
+ xorq %rbp,%rbp
+ movq %rdi,8+8(%rsp)
+
+ mulxq 0(%rcx),%rax,%r10
+ adcxq %rax,%r15
+ adoxq %r11,%r10
+ mulxq 8(%rcx),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+ mulxq 16(%rcx),%rax,%r12
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ movq 24+8(%rsp),%rdi
+ movq %r10,-32(%rbx)
+ adcxq %rax,%r12
+ movq %r11,-24(%rbx)
+ adoxq %rbp,%r15
+ movq %r12,-16(%rbx)
+ leaq 32(%rcx),%rcx
+ jmp .Lmulx4x_inner
+
+.align 32
+.Lmulx4x_inner:
+ mulxq 0(%rsi),%r10,%rax
+ adcxq %rbp,%r15
+ adoxq %r14,%r10
+ mulxq 8(%rsi),%r11,%r14
+ adcxq 0(%rbx),%r10
+ adoxq %rax,%r11
+ mulxq 16(%rsi),%r12,%rax
+ adcxq 8(%rbx),%r11
+ adoxq %r14,%r12
+ mulxq 24(%rsi),%r13,%r14
+ movq %r8,%rdx
+ adcxq 16(%rbx),%r12
+ adoxq %rax,%r13
+ adcxq 24(%rbx),%r13
+ adoxq %rbp,%r14
+ leaq 32(%rsi),%rsi
+ leaq 32(%rbx),%rbx
+ adcxq %rbp,%r14
+
+ adoxq %r15,%r10
+ mulxq 0(%rcx),%rax,%r15
+ adcxq %rax,%r10
+ adoxq %r15,%r11
+ mulxq 8(%rcx),%rax,%r15
+ adcxq %rax,%r11
+ adoxq %r15,%r12
+ mulxq 16(%rcx),%rax,%r15
+ movq %r10,-40(%rbx)
+ adcxq %rax,%r12
+ adoxq %r15,%r13
+ movq %r11,-32(%rbx)
+ mulxq 24(%rcx),%rax,%r15
+ movq %r9,%rdx
+ leaq 32(%rcx),%rcx
+ movq %r12,-24(%rbx)
+ adcxq %rax,%r13
+ adoxq %rbp,%r15
+ movq %r13,-16(%rbx)
+
+ decq %rdi
+ jnz .Lmulx4x_inner
+
+ movq 0+8(%rsp),%rax
+ adcq %rbp,%r15
+ subq 0(%rbx),%rdi
+ movq 8+8(%rsp),%rdi
+ movq 16+8(%rsp),%r10
+ adcq %r15,%r14
+ leaq (%rsi,%rax,1),%rsi
+ adcq %rbp,%rbp
+ movq %r14,-8(%rbx)
+
+ cmpq %r10,%rdi
+ jb .Lmulx4x_outer
+
+ movq -8(%rcx),%r10
+ movq %rbp,%r8
+ movq (%rcx,%rax,1),%r12
+ leaq (%rcx,%rax,1),%rbp
+ movq %rax,%rcx
+ leaq (%rbx,%rax,1),%rdi
+ xorl %eax,%eax
+ xorq %r15,%r15
+ subq %r14,%r10
+ adcq %r15,%r15
+ orq %r15,%r8
+ sarq $3+2,%rcx
+ subq %r8,%rax
+ movq 56+8(%rsp),%rdx
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry
+.cfi_endproc
+.size mulx4x_internal,.-mulx4x_internal
+.type bn_powerx5,@function
+.align 32
+bn_powerx5:
+.cfi_startproc
+ movq %rsp,%rax
+.cfi_def_cfa_register %rax
+.Lpowerx5_enter:
+ pushq %rbx
+.cfi_offset %rbx,-16
+ pushq %rbp
+.cfi_offset %rbp,-24
+ pushq %r12
+.cfi_offset %r12,-32
+ pushq %r13
+.cfi_offset %r13,-40
+ pushq %r14
+.cfi_offset %r14,-48
+ pushq %r15
+.cfi_offset %r15,-56
+.Lpowerx5_prologue:
+
+ shll $3,%r9d
+ leaq (%r9,%r9,2),%r10
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+
+ leaq -320(%rsp,%r9,2),%r11
+ movq %rsp,%rbp
+ subq %rdi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lpwrx_sp_alt
+ subq %r11,%rbp
+ leaq -320(%rbp,%r9,2),%rbp
+ jmp .Lpwrx_sp_done
+
+.align 32
+.Lpwrx_sp_alt:
+ leaq 4096-320(,%r9,2),%r10
+ leaq -320(%rbp,%r9,2),%rbp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rbp
+.Lpwrx_sp_done:
+ andq $-64,%rbp
+ movq %rsp,%r11
+ subq %rbp,%r11
+ andq $-4096,%r11
+ leaq (%r11,%rbp,1),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwrx_page_walk
+ jmp .Lpwrx_page_walk_done
+
+.Lpwrx_page_walk:
+ leaq -4096(%rsp),%rsp
+ movq (%rsp),%r10
+ cmpq %rbp,%rsp
+ ja .Lpwrx_page_walk
+.Lpwrx_page_walk_done:
+
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+
+
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08
+.Lpowerx5_body:
+
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+ call __bn_sqrx8x_internal
+ call __bn_postx4x_internal
+
+ movq %r10,%r9
+ movq %rsi,%rdi
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq 40(%rsp),%rax
+
+ call mulx4x_internal
+
+ movq 40(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ movq $1,%rax
+
+ movq -48(%rsi),%r15
+.cfi_restore %r15
+ movq -40(%rsi),%r14
+.cfi_restore %r14
+ movq -32(%rsi),%r13
+.cfi_restore %r13
+ movq -24(%rsi),%r12
+.cfi_restore %r12
+ movq -16(%rsi),%rbp
+.cfi_restore %rbp
+ movq -8(%rsi),%rbx
+.cfi_restore %rbx
+ leaq (%rsi),%rsp
+.cfi_def_cfa_register %rsp
+.Lpowerx5_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_powerx5,.-bn_powerx5
+
+.globl bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.hidden bn_sqrx8x_internal
+.type bn_sqrx8x_internal,@function
+.align 32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+.cfi_startproc
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 48+8(%rsp),%rdi
+ leaq (%rsi,%r9,1),%rbp
+ movq %r9,0+8(%rsp)
+ movq %rbp,8+8(%rsp)
+ jmp .Lsqr8x_zero_start
+
+.align 32
+.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+.byte 0x3e
+ movdqa %xmm0,0(%rdi)
+ movdqa %xmm0,16(%rdi)
+ movdqa %xmm0,32(%rdi)
+ movdqa %xmm0,48(%rdi)
+.Lsqr8x_zero_start:
+ movdqa %xmm0,64(%rdi)
+ movdqa %xmm0,80(%rdi)
+ movdqa %xmm0,96(%rdi)
+ movdqa %xmm0,112(%rdi)
+ leaq 128(%rdi),%rdi
+ subq $64,%r9
+ jnz .Lsqrx8x_zero
+
+ movq 0(%rsi),%rdx
+
+ xorq %r10,%r10
+ xorq %r11,%r11
+ xorq %r12,%r12
+ xorq %r13,%r13
+ xorq %r14,%r14
+ xorq %r15,%r15
+ leaq 48+8(%rsp),%rdi
+ xorq %rbp,%rbp
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_loop:
+ mulxq 8(%rsi),%r8,%rax
+ adcxq %r9,%r8
+ adoxq %rax,%r10
+ mulxq 16(%rsi),%r9,%rax
+ adcxq %r10,%r9
+ adoxq %rax,%r11
+.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
+ adcxq %r11,%r10
+ adoxq %rax,%r12
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
+ adcxq %r12,%r11
+ adoxq %rax,%r13
+ mulxq 40(%rsi),%r12,%rax
+ adcxq %r13,%r12
+ adoxq %rax,%r14
+ mulxq 48(%rsi),%r13,%rax
+ adcxq %r14,%r13
+ adoxq %r15,%rax
+ mulxq 56(%rsi),%r14,%r15
+ movq 8(%rsi),%rdx
+ adcxq %rax,%r14
+ adoxq %rbp,%r15
+ adcq 64(%rdi),%r15
+ movq %r8,8(%rdi)
+ movq %r9,16(%rdi)
+ sbbq %rcx,%rcx
+ xorq %rbp,%rbp
+
+
+ mulxq 16(%rsi),%r8,%rbx
+ mulxq 24(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 32(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %rbx,%r11
+.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
+ adcxq %r13,%r11
+ adoxq %r14,%r12
+.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
+ movq 16(%rsi),%rdx
+ adcxq %rax,%r12
+ adoxq %rbx,%r13
+ adcxq %r15,%r13
+ adoxq %rbp,%r14
+ adcxq %rbp,%r14
+
+ movq %r8,24(%rdi)
+ movq %r9,32(%rdi)
+
+ mulxq 24(%rsi),%r8,%rbx
+ mulxq 32(%rsi),%r9,%rax
+ adcxq %r10,%r8
+ adoxq %rbx,%r9
+ mulxq 40(%rsi),%r10,%rbx
+ adcxq %r11,%r9
+ adoxq %rax,%r10
+.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
+ adcxq %r12,%r10
+ adoxq %r13,%r11
+.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
+.byte 0x3e
+ movq 24(%rsi),%rdx
+ adcxq %rbx,%r11
+ adoxq %rax,%r12
+ adcxq %r14,%r12
+ movq %r8,40(%rdi)
+ movq %r9,48(%rdi)
+ mulxq 32(%rsi),%r8,%rax
+ adoxq %rbp,%r13
+ adcxq %rbp,%r13
+
+ mulxq 40(%rsi),%r9,%rbx
+ adcxq %r10,%r8
+ adoxq %rax,%r9
+ mulxq 48(%rsi),%r10,%rax
+ adcxq %r11,%r9
+ adoxq %r12,%r10
+ mulxq 56(%rsi),%r11,%r12
+ movq 32(%rsi),%rdx
+ movq 40(%rsi),%r14
+ adcxq %rbx,%r10
+ adoxq %rax,%r11
+ movq 48(%rsi),%r15
+ adcxq %r13,%r11
+ adoxq %rbp,%r12
+ adcxq %rbp,%r12
+
+ movq %r8,56(%rdi)
+ movq %r9,64(%rdi)
+
+ mulxq %r14,%r9,%rax
+ movq 56(%rsi),%r8
+ adcxq %r10,%r9
+ mulxq %r15,%r10,%rbx
+ adoxq %rax,%r10
+ adcxq %r11,%r10
+ mulxq %r8,%r11,%rax
+ movq %r14,%rdx
+ adoxq %rbx,%r11
+ adcxq %r12,%r11
+
+ adcxq %rbp,%rax
+
+ mulxq %r15,%r14,%rbx
+ mulxq %r8,%r12,%r13
+ movq %r15,%rdx
+ leaq 64(%rsi),%rsi
+ adcxq %r14,%r11
+ adoxq %rbx,%r12
+ adcxq %rax,%r12
+ adoxq %rbp,%r13
+
+.byte 0x67,0x67
+ mulxq %r8,%r8,%r14
+ adcxq %r8,%r13
+ adcxq %rbp,%r14
+
+ cmpq 8+8(%rsp),%rsi
+ je .Lsqrx8x_outer_break
+
+ negq %rcx
+ movq $-8,%rcx
+ movq %rbp,%r15
+ movq 64(%rdi),%r8
+ adcxq 72(%rdi),%r9
+ adcxq 80(%rdi),%r10
+ adcxq 88(%rdi),%r11
+ adcq 96(%rdi),%r12
+ adcq 104(%rdi),%r13
+ adcq 112(%rdi),%r14
+ adcq 120(%rdi),%r15
+ leaq (%rsi),%rbp
+ leaq 128(%rdi),%rdi
+ sbbq %rax,%rax
+
+ movq -64(%rsi),%rdx
+ movq %rax,16+8(%rsp)
+ movq %rdi,24+8(%rsp)
+
+
+ xorl %eax,%eax
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_loop:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ movq %rbx,(%rdi,%rcx,8)
+ movl $0,%ebx
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
+ movq 8(%rsi,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rbx,%r15
+ adcxq %rbx,%r15
+
+.byte 0x67
+ incq %rcx
+ jnz .Lsqrx8x_loop
+
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ cmpq 8+8(%rsp),%rbp
+ je .Lsqrx8x_break
+
+ subq 16+8(%rsp),%rbx
+.byte 0x66
+ movq -64(%rsi),%rdx
+ adcxq 0(%rdi),%r8
+ adcxq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+.byte 0x67
+ sbbq %rax,%rax
+ xorl %ebx,%ebx
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_loop
+
+.align 32
+.Lsqrx8x_break:
+ xorq %rbp,%rbp
+ subq 16+8(%rsp),%rbx
+ adcxq %rbp,%r8
+ movq 24+8(%rsp),%rcx
+ adcxq %rbp,%r9
+ movq 0(%rsi),%rdx
+ adcq $0,%r10
+ movq %r8,0(%rdi)
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ cmpq %rcx,%rdi
+ je .Lsqrx8x_outer_loop
+
+ movq %r9,8(%rdi)
+ movq 8(%rcx),%r9
+ movq %r10,16(%rdi)
+ movq 16(%rcx),%r10
+ movq %r11,24(%rdi)
+ movq 24(%rcx),%r11
+ movq %r12,32(%rdi)
+ movq 32(%rcx),%r12
+ movq %r13,40(%rdi)
+ movq 40(%rcx),%r13
+ movq %r14,48(%rdi)
+ movq 48(%rcx),%r14
+ movq %r15,56(%rdi)
+ movq 56(%rcx),%r15
+ movq %rcx,%rdi
+ jmp .Lsqrx8x_outer_loop
+
+.align 32
+.Lsqrx8x_outer_break:
+ movq %r9,72(%rdi)
+.byte 102,72,15,126,217
+ movq %r10,80(%rdi)
+ movq %r11,88(%rdi)
+ movq %r12,96(%rdi)
+ movq %r13,104(%rdi)
+ movq %r14,112(%rdi)
+ leaq 48+8(%rsp),%rdi
+ movq (%rsi,%rcx,1),%rdx
+
+ movq 8(%rdi),%r11
+ xorq %r10,%r10
+ movq 0+8(%rsp),%r9
+ adoxq %r11,%r11
+ movq 16(%rdi),%r12
+ movq 24(%rdi),%r13
+
+
+.align 32
+.Lsqrx4x_shift_n_add:
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
+.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 40(%rdi),%r11
+ movq %rax,0(%rdi)
+ movq %rbx,8(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ movq 16(%rsi,%rcx,1),%rdx
+ movq 48(%rdi),%r12
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 56(%rdi),%r13
+ movq %rax,16(%rdi)
+ movq %rbx,24(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r12,%r12
+ adcxq %r10,%rax
+ movq 24(%rsi,%rcx,1),%rdx
+ leaq 32(%rcx),%rcx
+ movq 64(%rdi),%r10
+ adoxq %r13,%r13
+ adcxq %r11,%rbx
+ movq 72(%rdi),%r11
+ movq %rax,32(%rdi)
+ movq %rbx,40(%rdi)
+
+ mulxq %rdx,%rax,%rbx
+ adoxq %r10,%r10
+ adcxq %r12,%rax
+ jrcxz .Lsqrx4x_shift_n_add_break
+.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
+ adoxq %r11,%r11
+ adcxq %r13,%rbx
+ movq 80(%rdi),%r12
+ movq 88(%rdi),%r13
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+ nop
+ jmp .Lsqrx4x_shift_n_add
+
+.align 32
+.Lsqrx4x_shift_n_add_break:
+ adcxq %r13,%rbx
+ movq %rax,48(%rdi)
+ movq %rbx,56(%rdi)
+ leaq 64(%rdi),%rdi
+.byte 102,72,15,126,213
+__bn_sqrx8x_reduction:
+ xorl %eax,%eax
+ movq 32+8(%rsp),%rbx
+ movq 48+8(%rsp),%rdx
+ leaq -64(%rbp,%r9,1),%rcx
+
+ movq %rcx,0+8(%rsp)
+ movq %rdi,8+8(%rsp)
+
+ leaq 48+8(%rsp),%rdi
+ jmp .Lsqrx8x_reduction_loop
+
+.align 32
+.Lsqrx8x_reduction_loop:
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq %rdx,%r8
+ imulq %rbx,%rdx
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,24+8(%rsp)
+
+ leaq 64(%rdi),%rdi
+ xorq %rsi,%rsi
+ movq $-8,%rcx
+ jmp .Lsqrx8x_reduce
+
+.align 32
+.Lsqrx8x_reduce:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rbx,%rax
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rbx,%r9
+ adcxq %rbx,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rbx,%r10
+ adcxq %rbx,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rbx,%r11
+ adcxq %rbx,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
+ movq %rdx,%rax
+ movq %r8,%rdx
+ adcxq %rbx,%r11
+ adoxq %r13,%r12
+
+ mulxq 32+8(%rsp),%rbx,%rdx
+ movq %rax,%rdx
+ movq %rax,64+48+8(%rsp,%rcx,8)
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq %rbx,%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ adcxq %rsi,%r15
+
+.byte 0x67,0x67,0x67
+ incq %rcx
+ jnz .Lsqrx8x_reduce
+
+ movq %rsi,%rax
+ cmpq 0+8(%rsp),%rbp
+ jae .Lsqrx8x_no_tail
+
+ movq 48+8(%rsp),%rdx
+ addq 0(%rdi),%r8
+ leaq 64(%rbp),%rbp
+ movq $-8,%rcx
+ adcxq 8(%rdi),%r9
+ adcxq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail:
+ movq %r8,%rbx
+ mulxq 0(%rbp),%rax,%r8
+ adcxq %rax,%rbx
+ adoxq %r9,%r8
+
+ mulxq 8(%rbp),%rax,%r9
+ adcxq %rax,%r8
+ adoxq %r10,%r9
+
+ mulxq 16(%rbp),%rax,%r10
+ adcxq %rax,%r9
+ adoxq %r11,%r10
+
+ mulxq 24(%rbp),%rax,%r11
+ adcxq %rax,%r10
+ adoxq %r12,%r11
+
+.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
+ adcxq %rax,%r11
+ adoxq %r13,%r12
+
+ mulxq 40(%rbp),%rax,%r13
+ adcxq %rax,%r12
+ adoxq %r14,%r13
+
+ mulxq 48(%rbp),%rax,%r14
+ adcxq %rax,%r13
+ adoxq %r15,%r14
+
+ mulxq 56(%rbp),%rax,%r15
+ movq 72+48+8(%rsp,%rcx,8),%rdx
+ adcxq %rax,%r14
+ adoxq %rsi,%r15
+ movq %rbx,(%rdi,%rcx,8)
+ movq %r8,%rbx
+ adcxq %rsi,%r15
+
+ incq %rcx
+ jnz .Lsqrx8x_tail
+
+ cmpq 0+8(%rsp),%rbp
+ jae .Lsqrx8x_tail_done
+
+ subq 16+8(%rsp),%rsi
+ movq 48+8(%rsp),%rdx
+ leaq 64(%rbp),%rbp
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ leaq 64(%rdi),%rdi
+ sbbq %rax,%rax
+ subq $8,%rcx
+
+ xorq %rsi,%rsi
+ movq %rax,16+8(%rsp)
+ jmp .Lsqrx8x_tail
+
+.align 32
+.Lsqrx8x_tail_done:
+ xorq %rax,%rax
+ addq 24+8(%rsp),%r8
+ adcq $0,%r9
+ adcq $0,%r10
+ adcq $0,%r11
+ adcq $0,%r12
+ adcq $0,%r13
+ adcq $0,%r14
+ adcq $0,%r15
+ adcq $0,%rax
+
+ subq 16+8(%rsp),%rsi
+.Lsqrx8x_no_tail:
+ adcq 0(%rdi),%r8
+.byte 102,72,15,126,217
+ adcq 8(%rdi),%r9
+ movq 56(%rbp),%rsi
+.byte 102,72,15,126,213
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+
+ movq 32+8(%rsp),%rbx
+ movq 64(%rdi,%rcx,1),%rdx
+
+ movq %r8,0(%rdi)
+ leaq 64(%rdi),%r8
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ leaq 64(%rdi,%rcx,1),%rdi
+ cmpq 8+8(%rsp),%r8
+ jb .Lsqrx8x_reduction_loop
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
+.align 32
+.type __bn_postx4x_internal,@function
+__bn_postx4x_internal:
+.cfi_startproc
+ movq 0(%rbp),%r12
+ movq %rcx,%r10
+ movq %rcx,%r9
+ negq %rax
+ sarq $3+2,%rcx
+
+.byte 102,72,15,126,202
+.byte 102,72,15,126,206
+ decq %r12
+ movq 8(%rbp),%r13
+ xorq %r8,%r8
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+ jmp .Lsqrx4x_sub_entry
+
+.align 16
+.Lsqrx4x_sub:
+ movq 0(%rbp),%r12
+ movq 8(%rbp),%r13
+ movq 16(%rbp),%r14
+ movq 24(%rbp),%r15
+.Lsqrx4x_sub_entry:
+ andnq %rax,%r12,%r12
+ leaq 32(%rbp),%rbp
+ andnq %rax,%r13,%r13
+ andnq %rax,%r14,%r14
+ andnq %rax,%r15,%r15
+
+ negq %r8
+ adcq 0(%rdi),%r12
+ adcq 8(%rdi),%r13
+ adcq 16(%rdi),%r14
+ adcq 24(%rdi),%r15
+ movq %r12,0(%rdx)
+ leaq 32(%rdi),%rdi
+ movq %r13,8(%rdx)
+ sbbq %r8,%r8
+ movq %r14,16(%rdx)
+ movq %r15,24(%rdx)
+ leaq 32(%rdx),%rdx
+
+ incq %rcx
+ jnz .Lsqrx4x_sub
+
+ negq %r9
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size __bn_postx4x_internal,.-__bn_postx4x_internal
+.globl bn_scatter5
+.hidden bn_scatter5
+.type bn_scatter5,@function
+.align 16
+bn_scatter5:
+.cfi_startproc
+ cmpl $0,%esi
+ jz .Lscatter_epilogue
+ leaq (%rdx,%rcx,8),%rdx
+.Lscatter:
+ movq (%rdi),%rax
+ leaq 8(%rdi),%rdi
+ movq %rax,(%rdx)
+ leaq 256(%rdx),%rdx
+ subl $1,%esi
+ jnz .Lscatter
+.Lscatter_epilogue:
+ .byte 0xf3,0xc3
+.cfi_endproc
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.hidden bn_gather5
+.type bn_gather5,@function
+.align 32
+bn_gather5:
+.cfi_startproc
+.LSEH_begin_bn_gather5:
+
+.byte 0x4c,0x8d,0x14,0x24
+.cfi_def_cfa_register %r10
+.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00
+ leaq .Linc(%rip),%rax
+ andq $-16,%rsp
+
+ movd %ecx,%xmm5
+ movdqa 0(%rax),%xmm0
+ movdqa 16(%rax),%xmm1
+ leaq 128(%rdx),%r11
+ leaq 128(%rsp),%rax
+
+ pshufd $0,%xmm5,%xmm5
+ movdqa %xmm1,%xmm4
+ movdqa %xmm1,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-128(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-112(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-96(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-80(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,-64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,-48(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,-32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,-16(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,16(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,32(%rax)
+ movdqa %xmm4,%xmm2
+ paddd %xmm0,%xmm1
+ pcmpeqd %xmm5,%xmm0
+ movdqa %xmm3,48(%rax)
+ movdqa %xmm4,%xmm3
+
+ paddd %xmm1,%xmm2
+ pcmpeqd %xmm5,%xmm1
+ movdqa %xmm0,64(%rax)
+ movdqa %xmm4,%xmm0
+
+ paddd %xmm2,%xmm3
+ pcmpeqd %xmm5,%xmm2
+ movdqa %xmm1,80(%rax)
+ movdqa %xmm4,%xmm1
+
+ paddd %xmm3,%xmm0
+ pcmpeqd %xmm5,%xmm3
+ movdqa %xmm2,96(%rax)
+ movdqa %xmm4,%xmm2
+ movdqa %xmm3,112(%rax)
+ jmp .Lgather
+
+.align 32
+.Lgather:
+ pxor %xmm4,%xmm4
+ pxor %xmm5,%xmm5
+ movdqa -128(%r11),%xmm0
+ movdqa -112(%r11),%xmm1
+ movdqa -96(%r11),%xmm2
+ pand -128(%rax),%xmm0
+ movdqa -80(%r11),%xmm3
+ pand -112(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -80(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa -64(%r11),%xmm0
+ movdqa -48(%r11),%xmm1
+ movdqa -32(%r11),%xmm2
+ pand -64(%rax),%xmm0
+ movdqa -16(%r11),%xmm3
+ pand -48(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand -32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand -16(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 0(%r11),%xmm0
+ movdqa 16(%r11),%xmm1
+ movdqa 32(%r11),%xmm2
+ pand 0(%rax),%xmm0
+ movdqa 48(%r11),%xmm3
+ pand 16(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 32(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 48(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ movdqa 64(%r11),%xmm0
+ movdqa 80(%r11),%xmm1
+ movdqa 96(%r11),%xmm2
+ pand 64(%rax),%xmm0
+ movdqa 112(%r11),%xmm3
+ pand 80(%rax),%xmm1
+ por %xmm0,%xmm4
+ pand 96(%rax),%xmm2
+ por %xmm1,%xmm5
+ pand 112(%rax),%xmm3
+ por %xmm2,%xmm4
+ por %xmm3,%xmm5
+ por %xmm5,%xmm4
+ leaq 256(%r11),%r11
+ pshufd $0x4e,%xmm4,%xmm0
+ por %xmm4,%xmm0
+ movq %xmm0,(%rdi)
+ leaq 8(%rdi),%rdi
+ subl $1,%esi
+ jnz .Lgather
+
+ leaq (%r10),%rsp
+.cfi_def_cfa_register %rsp
+ .byte 0xf3,0xc3
+.LSEH_end_bn_gather5:
+.cfi_endproc
+.size bn_gather5,.-bn_gather5
+.align 64
+.Linc:
+.long 0,0, 1,1
+.long 2,2, 2,2
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif
+.section .note.GNU-stack,"",@progbits
--- /dev/null
+++ b/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S
@@ -1,0 +1,518 @@
+// This file is generated from a similarly-named Perl script in the BoringSSL
+// source tree. Do not edit by hand.
+
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
+#define OPENSSL_NO_ASM
+#endif
+#endif
+
+#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM)
+#if defined(BORINGSSL_PREFIX)
+#include <boringssl_prefix_symbols_asm.h>
+#endif
+.text
+
+
+
+
+
+
+
+
+.type abi_test_trampoline, @function
+.globl abi_test_trampoline
+.hidden abi_test_trampoline
+.align 16
+abi_test_trampoline:
+.Labi_test_trampoline_seh_begin:
+.cfi_startproc
+
+
+
+
+
+
+
+
+
+ subq $120,%rsp
+.cfi_adjust_cfa_offset 120
+.Labi_test_trampoline_seh_prolog_alloc:
+ movq %r8,48(%rsp)
+ movq %rbx,64(%rsp)
+.cfi_offset rbx, -64
+.Labi_test_trampoline_seh_prolog_rbx:
+ movq %rbp,72(%rsp)
+.cfi_offset rbp, -56
+.Labi_test_trampoline_seh_prolog_rbp:
+ movq %r12,80(%rsp)
+.cfi_offset r12, -48
+.Labi_test_trampoline_seh_prolog_r12:
+ movq %r13,88(%rsp)
+.cfi_offset r13, -40
+.Labi_test_trampoline_seh_prolog_r13:
+ movq %r14,96(%rsp)
+.cfi_offset r14, -32
+.Labi_test_trampoline_seh_prolog_r14:
+ movq %r15,104(%rsp)
+.cfi_offset r15, -24
+.Labi_test_trampoline_seh_prolog_r15:
+.Labi_test_trampoline_seh_prolog_end:
+ movq 0(%rsi),%rbx
+ movq 8(%rsi),%rbp
+ movq 16(%rsi),%r12
+ movq 24(%rsi),%r13
+ movq 32(%rsi),%r14
+ movq 40(%rsi),%r15
+
+ movq %rdi,32(%rsp)
+ movq %rsi,40(%rsp)
+
+
+
+
+ movq %rdx,%r10
+ movq %rcx,%r11
+ decq %r11
+ js .Largs_done
+ movq (%r10),%rdi
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%rsi
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%rdx
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%rcx
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%r8
+ addq $8,%r10
+ decq %r11
+ js .Largs_done
+ movq (%r10),%r9
+ addq $8,%r10
+ leaq 0(%rsp),%rax
+.Largs_loop:
+ decq %r11
+ js .Largs_done
+
+
+
+
+
+
+ movq %r11,56(%rsp)
+ movq (%r10),%r11
+ movq %r11,(%rax)
+ movq 56(%rsp),%r11
+
+ addq $8,%r10
+ addq $8,%rax
+ jmp .Largs_loop
+
+.Largs_done:
+ movq 32(%rsp),%rax
+ movq 48(%rsp),%r10
+ testq %r10,%r10
+ jz .Lno_unwind
+
+
+ pushfq
+ orq $0x100,0(%rsp)
+ popfq
+
+
+
+ nop
+.globl abi_test_unwind_start
+.hidden abi_test_unwind_start
+abi_test_unwind_start:
+
+ call *%rax
+.globl abi_test_unwind_return
+.hidden abi_test_unwind_return
+abi_test_unwind_return:
+
+
+
+
+ pushfq
+ andq $-0x101,0(%rsp)
+ popfq
+.globl abi_test_unwind_stop
+.hidden abi_test_unwind_stop
+abi_test_unwind_stop:
+
+ jmp .Lcall_done
+
+.Lno_unwind:
+ call *%rax
+
+.Lcall_done:
+
+ movq 40(%rsp),%rsi
+ movq %rbx,0(%rsi)
+ movq %rbp,8(%rsi)
+ movq %r12,16(%rsi)
+ movq %r13,24(%rsi)
+ movq %r14,32(%rsi)
+ movq %r15,40(%rsi)
+ movq 64(%rsp),%rbx
+.cfi_restore rbx
+ movq 72(%rsp),%rbp
+.cfi_restore rbp
+ movq 80(%rsp),%r12
+.cfi_restore r12
+ movq 88(%rsp),%r13
+.cfi_restore r13
+ movq 96(%rsp),%r14
+.cfi_restore r14
+ movq 104(%rsp),%r15
+.cfi_restore r15
+ addq $120,%rsp
+.cfi_adjust_cfa_offset -120
+
+
+ .byte 0xf3,0xc3
+.cfi_endproc
+.Labi_test_trampoline_seh_end:
+.size abi_test_trampoline,.-abi_test_trampoline
+.type abi_test_clobber_rax, @function
+.globl abi_test_clobber_rax
+.hidden abi_test_clobber_rax
+.align 16
+abi_test_clobber_rax:
+ xorq %rax,%rax
+ .byte 0xf3,0xc3
+.size abi_test_clobber_rax,.-abi_test_clobber_rax
+.type abi_test_clobber_rbx, @function
+.globl abi_test_clobber_rbx
+.hidden abi_test_clobber_rbx
+.align 16
+abi_test_clobber_rbx:
+ xorq %rbx,%rbx
+ .byte 0xf3,0xc3
+.size abi_test_clobber_rbx,.-abi_test_clobber_rbx
+.type abi_test_clobber_rcx, @function
+.globl abi_test_clobber_rcx
+.hidden abi_test_clobber_rcx
+.align 16
+abi_test_clobber_rcx:
+ xorq %rcx,%rcx
+ .byte 0xf3,0xc3
+.size abi_test_clobber_rcx,.-abi_test_clobber_rcx
+.type abi_test_clobber_rdx, @function
+.globl abi_test_clobber_rdx
+.hidden abi_test_clobber_rdx
+.align 16
+abi_test_clobber_rdx:
+ xorq %rdx,%rdx
+ .byte 0xf3,0xc3
+.size abi_test_clobber_rdx,.-abi_test_clobber_rdx
+.type abi_test_clobber_rdi, @function
+.globl abi_test_clobber_rdi
+.hidden abi_test_clobber_rdi
+.align 16
+abi_test_clobber_rdi:
+ xorq %rdi,%rdi
+ .byte 0xf3,0xc3
+.size abi_test_clobber_rdi,.-abi_test_clobber_rdi
+.type abi_test_clobber_rsi, @function
+.globl abi_test_clobber_rsi
+.hidden abi_test_clobber_rsi
+.align 16
+abi_test_clobber_rsi:
+ xorq %rsi,%rsi
+ .byte 0xf3,0xc3
+.size abi_test_clobber_rsi,.-abi_test_clobber_rsi
+.type abi_test_clobber_rbp, @function
+.globl abi_test_clobber_rbp
+.hidden abi_test_clobber_rbp
+.align 16
+abi_test_clobber_rbp:
+ xorq %rbp,%rbp
+ .byte 0xf3,0xc3
+.size abi_test_clobber_rbp,.-abi_test_clobber_rbp
+.type abi_test_clobber_r8, @function
+.globl abi_test_clobber_r8
+.hidden abi_test_clobber_r8
+.align 16
+abi_test_clobber_r8:
+ xorq %r8,%r8
+ .byte 0xf3,0xc3
+.size abi_test_clobber_r8,.-abi_test_clobber_r8
+.type abi_test_clobber_r9, @function
+.globl abi_test_clobber_r9
+.hidden abi_test_clobber_r9
+.align 16
+abi_test_clobber_r9:
+ xorq %r9,%r9
+ .byte 0xf3,0xc3
+.size abi_test_clobber_r9,.-abi_test_clobber_r9
+.type abi_test_clobber_r10, @function
+.globl abi_test_clobber_r10
+.hidden abi_test_clobber_r10
+.align 16
+abi_test_clobber_r10:
+ xorq %r10,%r10
+ .byte 0xf3,0xc3
+.size abi_test_clobber_r10,.-abi_test_clobber_r10
+.type abi_test_clobber_r11, @function
+.globl abi_test_clobber_r11
+.hidden abi_test_clobber_r11
+.align 16
+abi_test_clobber_r11:
+ xorq %r11,%r11
+ .byte 0xf3,0xc3
+.size abi_test_clobber_r11,.-abi_test_clobber_r11
+.type abi_test_clobber_r12, @function
+.globl abi_test_clobber_r12
+.hidden abi_test_clobber_r12
+.align 16
+abi_test_clobber_r12:
+ xorq %r12,%r12
+ .byte 0xf3,0xc3
+.size abi_test_clobber_r12,.-abi_test_clobber_r12
+.type abi_test_clobber_r13, @function
+.globl abi_test_clobber_r13
+.hidden abi_test_clobber_r13
+.align 16
+abi_test_clobber_r13:
+ xorq %r13,%r13
+ .byte 0xf3,0xc3
+.size abi_test_clobber_r13,.-abi_test_clobber_r13
+.type abi_test_clobber_r14, @function
+.globl abi_test_clobber_r14
+.hidden abi_test_clobber_r14
+.align 16
+abi_test_clobber_r14:
+ xorq %r14,%r14
+ .byte 0xf3,0xc3
+.size abi_test_clobber_r14,.-abi_test_clobber_r14
+.type abi_test_clobber_r15, @function
+.globl abi_test_clobber_r15
+.hidden abi_test_clobber_r15
+.align 16
+abi_test_clobber_r15:
+ xorq %r15,%r15
+ .byte 0xf3,0xc3
+.size abi_test_clobber_r15,.-abi_test_clobber_r15
+.type abi_test_clobber_xmm0, @function
+.globl abi_test_clobber_xmm0
+.hidden abi_test_clobber_xmm0
+.align 16
+abi_test_clobber_xmm0:
+ pxor %xmm0,%xmm0
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm0,.-abi_test_clobber_xmm0
+.type abi_test_clobber_xmm1, @function
+.globl abi_test_clobber_xmm1
+.hidden abi_test_clobber_xmm1
+.align 16
+abi_test_clobber_xmm1:
+ pxor %xmm1,%xmm1
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm1,.-abi_test_clobber_xmm1
+.type abi_test_clobber_xmm2, @function
+.globl abi_test_clobber_xmm2
+.hidden abi_test_clobber_xmm2
+.align 16
+abi_test_clobber_xmm2:
+ pxor %xmm2,%xmm2
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm2,.-abi_test_clobber_xmm2
+.type abi_test_clobber_xmm3, @function
+.globl abi_test_clobber_xmm3
+.hidden abi_test_clobber_xmm3
+.align 16
+abi_test_clobber_xmm3:
+ pxor %xmm3,%xmm3
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm3,.-abi_test_clobber_xmm3
+.type abi_test_clobber_xmm4, @function
+.globl abi_test_clobber_xmm4
+.hidden abi_test_clobber_xmm4
+.align 16
+abi_test_clobber_xmm4:
+ pxor %xmm4,%xmm4
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm4,.-abi_test_clobber_xmm4
+.type abi_test_clobber_xmm5, @function
+.globl abi_test_clobber_xmm5
+.hidden abi_test_clobber_xmm5
+.align 16
+abi_test_clobber_xmm5:
+ pxor %xmm5,%xmm5
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm5,.-abi_test_clobber_xmm5
+.type abi_test_clobber_xmm6, @function
+.globl abi_test_clobber_xmm6
+.hidden abi_test_clobber_xmm6
+.align 16
+abi_test_clobber_xmm6:
+ pxor %xmm6,%xmm6
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm6,.-abi_test_clobber_xmm6
+.type abi_test_clobber_xmm7, @function
+.globl abi_test_clobber_xmm7
+.hidden abi_test_clobber_xmm7
+.align 16
+abi_test_clobber_xmm7:
+ pxor %xmm7,%xmm7
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm7,.-abi_test_clobber_xmm7
+.type abi_test_clobber_xmm8, @function
+.globl abi_test_clobber_xmm8
+.hidden abi_test_clobber_xmm8
+.align 16
+abi_test_clobber_xmm8:
+ pxor %xmm8,%xmm8
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm8,.-abi_test_clobber_xmm8
+.type abi_test_clobber_xmm9, @function
+.globl abi_test_clobber_xmm9
+.hidden abi_test_clobber_xmm9
+.align 16
+abi_test_clobber_xmm9:
+ pxor %xmm9,%xmm9
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm9,.-abi_test_clobber_xmm9
+.type abi_test_clobber_xmm10, @function
+.globl abi_test_clobber_xmm10
+.hidden abi_test_clobber_xmm10
+.align 16
+abi_test_clobber_xmm10:
+ pxor %xmm10,%xmm10
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm10,.-abi_test_clobber_xmm10
+.type abi_test_clobber_xmm11, @function
+.globl abi_test_clobber_xmm11
+.hidden abi_test_clobber_xmm11
+.align 16
+abi_test_clobber_xmm11:
+ pxor %xmm11,%xmm11
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm11,.-abi_test_clobber_xmm11
+.type abi_test_clobber_xmm12, @function
+.globl abi_test_clobber_xmm12
+.hidden abi_test_clobber_xmm12
+.align 16
+abi_test_clobber_xmm12:
+ pxor %xmm12,%xmm12
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm12,.-abi_test_clobber_xmm12
+.type abi_test_clobber_xmm13, @function
+.globl abi_test_clobber_xmm13
+.hidden abi_test_clobber_xmm13
+.align 16
+abi_test_clobber_xmm13:
+ pxor %xmm13,%xmm13
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm13,.-abi_test_clobber_xmm13
+.type abi_test_clobber_xmm14, @function
+.globl abi_test_clobber_xmm14
+.hidden abi_test_clobber_xmm14
+.align 16
+abi_test_clobber_xmm14:
+ pxor %xmm14,%xmm14
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm14,.-abi_test_clobber_xmm14
+.type abi_test_clobber_xmm15, @function
+.globl abi_test_clobber_xmm15
+.hidden abi_test_clobber_xmm15
+.align 16
+abi_test_clobber_xmm15:
+ pxor %xmm15,%xmm15
+ .byte 0xf3,0xc3
+.size abi_test_clobber_xmm15,.-abi_test_clobber_xmm15
+
+
+
+.type abi_test_bad_unwind_wrong_register, @function
+.globl abi_test_bad_unwind_wrong_register
+.hidden abi_test_bad_unwind_wrong_register
+.align 16
+abi_test_bad_unwind_wrong_register:
+.cfi_startproc
+.Labi_test_bad_unwind_wrong_register_seh_begin:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r13,-16
+.Labi_test_bad_unwind_wrong_register_seh_push_r13:
+
+
+
+ nop
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ .byte 0xf3,0xc3
+.Labi_test_bad_unwind_wrong_register_seh_end:
+.cfi_endproc
+.size abi_test_bad_unwind_wrong_register,.-abi_test_bad_unwind_wrong_register
+
+
+
+
+.type abi_test_bad_unwind_temporary, @function
+.globl abi_test_bad_unwind_temporary
+.hidden abi_test_bad_unwind_temporary
+.align 16
+abi_test_bad_unwind_temporary:
+.cfi_startproc
+.Labi_test_bad_unwind_temporary_seh_begin:
+ pushq %r12
+.cfi_adjust_cfa_offset 8
+.cfi_offset %r12,-16
+.Labi_test_bad_unwind_temporary_seh_push_r12:
+
+ movq %r12,%rax
+ incq %rax
+ movq %rax,(%rsp)
+
+
+
+ movq %r12,(%rsp)
+
+
+ popq %r12
+.cfi_adjust_cfa_offset -8
+.cfi_restore %r12
+ .byte 0xf3,0xc3
+.Labi_test_bad_unwind_temporary_seh_end:
+.cfi_endproc
+.size abi_test_bad_unwind_temporary,.-abi_test_bad_unwind_temporary
+
+
+
+
+.type abi_test_set_direction_flag, @function
+.globl abi_test_get_and_clear_direction_flag
+.hidden abi_test_get_and_clear_direction_flag
+abi_test_get_and_clear_direction_flag:
+ pushfq
+ popq %rax
+ andq $0x400,%rax
+ shrq $10,%rax
+ cld
+ .byte 0xf3,0xc3
+.size abi_test_get_and_clear_direction_flag,.-abi_test_get_and_clear_direction_flag
+
+
+
+.type abi_test_set_direction_flag, @function
+.globl abi_test_set_direction_flag
+.hidden abi_test_set_direction_flag
+abi_test_set_direction_flag:
+ std
+ .byte 0xf3,0xc3
+.size abi_test_set_direction_flag,.-abi_test_set_direction_flag
+#endif
+.section .note.GNU-stack,"",@progbits