whiterose

linux unikernel
Log | Files | Refs | README | LICENSE | git clone https://git.ne02ptzero.me/git/whiterose

commit b71acb0e372160167bf6d5500b88b30b52ccef6e
parent e0c38a4d1f196a4b17d2eba36afff8f656a4f1de
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu, 27 Dec 2018 13:53:32 -0800

Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6

Pull crypto updates from Herbert Xu:
 "API:
   - Add 1472-byte test to tcrypt for IPsec
   - Reintroduced crypto stats interface with numerous changes
   - Support incremental algorithm dumps

  Algorithms:
   - Add xchacha12/20
   - Add nhpoly1305
   - Add adiantum
   - Add streebog hash
   - Mark cts(cbc(aes)) as FIPS allowed

  Drivers:
   - Improve performance of arm64/chacha20
   - Improve performance of x86/chacha20
   - Add NEON-accelerated nhpoly1305
   - Add SSE2 accelerated nhpoly1305
   - Add AVX2 accelerated nhpoly1305
   - Add support for 192/256-bit keys in gcmaes AVX
   - Add SG support in gcmaes AVX
   - ESN for inline IPsec tx in chcr
   - Add support for CryptoCell 703 in ccree
   - Add support for CryptoCell 713 in ccree
   - Add SM4 support in ccree
   - Add SM3 support in ccree
   - Add support for chacha20 in caam/qi2
   - Add support for chacha20 + poly1305 in caam/jr
   - Add support for chacha20 + poly1305 in caam/qi2
   - Add AEAD cipher support in cavium/nitrox"

* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (130 commits)
  crypto: skcipher - remove remnants of internal IV generators
  crypto: cavium/nitrox - Fix build with !CONFIG_DEBUG_FS
  crypto: salsa20-generic - don't unnecessarily use atomic walk
  crypto: skcipher - add might_sleep() to skcipher_walk_virt()
  crypto: x86/chacha - avoid sleeping under kernel_fpu_begin()
  crypto: cavium/nitrox - Added AEAD cipher support
  crypto: mxc-scc - fix build warnings on ARM64
  crypto: api - document missing stats member
  crypto: user - remove unused dump functions
  crypto: chelsio - Fix wrong error counter increments
  crypto: chelsio - Reset counters on cxgb4 Detach
  crypto: chelsio - Handle PCI shutdown event
  crypto: chelsio - cleanup:send addr as value in function argument
  crypto: chelsio - Use same value for both channel in single WR
  crypto: chelsio - Swap location of AAD and IV sent in WR
  crypto: chelsio - remove set but not used variable 'kctx_len'
  crypto: ux500 - Use proper enum in hash_set_dma_transfer
  crypto: ux500 - Use proper enum in cryp_set_dma_transfer
  crypto: aesni - Add scatter/gather avx stubs, and use them in C
  crypto: aesni - Introduce partial block macro
  ..

Diffstat:
MDocumentation/crypto/api.rst | 9---------
MDocumentation/crypto/architecture.rst | 31+++++--------------------------
MDocumentation/devicetree/bindings/crypto/arm-cryptocell.txt | 8++++++--
MDocumentation/devicetree/bindings/crypto/fsl-dcp.txt | 2++
MMAINTAINERS | 9+++++++--
March/arm/crypto/Kconfig | 16+++++++++++++++-
March/arm/crypto/Makefile | 6++++--
March/arm/crypto/aes-ce-glue.c | 1-
March/arm/crypto/aes-cipher-core.S | 62++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Aarch/arm/crypto/chacha-neon-core.S | 560+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/arm/crypto/chacha-neon-glue.c | 201+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Darch/arm/crypto/chacha20-neon-core.S | 530-------------------------------------------------------------------------------
Darch/arm/crypto/chacha20-neon-glue.c | 127-------------------------------------------------------------------------------
Aarch/arm/crypto/nh-neon-core.S | 116+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/arm/crypto/nhpoly1305-neon-glue.c | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/arm64/crypto/Kconfig | 7++++++-
March/arm64/crypto/Makefile | 7+++++--
Aarch/arm64/crypto/chacha-neon-core.S | 844+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/arm64/crypto/chacha-neon-glue.c | 198+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Darch/arm64/crypto/chacha20-neon-core.S | 450-------------------------------------------------------------------------------
Darch/arm64/crypto/chacha20-neon-glue.c | 133-------------------------------------------------------------------------------
Aarch/arm64/crypto/nh-neon-core.S | 103+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/arm64/crypto/nhpoly1305-neon-glue.c | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/s390/crypto/aes_s390.c | 2+-
March/sparc/crypto/aes_glue.c | 5-----
March/sparc/crypto/camellia_glue.c | 5-----
March/sparc/crypto/des_glue.c | 5-----
March/x86/crypto/Makefile | 18+++++++++++++++---
March/x86/crypto/aesni-intel_avx-x86_64.S | 3665++++++++++++++++++++++++++++++++++++++-----------------------------------------
March/x86/crypto/aesni-intel_glue.c | 353++++++++++++++++++++++++-------------------------------------------------------
Aarch/x86/crypto/chacha-avx2-x86_64.S | 1025+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/crypto/chacha-avx512vl-x86_64.S | 836+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/crypto/chacha-ssse3-x86_64.S | 795+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Darch/x86/crypto/chacha20-avx2-x86_64.S | 448-------------------------------------------------------------------------------
Darch/x86/crypto/chacha20-ssse3-x86_64.S | 630-------------------------------------------------------------------------------
Darch/x86/crypto/chacha20_glue.c | 146-------------------------------------------------------------------------------
Aarch/x86/crypto/chacha_glue.c | 304+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/crypto/nh-avx2-x86_64.S | 157+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/crypto/nh-sse2-x86_64.S | 123+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/crypto/nhpoly1305-avx2-glue.c | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aarch/x86/crypto/nhpoly1305-sse2-glue.c | 76++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
March/x86/crypto/poly1305_glue.c | 20+++++++++++---------
Mcrypto/Kconfig | 99++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mcrypto/Makefile | 8++++++--
Mcrypto/ablkcipher.c | 94++++++-------------------------------------------------------------------------
Mcrypto/acompress.c | 10+++-------
Acrypto/adiantum.c | 664+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcrypto/aead.c | 14+++++---------
Mcrypto/aes_generic.c | 9+++++----
Mcrypto/aes_ti.c | 18++++++++++++++++++
Mcrypto/ahash.c | 29++++++++++++++++++-----------
Mcrypto/akcipher.c | 11++++-------
Mcrypto/algapi.c | 247++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mcrypto/blkcipher.c | 20+++++++-------------
Mcrypto/cfb.c | 2+-
Dcrypto/chacha20_generic.c | 137-------------------------------------------------------------------------------
Mcrypto/chacha20poly1305.c | 12+++++-------
Acrypto/chacha_generic.c | 217+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcrypto/cryptd.c | 4+---
Mcrypto/crypto_user_base.c | 136++++++++++++++++++++++---------------------------------------------------------
Mcrypto/crypto_user_stat.c | 301+++++++++++++++++++++----------------------------------------------------------
Mcrypto/ctr.c | 2--
Mcrypto/ecc.c | 58++++++++++++++++++++++++++++++++++++++--------------------
Mcrypto/hash_info.c | 4++++
Mcrypto/kpp.c | 10+++-------
Mcrypto/lz4.c | 1-
Mcrypto/lz4hc.c | 1-
Acrypto/nhpoly1305.c | 254+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcrypto/pcrypt.c | 2+-
Mcrypto/poly1305_generic.c | 174+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Mcrypto/rng.c | 16+++++++---------
Mcrypto/salsa20_generic.c | 2+-
Mcrypto/scompress.c | 11++++-------
Mcrypto/shash.c | 12++++--------
Mcrypto/skcipher.c | 23++++++++++-------------
Acrypto/streebog_generic.c | 1140+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcrypto/tcrypt.c | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mcrypto/testmgr.c | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcrypto/testmgr.h | 3220++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mdrivers/block/drbd/drbd_receiver.c | 2+-
Mdrivers/char/hw_random/bcm2835-rng.c | 7++-----
Mdrivers/char/random.c | 51+++++++++++++++++++++++++--------------------------
Mdrivers/crypto/Kconfig | 4+++-
Mdrivers/crypto/amcc/crypto4xx_alg.c | 3+--
Mdrivers/crypto/bcm/cipher.c | 9+++++----
Mdrivers/crypto/caam/caamalg.c | 266++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
Mdrivers/crypto/caam/caamalg_desc.c | 139+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Mdrivers/crypto/caam/caamalg_desc.h | 5+++++
Mdrivers/crypto/caam/caamalg_qi.c | 37++++++++++++++++++++++++++++---------
Mdrivers/crypto/caam/caamalg_qi2.c | 156+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mdrivers/crypto/caam/caamhash.c | 20+++++++++++++++-----
Mdrivers/crypto/caam/caampkc.c | 10+++++++---
Mdrivers/crypto/caam/caamrng.c | 10+++++++++-
Mdrivers/crypto/caam/compat.h | 2++
Mdrivers/crypto/caam/ctrl.c | 28+++++++++++++++++++++-------
Mdrivers/crypto/caam/desc.h | 28++++++++++++++++++++++++++++
Mdrivers/crypto/caam/desc_constr.h | 7++++++-
Mdrivers/crypto/caam/regs.h | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Mdrivers/crypto/cavium/nitrox/Makefile | 5++++-
Adrivers/crypto/cavium/nitrox/nitrox_aead.c | 364+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/crypto/cavium/nitrox/nitrox_algs.c | 456++-----------------------------------------------------------------------------
Mdrivers/crypto/cavium/nitrox/nitrox_common.h | 6+++++-
Mdrivers/crypto/cavium/nitrox/nitrox_csr.h | 12+++++++++++-
Mdrivers/crypto/cavium/nitrox/nitrox_debugfs.c | 48+++++++++---------------------------------------
Adrivers/crypto/cavium/nitrox/nitrox_debugfs.h | 21+++++++++++++++++++++
Mdrivers/crypto/cavium/nitrox/nitrox_dev.h | 74+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
Mdrivers/crypto/cavium/nitrox/nitrox_hal.c | 114+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Mdrivers/crypto/cavium/nitrox/nitrox_hal.h | 2++
Mdrivers/crypto/cavium/nitrox/nitrox_isr.c | 92++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
Mdrivers/crypto/cavium/nitrox/nitrox_isr.h | 2++
Mdrivers/crypto/cavium/nitrox/nitrox_lib.c | 22+++++++++++++++++-----
Mdrivers/crypto/cavium/nitrox/nitrox_main.c | 3+--
Adrivers/crypto/cavium/nitrox/nitrox_mbx.c | 204+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Adrivers/crypto/cavium/nitrox/nitrox_mbx.h | 9+++++++++
Mdrivers/crypto/cavium/nitrox/nitrox_req.h | 326+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mdrivers/crypto/cavium/nitrox/nitrox_reqmgr.c | 302+++++++++++++++++++++----------------------------------------------------------
Adrivers/crypto/cavium/nitrox/nitrox_skcipher.c | 498+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdrivers/crypto/cavium/nitrox/nitrox_sriov.c | 94+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
Mdrivers/crypto/ccp/ccp-crypto-aes-cmac.c | 4+---
Mdrivers/crypto/ccree/cc_aead.c | 35+++++++++++++++++++++++++++++------
Mdrivers/crypto/ccree/cc_cipher.c | 104++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mdrivers/crypto/ccree/cc_crypto_ctx.h | 4+++-
Mdrivers/crypto/ccree/cc_driver.c | 50+++++++++++++++++++++++++++++++++++++-------------
Mdrivers/crypto/ccree/cc_driver.h | 15++++++++++++---
Mdrivers/crypto/ccree/cc_hash.c | 189+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Mdrivers/crypto/ccree/cc_hw_queue_defs.h | 30++++++++++++++++++++++++++++++
Mdrivers/crypto/chelsio/chcr_algo.c | 418++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mdrivers/crypto/chelsio/chcr_algo.h | 2+-
Mdrivers/crypto/chelsio/chcr_core.c | 195++++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------
Mdrivers/crypto/chelsio/chcr_core.h | 44+++++++++++++++++++++++++++++++++++++-------
Mdrivers/crypto/chelsio/chcr_crypto.h | 10+++-------
Mdrivers/crypto/chelsio/chcr_ipsec.c | 183++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Mdrivers/crypto/geode-aes.c | 2+-
Mdrivers/crypto/inside-secure/safexcel_cipher.c | 8++++----
Mdrivers/crypto/ixp4xx_crypto.c | 5-----
Mdrivers/crypto/mxc-scc.c | 12++++++------
Mdrivers/crypto/mxs-dcp.c | 28+++++++++++++++++++++++++---
Mdrivers/crypto/nx/nx-aes-ctr.c | 1-
Mdrivers/crypto/omap-aes.c | 3---
Mdrivers/crypto/omap-des.c | 1-
Mdrivers/crypto/picoxcell_crypto.c | 3+--
Mdrivers/crypto/qce/ablkcipher.c | 1-
Mdrivers/crypto/qce/sha.c | 1-
Mdrivers/crypto/sahara.c | 1-
Mdrivers/crypto/talitos.c | 1-
Mdrivers/crypto/ux500/cryp/cryp_core.c | 4++--
Mdrivers/crypto/ux500/hash/hash_core.c | 2+-
Mdrivers/md/dm-crypt.c | 2+-
Mdrivers/md/dm-integrity.c | 2+-
Mdrivers/net/wireless/cisco/airo.c | 2+-
Mdrivers/net/wireless/intersil/orinoco/mic.c | 6++----
Mdrivers/staging/rtl8192e/rtllib_crypt_ccmp.c | 2+-
Mdrivers/staging/rtl8192u/ieee80211/ieee80211_crypt_ccmp.c | 2+-
Mdrivers/usb/wusbcore/crypto.c | 2+-
Mfs/ubifs/auth.c | 5++---
Minclude/crypto/acompress.h | 38++++++++------------------------------
Minclude/crypto/aead.h | 41++++++++---------------------------------
Minclude/crypto/akcipher.h | 74++++++++++++++------------------------------------------------------------
Ainclude/crypto/chacha.h | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dinclude/crypto/chacha20.h | 27---------------------------
Minclude/crypto/hash.h | 32+++++---------------------------
Minclude/crypto/hash_info.h | 1+
Minclude/crypto/internal/cryptouser.h | 9+++++++--
Minclude/crypto/internal/skcipher.h | 2--
Minclude/crypto/kpp.h | 48+++++++++---------------------------------------
Ainclude/crypto/nhpoly1305.h | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/crypto/poly1305.h | 28++++++++++++++++++++++++++--
Minclude/crypto/rng.h | 27+++------------------------
Minclude/crypto/skcipher.h | 49++++++++-----------------------------------------
Ainclude/crypto/streebog.h | 34++++++++++++++++++++++++++++++++++
Minclude/linux/crypto.h | 331+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Minclude/uapi/linux/cryptouser.h | 102+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Minclude/uapi/linux/hash_info.h | 2++
Mkernel/padata.c | 2+-
Mlib/Makefile | 2+-
Alib/chacha.c | 117+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dlib/chacha20.c | 74--------------------------------------------------------------------------
Mnet/bluetooth/smp.c | 8++++----
Mnet/mac80211/wep.c | 4++--
Mnet/wireless/lib80211_crypt_ccmp.c | 2+-
Mnet/wireless/lib80211_crypt_tkip.c | 4++--
Mnet/wireless/lib80211_crypt_wep.c | 4++--
Msecurity/apparmor/crypto.c | 2+-
Msecurity/integrity/evm/evm_crypto.c | 3+--
Msecurity/keys/encrypted-keys/encrypted.c | 4++--
Msecurity/keys/trusted.c | 4++--
Mtools/crypto/getstat.c | 72++++++++++++++++++++++++++++++++++++------------------------------------
187 files changed, 18131 insertions(+), 7382 deletions(-)

diff --git a/Documentation/crypto/api.rst b/Documentation/crypto/api.rst @@ -1,15 +1,6 @@ Programming Interface ===================== -Please note that the kernel crypto API contains the AEAD givcrypt API -(crypto_aead_giv\* and aead_givcrypt\* function calls in -include/crypto/aead.h). This API is obsolete and will be removed in the -future. To obtain the functionality of an AEAD cipher with internal IV -generation, use the IV generator as a regular cipher. For example, -rfc4106(gcm(aes)) is the AEAD cipher with external IV generation and -seqniv(rfc4106(gcm(aes))) implies that the kernel crypto API generates -the IV. Different IV generators are available. - .. class:: toc-title Table of contents diff --git a/Documentation/crypto/architecture.rst b/Documentation/crypto/architecture.rst @@ -157,10 +157,6 @@ applicable to a cipher, it is not displayed: - rng for random number generator - - givcipher for cipher with associated IV generator (see the geniv - entry below for the specification of the IV generator type used by - the cipher implementation) - - kpp for a Key-agreement Protocol Primitive (KPP) cipher such as an ECDH or DH implementation @@ -174,16 +170,7 @@ applicable to a cipher, it is not displayed: - digestsize: output size of the message digest -- geniv: IV generation type: - - - eseqiv for encrypted sequence number based IV generation - - - seqiv for sequence number based IV generation - - - chainiv for chain iv generation - - - <builtin> is a marker that the cipher implements IV generation and - handling as it is specific to the given cipher +- geniv: IV generator (obsolete) Key Sizes --------- @@ -218,10 +205,6 @@ the aforementioned cipher types: - CRYPTO_ALG_TYPE_ABLKCIPHER Asynchronous multi-block cipher -- CRYPTO_ALG_TYPE_GIVCIPHER Asynchronous multi-block cipher packed - together with an IV generator (see geniv field in the /proc/crypto - listing for the known IV generators) - - CRYPTO_ALG_TYPE_KPP Key-agreement Protocol Primitive (KPP) such as an ECDH or DH implementation @@ -338,18 +321,14 @@ uses the API applicable to the cipher type specified for the block. The following call sequence is applicable when the IPSEC layer triggers an encryption operation with the esp_output function. During -configuration, the administrator set up the use of rfc4106(gcm(aes)) as -the cipher for ESP. The following call sequence is now depicted in the -ASCII art above: +configuration, the administrator set up the use of seqiv(rfc4106(gcm(aes))) +as the cipher for ESP. The following call sequence is now depicted in +the ASCII art above: 1. esp_output() invokes crypto_aead_encrypt() to trigger an encryption operation of the AEAD cipher with IV generator. - In case of GCM, the SEQIV implementation is registered as GIVCIPHER - in crypto_rfc4106_alloc(). - - The SEQIV performs its operation to generate an IV where the core - function is seqiv_geniv(). + The SEQIV generates the IV. 2. Now, SEQIV uses the AEAD API function calls to invoke the associated AEAD cipher. In our case, during the instantiation of SEQIV, the diff --git a/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt b/Documentation/devicetree/bindings/crypto/arm-cryptocell.txt @@ -1,8 +1,12 @@ Arm TrustZone CryptoCell cryptographic engine Required properties: -- compatible: Should be one of: "arm,cryptocell-712-ree", - "arm,cryptocell-710-ree" or "arm,cryptocell-630p-ree". +- compatible: Should be one of - + "arm,cryptocell-713-ree" + "arm,cryptocell-703-ree" + "arm,cryptocell-712-ree" + "arm,cryptocell-710-ree" + "arm,cryptocell-630p-ree" - reg: Base physical address of the engine and length of memory mapped region. - interrupts: Interrupt number for the device. diff --git a/Documentation/devicetree/bindings/crypto/fsl-dcp.txt b/Documentation/devicetree/bindings/crypto/fsl-dcp.txt @@ -6,6 +6,8 @@ Required properties: - interrupts : Should contain MXS DCP interrupt numbers, VMI IRQ and DCP IRQ must be supplied, optionally Secure IRQ can be present, but is currently not implemented and not used. +- clocks : Clock reference (only required on some SOCs: 6ull and 6sll). +- clock-names : Must be "dcp". Example: diff --git a/MAINTAINERS b/MAINTAINERS @@ -3484,6 +3484,7 @@ F: include/linux/spi/cc2520.h F: Documentation/devicetree/bindings/net/ieee802154/cc2520.txt CCREE ARM TRUSTZONE CRYPTOCELL REE DRIVER +M: Yael Chemla <yael.chemla@foss.arm.com> M: Gilad Ben-Yossef <gilad@benyossef.com> L: linux-crypto@vger.kernel.org S: Supported @@ -7147,7 +7148,9 @@ F: crypto/842.c F: lib/842/ IBM Power in-Nest Crypto Acceleration -M: Paulo Flabiano Smorigo <pfsmorigo@linux.ibm.com> +M: Breno Leitão <leitao@debian.org> +M: Nayna Jain <nayna@linux.ibm.com> +M: Paulo Flabiano Smorigo <pfsmorigo@gmail.com> L: linux-crypto@vger.kernel.org S: Supported F: drivers/crypto/nx/Makefile @@ -7211,7 +7214,9 @@ S: Supported F: drivers/scsi/ibmvscsi_tgt/ IBM Power VMX Cryptographic instructions -M: Paulo Flabiano Smorigo <pfsmorigo@linux.ibm.com> +M: Breno Leitão <leitao@debian.org> +M: Nayna Jain <nayna@linux.ibm.com> +M: Paulo Flabiano Smorigo <pfsmorigo@gmail.com> L: linux-crypto@vger.kernel.org S: Supported F: drivers/crypto/vmx/Makefile diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig @@ -69,6 +69,15 @@ config CRYPTO_AES_ARM help Use optimized AES assembler routines for ARM platforms. + On ARM processors without the Crypto Extensions, this is the + fastest AES implementation for single blocks. For multiple + blocks, the NEON bit-sliced implementation is usually faster. + + This implementation may be vulnerable to cache timing attacks, + since it uses lookup tables. However, as countermeasures it + disables IRQs and preloads the tables; it is hoped this makes + such attacks very difficult. + config CRYPTO_AES_ARM_BS tristate "Bit sliced AES using NEON instructions" depends on KERNEL_MODE_NEON @@ -117,9 +126,14 @@ config CRYPTO_CRC32_ARM_CE select CRYPTO_HASH config CRYPTO_CHACHA20_NEON - tristate "NEON accelerated ChaCha20 symmetric cipher" + tristate "NEON accelerated ChaCha stream cipher algorithms" depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 +config CRYPTO_NHPOLY1305_NEON + tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)" + depends on KERNEL_MODE_NEON + select CRYPTO_NHPOLY1305 + endif diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile @@ -9,7 +9,8 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o @@ -52,7 +53,8 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o -chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o +chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o +nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o ifdef REGENERATE_ARM_CRYPTO quiet_cmd_perl = PERL $@ diff --git a/arch/arm/crypto/aes-ce-glue.c b/arch/arm/crypto/aes-ce-glue.c @@ -10,7 +10,6 @@ #include <asm/hwcap.h> #include <asm/neon.h> -#include <asm/hwcap.h> #include <crypto/aes.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> diff --git a/arch/arm/crypto/aes-cipher-core.S b/arch/arm/crypto/aes-cipher-core.S @@ -10,6 +10,7 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> #include <asm/cache.h> .text @@ -41,7 +42,7 @@ .endif .endm - .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op + .macro __hround, out0, out1, in0, in1, in2, in3, t3, t4, enc, sz, op, oldcpsr __select \out0, \in0, 0 __select t0, \in1, 1 __load \out0, \out0, 0, \sz, \op @@ -73,6 +74,14 @@ __load t0, t0, 3, \sz, \op __load \t4, \t4, 3, \sz, \op + .ifnb \oldcpsr + /* + * This is the final round and we're done with all data-dependent table + * lookups, so we can safely re-enable interrupts. + */ + restore_irqs \oldcpsr + .endif + eor \out1, \out1, t1, ror #24 eor \out0, \out0, t2, ror #16 ldm rk!, {t1, t2} @@ -83,14 +92,14 @@ eor \out1, \out1, t2 .endm - .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + .macro fround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr __hround \out0, \out1, \in0, \in1, \in2, \in3, \out2, \out3, 1, \sz, \op - __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op + __hround \out2, \out3, \in2, \in3, \in0, \in1, \in1, \in2, 1, \sz, \op, \oldcpsr .endm - .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op + .macro iround, out0, out1, out2, out3, in0, in1, in2, in3, sz=2, op, oldcpsr __hround \out0, \out1, \in0, \in3, \in2, \in1, \out2, \out3, 0, \sz, \op - __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op + __hround \out2, \out3, \in2, \in1, \in0, \in3, \in1, \in0, 0, \sz, \op, \oldcpsr .endm .macro __rev, out, in @@ -118,13 +127,14 @@ .macro do_crypt, round, ttab, ltab, bsz push {r3-r11, lr} + // Load keys first, to reduce latency in case they're not cached yet. + ldm rk!, {r8-r11} + ldr r4, [in] ldr r5, [in, #4] ldr r6, [in, #8] ldr r7, [in, #12] - ldm rk!, {r8-r11} - #ifdef CONFIG_CPU_BIG_ENDIAN __rev r4, r4 __rev r5, r5 @@ -138,6 +148,25 @@ eor r7, r7, r11 __adrl ttab, \ttab + /* + * Disable interrupts and prefetch the 1024-byte 'ft' or 'it' table into + * L1 cache, assuming cacheline size >= 32. This is a hardening measure + * intended to make cache-timing attacks more difficult. They may not + * be fully prevented, however; see the paper + * https://cr.yp.to/antiforgery/cachetiming-20050414.pdf + * ("Cache-timing attacks on AES") for a discussion of the many + * difficulties involved in writing truly constant-time AES software. + */ + save_and_disable_irqs t0 + .set i, 0 + .rept 1024 / 128 + ldr r8, [ttab, #i + 0] + ldr r9, [ttab, #i + 32] + ldr r10, [ttab, #i + 64] + ldr r11, [ttab, #i + 96] + .set i, i + 128 + .endr + push {t0} // oldcpsr tst rounds, #2 bne 1f @@ -151,8 +180,21 @@ \round r4, r5, r6, r7, r8, r9, r10, r11 b 0b -2: __adrl ttab, \ltab - \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b +2: .ifb \ltab + add ttab, ttab, #1 + .else + __adrl ttab, \ltab + // Prefetch inverse S-box for final round; see explanation above + .set i, 0 + .rept 256 / 64 + ldr t0, [ttab, #i + 0] + ldr t1, [ttab, #i + 32] + .set i, i + 64 + .endr + .endif + + pop {rounds} // oldcpsr + \round r4, r5, r6, r7, r8, r9, r10, r11, \bsz, b, rounds #ifdef CONFIG_CPU_BIG_ENDIAN __rev r4, r4 @@ -175,7 +217,7 @@ .endm ENTRY(__aes_arm_encrypt) - do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2 + do_crypt fround, crypto_ft_tab,, 2 ENDPROC(__aes_arm_encrypt) .align 5 diff --git a/arch/arm/crypto/chacha-neon-core.S b/arch/arm/crypto/chacha-neon-core.S @@ -0,0 +1,560 @@ +/* + * ChaCha/XChaCha NEON helper functions + * + * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + + /* + * NEON doesn't have a rotate instruction. The alternatives are, more or less: + * + * (a) vshl.u32 + vsri.u32 (needs temporary register) + * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) + * (c) vrev32.16 (16-bit rotations only) + * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, + * needs index vector) + * + * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, + * the only choices are (a) and (b). We use (a) since it takes two-thirds the + * cycles of (b) on both Cortex-A7 and Cortex-A53. + * + * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest + * and doesn't need a temporary register. + * + * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence + * is twice as fast as (a), even when doing (a) on multiple registers + * simultaneously to eliminate the stall between vshl and vsri. Also, it + * parallelizes better when temporary registers are scarce. + * + * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as + * (a), so the need to load the rotation table actually makes the vtbl method + * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it + * seems to be a good compromise to get a more significant speed boost on some + * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. + */ + +#include <linux/linkage.h> + + .text + .fpu neon + .align 5 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers q0-q3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in r3. + * + * Clobbers: r3, ip, q4-q5 + */ +chacha_permute: + + adr ip, .Lrol8_table + vld1.8 {d10}, [ip, :64] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vext.8 q1, q1, q1, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vext.8 q3, q3, q3, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vrev32.16 q3, q3 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #12 + vsri.u32 q1, q4, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vadd.i32 q0, q0, q1 + veor q3, q3, q0 + vtbl.8 d6, {d6}, d10 + vtbl.8 d7, {d7}, d10 + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vadd.i32 q2, q2, q3 + veor q4, q1, q2 + vshl.u32 q1, q4, #7 + vsri.u32 q1, q4, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vext.8 q1, q1, q1, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vext.8 q2, q2, q2, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vext.8 q3, q3, q3, #4 + + subs r3, r3, #2 + bne .Ldoubleround + + bx lr +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_neon) + // r0: Input state matrix, s + // r1: 1 data block output, o + // r2: 1 data block input, i + // r3: nrounds + push {lr} + + // x0..3 = s0..3 + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + vmov q8, q0 + vmov q9, q1 + vmov q10, q2 + vmov q11, q3 + + bl chacha_permute + + add ip, r2, #0x20 + vld1.8 {q4-q5}, [r2] + vld1.8 {q6-q7}, [ip] + + // o0 = i0 ^ (x0 + s0) + vadd.i32 q0, q0, q8 + veor q0, q0, q4 + + // o1 = i1 ^ (x1 + s1) + vadd.i32 q1, q1, q9 + veor q1, q1, q5 + + // o2 = i2 ^ (x2 + s2) + vadd.i32 q2, q2, q10 + veor q2, q2, q6 + + // o3 = i3 ^ (x3 + s3) + vadd.i32 q3, q3, q11 + veor q3, q3, q7 + + add ip, r1, #0x20 + vst1.8 {q0-q1}, [r1] + vst1.8 {q2-q3}, [ip] + + pop {pc} +ENDPROC(chacha_block_xor_neon) + +ENTRY(hchacha_block_neon) + // r0: Input state matrix, s + // r1: output (8 32-bit words) + // r2: nrounds + push {lr} + + vld1.32 {q0-q1}, [r0]! + vld1.32 {q2-q3}, [r0] + + mov r3, r2 + bl chacha_permute + + vst1.32 {q0}, [r1]! + vst1.32 {q3}, [r1] + + pop {pc} +ENDPROC(hchacha_block_neon) + + .align 4 +.Lctrinc: .word 0, 1, 2, 3 +.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 + + .align 5 +ENTRY(chacha_4block_xor_neon) + push {r4-r5} + mov r4, sp // preserve the stack pointer + sub ip, sp, #0x20 // allocate a 32 byte buffer + bic ip, ip, #0x1f // aligned to 32 bytes + mov sp, ip + + // r0: Input state matrix, s + // r1: 4 data blocks output, o + // r2: 4 data blocks input, i + // r3: nrounds + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. The words are re-interleaved before the + // final addition of the original state and the XORing step. + // + + // x0..15[0-3] = s0..15[0-3] + add ip, r0, #0x20 + vld1.32 {q0-q1}, [r0] + vld1.32 {q2-q3}, [ip] + + adr r5, .Lctrinc + vdup.32 q15, d7[1] + vdup.32 q14, d7[0] + vld1.32 {q4}, [r5, :128] + vdup.32 q13, d6[1] + vdup.32 q12, d6[0] + vdup.32 q11, d5[1] + vdup.32 q10, d5[0] + vadd.u32 q12, q12, q4 // x12 += counter values 0-3 + vdup.32 q9, d4[1] + vdup.32 q8, d4[0] + vdup.32 q7, d3[1] + vdup.32 q6, d3[0] + vdup.32 q5, d2[1] + vdup.32 q4, d2[0] + vdup.32 q3, d1[1] + vdup.32 q2, d1[0] + vdup.32 q1, d0[1] + vdup.32 q0, d0[0] + + adr ip, .Lrol8_table + b 1f + +.Ldoubleround4: + vld1.32 {q8-q9}, [sp, :256] +1: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + vrev32.16 q15, q15 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #12 + vshl.u32 q5, q9, #12 + vsri.u32 q4, q8, #20 + vsri.u32 q5, q9, #20 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #12 + vshl.u32 q7, q9, #12 + vsri.u32 q6, q8, #20 + vsri.u32 q7, q9, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q4 + vadd.i32 q1, q1, q5 + vadd.i32 q2, q2, q6 + vadd.i32 q3, q3, q7 + + veor q12, q12, q0 + veor q13, q13, q1 + veor q14, q14, q2 + veor q15, q15, q3 + + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vadd.i32 q8, q8, q12 + vadd.i32 q9, q9, q13 + vadd.i32 q10, q10, q14 + vadd.i32 q11, q11, q15 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q4, q8 + veor q9, q5, q9 + vshl.u32 q4, q8, #7 + vshl.u32 q5, q9, #7 + vsri.u32 q4, q8, #25 + vsri.u32 q5, q9, #25 + + veor q8, q6, q10 + veor q9, q7, q11 + vshl.u32 q6, q8, #7 + vshl.u32 q7, q9, #7 + vsri.u32 q6, q8, #25 + vsri.u32 q7, q9, #25 + + vld1.32 {q8-q9}, [sp, :256] + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vrev32.16 q15, q15 + vrev32.16 q12, q12 + vrev32.16 q13, q13 + vrev32.16 q14, q14 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #12 + vshl.u32 q4, q9, #12 + vsri.u32 q7, q8, #20 + vsri.u32 q4, q9, #20 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #12 + vshl.u32 q6, q9, #12 + vsri.u32 q5, q8, #20 + vsri.u32 q6, q9, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vld1.8 {d16}, [ip, :64] + vadd.i32 q0, q0, q5 + vadd.i32 q1, q1, q6 + vadd.i32 q2, q2, q7 + vadd.i32 q3, q3, q4 + + veor q15, q15, q0 + veor q12, q12, q1 + veor q13, q13, q2 + veor q14, q14, q3 + + vtbl.8 d30, {d30}, d16 + vtbl.8 d31, {d31}, d16 + vtbl.8 d24, {d24}, d16 + vtbl.8 d25, {d25}, d16 + vtbl.8 d26, {d26}, d16 + vtbl.8 d27, {d27}, d16 + vtbl.8 d28, {d28}, d16 + vtbl.8 d29, {d29}, d16 + + vld1.32 {q8-q9}, [sp, :256] + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vadd.i32 q10, q10, q15 + vadd.i32 q11, q11, q12 + vadd.i32 q8, q8, q13 + vadd.i32 q9, q9, q14 + + vst1.32 {q8-q9}, [sp, :256] + + veor q8, q7, q8 + veor q9, q4, q9 + vshl.u32 q7, q8, #7 + vshl.u32 q4, q9, #7 + vsri.u32 q7, q8, #25 + vsri.u32 q4, q9, #25 + + veor q8, q5, q10 + veor q9, q6, q11 + vshl.u32 q5, q8, #7 + vshl.u32 q6, q9, #7 + vsri.u32 q5, q8, #25 + vsri.u32 q6, q9, #25 + + subs r3, r3, #2 + bne .Ldoubleround4 + + // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. + // x8..9[0-3] are on the stack. + + // Re-interleave the words in the first two rows of each block (x0..7). + // Also add the counter values 0-3 to x12[0-3]. + vld1.32 {q8}, [r5, :128] // load counter values 0-3 + vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) + vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) + vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) + vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) + vadd.u32 q12, q8 // x12 += counter values 0-3 + vswp d1, d4 + vswp d3, d6 + vld1.32 {q8-q9}, [r0]! // load s0..7 + vswp d9, d12 + vswp d11, d14 + + // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) + // after XORing the first 32 bytes. + vswp q1, q4 + + // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) + + // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) + vadd.u32 q0, q0, q8 + vadd.u32 q2, q2, q8 + vadd.u32 q4, q4, q8 + vadd.u32 q3, q3, q8 + + // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) + vadd.u32 q1, q1, q9 + vadd.u32 q6, q6, q9 + vadd.u32 q5, q5, q9 + vadd.u32 q7, q7, q9 + + // XOR first 32 bytes using keystream from first two rows of first block + vld1.8 {q8-q9}, [r2]! + veor q8, q8, q0 + veor q9, q9, q1 + vst1.8 {q8-q9}, [r1]! + + // Re-interleave the words in the last two rows of each block (x8..15). + vld1.32 {q8-q9}, [sp, :256] + vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) + vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) + vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) + vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) + vld1.32 {q0-q1}, [r0] // load s8..15 + vswp d25, d28 + vswp d27, d30 + vswp d17, d20 + vswp d19, d22 + + // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) + + // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) + vadd.u32 q8, q8, q0 + vadd.u32 q10, q10, q0 + vadd.u32 q9, q9, q0 + vadd.u32 q11, q11, q0 + + // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) + vadd.u32 q12, q12, q1 + vadd.u32 q14, q14, q1 + vadd.u32 q13, q13, q1 + vadd.u32 q15, q15, q1 + + // XOR the rest of the data with the keystream + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q8 + veor q1, q1, q12 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q2 + veor q1, q1, q6 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q10 + veor q1, q1, q14 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q4 + veor q1, q1, q5 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q9 + veor q1, q1, q13 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2]! + veor q0, q0, q3 + veor q1, q1, q7 + vst1.8 {q0-q1}, [r1]! + + vld1.8 {q0-q1}, [r2] + mov sp, r4 // restore original stack pointer + veor q0, q0, q11 + veor q1, q1, q15 + vst1.8 {q0-q1}, [r1] + + pop {r4-r5} + bx lr +ENDPROC(chacha_4block_xor_neon) diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c @@ -0,0 +1,201 @@ +/* + * ARM NEON accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/algapi.h> +#include <crypto/chacha.h> +#include <crypto/internal/skcipher.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); + +static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ + u8 buf[CHACHA_BLOCK_SIZE]; + + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_neon(state, dst, src, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state[12] += 4; + } + while (bytes >= CHACHA_BLOCK_SIZE) { + chacha_block_xor_neon(state, dst, src, nrounds); + bytes -= CHACHA_BLOCK_SIZE; + src += CHACHA_BLOCK_SIZE; + dst += CHACHA_BLOCK_SIZE; + state[12]++; + } + if (bytes) { + memcpy(buf, src, bytes); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, bytes); + } +} + +static int chacha_neon_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + crypto_chacha_init(state, ctx, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); + + kernel_neon_begin(); + chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int chacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha_crypt(req); + + return chacha_neon_stream_xor(req, ctx, req->iv); +} + +static int xchacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_xchacha_crypt(req); + + crypto_chacha_init(state, ctx, req->iv); + + kernel_neon_begin(); + hchacha_block_neon(state, subctx.key, ctx->nrounds); + kernel_neon_end(); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha_neon_stream_xor(req, &subctx, real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha_neon, + .decrypt = chacha_neon, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 4 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha12_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + } +}; + +static int __init chacha_simd_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_NEON)) + return -ENODEV; + + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-neon"); diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S @@ -1,530 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - - /* - * NEON doesn't have a rotate instruction. The alternatives are, more or less: - * - * (a) vshl.u32 + vsri.u32 (needs temporary register) - * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) - * (c) vrev32.16 (16-bit rotations only) - * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, - * needs index vector) - * - * ChaCha20 has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit - * rotations, the only choices are (a) and (b). We use (a) since it takes - * two-thirds the cycles of (b) on both Cortex-A7 and Cortex-A53. - * - * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest - * and doesn't need a temporary register. - * - * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence - * is twice as fast as (a), even when doing (a) on multiple registers - * simultaneously to eliminate the stall between vshl and vsri. Also, it - * parallelizes better when temporary registers are scarce. - * - * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as - * (a), so the need to load the rotation table actually makes the vtbl method - * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it - * seems to be a good compromise to get a more significant speed boost on some - * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. - */ - -#include <linux/linkage.h> - - .text - .fpu neon - .align 5 - -ENTRY(chacha20_block_xor_neon) - // r0: Input state matrix, s - // r1: 1 data block output, o - // r2: 1 data block input, i - - // - // This function encrypts one ChaCha20 block by loading the state matrix - // in four NEON registers. It performs matrix operation on four words in - // parallel, but requireds shuffling to rearrange the words after each - // round. - // - - // x0..3 = s0..3 - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - vmov q8, q0 - vmov q9, q1 - vmov q10, q2 - vmov q11, q3 - - adr ip, .Lrol8_table - mov r3, #10 - vld1.8 {d10}, [ip, :64] - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vrev32.16 q3, q3 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vtbl.8 d6, {d6}, d10 - vtbl.8 d7, {d7}, d10 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - vext.8 q1, q1, q1, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - vext.8 q3, q3, q3, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vrev32.16 q3, q3 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #12 - vsri.u32 q1, q4, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - vadd.i32 q0, q0, q1 - veor q3, q3, q0 - vtbl.8 d6, {d6}, d10 - vtbl.8 d7, {d7}, d10 - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - vadd.i32 q2, q2, q3 - veor q4, q1, q2 - vshl.u32 q1, q4, #7 - vsri.u32 q1, q4, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - vext.8 q1, q1, q1, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - vext.8 q2, q2, q2, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - vext.8 q3, q3, q3, #4 - - subs r3, r3, #1 - bne .Ldoubleround - - add ip, r2, #0x20 - vld1.8 {q4-q5}, [r2] - vld1.8 {q6-q7}, [ip] - - // o0 = i0 ^ (x0 + s0) - vadd.i32 q0, q0, q8 - veor q0, q0, q4 - - // o1 = i1 ^ (x1 + s1) - vadd.i32 q1, q1, q9 - veor q1, q1, q5 - - // o2 = i2 ^ (x2 + s2) - vadd.i32 q2, q2, q10 - veor q2, q2, q6 - - // o3 = i3 ^ (x3 + s3) - vadd.i32 q3, q3, q11 - veor q3, q3, q7 - - add ip, r1, #0x20 - vst1.8 {q0-q1}, [r1] - vst1.8 {q2-q3}, [ip] - - bx lr -ENDPROC(chacha20_block_xor_neon) - - .align 4 -.Lctrinc: .word 0, 1, 2, 3 -.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 - - .align 5 -ENTRY(chacha20_4block_xor_neon) - push {r4-r5} - mov r4, sp // preserve the stack pointer - sub ip, sp, #0x20 // allocate a 32 byte buffer - bic ip, ip, #0x1f // aligned to 32 bytes - mov sp, ip - - // r0: Input state matrix, s - // r1: 4 data blocks output, o - // r2: 4 data blocks input, i - - // - // This function encrypts four consecutive ChaCha20 blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. The words are re-interleaved before the - // final addition of the original state and the XORing step. - // - - // x0..15[0-3] = s0..15[0-3] - add ip, r0, #0x20 - vld1.32 {q0-q1}, [r0] - vld1.32 {q2-q3}, [ip] - - adr r5, .Lctrinc - vdup.32 q15, d7[1] - vdup.32 q14, d7[0] - vld1.32 {q4}, [r5, :128] - vdup.32 q13, d6[1] - vdup.32 q12, d6[0] - vdup.32 q11, d5[1] - vdup.32 q10, d5[0] - vadd.u32 q12, q12, q4 // x12 += counter values 0-3 - vdup.32 q9, d4[1] - vdup.32 q8, d4[0] - vdup.32 q7, d3[1] - vdup.32 q6, d3[0] - vdup.32 q5, d2[1] - vdup.32 q4, d2[0] - vdup.32 q3, d1[1] - vdup.32 q2, d1[0] - vdup.32 q1, d0[1] - vdup.32 q0, d0[0] - - adr ip, .Lrol8_table - mov r3, #10 - b 1f - -.Ldoubleround4: - vld1.32 {q8-q9}, [sp, :256] -1: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q12, q12, q0 - veor q13, q13, q1 - veor q14, q14, q2 - veor q15, q15, q3 - - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - vrev32.16 q15, q15 - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #12 - vshl.u32 q5, q9, #12 - vsri.u32 q4, q8, #20 - vsri.u32 q5, q9, #20 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #12 - vshl.u32 q7, q9, #12 - vsri.u32 q6, q8, #20 - vsri.u32 q7, q9, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vld1.8 {d16}, [ip, :64] - vadd.i32 q0, q0, q4 - vadd.i32 q1, q1, q5 - vadd.i32 q2, q2, q6 - vadd.i32 q3, q3, q7 - - veor q12, q12, q0 - veor q13, q13, q1 - veor q14, q14, q2 - veor q15, q15, q3 - - vtbl.8 d24, {d24}, d16 - vtbl.8 d25, {d25}, d16 - vtbl.8 d26, {d26}, d16 - vtbl.8 d27, {d27}, d16 - vtbl.8 d28, {d28}, d16 - vtbl.8 d29, {d29}, d16 - vtbl.8 d30, {d30}, d16 - vtbl.8 d31, {d31}, d16 - - vld1.32 {q8-q9}, [sp, :256] - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vadd.i32 q8, q8, q12 - vadd.i32 q9, q9, q13 - vadd.i32 q10, q10, q14 - vadd.i32 q11, q11, q15 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q4, q8 - veor q9, q5, q9 - vshl.u32 q4, q8, #7 - vshl.u32 q5, q9, #7 - vsri.u32 q4, q8, #25 - vsri.u32 q5, q9, #25 - - veor q8, q6, q10 - veor q9, q7, q11 - vshl.u32 q6, q8, #7 - vshl.u32 q7, q9, #7 - vsri.u32 q6, q8, #25 - vsri.u32 q7, q9, #25 - - vld1.32 {q8-q9}, [sp, :256] - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q15, q15, q0 - veor q12, q12, q1 - veor q13, q13, q2 - veor q14, q14, q3 - - vrev32.16 q15, q15 - vrev32.16 q12, q12 - vrev32.16 q13, q13 - vrev32.16 q14, q14 - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #12 - vshl.u32 q4, q9, #12 - vsri.u32 q7, q8, #20 - vsri.u32 q4, q9, #20 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #12 - vshl.u32 q6, q9, #12 - vsri.u32 q5, q8, #20 - vsri.u32 q6, q9, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vld1.8 {d16}, [ip, :64] - vadd.i32 q0, q0, q5 - vadd.i32 q1, q1, q6 - vadd.i32 q2, q2, q7 - vadd.i32 q3, q3, q4 - - veor q15, q15, q0 - veor q12, q12, q1 - veor q13, q13, q2 - veor q14, q14, q3 - - vtbl.8 d30, {d30}, d16 - vtbl.8 d31, {d31}, d16 - vtbl.8 d24, {d24}, d16 - vtbl.8 d25, {d25}, d16 - vtbl.8 d26, {d26}, d16 - vtbl.8 d27, {d27}, d16 - vtbl.8 d28, {d28}, d16 - vtbl.8 d29, {d29}, d16 - - vld1.32 {q8-q9}, [sp, :256] - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vadd.i32 q10, q10, q15 - vadd.i32 q11, q11, q12 - vadd.i32 q8, q8, q13 - vadd.i32 q9, q9, q14 - - vst1.32 {q8-q9}, [sp, :256] - - veor q8, q7, q8 - veor q9, q4, q9 - vshl.u32 q7, q8, #7 - vshl.u32 q4, q9, #7 - vsri.u32 q7, q8, #25 - vsri.u32 q4, q9, #25 - - veor q8, q5, q10 - veor q9, q6, q11 - vshl.u32 q5, q8, #7 - vshl.u32 q6, q9, #7 - vsri.u32 q5, q8, #25 - vsri.u32 q6, q9, #25 - - subs r3, r3, #1 - bne .Ldoubleround4 - - // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. - // x8..9[0-3] are on the stack. - - // Re-interleave the words in the first two rows of each block (x0..7). - // Also add the counter values 0-3 to x12[0-3]. - vld1.32 {q8}, [r5, :128] // load counter values 0-3 - vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) - vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) - vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) - vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) - vadd.u32 q12, q8 // x12 += counter values 0-3 - vswp d1, d4 - vswp d3, d6 - vld1.32 {q8-q9}, [r0]! // load s0..7 - vswp d9, d12 - vswp d11, d14 - - // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) - // after XORing the first 32 bytes. - vswp q1, q4 - - // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) - - // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) - vadd.u32 q0, q0, q8 - vadd.u32 q2, q2, q8 - vadd.u32 q4, q4, q8 - vadd.u32 q3, q3, q8 - - // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) - vadd.u32 q1, q1, q9 - vadd.u32 q6, q6, q9 - vadd.u32 q5, q5, q9 - vadd.u32 q7, q7, q9 - - // XOR first 32 bytes using keystream from first two rows of first block - vld1.8 {q8-q9}, [r2]! - veor q8, q8, q0 - veor q9, q9, q1 - vst1.8 {q8-q9}, [r1]! - - // Re-interleave the words in the last two rows of each block (x8..15). - vld1.32 {q8-q9}, [sp, :256] - vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) - vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) - vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) - vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) - vld1.32 {q0-q1}, [r0] // load s8..15 - vswp d25, d28 - vswp d27, d30 - vswp d17, d20 - vswp d19, d22 - - // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) - - // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) - vadd.u32 q8, q8, q0 - vadd.u32 q10, q10, q0 - vadd.u32 q9, q9, q0 - vadd.u32 q11, q11, q0 - - // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) - vadd.u32 q12, q12, q1 - vadd.u32 q14, q14, q1 - vadd.u32 q13, q13, q1 - vadd.u32 q15, q15, q1 - - // XOR the rest of the data with the keystream - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q8 - veor q1, q1, q12 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q2 - veor q1, q1, q6 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q10 - veor q1, q1, q14 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q4 - veor q1, q1, q5 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q9 - veor q1, q1, q13 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2]! - veor q0, q0, q3 - veor q1, q1, q7 - vst1.8 {q0-q1}, [r1]! - - vld1.8 {q0-q1}, [r2] - mov sp, r4 // restore original stack pointer - veor q0, q0, q11 - veor q1, q1, q15 - vst1.8 {q0-q1}, [r1] - - pop {r4-r5} - bx lr -ENDPROC(chacha20_4block_xor_neon) diff --git a/arch/arm/crypto/chacha20-neon-glue.c b/arch/arm/crypto/chacha20-neon-glue.c @@ -1,127 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <crypto/algapi.h> -#include <crypto/chacha20.h> -#include <crypto/internal/skcipher.h> -#include <linux/kernel.h> -#include <linux/module.h> - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> - -asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); -asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); - -static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ - u8 buf[CHACHA20_BLOCK_SIZE]; - - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { - chacha20_4block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; - state[12] += 4; - } - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_neon(state, buf, buf); - memcpy(dst, buf, bytes); - } -} - -static int chacha20_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - u32 state[16]; - int err; - - if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha20_crypt(req); - - err = skcipher_walk_virt(&walk, req, true); - - crypto_chacha20_init(state, ctx, walk.iv); - - kernel_neon_begin(); - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - kernel_neon_end(); - - return err; -} - -static struct skcipher_alg alg = { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, - .walksize = 4 * CHACHA20_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_neon, - .decrypt = chacha20_neon, -}; - -static int __init chacha20_simd_mod_init(void) -{ - if (!(elf_hwcap & HWCAP_NEON)) - return -ENODEV; - - return crypto_register_skcipher(&alg); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_skcipher(&alg); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); diff --git a/arch/arm/crypto/nh-neon-core.S b/arch/arm/crypto/nh-neon-core.S @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, NEON accelerated version + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ + +#include <linux/linkage.h> + + .text + .fpu neon + + KEY .req r0 + MESSAGE .req r1 + MESSAGE_LEN .req r2 + HASH .req r3 + + PASS0_SUMS .req q0 + PASS0_SUM_A .req d0 + PASS0_SUM_B .req d1 + PASS1_SUMS .req q1 + PASS1_SUM_A .req d2 + PASS1_SUM_B .req d3 + PASS2_SUMS .req q2 + PASS2_SUM_A .req d4 + PASS2_SUM_B .req d5 + PASS3_SUMS .req q3 + PASS3_SUM_A .req d6 + PASS3_SUM_B .req d7 + K0 .req q4 + K1 .req q5 + K2 .req q6 + K3 .req q7 + T0 .req q8 + T0_L .req d16 + T0_H .req d17 + T1 .req q9 + T1_L .req d18 + T1_H .req d19 + T2 .req q10 + T2_L .req d20 + T2_H .req d21 + T3 .req q11 + T3_L .req d22 + T3_H .req d23 + +.macro _nh_stride k0, k1, k2, k3 + + // Load next message stride + vld1.8 {T3}, [MESSAGE]! + + // Load next key stride + vld1.32 {\k3}, [KEY]! + + // Add message words to key words + vadd.u32 T0, T3, \k0 + vadd.u32 T1, T3, \k1 + vadd.u32 T2, T3, \k2 + vadd.u32 T3, T3, \k3 + + // Multiply 32x32 => 64 and accumulate + vmlal.u32 PASS0_SUMS, T0_L, T0_H + vmlal.u32 PASS1_SUMS, T1_L, T1_H + vmlal.u32 PASS2_SUMS, T2_L, T2_H + vmlal.u32 PASS3_SUMS, T3_L, T3_H +.endm + +/* + * void nh_neon(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_neon) + + vld1.32 {K0,K1}, [KEY]! + vmov.u64 PASS0_SUMS, #0 + vmov.u64 PASS1_SUMS, #0 + vld1.32 {K2}, [KEY]! + vmov.u64 PASS2_SUMS, #0 + vmov.u64 PASS3_SUMS, #0 + + subs MESSAGE_LEN, MESSAGE_LEN, #64 + blt .Lloop4_done +.Lloop4: + _nh_stride K0, K1, K2, K3 + _nh_stride K1, K2, K3, K0 + _nh_stride K2, K3, K0, K1 + _nh_stride K3, K0, K1, K2 + subs MESSAGE_LEN, MESSAGE_LEN, #64 + bge .Lloop4 + +.Lloop4_done: + ands MESSAGE_LEN, MESSAGE_LEN, #63 + beq .Ldone + _nh_stride K0, K1, K2, K3 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K1, K2, K3, K0 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K2, K3, K0, K1 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + vadd.u64 T0_L, PASS0_SUM_A, PASS0_SUM_B + vadd.u64 T0_H, PASS1_SUM_A, PASS1_SUM_B + vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B + vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B + vst1.8 {T0-T1}, [HASH] + bx lr +ENDPROC(nh_neon) diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * (NEON accelerated version) + * + * Copyright 2018 Google LLC + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/hash.h> +#include <crypto/nhpoly1305.h> +#include <linux/module.h> + +asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len, + u8 hash[NH_HASH_BYTES]); + +/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ +static void _nh_neon(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + nh_neon(key, message, message_len, (u8 *)hash); +} + +static int nhpoly1305_neon_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + if (srclen < 64 || !may_use_simd()) + return crypto_nhpoly1305_update(desc, src, srclen); + + do { + unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + + kernel_neon_begin(); + crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); + kernel_neon_end(); + src += n; + srclen -= n; + } while (srclen); + return 0; +} + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-neon", + .base.cra_priority = 200, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = nhpoly1305_neon_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_NEON)) + return -ENODEV; + + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>"); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-neon"); diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig @@ -101,11 +101,16 @@ config CRYPTO_AES_ARM64_NEON_BLK select CRYPTO_SIMD config CRYPTO_CHACHA20_NEON - tristate "NEON accelerated ChaCha20 symmetric cipher" + tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions" depends on KERNEL_MODE_NEON select CRYPTO_BLKCIPHER select CRYPTO_CHACHA20 +config CRYPTO_NHPOLY1305_NEON + tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)" + depends on KERNEL_MODE_NEON + select CRYPTO_NHPOLY1305 + config CRYPTO_AES_ARM64_BS tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm" depends on KERNEL_MODE_NEON diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile @@ -50,8 +50,11 @@ sha256-arm64-y := sha256-glue.o sha256-core.o obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o sha512-arm64-y := sha512-glue.o sha512-core.o -obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o -chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o +obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o +chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o + +obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o +nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S @@ -0,0 +1,844 @@ +/* + * ChaCha/XChaCha NEON helper functions + * + * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Originally based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/assembler.h> +#include <asm/cache.h> + + .text + .align 6 + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is stored in the four NEON + * registers v0-v3. It performs matrix operations on four words in parallel, + * but requires shuffling to rearrange the words after each round. + * + * The round count is given in w3. + * + * Clobbers: w3, x10, v4, v12 + */ +chacha_permute: + + adr_l x10, ROT8 + ld1 {v12.4s}, [x10] + +.Ldoubleround: + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + ext v1.16b, v1.16b, v1.16b, #4 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + ext v3.16b, v3.16b, v3.16b, #12 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 16) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + rev32 v3.8h, v3.8h + + // x2 += x3, x1 = rotl32(x1 ^ x2, 12) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #12 + sri v1.4s, v4.4s, #20 + + // x0 += x1, x3 = rotl32(x3 ^ x0, 8) + add v0.4s, v0.4s, v1.4s + eor v3.16b, v3.16b, v0.16b + tbl v3.16b, {v3.16b}, v12.16b + + // x2 += x3, x1 = rotl32(x1 ^ x2, 7) + add v2.4s, v2.4s, v3.4s + eor v4.16b, v1.16b, v2.16b + shl v1.4s, v4.4s, #7 + sri v1.4s, v4.4s, #25 + + // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + ext v1.16b, v1.16b, v1.16b, #12 + // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + ext v2.16b, v2.16b, v2.16b, #8 + // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + ext v3.16b, v3.16b, v3.16b, #4 + + subs w3, w3, #2 + b.ne .Ldoubleround + + ret +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_neon) + // x0: Input state matrix, s + // x1: 1 data block output, o + // x2: 1 data block input, i + // w3: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + // x0..3 = s0..3 + ld1 {v0.4s-v3.4s}, [x0] + ld1 {v8.4s-v11.4s}, [x0] + + bl chacha_permute + + ld1 {v4.16b-v7.16b}, [x2] + + // o0 = i0 ^ (x0 + s0) + add v0.4s, v0.4s, v8.4s + eor v0.16b, v0.16b, v4.16b + + // o1 = i1 ^ (x1 + s1) + add v1.4s, v1.4s, v9.4s + eor v1.16b, v1.16b, v5.16b + + // o2 = i2 ^ (x2 + s2) + add v2.4s, v2.4s, v10.4s + eor v2.16b, v2.16b, v6.16b + + // o3 = i3 ^ (x3 + s3) + add v3.4s, v3.4s, v11.4s + eor v3.16b, v3.16b, v7.16b + + st1 {v0.16b-v3.16b}, [x1] + + ldp x29, x30, [sp], #16 + ret +ENDPROC(chacha_block_xor_neon) + +ENTRY(hchacha_block_neon) + // x0: Input state matrix, s + // x1: output (8 32-bit words) + // w2: nrounds + + stp x29, x30, [sp, #-16]! + mov x29, sp + + ld1 {v0.4s-v3.4s}, [x0] + + mov w3, w2 + bl chacha_permute + + st1 {v0.16b}, [x1], #16 + st1 {v3.16b}, [x1] + + ldp x29, x30, [sp], #16 + ret +ENDPROC(hchacha_block_neon) + + a0 .req w12 + a1 .req w13 + a2 .req w14 + a3 .req w15 + a4 .req w16 + a5 .req w17 + a6 .req w19 + a7 .req w20 + a8 .req w21 + a9 .req w22 + a10 .req w23 + a11 .req w24 + a12 .req w25 + a13 .req w26 + a14 .req w27 + a15 .req w28 + + .align 6 +ENTRY(chacha_4block_xor_neon) + frame_push 10 + + // x0: Input state matrix, s + // x1: 4 data blocks output, o + // x2: 4 data blocks input, i + // w3: nrounds + // x4: byte count + + adr_l x10, .Lpermute + and x5, x4, #63 + add x10, x10, x5 + add x11, x10, #64 + + // + // This function encrypts four consecutive ChaCha blocks by loading + // the state matrix in NEON registers four times. The algorithm performs + // each operation on the corresponding word of each state matrix, hence + // requires no word shuffling. For final XORing step we transpose the + // matrix by interleaving 32- and then 64-bit words, which allows us to + // do XOR in NEON registers. + // + // At the same time, a fifth block is encrypted in parallel using + // scalar registers + // + adr_l x9, CTRINC // ... and ROT8 + ld1 {v30.4s-v31.4s}, [x9] + + // x0..15[0-3] = s0..3[0..3] + add x8, x0, #16 + ld4r { v0.4s- v3.4s}, [x0] + ld4r { v4.4s- v7.4s}, [x8], #16 + ld4r { v8.4s-v11.4s}, [x8], #16 + ld4r {v12.4s-v15.4s}, [x8] + + mov a0, v0.s[0] + mov a1, v1.s[0] + mov a2, v2.s[0] + mov a3, v3.s[0] + mov a4, v4.s[0] + mov a5, v5.s[0] + mov a6, v6.s[0] + mov a7, v7.s[0] + mov a8, v8.s[0] + mov a9, v9.s[0] + mov a10, v10.s[0] + mov a11, v11.s[0] + mov a12, v12.s[0] + mov a13, v13.s[0] + mov a14, v14.s[0] + mov a15, v15.s[0] + + // x12 += counter values 1-4 + add v12.4s, v12.4s, v30.4s + +.Ldoubleround4: + // x0 += x4, x12 = rotl32(x12 ^ x0, 16) + // x1 += x5, x13 = rotl32(x13 ^ x1, 16) + // x2 += x6, x14 = rotl32(x14 ^ x2, 16) + // x3 += x7, x15 = rotl32(x15 ^ x3, 16) + add v0.4s, v0.4s, v4.4s + add a0, a0, a4 + add v1.4s, v1.4s, v5.4s + add a1, a1, a5 + add v2.4s, v2.4s, v6.4s + add a2, a2, a6 + add v3.4s, v3.4s, v7.4s + add a3, a3, a7 + + eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 + eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 + eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 + eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 + + rev32 v12.8h, v12.8h + ror a12, a12, #16 + rev32 v13.8h, v13.8h + ror a13, a13, #16 + rev32 v14.8h, v14.8h + ror a14, a14, #16 + rev32 v15.8h, v15.8h + ror a15, a15, #16 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 12) + // x9 += x13, x5 = rotl32(x5 ^ x9, 12) + // x10 += x14, x6 = rotl32(x6 ^ x10, 12) + // x11 += x15, x7 = rotl32(x7 ^ x11, 12) + add v8.4s, v8.4s, v12.4s + add a8, a8, a12 + add v9.4s, v9.4s, v13.4s + add a9, a9, a13 + add v10.4s, v10.4s, v14.4s + add a10, a10, a14 + add v11.4s, v11.4s, v15.4s + add a11, a11, a15 + + eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 + eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 + eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 + eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 + + shl v4.4s, v16.4s, #12 + shl v5.4s, v17.4s, #12 + shl v6.4s, v18.4s, #12 + shl v7.4s, v19.4s, #12 + + sri v4.4s, v16.4s, #20 + ror a4, a4, #20 + sri v5.4s, v17.4s, #20 + ror a5, a5, #20 + sri v6.4s, v18.4s, #20 + ror a6, a6, #20 + sri v7.4s, v19.4s, #20 + ror a7, a7, #20 + + // x0 += x4, x12 = rotl32(x12 ^ x0, 8) + // x1 += x5, x13 = rotl32(x13 ^ x1, 8) + // x2 += x6, x14 = rotl32(x14 ^ x2, 8) + // x3 += x7, x15 = rotl32(x15 ^ x3, 8) + add v0.4s, v0.4s, v4.4s + add a0, a0, a4 + add v1.4s, v1.4s, v5.4s + add a1, a1, a5 + add v2.4s, v2.4s, v6.4s + add a2, a2, a6 + add v3.4s, v3.4s, v7.4s + add a3, a3, a7 + + eor v12.16b, v12.16b, v0.16b + eor a12, a12, a0 + eor v13.16b, v13.16b, v1.16b + eor a13, a13, a1 + eor v14.16b, v14.16b, v2.16b + eor a14, a14, a2 + eor v15.16b, v15.16b, v3.16b + eor a15, a15, a3 + + tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 + tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 + tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 + tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 + + // x8 += x12, x4 = rotl32(x4 ^ x8, 7) + // x9 += x13, x5 = rotl32(x5 ^ x9, 7) + // x10 += x14, x6 = rotl32(x6 ^ x10, 7) + // x11 += x15, x7 = rotl32(x7 ^ x11, 7) + add v8.4s, v8.4s, v12.4s + add a8, a8, a12 + add v9.4s, v9.4s, v13.4s + add a9, a9, a13 + add v10.4s, v10.4s, v14.4s + add a10, a10, a14 + add v11.4s, v11.4s, v15.4s + add a11, a11, a15 + + eor v16.16b, v4.16b, v8.16b + eor a4, a4, a8 + eor v17.16b, v5.16b, v9.16b + eor a5, a5, a9 + eor v18.16b, v6.16b, v10.16b + eor a6, a6, a10 + eor v19.16b, v7.16b, v11.16b + eor a7, a7, a11 + + shl v4.4s, v16.4s, #7 + shl v5.4s, v17.4s, #7 + shl v6.4s, v18.4s, #7 + shl v7.4s, v19.4s, #7 + + sri v4.4s, v16.4s, #25 + ror a4, a4, #25 + sri v5.4s, v17.4s, #25 + ror a5, a5, #25 + sri v6.4s, v18.4s, #25 + ror a6, a6, #25 + sri v7.4s, v19.4s, #25 + ror a7, a7, #25 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 16) + // x1 += x6, x12 = rotl32(x12 ^ x1, 16) + // x2 += x7, x13 = rotl32(x13 ^ x2, 16) + // x3 += x4, x14 = rotl32(x14 ^ x3, 16) + add v0.4s, v0.4s, v5.4s + add a0, a0, a5 + add v1.4s, v1.4s, v6.4s + add a1, a1, a6 + add v2.4s, v2.4s, v7.4s + add a2, a2, a7 + add v3.4s, v3.4s, v4.4s + add a3, a3, a4 + + eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 + eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 + eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 + eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 + + rev32 v15.8h, v15.8h + ror a15, a15, #16 + rev32 v12.8h, v12.8h + ror a12, a12, #16 + rev32 v13.8h, v13.8h + ror a13, a13, #16 + rev32 v14.8h, v14.8h + ror a14, a14, #16 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 12) + // x11 += x12, x6 = rotl32(x6 ^ x11, 12) + // x8 += x13, x7 = rotl32(x7 ^ x8, 12) + // x9 += x14, x4 = rotl32(x4 ^ x9, 12) + add v10.4s, v10.4s, v15.4s + add a10, a10, a15 + add v11.4s, v11.4s, v12.4s + add a11, a11, a12 + add v8.4s, v8.4s, v13.4s + add a8, a8, a13 + add v9.4s, v9.4s, v14.4s + add a9, a9, a14 + + eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 + eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 + eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 + eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 + + shl v5.4s, v16.4s, #12 + shl v6.4s, v17.4s, #12 + shl v7.4s, v18.4s, #12 + shl v4.4s, v19.4s, #12 + + sri v5.4s, v16.4s, #20 + ror a5, a5, #20 + sri v6.4s, v17.4s, #20 + ror a6, a6, #20 + sri v7.4s, v18.4s, #20 + ror a7, a7, #20 + sri v4.4s, v19.4s, #20 + ror a4, a4, #20 + + // x0 += x5, x15 = rotl32(x15 ^ x0, 8) + // x1 += x6, x12 = rotl32(x12 ^ x1, 8) + // x2 += x7, x13 = rotl32(x13 ^ x2, 8) + // x3 += x4, x14 = rotl32(x14 ^ x3, 8) + add v0.4s, v0.4s, v5.4s + add a0, a0, a5 + add v1.4s, v1.4s, v6.4s + add a1, a1, a6 + add v2.4s, v2.4s, v7.4s + add a2, a2, a7 + add v3.4s, v3.4s, v4.4s + add a3, a3, a4 + + eor v15.16b, v15.16b, v0.16b + eor a15, a15, a0 + eor v12.16b, v12.16b, v1.16b + eor a12, a12, a1 + eor v13.16b, v13.16b, v2.16b + eor a13, a13, a2 + eor v14.16b, v14.16b, v3.16b + eor a14, a14, a3 + + tbl v15.16b, {v15.16b}, v31.16b + ror a15, a15, #24 + tbl v12.16b, {v12.16b}, v31.16b + ror a12, a12, #24 + tbl v13.16b, {v13.16b}, v31.16b + ror a13, a13, #24 + tbl v14.16b, {v14.16b}, v31.16b + ror a14, a14, #24 + + // x10 += x15, x5 = rotl32(x5 ^ x10, 7) + // x11 += x12, x6 = rotl32(x6 ^ x11, 7) + // x8 += x13, x7 = rotl32(x7 ^ x8, 7) + // x9 += x14, x4 = rotl32(x4 ^ x9, 7) + add v10.4s, v10.4s, v15.4s + add a10, a10, a15 + add v11.4s, v11.4s, v12.4s + add a11, a11, a12 + add v8.4s, v8.4s, v13.4s + add a8, a8, a13 + add v9.4s, v9.4s, v14.4s + add a9, a9, a14 + + eor v16.16b, v5.16b, v10.16b + eor a5, a5, a10 + eor v17.16b, v6.16b, v11.16b + eor a6, a6, a11 + eor v18.16b, v7.16b, v8.16b + eor a7, a7, a8 + eor v19.16b, v4.16b, v9.16b + eor a4, a4, a9 + + shl v5.4s, v16.4s, #7 + shl v6.4s, v17.4s, #7 + shl v7.4s, v18.4s, #7 + shl v4.4s, v19.4s, #7 + + sri v5.4s, v16.4s, #25 + ror a5, a5, #25 + sri v6.4s, v17.4s, #25 + ror a6, a6, #25 + sri v7.4s, v18.4s, #25 + ror a7, a7, #25 + sri v4.4s, v19.4s, #25 + ror a4, a4, #25 + + subs w3, w3, #2 + b.ne .Ldoubleround4 + + ld4r {v16.4s-v19.4s}, [x0], #16 + ld4r {v20.4s-v23.4s}, [x0], #16 + + // x12 += counter values 0-3 + add v12.4s, v12.4s, v30.4s + + // x0[0-3] += s0[0] + // x1[0-3] += s0[1] + // x2[0-3] += s0[2] + // x3[0-3] += s0[3] + add v0.4s, v0.4s, v16.4s + mov w6, v16.s[0] + mov w7, v17.s[0] + add v1.4s, v1.4s, v17.4s + mov w8, v18.s[0] + mov w9, v19.s[0] + add v2.4s, v2.4s, v18.4s + add a0, a0, w6 + add a1, a1, w7 + add v3.4s, v3.4s, v19.4s + add a2, a2, w8 + add a3, a3, w9 + + ld4r {v24.4s-v27.4s}, [x0], #16 + ld4r {v28.4s-v31.4s}, [x0] + + // x4[0-3] += s1[0] + // x5[0-3] += s1[1] + // x6[0-3] += s1[2] + // x7[0-3] += s1[3] + add v4.4s, v4.4s, v20.4s + mov w6, v20.s[0] + mov w7, v21.s[0] + add v5.4s, v5.4s, v21.4s + mov w8, v22.s[0] + mov w9, v23.s[0] + add v6.4s, v6.4s, v22.4s + add a4, a4, w6 + add a5, a5, w7 + add v7.4s, v7.4s, v23.4s + add a6, a6, w8 + add a7, a7, w9 + + // x8[0-3] += s2[0] + // x9[0-3] += s2[1] + // x10[0-3] += s2[2] + // x11[0-3] += s2[3] + add v8.4s, v8.4s, v24.4s + mov w6, v24.s[0] + mov w7, v25.s[0] + add v9.4s, v9.4s, v25.4s + mov w8, v26.s[0] + mov w9, v27.s[0] + add v10.4s, v10.4s, v26.4s + add a8, a8, w6 + add a9, a9, w7 + add v11.4s, v11.4s, v27.4s + add a10, a10, w8 + add a11, a11, w9 + + // x12[0-3] += s3[0] + // x13[0-3] += s3[1] + // x14[0-3] += s3[2] + // x15[0-3] += s3[3] + add v12.4s, v12.4s, v28.4s + mov w6, v28.s[0] + mov w7, v29.s[0] + add v13.4s, v13.4s, v29.4s + mov w8, v30.s[0] + mov w9, v31.s[0] + add v14.4s, v14.4s, v30.4s + add a12, a12, w6 + add a13, a13, w7 + add v15.4s, v15.4s, v31.4s + add a14, a14, w8 + add a15, a15, w9 + + // interleave 32-bit words in state n, n+1 + ldp w6, w7, [x2], #64 + zip1 v16.4s, v0.4s, v1.4s + ldp w8, w9, [x2, #-56] + eor a0, a0, w6 + zip2 v17.4s, v0.4s, v1.4s + eor a1, a1, w7 + zip1 v18.4s, v2.4s, v3.4s + eor a2, a2, w8 + zip2 v19.4s, v2.4s, v3.4s + eor a3, a3, w9 + ldp w6, w7, [x2, #-48] + zip1 v20.4s, v4.4s, v5.4s + ldp w8, w9, [x2, #-40] + eor a4, a4, w6 + zip2 v21.4s, v4.4s, v5.4s + eor a5, a5, w7 + zip1 v22.4s, v6.4s, v7.4s + eor a6, a6, w8 + zip2 v23.4s, v6.4s, v7.4s + eor a7, a7, w9 + ldp w6, w7, [x2, #-32] + zip1 v24.4s, v8.4s, v9.4s + ldp w8, w9, [x2, #-24] + eor a8, a8, w6 + zip2 v25.4s, v8.4s, v9.4s + eor a9, a9, w7 + zip1 v26.4s, v10.4s, v11.4s + eor a10, a10, w8 + zip2 v27.4s, v10.4s, v11.4s + eor a11, a11, w9 + ldp w6, w7, [x2, #-16] + zip1 v28.4s, v12.4s, v13.4s + ldp w8, w9, [x2, #-8] + eor a12, a12, w6 + zip2 v29.4s, v12.4s, v13.4s + eor a13, a13, w7 + zip1 v30.4s, v14.4s, v15.4s + eor a14, a14, w8 + zip2 v31.4s, v14.4s, v15.4s + eor a15, a15, w9 + + mov x3, #64 + subs x5, x4, #128 + add x6, x5, x2 + csel x3, x3, xzr, ge + csel x2, x2, x6, ge + + // interleave 64-bit words in state n, n+2 + zip1 v0.2d, v16.2d, v18.2d + zip2 v4.2d, v16.2d, v18.2d + stp a0, a1, [x1], #64 + zip1 v8.2d, v17.2d, v19.2d + zip2 v12.2d, v17.2d, v19.2d + stp a2, a3, [x1, #-56] + ld1 {v16.16b-v19.16b}, [x2], x3 + + subs x6, x4, #192 + ccmp x3, xzr, #4, lt + add x7, x6, x2 + csel x3, x3, xzr, eq + csel x2, x2, x7, eq + + zip1 v1.2d, v20.2d, v22.2d + zip2 v5.2d, v20.2d, v22.2d + stp a4, a5, [x1, #-48] + zip1 v9.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + stp a6, a7, [x1, #-40] + ld1 {v20.16b-v23.16b}, [x2], x3 + + subs x7, x4, #256 + ccmp x3, xzr, #4, lt + add x8, x7, x2 + csel x3, x3, xzr, eq + csel x2, x2, x8, eq + + zip1 v2.2d, v24.2d, v26.2d + zip2 v6.2d, v24.2d, v26.2d + stp a8, a9, [x1, #-32] + zip1 v10.2d, v25.2d, v27.2d + zip2 v14.2d, v25.2d, v27.2d + stp a10, a11, [x1, #-24] + ld1 {v24.16b-v27.16b}, [x2], x3 + + subs x8, x4, #320 + ccmp x3, xzr, #4, lt + add x9, x8, x2 + csel x2, x2, x9, eq + + zip1 v3.2d, v28.2d, v30.2d + zip2 v7.2d, v28.2d, v30.2d + stp a12, a13, [x1, #-16] + zip1 v11.2d, v29.2d, v31.2d + zip2 v15.2d, v29.2d, v31.2d + stp a14, a15, [x1, #-8] + ld1 {v28.16b-v31.16b}, [x2] + + // xor with corresponding input, write to output + tbnz x5, #63, 0f + eor v16.16b, v16.16b, v0.16b + eor v17.16b, v17.16b, v1.16b + eor v18.16b, v18.16b, v2.16b + eor v19.16b, v19.16b, v3.16b + st1 {v16.16b-v19.16b}, [x1], #64 + cbz x5, .Lout + + tbnz x6, #63, 1f + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + st1 {v20.16b-v23.16b}, [x1], #64 + cbz x6, .Lout + + tbnz x7, #63, 2f + eor v24.16b, v24.16b, v8.16b + eor v25.16b, v25.16b, v9.16b + eor v26.16b, v26.16b, v10.16b + eor v27.16b, v27.16b, v11.16b + st1 {v24.16b-v27.16b}, [x1], #64 + cbz x7, .Lout + + tbnz x8, #63, 3f + eor v28.16b, v28.16b, v12.16b + eor v29.16b, v29.16b, v13.16b + eor v30.16b, v30.16b, v14.16b + eor v31.16b, v31.16b, v15.16b + st1 {v28.16b-v31.16b}, [x1] + +.Lout: frame_pop + ret + + // fewer than 128 bytes of in/output +0: ld1 {v8.16b}, [x10] + ld1 {v9.16b}, [x11] + movi v10.16b, #16 + sub x2, x1, #64 + add x1, x1, x5 + ld1 {v16.16b-v19.16b}, [x2] + tbl v4.16b, {v0.16b-v3.16b}, v8.16b + tbx v20.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v5.16b, {v0.16b-v3.16b}, v8.16b + tbx v21.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v6.16b, {v0.16b-v3.16b}, v8.16b + tbx v22.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v7.16b, {v0.16b-v3.16b}, v8.16b + tbx v23.16b, {v16.16b-v19.16b}, v9.16b + + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v6.16b + eor v23.16b, v23.16b, v7.16b + st1 {v20.16b-v23.16b}, [x1] + b .Lout + + // fewer than 192 bytes of in/output +1: ld1 {v8.16b}, [x10] + ld1 {v9.16b}, [x11] + movi v10.16b, #16 + add x1, x1, x6 + tbl v0.16b, {v4.16b-v7.16b}, v8.16b + tbx v20.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v1.16b, {v4.16b-v7.16b}, v8.16b + tbx v21.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v2.16b, {v4.16b-v7.16b}, v8.16b + tbx v22.16b, {v16.16b-v19.16b}, v9.16b + add v8.16b, v8.16b, v10.16b + add v9.16b, v9.16b, v10.16b + tbl v3.16b, {v4.16b-v7.16b}, v8.16b + tbx v23.16b, {v16.16b-v19.16b}, v9.16b + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v1.16b + eor v22.16b, v22.16b, v2.16b + eor v23.16b, v23.16b, v3.16b + st1 {v20.16b-v23.16b}, [x1] + b .Lout + + // fewer than 256 bytes of in/output +2: ld1 {v4.16b}, [x10] + ld1 {v5.16b}, [x11] + movi v6.16b, #16 + add x1, x1, x7 + tbl v0.16b, {v8.16b-v11.16b}, v4.16b + tbx v24.16b, {v20.16b-v23.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v1.16b, {v8.16b-v11.16b}, v4.16b + tbx v25.16b, {v20.16b-v23.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v2.16b, {v8.16b-v11.16b}, v4.16b + tbx v26.16b, {v20.16b-v23.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v3.16b, {v8.16b-v11.16b}, v4.16b + tbx v27.16b, {v20.16b-v23.16b}, v5.16b + + eor v24.16b, v24.16b, v0.16b + eor v25.16b, v25.16b, v1.16b + eor v26.16b, v26.16b, v2.16b + eor v27.16b, v27.16b, v3.16b + st1 {v24.16b-v27.16b}, [x1] + b .Lout + + // fewer than 320 bytes of in/output +3: ld1 {v4.16b}, [x10] + ld1 {v5.16b}, [x11] + movi v6.16b, #16 + add x1, x1, x8 + tbl v0.16b, {v12.16b-v15.16b}, v4.16b + tbx v28.16b, {v24.16b-v27.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v1.16b, {v12.16b-v15.16b}, v4.16b + tbx v29.16b, {v24.16b-v27.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v2.16b, {v12.16b-v15.16b}, v4.16b + tbx v30.16b, {v24.16b-v27.16b}, v5.16b + add v4.16b, v4.16b, v6.16b + add v5.16b, v5.16b, v6.16b + tbl v3.16b, {v12.16b-v15.16b}, v4.16b + tbx v31.16b, {v24.16b-v27.16b}, v5.16b + + eor v28.16b, v28.16b, v0.16b + eor v29.16b, v29.16b, v1.16b + eor v30.16b, v30.16b, v2.16b + eor v31.16b, v31.16b, v3.16b + st1 {v28.16b-v31.16b}, [x1] + b .Lout +ENDPROC(chacha_4block_xor_neon) + + .section ".rodata", "a", %progbits + .align L1_CACHE_SHIFT +.Lpermute: + .set .Li, 0 + .rept 192 + .byte (.Li - 64) + .set .Li, .Li + 1 + .endr + +CTRINC: .word 1, 2, 3, 4 +ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c @@ -0,0 +1,198 @@ +/* + * ARM NEON accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on: + * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/algapi.h> +#include <crypto/chacha.h> +#include <crypto/internal/skcipher.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/hwcap.h> +#include <asm/neon.h> +#include <asm/simd.h> + +asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src, + int nrounds); +asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src, + int nrounds, int bytes); +asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds); + +static void chacha_doneon(u32 *state, u8 *dst, const u8 *src, + int bytes, int nrounds) +{ + while (bytes > 0) { + int l = min(bytes, CHACHA_BLOCK_SIZE * 5); + + if (l <= CHACHA_BLOCK_SIZE) { + u8 buf[CHACHA_BLOCK_SIZE]; + + memcpy(buf, src, l); + chacha_block_xor_neon(state, buf, buf, nrounds); + memcpy(dst, buf, l); + state[12] += 1; + break; + } + chacha_4block_xor_neon(state, dst, src, nrounds, l); + bytes -= CHACHA_BLOCK_SIZE * 5; + src += CHACHA_BLOCK_SIZE * 5; + dst += CHACHA_BLOCK_SIZE * 5; + state[12] += 5; + } +} + +static int chacha_neon_stream_xor(struct skcipher_request *req, + struct chacha_ctx *ctx, u8 *iv) +{ + struct skcipher_walk walk; + u32 state[16]; + int err; + + err = skcipher_walk_virt(&walk, req, false); + + crypto_chacha_init(state, ctx, iv); + + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; + + if (nbytes < walk.total) + nbytes = rounddown(nbytes, walk.stride); + + kernel_neon_begin(); + chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, + nbytes, ctx->nrounds); + kernel_neon_end(); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); + } + + return err; +} + +static int chacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_chacha_crypt(req); + + return chacha_neon_stream_xor(req, ctx, req->iv); +} + +static int xchacha_neon(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct chacha_ctx subctx; + u32 state[16]; + u8 real_iv[16]; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + return crypto_xchacha_crypt(req); + + crypto_chacha_init(state, ctx, req->iv); + + kernel_neon_begin(); + hchacha_block_neon(state, subctx.key, ctx->nrounds); + kernel_neon_end(); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + return chacha_neon_stream_xor(req, &subctx, real_iv); +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha_neon, + .decrypt = chacha_neon, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-neon", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .walksize = 5 * CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha12_setkey, + .encrypt = xchacha_neon, + .decrypt = xchacha_neon, + } +}; + +static int __init chacha_simd_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)"); +MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-neon"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-neon"); diff --git a/arch/arm64/crypto/chacha20-neon-core.S b/arch/arm64/crypto/chacha20-neon-core.S @@ -1,450 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions - * - * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/linkage.h> - - .text - .align 6 - -ENTRY(chacha20_block_xor_neon) - // x0: Input state matrix, s - // x1: 1 data block output, o - // x2: 1 data block input, i - - // - // This function encrypts one ChaCha20 block by loading the state matrix - // in four NEON registers. It performs matrix operation on four words in - // parallel, but requires shuffling to rearrange the words after each - // round. - // - - // x0..3 = s0..3 - adr x3, ROT8 - ld1 {v0.4s-v3.4s}, [x0] - ld1 {v8.4s-v11.4s}, [x0] - ld1 {v12.4s}, [x3] - - mov x3, #10 - -.Ldoubleround: - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - ext v1.16b, v1.16b, v1.16b, #4 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - ext v3.16b, v3.16b, v3.16b, #12 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 16) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - rev32 v3.8h, v3.8h - - // x2 += x3, x1 = rotl32(x1 ^ x2, 12) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #12 - sri v1.4s, v4.4s, #20 - - // x0 += x1, x3 = rotl32(x3 ^ x0, 8) - add v0.4s, v0.4s, v1.4s - eor v3.16b, v3.16b, v0.16b - tbl v3.16b, {v3.16b}, v12.16b - - // x2 += x3, x1 = rotl32(x1 ^ x2, 7) - add v2.4s, v2.4s, v3.4s - eor v4.16b, v1.16b, v2.16b - shl v1.4s, v4.4s, #7 - sri v1.4s, v4.4s, #25 - - // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - ext v1.16b, v1.16b, v1.16b, #12 - // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - ext v2.16b, v2.16b, v2.16b, #8 - // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - ext v3.16b, v3.16b, v3.16b, #4 - - subs x3, x3, #1 - b.ne .Ldoubleround - - ld1 {v4.16b-v7.16b}, [x2] - - // o0 = i0 ^ (x0 + s0) - add v0.4s, v0.4s, v8.4s - eor v0.16b, v0.16b, v4.16b - - // o1 = i1 ^ (x1 + s1) - add v1.4s, v1.4s, v9.4s - eor v1.16b, v1.16b, v5.16b - - // o2 = i2 ^ (x2 + s2) - add v2.4s, v2.4s, v10.4s - eor v2.16b, v2.16b, v6.16b - - // o3 = i3 ^ (x3 + s3) - add v3.4s, v3.4s, v11.4s - eor v3.16b, v3.16b, v7.16b - - st1 {v0.16b-v3.16b}, [x1] - - ret -ENDPROC(chacha20_block_xor_neon) - - .align 6 -ENTRY(chacha20_4block_xor_neon) - // x0: Input state matrix, s - // x1: 4 data blocks output, o - // x2: 4 data blocks input, i - - // - // This function encrypts four consecutive ChaCha20 blocks by loading - // the state matrix in NEON registers four times. The algorithm performs - // each operation on the corresponding word of each state matrix, hence - // requires no word shuffling. For final XORing step we transpose the - // matrix by interleaving 32- and then 64-bit words, which allows us to - // do XOR in NEON registers. - // - adr x3, CTRINC // ... and ROT8 - ld1 {v30.4s-v31.4s}, [x3] - - // x0..15[0-3] = s0..3[0..3] - mov x4, x0 - ld4r { v0.4s- v3.4s}, [x4], #16 - ld4r { v4.4s- v7.4s}, [x4], #16 - ld4r { v8.4s-v11.4s}, [x4], #16 - ld4r {v12.4s-v15.4s}, [x4] - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v30.4s - - mov x3, #10 - -.Ldoubleround4: - // x0 += x4, x12 = rotl32(x12 ^ x0, 16) - // x1 += x5, x13 = rotl32(x13 ^ x1, 16) - // x2 += x6, x14 = rotl32(x14 ^ x2, 16) - // x3 += x7, x15 = rotl32(x15 ^ x3, 16) - add v0.4s, v0.4s, v4.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - - eor v12.16b, v12.16b, v0.16b - eor v13.16b, v13.16b, v1.16b - eor v14.16b, v14.16b, v2.16b - eor v15.16b, v15.16b, v3.16b - - rev32 v12.8h, v12.8h - rev32 v13.8h, v13.8h - rev32 v14.8h, v14.8h - rev32 v15.8h, v15.8h - - // x8 += x12, x4 = rotl32(x4 ^ x8, 12) - // x9 += x13, x5 = rotl32(x5 ^ x9, 12) - // x10 += x14, x6 = rotl32(x6 ^ x10, 12) - // x11 += x15, x7 = rotl32(x7 ^ x11, 12) - add v8.4s, v8.4s, v12.4s - add v9.4s, v9.4s, v13.4s - add v10.4s, v10.4s, v14.4s - add v11.4s, v11.4s, v15.4s - - eor v16.16b, v4.16b, v8.16b - eor v17.16b, v5.16b, v9.16b - eor v18.16b, v6.16b, v10.16b - eor v19.16b, v7.16b, v11.16b - - shl v4.4s, v16.4s, #12 - shl v5.4s, v17.4s, #12 - shl v6.4s, v18.4s, #12 - shl v7.4s, v19.4s, #12 - - sri v4.4s, v16.4s, #20 - sri v5.4s, v17.4s, #20 - sri v6.4s, v18.4s, #20 - sri v7.4s, v19.4s, #20 - - // x0 += x4, x12 = rotl32(x12 ^ x0, 8) - // x1 += x5, x13 = rotl32(x13 ^ x1, 8) - // x2 += x6, x14 = rotl32(x14 ^ x2, 8) - // x3 += x7, x15 = rotl32(x15 ^ x3, 8) - add v0.4s, v0.4s, v4.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - - eor v12.16b, v12.16b, v0.16b - eor v13.16b, v13.16b, v1.16b - eor v14.16b, v14.16b, v2.16b - eor v15.16b, v15.16b, v3.16b - - tbl v12.16b, {v12.16b}, v31.16b - tbl v13.16b, {v13.16b}, v31.16b - tbl v14.16b, {v14.16b}, v31.16b - tbl v15.16b, {v15.16b}, v31.16b - - // x8 += x12, x4 = rotl32(x4 ^ x8, 7) - // x9 += x13, x5 = rotl32(x5 ^ x9, 7) - // x10 += x14, x6 = rotl32(x6 ^ x10, 7) - // x11 += x15, x7 = rotl32(x7 ^ x11, 7) - add v8.4s, v8.4s, v12.4s - add v9.4s, v9.4s, v13.4s - add v10.4s, v10.4s, v14.4s - add v11.4s, v11.4s, v15.4s - - eor v16.16b, v4.16b, v8.16b - eor v17.16b, v5.16b, v9.16b - eor v18.16b, v6.16b, v10.16b - eor v19.16b, v7.16b, v11.16b - - shl v4.4s, v16.4s, #7 - shl v5.4s, v17.4s, #7 - shl v6.4s, v18.4s, #7 - shl v7.4s, v19.4s, #7 - - sri v4.4s, v16.4s, #25 - sri v5.4s, v17.4s, #25 - sri v6.4s, v18.4s, #25 - sri v7.4s, v19.4s, #25 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 16) - // x1 += x6, x12 = rotl32(x12 ^ x1, 16) - // x2 += x7, x13 = rotl32(x13 ^ x2, 16) - // x3 += x4, x14 = rotl32(x14 ^ x3, 16) - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v4.4s - - eor v15.16b, v15.16b, v0.16b - eor v12.16b, v12.16b, v1.16b - eor v13.16b, v13.16b, v2.16b - eor v14.16b, v14.16b, v3.16b - - rev32 v15.8h, v15.8h - rev32 v12.8h, v12.8h - rev32 v13.8h, v13.8h - rev32 v14.8h, v14.8h - - // x10 += x15, x5 = rotl32(x5 ^ x10, 12) - // x11 += x12, x6 = rotl32(x6 ^ x11, 12) - // x8 += x13, x7 = rotl32(x7 ^ x8, 12) - // x9 += x14, x4 = rotl32(x4 ^ x9, 12) - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v12.4s - add v8.4s, v8.4s, v13.4s - add v9.4s, v9.4s, v14.4s - - eor v16.16b, v5.16b, v10.16b - eor v17.16b, v6.16b, v11.16b - eor v18.16b, v7.16b, v8.16b - eor v19.16b, v4.16b, v9.16b - - shl v5.4s, v16.4s, #12 - shl v6.4s, v17.4s, #12 - shl v7.4s, v18.4s, #12 - shl v4.4s, v19.4s, #12 - - sri v5.4s, v16.4s, #20 - sri v6.4s, v17.4s, #20 - sri v7.4s, v18.4s, #20 - sri v4.4s, v19.4s, #20 - - // x0 += x5, x15 = rotl32(x15 ^ x0, 8) - // x1 += x6, x12 = rotl32(x12 ^ x1, 8) - // x2 += x7, x13 = rotl32(x13 ^ x2, 8) - // x3 += x4, x14 = rotl32(x14 ^ x3, 8) - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v4.4s - - eor v15.16b, v15.16b, v0.16b - eor v12.16b, v12.16b, v1.16b - eor v13.16b, v13.16b, v2.16b - eor v14.16b, v14.16b, v3.16b - - tbl v15.16b, {v15.16b}, v31.16b - tbl v12.16b, {v12.16b}, v31.16b - tbl v13.16b, {v13.16b}, v31.16b - tbl v14.16b, {v14.16b}, v31.16b - - // x10 += x15, x5 = rotl32(x5 ^ x10, 7) - // x11 += x12, x6 = rotl32(x6 ^ x11, 7) - // x8 += x13, x7 = rotl32(x7 ^ x8, 7) - // x9 += x14, x4 = rotl32(x4 ^ x9, 7) - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v12.4s - add v8.4s, v8.4s, v13.4s - add v9.4s, v9.4s, v14.4s - - eor v16.16b, v5.16b, v10.16b - eor v17.16b, v6.16b, v11.16b - eor v18.16b, v7.16b, v8.16b - eor v19.16b, v4.16b, v9.16b - - shl v5.4s, v16.4s, #7 - shl v6.4s, v17.4s, #7 - shl v7.4s, v18.4s, #7 - shl v4.4s, v19.4s, #7 - - sri v5.4s, v16.4s, #25 - sri v6.4s, v17.4s, #25 - sri v7.4s, v18.4s, #25 - sri v4.4s, v19.4s, #25 - - subs x3, x3, #1 - b.ne .Ldoubleround4 - - ld4r {v16.4s-v19.4s}, [x0], #16 - ld4r {v20.4s-v23.4s}, [x0], #16 - - // x12 += counter values 0-3 - add v12.4s, v12.4s, v30.4s - - // x0[0-3] += s0[0] - // x1[0-3] += s0[1] - // x2[0-3] += s0[2] - // x3[0-3] += s0[3] - add v0.4s, v0.4s, v16.4s - add v1.4s, v1.4s, v17.4s - add v2.4s, v2.4s, v18.4s - add v3.4s, v3.4s, v19.4s - - ld4r {v24.4s-v27.4s}, [x0], #16 - ld4r {v28.4s-v31.4s}, [x0] - - // x4[0-3] += s1[0] - // x5[0-3] += s1[1] - // x6[0-3] += s1[2] - // x7[0-3] += s1[3] - add v4.4s, v4.4s, v20.4s - add v5.4s, v5.4s, v21.4s - add v6.4s, v6.4s, v22.4s - add v7.4s, v7.4s, v23.4s - - // x8[0-3] += s2[0] - // x9[0-3] += s2[1] - // x10[0-3] += s2[2] - // x11[0-3] += s2[3] - add v8.4s, v8.4s, v24.4s - add v9.4s, v9.4s, v25.4s - add v10.4s, v10.4s, v26.4s - add v11.4s, v11.4s, v27.4s - - // x12[0-3] += s3[0] - // x13[0-3] += s3[1] - // x14[0-3] += s3[2] - // x15[0-3] += s3[3] - add v12.4s, v12.4s, v28.4s - add v13.4s, v13.4s, v29.4s - add v14.4s, v14.4s, v30.4s - add v15.4s, v15.4s, v31.4s - - // interleave 32-bit words in state n, n+1 - zip1 v16.4s, v0.4s, v1.4s - zip2 v17.4s, v0.4s, v1.4s - zip1 v18.4s, v2.4s, v3.4s - zip2 v19.4s, v2.4s, v3.4s - zip1 v20.4s, v4.4s, v5.4s - zip2 v21.4s, v4.4s, v5.4s - zip1 v22.4s, v6.4s, v7.4s - zip2 v23.4s, v6.4s, v7.4s - zip1 v24.4s, v8.4s, v9.4s - zip2 v25.4s, v8.4s, v9.4s - zip1 v26.4s, v10.4s, v11.4s - zip2 v27.4s, v10.4s, v11.4s - zip1 v28.4s, v12.4s, v13.4s - zip2 v29.4s, v12.4s, v13.4s - zip1 v30.4s, v14.4s, v15.4s - zip2 v31.4s, v14.4s, v15.4s - - // interleave 64-bit words in state n, n+2 - zip1 v0.2d, v16.2d, v18.2d - zip2 v4.2d, v16.2d, v18.2d - zip1 v8.2d, v17.2d, v19.2d - zip2 v12.2d, v17.2d, v19.2d - ld1 {v16.16b-v19.16b}, [x2], #64 - - zip1 v1.2d, v20.2d, v22.2d - zip2 v5.2d, v20.2d, v22.2d - zip1 v9.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - ld1 {v20.16b-v23.16b}, [x2], #64 - - zip1 v2.2d, v24.2d, v26.2d - zip2 v6.2d, v24.2d, v26.2d - zip1 v10.2d, v25.2d, v27.2d - zip2 v14.2d, v25.2d, v27.2d - ld1 {v24.16b-v27.16b}, [x2], #64 - - zip1 v3.2d, v28.2d, v30.2d - zip2 v7.2d, v28.2d, v30.2d - zip1 v11.2d, v29.2d, v31.2d - zip2 v15.2d, v29.2d, v31.2d - ld1 {v28.16b-v31.16b}, [x2] - - // xor with corresponding input, write to output - eor v16.16b, v16.16b, v0.16b - eor v17.16b, v17.16b, v1.16b - eor v18.16b, v18.16b, v2.16b - eor v19.16b, v19.16b, v3.16b - eor v20.16b, v20.16b, v4.16b - eor v21.16b, v21.16b, v5.16b - st1 {v16.16b-v19.16b}, [x1], #64 - eor v22.16b, v22.16b, v6.16b - eor v23.16b, v23.16b, v7.16b - eor v24.16b, v24.16b, v8.16b - eor v25.16b, v25.16b, v9.16b - st1 {v20.16b-v23.16b}, [x1], #64 - eor v26.16b, v26.16b, v10.16b - eor v27.16b, v27.16b, v11.16b - eor v28.16b, v28.16b, v12.16b - st1 {v24.16b-v27.16b}, [x1], #64 - eor v29.16b, v29.16b, v13.16b - eor v30.16b, v30.16b, v14.16b - eor v31.16b, v31.16b, v15.16b - st1 {v28.16b-v31.16b}, [x1] - - ret -ENDPROC(chacha20_4block_xor_neon) - -CTRINC: .word 0, 1, 2, 3 -ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f diff --git a/arch/arm64/crypto/chacha20-neon-glue.c b/arch/arm64/crypto/chacha20-neon-glue.c @@ -1,133 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions - * - * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Based on: - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <crypto/algapi.h> -#include <crypto/chacha20.h> -#include <crypto/internal/skcipher.h> -#include <linux/kernel.h> -#include <linux/module.h> - -#include <asm/hwcap.h> -#include <asm/neon.h> -#include <asm/simd.h> - -asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src); -asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src); - -static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ - u8 buf[CHACHA20_BLOCK_SIZE]; - - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { - kernel_neon_begin(); - chacha20_4block_xor_neon(state, dst, src); - kernel_neon_end(); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; - state[12] += 4; - } - - if (!bytes) - return; - - kernel_neon_begin(); - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_neon(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_neon(state, buf, buf); - memcpy(dst, buf, bytes); - } - kernel_neon_end(); -} - -static int chacha20_neon(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - u32 state[16]; - int err; - - if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE) - return crypto_chacha20_crypt(req); - - err = skcipher_walk_virt(&walk, req, false); - - crypto_chacha20_init(state, ctx, walk.iv); - - while (walk.nbytes > 0) { - unsigned int nbytes = walk.nbytes; - - if (nbytes < walk.total) - nbytes = round_down(nbytes, walk.stride); - - chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr, - nbytes); - err = skcipher_walk_done(&walk, walk.nbytes - nbytes); - } - - return err; -} - -static struct skcipher_alg alg = { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-neon", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, - .walksize = 4 * CHACHA20_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_neon, - .decrypt = chacha20_neon, -}; - -static int __init chacha20_simd_mod_init(void) -{ - if (!(elf_hwcap & HWCAP_ASIMD)) - return -ENODEV; - - return crypto_register_skcipher(&alg); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_skcipher(&alg); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>"); -MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_CRYPTO("chacha20"); diff --git a/arch/arm64/crypto/nh-neon-core.S b/arch/arm64/crypto/nh-neon-core.S @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, ARM64 NEON accelerated version + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ + +#include <linux/linkage.h> + + KEY .req x0 + MESSAGE .req x1 + MESSAGE_LEN .req x2 + HASH .req x3 + + PASS0_SUMS .req v0 + PASS1_SUMS .req v1 + PASS2_SUMS .req v2 + PASS3_SUMS .req v3 + K0 .req v4 + K1 .req v5 + K2 .req v6 + K3 .req v7 + T0 .req v8 + T1 .req v9 + T2 .req v10 + T3 .req v11 + T4 .req v12 + T5 .req v13 + T6 .req v14 + T7 .req v15 + +.macro _nh_stride k0, k1, k2, k3 + + // Load next message stride + ld1 {T3.16b}, [MESSAGE], #16 + + // Load next key stride + ld1 {\k3\().4s}, [KEY], #16 + + // Add message words to key words + add T0.4s, T3.4s, \k0\().4s + add T1.4s, T3.4s, \k1\().4s + add T2.4s, T3.4s, \k2\().4s + add T3.4s, T3.4s, \k3\().4s + + // Multiply 32x32 => 64 and accumulate + mov T4.d[0], T0.d[1] + mov T5.d[0], T1.d[1] + mov T6.d[0], T2.d[1] + mov T7.d[0], T3.d[1] + umlal PASS0_SUMS.2d, T0.2s, T4.2s + umlal PASS1_SUMS.2d, T1.2s, T5.2s + umlal PASS2_SUMS.2d, T2.2s, T6.2s + umlal PASS3_SUMS.2d, T3.2s, T7.2s +.endm + +/* + * void nh_neon(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_neon) + + ld1 {K0.4s,K1.4s}, [KEY], #32 + movi PASS0_SUMS.2d, #0 + movi PASS1_SUMS.2d, #0 + ld1 {K2.4s}, [KEY], #16 + movi PASS2_SUMS.2d, #0 + movi PASS3_SUMS.2d, #0 + + subs MESSAGE_LEN, MESSAGE_LEN, #64 + blt .Lloop4_done +.Lloop4: + _nh_stride K0, K1, K2, K3 + _nh_stride K1, K2, K3, K0 + _nh_stride K2, K3, K0, K1 + _nh_stride K3, K0, K1, K2 + subs MESSAGE_LEN, MESSAGE_LEN, #64 + bge .Lloop4 + +.Lloop4_done: + ands MESSAGE_LEN, MESSAGE_LEN, #63 + beq .Ldone + _nh_stride K0, K1, K2, K3 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K1, K2, K3, K0 + + subs MESSAGE_LEN, MESSAGE_LEN, #16 + beq .Ldone + _nh_stride K2, K3, K0, K1 + +.Ldone: + // Sum the accumulators for each pass, then store the sums to 'hash' + addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d + addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d + st1 {T0.16b,T1.16b}, [HASH] + ret +ENDPROC(nh_neon) diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum + * (ARM64 NEON accelerated version) + * + * Copyright 2018 Google LLC + */ + +#include <asm/neon.h> +#include <asm/simd.h> +#include <crypto/internal/hash.h> +#include <crypto/nhpoly1305.h> +#include <linux/module.h> + +asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len, + u8 hash[NH_HASH_BYTES]); + +/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */ +static void _nh_neon(const u32 *key, const u8 *message, size_t message_len, + __le64 hash[NH_NUM_PASSES]) +{ + nh_neon(key, message, message_len, (u8 *)hash); +} + +static int nhpoly1305_neon_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + if (srclen < 64 || !may_use_simd()) + return crypto_nhpoly1305_update(desc, src, srclen); + + do { + unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE); + + kernel_neon_begin(); + crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon); + kernel_neon_end(); + src += n; + srclen -= n; + } while (srclen); + return 0; +} + +static struct shash_alg nhpoly1305_alg = { + .base.cra_name = "nhpoly1305", + .base.cra_driver_name = "nhpoly1305-neon", + .base.cra_priority = 200, + .base.cra_ctxsize = sizeof(struct nhpoly1305_key), + .base.cra_module = THIS_MODULE, + .digestsize = POLY1305_DIGEST_SIZE, + .init = crypto_nhpoly1305_init, + .update = nhpoly1305_neon_update, + .final = crypto_nhpoly1305_final, + .setkey = crypto_nhpoly1305_setkey, + .descsize = sizeof(struct nhpoly1305_state), +}; + +static int __init nhpoly1305_mod_init(void) +{ + if (!(elf_hwcap & HWCAP_ASIMD)) + return -ENODEV; + + return crypto_register_shash(&nhpoly1305_alg); +} + +static void __exit nhpoly1305_mod_exit(void) +{ + crypto_unregister_shash(&nhpoly1305_alg); +} + +module_init(nhpoly1305_mod_init); +module_exit(nhpoly1305_mod_exit); + +MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>"); +MODULE_ALIAS_CRYPTO("nhpoly1305"); +MODULE_ALIAS_CRYPTO("nhpoly1305-neon"); diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c @@ -137,7 +137,7 @@ static int fallback_init_cip(struct crypto_tfm *tfm) struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); sctx->fallback.cip = crypto_alloc_cipher(name, 0, - CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + CRYPTO_ALG_NEED_FALLBACK); if (IS_ERR(sctx->fallback.cip)) { pr_err("Allocating AES fallback algorithm %s failed\n", diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c @@ -476,11 +476,6 @@ static bool __init sparc64_has_aes_opcode(void) static int __init aes_sparc64_mod_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(algs); i++) - INIT_LIST_HEAD(&algs[i].cra_list); - if (sparc64_has_aes_opcode()) { pr_info("Using sparc64 aes opcodes optimized AES implementation\n"); return crypto_register_algs(algs, ARRAY_SIZE(algs)); diff --git a/arch/sparc/crypto/camellia_glue.c b/arch/sparc/crypto/camellia_glue.c @@ -299,11 +299,6 @@ static bool __init sparc64_has_camellia_opcode(void) static int __init camellia_sparc64_mod_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(algs); i++) - INIT_LIST_HEAD(&algs[i].cra_list); - if (sparc64_has_camellia_opcode()) { pr_info("Using sparc64 camellia opcodes optimized CAMELLIA implementation\n"); return crypto_register_algs(algs, ARRAY_SIZE(algs)); diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c @@ -510,11 +510,6 @@ static bool __init sparc64_has_des_opcode(void) static int __init des_sparc64_mod_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(algs); i++) - INIT_LIST_HEAD(&algs[i].cra_list); - if (sparc64_has_des_opcode()) { pr_info("Using sparc64 des opcodes optimized DES implementation\n"); return crypto_register_algs(algs, ARRAY_SIZE(algs)); diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile @@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no) avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\ $(comma)4)$(comma)%ymm2,yes,no) +avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no) sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no) sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no) @@ -23,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o -obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o +obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o @@ -46,6 +47,9 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o +obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o + # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \ @@ -74,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o -chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o +chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o @@ -84,6 +88,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o +nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o + ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ camellia_aesni_avx_glue.o @@ -97,10 +103,16 @@ endif ifeq ($(avx2_supported),yes) camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o - chacha20-x86_64-y += chacha20-avx2-x86_64.o + chacha-x86_64-y += chacha-avx2-x86_64.o serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o + + nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o +endif + +ifeq ($(avx512_supported),yes) + chacha-x86_64-y += chacha-avx512vl-x86_64.o endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -182,43 +182,30 @@ aad_shift_arr: .text -##define the fields of the gcm aes context -#{ -# u8 expanded_keys[16*11] store expanded keys -# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here -# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here -# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here -# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here -# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here -# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here -# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here -# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here -# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) -# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) -#} gcm_ctx# - -HashKey = 16*11 # store HashKey <<1 mod poly here -HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here -HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here -HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here -HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here -HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here -HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here -HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here -HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) -HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) -HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) -HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) -HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) -HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) -HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) -HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +#define AadHash 16*0 +#define AadLen 16*1 +#define InLen (16*1)+8 +#define PBlockEncKey 16*2 +#define OrigIV 16*3 +#define CurCount 16*4 +#define PBlockLen 16*5 + +HashKey = 16*6 # store HashKey <<1 mod poly here +HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here +HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here +HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here +HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here +HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here +HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here +HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here +HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) +HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) #define arg1 %rdi #define arg2 %rsi @@ -229,6 +216,8 @@ HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu #define arg7 STACK_OFFSET+8*1(%r14) #define arg8 STACK_OFFSET+8*2(%r14) #define arg9 STACK_OFFSET+8*3(%r14) +#define arg10 STACK_OFFSET+8*4(%r14) +#define keysize 2*15*16(arg1) i = 0 j = 0 @@ -267,1356 +256,1619 @@ VARIABLE_OFFSET = 16*8 # Utility Macros ################################ +.macro FUNC_SAVE + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes +.endm + +.macro FUNC_RESTORE + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm + # Encryption of a single block -.macro ENCRYPT_SINGLE_BLOCK XMM0 +.macro ENCRYPT_SINGLE_BLOCK REP XMM0 vpxor (arg1), \XMM0, \XMM0 - i = 1 - setreg -.rep 9 + i = 1 + setreg +.rep \REP vaesenc 16*i(arg1), \XMM0, \XMM0 - i = (i+1) - setreg + i = (i+1) + setreg .endr - vaesenclast 16*10(arg1), \XMM0, \XMM0 + vaesenclast 16*i(arg1), \XMM0, \XMM0 .endm -#ifdef CONFIG_AS_AVX -############################################################################### -# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) -# Input: A and B (128-bits each, bit-reflected) -# Output: C = A*B*x mod poly, (i.e. >>1 ) -# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input -# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. -############################################################################### -.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 +# combined for GCM encrypt and decrypt functions +# clobbering all xmm registers +# clobbering r10, r11, r12, r13, r14, r15 +.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP + vmovdqu AadHash(arg2), %xmm8 + vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey + add arg5, InLen(arg2) - vpshufd $0b01001110, \GH, \T2 - vpshufd $0b01001110, \HK, \T3 - vpxor \GH , \T2, \T2 # T2 = (a1+a0) - vpxor \HK , \T3, \T3 # T3 = (b1+b0) + # initialize the data pointer offset as zero + xor %r11d, %r11d - vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 - vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 - vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) - vpxor \GH, \T2,\T2 - vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 + PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC + sub %r11, arg5 - vpslldq $8, \T2,\T3 # shift-L T3 2 DWs - vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs - vpxor \T3, \GH, \GH - vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK + mov arg5, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # r13 = r13 - (r13 mod 16) - #first phase of the reduction - vpslld $31, \GH, \T2 # packed right shifting << 31 - vpslld $30, \GH, \T3 # packed right shifting shift << 30 - vpslld $25, \GH, \T4 # packed right shifting shift << 25 + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0\@ - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 + cmp $7, %r12 + je _initial_num_blocks_is_7\@ + cmp $6, %r12 + je _initial_num_blocks_is_6\@ + cmp $5, %r12 + je _initial_num_blocks_is_5\@ + cmp $4, %r12 + je _initial_num_blocks_is_4\@ + cmp $3, %r12 + je _initial_num_blocks_is_3\@ + cmp $2, %r12 + je _initial_num_blocks_is_2\@ - vpsrldq $4, \T2, \T5 # shift-R T5 1 DW + jmp _initial_num_blocks_is_1\@ - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \GH, \GH # first phase of the reduction complete +_initial_num_blocks_is_7\@: + \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*7, %r13 + jmp _initial_blocks_encrypted\@ - #second phase of the reduction +_initial_num_blocks_is_6\@: + \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*6, %r13 + jmp _initial_blocks_encrypted\@ - vpsrld $1,\GH, \T2 # packed left shifting >> 1 - vpsrld $2,\GH, \T3 # packed left shifting >> 2 - vpsrld $7,\GH, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 +_initial_num_blocks_is_5\@: + \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*5, %r13 + jmp _initial_blocks_encrypted\@ - vpxor \T5, \T2, \T2 - vpxor \T2, \GH, \GH - vpxor \T1, \GH, \GH # the result is in GH +_initial_num_blocks_is_4\@: + \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*4, %r13 + jmp _initial_blocks_encrypted\@ +_initial_num_blocks_is_3\@: + \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*3, %r13 + jmp _initial_blocks_encrypted\@ -.endm +_initial_num_blocks_is_2\@: + \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*2, %r13 + jmp _initial_blocks_encrypted\@ -.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 +_initial_num_blocks_is_1\@: + \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*1, %r13 + jmp _initial_blocks_encrypted\@ - # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vmovdqa \HK, \T5 +_initial_num_blocks_is_0\@: + \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_k(arg1) - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_2_k(arg1) +_initial_blocks_encrypted\@: + cmp $0, %r13 + je _zero_cipher_left\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqa \T5, HashKey_3(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_3_k(arg1) + sub $128, %r13 + je _eight_cipher_left\@ - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqa \T5, HashKey_4(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_4_k(arg1) - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqa \T5, HashKey_5(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_5_k(arg1) - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqa \T5, HashKey_6(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_6_k(arg1) - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqa \T5, HashKey_7(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_7_k(arg1) + vmovd %xmm9, %r15d + and $255, %r15d + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqa \T5, HashKey_8(arg1) - vpshufd $0b01001110, \T5, \T1 - vpxor \T5, \T1, \T1 - vmovdqa \T1, HashKey_8_k(arg1) -.endm +_encrypt_by_8_new\@: + cmp $(255-8), %r15d + jg _encrypt_by_8\@ -## if a = number of total plaintext bytes -## b = floor(a/16) -## num_initial_blocks = b mod 4# -## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext -## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, r14 are used as a pointer only, not modified -.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC - i = (8-\num_initial_blocks) - j = 0 - setreg - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen + add $8, %r15b + \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + jmp _eight_cipher_left\@ - mov %r12, %r11 +_encrypt_by_8\@: + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $8, %r15b + \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ - vpxor reg_j, reg_j, reg_j - vpxor reg_i, reg_i, reg_i - cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: - vmovdqu (%r10), reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge _get_AAD_blocks\@ - vmovdqu reg_j, reg_i - cmp $0, %r11 - je _get_AAD_done\@ + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor reg_i, reg_i, reg_i - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: - cmp $4, %r11 - jle _get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, reg_i, reg_i - vpxor \T1, reg_i, reg_i - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: - cmp $0, %r11 - jle _get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i -_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, reg_i, reg_i -_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_j, reg_i, reg_i - GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 -_get_AAD_done\@: - # initialize the data pointer offset as zero - xor %r11d, %r11d - # start AES for num_initial_blocks blocks - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), \CTR # CTR = Y0 - vpshufb SHUF_MASK(%rip), \CTR, \CTR +_eight_cipher_left\@: + \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap - i = (i+1) - setreg -.endr +_zero_cipher_left\@: + vmovdqu %xmm14, AadHash(arg2) + vmovdqu %xmm9, CurCount(arg2) - vmovdqa (arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vpxor \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr + # check for 0 length + mov arg5, %r13 + and $15, %r13 # r13 = (arg5 mod 16) - j = 1 - setreg -.rep 9 - vmovdqa 16*j(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenc \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr + je _multiple_of_16_bytes\@ - j = (j+1) - setreg -.endr + # handle the last <16 Byte block separately + mov %r13, PBlockLen(arg2) - vmovdqa 16*10(arg1), \T_key - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vaesenclast \T_key, reg_i, reg_i - i = (i+1) - setreg -.endr + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vmovdqu %xmm9, CurCount(arg2) + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - i = (9-\num_initial_blocks) - setreg -.rep \num_initial_blocks - vmovdqu (arg3, %r11), \T1 - vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks - add $16, %r11 -.if \ENC_DEC == DEC - vmovdqa \T1, reg_i -.endif - vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations - i = (i+1) - setreg -.endr + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) + vmovdqu %xmm9, PBlockEncKey(arg2) + cmp $16, arg5 + jge _large_enough_update\@ - i = (8-\num_initial_blocks) - j = (9-\num_initial_blocks) - setreg + lea (arg4,%r11,1), %r10 + mov %r13, %r12 -.rep \num_initial_blocks - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks - i = (i+1) - j = (j+1) - setreg -.endr - # XMM8 has the combined result here + READ_PARTIAL_BLOCK %r10 %r12 %xmm1 - vmovdqa \XMM8, TMP1(%rsp) - vmovdqa \XMM8, \T3 + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) - cmp $128, %r13 - jl _initial_blocks_done\@ # no need for precomputed constants + jmp _final_ghash_mul\@ -############################################################################### -# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM1 - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap +_large_enough_update\@: + sub $16, %r11 + add %r13, %r11 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM2 - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + # receive the last <16 Byte block + vmovdqu (arg4, %r11, 1), %xmm1 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM3 - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + sub %r13, %r11 + add $16, %r11 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM4 - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + lea SHIFT_MASK+16(%rip), %r12 + # adjust the shuffle mask pointer to be able to shift 16-r13 bytes + # (r13 is the number of bytes in plaintext mod 16) + sub %r13, %r12 + # get the appropriate shuffle mask + vmovdqu (%r12), %xmm2 + # shift right 16-r13 bytes + vpshufb %xmm2, %xmm1, %xmm1 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM5 - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +_final_ghash_mul\@: + .if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm2, %xmm2 + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm14, %xmm14 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM6 - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vmovdqu %xmm14, AadHash(arg2) + .else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM7 - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vmovdqu %xmm14, AadHash(arg2) + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext + .endif - vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 - vmovdqa \CTR, \XMM8 - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - vmovdqa (arg1), \T_key - vpxor \T_key, \XMM1, \XMM1 - vpxor \T_key, \XMM2, \XMM2 - vpxor \T_key, \XMM3, \XMM3 - vpxor \T_key, \XMM4, \XMM4 - vpxor \T_key, \XMM5, \XMM5 - vpxor \T_key, \XMM6, \XMM6 - vpxor \T_key, \XMM7, \XMM7 - vpxor \T_key, \XMM8, \XMM8 + ############################# + # output r13 Bytes + vmovq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left\@ - i = 1 - setreg -.rep 9 # do 9 rounds - vmovdqa 16*i(arg1), \T_key - vaesenc \T_key, \XMM1, \XMM1 - vaesenc \T_key, \XMM2, \XMM2 - vaesenc \T_key, \XMM3, \XMM3 - vaesenc \T_key, \XMM4, \XMM4 - vaesenc \T_key, \XMM5, \XMM5 - vaesenc \T_key, \XMM6, \XMM6 - vaesenc \T_key, \XMM7, \XMM7 - vaesenc \T_key, \XMM8, \XMM8 - i = (i+1) - setreg -.endr + mov %rax, (arg3 , %r11) + add $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + vmovq %xmm9, %rax + sub $8, %r13 +_less_than_8_bytes_left\@: + movb %al, (arg3 , %r11) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left\@ + ############################# - vmovdqa 16*i(arg1), \T_key - vaesenclast \T_key, \XMM1, \XMM1 - vaesenclast \T_key, \XMM2, \XMM2 - vaesenclast \T_key, \XMM3, \XMM3 - vaesenclast \T_key, \XMM4, \XMM4 - vaesenclast \T_key, \XMM5, \XMM5 - vaesenclast \T_key, \XMM6, \XMM6 - vaesenclast \T_key, \XMM7, \XMM7 - vaesenclast \T_key, \XMM8, \XMM8 +_multiple_of_16_bytes\@: +.endm - vmovdqu (arg3, %r11), \T1 - vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM1 - .endif - vmovdqu 16*1(arg3, %r11), \T1 - vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM2 - .endif +# GCM_COMPLETE Finishes update of tag of last partial block +# Output: Authorization Tag (AUTH_TAG) +# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 +.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN + vmovdqu AadHash(arg2), %xmm14 + vmovdqu HashKey(arg2), %xmm13 - vmovdqu 16*2(arg3, %r11), \T1 - vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM3 - .endif + mov PBlockLen(arg2), %r12 + cmp $0, %r12 + je _partial_done\@ - vmovdqu 16*3(arg3, %r11), \T1 - vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM4 - .endif + #GHASH computation for the last <16 Byte block + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - vmovdqu 16*4(arg3, %r11), \T1 - vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM5 - .endif +_partial_done\@: + mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + vmovd %r12d, %xmm15 # len(A) in xmm15 - vmovdqu 16*5(arg3, %r11), \T1 - vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM6 - .endif + mov InLen(arg2), %r12 + shl $3, %r12 # len(C) in bits (*128) + vmovq %r12, %xmm1 + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) - vmovdqu 16*6(arg3, %r11), \T1 - vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM7 - .endif + vpxor %xmm15, %xmm14, %xmm14 + \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap - vmovdqu 16*7(arg3, %r11), \T1 - vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg2 , %r11) - .if \ENC_DEC == DEC - vmovdqa \T1, \XMM8 - .endif + vmovdqu OrigIV(arg2), %xmm9 - add $128, %r11 + ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + vpxor %xmm14, %xmm9, %xmm9 -############################################################################### -_initial_blocks_done\@: -.endm +_return_T\@: + mov \AUTH_TAG, %r10 # r10 = authTag + mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len -# encrypt 8 blocks at a time -# ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3 are used as pointers only, not modified -# r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC + cmp $16, %r11 + je _T_16\@ - vmovdqa \XMM1, \T2 - vmovdqa \XMM2, TMP2(%rsp) - vmovdqa \XMM3, TMP3(%rsp) - vmovdqa \XMM4, TMP4(%rsp) - vmovdqa \XMM5, TMP5(%rsp) - vmovdqa \XMM6, TMP6(%rsp) - vmovdqa \XMM7, TMP7(%rsp) - vmovdqa \XMM8, TMP8(%rsp) + cmp $8, %r11 + jl _T_4\@ -.if \loop_idx == in_order - vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONE(%rip), \XMM1, \XMM2 - vpaddd ONE(%rip), \XMM2, \XMM3 - vpaddd ONE(%rip), \XMM3, \XMM4 - vpaddd ONE(%rip), \XMM4, \XMM5 - vpaddd ONE(%rip), \XMM5, \XMM6 - vpaddd ONE(%rip), \XMM6, \XMM7 - vpaddd ONE(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR +_T_8\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + add $8, %r10 + sub $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_4\@: + vmovd %xmm9, %eax + mov %eax, (%r10) + add $4, %r10 + sub $4, %r11 + vpsrldq $4, %xmm9, %xmm9 + cmp $0, %r11 + je _return_T_done\@ +_T_123\@: + vmovd %xmm9, %eax + cmp $2, %r11 + jl _T_1\@ + mov %ax, (%r10) + cmp $2, %r11 + je _return_T_done\@ + add $2, %r10 + sar $16, %eax +_T_1\@: + mov %al, (%r10) + jmp _return_T_done\@ - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -.else - vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT - vpaddd ONEf(%rip), \XMM1, \XMM2 - vpaddd ONEf(%rip), \XMM2, \XMM3 - vpaddd ONEf(%rip), \XMM3, \XMM4 - vpaddd ONEf(%rip), \XMM4, \XMM5 - vpaddd ONEf(%rip), \XMM5, \XMM6 - vpaddd ONEf(%rip), \XMM6, \XMM7 - vpaddd ONEf(%rip), \XMM7, \XMM8 - vmovdqa \XMM8, \CTR -.endif +_T_16\@: + vmovdqu %xmm9, (%r10) +_return_T_done\@: +.endm - ####################################################################### - - vmovdqu (arg1), \T1 - vpxor \T1, \XMM1, \XMM1 - vpxor \T1, \XMM2, \XMM2 - vpxor \T1, \XMM3, \XMM3 - vpxor \T1, \XMM4, \XMM4 - vpxor \T1, \XMM5, \XMM5 - vpxor \T1, \XMM6, \XMM6 - vpxor \T1, \XMM7, \XMM7 - vpxor \T1, \XMM8, \XMM8 - - ####################################################################### - +.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 + mov \AAD, %r10 # r10 = AAD + mov \AADLEN, %r12 # r12 = aadLen + mov %r12, %r11 - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + vpxor \T8, \T8, \T8 + vpxor \T7, \T7, \T7 + cmp $16, %r11 + jl _get_AAD_rest8\@ +_get_AAD_blocks\@: + vmovdqu (%r10), \T7 + vpshufb SHUF_MASK(%rip), \T7, \T7 + vpxor \T7, \T8, \T8 + \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 + add $16, %r10 + sub $16, %r12 + sub $16, %r11 + cmp $16, %r11 + jge _get_AAD_blocks\@ + vmovdqu \T8, \T7 + cmp $0, %r11 + je _get_AAD_done\@ - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + vpxor \T7, \T7, \T7 + /* read the last <16B of AAD. since we have at least 4B of + data right after the AAD (the ICV, and maybe some CT), we can + read 4B/8B blocks safely, and then get rid of the extra stuff */ +_get_AAD_rest8\@: + cmp $4, %r11 + jle _get_AAD_rest4\@ + movq (%r10), \T1 + add $8, %r10 + sub $8, %r11 + vpslldq $8, \T1, \T1 + vpsrldq $8, \T7, \T7 + vpxor \T1, \T7, \T7 + jmp _get_AAD_rest8\@ +_get_AAD_rest4\@: + cmp $0, %r11 + jle _get_AAD_rest0\@ + mov (%r10), %eax + movq %rax, \T1 + add $4, %r10 + sub $4, %r11 + vpslldq $12, \T1, \T1 + vpsrldq $4, \T7, \T7 + vpxor \T1, \T7, \T7 +_get_AAD_rest0\@: + /* finalize: shift out the extra bytes we read, and align + left. since pslldq can only shift by an immediate, we use + vpshufb and an array of shuffle masks */ + movq %r12, %r11 + salq $4, %r11 + vmovdqu aad_shift_arr(%r11), \T1 + vpshufb \T1, \T7, \T7 +_get_AAD_rest_final\@: + vpshufb SHUF_MASK(%rip), \T7, \T7 + vpxor \T8, \T7, \T7 + \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 - ####################################################################### +_get_AAD_done\@: + vmovdqu \T7, AadHash(arg2) +.endm - vmovdqa HashKey_8(arg1), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 +.macro INIT GHASH_MUL PRECOMPUTE + mov arg6, %r11 + mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length + xor %r11d, %r11d + mov %r11, InLen(arg2) # ctx_data.in_length = 0 - vpshufd $0b01001110, \T2, \T6 - vpxor \T2, \T6, \T6 + mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 + mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 + mov arg3, %rax + movdqu (%rax), %xmm0 + movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv - vmovdqa HashKey_8_k(arg1), \T5 - vpclmulqdq $0x00, \T5, \T6, \T6 + vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 + movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqa HashKey_7(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_7_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 + vmovdqu (arg4), %xmm6 # xmm6 = HashKey + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + vmovdqa %xmm6, %xmm2 + vpsllq $1, %xmm6, %xmm6 + vpsrlq $63, %xmm2, %xmm2 + vmovdqa %xmm2, %xmm1 + vpslldq $8, %xmm2, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + #reduction + vpshufd $0b00100100, %xmm1, %xmm2 + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 + vpand POLY(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly ####################################################################### + vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly - vmovdqa TMP3(%rsp), \T1 - vmovdqa HashKey_6(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_6_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP4(%rsp), \T1 - vmovdqa HashKey_5(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_5_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - vmovdqa TMP5(%rsp), \T1 - vmovdqa HashKey_4(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_4_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 - vmovdqu 16*7(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP6(%rsp), \T1 - vmovdqa HashKey_3(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_3_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 +.endm - vmovdqu 16*8(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 +# Reads DLEN bytes starting at DPTR and stores in XMMDst +# where 0 < DLEN < 16 +# Clobbers %rax, DLEN +.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst + vpxor \XMMDst, \XMMDst, \XMMDst + + cmp $8, \DLEN + jl _read_lt8_\@ + mov (\DPTR), %rax + vpinsrq $0, %rax, \XMMDst, \XMMDst + sub $8, \DLEN + jz _done_read_partial_block_\@ + xor %eax, %eax +_read_next_byte_\@: + shl $8, %rax + mov 7(\DPTR, \DLEN, 1), %al + dec \DLEN + jnz _read_next_byte_\@ + vpinsrq $1, %rax, \XMMDst, \XMMDst + jmp _done_read_partial_block_\@ +_read_lt8_\@: + xor %eax, %eax +_read_next_byte_lt8_\@: + shl $8, %rax + mov -1(\DPTR, \DLEN, 1), %al + dec \DLEN + jnz _read_next_byte_lt8_\@ + vpinsrq $0, %rax, \XMMDst, \XMMDst +_done_read_partial_block_\@: +.endm - vmovdqa TMP7(%rsp), \T1 - vmovdqa HashKey_2(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 +# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks +# between update calls. +# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK +# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context +# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 +.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ + AAD_HASH ENC_DEC + mov PBlockLen(arg2), %r13 + cmp $0, %r13 + je _partial_block_done_\@ # Leave Macro if no partial blocks + # Read in input data without over reading + cmp $16, \PLAIN_CYPH_LEN + jl _fewer_than_16_bytes_\@ + vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm + jmp _data_read_\@ + +_fewer_than_16_bytes_\@: + lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 + mov \PLAIN_CYPH_LEN, %r12 + READ_PARTIAL_BLOCK %r10 %r12 %xmm1 + + mov PBlockLen(arg2), %r13 + +_data_read_\@: # Finished reading in data + + vmovdqu PBlockEncKey(arg2), %xmm9 + vmovdqu HashKey(arg2), %xmm13 + + lea SHIFT_MASK(%rip), %r12 + + # adjust the shuffle mask pointer to be able to shift r13 bytes + # r16-r13 is the number of bytes in plaintext mod 16) + add %r13, %r12 + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes + +.if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm3 + pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r10 + add %r13, %r10 + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling + sub $16, %r10 + # Determine if if partial block is not being filled and + # shift mask accordingly + jge _no_extra_mask_1_\@ + sub %r10, %r12 +_no_extra_mask_1_\@: + + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 + + vpand %xmm1, %xmm3, %xmm3 + vmovdqa SHUF_MASK(%rip), %xmm10 + vpshufb %xmm10, %xmm3, %xmm3 + vpshufb %xmm2, %xmm3, %xmm3 + vpxor %xmm3, \AAD_HASH, \AAD_HASH + + cmp $0, %r10 + jl _partial_incomplete_1_\@ + + # GHASH computation for the last <16 Byte block + \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + xor %eax,%eax + + mov %rax, PBlockLen(arg2) + jmp _dec_done_\@ +_partial_incomplete_1_\@: + add \PLAIN_CYPH_LEN, PBlockLen(arg2) +_dec_done_\@: + vmovdqu \AAD_HASH, AadHash(arg2) +.else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + + mov \PLAIN_CYPH_LEN, %r10 + add %r13, %r10 + # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling + sub $16, %r10 + # Determine if if partial block is not being filled and + # shift mask accordingly + jge _no_extra_mask_2_\@ + sub %r10, %r12 +_no_extra_mask_2_\@: + + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out bottom r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 + + vmovdqa SHUF_MASK(%rip), %xmm1 + vpshufb %xmm1, %xmm9, %xmm9 + vpshufb %xmm2, %xmm9, %xmm9 + vpxor %xmm9, \AAD_HASH, \AAD_HASH + + cmp $0, %r10 + jl _partial_incomplete_2_\@ + + # GHASH computation for the last <16 Byte block + \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + xor %eax,%eax + + mov %rax, PBlockLen(arg2) + jmp _encode_done_\@ +_partial_incomplete_2_\@: + add \PLAIN_CYPH_LEN, PBlockLen(arg2) +_encode_done_\@: + vmovdqu \AAD_HASH, AadHash(arg2) + + vmovdqa SHUF_MASK(%rip), %xmm10 + # shuffle xmm9 back to output as ciphertext + vpshufb %xmm10, %xmm9, %xmm9 + vpshufb %xmm2, %xmm9, %xmm9 +.endif + # output encrypted Bytes + cmp $0, %r10 + jl _partial_fill_\@ + mov %r13, %r12 + mov $16, %r13 + # Set r13 to be the number of bytes to write out + sub %r12, %r13 + jmp _count_set_\@ +_partial_fill_\@: + mov \PLAIN_CYPH_LEN, %r13 +_count_set_\@: + vmovdqa %xmm9, %xmm0 + vmovq %xmm0, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_\@ + + mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) + add $8, \DATA_OFFSET + psrldq $8, %xmm0 + vmovq %xmm0, %rax + sub $8, %r13 +_less_than_8_bytes_left_\@: + movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) + add $1, \DATA_OFFSET + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_\@ +_partial_block_done_\@: +.endm # PARTIAL_BLOCK - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_2_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 +#ifdef CONFIG_AS_AVX +############################################################################### +# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +# Input: A and B (128-bits each, bit-reflected) +# Output: C = A*B*x mod poly, (i.e. >>1 ) +# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +############################################################################### +.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 - ####################################################################### + vpshufd $0b01001110, \GH, \T2 + vpshufd $0b01001110, \HK, \T3 + vpxor \GH , \T2, \T2 # T2 = (a1+a0) + vpxor \HK , \T3, \T3 # T3 = (b1+b0) - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 + vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 + vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 + vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) + vpxor \GH, \T2,\T2 + vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 - vmovdqa TMP8(%rsp), \T1 - vmovdqa HashKey(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 + vpslldq $8, \T2,\T3 # shift-L T3 2 DWs + vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs + vpxor \T3, \GH, \GH + vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK - vpshufd $0b01001110, \T1, \T3 - vpxor \T1, \T3, \T3 - vmovdqa HashKey_k(arg1), \T5 - vpclmulqdq $0x10, \T5, \T3, \T3 - vpxor \T3, \T6, \T6 + #first phase of the reduction + vpslld $31, \GH, \T2 # packed right shifting << 31 + vpslld $30, \GH, \T3 # packed right shifting shift << 30 + vpslld $25, \GH, \T4 # packed right shifting shift << 25 - vpxor \T4, \T6, \T6 - vpxor \T7, \T6, \T6 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 - vmovdqu 16*10(arg1), \T5 + vpsrldq $4, \T2, \T5 # shift-R T5 1 DW - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg3, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg3, %r11), reg_j - vmovdqu \T3, 16*i(arg2, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \GH, \GH # first phase of the reduction complete + #second phase of the reduction - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 + vpsrld $1,\GH, \T2 # packed left shifting >> 1 + vpsrld $2,\GH, \T3 # packed left shifting >> 2 + vpsrld $7,\GH, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + vpxor \T5, \T2, \T2 + vpxor \T2, \GH, \GH + vpxor \T1, \GH, \GH # the result is in GH - ####################################################################### - #first phase of the reduction - ####################################################################### - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 +.endm - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 +.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa \HK, \T5 - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer - .endif + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqu \T1, HashKey_k(arg2) - ####################################################################### - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly + vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqu \T1, HashKey_2_k(arg2) - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 - ####################################################################### + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly + vmovdqu \T5, HashKey_3(arg2) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqu \T1, HashKey_3_k(arg2) - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly + vmovdqu \T5, HashKey_4(arg2) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqu \T1, HashKey_4_k(arg2) + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly + vmovdqu \T5, HashKey_5(arg2) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqu \T1, HashKey_5_k(arg2) - vpxor \T6, \XMM1, \XMM1 + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly + vmovdqu \T5, HashKey_6(arg2) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqu \T1, HashKey_6_k(arg2) + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly + vmovdqu \T5, HashKey_7(arg2) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqu \T1, HashKey_7_k(arg2) + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly + vmovdqu \T5, HashKey_8(arg2) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqu \T1, HashKey_8_k(arg2) .endm +## if a = number of total plaintext bytes +## b = floor(a/16) +## num_initial_blocks = b mod 4# +## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +## r10, r11, r12, rax are clobbered +## arg1, arg3, arg4, r14 are used as a pointer only, not modified -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 +.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC + i = (8-\num_initial_blocks) + setreg + vmovdqu AadHash(arg2), reg_i - ## Karatsuba Method + # start AES for num_initial_blocks blocks + vmovdqu CurCount(arg2), \CTR + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap + i = (i+1) + setreg +.endr - vpshufd $0b01001110, \XMM1, \T2 - vpxor \XMM1, \T2, \T2 - vmovdqa HashKey_8(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 + vmovdqa (arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpxor \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr - vmovdqa HashKey_8_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \XMM1 + j = 1 + setreg +.rep \REP + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenc \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr - ###################### + j = (j+1) + setreg +.endr - vpshufd $0b01001110, \XMM2, \T2 - vpxor \XMM2, \T2, \T2 - vmovdqa HashKey_7(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenclast \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vmovdqu (arg4, %r11), \T1 + vpxor \T1, reg_i, reg_i + vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks + add $16, %r11 +.if \ENC_DEC == DEC + vmovdqa \T1, reg_i +.endif + vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations + i = (i+1) + setreg +.endr - vmovdqa HashKey_7_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 - ###################### + i = (8-\num_initial_blocks) + j = (9-\num_initial_blocks) + setreg - vpshufd $0b01001110, \XMM3, \T2 - vpxor \XMM3, \T2, \T2 - vmovdqa HashKey_6(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 +.rep \num_initial_blocks + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks + i = (i+1) + j = (j+1) + setreg +.endr + # XMM8 has the combined result here - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 + vmovdqa \XMM8, TMP1(%rsp) + vmovdqa \XMM8, \T3 - vmovdqa HashKey_6_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + cmp $128, %r13 + jl _initial_blocks_done\@ # no need for precomputed constants - ###################### +############################################################################### +# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufd $0b01001110, \XMM4, \T2 - vpxor \XMM4, \T2, \T2 - vmovdqa HashKey_5(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM2 + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM3 + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vmovdqa HashKey_5_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM4 + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - ###################### + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM5 + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufd $0b01001110, \XMM5, \T2 - vpxor \XMM5, \T2, \T2 - vmovdqa HashKey_4(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM6 + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM7 + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vmovdqa HashKey_4_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM8 + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - ###################### + vmovdqa (arg1), \T_key + vpxor \T_key, \XMM1, \XMM1 + vpxor \T_key, \XMM2, \XMM2 + vpxor \T_key, \XMM3, \XMM3 + vpxor \T_key, \XMM4, \XMM4 + vpxor \T_key, \XMM5, \XMM5 + vpxor \T_key, \XMM6, \XMM6 + vpxor \T_key, \XMM7, \XMM7 + vpxor \T_key, \XMM8, \XMM8 - vpshufd $0b01001110, \XMM6, \T2 - vpxor \XMM6, \T2, \T2 - vmovdqa HashKey_3(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 + i = 1 + setreg +.rep \REP # do REP rounds + vmovdqa 16*i(arg1), \T_key + vaesenc \T_key, \XMM1, \XMM1 + vaesenc \T_key, \XMM2, \XMM2 + vaesenc \T_key, \XMM3, \XMM3 + vaesenc \T_key, \XMM4, \XMM4 + vaesenc \T_key, \XMM5, \XMM5 + vaesenc \T_key, \XMM6, \XMM6 + vaesenc \T_key, \XMM7, \XMM7 + vaesenc \T_key, \XMM8, \XMM8 + i = (i+1) + setreg +.endr - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 + vmovdqa 16*i(arg1), \T_key + vaesenclast \T_key, \XMM1, \XMM1 + vaesenclast \T_key, \XMM2, \XMM2 + vaesenclast \T_key, \XMM3, \XMM3 + vaesenclast \T_key, \XMM4, \XMM4 + vaesenclast \T_key, \XMM5, \XMM5 + vaesenclast \T_key, \XMM6, \XMM6 + vaesenclast \T_key, \XMM7, \XMM7 + vaesenclast \T_key, \XMM8, \XMM8 - vmovdqa HashKey_3_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vmovdqu (arg4, %r11), \T1 + vpxor \T1, \XMM1, \XMM1 + vmovdqu \XMM1, (arg3 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM1 + .endif - ###################### + vmovdqu 16*1(arg4, %r11), \T1 + vpxor \T1, \XMM2, \XMM2 + vmovdqu \XMM2, 16*1(arg3 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM2 + .endif - vpshufd $0b01001110, \XMM7, \T2 - vpxor \XMM7, \T2, \T2 - vmovdqa HashKey_2(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 + vmovdqu 16*2(arg4, %r11), \T1 + vpxor \T1, \XMM3, \XMM3 + vmovdqu \XMM3, 16*2(arg3 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM3 + .endif - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 + vmovdqu 16*3(arg4, %r11), \T1 + vpxor \T1, \XMM4, \XMM4 + vmovdqu \XMM4, 16*3(arg3 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM4 + .endif - vmovdqa HashKey_2_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor \T2, \XMM1, \XMM1 + vmovdqu 16*4(arg4, %r11), \T1 + vpxor \T1, \XMM5, \XMM5 + vmovdqu \XMM5, 16*4(arg3 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM5 + .endif - ###################### + vmovdqu 16*5(arg4, %r11), \T1 + vpxor \T1, \XMM6, \XMM6 + vmovdqu \XMM6, 16*5(arg3 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM6 + .endif - vpshufd $0b01001110, \XMM8, \T2 - vpxor \XMM8, \T2, \T2 - vmovdqa HashKey(arg1), \T5 - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 + vmovdqu 16*6(arg4, %r11), \T1 + vpxor \T1, \XMM7, \XMM7 + vmovdqu \XMM7, 16*6(arg3 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM7 + .endif + + vmovdqu 16*7(arg4, %r11), \T1 + vpxor \T1, \XMM8, \XMM8 + vmovdqu \XMM8, 16*7(arg3 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM8 + .endif + + add $128, %r11 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 +############################################################################### - vmovdqa HashKey_k(arg1), \T3 - vpclmulqdq $0x00, \T3, \T2, \T2 +_initial_blocks_done\@: - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 +.endm +# encrypt 8 blocks at a time +# ghash the 8 previously encrypted ciphertext blocks +# arg1, arg3, arg4 are used as pointers only, not modified +# r11 is the data offset value +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC + vmovdqa \XMM1, \T2 + vmovdqa \XMM2, TMP2(%rsp) + vmovdqa \XMM3, TMP3(%rsp) + vmovdqa \XMM4, TMP4(%rsp) + vmovdqa \XMM5, TMP5(%rsp) + vmovdqa \XMM6, TMP6(%rsp) + vmovdqa \XMM7, TMP7(%rsp) + vmovdqa \XMM8, TMP8(%rsp) +.if \loop_idx == in_order + vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONE(%rip), \XMM1, \XMM2 + vpaddd ONE(%rip), \XMM2, \XMM3 + vpaddd ONE(%rip), \XMM3, \XMM4 + vpaddd ONE(%rip), \XMM4, \XMM5 + vpaddd ONE(%rip), \XMM5, \XMM6 + vpaddd ONE(%rip), \XMM6, \XMM7 + vpaddd ONE(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +.else + vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONEf(%rip), \XMM1, \XMM2 + vpaddd ONEf(%rip), \XMM2, \XMM3 + vpaddd ONEf(%rip), \XMM3, \XMM4 + vpaddd ONEf(%rip), \XMM4, \XMM5 + vpaddd ONEf(%rip), \XMM5, \XMM6 + vpaddd ONEf(%rip), \XMM6, \XMM7 + vpaddd ONEf(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR +.endif - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of - # the accumulated carry-less multiplications ####################################################################### - #first phase of the reduction - vpslld $31, \T7, \T2 # packed right shifting << 31 - vpslld $30, \T7, \T3 # packed right shifting shift << 30 - vpslld $25, \T7, \T4 # packed right shifting shift << 25 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - - vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + vmovdqu (arg1), \T1 + vpxor \T1, \XMM1, \XMM1 + vpxor \T1, \XMM2, \XMM2 + vpxor \T1, \XMM3, \XMM3 + vpxor \T1, \XMM4, \XMM4 + vpxor \T1, \XMM5, \XMM5 + vpxor \T1, \XMM6, \XMM6 + vpxor \T1, \XMM7, \XMM7 + vpxor \T1, \XMM8, \XMM8 - vpslldq $12, \T2, \T2 # shift-L T2 3 DWs - vpxor \T2, \T7, \T7 # first phase of the reduction complete ####################################################################### - #second phase of the reduction - vpsrld $1, \T7, \T2 # packed left shifting >> 1 - vpsrld $2, \T7, \T3 # packed left shifting >> 2 - vpsrld $7, \T7, \T4 # packed left shifting >> 7 - vpxor \T3, \T2, \T2 # xor the shifted versions - vpxor \T4, \T2, \T2 - vpxor \T1, \T2, \T2 - vpxor \T2, \T7, \T7 - vpxor \T7, \T6, \T6 # the result is in T6 -.endm + vmovdqu 16*1(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r14, r15 -.macro GCM_ENC_DEC_AVX ENC_DEC + vmovdqu 16*2(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 - mov %rsp, %r14 + ####################################################################### + vmovdqu HashKey_8(arg2), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + vpshufd $0b01001110, \T2, \T6 + vpxor \T2, \T6, \T6 + vmovdqu HashKey_8_k(arg2), \T5 + vpclmulqdq $0x00, \T5, \T6, \T6 - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + vmovdqu 16*3(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + vmovdqa TMP2(%rsp), \T1 + vmovdqu HashKey_7(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqu HashKey_7_k(arg2), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) + vmovdqu 16*4(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz _initial_num_blocks_is_0\@ + ####################################################################### - cmp $7, %r12 - je _initial_num_blocks_is_7\@ - cmp $6, %r12 - je _initial_num_blocks_is_6\@ - cmp $5, %r12 - je _initial_num_blocks_is_5\@ - cmp $4, %r12 - je _initial_num_blocks_is_4\@ - cmp $3, %r12 - je _initial_num_blocks_is_3\@ - cmp $2, %r12 - je _initial_num_blocks_is_2\@ + vmovdqa TMP3(%rsp), \T1 + vmovdqu HashKey_6(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - jmp _initial_num_blocks_is_1\@ + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqu HashKey_6_k(arg2), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 -_initial_num_blocks_is_7\@: - INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp _initial_blocks_encrypted\@ + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 -_initial_num_blocks_is_6\@: - INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp _initial_blocks_encrypted\@ + vmovdqa TMP4(%rsp), \T1 + vmovdqu HashKey_5(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 -_initial_num_blocks_is_5\@: - INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp _initial_blocks_encrypted\@ + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqu HashKey_5_k(arg2), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 -_initial_num_blocks_is_4\@: - INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp _initial_blocks_encrypted\@ + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 -_initial_num_blocks_is_3\@: - INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp _initial_blocks_encrypted\@ -_initial_num_blocks_is_2\@: - INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp _initial_blocks_encrypted\@ + vmovdqa TMP5(%rsp), \T1 + vmovdqu HashKey_4(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 -_initial_num_blocks_is_1\@: - INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp _initial_blocks_encrypted\@ + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqu HashKey_4_k(arg2), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 -_initial_num_blocks_is_0\@: - INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + vmovdqa TMP6(%rsp), \T1 + vmovdqu HashKey_3(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 -_initial_blocks_encrypted\@: - cmp $0, %r13 - je _zero_cipher_left\@ + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqu HashKey_3_k(arg2), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - sub $128, %r13 - je _eight_cipher_left\@ + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + vmovdqa TMP7(%rsp), \T1 + vmovdqu HashKey_2(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqu HashKey_2_k(arg2), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ####################################################################### + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 -_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg _encrypt_by_8\@ + vmovdqa TMP8(%rsp), \T1 + vmovdqu HashKey(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqu HashKey_k(arg2), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + vpxor \T4, \T6, \T6 + vpxor \T7, \T6, \T6 - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ + vmovdqu 16*10(arg1), \T5 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp _eight_cipher_left\@ + i = 11 + setreg +.rep (\REP-9) + + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqu 16*i(arg1), \T5 + i = i + 1 + setreg +.endr -_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg4, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg4, %r11), reg_j + vmovdqu \T3, 16*i(arg3, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 -_eight_cipher_left\@: - GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + ####################################################################### + #first phase of the reduction + ####################################################################### + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 -_zero_cipher_left\@: - cmp $16, arg4 - jl _only_less_than_16\@ + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer + .endif - je _multiple_of_16_bytes\@ + ####################################################################### + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 - # handle the last <16 Byte block seperately + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + ####################################################################### + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) - sub $16, %r11 - add %r13, %r11 - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + vpxor \T6, \XMM1, \XMM1 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes - jmp _final_ghash_mul\@ -_only_less_than_16\@: - # check for 0 length - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) - je _multiple_of_16_bytes\@ +.endm - # handle the last <16 Byte block seperately +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + ## Karatsuba Method - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) + vpshufd $0b01001110, \XMM1, \T2 + vpxor \XMM1, \T2, \T2 + vmovdqu HashKey_8(arg2), \T5 + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 -_get_last_16_byte_loop\@: - movb (arg3, %r11), %al - movb %al, TMP1 (%rsp , %r11) - add $1, %r11 - cmp %r13, %r11 - jne _get_last_16_byte_loop\@ + vmovdqu HashKey_8_k(arg2), \T3 + vpclmulqdq $0x00, \T3, \T2, \XMM1 - vmovdqu TMP1(%rsp), %xmm1 + ###################### - sub $16, %r11 + vpshufd $0b01001110, \XMM2, \T2 + vpxor \XMM2, \T2, \T2 + vmovdqu HashKey_7(arg2), \T5 + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 -_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to - # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + vmovdqu HashKey_7_k(arg2), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle _less_than_8_bytes_left\@ + ###################### - mov %rax, (arg2 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 + vpshufd $0b01001110, \XMM3, \T2 + vpxor \XMM3, \T2, \T2 + vmovdqu HashKey_6(arg2), \T5 + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 -_less_than_8_bytes_left\@: - movb %al, (arg2 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne _less_than_8_bytes_left\@ - ############################# + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 -_multiple_of_16_bytes\@: - mov arg7, %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 + vmovdqu HashKey_6_k(arg2), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - shl $3, arg4 # len(C) in bits (*128) - vmovq arg4, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + ###################### - vpxor %xmm15, %xmm14, %xmm14 - GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + vpshufd $0b01001110, \XMM4, \T2 + vpxor \XMM4, \T2, \T2 + vmovdqu HashKey_5(arg2), \T5 + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), %xmm9 # xmm9 = Y0 + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + vmovdqu HashKey_5_k(arg2), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - vpxor %xmm14, %xmm9, %xmm9 + ###################### + vpshufd $0b01001110, \XMM5, \T2 + vpxor \XMM5, \T2, \T2 + vmovdqu HashKey_4(arg2), \T5 + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 -_return_T\@: - mov arg8, %r10 # r10 = authTag - mov arg9, %r11 # r11 = auth_tag_len + vmovdqu HashKey_4_k(arg2), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - cmp $16, %r11 - je _T_16\@ + ###################### - cmp $8, %r11 - jl _T_4\@ + vpshufd $0b01001110, \XMM6, \T2 + vpxor \XMM6, \T2, \T2 + vmovdqu HashKey_3(arg2), \T5 + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 -_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl _T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je _return_T_done\@ - add $2, %r10 - sar $16, %eax -_T_1\@: - mov %al, (%r10) - jmp _return_T_done\@ + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 -_T_16\@: - vmovdqu %xmm9, (%r10) + vmovdqu HashKey_3_k(arg2), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 -_return_T_done\@: - mov %r14, %rsp + ###################### - pop %r15 - pop %r14 - pop %r13 - pop %r12 -.endm + vpshufd $0b01001110, \XMM7, \T2 + vpxor \XMM7, \T2, \T2 + vmovdqu HashKey_2(arg2), \T5 + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 -############################################################# -#void aesni_gcm_precomp_avx_gen2 -# (gcm_data *my_ctx_data, -# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ -############################################################# -ENTRY(aesni_gcm_precomp_avx_gen2) - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 + vmovdqu HashKey_2_k(arg2), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 - mov %rsp, %r14 + ###################### + + vpshufd $0b01001110, \XMM8, \T2 + vpxor \XMM8, \T2, \T2 + vmovdqu HashKey(arg2), \T5 + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 + vmovdqu HashKey_k(arg2), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 - vmovdqu (arg2), %xmm6 # xmm6 = HashKey - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + + + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # <T6:T7> holds the result of + # the accumulated carry-less multiplications + ####################################################################### - vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + #first phase of the reduction + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 - PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW - mov %r14, %rsp + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### - pop %r15 - pop %r14 - pop %r13 - pop %r12 - ret -ENDPROC(aesni_gcm_precomp_avx_gen2) -############################################################################### -#void aesni_gcm_enc_avx_gen2( -# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ -# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ -# const u8 *in, /* Plaintext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + +.endm + +############################################################# +#void aesni_gcm_precomp_avx_gen2 +# (gcm_data *my_ctx_data, +# gcm_context_data *data, +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ # u8 *iv, /* Pre-counter block j0: 4 byte salt # (from Security Association) concatenated with 8 byte # Initialisation Vector (from IPSec ESP Payload) # concatenated with 0x00000001. 16-byte aligned pointer. */ # const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +############################################################# +ENTRY(aesni_gcm_init_avx_gen2) + FUNC_SAVE + INIT GHASH_MUL_AVX, PRECOMPUTE_AVX + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_init_avx_gen2) + ############################################################################### -ENTRY(aesni_gcm_enc_avx_gen2) - GCM_ENC_DEC_AVX ENC - ret -ENDPROC(aesni_gcm_enc_avx_gen2) +#void aesni_gcm_enc_update_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, +# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ +# const u8 *in, /* Plaintext input */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ +############################################################################### +ENTRY(aesni_gcm_enc_update_avx_gen2) + FUNC_SAVE + mov keysize, %eax + cmp $32, %eax + je key_256_enc_update + cmp $16, %eax + je key_128_enc_update + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 + FUNC_RESTORE + ret +key_128_enc_update: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 + FUNC_RESTORE + ret +key_256_enc_update: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_enc_update_avx_gen2) ############################################################################### -#void aesni_gcm_dec_avx_gen2( +#void aesni_gcm_dec_update_avx_gen2( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ +############################################################################### +ENTRY(aesni_gcm_dec_update_avx_gen2) + FUNC_SAVE + mov keysize,%eax + cmp $32, %eax + je key_256_dec_update + cmp $16, %eax + je key_128_dec_update + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 + FUNC_RESTORE + ret +key_128_dec_update: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 + FUNC_RESTORE + ret +key_256_dec_update: + GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_dec_update_avx_gen2) + +############################################################################### +#void aesni_gcm_finalize_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *auth_tag, /* Authenticated Tag output. */ # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. # Valid values are 16 (most likely), 12 or 8. */ ############################################################################### -ENTRY(aesni_gcm_dec_avx_gen2) - GCM_ENC_DEC_AVX DEC - ret -ENDPROC(aesni_gcm_dec_avx_gen2) +ENTRY(aesni_gcm_finalize_avx_gen2) + FUNC_SAVE + mov keysize,%eax + cmp $32, %eax + je key_256_finalize + cmp $16, %eax + je key_128_finalize + # must be 192 + GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 + FUNC_RESTORE + ret +key_128_finalize: + GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 + FUNC_RESTORE + ret +key_256_finalize: + GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_finalize_avx_gen2) + #endif /* CONFIG_AS_AVX */ #ifdef CONFIG_AS_AVX2 @@ -1670,113 +1922,42 @@ ENDPROC(aesni_gcm_dec_avx_gen2) # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i vmovdqa \HK, \T5 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly - vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly - vmovdqa \T5, HashKey_3(arg1) + vmovdqu \T5, HashKey_3(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly - vmovdqa \T5, HashKey_4(arg1) + vmovdqu \T5, HashKey_4(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly - vmovdqa \T5, HashKey_5(arg1) + vmovdqu \T5, HashKey_5(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly - vmovdqa \T5, HashKey_6(arg1) + vmovdqu \T5, HashKey_6(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly - vmovdqa \T5, HashKey_7(arg1) + vmovdqu \T5, HashKey_7(arg2) GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly - vmovdqa \T5, HashKey_8(arg1) + vmovdqu \T5, HashKey_8(arg2) .endm - ## if a = number of total plaintext bytes ## b = floor(a/16) ## num_initial_blocks = b mod 4# ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext ## r10, r11, r12, rax are clobbered -## arg1, arg2, arg3, r14 are used as a pointer only, not modified +## arg1, arg3, arg4, r14 are used as a pointer only, not modified -.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER +.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER i = (8-\num_initial_blocks) - j = 0 setreg - - mov arg6, %r10 # r10 = AAD - mov arg7, %r12 # r12 = aadLen - - - mov %r12, %r11 - - vpxor reg_j, reg_j, reg_j - vpxor reg_i, reg_i, reg_i - - cmp $16, %r11 - jl _get_AAD_rest8\@ -_get_AAD_blocks\@: - vmovdqu (%r10), reg_i - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_i, reg_j, reg_j - GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 - add $16, %r10 - sub $16, %r12 - sub $16, %r11 - cmp $16, %r11 - jge _get_AAD_blocks\@ - vmovdqu reg_j, reg_i - cmp $0, %r11 - je _get_AAD_done\@ - - vpxor reg_i, reg_i, reg_i - - /* read the last <16B of AAD. since we have at least 4B of - data right after the AAD (the ICV, and maybe some CT), we can - read 4B/8B blocks safely, and then get rid of the extra stuff */ -_get_AAD_rest8\@: - cmp $4, %r11 - jle _get_AAD_rest4\@ - movq (%r10), \T1 - add $8, %r10 - sub $8, %r11 - vpslldq $8, \T1, \T1 - vpsrldq $8, reg_i, reg_i - vpxor \T1, reg_i, reg_i - jmp _get_AAD_rest8\@ -_get_AAD_rest4\@: - cmp $0, %r11 - jle _get_AAD_rest0\@ - mov (%r10), %eax - movq %rax, \T1 - add $4, %r10 - sub $4, %r11 - vpslldq $12, \T1, \T1 - vpsrldq $4, reg_i, reg_i - vpxor \T1, reg_i, reg_i -_get_AAD_rest0\@: - /* finalize: shift out the extra bytes we read, and align - left. since pslldq can only shift by an immediate, we use - vpshufb and an array of shuffle masks */ - movq %r12, %r11 - salq $4, %r11 - movdqu aad_shift_arr(%r11), \T1 - vpshufb \T1, reg_i, reg_i -_get_AAD_rest_final\@: - vpshufb SHUF_MASK(%rip), reg_i, reg_i - vpxor reg_j, reg_i, reg_i - GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 - -_get_AAD_done\@: - # initialize the data pointer offset as zero - xor %r11d, %r11d + vmovdqu AadHash(arg2), reg_i # start AES for num_initial_blocks blocks - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), \CTR # CTR = Y0 - vpshufb SHUF_MASK(%rip), \CTR, \CTR - + vmovdqu CurCount(arg2), \CTR i = (9-\num_initial_blocks) setreg @@ -1799,7 +1980,7 @@ _get_AAD_done\@: j = 1 setreg -.rep 9 +.rep \REP vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg @@ -1814,7 +1995,7 @@ _get_AAD_done\@: .endr - vmovdqa 16*10(arg1), \T_key + vmovdqa 16*j(arg1), \T_key i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks @@ -1826,9 +2007,9 @@ _get_AAD_done\@: i = (9-\num_initial_blocks) setreg .rep \num_initial_blocks - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, reg_i, reg_i - vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for + vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for # num_initial_blocks blocks add $16, %r11 .if \ENC_DEC == DEC @@ -1905,7 +2086,7 @@ _get_AAD_done\@: i = 1 setreg -.rep 9 # do 9 rounds +.rep \REP # do REP rounds vmovdqa 16*i(arg1), \T_key vaesenc \T_key, \XMM1, \XMM1 vaesenc \T_key, \XMM2, \XMM2 @@ -1930,58 +2111,58 @@ _get_AAD_done\@: vaesenclast \T_key, \XMM7, \XMM7 vaesenclast \T_key, \XMM8, \XMM8 - vmovdqu (arg3, %r11), \T1 + vmovdqu (arg4, %r11), \T1 vpxor \T1, \XMM1, \XMM1 - vmovdqu \XMM1, (arg2 , %r11) + vmovdqu \XMM1, (arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM1 .endif - vmovdqu 16*1(arg3, %r11), \T1 + vmovdqu 16*1(arg4, %r11), \T1 vpxor \T1, \XMM2, \XMM2 - vmovdqu \XMM2, 16*1(arg2 , %r11) + vmovdqu \XMM2, 16*1(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM2 .endif - vmovdqu 16*2(arg3, %r11), \T1 + vmovdqu 16*2(arg4, %r11), \T1 vpxor \T1, \XMM3, \XMM3 - vmovdqu \XMM3, 16*2(arg2 , %r11) + vmovdqu \XMM3, 16*2(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM3 .endif - vmovdqu 16*3(arg3, %r11), \T1 + vmovdqu 16*3(arg4, %r11), \T1 vpxor \T1, \XMM4, \XMM4 - vmovdqu \XMM4, 16*3(arg2 , %r11) + vmovdqu \XMM4, 16*3(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM4 .endif - vmovdqu 16*4(arg3, %r11), \T1 + vmovdqu 16*4(arg4, %r11), \T1 vpxor \T1, \XMM5, \XMM5 - vmovdqu \XMM5, 16*4(arg2 , %r11) + vmovdqu \XMM5, 16*4(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM5 .endif - vmovdqu 16*5(arg3, %r11), \T1 + vmovdqu 16*5(arg4, %r11), \T1 vpxor \T1, \XMM6, \XMM6 - vmovdqu \XMM6, 16*5(arg2 , %r11) + vmovdqu \XMM6, 16*5(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM6 .endif - vmovdqu 16*6(arg3, %r11), \T1 + vmovdqu 16*6(arg4, %r11), \T1 vpxor \T1, \XMM7, \XMM7 - vmovdqu \XMM7, 16*6(arg2 , %r11) + vmovdqu \XMM7, 16*6(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM7 .endif - vmovdqu 16*7(arg3, %r11), \T1 + vmovdqu 16*7(arg4, %r11), \T1 vpxor \T1, \XMM8, \XMM8 - vmovdqu \XMM8, 16*7(arg2 , %r11) + vmovdqu \XMM8, 16*7(arg3 , %r11) .if \ENC_DEC == DEC vmovdqa \T1, \XMM8 .endif @@ -2010,9 +2191,9 @@ _initial_blocks_done\@: # encrypt 8 blocks at a time # ghash the 8 previously encrypted ciphertext blocks -# arg1, arg2, arg3 are used as pointers only, not modified +# arg1, arg3, arg4 are used as pointers only, not modified # r11 is the data offset value -.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC vmovdqa \XMM1, \T2 vmovdqa \XMM2, TMP2(%rsp) @@ -2073,87 +2254,7 @@ _initial_blocks_done\@: - vmovdqu 16*1(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqu 16*2(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - - ####################################################################### - - vmovdqa HashKey_8(arg1), \T5 - vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 - vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 - vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 - vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 - vpxor \T5, \T6, \T6 - - vmovdqu 16*3(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - vmovdqa TMP2(%rsp), \T1 - vmovdqa HashKey_7(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*4(arg1), \T1 - vaesenc \T1, \XMM1, \XMM1 - vaesenc \T1, \XMM2, \XMM2 - vaesenc \T1, \XMM3, \XMM3 - vaesenc \T1, \XMM4, \XMM4 - vaesenc \T1, \XMM5, \XMM5 - vaesenc \T1, \XMM6, \XMM6 - vaesenc \T1, \XMM7, \XMM7 - vaesenc \T1, \XMM8, \XMM8 - - ####################################################################### - - vmovdqa TMP3(%rsp), \T1 - vmovdqa HashKey_6(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*5(arg1), \T1 + vmovdqu 16*1(arg1), \T1 vaesenc \T1, \XMM1, \XMM1 vaesenc \T1, \XMM2, \XMM2 vaesenc \T1, \XMM3, \XMM3 @@ -2163,21 +2264,7 @@ _initial_blocks_done\@: vaesenc \T1, \XMM7, \XMM7 vaesenc \T1, \XMM8, \XMM8 - vmovdqa TMP4(%rsp), \T1 - vmovdqa HashKey_5(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vmovdqu 16*6(arg1), \T1 + vmovdqu 16*2(arg1), \T1 vaesenc \T1, \XMM1, \XMM1 vaesenc \T1, \XMM2, \XMM2 vaesenc \T1, \XMM3, \XMM3 @@ -2188,21 +2275,16 @@ _initial_blocks_done\@: vaesenc \T1, \XMM8, \XMM8 - vmovdqa TMP5(%rsp), \T1 - vmovdqa HashKey_4(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 + ####################################################################### - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 + vmovdqu HashKey_8(arg2), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 + vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 + vpxor \T5, \T6, \T6 - vmovdqu 16*7(arg1), \T1 + vmovdqu 16*3(arg1), \T1 vaesenc \T1, \XMM1, \XMM1 vaesenc \T1, \XMM2, \XMM2 vaesenc \T1, \XMM3, \XMM3 @@ -2212,8 +2294,8 @@ _initial_blocks_done\@: vaesenc \T1, \XMM7, \XMM7 vaesenc \T1, \XMM8, \XMM8 - vmovdqa TMP6(%rsp), \T1 - vmovdqa HashKey_3(arg1), \T5 + vmovdqa TMP2(%rsp), \T1 + vmovdqu HashKey_7(arg2), \T5 vpclmulqdq $0x11, \T5, \T1, \T3 vpxor \T3, \T4, \T4 @@ -2226,7 +2308,7 @@ _initial_blocks_done\@: vpclmulqdq $0x10, \T5, \T1, \T3 vpxor \T3, \T6, \T6 - vmovdqu 16*8(arg1), \T1 + vmovdqu 16*4(arg1), \T1 vaesenc \T1, \XMM1, \XMM1 vaesenc \T1, \XMM2, \XMM2 vaesenc \T1, \XMM3, \XMM3 @@ -2236,35 +2318,12 @@ _initial_blocks_done\@: vaesenc \T1, \XMM7, \XMM7 vaesenc \T1, \XMM8, \XMM8 - vmovdqa TMP7(%rsp), \T1 - vmovdqa HashKey_2(arg1), \T5 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T4 - - vpclmulqdq $0x00, \T5, \T1, \T3 - vpxor \T3, \T7, \T7 - - vpclmulqdq $0x01, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - vpclmulqdq $0x10, \T5, \T1, \T3 - vpxor \T3, \T6, \T6 - - ####################################################################### - vmovdqu 16*9(arg1), \T5 - vaesenc \T5, \XMM1, \XMM1 - vaesenc \T5, \XMM2, \XMM2 - vaesenc \T5, \XMM3, \XMM3 - vaesenc \T5, \XMM4, \XMM4 - vaesenc \T5, \XMM5, \XMM5 - vaesenc \T5, \XMM6, \XMM6 - vaesenc \T5, \XMM7, \XMM7 - vaesenc \T5, \XMM8, \XMM8 - - vmovdqa TMP8(%rsp), \T1 - vmovdqa HashKey(arg1), \T5 + vmovdqa TMP3(%rsp), \T1 + vmovdqu HashKey_6(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 vpclmulqdq $0x00, \T5, \T1, \T3 vpxor \T3, \T7, \T7 @@ -2275,672 +2334,510 @@ _initial_blocks_done\@: vpclmulqdq $0x10, \T5, \T1, \T3 vpxor \T3, \T6, \T6 - vpclmulqdq $0x11, \T5, \T1, \T3 - vpxor \T3, \T4, \T1 - - - vmovdqu 16*10(arg1), \T5 - - i = 0 - j = 1 - setreg -.rep 8 - vpxor 16*i(arg3, %r11), \T5, \T2 - .if \ENC_DEC == ENC - vaesenclast \T2, reg_j, reg_j - .else - vaesenclast \T2, reg_j, \T3 - vmovdqu 16*i(arg3, %r11), reg_j - vmovdqu \T3, 16*i(arg2, %r11) - .endif - i = (i+1) - j = (j+1) - setreg -.endr - ####################################################################### - - - vpslldq $8, \T6, \T3 # shift-L T3 2 DWs - vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs - vpxor \T3, \T7, \T7 - vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 - - - - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 - - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs - - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### - .if \ENC_DEC == ENC - vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer - vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer - .endif - - ####################################################################### - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) - - vpxor \T2, \T4, \T4 # second phase of the reduction complete - ####################################################################### - vpxor \T4, \T1, \T1 # the result is in T1 - - vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap - vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap - - - vpxor \T1, \XMM1, \XMM1 - - - -.endm - - -# GHASH the last 4 ciphertext blocks. -.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 - - ## Karatsuba Method - - vmovdqa HashKey_8(arg1), \T5 - - vpshufd $0b01001110, \XMM1, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM1, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM1, \T6 - vpclmulqdq $0x00, \T5, \XMM1, \T7 - - vpclmulqdq $0x00, \T3, \T2, \XMM1 - - ###################### - - vmovdqa HashKey_7(arg1), \T5 - vpshufd $0b01001110, \XMM2, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM2, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM2, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM2, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_6(arg1), \T5 - vpshufd $0b01001110, \XMM3, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM3, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM3, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM3, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_5(arg1), \T5 - vpshufd $0b01001110, \XMM4, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM4, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM4, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM4, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_4(arg1), \T5 - vpshufd $0b01001110, \XMM5, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM5, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM5, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM5, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_3(arg1), \T5 - vpshufd $0b01001110, \XMM6, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM6, \T2, \T2 - vpxor \T5, \T3, \T3 - - vpclmulqdq $0x11, \T5, \XMM6, \T4 - vpxor \T4, \T6, \T6 - - vpclmulqdq $0x00, \T5, \XMM6, \T4 - vpxor \T4, \T7, \T7 - - vpclmulqdq $0x00, \T3, \T2, \T2 - - vpxor \T2, \XMM1, \XMM1 - - ###################### - - vmovdqa HashKey_2(arg1), \T5 - vpshufd $0b01001110, \XMM7, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM7, \T2, \T2 - vpxor \T5, \T3, \T3 + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - vpclmulqdq $0x11, \T5, \XMM7, \T4 - vpxor \T4, \T6, \T6 + vmovdqa TMP4(%rsp), \T1 + vmovdqu HashKey_5(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \XMM7, \T4 - vpxor \T4, \T7, \T7 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - vpclmulqdq $0x00, \T3, \T2, \T2 + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 - vpxor \T2, \XMM1, \XMM1 + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 - ###################### + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - vmovdqa HashKey(arg1), \T5 - vpshufd $0b01001110, \XMM8, \T2 - vpshufd $0b01001110, \T5, \T3 - vpxor \XMM8, \T2, \T2 - vpxor \T5, \T3, \T3 - vpclmulqdq $0x11, \T5, \XMM8, \T4 - vpxor \T4, \T6, \T6 + vmovdqa TMP5(%rsp), \T1 + vmovdqu HashKey_4(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 - vpclmulqdq $0x00, \T5, \XMM8, \T4 - vpxor \T4, \T7, \T7 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - vpclmulqdq $0x00, \T3, \T2, \T2 + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 - vpxor \T2, \XMM1, \XMM1 - vpxor \T6, \XMM1, \XMM1 - vpxor \T7, \XMM1, \T2 + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + vmovdqa TMP6(%rsp), \T1 + vmovdqu HashKey_3(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 - vpslldq $8, \T2, \T4 - vpsrldq $8, \T2, \T2 + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 - vpxor \T4, \T7, \T7 - vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the - # accumulated carry-less multiplications + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 - ####################################################################### - #first phase of the reduction - vmovdqa POLY2(%rip), \T3 + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 - vpclmulqdq $0x01, \T7, \T3, \T2 - vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs + vmovdqa TMP7(%rsp), \T1 + vmovdqu HashKey_2(arg2), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 - vpxor \T2, \T7, \T7 # first phase of the reduction complete - ####################################################################### + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 - #second phase of the reduction - vpclmulqdq $0x00, \T7, \T3, \T2 - vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 - vpclmulqdq $0x10, \T7, \T3, \T4 - vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - vpxor \T2, \T4, \T4 # second phase of the reduction complete ####################################################################### - vpxor \T4, \T6, \T6 # the result is in T6 -.endm - - -# combined for GCM encrypt and decrypt functions -# clobbering all xmm registers -# clobbering r10, r11, r12, r13, r14, r15 -.macro GCM_ENC_DEC_AVX2 ENC_DEC + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 + vmovdqa TMP8(%rsp), \T1 + vmovdqu HashKey(arg2), \T5 - mov %rsp, %r14 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T1 - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + vmovdqu 16*10(arg1), \T5 - vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + i = 11 + setreg +.rep (\REP-9) + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqu 16*i(arg1), \T5 + i = i + 1 + setreg +.endr - mov arg4, %r13 # save the number of bytes of plaintext/ciphertext - and $-16, %r13 # r13 = r13 - (r13 mod 16) + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg4, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg4, %r11), reg_j + vmovdqu \T3, 16*i(arg3, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### - mov %r13, %r12 - shr $4, %r12 - and $7, %r12 - jz _initial_num_blocks_is_0\@ - cmp $7, %r12 - je _initial_num_blocks_is_7\@ - cmp $6, %r12 - je _initial_num_blocks_is_6\@ - cmp $5, %r12 - je _initial_num_blocks_is_5\@ - cmp $4, %r12 - je _initial_num_blocks_is_4\@ - cmp $3, %r12 - je _initial_num_blocks_is_3\@ - cmp $2, %r12 - je _initial_num_blocks_is_2\@ + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 - jmp _initial_num_blocks_is_1\@ -_initial_num_blocks_is_7\@: - INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*7, %r13 - jmp _initial_blocks_encrypted\@ -_initial_num_blocks_is_6\@: - INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*6, %r13 - jmp _initial_blocks_encrypted\@ + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 -_initial_num_blocks_is_5\@: - INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*5, %r13 - jmp _initial_blocks_encrypted\@ + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs -_initial_num_blocks_is_4\@: - INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*4, %r13 - jmp _initial_blocks_encrypted\@ + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer + .endif -_initial_num_blocks_is_3\@: - INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*3, %r13 - jmp _initial_blocks_encrypted\@ + ####################################################################### + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) -_initial_num_blocks_is_2\@: - INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*2, %r13 - jmp _initial_blocks_encrypted\@ + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) -_initial_num_blocks_is_1\@: - INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC - sub $16*1, %r13 - jmp _initial_blocks_encrypted\@ + vpxor \T2, \T4, \T4 # second phase of the reduction complete + ####################################################################### + vpxor \T4, \T1, \T1 # the result is in T1 -_initial_num_blocks_is_0\@: - INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap -_initial_blocks_encrypted\@: - cmp $0, %r13 - je _zero_cipher_left\@ + vpxor \T1, \XMM1, \XMM1 - sub $128, %r13 - je _eight_cipher_left\@ +.endm - vmovd %xmm9, %r15d - and $255, %r15d - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 + ## Karatsuba Method -_encrypt_by_8_new\@: - cmp $(255-8), %r15d - jg _encrypt_by_8\@ + vmovdqu HashKey_8(arg2), \T5 + vpshufd $0b01001110, \XMM1, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM1, \T2, \T2 + vpxor \T5, \T3, \T3 + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ + vpclmulqdq $0x00, \T3, \T2, \XMM1 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - jmp _eight_cipher_left\@ + ###################### -_encrypt_by_8\@: - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $8, %r15b - GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - add $128, %r11 - sub $128, %r13 - jne _encrypt_by_8_new\@ + vmovdqu HashKey_7(arg2), \T5 + vpshufd $0b01001110, \XMM2, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM2, \T2, \T2 + vpxor \T5, \T3, \T3 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 -_eight_cipher_left\@: - GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + ###################### + vmovdqu HashKey_6(arg2), \T5 + vpshufd $0b01001110, \XMM3, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM3, \T2, \T2 + vpxor \T5, \T3, \T3 -_zero_cipher_left\@: - cmp $16, arg4 - jl _only_less_than_16\@ + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 - je _multiple_of_16_bytes\@ + vpclmulqdq $0x00, \T3, \T2, \T2 - # handle the last <16 Byte block seperately + vpxor \T2, \XMM1, \XMM1 + ###################### - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + vmovdqu HashKey_5(arg2), \T5 + vpshufd $0b01001110, \XMM4, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM4, \T2, \T2 + vpxor \T5, \T3, \T3 - sub $16, %r11 - add %r13, %r11 - vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer - # to be able to shift 16-r13 bytes - # (r13 is the number of bytes in plaintext mod 16) - vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask - vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes - jmp _final_ghash_mul\@ - -_only_less_than_16\@: - # check for 0 length - mov arg4, %r13 - and $15, %r13 # r13 = (arg4 mod 16) + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 - je _multiple_of_16_bytes\@ + vpclmulqdq $0x00, \T3, \T2, \T2 - # handle the last <16 Byte block seperately + vpxor \T2, \XMM1, \XMM1 + ###################### - vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + vmovdqu HashKey_4(arg2), \T5 + vpshufd $0b01001110, \XMM5, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM5, \T2, \T2 + vpxor \T5, \T3, \T3 + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 - lea SHIFT_MASK+16(%rip), %r12 - sub %r13, %r12 # adjust the shuffle mask pointer to be - # able to shift 16-r13 bytes (r13 is the - # number of bytes in plaintext mod 16) + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 -_get_last_16_byte_loop\@: - movb (arg3, %r11), %al - movb %al, TMP1 (%rsp , %r11) - add $1, %r11 - cmp %r13, %r11 - jne _get_last_16_byte_loop\@ + vpclmulqdq $0x00, \T3, \T2, \T2 - vmovdqu TMP1(%rsp), %xmm1 + vpxor \T2, \XMM1, \XMM1 - sub $16, %r11 + ###################### -_final_ghash_mul\@: - .if \ENC_DEC == DEC - vmovdqa %xmm1, %xmm2 - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm2, %xmm2 - vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - .else - vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) - vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 - vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 - vpxor %xmm9, %xmm14, %xmm14 - #GHASH computation for the last <16 Byte block - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 - sub %r13, %r11 - add $16, %r11 - vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext - .endif + vmovdqu HashKey_3(arg2), \T5 + vpshufd $0b01001110, \XMM6, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM6, \T2, \T2 + vpxor \T5, \T3, \T3 + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 - ############################# - # output r13 Bytes - vmovq %xmm9, %rax - cmp $8, %r13 - jle _less_than_8_bytes_left\@ + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 - mov %rax, (arg2 , %r11) - add $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - vmovq %xmm9, %rax - sub $8, %r13 + vpclmulqdq $0x00, \T3, \T2, \T2 -_less_than_8_bytes_left\@: - movb %al, (arg2 , %r11) - add $1, %r11 - shr $8, %rax - sub $1, %r13 - jne _less_than_8_bytes_left\@ - ############################# + vpxor \T2, \XMM1, \XMM1 -_multiple_of_16_bytes\@: - mov arg7, %r12 # r12 = aadLen (number of bytes) - shl $3, %r12 # convert into number of bits - vmovd %r12d, %xmm15 # len(A) in xmm15 + ###################### - shl $3, arg4 # len(C) in bits (*128) - vmovq arg4, %xmm1 - vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 - vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + vmovdqu HashKey_2(arg2), \T5 + vpshufd $0b01001110, \XMM7, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM7, \T2, \T2 + vpxor \T5, \T3, \T3 - vpxor %xmm15, %xmm14, %xmm14 - GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 - mov arg5, %rax # rax = *Y0 - vmovdqu (%rax), %xmm9 # xmm9 = Y0 + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 - ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + vpclmulqdq $0x00, \T3, \T2, \T2 - vpxor %xmm14, %xmm9, %xmm9 + vpxor \T2, \XMM1, \XMM1 + ###################### + vmovdqu HashKey(arg2), \T5 + vpshufd $0b01001110, \XMM8, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM8, \T2, \T2 + vpxor \T5, \T3, \T3 -_return_T\@: - mov arg8, %r10 # r10 = authTag - mov arg9, %r11 # r11 = auth_tag_len + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 - cmp $16, %r11 - je _T_16\@ + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 - cmp $8, %r11 - jl _T_4\@ + vpclmulqdq $0x00, \T3, \T2, \T2 -_T_8\@: - vmovq %xmm9, %rax - mov %rax, (%r10) - add $8, %r10 - sub $8, %r11 - vpsrldq $8, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_4\@: - vmovd %xmm9, %eax - mov %eax, (%r10) - add $4, %r10 - sub $4, %r11 - vpsrldq $4, %xmm9, %xmm9 - cmp $0, %r11 - je _return_T_done\@ -_T_123\@: - vmovd %xmm9, %eax - cmp $2, %r11 - jl _T_1\@ - mov %ax, (%r10) - cmp $2, %r11 - je _return_T_done\@ - add $2, %r10 - sar $16, %eax -_T_1\@: - mov %al, (%r10) - jmp _return_T_done\@ + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 -_T_16\@: - vmovdqu %xmm9, (%r10) -_return_T_done\@: - mov %r14, %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 -.endm + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 -############################################################# -#void aesni_gcm_precomp_avx_gen4 -# (gcm_data *my_ctx_data, -# u8 *hash_subkey)# /* H, the Hash sub key input. -# Data starts on a 16-byte boundary. */ -############################################################# -ENTRY(aesni_gcm_precomp_avx_gen4) - #the number of pushes must equal STACK_OFFSET - push %r12 - push %r13 - push %r14 - push %r15 + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the + # accumulated carry-less multiplications - mov %rsp, %r14 + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs + + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### - sub $VARIABLE_OFFSET, %rsp - and $~63, %rsp # align rsp to 64 bytes + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) - vmovdqu (arg2), %xmm6 # xmm6 = HashKey + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) - vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 - ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey - vmovdqa %xmm6, %xmm2 - vpsllq $1, %xmm6, %xmm6 - vpsrlq $63, %xmm2, %xmm2 - vmovdqa %xmm2, %xmm1 - vpslldq $8, %xmm2, %xmm2 - vpsrldq $8, %xmm1, %xmm1 - vpor %xmm2, %xmm6, %xmm6 - #reduction - vpshufd $0b00100100, %xmm1, %xmm2 - vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 - vpand POLY(%rip), %xmm2, %xmm2 - vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + vpxor \T2, \T4, \T4 # second phase of the reduction complete ####################################################################### - vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly - + vpxor \T4, \T6, \T6 # the result is in T6 +.endm - PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 - mov %r14, %rsp - pop %r15 - pop %r14 - pop %r13 - pop %r12 +############################################################# +#void aesni_gcm_init_avx_gen4 +# (gcm_data *my_ctx_data, +# gcm_context_data *data, +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +############################################################# +ENTRY(aesni_gcm_init_avx_gen4) + FUNC_SAVE + INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 + FUNC_RESTORE ret -ENDPROC(aesni_gcm_precomp_avx_gen4) - +ENDPROC(aesni_gcm_init_avx_gen4) ############################################################################### #void aesni_gcm_enc_avx_gen4( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ # const u8 *in, /* Plaintext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ -# u8 *auth_tag, /* Authenticated Tag output. */ -# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ ############################################################################### -ENTRY(aesni_gcm_enc_avx_gen4) - GCM_ENC_DEC_AVX2 ENC +ENTRY(aesni_gcm_enc_update_avx_gen4) + FUNC_SAVE + mov keysize,%eax + cmp $32, %eax + je key_256_enc_update4 + cmp $16, %eax + je key_128_enc_update4 + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 + FUNC_RESTORE + ret +key_128_enc_update4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 + FUNC_RESTORE ret -ENDPROC(aesni_gcm_enc_avx_gen4) +key_256_enc_update4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_enc_update_avx_gen4) ############################################################################### -#void aesni_gcm_dec_avx_gen4( +#void aesni_gcm_dec_update_avx_gen4( # gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ # const u8 *in, /* Ciphertext input */ -# u64 plaintext_len, /* Length of data in Bytes for encryption. */ -# u8 *iv, /* Pre-counter block j0: 4 byte salt -# (from Security Association) concatenated with 8 byte -# Initialisation Vector (from IPSec ESP Payload) -# concatenated with 0x00000001. 16-byte aligned pointer. */ -# const u8 *aad, /* Additional Authentication Data (AAD)*/ -# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u64 plaintext_len) /* Length of data in Bytes for encryption. */ +############################################################################### +ENTRY(aesni_gcm_dec_update_avx_gen4) + FUNC_SAVE + mov keysize,%eax + cmp $32, %eax + je key_256_dec_update4 + cmp $16, %eax + je key_128_dec_update4 + # must be 192 + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 + FUNC_RESTORE + ret +key_128_dec_update4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 + FUNC_RESTORE + ret +key_256_dec_update4: + GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_dec_update_avx_gen4) + +############################################################################### +#void aesni_gcm_finalize_avx_gen4( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# gcm_context_data *data, # u8 *auth_tag, /* Authenticated Tag output. */ # u64 auth_tag_len)# /* Authenticated Tag Length in bytes. -# Valid values are 16 (most likely), 12 or 8. */ +# Valid values are 16 (most likely), 12 or 8. */ ############################################################################### -ENTRY(aesni_gcm_dec_avx_gen4) - GCM_ENC_DEC_AVX2 DEC - ret -ENDPROC(aesni_gcm_dec_avx_gen4) +ENTRY(aesni_gcm_finalize_avx_gen4) + FUNC_SAVE + mov keysize,%eax + cmp $32, %eax + je key_256_finalize4 + cmp $16, %eax + je key_128_finalize4 + # must be 192 + GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 + FUNC_RESTORE + ret +key_128_finalize4: + GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 + FUNC_RESTORE + ret +key_256_finalize4: + GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 + FUNC_RESTORE + ret +ENDPROC(aesni_gcm_finalize_avx_gen4) #endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c @@ -84,7 +84,7 @@ struct gcm_context_data { u8 current_counter[GCM_BLOCK_LEN]; u64 partial_block_len; u64 unused; - u8 hash_keys[GCM_BLOCK_LEN * 8]; + u8 hash_keys[GCM_BLOCK_LEN * 16]; }; asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, @@ -175,6 +175,32 @@ asmlinkage void aesni_gcm_finalize(void *ctx, struct gcm_context_data *gdata, u8 *auth_tag, unsigned long auth_tag_len); +static struct aesni_gcm_tfm_s { +void (*init)(void *ctx, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, const u8 *aad, + unsigned long aad_len); +void (*enc_update)(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long plaintext_len); +void (*dec_update)(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +void (*finalize)(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); +} *aesni_gcm_tfm; + +struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = { + .init = &aesni_gcm_init, + .enc_update = &aesni_gcm_enc_update, + .dec_update = &aesni_gcm_dec_update, + .finalize = &aesni_gcm_finalize, +}; + #ifdef CONFIG_AS_AVX asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); @@ -183,136 +209,94 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv, void *keys, u8 *out, unsigned int num_bytes); /* - * asmlinkage void aesni_gcm_precomp_avx_gen2() + * asmlinkage void aesni_gcm_init_avx_gen2() * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey); +asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, + const u8 *aad, + unsigned long aad_len); + +asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, unsigned long plaintext_len); +asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out, +asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long plaintext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out, +asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long ciphertext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -static void aesni_gcm_enc_avx(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){ - aesni_gcm_enc(ctx, data, out, in, - plaintext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); - } -} +struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = { + .init = &aesni_gcm_init_avx_gen2, + .enc_update = &aesni_gcm_enc_update_avx_gen2, + .dec_update = &aesni_gcm_dec_update_avx_gen2, + .finalize = &aesni_gcm_finalize_avx_gen2, +}; -static void aesni_gcm_dec_avx(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { - aesni_gcm_dec(ctx, data, out, in, - ciphertext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); - } -} #endif #ifdef CONFIG_AS_AVX2 /* - * asmlinkage void aesni_gcm_precomp_avx_gen4() + * asmlinkage void aesni_gcm_init_avx_gen4() * gcm_data *my_ctx_data, context data * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. */ -asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey); +asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data, + struct gcm_context_data *gdata, + u8 *iv, + u8 *hash_subkey, + const u8 *aad, + unsigned long aad_len); + +asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, unsigned long plaintext_len); +asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, + const u8 *in, + unsigned long ciphertext_len); +asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, + struct gcm_context_data *gdata, + u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out, +asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long plaintext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out, +asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, + struct gcm_context_data *gdata, u8 *out, const u8 *in, unsigned long ciphertext_len, u8 *iv, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); -static void aesni_gcm_enc_avx2(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long plaintext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { - aesni_gcm_enc(ctx, data, out, in, - plaintext_len, iv, hash_subkey, aad, - aad_len, auth_tag, auth_tag_len); - } else if (plaintext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); - aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); - } -} +struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = { + .init = &aesni_gcm_init_avx_gen4, + .enc_update = &aesni_gcm_enc_update_avx_gen4, + .dec_update = &aesni_gcm_dec_update_avx_gen4, + .finalize = &aesni_gcm_finalize_avx_gen4, +}; -static void aesni_gcm_dec_avx2(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long ciphertext_len, u8 *iv, - u8 *hash_subkey, const u8 *aad, unsigned long aad_len, - u8 *auth_tag, unsigned long auth_tag_len) -{ - struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx; - if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) { - aesni_gcm_dec(ctx, data, out, in, - ciphertext_len, iv, hash_subkey, - aad, aad_len, auth_tag, auth_tag_len); - } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { - aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); - aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); - } else { - aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); - aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad, - aad_len, auth_tag, auth_tag_len); - } -} #endif -static void (*aesni_gcm_enc_tfm)(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long plaintext_len, - u8 *iv, u8 *hash_subkey, const u8 *aad, - unsigned long aad_len, u8 *auth_tag, - unsigned long auth_tag_len); - -static void (*aesni_gcm_dec_tfm)(void *ctx, - struct gcm_context_data *data, u8 *out, - const u8 *in, unsigned long ciphertext_len, - u8 *iv, u8 *hash_subkey, const u8 *aad, - unsigned long aad_len, u8 *auth_tag, - unsigned long auth_tag_len); - static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) { @@ -794,6 +778,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, { struct crypto_aead *tfm = crypto_aead_reqtfm(req); unsigned long auth_tag_len = crypto_aead_authsize(tfm); + struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm; struct gcm_context_data data AESNI_ALIGN_ATTR; struct scatter_walk dst_sg_walk = {}; unsigned long left = req->cryptlen; @@ -811,6 +796,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, if (!enc) left -= auth_tag_len; +#ifdef CONFIG_AS_AVX2 + if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4) + gcm_tfm = &aesni_gcm_tfm_avx_gen2; +#endif +#ifdef CONFIG_AS_AVX + if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2) + gcm_tfm = &aesni_gcm_tfm_sse; +#endif + /* Linearize assoc, if not already linear */ if (req->src->length >= assoclen && req->src->length && (!PageHighMem(sg_page(req->src)) || @@ -835,7 +829,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, } kernel_fpu_begin(); - aesni_gcm_init(aes_ctx, &data, iv, + gcm_tfm->init(aes_ctx, &data, iv, hash_subkey, assoc, assoclen); if (req->src != req->dst) { while (left) { @@ -846,10 +840,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, len = min(srclen, dstlen); if (len) { if (enc) - aesni_gcm_enc_update(aes_ctx, &data, + gcm_tfm->enc_update(aes_ctx, &data, dst, src, len); else - aesni_gcm_dec_update(aes_ctx, &data, + gcm_tfm->dec_update(aes_ctx, &data, dst, src, len); } left -= len; @@ -867,10 +861,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, len = scatterwalk_clamp(&src_sg_walk, left); if (len) { if (enc) - aesni_gcm_enc_update(aes_ctx, &data, + gcm_tfm->enc_update(aes_ctx, &data, src, src, len); else - aesni_gcm_dec_update(aes_ctx, &data, + gcm_tfm->dec_update(aes_ctx, &data, src, src, len); } left -= len; @@ -879,7 +873,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, scatterwalk_done(&src_sg_walk, 1, left); } } - aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len); + gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len); kernel_fpu_end(); if (!assocmem) @@ -912,147 +906,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - u8 one_entry_in_sg = 0; - u8 *src, *dst, *assoc; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - struct scatter_walk src_sg_walk; - struct scatter_walk dst_sg_walk = {}; - struct gcm_context_data data AESNI_ALIGN_ATTR; - - if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 || - aesni_gcm_enc_tfm == aesni_gcm_enc || - req->cryptlen < AVX_GEN2_OPTSIZE) { - return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, - aes_ctx); - } - if (sg_is_last(req->src) && - (!PageHighMem(sg_page(req->src)) || - req->src->offset + req->src->length <= PAGE_SIZE) && - sg_is_last(req->dst) && - (!PageHighMem(sg_page(req->dst)) || - req->dst->offset + req->dst->length <= PAGE_SIZE)) { - one_entry_in_sg = 1; - scatterwalk_start(&src_sg_walk, req->src); - assoc = scatterwalk_map(&src_sg_walk); - src = assoc + req->assoclen; - dst = src; - if (unlikely(req->src != req->dst)) { - scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; - } - } else { - /* Allocate memory for src, dst, assoc */ - assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, - GFP_ATOMIC); - if (unlikely(!assoc)) - return -ENOMEM; - scatterwalk_map_and_copy(assoc, req->src, 0, - req->assoclen + req->cryptlen, 0); - src = assoc + req->assoclen; - dst = src; - } - - kernel_fpu_begin(); - aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv, - hash_subkey, assoc, assoclen, - dst + req->cryptlen, auth_tag_len); - kernel_fpu_end(); - - /* The authTag (aka the Integrity Check Value) needs to be written - * back to the packet. */ - if (one_entry_in_sg) { - if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst - req->assoclen); - scatterwalk_advance(&dst_sg_walk, req->dst->length); - scatterwalk_done(&dst_sg_walk, 1, 0); - } - scatterwalk_unmap(assoc); - scatterwalk_advance(&src_sg_walk, req->src->length); - scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); - } else { - scatterwalk_map_and_copy(dst, req->dst, req->assoclen, - req->cryptlen + auth_tag_len, 1); - kfree(assoc); - } - return 0; + return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, + aes_ctx); } static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, u8 *hash_subkey, u8 *iv, void *aes_ctx) { - u8 one_entry_in_sg = 0; - u8 *src, *dst, *assoc; - unsigned long tempCipherLen = 0; - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - unsigned long auth_tag_len = crypto_aead_authsize(tfm); - u8 authTag[16]; - struct scatter_walk src_sg_walk; - struct scatter_walk dst_sg_walk = {}; - struct gcm_context_data data AESNI_ALIGN_ATTR; - int retval = 0; - - if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 || - aesni_gcm_enc_tfm == aesni_gcm_enc || - req->cryptlen < AVX_GEN2_OPTSIZE) { - return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, - aes_ctx); - } - tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); - - if (sg_is_last(req->src) && - (!PageHighMem(sg_page(req->src)) || - req->src->offset + req->src->length <= PAGE_SIZE) && - sg_is_last(req->dst) && req->dst->length && - (!PageHighMem(sg_page(req->dst)) || - req->dst->offset + req->dst->length <= PAGE_SIZE)) { - one_entry_in_sg = 1; - scatterwalk_start(&src_sg_walk, req->src); - assoc = scatterwalk_map(&src_sg_walk); - src = assoc + req->assoclen; - dst = src; - if (unlikely(req->src != req->dst)) { - scatterwalk_start(&dst_sg_walk, req->dst); - dst = scatterwalk_map(&dst_sg_walk) + req->assoclen; - } - } else { - /* Allocate memory for src, dst, assoc */ - assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); - if (!assoc) - return -ENOMEM; - scatterwalk_map_and_copy(assoc, req->src, 0, - req->assoclen + req->cryptlen, 0); - src = assoc + req->assoclen; - dst = src; - } - - - kernel_fpu_begin(); - aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv, - hash_subkey, assoc, assoclen, - authTag, auth_tag_len); - kernel_fpu_end(); - - /* Compare generated tag with passed in tag. */ - retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ? - -EBADMSG : 0; - - if (one_entry_in_sg) { - if (unlikely(req->src != req->dst)) { - scatterwalk_unmap(dst - req->assoclen); - scatterwalk_advance(&dst_sg_walk, req->dst->length); - scatterwalk_done(&dst_sg_walk, 1, 0); - } - scatterwalk_unmap(assoc); - scatterwalk_advance(&src_sg_walk, req->src->length); - scatterwalk_done(&src_sg_walk, req->src == req->dst, 0); - } else { - scatterwalk_map_and_copy(dst, req->dst, req->assoclen, - tempCipherLen, 1); - kfree(assoc); - } - return retval; - + return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, + aes_ctx); } static int helper_rfc4106_encrypt(struct aead_request *req) @@ -1420,21 +1282,18 @@ static int __init aesni_init(void) #ifdef CONFIG_AS_AVX2 if (boot_cpu_has(X86_FEATURE_AVX2)) { pr_info("AVX2 version of gcm_enc/dec engaged.\n"); - aesni_gcm_enc_tfm = aesni_gcm_enc_avx2; - aesni_gcm_dec_tfm = aesni_gcm_dec_avx2; + aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4; } else #endif #ifdef CONFIG_AS_AVX if (boot_cpu_has(X86_FEATURE_AVX)) { pr_info("AVX version of gcm_enc/dec engaged.\n"); - aesni_gcm_enc_tfm = aesni_gcm_enc_avx; - aesni_gcm_dec_tfm = aesni_gcm_dec_avx; + aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2; } else #endif { pr_info("SSE version of gcm_enc/dec engaged.\n"); - aesni_gcm_enc_tfm = aesni_gcm_enc; - aesni_gcm_dec_tfm = aesni_gcm_dec; + aesni_gcm_tfm = &aesni_gcm_tfm_sse; } aesni_ctr_enc_tfm = aesni_ctr_enc; #ifdef CONFIG_AS_AVX diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S @@ -0,0 +1,1025 @@ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX2 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> + +.section .rodata.cst32.ROT8, "aM", @progbits, 32 +.align 32 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 + .octa 0x0e0d0c0f0a09080b0605040702010003 + +.section .rodata.cst32.ROT16, "aM", @progbits, 32 +.align 32 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 + .octa 0x0d0c0f0e09080b0a0504070601000302 + +.section .rodata.cst32.CTRINC, "aM", @progbits, 32 +.align 32 +CTRINC: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.text + +ENTRY(chacha_2block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + + vmovdqa ROT8(%rip),%ymm4 + vmovdqa ROT16(%rip),%ymm5 + + mov %rcx,%rax + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm5,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm6 + vpslld $12,%ymm6,%ymm6 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm6,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm4,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm7 + vpslld $7,%ymm7,%ymm7 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rax + jl .Lxorpart2 + vpxor 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rax + jl .Lxorpart2 + vpxor 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rax + jl .Lxorpart2 + vpxor 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rax + jl .Lxorpart2 + vpxor 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rax + jl .Lxorpart2 + vpxor 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rax + jl .Lxorpart2 + vpxor 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rax + jl .Lxorpart2 + vpxor 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rax + jl .Lxorpart2 + vpxor 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone2 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm7,%xmm7 + vmovdqa %xmm7,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone2 + +ENDPROC(chacha_2block_xor_avx2) + +ENTRY(chacha_4block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + + vmovdqa ROT8(%rip),%ymm8 + vmovdqa ROT16(%rip),%ymm9 + + mov %rcx,%rax + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm9,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm9,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $12,%ymm10,%ymm10 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpshufb %ymm8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + vpshufb %ymm8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vmovdqa %ymm1,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm10,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm6,%ymm5,%ymm5 + vmovdqa %ymm5,%ymm10 + vpslld $7,%ymm10,%ymm10 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm10,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rax + jl .Lxorpart4 + vpxor 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rax + jl .Lxorpart4 + vpxor 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rax + jl .Lxorpart4 + vpxor 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rax + jl .Lxorpart4 + vpxor 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rax + jl .Lxorpart4 + vpxor 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rax + jl .Lxorpart4 + vpxor 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rax + jl .Lxorpart4 + vpxor 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rax + jl .Lxorpart4 + vpxor 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rax + jl .Lxorpart4 + vpxor 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rax + jl .Lxorpart4 + vpxor 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rax + jl .Lxorpart4 + vpxor 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rax + jl .Lxorpart4 + vpxor 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rax + jl .Lxorpart4 + vpxor 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rax + jl .Lxorpart4 + vpxor 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rax + jl .Lxorpart4 + vpxor 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rax + jl .Lxorpart4 + vpxor 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%xmm10,%xmm10 + vmovdqa %xmm10,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone4 + +ENDPROC(chacha_4block_xor_avx2) + +ENTRY(chacha_8block_xor_avx2) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. As we need some + # scratch registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32-, 64- and then 128-bit + # words, which allows us to do XOR in AVX registers. 8/16-bit word + # rotation is done with the slightly better performing byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + vzeroupper + # 4 * 32 byte stack, 32-byte aligned + lea 8(%rsp),%r10 + and $~31, %rsp + sub $0x80, %rsp + mov %rcx,%rax + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + # x0..3 on stack + vmovdqa %ymm0,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm3,0x60(%rsp) + + vmovdqa CTRINC(%rip),%ymm1 + vmovdqa ROT8(%rip),%ymm2 + vmovdqa ROT16(%rip),%ymm3 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxor %ymm11,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm3,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm3,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm3,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm3,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $12,%ymm5,%ymm0 + vpsrld $20,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $12,%ymm6,%ymm0 + vpsrld $20,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $12,%ymm7,%ymm0 + vpsrld $20,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $12,%ymm4,%ymm0 + vpsrld $20,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd 0x00(%rsp),%ymm5,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpxor %ymm0,%ymm15,%ymm15 + vpshufb %ymm2,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd 0x20(%rsp),%ymm6,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm2,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd 0x40(%rsp),%ymm7,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpxor %ymm0,%ymm13,%ymm13 + vpshufb %ymm2,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd 0x60(%rsp),%ymm4,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpxor %ymm0,%ymm14,%ymm14 + vpshufb %ymm2,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxor %ymm10,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm0 + vpsrld $25,%ymm5,%ymm5 + vpor %ymm0,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxor %ymm11,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm0 + vpsrld $25,%ymm6,%ymm6 + vpor %ymm0,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpslld $7,%ymm7,%ymm0 + vpsrld $25,%ymm7,%ymm7 + vpor %ymm0,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxor %ymm9,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm0 + vpsrld $25,%ymm4,%ymm4 + vpor %ymm0,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpaddd 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + vpbroadcastd 0x04(%rdi),%ymm0 + vpaddd 0x20(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x20(%rsp) + vpbroadcastd 0x08(%rdi),%ymm0 + vpaddd 0x40(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x40(%rsp) + vpbroadcastd 0x0c(%rdi),%ymm0 + vpaddd 0x60(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x60(%rsp) + vpbroadcastd 0x10(%rdi),%ymm0 + vpaddd %ymm0,%ymm4,%ymm4 + vpbroadcastd 0x14(%rdi),%ymm0 + vpaddd %ymm0,%ymm5,%ymm5 + vpbroadcastd 0x18(%rdi),%ymm0 + vpaddd %ymm0,%ymm6,%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm0 + vpaddd %ymm0,%ymm7,%ymm7 + vpbroadcastd 0x20(%rdi),%ymm0 + vpaddd %ymm0,%ymm8,%ymm8 + vpbroadcastd 0x24(%rdi),%ymm0 + vpaddd %ymm0,%ymm9,%ymm9 + vpbroadcastd 0x28(%rdi),%ymm0 + vpaddd %ymm0,%ymm10,%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm0 + vpaddd %ymm0,%ymm11,%ymm11 + vpbroadcastd 0x30(%rdi),%ymm0 + vpaddd %ymm0,%ymm12,%ymm12 + vpbroadcastd 0x34(%rdi),%ymm0 + vpaddd %ymm0,%ymm13,%ymm13 + vpbroadcastd 0x38(%rdi),%ymm0 + vpaddd %ymm0,%ymm14,%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm0 + vpaddd %ymm0,%ymm15,%ymm15 + + # x12 += counter values 0-3 + vpaddd %ymm1,%ymm12,%ymm12 + + # interleave 32-bit words in state n, n+1 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x20(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x00(%rsp) + vmovdqa %ymm1,0x20(%rsp) + vmovdqa 0x40(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm1 + vpunpckldq %ymm1,%ymm0,%ymm2 + vpunpckhdq %ymm1,%ymm0,%ymm1 + vmovdqa %ymm2,0x40(%rsp) + vmovdqa %ymm1,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpckldq %ymm5,%ymm0,%ymm4 + vpunpckhdq %ymm5,%ymm0,%ymm5 + vmovdqa %ymm6,%ymm0 + vpunpckldq %ymm7,%ymm0,%ymm6 + vpunpckhdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpckldq %ymm9,%ymm0,%ymm8 + vpunpckhdq %ymm9,%ymm0,%ymm9 + vmovdqa %ymm10,%ymm0 + vpunpckldq %ymm11,%ymm0,%ymm10 + vpunpckhdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpckldq %ymm13,%ymm0,%ymm12 + vpunpckhdq %ymm13,%ymm0,%ymm13 + vmovdqa %ymm14,%ymm0 + vpunpckldq %ymm15,%ymm0,%ymm14 + vpunpckhdq %ymm15,%ymm0,%ymm15 + + # interleave 64-bit words in state n, n+2 + vmovdqa 0x00(%rsp),%ymm0 + vmovdqa 0x40(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x00(%rsp) + vmovdqa %ymm2,0x40(%rsp) + vmovdqa 0x20(%rsp),%ymm0 + vmovdqa 0x60(%rsp),%ymm2 + vpunpcklqdq %ymm2,%ymm0,%ymm1 + vpunpckhqdq %ymm2,%ymm0,%ymm2 + vmovdqa %ymm1,0x20(%rsp) + vmovdqa %ymm2,0x60(%rsp) + vmovdqa %ymm4,%ymm0 + vpunpcklqdq %ymm6,%ymm0,%ymm4 + vpunpckhqdq %ymm6,%ymm0,%ymm6 + vmovdqa %ymm5,%ymm0 + vpunpcklqdq %ymm7,%ymm0,%ymm5 + vpunpckhqdq %ymm7,%ymm0,%ymm7 + vmovdqa %ymm8,%ymm0 + vpunpcklqdq %ymm10,%ymm0,%ymm8 + vpunpckhqdq %ymm10,%ymm0,%ymm10 + vmovdqa %ymm9,%ymm0 + vpunpcklqdq %ymm11,%ymm0,%ymm9 + vpunpckhqdq %ymm11,%ymm0,%ymm11 + vmovdqa %ymm12,%ymm0 + vpunpcklqdq %ymm14,%ymm0,%ymm12 + vpunpckhqdq %ymm14,%ymm0,%ymm14 + vmovdqa %ymm13,%ymm0 + vpunpcklqdq %ymm15,%ymm0,%ymm13 + vpunpckhqdq %ymm15,%ymm0,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa 0x00(%rsp),%ymm1 + vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 + cmp $0x0020,%rax + jl .Lxorpart8 + vpxor 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0000(%rsi) + vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rax + jl .Lxorpart8 + vpxor 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vmovdqa 0x40(%rsp),%ymm1 + vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 + cmp $0x0060,%rax + jl .Lxorpart8 + vpxor 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rax + jl .Lxorpart8 + vpxor 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vmovdqa 0x20(%rsp),%ymm1 + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rax + jl .Lxorpart8 + vpxor 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rax + jl .Lxorpart8 + vpxor 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vmovdqa 0x60(%rsp),%ymm1 + vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 + cmp $0x00e0,%rax + jl .Lxorpart8 + vpxor 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rax + jl .Lxorpart8 + vpxor 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa %ymm4,%ymm0 + cmp $0x0120,%rax + jl .Lxorpart8 + vpxor 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0100(%rsi) + + vmovdqa %ymm12,%ymm0 + cmp $0x0140,%rax + jl .Lxorpart8 + vpxor 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0120(%rsi) + + vmovdqa %ymm6,%ymm0 + cmp $0x0160,%rax + jl .Lxorpart8 + vpxor 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0140(%rsi) + + vmovdqa %ymm14,%ymm0 + cmp $0x0180,%rax + jl .Lxorpart8 + vpxor 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0160(%rsi) + + vmovdqa %ymm5,%ymm0 + cmp $0x01a0,%rax + jl .Lxorpart8 + vpxor 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x0180(%rsi) + + vmovdqa %ymm13,%ymm0 + cmp $0x01c0,%rax + jl .Lxorpart8 + vpxor 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01a0(%rsi) + + vmovdqa %ymm7,%ymm0 + cmp $0x01e0,%rax + jl .Lxorpart8 + vpxor 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01c0(%rsi) + + vmovdqa %ymm15,%ymm0 + cmp $0x0200,%rax + jl .Lxorpart8 + vpxor 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + lea -8(%r10),%rsp + ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x1f,%r9 + jz .Ldone8 + and $~0x1f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + vpxor 0x00(%rsp),%ymm0,%ymm0 + vmovdqa %ymm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone8 + +ENDPROC(chacha_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S @@ -0,0 +1,836 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions + * + * Copyright (C) 2018 Martin Willi + */ + +#include <linux/linkage.h> + +.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 +.align 32 +CTR2BL: .octa 0x00000000000000000000000000000000 + .octa 0x00000000000000000000000000000001 + +.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 +.align 32 +CTR4BL: .octa 0x00000000000000000000000000000002 + .octa 0x00000000000000000000000000000003 + +.section .rodata.cst32.CTR8BL, "aM", @progbits, 32 +.align 32 +CTR8BL: .octa 0x00000003000000020000000100000000 + .octa 0x00000007000000060000000500000004 + +.text + +ENTRY(chacha_2block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 2 data blocks output, o + # %rdx: up to 2 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts two ChaCha blocks by loading the state + # matrix twice across four AVX registers. It performs matrix operations + # on four words in each matrix in parallel, but requires shuffling to + # rearrange the words after each round. + + vzeroupper + + # x0..3[0-2] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + + vmovdqa %ymm0,%ymm8 + vmovdqa %ymm1,%ymm9 + vmovdqa %ymm2,%ymm10 + vmovdqa %ymm3,%ymm11 + +.Ldoubleround: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + + sub $2,%r8d + jnz .Ldoubleround + + # o0 = i0 ^ (x0 + s0) + vpaddd %ymm8,%ymm0,%ymm7 + cmp $0x10,%rcx + jl .Lxorpart2 + vpxord 0x00(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x00(%rsi) + vextracti128 $1,%ymm7,%xmm0 + # o1 = i1 ^ (x1 + s1) + vpaddd %ymm9,%ymm1,%ymm7 + cmp $0x20,%rcx + jl .Lxorpart2 + vpxord 0x10(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x10(%rsi) + vextracti128 $1,%ymm7,%xmm1 + # o2 = i2 ^ (x2 + s2) + vpaddd %ymm10,%ymm2,%ymm7 + cmp $0x30,%rcx + jl .Lxorpart2 + vpxord 0x20(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x20(%rsi) + vextracti128 $1,%ymm7,%xmm2 + # o3 = i3 ^ (x3 + s3) + vpaddd %ymm11,%ymm3,%ymm7 + cmp $0x40,%rcx + jl .Lxorpart2 + vpxord 0x30(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x30(%rsi) + vextracti128 $1,%ymm7,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm7 + cmp $0x50,%rcx + jl .Lxorpart2 + vpxord 0x40(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x40(%rsi) + + vmovdqa %xmm1,%xmm7 + cmp $0x60,%rcx + jl .Lxorpart2 + vpxord 0x50(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x50(%rsi) + + vmovdqa %xmm2,%xmm7 + cmp $0x70,%rcx + jl .Lxorpart2 + vpxord 0x60(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x60(%rsi) + + vmovdqa %xmm3,%xmm7 + cmp $0x80,%rcx + jl .Lxorpart2 + vpxord 0x70(%rdx),%xmm7,%xmm6 + vmovdqu %xmm6,0x70(%rsi) + +.Ldone2: + vzeroupper + ret + +.Lxorpart2: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm7,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone2 + +ENDPROC(chacha_2block_xor_avx512vl) + +ENTRY(chacha_4block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four ChaCha blocks by loading the state + # matrix four times across eight AVX registers. It performs matrix + # operations on four words in two matrices in parallel, sequentially + # to the operations on the four words of the other two matrices. The + # required word shuffling has a rather high latency, we can do the + # arithmetic on two matrix-pairs without much slowdown. + + vzeroupper + + # x0..3[0-4] = s0..3 + vbroadcasti128 0x00(%rdi),%ymm0 + vbroadcasti128 0x10(%rdi),%ymm1 + vbroadcasti128 0x20(%rdi),%ymm2 + vbroadcasti128 0x30(%rdi),%ymm3 + + vmovdqa %ymm0,%ymm4 + vmovdqa %ymm1,%ymm5 + vmovdqa %ymm2,%ymm6 + vmovdqa %ymm3,%ymm7 + + vpaddd CTR2BL(%rip),%ymm3,%ymm3 + vpaddd CTR4BL(%rip),%ymm7,%ymm7 + + vmovdqa %ymm0,%ymm11 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm3,%ymm14 + vmovdqa %ymm7,%ymm15 + +.Ldoubleround4: + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm1,%ymm1 + vpshufd $0x39,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm3,%ymm3 + vpshufd $0x93,%ymm7,%ymm7 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $16,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $16,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $12,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + vpaddd %ymm1,%ymm0,%ymm0 + vpxord %ymm0,%ymm3,%ymm3 + vprold $8,%ymm3,%ymm3 + + vpaddd %ymm5,%ymm4,%ymm4 + vpxord %ymm4,%ymm7,%ymm7 + vprold $8,%ymm7,%ymm7 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + vpaddd %ymm3,%ymm2,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vprold $7,%ymm1,%ymm1 + + vpaddd %ymm7,%ymm6,%ymm6 + vpxord %ymm6,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + vpshufd $0x93,%ymm1,%ymm1 + vpshufd $0x93,%ymm5,%ymm5 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + vpshufd $0x4e,%ymm2,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + vpshufd $0x39,%ymm3,%ymm3 + vpshufd $0x39,%ymm7,%ymm7 + + sub $2,%r8d + jnz .Ldoubleround4 + + # o0 = i0 ^ (x0 + s0), first block + vpaddd %ymm11,%ymm0,%ymm10 + cmp $0x10,%rcx + jl .Lxorpart4 + vpxord 0x00(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x00(%rsi) + vextracti128 $1,%ymm10,%xmm0 + # o1 = i1 ^ (x1 + s1), first block + vpaddd %ymm12,%ymm1,%ymm10 + cmp $0x20,%rcx + jl .Lxorpart4 + vpxord 0x10(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x10(%rsi) + vextracti128 $1,%ymm10,%xmm1 + # o2 = i2 ^ (x2 + s2), first block + vpaddd %ymm13,%ymm2,%ymm10 + cmp $0x30,%rcx + jl .Lxorpart4 + vpxord 0x20(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x20(%rsi) + vextracti128 $1,%ymm10,%xmm2 + # o3 = i3 ^ (x3 + s3), first block + vpaddd %ymm14,%ymm3,%ymm10 + cmp $0x40,%rcx + jl .Lxorpart4 + vpxord 0x30(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x30(%rsi) + vextracti128 $1,%ymm10,%xmm3 + + # xor and write second block + vmovdqa %xmm0,%xmm10 + cmp $0x50,%rcx + jl .Lxorpart4 + vpxord 0x40(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x40(%rsi) + + vmovdqa %xmm1,%xmm10 + cmp $0x60,%rcx + jl .Lxorpart4 + vpxord 0x50(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x50(%rsi) + + vmovdqa %xmm2,%xmm10 + cmp $0x70,%rcx + jl .Lxorpart4 + vpxord 0x60(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x60(%rsi) + + vmovdqa %xmm3,%xmm10 + cmp $0x80,%rcx + jl .Lxorpart4 + vpxord 0x70(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x70(%rsi) + + # o0 = i0 ^ (x0 + s0), third block + vpaddd %ymm11,%ymm4,%ymm10 + cmp $0x90,%rcx + jl .Lxorpart4 + vpxord 0x80(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x80(%rsi) + vextracti128 $1,%ymm10,%xmm4 + # o1 = i1 ^ (x1 + s1), third block + vpaddd %ymm12,%ymm5,%ymm10 + cmp $0xa0,%rcx + jl .Lxorpart4 + vpxord 0x90(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0x90(%rsi) + vextracti128 $1,%ymm10,%xmm5 + # o2 = i2 ^ (x2 + s2), third block + vpaddd %ymm13,%ymm6,%ymm10 + cmp $0xb0,%rcx + jl .Lxorpart4 + vpxord 0xa0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xa0(%rsi) + vextracti128 $1,%ymm10,%xmm6 + # o3 = i3 ^ (x3 + s3), third block + vpaddd %ymm15,%ymm7,%ymm10 + cmp $0xc0,%rcx + jl .Lxorpart4 + vpxord 0xb0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xb0(%rsi) + vextracti128 $1,%ymm10,%xmm7 + + # xor and write fourth block + vmovdqa %xmm4,%xmm10 + cmp $0xd0,%rcx + jl .Lxorpart4 + vpxord 0xc0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xc0(%rsi) + + vmovdqa %xmm5,%xmm10 + cmp $0xe0,%rcx + jl .Lxorpart4 + vpxord 0xd0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xd0(%rsi) + + vmovdqa %xmm6,%xmm10 + cmp $0xf0,%rcx + jl .Lxorpart4 + vpxord 0xe0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xe0(%rsi) + + vmovdqa %xmm7,%xmm10 + cmp $0x100,%rcx + jl .Lxorpart4 + vpxord 0xf0(%rdx),%xmm10,%xmm9 + vmovdqu %xmm9,0xf0(%rsi) + +.Ldone4: + vzeroupper + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0xf,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0xf,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z} + vpxord %xmm10,%xmm1,%xmm1 + vmovdqu8 %xmm1,(%rsi,%r9){%k1} + + jmp .Ldone4 + +ENDPROC(chacha_4block_xor_avx512vl) + +ENTRY(chacha_8block_xor_avx512vl) + # %rdi: Input state matrix, s + # %rsi: up to 8 data blocks output, o + # %rdx: up to 8 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts eight consecutive ChaCha blocks by loading + # the state matrix in AVX registers eight times. Compared to AVX2, this + # mostly benefits from the new rotate instructions in VL and the + # additional registers. + + vzeroupper + + # x0..15[0-7] = s[0..15] + vpbroadcastd 0x00(%rdi),%ymm0 + vpbroadcastd 0x04(%rdi),%ymm1 + vpbroadcastd 0x08(%rdi),%ymm2 + vpbroadcastd 0x0c(%rdi),%ymm3 + vpbroadcastd 0x10(%rdi),%ymm4 + vpbroadcastd 0x14(%rdi),%ymm5 + vpbroadcastd 0x18(%rdi),%ymm6 + vpbroadcastd 0x1c(%rdi),%ymm7 + vpbroadcastd 0x20(%rdi),%ymm8 + vpbroadcastd 0x24(%rdi),%ymm9 + vpbroadcastd 0x28(%rdi),%ymm10 + vpbroadcastd 0x2c(%rdi),%ymm11 + vpbroadcastd 0x30(%rdi),%ymm12 + vpbroadcastd 0x34(%rdi),%ymm13 + vpbroadcastd 0x38(%rdi),%ymm14 + vpbroadcastd 0x3c(%rdi),%ymm15 + + # x12 += counter values 0-3 + vpaddd CTR8BL(%rip),%ymm12,%ymm12 + + vmovdqa64 %ymm0,%ymm16 + vmovdqa64 %ymm1,%ymm17 + vmovdqa64 %ymm2,%ymm18 + vmovdqa64 %ymm3,%ymm19 + vmovdqa64 %ymm4,%ymm20 + vmovdqa64 %ymm5,%ymm21 + vmovdqa64 %ymm6,%ymm22 + vmovdqa64 %ymm7,%ymm23 + vmovdqa64 %ymm8,%ymm24 + vmovdqa64 %ymm9,%ymm25 + vmovdqa64 %ymm10,%ymm26 + vmovdqa64 %ymm11,%ymm27 + vmovdqa64 %ymm12,%ymm28 + vmovdqa64 %ymm13,%ymm29 + vmovdqa64 %ymm14,%ymm30 + vmovdqa64 %ymm15,%ymm31 + +.Ldoubleround8: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + vpaddd %ymm0,%ymm4,%ymm0 + vpxord %ymm0,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + vpaddd %ymm1,%ymm5,%ymm1 + vpxord %ymm1,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + vpaddd %ymm2,%ymm6,%ymm2 + vpxord %ymm2,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + vpaddd %ymm3,%ymm7,%ymm3 + vpxord %ymm3,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + vpaddd %ymm12,%ymm8,%ymm8 + vpxord %ymm8,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + vpaddd %ymm13,%ymm9,%ymm9 + vpxord %ymm9,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + vpaddd %ymm14,%ymm10,%ymm10 + vpxord %ymm10,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + vpaddd %ymm15,%ymm11,%ymm11 + vpxord %ymm11,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $16,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $16,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $16,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $16,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $12,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $12,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $12,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $12,%ymm4,%ymm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + vpaddd %ymm0,%ymm5,%ymm0 + vpxord %ymm0,%ymm15,%ymm15 + vprold $8,%ymm15,%ymm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + vpaddd %ymm1,%ymm6,%ymm1 + vpxord %ymm1,%ymm12,%ymm12 + vprold $8,%ymm12,%ymm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + vpaddd %ymm2,%ymm7,%ymm2 + vpxord %ymm2,%ymm13,%ymm13 + vprold $8,%ymm13,%ymm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + vpaddd %ymm3,%ymm4,%ymm3 + vpxord %ymm3,%ymm14,%ymm14 + vprold $8,%ymm14,%ymm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + vpaddd %ymm15,%ymm10,%ymm10 + vpxord %ymm10,%ymm5,%ymm5 + vprold $7,%ymm5,%ymm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + vpaddd %ymm12,%ymm11,%ymm11 + vpxord %ymm11,%ymm6,%ymm6 + vprold $7,%ymm6,%ymm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + vpaddd %ymm13,%ymm8,%ymm8 + vpxord %ymm8,%ymm7,%ymm7 + vprold $7,%ymm7,%ymm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + vpaddd %ymm14,%ymm9,%ymm9 + vpxord %ymm9,%ymm4,%ymm4 + vprold $7,%ymm4,%ymm4 + + sub $2,%r8d + jnz .Ldoubleround8 + + # x0..15[0-3] += s[0..15] + vpaddd %ymm16,%ymm0,%ymm0 + vpaddd %ymm17,%ymm1,%ymm1 + vpaddd %ymm18,%ymm2,%ymm2 + vpaddd %ymm19,%ymm3,%ymm3 + vpaddd %ymm20,%ymm4,%ymm4 + vpaddd %ymm21,%ymm5,%ymm5 + vpaddd %ymm22,%ymm6,%ymm6 + vpaddd %ymm23,%ymm7,%ymm7 + vpaddd %ymm24,%ymm8,%ymm8 + vpaddd %ymm25,%ymm9,%ymm9 + vpaddd %ymm26,%ymm10,%ymm10 + vpaddd %ymm27,%ymm11,%ymm11 + vpaddd %ymm28,%ymm12,%ymm12 + vpaddd %ymm29,%ymm13,%ymm13 + vpaddd %ymm30,%ymm14,%ymm14 + vpaddd %ymm31,%ymm15,%ymm15 + + # interleave 32-bit words in state n, n+1 + vpunpckldq %ymm1,%ymm0,%ymm16 + vpunpckhdq %ymm1,%ymm0,%ymm17 + vpunpckldq %ymm3,%ymm2,%ymm18 + vpunpckhdq %ymm3,%ymm2,%ymm19 + vpunpckldq %ymm5,%ymm4,%ymm20 + vpunpckhdq %ymm5,%ymm4,%ymm21 + vpunpckldq %ymm7,%ymm6,%ymm22 + vpunpckhdq %ymm7,%ymm6,%ymm23 + vpunpckldq %ymm9,%ymm8,%ymm24 + vpunpckhdq %ymm9,%ymm8,%ymm25 + vpunpckldq %ymm11,%ymm10,%ymm26 + vpunpckhdq %ymm11,%ymm10,%ymm27 + vpunpckldq %ymm13,%ymm12,%ymm28 + vpunpckhdq %ymm13,%ymm12,%ymm29 + vpunpckldq %ymm15,%ymm14,%ymm30 + vpunpckhdq %ymm15,%ymm14,%ymm31 + + # interleave 64-bit words in state n, n+2 + vpunpcklqdq %ymm18,%ymm16,%ymm0 + vpunpcklqdq %ymm19,%ymm17,%ymm1 + vpunpckhqdq %ymm18,%ymm16,%ymm2 + vpunpckhqdq %ymm19,%ymm17,%ymm3 + vpunpcklqdq %ymm22,%ymm20,%ymm4 + vpunpcklqdq %ymm23,%ymm21,%ymm5 + vpunpckhqdq %ymm22,%ymm20,%ymm6 + vpunpckhqdq %ymm23,%ymm21,%ymm7 + vpunpcklqdq %ymm26,%ymm24,%ymm8 + vpunpcklqdq %ymm27,%ymm25,%ymm9 + vpunpckhqdq %ymm26,%ymm24,%ymm10 + vpunpckhqdq %ymm27,%ymm25,%ymm11 + vpunpcklqdq %ymm30,%ymm28,%ymm12 + vpunpcklqdq %ymm31,%ymm29,%ymm13 + vpunpckhqdq %ymm30,%ymm28,%ymm14 + vpunpckhqdq %ymm31,%ymm29,%ymm15 + + # interleave 128-bit words in state n, n+4 + # xor/write first four blocks + vmovdqa64 %ymm0,%ymm16 + vperm2i128 $0x20,%ymm4,%ymm0,%ymm0 + cmp $0x0020,%rcx + jl .Lxorpart8 + vpxord 0x0000(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0000(%rsi) + vmovdqa64 %ymm16,%ymm0 + vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 + + vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 + cmp $0x0040,%rcx + jl .Lxorpart8 + vpxord 0x0020(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0020(%rsi) + vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 + + vperm2i128 $0x20,%ymm6,%ymm2,%ymm0 + cmp $0x0060,%rcx + jl .Lxorpart8 + vpxord 0x0040(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0040(%rsi) + vperm2i128 $0x31,%ymm6,%ymm2,%ymm6 + + vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 + cmp $0x0080,%rcx + jl .Lxorpart8 + vpxord 0x0060(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0060(%rsi) + vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 + + vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 + cmp $0x00a0,%rcx + jl .Lxorpart8 + vpxord 0x0080(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0080(%rsi) + vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 + + vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 + cmp $0x00c0,%rcx + jl .Lxorpart8 + vpxord 0x00a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00a0(%rsi) + vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 + + vperm2i128 $0x20,%ymm7,%ymm3,%ymm0 + cmp $0x00e0,%rcx + jl .Lxorpart8 + vpxord 0x00c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00c0(%rsi) + vperm2i128 $0x31,%ymm7,%ymm3,%ymm7 + + vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 + cmp $0x0100,%rcx + jl .Lxorpart8 + vpxord 0x00e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x00e0(%rsi) + vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 + + # xor remaining blocks, write to output + vmovdqa64 %ymm4,%ymm0 + cmp $0x0120,%rcx + jl .Lxorpart8 + vpxord 0x0100(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0100(%rsi) + + vmovdqa64 %ymm12,%ymm0 + cmp $0x0140,%rcx + jl .Lxorpart8 + vpxord 0x0120(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0120(%rsi) + + vmovdqa64 %ymm6,%ymm0 + cmp $0x0160,%rcx + jl .Lxorpart8 + vpxord 0x0140(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0140(%rsi) + + vmovdqa64 %ymm14,%ymm0 + cmp $0x0180,%rcx + jl .Lxorpart8 + vpxord 0x0160(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0160(%rsi) + + vmovdqa64 %ymm5,%ymm0 + cmp $0x01a0,%rcx + jl .Lxorpart8 + vpxord 0x0180(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x0180(%rsi) + + vmovdqa64 %ymm13,%ymm0 + cmp $0x01c0,%rcx + jl .Lxorpart8 + vpxord 0x01a0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01a0(%rsi) + + vmovdqa64 %ymm7,%ymm0 + cmp $0x01e0,%rcx + jl .Lxorpart8 + vpxord 0x01c0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01c0(%rsi) + + vmovdqa64 %ymm15,%ymm0 + cmp $0x0200,%rcx + jl .Lxorpart8 + vpxord 0x01e0(%rdx),%ymm0,%ymm0 + vmovdqu64 %ymm0,0x01e0(%rsi) + +.Ldone8: + vzeroupper + ret + +.Lxorpart8: + # xor remaining bytes from partial register into output + mov %rcx,%rax + and $0x1f,%rcx + jz .Ldone8 + mov %rax,%r9 + and $~0x1f,%r9 + + mov $1,%rax + shld %cl,%rax,%rax + sub $1,%rax + kmovq %rax,%k1 + + vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z} + vpxord %ymm0,%ymm1,%ymm1 + vmovdqu8 %ymm1,(%rsi,%r9){%k1} + + jmp .Ldone8 + +ENDPROC(chacha_8block_xor_avx512vl) diff --git a/arch/x86/crypto/chacha-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S @@ -0,0 +1,795 @@ +/* + * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/frame.h> + +.section .rodata.cst16.ROT8, "aM", @progbits, 16 +.align 16 +ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 +.section .rodata.cst16.CTRINC, "aM", @progbits, 16 +.align 16 +CTRINC: .octa 0x00000003000000020000000100000000 + +.text + +/* + * chacha_permute - permute one block + * + * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This + * function performs matrix operations on four words in parallel, but requires + * shuffling to rearrange the words after each round. 8/16-bit word rotation is + * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word + * rotation uses traditional shift+OR. + * + * The round count is given in %r8d. + * + * Clobbers: %r8d, %xmm4-%xmm7 + */ +chacha_permute: + + movdqa ROT8(%rip),%xmm4 + movdqa ROT16(%rip),%xmm5 + +.Ldoubleround: + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm3,%xmm3 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 16) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm5,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 12) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm6 + pslld $12,%xmm6 + psrld $20,%xmm1 + por %xmm6,%xmm1 + + # x0 += x1, x3 = rotl32(x3 ^ x0, 8) + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm4,%xmm3 + + # x2 += x3, x1 = rotl32(x1 ^ x2, 7) + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm7 + pslld $7,%xmm7 + psrld $25,%xmm1 + por %xmm7,%xmm1 + + # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) + pshufd $0x93,%xmm1,%xmm1 + # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) + pshufd $0x4e,%xmm2,%xmm2 + # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) + pshufd $0x39,%xmm3,%xmm3 + + sub $2,%r8d + jnz .Ldoubleround + + ret +ENDPROC(chacha_permute) + +ENTRY(chacha_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + FRAME_BEGIN + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + mov %rcx,%rax + call chacha_permute + + # o0 = i0 ^ (x0 + s0) + paddd %xmm8,%xmm0 + cmp $0x10,%rax + jl .Lxorpart + movdqu 0x00(%rdx),%xmm4 + pxor %xmm4,%xmm0 + movdqu %xmm0,0x00(%rsi) + # o1 = i1 ^ (x1 + s1) + paddd %xmm9,%xmm1 + movdqa %xmm1,%xmm0 + cmp $0x20,%rax + jl .Lxorpart + movdqu 0x10(%rdx),%xmm0 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + # o2 = i2 ^ (x2 + s2) + paddd %xmm10,%xmm2 + movdqa %xmm2,%xmm0 + cmp $0x30,%rax + jl .Lxorpart + movdqu 0x20(%rdx),%xmm0 + pxor %xmm2,%xmm0 + movdqu %xmm0,0x20(%rsi) + # o3 = i3 ^ (x3 + s3) + paddd %xmm11,%xmm3 + movdqa %xmm3,%xmm0 + cmp $0x40,%rax + jl .Lxorpart + movdqu 0x30(%rdx),%xmm0 + pxor %xmm3,%xmm0 + movdqu %xmm0,0x30(%rsi) + +.Ldone: + FRAME_END + ret + +.Lxorpart: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone + and $~0x0f,%rax + + mov %rsi,%r11 + + lea 8(%rsp),%r10 + sub $0x10,%rsp + and $~31,%rsp + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + lea -8(%r10),%rsp + jmp .Ldone + +ENDPROC(chacha_block_xor_ssse3) + +ENTRY(hchacha_block_ssse3) + # %rdi: Input state matrix, s + # %rsi: output (8 32-bit words) + # %edx: nrounds + FRAME_BEGIN + + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + + mov %edx,%r8d + call chacha_permute + + movdqu %xmm0,0x00(%rsi) + movdqu %xmm3,0x10(%rsi) + + FRAME_END + ret +ENDPROC(hchacha_block_ssse3) + +ENTRY(chacha_4block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 4 data blocks output, o + # %rdx: up to 4 data blocks input, i + # %rcx: input/output length in bytes + # %r8d: nrounds + + # This function encrypts four consecutive ChaCha blocks by loading the + # the state matrix in SSE registers four times. As we need some scratch + # registers, we save the first four registers on the stack. The + # algorithm performs each operation on the corresponding word of each + # state matrix, hence requires no word shuffling. For final XORing step + # we transpose the matrix by interleaving 32- and then 64-bit words, + # which allows us to do XOR in SSE registers. 8/16-bit word rotation is + # done with the slightly better performing SSSE3 byte shuffling, + # 7/12-bit word rotation uses traditional shift+OR. + + lea 8(%rsp),%r10 + sub $0x80,%rsp + and $~63,%rsp + mov %rcx,%rax + + # x0..15[0-3] = s0..3[0..3] + movq 0x00(%rdi),%xmm1 + pshufd $0x00,%xmm1,%xmm0 + pshufd $0x55,%xmm1,%xmm1 + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + movq 0x10(%rdi),%xmm5 + pshufd $0x00,%xmm5,%xmm4 + pshufd $0x55,%xmm5,%xmm5 + movq 0x18(%rdi),%xmm7 + pshufd $0x00,%xmm7,%xmm6 + pshufd $0x55,%xmm7,%xmm7 + movq 0x20(%rdi),%xmm9 + pshufd $0x00,%xmm9,%xmm8 + pshufd $0x55,%xmm9,%xmm9 + movq 0x28(%rdi),%xmm11 + pshufd $0x00,%xmm11,%xmm10 + pshufd $0x55,%xmm11,%xmm11 + movq 0x30(%rdi),%xmm13 + pshufd $0x00,%xmm13,%xmm12 + pshufd $0x55,%xmm13,%xmm13 + movq 0x38(%rdi),%xmm15 + pshufd $0x00,%xmm15,%xmm14 + pshufd $0x55,%xmm15,%xmm15 + # x0..3 on stack + movdqa %xmm0,0x00(%rsp) + movdqa %xmm1,0x10(%rsp) + movdqa %xmm2,0x20(%rsp) + movdqa %xmm3,0x30(%rsp) + + movdqa CTRINC(%rip),%xmm1 + movdqa ROT8(%rip),%xmm2 + movdqa ROT16(%rip),%xmm3 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + +.Ldoubleround4: + # x0 += x4, x12 = rotl32(x12 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 12) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 12) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 12) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 12) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + + # x0 += x4, x12 = rotl32(x12 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x1 += x5, x13 = rotl32(x13 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x2 += x6, x14 = rotl32(x14 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + # x3 += x7, x15 = rotl32(x15 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + + # x8 += x12, x4 = rotl32(x4 ^ x8, 7) + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + # x9 += x13, x5 = rotl32(x5 ^ x9, 7) + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x10 += x14, x6 = rotl32(x6 ^ x10, 7) + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x11 += x15, x7 = rotl32(x7 ^ x11, 7) + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 16) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm3,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 16) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm3,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 16) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm3,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 16) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm3,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 12) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 12) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 12) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 12) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $12,%xmm0 + psrld $20,%xmm4 + por %xmm0,%xmm4 + + # x0 += x5, x15 = rotl32(x15 ^ x0, 8) + movdqa 0x00(%rsp),%xmm0 + paddd %xmm5,%xmm0 + movdqa %xmm0,0x00(%rsp) + pxor %xmm0,%xmm15 + pshufb %xmm2,%xmm15 + # x1 += x6, x12 = rotl32(x12 ^ x1, 8) + movdqa 0x10(%rsp),%xmm0 + paddd %xmm6,%xmm0 + movdqa %xmm0,0x10(%rsp) + pxor %xmm0,%xmm12 + pshufb %xmm2,%xmm12 + # x2 += x7, x13 = rotl32(x13 ^ x2, 8) + movdqa 0x20(%rsp),%xmm0 + paddd %xmm7,%xmm0 + movdqa %xmm0,0x20(%rsp) + pxor %xmm0,%xmm13 + pshufb %xmm2,%xmm13 + # x3 += x4, x14 = rotl32(x14 ^ x3, 8) + movdqa 0x30(%rsp),%xmm0 + paddd %xmm4,%xmm0 + movdqa %xmm0,0x30(%rsp) + pxor %xmm0,%xmm14 + pshufb %xmm2,%xmm14 + + # x10 += x15, x5 = rotl32(x5 ^ x10, 7) + paddd %xmm15,%xmm10 + pxor %xmm10,%xmm5 + movdqa %xmm5,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm5 + por %xmm0,%xmm5 + # x11 += x12, x6 = rotl32(x6 ^ x11, 7) + paddd %xmm12,%xmm11 + pxor %xmm11,%xmm6 + movdqa %xmm6,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm6 + por %xmm0,%xmm6 + # x8 += x13, x7 = rotl32(x7 ^ x8, 7) + paddd %xmm13,%xmm8 + pxor %xmm8,%xmm7 + movdqa %xmm7,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm7 + por %xmm0,%xmm7 + # x9 += x14, x4 = rotl32(x4 ^ x9, 7) + paddd %xmm14,%xmm9 + pxor %xmm9,%xmm4 + movdqa %xmm4,%xmm0 + pslld $7,%xmm0 + psrld $25,%xmm4 + por %xmm0,%xmm4 + + sub $2,%r8d + jnz .Ldoubleround4 + + # x0[0-3] += s0[0] + # x1[0-3] += s0[1] + movq 0x00(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x00(%rsp),%xmm2 + movdqa %xmm2,0x00(%rsp) + paddd 0x10(%rsp),%xmm3 + movdqa %xmm3,0x10(%rsp) + # x2[0-3] += s0[2] + # x3[0-3] += s0[3] + movq 0x08(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd 0x20(%rsp),%xmm2 + movdqa %xmm2,0x20(%rsp) + paddd 0x30(%rsp),%xmm3 + movdqa %xmm3,0x30(%rsp) + + # x4[0-3] += s1[0] + # x5[0-3] += s1[1] + movq 0x10(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + # x6[0-3] += s1[2] + # x7[0-3] += s1[3] + movq 0x18(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + + # x8[0-3] += s2[0] + # x9[0-3] += s2[1] + movq 0x20(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm8 + paddd %xmm3,%xmm9 + # x10[0-3] += s2[2] + # x11[0-3] += s2[3] + movq 0x28(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm10 + paddd %xmm3,%xmm11 + + # x12[0-3] += s3[0] + # x13[0-3] += s3[1] + movq 0x30(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm12 + paddd %xmm3,%xmm13 + # x14[0-3] += s3[2] + # x15[0-3] += s3[3] + movq 0x38(%rdi),%xmm3 + pshufd $0x00,%xmm3,%xmm2 + pshufd $0x55,%xmm3,%xmm3 + paddd %xmm2,%xmm14 + paddd %xmm3,%xmm15 + + # x12 += counter values 0-3 + paddd %xmm1,%xmm12 + + # interleave 32-bit words in state n, n+1 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x10(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x10(%rsp) + movdqa 0x20(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpckldq %xmm1,%xmm2 + punpckhdq %xmm1,%xmm0 + movdqa %xmm2,0x20(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpckldq %xmm5,%xmm4 + punpckhdq %xmm5,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm6,%xmm0 + punpckldq %xmm7,%xmm6 + punpckhdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm9,%xmm0 + movdqa %xmm0,%xmm9 + movdqa %xmm10,%xmm0 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpckldq %xmm13,%xmm12 + punpckhdq %xmm13,%xmm0 + movdqa %xmm0,%xmm13 + movdqa %xmm14,%xmm0 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # interleave 64-bit words in state n, n+2 + movdqa 0x00(%rsp),%xmm0 + movdqa 0x20(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x00(%rsp) + movdqa %xmm0,0x20(%rsp) + movdqa 0x10(%rsp),%xmm0 + movdqa 0x30(%rsp),%xmm1 + movdqa %xmm0,%xmm2 + punpcklqdq %xmm1,%xmm2 + punpckhqdq %xmm1,%xmm0 + movdqa %xmm2,0x10(%rsp) + movdqa %xmm0,0x30(%rsp) + movdqa %xmm4,%xmm0 + punpcklqdq %xmm6,%xmm4 + punpckhqdq %xmm6,%xmm0 + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + punpcklqdq %xmm7,%xmm5 + punpckhqdq %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + movdqa %xmm8,%xmm0 + punpcklqdq %xmm10,%xmm8 + punpckhqdq %xmm10,%xmm0 + movdqa %xmm0,%xmm10 + movdqa %xmm9,%xmm0 + punpcklqdq %xmm11,%xmm9 + punpckhqdq %xmm11,%xmm0 + movdqa %xmm0,%xmm11 + movdqa %xmm12,%xmm0 + punpcklqdq %xmm14,%xmm12 + punpckhqdq %xmm14,%xmm0 + movdqa %xmm0,%xmm14 + movdqa %xmm13,%xmm0 + punpcklqdq %xmm15,%xmm13 + punpckhqdq %xmm15,%xmm0 + movdqa %xmm0,%xmm15 + + # xor with corresponding input, write to output + movdqa 0x00(%rsp),%xmm0 + cmp $0x10,%rax + jl .Lxorpart4 + movdqu 0x00(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x00(%rsi) + + movdqu %xmm4,%xmm0 + cmp $0x20,%rax + jl .Lxorpart4 + movdqu 0x10(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x10(%rsi) + + movdqu %xmm8,%xmm0 + cmp $0x30,%rax + jl .Lxorpart4 + movdqu 0x20(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x20(%rsi) + + movdqu %xmm12,%xmm0 + cmp $0x40,%rax + jl .Lxorpart4 + movdqu 0x30(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x30(%rsi) + + movdqa 0x20(%rsp),%xmm0 + cmp $0x50,%rax + jl .Lxorpart4 + movdqu 0x40(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x40(%rsi) + + movdqu %xmm6,%xmm0 + cmp $0x60,%rax + jl .Lxorpart4 + movdqu 0x50(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x50(%rsi) + + movdqu %xmm10,%xmm0 + cmp $0x70,%rax + jl .Lxorpart4 + movdqu 0x60(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x60(%rsi) + + movdqu %xmm14,%xmm0 + cmp $0x80,%rax + jl .Lxorpart4 + movdqu 0x70(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x70(%rsi) + + movdqa 0x10(%rsp),%xmm0 + cmp $0x90,%rax + jl .Lxorpart4 + movdqu 0x80(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x80(%rsi) + + movdqu %xmm5,%xmm0 + cmp $0xa0,%rax + jl .Lxorpart4 + movdqu 0x90(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0x90(%rsi) + + movdqu %xmm9,%xmm0 + cmp $0xb0,%rax + jl .Lxorpart4 + movdqu 0xa0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xa0(%rsi) + + movdqu %xmm13,%xmm0 + cmp $0xc0,%rax + jl .Lxorpart4 + movdqu 0xb0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xb0(%rsi) + + movdqa 0x30(%rsp),%xmm0 + cmp $0xd0,%rax + jl .Lxorpart4 + movdqu 0xc0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xc0(%rsi) + + movdqu %xmm7,%xmm0 + cmp $0xe0,%rax + jl .Lxorpart4 + movdqu 0xd0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xd0(%rsi) + + movdqu %xmm11,%xmm0 + cmp $0xf0,%rax + jl .Lxorpart4 + movdqu 0xe0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xe0(%rsi) + + movdqu %xmm15,%xmm0 + cmp $0x100,%rax + jl .Lxorpart4 + movdqu 0xf0(%rdx),%xmm1 + pxor %xmm1,%xmm0 + movdqu %xmm0,0xf0(%rsi) + +.Ldone4: + lea -8(%r10),%rsp + ret + +.Lxorpart4: + # xor remaining bytes from partial register into output + mov %rax,%r9 + and $0x0f,%r9 + jz .Ldone4 + and $~0x0f,%rax + + mov %rsi,%r11 + + lea (%rdx,%rax),%rsi + mov %rsp,%rdi + mov %r9,%rcx + rep movsb + + pxor 0x00(%rsp),%xmm0 + movdqa %xmm0,0x00(%rsp) + + mov %rsp,%rsi + lea (%r11,%rax),%rdi + mov %r9,%rcx + rep movsb + + jmp .Ldone4 + +ENDPROC(chacha_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S @@ -1,448 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/linkage.h> - -.section .rodata.cst32.ROT8, "aM", @progbits, 32 -.align 32 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 - .octa 0x0e0d0c0f0a09080b0605040702010003 - -.section .rodata.cst32.ROT16, "aM", @progbits, 32 -.align 32 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 - .octa 0x0d0c0f0e09080b0a0504070601000302 - -.section .rodata.cst32.CTRINC, "aM", @progbits, 32 -.align 32 -CTRINC: .octa 0x00000003000000020000000100000000 - .octa 0x00000007000000060000000500000004 - -.text - -ENTRY(chacha20_8block_xor_avx2) - # %rdi: Input state matrix, s - # %rsi: 8 data blocks output, o - # %rdx: 8 data blocks input, i - - # This function encrypts eight consecutive ChaCha20 blocks by loading - # the state matrix in AVX registers eight times. As we need some - # scratch registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32-, 64- and then 128-bit - # words, which allows us to do XOR in AVX registers. 8/16-bit word - # rotation is done with the slightly better performing byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - vzeroupper - # 4 * 32 byte stack, 32-byte aligned - lea 8(%rsp),%r10 - and $~31, %rsp - sub $0x80, %rsp - - # x0..15[0-7] = s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpbroadcastd 0x04(%rdi),%ymm1 - vpbroadcastd 0x08(%rdi),%ymm2 - vpbroadcastd 0x0c(%rdi),%ymm3 - vpbroadcastd 0x10(%rdi),%ymm4 - vpbroadcastd 0x14(%rdi),%ymm5 - vpbroadcastd 0x18(%rdi),%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm7 - vpbroadcastd 0x20(%rdi),%ymm8 - vpbroadcastd 0x24(%rdi),%ymm9 - vpbroadcastd 0x28(%rdi),%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm11 - vpbroadcastd 0x30(%rdi),%ymm12 - vpbroadcastd 0x34(%rdi),%ymm13 - vpbroadcastd 0x38(%rdi),%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm15 - # x0..3 on stack - vmovdqa %ymm0,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm3,0x60(%rsp) - - vmovdqa CTRINC(%rip),%ymm1 - vmovdqa ROT8(%rip),%ymm2 - vmovdqa ROT16(%rip),%ymm3 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - mov $10,%ecx - -.Ldoubleround8: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm11,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm3,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm3,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm3,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm3,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $12,%ymm5,%ymm0 - vpsrld $20,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $12,%ymm6,%ymm0 - vpsrld $20,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $12,%ymm7,%ymm0 - vpsrld $20,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $12,%ymm4,%ymm0 - vpsrld $20,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - vpaddd 0x00(%rsp),%ymm5,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpxor %ymm0,%ymm15,%ymm15 - vpshufb %ymm2,%ymm15,%ymm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - vpaddd 0x20(%rsp),%ymm6,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm2,%ymm12,%ymm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - vpaddd 0x40(%rsp),%ymm7,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpxor %ymm0,%ymm13,%ymm13 - vpshufb %ymm2,%ymm13,%ymm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - vpaddd 0x60(%rsp),%ymm4,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpxor %ymm0,%ymm14,%ymm14 - vpshufb %ymm2,%ymm14,%ymm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - vpaddd %ymm15,%ymm10,%ymm10 - vpxor %ymm10,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm0 - vpsrld $25,%ymm5,%ymm5 - vpor %ymm0,%ymm5,%ymm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - vpaddd %ymm12,%ymm11,%ymm11 - vpxor %ymm11,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm0 - vpsrld $25,%ymm6,%ymm6 - vpor %ymm0,%ymm6,%ymm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - vpaddd %ymm13,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpslld $7,%ymm7,%ymm0 - vpsrld $25,%ymm7,%ymm7 - vpor %ymm0,%ymm7,%ymm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm9,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm0 - vpsrld $25,%ymm4,%ymm4 - vpor %ymm0,%ymm4,%ymm4 - - dec %ecx - jnz .Ldoubleround8 - - # x0..15[0-3] += s[0..15] - vpbroadcastd 0x00(%rdi),%ymm0 - vpaddd 0x00(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x00(%rsp) - vpbroadcastd 0x04(%rdi),%ymm0 - vpaddd 0x20(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x20(%rsp) - vpbroadcastd 0x08(%rdi),%ymm0 - vpaddd 0x40(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x40(%rsp) - vpbroadcastd 0x0c(%rdi),%ymm0 - vpaddd 0x60(%rsp),%ymm0,%ymm0 - vmovdqa %ymm0,0x60(%rsp) - vpbroadcastd 0x10(%rdi),%ymm0 - vpaddd %ymm0,%ymm4,%ymm4 - vpbroadcastd 0x14(%rdi),%ymm0 - vpaddd %ymm0,%ymm5,%ymm5 - vpbroadcastd 0x18(%rdi),%ymm0 - vpaddd %ymm0,%ymm6,%ymm6 - vpbroadcastd 0x1c(%rdi),%ymm0 - vpaddd %ymm0,%ymm7,%ymm7 - vpbroadcastd 0x20(%rdi),%ymm0 - vpaddd %ymm0,%ymm8,%ymm8 - vpbroadcastd 0x24(%rdi),%ymm0 - vpaddd %ymm0,%ymm9,%ymm9 - vpbroadcastd 0x28(%rdi),%ymm0 - vpaddd %ymm0,%ymm10,%ymm10 - vpbroadcastd 0x2c(%rdi),%ymm0 - vpaddd %ymm0,%ymm11,%ymm11 - vpbroadcastd 0x30(%rdi),%ymm0 - vpaddd %ymm0,%ymm12,%ymm12 - vpbroadcastd 0x34(%rdi),%ymm0 - vpaddd %ymm0,%ymm13,%ymm13 - vpbroadcastd 0x38(%rdi),%ymm0 - vpaddd %ymm0,%ymm14,%ymm14 - vpbroadcastd 0x3c(%rdi),%ymm0 - vpaddd %ymm0,%ymm15,%ymm15 - - # x12 += counter values 0-3 - vpaddd %ymm1,%ymm12,%ymm12 - - # interleave 32-bit words in state n, n+1 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x20(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x00(%rsp) - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm1 - vpunpckldq %ymm1,%ymm0,%ymm2 - vpunpckhdq %ymm1,%ymm0,%ymm1 - vmovdqa %ymm2,0x40(%rsp) - vmovdqa %ymm1,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpckldq %ymm5,%ymm0,%ymm4 - vpunpckhdq %ymm5,%ymm0,%ymm5 - vmovdqa %ymm6,%ymm0 - vpunpckldq %ymm7,%ymm0,%ymm6 - vpunpckhdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpckldq %ymm9,%ymm0,%ymm8 - vpunpckhdq %ymm9,%ymm0,%ymm9 - vmovdqa %ymm10,%ymm0 - vpunpckldq %ymm11,%ymm0,%ymm10 - vpunpckhdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpckldq %ymm13,%ymm0,%ymm12 - vpunpckhdq %ymm13,%ymm0,%ymm13 - vmovdqa %ymm14,%ymm0 - vpunpckldq %ymm15,%ymm0,%ymm14 - vpunpckhdq %ymm15,%ymm0,%ymm15 - - # interleave 64-bit words in state n, n+2 - vmovdqa 0x00(%rsp),%ymm0 - vmovdqa 0x40(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa %ymm2,0x40(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vmovdqa 0x60(%rsp),%ymm2 - vpunpcklqdq %ymm2,%ymm0,%ymm1 - vpunpckhqdq %ymm2,%ymm0,%ymm2 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa %ymm2,0x60(%rsp) - vmovdqa %ymm4,%ymm0 - vpunpcklqdq %ymm6,%ymm0,%ymm4 - vpunpckhqdq %ymm6,%ymm0,%ymm6 - vmovdqa %ymm5,%ymm0 - vpunpcklqdq %ymm7,%ymm0,%ymm5 - vpunpckhqdq %ymm7,%ymm0,%ymm7 - vmovdqa %ymm8,%ymm0 - vpunpcklqdq %ymm10,%ymm0,%ymm8 - vpunpckhqdq %ymm10,%ymm0,%ymm10 - vmovdqa %ymm9,%ymm0 - vpunpcklqdq %ymm11,%ymm0,%ymm9 - vpunpckhqdq %ymm11,%ymm0,%ymm11 - vmovdqa %ymm12,%ymm0 - vpunpcklqdq %ymm14,%ymm0,%ymm12 - vpunpckhqdq %ymm14,%ymm0,%ymm14 - vmovdqa %ymm13,%ymm0 - vpunpcklqdq %ymm15,%ymm0,%ymm13 - vpunpckhqdq %ymm15,%ymm0,%ymm15 - - # interleave 128-bit words in state n, n+4 - vmovdqa 0x00(%rsp),%ymm0 - vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 - vmovdqa %ymm1,0x00(%rsp) - vmovdqa 0x20(%rsp),%ymm0 - vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 - vmovdqa %ymm1,0x20(%rsp) - vmovdqa 0x40(%rsp),%ymm0 - vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 - vmovdqa %ymm1,0x40(%rsp) - vmovdqa 0x60(%rsp),%ymm0 - vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 - vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 - vmovdqa %ymm1,0x60(%rsp) - vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - vmovdqa %ymm0,%ymm8 - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - vmovdqa %ymm0,%ymm9 - vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 - vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 - vmovdqa %ymm0,%ymm10 - vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - vmovdqa %ymm0,%ymm11 - - # xor with corresponding input, write to output - vmovdqa 0x00(%rsp),%ymm0 - vpxor 0x0000(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0000(%rsi) - vmovdqa 0x20(%rsp),%ymm0 - vpxor 0x0080(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0080(%rsi) - vmovdqa 0x40(%rsp),%ymm0 - vpxor 0x0040(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x0040(%rsi) - vmovdqa 0x60(%rsp),%ymm0 - vpxor 0x00c0(%rdx),%ymm0,%ymm0 - vmovdqu %ymm0,0x00c0(%rsi) - vpxor 0x0100(%rdx),%ymm4,%ymm4 - vmovdqu %ymm4,0x0100(%rsi) - vpxor 0x0180(%rdx),%ymm5,%ymm5 - vmovdqu %ymm5,0x00180(%rsi) - vpxor 0x0140(%rdx),%ymm6,%ymm6 - vmovdqu %ymm6,0x0140(%rsi) - vpxor 0x01c0(%rdx),%ymm7,%ymm7 - vmovdqu %ymm7,0x01c0(%rsi) - vpxor 0x0020(%rdx),%ymm8,%ymm8 - vmovdqu %ymm8,0x0020(%rsi) - vpxor 0x00a0(%rdx),%ymm9,%ymm9 - vmovdqu %ymm9,0x00a0(%rsi) - vpxor 0x0060(%rdx),%ymm10,%ymm10 - vmovdqu %ymm10,0x0060(%rsi) - vpxor 0x00e0(%rdx),%ymm11,%ymm11 - vmovdqu %ymm11,0x00e0(%rsi) - vpxor 0x0120(%rdx),%ymm12,%ymm12 - vmovdqu %ymm12,0x0120(%rsi) - vpxor 0x01a0(%rdx),%ymm13,%ymm13 - vmovdqu %ymm13,0x01a0(%rsi) - vpxor 0x0160(%rdx),%ymm14,%ymm14 - vmovdqu %ymm14,0x0160(%rsi) - vpxor 0x01e0(%rdx),%ymm15,%ymm15 - vmovdqu %ymm15,0x01e0(%rsi) - - vzeroupper - lea -8(%r10),%rsp - ret -ENDPROC(chacha20_8block_xor_avx2) diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -1,630 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <linux/linkage.h> - -.section .rodata.cst16.ROT8, "aM", @progbits, 16 -.align 16 -ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 -.section .rodata.cst16.ROT16, "aM", @progbits, 16 -.align 16 -ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 -.section .rodata.cst16.CTRINC, "aM", @progbits, 16 -.align 16 -CTRINC: .octa 0x00000003000000020000000100000000 - -.text - -ENTRY(chacha20_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: 1 data block output, o - # %rdx: 1 data block input, i - - # This function encrypts one ChaCha20 block by loading the state matrix - # in four SSE registers. It performs matrix operation on four words in - # parallel, but requireds shuffling to rearrange the words after each - # round. 8/16-bit word rotation is done with the slightly better - # performing SSSE3 byte shuffling, 7/12-bit word rotation uses - # traditional shift+OR. - - # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 - - movdqa ROT8(%rip),%xmm4 - movdqa ROT16(%rip),%xmm5 - - mov $10,%ecx - -.Ldoubleround: - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm3,%xmm3 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm5,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 12) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm6 - pslld $12,%xmm6 - psrld $20,%xmm1 - por %xmm6,%xmm1 - - # x0 += x1, x3 = rotl32(x3 ^ x0, 8) - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm4,%xmm3 - - # x2 += x3, x1 = rotl32(x1 ^ x2, 7) - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm7 - pslld $7,%xmm7 - psrld $25,%xmm1 - por %xmm7,%xmm1 - - # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) - pshufd $0x93,%xmm1,%xmm1 - # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) - pshufd $0x4e,%xmm2,%xmm2 - # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) - pshufd $0x39,%xmm3,%xmm3 - - dec %ecx - jnz .Ldoubleround - - # o0 = i0 ^ (x0 + s0) - movdqu 0x00(%rdx),%xmm4 - paddd %xmm8,%xmm0 - pxor %xmm4,%xmm0 - movdqu %xmm0,0x00(%rsi) - # o1 = i1 ^ (x1 + s1) - movdqu 0x10(%rdx),%xmm5 - paddd %xmm9,%xmm1 - pxor %xmm5,%xmm1 - movdqu %xmm1,0x10(%rsi) - # o2 = i2 ^ (x2 + s2) - movdqu 0x20(%rdx),%xmm6 - paddd %xmm10,%xmm2 - pxor %xmm6,%xmm2 - movdqu %xmm2,0x20(%rsi) - # o3 = i3 ^ (x3 + s3) - movdqu 0x30(%rdx),%xmm7 - paddd %xmm11,%xmm3 - pxor %xmm7,%xmm3 - movdqu %xmm3,0x30(%rsi) - - ret -ENDPROC(chacha20_block_xor_ssse3) - -ENTRY(chacha20_4block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: 4 data blocks output, o - # %rdx: 4 data blocks input, i - - # This function encrypts four consecutive ChaCha20 blocks by loading the - # the state matrix in SSE registers four times. As we need some scratch - # registers, we save the first four registers on the stack. The - # algorithm performs each operation on the corresponding word of each - # state matrix, hence requires no word shuffling. For final XORing step - # we transpose the matrix by interleaving 32- and then 64-bit words, - # which allows us to do XOR in SSE registers. 8/16-bit word rotation is - # done with the slightly better performing SSSE3 byte shuffling, - # 7/12-bit word rotation uses traditional shift+OR. - - lea 8(%rsp),%r10 - sub $0x80,%rsp - and $~63,%rsp - - # x0..15[0-3] = s0..3[0..3] - movq 0x00(%rdi),%xmm1 - pshufd $0x00,%xmm1,%xmm0 - pshufd $0x55,%xmm1,%xmm1 - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - movq 0x10(%rdi),%xmm5 - pshufd $0x00,%xmm5,%xmm4 - pshufd $0x55,%xmm5,%xmm5 - movq 0x18(%rdi),%xmm7 - pshufd $0x00,%xmm7,%xmm6 - pshufd $0x55,%xmm7,%xmm7 - movq 0x20(%rdi),%xmm9 - pshufd $0x00,%xmm9,%xmm8 - pshufd $0x55,%xmm9,%xmm9 - movq 0x28(%rdi),%xmm11 - pshufd $0x00,%xmm11,%xmm10 - pshufd $0x55,%xmm11,%xmm11 - movq 0x30(%rdi),%xmm13 - pshufd $0x00,%xmm13,%xmm12 - pshufd $0x55,%xmm13,%xmm13 - movq 0x38(%rdi),%xmm15 - pshufd $0x00,%xmm15,%xmm14 - pshufd $0x55,%xmm15,%xmm15 - # x0..3 on stack - movdqa %xmm0,0x00(%rsp) - movdqa %xmm1,0x10(%rsp) - movdqa %xmm2,0x20(%rsp) - movdqa %xmm3,0x30(%rsp) - - movdqa CTRINC(%rip),%xmm1 - movdqa ROT8(%rip),%xmm2 - movdqa ROT16(%rip),%xmm3 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - mov $10,%ecx - -.Ldoubleround4: - # x0 += x4, x12 = rotl32(x12 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 12) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 12) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 12) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 12) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - - # x0 += x4, x12 = rotl32(x12 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x1 += x5, x13 = rotl32(x13 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x2 += x6, x14 = rotl32(x14 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - # x3 += x7, x15 = rotl32(x15 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - - # x8 += x12, x4 = rotl32(x4 ^ x8, 7) - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - # x9 += x13, x5 = rotl32(x5 ^ x9, 7) - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x10 += x14, x6 = rotl32(x6 ^ x10, 7) - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x11 += x15, x7 = rotl32(x7 ^ x11, 7) - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 16) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm3,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 16) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm3,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 16) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm3,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 16) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm3,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 12) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 12) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 12) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 12) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $12,%xmm0 - psrld $20,%xmm4 - por %xmm0,%xmm4 - - # x0 += x5, x15 = rotl32(x15 ^ x0, 8) - movdqa 0x00(%rsp),%xmm0 - paddd %xmm5,%xmm0 - movdqa %xmm0,0x00(%rsp) - pxor %xmm0,%xmm15 - pshufb %xmm2,%xmm15 - # x1 += x6, x12 = rotl32(x12 ^ x1, 8) - movdqa 0x10(%rsp),%xmm0 - paddd %xmm6,%xmm0 - movdqa %xmm0,0x10(%rsp) - pxor %xmm0,%xmm12 - pshufb %xmm2,%xmm12 - # x2 += x7, x13 = rotl32(x13 ^ x2, 8) - movdqa 0x20(%rsp),%xmm0 - paddd %xmm7,%xmm0 - movdqa %xmm0,0x20(%rsp) - pxor %xmm0,%xmm13 - pshufb %xmm2,%xmm13 - # x3 += x4, x14 = rotl32(x14 ^ x3, 8) - movdqa 0x30(%rsp),%xmm0 - paddd %xmm4,%xmm0 - movdqa %xmm0,0x30(%rsp) - pxor %xmm0,%xmm14 - pshufb %xmm2,%xmm14 - - # x10 += x15, x5 = rotl32(x5 ^ x10, 7) - paddd %xmm15,%xmm10 - pxor %xmm10,%xmm5 - movdqa %xmm5,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm5 - por %xmm0,%xmm5 - # x11 += x12, x6 = rotl32(x6 ^ x11, 7) - paddd %xmm12,%xmm11 - pxor %xmm11,%xmm6 - movdqa %xmm6,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm6 - por %xmm0,%xmm6 - # x8 += x13, x7 = rotl32(x7 ^ x8, 7) - paddd %xmm13,%xmm8 - pxor %xmm8,%xmm7 - movdqa %xmm7,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm7 - por %xmm0,%xmm7 - # x9 += x14, x4 = rotl32(x4 ^ x9, 7) - paddd %xmm14,%xmm9 - pxor %xmm9,%xmm4 - movdqa %xmm4,%xmm0 - pslld $7,%xmm0 - psrld $25,%xmm4 - por %xmm0,%xmm4 - - dec %ecx - jnz .Ldoubleround4 - - # x0[0-3] += s0[0] - # x1[0-3] += s0[1] - movq 0x00(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x00(%rsp),%xmm2 - movdqa %xmm2,0x00(%rsp) - paddd 0x10(%rsp),%xmm3 - movdqa %xmm3,0x10(%rsp) - # x2[0-3] += s0[2] - # x3[0-3] += s0[3] - movq 0x08(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd 0x20(%rsp),%xmm2 - movdqa %xmm2,0x20(%rsp) - paddd 0x30(%rsp),%xmm3 - movdqa %xmm3,0x30(%rsp) - - # x4[0-3] += s1[0] - # x5[0-3] += s1[1] - movq 0x10(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - # x6[0-3] += s1[2] - # x7[0-3] += s1[3] - movq 0x18(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - - # x8[0-3] += s2[0] - # x9[0-3] += s2[1] - movq 0x20(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm8 - paddd %xmm3,%xmm9 - # x10[0-3] += s2[2] - # x11[0-3] += s2[3] - movq 0x28(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm10 - paddd %xmm3,%xmm11 - - # x12[0-3] += s3[0] - # x13[0-3] += s3[1] - movq 0x30(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm12 - paddd %xmm3,%xmm13 - # x14[0-3] += s3[2] - # x15[0-3] += s3[3] - movq 0x38(%rdi),%xmm3 - pshufd $0x00,%xmm3,%xmm2 - pshufd $0x55,%xmm3,%xmm3 - paddd %xmm2,%xmm14 - paddd %xmm3,%xmm15 - - # x12 += counter values 0-3 - paddd %xmm1,%xmm12 - - # interleave 32-bit words in state n, n+1 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x10(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x10(%rsp) - movdqa 0x20(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpckldq %xmm1,%xmm2 - punpckhdq %xmm1,%xmm0 - movdqa %xmm2,0x20(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpckldq %xmm5,%xmm4 - punpckhdq %xmm5,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm6,%xmm0 - punpckldq %xmm7,%xmm6 - punpckhdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm9,%xmm0 - movdqa %xmm0,%xmm9 - movdqa %xmm10,%xmm0 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpckldq %xmm13,%xmm12 - punpckhdq %xmm13,%xmm0 - movdqa %xmm0,%xmm13 - movdqa %xmm14,%xmm0 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # interleave 64-bit words in state n, n+2 - movdqa 0x00(%rsp),%xmm0 - movdqa 0x20(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x00(%rsp) - movdqa %xmm0,0x20(%rsp) - movdqa 0x10(%rsp),%xmm0 - movdqa 0x30(%rsp),%xmm1 - movdqa %xmm0,%xmm2 - punpcklqdq %xmm1,%xmm2 - punpckhqdq %xmm1,%xmm0 - movdqa %xmm2,0x10(%rsp) - movdqa %xmm0,0x30(%rsp) - movdqa %xmm4,%xmm0 - punpcklqdq %xmm6,%xmm4 - punpckhqdq %xmm6,%xmm0 - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - punpcklqdq %xmm7,%xmm5 - punpckhqdq %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - movdqa %xmm8,%xmm0 - punpcklqdq %xmm10,%xmm8 - punpckhqdq %xmm10,%xmm0 - movdqa %xmm0,%xmm10 - movdqa %xmm9,%xmm0 - punpcklqdq %xmm11,%xmm9 - punpckhqdq %xmm11,%xmm0 - movdqa %xmm0,%xmm11 - movdqa %xmm12,%xmm0 - punpcklqdq %xmm14,%xmm12 - punpckhqdq %xmm14,%xmm0 - movdqa %xmm0,%xmm14 - movdqa %xmm13,%xmm0 - punpcklqdq %xmm15,%xmm13 - punpckhqdq %xmm15,%xmm0 - movdqa %xmm0,%xmm15 - - # xor with corresponding input, write to output - movdqa 0x00(%rsp),%xmm0 - movdqu 0x00(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x00(%rsi) - movdqa 0x10(%rsp),%xmm0 - movdqu 0x80(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x80(%rsi) - movdqa 0x20(%rsp),%xmm0 - movdqu 0x40(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0x40(%rsi) - movdqa 0x30(%rsp),%xmm0 - movdqu 0xc0(%rdx),%xmm1 - pxor %xmm1,%xmm0 - movdqu %xmm0,0xc0(%rsi) - movdqu 0x10(%rdx),%xmm1 - pxor %xmm1,%xmm4 - movdqu %xmm4,0x10(%rsi) - movdqu 0x90(%rdx),%xmm1 - pxor %xmm1,%xmm5 - movdqu %xmm5,0x90(%rsi) - movdqu 0x50(%rdx),%xmm1 - pxor %xmm1,%xmm6 - movdqu %xmm6,0x50(%rsi) - movdqu 0xd0(%rdx),%xmm1 - pxor %xmm1,%xmm7 - movdqu %xmm7,0xd0(%rsi) - movdqu 0x20(%rdx),%xmm1 - pxor %xmm1,%xmm8 - movdqu %xmm8,0x20(%rsi) - movdqu 0xa0(%rdx),%xmm1 - pxor %xmm1,%xmm9 - movdqu %xmm9,0xa0(%rsi) - movdqu 0x60(%rdx),%xmm1 - pxor %xmm1,%xmm10 - movdqu %xmm10,0x60(%rsi) - movdqu 0xe0(%rdx),%xmm1 - pxor %xmm1,%xmm11 - movdqu %xmm11,0xe0(%rsi) - movdqu 0x30(%rdx),%xmm1 - pxor %xmm1,%xmm12 - movdqu %xmm12,0x30(%rsi) - movdqu 0xb0(%rdx),%xmm1 - pxor %xmm1,%xmm13 - movdqu %xmm13,0xb0(%rsi) - movdqu 0x70(%rdx),%xmm1 - pxor %xmm1,%xmm14 - movdqu %xmm14,0x70(%rsi) - movdqu 0xf0(%rdx),%xmm1 - pxor %xmm1,%xmm15 - movdqu %xmm15,0xf0(%rsi) - - lea -8(%r10),%rsp - ret -ENDPROC(chacha20_4block_xor_ssse3) diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c @@ -1,146 +0,0 @@ -/* - * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code - * - * Copyright (C) 2015 Martin Willi - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#include <crypto/algapi.h> -#include <crypto/chacha20.h> -#include <crypto/internal/skcipher.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <asm/fpu/api.h> -#include <asm/simd.h> - -#define CHACHA20_STATE_ALIGN 16 - -asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); -asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src); -#ifdef CONFIG_AS_AVX2 -asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src); -static bool chacha20_use_avx2; -#endif - -static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src, - unsigned int bytes) -{ - u8 buf[CHACHA20_BLOCK_SIZE]; - -#ifdef CONFIG_AS_AVX2 - if (chacha20_use_avx2) { - while (bytes >= CHACHA20_BLOCK_SIZE * 8) { - chacha20_8block_xor_avx2(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE * 8; - src += CHACHA20_BLOCK_SIZE * 8; - dst += CHACHA20_BLOCK_SIZE * 8; - state[12] += 8; - } - } -#endif - while (bytes >= CHACHA20_BLOCK_SIZE * 4) { - chacha20_4block_xor_ssse3(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE * 4; - src += CHACHA20_BLOCK_SIZE * 4; - dst += CHACHA20_BLOCK_SIZE * 4; - state[12] += 4; - } - while (bytes >= CHACHA20_BLOCK_SIZE) { - chacha20_block_xor_ssse3(state, dst, src); - bytes -= CHACHA20_BLOCK_SIZE; - src += CHACHA20_BLOCK_SIZE; - dst += CHACHA20_BLOCK_SIZE; - state[12]++; - } - if (bytes) { - memcpy(buf, src, bytes); - chacha20_block_xor_ssse3(state, buf, buf); - memcpy(dst, buf, bytes); - } -} - -static int chacha20_simd(struct skcipher_request *req) -{ - struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); - struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm); - u32 *state, state_buf[16 + 2] __aligned(8); - struct skcipher_walk walk; - int err; - - BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16); - state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN); - - if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd()) - return crypto_chacha20_crypt(req); - - err = skcipher_walk_virt(&walk, req, true); - - crypto_chacha20_init(state, ctx, walk.iv); - - kernel_fpu_begin(); - - while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); - err = skcipher_walk_done(&walk, - walk.nbytes % CHACHA20_BLOCK_SIZE); - } - - if (walk.nbytes) { - chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, - walk.nbytes); - err = skcipher_walk_done(&walk, 0); - } - - kernel_fpu_end(); - - return err; -} - -static struct skcipher_alg alg = { - .base.cra_name = "chacha20", - .base.cra_driver_name = "chacha20-simd", - .base.cra_priority = 300, - .base.cra_blocksize = 1, - .base.cra_ctxsize = sizeof(struct chacha20_ctx), - .base.cra_module = THIS_MODULE, - - .min_keysize = CHACHA20_KEY_SIZE, - .max_keysize = CHACHA20_KEY_SIZE, - .ivsize = CHACHA20_IV_SIZE, - .chunksize = CHACHA20_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, - .encrypt = chacha20_simd, - .decrypt = chacha20_simd, -}; - -static int __init chacha20_simd_mod_init(void) -{ - if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); -#endif - return crypto_register_skcipher(&alg); -} - -static void __exit chacha20_simd_mod_fini(void) -{ - crypto_unregister_skcipher(&alg); -} - -module_init(chacha20_simd_mod_init); -module_exit(chacha20_simd_mod_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); -MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated"); -MODULE_ALIAS_CRYPTO("chacha20"); -MODULE_ALIAS_CRYPTO("chacha20-simd"); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c @@ -0,0 +1,304 @@ +/* + * x64 SIMD accelerated ChaCha and XChaCha stream ciphers, + * including ChaCha20 (RFC7539) + * + * Copyright (C) 2015 Martin Willi + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <crypto/algapi.h> +#include <crypto/chacha.h> +#include <crypto/internal/skcipher.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/fpu/api.h> +#include <asm/simd.h> + +#define CHACHA_STATE_ALIGN 16 + +asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); +#ifdef CONFIG_AS_AVX2 +asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +static bool chacha_use_avx2; +#ifdef CONFIG_AS_AVX512 +asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, + unsigned int len, int nrounds); +static bool chacha_use_avx512vl; +#endif +#endif + +static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) +{ + len = min(len, maxblocks * CHACHA_BLOCK_SIZE); + return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE; +} + +static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, + unsigned int bytes, int nrounds) +{ +#ifdef CONFIG_AS_AVX2 +#ifdef CONFIG_AS_AVX512 + if (chacha_use_avx512vl) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_2block_xor_avx512vl(state, dst, src, bytes, + nrounds); + state[12] += chacha_advance(bytes, 2); + return; + } + } +#endif + if (chacha_use_avx2) { + while (bytes >= CHACHA_BLOCK_SIZE * 8) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 8; + src += CHACHA_BLOCK_SIZE * 8; + dst += CHACHA_BLOCK_SIZE * 8; + state[12] += 8; + } + if (bytes > CHACHA_BLOCK_SIZE * 4) { + chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 8); + return; + } + if (bytes > CHACHA_BLOCK_SIZE * 2) { + chacha_4block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_2block_xor_avx2(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 2); + return; + } + } +#endif + while (bytes >= CHACHA_BLOCK_SIZE * 4) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + bytes -= CHACHA_BLOCK_SIZE * 4; + src += CHACHA_BLOCK_SIZE * 4; + dst += CHACHA_BLOCK_SIZE * 4; + state[12] += 4; + } + if (bytes > CHACHA_BLOCK_SIZE) { + chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); + state[12] += chacha_advance(bytes, 4); + return; + } + if (bytes) { + chacha_block_xor_ssse3(state, dst, src, bytes, nrounds); + state[12]++; + } +} + +static int chacha_simd_stream_xor(struct skcipher_walk *walk, + struct chacha_ctx *ctx, u8 *iv) +{ + u32 *state, state_buf[16 + 2] __aligned(8); + int next_yield = 4096; /* bytes until next FPU yield */ + int err = 0; + + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); + + crypto_chacha_init(state, ctx, iv); + + while (walk->nbytes > 0) { + unsigned int nbytes = walk->nbytes; + + if (nbytes < walk->total) { + nbytes = round_down(nbytes, walk->stride); + next_yield -= nbytes; + } + + chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr, + nbytes, ctx->nrounds); + + if (next_yield <= 0) { + /* temporarily allow preemption */ + kernel_fpu_end(); + kernel_fpu_begin(); + next_yield = 4096; + } + + err = skcipher_walk_done(walk, walk->nbytes - nbytes); + } + + return err; +} + +static int chacha_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_chacha_crypt(req); + + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; + + kernel_fpu_begin(); + err = chacha_simd_stream_xor(&walk, ctx, req->iv); + kernel_fpu_end(); + return err; +} + +static int xchacha_simd(struct skcipher_request *req) +{ + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + struct chacha_ctx subctx; + u32 *state, state_buf[16 + 2] __aligned(8); + u8 real_iv[16]; + int err; + + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + return crypto_xchacha_crypt(req); + + err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; + + BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); + state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); + crypto_chacha_init(state, ctx, req->iv); + + kernel_fpu_begin(); + + hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + subctx.nrounds = ctx->nrounds; + + memcpy(&real_iv[0], req->iv + 24, 8); + memcpy(&real_iv[8], req->iv + 16, 8); + err = chacha_simd_stream_xor(&walk, &subctx, real_iv); + + kernel_fpu_end(); + + return err; +} + +static struct skcipher_alg algs[] = { + { + .base.cra_name = "chacha20", + .base.cra_driver_name = "chacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = CHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = chacha_simd, + .decrypt = chacha_simd, + }, { + .base.cra_name = "xchacha20", + .base.cra_driver_name = "xchacha20-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha20_setkey, + .encrypt = xchacha_simd, + .decrypt = xchacha_simd, + }, { + .base.cra_name = "xchacha12", + .base.cra_driver_name = "xchacha12-simd", + .base.cra_priority = 300, + .base.cra_blocksize = 1, + .base.cra_ctxsize = sizeof(struct chacha_ctx), + .base.cra_module = THIS_MODULE, + + .min_keysize = CHACHA_KEY_SIZE, + .max_keysize = CHACHA_KEY_SIZE, + .ivsize = XCHACHA_IV_SIZE, + .chunksize = CHACHA_BLOCK_SIZE, + .setkey = crypto_chacha12_setkey, + .encrypt = xchacha_simd, + .decrypt = xchacha_simd, + }, +}; + +static int __init chacha_simd_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return -ENODEV; + +#ifdef CONFIG_AS_AVX2 + chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); +#ifdef CONFIG_AS_AVX512 + chacha_use_avx512vl = chacha_use_avx2 && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ +#endif +#endif + return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); +} + +static void __exit chacha_simd_mod_fini(void) +{ + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); +} + +module_init(chacha_simd_mod_init); +module_exit(chacha_simd_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); +MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)"); +MODULE_ALIAS_CRYPTO("chacha20"); +MODULE_ALIAS_CRYPTO("chacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha20"); +MODULE_ALIAS_CRYPTO("xchacha20-simd"); +MODULE_ALIAS_CRYPTO("xchacha12"); +MODULE_ALIAS_CRYPTO("xchacha12-simd"); diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated + * + * Copyright 2018 Google LLC + * + * Author: Eric Biggers <ebiggers@google.com> + */ + +#include <linux/linkage.h> + +#define PASS0_SUMS %ymm0 +#define PASS1_SUMS %ymm1 +#define PASS2_SUMS %ymm2 +#define PASS3_SUMS %ymm3 +#define K0 %ymm4 +#define K0_XMM %xmm4 +#define K1 %ymm5 +#define K1_XMM %xmm5 +#define K2 %ymm6 +#define K2_XMM %xmm6 +#define K3 %ymm7 +#define K3_XMM %xmm7 +#define T0 %ymm8 +#define T1 %ymm9 +#define T2 %ymm10 +#define T2_XMM %xmm10 +#define T3 %ymm11 +#define T3_XMM %xmm11 +#define T4 %ymm12 +#define T5 %ymm13 +#define T6 %ymm14 +#define T7 %ymm15 +#define KEY %rdi +#define MESSAGE %rsi +#define MESSAGE_LEN %rdx +#define HASH %rcx + +.macro _nh_2xstride k0, k1, k2, k3 + + // Add message words to key words + vpaddd \k0, T3, T0 + vpaddd \k1, T3, T1 + vpaddd \k2, T3, T2 + vpaddd \k3, T3, T3 + + // Multiply 32x32 => 64 and accumulate + vpshufd $0x10, T0, T4 + vpshufd $0x32, T0, T0 + vpshufd $0x10, T1, T5 + vpshufd $0x32, T1, T1 + vpshufd $0x10, T2, T6 + vpshufd $0x32, T2, T2 + vpshufd $0x10, T3, T7 + vpshufd $0x32, T3, T3 + vpmuludq T4, T0, T0 + vpmuludq T5, T1, T1 + vpmuludq T6, T2, T2 + vpmuludq T7, T3, T3 + vpaddq T0, PASS0_SUMS, PASS0_SUMS + vpaddq T1, PASS1_SUMS, PASS1_SUMS + vpaddq T2, PASS2_SUMS, PASS2_SUMS + vpaddq T3, PASS3_SUMS, PASS3_SUMS +.endm + +/* + * void nh_avx2(const u32 *key, const u8 *message, size_t message_len, + * u8 hash[NH_HASH_BYTES]) + * + * It's guaranteed that message_len % 16 == 0. + */ +ENTRY(nh_avx2) + + vmovdqu 0x00(KEY), K0 + vmovdqu 0x10(KEY), K1 + add $0x20, KEY + vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS + vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS + vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS + vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS + + sub $0x40, MESSAGE_LEN