Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 5 Apr 2018 00:11:08 +0000 (17:11 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 5 Apr 2018 00:11:08 +0000 (17:11 -0700)
Pull crypto updates from Herbert Xu:
 "API:

   - add AEAD support to crypto engine

   - allow batch registration in simd

  Algorithms:

   - add CFB mode

   - add speck block cipher

   - add sm4 block cipher

   - new test case for crct10dif

   - improve scheduling latency on ARM

   - scatter/gather support to gcm in aesni

   - convert x86 crypto algorithms to skcihper

  Drivers:

   - hmac(sha224/sha256) support in inside-secure

   - aes gcm/ccm support in stm32

   - stm32mp1 support in stm32

   - ccree driver from staging tree

   - gcm support over QI in caam

   - add ks-sa hwrng driver"

* 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (212 commits)
  crypto: ccree - remove unused enums
  crypto: ahash - Fix early termination in hash walk
  crypto: brcm - explicitly cast cipher to hash type
  crypto: talitos - don't leak pointers to authenc keys
  crypto: qat - don't leak pointers to authenc keys
  crypto: picoxcell - don't leak pointers to authenc keys
  crypto: ixp4xx - don't leak pointers to authenc keys
  crypto: chelsio - don't leak pointers to authenc keys
  crypto: caam/qi - don't leak pointers to authenc keys
  crypto: caam - don't leak pointers to authenc keys
  crypto: lrw - Free rctx->ext with kzfree
  crypto: talitos - fix IPsec cipher in length
  crypto: Deduplicate le32_to_cpu_array() and cpu_to_le32_array()
  crypto: doc - clarify hash callbacks state machine
  crypto: api - Keep failed instances alive
  crypto: api - Make crypto_alg_lookup static
  crypto: api - Remove unused crypto_type lookup function
  crypto: chelsio - Remove declaration of static function from header
  crypto: inside-secure - hmac(sha224) support
  crypto: inside-secure - hmac(sha256) support
  ..

184 files changed:
Documentation/crypto/crypto_engine.rst [new file with mode: 0644]
Documentation/crypto/devel-algos.rst
Documentation/devicetree/bindings/crypto/arm-cryptocell.txt
Documentation/devicetree/bindings/crypto/inside-secure-safexcel.txt
Documentation/devicetree/bindings/rng/imx-rng.txt [new file with mode: 0644]
Documentation/devicetree/bindings/rng/imx-rngc.txt [deleted file]
Documentation/devicetree/bindings/rng/ks-sa-rng.txt [new file with mode: 0644]
Documentation/devicetree/bindings/rng/omap_rng.txt
Documentation/devicetree/bindings/rng/st,stm32-rng.txt
MAINTAINERS
arch/arm/crypto/Kconfig
arch/arm/crypto/Makefile
arch/arm/crypto/aes-cipher-core.S
arch/arm/crypto/speck-neon-core.S [new file with mode: 0644]
arch/arm/crypto/speck-neon-glue.c [new file with mode: 0644]
arch/arm64/crypto/Kconfig
arch/arm64/crypto/Makefile
arch/arm64/crypto/aes-ce-ccm-glue.c
arch/arm64/crypto/aes-glue.c
arch/arm64/crypto/aes-modes.S
arch/arm64/crypto/aes-neonbs-glue.c
arch/arm64/crypto/chacha20-neon-glue.c
arch/arm64/crypto/sha256-glue.c
arch/arm64/crypto/speck-neon-core.S [new file with mode: 0644]
arch/arm64/crypto/speck-neon-glue.c [new file with mode: 0644]
arch/x86/crypto/aesni-intel_asm.S
arch/x86/crypto/aesni-intel_glue.c
arch/x86/crypto/blowfish_glue.c
arch/x86/crypto/camellia_aesni_avx2_glue.c
arch/x86/crypto/camellia_aesni_avx_glue.c
arch/x86/crypto/camellia_glue.c
arch/x86/crypto/cast5_avx_glue.c
arch/x86/crypto/cast6_avx_glue.c
arch/x86/crypto/des3_ede_glue.c
arch/x86/crypto/glue_helper.c
arch/x86/crypto/serpent_avx2_glue.c
arch/x86/crypto/serpent_avx_glue.c
arch/x86/crypto/serpent_sse2_glue.c
arch/x86/crypto/sha1-mb/sha1_mb.c
arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
arch/x86/crypto/sha256-mb/sha256_mb.c
arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
arch/x86/crypto/sha512-mb/sha512_mb.c
arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
arch/x86/crypto/twofish_avx_glue.c
arch/x86/crypto/twofish_glue_3way.c
arch/x86/include/asm/crypto/camellia.h
arch/x86/include/asm/crypto/glue_helper.h
arch/x86/include/asm/crypto/serpent-avx.h
arch/x86/include/asm/crypto/twofish.h
crypto/Kconfig
crypto/Makefile
crypto/ablk_helper.c [deleted file]
crypto/ahash.c
crypto/algapi.c
crypto/api.c
crypto/cfb.c [new file with mode: 0644]
crypto/crypto_engine.c
crypto/crypto_user.c
crypto/ecc.c
crypto/ecdh.c
crypto/internal.h
crypto/lrw.c
crypto/mcryptd.c
crypto/md4.c
crypto/md5.c
crypto/rsa-pkcs1pad.c
crypto/simd.c
crypto/sm4_generic.c [new file with mode: 0644]
crypto/speck.c [new file with mode: 0644]
crypto/tcrypt.c
crypto/testmgr.c
crypto/testmgr.h
crypto/xts.c
drivers/char/hw_random/Kconfig
drivers/char/hw_random/Makefile
drivers/char/hw_random/bcm2835-rng.c
drivers/char/hw_random/cavium-rng-vf.c
drivers/char/hw_random/cavium-rng.c
drivers/char/hw_random/imx-rngc.c
drivers/char/hw_random/ks-sa-rng.c [new file with mode: 0644]
drivers/char/hw_random/mxc-rnga.c
drivers/char/hw_random/omap-rng.c
drivers/char/hw_random/stm32-rng.c
drivers/crypto/Kconfig
drivers/crypto/Makefile
drivers/crypto/atmel-aes.c
drivers/crypto/atmel-sha.c
drivers/crypto/atmel-tdes.c
drivers/crypto/bcm/cipher.c
drivers/crypto/bcm/util.c
drivers/crypto/bfin_crc.c [deleted file]
drivers/crypto/bfin_crc.h [deleted file]
drivers/crypto/caam/caamalg.c
drivers/crypto/caam/caamalg_desc.c
drivers/crypto/caam/caamalg_desc.h
drivers/crypto/caam/caamalg_qi.c
drivers/crypto/caam/ctrl.c
drivers/crypto/caam/qi.c
drivers/crypto/cavium/cpt/cptpf_main.c
drivers/crypto/ccp/ccp-crypto-aes-cmac.c
drivers/crypto/ccp/ccp-crypto-rsa.c
drivers/crypto/ccp/ccp-crypto-sha.c
drivers/crypto/ccp/ccp-debugfs.c
drivers/crypto/ccp/ccp-dmaengine.c
drivers/crypto/ccp/ccp-ops.c
drivers/crypto/ccp/psp-dev.c
drivers/crypto/ccp/sp-dev.c
drivers/crypto/ccree/Makefile [new file with mode: 0644]
drivers/crypto/ccree/cc_aead.c [new file with mode: 0644]
drivers/crypto/ccree/cc_aead.h [new file with mode: 0644]
drivers/crypto/ccree/cc_buffer_mgr.c [new file with mode: 0644]
drivers/crypto/ccree/cc_buffer_mgr.h [new file with mode: 0644]
drivers/crypto/ccree/cc_cipher.c [new file with mode: 0644]
drivers/crypto/ccree/cc_cipher.h [new file with mode: 0644]
drivers/crypto/ccree/cc_crypto_ctx.h [new file with mode: 0644]
drivers/crypto/ccree/cc_debugfs.c [new file with mode: 0644]
drivers/crypto/ccree/cc_debugfs.h [new file with mode: 0644]
drivers/crypto/ccree/cc_driver.c [new file with mode: 0644]
drivers/crypto/ccree/cc_driver.h [new file with mode: 0644]
drivers/crypto/ccree/cc_fips.c [new file with mode: 0644]
drivers/crypto/ccree/cc_fips.h [new file with mode: 0644]
drivers/crypto/ccree/cc_hash.c [new file with mode: 0644]
drivers/crypto/ccree/cc_hash.h [new file with mode: 0644]
drivers/crypto/ccree/cc_host_regs.h [new file with mode: 0644]
drivers/crypto/ccree/cc_hw_queue_defs.h [new file with mode: 0644]
drivers/crypto/ccree/cc_ivgen.c [new file with mode: 0644]
drivers/crypto/ccree/cc_ivgen.h [new file with mode: 0644]
drivers/crypto/ccree/cc_kernel_regs.h [new file with mode: 0644]
drivers/crypto/ccree/cc_lli_defs.h [new file with mode: 0644]
drivers/crypto/ccree/cc_pm.c [new file with mode: 0644]
drivers/crypto/ccree/cc_pm.h [new file with mode: 0644]
drivers/crypto/ccree/cc_request_mgr.c [new file with mode: 0644]
drivers/crypto/ccree/cc_request_mgr.h [new file with mode: 0644]
drivers/crypto/ccree/cc_sram_mgr.c [new file with mode: 0644]
drivers/crypto/ccree/cc_sram_mgr.h [new file with mode: 0644]
drivers/crypto/chelsio/chcr_algo.c
drivers/crypto/chelsio/chcr_algo.h
drivers/crypto/chelsio/chcr_core.h
drivers/crypto/chelsio/chcr_crypto.h
drivers/crypto/chelsio/chcr_ipsec.c
drivers/crypto/inside-secure/safexcel.c
drivers/crypto/inside-secure/safexcel.h
drivers/crypto/inside-secure/safexcel_cipher.c
drivers/crypto/inside-secure/safexcel_hash.c
drivers/crypto/ixp4xx_crypto.c
drivers/crypto/marvell/cesa.c
drivers/crypto/mxs-dcp.c
drivers/crypto/n2_core.c
drivers/crypto/nx/nx-842-pseries.c
drivers/crypto/omap-aes.c
drivers/crypto/omap-aes.h
drivers/crypto/omap-crypto.c
drivers/crypto/omap-des.c
drivers/crypto/omap-sham.c
drivers/crypto/picoxcell_crypto.c
drivers/crypto/qat/qat_common/qat_algs.c
drivers/crypto/qat/qat_common/qat_asym_algs.c
drivers/crypto/s5p-sss.c
drivers/crypto/sahara.c
drivers/crypto/stm32/stm32-cryp.c
drivers/crypto/stm32/stm32-hash.c
drivers/crypto/sunxi-ss/sun4i-ss-core.c
drivers/crypto/talitos.c
drivers/crypto/ux500/cryp/cryp_core.c
drivers/crypto/ux500/hash/hash_core.c
drivers/crypto/virtio/Kconfig
drivers/crypto/virtio/virtio_crypto_algs.c
drivers/crypto/virtio/virtio_crypto_common.h
drivers/crypto/virtio/virtio_crypto_core.c
drivers/staging/ccree/Kconfig
drivers/staging/ccree/Makefile
include/crypto/ablk_helper.h [deleted file]
include/crypto/algapi.h
include/crypto/engine.h
include/crypto/hash.h
include/crypto/internal/hash.h
include/crypto/internal/simd.h
include/crypto/lrw.h [deleted file]
include/crypto/sm4.h [new file with mode: 0644]
include/crypto/speck.h [new file with mode: 0644]
include/crypto/xts.h
include/linux/byteorder/generic.h
include/linux/crypto.h

diff --git a/Documentation/crypto/crypto_engine.rst b/Documentation/crypto/crypto_engine.rst
new file mode 100644 (file)
index 0000000..8272ac9
--- /dev/null
@@ -0,0 +1,48 @@
+=============
+CRYPTO ENGINE
+=============
+
+Overview
+--------
+The crypto engine API (CE), is a crypto queue manager.
+
+Requirement
+-----------
+You have to put at start of your tfm_ctx the struct crypto_engine_ctx
+struct your_tfm_ctx {
+        struct crypto_engine_ctx enginectx;
+        ...
+};
+Why: Since CE manage only crypto_async_request, it cannot know the underlying
+request_type and so have access only on the TFM.
+So using container_of for accessing __ctx is impossible.
+Furthermore, the crypto engine cannot know the "struct your_tfm_ctx",
+so it must assume that crypto_engine_ctx is at start of it.
+
+Order of operations
+-------------------
+You have to obtain a struct crypto_engine via crypto_engine_alloc_init().
+And start it via crypto_engine_start().
+
+Before transferring any request, you have to fill the enginectx.
+- prepare_request: (taking a function pointer) If you need to do some processing before doing the request
+- unprepare_request: (taking a function pointer) Undoing what's done in prepare_request
+- do_one_request: (taking a function pointer) Do encryption for current request
+
+Note: that those three functions get the crypto_async_request associated with the received request.
+So your need to get the original request via container_of(areq, struct yourrequesttype_request, base);
+
+When your driver receive a crypto_request, you have to transfer it to
+the cryptoengine via one of:
+- crypto_transfer_ablkcipher_request_to_engine()
+- crypto_transfer_aead_request_to_engine()
+- crypto_transfer_akcipher_request_to_engine()
+- crypto_transfer_hash_request_to_engine()
+- crypto_transfer_skcipher_request_to_engine()
+
+At the end of the request process, a call to one of the following function is needed:
+- crypto_finalize_ablkcipher_request
+- crypto_finalize_aead_request
+- crypto_finalize_akcipher_request
+- crypto_finalize_hash_request
+- crypto_finalize_skcipher_request
index 66f50d32dcec9713b39c523e7c243550d2c87159..c45c6f400dbd562de7c6c558a6b01b06a1ec952d 100644 (file)
@@ -236,6 +236,14 @@ when used from another part of the kernel.
                                |
                                '---------------> HASH2
 
+Note that it is perfectly legal to "abandon" a request object:
+- call .init() and then (as many times) .update()
+- _not_ call any of .final(), .finup() or .export() at any point in future
+
+In other words implementations should mind the resource allocation and clean-up.
+No resources related to request objects should remain allocated after a call
+to .init() or .update(), since there might be no chance to free them.
+
 
 Specifics Of Asynchronous HASH Transformation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
index cec8d5d74e26f872c016f2ae1bc9e10dd7373ed7..c2598ab27f2ea8abc0cf6343323af5453ba0dd07 100644 (file)
@@ -1,7 +1,8 @@
 Arm TrustZone CryptoCell cryptographic engine
 
 Required properties:
-- compatible: Should be "arm,cryptocell-712-ree".
+- compatible: Should be one of: "arm,cryptocell-712-ree",
+  "arm,cryptocell-710-ree" or "arm,cryptocell-630p-ree".
 - reg: Base physical address of the engine and length of memory mapped region.
 - interrupts: Interrupt number for the device.
 
index 30c3ce6b502e926401cdbae6ee680c5d30cdc703..5dba55cdfa634ca4947efe2b56a43883d87d1868 100644 (file)
@@ -8,7 +8,11 @@ Required properties:
 - interrupt-names: Should be "ring0", "ring1", "ring2", "ring3", "eip", "mem".
 
 Optional properties:
-- clocks: Reference to the crypto engine clock.
+- clocks: Reference to the crypto engine clocks, the second clock is
+          needed for the Armada 7K/8K SoCs.
+- clock-names: mandatory if there is a second clock, in this case the
+               name must be "core" for the first clock and "reg" for
+               the second one.
 
 Example:
 
diff --git a/Documentation/devicetree/bindings/rng/imx-rng.txt b/Documentation/devicetree/bindings/rng/imx-rng.txt
new file mode 100644 (file)
index 0000000..405c2b0
--- /dev/null
@@ -0,0 +1,20 @@
+Freescale RNGA/RNGB/RNGC (Random Number Generator Versions A, B and C)
+
+Required properties:
+- compatible : should be one of
+               "fsl,imx21-rnga"
+               "fsl,imx31-rnga" (backward compatible with "fsl,imx21-rnga")
+               "fsl,imx25-rngb"
+               "fsl,imx35-rngc"
+- reg : offset and length of the register set of this block
+- interrupts : the interrupt number for the RNG block
+- clocks : the RNG clk source
+
+Example:
+
+rng@53fb0000 {
+       compatible = "fsl,imx25-rngb";
+       reg = <0x53fb0000 0x4000>;
+       interrupts = <22>;
+       clocks = <&trng_clk>;
+};
diff --git a/Documentation/devicetree/bindings/rng/imx-rngc.txt b/Documentation/devicetree/bindings/rng/imx-rngc.txt
deleted file mode 100644 (file)
index 93c7174..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-Freescale RNGC (Random Number Generator Version C)
-
-The driver also supports version B, which is mostly compatible
-to version C.
-
-Required properties:
-- compatible : should be one of
-               "fsl,imx25-rngb"
-               "fsl,imx35-rngc"
-- reg : offset and length of the register set of this block
-- interrupts : the interrupt number for the RNGC block
-- clocks : the RNGC clk source
-
-Example:
-
-rng@53fb0000 {
-       compatible = "fsl,imx25-rngb";
-       reg = <0x53fb0000 0x4000>;
-       interrupts = <22>;
-       clocks = <&trng_clk>;
-};
diff --git a/Documentation/devicetree/bindings/rng/ks-sa-rng.txt b/Documentation/devicetree/bindings/rng/ks-sa-rng.txt
new file mode 100644 (file)
index 0000000..b7a65b4
--- /dev/null
@@ -0,0 +1,21 @@
+Keystone SoC Hardware Random Number Generator(HWRNG) Module
+
+On Keystone SoCs HWRNG module is a submodule of the Security Accelerator.
+
+- compatible: should be "ti,keystone-rng"
+- ti,syscon-sa-cfg: phandle to syscon node of the SA configuration registers.
+                   This registers are shared between hwrng and crypto drivers.
+- clocks: phandle to the reference clocks for the subsystem
+- clock-names: functional clock name. Should be set to "fck"
+- reg: HWRNG module register space
+
+Example:
+/* K2HK */
+
+rng@24000 {
+       compatible = "ti,keystone-rng";
+       ti,syscon-sa-cfg = <&sa_config>;
+       clocks = <&clksa>;
+       clock-names = "fck";
+       reg = <0x24000 0x1000>;
+};
index 9cf7876ab43444f604a501075ccbf6088e50f880..ea434ce50f36fad854429933566ae82a50d42737 100644 (file)
@@ -13,7 +13,12 @@ Required properties:
 - interrupts : the interrupt number for the RNG module.
                Used for "ti,omap4-rng" and "inside-secure,safexcel-eip76"
 - clocks: the trng clock source. Only mandatory for the
-  "inside-secure,safexcel-eip76" compatible.
+  "inside-secure,safexcel-eip76" compatible, the second clock is
+  needed for the Armada 7K/8K SoCs
+- clock-names: mandatory if there is a second clock, in this case the
+  name must be "core" for the first clock and "reg" for the second
+  one
+
 
 Example:
 /* AM335x */
index 47f04176f93bd81e9792fe30e6acdd36ccf7ab11..1dfa7d51e006d755ad82af526b8fe67a82491e96 100644 (file)
@@ -11,6 +11,10 @@ Required properties:
 - interrupts : The designated IRQ line for the RNG
 - clocks : The clock needed to enable the RNG
 
+Optional properties:
+- resets : The reset to properly start RNG
+- clock-error-detect : Enable the clock detection management
+
 Example:
 
        rng: rng@50060800 {
index 2328eed6aea905b49ae438e8bdb9bfed02e1e06b..9d42bb8bb1203264a216bfe65eecaca73d47c812 100644 (file)
@@ -3252,12 +3252,11 @@ F:      drivers/net/ieee802154/cc2520.c
 F:     include/linux/spi/cc2520.h
 F:     Documentation/devicetree/bindings/net/ieee802154/cc2520.txt
 
-CCREE ARM TRUSTZONE CRYPTOCELL 700 REE DRIVER
+CCREE ARM TRUSTZONE CRYPTOCELL REE DRIVER
 M:     Gilad Ben-Yossef <gilad@benyossef.com>
 L:     linux-crypto@vger.kernel.org
-L:     driverdev-devel@linuxdriverproject.org
 S:     Supported
-F:     drivers/staging/ccree/
+F:     drivers/crypto/ccree/
 W:     https://developer.arm.com/products/system-ip/trustzone-cryptocell/cryptocell-700-family
 
 CEC FRAMEWORK
@@ -6962,7 +6961,7 @@ F:        drivers/input/input-mt.c
 K:     \b(ABS|SYN)_MT_
 
 INSIDE SECURE CRYPTO DRIVER
-M:     Antoine Tenart <antoine.tenart@free-electrons.com>
+M:     Antoine Tenart <antoine.tenart@bootlin.com>
 F:     drivers/crypto/inside-secure/
 S:     Maintained
 L:     linux-crypto@vger.kernel.org
@@ -7200,6 +7199,14 @@ L:       linux-rdma@vger.kernel.org
 S:     Supported
 F:     drivers/infiniband/hw/i40iw/
 
+INTEL SHA MULTIBUFFER DRIVER
+M:     Megha Dey <megha.dey@linux.intel.com>
+R:     Tim Chen <tim.c.chen@linux.intel.com>
+L:     linux-crypto@vger.kernel.org
+S:     Supported
+F:     arch/x86/crypto/sha*-mb
+F:     crypto/mcryptd.c
+
 INTEL TELEMETRY DRIVER
 M:     Souvik Kumar Chakravarty <souvik.k.chakravarty@intel.com>
 L:     platform-driver-x86@vger.kernel.org
index b8e69fe282b8db8338abd8c4405d8031022aa448..925d1364727a5dd0e35888091eb5dc73ee2413d9 100644 (file)
@@ -121,4 +121,10 @@ config CRYPTO_CHACHA20_NEON
        select CRYPTO_BLKCIPHER
        select CRYPTO_CHACHA20
 
+config CRYPTO_SPECK_NEON
+       tristate "NEON accelerated Speck cipher algorithms"
+       depends on KERNEL_MODE_NEON
+       select CRYPTO_BLKCIPHER
+       select CRYPTO_SPECK
+
 endif
index 30ef8e291271dc264e96254b11a90e9cadb750b0..3304e671918d66f6308580eb51bc2cc07647ad94 100644 (file)
@@ -10,6 +10,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -53,7 +54,9 @@ ghash-arm-ce-y        := ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y     := crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
 chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+speck-neon-y := speck-neon-core.o speck-neon-glue.o
 
+ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
 
@@ -62,5 +65,6 @@ $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
 
 $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
        $(call cmd,perl)
+endif
 
 .PRECIOUS: $(obj)/sha256-core.S $(obj)/sha512-core.S
index 54b384084637b7e574a1d15a52d16ffc572b815e..184d6c2d15d5e7b4a01803f6ba2b99ee7b8fbfc2 100644 (file)
        .ltorg
        .endm
 
+ENTRY(__aes_arm_encrypt)
+       do_crypt        fround, crypto_ft_tab, crypto_ft_tab + 1, 2
+ENDPROC(__aes_arm_encrypt)
+
+       .align          5
+ENTRY(__aes_arm_decrypt)
+       do_crypt        iround, crypto_it_tab, __aes_arm_inverse_sbox, 0
+ENDPROC(__aes_arm_decrypt)
+
+       .section        ".rodata", "a"
        .align          L1_CACHE_SHIFT
        .type           __aes_arm_inverse_sbox, %object
 __aes_arm_inverse_sbox:
@@ -210,12 +220,3 @@ __aes_arm_inverse_sbox:
        .byte           0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
        .byte           0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
        .size           __aes_arm_inverse_sbox, . - __aes_arm_inverse_sbox
-
-ENTRY(__aes_arm_encrypt)
-       do_crypt        fround, crypto_ft_tab, crypto_ft_tab + 1, 2
-ENDPROC(__aes_arm_encrypt)
-
-       .align          5
-ENTRY(__aes_arm_decrypt)
-       do_crypt        iround, crypto_it_tab, __aes_arm_inverse_sbox, 0
-ENDPROC(__aes_arm_decrypt)
diff --git a/arch/arm/crypto/speck-neon-core.S b/arch/arm/crypto/speck-neon-core.S
new file mode 100644 (file)
index 0000000..3c1e203
--- /dev/null
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
+ *
+ * Copyright (c) 2018 Google, Inc
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+       .text
+       .fpu            neon
+
+       // arguments
+       ROUND_KEYS      .req    r0      // const {u64,u32} *round_keys
+       NROUNDS         .req    r1      // int nrounds
+       DST             .req    r2      // void *dst
+       SRC             .req    r3      // const void *src
+       NBYTES          .req    r4      // unsigned int nbytes
+       TWEAK           .req    r5      // void *tweak
+
+       // registers which hold the data being encrypted/decrypted
+       X0              .req    q0
+       X0_L            .req    d0
+       X0_H            .req    d1
+       Y0              .req    q1
+       Y0_H            .req    d3
+       X1              .req    q2
+       X1_L            .req    d4
+       X1_H            .req    d5
+       Y1              .req    q3
+       Y1_H            .req    d7
+       X2              .req    q4
+       X2_L            .req    d8
+       X2_H            .req    d9
+       Y2              .req    q5
+       Y2_H            .req    d11
+       X3              .req    q6
+       X3_L            .req    d12
+       X3_H            .req    d13
+       Y3              .req    q7
+       Y3_H            .req    d15
+
+       // the round key, duplicated in all lanes
+       ROUND_KEY       .req    q8
+       ROUND_KEY_L     .req    d16
+       ROUND_KEY_H     .req    d17
+
+       // index vector for vtbl-based 8-bit rotates
+       ROTATE_TABLE    .req    d18
+
+       // multiplication table for updating XTS tweaks
+       GF128MUL_TABLE  .req    d19
+       GF64MUL_TABLE   .req    d19
+
+       // current XTS tweak value(s)
+       TWEAKV          .req    q10
+       TWEAKV_L        .req    d20
+       TWEAKV_H        .req    d21
+
+       TMP0            .req    q12
+       TMP0_L          .req    d24
+       TMP0_H          .req    d25
+       TMP1            .req    q13
+       TMP2            .req    q14
+       TMP3            .req    q15
+
+       .align          4
+.Lror64_8_table:
+       .byte           1, 2, 3, 4, 5, 6, 7, 0
+.Lror32_8_table:
+       .byte           1, 2, 3, 0, 5, 6, 7, 4
+.Lrol64_8_table:
+       .byte           7, 0, 1, 2, 3, 4, 5, 6
+.Lrol32_8_table:
+       .byte           3, 0, 1, 2, 7, 4, 5, 6
+.Lgf128mul_table:
+       .byte           0, 0x87
+       .fill           14
+.Lgf64mul_table:
+       .byte           0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
+       .fill           12
+
+/*
+ * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
+ *
+ * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
+ * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
+ * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
+ *
+ * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
+ * the vtbl approach is faster on some processors and the same speed on others.
+ */
+.macro _speck_round_128bytes   n
+
+       // x = ror(x, 8)
+       vtbl.8          X0_L, {X0_L}, ROTATE_TABLE
+       vtbl.8          X0_H, {X0_H}, ROTATE_TABLE
+       vtbl.8          X1_L, {X1_L}, ROTATE_TABLE
+       vtbl.8          X1_H, {X1_H}, ROTATE_TABLE
+       vtbl.8          X2_L, {X2_L}, ROTATE_TABLE
+       vtbl.8          X2_H, {X2_H}, ROTATE_TABLE
+       vtbl.8          X3_L, {X3_L}, ROTATE_TABLE
+       vtbl.8          X3_H, {X3_H}, ROTATE_TABLE
+
+       // x += y
+       vadd.u\n        X0, Y0
+       vadd.u\n        X1, Y1
+       vadd.u\n        X2, Y2
+       vadd.u\n        X3, Y3
+
+       // x ^= k
+       veor            X0, ROUND_KEY
+       veor            X1, ROUND_KEY
+       veor            X2, ROUND_KEY
+       veor            X3, ROUND_KEY
+
+       // y = rol(y, 3)
+       vshl.u\n        TMP0, Y0, #3
+       vshl.u\n        TMP1, Y1, #3
+       vshl.u\n        TMP2, Y2, #3
+       vshl.u\n        TMP3, Y3, #3
+       vsri.u\n        TMP0, Y0, #(\n - 3)
+       vsri.u\n        TMP1, Y1, #(\n - 3)
+       vsri.u\n        TMP2, Y2, #(\n - 3)
+       vsri.u\n        TMP3, Y3, #(\n - 3)
+
+       // y ^= x
+       veor            Y0, TMP0, X0
+       veor            Y1, TMP1, X1
+       veor            Y2, TMP2, X2
+       veor            Y3, TMP3, X3
+.endm
+
+/*
+ * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
+ *
+ * This is the inverse of _speck_round_128bytes().
+ */
+.macro _speck_unround_128bytes n
+
+       // y ^= x
+       veor            TMP0, Y0, X0
+       veor            TMP1, Y1, X1
+       veor            TMP2, Y2, X2
+       veor            TMP3, Y3, X3
+
+       // y = ror(y, 3)
+       vshr.u\n        Y0, TMP0, #3
+       vshr.u\n        Y1, TMP1, #3
+       vshr.u\n        Y2, TMP2, #3
+       vshr.u\n        Y3, TMP3, #3
+       vsli.u\n        Y0, TMP0, #(\n - 3)
+       vsli.u\n        Y1, TMP1, #(\n - 3)
+       vsli.u\n        Y2, TMP2, #(\n - 3)
+       vsli.u\n        Y3, TMP3, #(\n - 3)
+
+       // x ^= k
+       veor            X0, ROUND_KEY
+       veor            X1, ROUND_KEY
+       veor            X2, ROUND_KEY
+       veor            X3, ROUND_KEY
+
+       // x -= y
+       vsub.u\n        X0, Y0
+       vsub.u\n        X1, Y1
+       vsub.u\n        X2, Y2
+       vsub.u\n        X3, Y3
+
+       // x = rol(x, 8);
+       vtbl.8          X0_L, {X0_L}, ROTATE_TABLE
+       vtbl.8          X0_H, {X0_H}, ROTATE_TABLE
+       vtbl.8          X1_L, {X1_L}, ROTATE_TABLE
+       vtbl.8          X1_H, {X1_H}, ROTATE_TABLE
+       vtbl.8          X2_L, {X2_L}, ROTATE_TABLE
+       vtbl.8          X2_H, {X2_H}, ROTATE_TABLE
+       vtbl.8          X3_L, {X3_L}, ROTATE_TABLE
+       vtbl.8          X3_H, {X3_H}, ROTATE_TABLE
+.endm
+
+.macro _xts128_precrypt_one    dst_reg, tweak_buf, tmp
+
+       // Load the next source block
+       vld1.8          {\dst_reg}, [SRC]!
+
+       // Save the current tweak in the tweak buffer
+       vst1.8          {TWEAKV}, [\tweak_buf:128]!
+
+       // XOR the next source block with the current tweak
+       veor            \dst_reg, TWEAKV
+
+       /*
+        * Calculate the next tweak by multiplying the current one by x,
+        * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
+        */
+       vshr.u64        \tmp, TWEAKV, #63
+       vshl.u64        TWEAKV, #1
+       veor            TWEAKV_H, \tmp\()_L
+       vtbl.8          \tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
+       veor            TWEAKV_L, \tmp\()_H
+.endm
+
+.macro _xts64_precrypt_two     dst_reg, tweak_buf, tmp
+
+       // Load the next two source blocks
+       vld1.8          {\dst_reg}, [SRC]!
+
+       // Save the current two tweaks in the tweak buffer
+       vst1.8          {TWEAKV}, [\tweak_buf:128]!
+
+       // XOR the next two source blocks with the current two tweaks
+       veor            \dst_reg, TWEAKV
+
+       /*
+        * Calculate the next two tweaks by multiplying the current ones by x^2,
+        * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
+        */
+       vshr.u64        \tmp, TWEAKV, #62
+       vshl.u64        TWEAKV, #2
+       vtbl.8          \tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
+       vtbl.8          \tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
+       veor            TWEAKV, \tmp
+.endm
+
+/*
+ * _speck_xts_crypt() - Speck-XTS encryption/decryption
+ *
+ * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
+ * using Speck-XTS, specifically the variant with a block size of '2n' and round
+ * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
+ * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
+ * nonzero multiple of 128.
+ */
+.macro _speck_xts_crypt        n, decrypting
+       push            {r4-r7}
+       mov             r7, sp
+
+       /*
+        * The first four parameters were passed in registers r0-r3.  Load the
+        * additional parameters, which were passed on the stack.
+        */
+       ldr             NBYTES, [sp, #16]
+       ldr             TWEAK, [sp, #20]
+
+       /*
+        * If decrypting, modify the ROUND_KEYS parameter to point to the last
+        * round key rather than the first, since for decryption the round keys
+        * are used in reverse order.
+        */
+.if \decrypting
+.if \n == 64
+       add             ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
+       sub             ROUND_KEYS, #8
+.else
+       add             ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
+       sub             ROUND_KEYS, #4
+.endif
+.endif
+
+       // Load the index vector for vtbl-based 8-bit rotates
+.if \decrypting
+       ldr             r12, =.Lrol\n\()_8_table
+.else
+       ldr             r12, =.Lror\n\()_8_table
+.endif
+       vld1.8          {ROTATE_TABLE}, [r12:64]
+
+       // One-time XTS preparation
+
+       /*
+        * Allocate stack space to store 128 bytes worth of tweaks.  For
+        * performance, this space is aligned to a 16-byte boundary so that we
+        * can use the load/store instructions that declare 16-byte alignment.
+        */
+       sub             sp, #128
+       bic             sp, #0xf
+
+.if \n == 64
+       // Load first tweak
+       vld1.8          {TWEAKV}, [TWEAK]
+
+       // Load GF(2^128) multiplication table
+       ldr             r12, =.Lgf128mul_table
+       vld1.8          {GF128MUL_TABLE}, [r12:64]
+.else
+       // Load first tweak
+       vld1.8          {TWEAKV_L}, [TWEAK]
+
+       // Load GF(2^64) multiplication table
+       ldr             r12, =.Lgf64mul_table
+       vld1.8          {GF64MUL_TABLE}, [r12:64]
+
+       // Calculate second tweak, packing it together with the first
+       vshr.u64        TMP0_L, TWEAKV_L, #63
+       vtbl.u8         TMP0_L, {GF64MUL_TABLE}, TMP0_L
+       vshl.u64        TWEAKV_H, TWEAKV_L, #1
+       veor            TWEAKV_H, TMP0_L
+.endif
+
+.Lnext_128bytes_\@:
+
+       /*
+        * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
+        * values, and save the tweaks on the stack for later.  Then
+        * de-interleave the 'x' and 'y' elements of each block, i.e. make it so
+        * that the X[0-3] registers contain only the second halves of blocks,
+        * and the Y[0-3] registers contain only the first halves of blocks.
+        * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
+        */
+       mov             r12, sp
+.if \n == 64
+       _xts128_precrypt_one    X0, r12, TMP0
+       _xts128_precrypt_one    Y0, r12, TMP0
+       _xts128_precrypt_one    X1, r12, TMP0
+       _xts128_precrypt_one    Y1, r12, TMP0
+       _xts128_precrypt_one    X2, r12, TMP0
+       _xts128_precrypt_one    Y2, r12, TMP0
+       _xts128_precrypt_one    X3, r12, TMP0
+       _xts128_precrypt_one    Y3, r12, TMP0
+       vswp            X0_L, Y0_H
+       vswp            X1_L, Y1_H
+       vswp            X2_L, Y2_H
+       vswp            X3_L, Y3_H
+.else
+       _xts64_precrypt_two     X0, r12, TMP0
+       _xts64_precrypt_two     Y0, r12, TMP0
+       _xts64_precrypt_two     X1, r12, TMP0
+       _xts64_precrypt_two     Y1, r12, TMP0
+       _xts64_precrypt_two     X2, r12, TMP0
+       _xts64_precrypt_two     Y2, r12, TMP0
+       _xts64_precrypt_two     X3, r12, TMP0
+       _xts64_precrypt_two     Y3, r12, TMP0
+       vuzp.32         Y0, X0
+       vuzp.32         Y1, X1
+       vuzp.32         Y2, X2
+       vuzp.32         Y3, X3
+.endif
+
+       // Do the cipher rounds
+
+       mov             r12, ROUND_KEYS
+       mov             r6, NROUNDS
+
+.Lnext_round_\@:
+.if \decrypting
+.if \n == 64
+       vld1.64         ROUND_KEY_L, [r12]
+       sub             r12, #8
+       vmov            ROUND_KEY_H, ROUND_KEY_L
+.else
+       vld1.32         {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
+       sub             r12, #4
+.endif
+       _speck_unround_128bytes \n
+.else
+.if \n == 64
+       vld1.64         ROUND_KEY_L, [r12]!
+       vmov            ROUND_KEY_H, ROUND_KEY_L
+.else
+       vld1.32         {ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
+.endif
+       _speck_round_128bytes   \n
+.endif
+       subs            r6, r6, #1
+       bne             .Lnext_round_\@
+
+       // Re-interleave the 'x' and 'y' elements of each block
+.if \n == 64
+       vswp            X0_L, Y0_H
+       vswp            X1_L, Y1_H
+       vswp            X2_L, Y2_H
+       vswp            X3_L, Y3_H
+.else
+       vzip.32         Y0, X0
+       vzip.32         Y1, X1
+       vzip.32         Y2, X2
+       vzip.32         Y3, X3
+.endif
+
+       // XOR the encrypted/decrypted blocks with the tweaks we saved earlier
+       mov             r12, sp
+       vld1.8          {TMP0, TMP1}, [r12:128]!
+       vld1.8          {TMP2, TMP3}, [r12:128]!
+       veor            X0, TMP0
+       veor            Y0, TMP1
+       veor            X1, TMP2
+       veor            Y1, TMP3
+       vld1.8          {TMP0, TMP1}, [r12:128]!
+       vld1.8          {TMP2, TMP3}, [r12:128]!
+       veor            X2, TMP0
+       veor            Y2, TMP1
+       veor            X3, TMP2
+       veor            Y3, TMP3
+
+       // Store the ciphertext in the destination buffer
+       vst1.8          {X0, Y0}, [DST]!
+       vst1.8          {X1, Y1}, [DST]!
+       vst1.8          {X2, Y2}, [DST]!
+       vst1.8          {X3, Y3}, [DST]!
+
+       // Continue if there are more 128-byte chunks remaining, else return
+       subs            NBYTES, #128
+       bne             .Lnext_128bytes_\@
+
+       // Store the next tweak
+.if \n == 64
+       vst1.8          {TWEAKV}, [TWEAK]
+.else
+       vst1.8          {TWEAKV_L}, [TWEAK]
+.endif
+
+       mov             sp, r7
+       pop             {r4-r7}
+       bx              lr
+.endm
+
+ENTRY(speck128_xts_encrypt_neon)
+       _speck_xts_crypt        n=64, decrypting=0
+ENDPROC(speck128_xts_encrypt_neon)
+
+ENTRY(speck128_xts_decrypt_neon)
+       _speck_xts_crypt        n=64, decrypting=1
+ENDPROC(speck128_xts_decrypt_neon)
+
+ENTRY(speck64_xts_encrypt_neon)
+       _speck_xts_crypt        n=32, decrypting=0
+ENDPROC(speck64_xts_encrypt_neon)
+
+ENTRY(speck64_xts_decrypt_neon)
+       _speck_xts_crypt        n=32, decrypting=1
+ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm/crypto/speck-neon-glue.c b/arch/arm/crypto/speck-neon-glue.c
new file mode 100644 (file)
index 0000000..f012c3e
--- /dev/null
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
+ *
+ * Copyright (c) 2018 Google, Inc
+ *
+ * Note: the NIST recommendation for XTS only specifies a 128-bit block size,
+ * but a 64-bit version (needed for Speck64) is fairly straightforward; the math
+ * is just done in GF(2^64) instead of GF(2^128), with the reducing polynomial
+ * x^64 + x^4 + x^3 + x + 1 from the original XEX paper (Rogaway, 2004:
+ * "Efficient Instantiations of Tweakable Blockciphers and Refinements to Modes
+ * OCB and PMAC"), represented as 0x1B.
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/algapi.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/speck.h>
+#include <crypto/xts.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/* The assembly functions only handle multiples of 128 bytes */
+#define SPECK_NEON_CHUNK_SIZE  128
+
+/* Speck128 */
+
+struct speck128_xts_tfm_ctx {
+       struct speck128_tfm_ctx main_key;
+       struct speck128_tfm_ctx tweak_key;
+};
+
+asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
+                                         void *dst, const void *src,
+                                         unsigned int nbytes, void *tweak);
+
+asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
+                                         void *dst, const void *src,
+                                         unsigned int nbytes, void *tweak);
+
+typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
+                                    u8 *, const u8 *);
+typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
+                                         const void *, unsigned int, void *);
+
+static __always_inline int
+__speck128_xts_crypt(struct skcipher_request *req,
+                    speck128_crypt_one_t crypt_one,
+                    speck128_xts_crypt_many_t crypt_many)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       le128 tweak;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, true);
+
+       crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
+
+       while (walk.nbytes > 0) {
+               unsigned int nbytes = walk.nbytes;
+               u8 *dst = walk.dst.virt.addr;
+               const u8 *src = walk.src.virt.addr;
+
+               if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
+                       unsigned int count;
+
+                       count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
+                       kernel_neon_begin();
+                       (*crypt_many)(ctx->main_key.round_keys,
+                                     ctx->main_key.nrounds,
+                                     dst, src, count, &tweak);
+                       kernel_neon_end();
+                       dst += count;
+                       src += count;
+                       nbytes -= count;
+               }
+
+               /* Handle any remainder with generic code */
+               while (nbytes >= sizeof(tweak)) {
+                       le128_xor((le128 *)dst, (const le128 *)src, &tweak);
+                       (*crypt_one)(&ctx->main_key, dst, dst);
+                       le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
+                       gf128mul_x_ble(&tweak, &tweak);
+
+                       dst += sizeof(tweak);
+                       src += sizeof(tweak);
+                       nbytes -= sizeof(tweak);
+               }
+               err = skcipher_walk_done(&walk, nbytes);
+       }
+
+       return err;
+}
+
+static int speck128_xts_encrypt(struct skcipher_request *req)
+{
+       return __speck128_xts_crypt(req, crypto_speck128_encrypt,
+                                   speck128_xts_encrypt_neon);
+}
+
+static int speck128_xts_decrypt(struct skcipher_request *req)
+{
+       return __speck128_xts_crypt(req, crypto_speck128_decrypt,
+                                   speck128_xts_decrypt_neon);
+}
+
+static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                              unsigned int keylen)
+{
+       struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
+       int err;
+
+       err = xts_verify_key(tfm, key, keylen);
+       if (err)
+               return err;
+
+       keylen /= 2;
+
+       err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
+       if (err)
+               return err;
+
+       return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
+}
+
+/* Speck64 */
+
+struct speck64_xts_tfm_ctx {
+       struct speck64_tfm_ctx main_key;
+       struct speck64_tfm_ctx tweak_key;
+};
+
+asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
+                                        void *dst, const void *src,
+                                        unsigned int nbytes, void *tweak);
+
+asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
+                                        void *dst, const void *src,
+                                        unsigned int nbytes, void *tweak);
+
+typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
+                                   u8 *, const u8 *);
+typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
+                                        const void *, unsigned int, void *);
+
+static __always_inline int
+__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
+                   speck64_xts_crypt_many_t crypt_many)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       __le64 tweak;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, true);
+
+       crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
+
+       while (walk.nbytes > 0) {
+               unsigned int nbytes = walk.nbytes;
+               u8 *dst = walk.dst.virt.addr;
+               const u8 *src = walk.src.virt.addr;
+
+               if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
+                       unsigned int count;
+
+                       count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
+                       kernel_neon_begin();
+                       (*crypt_many)(ctx->main_key.round_keys,
+                                     ctx->main_key.nrounds,
+                                     dst, src, count, &tweak);
+                       kernel_neon_end();
+                       dst += count;
+                       src += count;
+                       nbytes -= count;
+               }
+
+               /* Handle any remainder with generic code */
+               while (nbytes >= sizeof(tweak)) {
+                       *(__le64 *)dst = *(__le64 *)src ^ tweak;
+                       (*crypt_one)(&ctx->main_key, dst, dst);
+                       *(__le64 *)dst ^= tweak;
+                       tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
+                                           ((tweak & cpu_to_le64(1ULL << 63)) ?
+                                            0x1B : 0));
+                       dst += sizeof(tweak);
+                       src += sizeof(tweak);
+                       nbytes -= sizeof(tweak);
+               }
+               err = skcipher_walk_done(&walk, nbytes);
+       }
+
+       return err;
+}
+
+static int speck64_xts_encrypt(struct skcipher_request *req)
+{
+       return __speck64_xts_crypt(req, crypto_speck64_encrypt,
+                                  speck64_xts_encrypt_neon);
+}
+
+static int speck64_xts_decrypt(struct skcipher_request *req)
+{
+       return __speck64_xts_crypt(req, crypto_speck64_decrypt,
+                                  speck64_xts_decrypt_neon);
+}
+
+static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                             unsigned int keylen)
+{
+       struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
+       int err;
+
+       err = xts_verify_key(tfm, key, keylen);
+       if (err)
+               return err;
+
+       keylen /= 2;
+
+       err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
+       if (err)
+               return err;
+
+       return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
+}
+
+static struct skcipher_alg speck_algs[] = {
+       {
+               .base.cra_name          = "xts(speck128)",
+               .base.cra_driver_name   = "xts-speck128-neon",
+               .base.cra_priority      = 300,
+               .base.cra_blocksize     = SPECK128_BLOCK_SIZE,
+               .base.cra_ctxsize       = sizeof(struct speck128_xts_tfm_ctx),
+               .base.cra_alignmask     = 7,
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = 2 * SPECK128_128_KEY_SIZE,
+               .max_keysize            = 2 * SPECK128_256_KEY_SIZE,
+               .ivsize                 = SPECK128_BLOCK_SIZE,
+               .walksize               = SPECK_NEON_CHUNK_SIZE,
+               .setkey                 = speck128_xts_setkey,
+               .encrypt                = speck128_xts_encrypt,
+               .decrypt                = speck128_xts_decrypt,
+       }, {
+               .base.cra_name          = "xts(speck64)",
+               .base.cra_driver_name   = "xts-speck64-neon",
+               .base.cra_priority      = 300,
+               .base.cra_blocksize     = SPECK64_BLOCK_SIZE,
+               .base.cra_ctxsize       = sizeof(struct speck64_xts_tfm_ctx),
+               .base.cra_alignmask     = 7,
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = 2 * SPECK64_96_KEY_SIZE,
+               .max_keysize            = 2 * SPECK64_128_KEY_SIZE,
+               .ivsize                 = SPECK64_BLOCK_SIZE,
+               .walksize               = SPECK_NEON_CHUNK_SIZE,
+               .setkey                 = speck64_xts_setkey,
+               .encrypt                = speck64_xts_encrypt,
+               .decrypt                = speck64_xts_decrypt,
+       }
+};
+
+static int __init speck_neon_module_init(void)
+{
+       if (!(elf_hwcap & HWCAP_NEON))
+               return -ENODEV;
+       return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+static void __exit speck_neon_module_exit(void)
+{
+       crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+module_init(speck_neon_module_init);
+module_exit(speck_neon_module_exit);
+
+MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("xts(speck128)");
+MODULE_ALIAS_CRYPTO("xts-speck128-neon");
+MODULE_ALIAS_CRYPTO("xts(speck64)");
+MODULE_ALIAS_CRYPTO("xts-speck64-neon");
index 285c36c7b408b267123039dca8ffa3c4f39e3ef6..cb5a243110c47ab1f92bca5ed7045a058847cc89 100644 (file)
@@ -113,4 +113,10 @@ config CRYPTO_AES_ARM64_BS
        select CRYPTO_AES_ARM64
        select CRYPTO_SIMD
 
+config CRYPTO_SPECK_NEON
+       tristate "NEON accelerated Speck cipher algorithms"
+       depends on KERNEL_MODE_NEON
+       select CRYPTO_BLKCIPHER
+       select CRYPTO_SPECK
+
 endif
index cee9b8d9830bfc72c58dee038736b7652acd1e82..8df9f326f4495bf4c5f98f762073e7faf4a574b8 100644 (file)
@@ -53,20 +53,21 @@ sha512-arm64-y := sha512-glue.o sha512-core.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
 chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
 
+obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
+speck-neon-y := speck-neon-core.o speck-neon-glue.o
+
 obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
 aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
 
 obj-$(CONFIG_CRYPTO_AES_ARM64_BS) += aes-neon-bs.o
 aes-neon-bs-y := aes-neonbs-core.o aes-neonbs-glue.o
 
-AFLAGS_aes-ce.o                := -DINTERLEAVE=4
-AFLAGS_aes-neon.o      := -DINTERLEAVE=4
-
 CFLAGS_aes-glue-ce.o   := -DUSE_V8_CRYPTO_EXTENSIONS
 
 $(obj)/aes-glue-%.o: $(src)/aes-glue.c FORCE
        $(call if_changed_rule,cc_o_c)
 
+ifdef REGENERATE_ARM64_CRYPTO
 quiet_cmd_perlasm = PERLASM $@
       cmd_perlasm = $(PERL) $(<) void $(@)
 
@@ -75,5 +76,6 @@ $(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
 
 $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
        $(call cmd,perlasm)
+endif
 
 .PRECIOUS: $(obj)/sha256-core.S $(obj)/sha512-core.S
index a1254036f2b1e0df0e3083dae606a543f914f7d3..68b11aa690e476d1011c50480da444b8bde91479 100644 (file)
@@ -107,11 +107,13 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
 }
 
 static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
-                          u32 abytes, u32 *macp, bool use_neon)
+                          u32 abytes, u32 *macp)
 {
-       if (likely(use_neon)) {
+       if (may_use_simd()) {
+               kernel_neon_begin();
                ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
                                     num_rounds(key));
+               kernel_neon_end();
        } else {
                if (*macp > 0 && *macp < AES_BLOCK_SIZE) {
                        int added = min(abytes, AES_BLOCK_SIZE - *macp);
@@ -143,8 +145,7 @@ static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
        }
 }
 
-static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
-                                  bool use_neon)
+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
 {
        struct crypto_aead *aead = crypto_aead_reqtfm(req);
        struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
@@ -163,7 +164,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
                ltag.len = 6;
        }
 
-       ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp, use_neon);
+       ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp);
        scatterwalk_start(&walk, req->src);
 
        do {
@@ -175,7 +176,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
                        n = scatterwalk_clamp(&walk, len);
                }
                p = scatterwalk_map(&walk);
-               ccm_update_mac(ctx, mac, p, n, &macp, use_neon);
+               ccm_update_mac(ctx, mac, p, n, &macp);
                len -= n;
 
                scatterwalk_unmap(p);
@@ -242,43 +243,42 @@ static int ccm_encrypt(struct aead_request *req)
        u8 __aligned(8) mac[AES_BLOCK_SIZE];
        u8 buf[AES_BLOCK_SIZE];
        u32 len = req->cryptlen;
-       bool use_neon = may_use_simd();
        int err;
 
        err = ccm_init_mac(req, mac, len);
        if (err)
                return err;
 
-       if (likely(use_neon))
-               kernel_neon_begin();
-
        if (req->assoclen)
-               ccm_calculate_auth_mac(req, mac, use_neon);
+               ccm_calculate_auth_mac(req, mac);
 
        /* preserve the original iv for the final round */
        memcpy(buf, req->iv, AES_BLOCK_SIZE);
 
        err = skcipher_walk_aead_encrypt(&walk, req, true);
 
-       if (likely(use_neon)) {
+       if (may_use_simd()) {
                while (walk.nbytes) {
                        u32 tail = walk.nbytes % AES_BLOCK_SIZE;
 
                        if (walk.nbytes == walk.total)
                                tail = 0;
 
+                       kernel_neon_begin();
                        ce_aes_ccm_encrypt(walk.dst.virt.addr,
                                           walk.src.virt.addr,
                                           walk.nbytes - tail, ctx->key_enc,
                                           num_rounds(ctx), mac, walk.iv);
+                       kernel_neon_end();
 
                        err = skcipher_walk_done(&walk, tail);
                }
-               if (!err)
+               if (!err) {
+                       kernel_neon_begin();
                        ce_aes_ccm_final(mac, buf, ctx->key_enc,
                                         num_rounds(ctx));
-
-               kernel_neon_end();
+                       kernel_neon_end();
+               }
        } else {
                err = ccm_crypt_fallback(&walk, mac, buf, ctx, true);
        }
@@ -301,43 +301,42 @@ static int ccm_decrypt(struct aead_request *req)
        u8 __aligned(8) mac[AES_BLOCK_SIZE];
        u8 buf[AES_BLOCK_SIZE];
        u32 len = req->cryptlen - authsize;
-       bool use_neon = may_use_simd();
        int err;
 
        err = ccm_init_mac(req, mac, len);
        if (err)
                return err;
 
-       if (likely(use_neon))
-               kernel_neon_begin();
-
        if (req->assoclen)
-               ccm_calculate_auth_mac(req, mac, use_neon);
+               ccm_calculate_auth_mac(req, mac);
 
        /* preserve the original iv for the final round */
        memcpy(buf, req->iv, AES_BLOCK_SIZE);
 
        err = skcipher_walk_aead_decrypt(&walk, req, true);
 
-       if (likely(use_neon)) {
+       if (may_use_simd()) {
                while (walk.nbytes) {
                        u32 tail = walk.nbytes % AES_BLOCK_SIZE;
 
                        if (walk.nbytes == walk.total)
                                tail = 0;
 
+                       kernel_neon_begin();
                        ce_aes_ccm_decrypt(walk.dst.virt.addr,
                                           walk.src.virt.addr,
                                           walk.nbytes - tail, ctx->key_enc,
                                           num_rounds(ctx), mac, walk.iv);
+                       kernel_neon_end();
 
                        err = skcipher_walk_done(&walk, tail);
                }
-               if (!err)
+               if (!err) {
+                       kernel_neon_begin();
                        ce_aes_ccm_final(mac, buf, ctx->key_enc,
                                         num_rounds(ctx));
-
-               kernel_neon_end();
+                       kernel_neon_end();
+               }
        } else {
                err = ccm_crypt_fallback(&walk, mac, buf, ctx, false);
        }
index 2fa850e86aa808d1f8ce3f2b25e91f59a14e49ee..253188fb8cb0cea0e35d0f4ed77b5e2c6332d507 100644 (file)
@@ -64,17 +64,17 @@ MODULE_LICENSE("GPL v2");
 
 /* defined in aes-modes.S */
 asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
-                               int rounds, int blocks, int first);
+                               int rounds, int blocks);
 asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
-                               int rounds, int blocks, int first);
+                               int rounds, int blocks);
 
 asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
-                               int rounds, int blocks, u8 iv[], int first);
+                               int rounds, int blocks, u8 iv[]);
 asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
-                               int rounds, int blocks, u8 iv[], int first);
+                               int rounds, int blocks, u8 iv[]);
 
 asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
-                               int rounds, int blocks, u8 ctr[], int first);
+                               int rounds, int blocks, u8 ctr[]);
 
 asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
                                int rounds, int blocks, u8 const rk2[], u8 iv[],
@@ -133,19 +133,19 @@ static int ecb_encrypt(struct skcipher_request *req)
 {
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-       int err, first, rounds = 6 + ctx->key_length / 4;
+       int err, rounds = 6 + ctx->key_length / 4;
        struct skcipher_walk walk;
        unsigned int blocks;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
-       for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+       while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+               kernel_neon_begin();
                aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-                               (u8 *)ctx->key_enc, rounds, blocks, first);
+                               (u8 *)ctx->key_enc, rounds, blocks);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
        return err;
 }
 
@@ -153,19 +153,19 @@ static int ecb_decrypt(struct skcipher_request *req)
 {
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-       int err, first, rounds = 6 + ctx->key_length / 4;
+       int err, rounds = 6 + ctx->key_length / 4;
        struct skcipher_walk walk;
        unsigned int blocks;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
-       for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+       while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+               kernel_neon_begin();
                aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-                               (u8 *)ctx->key_dec, rounds, blocks, first);
+                               (u8 *)ctx->key_dec, rounds, blocks);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
        return err;
 }
 
@@ -173,20 +173,19 @@ static int cbc_encrypt(struct skcipher_request *req)
 {
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-       int err, first, rounds = 6 + ctx->key_length / 4;
+       int err, rounds = 6 + ctx->key_length / 4;
        struct skcipher_walk walk;
        unsigned int blocks;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
-       for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+       while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+               kernel_neon_begin();
                aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-                               (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
-                               first);
+                               (u8 *)ctx->key_enc, rounds, blocks, walk.iv);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
        return err;
 }
 
@@ -194,20 +193,19 @@ static int cbc_decrypt(struct skcipher_request *req)
 {
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-       int err, first, rounds = 6 + ctx->key_length / 4;
+       int err, rounds = 6 + ctx->key_length / 4;
        struct skcipher_walk walk;
        unsigned int blocks;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
-       for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+       while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+               kernel_neon_begin();
                aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-                               (u8 *)ctx->key_dec, rounds, blocks, walk.iv,
-                               first);
+                               (u8 *)ctx->key_dec, rounds, blocks, walk.iv);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
        return err;
 }
 
@@ -215,20 +213,18 @@ static int ctr_encrypt(struct skcipher_request *req)
 {
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-       int err, first, rounds = 6 + ctx->key_length / 4;
+       int err, rounds = 6 + ctx->key_length / 4;
        struct skcipher_walk walk;
        int blocks;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       first = 1;
-       kernel_neon_begin();
        while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+               kernel_neon_begin();
                aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-                               (u8 *)ctx->key_enc, rounds, blocks, walk.iv,
-                               first);
+                               (u8 *)ctx->key_enc, rounds, blocks, walk.iv);
                err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
-               first = 0;
+               kernel_neon_end();
        }
        if (walk.nbytes) {
                u8 __aligned(8) tail[AES_BLOCK_SIZE];
@@ -241,12 +237,13 @@ static int ctr_encrypt(struct skcipher_request *req)
                 */
                blocks = -1;
 
+               kernel_neon_begin();
                aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
-                               blocks, walk.iv, first);
+                               blocks, walk.iv);
+               kernel_neon_end();
                crypto_xor_cpy(tdst, tsrc, tail, nbytes);
                err = skcipher_walk_done(&walk, 0);
        }
-       kernel_neon_end();
 
        return err;
 }
@@ -270,16 +267,16 @@ static int xts_encrypt(struct skcipher_request *req)
        struct skcipher_walk walk;
        unsigned int blocks;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+               kernel_neon_begin();
                aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
                                (u8 *)ctx->key1.key_enc, rounds, blocks,
                                (u8 *)ctx->key2.key_enc, walk.iv, first);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
 
        return err;
 }
@@ -292,16 +289,16 @@ static int xts_decrypt(struct skcipher_request *req)
        struct skcipher_walk walk;
        unsigned int blocks;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
        for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
+               kernel_neon_begin();
                aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
                                (u8 *)ctx->key1.key_dec, rounds, blocks,
                                (u8 *)ctx->key2.key_enc, walk.iv, first);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
 
        return err;
 }
@@ -425,7 +422,7 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
 
        /* encrypt the zero vector */
        kernel_neon_begin();
-       aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1, 1);
+       aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1);
        kernel_neon_end();
 
        cmac_gf128_mul_by_x(consts, consts);
@@ -454,8 +451,8 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
                return err;
 
        kernel_neon_begin();
-       aes_ecb_encrypt(key, ks[0], rk, rounds, 1, 1);
-       aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2, 0);
+       aes_ecb_encrypt(key, ks[0], rk, rounds, 1);
+       aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2);
        kernel_neon_end();
 
        return cbcmac_setkey(tfm, key, sizeof(key));
index 2674d43d1384b87614074a6e02e5c1305545c8bd..a68412e1e3a47d60c6c527708ff6b2019fb5cfc5 100644 (file)
        .text
        .align          4
 
-/*
- * There are several ways to instantiate this code:
- * - no interleave, all inline
- * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
- * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
- * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
- * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
- *
- * Macros imported by this code:
- * - enc_prepare       - setup NEON registers for encryption
- * - dec_prepare       - setup NEON registers for decryption
- * - enc_switch_key    - change to new key after having prepared for encryption
- * - encrypt_block     - encrypt a single block
- * - decrypt block     - decrypt a single block
- * - encrypt_block2x   - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - decrypt_block2x   - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
- * - encrypt_block4x   - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
- * - decrypt_block4x   - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
- */
-
-#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
-#define FRAME_PUSH     stp x29, x30, [sp,#-16]! ; mov x29, sp
-#define FRAME_POP      ldp x29, x30, [sp],#16
-
-#if INTERLEAVE == 2
-
-aes_encrypt_block2x:
-       encrypt_block2x v0, v1, w3, x2, x6, w7
-       ret
-ENDPROC(aes_encrypt_block2x)
-
-aes_decrypt_block2x:
-       decrypt_block2x v0, v1, w3, x2, x6, w7
-       ret
-ENDPROC(aes_decrypt_block2x)
-
-#elif INTERLEAVE == 4
-
 aes_encrypt_block4x:
-       encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+       encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
        ret
 ENDPROC(aes_encrypt_block4x)
 
 aes_decrypt_block4x:
-       decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
+       decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
        ret
 ENDPROC(aes_decrypt_block4x)
 
-#else
-#error INTERLEAVE should equal 2 or 4
-#endif
-
-       .macro          do_encrypt_block2x
-       bl              aes_encrypt_block2x
-       .endm
-
-       .macro          do_decrypt_block2x
-       bl              aes_decrypt_block2x
-       .endm
-
-       .macro          do_encrypt_block4x
-       bl              aes_encrypt_block4x
-       .endm
-
-       .macro          do_decrypt_block4x
-       bl              aes_decrypt_block4x
-       .endm
-
-#else
-#define FRAME_PUSH
-#define FRAME_POP
-
-       .macro          do_encrypt_block2x
-       encrypt_block2x v0, v1, w3, x2, x6, w7
-       .endm
-
-       .macro          do_decrypt_block2x
-       decrypt_block2x v0, v1, w3, x2, x6, w7
-       .endm
-
-       .macro          do_encrypt_block4x
-       encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
-       .endm
-
-       .macro          do_decrypt_block4x
-       decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
-       .endm
-
-#endif
-
        /*
         * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
-        *                 int blocks, int first)
+        *                 int blocks)
         * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
-        *                 int blocks, int first)
+        *                 int blocks)
         */
 
 AES_ENTRY(aes_ecb_encrypt)
-       FRAME_PUSH
-       cbz             w5, .LecbencloopNx
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
        enc_prepare     w3, x2, x5
 
 .LecbencloopNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lecbenc1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
-       do_encrypt_block2x
-       st1             {v0.16b-v1.16b}, [x0], #32
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
-       do_encrypt_block4x
+       bl              aes_encrypt_block4x
        st1             {v0.16b-v3.16b}, [x0], #64
-#endif
        b               .LecbencloopNx
 .Lecbenc1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lecbencout
-#endif
 .Lecbencloop:
        ld1             {v0.16b}, [x1], #16             /* get next pt block */
        encrypt_block   v0, w3, x2, x5, w6
@@ -141,35 +53,27 @@ AES_ENTRY(aes_ecb_encrypt)
        subs            w4, w4, #1
        bne             .Lecbencloop
 .Lecbencout:
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_ecb_encrypt)
 
 
 AES_ENTRY(aes_ecb_decrypt)
-       FRAME_PUSH
-       cbz             w5, .LecbdecloopNx
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
        dec_prepare     w3, x2, x5
 
 .LecbdecloopNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lecbdec1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
-       do_decrypt_block2x
-       st1             {v0.16b-v1.16b}, [x0], #32
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
-       do_decrypt_block4x
+       bl              aes_decrypt_block4x
        st1             {v0.16b-v3.16b}, [x0], #64
-#endif
        b               .LecbdecloopNx
 .Lecbdec1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lecbdecout
-#endif
 .Lecbdecloop:
        ld1             {v0.16b}, [x1], #16             /* get next ct block */
        decrypt_block   v0, w3, x2, x5, w6
@@ -177,62 +81,68 @@ AES_ENTRY(aes_ecb_decrypt)
        subs            w4, w4, #1
        bne             .Lecbdecloop
 .Lecbdecout:
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_ecb_decrypt)
 
 
        /*
         * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
-        *                 int blocks, u8 iv[], int first)
+        *                 int blocks, u8 iv[])
         * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
-        *                 int blocks, u8 iv[], int first)
+        *                 int blocks, u8 iv[])
         */
 
 AES_ENTRY(aes_cbc_encrypt)
-       cbz             w6, .Lcbcencloop
-
-       ld1             {v0.16b}, [x5]                  /* get iv */
+       ld1             {v4.16b}, [x5]                  /* get iv */
        enc_prepare     w3, x2, x6
 
-.Lcbcencloop:
-       ld1             {v1.16b}, [x1], #16             /* get next pt block */
-       eor             v0.16b, v0.16b, v1.16b          /* ..and xor with iv */
+.Lcbcencloop4x:
+       subs            w4, w4, #4
+       bmi             .Lcbcenc1x
+       ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
+       eor             v0.16b, v0.16b, v4.16b          /* ..and xor with iv */
        encrypt_block   v0, w3, x2, x6, w7
-       st1             {v0.16b}, [x0], #16
+       eor             v1.16b, v1.16b, v0.16b
+       encrypt_block   v1, w3, x2, x6, w7
+       eor             v2.16b, v2.16b, v1.16b
+       encrypt_block   v2, w3, x2, x6, w7
+       eor             v3.16b, v3.16b, v2.16b
+       encrypt_block   v3, w3, x2, x6, w7
+       st1             {v0.16b-v3.16b}, [x0], #64
+       mov             v4.16b, v3.16b
+       b               .Lcbcencloop4x
+.Lcbcenc1x:
+       adds            w4, w4, #4
+       beq             .Lcbcencout
+.Lcbcencloop:
+       ld1             {v0.16b}, [x1], #16             /* get next pt block */
+       eor             v4.16b, v4.16b, v0.16b          /* ..and xor with iv */
+       encrypt_block   v4, w3, x2, x6, w7
+       st1             {v4.16b}, [x0], #16
        subs            w4, w4, #1
        bne             .Lcbcencloop
-       st1             {v0.16b}, [x5]                  /* return iv */
+.Lcbcencout:
+       st1             {v4.16b}, [x5]                  /* return iv */
        ret
 AES_ENDPROC(aes_cbc_encrypt)
 
 
 AES_ENTRY(aes_cbc_decrypt)
-       FRAME_PUSH
-       cbz             w6, .LcbcdecloopNx
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
        ld1             {v7.16b}, [x5]                  /* get iv */
        dec_prepare     w3, x2, x6
 
 .LcbcdecloopNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lcbcdec1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
-       mov             v2.16b, v0.16b
-       mov             v3.16b, v1.16b
-       do_decrypt_block2x
-       eor             v0.16b, v0.16b, v7.16b
-       eor             v1.16b, v1.16b, v2.16b
-       mov             v7.16b, v3.16b
-       st1             {v0.16b-v1.16b}, [x0], #32
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
        mov             v4.16b, v0.16b
        mov             v5.16b, v1.16b
        mov             v6.16b, v2.16b
-       do_decrypt_block4x
+       bl              aes_decrypt_block4x
        sub             x1, x1, #16
        eor             v0.16b, v0.16b, v7.16b
        eor             v1.16b, v1.16b, v4.16b
@@ -240,12 +150,10 @@ AES_ENTRY(aes_cbc_decrypt)
        eor             v2.16b, v2.16b, v5.16b
        eor             v3.16b, v3.16b, v6.16b
        st1             {v0.16b-v3.16b}, [x0], #64
-#endif
        b               .LcbcdecloopNx
 .Lcbcdec1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lcbcdecout
-#endif
 .Lcbcdecloop:
        ld1             {v1.16b}, [x1], #16             /* get next ct block */
        mov             v0.16b, v1.16b                  /* ...and copy to v0 */
@@ -256,49 +164,33 @@ AES_ENTRY(aes_cbc_decrypt)
        subs            w4, w4, #1
        bne             .Lcbcdecloop
 .Lcbcdecout:
-       FRAME_POP
        st1             {v7.16b}, [x5]                  /* return iv */
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_cbc_decrypt)
 
 
        /*
         * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
-        *                 int blocks, u8 ctr[], int first)
+        *                 int blocks, u8 ctr[])
         */
 
 AES_ENTRY(aes_ctr_encrypt)
-       FRAME_PUSH
-       cbz             w6, .Lctrnotfirst       /* 1st time around? */
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
+
        enc_prepare     w3, x2, x6
        ld1             {v4.16b}, [x5]
 
-.Lctrnotfirst:
-       umov            x8, v4.d[1]             /* keep swabbed ctr in reg */
-       rev             x8, x8
-#if INTERLEAVE >= 2
-       cmn             w8, w4                  /* 32 bit overflow? */
+       umov            x6, v4.d[1]             /* keep swabbed ctr in reg */
+       rev             x6, x6
+       cmn             w6, w4                  /* 32 bit overflow? */
        bcs             .Lctrloop
 .LctrloopNx:
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lctr1x
-#if INTERLEAVE == 2
-       mov             v0.8b, v4.8b
-       mov             v1.8b, v4.8b
-       rev             x7, x8
-       add             x8, x8, #1
-       ins             v0.d[1], x7
-       rev             x7, x8
-       add             x8, x8, #1
-       ins             v1.d[1], x7
-       ld1             {v2.16b-v3.16b}, [x1], #32      /* get 2 input blocks */
-       do_encrypt_block2x
-       eor             v0.16b, v0.16b, v2.16b
-       eor             v1.16b, v1.16b, v3.16b
-       st1             {v0.16b-v1.16b}, [x0], #32
-#else
        ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
-       dup             v7.4s, w8
+       dup             v7.4s, w6
        mov             v0.16b, v4.16b
        add             v7.4s, v7.4s, v8.4s
        mov             v1.16b, v4.16b
@@ -309,29 +201,27 @@ AES_ENTRY(aes_ctr_encrypt)
        mov             v2.s[3], v8.s[1]
        mov             v3.s[3], v8.s[2]
        ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
-       do_encrypt_block4x
+       bl              aes_encrypt_block4x
        eor             v0.16b, v5.16b, v0.16b
        ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
        eor             v1.16b, v6.16b, v1.16b
        eor             v2.16b, v7.16b, v2.16b
        eor             v3.16b, v5.16b, v3.16b
        st1             {v0.16b-v3.16b}, [x0], #64
-       add             x8, x8, #INTERLEAVE
-#endif
-       rev             x7, x8
+       add             x6, x6, #4
+       rev             x7, x6
        ins             v4.d[1], x7
        cbz             w4, .Lctrout
        b               .LctrloopNx
 .Lctr1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lctrout
-#endif
 .Lctrloop:
        mov             v0.16b, v4.16b
-       encrypt_block   v0, w3, x2, x6, w7
+       encrypt_block   v0, w3, x2, x8, w7
 
-       adds            x8, x8, #1              /* increment BE ctr */
-       rev             x7, x8
+       adds            x6, x6, #1              /* increment BE ctr */
+       rev             x7, x6
        ins             v4.d[1], x7
        bcs             .Lctrcarry              /* overflow? */
 
@@ -345,12 +235,12 @@ AES_ENTRY(aes_ctr_encrypt)
 
 .Lctrout:
        st1             {v4.16b}, [x5]          /* return next CTR value */
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 
 .Lctrtailblock:
        st1             {v0.16b}, [x0]
-       FRAME_POP
+       ldp             x29, x30, [sp], #16
        ret
 
 .Lctrcarry:
@@ -384,39 +274,26 @@ CPU_LE(   .quad           1, 0x87         )
 CPU_BE(        .quad           0x87, 1         )
 
 AES_ENTRY(aes_xts_encrypt)
-       FRAME_PUSH
-       cbz             w7, .LxtsencloopNx
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
        ld1             {v4.16b}, [x6]
-       enc_prepare     w3, x5, x6
-       encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
-       enc_switch_key  w3, x2, x6
+       cbz             w7, .Lxtsencnotfirst
+
+       enc_prepare     w3, x5, x8
+       encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
+       enc_switch_key  w3, x2, x8
        ldr             q7, .Lxts_mul_x
        b               .LxtsencNx
 
+.Lxtsencnotfirst:
+       enc_prepare     w3, x2, x8
 .LxtsencloopNx:
        ldr             q7, .Lxts_mul_x
        next_tweak      v4, v4, v7, v8
 .LxtsencNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lxtsenc1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
-       next_tweak      v5, v4, v7, v8
-       eor             v0.16b, v0.16b, v4.16b
-       eor             v1.16b, v1.16b, v5.16b
-       do_encrypt_block2x
-       eor             v0.16b, v0.16b, v4.16b
-       eor             v1.16b, v1.16b, v5.16b
-       st1             {v0.16b-v1.16b}, [x0], #32
-       cbz             w4, .LxtsencoutNx
-       next_tweak      v4, v5, v7, v8
-       b               .LxtsencNx
-.LxtsencoutNx:
-       mov             v4.16b, v5.16b
-       b               .Lxtsencout
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
        next_tweak      v5, v4, v7, v8
        eor             v0.16b, v0.16b, v4.16b
@@ -425,7 +302,7 @@ AES_ENTRY(aes_xts_encrypt)
        eor             v2.16b, v2.16b, v6.16b
        next_tweak      v7, v6, v7, v8
        eor             v3.16b, v3.16b, v7.16b
-       do_encrypt_block4x
+       bl              aes_encrypt_block4x
        eor             v3.16b, v3.16b, v7.16b
        eor             v0.16b, v0.16b, v4.16b
        eor             v1.16b, v1.16b, v5.16b
@@ -434,15 +311,13 @@ AES_ENTRY(aes_xts_encrypt)
        mov             v4.16b, v7.16b
        cbz             w4, .Lxtsencout
        b               .LxtsencloopNx
-#endif
 .Lxtsenc1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lxtsencout
-#endif
 .Lxtsencloop:
        ld1             {v1.16b}, [x1], #16
        eor             v0.16b, v1.16b, v4.16b
-       encrypt_block   v0, w3, x2, x6, w7
+       encrypt_block   v0, w3, x2, x8, w7
        eor             v0.16b, v0.16b, v4.16b
        st1             {v0.16b}, [x0], #16
        subs            w4, w4, #1
@@ -450,45 +325,33 @@ AES_ENTRY(aes_xts_encrypt)
        next_tweak      v4, v4, v7, v8
        b               .Lxtsencloop
 .Lxtsencout:
-       FRAME_POP
+       st1             {v4.16b}, [x6]
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_xts_encrypt)
 
 
 AES_ENTRY(aes_xts_decrypt)
-       FRAME_PUSH
-       cbz             w7, .LxtsdecloopNx
+       stp             x29, x30, [sp, #-16]!
+       mov             x29, sp
 
        ld1             {v4.16b}, [x6]
-       enc_prepare     w3, x5, x6
-       encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
-       dec_prepare     w3, x2, x6
+       cbz             w7, .Lxtsdecnotfirst
+
+       enc_prepare     w3, x5, x8
+       encrypt_block   v4, w3, x5, x8, w7              /* first tweak */
+       dec_prepare     w3, x2, x8
        ldr             q7, .Lxts_mul_x
        b               .LxtsdecNx
 
+.Lxtsdecnotfirst:
+       dec_prepare     w3, x2, x8
 .LxtsdecloopNx:
        ldr             q7, .Lxts_mul_x
        next_tweak      v4, v4, v7, v8
 .LxtsdecNx:
-#if INTERLEAVE >= 2
-       subs            w4, w4, #INTERLEAVE
+       subs            w4, w4, #4
        bmi             .Lxtsdec1x
-#if INTERLEAVE == 2
-       ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
-       next_tweak      v5, v4, v7, v8
-       eor             v0.16b, v0.16b, v4.16b
-       eor             v1.16b, v1.16b, v5.16b
-       do_decrypt_block2x
-       eor             v0.16b, v0.16b, v4.16b
-       eor             v1.16b, v1.16b, v5.16b
-       st1             {v0.16b-v1.16b}, [x0], #32
-       cbz             w4, .LxtsdecoutNx
-       next_tweak      v4, v5, v7, v8
-       b               .LxtsdecNx
-.LxtsdecoutNx:
-       mov             v4.16b, v5.16b
-       b               .Lxtsdecout
-#else
        ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
        next_tweak      v5, v4, v7, v8
        eor             v0.16b, v0.16b, v4.16b
@@ -497,7 +360,7 @@ AES_ENTRY(aes_xts_decrypt)
        eor             v2.16b, v2.16b, v6.16b
        next_tweak      v7, v6, v7, v8
        eor             v3.16b, v3.16b, v7.16b
-       do_decrypt_block4x
+       bl              aes_decrypt_block4x
        eor             v3.16b, v3.16b, v7.16b
        eor             v0.16b, v0.16b, v4.16b
        eor             v1.16b, v1.16b, v5.16b
@@ -506,15 +369,13 @@ AES_ENTRY(aes_xts_decrypt)
        mov             v4.16b, v7.16b
        cbz             w4, .Lxtsdecout
        b               .LxtsdecloopNx
-#endif
 .Lxtsdec1x:
-       adds            w4, w4, #INTERLEAVE
+       adds            w4, w4, #4
        beq             .Lxtsdecout
-#endif
 .Lxtsdecloop:
        ld1             {v1.16b}, [x1], #16
        eor             v0.16b, v1.16b, v4.16b
-       decrypt_block   v0, w3, x2, x6, w7
+       decrypt_block   v0, w3, x2, x8, w7
        eor             v0.16b, v0.16b, v4.16b
        st1             {v0.16b}, [x0], #16
        subs            w4, w4, #1
@@ -522,7 +383,8 @@ AES_ENTRY(aes_xts_decrypt)
        next_tweak      v4, v4, v7, v8
        b               .Lxtsdecloop
 .Lxtsdecout:
-       FRAME_POP
+       st1             {v4.16b}, [x6]
+       ldp             x29, x30, [sp], #16
        ret
 AES_ENDPROC(aes_xts_decrypt)
 
@@ -533,8 +395,28 @@ AES_ENDPROC(aes_xts_decrypt)
 AES_ENTRY(aes_mac_update)
        ld1             {v0.16b}, [x4]                  /* get dg */
        enc_prepare     w2, x1, x7
-       cbnz            w5, .Lmacenc
+       cbz             w5, .Lmacloop4x
+
+       encrypt_block   v0, w2, x1, x7, w8
 
+.Lmacloop4x:
+       subs            w3, w3, #4
+       bmi             .Lmac1x
+       ld1             {v1.16b-v4.16b}, [x0], #64      /* get next pt block */
+       eor             v0.16b, v0.16b, v1.16b          /* ..and xor with dg */
+       encrypt_block   v0, w2, x1, x7, w8
+       eor             v0.16b, v0.16b, v2.16b
+       encrypt_block   v0, w2, x1, x7, w8
+       eor             v0.16b, v0.16b, v3.16b
+       encrypt_block   v0, w2, x1, x7, w8
+       eor             v0.16b, v0.16b, v4.16b
+       cmp             w3, wzr
+       csinv           x5, x6, xzr, eq
+       cbz             w5, .Lmacout
+       encrypt_block   v0, w2, x1, x7, w8
+       b               .Lmacloop4x
+.Lmac1x:
+       add             w3, w3, #4
 .Lmacloop:
        cbz             w3, .Lmacout
        ld1             {v1.16b}, [x0], #16             /* get next pt block */
@@ -544,7 +426,6 @@ AES_ENTRY(aes_mac_update)
        csinv           x5, x6, xzr, eq
        cbz             w5, .Lmacout
 
-.Lmacenc:
        encrypt_block   v0, w2, x1, x7, w8
        b               .Lmacloop
 
index c55d68ccb89f804a7db201ae23abe49ace0a1188..e7a95a566462f259c227d924ae61eaa5697a23d4 100644 (file)
@@ -46,10 +46,9 @@ asmlinkage void aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[],
 
 /* borrowed from aes-neon-blk.ko */
 asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
-                                    int rounds, int blocks, int first);
+                                    int rounds, int blocks);
 asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
-                                    int rounds, int blocks, u8 iv[],
-                                    int first);
+                                    int rounds, int blocks, u8 iv[]);
 
 struct aesbs_ctx {
        u8      rk[13 * (8 * AES_BLOCK_SIZE) + 32];
@@ -100,9 +99,8 @@ static int __ecb_crypt(struct skcipher_request *req,
        struct skcipher_walk walk;
        int err;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
        while (walk.nbytes >= AES_BLOCK_SIZE) {
                unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
 
@@ -110,12 +108,13 @@ static int __ecb_crypt(struct skcipher_request *req,
                        blocks = round_down(blocks,
                                            walk.stride / AES_BLOCK_SIZE);
 
+               kernel_neon_begin();
                fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
                   ctx->rounds, blocks);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk,
                                         walk.nbytes - blocks * AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
 
        return err;
 }
@@ -157,22 +156,21 @@ static int cbc_encrypt(struct skcipher_request *req)
        struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
        struct aesbs_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
        struct skcipher_walk walk;
-       int err, first = 1;
+       int err;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
        while (walk.nbytes >= AES_BLOCK_SIZE) {
                unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
 
                /* fall back to the non-bitsliced NEON implementation */
+               kernel_neon_begin();
                neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-                                    ctx->enc, ctx->key.rounds, blocks, walk.iv,
-                                    first);
+                                    ctx->enc, ctx->key.rounds, blocks,
+                                    walk.iv);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
-               first = 0;
        }
-       kernel_neon_end();
        return err;
 }
 
@@ -183,9 +181,8 @@ static int cbc_decrypt(struct skcipher_request *req)
        struct skcipher_walk walk;
        int err;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
        while (walk.nbytes >= AES_BLOCK_SIZE) {
                unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
 
@@ -193,13 +190,14 @@ static int cbc_decrypt(struct skcipher_request *req)
                        blocks = round_down(blocks,
                                            walk.stride / AES_BLOCK_SIZE);
 
+               kernel_neon_begin();
                aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
                                  ctx->key.rk, ctx->key.rounds, blocks,
                                  walk.iv);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk,
                                         walk.nbytes - blocks * AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
 
        return err;
 }
@@ -231,9 +229,8 @@ static int ctr_encrypt(struct skcipher_request *req)
        u8 buf[AES_BLOCK_SIZE];
        int err;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       kernel_neon_begin();
        while (walk.nbytes > 0) {
                unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
                u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
@@ -244,8 +241,10 @@ static int ctr_encrypt(struct skcipher_request *req)
                        final = NULL;
                }
 
+               kernel_neon_begin();
                aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
                                  ctx->rk, ctx->rounds, blocks, walk.iv, final);
+               kernel_neon_end();
 
                if (final) {
                        u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
@@ -260,8 +259,6 @@ static int ctr_encrypt(struct skcipher_request *req)
                err = skcipher_walk_done(&walk,
                                         walk.nbytes - blocks * AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
-
        return err;
 }
 
@@ -306,12 +303,11 @@ static int __xts_crypt(struct skcipher_request *req,
        struct skcipher_walk walk;
        int err;
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
        kernel_neon_begin();
-
-       neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey,
-                            ctx->key.rounds, 1, 1);
+       neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey, ctx->key.rounds, 1);
+       kernel_neon_end();
 
        while (walk.nbytes >= AES_BLOCK_SIZE) {
                unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
@@ -320,13 +316,13 @@ static int __xts_crypt(struct skcipher_request *req,
                        blocks = round_down(blocks,
                                            walk.stride / AES_BLOCK_SIZE);
 
+               kernel_neon_begin();
                fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
                   ctx->key.rounds, blocks, walk.iv);
+               kernel_neon_end();
                err = skcipher_walk_done(&walk,
                                         walk.nbytes - blocks * AES_BLOCK_SIZE);
        }
-       kernel_neon_end();
-
        return err;
 }
 
index cbdb75d15cd031596c84bdb3959f17910fea08f2..727579c93dedbdce1584818a8021b66a4a5cb4f8 100644 (file)
@@ -37,12 +37,19 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
        u8 buf[CHACHA20_BLOCK_SIZE];
 
        while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
+               kernel_neon_begin();
                chacha20_4block_xor_neon(state, dst, src);
+               kernel_neon_end();
                bytes -= CHACHA20_BLOCK_SIZE * 4;
                src += CHACHA20_BLOCK_SIZE * 4;
                dst += CHACHA20_BLOCK_SIZE * 4;
                state[12] += 4;
        }
+
+       if (!bytes)
+               return;
+
+       kernel_neon_begin();
        while (bytes >= CHACHA20_BLOCK_SIZE) {
                chacha20_block_xor_neon(state, dst, src);
                bytes -= CHACHA20_BLOCK_SIZE;
@@ -55,6 +62,7 @@ static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
                chacha20_block_xor_neon(state, buf, buf);
                memcpy(dst, buf, bytes);
        }
+       kernel_neon_end();
 }
 
 static int chacha20_neon(struct skcipher_request *req)
@@ -68,11 +76,10 @@ static int chacha20_neon(struct skcipher_request *req)
        if (!may_use_simd() || req->cryptlen <= CHACHA20_BLOCK_SIZE)
                return crypto_chacha20_crypt(req);
 
-       err = skcipher_walk_virt(&walk, req, true);
+       err = skcipher_walk_virt(&walk, req, false);
 
        crypto_chacha20_init(state, ctx, walk.iv);
 
-       kernel_neon_begin();
        while (walk.nbytes > 0) {
                unsigned int nbytes = walk.nbytes;
 
@@ -83,7 +90,6 @@ static int chacha20_neon(struct skcipher_request *req)
                                nbytes);
                err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
        }
-       kernel_neon_end();
 
        return err;
 }
index b064d925fe2a7552222c568754e03d474c11a5aa..e8880ccdc71f6a28f7aca4e500f6a70e6779edf0 100644 (file)
@@ -89,21 +89,32 @@ static struct shash_alg algs[] = { {
 static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
                              unsigned int len)
 {
-       /*
-        * Stacking and unstacking a substantial slice of the NEON register
-        * file may significantly affect performance for small updates when
-        * executing in interrupt context, so fall back to the scalar code
-        * in that case.
-        */
+       struct sha256_state *sctx = shash_desc_ctx(desc);
+
        if (!may_use_simd())
                return sha256_base_do_update(desc, data, len,
                                (sha256_block_fn *)sha256_block_data_order);
 
-       kernel_neon_begin();
-       sha256_base_do_update(desc, data, len,
-                               (sha256_block_fn *)sha256_block_neon);
-       kernel_neon_end();
+       while (len > 0) {
+               unsigned int chunk = len;
+
+               /*
+                * Don't hog the CPU for the entire time it takes to process all
+                * input when running on a preemptible kernel, but process the
+                * data block by block instead.
+                */
+               if (IS_ENABLED(CONFIG_PREEMPT) &&
+                   chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE)
+                       chunk = SHA256_BLOCK_SIZE -
+                               sctx->count % SHA256_BLOCK_SIZE;
 
+               kernel_neon_begin();
+               sha256_base_do_update(desc, data, chunk,
+                                     (sha256_block_fn *)sha256_block_neon);
+               kernel_neon_end();
+               data += chunk;
+               len -= chunk;
+       }
        return 0;
 }
 
@@ -117,10 +128,9 @@ static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
                sha256_base_do_finalize(desc,
                                (sha256_block_fn *)sha256_block_data_order);
        } else {
-               kernel_neon_begin();
                if (len)
-                       sha256_base_do_update(desc, data, len,
-                               (sha256_block_fn *)sha256_block_neon);
+                       sha256_update_neon(desc, data, len);
+               kernel_neon_begin();
                sha256_base_do_finalize(desc,
                                (sha256_block_fn *)sha256_block_neon);
                kernel_neon_end();
diff --git a/arch/arm64/crypto/speck-neon-core.S b/arch/arm64/crypto/speck-neon-core.S
new file mode 100644 (file)
index 0000000..b144634
--- /dev/null
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
+ *
+ * Copyright (c) 2018 Google, Inc
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+       .text
+
+       // arguments
+       ROUND_KEYS      .req    x0      // const {u64,u32} *round_keys
+       NROUNDS         .req    w1      // int nrounds
+       NROUNDS_X       .req    x1
+       DST             .req    x2      // void *dst
+       SRC             .req    x3      // const void *src
+       NBYTES          .req    w4      // unsigned int nbytes
+       TWEAK           .req    x5      // void *tweak
+
+       // registers which hold the data being encrypted/decrypted
+       // (underscores avoid a naming collision with ARM64 registers x0-x3)
+       X_0             .req    v0
+       Y_0             .req    v1
+       X_1             .req    v2
+       Y_1             .req    v3
+       X_2             .req    v4
+       Y_2             .req    v5
+       X_3             .req    v6
+       Y_3             .req    v7
+
+       // the round key, duplicated in all lanes
+       ROUND_KEY       .req    v8
+
+       // index vector for tbl-based 8-bit rotates
+       ROTATE_TABLE    .req    v9
+       ROTATE_TABLE_Q  .req    q9
+
+       // temporary registers
+       TMP0            .req    v10
+       TMP1            .req    v11
+       TMP2            .req    v12
+       TMP3            .req    v13
+
+       // multiplication table for updating XTS tweaks
+       GFMUL_TABLE     .req    v14
+       GFMUL_TABLE_Q   .req    q14
+
+       // next XTS tweak value(s)
+       TWEAKV_NEXT     .req    v15
+
+       // XTS tweaks for the blocks currently being encrypted/decrypted
+       TWEAKV0         .req    v16
+       TWEAKV1         .req    v17
+       TWEAKV2         .req    v18
+       TWEAKV3         .req    v19
+       TWEAKV4         .req    v20
+       TWEAKV5         .req    v21
+       TWEAKV6         .req    v22
+       TWEAKV7         .req    v23
+
+       .align          4
+.Lror64_8_table:
+       .octa           0x080f0e0d0c0b0a090007060504030201
+.Lror32_8_table:
+       .octa           0x0c0f0e0d080b0a090407060500030201
+.Lrol64_8_table:
+       .octa           0x0e0d0c0b0a09080f0605040302010007
+.Lrol32_8_table:
+       .octa           0x0e0d0c0f0a09080b0605040702010003
+.Lgf128mul_table:
+       .octa           0x00000000000000870000000000000001
+.Lgf64mul_table:
+       .octa           0x0000000000000000000000002d361b00
+
+/*
+ * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
+ *
+ * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
+ * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
+ * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
+ * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
+ */
+.macro _speck_round_128bytes   n, lanes
+
+       // x = ror(x, 8)
+       tbl             X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
+       tbl             X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
+       tbl             X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
+       tbl             X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
+
+       // x += y
+       add             X_0.\lanes, X_0.\lanes, Y_0.\lanes
+       add             X_1.\lanes, X_1.\lanes, Y_1.\lanes
+       add             X_2.\lanes, X_2.\lanes, Y_2.\lanes
+       add             X_3.\lanes, X_3.\lanes, Y_3.\lanes
+
+       // x ^= k
+       eor             X_0.16b, X_0.16b, ROUND_KEY.16b
+       eor             X_1.16b, X_1.16b, ROUND_KEY.16b
+       eor             X_2.16b, X_2.16b, ROUND_KEY.16b
+       eor             X_3.16b, X_3.16b, ROUND_KEY.16b
+
+       // y = rol(y, 3)
+       shl             TMP0.\lanes, Y_0.\lanes, #3
+       shl             TMP1.\lanes, Y_1.\lanes, #3
+       shl             TMP2.\lanes, Y_2.\lanes, #3
+       shl             TMP3.\lanes, Y_3.\lanes, #3
+       sri             TMP0.\lanes, Y_0.\lanes, #(\n - 3)
+       sri             TMP1.\lanes, Y_1.\lanes, #(\n - 3)
+       sri             TMP2.\lanes, Y_2.\lanes, #(\n - 3)
+       sri             TMP3.\lanes, Y_3.\lanes, #(\n - 3)
+
+       // y ^= x
+       eor             Y_0.16b, TMP0.16b, X_0.16b
+       eor             Y_1.16b, TMP1.16b, X_1.16b
+       eor             Y_2.16b, TMP2.16b, X_2.16b
+       eor             Y_3.16b, TMP3.16b, X_3.16b
+.endm
+
+/*
+ * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
+ *
+ * This is the inverse of _speck_round_128bytes().
+ */
+.macro _speck_unround_128bytes n, lanes
+
+       // y ^= x
+       eor             TMP0.16b, Y_0.16b, X_0.16b
+       eor             TMP1.16b, Y_1.16b, X_1.16b
+       eor             TMP2.16b, Y_2.16b, X_2.16b
+       eor             TMP3.16b, Y_3.16b, X_3.16b
+
+       // y = ror(y, 3)
+       ushr            Y_0.\lanes, TMP0.\lanes, #3
+       ushr            Y_1.\lanes, TMP1.\lanes, #3
+       ushr            Y_2.\lanes, TMP2.\lanes, #3
+       ushr            Y_3.\lanes, TMP3.\lanes, #3
+       sli             Y_0.\lanes, TMP0.\lanes, #(\n - 3)
+       sli             Y_1.\lanes, TMP1.\lanes, #(\n - 3)
+       sli             Y_2.\lanes, TMP2.\lanes, #(\n - 3)
+       sli             Y_3.\lanes, TMP3.\lanes, #(\n - 3)
+
+       // x ^= k
+       eor             X_0.16b, X_0.16b, ROUND_KEY.16b
+       eor             X_1.16b, X_1.16b, ROUND_KEY.16b
+       eor             X_2.16b, X_2.16b, ROUND_KEY.16b
+       eor             X_3.16b, X_3.16b, ROUND_KEY.16b
+
+       // x -= y
+       sub             X_0.\lanes, X_0.\lanes, Y_0.\lanes
+       sub             X_1.\lanes, X_1.\lanes, Y_1.\lanes
+       sub             X_2.\lanes, X_2.\lanes, Y_2.\lanes
+       sub             X_3.\lanes, X_3.\lanes, Y_3.\lanes
+
+       // x = rol(x, 8)
+       tbl             X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
+       tbl             X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
+       tbl             X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
+       tbl             X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
+.endm
+
+.macro _next_xts_tweak next, cur, tmp, n
+.if \n == 64
+       /*
+        * Calculate the next tweak by multiplying the current one by x,
+        * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
+        */
+       sshr            \tmp\().2d, \cur\().2d, #63
+       and             \tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
+       shl             \next\().2d, \cur\().2d, #1
+       ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
+       eor             \next\().16b, \next\().16b, \tmp\().16b
+.else
+       /*
+        * Calculate the next two tweaks by multiplying the current ones by x^2,
+        * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
+        */
+       ushr            \tmp\().2d, \cur\().2d, #62
+       shl             \next\().2d, \cur\().2d, #2
+       tbl             \tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
+       eor             \next\().16b, \next\().16b, \tmp\().16b
+.endif
+.endm
+
+/*
+ * _speck_xts_crypt() - Speck-XTS encryption/decryption
+ *
+ * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
+ * using Speck-XTS, specifically the variant with a block size of '2n' and round
+ * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
+ * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
+ * nonzero multiple of 128.
+ */
+.macro _speck_xts_crypt        n, lanes, decrypting
+
+       /*
+        * If decrypting, modify the ROUND_KEYS parameter to point to the last
+        * round key rather than the first, since for decryption the round keys
+        * are used in reverse order.
+        */
+.if \decrypting
+       mov             NROUNDS, NROUNDS        /* zero the high 32 bits */
+.if \n == 64
+       add             ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
+       sub             ROUND_KEYS, ROUND_KEYS, #8
+.else
+       add             ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
+       sub             ROUND_KEYS, ROUND_KEYS, #4
+.endif
+.endif
+
+       // Load the index vector for tbl-based 8-bit rotates
+.if \decrypting
+       ldr             ROTATE_TABLE_Q, .Lrol\n\()_8_table
+.else
+       ldr             ROTATE_TABLE_Q, .Lror\n\()_8_table
+.endif
+
+       // One-time XTS preparation
+.if \n == 64
+       // Load first tweak
+       ld1             {TWEAKV0.16b}, [TWEAK]
+
+       // Load GF(2^128) multiplication table
+       ldr             GFMUL_TABLE_Q, .Lgf128mul_table
+.else
+       // Load first tweak
+       ld1             {TWEAKV0.8b}, [TWEAK]
+
+       // Load GF(2^64) multiplication table
+       ldr             GFMUL_TABLE_Q, .Lgf64mul_table
+
+       // Calculate second tweak, packing it together with the first
+       ushr            TMP0.2d, TWEAKV0.2d, #63
+       shl             TMP1.2d, TWEAKV0.2d, #1
+       tbl             TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
+       eor             TMP0.8b, TMP0.8b, TMP1.8b
+       mov             TWEAKV0.d[1], TMP0.d[0]
+.endif
+
+.Lnext_128bytes_\@:
+
+       // Calculate XTS tweaks for next 128 bytes
+       _next_xts_tweak TWEAKV1, TWEAKV0, TMP0, \n
+       _next_xts_tweak TWEAKV2, TWEAKV1, TMP0, \n
+       _next_xts_tweak TWEAKV3, TWEAKV2, TMP0, \n
+       _next_xts_tweak TWEAKV4, TWEAKV3, TMP0, \n
+       _next_xts_tweak TWEAKV5, TWEAKV4, TMP0, \n
+       _next_xts_tweak TWEAKV6, TWEAKV5, TMP0, \n
+       _next_xts_tweak TWEAKV7, TWEAKV6, TMP0, \n
+       _next_xts_tweak TWEAKV_NEXT, TWEAKV7, TMP0, \n
+
+       // Load the next source blocks into {X,Y}[0-3]
+       ld1             {X_0.16b-Y_1.16b}, [SRC], #64
+       ld1             {X_2.16b-Y_3.16b}, [SRC], #64
+
+       // XOR the source blocks with their XTS tweaks
+       eor             TMP0.16b, X_0.16b, TWEAKV0.16b
+       eor             Y_0.16b,  Y_0.16b, TWEAKV1.16b
+       eor             TMP1.16b, X_1.16b, TWEAKV2.16b
+       eor             Y_1.16b,  Y_1.16b, TWEAKV3.16b
+       eor             TMP2.16b, X_2.16b, TWEAKV4.16b
+       eor             Y_2.16b,  Y_2.16b, TWEAKV5.16b
+       eor             TMP3.16b, X_3.16b, TWEAKV6.16b
+       eor             Y_3.16b,  Y_3.16b, TWEAKV7.16b
+
+       /*
+        * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
+        * that the X[0-3] registers contain only the second halves of blocks,
+        * and the Y[0-3] registers contain only the first halves of blocks.
+        * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
+        */
+       uzp2            X_0.\lanes, TMP0.\lanes, Y_0.\lanes
+       uzp1            Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
+       uzp2            X_1.\lanes, TMP1.\lanes, Y_1.\lanes
+       uzp1            Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
+       uzp2            X_2.\lanes, TMP2.\lanes, Y_2.\lanes
+       uzp1            Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
+       uzp2            X_3.\lanes, TMP3.\lanes, Y_3.\lanes
+       uzp1            Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
+
+       // Do the cipher rounds
+       mov             x6, ROUND_KEYS
+       mov             w7, NROUNDS
+.Lnext_round_\@:
+.if \decrypting
+       ld1r            {ROUND_KEY.\lanes}, [x6]
+       sub             x6, x6, #( \n / 8 )
+       _speck_unround_128bytes \n, \lanes
+.else
+       ld1r            {ROUND_KEY.\lanes}, [x6], #( \n / 8 )
+       _speck_round_128bytes   \n, \lanes
+.endif
+       subs            w7, w7, #1
+       bne             .Lnext_round_\@
+
+       // Re-interleave the 'x' and 'y' elements of each block
+       zip1            TMP0.\lanes, Y_0.\lanes, X_0.\lanes
+       zip2            Y_0.\lanes,  Y_0.\lanes, X_0.\lanes
+       zip1            TMP1.\lanes, Y_1.\lanes, X_1.\lanes
+       zip2            Y_1.\lanes,  Y_1.\lanes, X_1.\lanes
+       zip1            TMP2.\lanes, Y_2.\lanes, X_2.\lanes
+       zip2            Y_2.\lanes,  Y_2.\lanes, X_2.\lanes
+       zip1            TMP3.\lanes, Y_3.\lanes, X_3.\lanes
+       zip2            Y_3.\lanes,  Y_3.\lanes, X_3.\lanes
+
+       // XOR the encrypted/decrypted blocks with the tweaks calculated earlier
+       eor             X_0.16b, TMP0.16b, TWEAKV0.16b
+       eor             Y_0.16b, Y_0.16b,  TWEAKV1.16b
+       eor             X_1.16b, TMP1.16b, TWEAKV2.16b
+       eor             Y_1.16b, Y_1.16b,  TWEAKV3.16b
+       eor             X_2.16b, TMP2.16b, TWEAKV4.16b
+       eor             Y_2.16b, Y_2.16b,  TWEAKV5.16b
+       eor             X_3.16b, TMP3.16b, TWEAKV6.16b
+       eor             Y_3.16b, Y_3.16b,  TWEAKV7.16b
+       mov             TWEAKV0.16b, TWEAKV_NEXT.16b
+
+       // Store the ciphertext in the destination buffer
+       st1             {X_0.16b-Y_1.16b}, [DST], #64
+       st1             {X_2.16b-Y_3.16b}, [DST], #64
+
+       // Continue if there are more 128-byte chunks remaining
+       subs            NBYTES, NBYTES, #128
+       bne             .Lnext_128bytes_\@
+
+       // Store the next tweak and return
+.if \n == 64
+       st1             {TWEAKV_NEXT.16b}, [TWEAK]
+.else
+       st1             {TWEAKV_NEXT.8b}, [TWEAK]
+.endif
+       ret
+.endm
+
+ENTRY(speck128_xts_encrypt_neon)
+       _speck_xts_crypt        n=64, lanes=2d, decrypting=0
+ENDPROC(speck128_xts_encrypt_neon)
+
+ENTRY(speck128_xts_decrypt_neon)
+       _speck_xts_crypt        n=64, lanes=2d, decrypting=1
+ENDPROC(speck128_xts_decrypt_neon)
+
+ENTRY(speck64_xts_encrypt_neon)
+       _speck_xts_crypt        n=32, lanes=4s, decrypting=0
+ENDPROC(speck64_xts_encrypt_neon)
+
+ENTRY(speck64_xts_decrypt_neon)
+       _speck_xts_crypt        n=32, lanes=4s, decrypting=1
+ENDPROC(speck64_xts_decrypt_neon)
diff --git a/arch/arm64/crypto/speck-neon-glue.c b/arch/arm64/crypto/speck-neon-glue.c
new file mode 100644 (file)
index 0000000..6e233ae
--- /dev/null
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
+ * (64-bit version; based on the 32-bit version)
+ *
+ * Copyright (c) 2018 Google, Inc
+ */
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <crypto/algapi.h>
+#include <crypto/gf128mul.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/speck.h>
+#include <crypto/xts.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+/* The assembly functions only handle multiples of 128 bytes */
+#define SPECK_NEON_CHUNK_SIZE  128
+
+/* Speck128 */
+
+struct speck128_xts_tfm_ctx {
+       struct speck128_tfm_ctx main_key;
+       struct speck128_tfm_ctx tweak_key;
+};
+
+asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
+                                         void *dst, const void *src,
+                                         unsigned int nbytes, void *tweak);
+
+asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
+                                         void *dst, const void *src,
+                                         unsigned int nbytes, void *tweak);
+
+typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
+                                    u8 *, const u8 *);
+typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
+                                         const void *, unsigned int, void *);
+
+static __always_inline int
+__speck128_xts_crypt(struct skcipher_request *req,
+                    speck128_crypt_one_t crypt_one,
+                    speck128_xts_crypt_many_t crypt_many)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       le128 tweak;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, true);
+
+       crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
+
+       while (walk.nbytes > 0) {
+               unsigned int nbytes = walk.nbytes;
+               u8 *dst = walk.dst.virt.addr;
+               const u8 *src = walk.src.virt.addr;
+
+               if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
+                       unsigned int count;
+
+                       count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
+                       kernel_neon_begin();
+                       (*crypt_many)(ctx->main_key.round_keys,
+                                     ctx->main_key.nrounds,
+                                     dst, src, count, &tweak);
+                       kernel_neon_end();
+                       dst += count;
+                       src += count;
+                       nbytes -= count;
+               }
+
+               /* Handle any remainder with generic code */
+               while (nbytes >= sizeof(tweak)) {
+                       le128_xor((le128 *)dst, (const le128 *)src, &tweak);
+                       (*crypt_one)(&ctx->main_key, dst, dst);
+                       le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
+                       gf128mul_x_ble(&tweak, &tweak);
+
+                       dst += sizeof(tweak);
+                       src += sizeof(tweak);
+                       nbytes -= sizeof(tweak);
+               }
+               err = skcipher_walk_done(&walk, nbytes);
+       }
+
+       return err;
+}
+
+static int speck128_xts_encrypt(struct skcipher_request *req)
+{
+       return __speck128_xts_crypt(req, crypto_speck128_encrypt,
+                                   speck128_xts_encrypt_neon);
+}
+
+static int speck128_xts_decrypt(struct skcipher_request *req)
+{
+       return __speck128_xts_crypt(req, crypto_speck128_decrypt,
+                                   speck128_xts_decrypt_neon);
+}
+
+static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                              unsigned int keylen)
+{
+       struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
+       int err;
+
+       err = xts_verify_key(tfm, key, keylen);
+       if (err)
+               return err;
+
+       keylen /= 2;
+
+       err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
+       if (err)
+               return err;
+
+       return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
+}
+
+/* Speck64 */
+
+struct speck64_xts_tfm_ctx {
+       struct speck64_tfm_ctx main_key;
+       struct speck64_tfm_ctx tweak_key;
+};
+
+asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
+                                        void *dst, const void *src,
+                                        unsigned int nbytes, void *tweak);
+
+asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
+                                        void *dst, const void *src,
+                                        unsigned int nbytes, void *tweak);
+
+typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
+                                   u8 *, const u8 *);
+typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
+                                        const void *, unsigned int, void *);
+
+static __always_inline int
+__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
+                   speck64_xts_crypt_many_t crypt_many)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       __le64 tweak;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, true);
+
+       crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
+
+       while (walk.nbytes > 0) {
+               unsigned int nbytes = walk.nbytes;
+               u8 *dst = walk.dst.virt.addr;
+               const u8 *src = walk.src.virt.addr;
+
+               if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
+                       unsigned int count;
+
+                       count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
+                       kernel_neon_begin();
+                       (*crypt_many)(ctx->main_key.round_keys,
+                                     ctx->main_key.nrounds,
+                                     dst, src, count, &tweak);
+                       kernel_neon_end();
+                       dst += count;
+                       src += count;
+                       nbytes -= count;
+               }
+
+               /* Handle any remainder with generic code */
+               while (nbytes >= sizeof(tweak)) {
+                       *(__le64 *)dst = *(__le64 *)src ^ tweak;
+                       (*crypt_one)(&ctx->main_key, dst, dst);
+                       *(__le64 *)dst ^= tweak;
+                       tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
+                                           ((tweak & cpu_to_le64(1ULL << 63)) ?
+                                            0x1B : 0));
+                       dst += sizeof(tweak);
+                       src += sizeof(tweak);
+                       nbytes -= sizeof(tweak);
+               }
+               err = skcipher_walk_done(&walk, nbytes);
+       }
+
+       return err;
+}
+
+static int speck64_xts_encrypt(struct skcipher_request *req)
+{
+       return __speck64_xts_crypt(req, crypto_speck64_encrypt,
+                                  speck64_xts_encrypt_neon);
+}
+
+static int speck64_xts_decrypt(struct skcipher_request *req)
+{
+       return __speck64_xts_crypt(req, crypto_speck64_decrypt,
+                                  speck64_xts_decrypt_neon);
+}
+
+static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                             unsigned int keylen)
+{
+       struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
+       int err;
+
+       err = xts_verify_key(tfm, key, keylen);
+       if (err)
+               return err;
+
+       keylen /= 2;
+
+       err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
+       if (err)
+               return err;
+
+       return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
+}
+
+static struct skcipher_alg speck_algs[] = {
+       {
+               .base.cra_name          = "xts(speck128)",
+               .base.cra_driver_name   = "xts-speck128-neon",
+               .base.cra_priority      = 300,
+               .base.cra_blocksize     = SPECK128_BLOCK_SIZE,
+               .base.cra_ctxsize       = sizeof(struct speck128_xts_tfm_ctx),
+               .base.cra_alignmask     = 7,
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = 2 * SPECK128_128_KEY_SIZE,
+               .max_keysize            = 2 * SPECK128_256_KEY_SIZE,
+               .ivsize                 = SPECK128_BLOCK_SIZE,
+               .walksize               = SPECK_NEON_CHUNK_SIZE,
+               .setkey                 = speck128_xts_setkey,
+               .encrypt                = speck128_xts_encrypt,
+               .decrypt                = speck128_xts_decrypt,
+       }, {
+               .base.cra_name          = "xts(speck64)",
+               .base.cra_driver_name   = "xts-speck64-neon",
+               .base.cra_priority      = 300,
+               .base.cra_blocksize     = SPECK64_BLOCK_SIZE,
+               .base.cra_ctxsize       = sizeof(struct speck64_xts_tfm_ctx),
+               .base.cra_alignmask     = 7,
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = 2 * SPECK64_96_KEY_SIZE,
+               .max_keysize            = 2 * SPECK64_128_KEY_SIZE,
+               .ivsize                 = SPECK64_BLOCK_SIZE,
+               .walksize               = SPECK_NEON_CHUNK_SIZE,
+               .setkey                 = speck64_xts_setkey,
+               .encrypt                = speck64_xts_encrypt,
+               .decrypt                = speck64_xts_decrypt,
+       }
+};
+
+static int __init speck_neon_module_init(void)
+{
+       if (!(elf_hwcap & HWCAP_ASIMD))
+               return -ENODEV;
+       return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+static void __exit speck_neon_module_exit(void)
+{
+       crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
+}
+
+module_init(speck_neon_module_init);
+module_exit(speck_neon_module_exit);
+
+MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("xts(speck128)");
+MODULE_ALIAS_CRYPTO("xts-speck128-neon");
+MODULE_ALIAS_CRYPTO("xts(speck64)");
+MODULE_ALIAS_CRYPTO("xts-speck64-neon");
index 12e8484a8ee79b42d3604abc71a6e53a0f5b4cee..e762ef417562ff96ba769a555a36f546ff5d7d92 100644 (file)
@@ -94,23 +94,30 @@ ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
 
 
 #define        STACK_OFFSET    8*3
-#define        HashKey         16*0    // store HashKey <<1 mod poly here
-#define        HashKey_2       16*1    // store HashKey^2 <<1 mod poly here
-#define        HashKey_3       16*2    // store HashKey^3 <<1 mod poly here
-#define        HashKey_4       16*3    // store HashKey^4 <<1 mod poly here
-#define        HashKey_k       16*4    // store XOR of High 64 bits and Low 64
+
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+#define        HashKey         16*6    // store HashKey <<1 mod poly here
+#define        HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
+#define        HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
+#define        HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
+#define        HashKey_k       16*10   // store XOR of High 64 bits and Low 64
                                // bits of  HashKey <<1 mod poly here
                                //(for Karatsuba purposes)
-#define        HashKey_2_k     16*   // store XOR of High 64 bits and Low 64
+#define        HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
                                // bits of  HashKey^2 <<1 mod poly here
                                // (for Karatsuba purposes)
-#define        HashKey_3_k     16*   // store XOR of High 64 bits and Low 64
+#define        HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
                                // bits of  HashKey^3 <<1 mod poly here
                                // (for Karatsuba purposes)
-#define        HashKey_4_k     16*   // store XOR of High 64 bits and Low 64
+#define        HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
                                // bits of  HashKey^4 <<1 mod poly here
                                // (for Karatsuba purposes)
-#define        VARIABLE_OFFSET 16*8
 
 #define arg1 rdi
 #define arg2 rsi
@@ -118,10 +125,11 @@ ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
 #define arg4 rcx
 #define arg5 r8
 #define arg6 r9
-#define arg7 STACK_OFFSET+8(%r14)
-#define arg8 STACK_OFFSET+16(%r14)
-#define arg9 STACK_OFFSET+24(%r14)
-#define arg10 STACK_OFFSET+32(%r14)
+#define arg7 STACK_OFFSET+8(%rsp)
+#define arg8 STACK_OFFSET+16(%rsp)
+#define arg9 STACK_OFFSET+24(%rsp)
+#define arg10 STACK_OFFSET+32(%rsp)
+#define arg11 STACK_OFFSET+40(%rsp)
 #define keysize 2*15*16(%arg1)
 #endif
 
@@ -171,6 +179,332 @@ ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
 #define TKEYP  T1
 #endif
 
+.macro FUNC_SAVE
+       push    %r12
+       push    %r13
+       push    %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+.endm
+
+
+.macro FUNC_RESTORE
+       pop     %r14
+       pop     %r13
+       pop     %r12
+.endm
+
+# Precompute hashkeys.
+# Input: Hash subkey.
+# Output: HashKeys stored in gcm_context_data.  Only needs to be called
+# once per key.
+# clobbers r12, and tmp xmm registers.
+.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
+       mov     \SUBKEY, %r12
+       movdqu  (%r12), \TMP3
+       movdqa  SHUF_MASK(%rip), \TMP2
+       PSHUFB_XMM \TMP2, \TMP3
+
+       # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+       movdqa  \TMP3, \TMP2
+       psllq   $1, \TMP3
+       psrlq   $63, \TMP2
+       movdqa  \TMP2, \TMP1
+       pslldq  $8, \TMP2
+       psrldq  $8, \TMP1
+       por     \TMP2, \TMP3
+
+       # reduce HashKey<<1
+
+       pshufd  $0x24, \TMP1, \TMP2
+       pcmpeqd TWOONE(%rip), \TMP2
+       pand    POLY(%rip), \TMP2
+       pxor    \TMP2, \TMP3
+       movdqa  \TMP3, HashKey(%arg2)
+
+       movdqa     \TMP3, \TMP5
+       pshufd     $78, \TMP3, \TMP1
+       pxor       \TMP3, \TMP1
+       movdqa     \TMP1, HashKey_k(%arg2)
+
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+       movdqa     \TMP5, HashKey_2(%arg2)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_2_k(%arg2)
+
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_3(%arg2)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_3_k(%arg2)
+
+       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+       movdqa     \TMP5, HashKey_4(%arg2)
+       pshufd     $78, \TMP5, \TMP1
+       pxor       \TMP5, \TMP1
+       movdqa     \TMP1, HashKey_4_k(%arg2)
+.endm
+
+# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
+# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
+.macro GCM_INIT Iv SUBKEY AAD AADLEN
+       mov \AADLEN, %r11
+       mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
+       xor %r11, %r11
+       mov %r11, InLen(%arg2) # ctx_data.in_length = 0
+       mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
+       mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
+       mov \Iv, %rax
+       movdqu (%rax), %xmm0
+       movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
+
+       movdqa  SHUF_MASK(%rip), %xmm2
+       PSHUFB_XMM %xmm2, %xmm0
+       movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
+
+       PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
+       movdqa HashKey(%arg2), %xmm13
+
+       CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
+       %xmm4, %xmm5, %xmm6
+.endm
+
+# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
+# struct has been initialized by GCM_INIT.
+# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
+# Clobbers rax, r10-r13, and xmm0-xmm15
+.macro GCM_ENC_DEC operation
+       movdqu AadHash(%arg2), %xmm8
+       movdqu HashKey(%arg2), %xmm13
+       add %arg5, InLen(%arg2)
+
+       xor %r11, %r11 # initialise the data pointer offset as zero
+       PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
+
+       sub %r11, %arg5         # sub partial block data used
+       mov %arg5, %r13         # save the number of bytes
+
+       and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
+       mov %r13, %r12
+       # Encrypt/Decrypt first few blocks
+
+       and     $(3<<4), %r12
+       jz      _initial_num_blocks_is_0_\@
+       cmp     $(2<<4), %r12
+       jb      _initial_num_blocks_is_1_\@
+       je      _initial_num_blocks_is_2_\@
+_initial_num_blocks_is_3_\@:
+       INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
+       sub     $48, %r13
+       jmp     _initial_blocks_\@
+_initial_num_blocks_is_2_\@:
+       INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
+       sub     $32, %r13
+       jmp     _initial_blocks_\@
+_initial_num_blocks_is_1_\@:
+       INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
+       sub     $16, %r13
+       jmp     _initial_blocks_\@
+_initial_num_blocks_is_0_\@:
+       INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
+_initial_blocks_\@:
+
+       # Main loop - Encrypt/Decrypt remaining blocks
+
+       cmp     $0, %r13
+       je      _zero_cipher_left_\@
+       sub     $64, %r13
+       je      _four_cipher_left_\@
+_crypt_by_4_\@:
+       GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
+       %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
+       %xmm7, %xmm8, enc
+       add     $64, %r11
+       sub     $64, %r13
+       jne     _crypt_by_4_\@
+_four_cipher_left_\@:
+       GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_\@:
+       movdqu %xmm8, AadHash(%arg2)
+       movdqu %xmm0, CurCount(%arg2)
+
+       mov     %arg5, %r13
+       and     $15, %r13                       # %r13 = arg5 (mod 16)
+       je      _multiple_of_16_bytes_\@
+
+       mov %r13, PBlockLen(%arg2)
+
+       # Handle the last <16 Byte block separately
+       paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
+       movdqu %xmm0, CurCount(%arg2)
+       movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm0
+
+       ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
+       movdqu %xmm0, PBlockEncKey(%arg2)
+
+       cmp     $16, %arg5
+       jge _large_enough_update_\@
+
+       lea (%arg4,%r11,1), %r10
+       mov %r13, %r12
+       READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
+       jmp _data_read_\@
+
+_large_enough_update_\@:
+       sub     $16, %r11
+       add     %r13, %r11
+
+       # receive the last <16 Byte block
+       movdqu  (%arg4, %r11, 1), %xmm1
+
+       sub     %r13, %r11
+       add     $16, %r11
+
+       lea     SHIFT_MASK+16(%rip), %r12
+       # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+       # (r13 is the number of bytes in plaintext mod 16)
+       sub     %r13, %r12
+       # get the appropriate shuffle mask
+       movdqu  (%r12), %xmm2
+       # shift right 16-r13 bytes
+       PSHUFB_XMM  %xmm2, %xmm1
+
+_data_read_\@:
+       lea ALL_F+16(%rip), %r12
+       sub %r13, %r12
+
+.ifc \operation, dec
+       movdqa  %xmm1, %xmm2
+.endif
+       pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
+       movdqu  (%r12), %xmm1
+       # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+       pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
+.ifc \operation, dec
+       pand    %xmm1, %xmm2
+       movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10 ,%xmm2
+
+       pxor %xmm2, %xmm8
+.else
+       movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10,%xmm0
+
+       pxor    %xmm0, %xmm8
+.endif
+
+       movdqu %xmm8, AadHash(%arg2)
+.ifc \operation, enc
+       # GHASH computation for the last <16 byte block
+       movdqa SHUF_MASK(%rip), %xmm10
+       # shuffle xmm0 back to output as ciphertext
+       PSHUFB_XMM %xmm10, %xmm0
+.endif
+
+       # Output %r13 bytes
+       MOVQ_R64_XMM %xmm0, %rax
+       cmp $8, %r13
+       jle _less_than_8_bytes_left_\@
+       mov %rax, (%arg3 , %r11, 1)
+       add $8, %r11
+       psrldq $8, %xmm0
+       MOVQ_R64_XMM %xmm0, %rax
+       sub $8, %r13
+_less_than_8_bytes_left_\@:
+       mov %al,  (%arg3, %r11, 1)
+       add $1, %r11
+       shr $8, %rax
+       sub $1, %r13
+       jne _less_than_8_bytes_left_\@
+_multiple_of_16_bytes_\@:
+.endm
+
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
+       movdqu AadHash(%arg2), %xmm8
+       movdqu HashKey(%arg2), %xmm13
+
+       mov PBlockLen(%arg2), %r12
+
+       cmp $0, %r12
+       je _partial_done\@
+
+       GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+
+_partial_done\@:
+       mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
+       shl     $3, %r12                  # convert into number of bits
+       movd    %r12d, %xmm15             # len(A) in %xmm15
+       mov InLen(%arg2), %r12
+       shl     $3, %r12                  # len(C) in bits (*128)
+       MOVQ_R64_XMM    %r12, %xmm1
+
+       pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
+       pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
+       pxor    %xmm15, %xmm8
+       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+       # final GHASH computation
+       movdqa SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM %xmm10, %xmm8
+
+       movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
+       ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
+       pxor    %xmm8, %xmm0
+_return_T_\@:
+       mov     \AUTHTAG, %r10                     # %r10 = authTag
+       mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
+       cmp     $16, %r11
+       je      _T_16_\@
+       cmp     $8, %r11
+       jl      _T_4_\@
+_T_8_\@:
+       MOVQ_R64_XMM    %xmm0, %rax
+       mov     %rax, (%r10)
+       add     $8, %r10
+       sub     $8, %r11
+       psrldq  $8, %xmm0
+       cmp     $0, %r11
+       je      _return_T_done_\@
+_T_4_\@:
+       movd    %xmm0, %eax
+       mov     %eax, (%r10)
+       add     $4, %r10
+       sub     $4, %r11
+       psrldq  $4, %xmm0
+       cmp     $0, %r11
+       je      _return_T_done_\@
+_T_123_\@:
+       movd    %xmm0, %eax
+       cmp     $2, %r11
+       jl      _T_1_\@
+       mov     %ax, (%r10)
+       cmp     $2, %r11
+       je      _return_T_done_\@
+       add     $2, %r10
+       sar     $16, %eax
+_T_1_\@:
+       mov     %al, (%r10)
+       jmp     _return_T_done_\@
+_T_16_\@:
+       movdqu  %xmm0, (%r10)
+_return_T_done_\@:
+.endm
 
 #ifdef __x86_64__
 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@@ -264,232 +598,188 @@ _read_next_byte_lt8_\@:
 _done_read_partial_block_\@:
 .endm
 
-/*
-* if a = number of total plaintext bytes
-* b = floor(a/16)
-* num_initial_blocks = b mod 4
-* encrypt the initial num_initial_blocks blocks and apply ghash on
-* the ciphertext
-* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
-* are clobbered
-* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
-*/
-
-
-.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
-XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
-        MOVADQ     SHUF_MASK(%rip), %xmm14
-       mov        arg7, %r10           # %r10 = AAD
-       mov        arg8, %r11           # %r11 = aadLen
-       pxor       %xmm\i, %xmm\i
-       pxor       \XMM2, \XMM2
+# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+# clobbers r10-11, xmm14
+.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
+       TMP6 TMP7
+       MOVADQ     SHUF_MASK(%rip), %xmm14
+       mov        \AAD, %r10           # %r10 = AAD
+       mov        \AADLEN, %r11                # %r11 = aadLen
+       pxor       \TMP7, \TMP7
+       pxor       \TMP6, \TMP6
 
        cmp        $16, %r11
-       jl         _get_AAD_rest\num_initial_blocks\operation
-_get_AAD_blocks\num_initial_blocks\operation:
-       movdqu     (%r10), %xmm\i
-       PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
-       pxor       %xmm\i, \XMM2
-       GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+       jl         _get_AAD_rest\@
+_get_AAD_blocks\@:
+       movdqu     (%r10), \TMP7
+       PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
+       pxor       \TMP7, \TMP6
+       GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
        add        $16, %r10
        sub        $16, %r11
-       cmp        $16, %r11
-       jge        _get_AAD_blocks\num_initial_blocks\operation
-
-       movdqu     \XMM2, %xmm\i
-
-       /* read the last <16B of AAD */
-_get_AAD_rest\num_initial_blocks\operation:
-       cmp        $0, %r11
-       je         _get_AAD_done\num_initial_blocks\operation
-
-       READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
-       PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
-       pxor       \XMM2, %xmm\i
-       GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-
-_get_AAD_done\num_initial_blocks\operation:
-       xor        %r11, %r11 # initialise the data pointer offset as zero
-       # start AES for num_initial_blocks blocks
-
-       mov        %arg5, %rax                      # %rax = *Y0
-       movdqu     (%rax), \XMM0                    # XMM0 = Y0
-       PSHUFB_XMM   %xmm14, \XMM0
-
-.if (\i == 5) || (\i == 6) || (\i == 7)
-       MOVADQ          ONE(%RIP),\TMP1
-       MOVADQ          (%arg1),\TMP2
-.irpc index, \i_seq
-       paddd      \TMP1, \XMM0                 # INCR Y0
-       movdqa     \XMM0, %xmm\index
-       PSHUFB_XMM   %xmm14, %xmm\index      # perform a 16 byte swap
-       pxor       \TMP2, %xmm\index
-.endr
-       lea     0x10(%arg1),%r10
-       mov     keysize,%eax
-       shr     $2,%eax                         # 128->4, 192->6, 256->8
-       add     $5,%eax                       # 128->9, 192->11, 256->13
-
-aes_loop_initial_dec\num_initial_blocks:
-       MOVADQ  (%r10),\TMP1
-.irpc  index, \i_seq
-       AESENC  \TMP1, %xmm\index
-.endr
-       add     $16,%r10
-       sub     $1,%eax
-       jnz     aes_loop_initial_dec\num_initial_blocks
-
-       MOVADQ  (%r10), \TMP1
-.irpc index, \i_seq
-       AESENCLAST \TMP1, %xmm\index         # Last Round
-.endr
-.irpc index, \i_seq
-       movdqu     (%arg3 , %r11, 1), \TMP1
-       pxor       \TMP1, %xmm\index
-       movdqu     %xmm\index, (%arg2 , %r11, 1)
-       # write back plaintext/ciphertext for num_initial_blocks
-       add        $16, %r11
-
-       movdqa     \TMP1, %xmm\index
-       PSHUFB_XMM         %xmm14, %xmm\index
-                # prepare plaintext/ciphertext for GHASH computation
-.endr
-.endif
-
-        # apply GHASH on num_initial_blocks blocks
-
-.if \i == 5
-        pxor       %xmm5, %xmm6
-       GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-        pxor       %xmm6, %xmm7
-       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-        pxor       %xmm7, %xmm8
-       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 6
-        pxor       %xmm6, %xmm7
-       GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-        pxor       %xmm7, %xmm8
-       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.elseif \i == 7
-        pxor       %xmm7, %xmm8
-       GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-.endif
-       cmp        $64, %r13
-       jl      _initial_blocks_done\num_initial_blocks\operation
-       # no need for precomputed values
-/*
-*
-* Precomputations for HashKey parallel with encryption of first 4 blocks.
-* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
-*/
-       MOVADQ     ONE(%rip), \TMP1
-       paddd      \TMP1, \XMM0              # INCR Y0
-       MOVADQ     \XMM0, \XMM1
-       PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
-
-       paddd      \TMP1, \XMM0              # INCR Y0
-       MOVADQ     \XMM0, \XMM2
-       PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
-
-       paddd      \TMP1, \XMM0              # INCR Y0
-       MOVADQ     \XMM0, \XMM3
-       PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
-
-       paddd      \TMP1, \XMM0              # INCR Y0
-       MOVADQ     \XMM0, \XMM4
-       PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
-
-       MOVADQ     0(%arg1),\TMP1
-       pxor       \TMP1, \XMM1
-       pxor       \TMP1, \XMM2
-       pxor       \TMP1, \XMM3
-       pxor       \TMP1, \XMM4
-       movdqa     \TMP3, \TMP5
-       pshufd     $78, \TMP3, \TMP1
-       pxor       \TMP3, \TMP1
-       movdqa     \TMP1, HashKey_k(%rsp)
-       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^2<<1 (mod poly)
-       movdqa     \TMP5, HashKey_2(%rsp)
-# HashKey_2 = HashKey^2<<1 (mod poly)
-       pshufd     $78, \TMP5, \TMP1
-       pxor       \TMP5, \TMP1
-       movdqa     \TMP1, HashKey_2_k(%rsp)
-.irpc index, 1234 # do 4 rounds
-       movaps 0x10*\index(%arg1), \TMP1
-       AESENC     \TMP1, \XMM1
-       AESENC     \TMP1, \XMM2
-       AESENC     \TMP1, \XMM3
-       AESENC     \TMP1, \XMM4
-.endr
-       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
-       movdqa     \TMP5, HashKey_3(%rsp)
-       pshufd     $78, \TMP5, \TMP1
-       pxor       \TMP5, \TMP1
-       movdqa     \TMP1, HashKey_3_k(%rsp)
-.irpc index, 56789 # do next 5 rounds
-       movaps 0x10*\index(%arg1), \TMP1
-       AESENC     \TMP1, \XMM1
-       AESENC     \TMP1, \XMM2
-       AESENC     \TMP1, \XMM3
-       AESENC     \TMP1, \XMM4
-.endr
-       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
-       movdqa     \TMP5, HashKey_4(%rsp)
-       pshufd     $78, \TMP5, \TMP1
-       pxor       \TMP5, \TMP1
-       movdqa     \TMP1, HashKey_4_k(%rsp)
-       lea        0xa0(%arg1),%r10
-       mov        keysize,%eax
-       shr        $2,%eax                      # 128->4, 192->6, 256->8
-       sub        $4,%eax                      # 128->0, 192->2, 256->4
-       jz         aes_loop_pre_dec_done\num_initial_blocks
-
-aes_loop_pre_dec\num_initial_blocks:
-       MOVADQ     (%r10),\TMP2
-.irpc  index, 1234
-       AESENC     \TMP2, %xmm\index
-.endr
-       add        $16,%r10
-       sub        $1,%eax
-       jnz        aes_loop_pre_dec\num_initial_blocks
+       cmp        $16, %r11
+       jge        _get_AAD_blocks\@
 
-aes_loop_pre_dec_done\num_initial_blocks:
-       MOVADQ     (%r10), \TMP2
-       AESENCLAST \TMP2, \XMM1
-       AESENCLAST \TMP2, \XMM2
-       AESENCLAST \TMP2, \XMM3
-       AESENCLAST \TMP2, \XMM4
-       movdqu     16*0(%arg3 , %r11 , 1), \TMP1
-       pxor       \TMP1, \XMM1
-       movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
-       movdqa     \TMP1, \XMM1
-       movdqu     16*1(%arg3 , %r11 , 1), \TMP1
-       pxor       \TMP1, \XMM2
-       movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
-       movdqa     \TMP1, \XMM2
-       movdqu     16*2(%arg3 , %r11 , 1), \TMP1
-       pxor       \TMP1, \XMM3
-       movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
-       movdqa     \TMP1, \XMM3
-       movdqu     16*3(%arg3 , %r11 , 1), \TMP1
-       pxor       \TMP1, \XMM4
-       movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
-       movdqa     \TMP1, \XMM4
-       add        $64, %r11
-       PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
-       pxor       \XMMDst, \XMM1
-# combine GHASHed value with the corresponding ciphertext
-       PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
-       PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
-       PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+       movdqu     \TMP6, \TMP7
+
+       /* read the last <16B of AAD */
+_get_AAD_rest\@:
+       cmp        $0, %r11
+       je         _get_AAD_done\@
 
-_initial_blocks_done\num_initial_blocks\operation:
+       READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
+       PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
+       pxor       \TMP6, \TMP7
+       GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
+       movdqu \TMP7, \TMP6
 
+_get_AAD_done\@:
+       movdqu \TMP6, AadHash(%arg2)
 .endm
 
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
+# between update calls.
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
+# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
+.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+       AAD_HASH operation
+       mov     PBlockLen(%arg2), %r13
+       cmp     $0, %r13
+       je      _partial_block_done_\@  # Leave Macro if no partial blocks
+       # Read in input data without over reading
+       cmp     $16, \PLAIN_CYPH_LEN
+       jl      _fewer_than_16_bytes_\@
+       movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
+       jmp     _data_read_\@
+
+_fewer_than_16_bytes_\@:
+       lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
+       mov     \PLAIN_CYPH_LEN, %r12
+       READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
+
+       mov PBlockLen(%arg2), %r13
+
+_data_read_\@:                         # Finished reading in data
+
+       movdqu  PBlockEncKey(%arg2), %xmm9
+       movdqu  HashKey(%arg2), %xmm13
+
+       lea     SHIFT_MASK(%rip), %r12
+
+       # adjust the shuffle mask pointer to be able to shift r13 bytes
+       # r16-r13 is the number of bytes in plaintext mod 16)
+       add     %r13, %r12
+       movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
+       PSHUFB_XMM %xmm2, %xmm9         # shift right r13 bytes
+
+.ifc \operation, dec
+       movdqa  %xmm1, %xmm3
+       pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
+
+       mov     \PLAIN_CYPH_LEN, %r10
+       add     %r13, %r10
+       # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+       sub     $16, %r10
+       # Determine if if partial block is not being filled and
+       # shift mask accordingly
+       jge     _no_extra_mask_1_\@
+       sub     %r10, %r12
+_no_extra_mask_1_\@:
+
+       movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
+       # get the appropriate mask to mask out bottom r13 bytes of xmm9
+       pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
+
+       pand    %xmm1, %xmm3
+       movdqa  SHUF_MASK(%rip), %xmm10
+       PSHUFB_XMM      %xmm10, %xmm3
+       PSHUFB_XMM      %xmm2, %xmm3
+       pxor    %xmm3, \AAD_HASH
+
+       cmp     $0, %r10
+       jl      _partial_incomplete_1_\@
+
+       # GHASH computation for the last <16 Byte block
+       GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+       xor     %rax,%rax
+
+       mov     %rax, PBlockLen(%arg2)
+       jmp     _dec_done_\@
+_partial_incomplete_1_\@:
+       add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
+_dec_done_\@:
+       movdqu  \AAD_HASH, AadHash(%arg2)
+.else
+       pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
+
+       mov     \PLAIN_CYPH_LEN, %r10
+       add     %r13, %r10
+       # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+       sub     $16, %r10
+       # Determine if if partial block is not being filled and
+       # shift mask accordingly
+       jge     _no_extra_mask_2_\@
+       sub     %r10, %r12
+_no_extra_mask_2_\@:
+
+       movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
+       # get the appropriate mask to mask out bottom r13 bytes of xmm9
+       pand    %xmm1, %xmm9
+
+       movdqa  SHUF_MASK(%rip), %xmm1
+       PSHUFB_XMM %xmm1, %xmm9
+       PSHUFB_XMM %xmm2, %xmm9
+       pxor    %xmm9, \AAD_HASH
+
+       cmp     $0, %r10
+       jl      _partial_incomplete_2_\@
+
+       # GHASH computation for the last <16 Byte block
+       GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+       xor     %rax,%rax
+
+       mov     %rax, PBlockLen(%arg2)
+       jmp     _encode_done_\@
+_partial_incomplete_2_\@:
+       add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
+_encode_done_\@:
+       movdqu  \AAD_HASH, AadHash(%arg2)
+
+       movdqa  SHUF_MASK(%rip), %xmm10
+       # shuffle xmm9 back to output as ciphertext
+       PSHUFB_XMM      %xmm10, %xmm9
+       PSHUFB_XMM      %xmm2, %xmm9
+.endif
+       # output encrypted Bytes
+       cmp     $0, %r10
+       jl      _partial_fill_\@
+       mov     %r13, %r12
+       mov     $16, %r13
+       # Set r13 to be the number of bytes to write out
+       sub     %r12, %r13
+       jmp     _count_set_\@
+_partial_fill_\@:
+       mov     \PLAIN_CYPH_LEN, %r13
+_count_set_\@:
+       movdqa  %xmm9, %xmm0
+       MOVQ_R64_XMM    %xmm0, %rax
+       cmp     $8, %r13
+       jle     _less_than_8_bytes_left_\@
+
+       mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+       add     $8, \DATA_OFFSET
+       psrldq  $8, %xmm0
+       MOVQ_R64_XMM    %xmm0, %rax
+       sub     $8, %r13
+_less_than_8_bytes_left_\@:
+       movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+       add     $1, \DATA_OFFSET
+       shr     $8, %rax
+       sub     $1, %r13
+       jne     _less_than_8_bytes_left_\@
+_partial_block_done_\@:
+.endm # PARTIAL_BLOCK
 
 /*
 * if a = number of total plaintext bytes
@@ -499,49 +789,19 @@ _initial_blocks_done\num_initial_blocks\operation:
 * the ciphertext
 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 * are clobbered
-* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+* arg1, %arg2, %arg3 are used as a pointer only, not modified
 */
 
 
-.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
-XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
-        MOVADQ     SHUF_MASK(%rip), %xmm14
-       mov        arg7, %r10           # %r10 = AAD
-       mov        arg8, %r11           # %r11 = aadLen
-       pxor       %xmm\i, %xmm\i
-       pxor       \XMM2, \XMM2
-
-       cmp        $16, %r11
-       jl         _get_AAD_rest\num_initial_blocks\operation
-_get_AAD_blocks\num_initial_blocks\operation:
-       movdqu     (%r10), %xmm\i
-       PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
-       pxor       %xmm\i, \XMM2
-       GHASH_MUL  \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
-       add        $16, %r10
-       sub        $16, %r11
-       cmp        $16, %r11
-       jge        _get_AAD_blocks\num_initial_blocks\operation
-
-       movdqu     \XMM2, %xmm\i
-
-       /* read the last <16B of AAD */
-_get_AAD_rest\num_initial_blocks\operation:
-       cmp        $0, %r11
-       je         _get_AAD_done\num_initial_blocks\operation
+.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+       XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+       MOVADQ          SHUF_MASK(%rip), %xmm14
 
-       READ_PARTIAL_BLOCK %r10, %r11, \TMP1, %xmm\i
-       PSHUFB_XMM   %xmm14, %xmm\i # byte-reflect the AAD data
-       pxor       \XMM2, %xmm\i
-       GHASH_MUL  %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+       movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 
-_get_AAD_done\num_initial_blocks\operation:
-       xor        %r11, %r11 # initialise the data pointer offset as zero
        # start AES for num_initial_blocks blocks
 
-       mov        %arg5, %rax                      # %rax = *Y0
-       movdqu     (%rax), \XMM0                    # XMM0 = Y0
-       PSHUFB_XMM   %xmm14, \XMM0
+       movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 
 .if (\i == 5) || (\i == 6) || (\i == 7)
 
@@ -549,7 +809,11 @@ _get_AAD_done\num_initial_blocks\operation:
        MOVADQ          0(%arg1),\TMP2
 .irpc index, \i_seq
        paddd           \TMP1, \XMM0                 # INCR Y0
+.ifc \operation, dec
+        movdqa     \XMM0, %xmm\index
+.else
        MOVADQ          \XMM0, %xmm\index
+.endif
        PSHUFB_XMM      %xmm14, %xmm\index      # perform a 16 byte swap
        pxor            \TMP2, %xmm\index
 .endr
@@ -558,25 +822,29 @@ _get_AAD_done\num_initial_blocks\operation:
        shr     $2,%eax                         # 128->4, 192->6, 256->8
        add     $5,%eax                       # 128->9, 192->11, 256->13
 
-aes_loop_initial_enc\num_initial_blocks:
+aes_loop_initial_\@:
        MOVADQ  (%r10),\TMP1
 .irpc  index, \i_seq
        AESENC  \TMP1, %xmm\index
 .endr
        add     $16,%r10
        sub     $1,%eax
-       jnz     aes_loop_initial_enc\num_initial_blocks
+       jnz     aes_loop_initial_\@
 
        MOVADQ  (%r10), \TMP1
 .irpc index, \i_seq
        AESENCLAST \TMP1, %xmm\index         # Last Round
 .endr
 .irpc index, \i_seq
-       movdqu     (%arg3 , %r11, 1), \TMP1
+       movdqu     (%arg4 , %r11, 1), \TMP1
        pxor       \TMP1, %xmm\index
-       movdqu     %xmm\index, (%arg2 , %r11, 1)
+       movdqu     %xmm\index, (%arg3 , %r11, 1)
        # write back plaintext/ciphertext for num_initial_blocks
        add        $16, %r11
+
+.ifc \operation, dec
+       movdqa     \TMP1, %xmm\index
+.endif
        PSHUFB_XMM         %xmm14, %xmm\index
 
                # prepare plaintext/ciphertext for GHASH computation
@@ -602,7 +870,7 @@ aes_loop_initial_enc\num_initial_blocks:
        GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 .endif
        cmp        $64, %r13
-       jl      _initial_blocks_done\num_initial_blocks\operation
+       jl      _initial_blocks_done\@
        # no need for precomputed values
 /*
 *
@@ -631,17 +899,6 @@ aes_loop_initial_enc\num_initial_blocks:
        pxor       \TMP1, \XMM2
        pxor       \TMP1, \XMM3
        pxor       \TMP1, \XMM4
-       movdqa     \TMP3, \TMP5
-       pshufd     $78, \TMP3, \TMP1
-       pxor       \TMP3, \TMP1
-       movdqa     \TMP1, HashKey_k(%rsp)
-       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^2<<1 (mod poly)
-       movdqa     \TMP5, HashKey_2(%rsp)
-# HashKey_2 = HashKey^2<<1 (mod poly)
-       pshufd     $78, \TMP5, \TMP1
-       pxor       \TMP5, \TMP1
-       movdqa     \TMP1, HashKey_2_k(%rsp)
 .irpc index, 1234 # do 4 rounds
        movaps 0x10*\index(%arg1), \TMP1
        AESENC     \TMP1, \XMM1
@@ -649,12 +906,6 @@ aes_loop_initial_enc\num_initial_blocks:
        AESENC     \TMP1, \XMM3
        AESENC     \TMP1, \XMM4
 .endr
-       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
-       movdqa     \TMP5, HashKey_3(%rsp)
-       pshufd     $78, \TMP5, \TMP1
-       pxor       \TMP5, \TMP1
-       movdqa     \TMP1, HashKey_3_k(%rsp)
 .irpc index, 56789 # do next 5 rounds
        movaps 0x10*\index(%arg1), \TMP1
        AESENC     \TMP1, \XMM1
@@ -662,45 +913,56 @@ aes_loop_initial_enc\num_initial_blocks:
        AESENC     \TMP1, \XMM3
        AESENC     \TMP1, \XMM4
 .endr
-       GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
-# TMP5 = HashKey^3<<1 (mod poly)
-       movdqa     \TMP5, HashKey_4(%rsp)
-       pshufd     $78, \TMP5, \TMP1
-       pxor       \TMP5, \TMP1
-       movdqa     \TMP1, HashKey_4_k(%rsp)
        lea        0xa0(%arg1),%r10
        mov        keysize,%eax
        shr        $2,%eax                      # 128->4, 192->6, 256->8
        sub        $4,%eax                      # 128->0, 192->2, 256->4
-       jz         aes_loop_pre_enc_done\num_initial_blocks
+       jz         aes_loop_pre_done\@
 
-aes_loop_pre_enc\num_initial_blocks:
+aes_loop_pre_\@:
        MOVADQ     (%r10),\TMP2
 .irpc  index, 1234
        AESENC     \TMP2, %xmm\index
 .endr
        add        $16,%r10
        sub        $1,%eax
-       jnz        aes_loop_pre_enc\num_initial_blocks
+       jnz        aes_loop_pre_\@
 
-aes_loop_pre_enc_done\num_initial_blocks:
+aes_loop_pre_done\@:
        MOVADQ     (%r10), \TMP2
        AESENCLAST \TMP2, \XMM1
        AESENCLAST \TMP2, \XMM2
        AESENCLAST \TMP2, \XMM3
        AESENCLAST \TMP2, \XMM4
-       movdqu     16*0(%arg3 , %r11 , 1), \TMP1
+       movdqu     16*0(%arg4 , %r11 , 1), \TMP1
        pxor       \TMP1, \XMM1
-       movdqu     16*1(%arg3 , %r11 , 1), \TMP1
+.ifc \operation, dec
+       movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
+       movdqa     \TMP1, \XMM1
+.endif
+       movdqu     16*1(%arg4 , %r11 , 1), \TMP1
        pxor       \TMP1, \XMM2
-       movdqu     16*2(%arg3 , %r11 , 1), \TMP1
+.ifc \operation, dec
+       movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
+       movdqa     \TMP1, \XMM2
+.endif
+       movdqu     16*2(%arg4 , %r11 , 1), \TMP1
        pxor       \TMP1, \XMM3
-       movdqu     16*3(%arg3 , %r11 , 1), \TMP1
+.ifc \operation, dec
+       movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
+       movdqa     \TMP1, \XMM3
+.endif
+       movdqu     16*3(%arg4 , %r11 , 1), \TMP1
        pxor       \TMP1, \XMM4
-       movdqu     \XMM1, 16*0(%arg2 , %r11 , 1)
-       movdqu     \XMM2, 16*1(%arg2 , %r11 , 1)
-       movdqu     \XMM3, 16*2(%arg2 , %r11 , 1)
-       movdqu     \XMM4, 16*3(%arg2 , %r11 , 1)
+.ifc \operation, dec
+       movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
+       movdqa     \TMP1, \XMM4
+.else
+       movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
+       movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
+       movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
+       movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
+.endif
 
        add        $64, %r11
        PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
@@ -710,14 +972,14 @@ aes_loop_pre_enc_done\num_initial_blocks:
        PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
        PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 
-_initial_blocks_done\num_initial_blocks\operation:
+_initial_blocks_done\@:
 
 .endm
 
 /*
 * encrypt 4 blocks at a time
 * ghash the 4 previously encrypted ciphertext blocks
-* arg1, %arg2, %arg3 are used as pointers only, not modified
+* arg1, %arg3, %arg4 are used as pointers only, not modified
 * %r11 is the data offset value
 */
 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
@@ -735,7 +997,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        pshufd    $78, \XMM5, \TMP6
        pxor      \XMM5, \TMP6
        paddd     ONE(%rip), \XMM0              # INCR CNT
-       movdqa    HashKey_4(%rsp), \TMP5
+       movdqa    HashKey_4(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
        movdqa    \XMM0, \XMM1
        paddd     ONE(%rip), \XMM0              # INCR CNT
@@ -754,7 +1016,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        pxor      (%arg1), \XMM2
        pxor      (%arg1), \XMM3
        pxor      (%arg1), \XMM4
-       movdqa    HashKey_4_k(%rsp), \TMP5
+       movdqa    HashKey_4_k(%arg2), \TMP5
        PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
        movaps 0x10(%arg1), \TMP1
        AESENC    \TMP1, \XMM1              # Round 1
@@ -769,7 +1031,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        movdqa    \XMM6, \TMP1
        pshufd    $78, \XMM6, \TMP2
        pxor      \XMM6, \TMP2
-       movdqa    HashKey_3(%rsp), \TMP5
+       movdqa    HashKey_3(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
        movaps 0x30(%arg1), \TMP3
        AESENC    \TMP3, \XMM1              # Round 3
@@ -782,7 +1044,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        AESENC    \TMP3, \XMM2
        AESENC    \TMP3, \XMM3
        AESENC    \TMP3, \XMM4
-       movdqa    HashKey_3_k(%rsp), \TMP5
+       movdqa    HashKey_3_k(%arg2), \TMP5
        PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
        movaps 0x50(%arg1), \TMP3
        AESENC    \TMP3, \XMM1              # Round 5
@@ -796,7 +1058,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        movdqa    \XMM7, \TMP1
        pshufd    $78, \XMM7, \TMP2
        pxor      \XMM7, \TMP2
-       movdqa    HashKey_2(%rsp ), \TMP5
+       movdqa    HashKey_2(%arg2), \TMP5
 
         # Multiply TMP5 * HashKey using karatsuba
 
@@ -812,7 +1074,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        AESENC    \TMP3, \XMM2
        AESENC    \TMP3, \XMM3
        AESENC    \TMP3, \XMM4
-       movdqa    HashKey_2_k(%rsp), \TMP5
+       movdqa    HashKey_2_k(%arg2), \TMP5
        PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
        movaps 0x80(%arg1), \TMP3
        AESENC    \TMP3, \XMM1             # Round 8
@@ -830,7 +1092,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        movdqa    \XMM8, \TMP1
        pshufd    $78, \XMM8, \TMP2
        pxor      \XMM8, \TMP2
-       movdqa    HashKey(%rsp), \TMP5
+       movdqa    HashKey(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
        movaps 0x90(%arg1), \TMP3
        AESENC    \TMP3, \XMM1            # Round 9
@@ -842,37 +1104,37 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        mov       keysize,%eax
        shr       $2,%eax                       # 128->4, 192->6, 256->8
        sub       $4,%eax                       # 128->0, 192->2, 256->4
-       jz        aes_loop_par_enc_done
+       jz        aes_loop_par_enc_done\@
 
-aes_loop_par_enc:
+aes_loop_par_enc\@:
        MOVADQ    (%r10),\TMP3
 .irpc  index, 1234
        AESENC    \TMP3, %xmm\index
 .endr
        add       $16,%r10
        sub       $1,%eax
-       jnz       aes_loop_par_enc
+       jnz       aes_loop_par_enc\@
 
-aes_loop_par_enc_done:
+aes_loop_par_enc_done\@:
        MOVADQ    (%r10), \TMP3
        AESENCLAST \TMP3, \XMM1           # Round 10
        AESENCLAST \TMP3, \XMM2
        AESENCLAST \TMP3, \XMM3
        AESENCLAST \TMP3, \XMM4
-       movdqa    HashKey_k(%rsp), \TMP5
+       movdqa    HashKey_k(%arg2), \TMP5
        PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
-       movdqu    (%arg3,%r11,1), \TMP3
+       movdqu    (%arg4,%r11,1), \TMP3
        pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
-       movdqu    16(%arg3,%r11,1), \TMP3
+       movdqu    16(%arg4,%r11,1), \TMP3
        pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
-       movdqu    32(%arg3,%r11,1), \TMP3
+       movdqu    32(%arg4,%r11,1), \TMP3
        pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
-       movdqu    48(%arg3,%r11,1), \TMP3
+       movdqu    48(%arg4,%r11,1), \TMP3
        pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
-        movdqu    \XMM1, (%arg2,%r11,1)        # Write to the ciphertext buffer
-        movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to the ciphertext buffer
-        movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to the ciphertext buffer
-        movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
+        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
+        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
        PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
        PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
        PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
@@ -925,7 +1187,7 @@ aes_loop_par_enc_done:
 /*
 * decrypt 4 blocks at a time
 * ghash the 4 previously decrypted ciphertext blocks
-* arg1, %arg2, %arg3 are used as pointers only, not modified
+* arg1, %arg3, %arg4 are used as pointers only, not modified
 * %r11 is the data offset value
 */
 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
@@ -943,7 +1205,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        pshufd    $78, \XMM5, \TMP6
        pxor      \XMM5, \TMP6
        paddd     ONE(%rip), \XMM0              # INCR CNT
-       movdqa    HashKey_4(%rsp), \TMP5
+       movdqa    HashKey_4(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
        movdqa    \XMM0, \XMM1
        paddd     ONE(%rip), \XMM0              # INCR CNT
@@ -962,7 +1224,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        pxor      (%arg1), \XMM2
        pxor      (%arg1), \XMM3
        pxor      (%arg1), \XMM4
-       movdqa    HashKey_4_k(%rsp), \TMP5
+       movdqa    HashKey_4_k(%arg2), \TMP5
        PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
        movaps 0x10(%arg1), \TMP1
        AESENC    \TMP1, \XMM1              # Round 1
@@ -977,7 +1239,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        movdqa    \XMM6, \TMP1
        pshufd    $78, \XMM6, \TMP2
        pxor      \XMM6, \TMP2
-       movdqa    HashKey_3(%rsp), \TMP5
+       movdqa    HashKey_3(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
        movaps 0x30(%arg1), \TMP3
        AESENC    \TMP3, \XMM1              # Round 3
@@ -990,7 +1252,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        AESENC    \TMP3, \XMM2
        AESENC    \TMP3, \XMM3
        AESENC    \TMP3, \XMM4
-       movdqa    HashKey_3_k(%rsp), \TMP5
+       movdqa    HashKey_3_k(%arg2), \TMP5
        PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
        movaps 0x50(%arg1), \TMP3
        AESENC    \TMP3, \XMM1              # Round 5
@@ -1004,7 +1266,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        movdqa    \XMM7, \TMP1
        pshufd    $78, \XMM7, \TMP2
        pxor      \XMM7, \TMP2
-       movdqa    HashKey_2(%rsp ), \TMP5
+       movdqa    HashKey_2(%arg2), \TMP5
 
         # Multiply TMP5 * HashKey using karatsuba
 
@@ -1020,7 +1282,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        AESENC    \TMP3, \XMM2
        AESENC    \TMP3, \XMM3
        AESENC    \TMP3, \XMM4
-       movdqa    HashKey_2_k(%rsp), \TMP5
+       movdqa    HashKey_2_k(%arg2), \TMP5
        PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
        movaps 0x80(%arg1), \TMP3
        AESENC    \TMP3, \XMM1             # Round 8
@@ -1038,7 +1300,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        movdqa    \XMM8, \TMP1
        pshufd    $78, \XMM8, \TMP2
        pxor      \XMM8, \TMP2
-       movdqa    HashKey(%rsp), \TMP5
+       movdqa    HashKey(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
        movaps 0x90(%arg1), \TMP3
        AESENC    \TMP3, \XMM1            # Round 9
@@ -1050,40 +1312,40 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
        mov       keysize,%eax
        shr       $2,%eax                       # 128->4, 192->6, 256->8
        sub       $4,%eax                       # 128->0, 192->2, 256->4
-       jz        aes_loop_par_dec_done
+       jz        aes_loop_par_dec_done\@
 
-aes_loop_par_dec:
+aes_loop_par_dec\@:
        MOVADQ    (%r10),\TMP3
 .irpc  index, 1234
        AESENC    \TMP3, %xmm\index
 .endr
        add       $16,%r10
        sub       $1,%eax
-       jnz       aes_loop_par_dec
+       jnz       aes_loop_par_dec\@
 
-aes_loop_par_dec_done:
+aes_loop_par_dec_done\@:
        MOVADQ    (%r10), \TMP3
        AESENCLAST \TMP3, \XMM1           # last round
        AESENCLAST \TMP3, \XMM2
        AESENCLAST \TMP3, \XMM3
        AESENCLAST \TMP3, \XMM4
-       movdqa    HashKey_k(%rsp), \TMP5
+       movdqa    HashKey_k(%arg2), \TMP5
        PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
-       movdqu    (%arg3,%r11,1), \TMP3
+       movdqu    (%arg4,%r11,1), \TMP3
        pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
-       movdqu    \XMM1, (%arg2,%r11,1)        # Write to plaintext buffer
+       movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
        movdqa    \TMP3, \XMM1
-       movdqu    16(%arg3,%r11,1), \TMP3
+       movdqu    16(%arg4,%r11,1), \TMP3
        pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
-       movdqu    \XMM2, 16(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
        movdqa    \TMP3, \XMM2
-       movdqu    32(%arg3,%r11,1), \TMP3
+       movdqu    32(%arg4,%r11,1), \TMP3
        pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
-       movdqu    \XMM3, 32(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
        movdqa    \TMP3, \XMM3
-       movdqu    48(%arg3,%r11,1), \TMP3
+       movdqu    48(%arg4,%r11,1), \TMP3
        pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
-       movdqu    \XMM4, 48(%arg2,%r11,1)      # Write to plaintext buffer
+       movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
        movdqa    \TMP3, \XMM4
        PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
        PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
@@ -1143,10 +1405,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
        movdqa    \XMM1, \TMP6
        pshufd    $78, \XMM1, \TMP2
        pxor      \XMM1, \TMP2
-       movdqa    HashKey_4(%rsp), \TMP5
+       movdqa    HashKey_4(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
        PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
-       movdqa    HashKey_4_k(%rsp), \TMP4
+       movdqa    HashKey_4_k(%arg2), \TMP4
        PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
        movdqa    \XMM1, \XMMDst
        movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
@@ -1156,10 +1418,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
        movdqa    \XMM2, \TMP1
        pshufd    $78, \XMM2, \TMP2
        pxor      \XMM2, \TMP2
-       movdqa    HashKey_3(%rsp), \TMP5
+       movdqa    HashKey_3(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
        PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
-       movdqa    HashKey_3_k(%rsp), \TMP4
+       movdqa    HashKey_3_k(%arg2), \TMP4
        PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
        pxor      \TMP1, \TMP6
        pxor      \XMM2, \XMMDst
@@ -1171,10 +1433,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
        movdqa    \XMM3, \TMP1
        pshufd    $78, \XMM3, \TMP2
        pxor      \XMM3, \TMP2
-       movdqa    HashKey_2(%rsp), \TMP5
+       movdqa    HashKey_2(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
        PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
-       movdqa    HashKey_2_k(%rsp), \TMP4
+       movdqa    HashKey_2_k(%arg2), \TMP4
        PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
        pxor      \TMP1, \TMP6
        pxor      \XMM3, \XMMDst
@@ -1184,10 +1446,10 @@ TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
        movdqa    \XMM4, \TMP1
        pshufd    $78, \XMM4, \TMP2
        pxor      \XMM4, \TMP2
-       movdqa    HashKey(%rsp), \TMP5
+       movdqa    HashKey(%arg2), \TMP5
        PCLMULQDQ 0x11, \TMP5, \TMP1        # TMP1 = a1*b1
        PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
-       movdqa    HashKey_k(%rsp), \TMP4
+       movdqa    HashKey_k(%arg2), \TMP4
        PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
        pxor      \TMP1, \TMP6
        pxor      \XMM4, \XMMDst
@@ -1256,6 +1518,8 @@ _esb_loop_\@:
 .endm
 /*****************************************************************************
 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
+*                   struct gcm_context_data *data
+*                                      // Context data
 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
 *                   const u8 *in,      // Ciphertext input
 *                   u64 plaintext_len, // Length of data in bytes for decryption.
@@ -1333,195 +1597,20 @@ _esb_loop_\@:
 *
 *****************************************************************************/
 ENTRY(aesni_gcm_dec)
-       push    %r12
-       push    %r13
-       push    %r14
-       mov     %rsp, %r14
-/*
-* states of %xmm registers %xmm6:%xmm15 not saved
-* all %xmm registers are clobbered
-*/
-       sub     $VARIABLE_OFFSET, %rsp
-       and     $~63, %rsp                        # align rsp to 64 bytes
-       mov     %arg6, %r12
-       movdqu  (%r12), %xmm13                    # %xmm13 = HashKey
-        movdqa  SHUF_MASK(%rip), %xmm2
-       PSHUFB_XMM %xmm2, %xmm13
-
-
-# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
-
-       movdqa  %xmm13, %xmm2
-       psllq   $1, %xmm13
-       psrlq   $63, %xmm2
-       movdqa  %xmm2, %xmm1
-       pslldq  $8, %xmm2
-       psrldq  $8, %xmm1
-       por     %xmm2, %xmm13
-
-        # Reduction
-
-       pshufd  $0x24, %xmm1, %xmm2
-       pcmpeqd TWOONE(%rip), %xmm2
-       pand    POLY(%rip), %xmm2
-       pxor    %xmm2, %xmm13     # %xmm13 holds the HashKey<<1 (mod poly)
-
-
-        # Decrypt first few blocks
-
-       movdqa %xmm13, HashKey(%rsp)           # store HashKey<<1 (mod poly)
-       mov %arg4, %r13    # save the number of bytes of plaintext/ciphertext
-       and $-16, %r13                      # %r13 = %r13 - (%r13 mod 16)
-       mov %r13, %r12
-       and $(3<<4), %r12
-       jz _initial_num_blocks_is_0_decrypt
-       cmp $(2<<4), %r12
-       jb _initial_num_blocks_is_1_decrypt
-       je _initial_num_blocks_is_2_decrypt
-_initial_num_blocks_is_3_decrypt:
-       INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
-       sub     $48, %r13
-       jmp     _initial_blocks_decrypted
-_initial_num_blocks_is_2_decrypt:
-       INITIAL_BLOCKS_DEC      2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
-       sub     $32, %r13
-       jmp     _initial_blocks_decrypted
-_initial_num_blocks_is_1_decrypt:
-       INITIAL_BLOCKS_DEC      1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
-       sub     $16, %r13
-       jmp     _initial_blocks_decrypted
-_initial_num_blocks_is_0_decrypt:
-       INITIAL_BLOCKS_DEC      0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
-_initial_blocks_decrypted:
-       cmp     $0, %r13
-       je      _zero_cipher_left_decrypt
-       sub     $64, %r13
-       je      _four_cipher_left_decrypt
-_decrypt_by_4:
-       GHASH_4_ENCRYPT_4_PARALLEL_DEC  %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
-%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
-       add     $64, %r11
-       sub     $64, %r13
-       jne     _decrypt_by_4
-_four_cipher_left_decrypt:
-       GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
-%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
-_zero_cipher_left_decrypt:
-       mov     %arg4, %r13
-       and     $15, %r13                               # %r13 = arg4 (mod 16)
-       je      _multiple_of_16_bytes_decrypt
-
-        # Handle the last <16 byte block separately
+       FUNC_SAVE
 
-       paddd ONE(%rip), %xmm0         # increment CNT to get Yn
-        movdqa SHUF_MASK(%rip), %xmm10
-       PSHUFB_XMM %xmm10, %xmm0
-
-       ENCRYPT_SINGLE_BLOCK  %xmm0, %xmm1    # E(K, Yn)
-
-       lea (%arg3,%r11,1), %r10
-       mov %r13, %r12
-       READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
-
-       lea ALL_F+16(%rip), %r12
-       sub %r13, %r12
-       movdqa  %xmm1, %xmm2
-       pxor %xmm1, %xmm0            # Ciphertext XOR E(K, Yn)
-       movdqu (%r12), %xmm1
-       # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
-       pand %xmm1, %xmm0            # mask out top 16-%r13 bytes of %xmm0
-       pand    %xmm1, %xmm2
-        movdqa SHUF_MASK(%rip), %xmm10
-       PSHUFB_XMM %xmm10 ,%xmm2
-
-       pxor %xmm2, %xmm8
-       GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-
-        # output %r13 bytes
-       MOVQ_R64_XMM    %xmm0, %rax
-       cmp     $8, %r13
-       jle     _less_than_8_bytes_left_decrypt
-       mov     %rax, (%arg2 , %r11, 1)
-       add     $8, %r11
-       psrldq  $8, %xmm0
-       MOVQ_R64_XMM    %xmm0, %rax
-       sub     $8, %r13
-_less_than_8_bytes_left_decrypt:
-       mov     %al,  (%arg2, %r11, 1)
-       add     $1, %r11
-       shr     $8, %rax
-       sub     $1, %r13
-       jne     _less_than_8_bytes_left_decrypt
-_multiple_of_16_bytes_decrypt:
-       mov     arg8, %r12                # %r13 = aadLen (number of bytes)
-       shl     $3, %r12                  # convert into number of bits
-       movd    %r12d, %xmm15             # len(A) in %xmm15
-       shl     $3, %arg4                 # len(C) in bits (*128)
-       MOVQ_R64_XMM    %arg4, %xmm1
-       pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
-       pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
-       pxor    %xmm15, %xmm8
-       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-                # final GHASH computation
-        movdqa SHUF_MASK(%rip), %xmm10
-       PSHUFB_XMM %xmm10, %xmm8
-
-       mov     %arg5, %rax               # %rax = *Y0
-       movdqu  (%rax), %xmm0             # %xmm0 = Y0
-       ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
-       pxor    %xmm8, %xmm0
-_return_T_decrypt:
-       mov     arg9, %r10                # %r10 = authTag
-       mov     arg10, %r11               # %r11 = auth_tag_len
-       cmp     $16, %r11
-       je      _T_16_decrypt
-       cmp     $8, %r11
-       jl      _T_4_decrypt
-_T_8_decrypt:
-       MOVQ_R64_XMM    %xmm0, %rax
-       mov     %rax, (%r10)
-       add     $8, %r10
-       sub     $8, %r11
-       psrldq  $8, %xmm0
-       cmp     $0, %r11
-       je      _return_T_done_decrypt
-_T_4_decrypt:
-       movd    %xmm0, %eax
-       mov     %eax, (%r10)
-       add     $4, %r10
-       sub     $4, %r11
-       psrldq  $4, %xmm0
-       cmp     $0, %r11
-       je      _return_T_done_decrypt
-_T_123_decrypt:
-       movd    %xmm0, %eax
-       cmp     $2, %r11
-       jl      _T_1_decrypt
-       mov     %ax, (%r10)
-       cmp     $2, %r11
-       je      _return_T_done_decrypt
-       add     $2, %r10
-       sar     $16, %eax
-_T_1_decrypt:
-       mov     %al, (%r10)
-       jmp     _return_T_done_decrypt
-_T_16_decrypt:
-       movdqu  %xmm0, (%r10)
-_return_T_done_decrypt:
-       mov     %r14, %rsp
-       pop     %r14
-       pop     %r13
-       pop     %r12
+       GCM_INIT %arg6, arg7, arg8, arg9
+       GCM_ENC_DEC dec
+       GCM_COMPLETE arg10, arg11
+       FUNC_RESTORE
        ret
 ENDPROC(aesni_gcm_dec)
 
 
 /*****************************************************************************
 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
+*                    struct gcm_context_data *data
+*                                        // Context data
 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
 *                    const u8 *in,       // Plaintext input
 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
@@ -1596,195 +1685,78 @@ ENDPROC(aesni_gcm_dec)
 * poly = x^128 + x^127 + x^126 + x^121 + 1
 ***************************************************************************/
 ENTRY(aesni_gcm_enc)
-       push    %r12
-       push    %r13
-       push    %r14
-       mov     %rsp, %r14
-#
-# states of %xmm registers %xmm6:%xmm15 not saved
-# all %xmm registers are clobbered
-#
-       sub     $VARIABLE_OFFSET, %rsp
-       and     $~63, %rsp
-       mov     %arg6, %r12
-       movdqu  (%r12), %xmm13
-        movdqa  SHUF_MASK(%rip), %xmm2
-       PSHUFB_XMM %xmm2, %xmm13
-
-
-# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
-
-       movdqa  %xmm13, %xmm2
-       psllq   $1, %xmm13
-       psrlq   $63, %xmm2
-       movdqa  %xmm2, %xmm1
-       pslldq  $8, %xmm2
-       psrldq  $8, %xmm1
-       por     %xmm2, %xmm13
-
-        # reduce HashKey<<1
-
-       pshufd  $0x24, %xmm1, %xmm2
-       pcmpeqd TWOONE(%rip), %xmm2
-       pand    POLY(%rip), %xmm2
-       pxor    %xmm2, %xmm13
-       movdqa  %xmm13, HashKey(%rsp)
-       mov     %arg4, %r13            # %xmm13 holds HashKey<<1 (mod poly)
-       and     $-16, %r13
-       mov     %r13, %r12
-
-        # Encrypt first few blocks
-
-       and     $(3<<4), %r12
-       jz      _initial_num_blocks_is_0_encrypt
-       cmp     $(2<<4), %r12
-       jb      _initial_num_blocks_is_1_encrypt
-       je      _initial_num_blocks_is_2_encrypt
-_initial_num_blocks_is_3_encrypt:
-       INITIAL_BLOCKS_ENC      3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
-       sub     $48, %r13
-       jmp     _initial_blocks_encrypted
-_initial_num_blocks_is_2_encrypt:
-       INITIAL_BLOCKS_ENC      2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
-       sub     $32, %r13
-       jmp     _initial_blocks_encrypted
-_initial_num_blocks_is_1_encrypt:
-       INITIAL_BLOCKS_ENC      1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
-       sub     $16, %r13
-       jmp     _initial_blocks_encrypted
-_initial_num_blocks_is_0_encrypt:
-       INITIAL_BLOCKS_ENC      0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
-_initial_blocks_encrypted:
-
-        # Main loop - Encrypt remaining blocks
-
-       cmp     $0, %r13
-       je      _zero_cipher_left_encrypt
-       sub     $64, %r13
-       je      _four_cipher_left_encrypt
-_encrypt_by_4_encrypt:
-       GHASH_4_ENCRYPT_4_PARALLEL_ENC  %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
-%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
-       add     $64, %r11
-       sub     $64, %r13
-       jne     _encrypt_by_4_encrypt
-_four_cipher_left_encrypt:
-       GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
-%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
-_zero_cipher_left_encrypt:
-       mov     %arg4, %r13
-       and     $15, %r13                       # %r13 = arg4 (mod 16)
-       je      _multiple_of_16_bytes_encrypt
+       FUNC_SAVE
 
-         # Handle the last <16 Byte block separately
-       paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
-        movdqa SHUF_MASK(%rip), %xmm10
-       PSHUFB_XMM %xmm10, %xmm0
-
-       ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
+       GCM_INIT %arg6, arg7, arg8, arg9
+       GCM_ENC_DEC enc
 
-       lea (%arg3,%r11,1), %r10
-       mov %r13, %r12
-       READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
-
-       lea ALL_F+16(%rip), %r12
-       sub %r13, %r12
-       pxor    %xmm1, %xmm0            # Plaintext XOR Encrypt(K, Yn)
-       movdqu  (%r12), %xmm1
-       # get the appropriate mask to mask out top 16-r13 bytes of xmm0
-       pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
-        movdqa SHUF_MASK(%rip), %xmm10
-       PSHUFB_XMM %xmm10,%xmm0
+       GCM_COMPLETE arg10, arg11
+       FUNC_RESTORE
+       ret
+ENDPROC(aesni_gcm_enc)
 
-       pxor    %xmm0, %xmm8
-       GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-       # GHASH computation for the last <16 byte block
-       movdqa SHUF_MASK(%rip), %xmm10
-       PSHUFB_XMM %xmm10, %xmm0
+/*****************************************************************************
+* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
+*                     struct gcm_context_data *data,
+*                                         // context data
+*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
+*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
+*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
+*                     const u8 *aad,      // Additional Authentication Data (AAD)
+*                     u64 aad_len)        // Length of AAD in bytes.
+*/
+ENTRY(aesni_gcm_init)
+       FUNC_SAVE
+       GCM_INIT %arg3, %arg4,%arg5, %arg6
+       FUNC_RESTORE
+       ret
+ENDPROC(aesni_gcm_init)
 
-       # shuffle xmm0 back to output as ciphertext
+/*****************************************************************************
+* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
+*                    struct gcm_context_data *data,
+*                                        // context data
+*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
+*                    const u8 *in,       // Plaintext input
+*                    u64 plaintext_len,  // Length of data in bytes for encryption.
+*/
+ENTRY(aesni_gcm_enc_update)
+       FUNC_SAVE
+       GCM_ENC_DEC enc
+       FUNC_RESTORE
+       ret
+ENDPROC(aesni_gcm_enc_update)
 
-        # Output %r13 bytes
-       MOVQ_R64_XMM %xmm0, %rax
-       cmp $8, %r13
-       jle _less_than_8_bytes_left_encrypt
-       mov %rax, (%arg2 , %r11, 1)
-       add $8, %r11
-       psrldq $8, %xmm0
-       MOVQ_R64_XMM %xmm0, %rax
-       sub $8, %r13
-_less_than_8_bytes_left_encrypt:
-       mov %al,  (%arg2, %r11, 1)
-       add $1, %r11
-       shr $8, %rax
-       sub $1, %r13
-       jne _less_than_8_bytes_left_encrypt
-_multiple_of_16_bytes_encrypt:
-       mov     arg8, %r12    # %r12 = addLen (number of bytes)
-       shl     $3, %r12
-       movd    %r12d, %xmm15       # len(A) in %xmm15
-       shl     $3, %arg4               # len(C) in bits (*128)
-       MOVQ_R64_XMM    %arg4, %xmm1
-       pslldq  $8, %xmm15          # %xmm15 = len(A)||0x0000000000000000
-       pxor    %xmm1, %xmm15       # %xmm15 = len(A)||len(C)
-       pxor    %xmm15, %xmm8
-       GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
-       # final GHASH computation
-        movdqa SHUF_MASK(%rip), %xmm10
-       PSHUFB_XMM %xmm10, %xmm8         # perform a 16 byte swap
+/*****************************************************************************
+* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
+*                    struct gcm_context_data *data,
+*                                        // context data
+*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
+*                    const u8 *in,       // Plaintext input
+*                    u64 plaintext_len,  // Length of data in bytes for encryption.
+*/
+ENTRY(aesni_gcm_dec_update)
+       FUNC_SAVE
+       GCM_ENC_DEC dec
+       FUNC_RESTORE
+       ret
+ENDPROC(aesni_gcm_dec_update)
 
-       mov     %arg5, %rax                    # %rax  = *Y0
-       movdqu  (%rax), %xmm0                  # %xmm0 = Y0
-       ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm15         # Encrypt(K, Y0)
-       pxor    %xmm8, %xmm0
-_return_T_encrypt:
-       mov     arg9, %r10                     # %r10 = authTag
-       mov     arg10, %r11                    # %r11 = auth_tag_len
-       cmp     $16, %r11
-       je      _T_16_encrypt
-       cmp     $8, %r11
-       jl      _T_4_encrypt
-_T_8_encrypt:
-       MOVQ_R64_XMM    %xmm0, %rax
-       mov     %rax, (%r10)
-       add     $8, %r10
-       sub     $8, %r11
-       psrldq  $8, %xmm0
-       cmp     $0, %r11
-       je      _return_T_done_encrypt
-_T_4_encrypt:
-       movd    %xmm0, %eax
-       mov     %eax, (%r10)
-       add     $4, %r10
-       sub     $4, %r11
-       psrldq  $4, %xmm0
-       cmp     $0, %r11
-       je      _return_T_done_encrypt
-_T_123_encrypt:
-       movd    %xmm0, %eax
-       cmp     $2, %r11
-       jl      _T_1_encrypt
-       mov     %ax, (%r10)
-       cmp     $2, %r11
-       je      _return_T_done_encrypt
-       add     $2, %r10
-       sar     $16, %eax
-_T_1_encrypt:
-       mov     %al, (%r10)
-       jmp     _return_T_done_encrypt
-_T_16_encrypt:
-       movdqu  %xmm0, (%r10)
-_return_T_done_encrypt:
-       mov     %r14, %rsp
-       pop     %r14
-       pop     %r13
-       pop     %r12
+/*****************************************************************************
+* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
+*                    struct gcm_context_data *data,
+*                                        // context data
+*                    u8 *auth_tag,       // Authenticated Tag output.
+*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+*                                        // 12 or 8.
+*/
+ENTRY(aesni_gcm_finalize)
+       FUNC_SAVE
+       GCM_COMPLETE %arg3 %arg4
+       FUNC_RESTORE
        ret
-ENDPROC(aesni_gcm_enc)
+ENDPROC(aesni_gcm_finalize)
 
 #endif
 
index 34cf1c1f8c983367a0443d1343e718e9ded32c05..acbe7e8336d8556c272f915c7543d02132a852d3 100644 (file)
@@ -72,6 +72,21 @@ struct aesni_xts_ctx {
        u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
 };
 
+#define GCM_BLOCK_LEN 16
+
+struct gcm_context_data {
+       /* init, update and finalize context data */
+       u8 aad_hash[GCM_BLOCK_LEN];
+       u64 aad_length;
+       u64 in_length;
+       u8 partial_block_enc_key[GCM_BLOCK_LEN];
+       u8 orig_IV[GCM_BLOCK_LEN];
+       u8 current_counter[GCM_BLOCK_LEN];
+       u64 partial_block_len;
+       u64 unused;
+       u8 hash_keys[GCM_BLOCK_LEN * 8];
+};
+
 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
                             unsigned int key_len);
 asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out,
@@ -105,6 +120,7 @@ asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
 
 /* asmlinkage void aesni_gcm_enc()
  * void *ctx,  AES Key schedule. Starts on a 16 byte boundary.
+ * struct gcm_context_data.  May be uninitialized.
  * u8 *out, Ciphertext output. Encrypt in-place is allowed.
  * const u8 *in, Plaintext input
  * unsigned long plaintext_len, Length of data in bytes for encryption.
@@ -117,13 +133,15 @@ asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
  * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
  *          Valid values are 16 (most likely), 12 or 8.
  */
-asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_enc(void *ctx,
+                       struct gcm_context_data *gdata, u8 *out,
                        const u8 *in, unsigned long plaintext_len, u8 *iv,
                        u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len);
 
 /* asmlinkage void aesni_gcm_dec()
  * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * struct gcm_context_data.  May be uninitialized.
  * u8 *out, Plaintext output. Decrypt in-place is allowed.
  * const u8 *in, Ciphertext input
  * unsigned long ciphertext_len, Length of data in bytes for decryption.
@@ -137,11 +155,28 @@ asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
  * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
  * Valid values are 16 (most likely), 12 or 8.
  */
-asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_dec(void *ctx,
+                       struct gcm_context_data *gdata, u8 *out,
                        const u8 *in, unsigned long ciphertext_len, u8 *iv,
                        u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len);
 
+/* Scatter / Gather routines, with args similar to above */
+asmlinkage void aesni_gcm_init(void *ctx,
+                              struct gcm_context_data *gdata,
+                              u8 *iv,
+                              u8 *hash_subkey, const u8 *aad,
+                              unsigned long aad_len);
+asmlinkage void aesni_gcm_enc_update(void *ctx,
+                                    struct gcm_context_data *gdata, u8 *out,
+                                    const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update(void *ctx,
+                                    struct gcm_context_data *gdata, u8 *out,
+                                    const u8 *in,
+                                    unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize(void *ctx,
+                                  struct gcm_context_data *gdata,
+                                  u8 *auth_tag, unsigned long auth_tag_len);
 
 #ifdef CONFIG_AS_AVX
 asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
@@ -167,15 +202,17 @@ asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
                        const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len);
 
-static void aesni_gcm_enc_avx(void *ctx, u8 *out,
+static void aesni_gcm_enc_avx(void *ctx,
+                       struct gcm_context_data *data, u8 *out,
                        const u8 *in, unsigned long plaintext_len, u8 *iv,
                        u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len)
 {
         struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
        if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
-               aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
-                               aad_len, auth_tag, auth_tag_len);
+               aesni_gcm_enc(ctx, data, out, in,
+                       plaintext_len, iv, hash_subkey, aad,
+                       aad_len, auth_tag, auth_tag_len);
        } else {
                aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
                aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
@@ -183,15 +220,17 @@ static void aesni_gcm_enc_avx(void *ctx, u8 *out,
        }
 }
 
-static void aesni_gcm_dec_avx(void *ctx, u8 *out,
+static void aesni_gcm_dec_avx(void *ctx,
+                       struct gcm_context_data *data, u8 *out,
                        const u8 *in, unsigned long ciphertext_len, u8 *iv,
                        u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len)
 {
         struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
        if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
-               aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad,
-                               aad_len, auth_tag, auth_tag_len);
+               aesni_gcm_dec(ctx, data, out, in,
+                       ciphertext_len, iv, hash_subkey, aad,
+                       aad_len, auth_tag, auth_tag_len);
        } else {
                aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
                aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
@@ -218,15 +257,17 @@ asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
                        const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len);
 
-static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
+static void aesni_gcm_enc_avx2(void *ctx,
+                       struct gcm_context_data *data, u8 *out,
                        const u8 *in, unsigned long plaintext_len, u8 *iv,
                        u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len)
 {
        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
        if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
-               aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad,
-                               aad_len, auth_tag, auth_tag_len);
+               aesni_gcm_enc(ctx, data, out, in,
+                             plaintext_len, iv, hash_subkey, aad,
+                             aad_len, auth_tag, auth_tag_len);
        } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
                aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
                aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
@@ -238,15 +279,17 @@ static void aesni_gcm_enc_avx2(void *ctx, u8 *out,
        }
 }
 
-static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
+static void aesni_gcm_dec_avx2(void *ctx,
+       struct gcm_context_data *data, u8 *out,
                        const u8 *in, unsigned long ciphertext_len, u8 *iv,
                        u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
                        u8 *auth_tag, unsigned long auth_tag_len)
 {
        struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
        if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
-               aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey,
-                               aad, aad_len, auth_tag, auth_tag_len);
+               aesni_gcm_dec(ctx, data, out, in,
+                             ciphertext_len, iv, hash_subkey,
+                             aad, aad_len, auth_tag, auth_tag_len);
        } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
                aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
                aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
@@ -259,15 +302,19 @@ static void aesni_gcm_dec_avx2(void *ctx, u8 *out,
 }
 #endif
 
-static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out,
-                       const u8 *in, unsigned long plaintext_len, u8 *iv,
-                       u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-                       u8 *auth_tag, unsigned long auth_tag_len);
+static void (*aesni_gcm_enc_tfm)(void *ctx,
+                                struct gcm_context_data *data, u8 *out,
+                                const u8 *in, unsigned long plaintext_len,
+                                u8 *iv, u8 *hash_subkey, const u8 *aad,
+                                unsigned long aad_len, u8 *auth_tag,
+                                unsigned long auth_tag_len);
 
-static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out,
-                       const u8 *in, unsigned long ciphertext_len, u8 *iv,
-                       u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
-                       u8 *auth_tag, unsigned long auth_tag_len);
+static void (*aesni_gcm_dec_tfm)(void *ctx,
+                                struct gcm_context_data *data, u8 *out,
+                                const u8 *in, unsigned long ciphertext_len,
+                                u8 *iv, u8 *hash_subkey, const u8 *aad,
+                                unsigned long aad_len, u8 *auth_tag,
+                                unsigned long auth_tag_len);
 
 static inline struct
 aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
@@ -744,6 +791,127 @@ static int generic_gcmaes_set_authsize(struct crypto_aead *tfm,
        return 0;
 }
 
+static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
+                             unsigned int assoclen, u8 *hash_subkey,
+                             u8 *iv, void *aes_ctx)
+{
+       struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+       unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+       struct gcm_context_data data AESNI_ALIGN_ATTR;
+       struct scatter_walk dst_sg_walk = {};
+       unsigned long left = req->cryptlen;
+       unsigned long len, srclen, dstlen;
+       struct scatter_walk assoc_sg_walk;
+       struct scatter_walk src_sg_walk;
+       struct scatterlist src_start[2];
+       struct scatterlist dst_start[2];
+       struct scatterlist *src_sg;
+       struct scatterlist *dst_sg;
+       u8 *src, *dst, *assoc;
+       u8 *assocmem = NULL;
+       u8 authTag[16];
+
+       if (!enc)
+               left -= auth_tag_len;
+
+       /* Linearize assoc, if not already linear */
+       if (req->src->length >= assoclen && req->src->length &&
+               (!PageHighMem(sg_page(req->src)) ||
+                       req->src->offset + req->src->length < PAGE_SIZE)) {
+               scatterwalk_start(&assoc_sg_walk, req->src);
+               assoc = scatterwalk_map(&assoc_sg_walk);
+       } else {
+               /* assoc can be any length, so must be on heap */
+               assocmem = kmalloc(assoclen, GFP_ATOMIC);
+               if (unlikely(!assocmem))
+                       return -ENOMEM;
+               assoc = assocmem;
+
+               scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
+       }
+
+       src_sg = scatterwalk_ffwd(src_start, req->src, req->assoclen);
+       scatterwalk_start(&src_sg_walk, src_sg);
+       if (req->src != req->dst) {
+               dst_sg = scatterwalk_ffwd(dst_start, req->dst, req->assoclen);
+               scatterwalk_start(&dst_sg_walk, dst_sg);
+       }
+
+       kernel_fpu_begin();
+       aesni_gcm_init(aes_ctx, &data, iv,
+               hash_subkey, assoc, assoclen);
+       if (req->src != req->dst) {
+               while (left) {
+                       src = scatterwalk_map(&src_sg_walk);
+                       dst = scatterwalk_map(&dst_sg_walk);
+                       srclen = scatterwalk_clamp(&src_sg_walk, left);
+                       dstlen = scatterwalk_clamp(&dst_sg_walk, left);
+                       len = min(srclen, dstlen);
+                       if (len) {
+                               if (enc)
+                                       aesni_gcm_enc_update(aes_ctx, &data,
+                                                            dst, src, len);
+                               else
+                                       aesni_gcm_dec_update(aes_ctx, &data,
+                                                            dst, src, len);
+                       }
+                       left -= len;
+
+                       scatterwalk_unmap(src);
+                       scatterwalk_unmap(dst);
+                       scatterwalk_advance(&src_sg_walk, len);
+                       scatterwalk_advance(&dst_sg_walk, len);
+                       scatterwalk_done(&src_sg_walk, 0, left);
+                       scatterwalk_done(&dst_sg_walk, 1, left);
+               }
+       } else {
+               while (left) {
+                       dst = src = scatterwalk_map(&src_sg_walk);
+                       len = scatterwalk_clamp(&src_sg_walk, left);
+                       if (len) {
+                               if (enc)
+                                       aesni_gcm_enc_update(aes_ctx, &data,
+                                                            src, src, len);
+                               else
+                                       aesni_gcm_dec_update(aes_ctx, &data,
+                                                            src, src, len);
+                       }
+                       left -= len;
+                       scatterwalk_unmap(src);
+                       scatterwalk_advance(&src_sg_walk, len);
+                       scatterwalk_done(&src_sg_walk, 1, left);
+               }
+       }
+       aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len);
+       kernel_fpu_end();
+
+       if (!assocmem)
+               scatterwalk_unmap(assoc);
+       else
+               kfree(assocmem);
+
+       if (!enc) {
+               u8 authTagMsg[16];
+
+               /* Copy out original authTag */
+               scatterwalk_map_and_copy(authTagMsg, req->src,
+                                        req->assoclen + req->cryptlen -
+                                        auth_tag_len,
+                                        auth_tag_len, 0);
+
+               /* Compare generated tag with passed in tag. */
+               return crypto_memneq(authTagMsg, authTag, auth_tag_len) ?
+                       -EBADMSG : 0;
+       }
+
+       /* Copy in the authTag */
+       scatterwalk_map_and_copy(authTag, req->dst,
+                                req->assoclen + req->cryptlen,
+                                auth_tag_len, 1);
+
+       return 0;
+}
+
 static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
                          u8 *hash_subkey, u8 *iv, void *aes_ctx)
 {
@@ -753,7 +921,14 @@ static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
        unsigned long auth_tag_len = crypto_aead_authsize(tfm);
        struct scatter_walk src_sg_walk;
        struct scatter_walk dst_sg_walk = {};
+       struct gcm_context_data data AESNI_ALIGN_ATTR;
 
+       if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
+               aesni_gcm_enc_tfm == aesni_gcm_enc ||
+               req->cryptlen < AVX_GEN2_OPTSIZE) {
+               return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
+                                         aes_ctx);
+       }
        if (sg_is_last(req->src) &&
            (!PageHighMem(sg_page(req->src)) ||
            req->src->offset + req->src->length <= PAGE_SIZE) &&
@@ -782,7 +957,7 @@ static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
        }
 
        kernel_fpu_begin();
-       aesni_gcm_enc_tfm(aes_ctx, dst, src, req->cryptlen, iv,
+       aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv,
                          hash_subkey, assoc, assoclen,
                          dst + req->cryptlen, auth_tag_len);
        kernel_fpu_end();
@@ -817,8 +992,15 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
        u8 authTag[16];
        struct scatter_walk src_sg_walk;
        struct scatter_walk dst_sg_walk = {};
+       struct gcm_context_data data AESNI_ALIGN_ATTR;
        int retval = 0;
 
+       if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
+               aesni_gcm_enc_tfm == aesni_gcm_enc ||
+               req->cryptlen < AVX_GEN2_OPTSIZE) {
+               return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
+                                         aes_ctx);
+       }
        tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
 
        if (sg_is_last(req->src) &&
@@ -849,7 +1031,7 @@ static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
 
 
        kernel_fpu_begin();
-       aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
+       aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv,
                          hash_subkey, assoc, assoclen,
                          authTag, auth_tag_len);
        kernel_fpu_end();
index f9eca34301e20fccec4b8daa5a0cc98710fa3094..3e0c07cc9124f0889db2f8942428f5fe2c75b6ed 100644 (file)
  *
  */
 
-#include <asm/processor.h>
+#include <crypto/algapi.h>
 #include <crypto/blowfish.h>
+#include <crypto/internal/skcipher.h>
 #include <linux/crypto.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/types.h>
-#include <crypto/algapi.h>
 
 /* regular block cipher functions */
 asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src,
@@ -77,20 +77,28 @@ static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
        blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src);
 }
 
-static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
+static int blowfish_setkey_skcipher(struct crypto_skcipher *tfm,
+                                   const u8 *key, unsigned int keylen)
+{
+       return blowfish_setkey(&tfm->base, key, keylen);
+}
+
+static int ecb_crypt(struct skcipher_request *req,
                     void (*fn)(struct bf_ctx *, u8 *, const u8 *),
                     void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *))
 {
-       struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
        unsigned int bsize = BF_BLOCK_SIZE;
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
        unsigned int nbytes;
        int err;
 
-       err = blkcipher_walk_virt(desc, walk);
+       err = skcipher_walk_virt(&walk, req, false);
 
-       while ((nbytes = walk->nbytes)) {
-               u8 *wsrc = walk->src.virt.addr;
-               u8 *wdst = walk->dst.virt.addr;
+       while ((nbytes = walk.nbytes)) {
+               u8 *wsrc = walk.src.virt.addr;
+               u8 *wdst = walk.dst.virt.addr;
 
                /* Process four block batch */
                if (nbytes >= bsize * 4) {
@@ -116,34 +124,25 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
                } while (nbytes >= bsize);
 
 done:
-               err = blkcipher_walk_done(desc, walk, nbytes);
+               err = skcipher_walk_done(&walk, nbytes);
        }
 
        return err;
 }
 
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int ecb_encrypt(struct skcipher_request *req)
 {
-       struct blkcipher_walk walk;
-
-       blkcipher_walk_init(&walk, dst, src, nbytes);
-       return ecb_crypt(desc, &walk, blowfish_enc_blk, blowfish_enc_blk_4way);
+       return ecb_crypt(req, blowfish_enc_blk, blowfish_enc_blk_4way);
 }
 
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int ecb_decrypt(struct skcipher_request *req)
 {
-       struct blkcipher_walk walk;
-
-       blkcipher_walk_init(&walk, dst, src, nbytes);
-       return ecb_crypt(desc, &walk, blowfish_dec_blk, blowfish_dec_blk_4way);
+       return ecb_crypt(req, blowfish_dec_blk, blowfish_dec_blk_4way);
 }
 
-static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
-                                 struct blkcipher_walk *walk)
+static unsigned int __cbc_encrypt(struct bf_ctx *ctx,
+                                 struct skcipher_walk *walk)
 {
-       struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
        unsigned int bsize = BF_BLOCK_SIZE;
        unsigned int nbytes = walk->nbytes;
        u64 *src = (u64 *)walk->src.virt.addr;
@@ -164,27 +163,27 @@ static unsigned int __cbc_encrypt(struct blkcipher_desc *desc,
        return nbytes;
 }
 
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int cbc_encrypt(struct skcipher_request *req)
 {
-       struct blkcipher_walk walk;
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       unsigned int nbytes;
        int err;
 
-       blkcipher_walk_init(&walk, dst, src, nbytes);
-       err = blkcipher_walk_virt(desc, &walk);
+       err = skcipher_walk_virt(&walk, req, false);
 
        while ((nbytes = walk.nbytes)) {
-               nbytes = __cbc_encrypt(desc, &walk);
-               err = blkcipher_walk_done(desc, &walk, nbytes);
+               nbytes = __cbc_encrypt(ctx, &walk);
+               err = skcipher_walk_done(&walk, nbytes);
        }
 
        return err;
 }
 
-static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
-                                 struct blkcipher_walk *walk)
+static unsigned int __cbc_decrypt(struct bf_ctx *ctx,
+                                 struct skcipher_walk *walk)
 {
-       struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
        unsigned int bsize = BF_BLOCK_SIZE;
        unsigned int nbytes = walk->nbytes;
        u64 *src = (u64 *)walk->src.virt.addr;
@@ -245,24 +244,25 @@ done:
        return nbytes;
 }
 
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int cbc_decrypt(struct skcipher_request *req)
 {
-       struct blkcipher_walk walk;
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       unsigned int nbytes;
        int err;
 
-       blkcipher_walk_init(&walk, dst, src, nbytes);
-       err = blkcipher_walk_virt(desc, &walk);
+       err = skcipher_walk_virt(&walk, req, false);
 
        while ((nbytes = walk.nbytes)) {
-               nbytes = __cbc_decrypt(desc, &walk);
-               err = blkcipher_walk_done(desc, &walk, nbytes);
+               nbytes = __cbc_decrypt(ctx, &walk);
+               err = skcipher_walk_done(&walk, nbytes);
        }
 
        return err;
 }
 
-static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk)
+static void ctr_crypt_final(struct bf_ctx *ctx, struct skcipher_walk *walk)
 {
        u8 *ctrblk = walk->iv;
        u8 keystream[BF_BLOCK_SIZE];
@@ -276,10 +276,8 @@ static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk)
        crypto_inc(ctrblk, BF_BLOCK_SIZE);
 }
 
-static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
-                               struct blkcipher_walk *walk)
+static unsigned int __ctr_crypt(struct bf_ctx *ctx, struct skcipher_walk *walk)
 {
-       struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
        unsigned int bsize = BF_BLOCK_SIZE;
        unsigned int nbytes = walk->nbytes;
        u64 *src = (u64 *)walk->src.virt.addr;
@@ -332,29 +330,30 @@ done:
        return nbytes;
 }
 
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                    struct scatterlist *src, unsigned int nbytes)
+static int ctr_crypt(struct skcipher_request *req)
 {
-       struct blkcipher_walk walk;
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct bf_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       unsigned int nbytes;
        int err;
 
-       blkcipher_walk_init(&walk, dst, src, nbytes);
-       err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE);
+       err = skcipher_walk_virt(&walk, req, false);
 
        while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) {
-               nbytes = __ctr_crypt(desc, &walk);
-               err = blkcipher_walk_done(desc, &walk, nbytes);
+               nbytes = __ctr_crypt(ctx, &walk);
+               err = skcipher_walk_done(&walk, nbytes);
        }
 
-       if (walk.nbytes) {
-               ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk);
-               err = blkcipher_walk_done(desc, &walk, 0);
+       if (nbytes) {
+               ctr_crypt_final(ctx, &walk);
+               err = skcipher_walk_done(&walk, 0);
        }
 
        return err;
 }
 
-static struct crypto_alg bf_algs[4] = { {
+static struct crypto_alg bf_cipher_alg = {
        .cra_name               = "blowfish",
        .cra_driver_name        = "blowfish-asm",
        .cra_priority           = 200,
@@ -372,66 +371,50 @@ static struct crypto_alg bf_algs[4] = { {
                        .cia_decrypt            = blowfish_decrypt,
                }
        }
-}, {
-       .cra_name               = "ecb(blowfish)",
-       .cra_driver_name        = "ecb-blowfish-asm",
-       .cra_priority           = 300,
-       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-       .cra_blocksize          = BF_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct bf_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_blkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_u = {
-               .blkcipher = {
-                       .min_keysize    = BF_MIN_KEY_SIZE,
-                       .max_keysize    = BF_MAX_KEY_SIZE,
-                       .setkey         = blowfish_setkey,
-                       .encrypt        = ecb_encrypt,
-                       .decrypt        = ecb_decrypt,
-               },
+};
+
+static struct skcipher_alg bf_skcipher_algs[] = {
+       {
+               .base.cra_name          = "ecb(blowfish)",
+               .base.cra_driver_name   = "ecb-blowfish-asm",
+               .base.cra_priority      = 300,
+               .base.cra_blocksize     = BF_BLOCK_SIZE,
+               .base.cra_ctxsize       = sizeof(struct bf_ctx),
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = BF_MIN_KEY_SIZE,
+               .max_keysize            = BF_MAX_KEY_SIZE,
+               .setkey                 = blowfish_setkey_skcipher,
+               .encrypt                = ecb_encrypt,
+               .decrypt                = ecb_decrypt,
+       }, {
+               .base.cra_name          = "cbc(blowfish)",
+               .base.cra_driver_name   = "cbc-blowfish-asm",
+               .base.cra_priority      = 300,
+               .base.cra_blocksize     = BF_BLOCK_SIZE,
+               .base.cra_ctxsize       = sizeof(struct bf_ctx),
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = BF_MIN_KEY_SIZE,
+               .max_keysize            = BF_MAX_KEY_SIZE,
+               .ivsize                 = BF_BLOCK_SIZE,
+               .setkey                 = blowfish_setkey_skcipher,
+               .encrypt                = cbc_encrypt,
+               .decrypt                = cbc_decrypt,
+       }, {
+               .base.cra_name          = "ctr(blowfish)",
+               .base.cra_driver_name   = "ctr-blowfish-asm",
+               .base.cra_priority      = 300,
+               .base.cra_blocksize     = 1,
+               .base.cra_ctxsize       = sizeof(struct bf_ctx),
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = BF_MIN_KEY_SIZE,
+               .max_keysize            = BF_MAX_KEY_SIZE,
+               .ivsize                 = BF_BLOCK_SIZE,
+               .chunksize              = BF_BLOCK_SIZE,
+               .setkey                 = blowfish_setkey_skcipher,
+               .encrypt                = ctr_crypt,
+               .decrypt                = ctr_crypt,
        },
-}, {
-       .cra_name               = "cbc(blowfish)",
-       .cra_driver_name        = "cbc-blowfish-asm",
-       .cra_priority           = 300,
-       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-       .cra_blocksize          = BF_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct bf_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_blkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_u = {
-               .blkcipher = {
-                       .min_keysize    = BF_MIN_KEY_SIZE,
-                       .max_keysize    = BF_MAX_KEY_SIZE,
-                       .ivsize         = BF_BLOCK_SIZE,
-                       .setkey         = blowfish_setkey,
-                       .encrypt        = cbc_encrypt,
-                       .decrypt        = cbc_decrypt,
-               },
-       },
-}, {
-       .cra_name               = "ctr(blowfish)",
-       .cra_driver_name        = "ctr-blowfish-asm",
-       .cra_priority           = 300,
-       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER,
-       .cra_blocksize          = 1,
-       .cra_ctxsize            = sizeof(struct bf_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_blkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_u = {
-               .blkcipher = {
-                       .min_keysize    = BF_MIN_KEY_SIZE,
-                       .max_keysize    = BF_MAX_KEY_SIZE,
-                       .ivsize         = BF_BLOCK_SIZE,
-                       .setkey         = blowfish_setkey,
-                       .encrypt        = ctr_crypt,
-                       .decrypt        = ctr_crypt,
-               },
-       },
-} };
+};
 
 static bool is_blacklisted_cpu(void)
 {
@@ -456,6 +439,8 @@ MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist");
 
 static int __init init(void)
 {
+       int err;
+
        if (!force && is_blacklisted_cpu()) {
                printk(KERN_INFO
                        "blowfish-x86_64: performance on this CPU "
@@ -464,12 +449,23 @@ static int __init init(void)
                return -ENODEV;
        }
 
-       return crypto_register_algs(bf_algs, ARRAY_SIZE(bf_algs));
+       err = crypto_register_alg(&bf_cipher_alg);
+       if (err)
+               return err;
+
+       err = crypto_register_skciphers(bf_skcipher_algs,
+                                       ARRAY_SIZE(bf_skcipher_algs));
+       if (err)
+               crypto_unregister_alg(&bf_cipher_alg);
+
+       return err;
 }
 
 static void __exit fini(void)
 {
-       crypto_unregister_algs(bf_algs, ARRAY_SIZE(bf_algs));
+       crypto_unregister_alg(&bf_cipher_alg);
+       crypto_unregister_skciphers(bf_skcipher_algs,
+                                   ARRAY_SIZE(bf_skcipher_algs));
 }
 
 module_init(init);
index 60907c139c4e2a7842f412ebcbfd744085cb6020..d4992e458f929e8883ba7418d790d2e4b3c7a94a 100644 (file)
  *
  */
 
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/err.h>
-#include <crypto/ablk_helper.h>
-#include <crypto/algapi.h>
-#include <crypto/ctr.h>
-#include <crypto/lrw.h>
-#include <crypto/xts.h>
-#include <asm/fpu/api.h>
 #include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/xts.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
 
 #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
 #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
@@ -150,413 +147,120 @@ static const struct common_glue_ctx camellia_dec_xts = {
        } }
 };
 
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                          unsigned int keylen)
 {
-       return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
+       return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen,
+                                &tfm->base.crt_flags);
 }
 
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int ecb_encrypt(struct skcipher_request *req)
 {
-       return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
+       return glue_ecb_req_128bit(&camellia_enc, req);
 }
 
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
-{
-       return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
-                                      dst, src, nbytes);
-}
-
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
-{
-       return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
-                                      nbytes);
-}
-
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                    struct scatterlist *src, unsigned int nbytes)
-{
-       return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
-}
-
-static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
-{
-       return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
-                             CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
-                             nbytes);
-}
-
-static inline void camellia_fpu_end(bool fpu_enabled)
-{
-       glue_fpu_end(fpu_enabled);
-}
-
-static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
-                          unsigned int key_len)
-{
-       return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
-                                &tfm->crt_flags);
-}
-
-struct crypt_priv {
-       struct camellia_ctx *ctx;
-       bool fpu_enabled;
-};
-
-static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+static int ecb_decrypt(struct skcipher_request *req)
 {
-       const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
-       struct crypt_priv *ctx = priv;
-       int i;
-
-       ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
-
-       if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
-               camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
-       }
-
-       if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
-               camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
-       }
-
-       while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
-               camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
-       }
-
-       for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-               camellia_enc_blk(ctx->ctx, srcdst, srcdst);
+       return glue_ecb_req_128bit(&camellia_dec, req);
 }
 
-static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+static int cbc_encrypt(struct skcipher_request *req)
 {
-       const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
-       struct crypt_priv *ctx = priv;
-       int i;
-
-       ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
-
-       if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) {
-               camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS;
-       }
-
-       if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
-               camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
-       }
-
-       while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
-               camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
-       }
-
-       for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-               camellia_dec_blk(ctx->ctx, srcdst, srcdst);
+       return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk),
+                                          req);
 }
 
-static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int cbc_decrypt(struct skcipher_request *req)
 {
-       struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-       be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
-       struct crypt_priv crypt_ctx = {
-               .ctx = &ctx->camellia_ctx,
-               .fpu_enabled = false,
-       };
-       struct lrw_crypt_req req = {
-               .tbuf = buf,
-               .tbuflen = sizeof(buf),
-
-               .table_ctx = &ctx->lrw_table,
-               .crypt_ctx = &crypt_ctx,
-               .crypt_fn = encrypt_callback,
-       };
-       int ret;
-
-       desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-       ret = lrw_crypt(desc, dst, src, nbytes, &req);
-       camellia_fpu_end(crypt_ctx.fpu_enabled);
-
-       return ret;
+       return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
 }
 
-static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int ctr_crypt(struct skcipher_request *req)
 {
-       struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-       be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS];
-       struct crypt_priv crypt_ctx = {
-               .ctx = &ctx->camellia_ctx,
-               .fpu_enabled = false,
-       };
-       struct lrw_crypt_req req = {
-               .tbuf = buf,
-               .tbuflen = sizeof(buf),
-
-               .table_ctx = &ctx->lrw_table,
-               .crypt_ctx = &crypt_ctx,
-               .crypt_fn = decrypt_callback,
-       };
-       int ret;
-
-       desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-       ret = lrw_crypt(desc, dst, src, nbytes, &req);
-       camellia_fpu_end(crypt_ctx.fpu_enabled);
-
-       return ret;
+       return glue_ctr_req_128bit(&camellia_ctr, req);
 }
 
-static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int xts_encrypt(struct skcipher_request *req)
 {
-       struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-       return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes,
-                                    XTS_TWEAK_CAST(camellia_enc_blk),
-                                    &ctx->tweak_ctx, &ctx->crypt_ctx);
+       return glue_xts_req_128bit(&camellia_enc_xts, req,
+                                  XTS_TWEAK_CAST(camellia_enc_blk),
+                                  &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 
-static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int xts_decrypt(struct skcipher_request *req)
 {
-       struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-       return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes,
-                                    XTS_TWEAK_CAST(camellia_enc_blk),
-                                    &ctx->tweak_ctx, &ctx->crypt_ctx);
+       return glue_xts_req_128bit(&camellia_dec_xts, req,
+                                  XTS_TWEAK_CAST(camellia_enc_blk),
+                                  &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 
-static struct crypto_alg cmll_algs[10] = { {
-       .cra_name               = "__ecb-camellia-aesni-avx2",
-       .cra_driver_name        = "__driver-ecb-camellia-aesni-avx2",
-       .cra_priority           = 0,
-       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER |
-                                 CRYPTO_ALG_INTERNAL,
-       .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct camellia_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_blkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_u = {
-               .blkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
-                       .setkey         = camellia_setkey,
-                       .encrypt        = ecb_encrypt,
-                       .decrypt        = ecb_decrypt,
-               },
-       },
-}, {
-       .cra_name               = "__cbc-camellia-aesni-avx2",
-       .cra_driver_name        = "__driver-cbc-camellia-aesni-avx2",
-       .cra_priority           = 0,
-       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER |
-                                 CRYPTO_ALG_INTERNAL,
-       .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct camellia_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_blkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_u = {
-               .blkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
-                       .setkey         = camellia_setkey,
-                       .encrypt        = cbc_encrypt,
-                       .decrypt        = cbc_decrypt,
-               },
-       },
-}, {
-       .cra_name               = "__ctr-camellia-aesni-avx2",
-       .cra_driver_name        = "__driver-ctr-camellia-aesni-avx2",
-       .cra_priority           = 0,
-       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER |
-                                 CRYPTO_ALG_INTERNAL,
-       .cra_blocksize          = 1,
-       .cra_ctxsize            = sizeof(struct camellia_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_blkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_u = {
-               .blkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
-                       .ivsize         = CAMELLIA_BLOCK_SIZE,
-                       .setkey         = camellia_setkey,
-                       .encrypt        = ctr_crypt,
-                       .decrypt        = ctr_crypt,
-               },
-       },
-}, {
-       .cra_name               = "__lrw-camellia-aesni-avx2",
-       .cra_driver_name        = "__driver-lrw-camellia-aesni-avx2",
-       .cra_priority           = 0,
-       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER |
-                                 CRYPTO_ALG_INTERNAL,
-       .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct camellia_lrw_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_blkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_exit               = lrw_camellia_exit_tfm,
-       .cra_u = {
-               .blkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE +
-                                         CAMELLIA_BLOCK_SIZE,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE +
-                                         CAMELLIA_BLOCK_SIZE,
-                       .ivsize         = CAMELLIA_BLOCK_SIZE,
-                       .setkey         = lrw_camellia_setkey,
-                       .encrypt        = lrw_encrypt,
-                       .decrypt        = lrw_decrypt,
-               },
-       },
-}, {
-       .cra_name               = "__xts-camellia-aesni-avx2",
-       .cra_driver_name        = "__driver-xts-camellia-aesni-avx2",
-       .cra_priority           = 0,
-       .cra_flags              = CRYPTO_ALG_TYPE_BLKCIPHER |
-                                 CRYPTO_ALG_INTERNAL,
-       .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct camellia_xts_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_blkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_u = {
-               .blkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE * 2,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE * 2,
-                       .ivsize         = CAMELLIA_BLOCK_SIZE,
-                       .setkey         = xts_camellia_setkey,
-                       .encrypt        = xts_encrypt,
-                       .decrypt        = xts_decrypt,
-               },
-       },
-}, {
-       .cra_name               = "ecb(camellia)",
-       .cra_driver_name        = "ecb-camellia-aesni-avx2",
-       .cra_priority           = 500,
-       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-       .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct async_helper_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_ablkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_init               = ablk_init,
-       .cra_exit               = ablk_exit,
-       .cra_u = {
-               .ablkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
-                       .setkey         = ablk_set_key,
-                       .encrypt        = ablk_encrypt,
-                       .decrypt        = ablk_decrypt,
-               },
-       },
-}, {
-       .cra_name               = "cbc(camellia)",
-       .cra_driver_name        = "cbc-camellia-aesni-avx2",
-       .cra_priority           = 500,
-       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-       .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct async_helper_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_ablkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_init               = ablk_init,
-       .cra_exit               = ablk_exit,
-       .cra_u = {
-               .ablkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
-                       .ivsize         = CAMELLIA_BLOCK_SIZE,
-                       .setkey         = ablk_set_key,
-                       .encrypt        = __ablk_encrypt,
-                       .decrypt        = ablk_decrypt,
-               },
-       },
-}, {
-       .cra_name               = "ctr(camellia)",
-       .cra_driver_name        = "ctr-camellia-aesni-avx2",
-       .cra_priority           = 500,
-       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-       .cra_blocksize          = 1,
-       .cra_ctxsize            = sizeof(struct async_helper_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_ablkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_init               = ablk_init,
-       .cra_exit               = ablk_exit,
-       .cra_u = {
-               .ablkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE,
-                       .ivsize         = CAMELLIA_BLOCK_SIZE,
-                       .setkey         = ablk_set_key,
-                       .encrypt        = ablk_encrypt,
-                       .decrypt        = ablk_encrypt,
-                       .geniv          = "chainiv",
-               },
-       },
-}, {
-       .cra_name               = "lrw(camellia)",
-       .cra_driver_name        = "lrw-camellia-aesni-avx2",
-       .cra_priority           = 500,
-       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-       .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct async_helper_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_ablkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_init               = ablk_init,
-       .cra_exit               = ablk_exit,
-       .cra_u = {
-               .ablkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE +
-                                         CAMELLIA_BLOCK_SIZE,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE +
-                                         CAMELLIA_BLOCK_SIZE,
-                       .ivsize         = CAMELLIA_BLOCK_SIZE,
-                       .setkey         = ablk_set_key,
-                       .encrypt        = ablk_encrypt,
-                       .decrypt        = ablk_decrypt,
-               },
-       },
-}, {
-       .cra_name               = "xts(camellia)",
-       .cra_driver_name        = "xts-camellia-aesni-avx2",
-       .cra_priority           = 500,
-       .cra_flags              = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
-       .cra_blocksize          = CAMELLIA_BLOCK_SIZE,
-       .cra_ctxsize            = sizeof(struct async_helper_ctx),
-       .cra_alignmask          = 0,
-       .cra_type               = &crypto_ablkcipher_type,
-       .cra_module             = THIS_MODULE,
-       .cra_init               = ablk_init,
-       .cra_exit               = ablk_exit,
-       .cra_u = {
-               .ablkcipher = {
-                       .min_keysize    = CAMELLIA_MIN_KEY_SIZE * 2,
-                       .max_keysize    = CAMELLIA_MAX_KEY_SIZE * 2,
-                       .ivsize         = CAMELLIA_BLOCK_SIZE,
-                       .setkey         = ablk_set_key,
-                       .encrypt        = ablk_encrypt,
-                       .decrypt        = ablk_decrypt,
-               },
+static struct skcipher_alg camellia_algs[] = {
+       {
+               .base.cra_name          = "__ecb(camellia)",
+               .base.cra_driver_name   = "__ecb-camellia-aesni-avx2",
+               .base.cra_priority      = 500,
+               .base.cra_flags         = CRYPTO_ALG_INTERNAL,
+               .base.cra_blocksize     = CAMELLIA_BLOCK_SIZE,
+               .base.cra_ctxsize       = sizeof(struct camellia_ctx),
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = CAMELLIA_MIN_KEY_SIZE,
+               .max_keysize            = CAMELLIA_MAX_KEY_SIZE,
+               .setkey                 = camellia_setkey,
+               .encrypt                = ecb_encrypt,
+               .decrypt                = ecb_decrypt,
+       }, {
+               .base.cra_name          = "__cbc(camellia)",
+               .base.cra_driver_name   = "__cbc-camellia-aesni-avx2",
+               .base.cra_priority      = 500,
+               .base.cra_flags         = CRYPTO_ALG_INTERNAL,
+               .base.cra_blocksize     = CAMELLIA_BLOCK_SIZE,
+               .base.cra_ctxsize       = sizeof(struct camellia_ctx),
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = CAMELLIA_MIN_KEY_SIZE,
+               .max_keysize            = CAMELLIA_MAX_KEY_SIZE,
+               .ivsize                 = CAMELLIA_BLOCK_SIZE,
+               .setkey                 = camellia_setkey,
+               .encrypt                = cbc_encrypt,
+               .decrypt                = cbc_decrypt,
+       }, {
+               .base.cra_name          = "__ctr(camellia)",
+               .base.cra_driver_name   = "__ctr-camellia-aesni-avx2",
+               .base.cra_priority      = 500,
+               .base.cra_flags         = CRYPTO_ALG_INTERNAL,
+               .base.cra_blocksize     = 1,
+               .base.cra_ctxsize       = sizeof(struct camellia_ctx),
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = CAMELLIA_MIN_KEY_SIZE,
+               .max_keysize            = CAMELLIA_MAX_KEY_SIZE,
+               .ivsize                 = CAMELLIA_BLOCK_SIZE,
+               .chunksize              = CAMELLIA_BLOCK_SIZE,
+               .setkey                 = camellia_setkey,
+               .encrypt                = ctr_crypt,
+               .decrypt                = ctr_crypt,
+       }, {
+               .base.cra_name          = "__xts(camellia)",
+               .base.cra_driver_name   = "__xts-camellia-aesni-avx2",
+               .base.cra_priority      = 500,
+               .base.cra_flags         = CRYPTO_ALG_INTERNAL,
+               .base.cra_blocksize     = CAMELLIA_BLOCK_SIZE,
+               .base.cra_ctxsize       = sizeof(struct camellia_xts_ctx),
+               .base.cra_module        = THIS_MODULE,
+               .min_keysize            = 2 * CAMELLIA_MIN_KEY_SIZE,
+               .max_keysize            = 2 * CAMELLIA_MAX_KEY_SIZE,
+               .ivsize                 = CAMELLIA_BLOCK_SIZE,
+               .setkey                 = xts_camellia_setkey,
+               .encrypt                = xts_encrypt,
+               .decrypt                = xts_decrypt,
        },
-} };
+};
+
+static struct simd_skcipher_alg *camellia_simd_algs[ARRAY_SIZE(camellia_algs)];
 
 static int __init camellia_aesni_init(void)
 {
@@ -576,12 +280,15 @@ static int __init camellia_aesni_init(void)
                return -ENODEV;
        }
 
-       return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+       return simd_register_skciphers_compat(camellia_algs,
+                                             ARRAY_SIZE(camellia_algs),
+                                             camellia_simd_algs);
 }
 
 static void __exit camellia_aesni_fini(void)
 {
-       crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
+       simd_unregister_skciphers(camellia_algs, ARRAY_SIZE(camellia_algs),
+                                 camellia_simd_algs);
 }
 
 module_init(camellia_aesni_init);
index d96429da88eb8bf274620b79de372b2ac72adec4..d09f6521466aa30e5bb7983a806e5b3c2e47695e 100644 (file)
  *
  */
 
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/err.h>
-#include <crypto/ablk_helper.h>
-#include <crypto/algapi.h>
-#include <crypto/ctr.h>
-#include <crypto/lrw.h>
-#include <crypto/xts.h>
-#include <asm/fpu/api.h>
 #include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
+#include <crypto/algapi.h>
+#include <crypto/internal/simd.h>
+#include <crypto/xts.h>
+#include <linux/crypto.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/types.h>
 
 #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
 
@@ -154,401 +151,142 @@ static const struct common_glue_ctx camellia_dec_xts = {
        } }
 };
 
-static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
-{
-       return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
-}
-
-static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                          unsigned int keylen)
 {
-       return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
+       return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen,
+                                &tfm->base.crt_flags);
 }
 
-static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int ecb_encrypt(struct skcipher_request *req)
 {
-       return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
-                                      dst, src, nbytes);
+       return glue_ecb_req_128bit(&camellia_enc, req);
 }
 
-static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                      struct scatterlist *src, unsigned int nbytes)
+static int ecb_decrypt(struct skcipher_request *req)
 {
-       return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
-                                      nbytes);
+       return glue_ecb_req_128bit(&camellia_dec, req);
 }
 
-static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
-                    struct scatterlist *src, unsigned int nbytes)
+static int cbc_encrypt(struct skcipher_request *req)
 {
-       return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
+       return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk),
+                                          req);
 }
 
-static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
+static int cbc_decrypt(struct skcipher_request *req)
 {
-       return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
-                             CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
-                             nbytes);
+       return glue_cbc_decrypt_req_128bit(&camellia_dec_cbc, req);
 }
 
-static inline void camellia_fpu_end(bool fpu_enabled)
+static int ctr_crypt(struct skcipher_request *req)
 {
-       glue_fpu_end(fpu_enabled);
+       return glue_ctr_req_128bit(&camellia_ctr, req);
 }
 
-static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
-                          unsigned int key_len)
+int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                       unsigned int keylen)
 {
-       return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
-                                &tfm->crt_flags);
+       struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+       u32 *flags = &tfm->base.crt_flags;
+       int err;
+
+       err = xts_verify_key(tfm, key, keylen);
+       if (err)
+               return err;
+
+       /* first half of xts-key is for crypt */
+       err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+       if (err)
+               return err;
+
+       /* second half of xts-key is for tweak */
+       return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
+                               flags);
 }
+EXPORT_SYMBOL_GPL(xts_camellia_setkey);
 
-struct crypt_priv {
-       struct camellia_ctx *ctx;
-       bool fpu_enabled;
-};
-
-static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+static int xts_encrypt(struct skcipher_request *req)
 {
-       const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
-       struct crypt_priv *ctx = priv;
-       int i;
-
-       ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
-
-       if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
-               camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
-       }
-
-       while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
-               camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
-       }
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-       for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-               camellia_enc_blk(ctx->ctx, srcdst, srcdst);
+       return glue_xts_req_128bit(&camellia_enc_xts, req,
+                                  XTS_TWEAK_CAST(camellia_enc_blk),
+                                  &ctx->tweak_ctx, &ctx->crypt_ctx);
 }
 
-static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
+static int xts_decrypt(struct skcipher_request *req)
 {
-       const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
-       struct crypt_priv *ctx = priv;
-       int i;
-
-       ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
-
-       if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
-               camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
-       }
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
 
-       while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
-               camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
-               srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
-               nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
-       }
-
-       for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
-               camellia_dec_blk(ctx->ctx, srcdst, srcdst);
+       return glue_xts_req_128