aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cipher/Makefile.am9
-rw-r--r--cipher/asm-common-aarch64.h15
-rw-r--r--cipher/blowfish.c53
-rw-r--r--cipher/bulkhelp.h425
-rw-r--r--cipher/camellia-aarch64.S4
-rw-r--r--cipher/camellia-aesni-avx2-amd64.h502
-rw-r--r--cipher/camellia-gfni-avx2-amd64.S34
-rw-r--r--cipher/camellia-gfni-avx512-amd64.S1566
-rw-r--r--cipher/camellia-glue.c882
-rw-r--r--cipher/cast5.c53
-rw-r--r--cipher/chacha20-aarch64.S8
-rw-r--r--cipher/chacha20-amd64-avx512.S2
-rw-r--r--cipher/cipher-gcm-armv8-aarch64-ce.S6
-rw-r--r--cipher/cipher-selftest.c512
-rw-r--r--cipher/cipher-selftest.h69
-rw-r--r--cipher/crc-armv8-aarch64-ce.S4
-rw-r--r--cipher/des.c72
-rw-r--r--cipher/kdf.c316
-rw-r--r--cipher/poly1305-amd64-avx512.S4
-rw-r--r--cipher/pubkey-util.c5
-rw-r--r--cipher/rijndael-aesni.c1
-rw-r--r--cipher/rijndael-armv8-ce.c1
-rw-r--r--cipher/rijndael-padlock.c1
-rw-r--r--cipher/rijndael-ssse3-amd64.c1
-rw-r--r--cipher/rijndael-vaes.c1
-rw-r--r--cipher/rijndael.c92
-rw-r--r--cipher/rsa.c41
-rw-r--r--cipher/serpent.c152
-rw-r--r--cipher/sha1-armv8-aarch64-ce.S2
-rw-r--r--cipher/sha256-armv8-aarch64-ce.S2
-rw-r--r--cipher/sha512-avx512-amd64.S2
-rw-r--r--cipher/sm3-aarch64.S2
-rw-r--r--cipher/sm3-armv8-aarch64-ce.S2
-rw-r--r--cipher/sm4-aarch64.S2
-rw-r--r--cipher/sm4-aesni-avx2-amd64.S86
-rw-r--r--cipher/sm4-gfni-avx2-amd64.S1194
-rw-r--r--cipher/sm4.c788
-rw-r--r--cipher/twofish.c86
-rw-r--r--configure.ac50
-rw-r--r--doc/gcrypt.texi1
-rw-r--r--m4/Makefile.am2
-rw-r--r--mpi/longlong.h4
-rw-r--r--mpi/mpih-const-time.c9
-rw-r--r--random/jitterentropy-base-user.h15
-rw-r--r--random/random-drbg.c8
-rw-r--r--random/rndjent.c11
-rw-r--r--src/g10lib.h1
-rw-r--r--src/gcrypt-int.h4
-rw-r--r--src/gcrypt.h.in16
-rw-r--r--src/hwf-x86.c7
-rw-r--r--src/hwfeatures.c1
-rw-r--r--src/secmem.c67
-rw-r--r--tests/aeswrap.c5
-rw-r--r--tests/basic.c846
-rw-r--r--tests/bench-slope.c4
-rw-r--r--tests/pkcs1v2.c14
-rw-r--r--tests/t-cv25519.c4
-rw-r--r--tests/t-dsa.c5
-rw-r--r--tests/t-ecdsa.c6
-rw-r--r--tests/t-kdf.c130
-rw-r--r--tests/t-mpi-point.c12
-rw-r--r--tests/t-rsa-15.c5
-rw-r--r--tests/t-rsa-pss.c5
-rw-r--r--tests/t-x448.c4
-rw-r--r--tests/testdrv.c21
65 files changed, 6312 insertions, 1942 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 07e5ba26..250b229e 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -55,7 +55,6 @@ libcipher_la_SOURCES = \
cipher-eax.c \
cipher-siv.c \
cipher-gcm-siv.c \
- cipher-selftest.c cipher-selftest.h \
pubkey.c pubkey-internal.h pubkey-util.c \
md.c \
mac.c mac-internal.h \
@@ -117,7 +116,7 @@ EXTRA_libcipher_la_SOURCES = \
seed.c \
serpent.c serpent-sse2-amd64.S \
sm4.c sm4-aesni-avx-amd64.S sm4-aesni-avx2-amd64.S sm4-aarch64.S \
- sm4-armv8-aarch64-ce.S \
+ sm4-armv8-aarch64-ce.S sm4-gfni-avx2-amd64.S \
serpent-avx2-amd64.S serpent-armv7-neon.S \
sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
@@ -139,8 +138,10 @@ EXTRA_libcipher_la_SOURCES = \
twofish-avx2-amd64.S \
rfc2268.c \
camellia.c camellia.h camellia-glue.c camellia-aesni-avx-amd64.S \
- camellia-aesni-avx2-amd64.h camellia-vaes-avx2-amd64.S \
- camellia-aesni-avx2-amd64.S camellia-arm.S camellia-aarch64.S \
+ camellia-aesni-avx2-amd64.h \
+ camellia-gfni-avx2-amd64.S camellia-gfni-avx512-amd64.S \
+ camellia-vaes-avx2-amd64.S camellia-aesni-avx2-amd64.S \
+ camellia-arm.S camellia-aarch64.S \
blake2.c \
blake2b-amd64-avx2.S blake2s-amd64-avx.S
diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h
index d3f7801c..b38b17a6 100644
--- a/cipher/asm-common-aarch64.h
+++ b/cipher/asm-common-aarch64.h
@@ -29,19 +29,8 @@
# define ELF(...) /*_*/
#endif
-#ifdef __APPLE__
-#define GET_DATA_POINTER(reg, name) \
- adrp reg, [email protected] ; \
- add reg, reg, [email protected] ;
-#elif defined(_WIN32)
-#define GET_DATA_POINTER(reg, name) \
- adrp reg, name ; \
- add reg, reg, #:lo12:name ;
-#else
-#define GET_DATA_POINTER(reg, name) \
- adrp reg, :got:name ; \
- ldr reg, [reg, #:got_lo12:name] ;
-#endif
+#define GET_LOCAL_POINTER(reg, label) \
+ adr reg, label;
#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES
/* CFI directives to emit DWARF stack unwinding information. */
diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index 7b001306..1b11d718 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -38,7 +38,6 @@
#include "cipher.h"
#include "bufhelp.h"
#include "cipher-internal.h"
-#include "cipher-selftest.h"
#define BLOWFISH_BLOCKSIZE 8
#define BLOWFISH_KEY_MIN_BITS 8
@@ -856,48 +855,6 @@ _gcry_blowfish_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
}
-/* Run the self-tests for BLOWFISH-CTR, tests IV increment of bulk CTR
- encryption. Returns NULL on success. */
-static const char *
-selftest_ctr (void)
-{
- const int nblocks = 4+1;
- const int blocksize = BLOWFISH_BLOCKSIZE;
- const int context_size = sizeof(BLOWFISH_context);
-
- return _gcry_selftest_helper_ctr("BLOWFISH", &bf_setkey,
- &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for BLOWFISH-CBC, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char *
-selftest_cbc (void)
-{
- const int nblocks = 4+2;
- const int blocksize = BLOWFISH_BLOCKSIZE;
- const int context_size = sizeof(BLOWFISH_context);
-
- return _gcry_selftest_helper_cbc("BLOWFISH", &bf_setkey,
- &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for BLOWFISH-CFB, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char *
-selftest_cfb (void)
-{
- const int nblocks = 4+2;
- const int blocksize = BLOWFISH_BLOCKSIZE;
- const int context_size = sizeof(BLOWFISH_context);
-
- return _gcry_selftest_helper_cfb("BLOWFISH", &bf_setkey,
- &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
static const char*
selftest(void)
{
@@ -911,7 +868,6 @@ selftest(void)
{ 0x41, 0x79, 0x6E, 0xA0, 0x52, 0x61, 0x6E, 0xE4 };
static const byte cipher3[] =
{ 0xE1, 0x13, 0xF4, 0x10, 0x2C, 0xFC, 0xCE, 0x43 };
- const char *r;
bf_setkey( (void *) &c,
(const unsigned char*)"abcdefghijklmnopqrstuvwxyz", 26,
@@ -931,15 +887,6 @@ selftest(void)
if( memcmp( buffer, plain3, 8 ) )
return "Blowfish selftest failed (4).";
- if ( (r = selftest_cbc ()) )
- return r;
-
- if ( (r = selftest_cfb ()) )
- return r;
-
- if ( (r = selftest_ctr ()) )
- return r;
-
return NULL;
}
diff --git a/cipher/bulkhelp.h b/cipher/bulkhelp.h
new file mode 100644
index 00000000..8c322ede
--- /dev/null
+++ b/cipher/bulkhelp.h
@@ -0,0 +1,425 @@
+/* bulkhelp.h - Some bulk processing helpers
+ * Copyright (C) 2022 Jussi Kivilinna <[email protected]>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef GCRYPT_BULKHELP_H
+#define GCRYPT_BULKHELP_H
+
+
+#include "g10lib.h"
+#include "cipher-internal.h"
+
+
+#ifdef __x86_64__
+/* Use u64 to store pointers for x32 support (assembly function assumes
+ * 64-bit pointers). */
+typedef u64 ocb_L_uintptr_t;
+#else
+typedef uintptr_t ocb_L_uintptr_t;
+#endif
+
+typedef unsigned int (*bulk_crypt_fn_t) (const void *ctx, byte *out,
+ const byte *in,
+ unsigned int num_blks);
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk64 (gcry_cipher_hd_t c,
+ ocb_L_uintptr_t Ls[64], u64 blkn)
+{
+ unsigned int n = 64 - (blkn % 64);
+ unsigned int i;
+
+ for (i = 0; i < 64; i += 8)
+ {
+ Ls[(i + 0 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ Ls[(15 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+ Ls[(23 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ Ls[(31 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[5];
+ Ls[(39 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ Ls[(47 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+ Ls[(55 + n) % 64] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ return &Ls[(63 + n) % 64];
+}
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk32 (gcry_cipher_hd_t c,
+ ocb_L_uintptr_t Ls[32], u64 blkn)
+{
+ unsigned int n = 32 - (blkn % 32);
+ unsigned int i;
+
+ for (i = 0; i < 32; i += 8)
+ {
+ Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
+ Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ return &Ls[(31 + n) % 32];
+}
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk16 (gcry_cipher_hd_t c,
+ ocb_L_uintptr_t Ls[16], u64 blkn)
+{
+ unsigned int n = 16 - (blkn % 16);
+ unsigned int i;
+
+ for (i = 0; i < 16; i += 8)
+ {
+ Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+ Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ }
+
+ Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+ return &Ls[(15 + n) % 16];
+}
+
+
+static inline ocb_L_uintptr_t *
+bulk_ocb_prepare_L_pointers_array_blk8 (gcry_cipher_hd_t c,
+ ocb_L_uintptr_t Ls[8], u64 blkn)
+{
+ unsigned int n = 8 - (blkn % 8);
+
+ Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
+ Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
+ Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
+
+ return &Ls[(7 + n) % 8];
+}
+
+
+static inline unsigned int
+bulk_ctr_enc_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+ const byte *inbuf, size_t nblocks, byte *ctr,
+ byte *tmpbuf, size_t tmpbuf_nblocks,
+ unsigned int *num_used_tmpblocks)
+{
+ unsigned int tmp_used = 16;
+ unsigned int burn_depth = 0;
+ unsigned int nburn;
+
+ while (nblocks >= 1)
+ {
+ size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+ size_t i;
+
+ if (curr_blks * 16 > tmp_used)
+ tmp_used = curr_blks * 16;
+
+ cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
+ for (i = 1; i < curr_blks; i++)
+ {
+ cipher_block_cpy (&tmpbuf[i * 16], ctr, 16);
+ cipher_block_add (&tmpbuf[i * 16], i, 16);
+ }
+ cipher_block_add (ctr, curr_blks, 16);
+
+ nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
+ outbuf += 16;
+ inbuf += 16;
+ }
+
+ nblocks -= curr_blks;
+ }
+
+ *num_used_tmpblocks = tmp_used;
+ return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_cbc_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+ const byte *inbuf, size_t nblocks, byte *iv,
+ byte *tmpbuf, size_t tmpbuf_nblocks,
+ unsigned int *num_used_tmpblocks)
+{
+ unsigned int tmp_used = 16;
+ unsigned int burn_depth = 0;
+ unsigned int nburn;
+
+ while (nblocks >= 1)
+ {
+ size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+ size_t i;
+
+ if (curr_blks * 16 > tmp_used)
+ tmp_used = curr_blks * 16;
+
+ nburn = crypt_fn (priv, tmpbuf, inbuf, curr_blks);
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ cipher_block_xor_n_copy_2(outbuf, &tmpbuf[i * 16], iv, inbuf, 16);
+ outbuf += 16;
+ inbuf += 16;
+ }
+
+ nblocks -= curr_blks;
+ }
+
+ *num_used_tmpblocks = tmp_used;
+ return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_cfb_dec_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+ const byte *inbuf, size_t nblocks, byte *iv,
+ byte *tmpbuf, size_t tmpbuf_nblocks,
+ unsigned int *num_used_tmpblocks)
+{
+ unsigned int tmp_used = 16;
+ unsigned int burn_depth = 0;
+ unsigned int nburn;
+
+ while (nblocks >= 1)
+ {
+ size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+ size_t i;
+
+ if (curr_blks * 16 > tmp_used)
+ tmp_used = curr_blks * 16;
+
+ cipher_block_cpy (&tmpbuf[0 * 16], iv, 16);
+ if (curr_blks > 1)
+ memcpy (&tmpbuf[1 * 16], &inbuf[(1 - 1) * 16], 16 * curr_blks - 16);
+ cipher_block_cpy (iv, &inbuf[(curr_blks - 1) * 16], 16);
+
+ nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ cipher_block_xor (outbuf, inbuf, &tmpbuf[i * 16], 16);
+ outbuf += 16;
+ inbuf += 16;
+ }
+
+ nblocks -= curr_blks;
+ }
+
+ *num_used_tmpblocks = tmp_used;
+ return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_ocb_crypt_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
+ byte *outbuf, const byte *inbuf, size_t nblocks, u64 *blkn,
+ int encrypt, byte *tmpbuf, size_t tmpbuf_nblocks,
+ unsigned int *num_used_tmpblocks)
+{
+ unsigned int tmp_used = 16;
+ unsigned int burn_depth = 0;
+ unsigned int nburn;
+
+ while (nblocks >= 1)
+ {
+ size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+ size_t i;
+
+ if (curr_blks * 16 > tmp_used)
+ tmp_used = curr_blks * 16;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ const unsigned char *l = ocb_get_l(c, ++*blkn);
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ if (encrypt)
+ cipher_block_xor_1(c->u_ctr.ctr, &inbuf[i * 16], 16);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_iv.iv, l, 16);
+ cipher_block_xor (&outbuf[i * 16], &inbuf[i * 16],
+ c->u_iv.iv, 16);
+ }
+
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ cipher_block_xor_1 (&outbuf[i * 16], &tmpbuf[i * 16], 16);
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ if (!encrypt)
+ cipher_block_xor_1(c->u_ctr.ctr, &outbuf[i * 16], 16);
+ }
+
+ outbuf += curr_blks * 16;
+ inbuf += curr_blks * 16;
+ nblocks -= curr_blks;
+ }
+
+ *num_used_tmpblocks = tmp_used;
+ return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_ocb_auth_128 (gcry_cipher_hd_t c, void *priv, bulk_crypt_fn_t crypt_fn,
+ const byte *abuf, size_t nblocks, u64 *blkn, byte *tmpbuf,
+ size_t tmpbuf_nblocks, unsigned int *num_used_tmpblocks)
+{
+ unsigned int tmp_used = 16;
+ unsigned int burn_depth = 0;
+ unsigned int nburn;
+
+ while (nblocks >= 1)
+ {
+ size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+ size_t i;
+
+ if (curr_blks * 16 > tmp_used)
+ tmp_used = curr_blks * 16;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ const unsigned char *l = ocb_get_l(c, ++*blkn);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ cipher_block_xor_2dst (&tmpbuf[i * 16],
+ c->u_mode.ocb.aad_offset, l, 16);
+ cipher_block_xor_1 (&tmpbuf[i * 16], &abuf[i * 16], 16);
+ }
+
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ nburn = crypt_fn (priv, tmpbuf, tmpbuf, curr_blks);
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ cipher_block_xor_1 (c->u_mode.ocb.aad_sum, &tmpbuf[i * 16], 16);
+ }
+
+ abuf += curr_blks * 16;
+ nblocks -= curr_blks;
+ }
+
+ *num_used_tmpblocks = tmp_used;
+ return burn_depth;
+}
+
+
+static inline unsigned int
+bulk_xts_crypt_128 (void *priv, bulk_crypt_fn_t crypt_fn, byte *outbuf,
+ const byte *inbuf, size_t nblocks, byte *tweak,
+ byte *tmpbuf, size_t tmpbuf_nblocks,
+ unsigned int *num_used_tmpblocks)
+{
+ u64 tweak_lo, tweak_hi, tweak_next_lo, tweak_next_hi, tmp_lo, tmp_hi, carry;
+ unsigned int tmp_used = 16;
+ unsigned int burn_depth = 0;
+ unsigned int nburn;
+
+ tweak_next_lo = buf_get_le64 (tweak + 0);
+ tweak_next_hi = buf_get_le64 (tweak + 8);
+
+ while (nblocks >= 1)
+ {
+ size_t curr_blks = nblocks > tmpbuf_nblocks ? tmpbuf_nblocks : nblocks;
+ size_t i;
+
+ if (curr_blks * 16 > tmp_used)
+ tmp_used = curr_blks * 16;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ tweak_lo = tweak_next_lo;
+ tweak_hi = tweak_next_hi;
+
+ /* Generate next tweak. */
+ carry = -(tweak_next_hi >> 63) & 0x87;
+ tweak_next_hi = (tweak_next_hi << 1) + (tweak_next_lo >> 63);
+ tweak_next_lo = (tweak_next_lo << 1) ^ carry;
+
+ /* Xor-Encrypt/Decrypt-Xor block. */
+ tmp_lo = buf_get_le64 (inbuf + i * 16 + 0) ^ tweak_lo;
+ tmp_hi = buf_get_le64 (inbuf + i * 16 + 8) ^ tweak_hi;
+ buf_put_he64 (&tmpbuf[i * 16 + 0], tweak_lo);
+ buf_put_he64 (&tmpbuf[i * 16 + 8], tweak_hi);
+ buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+ buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+ }
+
+ nburn = crypt_fn (priv, outbuf, outbuf, curr_blks);
+ burn_depth = nburn > burn_depth ? nburn : burn_depth;
+
+ for (i = 0; i < curr_blks; i++)
+ {
+ /* Xor-Encrypt/Decrypt-Xor block. */
+ tweak_lo = buf_get_he64 (&tmpbuf[i * 16 + 0]);
+ tweak_hi = buf_get_he64 (&tmpbuf[i * 16 + 8]);
+ tmp_lo = buf_get_le64 (outbuf + i * 16 + 0) ^ tweak_lo;
+ tmp_hi = buf_get_le64 (outbuf + i * 16 + 8) ^ tweak_hi;
+ buf_put_le64 (outbuf + i * 16 + 0, tmp_lo);
+ buf_put_le64 (outbuf + i * 16 + 8, tmp_hi);
+ }
+
+ inbuf += curr_blks * 16;
+ outbuf += curr_blks * 16;
+ nblocks -= curr_blks;
+ }
+
+ buf_put_le64 (tweak + 0, tweak_next_lo);
+ buf_put_le64 (tweak + 8, tweak_next_hi);
+
+ *num_used_tmpblocks = tmp_used;
+ return burn_depth;
+}
+
+
+#endif /*GCRYPT_BULKHELP_H*/
diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S
index 30b568d3..c019c168 100644
--- a/cipher/camellia-aarch64.S
+++ b/cipher/camellia-aarch64.S
@@ -214,7 +214,7 @@ _gcry_camellia_arm_encrypt_block:
* w3: keybitlen
*/
- adr RTAB1, _gcry_camellia_arm_tables;
+ GET_LOCAL_POINTER(RTAB1, _gcry_camellia_arm_tables);
mov RMASK, #(0xff<<4); /* byte mask */
add RTAB2, RTAB1, #(1 * 4);
add RTAB3, RTAB1, #(2 * 4);
@@ -274,7 +274,7 @@ _gcry_camellia_arm_decrypt_block:
* w3: keybitlen
*/
- adr RTAB1, _gcry_camellia_arm_tables;
+ GET_LOCAL_POINTER(RTAB1, _gcry_camellia_arm_tables);
mov RMASK, #(0xff<<4); /* byte mask */
add RTAB2, RTAB1, #(1 * 4);
add RTAB3, RTAB1, #(2 * 4);
diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index e93c40b8..411e790f 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -1,6 +1,6 @@
-/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/AVX2 implementation of Camellia
+/* camellia-aesni-avx2-amd64.h - AES-NI/VAES/GFNI/AVX2 implementation of Camellia
*
- * Copyright (C) 2013-2015,2020-2021 Jussi Kivilinna <[email protected]>
+ * Copyright (C) 2013-2015,2020-2022 Jussi Kivilinna <[email protected]>
*
* This file is part of Libgcrypt.
*
@@ -36,6 +36,8 @@
/**********************************************************************
helper macros
**********************************************************************/
+
+#ifndef CAMELLIA_GFNI_BUILD
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
vpand x, mask4bit, tmp0; \
vpandn x, mask4bit, x; \
@@ -44,6 +46,7 @@
vpshufb tmp0, lo_t, tmp0; \
vpshufb x, hi_t, x; \
vpxor tmp0, x, x;
+#endif
#define ymm0_x xmm0
#define ymm1_x xmm1
@@ -71,10 +74,60 @@
#endif
/**********************************************************************
+ GFNI helper macros and constants
+ **********************************************************************/
+
+#ifdef CAMELLIA_GFNI_BUILD
+
+#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4.
+ * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Constant from "θ₁(x)" and "θ₄(x)" functions. */
+#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0)
+
+/* Constant from "ψ₁(A(x))" function: */
+#define post_filter_constant_s14 BV8(0, 1, 1, 1, 0, 1, 1, 0)
+
+/* Constant from "ψ₂(A(x))" function: */
+#define post_filter_constant_s2 BV8(0, 0, 1, 1, 1, 0, 1, 1)
+
+/* Constant from "ψ₃(A(x))" function: */
+#define post_filter_constant_s3 BV8(1, 1, 1, 0, 1, 1, 0, 0)
+
+#endif /* CAMELLIA_GFNI_BUILD */
+
+/**********************************************************************
32-way camellia
**********************************************************************/
-/*
+#ifdef CAMELLIA_GFNI_BUILD
+
+/* roundsm32 (GFNI version)
* IN:
* x0..x7: byte-sliced AB state
* mem_cd: register pointer storing CD state
@@ -82,7 +135,119 @@
* OUT:
* x0..x7: new byte-sliced CD state
*/
+#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
+ t6, t7, mem_cd, key) \
+ /* \
+ * S-function with AES subbytes \
+ */ \
+ vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \
+ vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \
+ vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
+ vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
+ vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
+ vpxor t7##_x, t7##_x, t7##_x; \
+ vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+ \
+ /* prefilter sboxes */ \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \
+ \
+ /* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \
+ \
+ /* sbox GF8 inverse + postfilter sbox 3 */ \
+ vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \
+ vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \
+ \
+ /* sbox GF8 inverse + postfilter sbox 2 */ \
+ vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
+ vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
+ \
+ vpsrldq $1, t0, t1; \
+ vpsrldq $2, t0, t2; \
+ vpshufb t7, t1, t1; \
+ vpsrldq $3, t0, t3; \
+ \
+ /* P-function */ \
+ vpxor x5, x0, x0; \
+ vpxor x6, x1, x1; \
+ vpxor x7, x2, x2; \
+ vpxor x4, x3, x3; \
+ \
+ vpshufb t7, t2, t2; \
+ vpsrldq $4, t0, t4; \
+ vpshufb t7, t3, t3; \
+ vpsrldq $5, t0, t5; \
+ vpshufb t7, t4, t4; \
+ \
+ vpxor x2, x4, x4; \
+ vpxor x3, x5, x5; \
+ vpxor x0, x6, x6; \
+ vpxor x1, x7, x7; \
+ \
+ vpsrldq $6, t0, t6; \
+ vpshufb t7, t5, t5; \
+ vpshufb t7, t6, t6; \
+ \
+ vpxor x7, x0, x0; \
+ vpxor x4, x1, x1; \
+ vpxor x5, x2, x2; \
+ vpxor x6, x3, x3; \
+ \
+ vpxor x3, x4, x4; \
+ vpxor x0, x5, x5; \
+ vpxor x1, x6, x6; \
+ vpxor x2, x7, x7; /* note: high and low parts swapped */ \
+ \
+ /* Add key material and result to CD (x becomes new CD) */ \
+ \
+ vpxor t6, x1, x1; \
+ vpxor 5 * 32(mem_cd), x1, x1; \
+ \
+ vpsrldq $7, t0, t6; \
+ vpshufb t7, t0, t0; \
+ vpshufb t7, t6, t7; \
+ \
+ vpxor t7, x0, x0; \
+ vpxor 4 * 32(mem_cd), x0, x0; \
+ \
+ vpxor t5, x2, x2; \
+ vpxor 6 * 32(mem_cd), x2, x2; \
+ \
+ vpxor t4, x3, x3; \
+ vpxor 7 * 32(mem_cd), x3, x3; \
+ \
+ vpxor t3, x4, x4; \
+ vpxor 0 * 32(mem_cd), x4, x4; \
+ \
+ vpxor t2, x5, x5; \
+ vpxor 1 * 32(mem_cd), x5, x5; \
+ \
+ vpxor t1, x6, x6; \
+ vpxor 2 * 32(mem_cd), x6, x6; \
+ \
+ vpxor t0, x7, x7; \
+ vpxor 3 * 32(mem_cd), x7, x7;
+#else /* CAMELLIA_GFNI_BUILD */
+
+/* roundsm32 (AES-NI / VAES version)
+ * IN:
+ * x0..x7: byte-sliced AB state
+ * mem_cd: register pointer storing CD state
+ * key: index for key material
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
t6, t7, mem_cd, key) \
/* \
@@ -181,7 +346,7 @@
/* postfilter sbox 2 */ \
filter_8bit(x1, t4, t5, t7, t2); \
filter_8bit(x4, t4, t5, t7, t2); \
- vpxor t7, t7, t7; \
+ vpxor t7##_x, t7##_x, t7##_x; \
\
vpsrldq $1, t0, t1; \
vpsrldq $2, t0, t2; \
@@ -249,6 +414,8 @@
vpxor t0, x7, x7; \
vpxor 3 * 32(mem_cd), x7, x7;
+#endif /* CAMELLIA_GFNI_BUILD */
+
/*
* IN/OUT:
* x0..x7: byte-sliced AB state preloaded
@@ -623,18 +790,88 @@
#define SHUFB_BYTES(idx) \
0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
-.Lshufb_16x16b:
- .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
- .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+FUNC_NAME(_constants):
+ELF(.type FUNC_NAME(_constants),@object;)
.Lpack_bswap:
.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
.long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
/* For CTR-mode IV byteswap */
.Lbswap128_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+#ifdef CAMELLIA_GFNI_BUILD
+
+/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
+ * and s4.
+ * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Bit-matrix from "θ₁(x)" function: */
+.Lpre_filter_bitmatrix_s123:
+ .quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(0, 0, 1, 1, 0, 0, 1, 0),
+ BV8(1, 1, 0, 1, 0, 0, 0, 0),
+ BV8(1, 0, 1, 1, 0, 0, 1, 1),
+ BV8(0, 0, 0, 0, 1, 1, 0, 0),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 0, 1, 0, 1, 1, 0, 0),
+ BV8(1, 0, 0, 0, 0, 1, 1, 0))
+
+/* Bit-matrix from "θ₄(x)" function: */
+.Lpre_filter_bitmatrix_s4:
+ .quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1),
+ BV8(0, 1, 1, 0, 0, 1, 0, 0),
+ BV8(1, 0, 1, 0, 0, 0, 0, 1),
+ BV8(0, 1, 1, 0, 0, 1, 1, 1),
+ BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(0, 1, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 1, 0, 1))
+
+/* Bit-matrix from "ψ₁(A(x))" function: */
+.Lpost_filter_bitmatrix_s14:
+ .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+ BV8(0, 1, 1, 0, 0, 1, 1, 0),
+ BV8(1, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 0, 1, 1),
+ BV8(1, 0, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 0, 1, 1, 1, 1, 0),
+ BV8(0, 1, 1, 1, 1, 1, 1, 1),
+ BV8(0, 0, 0, 1, 1, 1, 0, 0))
+
+/* Bit-matrix from "ψ₂(A(x))" function: */
+.Lpost_filter_bitmatrix_s2:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1),
+ BV8(0, 1, 1, 0, 0, 1, 1, 0),
+ BV8(1, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 0, 1, 1),
+ BV8(1, 0, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 0, 1, 1, 1, 1, 0),
+ BV8(0, 1, 1, 1, 1, 1, 1, 1))
+
+/* Bit-matrix from "ψ₃(A(x))" function: */
+.Lpost_filter_bitmatrix_s3:
+ .quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0),
+ BV8(1, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 0, 1, 1),
+ BV8(1, 0, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 0, 1, 1, 1, 1, 0),
+ BV8(0, 1, 1, 1, 1, 1, 1, 1),
+ BV8(0, 0, 0, 1, 1, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+#else /* CAMELLIA_GFNI_BUILD */
+
/*
* pre-SubByte transform
*
@@ -756,11 +993,14 @@
.L0f0f0f0f:
.long 0x0f0f0f0f
+#endif /* CAMELLIA_GFNI_BUILD */
+
+ELF(.size FUNC_NAME(_constants),.-FUNC_NAME(_constants);)
.align 8
-ELF(.type __camellia_enc_blk32,@function;)
+ELF(.type FUNC_NAME(enc_blk32),@function;)
-__camellia_enc_blk32:
+FUNC_NAME(enc_blk32):
/* input:
* %rdi: ctx, CTX
* %rax: temporary storage, 512 bytes
@@ -817,19 +1057,19 @@ __camellia_enc_blk32:
ret_spec_stop;
CFI_ENDPROC();
-ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;)
+ELF(.size FUNC_NAME(enc_blk32),.-FUNC_NAME(enc_blk32);)
.align 8
-ELF(.type __camellia_dec_blk32,@function;)
+ELF(.type FUNC_NAME(dec_blk32),@function;)
-__camellia_dec_blk32:
+FUNC_NAME(dec_blk32):
/* input:
* %rdi: ctx, CTX
* %rax: temporary storage, 512 bytes
* %r8d: 24 for 16 byte key, 32 for larger
- * %ymm0..%ymm15: 16 encrypted blocks
+ * %ymm0..%ymm15: 32 encrypted blocks
* output:
- * %ymm0..%ymm15: 16 plaintext blocks, order swapped:
+ * %ymm0..%ymm15: 32 plaintext blocks, order swapped:
* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
*/
CFI_STARTPROC();
@@ -882,7 +1122,7 @@ __camellia_dec_blk32:
ret_spec_stop;
CFI_ENDPROC();
-ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;)
+ELF(.size FUNC_NAME(dec_blk32),.-FUNC_NAME(dec_blk32);)
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
@@ -911,8 +1151,6 @@ FUNC_NAME(ctr_enc):
movq 8(%rcx), %r11;
bswapq %r11;
- vzeroupper;
-
cmpl $128, key_bitlength(CTX);
movl $32, %r8d;
movl $24, %eax;
@@ -1036,7 +1274,7 @@ FUNC_NAME(ctr_enc):
.align 4
.Lload_ctr_done:
- /* inpack16_pre: */
+ /* inpack32_pre: */
vpbroadcastq (key_table)(CTX), %ymm15;
vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
vpxor %ymm0, %ymm15, %ymm0;
@@ -1056,7 +1294,7 @@ FUNC_NAME(ctr_enc):
vpxor 14 * 32(%rax), %ymm15, %ymm14;
vpxor 15 * 32(%rax), %ymm15, %ymm15;
- call __camellia_enc_blk32;
+ call FUNC_NAME(enc_blk32);
vpxor 0 * 32(%rdx), %ymm7, %ymm7;
vpxor 1 * 32(%rdx), %ymm6, %ymm6;
@@ -1074,7 +1312,6 @@ FUNC_NAME(ctr_enc):
vpxor 13 * 32(%rdx), %ymm10, %ymm10;
vpxor 14 * 32(%rdx), %ymm9, %ymm9;
vpxor 15 * 32(%rdx), %ymm8, %ymm8;
- leaq 32 * 16(%rdx), %rdx;
write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
%ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
@@ -1106,8 +1343,6 @@ FUNC_NAME(cbc_dec):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);
- vzeroupper;
-
movq %rcx, %r9;
cmpl $128, key_bitlength(CTX);
@@ -1123,7 +1358,7 @@ FUNC_NAME(cbc_dec):
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
%ymm15, %rdx, (key_table)(CTX, %r8, 8));
- call __camellia_dec_blk32;
+ call FUNC_NAME(dec_blk32);
/* XOR output with IV */
vmovdqu %ymm8, (%rax);
@@ -1183,8 +1418,6 @@ FUNC_NAME(cfb_dec):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);
- vzeroupper;
-
cmpl $128, key_bitlength(CTX);
movl $32, %r8d;
movl $24, %eax;
@@ -1194,7 +1427,7 @@ FUNC_NAME(cfb_dec):
andq $~63, %rsp;
movq %rsp, %rax;
- /* inpack16_pre: */
+ /* inpack32_pre: */
vpbroadcastq (key_table)(CTX), %ymm0;
vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
vmovdqu (%rcx), %xmm15;
@@ -1218,7 +1451,7 @@ FUNC_NAME(cfb_dec):
vpxor (13 * 32 + 16)(%rdx), %ymm0, %ymm1;
vpxor (14 * 32 + 16)(%rdx), %ymm0, %ymm0;
- call __camellia_enc_blk32;
+ call FUNC_NAME(enc_blk32);
vpxor 0 * 32(%rdx), %ymm7, %ymm7;
vpxor 1 * 32(%rdx), %ymm6, %ymm6;
@@ -1269,8 +1502,6 @@ FUNC_NAME(ocb_enc):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);
- vzeroupper;
-
subq $(16 * 32 + 4 * 8), %rsp;
andq $~63, %rsp;
movq %rsp, %rax;
@@ -1363,7 +1594,7 @@ FUNC_NAME(ocb_enc):
movl $24, %r10d;
cmovel %r10d, %r8d; /* max */
- /* inpack16_pre: */
+ /* inpack32_pre: */
vpbroadcastq (key_table)(CTX), %ymm15;
vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
vpxor %ymm0, %ymm15, %ymm0;
@@ -1383,7 +1614,7 @@ FUNC_NAME(ocb_enc):
vpxor 14 * 32(%rax), %ymm15, %ymm14;
vpxor 15 * 32(%rax), %ymm15, %ymm15;
- call __camellia_enc_blk32;
+ call FUNC_NAME(enc_blk32);
vpxor 0 * 32(%rsi), %ymm7, %ymm7;
vpxor 1 * 32(%rsi), %ymm6, %ymm6;
@@ -1443,8 +1674,6 @@ FUNC_NAME(ocb_dec):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);
- vzeroupper;
-
subq $(16 * 32 + 4 * 8), %rsp;
andq $~63, %rsp;
movq %rsp, %rax;
@@ -1532,7 +1761,7 @@ FUNC_NAME(ocb_dec):
movl $24, %r9d;
cmovel %r9d, %r8d; /* max */
- /* inpack16_pre: */
+ /* inpack32_pre: */
vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
vpxor %ymm0, %ymm15, %ymm0;
@@ -1552,7 +1781,7 @@ FUNC_NAME(ocb_dec):
vpxor 14 * 32(%rax), %ymm15, %ymm14;
vpxor 15 * 32(%rax), %ymm15, %ymm15;
- call __camellia_dec_blk32;
+ call FUNC_NAME(dec_blk32);
vpxor 0 * 32(%rsi), %ymm7, %ymm7;
vpxor 1 * 32(%rsi), %ymm6, %ymm6;
@@ -1639,8 +1868,6 @@ FUNC_NAME(ocb_auth):
movq %rsp, %rbp;
CFI_DEF_CFA_REGISTER(%rbp);
- vzeroupper;
-
subq $(16 * 32 + 4 * 8), %rsp;
andq $~63, %rsp;
movq %rsp, %rax;
@@ -1728,7 +1955,7 @@ FUNC_NAME(ocb_auth):
movq %rcx, %r10;
- /* inpack16_pre: */
+ /* inpack32_pre: */
vpbroadcastq (key_table)(CTX), %ymm15;
vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15;
vpxor %ymm0, %ymm15, %ymm0;
@@ -1748,7 +1975,7 @@ FUNC_NAME(ocb_auth):
vpxor 14 * 32(%rax), %ymm15, %ymm14;
vpxor 15 * 32(%rax), %ymm15, %ymm15;
- call __camellia_enc_blk32;
+ call FUNC_NAME(enc_blk32);
vpxor %ymm7, %ymm6, %ymm6;
vpxor %ymm5, %ymm4, %ymm4;
@@ -1791,4 +2018,201 @@ FUNC_NAME(ocb_auth):
CFI_ENDPROC();
ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);)
+.align 8
+.globl FUNC_NAME(enc_blk1_32)
+ELF(.type FUNC_NAME(enc_blk1_32),@function;)
+
+FUNC_NAME(enc_blk1_32):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %ecx: nblocks (1 to 32)
+ */
+ CFI_STARTPROC();
+
+ pushq %rbp;
+ CFI_PUSH(%rbp);
+ movq %rsp, %rbp;
+ CFI_DEF_CFA_REGISTER(%rbp);
+
+ movl %ecx, %r9d;
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ subq $(16 * 32), %rsp;
+ andq $~63, %rsp;
+ movq %rsp, %rax;
+
+ cmpl $31, %ecx;
+ vpxor %xmm0, %xmm0, %xmm0;
+ ja 1f;
+ jb 2f;
+ vmovdqu 15 * 32(%rdx), %xmm0;
+ jmp 2f;
+ 1:
+ vmovdqu 15 * 32(%rdx), %ymm0;
+ 2:
+ vmovdqu %ymm0, (%rax);
+
+ vpbroadcastq (key_table)(CTX), %ymm0;
+ vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
+
+#define LOAD_INPUT(offset, ymm) \
+ cmpl $(1 + 2 * (offset)), %ecx; \
+ jb 2f; \
+ ja 1f; \
+ vmovdqu (offset) * 32(%rdx), %ymm##_x; \
+ vpxor %ymm0, %ymm, %ymm; \
+ jmp 2f; \
+ 1: \
+ vpxor (offset) * 32(%rdx), %ymm0, %ymm;
+
+ LOAD_INPUT(0, ymm15);
+ LOAD_INPUT(1, ymm14);
+ LOAD_INPUT(2, ymm13);
+ LOAD_INPUT(3, ymm12);
+ LOAD_INPUT(4, ymm11);
+ LOAD_INPUT(5, ymm10);
+ LOAD_INPUT(6, ymm9);
+ LOAD_INPUT(7, ymm8);
+ LOAD_INPUT(8, ymm7);
+ LOAD_INPUT(9, ymm6);
+ LOAD_INPUT(10, ymm5);
+ LOAD_INPUT(11, ymm4);
+ LOAD_INPUT(12, ymm3);
+ LOAD_INPUT(13, ymm2);
+ LOAD_INPUT(14, ymm1);
+ vpxor (%rax), %ymm0, %ymm0;
+
+2:
+ call FUNC_NAME(enc_blk32);
+
+#define STORE_OUTPUT(ymm, offset) \
+ cmpl $(1 + 2 * (offset)), %r9d; \
+ jb 2f; \
+ ja 1f; \
+ vmovdqu %ymm##_x, (offset) * 32(%rsi); \
+ jmp 2f; \
+ 1: \
+ vmovdqu %ymm, (offset) * 32(%rsi);
+
+ STORE_OUTPUT(ymm7, 0);
+ STORE_OUTPUT(ymm6, 1);
+ STORE_OUTPUT(ymm5, 2);
+ STORE_OUTPUT(ymm4, 3);
+ STORE_OUTPUT(ymm3, 4);
+ STORE_OUTPUT(ymm2, 5);
+ STORE_OUTPUT(ymm1, 6);
+ STORE_OUTPUT(ymm0, 7);
+ STORE_OUTPUT(ymm15, 8);
+ STORE_OUTPUT(ymm14, 9);
+ STORE_OUTPUT(ymm13, 10);
+ STORE_OUTPUT(ymm12, 11);
+ STORE_OUTPUT(ymm11, 12);
+ STORE_OUTPUT(ymm10, 13);
+ STORE_OUTPUT(ymm9, 14);
+ STORE_OUTPUT(ymm8, 15);
+
+2:
+ vzeroall;
+
+ leave;
+ CFI_LEAVE();
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);)
+
+.align 8
+.globl FUNC_NAME(dec_blk1_32)
+ELF(.type FUNC_NAME(dec_blk1_32),@function;)
+
+FUNC_NAME(dec_blk1_32):
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %ecx: nblocks (1 to 32)
+ */
+ CFI_STARTPROC();
+
+ pushq %rbp;
+ CFI_PUSH(%rbp);
+ movq %rsp, %rbp;
+ CFI_DEF_CFA_REGISTER(%rbp);
+
+ movl %ecx, %r9d;
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ subq $(16 * 32), %rsp;
+ andq $~63, %rsp;
+ movq %rsp, %rax;
+
+ cmpl $31, %ecx;
+ vpxor %xmm0, %xmm0, %xmm0;
+ ja 1f;
+ jb 2f;
+ vmovdqu 15 * 32(%rdx), %xmm0;
+ jmp 2f;
+ 1:
+ vmovdqu 15 * 32(%rdx), %ymm0;
+ 2:
+ vmovdqu %ymm0, (%rax);
+
+ vpbroadcastq (key_table)(CTX, %r8, 8), %ymm0;
+ vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
+
+ LOAD_INPUT(0, ymm15);
+ LOAD_INPUT(1, ymm14);
+ LOAD_INPUT(2, ymm13);
+ LOAD_INPUT(3, ymm12);
+ LOAD_INPUT(4, ymm11);
+ LOAD_INPUT(5, ymm10);
+ LOAD_INPUT(6, ymm9);
+ LOAD_INPUT(7, ymm8);
+ LOAD_INPUT(8, ymm7);
+ LOAD_INPUT(9, ymm6);
+ LOAD_INPUT(10, ymm5);
+ LOAD_INPUT(11, ymm4);
+ LOAD_INPUT(12, ymm3);
+ LOAD_INPUT(13, ymm2);
+ LOAD_INPUT(14, ymm1);
+ vpxor (%rax), %ymm0, %ymm0;
+
+2:
+ call FUNC_NAME(dec_blk32);
+
+ STORE_OUTPUT(ymm7, 0);
+ STORE_OUTPUT(ymm6, 1);
+ STORE_OUTPUT(ymm5, 2);
+ STORE_OUTPUT(ymm4, 3);
+ STORE_OUTPUT(ymm3, 4);
+ STORE_OUTPUT(ymm2, 5);
+ STORE_OUTPUT(ymm1, 6);
+ STORE_OUTPUT(ymm0, 7);
+ STORE_OUTPUT(ymm15, 8);
+ STORE_OUTPUT(ymm14, 9);
+ STORE_OUTPUT(ymm13, 10);
+ STORE_OUTPUT(ymm12, 11);
+ STORE_OUTPUT(ymm11, 12);
+ STORE_OUTPUT(ymm10, 13);
+ STORE_OUTPUT(ymm9, 14);
+ STORE_OUTPUT(ymm8, 15);
+
+2:
+ vzeroall;
+
+ leave;
+ CFI_LEAVE();
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size FUNC_NAME(dec_blk1_32),.-FUNC_NAME(dec_blk1_32);)
+
#endif /* GCRY_CAMELLIA_AESNI_AVX2_AMD64_H */
diff --git a/cipher/camellia-gfni-avx2-amd64.S b/cipher/camellia-gfni-avx2-amd64.S
new file mode 100644
index 00000000..20c9a432
--- /dev/null
+++ b/cipher/camellia-gfni-avx2-amd64.S
@@ -0,0 +1,34 @@
+/* camellia-vaes-avx2-amd64.S - GFNI/AVX2 implementation of Camellia cipher
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <[email protected]>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+ defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+
+#define CAMELLIA_GFNI_BUILD 1
+#define FUNC_NAME(func) _gcry_camellia_gfni_avx2_ ## func
+
+#include "camellia-aesni-avx2-amd64.h"
+
+#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) */
+#endif /* __x86_64 */
diff --git a/cipher/camellia-gfni-avx512-amd64.S b/cipher/camellia-gfni-avx512-amd64.S
new file mode 100644
index 00000000..70e10460
--- /dev/null
+++ b/cipher/camellia-gfni-avx512-amd64.S
@@ -0,0 +1,1566 @@
+/* camellia-gfni-avx512-amd64.h - GFNI/AVX512 implementation of Camellia
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <[email protected]>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+ defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+#define CAMELLIA_TABLE_BYTE_LEN 272
+
+/* struct CAMELLIA_context: */
+#define key_table 0
+#define key_bitlength CAMELLIA_TABLE_BYTE_LEN
+
+/* register macros */
+#define CTX %rdi
+#define RIO %r8
+
+/**********************************************************************
+ helper macros
+ **********************************************************************/
+
+#define zmm0_x xmm0
+#define zmm1_x xmm1
+#define zmm2_x xmm2
+#define zmm3_x xmm3
+#define zmm4_x xmm4
+#define zmm5_x xmm5
+#define zmm6_x xmm6
+#define zmm7_x xmm7
+#define zmm8_x xmm8
+#define zmm9_x xmm9
+#define zmm10_x xmm10
+#define zmm11_x xmm11
+#define zmm12_x xmm12
+#define zmm13_x xmm13
+#define zmm14_x xmm14
+#define zmm15_x xmm15
+
+#define zmm0_y ymm0
+#define zmm1_y ymm1
+#define zmm2_y ymm2
+#define zmm3_y ymm3
+#define zmm4_y ymm4
+#define zmm5_y ymm5
+#define zmm6_y ymm6
+#define zmm7_y ymm7
+#define zmm8_y ymm8
+#define zmm9_y ymm9
+#define zmm10_y ymm10
+#define zmm11_y ymm11
+#define zmm12_y ymm12
+#define zmm13_y ymm13
+#define zmm14_y ymm14
+#define zmm15_y ymm15
+
+#define mem_ab_0 %zmm16
+#define mem_ab_1 %zmm17
+#define mem_ab_2 %zmm31
+#define mem_ab_3 %zmm18
+#define mem_ab_4 %zmm19
+#define mem_ab_5 %zmm20
+#define mem_ab_6 %zmm21
+#define mem_ab_7 %zmm22
+#define mem_cd_0 %zmm23
+#define mem_cd_1 %zmm24
+#define mem_cd_2 %zmm30
+#define mem_cd_3 %zmm25
+#define mem_cd_4 %zmm26
+#define mem_cd_5 %zmm27
+#define mem_cd_6 %zmm28
+#define mem_cd_7 %zmm29
+
+#define clear_vec4(v0,v1,v2,v3) \
+ vpxord v0, v0, v0; \
+ vpxord v1, v1, v1; \
+ vpxord v2, v2, v2; \
+ vpxord v3, v3, v3
+
+#define clear_zmm16_zmm31() \
+ clear_vec4(%xmm16, %xmm20, %xmm24, %xmm28); \
+ clear_vec4(%xmm17, %xmm21, %xmm25, %xmm29); \
+ clear_vec4(%xmm18, %xmm22, %xmm26, %xmm30); \
+ clear_vec4(%xmm19, %xmm23, %xmm27, %xmm31)
+
+#define clear_regs() \
+ kxorq %k1, %k1, %k1; \
+ vzeroall; \
+ clear_zmm16_zmm31()
+
+/**********************************************************************
+ GFNI helper macros and constants
+ **********************************************************************/
+
+#define BV8(a0,a1,a2,a3,a4,a5,a6,a7) \
+ ( (((a0) & 1) << 0) | \
+ (((a1) & 1) << 1) | \
+ (((a2) & 1) << 2) | \
+ (((a3) & 1) << 3) | \
+ (((a4) & 1) << 4) | \
+ (((a5) & 1) << 5) | \
+ (((a6) & 1) << 6) | \
+ (((a7) & 1) << 7) )
+
+#define BM8X8(l0,l1,l2,l3,l4,l5,l6,l7) \
+ ( ((l7) << (0 * 8)) | \
+ ((l6) << (1 * 8)) | \
+ ((l5) << (2 * 8)) | \
+ ((l4) << (3 * 8)) | \
+ ((l3) << (4 * 8)) | \
+ ((l2) << (5 * 8)) | \
+ ((l1) << (6 * 8)) | \
+ ((l0) << (7 * 8)) )
+
+/* Pre-filters and post-filters constants for Camellia sboxes s1, s2, s3 and s4.
+ * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Constant from "θ₁(x)" and "θ₄(x)" functions. */
+#define pre_filter_constant_s1234 BV8(1, 0, 1, 0, 0, 0, 1, 0)
+
+/* Constant from "ψ₁(A(x))" function: */
+#define post_filter_constant_s14 BV8(0, 1, 1, 1, 0, 1, 1, 0)
+
+/* Constant from "ψ₂(A(x))" function: */
+#define post_filter_constant_s2 BV8(0, 0, 1, 1, 1, 0, 1, 1)
+
+/* Constant from "ψ₃(A(x))" function: */
+#define post_filter_constant_s3 BV8(1, 1, 1, 0, 1, 1, 0, 0)
+
+/**********************************************************************
+ 64-way parallel camellia
+ **********************************************************************/
+
+/* roundsm64 (GFNI/AVX512 version)
+ * IN:
+ * x0..x7: byte-sliced AB state
+ * mem_cd: register pointer storing CD state
+ * key: index for key material
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, \
+ t6, t7, mem_cd, key) \
+ /* \
+ * S-function with AES subbytes \
+ */ \
+ vpbroadcastq .Lpre_filter_bitmatrix_s123 rRIP, t5; \
+ vpbroadcastq .Lpre_filter_bitmatrix_s4 rRIP, t2; \
+ vpbroadcastq .Lpost_filter_bitmatrix_s14 rRIP, t4; \
+ vpbroadcastq .Lpost_filter_bitmatrix_s2 rRIP, t3; \
+ vpbroadcastq .Lpost_filter_bitmatrix_s3 rRIP, t6; \
+ vpxor t7##_x, t7##_x, t7##_x; \
+ vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
+ \
+ /* prefilter sboxes */ \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x0, x0; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x7, x7; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t2, x3, x3; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t2, x6, x6; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x2, x2; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x5, x5; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x1, x1; \
+ vgf2p8affineqb $(pre_filter_constant_s1234), t5, x4, x4; \
+ \
+ /* sbox GF8 inverse + postfilter sboxes 1 and 4 */ \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x0, x0; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x7, x7; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x3, x3; \
+ vgf2p8affineinvqb $(post_filter_constant_s14), t4, x6, x6; \
+ \
+ /* sbox GF8 inverse + postfilter sbox 3 */ \
+ vgf2p8affineinvqb $(post_filter_constant_s3), t6, x2, x2; \
+ vgf2p8affineinvqb $(post_filter_constant_s3), t6, x5, x5; \
+ \
+ /* sbox GF8 inverse + postfilter sbox 2 */ \
+ vgf2p8affineinvqb $(post_filter_constant_s2), t3, x1, x1; \
+ vgf2p8affineinvqb $(post_filter_constant_s2), t3, x4, x4; \
+ \
+ vpsrldq $1, t0, t1; \
+ vpsrldq $2, t0, t2; \
+ vpshufb t7, t1, t1; \
+ vpsrldq $3, t0, t3; \
+ \
+ /* P-function */ \
+ vpxorq x5, x0, x0; \
+ vpxorq x6, x1, x1; \
+ vpxorq x7, x2, x2; \
+ vpxorq x4, x3, x3; \
+ \
+ vpshufb t7, t2, t2; \
+ vpsrldq $4, t0, t4; \
+ vpshufb t7, t3, t3; \
+ vpsrldq $5, t0, t5; \
+ vpshufb t7, t4, t4; \
+ \
+ vpxorq x2, x4, x4; \
+ vpxorq x3, x5, x5; \
+ vpxorq x0, x6, x6; \
+ vpxorq x1, x7, x7; \
+ \
+ vpsrldq $6, t0, t6; \
+ vpshufb t7, t5, t5; \
+ vpshufb t7, t6, t6; \
+ \
+ vpxorq x7, x0, x0; \
+ vpxorq x4, x1, x1; \
+ vpxorq x5, x2, x2; \
+ vpxorq x6, x3, x3; \
+ \
+ vpxorq x3, x4, x4; \
+ vpxorq x0, x5, x5; \
+ vpxorq x1, x6, x6; \
+ vpxorq x2, x7, x7; /* note: high and low parts swapped */ \
+ \
+ /* Add key material and result to CD (x becomes new CD) */ \
+ \
+ vpternlogq $0x96, mem_cd##_5, t6, x1; \
+ \
+ vpsrldq $7, t0, t6; \
+ vpshufb t7, t0, t0; \
+ vpshufb t7, t6, t7; \
+ \
+ vpternlogq $0x96, mem_cd##_4, t7, x0; \
+ vpternlogq $0x96, mem_cd##_6, t5, x2; \
+ vpternlogq $0x96, mem_cd##_7, t4, x3; \
+ vpternlogq $0x96, mem_cd##_0, t3, x4; \
+ vpternlogq $0x96, mem_cd##_1, t2, x5; \
+ vpternlogq $0x96, mem_cd##_2, t1, x6; \
+ vpternlogq $0x96, mem_cd##_3, t0, x7;
+
+/*
+ * IN/OUT:
+ * x0..x7: byte-sliced AB state preloaded
+ * mem_ab: byte-sliced AB state in memory
+ * mem_cb: byte-sliced CD state in memory
+ */
+#define two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
+ roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
+ \
+ vmovdqu64 x0, mem_cd##_4; \
+ vmovdqu64 x1, mem_cd##_5; \
+ vmovdqu64 x2, mem_cd##_6; \
+ vmovdqu64 x3, mem_cd##_7; \
+ vmovdqu64 x4, mem_cd##_0; \
+ vmovdqu64 x5, mem_cd##_1; \
+ vmovdqu64 x6, mem_cd##_2; \
+ vmovdqu64 x7, mem_cd##_3; \
+ \
+ roundsm64(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
+ \
+ store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
+
+#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
+
+#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
+ /* Store new AB state */ \
+ vmovdqu64 x4, mem_ab##_4; \
+ vmovdqu64 x5, mem_ab##_5; \
+ vmovdqu64 x6, mem_ab##_6; \
+ vmovdqu64 x7, mem_ab##_7; \
+ vmovdqu64 x0, mem_ab##_0; \
+ vmovdqu64 x1, mem_ab##_1; \
+ vmovdqu64 x2, mem_ab##_2; \
+ vmovdqu64 x3, mem_ab##_3;
+
+#define enc_rounds64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
+ two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
+ two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
+
+#define dec_rounds64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, i) \
+ two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
+ two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
+ two_roundsm64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
+
+/*
+ * IN:
+ * v0..3: byte-sliced 32-bit integers
+ * OUT:
+ * v0..3: (IN << 1)
+ * t0, t1, t2, zero: (IN >> 7)
+ */
+#define rol32_1_64(v0, v1, v2, v3, t0, t1, t2, zero, one) \
+ vpcmpltb zero, v0, %k1; \
+ vpaddb v0, v0, v0; \
+ vpaddb one, zero, t0{%k1}{z}; \
+ \
+ vpcmpltb zero, v1, %k1; \
+ vpaddb v1, v1, v1; \
+ vpaddb one, zero, t1{%k1}{z}; \
+ \
+ vpcmpltb zero, v2, %k1; \
+ vpaddb v2, v2, v2; \
+ vpaddb one, zero, t2{%k1}{z}; \
+ \
+ vpcmpltb zero, v3, %k1; \
+ vpaddb v3, v3, v3; \
+ vpaddb one, zero, zero{%k1}{z};
+
+/*
+ * IN:
+ * r: byte-sliced AB state in memory
+ * l: byte-sliced CD state in memory
+ * OUT:
+ * x0..x7: new byte-sliced CD state
+ */
+#define fls64(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
+ tt1, tt2, tt3, kll, klr, krl, krr, tmp) \
+ /* \
+ * t0 = kll; \
+ * t0 &= ll; \
+ * lr ^= rol32(t0, 1); \
+ */ \
+ vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
+ vpbroadcastq .Lbyte_ones rRIP, tmp; \
+ vpxor tt3##_x, tt3##_x, tt3##_x; \
+ vpshufb tt3, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t0; \
+ \
+ vpandq l0, t0, t0; \
+ vpandq l1, t1, t1; \
+ vpandq l2, t2, t2; \
+ vpandq l3, t3, t3; \
+ \
+ rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \
+ \
+ vpternlogq $0x96, tt2, t0, l4; \
+ vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
+ vmovdqu64 l4, l##_4; \
+ vpternlogq $0x96, tt1, t1, l5; \
+ vmovdqu64 l5, l##_5; \
+ vpternlogq $0x96, tt0, t2, l6; \
+ vmovdqu64 l6, l##_6; \
+ vpternlogq $0x96, tt3, t3, l7; \
+ vmovdqu64 l7, l##_7; \
+ vpxor tt3##_x, tt3##_x, tt3##_x; \
+ \
+ /* \
+ * t2 = krr; \
+ * t2 |= rr; \
+ * rl ^= t2; \
+ */ \
+ \
+ vpshufb tt3, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t0; \
+ \
+ vpternlogq $0x1e, r##_4, t0, r##_0; \
+ vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
+ vpternlogq $0x1e, r##_5, t1, r##_1; \
+ vpternlogq $0x1e, r##_6, t2, r##_2; \
+ vpternlogq $0x1e, r##_7, t3, r##_3; \
+ \
+ /* \
+ * t2 = krl; \
+ * t2 &= rl; \
+ * rr ^= rol32(t2, 1); \
+ */ \
+ vpshufb tt3, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t0; \
+ \
+ vpandq r##_0, t0, t0; \
+ vpandq r##_1, t1, t1; \
+ vpandq r##_2, t2, t2; \
+ vpandq r##_3, t3, t3; \
+ \
+ rol32_1_64(t3, t2, t1, t0, tt0, tt1, tt2, tt3, tmp); \
+ \
+ vpternlogq $0x96, tt2, t0, r##_4; \
+ vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
+ vpternlogq $0x96, tt1, t1, r##_5; \
+ vpternlogq $0x96, tt0, t2, r##_6; \
+ vpternlogq $0x96, tt3, t3, r##_7; \
+ vpxor tt3##_x, tt3##_x, tt3##_x; \
+ \
+ /* \
+ * t0 = klr; \
+ * t0 |= lr; \
+ * ll ^= t0; \
+ */ \
+ \
+ vpshufb tt3, t0, t3; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t2; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t1; \
+ vpsrldq $1, t0, t0; \
+ vpshufb tt3, t0, t0; \
+ \
+ vpternlogq $0x1e, l4, t0, l0; \
+ vmovdqu64 l0, l##_0; \
+ vpternlogq $0x1e, l5, t1, l1; \
+ vmovdqu64 l1, l##_1; \
+ vpternlogq $0x1e, l6, t2, l2; \
+ vmovdqu64 l2, l##_2; \
+ vpternlogq $0x1e, l7, t3, l3; \
+ vmovdqu64 l3, l##_3;
+
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
+ a3, b3, c3, d3, st0, st1) \
+ transpose_4x4(a0, a1, a2, a3, st0, st1); \
+ transpose_4x4(b0, b1, b2, b3, st0, st1); \
+ \
+ transpose_4x4(c0, c1, c2, c3, st0, st1); \
+ transpose_4x4(d0, d1, d2, d3, st0, st1); \
+ \
+ vbroadcasti64x2 .Lshufb_16x16b rRIP, st0; \
+ vpshufb st0, a0, a0; \
+ vpshufb st0, a1, a1; \
+ vpshufb st0, a2, a2; \
+ vpshufb st0, a3, a3; \
+ vpshufb st0, b0, b0; \
+ vpshufb st0, b1, b1; \
+ vpshufb st0, b2, b2; \
+ vpshufb st0, b3, b3; \
+ vpshufb st0, c0, c0; \
+ vpshufb st0, c1, c1; \
+ vpshufb st0, c2, c2; \
+ vpshufb st0, c3, c3; \
+ vpshufb st0, d0, d0; \
+ vpshufb st0, d1, d1; \
+ vpshufb st0, d2, d2; \
+ vpshufb st0, d3, d3; \
+ \
+ transpose_4x4(a0, b0, c0, d0, st0, st1); \
+ transpose_4x4(a1, b1, c1, d1, st0, st1); \
+ \
+ transpose_4x4(a2, b2, c2, d2, st0, st1); \
+ transpose_4x4(a3, b3, c3, d3, st0, st1); \
+ /* does not adjust output bytes inside vectors */
+
+/* load blocks to registers and apply pre-whitening */
+#define inpack64_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio, key) \
+ vpbroadcastq key, x0; \
+ vpshufb .Lpack_bswap rRIP, x0, x0; \
+ \
+ vpxorq 0 * 64(rio), x0, y7; \
+ vpxorq 1 * 64(rio), x0, y6; \
+ vpxorq 2 * 64(rio), x0, y5; \
+ vpxorq 3 * 64(rio), x0, y4; \
+ vpxorq 4 * 64(rio), x0, y3; \
+ vpxorq 5 * 64(rio), x0, y2; \
+ vpxorq 6 * 64(rio), x0, y1; \
+ vpxorq 7 * 64(rio), x0, y0; \
+ vpxorq 8 * 64(rio), x0, x7; \
+ vpxorq 9 * 64(rio), x0, x6; \
+ vpxorq 10 * 64(rio), x0, x5; \
+ vpxorq 11 * 64(rio), x0, x4; \
+ vpxorq 12 * 64(rio), x0, x3; \
+ vpxorq 13 * 64(rio), x0, x2; \
+ vpxorq 14 * 64(rio), x0, x1; \
+ vpxorq 15 * 64(rio), x0, x0;
+
+/* byteslice pre-whitened blocks and store to temporary memory */
+#define inpack64_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, mem_ab, mem_cd, tmp0, tmp1) \
+ byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
+ y4, y5, y6, y7, tmp0, tmp1); \
+ \
+ vmovdqu64 x0, mem_ab##_0; \
+ vmovdqu64 x1, mem_ab##_1; \
+ vmovdqu64 x2, mem_ab##_2; \
+ vmovdqu64 x3, mem_ab##_3; \
+ vmovdqu64 x4, mem_ab##_4; \
+ vmovdqu64 x5, mem_ab##_5; \
+ vmovdqu64 x6, mem_ab##_6; \
+ vmovdqu64 x7, mem_ab##_7; \
+ vmovdqu64 y0, mem_cd##_0; \
+ vmovdqu64 y1, mem_cd##_1; \
+ vmovdqu64 y2, mem_cd##_2; \
+ vmovdqu64 y3, mem_cd##_3; \
+ vmovdqu64 y4, mem_cd##_4; \
+ vmovdqu64 y5, mem_cd##_5; \
+ vmovdqu64 y6, mem_cd##_6; \
+ vmovdqu64 y7, mem_cd##_7;
+
+/* de-byteslice, apply post-whitening and store blocks */
+#define outunpack64(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
+ y5, y6, y7, key, tmp0, tmp1) \
+ byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
+ y3, y7, x3, x7, tmp0, tmp1); \
+ \
+ vpbroadcastq key, tmp0; \
+ vpshufb .Lpack_bswap rRIP, tmp0, tmp0; \
+ \
+ vpxorq tmp0, y7, y7; \
+ vpxorq tmp0, y6, y6; \
+ vpxorq tmp0, y5, y5; \
+ vpxorq tmp0, y4, y4; \
+ vpxorq tmp0, y3, y3; \
+ vpxorq tmp0, y2, y2; \
+ vpxorq tmp0, y1, y1; \
+ vpxorq tmp0, y0, y0; \
+ vpxorq tmp0, x7, x7; \
+ vpxorq tmp0, x6, x6; \
+ vpxorq tmp0, x5, x5; \
+ vpxorq tmp0, x4, x4; \
+ vpxorq tmp0, x3, x3; \
+ vpxorq tmp0, x2, x2; \
+ vpxorq tmp0, x1, x1; \
+ vpxorq tmp0, x0, x0;
+
+#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
+ y6, y7, rio) \
+ vmovdqu64 x0, 0 * 64(rio); \
+ vmovdqu64 x1, 1 * 64(rio); \
+ vmovdqu64 x2, 2 * 64(rio); \
+ vmovdqu64 x3, 3 * 64(rio); \
+ vmovdqu64 x4, 4 * 64(rio); \
+ vmovdqu64 x5, 5 * 64(rio); \
+ vmovdqu64 x6, 6 * 64(rio); \
+ vmovdqu64 x7, 7 * 64(rio); \
+ vmovdqu64 y0, 8 * 64(rio); \
+ vmovdqu64 y1, 9 * 64(rio); \
+ vmovdqu64 y2, 10 * 64(rio); \
+ vmovdqu64 y3, 11 * 64(rio); \
+ vmovdqu64 y4, 12 * 64(rio); \
+ vmovdqu64 y5, 13 * 64(rio); \
+ vmovdqu64 y6, 14 * 64(rio); \
+ vmovdqu64 y7, 15 * 64(rio);
+
+.text
+
+#define SHUFB_BYTES(idx) \
+ 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
+
+_gcry_camellia_gfni_avx512__constants:
+ELF(.type _gcry_camellia_gfni_avx512__constants,@object;)
+
+.align 64
+.Lpack_bswap:
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+ .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
+
+.Lcounter0123_lo:
+ .quad 0, 0
+ .quad 1, 0
+ .quad 2, 0
+ .quad 3, 0
+
+.align 16
+.Lcounter4444_lo:
+ .quad 4, 0
+.Lcounter8888_lo:
+ .quad 8, 0
+.Lcounter16161616_lo:
+ .quad 16, 0
+.Lcounter1111_hi:
+ .quad 0, 1
+
+.Lshufb_16x16b:
+ .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+ vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22;
+ vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23;
+ vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24;
+ vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25;
+
+.Lbyte_ones:
+ .byte 1, 1, 1, 1, 1, 1, 1, 1
+
+/* Pre-filters and post-filters bit-matrixes for Camellia sboxes s1, s2, s3
+ * and s4.
+ * See http://urn.fi/URN:NBN:fi:oulu-201305311409, pages 43-48.
+ *
+ * Pre-filters are directly from above source, "θ₁"/"θ₄". Post-filters are
+ * combination of function "A" (AES SubBytes affine transformation) and
+ * "ψ₁"/"ψ₂"/"ψ₃".
+ */
+
+/* Bit-matrix from "θ₁(x)" function: */
+.Lpre_filter_bitmatrix_s123:
+ .quad BM8X8(BV8(1, 1, 1, 0, 1, 1, 0, 1),
+ BV8(0, 0, 1, 1, 0, 0, 1, 0),
+ BV8(1, 1, 0, 1, 0, 0, 0, 0),
+ BV8(1, 0, 1, 1, 0, 0, 1, 1),
+ BV8(0, 0, 0, 0, 1, 1, 0, 0),
+ BV8(1, 0, 1, 0, 0, 1, 0, 0),
+ BV8(0, 0, 1, 0, 1, 1, 0, 0),
+ BV8(1, 0, 0, 0, 0, 1, 1, 0))
+
+/* Bit-matrix from "θ₄(x)" function: */
+.Lpre_filter_bitmatrix_s4:
+ .quad BM8X8(BV8(1, 1, 0, 1, 1, 0, 1, 1),
+ BV8(0, 1, 1, 0, 0, 1, 0, 0),
+ BV8(1, 0, 1, 0, 0, 0, 0, 1),
+ BV8(0, 1, 1, 0, 0, 1, 1, 1),
+ BV8(0, 0, 0, 1, 1, 0, 0, 0),
+ BV8(0, 1, 0, 0, 1, 0, 0, 1),
+ BV8(0, 1, 0, 1, 1, 0, 0, 0),
+ BV8(0, 0, 0, 0, 1, 1, 0, 1))
+
+/* Bit-matrix from "ψ₁(A(x))" function: */
+.Lpost_filter_bitmatrix_s14:
+ .quad BM8X8(BV8(0, 0, 0, 0, 0, 0, 0, 1),
+ BV8(0, 1, 1, 0, 0, 1, 1, 0),
+ BV8(1, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 0, 1, 1),
+ BV8(1, 0, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 0, 1, 1, 1, 1, 0),
+ BV8(0, 1, 1, 1, 1, 1, 1, 1),
+ BV8(0, 0, 0, 1, 1, 1, 0, 0))
+
+/* Bit-matrix from "ψ₂(A(x))" function: */
+.Lpost_filter_bitmatrix_s2:
+ .quad BM8X8(BV8(0, 0, 0, 1, 1, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1),
+ BV8(0, 1, 1, 0, 0, 1, 1, 0),
+ BV8(1, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 0, 1, 1),
+ BV8(1, 0, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 0, 1, 1, 1, 1, 0),
+ BV8(0, 1, 1, 1, 1, 1, 1, 1))
+
+/* Bit-matrix from "ψ₃(A(x))" function: */
+.Lpost_filter_bitmatrix_s3:
+ .quad BM8X8(BV8(0, 1, 1, 0, 0, 1, 1, 0),
+ BV8(1, 0, 1, 1, 1, 1, 1, 0),
+ BV8(0, 0, 0, 1, 1, 0, 1, 1),
+ BV8(1, 0, 0, 0, 1, 1, 1, 0),
+ BV8(0, 1, 0, 1, 1, 1, 1, 0),
+ BV8(0, 1, 1, 1, 1, 1, 1, 1),
+ BV8(0, 0, 0, 1, 1, 1, 0, 0),
+ BV8(0, 0, 0, 0, 0, 0, 0, 1))
+
+ELF(.size _gcry_camellia_gfni_avx512__constants,.-_gcry_camellia_gfni_avx512__constants;)
+
+.align 8
+ELF(.type __camellia_gfni_avx512_enc_blk64,@function;)
+
+__camellia_gfni_avx512_enc_blk64:
+ /* input:
+ * %rdi: ctx, CTX
+ * %r8d: 24 for 16 byte key, 32 for larger
+ * %zmm0..%zmm15: 64 plaintext blocks
+ * output:
+ * %zmm0..%zmm15: 64 encrypted blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ CFI_STARTPROC();
+
+ leaq (-8 * 8)(CTX, %r8, 8), %r8;
+
+ inpack64_post(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, mem_ab, mem_cd, %zmm30, %zmm31);
+
+.align 8
+.Lenc_loop:
+ enc_rounds64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, mem_ab, mem_cd, 0);
+
+ cmpq %r8, CTX;
+ je .Lenc_done;
+ leaq (8 * 8)(CTX), CTX;
+
+ fls64(mem_ab, %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ mem_cd, %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15,
+ ((key_table) + 0)(CTX),
+ ((key_table) + 4)(CTX),
+ ((key_table) + 8)(CTX),
+ ((key_table) + 12)(CTX),
+ %zmm31);
+ jmp .Lenc_loop;
+
+.align 8
+.Lenc_done:
+ /* load CD for output */
+ vmovdqu64 mem_cd_0, %zmm8;
+ vmovdqu64 mem_cd_1, %zmm9;
+ vmovdqu64 mem_cd_2, %zmm10;
+ vmovdqu64 mem_cd_3, %zmm11;
+ vmovdqu64 mem_cd_4, %zmm12;
+ vmovdqu64 mem_cd_5, %zmm13;
+ vmovdqu64 mem_cd_6, %zmm14;
+ vmovdqu64 mem_cd_7, %zmm15;
+
+ outunpack64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, ((key_table) + 8 * 8)(%r8), %zmm30, %zmm31);
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size __camellia_gfni_avx512_enc_blk64,.-__camellia_gfni_avx512_enc_blk64;)
+
+.align 8
+ELF(.type __camellia_gfni_avx512_dec_blk64,@function;)
+
+__camellia_gfni_avx512_dec_blk64:
+ /* input:
+ * %rdi: ctx, CTX
+ * %r8d: 24 for 16 byte key, 32 for larger
+ * %zmm0..%zmm15: 64 encrypted blocks
+ * output:
+ * %zmm0..%zmm15: 64 plaintext blocks, order swapped:
+ * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
+ */
+ CFI_STARTPROC();
+
+ movq %r8, %rcx;
+ movq CTX, %r8
+ leaq (-8 * 8)(CTX, %rcx, 8), CTX;
+
+ inpack64_post(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, mem_ab, mem_cd, %zmm30, %zmm31);
+
+.align 8
+.Ldec_loop:
+ dec_rounds64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, mem_ab, mem_cd, 0);
+
+ cmpq %r8, CTX;
+ je .Ldec_done;
+
+ fls64(mem_ab, %zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ mem_cd, %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15,
+ ((key_table) + 8)(CTX),
+ ((key_table) + 12)(CTX),
+ ((key_table) + 0)(CTX),
+ ((key_table) + 4)(CTX),
+ %zmm31);
+
+ leaq (-8 * 8)(CTX), CTX;
+ jmp .Ldec_loop;
+
+.align 8
+.Ldec_done:
+ /* load CD for output */
+ vmovdqu64 mem_cd_0, %zmm8;
+ vmovdqu64 mem_cd_1, %zmm9;
+ vmovdqu64 mem_cd_2, %zmm10;
+ vmovdqu64 mem_cd_3, %zmm11;
+ vmovdqu64 mem_cd_4, %zmm12;
+ vmovdqu64 mem_cd_5, %zmm13;
+ vmovdqu64 mem_cd_6, %zmm14;
+ vmovdqu64 mem_cd_7, %zmm15;
+
+ outunpack64(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, (key_table)(CTX), %zmm30, %zmm31);
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size __camellia_gfni_avx512_dec_blk64,.-__camellia_gfni_avx512_dec_blk64;)
+
+#define add_le128(out, in, lo_counter, hi_counter1) \
+ vpaddq lo_counter, in, out; \
+ vpcmpuq $1, lo_counter, out, %k1; \
+ kaddb %k1, %k1, %k1; \
+ vpaddq hi_counter1, out, out{%k1};
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_ctr_enc
+ELF(.type _gcry_camellia_gfni_avx512_ctr_enc,@function;)
+
+_gcry_camellia_gfni_avx512_ctr_enc:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (64 blocks)
+ * %rdx: src (64 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+ CFI_STARTPROC();
+ vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+ vbroadcasti64x2 .Lbswap128_mask rRIP, %zmm19;
+ vmovdqa64 .Lcounter0123_lo rRIP, %zmm21;
+ vbroadcasti64x2 .Lcounter4444_lo rRIP, %zmm22;
+ vbroadcasti64x2 .Lcounter8888_lo rRIP, %zmm23;
+ vbroadcasti64x2 .Lcounter16161616_lo rRIP, %zmm24;
+ vbroadcasti64x2 .Lcounter1111_hi rRIP, %zmm25;
+
+ /* load IV and byteswap */
+ movq 8(%rcx), %r11;
+ movq (%rcx), %r10;
+ bswapq %r11;
+ bswapq %r10;
+ vbroadcasti64x2 (%rcx), %zmm0;
+ vpshufb %zmm19, %zmm0, %zmm0;
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 64), %r11;
+ ja .Lload_ctr_carry;
+
+ /* construct IVs */
+ vpaddq %zmm21, %zmm0, %zmm15; /* +0:+1:+2:+3 */
+ vpaddq %zmm22, %zmm15, %zmm14; /* +4:+5:+6:+7 */
+ vpaddq %zmm23, %zmm15, %zmm13; /* +8:+9:+10:+11 */
+ vpaddq %zmm23, %zmm14, %zmm12; /* +12:+13:+14:+15 */
+ vpaddq %zmm24, %zmm15, %zmm11; /* +16... */
+ vpaddq %zmm24, %zmm14, %zmm10; /* +20... */
+ vpaddq %zmm24, %zmm13, %zmm9; /* +24... */
+ vpaddq %zmm24, %zmm12, %zmm8; /* +28... */
+ vpaddq %zmm24, %zmm11, %zmm7; /* +32... */
+ vpaddq %zmm24, %zmm10, %zmm6; /* +36... */
+ vpaddq %zmm24, %zmm9, %zmm5; /* +40... */
+ vpaddq %zmm24, %zmm8, %zmm4; /* +44... */
+ vpaddq %zmm24, %zmm7, %zmm3; /* +48... */
+ vpaddq %zmm24, %zmm6, %zmm2; /* +52... */
+ vpaddq %zmm24, %zmm5, %zmm1; /* +56... */
+ vpaddq %zmm24, %zmm4, %zmm0; /* +60... */
+ jmp .Lload_ctr_done;
+
+.align 4
+.Lload_ctr_carry:
+ /* construct IVs */
+ add_le128(%zmm15, %zmm0, %zmm21, %zmm25); /* +0:+1:+2:+3 */
+ add_le128(%zmm14, %zmm15, %zmm22, %zmm25); /* +4:+5:+6:+7 */
+ add_le128(%zmm13, %zmm15, %zmm23, %zmm25); /* +8:+9:+10:+11 */
+ add_le128(%zmm12, %zmm14, %zmm23, %zmm25); /* +12:+13:+14:+15 */
+ add_le128(%zmm11, %zmm15, %zmm24, %zmm25); /* +16... */
+ add_le128(%zmm10, %zmm14, %zmm24, %zmm25); /* +20... */
+ add_le128(%zmm9, %zmm13, %zmm24, %zmm25); /* +24... */
+ add_le128(%zmm8, %zmm12, %zmm24, %zmm25); /* +28... */
+ add_le128(%zmm7, %zmm11, %zmm24, %zmm25); /* +32... */
+ add_le128(%zmm6, %zmm10, %zmm24, %zmm25); /* +36... */
+ add_le128(%zmm5, %zmm9, %zmm24, %zmm25); /* +40... */
+ add_le128(%zmm4, %zmm8, %zmm24, %zmm25); /* +44... */
+ add_le128(%zmm3, %zmm7, %zmm24, %zmm25); /* +48... */
+ add_le128(%zmm2, %zmm6, %zmm24, %zmm25); /* +52... */
+ add_le128(%zmm1, %zmm5, %zmm24, %zmm25); /* +56... */
+ add_le128(%zmm0, %zmm4, %zmm24, %zmm25); /* +60... */
+
+.align 4
+.Lload_ctr_done:
+ vpbroadcastq (key_table)(CTX), %zmm16;
+ vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16;
+
+ /* Byte-swap IVs and update counter. */
+ addq $64, %r11;
+ adcq $0, %r10;
+ vpshufb %zmm19, %zmm15, %zmm15;
+ vpshufb %zmm19, %zmm14, %zmm14;
+ vpshufb %zmm19, %zmm13, %zmm13;
+ vpshufb %zmm19, %zmm12, %zmm12;
+ vpshufb %zmm19, %zmm11, %zmm11;
+ vpshufb %zmm19, %zmm10, %zmm10;
+ vpshufb %zmm19, %zmm9, %zmm9;
+ vpshufb %zmm19, %zmm8, %zmm8;
+ bswapq %r11;
+ bswapq %r10;
+ vpshufb %zmm19, %zmm7, %zmm7;
+ vpshufb %zmm19, %zmm6, %zmm6;
+ vpshufb %zmm19, %zmm5, %zmm5;
+ vpshufb %zmm19, %zmm4, %zmm4;
+ vpshufb %zmm19, %zmm3, %zmm3;
+ vpshufb %zmm19, %zmm2, %zmm2;
+ vpshufb %zmm19, %zmm1, %zmm1;
+ vpshufb %zmm19, %zmm0, %zmm0;
+ movq %r11, 8(%rcx);
+ movq %r10, (%rcx);
+
+ /* inpack64_pre: */
+ vpxorq %zmm0, %zmm16, %zmm0;
+ vpxorq %zmm1, %zmm16, %zmm1;
+ vpxorq %zmm2, %zmm16, %zmm2;
+ vpxorq %zmm3, %zmm16, %zmm3;
+ vpxorq %zmm4, %zmm16, %zmm4;
+ vpxorq %zmm5, %zmm16, %zmm5;
+ vpxorq %zmm6, %zmm16, %zmm6;
+ vpxorq %zmm7, %zmm16, %zmm7;
+ vpxorq %zmm8, %zmm16, %zmm8;
+ vpxorq %zmm9, %zmm16, %zmm9;
+ vpxorq %zmm10, %zmm16, %zmm10;
+ vpxorq %zmm11, %zmm16, %zmm11;
+ vpxorq %zmm12, %zmm16, %zmm12;
+ vpxorq %zmm13, %zmm16, %zmm13;
+ vpxorq %zmm14, %zmm16, %zmm14;
+ vpxorq %zmm15, %zmm16, %zmm15;
+
+ call __camellia_gfni_avx512_enc_blk64;
+
+ vpxorq 0 * 64(%rdx), %zmm7, %zmm7;
+ vpxorq 1 * 64(%rdx), %zmm6, %zmm6;
+ vpxorq 2 * 64(%rdx), %zmm5, %zmm5;
+ vpxorq 3 * 64(%rdx), %zmm4, %zmm4;
+ vpxorq 4 * 64(%rdx), %zmm3, %zmm3;
+ vpxorq 5 * 64(%rdx), %zmm2, %zmm2;
+ vpxorq 6 * 64(%rdx), %zmm1, %zmm1;
+ vpxorq 7 * 64(%rdx), %zmm0, %zmm0;
+ vpxorq 8 * 64(%rdx), %zmm15, %zmm15;
+ vpxorq 9 * 64(%rdx), %zmm14, %zmm14;
+ vpxorq 10 * 64(%rdx), %zmm13, %zmm13;
+ vpxorq 11 * 64(%rdx), %zmm12, %zmm12;
+ vpxorq 12 * 64(%rdx), %zmm11, %zmm11;
+ vpxorq 13 * 64(%rdx), %zmm10, %zmm10;
+ vpxorq 14 * 64(%rdx), %zmm9, %zmm9;
+ vpxorq 15 * 64(%rdx), %zmm8, %zmm8;
+
+ write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+ %zmm8, %rsi);
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_ctr_enc,.-_gcry_camellia_gfni_avx512_ctr_enc;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_cbc_dec
+ELF(.type _gcry_camellia_gfni_avx512_cbc_dec,@function;)
+
+_gcry_camellia_gfni_avx512_cbc_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (64 blocks)
+ * %rdx: src (64 blocks)
+ * %rcx: iv
+ */
+ CFI_STARTPROC();
+ vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+ movq %rcx, %r9;
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ inpack64_pre(%zmm0, %zmm1, %zmm2, %zmm3, %zmm4, %zmm5, %zmm6, %zmm7,
+ %zmm8, %zmm9, %zmm10, %zmm11, %zmm12, %zmm13, %zmm14,
+ %zmm15, %rdx, (key_table)(CTX, %r8, 8));
+
+ call __camellia_gfni_avx512_dec_blk64;
+
+ /* XOR output with IV */
+ vmovdqu64 (%r9), %xmm16;
+ vinserti64x2 $1, (0 * 16)(%rdx), %ymm16, %ymm16;
+ vinserti64x4 $1, (1 * 16)(%rdx), %zmm16, %zmm16;
+ vpxorq %zmm16, %zmm7, %zmm7;
+ vpxorq (0 * 64 + 48)(%rdx), %zmm6, %zmm6;
+ vpxorq (1 * 64 + 48)(%rdx), %zmm5, %zmm5;
+ vpxorq (2 * 64 + 48)(%rdx), %zmm4, %zmm4;
+ vpxorq (3 * 64 + 48)(%rdx), %zmm3, %zmm3;
+ vpxorq (4 * 64 + 48)(%rdx), %zmm2, %zmm2;
+ vpxorq (5 * 64 + 48)(%rdx), %zmm1, %zmm1;
+ vpxorq (6 * 64 + 48)(%rdx), %zmm0, %zmm0;
+ vpxorq (7 * 64 + 48)(%rdx), %zmm15, %zmm15;
+ vpxorq (8 * 64 + 48)(%rdx), %zmm14, %zmm14;
+ vpxorq (9 * 64 + 48)(%rdx), %zmm13, %zmm13;
+ vpxorq (10 * 64 + 48)(%rdx), %zmm12, %zmm12;
+ vpxorq (11 * 64 + 48)(%rdx), %zmm11, %zmm11;
+ vpxorq (12 * 64 + 48)(%rdx), %zmm10, %zmm10;
+ vpxorq (13 * 64 + 48)(%rdx), %zmm9, %zmm9;
+ vpxorq (14 * 64 + 48)(%rdx), %zmm8, %zmm8;
+ vmovdqu64 (15 * 64 + 48)(%rdx), %xmm16;
+
+ write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+ %zmm8, %rsi);
+
+ /* store new IV */
+ vmovdqu64 %xmm16, (0)(%r9);
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_cbc_dec,.-_gcry_camellia_gfni_avx512_cbc_dec;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_cfb_dec
+ELF(.type _gcry_camellia_gfni_avx512_cfb_dec,@function;)
+
+_gcry_camellia_gfni_avx512_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: iv
+ */
+ CFI_STARTPROC();
+ vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ /* inpack64_pre: */
+ vpbroadcastq (key_table)(CTX), %zmm0;
+ vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0;
+ vmovdqu64 (%rcx), %xmm15;
+ vinserti64x2 $1, (%rdx), %ymm15, %ymm15;
+ vinserti64x4 $1, 16(%rdx), %zmm15, %zmm15;
+ vpxorq %zmm15, %zmm0, %zmm15;
+ vpxorq (0 * 64 + 48)(%rdx), %zmm0, %zmm14;
+ vpxorq (1 * 64 + 48)(%rdx), %zmm0, %zmm13;
+ vpxorq (2 * 64 + 48)(%rdx), %zmm0, %zmm12;
+ vpxorq (3 * 64 + 48)(%rdx), %zmm0, %zmm11;
+ vpxorq (4 * 64 + 48)(%rdx), %zmm0, %zmm10;
+ vpxorq (5 * 64 + 48)(%rdx), %zmm0, %zmm9;
+ vpxorq (6 * 64 + 48)(%rdx), %zmm0, %zmm8;
+ vpxorq (7 * 64 + 48)(%rdx), %zmm0, %zmm7;
+ vpxorq (8 * 64 + 48)(%rdx), %zmm0, %zmm6;
+ vpxorq (9 * 64 + 48)(%rdx), %zmm0, %zmm5;
+ vpxorq (10 * 64 + 48)(%rdx), %zmm0, %zmm4;
+ vpxorq (11 * 64 + 48)(%rdx), %zmm0, %zmm3;
+ vpxorq (12 * 64 + 48)(%rdx), %zmm0, %zmm2;
+ vpxorq (13 * 64 + 48)(%rdx), %zmm0, %zmm1;
+ vpxorq (14 * 64 + 48)(%rdx), %zmm0, %zmm0;
+ vmovdqu64 (15 * 64 + 48)(%rdx), %xmm16;
+ vmovdqu64 %xmm16, (%rcx); /* store new IV */
+
+ call __camellia_gfni_avx512_enc_blk64;
+
+ vpxorq 0 * 64(%rdx), %zmm7, %zmm7;
+ vpxorq 1 * 64(%rdx), %zmm6, %zmm6;
+ vpxorq 2 * 64(%rdx), %zmm5, %zmm5;
+ vpxorq 3 * 64(%rdx), %zmm4, %zmm4;
+ vpxorq 4 * 64(%rdx), %zmm3, %zmm3;
+ vpxorq 5 * 64(%rdx), %zmm2, %zmm2;
+ vpxorq 6 * 64(%rdx), %zmm1, %zmm1;
+ vpxorq 7 * 64(%rdx), %zmm0, %zmm0;
+ vpxorq 8 * 64(%rdx), %zmm15, %zmm15;
+ vpxorq 9 * 64(%rdx), %zmm14, %zmm14;
+ vpxorq 10 * 64(%rdx), %zmm13, %zmm13;
+ vpxorq 11 * 64(%rdx), %zmm12, %zmm12;
+ vpxorq 12 * 64(%rdx), %zmm11, %zmm11;
+ vpxorq 13 * 64(%rdx), %zmm10, %zmm10;
+ vpxorq 14 * 64(%rdx), %zmm9, %zmm9;
+ vpxorq 15 * 64(%rdx), %zmm8, %zmm8;
+
+ write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+ %zmm8, %rsi);
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_cfb_dec,.-_gcry_camellia_gfni_avx512_cfb_dec;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_ocb_enc
+ELF(.type _gcry_camellia_gfni_avx512_ocb_enc,@function;)
+
+_gcry_camellia_gfni_avx512_ocb_enc:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (64 blocks)
+ * %rdx: src (64 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[64])
+ */
+ CFI_STARTPROC();
+ vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+ pushq %r12;
+ CFI_PUSH(%r12);
+ pushq %r13;
+ CFI_PUSH(%r13);
+ pushq %r14;
+ CFI_PUSH(%r14);
+ pushq %r15;
+ CFI_PUSH(%r15);
+ pushq %rbx;
+ CFI_PUSH(%rbx);
+
+ vmovdqu64 (%rcx), %xmm30;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg, zplain) \
+ vmovdqu64 (n * 64)(%rdx), zplain; \
+ vpxorq (l0reg), %xmm30, %xmm16; \
+ vpxorq (l1reg), %xmm16, %xmm30; \
+ vinserti64x2 $1, %xmm30, %ymm16, %ymm16; \
+ vpxorq (l2reg), %xmm30, %xmm30; \
+ vinserti64x2 $2, %xmm30, %zmm16, %zmm16; \
+ vpxorq (l3reg), %xmm30, %xmm30; \
+ vinserti64x2 $3, %xmm30, %zmm16, %zmm16; \
+ vpxorq zplain, %zmm31, %zmm31; \
+ vpxorq zplain, %zmm16, zreg; \
+ vmovdqu64 %zmm16, (n * 64)(%rsi);
+
+#define OCB_LOAD_PTRS(n) \
+ movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \
+ movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \
+ movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \
+ movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \
+ movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \
+ movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \
+ movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \
+ movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx;
+
+ OCB_LOAD_PTRS(0);
+ OCB_INPUT(0, %r10, %r11, %r12, %r13, %zmm15, %zmm20);
+ OCB_INPUT(1, %r14, %r15, %rax, %rbx, %zmm14, %zmm21);
+ OCB_LOAD_PTRS(2);
+ OCB_INPUT(2, %r10, %r11, %r12, %r13, %zmm13, %zmm22);
+ vpternlogq $0x96, %zmm20, %zmm21, %zmm22;
+ OCB_INPUT(3, %r14, %r15, %rax, %rbx, %zmm12, %zmm23);
+ OCB_LOAD_PTRS(4);
+ OCB_INPUT(4, %r10, %r11, %r12, %r13, %zmm11, %zmm24);
+ OCB_INPUT(5, %r14, %r15, %rax, %rbx, %zmm10, %zmm25);
+ vpternlogq $0x96, %zmm23, %zmm24, %zmm25;
+ OCB_LOAD_PTRS(6);
+ OCB_INPUT(6, %r10, %r11, %r12, %r13, %zmm9, %zmm20);
+ OCB_INPUT(7, %r14, %r15, %rax, %rbx, %zmm8, %zmm21);
+ OCB_LOAD_PTRS(8);
+ OCB_INPUT(8, %r10, %r11, %r12, %r13, %zmm7, %zmm26);
+ vpternlogq $0x96, %zmm20, %zmm21, %zmm26;
+ OCB_INPUT(9, %r14, %r15, %rax, %rbx, %zmm6, %zmm23);
+ OCB_LOAD_PTRS(10);
+ OCB_INPUT(10, %r10, %r11, %r12, %r13, %zmm5, %zmm24);
+ OCB_INPUT(11, %r14, %r15, %rax, %rbx, %zmm4, %zmm27);
+ vpternlogq $0x96, %zmm23, %zmm24, %zmm27;
+ OCB_LOAD_PTRS(12);
+ OCB_INPUT(12, %r10, %r11, %r12, %r13, %zmm3, %zmm20);
+ OCB_INPUT(13, %r14, %r15, %rax, %rbx, %zmm2, %zmm21);
+ OCB_LOAD_PTRS(14);
+ OCB_INPUT(14, %r10, %r11, %r12, %r13, %zmm1, %zmm23);
+ vpternlogq $0x96, %zmm20, %zmm21, %zmm23;
+ OCB_INPUT(15, %r14, %r15, %rax, %rbx, %zmm0, %zmm24);
+#undef OCB_LOAD_PTRS
+#undef OCB_INPUT
+
+ vpbroadcastq (key_table)(CTX), %zmm16;
+ vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16;
+
+ vpternlogq $0x96, %zmm24, %zmm22, %zmm25;
+ vpternlogq $0x96, %zmm26, %zmm27, %zmm23;
+ vpxorq %zmm25, %zmm23, %zmm20;
+ vextracti64x4 $1, %zmm20, %ymm21;
+ vpxorq %ymm21, %ymm20, %ymm20;
+ vextracti64x2 $1, %ymm20, %xmm21;
+ vpternlogq $0x96, (%r8), %xmm21, %xmm20;
+ vmovdqu64 %xmm30, (%rcx);
+ vmovdqu64 %xmm20, (%r8);
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ /* inpack64_pre: */
+ vpxorq %zmm0, %zmm16, %zmm0;
+ vpxorq %zmm1, %zmm16, %zmm1;
+ vpxorq %zmm2, %zmm16, %zmm2;
+ vpxorq %zmm3, %zmm16, %zmm3;
+ vpxorq %zmm4, %zmm16, %zmm4;
+ vpxorq %zmm5, %zmm16, %zmm5;
+ vpxorq %zmm6, %zmm16, %zmm6;
+ vpxorq %zmm7, %zmm16, %zmm7;
+ vpxorq %zmm8, %zmm16, %zmm8;
+ vpxorq %zmm9, %zmm16, %zmm9;
+ vpxorq %zmm10, %zmm16, %zmm10;
+ vpxorq %zmm11, %zmm16, %zmm11;
+ vpxorq %zmm12, %zmm16, %zmm12;
+ vpxorq %zmm13, %zmm16, %zmm13;
+ vpxorq %zmm14, %zmm16, %zmm14;
+ vpxorq %zmm15, %zmm16, %zmm15;
+
+ call __camellia_gfni_avx512_enc_blk64;
+
+ vpxorq 0 * 64(%rsi), %zmm7, %zmm7;
+ vpxorq 1 * 64(%rsi), %zmm6, %zmm6;
+ vpxorq 2 * 64(%rsi), %zmm5, %zmm5;
+ vpxorq 3 * 64(%rsi), %zmm4, %zmm4;
+ vpxorq 4 * 64(%rsi), %zmm3, %zmm3;
+ vpxorq 5 * 64(%rsi), %zmm2, %zmm2;
+ vpxorq 6 * 64(%rsi), %zmm1, %zmm1;
+ vpxorq 7 * 64(%rsi), %zmm0, %zmm0;
+ vpxorq 8 * 64(%rsi), %zmm15, %zmm15;
+ vpxorq 9 * 64(%rsi), %zmm14, %zmm14;
+ vpxorq 10 * 64(%rsi), %zmm13, %zmm13;
+ vpxorq 11 * 64(%rsi), %zmm12, %zmm12;
+ vpxorq 12 * 64(%rsi), %zmm11, %zmm11;
+ vpxorq 13 * 64(%rsi), %zmm10, %zmm10;
+ vpxorq 14 * 64(%rsi), %zmm9, %zmm9;
+ vpxorq 15 * 64(%rsi), %zmm8, %zmm8;
+
+ write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+ %zmm8, %rsi);
+
+ popq %rbx;
+ CFI_RESTORE(%rbx);
+ popq %r15;
+ CFI_RESTORE(%r15);
+ popq %r14;
+ CFI_RESTORE(%r14);
+ popq %r13;
+ CFI_RESTORE(%r12);
+ popq %r12;
+ CFI_RESTORE(%r13);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_ocb_enc,.-_gcry_camellia_gfni_avx512_ocb_enc;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_ocb_dec
+ELF(.type _gcry_camellia_gfni_avx512_ocb_dec,@function;)
+
+_gcry_camellia_gfni_avx512_ocb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (64 blocks)
+ * %rdx: src (64 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[64])
+ */
+ CFI_STARTPROC();
+ vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+ pushq %r12;
+ CFI_PUSH(%r12);
+ pushq %r13;
+ CFI_PUSH(%r13);
+ pushq %r14;
+ CFI_PUSH(%r14);
+ pushq %r15;
+ CFI_PUSH(%r15);
+ pushq %rbx;
+ CFI_PUSH(%rbx);
+ pushq %r8;
+ CFI_PUSH(%r8);
+
+ vmovdqu64 (%rcx), %xmm30;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* C_i = Offset_i xor DECIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, l2reg, l3reg, zreg) \
+ vpxorq (l0reg), %xmm30, %xmm16; \
+ vpxorq (l1reg), %xmm16, %xmm30; \
+ vinserti64x2 $1, %xmm30, %ymm16, %ymm16; \
+ vpxorq (l2reg), %xmm30, %xmm30; \
+ vinserti64x2 $2, %xmm30, %zmm16, %zmm16; \
+ vpxorq (l3reg), %xmm30, %xmm30; \
+ vinserti64x2 $3, %xmm30, %zmm16, %zmm16; \
+ vpxorq (n * 64)(%rdx), %zmm16, zreg; \
+ vmovdqu64 %zmm16, (n * 64)(%rsi);
+
+#define OCB_LOAD_PTRS(n) \
+ movq ((n * 4 * 8) + (0 * 8))(%r9), %r10; \
+ movq ((n * 4 * 8) + (1 * 8))(%r9), %r11; \
+ movq ((n * 4 * 8) + (2 * 8))(%r9), %r12; \
+ movq ((n * 4 * 8) + (3 * 8))(%r9), %r13; \
+ movq ((n * 4 * 8) + (4 * 8))(%r9), %r14; \
+ movq ((n * 4 * 8) + (5 * 8))(%r9), %r15; \
+ movq ((n * 4 * 8) + (6 * 8))(%r9), %rax; \
+ movq ((n * 4 * 8) + (7 * 8))(%r9), %rbx;
+
+ OCB_LOAD_PTRS(0);
+ OCB_INPUT(0, %r10, %r11, %r12, %r13, %zmm15);
+ OCB_INPUT(1, %r14, %r15, %rax, %rbx, %zmm14);
+ OCB_LOAD_PTRS(2);
+ OCB_INPUT(2, %r10, %r11, %r12, %r13, %zmm13);
+ OCB_INPUT(3, %r14, %r15, %rax, %rbx, %zmm12);
+ OCB_LOAD_PTRS(4);
+ OCB_INPUT(4, %r10, %r11, %r12, %r13, %zmm11);
+ OCB_INPUT(5, %r14, %r15, %rax, %rbx, %zmm10);
+ OCB_LOAD_PTRS(6);
+ OCB_INPUT(6, %r10, %r11, %r12, %r13, %zmm9);
+ OCB_INPUT(7, %r14, %r15, %rax, %rbx, %zmm8);
+ OCB_LOAD_PTRS(8);
+ OCB_INPUT(8, %r10, %r11, %r12, %r13, %zmm7);
+ OCB_INPUT(9, %r14, %r15, %rax, %rbx, %zmm6);
+ OCB_LOAD_PTRS(10);
+ OCB_INPUT(10, %r10, %r11, %r12, %r13, %zmm5);
+ OCB_INPUT(11, %r14, %r15, %rax, %rbx, %zmm4);
+ OCB_LOAD_PTRS(12);
+ OCB_INPUT(12, %r10, %r11, %r12, %r13, %zmm3);
+ OCB_INPUT(13, %r14, %r15, %rax, %rbx, %zmm2);
+ OCB_LOAD_PTRS(14);
+ OCB_INPUT(14, %r10, %r11, %r12, %r13, %zmm1);
+ OCB_INPUT(15, %r14, %r15, %rax, %rbx, %zmm0);
+#undef OCB_LOAD_PTRS
+#undef OCB_INPUT
+
+ vmovdqu64 %xmm30, (%rcx);
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+
+ vpbroadcastq (key_table)(CTX, %r8, 8), %zmm16;
+ vpshufb .Lpack_bswap rRIP, %zmm16, %zmm16;
+
+ /* inpack64_pre: */
+ vpxorq %zmm0, %zmm16, %zmm0;
+ vpxorq %zmm1, %zmm16, %zmm1;
+ vpxorq %zmm2, %zmm16, %zmm2;
+ vpxorq %zmm3, %zmm16, %zmm3;
+ vpxorq %zmm4, %zmm16, %zmm4;
+ vpxorq %zmm5, %zmm16, %zmm5;
+ vpxorq %zmm6, %zmm16, %zmm6;
+ vpxorq %zmm7, %zmm16, %zmm7;
+ vpxorq %zmm8, %zmm16, %zmm8;
+ vpxorq %zmm9, %zmm16, %zmm9;
+ vpxorq %zmm10, %zmm16, %zmm10;
+ vpxorq %zmm11, %zmm16, %zmm11;
+ vpxorq %zmm12, %zmm16, %zmm12;
+ vpxorq %zmm13, %zmm16, %zmm13;
+ vpxorq %zmm14, %zmm16, %zmm14;
+ vpxorq %zmm15, %zmm16, %zmm15;
+
+ call __camellia_gfni_avx512_dec_blk64;
+
+ vpxorq 0 * 64(%rsi), %zmm7, %zmm7;
+ vpxorq 1 * 64(%rsi), %zmm6, %zmm6;
+ vpxorq 2 * 64(%rsi), %zmm5, %zmm5;
+ vpxorq 3 * 64(%rsi), %zmm4, %zmm4;
+ vpxorq 4 * 64(%rsi), %zmm3, %zmm3;
+ vpxorq 5 * 64(%rsi), %zmm2, %zmm2;
+ vpxorq 6 * 64(%rsi), %zmm1, %zmm1;
+ vpxorq 7 * 64(%rsi), %zmm0, %zmm0;
+ vpxorq 8 * 64(%rsi), %zmm15, %zmm15;
+ vpxorq 9 * 64(%rsi), %zmm14, %zmm14;
+ vpxorq 10 * 64(%rsi), %zmm13, %zmm13;
+ vpxorq 11 * 64(%rsi), %zmm12, %zmm12;
+ vpxorq 12 * 64(%rsi), %zmm11, %zmm11;
+ vpxorq 13 * 64(%rsi), %zmm10, %zmm10;
+ vpxorq 14 * 64(%rsi), %zmm9, %zmm9;
+ vpxorq 15 * 64(%rsi), %zmm8, %zmm8;
+
+ write_output(%zmm7, %zmm6, %zmm5, %zmm4, %zmm3, %zmm2, %zmm1, %zmm0,
+ %zmm15, %zmm14, %zmm13, %zmm12, %zmm11, %zmm10, %zmm9,
+ %zmm8, %rsi);
+
+ popq %r8;
+ CFI_RESTORE(%r8);
+
+ /* Checksum_i = Checksum_{i-1} xor C_i */
+ vpternlogq $0x96, %zmm7, %zmm6, %zmm5;
+ vpternlogq $0x96, %zmm4, %zmm3, %zmm2;
+ vpternlogq $0x96, %zmm1, %zmm0, %zmm15;
+ vpternlogq $0x96, %zmm14, %zmm13, %zmm12;
+ vpternlogq $0x96, %zmm11, %zmm10, %zmm9;
+ vpternlogq $0x96, %zmm5, %zmm2, %zmm15;
+ vpternlogq $0x96, %zmm12, %zmm9, %zmm8;
+ vpxorq %zmm15, %zmm8, %zmm8;
+
+ vextracti64x4 $1, %zmm8, %ymm0;
+ vpxor %ymm0, %ymm8, %ymm8;
+ vextracti128 $1, %ymm8, %xmm0;
+ vpternlogq $0x96, (%r8), %xmm0, %xmm8;
+ vmovdqu64 %xmm8, (%r8);
+
+ popq %rbx;
+ CFI_RESTORE(%rbx);
+ popq %r15;
+ CFI_RESTORE(%r15);
+ popq %r14;
+ CFI_RESTORE(%r14);
+ popq %r13;
+ CFI_RESTORE(%r12);
+ popq %r12;
+ CFI_RESTORE(%r13);
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_ocb_dec,.-_gcry_camellia_gfni_avx512_ocb_dec;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_enc_blk64
+ELF(.type _gcry_camellia_gfni_avx512_enc_blk64,@function;)
+
+_gcry_camellia_gfni_avx512_enc_blk64:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (64 blocks)
+ * %rdx: src (64 blocks)
+ */
+ CFI_STARTPROC();
+ vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+ xorl %eax, %eax;
+
+ vpbroadcastq (key_table)(CTX), %zmm0;
+ vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0;
+
+ vpxorq (0) * 64(%rdx), %zmm0, %zmm15;
+ vpxorq (1) * 64(%rdx), %zmm0, %zmm14;
+ vpxorq (2) * 64(%rdx), %zmm0, %zmm13;
+ vpxorq (3) * 64(%rdx), %zmm0, %zmm12;
+ vpxorq (4) * 64(%rdx), %zmm0, %zmm11;
+ vpxorq (5) * 64(%rdx), %zmm0, %zmm10;
+ vpxorq (6) * 64(%rdx), %zmm0, %zmm9;
+ vpxorq (7) * 64(%rdx), %zmm0, %zmm8;
+ vpxorq (8) * 64(%rdx), %zmm0, %zmm7;
+ vpxorq (9) * 64(%rdx), %zmm0, %zmm6;
+ vpxorq (10) * 64(%rdx), %zmm0, %zmm5;
+ vpxorq (11) * 64(%rdx), %zmm0, %zmm4;
+ vpxorq (12) * 64(%rdx), %zmm0, %zmm3;
+ vpxorq (13) * 64(%rdx), %zmm0, %zmm2;
+ vpxorq (14) * 64(%rdx), %zmm0, %zmm1;
+ vpxorq (15) * 64(%rdx), %zmm0, %zmm0;
+
+ call __camellia_gfni_avx512_enc_blk64;
+
+ vmovdqu64 %zmm7, (0) * 64(%rsi);
+ vmovdqu64 %zmm6, (1) * 64(%rsi);
+ vmovdqu64 %zmm5, (2) * 64(%rsi);
+ vmovdqu64 %zmm4, (3) * 64(%rsi);
+ vmovdqu64 %zmm3, (4) * 64(%rsi);
+ vmovdqu64 %zmm2, (5) * 64(%rsi);
+ vmovdqu64 %zmm1, (6) * 64(%rsi);
+ vmovdqu64 %zmm0, (7) * 64(%rsi);
+ vmovdqu64 %zmm15, (8) * 64(%rsi);
+ vmovdqu64 %zmm14, (9) * 64(%rsi);
+ vmovdqu64 %zmm13, (10) * 64(%rsi);
+ vmovdqu64 %zmm12, (11) * 64(%rsi);
+ vmovdqu64 %zmm11, (12) * 64(%rsi);
+ vmovdqu64 %zmm10, (13) * 64(%rsi);
+ vmovdqu64 %zmm9, (14) * 64(%rsi);
+ vmovdqu64 %zmm8, (15) * 64(%rsi);
+
+ clear_regs();
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_enc_blk64,.-_gcry_camellia_gfni_avx512_enc_blk64;)
+
+.align 8
+.globl _gcry_camellia_gfni_avx512_dec_blk64
+ELF(.type _gcry_camellia_gfni_avx512_dec_blk64,@function;)
+
+_gcry_camellia_gfni_avx512_dec_blk64:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (64 blocks)
+ * %rdx: src (64 blocks)
+ */
+ CFI_STARTPROC();
+ vpopcntb %zmm16, %zmm16; /* spec stop for old AVX512 CPUs */
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %eax;
+ cmovel %eax, %r8d; /* max */
+ xorl %eax, %eax;
+
+ vpbroadcastq (key_table)(CTX, %r8, 8), %zmm0;
+ vpshufb .Lpack_bswap rRIP, %zmm0, %zmm0;
+
+ vpxorq (0) * 64(%rdx), %zmm0, %zmm15;
+ vpxorq (1) * 64(%rdx), %zmm0, %zmm14;
+ vpxorq (2) * 64(%rdx), %zmm0, %zmm13;
+ vpxorq (3) * 64(%rdx), %zmm0, %zmm12;
+ vpxorq (4) * 64(%rdx), %zmm0, %zmm11;
+ vpxorq (5) * 64(%rdx), %zmm0, %zmm10;
+ vpxorq (6) * 64(%rdx), %zmm0, %zmm9;
+ vpxorq (7) * 64(%rdx), %zmm0, %zmm8;
+ vpxorq (8) * 64(%rdx), %zmm0, %zmm7;
+ vpxorq (9) * 64(%rdx), %zmm0, %zmm6;
+ vpxorq (10) * 64(%rdx), %zmm0, %zmm5;
+ vpxorq (11) * 64(%rdx), %zmm0, %zmm4;
+ vpxorq (12) * 64(%rdx), %zmm0, %zmm3;
+ vpxorq (13) * 64(%rdx), %zmm0, %zmm2;
+ vpxorq (14) * 64(%rdx), %zmm0, %zmm1;
+ vpxorq (15) * 64(%rdx), %zmm0, %zmm0;
+
+ call __camellia_gfni_avx512_dec_blk64;
+
+ vmovdqu64 %zmm7, (0) * 64(%rsi);
+ vmovdqu64 %zmm6, (1) * 64(%rsi);
+ vmovdqu64 %zmm5, (2) * 64(%rsi);
+ vmovdqu64 %zmm4, (3) * 64(%rsi);
+ vmovdqu64 %zmm3, (4) * 64(%rsi);
+ vmovdqu64 %zmm2, (5) * 64(%rsi);
+ vmovdqu64 %zmm1, (6) * 64(%rsi);
+ vmovdqu64 %zmm0, (7) * 64(%rsi);
+ vmovdqu64 %zmm15, (8) * 64(%rsi);
+ vmovdqu64 %zmm14, (9) * 64(%rsi);
+ vmovdqu64 %zmm13, (10) * 64(%rsi);
+ vmovdqu64 %zmm12, (11) * 64(%rsi);
+ vmovdqu64 %zmm11, (12) * 64(%rsi);
+ vmovdqu64 %zmm10, (13) * 64(%rsi);
+ vmovdqu64 %zmm9, (14) * 64(%rsi);
+ vmovdqu64 %zmm8, (15) * 64(%rsi);
+
+ clear_regs();
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_camellia_gfni_avx512_dec_blk64,.-_gcry_camellia_gfni_avx512_dec_blk64;)
+
+#endif /* defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX512_SUPPORT) */
+#endif /* __x86_64 */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 72c02d77..c938be71 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -64,7 +64,7 @@
#include "camellia.h"
#include "bufhelp.h"
#include "cipher-internal.h"
-#include "cipher-selftest.h"
+#include "bulkhelp.h"
/* Helper macro to force alignment to 16 bytes. */
#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
@@ -97,6 +97,18 @@
# define USE_VAES_AVX2 1
#endif
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(USE_AESNI_AVX2) && defined(ENABLE_GFNI_SUPPORT)
+# define USE_GFNI_AVX2 1
+#endif
+
+/* USE_GFNI_AVX512 inidicates whether to compile with Intel GFNI/AVX512 code. */
+#undef USE_GFNI_AVX512
+#if defined(USE_GFNI_AVX2) && defined(ENABLE_AVX512_SUPPORT)
+# define USE_GFNI_AVX512 1
+#endif
+
typedef struct
{
KEY_TABLE_TYPE keytable;
@@ -107,6 +119,8 @@ typedef struct
#ifdef USE_AESNI_AVX2
unsigned int use_aesni_avx2:1;/* AES-NI/AVX2 implementation shall be used. */
unsigned int use_vaes_avx2:1; /* VAES/AVX2 implementation shall be used. */
+ unsigned int use_gfni_avx2:1; /* GFNI/AVX2 implementation shall be used. */
+ unsigned int use_gfni_avx512:1; /* GFNI/AVX512 implementation shall be used. */
#endif /*USE_AESNI_AVX2*/
} CAMELLIA_context;
@@ -126,7 +140,7 @@ typedef struct
#ifdef USE_AESNI_AVX
/* Assembler implementations of Camellia using AES-NI and AVX. Process data
- in 16 block same time.
+ in 16 blocks same time.
*/
extern void _gcry_camellia_aesni_avx_ctr_enc(CAMELLIA_context *ctx,
unsigned char *out,
@@ -166,11 +180,15 @@ extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
const unsigned char *key,
unsigned int keylen) ASM_FUNC_ABI;
+
+static const int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
#endif
#ifdef USE_AESNI_AVX2
/* Assembler implementations of Camellia using AES-NI and AVX2. Process data
- in 32 block same time.
+ in 32 blocks same time.
*/
extern void _gcry_camellia_aesni_avx2_ctr_enc(CAMELLIA_context *ctx,
unsigned char *out,
@@ -206,11 +224,27 @@ extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
unsigned char *offset,
unsigned char *checksum,
const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+
+static const int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
#endif
#ifdef USE_VAES_AVX2
/* Assembler implementations of Camellia using VAES and AVX2. Process data
- in 32 block same time.
+ in 32 blocks same time.
*/
extern void _gcry_camellia_vaes_avx2_ctr_enc(CAMELLIA_context *ctx,
unsigned char *out,
@@ -246,6 +280,117 @@ extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx,
unsigned char *offset,
unsigned char *checksum,
const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_GFNI_AVX2
+/* Assembler implementations of Camellia using GFNI and AVX2. Process data
+ in 32 blocks same time.
+ */
+extern void _gcry_camellia_gfni_avx2_ctr_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_cbc_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_cfb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_ocb_auth(CAMELLIA_context *ctx,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned int nblocks)
+ ASM_FUNC_ABI;
+#endif
+
+#ifdef USE_GFNI_AVX512
+/* Assembler implementations of Camellia using GFNI and AVX512. Process data
+ in 64 blocks same time.
+ */
+extern void _gcry_camellia_gfni_avx512_ctr_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_cbc_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_cfb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_ocb_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_ocb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_enc_blk64(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in)
+ ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx512_dec_blk64(const CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in)
+ ASM_FUNC_ABI;
+
+/* Stack not used by AVX512 implementation. */
+static const int avx512_burn_stack_depth = 0;
#endif
static const char *selftest(void);
@@ -259,6 +404,9 @@ static void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
static void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks);
+static void _gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt);
static size_t _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks,
int encrypt);
@@ -272,7 +420,8 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
CAMELLIA_context *ctx=c;
static int initialized=0;
static const char *selftest_failed=NULL;
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_VAES_AVX2)
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) \
+ || defined(USE_VAES_AVX2) || defined(USE_GFNI_AVX2)
unsigned int hwf = _gcry_get_hw_features ();
#endif
@@ -296,10 +445,18 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
#ifdef USE_AESNI_AVX2
ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
ctx->use_vaes_avx2 = 0;
+ ctx->use_gfni_avx2 = 0;
+ ctx->use_gfni_avx512 = 0;
#endif
#ifdef USE_VAES_AVX2
ctx->use_vaes_avx2 = (hwf & HWF_INTEL_VAES_VPCLMUL) && (hwf & HWF_INTEL_AVX2);
#endif
+#ifdef USE_GFNI_AVX2
+ ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif
+#ifdef USE_GFNI_AVX512
+ ctx->use_gfni_avx512 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX512);
+#endif
ctx->keybitlength=keylen*8;
@@ -310,6 +467,10 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
bulk_ops->ctr_enc = _gcry_camellia_ctr_enc;
bulk_ops->ocb_crypt = _gcry_camellia_ocb_crypt;
bulk_ops->ocb_auth = _gcry_camellia_ocb_auth;
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2 || ctx->use_vaes_avx2 || ctx->use_gfni_avx2)
+ bulk_ops->xts_crypt = _gcry_camellia_xts_crypt;
+#endif
if (0)
{ }
@@ -328,6 +489,23 @@ camellia_setkey(void *c, const byte *key, unsigned keylen,
);
}
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ /* Disable AESNI & VAES implementations when GFNI implementation is
+ * enabled. */
+#ifdef USE_AESNI_AVX
+ ctx->use_aesni_avx = 0;
+#endif
+#ifdef USE_AESNI_AVX2
+ ctx->use_aesni_avx2 = 0;
+#endif
+#ifdef USE_VAES_AVX2
+ ctx->use_vaes_avx2 = 0;
+#endif
+ }
+#endif
+
return 0;
}
@@ -422,6 +600,168 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
#endif /*!USE_ARM_ASM*/
+
+static unsigned int
+camellia_encrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf,
+ unsigned int num_blks)
+{
+ const CAMELLIA_context *ctx = priv;
+ unsigned int stack_burn_size = 0;
+
+ gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2 && num_blks >= 3)
+ {
+ /* 3 or more parallel block GFNI processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_gfni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+#ifdef USE_VAES_AVX2
+ if (ctx->use_vaes_avx2 && num_blks >= 6)
+ {
+ /* 6 or more parallel block VAES processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_vaes_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2 && num_blks >= 6)
+ {
+ /* 6 or more parallel block AESNI processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_aesni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+
+ while (num_blks)
+ {
+ stack_burn_size = camellia_encrypt((void *)ctx, outbuf, inbuf);
+ outbuf += CAMELLIA_BLOCK_SIZE;
+ inbuf += CAMELLIA_BLOCK_SIZE;
+ num_blks--;
+ }
+
+ return stack_burn_size;
+}
+
+static unsigned int
+camellia_encrypt_blk1_64 (const void *priv, byte *outbuf, const byte *inbuf,
+ unsigned int num_blks)
+{
+ const CAMELLIA_context *ctx = priv;
+ unsigned int stack_burn_size = 0;
+ unsigned int nburn;
+
+ gcry_assert (num_blks <= 64);
+
+#ifdef USE_GFNI_AVX512
+ if (num_blks == 64 && ctx->use_gfni_avx512)
+ {
+ _gcry_camellia_gfni_avx512_enc_blk64 (ctx, outbuf, inbuf);
+ return avx512_burn_stack_depth;
+ }
+#endif
+
+ do
+ {
+ unsigned int curr_blks = num_blks > 32 ? 32 : num_blks;
+ nburn = camellia_encrypt_blk1_32 (ctx, outbuf, inbuf, curr_blks);
+ stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+ outbuf += curr_blks * 16;
+ inbuf += curr_blks * 16;
+ num_blks -= curr_blks;
+ }
+ while (num_blks > 0);
+
+ return stack_burn_size;
+}
+
+static unsigned int
+camellia_decrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf,
+ unsigned int num_blks)
+{
+ const CAMELLIA_context *ctx = priv;
+ unsigned int stack_burn_size = 0;
+
+ gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2 && num_blks >= 3)
+ {
+ /* 3 or more parallel block GFNI processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_gfni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+#ifdef USE_VAES_AVX2
+ if (ctx->use_vaes_avx2 && num_blks >= 6)
+ {
+ /* 6 or more parallel block VAES processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_vaes_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2 && num_blks >= 6)
+ {
+ /* 6 or more parallel block AESNI processing is faster than
+ * generic C implementation. */
+ _gcry_camellia_aesni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+ return avx2_burn_stack_depth;
+ }
+#endif
+
+ while (num_blks)
+ {
+ stack_burn_size = camellia_decrypt((void *)ctx, outbuf, inbuf);
+ outbuf += CAMELLIA_BLOCK_SIZE;
+ inbuf += CAMELLIA_BLOCK_SIZE;
+ num_blks--;
+ }
+
+ return stack_burn_size;
+}
+
+static unsigned int
+camellia_decrypt_blk1_64 (const void *priv, byte *outbuf, const byte *inbuf,
+ unsigned int num_blks)
+{
+ const CAMELLIA_context *ctx = priv;
+ unsigned int stack_burn_size = 0;
+ unsigned int nburn;
+
+ gcry_assert (num_blks <= 64);
+
+#ifdef USE_GFNI_AVX512
+ if (num_blks == 64 && ctx->use_gfni_avx512)
+ {
+ _gcry_camellia_gfni_avx512_dec_blk64 (ctx, outbuf, inbuf);
+ return avx512_burn_stack_depth;
+ }
+#endif
+
+ do
+ {
+ unsigned int curr_blks = num_blks > 32 ? 32 : num_blks;
+ nburn = camellia_decrypt_blk1_32 (ctx, outbuf, inbuf, curr_blks);
+ stack_burn_size = nburn > stack_burn_size ? nburn : stack_burn_size;
+ outbuf += curr_blks * 16;
+ inbuf += curr_blks * 16;
+ num_blks -= curr_blks;
+ }
+ while (num_blks > 0);
+
+ return stack_burn_size;
+}
+
+
/* Bulk encryption of complete blocks in CTR mode. This function is only
intended for the bulk encryption feature of cipher.c. CTR is expected to be
of size CAMELLIA_BLOCK_SIZE. */
@@ -433,27 +773,53 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
CAMELLIA_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE];
- int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
+ int burn_stack_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ int did_use_gfni_avx512 = 0;
+
+ /* Process data in 64 block chunks. */
+ while (nblocks >= 64)
+ {
+ _gcry_camellia_gfni_avx512_ctr_enc (ctx, outbuf, inbuf, ctr);
+ nblocks -= 64;
+ outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 64 * CAMELLIA_BLOCK_SIZE;
+ did_use_gfni_avx512 = 1;
+ }
+
+ if (did_use_gfni_avx512)
+ {
+ if (burn_stack_depth < avx512_burn_stack_depth)
+ burn_stack_depth = avx512_burn_stack_depth;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+ typeof (&_gcry_camellia_aesni_avx2_ctr_enc) bulk_ctr_fn =
+ _gcry_camellia_aesni_avx2_ctr_enc;
+
#ifdef USE_VAES_AVX2
- int use_vaes = ctx->use_vaes_avx2;
+ if (ctx->use_vaes_avx2)
+ bulk_ctr_fn =_gcry_camellia_vaes_avx2_ctr_enc;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_ctr_fn =_gcry_camellia_gfni_avx2_ctr_enc;
#endif
/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
-#ifdef USE_VAES_AVX2
- if (use_vaes)
- _gcry_camellia_vaes_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
- else
-#endif
- _gcry_camellia_aesni_avx2_ctr_enc(ctx, outbuf, inbuf, ctr);
-
+ bulk_ctr_fn (ctx, outbuf, inbuf, ctr);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
inbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -462,15 +828,11 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
/* Use generic code to handle smaller chunks... */
- /* TODO: use caching instead? */
}
#endif
@@ -492,32 +854,31 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
/* Use generic code to handle smaller chunks... */
- /* TODO: use caching instead? */
}
#endif
- for ( ;nblocks; nblocks-- )
+ /* Process remaining blocks. */
+ if (nblocks)
{
- /* Encrypt the counter. */
- Camellia_EncryptBlock(ctx->keybitlength, ctr, ctx->keytable, tmpbuf);
- /* XOR the input with the encrypted counter and store in output. */
- cipher_block_xor(outbuf, tmpbuf, inbuf, CAMELLIA_BLOCK_SIZE);
- outbuf += CAMELLIA_BLOCK_SIZE;
- inbuf += CAMELLIA_BLOCK_SIZE;
- /* Increment the counter. */
- cipher_block_add(ctr, 1, CAMELLIA_BLOCK_SIZE);
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_ctr_enc_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
+ nblocks, ctr, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
}
- wipememory(tmpbuf, sizeof(tmpbuf));
- _gcry_burn_stack(burn_stack_depth);
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
}
/* Bulk decryption of complete blocks in CBC mode. This function is only
@@ -530,27 +891,53 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
CAMELLIA_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
- int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+ int burn_stack_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ int did_use_gfni_avx512 = 0;
+
+ /* Process data in 64 block chunks. */
+ while (nblocks >= 64)
+ {
+ _gcry_camellia_gfni_avx512_cbc_dec (ctx, outbuf, inbuf, iv);
+ nblocks -= 64;
+ outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 64 * CAMELLIA_BLOCK_SIZE;
+ did_use_gfni_avx512 = 1;
+ }
+
+ if (did_use_gfni_avx512)
+ {
+ if (burn_stack_depth < avx512_burn_stack_depth)
+ burn_stack_depth = avx512_burn_stack_depth;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+ typeof (&_gcry_camellia_aesni_avx2_cbc_dec) bulk_cbc_fn =
+ _gcry_camellia_aesni_avx2_cbc_dec;
+
#ifdef USE_VAES_AVX2
- int use_vaes = ctx->use_vaes_avx2;
+ if (ctx->use_vaes_avx2)
+ bulk_cbc_fn =_gcry_camellia_vaes_avx2_cbc_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_cbc_fn =_gcry_camellia_gfni_avx2_cbc_dec;
#endif
/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
-#ifdef USE_VAES_AVX2
- if (use_vaes)
- _gcry_camellia_vaes_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
- else
-#endif
- _gcry_camellia_aesni_avx2_cbc_dec(ctx, outbuf, inbuf, iv);
-
+ bulk_cbc_fn (ctx, outbuf, inbuf, iv);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
inbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -559,9 +946,6 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
@@ -588,9 +972,6 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
@@ -599,20 +980,23 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
}
#endif
- for ( ;nblocks; nblocks-- )
+ /* Process remaining blocks. */
+ if (nblocks)
{
- /* INBUF is needed later and it may be identical to OUTBUF, so store
- the intermediate result to SAVEBUF. */
- Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, savebuf);
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
- cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf,
- CAMELLIA_BLOCK_SIZE);
- inbuf += CAMELLIA_BLOCK_SIZE;
- outbuf += CAMELLIA_BLOCK_SIZE;
+ nburn = bulk_cbc_dec_128(ctx, camellia_decrypt_blk1_32, outbuf, inbuf,
+ nblocks, iv, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
}
- wipememory(savebuf, sizeof(savebuf));
- _gcry_burn_stack(burn_stack_depth);
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
}
/* Bulk decryption of complete blocks in CFB mode. This function is only
@@ -625,26 +1009,53 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
CAMELLIA_context *ctx = context;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+ int burn_stack_depth = 0;
+
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ int did_use_gfni_avx512 = 0;
+
+ /* Process data in 64 block chunks. */
+ while (nblocks >= 64)
+ {
+ _gcry_camellia_gfni_avx512_cfb_dec (ctx, outbuf, inbuf, iv);
+ nblocks -= 64;
+ outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 64 * CAMELLIA_BLOCK_SIZE;
+ did_use_gfni_avx512 = 1;
+ }
+
+ if (did_use_gfni_avx512)
+ {
+ if (burn_stack_depth < avx512_burn_stack_depth)
+ burn_stack_depth = avx512_burn_stack_depth;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
+ typeof (&_gcry_camellia_aesni_avx2_cfb_dec) bulk_cfb_fn =
+ _gcry_camellia_aesni_avx2_cfb_dec;
+
#ifdef USE_VAES_AVX2
- int use_vaes = ctx->use_vaes_avx2;
+ if (ctx->use_vaes_avx2)
+ bulk_cfb_fn =_gcry_camellia_vaes_avx2_cfb_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_cfb_fn =_gcry_camellia_gfni_avx2_cfb_dec;
#endif
/* Process data in 32 block chunks. */
while (nblocks >= 32)
{
-#ifdef USE_VAES_AVX2
- if (use_vaes)
- _gcry_camellia_vaes_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
- else
-#endif
- _gcry_camellia_aesni_avx2_cfb_dec(ctx, outbuf, inbuf, iv);
-
+ bulk_cfb_fn (ctx, outbuf, inbuf, iv);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
inbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -653,9 +1064,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
@@ -682,9 +1090,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
@@ -693,15 +1098,55 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
}
#endif
- for ( ;nblocks; nblocks-- )
+ /* Process remaining blocks. */
+ if (nblocks)
{
- Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
- cipher_block_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
- outbuf += CAMELLIA_BLOCK_SIZE;
- inbuf += CAMELLIA_BLOCK_SIZE;
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_cfb_dec_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
+ nblocks, iv, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
+ }
+
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_camellia_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt)
+{
+ CAMELLIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
+
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 64];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_xts_crypt_128(ctx, encrypt ? camellia_encrypt_blk1_64
+ : camellia_decrypt_blk1_64,
+ outbuf, inbuf, nblocks, tweak, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
}
- _gcry_burn_stack(burn_stack_depth);
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
}
/* Bulk encryption/decryption of complete blocks in OCB mode. */
@@ -713,11 +1158,9 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
CAMELLIA_context *ctx = (void *)&c->context.c;
unsigned char *outbuf = outbuf_arg;
const unsigned char *inbuf = inbuf_arg;
- int burn_stack_depth;
+ int burn_stack_depth = 0;
u64 blkn = c->u_mode.ocb.data_nblocks;
- burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size :
- CAMELLIA_decrypt_stack_burn_size;
#else
(void)c;
(void)outbuf_arg;
@@ -725,38 +1168,69 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
(void)encrypt;
#endif
+#ifdef USE_GFNI_AVX512
+ if (ctx->use_gfni_avx512)
+ {
+ int did_use_gfni_avx512 = 0;
+ u64 Ls[64];
+ u64 *l;
+
+ if (nblocks >= 64)
+ {
+ typeof (&_gcry_camellia_gfni_avx512_ocb_dec) bulk_ocb_fn =
+ encrypt ? _gcry_camellia_gfni_avx512_ocb_enc
+ : _gcry_camellia_gfni_avx512_ocb_dec;
+ l = bulk_ocb_prepare_L_pointers_array_blk64 (c, Ls, blkn);
+
+ /* Process data in 64 block chunks. */
+ while (nblocks >= 64)
+ {
+ blkn += 64;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 64);
+
+ bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+ nblocks -= 64;
+ outbuf += 64 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 64 * CAMELLIA_BLOCK_SIZE;
+ did_use_gfni_avx512 = 1;
+ }
+ }
+
+ if (did_use_gfni_avx512)
+ {
+ if (burn_stack_depth < avx2_burn_stack_depth)
+ burn_stack_depth = avx2_burn_stack_depth;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
-#ifdef USE_VAES_AVX2
- int encrypt_use_vaes = encrypt && ctx->use_vaes_avx2;
- int decrypt_use_vaes = !encrypt && ctx->use_vaes_avx2;
-#endif
u64 Ls[32];
- unsigned int n = 32 - (blkn % 32);
u64 *l;
- int i;
if (nblocks >= 32)
{
- for (i = 0; i < 32; i += 8)
- {
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- }
+ typeof (&_gcry_camellia_aesni_avx2_ocb_dec) bulk_ocb_fn =
+ encrypt ? _gcry_camellia_aesni_avx2_ocb_enc
+ : _gcry_camellia_aesni_avx2_ocb_dec;
- Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
- Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(31 + n) % 32];
+#ifdef USE_VAES_AVX2
+ if (ctx->use_vaes_avx2)
+ bulk_ocb_fn = encrypt ? _gcry_camellia_vaes_avx2_ocb_enc
+ : _gcry_camellia_vaes_avx2_ocb_dec;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_ocb_fn = encrypt ? _gcry_camellia_gfni_avx2_ocb_enc
+ : _gcry_camellia_gfni_avx2_ocb_dec;
+#endif
+ l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
/* Process data in 32 block chunks. */
while (nblocks >= 32)
@@ -764,21 +1238,7 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
blkn += 32;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
- if (0) {}
-#ifdef USE_VAES_AVX2
- else if (encrypt_use_vaes)
- _gcry_camellia_vaes_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else if (decrypt_use_vaes)
- _gcry_camellia_vaes_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
-#endif
- else if (encrypt)
- _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
- else
- _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
+ bulk_ocb_fn (ctx, outbuf, inbuf, c->u_iv.iv, c->u_ctr.ctr, Ls);
nblocks -= 32;
outbuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -789,9 +1249,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
@@ -805,27 +1262,11 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
int did_use_aesni_avx = 0;
u64 Ls[16];
- unsigned int n = 16 - (blkn % 16);
u64 *l;
- int i;
if (nblocks >= 16)
{
- for (i = 0; i < 16; i += 8)
- {
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- }
-
- Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(15 + n) % 16];
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
/* Process data in 16 block chunks. */
while (nblocks >= 16)
@@ -849,9 +1290,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
@@ -861,6 +1299,24 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
#endif
#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_ocb_crypt_128 (c, ctx, encrypt ? camellia_encrypt_blk1_32
+ : camellia_decrypt_blk1_32,
+ outbuf, inbuf, nblocks, &blkn, encrypt,
+ tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
+ nblocks = 0;
+ }
+
c->u_mode.ocb.data_nblocks = blkn;
if (burn_stack_depth)
@@ -878,10 +1334,8 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
CAMELLIA_context *ctx = (void *)&c->context.c;
const unsigned char *abuf = abuf_arg;
- int burn_stack_depth;
+ int burn_stack_depth = 0;
u64 blkn = c->u_mode.ocb.aad_nblocks;
-
- burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
#else
(void)c;
(void)abuf_arg;
@@ -891,33 +1345,24 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
if (ctx->use_aesni_avx2)
{
int did_use_aesni_avx2 = 0;
-#ifdef USE_VAES_AVX2
- int use_vaes = ctx->use_vaes_avx2;
-#endif
u64 Ls[32];
- unsigned int n = 32 - (blkn % 32);
u64 *l;
- int i;
if (nblocks >= 32)
{
- for (i = 0; i < 32; i += 8)
- {
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- }
+ typeof (&_gcry_camellia_aesni_avx2_ocb_auth) bulk_auth_fn =
+ _gcry_camellia_aesni_avx2_ocb_auth;
+
+#ifdef USE_VAES_AVX2
+ if (ctx->use_vaes_avx2)
+ bulk_auth_fn = _gcry_camellia_vaes_avx2_ocb_auth;
+#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ bulk_auth_fn = _gcry_camellia_gfni_avx2_ocb_auth;
+#endif
- Ls[(7 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- Ls[(15 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[4];
- Ls[(23 + n) % 32] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(31 + n) % 32];
+ l = bulk_ocb_prepare_L_pointers_array_blk32 (c, Ls, blkn);
/* Process data in 32 block chunks. */
while (nblocks >= 32)
@@ -925,16 +1370,8 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
blkn += 32;
*l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 32);
-#ifdef USE_VAES_AVX2
- if (use_vaes)
- _gcry_camellia_vaes_avx2_ocb_auth(ctx, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
- else
-#endif
- _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf,
- c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ bulk_auth_fn (ctx, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
nblocks -= 32;
abuf += 32 * CAMELLIA_BLOCK_SIZE;
@@ -944,9 +1381,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
if (did_use_aesni_avx2)
{
- int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx2_burn_stack_depth)
burn_stack_depth = avx2_burn_stack_depth;
}
@@ -960,27 +1394,11 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
{
int did_use_aesni_avx = 0;
u64 Ls[16];
- unsigned int n = 16 - (blkn % 16);
u64 *l;
- int i;
if (nblocks >= 16)
{
- for (i = 0; i < 16; i += 8)
- {
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- }
-
- Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(15 + n) % 16];
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
/* Process data in 16 block chunks. */
while (nblocks >= 16)
@@ -1000,9 +1418,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
if (did_use_aesni_avx)
{
- int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
- 2 * sizeof(void *) + ASM_EXTRA_STACK;
-
if (burn_stack_depth < avx_burn_stack_depth)
burn_stack_depth = avx_burn_stack_depth;
}
@@ -1012,6 +1427,23 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
#endif
#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+ unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+ size_t nburn;
+
+ nburn = bulk_ocb_auth_128 (c, ctx, camellia_encrypt_blk1_32,
+ abuf, nblocks, &blkn, tmpbuf,
+ sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+ wipememory(tmpbuf, tmp_used);
+ nblocks = 0;
+ }
+
c->u_mode.ocb.aad_nblocks = blkn;
if (burn_stack_depth)
@@ -1021,44 +1453,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
return nblocks;
}
-/* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
- encryption. Returns NULL on success. */
-static const char*
-selftest_ctr_128 (void)
-{
- const int nblocks = 32+16+1;
- const int blocksize = CAMELLIA_BLOCK_SIZE;
- const int context_size = sizeof(CAMELLIA_context);
-
- return _gcry_selftest_helper_ctr("CAMELLIA", &camellia_setkey,
- &camellia_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for CAMELLIA-CBC-128, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char*
-selftest_cbc_128 (void)
-{
- const int nblocks = 32+16+2;
- const int blocksize = CAMELLIA_BLOCK_SIZE;
- const int context_size = sizeof(CAMELLIA_context);
-
- return _gcry_selftest_helper_cbc("CAMELLIA", &camellia_setkey,
- &camellia_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption.
- Returns NULL on success. */
-static const char*
-selftest_cfb_128 (void)
-{
- const int nblocks = 32+16+2;
- const int blocksize = CAMELLIA_BLOCK_SIZE;
- const int context_size = sizeof(CAMELLIA_context);
-
- return _gcry_selftest_helper_cfb("CAMELLIA", &camellia_setkey,
- &camellia_encrypt, nblocks, blocksize, context_size);
-}
static const char *
selftest(void)
@@ -1066,7 +1460,6 @@ selftest(void)
CAMELLIA_context ctx;
byte scratch[16];
cipher_bulk_ops_t bulk_ops;
- const char *r;
/* These test vectors are from RFC-3713 */
static const byte plaintext[]=
@@ -1130,15 +1523,6 @@ selftest(void)
if(memcmp(scratch,plaintext,sizeof(plaintext))!=0)
return "CAMELLIA-256 test decryption failed.";
- if ( (r = selftest_ctr_128 ()) )
- return r;
-
- if ( (r = selftest_cbc_128 ()) )
- return r;
-
- if ( (r = selftest_cfb_128 ()) )
- return r;
-
return NULL;
}
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 837ea0fe..20bf7479 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -45,7 +45,6 @@
#include "bithelp.h"
#include "bufhelp.h"
#include "cipher-internal.h"
-#include "cipher-selftest.h"
/* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */
#undef USE_AMD64_ASM
@@ -991,48 +990,6 @@ _gcry_cast5_cfb_dec(void *context, unsigned char *iv, void *outbuf_arg,
}
-/* Run the self-tests for CAST5-CTR, tests IV increment of bulk CTR
- encryption. Returns NULL on success. */
-static const char *
-selftest_ctr (void)
-{
- const int nblocks = 4+1;
- const int blocksize = CAST5_BLOCKSIZE;
- const int context_size = sizeof(CAST5_context);
-
- return _gcry_selftest_helper_ctr("CAST5", &cast_setkey,
- &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for CAST5-CBC, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char *
-selftest_cbc (void)
-{
- const int nblocks = 4+2;
- const int blocksize = CAST5_BLOCKSIZE;
- const int context_size = sizeof(CAST5_context);
-
- return _gcry_selftest_helper_cbc("CAST5", &cast_setkey,
- &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for CAST5-CFB, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char *
-selftest_cfb (void)
-{
- const int nblocks = 4+2;
- const int blocksize = CAST5_BLOCKSIZE;
- const int context_size = sizeof(CAST5_context);
-
- return _gcry_selftest_helper_cfb("CAST5", &cast_setkey,
- &encrypt_block, nblocks, blocksize, context_size);
-}
-
-
static const char*
selftest(void)
{
@@ -1046,7 +1003,6 @@ selftest(void)
static const byte cipher[8] =
{ 0x23, 0x8B, 0x4F, 0xE5, 0x84, 0x7E, 0x44, 0xB2 };
byte buffer[8];
- const char *r;
cast_setkey( &c, key, 16, &bulk_ops );
encrypt_block( &c, buffer, plain );
@@ -1082,15 +1038,6 @@ selftest(void)
}
#endif
- if ( (r = selftest_cbc ()) )
- return r;
-
- if ( (r = selftest_cfb ()) )
- return r;
-
- if ( (r = selftest_ctr ()) )
- return r;
-
return NULL;
}
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
index 2a980b95..540f892b 100644
--- a/cipher/chacha20-aarch64.S
+++ b/cipher/chacha20-aarch64.S
@@ -206,10 +206,10 @@ _gcry_chacha20_aarch64_blocks4:
*/
CFI_STARTPROC()
- GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8);
+ GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8);
add INPUT_CTR, INPUT, #(12*4);
ld1 {ROT8.16b}, [CTR];
- GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter);
+ GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter);
mov INPUT_POS, INPUT;
ld1 {VCTR.16b}, [CTR];
@@ -383,10 +383,10 @@ _gcry_chacha20_poly1305_aarch64_blocks4:
mov POLY_RSTATE, x4;
mov POLY_RSRC, x5;
- GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8);
+ GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_rot8);
add INPUT_CTR, INPUT, #(12*4);
ld1 {ROT8.16b}, [CTR];
- GET_DATA_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter);
+ GET_LOCAL_POINTER(CTR, _gcry_chacha20_aarch64_blocks4_data_inc_counter);
mov INPUT_POS, INPUT;
ld1 {VCTR.16b}, [CTR];
diff --git a/cipher/chacha20-amd64-avx512.S b/cipher/chacha20-amd64-avx512.S
index da24286e..8b4d7499 100644
--- a/cipher/chacha20-amd64-avx512.S
+++ b/cipher/chacha20-amd64-avx512.S
@@ -287,7 +287,7 @@ _gcry_chacha20_amd64_avx512_blocks16:
/* clear the used vector registers */
clear_zmm16_zmm31();
- kmovd %eax, %k2;
+ kxord %k2, %k2, %k2;
vzeroall; /* clears ZMM0-ZMM15 */
/* eax zeroed by round loop. */
diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S
index 687fabe3..78f3ad2d 100644
--- a/cipher/cipher-gcm-armv8-aarch64-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch64-ce.S
@@ -169,7 +169,7 @@ _gcry_ghash_armv8_ce_pmull:
cbz x3, .Ldo_nothing;
- GET_DATA_POINTER(x5, .Lrconst)
+ GET_LOCAL_POINTER(x5, .Lrconst)
eor vZZ.16b, vZZ.16b, vZZ.16b
ld1 {rhash.16b}, [x1]
@@ -368,7 +368,7 @@ _gcry_polyval_armv8_ce_pmull:
cbz x3, .Lpolyval_do_nothing;
- GET_DATA_POINTER(x5, .Lrconst)
+ GET_LOCAL_POINTER(x5, .Lrconst)
eor vZZ.16b, vZZ.16b, vZZ.16b
ld1 {rhash.16b}, [x1]
@@ -589,7 +589,7 @@ _gcry_ghash_setup_armv8_ce_pmull:
*/
CFI_STARTPROC()
- GET_DATA_POINTER(x2, .Lrconst)
+ GET_LOCAL_POINTER(x2, .Lrconst)
eor vZZ.16b, vZZ.16b, vZZ.16b
diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c
deleted file mode 100644
index d7f38a42..00000000
--- a/cipher/cipher-selftest.c
+++ /dev/null
@@ -1,512 +0,0 @@
-/* cipher-selftest.c - Helper functions for bulk encryption selftests.
- * Copyright (C) 2013,2020 Jussi Kivilinna <[email protected]>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <config.h>
-#ifdef HAVE_SYSLOG
-# include <syslog.h>
-#endif /*HAVE_SYSLOG*/
-
-#include "types.h"
-#include "g10lib.h"
-#include "cipher.h"
-#include "bufhelp.h"
-#include "cipher-selftest.h"
-#include "cipher-internal.h"
-
-#ifdef HAVE_STDINT_H
-# include <stdint.h> /* uintptr_t */
-#elif defined(HAVE_INTTYPES_H)
-# include <inttypes.h>
-#else
-/* In this case, uintptr_t is provided by config.h. */
-#endif
-
-/* Helper macro to force alignment to 16 bytes. */
-#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
-# define ATTR_ALIGNED_16 __attribute__ ((aligned (16)))
-#else
-# define ATTR_ALIGNED_16
-#endif
-
-
-/* Return an allocated buffers of size CONTEXT_SIZE with an alignment
- of 16. The caller must free that buffer using the address returned
- at R_MEM. Returns NULL and sets ERRNO on failure. */
-void *
-_gcry_cipher_selftest_alloc_ctx (const int context_size, unsigned char **r_mem)
-{
- int offs;
- unsigned int ctx_aligned_size, memsize;
-
- ctx_aligned_size = context_size + 15;
- ctx_aligned_size -= ctx_aligned_size & 0xf;
-
- memsize = ctx_aligned_size + 16;
-
- *r_mem = xtrycalloc (1, memsize);
- if (!*r_mem)
- return NULL;
-
- offs = (16 - ((uintptr_t)*r_mem & 15)) & 15;
- return (void*)(*r_mem + offs);
-}
-
-
-/* Run the self-tests for <block cipher>-CBC-<block size>, tests bulk CBC
- decryption. Returns NULL on success. */
-const char *
-_gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey_func,
- gcry_cipher_encrypt_t encrypt_one,
- const int nblocks, const int blocksize,
- const int context_size)
-{
- cipher_bulk_ops_t bulk_ops = { 0, };
- int i, offs;
- unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
- unsigned int ctx_aligned_size, memsize;
-
- static const unsigned char key[16] ATTR_ALIGNED_16 = {
- 0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
- 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
- };
-
- /* Allocate buffers, align first two elements to 16 bytes and latter to
- block size. */
- ctx_aligned_size = context_size + 15;
- ctx_aligned_size -= ctx_aligned_size & 0xf;
-
- memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
-
- mem = xtrycalloc (1, memsize);
- if (!mem)
- return "failed to allocate memory";
-
- offs = (16 - ((uintptr_t)mem & 15)) & 15;
- ctx = (void*)(mem + offs);
- iv = ctx + ctx_aligned_size;
- iv2 = iv + blocksize;
- plaintext = iv2 + blocksize;
- plaintext2 = plaintext + nblocks * blocksize;
- ciphertext = plaintext2 + nblocks * blocksize;
-
- /* Initialize ctx */
- if (setkey_func (ctx, key, sizeof(key), &bulk_ops) != GPG_ERR_NO_ERROR)
- {
- xfree(mem);
- return "setkey failed";
- }
-
- /* Test single block code path */
- memset (iv, 0x4e, blocksize);
- memset (iv2, 0x4e, blocksize);
- for (i = 0; i < blocksize; i++)
- plaintext[i] = i;
-
- /* CBC manually. */
- buf_xor (ciphertext, iv, plaintext, blocksize);
- encrypt_one (ctx, ciphertext, ciphertext);
- memcpy (iv, ciphertext, blocksize);
-
- /* CBC decrypt. */
- bulk_ops.cbc_dec (ctx, iv2, plaintext2, ciphertext, 1);
- if (memcmp (plaintext2, plaintext, blocksize))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CBC-%d test failed (plaintext mismatch)", cipher,
- blocksize * 8);
-#else
- (void)cipher; /* Not used. */
-#endif
- return "selftest for CBC failed - see syslog for details";
- }
-
- if (memcmp (iv2, iv, blocksize))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CBC-%d test failed (IV mismatch)", cipher, blocksize * 8);
-#endif
- return "selftest for CBC failed - see syslog for details";
- }
-
- /* Test parallelized code paths */
- memset (iv, 0x5f, blocksize);
- memset (iv2, 0x5f, blocksize);
-
- for (i = 0; i < nblocks * blocksize; i++)
- plaintext[i] = i;
-
- /* Create CBC ciphertext manually. */
- for (i = 0; i < nblocks * blocksize; i+=blocksize)
- {
- buf_xor (&ciphertext[i], iv, &plaintext[i], blocksize);
- encrypt_one (ctx, &ciphertext[i], &ciphertext[i]);
- memcpy (iv, &ciphertext[i], blocksize);
- }
-
- /* Decrypt using bulk CBC and compare result. */
- bulk_ops.cbc_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
-
- if (memcmp (plaintext2, plaintext, nblocks * blocksize))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CBC-%d test failed (plaintext mismatch, parallel path)",
- cipher, blocksize * 8);
-#endif
- return "selftest for CBC failed - see syslog for details";
- }
- if (memcmp (iv2, iv, blocksize))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CBC-%d test failed (IV mismatch, parallel path)",
- cipher, blocksize * 8);
-#endif
- return "selftest for CBC failed - see syslog for details";
- }
-
- xfree (mem);
- return NULL;
-}
-
-/* Run the self-tests for <block cipher>-CFB-<block size>, tests bulk CFB
- decryption. Returns NULL on success. */
-const char *
-_gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey_func,
- gcry_cipher_encrypt_t encrypt_one,
- const int nblocks, const int blocksize,
- const int context_size)
-{
- cipher_bulk_ops_t bulk_ops = { 0, };
- int i, offs;
- unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
- unsigned int ctx_aligned_size, memsize;
-
- static const unsigned char key[16] ATTR_ALIGNED_16 = {
- 0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
- 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
- };
-
- /* Allocate buffers, align first two elements to 16 bytes and latter to
- block size. */
- ctx_aligned_size = context_size + 15;
- ctx_aligned_size -= ctx_aligned_size & 0xf;
-
- memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
-
- mem = xtrycalloc (1, memsize);
- if (!mem)
- return "failed to allocate memory";
-
- offs = (16 - ((uintptr_t)mem & 15)) & 15;
- ctx = (void*)(mem + offs);
- iv = ctx + ctx_aligned_size;
- iv2 = iv + blocksize;
- plaintext = iv2 + blocksize;
- plaintext2 = plaintext + nblocks * blocksize;
- ciphertext = plaintext2 + nblocks * blocksize;
-
- /* Initialize ctx */
- if (setkey_func (ctx, key, sizeof(key), &bulk_ops) != GPG_ERR_NO_ERROR)
- {
- xfree(mem);
- return "setkey failed";
- }
-
- /* Test single block code path */
- memset(iv, 0xd3, blocksize);
- memset(iv2, 0xd3, blocksize);
- for (i = 0; i < blocksize; i++)
- plaintext[i] = i;
-
- /* CFB manually. */
- encrypt_one (ctx, ciphertext, iv);
- buf_xor_2dst (iv, ciphertext, plaintext, blocksize);
-
- /* CFB decrypt. */
- bulk_ops.cfb_dec (ctx, iv2, plaintext2, ciphertext, 1);
- if (memcmp(plaintext2, plaintext, blocksize))
- {
- xfree(mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CFB-%d test failed (plaintext mismatch)", cipher,
- blocksize * 8);
-#else
- (void)cipher; /* Not used. */
-#endif
- return "selftest for CFB failed - see syslog for details";
- }
-
- if (memcmp(iv2, iv, blocksize))
- {
- xfree(mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CFB-%d test failed (IV mismatch)", cipher, blocksize * 8);
-#endif
- return "selftest for CFB failed - see syslog for details";
- }
-
- /* Test parallelized code paths */
- memset(iv, 0xe6, blocksize);
- memset(iv2, 0xe6, blocksize);
-
- for (i = 0; i < nblocks * blocksize; i++)
- plaintext[i] = i;
-
- /* Create CFB ciphertext manually. */
- for (i = 0; i < nblocks * blocksize; i+=blocksize)
- {
- encrypt_one (ctx, &ciphertext[i], iv);
- buf_xor_2dst (iv, &ciphertext[i], &plaintext[i], blocksize);
- }
-
- /* Decrypt using bulk CBC and compare result. */
- bulk_ops.cfb_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
-
- if (memcmp(plaintext2, plaintext, nblocks * blocksize))
- {
- xfree(mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CFB-%d test failed (plaintext mismatch, parallel path)",
- cipher, blocksize * 8);
-#endif
- return "selftest for CFB failed - see syslog for details";
- }
- if (memcmp(iv2, iv, blocksize))
- {
- xfree(mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CFB-%d test failed (IV mismatch, parallel path)", cipher,
- blocksize * 8);
-#endif
- return "selftest for CFB failed - see syslog for details";
- }
-
- xfree(mem);
- return NULL;
-}
-
-/* Run the self-tests for <block cipher>-CTR-<block size>, tests IV increment
- of bulk CTR encryption. Returns NULL on success. */
-const char *
-_gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey_func,
- gcry_cipher_encrypt_t encrypt_one,
- const int nblocks, const int blocksize,
- const int context_size)
-{
- cipher_bulk_ops_t bulk_ops = { 0, };
- int i, j, offs, diff;
- unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *ciphertext2,
- *iv, *iv2, *mem;
- unsigned int ctx_aligned_size, memsize;
-
- static const unsigned char key[16] ATTR_ALIGNED_16 = {
- 0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
- 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
- };
-
- /* Allocate buffers, align first two elements to 16 bytes and latter to
- block size. */
- ctx_aligned_size = context_size + 15;
- ctx_aligned_size -= ctx_aligned_size & 0xf;
-
- memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 4) + 16;
-
- mem = xtrycalloc (1, memsize);
- if (!mem)
- return "failed to allocate memory";
-
- offs = (16 - ((uintptr_t)mem & 15)) & 15;
- ctx = (void*)(mem + offs);
- iv = ctx + ctx_aligned_size;
- iv2 = iv + blocksize;
- plaintext = iv2 + blocksize;
- plaintext2 = plaintext + nblocks * blocksize;
- ciphertext = plaintext2 + nblocks * blocksize;
- ciphertext2 = ciphertext + nblocks * blocksize;
-
- /* Initialize ctx */
- if (setkey_func (ctx, key, sizeof(key), &bulk_ops) != GPG_ERR_NO_ERROR)
- {
- xfree(mem);
- return "setkey failed";
- }
-
- /* Test single block code path */
- memset (iv, 0xff, blocksize);
- for (i = 0; i < blocksize; i++)
- plaintext[i] = i;
-
- /* CTR manually. */
- encrypt_one (ctx, ciphertext, iv);
- for (i = 0; i < blocksize; i++)
- ciphertext[i] ^= plaintext[i];
- for (i = blocksize; i > 0; i--)
- {
- iv[i-1]++;
- if (iv[i-1])
- break;
- }
-
- memset (iv2, 0xff, blocksize);
- bulk_ops.ctr_enc (ctx, iv2, plaintext2, ciphertext, 1);
-
- if (memcmp (plaintext2, plaintext, blocksize))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CTR-%d test failed (plaintext mismatch)", cipher,
- blocksize * 8);
-#else
- (void)cipher; /* Not used. */
-#endif
- return "selftest for CTR failed - see syslog for details";
- }
-
- if (memcmp (iv2, iv, blocksize))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CTR-%d test failed (IV mismatch)", cipher,
- blocksize * 8);
-#endif
- return "selftest for CTR failed - see syslog for details";
- }
-
- /* Test bulk encryption with typical IV. */
- memset(iv, 0x57, blocksize-4);
- iv[blocksize-1] = 1;
- iv[blocksize-2] = 0;
- iv[blocksize-3] = 0;
- iv[blocksize-4] = 0;
- memset(iv2, 0x57, blocksize-4);
- iv2[blocksize-1] = 1;
- iv2[blocksize-2] = 0;
- iv2[blocksize-3] = 0;
- iv2[blocksize-4] = 0;
-
- for (i = 0; i < blocksize * nblocks; i++)
- plaintext2[i] = plaintext[i] = i;
-
- /* Create CTR ciphertext manually. */
- for (i = 0; i < blocksize * nblocks; i+=blocksize)
- {
- encrypt_one (ctx, &ciphertext[i], iv);
- for (j = 0; j < blocksize; j++)
- ciphertext[i+j] ^= plaintext[i+j];
- for (j = blocksize; j > 0; j--)
- {
- iv[j-1]++;
- if (iv[j-1])
- break;
- }
- }
-
- bulk_ops.ctr_enc (ctx, iv2, ciphertext2, plaintext2, nblocks);
-
- if (memcmp (ciphertext2, ciphertext, blocksize * nblocks))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CTR-%d test failed (ciphertext mismatch, bulk)", cipher,
- blocksize * 8);
-#endif
- return "selftest for CTR failed - see syslog for details";
- }
- if (memcmp(iv2, iv, blocksize))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CTR-%d test failed (IV mismatch, bulk)", cipher,
- blocksize * 8);
-#endif
- return "selftest for CTR failed - see syslog for details";
- }
-
- /* Test parallelized code paths (check counter overflow handling) */
- for (diff = 0; diff < nblocks; diff++) {
- memset(iv, 0xff, blocksize);
- iv[blocksize-1] -= diff;
- iv[0] = iv[1] = 0;
- iv[2] = 0x07;
-
- for (i = 0; i < blocksize * nblocks; i++)
- plaintext[i] = i;
-
- /* Create CTR ciphertext manually. */
- for (i = 0; i < blocksize * nblocks; i+=blocksize)
- {
- encrypt_one (ctx, &ciphertext[i], iv);
- for (j = 0; j < blocksize; j++)
- ciphertext[i+j] ^= plaintext[i+j];
- for (j = blocksize; j > 0; j--)
- {
- iv[j-1]++;
- if (iv[j-1])
- break;
- }
- }
-
- /* Decrypt using bulk CTR and compare result. */
- memset(iv2, 0xff, blocksize);
- iv2[blocksize-1] -= diff;
- iv2[0] = iv2[1] = 0;
- iv2[2] = 0x07;
-
- bulk_ops.ctr_enc (ctx, iv2, plaintext2, ciphertext, nblocks);
-
- if (memcmp (plaintext2, plaintext, blocksize * nblocks))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CTR-%d test failed (plaintext mismatch, diff: %d)", cipher,
- blocksize * 8, diff);
-#endif
- return "selftest for CTR failed - see syslog for details";
- }
- if (memcmp(iv2, iv, blocksize))
- {
- xfree (mem);
-#ifdef HAVE_SYSLOG
- syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
- "%s-CTR-%d test failed (IV mismatch, diff: %d)", cipher,
- blocksize * 8, diff);
-#endif
- return "selftest for CTR failed - see syslog for details";
- }
- }
-
- xfree (mem);
- return NULL;
-}
diff --git a/cipher/cipher-selftest.h b/cipher/cipher-selftest.h
deleted file mode 100644
index c3090ad1..00000000
--- a/cipher/cipher-selftest.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* cipher-selftest.h - Helper functions for bulk encryption selftests.
- * Copyright (C) 2013,2020 Jussi Kivilinna <[email protected]>
- *
- * This file is part of Libgcrypt.
- *
- * Libgcrypt is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser general Public License as
- * published by the Free Software Foundation; either version 2.1 of
- * the License, or (at your option) any later version.
- *
- * Libgcrypt is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef G10_SELFTEST_HELP_H
-#define G10_SELFTEST_HELP_H
-
-#include <config.h>
-#include "types.h"
-#include "g10lib.h"
-#include "cipher.h"
-
-typedef void (*gcry_cipher_bulk_cbc_dec_t)(void *context, unsigned char *iv,
- void *outbuf_arg,
- const void *inbuf_arg,
- size_t nblocks);
-
-typedef void (*gcry_cipher_bulk_cfb_dec_t)(void *context, unsigned char *iv,
- void *outbuf_arg,
- const void *inbuf_arg,
- size_t nblocks);
-
-typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
- void *outbuf_arg,
- const void *inbuf_arg,
- size_t nblocks);
-
-/* Helper function to allocate an aligned context for selftests. */
-void *_gcry_cipher_selftest_alloc_ctx (const int context_size,
- unsigned char **r_mem);
-
-
-/* Helper function for bulk CBC decryption selftest */
-const char *
-_gcry_selftest_helper_cbc (const char *cipher, gcry_cipher_setkey_t setkey,
- gcry_cipher_encrypt_t encrypt_one,
- const int nblocks, const int blocksize,
- const int context_size);
-
-/* Helper function for bulk CFB decryption selftest */
-const char *
-_gcry_selftest_helper_cfb (const char *cipher, gcry_cipher_setkey_t setkey,
- gcry_cipher_encrypt_t encrypt_one,
- const int nblocks, const int blocksize,
- const int context_size);
-
-/* Helper function for bulk CTR encryption selftest */
-const char *
-_gcry_selftest_helper_ctr (const char *cipher, gcry_cipher_setkey_t setkey,
- gcry_cipher_encrypt_t encrypt_one,
- const int nblocks, const int blocksize,
- const int context_size);
-
-#endif /*G10_SELFTEST_HELP_H*/
diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S
index 7ac884af..b6cdbb3d 100644
--- a/cipher/crc-armv8-aarch64-ce.S
+++ b/cipher/crc-armv8-aarch64-ce.S
@@ -71,7 +71,7 @@ _gcry_crc32r_armv8_ce_bulk:
*/
CFI_STARTPROC()
- GET_DATA_POINTER(x7, .Lcrc32_constants)
+ GET_LOCAL_POINTER(x7, .Lcrc32_constants)
add x9, x3, #consts_k(5 - 1)
cmp x2, #128
@@ -280,7 +280,7 @@ _gcry_crc32_armv8_ce_bulk:
*/
CFI_STARTPROC()
- GET_DATA_POINTER(x7, .Lcrc32_constants)
+ GET_LOCAL_POINTER(x7, .Lcrc32_constants)
add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants
cmp x2, #128
ld1 {v7.16b}, [x4]
diff --git a/cipher/des.c b/cipher/des.c
index 51116fcf..7a81697a 100644
--- a/cipher/des.c
+++ b/cipher/des.c
@@ -120,7 +120,6 @@
#include "cipher.h"
#include "bufhelp.h"
#include "cipher-internal.h"
-#include "cipher-selftest.h"
#define DES_BLOCKSIZE 8
@@ -1047,66 +1046,6 @@ is_weak_key ( const byte *key )
}
-/* Alternative setkey for selftests; need larger key than default. */
-static gcry_err_code_t
-bulk_selftest_setkey (void *context, const byte *__key, unsigned __keylen,
- cipher_bulk_ops_t *bulk_ops)
-{
- static const unsigned char key[24] ATTR_ALIGNED_16 = {
- 0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
- 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22,
- 0x18,0x2A,0x39,0x47,0x5E,0x6F,0x75,0x82
- };
-
- (void)__key;
- (void)__keylen;
-
- return do_tripledes_setkey(context, key, sizeof(key), bulk_ops);
-}
-
-
-/* Run the self-tests for DES-CTR, tests IV increment of bulk CTR
- encryption. Returns NULL on success. */
-static const char *
-selftest_ctr (void)
-{
- const int nblocks = 3+1;
- const int blocksize = DES_BLOCKSIZE;
- const int context_size = sizeof(struct _tripledes_ctx);
-
- return _gcry_selftest_helper_ctr("3DES", &bulk_selftest_setkey,
- &do_tripledes_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for DES-CBC, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char *
-selftest_cbc (void)
-{
- const int nblocks = 3+2;
- const int blocksize = DES_BLOCKSIZE;
- const int context_size = sizeof(struct _tripledes_ctx);
-
- return _gcry_selftest_helper_cbc("3DES", &bulk_selftest_setkey,
- &do_tripledes_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for DES-CFB, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char *
-selftest_cfb (void)
-{
- const int nblocks = 3+2;
- const int blocksize = DES_BLOCKSIZE;
- const int context_size = sizeof(struct _tripledes_ctx);
-
- return _gcry_selftest_helper_cfb("3DES", &bulk_selftest_setkey,
- &do_tripledes_encrypt, nblocks, blocksize, context_size);
-}
-
-
/*
* Performs a selftest of this DES/Triple-DES implementation.
* Returns an string with the error text on failure.
@@ -1115,8 +1054,6 @@ selftest_cfb (void)
static const char *
selftest (void)
{
- const char *r;
-
/*
* Check if 'u32' is really 32 bits wide. This DES / 3DES implementation
* need this.
@@ -1296,15 +1233,6 @@ selftest (void)
return "DES weak key detection failed";
}
- if ( (r = selftest_cbc ()) )
- return r;
-
- if ( (r = selftest_cfb ()) )
- return r;
-
- if ( (r = selftest_ctr ()) )
- return r;
-
return 0;
}
diff --git a/cipher/kdf.c b/cipher/kdf.c
index 0e196432..dc6aaeb7 100644
--- a/cipher/kdf.c
+++ b/cipher/kdf.c
@@ -1382,8 +1382,266 @@ balloon_close (balloon_ctx_t b)
wipememory (b, n);
xfree (b);
}
+
+typedef struct onestep_kdf_context *onestep_kdf_ctx_t;
+
+/* OneStepKDF context */
+struct onestep_kdf_context {
+ int algo;
+ gcry_md_hd_t md;
+ unsigned int blklen;
+ unsigned int outlen;
+ const void *input;
+ size_t inputlen;
+ const void *fixedinfo;
+ size_t fixedinfolen;
+};
+
+static gpg_err_code_t
+onestep_kdf_open (gcry_kdf_hd_t *hd, int hashalgo,
+ const unsigned long *param, unsigned int paramlen,
+ const void *input, size_t inputlen,
+ const void *fixedinfo, size_t fixedinfolen)
+{
+ gpg_err_code_t ec;
+ unsigned int outlen;
+ onestep_kdf_ctx_t o;
+ size_t n;
+
+ if (paramlen != 1)
+ return GPG_ERR_INV_VALUE;
+ else
+ outlen = (unsigned int)param[0];
+
+ n = sizeof (struct onestep_kdf_context);
+ o = xtrymalloc (n);
+ if (!o)
+ return gpg_err_code_from_errno (errno);
+
+ o->blklen = _gcry_md_get_algo_dlen (hashalgo);
+ if (!o->blklen)
+ {
+ xfree (o);
+ return GPG_ERR_DIGEST_ALGO;
+ }
+ ec = _gcry_md_open (&o->md, hashalgo, 0);
+ if (ec)
+ {
+ xfree (o);
+ return ec;
+ }
+ o->algo = GCRY_KDF_ONESTEP_KDF;
+ o->outlen = outlen;
+ o->input = input;
+ o->inputlen = inputlen;
+ o->fixedinfo = fixedinfo;
+ o->fixedinfolen = fixedinfolen;
+
+ *hd = (void *)o;
+ return 0;
+}
+
+
+static gpg_err_code_t
+onestep_kdf_compute (onestep_kdf_ctx_t o, const struct gcry_kdf_thread_ops *ops)
+{
+ (void)o;
+
+ if (ops != NULL)
+ return GPG_ERR_INV_VALUE;
+
+ return 0;
+}
+
+static gpg_err_code_t
+onestep_kdf_final (onestep_kdf_ctx_t o, size_t resultlen, void *result)
+{
+ u32 counter = 0;
+ unsigned char cnt[4];
+ int i;
+
+ if (resultlen != o->outlen)
+ return GPG_ERR_INV_VALUE;
+
+ for (i = 0; i < o->outlen / o->blklen; i++)
+ {
+ counter++;
+ buf_put_be32 (cnt, counter);
+ _gcry_md_write (o->md, cnt, sizeof (cnt));
+ _gcry_md_write (o->md, o->input, o->inputlen);
+ _gcry_md_write (o->md, o->fixedinfo, o->fixedinfolen);
+ _gcry_md_final (o->md);
+ memcpy ((char *)result + o->blklen * i,
+ _gcry_md_read (o->md, 0), o->blklen);
+ resultlen -= o->blklen;
+ _gcry_md_reset (o->md);
+ }
+
+ if (resultlen)
+ {
+ counter++;
+ buf_put_be32 (cnt, counter);
+ _gcry_md_write (o->md, cnt, sizeof (cnt));
+ _gcry_md_write (o->md, o->input, o->inputlen);
+ _gcry_md_write (o->md, o->fixedinfo, o->fixedinfolen);
+ _gcry_md_final (o->md);
+ memcpy ((char *)result + o->blklen * i,
+ _gcry_md_read (o->md, 0), resultlen);
+ }
+
+ return 0;
+}
+
+static void
+onestep_kdf_close (onestep_kdf_ctx_t o)
+{
+ _gcry_md_close (o->md);
+ xfree (o);
+}
+
+typedef struct onestep_kdf_mac_context *onestep_kdf_mac_ctx_t;
+
+/* OneStep_KDF_MAC context */
+struct onestep_kdf_mac_context {
+ int algo;
+ gcry_mac_hd_t md;
+ unsigned int blklen;
+ unsigned int outlen;
+ const void *input;
+ size_t inputlen;
+ const void *salt;
+ size_t saltlen;
+ const void *fixedinfo;
+ size_t fixedinfolen;
+};
+
+static gpg_err_code_t
+onestep_kdf_mac_open (gcry_kdf_hd_t *hd, int macalgo,
+ const unsigned long *param, unsigned int paramlen,
+ const void *input, size_t inputlen,
+ const void *key, size_t keylen,
+ const void *fixedinfo, size_t fixedinfolen)
+{
+ gpg_err_code_t ec;
+ unsigned int outlen;
+ onestep_kdf_mac_ctx_t o;
+ size_t n;
+
+ if (paramlen != 1)
+ return GPG_ERR_INV_VALUE;
+ else
+ outlen = (unsigned int)param[0];
+
+ n = sizeof (struct onestep_kdf_mac_context);
+ o = xtrymalloc (n);
+ if (!o)
+ return gpg_err_code_from_errno (errno);
+ o->blklen = _gcry_mac_get_algo_maclen (macalgo);
+ if (!o->blklen)
+ {
+ xfree (o);
+ return GPG_ERR_MAC_ALGO;
+ }
+ ec = _gcry_mac_open (&o->md, macalgo, 0, NULL);
+ if (ec)
+ {
+ xfree (o);
+ return ec;
+ }
+ o->algo = GCRY_KDF_ONESTEP_KDF_MAC;
+ o->outlen = outlen;
+ o->input = input;
+ o->inputlen = inputlen;
+ o->salt = key;
+ o->saltlen = keylen;
+ o->fixedinfo = fixedinfo;
+ o->fixedinfolen = fixedinfolen;
+
+ *hd = (void *)o;
+ return 0;
+}
+
+static gpg_err_code_t
+onestep_kdf_mac_compute (onestep_kdf_mac_ctx_t o,
+ const struct gcry_kdf_thread_ops *ops)
+{
+ (void)o;
+
+ if (ops != NULL)
+ return GPG_ERR_INV_VALUE;
+
+ return 0;
+}
+
+static gpg_err_code_t
+onestep_kdf_mac_final (onestep_kdf_mac_ctx_t o, size_t resultlen, void *result)
+{
+ u32 counter = 0;
+ unsigned char cnt[4];
+ int i;
+ gcry_err_code_t ec;
+ size_t len = o->blklen;
+
+ if (resultlen != o->outlen)
+ return GPG_ERR_INV_VALUE;
+
+ ec = _gcry_mac_setkey (o->md, o->salt, o->saltlen);
+ if (ec)
+ return ec;
+
+ for (i = 0; i < o->outlen / o->blklen; i++)
+ {
+ counter++;
+ buf_put_be32 (cnt, counter);
+ ec = _gcry_mac_write (o->md, cnt, sizeof (cnt));
+ if (ec)
+ return ec;
+ ec = _gcry_mac_write (o->md, o->input, o->inputlen);
+ if (ec)
+ return ec;
+ ec = _gcry_mac_write (o->md, o->fixedinfo, o->fixedinfolen);
+ if (ec)
+ return ec;
+ ec = _gcry_mac_read (o->md, (char *)result + o->blklen * i, &len);
+ if (ec)
+ return ec;
+ resultlen -= o->blklen;
+ ec = _gcry_mac_ctl (o->md, GCRYCTL_RESET, NULL, 0);
+ if (ec)
+ return ec;
+ }
+
+ if (resultlen)
+ {
+ counter++;
+ len = resultlen;
+ buf_put_be32 (cnt, counter);
+ ec = _gcry_mac_write (o->md, cnt, sizeof (cnt));
+ if (ec)
+ return ec;
+ ec = _gcry_mac_write (o->md, o->input, o->inputlen);
+ if (ec)
+ return ec;
+ ec =_gcry_mac_write (o->md, o->fixedinfo, o->fixedinfolen);
+ if (ec)
+ return ec;
+ ec = _gcry_mac_read (o->md, (char *)result + o->blklen * i, &len);
+ if (ec)
+ return ec;
+ }
+
+ return 0;
+}
+
+static void
+onestep_kdf_mac_close (onestep_kdf_mac_ctx_t o)
+{
+ _gcry_mac_close (o->md);
+ xfree (o);
+}
+
struct gcry_kdf_handle {
int algo;
/* And algo specific parts come. */
@@ -1392,7 +1650,7 @@ struct gcry_kdf_handle {
gpg_err_code_t
_gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo,
const unsigned long *param, unsigned int paramlen,
- const void *passphrase, size_t passphraselen,
+ const void *input, size_t inputlen,
const void *salt, size_t saltlen,
const void *key, size_t keylen,
const void *ad, size_t adlen)
@@ -1402,23 +1660,46 @@ _gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo,
switch (algo)
{
case GCRY_KDF_ARGON2:
- if (!passphraselen || !saltlen)
+ if (!inputlen || !saltlen)
ec = GPG_ERR_INV_VALUE;
else
ec = argon2_open (hd, subalgo, param, paramlen,
- passphrase, passphraselen, salt, saltlen,
+ input, inputlen, salt, saltlen,
key, keylen, ad, adlen);
break;
case GCRY_KDF_BALLOON:
- if (!passphraselen || !saltlen || keylen || adlen)
+ if (!inputlen || !saltlen || keylen || adlen)
ec = GPG_ERR_INV_VALUE;
else
{
(void)key;
(void)ad;
ec = balloon_open (hd, subalgo, param, paramlen,
- passphrase, passphraselen, salt, saltlen);
+ input, inputlen, salt, saltlen);
+ }
+ break;
+
+ case GCRY_KDF_ONESTEP_KDF:
+ if (!inputlen || !paramlen || !adlen)
+ ec = GPG_ERR_INV_VALUE;
+ else
+ {
+ (void)salt;
+ (void)key;
+ ec = onestep_kdf_open (hd, subalgo, param, paramlen,
+ input, inputlen, ad, adlen);
+ }
+ break;
+
+ case GCRY_KDF_ONESTEP_KDF_MAC:
+ if (!inputlen || !paramlen || !keylen || !adlen)
+ ec = GPG_ERR_INV_VALUE;
+ else
+ {
+ (void)salt;
+ ec = onestep_kdf_mac_open (hd, subalgo, param, paramlen,
+ input, inputlen, key, keylen, ad, adlen);
}
break;
@@ -1445,6 +1726,14 @@ _gcry_kdf_compute (gcry_kdf_hd_t h, const struct gcry_kdf_thread_ops *ops)
ec = balloon_compute_all ((balloon_ctx_t)(void *)h, ops);
break;
+ case GCRY_KDF_ONESTEP_KDF:
+ ec = onestep_kdf_compute ((onestep_kdf_ctx_t)(void *)h, ops);
+ break;
+
+ case GCRY_KDF_ONESTEP_KDF_MAC:
+ ec = onestep_kdf_mac_compute ((onestep_kdf_mac_ctx_t)(void *)h, ops);
+ break;
+
default:
ec = GPG_ERR_UNKNOWN_ALGORITHM;
break;
@@ -1469,6 +1758,15 @@ _gcry_kdf_final (gcry_kdf_hd_t h, size_t resultlen, void *result)
ec = balloon_final ((balloon_ctx_t)(void *)h, resultlen, result);
break;
+ case GCRY_KDF_ONESTEP_KDF:
+ ec = onestep_kdf_final ((onestep_kdf_ctx_t)(void *)h, resultlen, result);
+ break;
+
+ case GCRY_KDF_ONESTEP_KDF_MAC:
+ ec = onestep_kdf_mac_final ((onestep_kdf_mac_ctx_t)(void *)h,
+ resultlen, result);
+ break;
+
default:
ec = GPG_ERR_UNKNOWN_ALGORITHM;
break;
@@ -1490,6 +1788,14 @@ _gcry_kdf_close (gcry_kdf_hd_t h)
balloon_close ((balloon_ctx_t)(void *)h);
break;
+ case GCRY_KDF_ONESTEP_KDF:
+ onestep_kdf_close ((onestep_kdf_ctx_t)(void *)h);
+ break;
+
+ case GCRY_KDF_ONESTEP_KDF_MAC:
+ onestep_kdf_mac_close ((onestep_kdf_mac_ctx_t)(void *)h);
+ break;
+
default:
break;
}
diff --git a/cipher/poly1305-amd64-avx512.S b/cipher/poly1305-amd64-avx512.S
index 48892777..72303e1e 100644
--- a/cipher/poly1305-amd64-avx512.S
+++ b/cipher/poly1305-amd64-avx512.S
@@ -1614,8 +1614,8 @@ _gcry_poly1305_amd64_avx512_blocks:
FUNC_EXIT()
xor eax, eax
- kmovw k1, eax
- kmovw k2, eax
+ kxorw k1, k1, k1
+ kxorw k2, k2, k2
ret_spec_stop
CFI_ENDPROC()
ELF(.size _gcry_poly1305_amd64_avx512_blocks,
diff --git a/cipher/pubkey-util.c b/cipher/pubkey-util.c
index 68defea6..4953caf3 100644
--- a/cipher/pubkey-util.c
+++ b/cipher/pubkey-util.c
@@ -957,7 +957,10 @@ _gcry_pk_util_data_to_mpi (gcry_sexp_t input, gcry_mpi_t *ret_mpi,
void *random_override = NULL;
size_t random_override_len = 0;
- if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) || !valuelen )
+ /* The RSA PKCS#1.5 encryption is no longer supported by FIPS */
+ if (fips_mode ())
+ rc = GPG_ERR_INV_FLAG;
+ else if ( !(value=sexp_nth_data (lvalue, 1, &valuelen)) || !valuelen )
rc = GPG_ERR_INV_OBJ;
else
{
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index ff6b0b26..156af015 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -27,7 +27,6 @@
#include "g10lib.h"
#include "cipher.h"
#include "bufhelp.h"
-#include "cipher-selftest.h"
#include "rijndael-internal.h"
#include "./cipher-internal.h"
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index b24ae3e9..e53c940e 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -27,7 +27,6 @@
#include "g10lib.h"
#include "cipher.h"
#include "bufhelp.h"
-#include "cipher-selftest.h"
#include "rijndael-internal.h"
#include "./cipher-internal.h"
diff --git a/cipher/rijndael-padlock.c b/cipher/rijndael-padlock.c
index 3af214d7..2583b834 100644
--- a/cipher/rijndael-padlock.c
+++ b/cipher/rijndael-padlock.c
@@ -27,7 +27,6 @@
#include "g10lib.h"
#include "cipher.h"
#include "bufhelp.h"
-#include "cipher-selftest.h"
#include "rijndael-internal.h"
#ifdef USE_PADLOCK
diff --git a/cipher/rijndael-ssse3-amd64.c b/cipher/rijndael-ssse3-amd64.c
index b0723853..0f0abf62 100644
--- a/cipher/rijndael-ssse3-amd64.c
+++ b/cipher/rijndael-ssse3-amd64.c
@@ -43,7 +43,6 @@
#include "g10lib.h"
#include "cipher.h"
#include "bufhelp.h"
-#include "cipher-selftest.h"
#include "rijndael-internal.h"
#include "./cipher-internal.h"
diff --git a/cipher/rijndael-vaes.c b/cipher/rijndael-vaes.c
index 0d7d1367..dbcf9afa 100644
--- a/cipher/rijndael-vaes.c
+++ b/cipher/rijndael-vaes.c
@@ -26,7 +26,6 @@
#include "g10lib.h"
#include "cipher.h"
#include "bufhelp.h"
-#include "cipher-selftest.h"
#include "rijndael-internal.h"
#include "./cipher-internal.h"
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 9b96b616..dddcbc54 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -46,7 +46,6 @@
#include "g10lib.h"
#include "cipher.h"
#include "bufhelp.h"
-#include "cipher-selftest.h"
#include "rijndael-internal.h"
#include "./cipher-internal.h"
@@ -1535,7 +1534,7 @@ static const char*
selftest_basic_128 (void)
{
RIJNDAEL_context *ctx;
- unsigned char *ctxmem;
+ unsigned char ctxmem[sizeof(*ctx) + 16];
unsigned char scratch[16];
cipher_bulk_ops_t bulk_ops;
@@ -1579,21 +1578,15 @@ selftest_basic_128 (void)
};
#endif
- /* Because gcc/ld can only align the CTX struct on 8 bytes on the
- stack, we need to allocate that context on the heap. */
- ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem);
- if (!ctx)
- return "failed to allocate memory";
+ ctx = (void *)(ctxmem + ((16 - ((uintptr_t)ctxmem & 15)) & 15));
rijndael_setkey (ctx, key_128, sizeof (key_128), &bulk_ops);
rijndael_encrypt (ctx, scratch, plaintext_128);
if (memcmp (scratch, ciphertext_128, sizeof (ciphertext_128)))
{
- xfree (ctxmem);
return "AES-128 test encryption failed.";
}
rijndael_decrypt (ctx, scratch, scratch);
- xfree (ctxmem);
if (memcmp (scratch, plaintext_128, sizeof (plaintext_128)))
return "AES-128 test decryption failed.";
@@ -1605,7 +1598,7 @@ static const char*
selftest_basic_192 (void)
{
RIJNDAEL_context *ctx;
- unsigned char *ctxmem;
+ unsigned char ctxmem[sizeof(*ctx) + 16];
unsigned char scratch[16];
cipher_bulk_ops_t bulk_ops;
@@ -1626,18 +1619,15 @@ selftest_basic_192 (void)
0x12,0x13,0x1A,0xC7,0xC5,0x47,0x88,0xAA
};
- ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem);
- if (!ctx)
- return "failed to allocate memory";
+ ctx = (void *)(ctxmem + ((16 - ((uintptr_t)ctxmem & 15)) & 15));
+
rijndael_setkey (ctx, key_192, sizeof(key_192), &bulk_ops);
rijndael_encrypt (ctx, scratch, plaintext_192);
if (memcmp (scratch, ciphertext_192, sizeof (ciphertext_192)))
{
- xfree (ctxmem);
return "AES-192 test encryption failed.";
}
rijndael_decrypt (ctx, scratch, scratch);
- xfree (ctxmem);
if (memcmp (scratch, plaintext_192, sizeof (plaintext_192)))
return "AES-192 test decryption failed.";
@@ -1650,7 +1640,7 @@ static const char*
selftest_basic_256 (void)
{
RIJNDAEL_context *ctx;
- unsigned char *ctxmem;
+ unsigned char ctxmem[sizeof(*ctx) + 16];
unsigned char scratch[16];
cipher_bulk_ops_t bulk_ops;
@@ -1672,18 +1662,15 @@ selftest_basic_256 (void)
0x9A,0xCF,0x72,0x80,0x86,0x04,0x0A,0xE3
};
- ctx = _gcry_cipher_selftest_alloc_ctx (sizeof *ctx, &ctxmem);
- if (!ctx)
- return "failed to allocate memory";
+ ctx = (void *)(ctxmem + ((16 - ((uintptr_t)ctxmem & 15)) & 15));
+
rijndael_setkey (ctx, key_256, sizeof(key_256), &bulk_ops);
rijndael_encrypt (ctx, scratch, plaintext_256);
if (memcmp (scratch, ciphertext_256, sizeof (ciphertext_256)))
{
- xfree (ctxmem);
return "AES-256 test encryption failed.";
}
rijndael_decrypt (ctx, scratch, scratch);
- xfree (ctxmem);
if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
return "AES-256 test decryption failed.";
@@ -1691,60 +1678,6 @@ selftest_basic_256 (void)
}
-/* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR
- encryption. Returns NULL on success. */
-static const char*
-selftest_ctr_128 (void)
-{
-#ifdef USE_VAES
- const int nblocks = 16+1;
-#else
- const int nblocks = 8+1;
-#endif
- const int blocksize = BLOCKSIZE;
- const int context_size = sizeof(RIJNDAEL_context);
-
- return _gcry_selftest_helper_ctr("AES", &rijndael_setkey,
- &rijndael_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for AES-CBC-128, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char*
-selftest_cbc_128 (void)
-{
-#ifdef USE_VAES
- const int nblocks = 16+2;
-#else
- const int nblocks = 8+2;
-#endif
- const int blocksize = BLOCKSIZE;
- const int context_size = sizeof(RIJNDAEL_context);
-
- return _gcry_selftest_helper_cbc("AES", &rijndael_setkey,
- &rijndael_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption.
- Returns NULL on success. */
-static const char*
-selftest_cfb_128 (void)
-{
-#ifdef USE_VAES
- const int nblocks = 16+2;
-#else
- const int nblocks = 8+2;
-#endif
- const int blocksize = BLOCKSIZE;
- const int context_size = sizeof(RIJNDAEL_context);
-
- return _gcry_selftest_helper_cfb("AES", &rijndael_setkey,
- &rijndael_encrypt, nblocks, blocksize, context_size);
-}
-
-
/* Run all the self-tests and return NULL on success. This function
is used for the on-the-fly self-tests. */
static const char *
@@ -1757,15 +1690,6 @@ selftest (void)
|| (r = selftest_basic_256 ()) )
return r;
- if ( (r = selftest_ctr_128 ()) )
- return r;
-
- if ( (r = selftest_cbc_128 ()) )
- return r;
-
- if ( (r = selftest_cfb_128 ()) )
- return r;
-
return r;
}
diff --git a/cipher/rsa.c b/cipher/rsa.c
index 3f1cd722..9f2b36e8 100644
--- a/cipher/rsa.c
+++ b/cipher/rsa.c
@@ -352,13 +352,35 @@ generate_std (RSA_secret_key *sk, unsigned int nbits, unsigned long use_e,
static gpg_err_code_t
rsa_check_keysize (unsigned int nbits)
{
- if (fips_mode() && nbits < 2048)
+ if (fips_mode () && nbits < 2048)
return GPG_ERR_INV_VALUE;
return GPG_ERR_NO_ERROR;
}
+/* Check the RSA key length is acceptable for signature verification
+ *
+ * FIPS allows signature verification with RSA keys of size
+ * 1024, 1280, 1536 and 1792 in legacy mode, but this is up to the
+ * calling application to decide if the signature is legacy and
+ * should be accepted.
+ */
+static gpg_err_code_t
+rsa_check_verify_keysize (unsigned int nbits)
+{
+ if (fips_mode ())
+ {
+ if ((nbits >= 1024 && (nbits % 256) == 0) || nbits >= 2048)
+ return GPG_ERR_NO_ERROR;
+
+ return GPG_ERR_INV_VALUE;
+ }
+
+ return GPG_ERR_NO_ERROR;
+}
+
+
/****************
* Generate a key pair with a key of size NBITS.
* USE_E = 0 let Libcgrypt decide what exponent to use.
@@ -476,12 +498,13 @@ generate_fips (RSA_secret_key *sk, unsigned int nbits, unsigned long use_e,
retry:
/* generate p and q */
- for (i = 0; i < 5 * pbits; i++)
+ for (i = 0; i < 10 * pbits; i++)
{
ploop:
if (!testparms)
{
_gcry_mpi_randomize (p, pbits, random_level);
+ mpi_set_bit (p, 0);
}
if (mpi_cmp (p, minp) < 0)
{
@@ -505,15 +528,16 @@ generate_fips (RSA_secret_key *sk, unsigned int nbits, unsigned long use_e,
else if (testparms)
goto err;
}
- if (i >= 5 * pbits)
+ if (i >= 10 * pbits)
goto err;
- for (i = 0; i < 5 * pbits; i++)
+ for (i = 0; i < 20 * pbits; i++)
{
qloop:
if (!testparms)
{
_gcry_mpi_randomize (q, pbits, random_level);
+ mpi_set_bit (q, 0);
}
if (mpi_cmp (q, minp) < 0)
{
@@ -553,7 +577,7 @@ generate_fips (RSA_secret_key *sk, unsigned int nbits, unsigned long use_e,
else if (testparms)
goto err;
}
- if (i >= 5 * pbits)
+ if (i >= 20 * pbits)
goto err;
if (testparms)
@@ -1389,6 +1413,11 @@ rsa_decrypt (gcry_sexp_t *r_plain, gcry_sexp_t s_data, gcry_sexp_t keyparms)
rc = GPG_ERR_INV_DATA;
goto leave;
}
+ if (fips_mode () && (ctx.encoding == PUBKEY_ENC_PKCS1))
+ {
+ rc = GPG_ERR_INV_FLAG;
+ goto leave;
+ }
/* Extract the key. */
rc = sexp_extract_param (keyparms, NULL, "nedp?q?u?",
@@ -1595,7 +1624,7 @@ rsa_verify (gcry_sexp_t s_sig, gcry_sexp_t s_data, gcry_sexp_t keyparms)
gcry_mpi_t result = NULL;
unsigned int nbits = rsa_get_nbits (keyparms);
- rc = rsa_check_keysize (nbits);
+ rc = rsa_check_verify_keysize (nbits);
if (rc)
return rc;
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 159d889f..11eeb079 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -30,7 +30,7 @@
#include "bithelp.h"
#include "bufhelp.h"
#include "cipher-internal.h"
-#include "cipher-selftest.h"
+#include "bulkhelp.h"
/* USE_SSE2 indicates whether to compile with AMD64 SSE2 code. */
@@ -1272,27 +1272,11 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
int did_use_avx2 = 0;
u64 Ls[16];
- unsigned int n = 16 - (blkn % 16);
u64 *l;
- int i;
if (nblocks >= 16)
{
- for (i = 0; i < 16; i += 8)
- {
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- }
-
- Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(15 + n) % 16];
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
/* Process data in 16 block chunks. */
while (nblocks >= 16)
@@ -1329,21 +1313,11 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
int did_use_sse2 = 0;
u64 Ls[8];
- unsigned int n = 8 - (blkn % 8);
u64 *l;
if (nblocks >= 8)
{
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- l = &Ls[(7 + n) % 8];
+ l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
/* Process data in 8 block chunks. */
while (nblocks >= 8)
@@ -1380,33 +1354,25 @@ _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
if (ctx->use_neon)
{
int did_use_neon = 0;
- const void *Ls[8];
- unsigned int n = 8 - (blkn % 8);
- const void **l;
+ uintptr_t Ls[8];
+ uintptr_t *l;
if (nblocks >= 8)
{
- Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
- Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
- Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
- Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
- Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
- Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
- Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
- l = &Ls[(7 + n) % 8];
+ l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
blkn += 8;
- *l = ocb_get_l(c, blkn - blkn % 8);
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
if (encrypt)
_gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
+ c->u_ctr.ctr, (void **)Ls);
else
_gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
- c->u_ctr.ctr, Ls);
+ c->u_ctr.ctr, (void **)Ls);
nblocks -= 8;
outbuf += 8 * sizeof(serpent_block_t);
@@ -1456,27 +1422,11 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
{
int did_use_avx2 = 0;
u64 Ls[16];
- unsigned int n = 16 - (blkn % 16);
u64 *l;
- int i;
if (nblocks >= 16)
{
- for (i = 0; i < 16; i += 8)
- {
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- }
-
- Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(15 + n) % 16];
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
/* Process data in 16 block chunks. */
while (nblocks >= 16)
@@ -1508,21 +1458,11 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
{
int did_use_sse2 = 0;
u64 Ls[8];
- unsigned int n = 8 - (blkn % 8);
u64 *l;
if (nblocks >= 8)
{
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- l = &Ls[(7 + n) % 8];
+ l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
/* Process data in 8 block chunks. */
while (nblocks >= 8)
@@ -1554,29 +1494,21 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
if (ctx->use_neon)
{
int did_use_neon = 0;
- const void *Ls[8];
- unsigned int n = 8 - (blkn % 8);
- const void **l;
+ uintptr_t Ls[8];
+ uintptr_t *l;
if (nblocks >= 8)
{
- Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
- Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
- Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
- Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
- Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
- Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
- Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
- l = &Ls[(7 + n) % 8];
+ l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
/* Process data in 8 block chunks. */
while (nblocks >= 8)
{
blkn += 8;
- *l = ocb_get_l(c, blkn - blkn % 8);
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);
_gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
- c->u_mode.ocb.aad_sum, Ls);
+ c->u_mode.ocb.aad_sum, (void **)Ls);
nblocks -= 8;
abuf += 8 * sizeof(serpent_block_t);
@@ -1607,48 +1539,6 @@ _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
-/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
- encryption. Returns NULL on success. */
-static const char*
-selftest_ctr_128 (void)
-{
- const int nblocks = 16+8+1;
- const int blocksize = sizeof(serpent_block_t);
- const int context_size = sizeof(serpent_context_t);
-
- return _gcry_selftest_helper_ctr("SERPENT", &serpent_setkey,
- &serpent_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char*
-selftest_cbc_128 (void)
-{
- const int nblocks = 16+8+2;
- const int blocksize = sizeof(serpent_block_t);
- const int context_size = sizeof(serpent_context_t);
-
- return _gcry_selftest_helper_cbc("SERPENT", &serpent_setkey,
- &serpent_encrypt, nblocks, blocksize, context_size);
-}
-
-
-/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char*
-selftest_cfb_128 (void)
-{
- const int nblocks = 16+8+2;
- const int blocksize = sizeof(serpent_block_t);
- const int context_size = sizeof(serpent_context_t);
-
- return _gcry_selftest_helper_cfb("SERPENT", &serpent_setkey,
- &serpent_encrypt, nblocks, blocksize, context_size);
-}
-
-
/* Serpent test. */
static const char *
@@ -1657,7 +1547,6 @@ serpent_test (void)
serpent_context_t context;
unsigned char scratch[16];
unsigned int i;
- const char *r;
static struct test
{
@@ -1729,15 +1618,6 @@ serpent_test (void)
}
}
- if ( (r = selftest_ctr_128 ()) )
- return r;
-
- if ( (r = selftest_cbc_128 ()) )
- return r;
-
- if ( (r = selftest_cfb_128 ()) )
- return r;
-
return NULL;
}
diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S
index ea26564b..f95717ee 100644
--- a/cipher/sha1-armv8-aarch64-ce.S
+++ b/cipher/sha1-armv8-aarch64-ce.S
@@ -109,7 +109,7 @@ _gcry_sha1_transform_armv8_ce:
cbz x2, .Ldo_nothing;
- GET_DATA_POINTER(x4, .LK_VEC);
+ GET_LOCAL_POINTER(x4, .LK_VEC);
ld1 {vH0123.4s}, [x0] /* load h0,h1,h2,h3 */
ld1 {vK1.4s-vK4.4s}, [x4] /* load K1,K2,K3,K4 */
diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S
index d0fa6285..5616eada 100644
--- a/cipher/sha256-armv8-aarch64-ce.S
+++ b/cipher/sha256-armv8-aarch64-ce.S
@@ -119,7 +119,7 @@ _gcry_sha256_transform_armv8_ce:
cbz x2, .Ldo_nothing;
- GET_DATA_POINTER(x3, .LK);
+ GET_LOCAL_POINTER(x3, .LK);
mov x4, x3
ld1 {vH0123.4s-vH4567.4s}, [x0] /* load state */
diff --git a/cipher/sha512-avx512-amd64.S b/cipher/sha512-avx512-amd64.S
index c0fdbc33..0e3f44ab 100644
--- a/cipher/sha512-avx512-amd64.S
+++ b/cipher/sha512-avx512-amd64.S
@@ -375,7 +375,7 @@ _gcry_sha512_transform_amd64_avx512:
addm([8*5 + CTX],f)
addm([8*6 + CTX],g)
addm([8*7 + CTX],h)
- kmovd MASK_DC_00, eax
+ kxord MASK_DC_00, MASK_DC_00, MASK_DC_00
vzeroall
vmovdqa [rsp + frame_XFER + 0*32], ymm0 /* burn stack */
diff --git a/cipher/sm3-aarch64.S b/cipher/sm3-aarch64.S
index 3fb89006..0e58254b 100644
--- a/cipher/sm3-aarch64.S
+++ b/cipher/sm3-aarch64.S
@@ -425,7 +425,7 @@ _gcry_sm3_transform_aarch64:
CFI_DEF_CFA_REGISTER(RFRAME);
sub addr0, sp, #STACK_SIZE;
- GET_DATA_POINTER(RKPTR, .LKtable);
+ GET_LOCAL_POINTER(RKPTR, .LKtable);
and sp, addr0, #(~63);
/* Preload first block. */
diff --git a/cipher/sm3-armv8-aarch64-ce.S b/cipher/sm3-armv8-aarch64-ce.S
index 0900b84f..d592d08a 100644
--- a/cipher/sm3-armv8-aarch64-ce.S
+++ b/cipher/sm3-armv8-aarch64-ce.S
@@ -170,7 +170,7 @@ _gcry_sm3_transform_armv8_ce:
ext CTX2.16b, CTX2.16b, CTX2.16b, #8;
.Lloop:
- GET_DATA_POINTER(x3, .Lsm3_Ktable);
+ GET_LOCAL_POINTER(x3, .Lsm3_Ktable);
ld1 {v0.16b-v3.16b}, [x1], #64;
sub x2, x2, #1;
diff --git a/cipher/sm4-aarch64.S b/cipher/sm4-aarch64.S
index 306b425e..8d06991b 100644
--- a/cipher/sm4-aarch64.S
+++ b/cipher/sm4-aarch64.S
@@ -84,7 +84,7 @@ ELF(.size _gcry_sm4_aarch64_consts,.-_gcry_sm4_aarch64_consts)
/* Helper macros. */
#define preload_sbox(ptr) \
- GET_DATA_POINTER(ptr, .Lsm4_sbox); \
+ GET_LOCAL_POINTER(ptr, .Lsm4_sbox); \
ld1 {v16.16b-v19.16b}, [ptr], #64; \
ld1 {v20.16b-v23.16b}, [ptr], #64; \
ld1 {v24.16b-v27.16b}, [ptr], #64; \
diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S
index 7a8b9558..e09fed8f 100644
--- a/cipher/sm4-aesni-avx2-amd64.S
+++ b/cipher/sm4-aesni-avx2-amd64.S
@@ -1,6 +1,6 @@
/* sm4-avx2-amd64.S - AVX2 implementation of SM4 cipher
*
- * Copyright (C) 2020 Jussi Kivilinna <[email protected]>
+ * Copyright (C) 2020, 2022 Jussi Kivilinna <[email protected]>
*
* This file is part of Libgcrypt.
*
@@ -45,11 +45,19 @@
#define RA1 %ymm9
#define RA2 %ymm10
#define RA3 %ymm11
+#define RA0x %xmm8
+#define RA1x %xmm9
+#define RA2x %xmm10
+#define RA3x %xmm11
#define RB0 %ymm12
#define RB1 %ymm13
#define RB2 %ymm14
#define RB3 %ymm15
+#define RB0x %xmm12
+#define RB1x %xmm13
+#define RB2x %xmm14
+#define RB3x %xmm15
#define RNOT %ymm0
#define RBSWAP %ymm1
@@ -252,14 +260,14 @@ __sm4_crypt_blk16:
leaq (32*4)(%rdi), %rax;
.align 16
-.Lroundloop_blk8:
+.Lroundloop_blk16:
ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
leaq (4*4)(%rdi), %rdi;
cmpq %rax, %rdi;
- jne .Lroundloop_blk8;
+ jne .Lroundloop_blk16;
#undef ROUND
@@ -280,6 +288,66 @@ __sm4_crypt_blk16:
CFI_ENDPROC();
ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;)
+.align 8
+.globl _gcry_sm4_aesni_avx2_crypt_blk1_16
+ELF(.type _gcry_sm4_aesni_avx2_crypt_blk1_16,@function;)
+_gcry_sm4_aesni_avx2_crypt_blk1_16:
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..16 blocks)
+ * %rdx: src (1..16 blocks)
+ * %rcx: num blocks (1..16)
+ */
+ CFI_STARTPROC();
+
+#define LOAD_INPUT(offset, yreg) \
+ cmpq $(1 + 2 * (offset)), %rcx; \
+ jb .Lblk16_load_input_done; \
+ ja 1f; \
+ vmovdqu (offset) * 32(%rdx), yreg##x; \
+ jmp .Lblk16_load_input_done; \
+ 1: \
+ vmovdqu (offset) * 32(%rdx), yreg;
+
+ LOAD_INPUT(0, RA0);
+ LOAD_INPUT(1, RA1);
+ LOAD_INPUT(2, RA2);
+ LOAD_INPUT(3, RA3);
+ LOAD_INPUT(4, RB0);
+ LOAD_INPUT(5, RB1);
+ LOAD_INPUT(6, RB2);
+ LOAD_INPUT(7, RB3);
+#undef LOAD_INPUT
+
+.Lblk16_load_input_done:
+ call __sm4_crypt_blk16;
+
+#define STORE_OUTPUT(yreg, offset) \
+ cmpq $(1 + 2 * (offset)), %rcx; \
+ jb .Lblk16_store_output_done; \
+ ja 1f; \
+ vmovdqu yreg##x, (offset) * 32(%rsi); \
+ jmp .Lblk16_store_output_done; \
+ 1: \
+ vmovdqu yreg, (offset) * 32(%rsi);
+
+ STORE_OUTPUT(RA0, 0);
+ STORE_OUTPUT(RA1, 1);
+ STORE_OUTPUT(RA2, 2);
+ STORE_OUTPUT(RA3, 3);
+ STORE_OUTPUT(RB0, 4);
+ STORE_OUTPUT(RB1, 5);
+ STORE_OUTPUT(RB2, 6);
+ STORE_OUTPUT(RB3, 7);
+#undef STORE_OUTPUT
+
+.Lblk16_store_output_done:
+ vzeroall;
+ xorl %eax, %eax;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_aesni_avx2_crypt_blk1_16,.-_gcry_sm4_aesni_avx2_crypt_blk1_16;)
+
#define inc_le128(x, minus_one, tmp) \
vpcmpeqq minus_one, x, tmp; \
vpsubq minus_one, x, x; \
@@ -301,8 +369,6 @@ _gcry_sm4_aesni_avx2_ctr_enc:
movq 8(%rcx), %rax;
bswapq %rax;
- vzeroupper;
-
vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
vpcmpeqd RNOT, RNOT, RNOT;
vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
@@ -410,8 +476,6 @@ _gcry_sm4_aesni_avx2_cbc_dec:
*/
CFI_STARTPROC();
- vzeroupper;
-
vmovdqu (0 * 32)(%rdx), RA0;
vmovdqu (1 * 32)(%rdx), RA1;
vmovdqu (2 * 32)(%rdx), RA2;
@@ -463,8 +527,6 @@ _gcry_sm4_aesni_avx2_cfb_dec:
*/
CFI_STARTPROC();
- vzeroupper;
-
/* Load input */
vmovdqu (%rcx), RNOTx;
vinserti128 $1, (%rdx), RNOT, RA0;
@@ -521,8 +583,6 @@ _gcry_sm4_aesni_avx2_ocb_enc:
*/
CFI_STARTPROC();
- vzeroupper;
-
subq $(4 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -635,8 +695,6 @@ _gcry_sm4_aesni_avx2_ocb_dec:
*/
CFI_STARTPROC();
- vzeroupper;
-
subq $(4 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(4 * 8);
@@ -758,8 +816,6 @@ _gcry_sm4_aesni_avx2_ocb_auth:
*/
CFI_STARTPROC();
- vzeroupper;
-
subq $(4 * 8), %rsp;
CFI_ADJUST_CFA_OFFSET(4 * 8);
diff --git a/cipher/sm4-gfni-avx2-amd64.S b/cipher/sm4-gfni-avx2-amd64.S
new file mode 100644
index 00000000..4ec0ea39
--- /dev/null
+++ b/cipher/sm4-gfni-avx2-amd64.S
@@ -0,0 +1,1194 @@
+/* sm4-gfni-avx2-amd64.S - GFNI/AVX2 implementation of SM4 cipher
+ *
+ * Copyright (C) 2022 Jussi Kivilinna <[email protected]>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <config.h>
+
+#ifdef __x86_64
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+ defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+
+#include "asm-common-amd64.h"
+
+/**********************************************************************
+ helper macros
+ **********************************************************************/
+
+/* Transpose four 32-bit words between 128-bit vectors. */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
+ vpunpckhdq x1, x0, t2; \
+ vpunpckldq x1, x0, x0; \
+ \
+ vpunpckldq x3, x2, t1; \
+ vpunpckhdq x3, x2, x2; \
+ \
+ vpunpckhqdq t1, x0, x1; \
+ vpunpcklqdq t1, x0, x0; \
+ \
+ vpunpckhqdq x2, t2, x3; \
+ vpunpcklqdq x2, t2, x2;
+
+/**********************************************************************
+ 4-way && 8-way SM4 with GFNI and AVX2
+ **********************************************************************/
+
+/* vector registers */
+#define RX0 %ymm0
+#define RX1 %ymm1
+#define RX0x %xmm0
+#define RX1x %xmm1
+
+#define RTMP0 %ymm2
+#define RTMP1 %ymm3
+#define RTMP2 %ymm4
+#define RTMP3 %ymm5
+#define RTMP4 %ymm6
+#define RTMP0x %xmm2
+#define RTMP1x %xmm3
+#define RTMP2x %xmm4
+#define RTMP3x %xmm5
+#define RTMP4x %xmm6
+
+#define RNOT %ymm7
+#define RNOTx %xmm7
+
+#define RA0 %ymm8
+#define RA1 %ymm9
+#define RA2 %ymm10
+#define RA3 %ymm11
+#define RA0x %xmm8
+#define RA1x %xmm9
+#define RA2x %xmm10
+#define RA3x %xmm11
+
+#define RB0 %ymm12
+#define RB1 %ymm13
+#define RB2 %ymm14
+#define RB3 %ymm15
+#define RB0x %xmm12
+#define RB1x %xmm13
+#define RB2x %xmm14
+#define RB3x %xmm15
+
+.text
+.align 32
+
+/* Affine transform, SM4 field to AES field */
+.Lpre_affine_s:
+ .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+ .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+ .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+ .byte 0x52, 0xbc, 0x2d, 0x02, 0x9e, 0x25, 0xac, 0x34
+
+/* Affine transform, AES field to SM4 field */
+.Lpost_affine_s:
+ .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+ .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+ .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+ .byte 0x19, 0x8b, 0x6c, 0x1e, 0x51, 0x8e, 0x2d, 0xd7
+
+/* Rotate left by 8 bits on 32-bit words with vpshufb */
+.Lrol_8:
+ .byte 0x03, 0x00, 0x01, 0x02, 0x07, 0x04, 0x05, 0x06
+ .byte 0x0b, 0x08, 0x09, 0x0a, 0x0f, 0x0c, 0x0d, 0x0e
+ .byte 0x03, 0x00, 0x01, 0x02, 0x07, 0x04, 0x05, 0x06
+ .byte 0x0b, 0x08, 0x09, 0x0a, 0x0f, 0x0c, 0x0d, 0x0e
+
+/* Rotate left by 16 bits on 32-bit words with vpshufb */
+.Lrol_16:
+ .byte 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05
+ .byte 0x0a, 0x0b, 0x08, 0x09, 0x0e, 0x0f, 0x0c, 0x0d
+ .byte 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05
+ .byte 0x0a, 0x0b, 0x08, 0x09, 0x0e, 0x0f, 0x0c, 0x0d
+
+/* Rotate left by 24 bits on 32-bit words with vpshufb */
+.Lrol_24:
+ .byte 0x01, 0x02, 0x03, 0x00, 0x05, 0x06, 0x07, 0x04
+ .byte 0x09, 0x0a, 0x0b, 0x08, 0x0d, 0x0e, 0x0f, 0x0c
+ .byte 0x01, 0x02, 0x03, 0x00, 0x05, 0x06, 0x07, 0x04
+ .byte 0x09, 0x0a, 0x0b, 0x08, 0x0d, 0x0e, 0x0f, 0x0c
+
+/* For CTR-mode IV byteswap */
+.Lbswap128_mask:
+ .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/* For input word byte-swap */
+.Lbswap32_mask:
+ .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+
+.align 8
+.globl _gcry_sm4_gfni_avx2_expand_key
+ELF(.type _gcry_sm4_gfni_avx2_expand_key,@function;)
+_gcry_sm4_gfni_avx2_expand_key:
+ /* input:
+ * %rdi: 128-bit key
+ * %rsi: rkey_enc
+ * %rdx: rkey_dec
+ * %rcx: fk array
+ * %r8: ck array
+ */
+ CFI_STARTPROC();
+
+ vmovd 0*4(%rdi), RA0x;
+ vmovd 1*4(%rdi), RA1x;
+ vmovd 2*4(%rdi), RA2x;
+ vmovd 3*4(%rdi), RA3x;
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2x;
+ vpshufb RTMP2x, RA0x, RA0x;
+ vpshufb RTMP2x, RA1x, RA1x;
+ vpshufb RTMP2x, RA2x, RA2x;
+ vpshufb RTMP2x, RA3x, RA3x;
+
+ vmovd 0*4(%rcx), RB0x;
+ vmovd 1*4(%rcx), RB1x;
+ vmovd 2*4(%rcx), RB2x;
+ vmovd 3*4(%rcx), RB3x;
+ vpxor RB0x, RA0x, RA0x;
+ vpxor RB1x, RA1x, RA1x;
+ vpxor RB2x, RA2x, RA2x;
+ vpxor RB3x, RA3x, RA3x;
+
+#define ROUND(round, s0, s1, s2, s3) \
+ vpbroadcastd (4*(round))(%r8), RX0x; \
+ vpxor s1, RX0x, RX0x; \
+ vpxor s2, RX0x, RX0x; \
+ vpxor s3, RX0x, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ vgf2p8affineqb $0x65, .Lpre_affine_s rRIP, RX0x, RX0x; \
+ vgf2p8affineinvqb $0xd3, .Lpost_affine_s rRIP, RX0x, RX0x; \
+ \
+ /* linear part */ \
+ vpxor RX0x, s0, s0; /* s0 ^ x */ \
+ vpslld $13, RX0x, RTMP0x; \
+ vpsrld $19, RX0x, RTMP1x; \
+ vpslld $23, RX0x, RTMP2x; \
+ vpsrld $9, RX0x, RTMP3x; \
+ vpxor RTMP0x, RTMP1x, RTMP1x; \
+ vpxor RTMP2x, RTMP3x, RTMP3x; \
+ vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,13) */ \
+ vpxor RTMP3x, s0, s0; /* s0 ^ x ^ rol(x,13) ^ rol(x,23) */
+
+ leaq (32*4)(%r8), %rax;
+ leaq (32*4)(%rdx), %rdx;
+.align 16
+.Lroundloop_expand_key:
+ leaq (-4*4)(%rdx), %rdx;
+ ROUND(0, RA0x, RA1x, RA2x, RA3x);
+ ROUND(1, RA1x, RA2x, RA3x, RA0x);
+ ROUND(2, RA2x, RA3x, RA0x, RA1x);
+ ROUND(3, RA3x, RA0x, RA1x, RA2x);
+ leaq (4*4)(%r8), %r8;
+ vmovd RA0x, (0*4)(%rsi);
+ vmovd RA1x, (1*4)(%rsi);
+ vmovd RA2x, (2*4)(%rsi);
+ vmovd RA3x, (3*4)(%rsi);
+ vmovd RA0x, (3*4)(%rdx);
+ vmovd RA1x, (2*4)(%rdx);
+ vmovd RA2x, (1*4)(%rdx);
+ vmovd RA3x, (0*4)(%rdx);
+ leaq (4*4)(%rsi), %rsi;
+ cmpq %rax, %r8;
+ jne .Lroundloop_expand_key;
+
+#undef ROUND
+
+ vzeroall;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_expand_key,.-_gcry_sm4_gfni_avx2_expand_key;)
+
+.align 8
+ELF(.type sm4_gfni_avx2_crypt_blk1_4,@function;)
+sm4_gfni_avx2_crypt_blk1_4:
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..4 blocks)
+ * %rdx: src (1..4 blocks)
+ * %rcx: num blocks (1..4)
+ */
+ CFI_STARTPROC();
+
+ vmovdqu 0*16(%rdx), RA0x;
+ vmovdqa RA0x, RA1x;
+ vmovdqa RA0x, RA2x;
+ vmovdqa RA0x, RA3x;
+ cmpq $2, %rcx;
+ jb .Lblk4_load_input_done;
+ vmovdqu 1*16(%rdx), RA1x;
+ je .Lblk4_load_input_done;
+ vmovdqu 2*16(%rdx), RA2x;
+ cmpq $3, %rcx;
+ je .Lblk4_load_input_done;
+ vmovdqu 3*16(%rdx), RA3x;
+
+.Lblk4_load_input_done:
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2x;
+ vpshufb RTMP2x, RA0x, RA0x;
+ vpshufb RTMP2x, RA1x, RA1x;
+ vpshufb RTMP2x, RA2x, RA2x;
+ vpshufb RTMP2x, RA3x, RA3x;
+
+ vmovdqa .Lrol_8 rRIP, RTMP2x;
+ vmovdqa .Lrol_16 rRIP, RTMP3x;
+ vmovdqa .Lrol_24 rRIP, RB3x;
+ transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+
+#define ROUND(round, s0, s1, s2, s3) \
+ vpbroadcastd (4*(round))(%rdi), RX0x; \
+ vpxor s1, RX0x, RX0x; \
+ vpxor s2, RX0x, RX0x; \
+ vpxor s3, RX0x, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ vgf2p8affineqb $0x65, .Lpre_affine_s rRIP, RX0x, RX0x; \
+ vgf2p8affineinvqb $0xd3, .Lpost_affine_s rRIP, RX0x, RX0x; \
+ \
+ /* linear part */ \
+ vpxor RX0x, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP2x, RX0x, RTMP1x; \
+ vpxor RTMP1x, RX0x, RTMP0x; /* x ^ rol(x,8) */ \
+ vpshufb RTMP3x, RX0x, RTMP1x; \
+ vpxor RTMP1x, RTMP0x, RTMP0x; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RB3x, RX0x, RTMP1x; \
+ vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0x, RTMP1x; \
+ vpsrld $30, RTMP0x, RTMP0x; \
+ vpxor RTMP0x, s0, s0; \
+ vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk4:
+ ROUND(0, RA0x, RA1x, RA2x, RA3x);
+ ROUND(1, RA1x, RA2x, RA3x, RA0x);
+ ROUND(2, RA2x, RA3x, RA0x, RA1x);
+ ROUND(3, RA3x, RA0x, RA1x, RA2x);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk4;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2x;
+
+ transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+ vpshufb RTMP2x, RA0x, RA0x;
+ vpshufb RTMP2x, RA1x, RA1x;
+ vpshufb RTMP2x, RA2x, RA2x;
+ vpshufb RTMP2x, RA3x, RA3x;
+
+ vmovdqu RA0x, 0*16(%rsi);
+ cmpq $2, %rcx;
+ jb .Lblk4_store_output_done;
+ vmovdqu RA1x, 1*16(%rsi);
+ je .Lblk4_store_output_done;
+ vmovdqu RA2x, 2*16(%rsi);
+ cmpq $3, %rcx;
+ je .Lblk4_store_output_done;
+ vmovdqu RA3x, 3*16(%rsi);
+
+.Lblk4_store_output_done:
+ vzeroall;
+ xorl %eax, %eax;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size sm4_gfni_avx2_crypt_blk1_4,.-sm4_gfni_avx2_crypt_blk1_4;)
+
+.align 8
+ELF(.type __sm4_gfni_crypt_blk8,@function;)
+__sm4_gfni_crypt_blk8:
+ /* input:
+ * %rdi: round key array, CTX
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel
+ * ciphertext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext
+ * blocks
+ */
+ CFI_STARTPROC();
+
+ vmovdqa .Lbswap32_mask rRIP, RTMP2x;
+ vpshufb RTMP2x, RA0x, RA0x;
+ vpshufb RTMP2x, RA1x, RA1x;
+ vpshufb RTMP2x, RA2x, RA2x;
+ vpshufb RTMP2x, RA3x, RA3x;
+ vpshufb RTMP2x, RB0x, RB0x;
+ vpshufb RTMP2x, RB1x, RB1x;
+ vpshufb RTMP2x, RB2x, RB2x;
+ vpshufb RTMP2x, RB3x, RB3x;
+
+ transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+ transpose_4x4(RB0x, RB1x, RB2x, RB3x, RTMP0x, RTMP1x);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+ vpbroadcastd (4*(round))(%rdi), RX0x; \
+ vmovdqa .Lpre_affine_s rRIP, RTMP2x; \
+ vmovdqa .Lpost_affine_s rRIP, RTMP3x; \
+ vmovdqa RX0x, RX1x; \
+ vpxor s1, RX0x, RX0x; \
+ vpxor s2, RX0x, RX0x; \
+ vpxor s3, RX0x, RX0x; /* s1 ^ s2 ^ s3 ^ rk */ \
+ vpxor r1, RX1x, RX1x; \
+ vpxor r2, RX1x, RX1x; \
+ vpxor r3, RX1x, RX1x; /* r1 ^ r2 ^ r3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ vmovdqa .Lrol_8 rRIP, RTMP4x; \
+ vgf2p8affineqb $0x65, RTMP2x, RX0x, RX0x; \
+ vgf2p8affineinvqb $0xd3, RTMP3x, RX0x, RX0x; \
+ vgf2p8affineqb $0x65, RTMP2x, RX1x, RX1x; \
+ vgf2p8affineinvqb $0xd3, RTMP3x, RX1x, RX1x; \
+ \
+ /* linear part */ \
+ vpxor RX0x, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP4x, RX0x, RTMP1x; \
+ vpxor RTMP1x, RX0x, RTMP0x; /* x ^ rol(x,8) */ \
+ vpxor RX1x, r0, r0; /* r0 ^ x */ \
+ vpshufb RTMP4x, RX1x, RTMP3x; \
+ vmovdqa .Lrol_16 rRIP, RTMP4x; \
+ vpxor RTMP3x, RX1x, RTMP2x; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4x, RX0x, RTMP1x; \
+ vpxor RTMP1x, RTMP0x, RTMP0x; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4x, RX1x, RTMP3x; \
+ vmovdqa .Lrol_24 rRIP, RTMP4x; \
+ vpxor RTMP3x, RTMP2x, RTMP2x; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4x, RX0x, RTMP1x; \
+ vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0x, RTMP1x; \
+ vpsrld $30, RTMP0x, RTMP0x; \
+ vpxor RTMP0x, s0, s0; \
+ vpxor RTMP1x, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpshufb RTMP4x, RX1x, RTMP3x; \
+ vpxor RTMP3x, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP2x, RTMP3x; \
+ vpsrld $30, RTMP2x, RTMP2x; \
+ vpxor RTMP2x, r0, r0; \
+ vpxor RTMP3x, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk8:
+ ROUND(0, RA0x, RA1x, RA2x, RA3x, RB0x, RB1x, RB2x, RB3x);
+ ROUND(1, RA1x, RA2x, RA3x, RA0x, RB1x, RB2x, RB3x, RB0x);
+ ROUND(2, RA2x, RA3x, RA0x, RA1x, RB2x, RB3x, RB0x, RB1x);
+ ROUND(3, RA3x, RA0x, RA1x, RA2x, RB3x, RB0x, RB1x, RB2x);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk8;
+
+#undef ROUND
+
+ vmovdqa .Lbswap128_mask rRIP, RTMP2x;
+
+ transpose_4x4(RA0x, RA1x, RA2x, RA3x, RTMP0x, RTMP1x);
+ transpose_4x4(RB0x, RB1x, RB2x, RB3x, RTMP0x, RTMP1x);
+ vpshufb RTMP2x, RA0x, RA0x;
+ vpshufb RTMP2x, RA1x, RA1x;
+ vpshufb RTMP2x, RA2x, RA2x;
+ vpshufb RTMP2x, RA3x, RA3x;
+ vpshufb RTMP2x, RB0x, RB0x;
+ vpshufb RTMP2x, RB1x, RB1x;
+ vpshufb RTMP2x, RB2x, RB2x;
+ vpshufb RTMP2x, RB3x, RB3x;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size __sm4_gfni_crypt_blk8,.-__sm4_gfni_crypt_blk8;)
+
+.align 8
+ELF(.type _gcry_sm4_gfni_avx2_crypt_blk1_8,@function;)
+_gcry_sm4_gfni_avx2_crypt_blk1_8:
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..8 blocks)
+ * %rdx: src (1..8 blocks)
+ * %rcx: num blocks (1..8)
+ */
+ CFI_STARTPROC();
+
+ cmpq $5, %rcx;
+ jb sm4_gfni_avx2_crypt_blk1_4;
+ vmovdqu (0 * 16)(%rdx), RA0x;
+ vmovdqu (1 * 16)(%rdx), RA1x;
+ vmovdqu (2 * 16)(%rdx), RA2x;
+ vmovdqu (3 * 16)(%rdx), RA3x;
+ vmovdqu (4 * 16)(%rdx), RB0x;
+ vmovdqa RB0x, RB1x;
+ vmovdqa RB0x, RB2x;
+ vmovdqa RB0x, RB3x;
+ je .Lblk8_load_input_done;
+ vmovdqu (5 * 16)(%rdx), RB1x;
+ cmpq $7, %rcx;
+ jb .Lblk8_load_input_done;
+ vmovdqu (6 * 16)(%rdx), RB2x;
+ je .Lblk8_load_input_done;
+ vmovdqu (7 * 16)(%rdx), RB3x;
+
+.Lblk8_load_input_done:
+ call __sm4_gfni_crypt_blk8;
+
+ cmpq $6, %rcx;
+ vmovdqu RA0x, (0 * 16)(%rsi);
+ vmovdqu RA1x, (1 * 16)(%rsi);
+ vmovdqu RA2x, (2 * 16)(%rsi);
+ vmovdqu RA3x, (3 * 16)(%rsi);
+ vmovdqu RB0x, (4 * 16)(%rsi);
+ jb .Lblk8_store_output_done;
+ vmovdqu RB1x, (5 * 16)(%rsi);
+ je .Lblk8_store_output_done;
+ vmovdqu RB2x, (6 * 16)(%rsi);
+ cmpq $7, %rcx;
+ je .Lblk8_store_output_done;
+ vmovdqu RB3x, (7 * 16)(%rsi);
+
+.Lblk8_store_output_done:
+ vzeroall;
+ xorl %eax, %eax;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_crypt_blk1_8,.-_gcry_sm4_gfni_avx2_crypt_blk1_8;)
+
+/**********************************************************************
+ 16-way SM4 with GFNI and AVX2
+ **********************************************************************/
+
+.align 8
+ELF(.type __sm4_gfni_crypt_blk16,@function;)
+__sm4_gfni_crypt_blk16:
+ /* input:
+ * %rdi: ctx, CTX
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * plaintext blocks
+ * output:
+ * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel
+ * ciphertext blocks
+ */
+ CFI_STARTPROC();
+
+ vbroadcasti128 .Lbswap32_mask rRIP, RTMP2;
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+
+#define ROUND(round, s0, s1, s2, s3, r0, r1, r2, r3) \
+ vpbroadcastd (4*(round))(%rdi), RX0; \
+ vbroadcasti128 .Lpre_affine_s rRIP, RTMP2; \
+ vbroadcasti128 .Lpost_affine_s rRIP, RTMP3; \
+ vmovdqa RX0, RX1; \
+ vpxor s1, RX0, RX0; \
+ vpxor s2, RX0, RX0; \
+ vpxor s3, RX0, RX0; /* s1 ^ s2 ^ s3 ^ rk */ \
+ vpxor r1, RX1, RX1; \
+ vpxor r2, RX1, RX1; \
+ vpxor r3, RX1, RX1; /* r1 ^ r2 ^ r3 ^ rk */ \
+ \
+ /* sbox, non-linear part */ \
+ vbroadcasti128 .Lrol_8 rRIP, RTMP4; \
+ vgf2p8affineqb $0x65, RTMP2, RX0, RX0; \
+ vgf2p8affineinvqb $0xd3, RTMP3, RX0, RX0; \
+ vgf2p8affineqb $0x65, RTMP2, RX1, RX1; \
+ vgf2p8affineinvqb $0xd3, RTMP3, RX1, RX1; \
+ \
+ /* linear part */ \
+ vpxor RX0, s0, s0; /* s0 ^ x */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RX0, RTMP0; /* x ^ rol(x,8) */ \
+ vpxor RX1, r0, r0; /* r0 ^ x */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vbroadcasti128 .Lrol_16 rRIP, RTMP4; \
+ vpxor RTMP3, RX1, RTMP2; /* x ^ rol(x,8) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, RTMP0, RTMP0; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vbroadcasti128 .Lrol_24 rRIP, RTMP4; \
+ vpxor RTMP3, RTMP2, RTMP2; /* x ^ rol(x,8) ^ rol(x,16) */ \
+ vpshufb RTMP4, RX0, RTMP1; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP0, RTMP1; \
+ vpsrld $30, RTMP0, RTMP0; \
+ vpxor RTMP0, s0, s0; \
+ vpxor RTMP1, s0, s0; /* s0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */ \
+ vpshufb RTMP4, RX1, RTMP3; \
+ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,24) */ \
+ vpslld $2, RTMP2, RTMP3; \
+ vpsrld $30, RTMP2, RTMP2; \
+ vpxor RTMP2, r0, r0; \
+ vpxor RTMP3, r0, r0; /* r0 ^ x ^ rol(x,2) ^ rol(x,10) ^ rol(x,18) ^ rol(x,24) */
+
+ leaq (32*4)(%rdi), %rax;
+.align 16
+.Lroundloop_blk16:
+ ROUND(0, RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3);
+ ROUND(1, RA1, RA2, RA3, RA0, RB1, RB2, RB3, RB0);
+ ROUND(2, RA2, RA3, RA0, RA1, RB2, RB3, RB0, RB1);
+ ROUND(3, RA3, RA0, RA1, RA2, RB3, RB0, RB1, RB2);
+ leaq (4*4)(%rdi), %rdi;
+ cmpq %rax, %rdi;
+ jne .Lroundloop_blk16;
+
+#undef ROUND
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP2;
+
+ transpose_4x4(RA0, RA1, RA2, RA3, RTMP0, RTMP1);
+ transpose_4x4(RB0, RB1, RB2, RB3, RTMP0, RTMP1);
+ vpshufb RTMP2, RA0, RA0;
+ vpshufb RTMP2, RA1, RA1;
+ vpshufb RTMP2, RA2, RA2;
+ vpshufb RTMP2, RA3, RA3;
+ vpshufb RTMP2, RB0, RB0;
+ vpshufb RTMP2, RB1, RB1;
+ vpshufb RTMP2, RB2, RB2;
+ vpshufb RTMP2, RB3, RB3;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size __sm4_gfni_crypt_blk16,.-__sm4_gfni_crypt_blk16;)
+
+.align 8
+.globl _gcry_sm4_gfni_avx2_crypt_blk1_16
+ELF(.type _gcry_sm4_gfni_avx2_crypt_blk1_16,@function;)
+_gcry_sm4_gfni_avx2_crypt_blk1_16:
+ /* input:
+ * %rdi: round key array, CTX
+ * %rsi: dst (1..16 blocks)
+ * %rdx: src (1..16 blocks)
+ * %rcx: num blocks (1..16)
+ */
+ CFI_STARTPROC();
+
+#define LOAD_INPUT(offset, yreg) \
+ cmpq $(1 + 2 * (offset)), %rcx; \
+ jb .Lblk16_load_input_done; \
+ ja 1f; \
+ vmovdqu (offset) * 32(%rdx), yreg##x; \
+ jmp .Lblk16_load_input_done; \
+ 1: \
+ vmovdqu (offset) * 32(%rdx), yreg;
+
+ cmpq $8, %rcx;
+ jbe _gcry_sm4_gfni_avx2_crypt_blk1_8;
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ vmovdqu (2 * 32)(%rdx), RA2;
+ vmovdqu (3 * 32)(%rdx), RA3;
+ LOAD_INPUT(4, RB0);
+ LOAD_INPUT(5, RB1);
+ LOAD_INPUT(6, RB2);
+ LOAD_INPUT(7, RB3);
+#undef LOAD_INPUT
+
+.Lblk16_load_input_done:
+ call __sm4_gfni_crypt_blk16;
+
+#define STORE_OUTPUT(yreg, offset) \
+ cmpq $(1 + 2 * (offset)), %rcx; \
+ jb .Lblk16_store_output_done; \
+ ja 1f; \
+ vmovdqu yreg##x, (offset) * 32(%rsi); \
+ jmp .Lblk16_store_output_done; \
+ 1: \
+ vmovdqu yreg, (offset) * 32(%rsi);
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ STORE_OUTPUT(RB0, 4);
+ STORE_OUTPUT(RB1, 5);
+ STORE_OUTPUT(RB2, 6);
+ STORE_OUTPUT(RB3, 7);
+#undef STORE_OUTPUT
+
+.Lblk16_store_output_done:
+ vzeroall;
+ xorl %eax, %eax;
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_crypt_blk1_16,.-_gcry_sm4_gfni_avx2_crypt_blk1_16;)
+
+#define inc_le128(x, minus_one, tmp) \
+ vpcmpeqq minus_one, x, tmp; \
+ vpsubq minus_one, x, x; \
+ vpslldq $8, tmp, tmp; \
+ vpsubq tmp, x, x;
+
+.align 8
+.globl _gcry_sm4_gfni_avx2_ctr_enc
+ELF(.type _gcry_sm4_gfni_avx2_ctr_enc,@function;)
+_gcry_sm4_gfni_avx2_ctr_enc:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv (big endian, 128bit)
+ */
+ CFI_STARTPROC();
+
+ movq 8(%rcx), %rax;
+ bswapq %rax;
+
+ vbroadcasti128 .Lbswap128_mask rRIP, RTMP3;
+ vpcmpeqd RNOT, RNOT, RNOT;
+ vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */
+ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */
+
+ /* load IV and byteswap */
+ vmovdqu (%rcx), RTMP4x;
+ vpshufb RTMP3x, RTMP4x, RTMP4x;
+ vmovdqa RTMP4x, RTMP0x;
+ inc_le128(RTMP4x, RNOTx, RTMP1x);
+ vinserti128 $1, RTMP4x, RTMP0, RTMP0;
+ vpshufb RTMP3, RTMP0, RA0; /* +1 ; +0 */
+
+ /* check need for handling 64-bit overflow and carry */
+ cmpq $(0xffffffffffffffff - 16), %rax;
+ ja .Lhandle_ctr_carry;
+
+ /* construct IVs */
+ vpsubq RTMP2, RTMP0, RTMP0; /* +3 ; +2 */
+ vpshufb RTMP3, RTMP0, RA1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +5 ; +4 */
+ vpshufb RTMP3, RTMP0, RA2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +7 ; +6 */
+ vpshufb RTMP3, RTMP0, RA3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +9 ; +8 */
+ vpshufb RTMP3, RTMP0, RB0;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +11 ; +10 */
+ vpshufb RTMP3, RTMP0, RB1;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +13 ; +12 */
+ vpshufb RTMP3, RTMP0, RB2;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +15 ; +14 */
+ vpshufb RTMP3, RTMP0, RB3;
+ vpsubq RTMP2, RTMP0, RTMP0; /* +16 */
+ vpshufb RTMP3x, RTMP0x, RTMP0x;
+
+ jmp .Lctr_carry_done;
+
+.Lhandle_ctr_carry:
+ /* construct IVs */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA1; /* +3 ; +2 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA2; /* +5 ; +4 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RA3; /* +7 ; +6 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB0; /* +9 ; +8 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB1; /* +11 ; +10 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB2; /* +13 ; +12 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vpshufb RTMP3, RTMP0, RB3; /* +15 ; +14 */
+ inc_le128(RTMP0, RNOT, RTMP1);
+ vextracti128 $1, RTMP0, RTMP0x;
+ vpshufb RTMP3x, RTMP0x, RTMP0x; /* +16 */
+
+.align 4
+.Lctr_carry_done:
+ /* store new IV */
+ vmovdqu RTMP0x, (%rcx);
+
+ call __sm4_gfni_crypt_blk16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RA1, RA1;
+ vpxor (2 * 32)(%rdx), RA2, RA2;
+ vpxor (3 * 32)(%rdx), RA3, RA3;
+ vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RB2, RB2;
+ vpxor (7 * 32)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_ctr_enc,.-_gcry_sm4_gfni_avx2_ctr_enc;)
+
+.align 8
+.globl _gcry_sm4_gfni_avx2_cbc_dec
+ELF(.type _gcry_sm4_gfni_avx2_cbc_dec,@function;)
+_gcry_sm4_gfni_avx2_cbc_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+ CFI_STARTPROC();
+
+ vmovdqu (0 * 32)(%rdx), RA0;
+ vmovdqu (1 * 32)(%rdx), RA1;
+ vmovdqu (2 * 32)(%rdx), RA2;
+ vmovdqu (3 * 32)(%rdx), RA3;
+ vmovdqu (4 * 32)(%rdx), RB0;
+ vmovdqu (5 * 32)(%rdx), RB1;
+ vmovdqu (6 * 32)(%rdx), RB2;
+ vmovdqu (7 * 32)(%rdx), RB3;
+
+ call __sm4_gfni_crypt_blk16;
+
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RNOT;
+ vpxor RNOT, RA0, RA0;
+ vpxor (0 * 32 + 16)(%rdx), RA1, RA1;
+ vpxor (1 * 32 + 16)(%rdx), RA2, RA2;
+ vpxor (2 * 32 + 16)(%rdx), RA3, RA3;
+ vpxor (3 * 32 + 16)(%rdx), RB0, RB0;
+ vpxor (4 * 32 + 16)(%rdx), RB1, RB1;
+ vpxor (5 * 32 + 16)(%rdx), RB2, RB2;
+ vpxor (6 * 32 + 16)(%rdx), RB3, RB3;
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx); /* store new IV */
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_cbc_dec,.-_gcry_sm4_gfni_avx2_cbc_dec;)
+
+.align 8
+.globl _gcry_sm4_gfni_avx2_cfb_dec
+ELF(.type _gcry_sm4_gfni_avx2_cfb_dec,@function;)
+_gcry_sm4_gfni_avx2_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+ CFI_STARTPROC();
+
+ /* Load input */
+ vmovdqu (%rcx), RNOTx;
+ vinserti128 $1, (%rdx), RNOT, RA0;
+ vmovdqu (0 * 32 + 16)(%rdx), RA1;
+ vmovdqu (1 * 32 + 16)(%rdx), RA2;
+ vmovdqu (2 * 32 + 16)(%rdx), RA3;
+ vmovdqu (3 * 32 + 16)(%rdx), RB0;
+ vmovdqu (4 * 32 + 16)(%rdx), RB1;
+ vmovdqu (5 * 32 + 16)(%rdx), RB2;
+ vmovdqu (6 * 32 + 16)(%rdx), RB3;
+
+ /* Update IV */
+ vmovdqu (7 * 32 + 16)(%rdx), RNOTx;
+ vmovdqu RNOTx, (%rcx);
+
+ call __sm4_gfni_crypt_blk16;
+
+ vpxor (0 * 32)(%rdx), RA0, RA0;
+ vpxor (1 * 32)(%rdx), RA1, RA1;
+ vpxor (2 * 32)(%rdx), RA2, RA2;
+ vpxor (3 * 32)(%rdx), RA3, RA3;
+ vpxor (4 * 32)(%rdx), RB0, RB0;
+ vpxor (5 * 32)(%rdx), RB1, RB1;
+ vpxor (6 * 32)(%rdx), RB2, RB2;
+ vpxor (7 * 32)(%rdx), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_cfb_dec,.-_gcry_sm4_gfni_avx2_cfb_dec;)
+
+.align 8
+.globl _gcry_sm4_gfni_avx2_ocb_enc
+ELF(.type _gcry_sm4_gfni_avx2_ocb_enc,@function;)
+
+_gcry_sm4_gfni_avx2_ocb_enc:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[16])
+ */
+ CFI_STARTPROC();
+
+ subq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+ movq %r10, (0 * 8)(%rsp);
+ movq %r11, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+ CFI_REL_OFFSET(%r10, 0 * 8);
+ CFI_REL_OFFSET(%r11, 1 * 8);
+ CFI_REL_OFFSET(%r12, 2 * 8);
+ CFI_REL_OFFSET(%r13, 3 * 8);
+
+ vmovdqu (%rcx), RTMP0x;
+ vmovdqu (%r8), RTMP1x;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rdx), yreg; \
+ vpxor (l0reg), RTMP0x, RNOTx; \
+ vpxor (l1reg), RNOTx, RTMP0x; \
+ vinserti128 $1, RTMP0x, RNOT, RNOT; \
+ vpxor yreg, RTMP1, RTMP1; \
+ vpxor yreg, RNOT, yreg; \
+ vmovdqu RNOT, (n * 32)(%rsi);
+
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %r11, RA0);
+ OCB_INPUT(1, %r12, %r13, RA1);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(2, %r10, %r11, RA2);
+ OCB_INPUT(3, %r12, %r13, RA3);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %r11, RB0);
+ OCB_INPUT(5, %r12, %r13, RB1);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(6, %r10, %r11, RB2);
+ OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+ vextracti128 $1, RTMP1, RNOTx;
+ vmovdqu RTMP0x, (%rcx);
+ vpxor RNOTx, RTMP1x, RTMP1x;
+ vmovdqu RTMP1x, (%r8);
+
+ movq (0 * 8)(%rsp), %r10;
+ movq (1 * 8)(%rsp), %r11;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+ CFI_RESTORE(%r10);
+ CFI_RESTORE(%r11);
+ CFI_RESTORE(%r12);
+ CFI_RESTORE(%r13);
+
+ call __sm4_gfni_crypt_blk16;
+
+ addq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+ vpxor (0 * 32)(%rsi), RA0, RA0;
+ vpxor (1 * 32)(%rsi), RA1, RA1;
+ vpxor (2 * 32)(%rsi), RA2, RA2;
+ vpxor (3 * 32)(%rsi), RA3, RA3;
+ vpxor (4 * 32)(%rsi), RB0, RB0;
+ vpxor (5 * 32)(%rsi), RB1, RB1;
+ vpxor (6 * 32)(%rsi), RB2, RB2;
+ vpxor (7 * 32)(%rsi), RB3, RB3;
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vmovdqu RB3, (7 * 32)(%rsi);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_ocb_enc,.-_gcry_sm4_gfni_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_sm4_gfni_avx2_ocb_dec
+ELF(.type _gcry_sm4_gfni_avx2_ocb_dec,@function;)
+
+_gcry_sm4_gfni_avx2_ocb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[16])
+ */
+ CFI_STARTPROC();
+
+ subq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+ movq %r10, (0 * 8)(%rsp);
+ movq %r11, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+ CFI_REL_OFFSET(%r10, 0 * 8);
+ CFI_REL_OFFSET(%r11, 1 * 8);
+ CFI_REL_OFFSET(%r12, 2 * 8);
+ CFI_REL_OFFSET(%r13, 3 * 8);
+
+ vmovdqu (%rcx), RTMP0x;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rdx), yreg; \
+ vpxor (l0reg), RTMP0x, RNOTx; \
+ vpxor (l1reg), RNOTx, RTMP0x; \
+ vinserti128 $1, RTMP0x, RNOT, RNOT; \
+ vpxor yreg, RNOT, yreg; \
+ vmovdqu RNOT, (n * 32)(%rsi);
+
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %r11, RA0);
+ OCB_INPUT(1, %r12, %r13, RA1);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(2, %r10, %r11, RA2);
+ OCB_INPUT(3, %r12, %r13, RA3);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %r11, RB0);
+ OCB_INPUT(5, %r12, %r13, RB1);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(6, %r10, %r11, RB2);
+ OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+ vmovdqu RTMP0x, (%rcx);
+
+ movq (0 * 8)(%rsp), %r10;
+ movq (1 * 8)(%rsp), %r11;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+ CFI_RESTORE(%r10);
+ CFI_RESTORE(%r11);
+ CFI_RESTORE(%r12);
+ CFI_RESTORE(%r13);
+
+ call __sm4_gfni_crypt_blk16;
+
+ addq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+ vmovdqu (%r8), RTMP1x;
+
+ vpxor (0 * 32)(%rsi), RA0, RA0;
+ vpxor (1 * 32)(%rsi), RA1, RA1;
+ vpxor (2 * 32)(%rsi), RA2, RA2;
+ vpxor (3 * 32)(%rsi), RA3, RA3;
+ vpxor (4 * 32)(%rsi), RB0, RB0;
+ vpxor (5 * 32)(%rsi), RB1, RB1;
+ vpxor (6 * 32)(%rsi), RB2, RB2;
+ vpxor (7 * 32)(%rsi), RB3, RB3;
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+
+ vmovdqu RA0, (0 * 32)(%rsi);
+ vpxor RA0, RTMP1, RTMP1;
+ vmovdqu RA1, (1 * 32)(%rsi);
+ vpxor RA1, RTMP1, RTMP1;
+ vmovdqu RA2, (2 * 32)(%rsi);
+ vpxor RA2, RTMP1, RTMP1;
+ vmovdqu RA3, (3 * 32)(%rsi);
+ vpxor RA3, RTMP1, RTMP1;
+ vmovdqu RB0, (4 * 32)(%rsi);
+ vpxor RB0, RTMP1, RTMP1;
+ vmovdqu RB1, (5 * 32)(%rsi);
+ vpxor RB1, RTMP1, RTMP1;
+ vmovdqu RB2, (6 * 32)(%rsi);
+ vpxor RB2, RTMP1, RTMP1;
+ vmovdqu RB3, (7 * 32)(%rsi);
+ vpxor RB3, RTMP1, RTMP1;
+
+ vextracti128 $1, RTMP1, RNOTx;
+ vpxor RNOTx, RTMP1x, RTMP1x;
+ vmovdqu RTMP1x, (%r8);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_ocb_dec,.-_gcry_sm4_gfni_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_sm4_gfni_avx2_ocb_auth
+ELF(.type _gcry_sm4_gfni_avx2_ocb_auth,@function;)
+
+_gcry_sm4_gfni_avx2_ocb_auth:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: abuf (16 blocks)
+ * %rdx: offset
+ * %rcx: checksum
+ * %r8 : L pointers (void *L[16])
+ */
+ CFI_STARTPROC();
+
+ subq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(4 * 8);
+
+ movq %r10, (0 * 8)(%rsp);
+ movq %r11, (1 * 8)(%rsp);
+ movq %r12, (2 * 8)(%rsp);
+ movq %r13, (3 * 8)(%rsp);
+ CFI_REL_OFFSET(%r10, 0 * 8);
+ CFI_REL_OFFSET(%r11, 1 * 8);
+ CFI_REL_OFFSET(%r12, 2 * 8);
+ CFI_REL_OFFSET(%r13, 3 * 8);
+
+ vmovdqu (%rdx), RTMP0x;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rsi), yreg; \
+ vpxor (l0reg), RTMP0x, RNOTx; \
+ vpxor (l1reg), RNOTx, RTMP0x; \
+ vinserti128 $1, RTMP0x, RNOT, RNOT; \
+ vpxor yreg, RNOT, yreg;
+
+ movq (0 * 8)(%r8), %r10;
+ movq (1 * 8)(%r8), %r11;
+ movq (2 * 8)(%r8), %r12;
+ movq (3 * 8)(%r8), %r13;
+ OCB_INPUT(0, %r10, %r11, RA0);
+ OCB_INPUT(1, %r12, %r13, RA1);
+ movq (4 * 8)(%r8), %r10;
+ movq (5 * 8)(%r8), %r11;
+ movq (6 * 8)(%r8), %r12;
+ movq (7 * 8)(%r8), %r13;
+ OCB_INPUT(2, %r10, %r11, RA2);
+ OCB_INPUT(3, %r12, %r13, RA3);
+ movq (8 * 8)(%r8), %r10;
+ movq (9 * 8)(%r8), %r11;
+ movq (10 * 8)(%r8), %r12;
+ movq (11 * 8)(%r8), %r13;
+ OCB_INPUT(4, %r10, %r11, RB0);
+ OCB_INPUT(5, %r12, %r13, RB1);
+ movq (12 * 8)(%r8), %r10;
+ movq (13 * 8)(%r8), %r11;
+ movq (14 * 8)(%r8), %r12;
+ movq (15 * 8)(%r8), %r13;
+ OCB_INPUT(6, %r10, %r11, RB2);
+ OCB_INPUT(7, %r12, %r13, RB3);
+#undef OCB_INPUT
+
+ vmovdqu RTMP0x, (%rdx);
+
+ movq (0 * 8)(%rsp), %r10;
+ movq (1 * 8)(%rsp), %r11;
+ movq (2 * 8)(%rsp), %r12;
+ movq (3 * 8)(%rsp), %r13;
+ CFI_RESTORE(%r10);
+ CFI_RESTORE(%r11);
+ CFI_RESTORE(%r12);
+ CFI_RESTORE(%r13);
+
+ call __sm4_gfni_crypt_blk16;
+
+ addq $(4 * 8), %rsp;
+ CFI_ADJUST_CFA_OFFSET(-4 * 8);
+
+ vpxor RA0, RB0, RA0;
+ vpxor RA1, RB1, RA1;
+ vpxor RA2, RB2, RA2;
+ vpxor RA3, RB3, RA3;
+
+ vpxor RA1, RA0, RA0;
+ vpxor RA3, RA2, RA2;
+
+ vpxor RA2, RA0, RTMP1;
+
+ vextracti128 $1, RTMP1, RNOTx;
+ vpxor (%rcx), RTMP1x, RTMP1x;
+ vpxor RNOTx, RTMP1x, RTMP1x;
+ vmovdqu RTMP1x, (%rcx);
+
+ vzeroall;
+
+ ret_spec_stop;
+ CFI_ENDPROC();
+ELF(.size _gcry_sm4_gfni_avx2_ocb_auth,.-_gcry_sm4_gfni_avx2_ocb_auth;)
+
+#endif /*defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
+#endif /*__x86_64*/
diff --git a/cipher/sm4.c b/cipher/sm4.c
index 79e6dbf1..5f8bf224 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -1,7 +1,7 @@
/* sm4.c - SM4 Cipher Algorithm
* Copyright (C) 2020 Alibaba Group.
* Copyright (C) 2020 Tianjia Zhang <[email protected]>
- * Copyright (C) 2020 Jussi Kivilinna <[email protected]>
+ * Copyright (C) 2020-2022 Jussi Kivilinna <[email protected]>
*
* This file is part of Libgcrypt.
*
@@ -29,7 +29,7 @@
#include "cipher.h"
#include "bufhelp.h"
#include "cipher-internal.h"
-#include "cipher-selftest.h"
+#include "bulkhelp.h"
/* Helper macro to force alignment to 64 bytes. */
#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
@@ -47,7 +47,7 @@
# endif
#endif
-/* USE_AESNI_AVX inidicates whether to compile with Intel AES-NI/AVX2 code. */
+/* USE_AESNI_AVX2 inidicates whether to compile with Intel AES-NI/AVX2 code. */
#undef USE_AESNI_AVX2
#if defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
@@ -56,10 +56,19 @@
# endif
#endif
+/* USE_GFNI_AVX2 inidicates whether to compile with Intel GFNI/AVX2 code. */
+#undef USE_GFNI_AVX2
+#if defined(ENABLE_GFNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)
+# if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_GFNI_AVX2 1
+# endif
+#endif
+
/* Assembly implementations use SystemV ABI, ABI conversion and additional
* stack to store XMM6-XMM15 needed on Win64. */
#undef ASM_FUNC_ABI
-#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+#if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2) || defined(USE_GFNI_AVX2)
# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
# define ASM_FUNC_ABI __attribute__((sysv_abi))
# else
@@ -96,6 +105,9 @@ static void _gcry_sm4_cbc_dec (void *context, unsigned char *iv,
static void _gcry_sm4_cfb_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks);
+static void _gcry_sm4_xts_crypt (void *context, unsigned char *tweak,
+ void *outbuf_arg, const void *inbuf_arg,
+ size_t nblocks, int encrypt);
static size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks,
int encrypt);
@@ -112,6 +124,9 @@ typedef struct
#ifdef USE_AESNI_AVX2
unsigned int use_aesni_avx2:1;
#endif
+#ifdef USE_GFNI_AVX2
+ unsigned int use_gfni_avx2:1;
+#endif
#ifdef USE_AARCH64_SIMD
unsigned int use_aarch64_simd:1;
#endif
@@ -120,6 +135,10 @@ typedef struct
#endif
} SM4_context;
+typedef unsigned int (*crypt_blk1_16_fn_t) (const void *ctx, byte *out,
+ const byte *in,
+ unsigned int num_blks);
+
static const u32 fk[4] =
{
0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
@@ -223,9 +242,17 @@ _gcry_sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
unsigned int num_blks) ASM_FUNC_ABI;
static inline unsigned int
-sm4_aesni_avx_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
- unsigned int num_blks)
+sm4_aesni_avx_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
{
+ if (num_blks > 8)
+ {
+ _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, 8);
+ in += 8 * 16;
+ out += 8 * 16;
+ num_blks -= 8;
+ }
+
return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks);
}
@@ -263,8 +290,76 @@ extern void _gcry_sm4_aesni_avx2_ocb_auth(const u32 *rk_enc,
unsigned char *offset,
unsigned char *checksum,
const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_aesni_avx2_crypt_blk1_16(const u32 *rk, byte *out, const byte *in,
+ unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_aesni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+#ifdef USE_AESNI_AVX
+ /* Use 128-bit register implementation for short input. */
+ if (num_blks <= 8)
+ return _gcry_sm4_aesni_avx_crypt_blk1_8(rk, out, in, num_blks);
+#endif
+
+ return _gcry_sm4_aesni_avx2_crypt_blk1_16(rk, out, in, num_blks);
+}
+
#endif /* USE_AESNI_AVX2 */
+#ifdef USE_GFNI_AVX2
+extern void _gcry_sm4_gfni_avx_expand_key(const byte *key, u32 *rk_enc,
+ u32 *rk_dec, const u32 *fk,
+ const u32 *ck) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ctr_enc(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *ctr) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_cbc_dec(const u32 *rk_dec, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_cfb_dec(const u32 *rk_enc, byte *out,
+ const byte *in,
+ byte *iv) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ocb_enc(const u32 *rk_enc,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ocb_dec(const u32 *rk_dec,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_sm4_gfni_avx2_ocb_auth(const u32 *rk_enc,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const u64 Ls[16]) ASM_FUNC_ABI;
+
+extern unsigned int
+_gcry_sm4_gfni_avx2_crypt_blk1_16(const u32 *rk, byte *out, const byte *in,
+ unsigned int num_blks) ASM_FUNC_ABI;
+
+static inline unsigned int
+sm4_gfni_avx2_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
+{
+ return _gcry_sm4_gfni_avx2_crypt_blk1_16(rk, out, in, num_blks);
+}
+
+#endif /* USE_GFNI_AVX2 */
+
#ifdef USE_AARCH64_SIMD
extern void _gcry_sm4_aarch64_crypt(const u32 *rk, byte *out,
const byte *in,
@@ -290,12 +385,21 @@ extern void _gcry_sm4_aarch64_crypt_blk1_8(const u32 *rk, byte *out,
size_t num_blocks);
static inline unsigned int
-sm4_aarch64_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
- unsigned int num_blks)
+sm4_aarch64_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
{
- _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, (size_t)num_blks);
+ if (num_blks > 8)
+ {
+ _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, 8);
+ in += 8 * 16;
+ out += 8 * 16;
+ num_blks -= 8;
+ }
+
+ _gcry_sm4_aarch64_crypt_blk1_8(rk, out, in, num_blks);
return 0;
}
+
#endif /* USE_AARCH64_SIMD */
#ifdef USE_ARM_CE
@@ -327,12 +431,21 @@ extern void _gcry_sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out,
size_t num_blocks);
static inline unsigned int
-sm4_armv8_ce_crypt_blk1_8(const u32 *rk, byte *out, const byte *in,
- unsigned int num_blks)
+sm4_armv8_ce_crypt_blk1_16(const void *rk, byte *out, const byte *in,
+ unsigned int num_blks)
{
- _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, (size_t)num_blks);
+ if (num_blks > 8)
+ {
+ _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, 8);
+ in += 8 * 16;
+ out += 8 * 16;
+ num_blks -= 8;
+ }
+
+ _gcry_sm4_armv8_ce_crypt_blk1_8(rk, out, in, num_blks);
return 0;
}
+
#endif /* USE_ARM_CE */
static inline void prefetch_sbox_table(void)
@@ -403,6 +516,15 @@ sm4_expand_key (SM4_context *ctx, const byte *key)
u32 rk[4];
int i;
+#ifdef USE_GFNI_AVX
+ if (ctx->use_gfni_avx)
+ {
+ _gcry_sm4_gfni_avx_expand_key (key, ctx->rkey_enc, ctx->rkey_dec,
+ fk, ck);
+ return;
+ }
+#endif
+
#ifdef USE_AESNI_AVX
if (ctx->use_aesni_avx)
{
@@ -475,6 +597,9 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
#ifdef USE_AESNI_AVX2
ctx->use_aesni_avx2 = (hwf & HWF_INTEL_AESNI) && (hwf & HWF_INTEL_AVX2);
#endif
+#ifdef USE_GFNI_AVX2
+ ctx->use_gfni_avx2 = (hwf & HWF_INTEL_GFNI) && (hwf & HWF_INTEL_AVX2);
+#endif
#ifdef USE_AARCH64_SIMD
ctx->use_aarch64_simd = !!(hwf & HWF_ARM_NEON);
#endif
@@ -482,11 +607,25 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
ctx->use_arm_ce = !!(hwf & HWF_ARM_SM4);
#endif
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ /* Disable AESNI implementations when GFNI implementation is enabled. */
+#ifdef USE_AESNI_AVX
+ ctx->use_aesni_avx = 0;
+#endif
+#ifdef USE_AESNI_AVX2
+ ctx->use_aesni_avx2 = 0;
+#endif
+ }
+#endif
+
/* Setup bulk encryption routines. */
memset (bulk_ops, 0, sizeof(*bulk_ops));
bulk_ops->cbc_dec = _gcry_sm4_cbc_dec;
bulk_ops->cfb_dec = _gcry_sm4_cfb_dec;
bulk_ops->ctr_enc = _gcry_sm4_ctr_enc;
+ bulk_ops->xts_crypt = _gcry_sm4_xts_crypt;
bulk_ops->ocb_crypt = _gcry_sm4_ocb_crypt;
bulk_ops->ocb_auth = _gcry_sm4_ocb_auth;
@@ -526,9 +665,14 @@ sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
{
SM4_context *ctx = context;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
+#endif
+
#ifdef USE_ARM_CE
if (ctx->use_arm_ce)
- return sm4_armv8_ce_crypt_blk1_8(ctx->rkey_enc, outbuf, inbuf, 1);
+ return sm4_armv8_ce_crypt_blk1_16(ctx->rkey_enc, outbuf, inbuf, 1);
#endif
prefetch_sbox_table ();
@@ -541,9 +685,14 @@ sm4_decrypt (void *context, byte *outbuf, const byte *inbuf)
{
SM4_context *ctx = context;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ return sm4_gfni_avx2_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
+#endif
+
#ifdef USE_ARM_CE
if (ctx->use_arm_ce)
- return sm4_armv8_ce_crypt_blk1_8(ctx->rkey_dec, outbuf, inbuf, 1);
+ return sm4_armv8_ce_crypt_blk1_16(ctx->rkey_dec, outbuf, inbuf, 1);
#endif
prefetch_sbox_table ();
@@ -600,9 +749,10 @@ sm4_do_crypt_blks2 (const u32 *rk, byte *out, const byte *in)
}
static unsigned int
-sm4_crypt_blocks (const u32 *rk, byte *out, const byte *in,
+sm4_crypt_blocks (const void *ctx, byte *out, const byte *in,
unsigned int num_blks)
{
+ const u32 *rk = ctx;
unsigned int burn_depth = 0;
unsigned int nburn;
@@ -629,6 +779,48 @@ sm4_crypt_blocks (const u32 *rk, byte *out, const byte *in,
return burn_depth;
}
+static inline crypt_blk1_16_fn_t
+sm4_get_crypt_blk1_16_fn(SM4_context *ctx)
+{
+ if (0)
+ ;
+#ifdef USE_AESNI_AVX
+ else if (ctx->use_gfni_avx2)
+ {
+ return &sm4_gfni_avx2_crypt_blk1_16;
+ }
+#endif
+#ifdef USE_AESNI_AVX2
+ else if (ctx->use_aesni_avx2)
+ {
+ return &sm4_aesni_avx2_crypt_blk1_16;
+ }
+#endif
+#ifdef USE_AESNI_AVX
+ else if (ctx->use_aesni_avx)
+ {
+ return &sm4_aesni_avx_crypt_blk1_16;
+ }
+#endif
+#ifdef USE_ARM_CE
+ else if (ctx->use_arm_ce)
+ {
+ return &sm4_armv8_ce_crypt_blk1_16;
+ }
+#endif
+#ifdef USE_AARCH64_SIMD
+ else if (ctx->use_aarch64_simd)
+ {
+ return &sm4_aarch64_crypt_blk1_16;
+ }
+#endif
+ else
+ {
+ prefetch_sbox_table ();
+ return &sm4_crypt_blocks;
+ }
+}
+
/* Bulk encryption of complete blocks in CTR mode. This function is only
intended for the bulk encryption feature of cipher.c. CTR is expected to be
of size 16. */
@@ -642,6 +834,21 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
const byte *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_sm4_gfni_avx2_ctr_enc(ctx->rkey_enc, outbuf, inbuf, ctr);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
@@ -709,66 +916,15 @@ _gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
/* Process remaining blocks. */
if (nblocks)
{
- unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
- unsigned int num_blks);
- byte tmpbuf[16 * 8];
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ byte tmpbuf[16 * 16];
unsigned int tmp_used = 16;
+ size_t nburn;
- if (0)
- ;
-#ifdef USE_AESNI_AVX
- else if (ctx->use_aesni_avx)
- {
- crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
- }
-#endif
-#ifdef USE_ARM_CE
- else if (ctx->use_arm_ce)
- {
- crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
- }
-#endif
-#ifdef USE_AARCH64_SIMD
- else if (ctx->use_aarch64_simd)
- {
- crypt_blk1_8 = sm4_aarch64_crypt_blk1_8;
- }
-#endif
- else
- {
- prefetch_sbox_table ();
- crypt_blk1_8 = sm4_crypt_blocks;
- }
-
- /* Process remaining blocks. */
- while (nblocks)
- {
- size_t curr_blks = nblocks > 8 ? 8 : nblocks;
- size_t i;
-
- if (curr_blks * 16 > tmp_used)
- tmp_used = curr_blks * 16;
-
- cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
- for (i = 1; i < curr_blks; i++)
- {
- cipher_block_cpy (&tmpbuf[i * 16], ctr, 16);
- cipher_block_add (&tmpbuf[i * 16], i, 16);
- }
- cipher_block_add (ctr, curr_blks, 16);
-
- burn_stack_depth = crypt_blk1_8 (ctx->rkey_enc, tmpbuf, tmpbuf,
- curr_blks);
-
- for (i = 0; i < curr_blks; i++)
- {
- cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
- outbuf += 16;
- inbuf += 16;
- }
-
- nblocks -= curr_blks;
- }
+ nburn = bulk_ctr_enc_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf,
+ nblocks, ctr, tmpbuf, sizeof(tmpbuf) / 16,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
wipememory(tmpbuf, tmp_used);
}
@@ -789,6 +945,21 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_sm4_gfni_avx2_cbc_dec(ctx->rkey_dec, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
@@ -856,61 +1027,17 @@ _gcry_sm4_cbc_dec(void *context, unsigned char *iv,
/* Process remaining blocks. */
if (nblocks)
{
- unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
- unsigned int num_blks);
- unsigned char savebuf[16 * 8];
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
+ size_t nburn;
- if (0)
- ;
-#ifdef USE_AESNI_AVX
- else if (ctx->use_aesni_avx)
- {
- crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
- }
-#endif
-#ifdef USE_ARM_CE
- else if (ctx->use_arm_ce)
- {
- crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
- }
-#endif
-#ifdef USE_AARCH64_SIMD
- else if (ctx->use_aarch64_simd)
- {
- crypt_blk1_8 = sm4_aarch64_crypt_blk1_8;
- }
-#endif
- else
- {
- prefetch_sbox_table ();
- crypt_blk1_8 = sm4_crypt_blocks;
- }
+ nburn = bulk_cbc_dec_128(ctx->rkey_dec, crypt_blk1_16, outbuf, inbuf,
+ nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
- /* Process remaining blocks. */
- while (nblocks)
- {
- size_t curr_blks = nblocks > 8 ? 8 : nblocks;
- size_t i;
-
- if (curr_blks * 16 > tmp_used)
- tmp_used = curr_blks * 16;
-
- burn_stack_depth = crypt_blk1_8 (ctx->rkey_dec, savebuf, inbuf,
- curr_blks);
-
- for (i = 0; i < curr_blks; i++)
- {
- cipher_block_xor_n_copy_2(outbuf, &savebuf[i * 16], iv, inbuf,
- 16);
- outbuf += 16;
- inbuf += 16;
- }
-
- nblocks -= curr_blks;
- }
-
- wipememory(savebuf, tmp_used);
+ wipememory(tmpbuf, tmp_used);
}
if (burn_stack_depth)
@@ -929,6 +1056,21 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
const unsigned char *inbuf = inbuf_arg;
int burn_stack_depth = 0;
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
+ {
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_sm4_gfni_avx2_cfb_dec(ctx->rkey_enc, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
+ }
+ }
+#endif
+
#ifdef USE_AESNI_AVX2
if (ctx->use_aesni_avx2)
{
@@ -996,65 +1138,48 @@ _gcry_sm4_cfb_dec(void *context, unsigned char *iv,
/* Process remaining blocks. */
if (nblocks)
{
- unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
- unsigned int num_blks);
- unsigned char ivbuf[16 * 8];
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
+ size_t nburn;
- if (0)
- ;
-#ifdef USE_AESNI_AVX
- else if (ctx->use_aesni_avx)
- {
- crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
- }
-#endif
-#ifdef USE_ARM_CE
- else if (ctx->use_arm_ce)
- {
- crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
- }
-#endif
-#ifdef USE_AARCH64_SIMD
- else if (ctx->use_aarch64_simd)
- {
- crypt_blk1_8 = sm4_aarch64_crypt_blk1_8;
- }
-#endif
- else
- {
- prefetch_sbox_table ();
- crypt_blk1_8 = sm4_crypt_blocks;
- }
-
- /* Process remaining blocks. */
- while (nblocks)
- {
- size_t curr_blks = nblocks > 8 ? 8 : nblocks;
- size_t i;
+ nburn = bulk_cfb_dec_128(ctx->rkey_enc, crypt_blk1_16, outbuf, inbuf,
+ nblocks, iv, tmpbuf, sizeof(tmpbuf) / 16,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
- if (curr_blks * 16 > tmp_used)
- tmp_used = curr_blks * 16;
+ wipememory(tmpbuf, tmp_used);
+ }
- cipher_block_cpy (&ivbuf[0 * 16], iv, 16);
- for (i = 1; i < curr_blks; i++)
- cipher_block_cpy (&ivbuf[i * 16], &inbuf[(i - 1) * 16], 16);
- cipher_block_cpy (iv, &inbuf[(i - 1) * 16], 16);
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
+}
- burn_stack_depth = crypt_blk1_8 (ctx->rkey_enc, ivbuf, ivbuf,
- curr_blks);
+/* Bulk encryption/decryption of complete blocks in XTS mode. */
+static void
+_gcry_sm4_xts_crypt (void *context, unsigned char *tweak, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+ SM4_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 0;
- for (i = 0; i < curr_blks; i++)
- {
- cipher_block_xor (outbuf, inbuf, &ivbuf[i * 16], 16);
- outbuf += 16;
- inbuf += 16;
- }
+ /* Process remaining blocks. */
+ if (nblocks)
+ {
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
+ unsigned char tmpbuf[16 * 16];
+ unsigned int tmp_used = 16;
+ size_t nburn;
- nblocks -= curr_blks;
- }
+ nburn = bulk_xts_crypt_128(rk, crypt_blk1_16, outbuf, inbuf, nblocks,
+ tweak, tmpbuf, sizeof(tmpbuf) / 16,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
- wipememory(ivbuf, tmp_used);
+ wipememory(tmpbuf, tmp_used);
}
if (burn_stack_depth)
@@ -1072,31 +1197,46 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
u64 blkn = c->u_mode.ocb.data_nblocks;
int burn_stack_depth = 0;
-#ifdef USE_AESNI_AVX2
- if (ctx->use_aesni_avx2)
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
{
u64 Ls[16];
- unsigned int n = 16 - (blkn % 16);
u64 *l;
- int i;
if (nblocks >= 16)
{
- for (i = 0; i < 16; i += 8)
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
{
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ if (encrypt)
+ _gcry_sm4_gfni_avx2_ocb_enc(ctx->rkey_enc, outbuf, inbuf,
+ c->u_iv.iv, c->u_ctr.ctr, Ls);
+ else
+ _gcry_sm4_gfni_avx2_ocb_dec(ctx->rkey_dec, outbuf, inbuf,
+ c->u_iv.iv, c->u_ctr.ctr, Ls);
+
+ nblocks -= 16;
+ outbuf += 16 * 16;
+ inbuf += 16 * 16;
}
+ }
+ }
+#endif
+
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2)
+ {
+ u64 Ls[16];
+ u64 *l;
- Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(15 + n) % 16];
+ if (nblocks >= 16)
+ {
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
/* Process data in 16 block chunks. */
while (nblocks >= 16)
@@ -1123,22 +1263,11 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
if (ctx->use_aesni_avx)
{
u64 Ls[8];
- unsigned int n = 8 - (blkn % 8);
u64 *l;
if (nblocks >= 8)
{
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(7 + n) % 8];
+ l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
/* Process data in 8 block chunks. */
while (nblocks >= 8)
@@ -1161,78 +1290,19 @@ _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
}
#endif
+ /* Process remaining blocks. */
if (nblocks)
{
- unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
- unsigned int num_blks);
- const u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
- unsigned char tmpbuf[16 * 8];
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
+ unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
+ size_t nburn;
- if (0)
- ;
-#ifdef USE_AESNI_AVX
- else if (ctx->use_aesni_avx)
- {
- crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
- }
-#endif
-#ifdef USE_ARM_CE
- else if (ctx->use_arm_ce)
- {
- crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
- }
-#endif
-#ifdef USE_AARCH64_SIMD
- else if (ctx->use_aarch64_simd)
- {
- crypt_blk1_8 = sm4_aarch64_crypt_blk1_8;
- }
-#endif
- else
- {
- prefetch_sbox_table ();
- crypt_blk1_8 = sm4_crypt_blocks;
- }
-
- while (nblocks)
- {
- size_t curr_blks = nblocks > 8 ? 8 : nblocks;
- size_t i;
-
- if (curr_blks * 16 > tmp_used)
- tmp_used = curr_blks * 16;
-
- for (i = 0; i < curr_blks; i++)
- {
- const unsigned char *l = ocb_get_l(c, ++blkn);
-
- /* Checksum_i = Checksum_{i-1} xor P_i */
- if (encrypt)
- cipher_block_xor_1(c->u_ctr.ctr, &inbuf[i * 16], 16);
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_iv.iv, l, 16);
- cipher_block_xor (&outbuf[i * 16], &inbuf[i * 16],
- c->u_iv.iv, 16);
- }
-
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
- crypt_blk1_8 (rk, outbuf, outbuf, curr_blks);
-
- for (i = 0; i < curr_blks; i++)
- {
- cipher_block_xor_1 (&outbuf[i * 16], &tmpbuf[i * 16], 16);
-
- /* Checksum_i = Checksum_{i-1} xor P_i */
- if (!encrypt)
- cipher_block_xor_1(c->u_ctr.ctr, &outbuf[i * 16], 16);
- }
-
- outbuf += curr_blks * 16;
- inbuf += curr_blks * 16;
- nblocks -= curr_blks;
- }
+ nburn = bulk_ocb_crypt_128 (c, rk, crypt_blk1_16, outbuf, inbuf, nblocks,
+ &blkn, encrypt, tmpbuf, sizeof(tmpbuf) / 16,
+ &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
wipememory(tmpbuf, tmp_used);
}
@@ -1252,32 +1322,44 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
SM4_context *ctx = (void *)&c->context.c;
const unsigned char *abuf = abuf_arg;
u64 blkn = c->u_mode.ocb.aad_nblocks;
+ int burn_stack_depth = 0;
-#ifdef USE_AESNI_AVX2
- if (ctx->use_aesni_avx2)
+#ifdef USE_GFNI_AVX2
+ if (ctx->use_gfni_avx2)
{
u64 Ls[16];
- unsigned int n = 16 - (blkn % 16);
u64 *l;
- int i;
if (nblocks >= 16)
{
- for (i = 0; i < 16; i += 8)
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
{
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
+ blkn += 16;
+ *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);
+
+ _gcry_sm4_gfni_avx2_ocb_auth(ctx->rkey_enc, abuf,
+ c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
+
+ nblocks -= 16;
+ abuf += 16 * 16;
}
+ }
+ }
+#endif
- Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(15 + n) % 16];
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2)
+ {
+ u64 Ls[16];
+ u64 *l;
+
+ if (nblocks >= 16)
+ {
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
/* Process data in 16 block chunks. */
while (nblocks >= 16)
@@ -1300,22 +1382,11 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
if (ctx->use_aesni_avx)
{
u64 Ls[8];
- unsigned int n = 8 - (blkn % 8);
u64 *l;
if (nblocks >= 8)
{
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(7 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(7 + n) % 8];
+ l = bulk_ocb_prepare_L_pointers_array_blk8 (c, Ls, blkn);
/* Process data in 8 block chunks. */
while (nblocks >= 8)
@@ -1334,114 +1405,27 @@ _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
}
#endif
+ /* Process remaining blocks. */
if (nblocks)
{
- unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
- unsigned int num_blks);
- unsigned char tmpbuf[16 * 8];
+ crypt_blk1_16_fn_t crypt_blk1_16 = sm4_get_crypt_blk1_16_fn(ctx);
+ unsigned char tmpbuf[16 * 16];
unsigned int tmp_used = 16;
+ size_t nburn;
- if (0)
- ;
-#ifdef USE_AESNI_AVX
- else if (ctx->use_aesni_avx)
- {
- crypt_blk1_8 = sm4_aesni_avx_crypt_blk1_8;
- }
-#endif
-#ifdef USE_ARM_CE
- else if (ctx->use_arm_ce)
- {
- crypt_blk1_8 = sm4_armv8_ce_crypt_blk1_8;
- }
-#endif
-#ifdef USE_AARCH64_SIMD
- else if (ctx->use_aarch64_simd)
- {
- crypt_blk1_8 = sm4_aarch64_crypt_blk1_8;
- }
-#endif
- else
- {
- prefetch_sbox_table ();
- crypt_blk1_8 = sm4_crypt_blocks;
- }
-
- while (nblocks)
- {
- size_t curr_blks = nblocks > 8 ? 8 : nblocks;
- size_t i;
-
- if (curr_blks * 16 > tmp_used)
- tmp_used = curr_blks * 16;
-
- for (i = 0; i < curr_blks; i++)
- {
- const unsigned char *l = ocb_get_l(c, ++blkn);
-
- /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
- cipher_block_xor_2dst (&tmpbuf[i * 16],
- c->u_mode.ocb.aad_offset, l, 16);
- cipher_block_xor_1 (&tmpbuf[i * 16], &abuf[i * 16], 16);
- }
-
- /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
- crypt_blk1_8 (ctx->rkey_enc, tmpbuf, tmpbuf, curr_blks);
-
- for (i = 0; i < curr_blks; i++)
- {
- cipher_block_xor_1 (c->u_mode.ocb.aad_sum, &tmpbuf[i * 16], 16);
- }
-
- abuf += curr_blks * 16;
- nblocks -= curr_blks;
- }
+ nburn = bulk_ocb_auth_128 (c, ctx->rkey_enc, crypt_blk1_16, abuf, nblocks,
+ &blkn, tmpbuf, sizeof(tmpbuf) / 16, &tmp_used);
+ burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
wipememory(tmpbuf, tmp_used);
}
c->u_mode.ocb.aad_nblocks = blkn;
- return 0;
-}
-
-/* Run the self-tests for SM4-CTR, tests IV increment of bulk CTR
- encryption. Returns NULL on success. */
-static const char*
-selftest_ctr_128 (void)
-{
- const int nblocks = 16 - 1;
- const int blocksize = 16;
- const int context_size = sizeof(SM4_context);
-
- return _gcry_selftest_helper_ctr("SM4", &sm4_setkey,
- &sm4_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for SM4-CBC, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char*
-selftest_cbc_128 (void)
-{
- const int nblocks = 16 - 1;
- const int blocksize = 16;
- const int context_size = sizeof(SM4_context);
-
- return _gcry_selftest_helper_cbc("SM4", &sm4_setkey,
- &sm4_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for SM4-CFB, tests bulk CFB decryption.
- Returns NULL on success. */
-static const char*
-selftest_cfb_128 (void)
-{
- const int nblocks = 16 - 1;
- const int blocksize = 16;
- const int context_size = sizeof(SM4_context);
+ if (burn_stack_depth)
+ _gcry_burn_stack(burn_stack_depth);
- return _gcry_selftest_helper_cfb("SM4", &sm4_setkey,
- &sm4_encrypt, nblocks, blocksize, context_size);
+ return 0;
}
static const char *
@@ -1449,7 +1433,6 @@ sm4_selftest (void)
{
SM4_context ctx;
byte scratch[16];
- const char *r;
static const byte plaintext[16] = {
0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
@@ -1474,15 +1457,6 @@ sm4_selftest (void)
if (memcmp (scratch, plaintext, sizeof (plaintext)))
return "SM4 test decryption failed.";
- if ( (r = selftest_ctr_128 ()) )
- return r;
-
- if ( (r = selftest_cbc_128 ()) )
- return r;
-
- if ( (r = selftest_cfb_128 ()) )
- return r;
-
return NULL;
}
diff --git a/cipher/twofish.c b/cipher/twofish.c
index d19e0790..b300715b 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -46,7 +46,7 @@
#include "cipher.h"
#include "bufhelp.h"
#include "cipher-internal.h"
-#include "cipher-selftest.h"
+#include "bulkhelp.h"
#define TWOFISH_BLOCKSIZE 16
@@ -1358,27 +1358,11 @@ _gcry_twofish_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
{
int did_use_avx2 = 0;
u64 Ls[16];
- unsigned int n = 16 - (blkn % 16);
u64 *l;
- int i;
if (nblocks >= 16)
{
- for (i = 0; i < 16; i += 8)
- {
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- }
-
- Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(15 + n) % 16];
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
/* Process data in 16 block chunks. */
while (nblocks >= 16)
@@ -1471,27 +1455,11 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
{
int did_use_avx2 = 0;
u64 Ls[16];
- unsigned int n = 16 - (blkn % 16);
u64 *l;
- int i;
if (nblocks >= 16)
{
- for (i = 0; i < 16; i += 8)
- {
- /* Use u64 to store pointers for x32 support (assembly function
- * assumes 64-bit pointers). */
- Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
- Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
- Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
- }
-
- Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
- l = &Ls[(15 + n) % 16];
+ l = bulk_ocb_prepare_L_pointers_array_blk16 (c, Ls, blkn);
/* Process data in 16 block chunks. */
while (nblocks >= 16)
@@ -1558,46 +1526,6 @@ _gcry_twofish_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
return nblocks;
}
-
-
-/* Run the self-tests for TWOFISH-CTR, tests IV increment of bulk CTR
- encryption. Returns NULL on success. */
-static const char *
-selftest_ctr (void)
-{
- const int nblocks = 16+1;
- const int blocksize = TWOFISH_BLOCKSIZE;
- const int context_size = sizeof(TWOFISH_context);
-
- return _gcry_selftest_helper_ctr("TWOFISH", &twofish_setkey,
- &twofish_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for TWOFISH-CBC, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char *
-selftest_cbc (void)
-{
- const int nblocks = 16+2;
- const int blocksize = TWOFISH_BLOCKSIZE;
- const int context_size = sizeof(TWOFISH_context);
-
- return _gcry_selftest_helper_cbc("TWOFISH", &twofish_setkey,
- &twofish_encrypt, nblocks, blocksize, context_size);
-}
-
-/* Run the self-tests for TWOFISH-CFB, tests bulk CBC decryption.
- Returns NULL on success. */
-static const char *
-selftest_cfb (void)
-{
- const int nblocks = 16+2;
- const int blocksize = TWOFISH_BLOCKSIZE;
- const int context_size = sizeof(TWOFISH_context);
-
- return _gcry_selftest_helper_cfb("TWOFISH", &twofish_setkey,
- &twofish_encrypt, nblocks, blocksize, context_size);
-}
/* Test a single encryption and decryption with each key size. */
@@ -1608,7 +1536,6 @@ selftest (void)
TWOFISH_context ctx; /* Expanded key. */
byte scratch[16]; /* Encryption/decryption result buffer. */
cipher_bulk_ops_t bulk_ops;
- const char *r;
/* Test vectors for single encryption/decryption. Note that I am using
* the vectors from the Twofish paper's "known answer test", I=3 for
@@ -1658,13 +1585,6 @@ selftest (void)
if (memcmp (scratch, plaintext_256, sizeof (plaintext_256)))
return "Twofish-256 test decryption failed.";
- if ((r = selftest_ctr()) != NULL)
- return r;
- if ((r = selftest_cbc()) != NULL)
- return r;
- if ((r = selftest_cfb()) != NULL)
- return r;
-
return NULL;
}
diff --git a/configure.ac b/configure.ac
index 3e415cea..a7482cf3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -675,6 +675,14 @@ AC_ARG_ENABLE(avx512-support,
avx512support=$enableval,avx512support=yes)
AC_MSG_RESULT($avx512support)
+# Implementation of the --disable-gfni-support switch.
+AC_MSG_CHECKING([whether GFNI support is requested])
+AC_ARG_ENABLE(gfni-support,
+ AS_HELP_STRING([--disable-gfni-support],
+ [Disable support for the Intel GFNI instructions]),
+ gfnisupport=$enableval,gfnisupport=yes)
+AC_MSG_RESULT($gfnisupport)
+
# Implementation of the --disable-neon-support switch.
AC_MSG_CHECKING([whether NEON support is requested])
AC_ARG_ENABLE(neon-support,
@@ -1305,6 +1313,7 @@ if test "$mpi_cpu_arch" != "x86" ; then
avxsupport="n/a"
avx2support="n/a"
avx512support="n/a"
+ gfnisupport="n/a"
padlocksupport="n/a"
drngsupport="n/a"
fi
@@ -1607,6 +1616,30 @@ fi
#
+# Check whether GCC inline assembler supports GFNI instructions
+#
+AC_CACHE_CHECK([whether GCC inline assembler supports GFNI instructions],
+ [gcry_cv_gcc_inline_asm_gfni],
+ [if test "$mpi_cpu_arch" != "x86" ||
+ test "$try_asm_modules" != "yes" ; then
+ gcry_cv_gcc_inline_asm_gfni="n/a"
+ else
+ gcry_cv_gcc_inline_asm_gfni=no
+ AC_LINK_IFELSE([AC_LANG_PROGRAM(
+ [[void a(void) {
+ __asm__("gf2p8affineqb \$123, %%xmm0, %%xmm0;\n\t":::"cc"); /* SSE */
+ __asm__("vgf2p8affineinvqb \$234, %%ymm1, %%ymm1, %%ymm1;\n\t":::"cc"); /* AVX */
+ __asm__("vgf2p8mulb (%%eax), %%zmm2, %%zmm2;\n\t":::"cc"); /* AVX512 */
+ }]], [ a(); ] )],
+ [gcry_cv_gcc_inline_asm_gfni=yes])
+ fi])
+if test "$gcry_cv_gcc_inline_asm_gfni" = "yes" ; then
+ AC_DEFINE(HAVE_GCC_INLINE_ASM_GFNI,1,
+ [Defined if inline assembler supports GFNI instructions])
+fi
+
+
+#
# Check whether GCC inline assembler supports BMI2 instructions
#
AC_CACHE_CHECK([whether GCC inline assembler supports BMI2 instructions],
@@ -2411,6 +2444,11 @@ if test x"$avx512support" = xyes ; then
avx512support="no (unsupported by compiler)"
fi
fi
+if test x"$gfnisupport" = xyes ; then
+ if test "$gcry_cv_gcc_inline_asm_gfni" != "yes" ; then
+ gfnisupport="no (unsupported by compiler)"
+ fi
+fi
if test x"$neonsupport" = xyes ; then
if test "$gcry_cv_gcc_inline_asm_neon" != "yes" ; then
if test "$gcry_cv_gcc_inline_asm_aarch64_neon" != "yes" ; then
@@ -2454,6 +2492,10 @@ if test x"$avx512support" = xyes ; then
AC_DEFINE(ENABLE_AVX512_SUPPORT,1,
[Enable support for Intel AVX512 instructions.])
fi
+if test x"$gfnisupport" = xyes ; then
+ AC_DEFINE(ENABLE_GFNI_SUPPORT,1,
+ [Enable support for Intel GFNI instructions.])
+fi
if test x"$neonsupport" = xyes ; then
AC_DEFINE(ENABLE_NEON_SUPPORT,1,
[Enable support for ARM NEON instructions.])
@@ -2713,6 +2755,12 @@ if test "$found" = "1" ; then
# Build with the VAES/AVX2 implementation
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-vaes-avx2-amd64.lo"
+
+ # Build with the GFNI/AVX2 implementation
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx2-amd64.lo"
+
+ # Build with the GFNI/AVX512 implementation
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS camellia-gfni-avx512-amd64.lo"
fi
fi
fi
@@ -2797,6 +2845,7 @@ if test "$found" = "1" ; then
# Build with the assembly implementation
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx-amd64.lo"
GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-aesni-avx2-amd64.lo"
+ GCRYPT_ASM_CIPHERS="$GCRYPT_ASM_CIPHERS sm4-gfni-avx2-amd64.lo"
;;
aarch64-*-*)
# Build with the assembly implementation
@@ -3318,6 +3367,7 @@ GCRY_MSG_SHOW([Try using DRNG (RDRAND): ],[$drngsupport])
GCRY_MSG_SHOW([Try using Intel AVX: ],[$avxsupport])
GCRY_MSG_SHOW([Try using Intel AVX2: ],[$avx2support])
GCRY_MSG_SHOW([Try using Intel AVX512: ],[$avx512support])
+GCRY_MSG_SHOW([Try using Intel GFNI: ],[$gfnisupport])
GCRY_MSG_SHOW([Try using ARM NEON: ],[$neonsupport])
GCRY_MSG_SHOW([Try using ARMv8 crypto: ],[$armcryptosupport])
GCRY_MSG_SHOW([Try using PPC crypto: ],[$ppccryptosupport])
diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi
index 55515011..b82535e2 100644
--- a/doc/gcrypt.texi
+++ b/doc/gcrypt.texi
@@ -591,6 +591,7 @@ are
@item intel-shaext
@item intel-vaes-vpclmul
@item intel-avx512
@item arm-neon
@item arm-aes
@item arm-sha1
diff --git a/m4/Makefile.am b/m4/Makefile.am
index c33f1009..53800d39 100644
--- a/m4/Makefile.am
+++ b/m4/Makefile.am
@@ -1,2 +1,2 @@
-EXTRA_DIST = libtool.m4 socklen.m4 noexecstack.m4
+EXTRA_DIST = libtool.m4 noexecstack.m4
EXTRA_DIST += gpg-error.m4
diff --git a/mpi/longlong.h b/mpi/longlong.h
index 39cdd0c2..c299534c 100644
--- a/mpi/longlong.h
+++ b/mpi/longlong.h
@@ -430,14 +430,14 @@ extern UDItype __udiv_qrnnd ();
# define UMUL_TIME 40
# define UDIV_TIME 80
# endif
-# ifndef LONGLONG_STANDALONE
+# if !defined(LONGLONG_STANDALONE) && !defined(ASM_DISABLED)
# define udiv_qrnnd(q, r, n1, n0, d) \
do { USItype __r; \
(q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \
(r) = __r; \
} while (0)
extern USItype __udiv_qrnnd ();
-# endif /* !LONGLONG_STANDALONE */
+# endif /* !LONGLONG_STANDALONE && !ASM_DISABLED */
# define count_leading_zeros(count, x) \
do { \
USItype __tmp; \
diff --git a/mpi/mpih-const-time.c b/mpi/mpih-const-time.c
index b527ad79..9d74d190 100644
--- a/mpi/mpih-const-time.c
+++ b/mpi/mpih-const-time.c
@@ -204,6 +204,13 @@ _gcry_mpih_cmp_ui (mpi_ptr_t up, mpi_size_t usize, unsigned long v)
is_all_zero &= (up[i] == 0);
if (is_all_zero)
- return up[0] - v;
+ {
+ if (up[0] < v)
+ return -1;
+ else if (up[0] > v)
+ return 1;
+ else
+ return 0;
+ }
return 1;
}
diff --git a/random/jitterentropy-base-user.h b/random/jitterentropy-base-user.h
index 389106ff..3b4274af 100644
--- a/random/jitterentropy-base-user.h
+++ b/random/jitterentropy-base-user.h
@@ -141,7 +141,7 @@ static inline void jent_memset_secure(void *s, size_t n)
static inline long jent_ncpu(void)
{
-#ifdef _POSIX_SOURCE
+#if defined(_POSIX_SOURCE)
long ncpu = sysconf(_SC_NPROCESSORS_ONLN);
if (ncpu == -1)
@@ -151,6 +151,19 @@ static inline long jent_ncpu(void)
return -EFAULT;
return ncpu;
+#elif defined(HAVE_W32_SYSTEM)
+ SYSTEM_INFO sysinfo;
+ long ncpu;
+
+ GetNativeSystemInfo (&sysinfo);
+ ncpu = sysinfo.dwNumberOfProcessors;
+ if (ncpu <= 0) {
+ GetSystemInfo (&sysinfo);
+ ncpu = sysinfo.dwNumberOfProcessors;
+ }
+ if (ncpu <= 0)
+ ncpu = 1;
+ return ncpu;
#else
return 1;
#endif
diff --git a/random/random-drbg.c b/random/random-drbg.c
index 5a46fd92..f1cfe286 100644
--- a/random/random-drbg.c
+++ b/random/random-drbg.c
@@ -341,6 +341,9 @@ enum drbg_prefixes
* Global variables
***************************************************************/
+/* The instance of the DRBG, to be refereed by drbg_state. */
+static struct drbg_state_s drbg_instance;
+
/* Global state variable holding the current instance of the DRBG. */
static drbg_state_t drbg_state;
@@ -1783,9 +1786,7 @@ _drbg_init_internal (u32 flags, drbg_string_t *pers)
}
else
{
- drbg_state = xtrycalloc_secure (1, sizeof *drbg_state);
- if (!drbg_state)
- return gpg_err_code_from_syserror ();
+ drbg_state = &drbg_instance;
}
if (flags & DRBG_PREDICTION_RESIST)
pr = 1;
@@ -1879,7 +1880,6 @@ _gcry_rngdrbg_close_fds (void)
if (drbg_state)
{
drbg_uninstantiate (drbg_state);
- xfree (drbg_state);
drbg_state = NULL;
}
drbg_unlock ();
diff --git a/random/rndjent.c b/random/rndjent.c
index 14d23794..0468c7cb 100644
--- a/random/rndjent.c
+++ b/random/rndjent.c
@@ -45,6 +45,17 @@
#endif
#include <unistd.h>
#include <errno.h>
+#ifndef EOPNOTSUPP
+# define EOPNOTSUPP ENOSYS
+#endif
+
+#ifdef HAVE_W32_SYSTEM
+# if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0501
+# undef _WIN32_WINNT
+# define _WIN32_WINNT 0x0501 /* for GetNativeSystemInfo */
+# endif
+# include <windows.h>
+#endif
#include "types.h"
#include "g10lib.h"
diff --git a/src/g10lib.h b/src/g10lib.h
index c07ed788..a5bed002 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -238,6 +238,7 @@ char **_gcry_strtokenize (const char *string, const char *delim);
#define HWF_INTEL_SHAEXT (1 << 16)
#define HWF_INTEL_VAES_VPCLMUL (1 << 17)
#define HWF_INTEL_AVX512 (1 << 18)
+#define HWF_INTEL_GFNI (1 << 19)
#elif defined(HAVE_CPU_ARCH_ARM)
diff --git a/src/gcrypt-int.h b/src/gcrypt-int.h
index 08977d32..04953ffc 100644
--- a/src/gcrypt-int.h
+++ b/src/gcrypt-int.h
@@ -214,8 +214,8 @@ gpg_err_code_t _gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo,
const void *salt, size_t saltlen,
const void *key, size_t keylen,
const void *ad, size_t adlen);
-gcry_error_t _gcry_kdf_compute (gcry_kdf_hd_t h,
- const struct gcry_kdf_thread_ops *ops);
+gcry_err_code_t _gcry_kdf_compute (gcry_kdf_hd_t h,
+ const struct gcry_kdf_thread_ops *ops);
gpg_err_code_t _gcry_kdf_final (gcry_kdf_hd_t h, size_t resultlen, void *result);
void _gcry_kdf_close (gcry_kdf_hd_t h);
diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in
index 2fd47292..809848b7 100644
--- a/src/gcrypt.h.in
+++ b/src/gcrypt.h.in
@@ -206,7 +206,7 @@ struct gcry_thread_cbs
Bits 7 - 0 are used for the thread model
Bits 15 - 8 are used for the version number. */
unsigned int option;
-} _GCRY_ATTR_INTERNAL;
+} _GCRY_GCC_ATTR_DEPRECATED;
#define GCRY_THREAD_OPTION_PTH_IMPL \
static struct gcry_thread_cbs gcry_threads_pth = { \
@@ -1592,8 +1592,20 @@ enum gcry_kdf_algos
GCRY_KDF_PBKDF1 = 33,
GCRY_KDF_PBKDF2 = 34,
GCRY_KDF_SCRYPT = 48,
+ /**/
GCRY_KDF_ARGON2 = 64,
- GCRY_KDF_BALLOON = 65
+ GCRY_KDF_BALLOON = 65,
+ /**/
+ /* In the original SP 800-56A, it's called
+ * "Concatenation Key Derivation Function".
+ * Now (as of 2022), it's defined in SP 800-56C rev.2, as
+ * "One-Step Key Derivation".
+ */
+ GCRY_KDF_ONESTEP_KDF = 96, /* One-Step Key Derivation with hash */
+ GCRY_KDF_ONESTEP_KDF_MAC = 97, /* One-Step Key Derivation with MAC */
+ /* Two-Step Key Derivation with HMAC */
+ /* Two-Step Key Derivation with CMAC */
+ /* KDF PRF in SP 800-108r1 */
};
enum gcry_kdf_subalgo_argon2
diff --git a/src/hwf-x86.c b/src/hwf-x86.c
index 33386070..20420798 100644
--- a/src/hwf-x86.c
+++ b/src/hwf-x86.c
@@ -403,7 +403,7 @@ detect_x86_gnuc (void)
#if defined(ENABLE_AVX2_SUPPORT) && defined(ENABLE_AESNI_SUPPORT) && \
defined(ENABLE_PCLMUL_SUPPORT)
- /* Test bit 9 for VAES and bit 10 for VPCLMULDQD */
+ /* Test features2 bit 9 for VAES and features2 bit 10 for VPCLMULDQD */
if ((features2 & 0x00000200) && (features2 & 0x00000400))
result |= HWF_INTEL_VAES_VPCLMUL;
#endif
@@ -439,6 +439,11 @@ detect_x86_gnuc (void)
&& (features2 & (1 << 14)))
result |= HWF_INTEL_AVX512;
#endif
+
+ /* Test features2 bit 6 for GFNI (Galois field new instructions).
+ * These instructions are available for SSE/AVX/AVX2/AVX512. */
+ if (features2 & (1 << 6))
+ result |= HWF_INTEL_GFNI;
}
return result;
diff --git a/src/hwfeatures.c b/src/hwfeatures.c
index 8e92cbdd..af5daf62 100644
--- a/src/hwfeatures.c
+++ b/src/hwfeatures.c
@@ -63,6 +63,7 @@ static struct
{ HWF_INTEL_SHAEXT, "intel-shaext" },
{ HWF_INTEL_VAES_VPCLMUL, "intel-vaes-vpclmul" },
{ HWF_INTEL_AVX512, "intel-avx512" },
+ { HWF_INTEL_GFNI, "intel-gfni" },
#elif defined(HAVE_CPU_ARCH_ARM)
{ HWF_ARM_NEON, "arm-neon" },
{ HWF_ARM_AES, "arm-aes" },
diff --git a/src/secmem.c b/src/secmem.c
index b36c44f6..010f1cc3 100644
--- a/src/secmem.c
+++ b/src/secmem.c
@@ -289,48 +289,7 @@ print_warn (void)
static void
lock_pool_pages (void *p, size_t n)
{
-#if defined(USE_CAPABILITIES) && defined(HAVE_MLOCK)
- int err;
-
- {
- cap_t cap;
-
- if (!no_priv_drop)
- {
- cap = cap_from_text ("cap_ipc_lock+ep");
- cap_set_proc (cap);
- cap_free (cap);
- }
- err = no_mlock? 0 : mlock (p, n);
- if (err && errno)
- err = errno;
- if (!no_priv_drop)
- {
- cap = cap_from_text ("cap_ipc_lock+p");
- cap_set_proc (cap);
- cap_free(cap);
- }
- }
-
- if (err)
- {
- if (err != EPERM
-#ifdef EAGAIN /* BSD and also Linux may return EAGAIN */
- && err != EAGAIN
-#endif
-#ifdef ENOSYS /* Some SCOs return this (function not implemented) */
- && err != ENOSYS
-#endif
-#ifdef ENOMEM /* Linux might return this. */
- && err != ENOMEM
-#endif
- )
- log_error ("can't lock memory: %s\n", strerror (err));
- show_warning = 1;
- not_locked = 1;
- }
-
-#elif defined(HAVE_MLOCK)
+#if defined(HAVE_MLOCK)
uid_t uid;
int err;
@@ -344,18 +303,14 @@ lock_pool_pages (void *p, size_t n)
if (uid)
{
errno = EPERM;
- err = errno;
+ err = -1;
}
else
{
err = no_mlock? 0 : mlock (p, n);
- if (err && errno)
- err = errno;
}
#else /* !HAVE_BROKEN_MLOCK */
err = no_mlock? 0 : mlock (p, n);
- if (err && errno)
- err = errno;
#endif /* !HAVE_BROKEN_MLOCK */
/* Test whether we are running setuid(0). */
@@ -373,18 +328,18 @@ lock_pool_pages (void *p, size_t n)
if (err)
{
- if (err != EPERM
+ if (errno != EPERM
#ifdef EAGAIN /* BSD and also Linux may return this. */
- && err != EAGAIN
+ && errno != EAGAIN
#endif
#ifdef ENOSYS /* Some SCOs return this (function not implemented). */
- && err != ENOSYS
+ && errno != ENOSYS
#endif
#ifdef ENOMEM /* Linux might return this. */
- && err != ENOMEM
+ && errno != ENOMEM
#endif
)
- log_error ("can't lock memory: %s\n", strerror (err));
+ log_error ("can't lock memory: %s\n", strerror (errno));
show_warning = 1;
not_locked = 1;
}
@@ -401,12 +356,6 @@ lock_pool_pages (void *p, size_t n)
* this whole Windows [email protected]#$% and their user base are inherently insecure. */
(void)p;
(void)n;
-#elif defined (__riscos__)
- /* No virtual memory on RISC OS, so no pages are swapped to disc,
- * besides we don't have mmap, so we don't use it! ;-)
- * But don't complain, as explained above. */
- (void)p;
- (void)n;
#else
(void)p;
(void)n;
@@ -809,7 +758,7 @@ _gcry_secmem_realloc_internal (void *p, size_t newsize, int xhint)
void *a;
mb = (memblock_t *) (void *) ((char *) p
- - ((size_t) &((memblock_t *) 0)->aligned.c));
+ - offsetof (memblock_t, aligned.c));
size = mb->size;
if (newsize < size)
{
diff --git a/tests/aeswrap.c b/tests/aeswrap.c
index ed4453bd..e5ecad75 100644
--- a/tests/aeswrap.c
+++ b/tests/aeswrap.c
@@ -219,6 +219,7 @@ check_one_with_padding (int algo,
if (err)
{
fail ("gcry_cipher_setkey failed: %s\n", gpg_strerror (err));
+ gcry_cipher_close (hd);
return;
}
@@ -235,6 +236,7 @@ check_one_with_padding (int algo,
if (err)
{
fail ("gcry_cipher_encrypt failed: %s\n", gpg_strerror (err));
+ gcry_cipher_close (hd);
return;
}
@@ -275,6 +277,7 @@ check_one_with_padding (int algo,
if (err)
{
fail ("gcry_cipher_decrypt failed: %s\n", gpg_strerror (err));
+ gcry_cipher_close (hd);
return;
}
@@ -318,6 +321,7 @@ check_one_with_padding (int algo,
if (err)
{
fail ("gcry_cipher_decrypt(2) failed: %s\n", gpg_strerror (err));
+ gcry_cipher_close (hd);
return;
}
@@ -347,6 +351,7 @@ check_one_with_padding (int algo,
if (err)
{
fail ("gcry_cipher_decrypt(3) failed: %s\n", gpg_strerror (err));
+ gcry_cipher_close (hd);
return;
}
diff --git a/tests/basic.c b/tests/basic.c
index a0ad33eb..ab00667e 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -27,6 +27,13 @@
#include <string.h>
#include <stdarg.h>
#include <assert.h>
+#ifdef HAVE_STDINT_H
+# include <stdint.h> /* uintptr_t */
+#elif defined(HAVE_INTTYPES_H)
+# include <inttypes.h>
+#else
+/* In this case, uintptr_t is provided by config.h. */
+#endif
#include "../src/gcrypt-int.h"
@@ -55,11 +62,12 @@ typedef struct test_spec_pubkey
}
test_spec_pubkey_t;
-#define FLAG_CRYPT (1 << 0)
-#define FLAG_SIGN (1 << 1)
-#define FLAG_GRIP (1 << 2)
-#define FLAG_NOFIPS (1 << 3)
-#define FLAG_CFB8 (1 << 4)
+#define FLAG_CRYPT (1 << 0)
+#define FLAG_SIGN (1 << 1)
+#define FLAG_GRIP (1 << 2)
+#define FLAG_NOFIPS (1 << 3)
+#define FLAG_CFB8 (1 << 4)
+#define FLAG_SPECIAL (1 << 5)
static int in_fips_mode;
@@ -7331,6 +7339,8 @@ check_ccm_cipher (void)
if (!keylen)
{
fail ("cipher-ccm, gcry_cipher_get_algo_keylen failed\n");
+ gcry_cipher_close (hde);
+ gcry_cipher_close (hdd);
return;
}
@@ -7350,6 +7360,8 @@ check_ccm_cipher (void)
if (!blklen)
{
fail ("cipher-ccm, gcry_cipher_get_algo_blklen failed\n");
+ gcry_cipher_close (hde);
+ gcry_cipher_close (hdd);
return;
}
@@ -8423,7 +8435,7 @@ check_ocb_cipher_checksum (int algo, int keylen)
const size_t buflen = 128 * 16;
unsigned char *inbuf, *outbuf;
gpg_error_t err = 0;
- gcry_cipher_hd_t hde, hde2, hdd;
+ gcry_cipher_hd_t hde = NULL, hde2 = NULL, hdd = NULL;
unsigned char tag[16];
unsigned char tag2[16];
unsigned char tag3[16];
@@ -8436,7 +8448,7 @@ check_ocb_cipher_checksum (int algo, int keylen)
return;
}
outbuf = xmalloc(buflen);
- if (!inbuf)
+ if (!outbuf)
{
fail ("out-of-memory\n");
xfree(inbuf);
@@ -8485,6 +8497,8 @@ check_ocb_cipher_checksum (int algo, int keylen)
err = gcry_cipher_open (&hdd, algo, GCRY_CIPHER_MODE_OCB, 0);
if (err)
{
+ gcry_cipher_close (hde);
+ gcry_cipher_close (hde2);
fail ("cipher-ocb, gcry_cipher_open failed (checksum, algo %d): %s\n",
algo, gpg_strerror (err));
goto out_free;
@@ -9118,6 +9132,7 @@ do_check_xts_cipher (int inplace)
{
fail ("cipher-xts, gcry_cipher_open failed (tv %d): %s\n",
tidx, gpg_strerror (err));
+ gcry_cipher_close (hde);
return;
}
@@ -9342,6 +9357,8 @@ check_gost28147_cipher_basic (enum gcry_cipher_algos algo)
if (err)
{
fail ("gost28147, gcry_cipher_open failed: %s\n", gpg_strerror (err));
+ gcry_cipher_close (hde);
+ gcry_cipher_close (hdd);
return;
}
@@ -10939,7 +10956,7 @@ static int
check_one_cipher_core (int algo, int mode, int flags,
const char *key, size_t nkey,
const unsigned char *plain, size_t nplain,
- int bufshift, int pass)
+ int bufshift, int split_mode, int pass)
{
gcry_cipher_hd_t hd;
unsigned char *in_buffer, *out_buffer;
@@ -11198,7 +11215,9 @@ check_one_cipher_core (int algo, int mode, int flags,
}
pos += piecelen;
- piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
+ piecelen = split_mode == 1
+ ? (piecelen + blklen)
+ : (piecelen * 2 - ((piecelen != blklen) ? blklen : 0));
}
if (taglen > 0)
@@ -11245,7 +11264,9 @@ check_one_cipher_core (int algo, int mode, int flags,
}
pos += piecelen;
- piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
+ piecelen = split_mode == 1
+ ? (piecelen + blklen)
+ : (piecelen * 2 - ((piecelen != blklen) ? blklen : 0));
}
if (taglen > 0)
@@ -11291,7 +11312,9 @@ check_one_cipher_core (int algo, int mode, int flags,
}
pos += piecelen;
- piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
+ piecelen = split_mode == 1
+ ? (piecelen + blklen)
+ : (piecelen * 2 - ((piecelen != blklen) ? blklen : 0));
}
if (memcmp (enc_result, out, nplain))
@@ -11320,7 +11343,9 @@ check_one_cipher_core (int algo, int mode, int flags,
}
pos += piecelen;
- piecelen = piecelen * 2 - ((piecelen != blklen) ? blklen : 0);
+ piecelen = split_mode == 1
+ ? (piecelen + blklen)
+ : (piecelen * 2 - ((piecelen != blklen) ? blklen : 0));
}
if (memcmp (plain, out, nplain))
@@ -11580,28 +11605,28 @@ check_one_cipher (int algo, int mode, int flags)
}
if (check_one_cipher_core (algo, mode, flags, key, 64, plain,
- medium_buffer_size, bufshift,
+ medium_buffer_size, bufshift, 0,
0+10*bufshift))
goto out;
/* Pass 1: Key not aligned. */
memmove (key+1, key, 64);
if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain,
- medium_buffer_size, bufshift,
+ medium_buffer_size, bufshift, 0,
1+10*bufshift))
goto out;
/* Pass 2: Key not aligned and data not aligned. */
memmove (plain+1, plain, medium_buffer_size);
if (check_one_cipher_core (algo, mode, flags, key+1, 64, plain+1,
- medium_buffer_size, bufshift,
+ medium_buffer_size, bufshift, 0,
2+10*bufshift))
goto out;
/* Pass 3: Key aligned and data not aligned. */
memmove (key, key+1, 64);
if (check_one_cipher_core (algo, mode, flags, key, 64, plain+1,
- medium_buffer_size, bufshift,
+ medium_buffer_size, bufshift, 0,
3+10*bufshift))
goto out;
}
@@ -11620,10 +11645,15 @@ check_one_cipher (int algo, int mode, int flags)
}
if (check_one_cipher_core (algo, mode, flags, key, 64, plain,
- large_buffer_size, bufshift,
+ large_buffer_size, bufshift, 0,
50))
goto out;
+ if (check_one_cipher_core (algo, mode, flags, key, 64, plain,
+ large_buffer_size, bufshift, 1,
+ 51))
+ goto out;
+
/* Pass 6: Counter overflow tests for ChaCha20 and CTR mode. */
if (mode == GCRY_CIPHER_MODE_STREAM && algo == GCRY_CIPHER_CHACHA20)
{
@@ -11658,6 +11688,764 @@ out:
+static void buf_xor(void *vdst, const void *vsrc1, const void *vsrc2, size_t len)
+{
+ char *dst = vdst;
+ const char *src1 = vsrc1;
+ const char *src2 = vsrc2;
+
+ while (len)
+ {
+ *(char *)dst = *(char *)src1 ^ *(char *)src2;
+ dst++;
+ src1++;
+ src2++;
+ len--;
+ }
+}
+
+/* Run the tests for <block cipher>-CBC-<block size>, tests bulk CBC
+ decryption. Returns NULL on success. */
+static int
+cipher_cbc_bulk_test (int cipher_algo)
+{
+ const int nblocks = 128 - 1;
+ int i, offs;
+ int blocksize;
+ const char *cipher;
+ gcry_cipher_hd_t hd_one;
+ gcry_cipher_hd_t hd_cbc;
+ gcry_error_t err = 0;
+ unsigned char *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+ unsigned int memsize;
+ unsigned int keylen;
+
+ static const unsigned char key[32] = {
+ 0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+ 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22,
+ 0x66,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+ 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x22
+ };
+
+ if (gcry_cipher_test_algo (cipher_algo))
+ return -1;
+ blocksize = gcry_cipher_get_algo_blklen(cipher_algo);
+ if (blocksize < 8)
+ return -1;
+ cipher = gcry_cipher_algo_name (cipher_algo);
+ keylen = gcry_cipher_get_algo_keylen (cipher_algo);
+ if (keylen > sizeof(key))
+ {
+ fail ("%s-CBC-%d test failed (key too short)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ memsize = (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+ mem = xcalloc (1, memsize);
+ if (!mem)
+ return -1;
+
+ offs = (16 - ((uintptr_t)mem & 15)) & 15;
+ iv = (void*)(mem + offs);
+ iv2 = iv + blocksize;
+ plaintext = iv2 + blocksize;
+ plaintext2 = plaintext + nblocks * blocksize;
+ ciphertext = plaintext2 + nblocks * blocksize;
+
+ err = gcry_cipher_open (&hd_one, cipher_algo, GCRY_CIPHER_MODE_ECB, 0);
+ if (err)
+ {
+ xfree(mem);
+ fail ("%s-CBC-%d test failed (cipher open fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_open (&hd_cbc, cipher_algo, GCRY_CIPHER_MODE_CBC, 0);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ xfree(mem);
+ fail ("%s-CBC-%d test failed (cipher open fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ /* Initialize ctx */
+ if (gcry_cipher_setkey (hd_one, key, keylen) ||
+ gcry_cipher_setkey (hd_cbc, key, keylen))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree(mem);
+ fail ("%s-CBC-%d test failed (setkey fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ /* Test single block code path */
+ memset (iv, 0x4e, blocksize);
+ memset (iv2, 0x4e, blocksize);
+ for (i = 0; i < blocksize; i++)
+ plaintext[i] = i;
+
+ /* CBC manually. */
+ buf_xor (ciphertext, iv, plaintext, blocksize);
+ err = gcry_cipher_encrypt (hd_one, ciphertext, blocksize,
+ ciphertext, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree(mem);
+ fail ("%s-CBC-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ memcpy (iv, ciphertext, blocksize);
+
+ /* CBC decrypt. */
+ err = gcry_cipher_setiv (hd_cbc, iv2, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree(mem);
+ fail ("%s-CBC-%d test failed (setiv fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_decrypt (hd_cbc, plaintext2, blocksize * 1,
+ ciphertext, blocksize * 1);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree(mem);
+ fail ("%s-CBC-%d test failed (CBC decrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ if (memcmp (plaintext2, plaintext, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree (mem);
+ fail ("%s-CBC-%d test failed (plaintext mismatch)", cipher, blocksize * 8);
+ return -1;
+ }
+
+#if 0 /* missing interface for reading IV */
+ if (memcmp (iv2, iv, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree (mem);
+ fail ("%s-CBC-%d test failed (IV mismatch)", cipher, blocksize * 8);
+ return -1;
+ }
+#endif
+
+ /* Test parallelized code paths */
+ memset (iv, 0x5f, blocksize);
+ memset (iv2, 0x5f, blocksize);
+
+ for (i = 0; i < nblocks * blocksize; i++)
+ plaintext[i] = i;
+
+ /* Create CBC ciphertext manually. */
+ for (i = 0; i < nblocks * blocksize; i+=blocksize)
+ {
+ buf_xor (&ciphertext[i], iv, &plaintext[i], blocksize);
+ err = gcry_cipher_encrypt (hd_one, &ciphertext[i], blocksize,
+ &ciphertext[i], blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree(mem);
+ fail ("%s-CBC-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ memcpy (iv, &ciphertext[i], blocksize);
+ }
+
+ /* Decrypt using bulk CBC and compare result. */
+ err = gcry_cipher_setiv (hd_cbc, iv2, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree(mem);
+ fail ("%s-CBC-%d test failed (setiv fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_decrypt (hd_cbc, plaintext2, blocksize * nblocks,
+ ciphertext, blocksize * nblocks);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree(mem);
+ fail ("%s-CBC-%d test failed (CBC decrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ if (memcmp (plaintext2, plaintext, nblocks * blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree (mem);
+ fail ("%s-CBC-%d test failed (plaintext mismatch, parallel path)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+#if 0 /* missing interface for reading IV */
+ if (memcmp (iv2, iv, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree (mem);
+ fail ("%s-CBC-%d test failed (IV mismatch, parallel path)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+#endif
+
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cbc);
+ xfree (mem);
+ return -1;
+}
+
+
+static void
+buf_xor_2dst(void *vdst1, void *vdst2, const void *vsrc, size_t len)
+{
+ byte *dst1 = vdst1;
+ byte *dst2 = vdst2;
+ const byte *src = vsrc;
+
+ for (; len; len--)
+ *dst1++ = (*dst2++ ^= *src++);
+}
+
+/* Run the tests for <block cipher>-CFB-<block size>, tests bulk CFB
+ decryption. Returns NULL on success. */
+static int
+cipher_cfb_bulk_test (int cipher_algo)
+{
+ const int nblocks = 128 - 1;
+ int blocksize;
+ const char *cipher;
+ gcry_cipher_hd_t hd_one;
+ gcry_cipher_hd_t hd_cfb;
+ gcry_error_t err = 0;
+ int i, offs;
+ unsigned char *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+ unsigned int memsize;
+ unsigned int keylen;
+
+ static const unsigned char key[32] = {
+ 0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+ 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33,
+ 0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+ 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
+ };
+
+ if (gcry_cipher_test_algo (cipher_algo))
+ return -1;
+ blocksize = gcry_cipher_get_algo_blklen(cipher_algo);
+ if (blocksize < 8)
+ return -1;
+ cipher = gcry_cipher_algo_name (cipher_algo);
+ keylen = gcry_cipher_get_algo_keylen (cipher_algo);
+ if (keylen > sizeof(key))
+ {
+ fail ("%s-CFB-%d test failed (key too short)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ memsize = (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+ mem = xcalloc (1, memsize);
+ if (!mem)
+ return -1;
+
+ offs = (16 - ((uintptr_t)mem & 15)) & 15;
+ iv = (void*)(mem + offs);
+ iv2 = iv + blocksize;
+ plaintext = iv2 + blocksize;
+ plaintext2 = plaintext + nblocks * blocksize;
+ ciphertext = plaintext2 + nblocks * blocksize;
+
+ err = gcry_cipher_open (&hd_one, cipher_algo, GCRY_CIPHER_MODE_ECB, 0);
+ if (err)
+ {
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (cipher open fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_open (&hd_cfb, cipher_algo, GCRY_CIPHER_MODE_CFB, 0);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (cipher open fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ /* Initialize ctx */
+ if (gcry_cipher_setkey (hd_one, key, keylen) ||
+ gcry_cipher_setkey (hd_cfb, key, keylen))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (setkey fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ /* Test single block code path */
+ memset(iv, 0xd3, blocksize);
+ memset(iv2, 0xd3, blocksize);
+ for (i = 0; i < blocksize; i++)
+ plaintext[i] = i;
+
+ /* CFB manually. */
+ err = gcry_cipher_encrypt (hd_one, ciphertext, blocksize, iv, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ buf_xor_2dst (iv, ciphertext, plaintext, blocksize);
+
+ /* CFB decrypt. */
+ err = gcry_cipher_setiv (hd_cfb, iv2, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (setiv fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_decrypt (hd_cfb, plaintext2, blocksize * 1,
+ ciphertext, blocksize * 1);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (CFB decrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ if (memcmp(plaintext2, plaintext, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (plaintext mismatch)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+
+#if 0
+ if (memcmp(iv2, iv, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (IV mismatch)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+#endif
+
+ /* Test parallelized code paths */
+ memset(iv, 0xe6, blocksize);
+ memset(iv2, 0xe6, blocksize);
+
+ for (i = 0; i < nblocks * blocksize; i++)
+ plaintext[i] = i;
+
+ /* Create CFB ciphertext manually. */
+ for (i = 0; i < nblocks * blocksize; i+=blocksize)
+ {
+ err = gcry_cipher_encrypt (hd_one, &ciphertext[i], blocksize,
+ iv, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ buf_xor_2dst (iv, &ciphertext[i], &plaintext[i], blocksize);
+ }
+
+ /* Decrypt using bulk CBC and compare result. */
+ err = gcry_cipher_setiv (hd_cfb, iv2, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (setiv fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_decrypt (hd_cfb, plaintext2, blocksize * nblocks,
+ ciphertext, blocksize * nblocks);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (CFB decrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ if (memcmp(plaintext2, plaintext, nblocks * blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (plaintext mismatch, parallel path)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+#if 0
+ if (memcmp(iv2, iv, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ fail ("%s-CFB-%d test failed (IV mismatch, parallel path)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+#endif
+
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_cfb);
+ xfree(mem);
+ return -1;
+}
+
+
+/* Run the tests for <block cipher>-CTR-<block size>, tests IV increment
+ of bulk CTR encryption. Returns NULL on success. */
+static int
+cipher_ctr_bulk_test (int cipher_algo)
+{
+ const int nblocks = 128 - 1;
+ int blocksize;
+ const char *cipher;
+ gcry_cipher_hd_t hd_one;
+ gcry_cipher_hd_t hd_ctr;
+ gcry_error_t err = 0;
+ int i, j, offs, diff;
+ unsigned char *plaintext, *plaintext2, *ciphertext, *ciphertext2,
+ *iv, *iv2, *mem;
+ unsigned int memsize;
+ unsigned int keylen;
+
+ static const unsigned char key[32] = {
+ 0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+ 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21,
+ 0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+ 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
+ };
+
+ if (gcry_cipher_test_algo (cipher_algo))
+ return -1;
+ blocksize = gcry_cipher_get_algo_blklen(cipher_algo);
+ if (blocksize < 8)
+ return -1;
+ cipher = gcry_cipher_algo_name (cipher_algo);
+ keylen = gcry_cipher_get_algo_keylen (cipher_algo);
+ if (keylen > sizeof(key))
+ {
+ fail ("%s-CTR-%d test failed (key too short)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ memsize = (blocksize * 2) + (blocksize * nblocks * 4) + 16;
+
+ mem = xcalloc (1, memsize);
+ if (!mem)
+ return -1;
+
+ offs = (16 - ((uintptr_t)mem & 15)) & 15;
+ iv = (void*)(mem + offs);
+ iv2 = iv + blocksize;
+ plaintext = iv2 + blocksize;
+ plaintext2 = plaintext + nblocks * blocksize;
+ ciphertext = plaintext2 + nblocks * blocksize;
+ ciphertext2 = ciphertext + nblocks * blocksize;
+
+ err = gcry_cipher_open (&hd_one, cipher_algo, GCRY_CIPHER_MODE_ECB, 0);
+ if (err)
+ {
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (cipher open fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_open (&hd_ctr, cipher_algo, GCRY_CIPHER_MODE_CTR, 0);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (cipher open fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ /* Initialize ctx */
+ if (gcry_cipher_setkey (hd_one, key, keylen) ||
+ gcry_cipher_setkey (hd_ctr, key, keylen))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (setkey fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ /* Test single block code path */
+ memset (iv, 0xff, blocksize);
+ for (i = 0; i < blocksize; i++)
+ plaintext[i] = i;
+
+ /* CTR manually. */
+ err = gcry_cipher_encrypt (hd_one, ciphertext, blocksize, iv, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (ECB encrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ for (i = 0; i < blocksize; i++)
+ ciphertext[i] ^= plaintext[i];
+ for (i = blocksize; i > 0; i--)
+ {
+ iv[i-1]++;
+ if (iv[i-1])
+ break;
+ }
+
+ memset (iv2, 0xff, blocksize);
+ err = gcry_cipher_setctr (hd_ctr, iv2, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (setiv fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_encrypt (hd_ctr, plaintext2, blocksize * 1,
+ ciphertext, blocksize * 1);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (CTR encrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ if (memcmp (plaintext2, plaintext, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (plaintext mismatch)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+
+#if 0
+ if (memcmp (iv2, iv, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (IV mismatch)", cipher, blocksize * 8);
+ return -1;
+ }
+#endif
+
+ /* Test bulk encryption with typical IV. */
+ memset(iv, 0x57, blocksize-4);
+ iv[blocksize-1] = 1;
+ iv[blocksize-2] = 0;
+ iv[blocksize-3] = 0;
+ iv[blocksize-4] = 0;
+ memset(iv2, 0x57, blocksize-4);
+ iv2[blocksize-1] = 1;
+ iv2[blocksize-2] = 0;
+ iv2[blocksize-3] = 0;
+ iv2[blocksize-4] = 0;
+
+ for (i = 0; i < blocksize * nblocks; i++)
+ plaintext2[i] = plaintext[i] = i;
+
+ /* Create CTR ciphertext manually. */
+ for (i = 0; i < blocksize * nblocks; i+=blocksize)
+ {
+ err = gcry_cipher_encrypt (hd_one, &ciphertext[i], blocksize,
+ iv, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (ECB encrypt fail)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+ for (j = 0; j < blocksize; j++)
+ ciphertext[i+j] ^= plaintext[i+j];
+ for (j = blocksize; j > 0; j--)
+ {
+ iv[j-1]++;
+ if (iv[j-1])
+ break;
+ }
+ }
+
+ err = gcry_cipher_setctr (hd_ctr, iv2, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (setiv fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_encrypt (hd_ctr, ciphertext2, blocksize * nblocks,
+ plaintext2, blocksize * nblocks);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (CTR encrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ if (memcmp (ciphertext2, ciphertext, blocksize * nblocks))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (ciphertext mismatch, bulk)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+#if 0
+ if (memcmp (iv2, iv, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (IV mismatch, bulk)", cipher, blocksize * 8);
+ return -1;
+ }
+#endif
+
+ /* Test parallelized code paths (check counter overflow handling) */
+ for (diff = 0; diff < nblocks; diff++) {
+ memset(iv, 0xff, blocksize);
+ iv[blocksize-1] -= diff;
+ iv[0] = iv[1] = 0;
+ iv[2] = 0x07;
+
+ for (i = 0; i < blocksize * nblocks; i++)
+ plaintext[i] = i;
+
+ /* Create CTR ciphertext manually. */
+ for (i = 0; i < blocksize * nblocks; i+=blocksize)
+ {
+ err = gcry_cipher_encrypt (hd_one, &ciphertext[i], blocksize,
+ iv, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (ECB encrypt fail)",
+ cipher, blocksize * 8);
+ return -1;
+ }
+ for (j = 0; j < blocksize; j++)
+ ciphertext[i+j] ^= plaintext[i+j];
+ for (j = blocksize; j > 0; j--)
+ {
+ iv[j-1]++;
+ if (iv[j-1])
+ break;
+ }
+ }
+
+ /* Decrypt using bulk CTR and compare result. */
+ memset(iv2, 0xff, blocksize);
+ iv2[blocksize-1] -= diff;
+ iv2[0] = iv2[1] = 0;
+ iv2[2] = 0x07;
+
+ err = gcry_cipher_setctr (hd_ctr, iv2, blocksize);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (setiv fail)", cipher, blocksize * 8);
+ return -1;
+ }
+ err = gcry_cipher_decrypt (hd_ctr, plaintext2, blocksize * nblocks,
+ ciphertext, blocksize * nblocks);
+ if (err)
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (CTR decrypt fail)", cipher, blocksize * 8);
+ return -1;
+ }
+
+ if (memcmp (plaintext2, plaintext, blocksize * nblocks))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (plaintext mismatch, diff: %d)",
+ cipher, blocksize * 8, diff);
+ return -1;
+ }
+#if 0
+ if (memcmp(iv2, iv, blocksize))
+ {
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ fail ("%s-CTR-%d test failed (IV mismatch, diff: %d)",
+ cipher, blocksize * 8, diff);
+ return -1;
+ }
+#endif
+ }
+
+ gcry_cipher_close (hd_one);
+ gcry_cipher_close (hd_ctr);
+ xfree(mem);
+ return -1;
+}
+
+
+
static void
check_ciphers (void)
{
@@ -11770,6 +12558,13 @@ check_ciphers (void)
check_one_cipher (algos[i], GCRY_CIPHER_MODE_OCB, 0);
if (gcry_cipher_get_algo_blklen (algos[i]) == GCRY_XTS_BLOCK_LEN)
check_one_cipher (algos[i], GCRY_CIPHER_MODE_XTS, 0);
+
+ if (gcry_cipher_get_algo_blklen (algos[i]) >= 8)
+ {
+ cipher_cbc_bulk_test (algos[i]);
+ cipher_cfb_bulk_test (algos[i]);
+ cipher_ctr_bulk_test (algos[i]);
+ }
}
for (i = 0; algos2[i]; i++)
@@ -15558,7 +16353,7 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
int unpadded;
int encrypt_expected_rc;
int decrypt_expected_rc;
- int special;
+ int flags;
} datas[] =
{
{ GCRY_PK_RSA,
@@ -15567,14 +16362,16 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
NULL,
0,
0,
- 0 },
+ 0,
+ FLAG_NOFIPS },
{ GCRY_PK_RSA,
"(data\n (flags pkcs1)\n"
" (value #11223344556677889900AA#))\n",
"(flags pkcs1)",
1,
0,
- 0 },
+ 0,
+ FLAG_NOFIPS },
{ GCRY_PK_RSA,
"(data\n (flags oaep)\n"
" (value #11223344556677889900AA#))\n",
@@ -15642,14 +16439,14 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
"(flags oaep)",
1,
0,
- GPG_ERR_ENCODING_PROBLEM, 1 },
+ GPG_ERR_ENCODING_PROBLEM, FLAG_SPECIAL },
{ GCRY_PK_RSA,
"(data\n (flags oaep)\n"
" (value #11223344556677889900AA#))\n",
"(flags pkcs1)",
1,
0,
- GPG_ERR_ENCODING_PROBLEM, 1 },
+ GPG_ERR_ENCODING_PROBLEM, FLAG_SPECIAL },
{ 0,
"(data\n (flags pss)\n"
" (value #11223344556677889900AA#))\n",
@@ -15676,7 +16473,8 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
die ("converting data failed: %s\n", gpg_strerror (rc));
rc = gcry_pk_encrypt (&ciph, data, pkey);
- if (in_fips_mode && (flags & FLAG_NOFIPS))
+ if (in_fips_mode && ((flags & FLAG_NOFIPS) ||
+ (datas[dataidx].flags & FLAG_NOFIPS)))
{
if (!rc)
fail ("gcry_pk_encrypt did not fail as expected in FIPS mode\n");
@@ -15725,7 +16523,7 @@ check_pubkey_crypt (int n, gcry_sexp_t skey, gcry_sexp_t pkey, int algo,
ciph = list;
}
rc = gcry_pk_decrypt (&plain, ciph, skey);
- if (!rc && datas[dataidx].special == 1)
+ if ((!rc || in_fips_mode) && (datas[dataidx].flags & FLAG_SPECIAL))
{
/* It may happen that OAEP formatted data which is
decrypted as pkcs#1 data returns a valid pkcs#1
diff --git a/tests/bench-slope.c b/tests/bench-slope.c
index 5c49ac25..aaddaa85 100644
--- a/tests/bench-slope.c
+++ b/tests/bench-slope.c
@@ -2629,7 +2629,9 @@ bench_ecc_init (struct bench_obj *obj)
switch (oper->algo)
{
default:
- return -1;
+ gcry_mpi_release (x);
+ free (hd);
+ return -1;
case ECC_ALGO_ED25519:
err = gcry_sexp_build (&hd->key_spec, NULL,
diff --git a/tests/pkcs1v2.c b/tests/pkcs1v2.c
index f26e779b..6c7f3d81 100644
--- a/tests/pkcs1v2.c
+++ b/tests/pkcs1v2.c
@@ -454,7 +454,19 @@ check_v15crypt (void)
gcry_free (seed);
err = gcry_pk_encrypt (&ciph, plain, pub_key);
- if (err)
+ if (in_fips_mode)
+ {
+ if (!err)
+ {
+ fail ("gcry_pk_encrypt should have failed in FIPS mode:\n");
+ }
+ gcry_sexp_release (plain);
+ plain = NULL;
+ gcry_sexp_release (ciph);
+ ciph = NULL;
+ continue;
+ }
+ else if (err)
{
show_sexp ("plain:\n", ciph);
fail ("gcry_pk_encrypt failed: %s\n", gpg_strerror (err));
diff --git a/tests/t-cv25519.c b/tests/t-cv25519.c
index ec1472a9..14a6719b 100644
--- a/tests/t-cv25519.c
+++ b/tests/t-cv25519.c
@@ -348,7 +348,7 @@ test_it (int testno, const char *k_str, int iter, const char *result_str)
gcry_mpi_t mpi_k = NULL;
gcry_mpi_t mpi_x = NULL;
gcry_mpi_point_t P = NULL;
- gcry_mpi_point_t Q;
+ gcry_mpi_point_t Q = NULL;
int i;
gcry_mpi_t mpi_kk = NULL;
@@ -363,7 +363,7 @@ test_it (int testno, const char *k_str, int iter, const char *result_str)
testno);
if (verbose > 1)
info ("not executed in FIPS mode\n");
- return;
+ goto leave;
}
Q = gcry_mpi_point_new (0);
diff --git a/tests/t-dsa.c b/tests/t-dsa.c
index 965b7cf2..8ed3b65b 100644
--- a/tests/t-dsa.c
+++ b/tests/t-dsa.c
@@ -168,7 +168,10 @@ hex2buffer (const char *string, size_t *r_length)
for (s=string; *s; s +=2 )
{
if (!hexdigitp (s) || !hexdigitp (s+1))
- return NULL; /* Invalid hex digits. */
+ {
+ xfree (buffer);
+ return NULL; /* Invalid hex digits. */
+ }
((unsigned char*)buffer)[length++] = xtoi_2 (s);
}
*r_length = length;
diff --git a/tests/t-ecdsa.c b/tests/t-ecdsa.c
index 9a0773b7..fa0a2ef9 100644
--- a/tests/t-ecdsa.c
+++ b/tests/t-ecdsa.c
@@ -195,7 +195,10 @@ hex2buffer (const char *string, size_t *r_length)
for (; *s; s +=2 )
{
if (!hexdigitp (s) || !hexdigitp (s+1))
- return NULL; /* Invalid hex digits. */
+ {
+ xfree (buffer);
+ return NULL; /* Invalid hex digits. */
+ }
buffer[length++] = xtoi_2 (s);
}
*r_length = length;
@@ -483,6 +486,7 @@ one_test_sexp (const char *curvename, const char *sha_alg,
xfree (out_s);
xfree (sig_r_string);
xfree (sig_s_string);
+ xfree (pkbuffer);
}
diff --git a/tests/t-kdf.c b/tests/t-kdf.c
index 234bbac6..d10a0e34 100644
--- a/tests/t-kdf.c
+++ b/tests/t-kdf.c
@@ -1603,6 +1603,135 @@ check_balloon (void)
}
+static void
+check_onestep_kdf (void)
+{
+ gcry_error_t err;
+ const unsigned long param[4] = { 38, 68, 44, 56 };
+ unsigned char out[68];
+ const unsigned char input[4][16] = {
+ {
+ 0x3f, 0x89, 0x2b, 0xd8, 0xb8, 0x4d, 0xae, 0x64,
+ 0xa7, 0x82, 0xa3, 0x5f, 0x6e, 0xaa, 0x8f, 0x00
+ },
+ {
+ 0xe6, 0x5b, 0x19, 0x05, 0x87, 0x8b, 0x95, 0xf6,
+ 0x8b, 0x55, 0x35, 0xbd, 0x3b, 0x2b, 0x10, 0x13
+ },
+ {
+ 0x02, 0xb4, 0x0d, 0x33, 0xe3, 0xf6, 0x85, 0xae,
+ 0xae, 0x67, 0x7a, 0xc3, 0x44, 0xee, 0xaf, 0x77
+ },
+ {
+ 0x8e, 0x5c, 0xd5, 0xf6, 0xae, 0x55, 0x8f, 0xfa,
+ 0x04, 0xcd, 0xa2, 0xfa, 0xd9, 0x4d, 0xd6, 0x16
+ }
+ };
+ const unsigned char other[4][12] = {
+ {
+ 0xec, 0x3f, 0x1c, 0xd8, 0x73, 0xd2, 0x88, 0x58,
+ 0xa5, 0x8c, 0xc3, 0x9e
+ },
+ {
+ 0x83, 0x02, 0x21, 0xb1, 0x73, 0x0d, 0x91, 0x76,
+ 0xf8, 0x07, 0xd4, 0x07
+ },
+ {
+ 0xc6, 0x7c, 0x38, 0x95, 0x80, 0x12, 0x8f, 0x18,
+ 0xf6, 0xcf, 0x85, 0x92
+ },
+ {
+ 0x4a, 0x43, 0x30, 0x18, 0xe5, 0x1c, 0x09, 0xbb,
+ 0xd6, 0x13, 0x26, 0xbb
+ }
+ };
+ const unsigned char key0[16] = {
+ 0x0a, 0xd5, 0x2c, 0x93, 0x57, 0xc8, 0x5e, 0x47,
+ 0x81, 0x29, 0x6a, 0x36, 0xca, 0x72, 0x03, 0x9c
+ };
+ const unsigned char key1[16] = {
+ 0x6e, 0xd9, 0x3b, 0x6f, 0xe5, 0xb3, 0x50, 0x2b,
+ 0xb4, 0x2b, 0x4c, 0x0f, 0xcb, 0x13, 0x36, 0x62
+ };
+ const unsigned char *key[4] = {
+ NULL, NULL, key0, key1
+ };
+ const unsigned char expected[4][68] = {
+ {
+ 0xa7, 0xc0, 0x66, 0x52, 0x98, 0x25, 0x25, 0x31,
+ 0xe0, 0xdb, 0x37, 0x73, 0x7a, 0x37, 0x46, 0x51,
+ 0xb3, 0x68, 0x27, 0x5f, 0x20, 0x48, 0x28, 0x4d,
+ 0x16, 0xa1, 0x66, 0xc6, 0xd8, 0xa9, 0x0a, 0x91,
+ 0xa4, 0x91, 0xc1, 0x6f, 0x49, 0x64
+ },
+ {
+ 0xb8, 0xc4, 0x4b, 0xdf, 0x0b, 0x85, 0xa6, 0x4b,
+ 0x6a, 0x51, 0xc1, 0x2a, 0x06, 0x71, 0x0e, 0x37,
+ 0x3d, 0x82, 0x9b, 0xb1, 0xfd, 0xa5, 0xb4, 0xe1,
+ 0xa2, 0x07, 0x95, 0xc6, 0x19, 0x95, 0x94, 0xf6,
+ 0xfa, 0x65, 0x19, 0x8a, 0x72, 0x12, 0x57, 0xf7,
+ 0xd5, 0x8c, 0xb2, 0xf6, 0xf6, 0xdb, 0x9b, 0xb5,
+ 0x69, 0x9f, 0x73, 0x86, 0x30, 0x45, 0x90, 0x90,
+ 0x54, 0xb2, 0x38, 0x9e, 0x06, 0xec, 0x00, 0xfe,
+ 0x31, 0x8c, 0xab, 0xd9
+ },
+ {
+ 0xbe, 0x32, 0xe7, 0xd3, 0x06, 0xd8, 0x91, 0x02,
+ 0x8b, 0xe0, 0x88, 0xf2, 0x13, 0xf9, 0xf9, 0x47,
+ 0xc5, 0x04, 0x20, 0xd9, 0xb5, 0xa1, 0x2c, 0xa6,
+ 0x98, 0x18, 0xdd, 0x99, 0x95, 0xde, 0xdd, 0x8e,
+ 0x61, 0x37, 0xc7, 0x10, 0x4d, 0x67, 0xf2, 0xca,
+ 0x90, 0x91, 0x5d, 0xda
+ },
+ {
+ 0x29, 0x5d, 0xfb, 0xeb, 0x54, 0xec, 0x0f, 0xe2,
+ 0x4e, 0xce, 0x32, 0xf5, 0xb8, 0x7c, 0x85, 0x3e,
+ 0x69, 0x9a, 0x62, 0xe3, 0x9d, 0x9c, 0x9e, 0xe6,
+ 0xee, 0x78, 0xf8, 0xb9, 0xa0, 0xee, 0x50, 0xa3,
+ 0x6a, 0x82, 0xe6, 0x06, 0x2c, 0x95, 0xed, 0x53,
+ 0xbc, 0x36, 0x67, 0x00, 0xe2, 0xd0, 0xe0, 0x93,
+ 0xbf, 0x75, 0x2e, 0xea, 0x42, 0x99, 0x47, 0x2e
+ }
+ };
+ int i;
+ int algo[4] = {
+ GCRY_MD_SHA256, GCRY_MD_SHA512,
+ GCRY_MAC_HMAC_SHA256, GCRY_MAC_HMAC_SHA512,
+ };
+ int count = 0;
+
+ again:
+
+ if (verbose)
+ fprintf (stderr, "checking OneStepKDF test vector %d\n", count);
+
+ err = my_kdf_derive (0,
+ count < 2 ? GCRY_KDF_ONESTEP_KDF
+ : GCRY_KDF_ONESTEP_KDF_MAC,
+ algo[count], &param[count], 1,
+ input[count], 16, NULL, 0,
+ key[count],
+ key[count] == NULL? 0 : 16,
+ other[count], 12,
+ param[count], out);
+ if (err)
+ fail ("OneStepKDF test %d failed: %s\n", count, gpg_strerror (err));
+ else if (memcmp (out, expected[count], param[count]))
+ {
+ fail ("OneStepKDF test %d failed: mismatch\n", count*2+0);
+ fputs ("got:", stderr);
+ for (i=0; i < param[count]; i++)
+ fprintf (stderr, " %02x", out[i]);
+ putc ('\n', stderr);
+ }
+
+ /* Next test vector */
+ count++;
+ if (count < 4)
+ goto again;
+}
+
+
int
main (int argc, char **argv)
{
@@ -1681,6 +1810,7 @@ main (int argc, char **argv)
check_scrypt ();
check_argon2 ();
check_balloon ();
+ check_onestep_kdf ();
}
return error_count ? 1 : 0;
diff --git a/tests/t-mpi-point.c b/tests/t-mpi-point.c
index 72d7fa9b..0310fe11 100644
--- a/tests/t-mpi-point.c
+++ b/tests/t-mpi-point.c
@@ -3504,8 +3504,8 @@ check_ec_mul (void)
};
gpg_error_t err;
gcry_ctx_t ctx;
- gcry_mpi_t k, x, y;
- gcry_mpi_point_t G, Q;
+ gcry_mpi_t k = NULL, x = NULL, y = NULL;
+ gcry_mpi_point_t G = NULL, Q = NULL;
int idx;
for (idx = 0; tv[idx].curve; idx++)
@@ -3544,7 +3544,7 @@ check_ec_mul (void)
{
fail ("tv[%d].'%s': error getting point parameter 'g'\n",
idx, tv[idx].curve);
- return;
+ goto err;
}
if (tv[idx].k_base10)
@@ -3562,7 +3562,7 @@ check_ec_mul (void)
{
fail ("tv[%d].'%s': failed to get affine coordinates\n",
idx, tv[idx].curve);
- return;
+ goto err;
}
if (cmp_mpihex (x, tv[idx].qx) || cmp_mpihex (y, tv[idx].qy))
@@ -3576,6 +3576,7 @@ check_ec_mul (void)
printf ("expected Qy: %s\n", tv[idx].qy);
}
+err:
gcry_mpi_release (k);
gcry_mpi_release (y);
gcry_mpi_release (x);
@@ -4368,7 +4369,7 @@ check_ec_mul_reduction (void)
{
fail ("tv[%d].'%s': failed to get affine coordinates\n",
idx, tv[idx].curve);
- return;
+ goto out;
}
if ((tv[idx].qx != NULL && tv[idx].qy != NULL)
@@ -4383,6 +4384,7 @@ check_ec_mul_reduction (void)
printf ("expected Qy: %s\n", tv[idx].qy);
}
+out:
gcry_mpi_release (uy);
gcry_mpi_release (ux);
gcry_mpi_release (uz);
diff --git a/tests/t-rsa-15.c b/tests/t-rsa-15.c
index 67dbd2cc..65e74961 100644
--- a/tests/t-rsa-15.c
+++ b/tests/t-rsa-15.c
@@ -144,7 +144,10 @@ hex2buffer (const char *string, size_t *r_length)
for (s=string; *s; s +=2 )
{
if (!hexdigitp (s) || !hexdigitp (s+1))
- return NULL; /* Invalid hex digits. */
+ {
+ xfree (buffer);
+ return NULL; /* Invalid hex digits. */
+ }
((unsigned char*)buffer)[length++] = xtoi_2 (s);
}
*r_length = length;
diff --git a/tests/t-rsa-pss.c b/tests/t-rsa-pss.c
index fa8392e9..c5f90116 100644
--- a/tests/t-rsa-pss.c
+++ b/tests/t-rsa-pss.c
@@ -144,7 +144,10 @@ hex2buffer (const char *string, size_t *r_length)
for (s=string; *s; s +=2 )
{
if (!hexdigitp (s) || !hexdigitp (s+1))
- return NULL; /* Invalid hex digits. */
+ {
+ xfree (buffer);
+ return NULL; /* Invalid hex digits. */
+ }
((unsigned char*)buffer)[length++] = xtoi_2 (s);
}
*r_length = length;
diff --git a/tests/t-x448.c b/tests/t-x448.c
index f5f49e5e..bfc22fc0 100644
--- a/tests/t-x448.c
+++ b/tests/t-x448.c
@@ -324,7 +324,7 @@ test_it (int testno, const char *k_str, int iter, const char *result_str)
gcry_mpi_t mpi_k = NULL;
gcry_mpi_t mpi_x = NULL;
gcry_mpi_point_t P = NULL;
- gcry_mpi_point_t Q;
+ gcry_mpi_point_t Q = NULL;
int i;
gcry_mpi_t mpi_kk = NULL;
@@ -339,7 +339,7 @@ test_it (int testno, const char *k_str, int iter, const char *result_str)
testno);
if (verbose > 1)
info ("not executed in FIPS mode\n");
- return;
+ goto leave;
}
Q = gcry_mpi_point_new (0);
diff --git a/tests/testdrv.c b/tests/testdrv.c
index 816eae0a..0ccde326 100644
--- a/tests/testdrv.c
+++ b/tests/testdrv.c
@@ -532,6 +532,7 @@ my_spawn (const char *pgmname, char **argv, char **envp, MYPID_T *pid)
*pid = fork ();
if (*pid == MYINVALID_PID)
{
+ xfree (arg_list);
fail ("error forking process: %s\n", strerror (errno));
return -1;
}
@@ -546,17 +547,29 @@ my_spawn (const char *pgmname, char **argv, char **envp, MYPID_T *pid)
/* Assign /dev/null to stdin. */
fd = open ("/dev/null", O_RDONLY);
if (fd == -1)
- die ("failed to open '%s': %s\n", "/dev/null", strerror (errno));
+ {
+ xfree (arg_list);
+ die ("failed to open '%s': %s\n", "/dev/null", strerror (errno));
+ }
if (fd != 0 && dup2 (fd, 0) == -1)
- die ("dup2(%d,0) failed: %s\n", fd, strerror (errno));
+ {
+ xfree (arg_list);
+ die ("dup2(%d,0) failed: %s\n", fd, strerror (errno));
+ }
/* Assign /dev/null to stdout unless in verbose mode. */
if (!verbose)
{
fd = open ("/dev/null", O_RDONLY);
if (fd == -1)
- die ("failed to open '%s': %s\n", "/dev/null", strerror (errno));
+ {
+ xfree (arg_list);
+ die ("failed to open '%s': %s\n", "/dev/null", strerror (errno));
+ }
if (fd != 1 && dup2 (fd, 1) == -1)
- die ("dup2(%d,1) failed: %s\n", fd, strerror (errno));
+ {
+ xfree (arg_list);
+ die ("dup2(%d,1) failed: %s\n", fd, strerror (errno));
+ }
}
/* Exec the program. */