diff options
author | Jussi Kivilinna <[email protected]> | 2020-12-30 17:46:07 +0200 |
---|---|---|
committer | Jussi Kivilinna <[email protected]> | 2020-12-30 17:46:07 +0200 |
commit | 1f75681cbba895ea2f7ea0637900721f4522e729 (patch) | |
tree | 19eb7a48b5513f9f5811b1e515a3d4c8e637641c | |
parent | 6a0bb9ab7f886087d7edb0725c90485086a1c0b4 (diff) | |
download | libgcrypt-cipher-s390x-optimizations.tar.gz libgcrypt-cipher-s390x-optimizations.tar.bz2 libgcrypt-cipher-s390x-optimizations.zip |
Add s390x/zSeries implementation of Poly1305cipher-s390x-optimizations
* cipher/Makefile.am: Add 'poly1305-s390x.S' and
'asm-poly1305-s390x.h'.
* cipher/asm-poly1305-s390x.h: New
* cipher/chacha20-s390x.S (_gcry_chacha20_poly1305_s390x_vx_blocks8)
(_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1): New, stitched
chacha20-poly1305 implementation.
* cipher/chacha20.c (USE_S390X_VX_POLY1305): New.
(_gcry_chacha20_poly1305_s390x_vx_blocks8)
(_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1): New prototypes.
(_gcry_chacha20_poly1305_encrypt, _gcry_chacha20_poly1305_decrypt): Add
s390x/VX stitched chacha20-poly1305 code-path.
* cipher/poly1305-s390x.S: New.
* cipher/poly1305.c (USE_S390X_ASM, HAVE_ASM_POLY1305_BLOCKS): New.
[USE_S390X_ASM] (_gcry_poly1305_s390x_blocks1, poly1305_blocks): New.
* configure.ac (gcry_cv_gcc_inline_asm_s390x): Check for 'risbgn' and
'algrk' instructions.
* tests/basic.c (_check_poly1305_cipher): Add large chacha20-poly1305
test vector.
--
Patch adds Poly1305 and stitched ChaCha20-Poly1305 implementation
for zSeries. Stitched implementation interleaves ChaCha20 and Poly1305
processing for higher instruction level parallelism and better
utilization of execution units.
Benchmark on z15 (4504 Mhz):
Before:
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 1.16 ns/B 823.2 MiB/s 5.22 c/B
POLY1305 dec | 1.16 ns/B 823.2 MiB/s 5.22 c/B
POLY1305 auth | 0.736 ns/B 1295 MiB/s 3.32 c/B
After (chacha20-poly1305 ~71% faster, poly1305 ~29% faster):
CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte
POLY1305 enc | 0.677 ns/B 1409 MiB/s 3.05 c/B
POLY1305 dec | 0.655 ns/B 1456 MiB/s 2.95 c/B
POLY1305 auth | 0.569 ns/B 1675 MiB/s 2.56 c/B
GnuPG-bug-id: 5202
Signed-off-by: Jussi Kivilinna <[email protected]>
-rw-r--r-- | cipher/Makefile.am | 2 | ||||
-rw-r--r-- | cipher/asm-poly1305-s390x.h | 140 | ||||
-rw-r--r-- | cipher/chacha20-s390x.S | 673 | ||||
-rw-r--r-- | cipher/chacha20.c | 126 | ||||
-rw-r--r-- | cipher/poly1305-s390x.S | 87 | ||||
-rw-r--r-- | cipher/poly1305.c | 40 | ||||
-rw-r--r-- | configure.ac | 8 | ||||
-rw-r--r-- | tests/basic.c | 138 |
8 files changed, 1213 insertions, 1 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 3234bcb2..6727b8b1 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -60,6 +60,7 @@ libcipher_la_SOURCES = \ mac.c mac-internal.h \ mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \ poly1305.c poly1305-internal.h \ + poly1305-s390x.S \ kdf.c kdf-internal.h \ bithelp.h \ bufhelp.h \ @@ -75,6 +76,7 @@ EXTRA_libcipher_la_SOURCES = \ asm-inline-s390x.h \ asm-poly1305-aarch64.h \ asm-poly1305-amd64.h \ + asm-poly1305-s390x.h \ arcfour.c arcfour-amd64.S \ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ diff --git a/cipher/asm-poly1305-s390x.h b/cipher/asm-poly1305-s390x.h new file mode 100644 index 00000000..113ab949 --- /dev/null +++ b/cipher/asm-poly1305-s390x.h @@ -0,0 +1,140 @@ +/* asm-common-amd64.h - Poly1305 macros for zSeries assembly + * + * Copyright (C) 2020 Jussi Kivilinna <[email protected]> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef GCRY_ASM_POLY1305_S390X_H +#define GCRY_ASM_POLY1305_S390X_H + +#include "asm-common-s390x.h" + +/********************************************************************** + poly1305 for stitched chacha20-poly1305 + **********************************************************************/ + +#define POLY_RSTATE %r1 +#define POLY_RSRC %r14 + +#define POLY_R_H0_TMP_HI %r6 // even- +#define POLY_R_H0 %r7 // odd pair +#define POLY_R_H1_TMP_HI %r8 // even- +#define POLY_R_H1 %r9 // odd pair +#define POLY_R_H2 %r10 +#define POLY_R_R0 %r11 +#define POLY_R_R1 %r12 +#define POLY_R_R1_MUL5 %r13 +#define POLY_R_X0_HI %r2 // even- +#define POLY_R_X0_LO %r3 // odd pair +#define POLY_R_X1_HI %r4 // even- +#define POLY_R_X1_LO %r5 // odd pair + +#define POLY_S_R0 (4 * 4 + 0 * 8)(POLY_RSTATE) +#define POLY_S_R1 (4 * 4 + 1 * 8)(POLY_RSTATE) +#define POLY_S_H0 (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE) +#define POLY_S_H1 (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE) +#define POLY_S_H2d (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE) + +#define INC_POLY1305_SRC(a) \ + aghi POLY_RSRC, (a); + +#define POLY1305_LOAD_STATE() \ + lg POLY_R_H0, POLY_S_H0; \ + lg POLY_R_H1, POLY_S_H1; \ + llgf POLY_R_H2, POLY_S_H2d; \ + rllg POLY_R_H0, POLY_R_H0, 32; \ + rllg POLY_R_H1, POLY_R_H1, 32; \ + lg POLY_R_R0, POLY_S_R0; \ + lg POLY_R_R1, POLY_S_R1; \ + rllg POLY_R_R0, POLY_R_R0, 32; \ + rllg POLY_R_R1, POLY_R_R1, 32; \ + srlg POLY_R_R1_MUL5, POLY_R_R1, 2; \ + algr POLY_R_R1_MUL5, POLY_R_R1; + +#define POLY1305_STORE_STATE() \ + rllg POLY_R_H0, POLY_R_H0, 32; \ + rllg POLY_R_H1, POLY_R_H1, 32; \ + stg POLY_R_H0, POLY_S_H0; \ + stg POLY_R_H1, POLY_S_H1; \ + st POLY_R_H2, POLY_S_H2d; + +/* a = h + m */ +#define POLY1305_BLOCK_PART1_HB(src_offset, high_pad) \ + lrvg POLY_R_X0_HI, ((src_offset) + 1 * 8)(POLY_RSRC); \ + lrvg POLY_R_X0_LO, ((src_offset) + 0 * 8)(POLY_RSRC); \ + lghi POLY_R_H1_TMP_HI, (high_pad); + +#define POLY1305_BLOCK_PART1(src_offset) \ + POLY1305_BLOCK_PART1_HB(src_offset, 1); + +#define POLY1305_BLOCK_PART2() \ + algr POLY_R_H0, POLY_R_X0_LO; \ + alcgr POLY_R_H1, POLY_R_X0_HI; \ + alcgr POLY_R_H2, POLY_R_H1_TMP_HI; \ + lgr POLY_R_X1_LO, POLY_R_H0; \ + lgr POLY_R_X0_LO, POLY_R_H0; + +#define POLY1305_BLOCK_PART3() \ + /* h = a * r (partial mod 2^130-5): */ \ + \ + /* h0 * r1 */ \ + mlgr POLY_R_X1_HI, POLY_R_R1; \ + \ + /* h1 * r0 */ \ + lgr POLY_R_H0, POLY_R_H1; \ + mlgr POLY_R_H0_TMP_HI, POLY_R_R0; \ + \ + /* h1 * r1 mod 2^130-5 */ \ + mlgr POLY_R_H1_TMP_HI, POLY_R_R1_MUL5; + +#define POLY1305_BLOCK_PART4() \ + \ + /* h0 * r0 */ \ + mlgr POLY_R_X0_HI, POLY_R_R0; \ + \ + algr POLY_R_X1_LO, POLY_R_H0; \ + alcgr POLY_R_X1_HI, POLY_R_H0_TMP_HI; \ + \ + lgr POLY_R_H0_TMP_HI, POLY_R_H2; \ + msgr POLY_R_H0_TMP_HI, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */ \ + msgr POLY_R_H2, POLY_R_R0; /* h2 * r0 */ + +#define POLY1305_BLOCK_PART5() \ + \ + algr POLY_R_X0_LO, POLY_R_H1; \ + alcgr POLY_R_X0_HI, POLY_R_H1_TMP_HI; + +#define POLY1305_BLOCK_PART6() \ + \ + algrk POLY_R_H1, POLY_R_H0_TMP_HI, POLY_R_X1_LO; \ + alcgr POLY_R_H2, POLY_R_X1_HI; + +#define POLY1305_BLOCK_PART7() \ + \ + /* carry propagation */ \ + srlg POLY_R_H0, POLY_R_H2, 2; \ + risbgn POLY_R_X1_LO, POLY_R_H2, 0, 0x80 | 61, 0; \ + lghi POLY_R_H1_TMP_HI, 0; \ + agr POLY_R_H0, POLY_R_X1_LO; \ + risbgn POLY_R_H2, POLY_R_H2, 62, 0x80 | 63, 0; + +#define POLY1305_BLOCK_PART8() \ + algr POLY_R_H0, POLY_R_X0_LO; \ + alcgr POLY_R_H1, POLY_R_X0_HI; \ + alcgr POLY_R_H2, POLY_R_H1_TMP_HI; + +#endif /* GCRY_ASM_POLY1305_AMD64_H */ diff --git a/cipher/chacha20-s390x.S b/cipher/chacha20-s390x.S index 2cd38330..9b1d59c6 100644 --- a/cipher/chacha20-s390x.S +++ b/cipher/chacha20-s390x.S @@ -23,6 +23,7 @@ #if defined(HAVE_GCC_INLINE_ASM_S390X_VX) #include "asm-common-s390x.h" +#include "asm-poly1305-s390x.h" .machine "z13+vx" .text @@ -574,6 +575,393 @@ ELF(.size _gcry_chacha20_s390x_vx_blocks4_2_1, .-_gcry_chacha20_s390x_vx_blocks4_2_1;) /********************************************************************** + 4-way && 2-way && 1-way stitched chacha20-poly1305 ("horizontal") + **********************************************************************/ + +.balign 8 +.globl _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1 +ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,@function;) + +_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1: + /* input: + * %r2: input + * %r3: dst + * %r4: src + * %r5: nblks + * %r6: poly1305 state + * 160(%r15): poly1305 src + */ + CFI_STARTPROC(); + + START_STACK(%r14); + lgr NBLKS, %r5; + + /* Load constants. */ + larl %r8, .Lconsts; + vl TMP0, (.Lwordswap - .Lconsts)(%r8); + vl TMP1, (.Lone - .Lconsts)(%r8); + vl TMP2, (.Lbswap128 - .Lconsts)(%r8); + + /* Load state. */ + vlm S0, S3, 0(INPUT); + vperm S0, S0, S0, TMP0; + vperm S1, S1, S1, TMP0; + vperm S2, S2, S2, TMP0; + vperm S3, S3, S3, TMP0; + + /* Store parameters to stack. */ + stmg %r2, %r6, STACK_INPUT(%r15); + + lgr POLY_RSTATE, %r6; + lgr NBLKS, %r5; + + lg POLY_RSRC, 0(%r15); + lg POLY_RSRC, 160(POLY_RSRC); + stg POLY_RSRC, STACK_POSRC(%r15); + + /* Load poly1305 state */ + POLY1305_LOAD_STATE(); + + clgijl NBLKS, 4, .Lloop2_poly; + +.balign 4 +.Lloop4_poly: + /* Process four chacha20 blocks and 16 poly1305 blocks. */ + vlr TMP3, S3; + lghi ROUND, (20 / 4); + vlr A0, S0; + vlr A1, S1; + vlr A2, S2; + vlr A3, TMP3; + vag TMP3, TMP3, TMP1; + vlr B0, S0; + vlr B1, S1; + vlr B2, S2; + vlr B3, TMP3; + vag TMP3, TMP3, TMP1; + vlr C0, S0; + vlr C1, S1; + vlr C2, S2; + vlr C3, TMP3; + vlr D0, S0; + vlr D1, S1; + vlr D2, S2; + vag D3, TMP3, TMP1; + + slgfi NBLKS, 4; + +.balign 4 +.Lround4_4_poly: + /* Total 15 poly1305 blocks processed by this loop. */ + QUARTERROUND4_4_POLY(3, 2, 1, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6()); + QUARTERROUND4_4_POLY(1, 2, 3, + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART1(1 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()); + QUARTERROUND4_4_POLY(3, 2, 1, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART1(2 * 16); + INC_POLY1305_SRC(3 * 16), + POLY1305_BLOCK_PART2()); + QUARTERROUND4_4_POLY(1, 2, 3, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8()); + brctg ROUND, .Lround4_4_poly; + + POLY1305_BLOCK_PART1(0 * 16); + INC_POLY1305_SRC(1 * 16); + stg POLY_RSRC, STACK_POSRC(%r15); + + lg %r14, STACK_SRC(%r15); + vlm IO0, IO7, 0(%r14); + + PLUS(A0, S0); + PLUS(A1, S1); + PLUS(A2, S2); + PLUS(A3, S3); + vag S3, S3, TMP1; /* Update counter. */ + POLY1305_BLOCK_PART2(); + PLUS(B0, S0); + PLUS(B1, S1); + PLUS(B2, S2); + PLUS(B3, S3); + vag S3, S3, TMP1; /* Update counter. */ + POLY1305_BLOCK_PART3(); + vperm A0, A0, A0, TMP2; + vperm A1, A1, A1, TMP2; + vperm A2, A2, A2, TMP2; + vperm A3, A3, A3, TMP2; + vperm B0, B0, B0, TMP2; + vperm B1, B1, B1, TMP2; + vperm B2, B2, B2, TMP2; + vperm B3, B3, B3, TMP2; + POLY1305_BLOCK_PART4(); + PLUS(C0, S0); + PLUS(C1, S1); + PLUS(C2, S2); + PLUS(C3, S3); + vag S3, S3, TMP1; /* Update counter. */ + PLUS(D0, S0); + PLUS(D1, S1); + PLUS(D2, S2); + PLUS(D3, S3); + vag S3, S3, TMP1; /* Update counter. */ + POLY1305_BLOCK_PART5(); + vperm C0, C0, C0, TMP2; + vperm C1, C1, C1, TMP2; + vperm C2, C2, C2, TMP2; + vperm C3, C3, C3, TMP2; + vperm D0, D0, D0, TMP2; + vperm D1, D1, D1, TMP2; + vperm D2, D2, D2, TMP2; + vperm D3, D3, D3, TMP2; + + POLY1305_BLOCK_PART6(); + XOR(IO0, A0); + XOR(IO1, A1); + XOR(IO2, A2); + XOR(IO3, A3); + XOR(IO4, B0); + XOR(IO5, B1); + XOR(IO6, B2); + XOR(IO7, B3); + vlm A0, B3, 128(%r14); + aghi %r14, 256; + stg %r14, STACK_SRC(%r15); + + lg %r14, STACK_DST(%r15); + POLY1305_BLOCK_PART7(); + vstm IO0, IO7, 0(%r14); + XOR(A0, C0); + XOR(A1, C1); + XOR(A2, C2); + XOR(A3, C3); + XOR(B0, D0); + XOR(B1, D1); + XOR(B2, D2); + XOR(B3, D3); + POLY1305_BLOCK_PART8(); + vstm A0, B3, 128(%r14); + aghi %r14, 256; + stg %r14, STACK_DST(%r15); + + lg POLY_RSRC, STACK_POSRC(%r15); + + clgijhe NBLKS, 4, .Lloop4_poly; + + CLEAR(C0); + CLEAR(C1); + CLEAR(C2); + CLEAR(C3); + CLEAR(D0); + CLEAR(D1); + CLEAR(D2); + CLEAR(D3); + +.balign 4 +.Lloop2_poly: + clgijl NBLKS, 2, .Lloop1_poly; + + /* Process two chacha20 and eight poly1305 blocks. */ + lghi ROUND, ((20 - 4) / 2); + vlr A0, S0; + vlr A1, S1; + vlr A2, S2; + vlr A3, S3; + vlr B0, S0; + vlr B1, S1; + vlr B2, S2; + vag B3, S3, TMP1; + + slgfi NBLKS, 2; + +.balign 4 +.Lround4_2_poly: + /* Total eight poly1305 blocks processed by this loop. */ + QUARTERROUND4_2_POLY(3, 2, 1, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()); + INC_POLY1305_SRC(1 * 16); + QUARTERROUND4_2_POLY(1, 2, 3, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8()); + brctg ROUND, .Lround4_2_poly; + + stg POLY_RSRC, STACK_POSRC(%r15); + lg %r14, STACK_SRC(%r15); + + QUARTERROUND4_2(3, 2, 1); + QUARTERROUND4_2(1, 2, 3); + QUARTERROUND4_2(3, 2, 1); + QUARTERROUND4_2(1, 2, 3); + + vlm IO0, IO7, 0(%r14); + aghi %r14, 128; + stg %r14, STACK_SRC(%r15); + + PLUS(A0, S0); + PLUS(A1, S1); + PLUS(A2, S2); + PLUS(A3, S3); + vag S3, S3, TMP1; /* Update counter. */ + PLUS(B0, S0); + PLUS(B1, S1); + PLUS(B2, S2); + PLUS(B3, S3); + vag S3, S3, TMP1; /* Update counter. */ + vperm A0, A0, A0, TMP2; + vperm A1, A1, A1, TMP2; + vperm A2, A2, A2, TMP2; + vperm A3, A3, A3, TMP2; + vperm B0, B0, B0, TMP2; + vperm B1, B1, B1, TMP2; + vperm B2, B2, B2, TMP2; + vperm B3, B3, B3, TMP2; + + lg %r14, STACK_DST(%r15); + XOR(IO0, A0); + XOR(IO1, A1); + XOR(IO2, A2); + XOR(IO3, A3); + XOR(IO4, B0); + XOR(IO5, B1); + XOR(IO6, B2); + XOR(IO7, B3); + vstm IO0, IO7, 0(%r14); + aghi %r14, 128; + stg %r14, STACK_DST(%r15); + + lg POLY_RSRC, STACK_POSRC(%r15); + + clgijhe NBLKS, 2, .Lloop2_poly; + + CLEAR(B0); + CLEAR(B1); + CLEAR(B2); + CLEAR(B3); + +.balign 4 +.Lloop1_poly: + clgijl NBLKS, 1, .Ldone_poly; + + /* Process one chacha20 block and four poly1305 blocks.*/ + lghi ROUND, ((20 - 4) / 4); + vlr A0, S0; + vlr A1, S1; + vlr A2, S2; + vlr A3, S3; + + slgfi NBLKS, 1; + +.balign 4 +.Lround4_1_poly: + /* Total four poly1305 blocks processed by this loop. */ + QUARTERROUND4_POLY(3, 2, 1, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2()); + INC_POLY1305_SRC(1 * 16); + QUARTERROUND4_POLY(1, 2, 3, + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()); + QUARTERROUND4_POLY(3, 2, 1, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6()); + QUARTERROUND4_POLY(1, 2, 3, + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8()); + brct ROUND, .Lround4_1_poly; + + stg POLY_RSRC, STACK_POSRC(%r15); + lg %r14, STACK_SRC(%r15); + + QUARTERROUND4(3, 2, 1); + QUARTERROUND4(1, 2, 3); + QUARTERROUND4(3, 2, 1); + QUARTERROUND4(1, 2, 3); + + vlm IO0, IO3, 0(%r14); + aghi %r14, 64; + stg %r14, STACK_SRC(%r15); + + PLUS(A0, S0); + PLUS(A1, S1); + PLUS(A2, S2); + PLUS(A3, S3); + vag S3, S3, TMP1; /* Update counter. */ + + lg %r14, STACK_DST(%r15); + vperm A0, A0, A0, TMP2; + vperm A1, A1, A1, TMP2; + vperm A2, A2, A2, TMP2; + vperm A3, A3, A3, TMP2; + XOR(IO0, A0); + XOR(IO1, A1); + XOR(IO2, A2); + XOR(IO3, A3); + vstm IO0, IO3, 0(%r14); + aghi %r14, 64; + stg %r14, STACK_DST(%r15); + + lg POLY_RSRC, STACK_POSRC(%r15); + + clgijhe NBLKS, 1, .Lloop1_poly; + +.balign 4 +.Ldone_poly: + /* Store poly1305 state */ + lg POLY_RSTATE, STACK_POCTX(%r15); + POLY1305_STORE_STATE(); + + /* Store counter. */ + lg INPUT, STACK_INPUT(%r15); + vperm S3, S3, S3, TMP0; + vst S3, (48)(INPUT); + + /* Clear the used vector registers. */ + CLEAR(A0); + CLEAR(A1); + CLEAR(A2); + CLEAR(A3); + CLEAR(IO0); + CLEAR(IO1); + CLEAR(IO2); + CLEAR(IO3); + CLEAR(IO4); + CLEAR(IO5); + CLEAR(IO6); + CLEAR(IO7); + CLEAR(TMP0); + CLEAR(TMP1); + CLEAR(TMP2); + + END_STACK(%r14); + xgr %r2, %r2; + br %r14; + CFI_ENDPROC(); +ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1, + .-_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1;) + +/********************************************************************** 8-way chacha20 ("vertical") **********************************************************************/ @@ -884,5 +1272,290 @@ _gcry_chacha20_s390x_vx_blocks8: ELF(.size _gcry_chacha20_s390x_vx_blocks8, .-_gcry_chacha20_s390x_vx_blocks8;) +/********************************************************************** + 8-way stitched chacha20-poly1305 ("vertical") + **********************************************************************/ + +.balign 8 +.globl _gcry_chacha20_poly1305_s390x_vx_blocks8 +ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks8,@function;) + +_gcry_chacha20_poly1305_s390x_vx_blocks8: + /* input: + * %r2: input + * %r3: dst + * %r4: src + * %r5: nblks (multiple of 8) + * %r6: poly1305 state + * 160(%r15): poly1305 src + */ + CFI_STARTPROC(); + + START_STACK(%r14); + + /* Store parameters to stack. */ + stmg %r2, %r6, STACK_INPUT(%r15); + + lgr POLY_RSTATE, %r6; + lgr NBLKS, %r5; + + lg POLY_RSRC, 0(%r15); + lg POLY_RSRC, 160(POLY_RSRC); + stg POLY_RSRC, STACK_POSRC(%r15); + + /* Load poly1305 state */ + POLY1305_LOAD_STATE(); + +.balign 4 + /* Process eight chacha20 blocks and 32 poly1305 blocks per loop. */ +.Lloop8_poly: + lg INPUT, STACK_INPUT(%r15); + larl %r8, .Lconsts; + + vlm Y0, Y3, 0(INPUT); + + slgfi NBLKS, 8; + lghi ROUND, (20 / 2); + + /* Construct counter vectors X12/X13 & Y12/Y13. */ + vl X4, (.Ladd_counter_0123 - .Lconsts)(%r8); + vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r8); + lg %r8, (12 * 4)(INPUT); /* Update counter. */ + vrepf Y12, Y3, 0; + vrepf Y13, Y3, 1; + vaccf X5, Y12, X4; + vaccf Y5, Y12, Y4; + vaf X12, Y12, X4; + vaf Y12, Y12, Y4; + vaf X13, Y13, X5; + vaf Y13, Y13, Y5; + rllg %r8, %r8, 32; + + vrepf X0, Y0, 0; + vrepf X1, Y0, 1; + vrepf X2, Y0, 2; + vrepf X3, Y0, 3; + vrepf X4, Y1, 0; + vrepf X5, Y1, 1; + vrepf X6, Y1, 2; + vrepf X7, Y1, 3; + vrepf X8, Y2, 0; + vrepf X9, Y2, 1; + vrepf X10, Y2, 2; + vrepf X11, Y2, 3; + vrepf X14, Y3, 2; + vrepf X15, Y3, 3; + agfi %r8, 8; + + /* Store counters for blocks 0-7. */ + vstm X12, X13, (STACK_CTR + 0 * 16)(%r15); + vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15); + rllg %r8, %r8, 32; + + vlr Y0, X0; + vlr Y1, X1; + vlr Y2, X2; + vlr Y3, X3; + vlr Y4, X4; + vlr Y5, X5; + vlr Y6, X6; + vlr Y7, X7; + vlr Y8, X8; + vlr Y9, X9; + vlr Y10, X10; + vlr Y11, X11; + vlr Y14, X14; + vlr Y15, X15; + stg %r8, (12 * 4)(INPUT); + +.balign 4 +.Lround2_8_poly: + /* Total 30 poly1305 blocks processed by this loop. */ + QUARTERROUND4_V8_POLY(X0, X4, X8, X12, X1, X5, X9, X13, + X2, X6, X10, X14, X3, X7, X11, X15, + Y0, Y4, Y8, Y12, Y1, Y5, Y9, Y13, + Y2, Y6, Y10, Y14, Y3, Y7, Y11, Y15, + POLY1305_BLOCK_PART1(0 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART1(1 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4()); + QUARTERROUND4_V8_POLY(X0, X5, X10, X15, X1, X6, X11, X12, + X2, X7, X8, X13, X3, X4, X9, X14, + Y0, Y5, Y10, Y15, Y1, Y6, Y11, Y12, + Y2, Y7, Y8, Y13, Y3, Y4, Y9, Y14, + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8(), + POLY1305_BLOCK_PART1(2 * 16); + INC_POLY1305_SRC(3 * 16), + POLY1305_BLOCK_PART2(), + POLY1305_BLOCK_PART3(), + POLY1305_BLOCK_PART4(), + POLY1305_BLOCK_PART5(), + POLY1305_BLOCK_PART6(), + POLY1305_BLOCK_PART7(), + POLY1305_BLOCK_PART8()); + brctg ROUND, .Lround2_8_poly; + + POLY1305_BLOCK_PART1(0 * 16); + + /* Store blocks 4-7. */ + vstm Y0, Y15, STACK_Y0_Y15(%r15); + + /* Load counters for blocks 0-3. */ + vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15); + + stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */ + + lghi ROUND, 1; + j .Lfirst_output_4blks_8_poly; + +.balign 4 +.Lsecond_output_4blks_8_poly: + + POLY1305_BLOCK_PART1(1 * 16); + + /* Load blocks 4-7. */ + vlm X0, X15, STACK_Y0_Y15(%r15); + + /* Load counters for blocks 4-7. */ + vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15); + + INC_POLY1305_SRC(2 * 16); + stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */ + + lghi ROUND, 0; + +.balign 4 + /* Output four chacha20 blocks and one poly1305 block per loop. */ +.Lfirst_output_4blks_8_poly: + lg %r14, STACK_INPUT(%r15); + vlm Y12, Y15, 0(%r14); + POLY1305_BLOCK_PART2(); + PLUS(X12, Y0); + PLUS(X13, Y1); + vrepf Y0, Y12, 0; + vrepf Y1, Y12, 1; + vrepf Y2, Y12, 2; + vrepf Y3, Y12, 3; + vrepf Y4, Y13, 0; + vrepf Y5, Y13, 1; + vrepf Y6, Y13, 2; + vrepf Y7, Y13, 3; + vrepf Y8, Y14, 0; + vrepf Y9, Y14, 1; + vrepf Y10, Y14, 2; + vrepf Y11, Y14, 3; + vrepf Y14, Y15, 2; + vrepf Y15, Y15, 3; + POLY1305_BLOCK_PART3(); + PLUS(X0, Y0); + PLUS(X1, Y1); + PLUS(X2, Y2); + PLUS(X3, Y3); + PLUS(X4, Y4); + PLUS(X5, Y5); + PLUS(X6, Y6); + PLUS(X7, Y7); + PLUS(X8, Y8); + PLUS(X9, Y9); + PLUS(X10, Y10); + PLUS(X11, Y11); + PLUS(X14, Y14); + PLUS(X15, Y15); + POLY1305_BLOCK_PART4(); + + larl %r14, .Lconsts; + vl Y15, (.Lbswap32 - .Lconsts)(%r14); + TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7, + Y9, Y10, Y11, Y12, Y13, Y14); + lg %r14, STACK_SRC(%r15); + POLY1305_BLOCK_PART5(); + TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15, + Y9, Y10, Y11, Y12, Y13, Y14); + + vlm Y0, Y14, 0(%r14); + POLY1305_BLOCK_PART6(); + vperm X0, X0, X0, Y15; + vperm X1, X1, X1, Y15; + vperm X2, X2, X2, Y15; + vperm X3, X3, X3, Y15; + vperm X4, X4, X4, Y15; + vperm X5, X5, X5, Y15; + vperm X6, X6, X6, Y15; + vperm X7, X7, X7, Y15; + vperm X8, X8, X8, Y15; + vperm X9, X9, X9, Y15; + vperm X10, X10, X10, Y15; + vperm X11, X11, X11, Y15; + vperm X12, X12, X12, Y15; + vperm X13, X13, X13, Y15; + vperm X14, X14, X14, Y15; + vperm X15, X15, X15, Y15; + vl Y15, (15 * 16)(%r14); + POLY1305_BLOCK_PART7(); + + aghi %r14, 256; + stg %r14, STACK_SRC(%r15); + lg %r14, STACK_DST(%r15); + + XOR(Y0, X0); + XOR(Y1, X4); + XOR(Y2, X8); + XOR(Y3, X12); + XOR(Y4, X1); + XOR(Y5, X5); + XOR(Y6, X9); + XOR(Y7, X13); + XOR(Y8, X2); + XOR(Y9, X6); + XOR(Y10, X10); + XOR(Y11, X14); + XOR(Y12, X3); + XOR(Y13, X7); + XOR(Y14, X11); + XOR(Y15, X15); + POLY1305_BLOCK_PART8(); + vstm Y0, Y15, 0(%r14); + + aghi %r14, 256; + stg %r14, STACK_DST(%r15); + + lg POLY_RSRC, STACK_POSRC(%r15); + + clgije ROUND, 1, .Lsecond_output_4blks_8_poly; + + clgijhe NBLKS, 8, .Lloop8_poly; + + /* Store poly1305 state */ + lg POLY_RSTATE, STACK_POCTX(%r15); + POLY1305_STORE_STATE(); + + /* Clear the used vector registers */ + DST_8(CLEAR, 0, _); + DST_8(CLEAR, 1, _); + DST_8(CLEAR, 2, _); + DST_8(CLEAR, 3, _); + + /* Clear sensitive data in stack. */ + vlm Y0, Y15, STACK_Y0_Y15(%r15); + vlm Y0, Y3, STACK_CTR(%r15); + + END_STACK(%r14); + xgr %r2, %r2; + br %r14; + CFI_ENDPROC(); +ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks8, + .-_gcry_chacha20_poly1305_s390x_vx_blocks8;) + #endif /*HAVE_GCC_INLINE_ASM_S390X_VX*/ #endif /*__s390x__*/ diff --git a/cipher/chacha20.c b/cipher/chacha20.c index 7b283080..497594a0 100644 --- a/cipher/chacha20.c +++ b/cipher/chacha20.c @@ -189,6 +189,18 @@ unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst, unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst, const byte *src, size_t nblks); +#undef USE_S390X_VX_POLY1305 +#if SIZEOF_UNSIGNED_LONG == 8 +#define USE_S390X_VX_POLY1305 1 +unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks8( + u32 *state, byte *dst, const byte *src, size_t nblks, + POLY1305_STATE *st, const byte *poly1305_src); + +unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1( + u32 *state, byte *dst, const byte *src, size_t nblks, + POLY1305_STATE *st, const byte *poly1305_src); +#endif /* SIZEOF_UNSIGNED_LONG == 8 */ + #endif /* USE_S390X_VX */ #ifdef USE_ARMV7_NEON @@ -759,6 +771,48 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, inbuf += 4 * CHACHA20_BLOCK_SIZE; } #endif +#ifdef USE_S390X_VX_POLY1305 + else if (ctx->use_s390x && length >= 2 * CHACHA20_BLOCK_SIZE * 8) + { + nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, 8); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 8 * CHACHA20_BLOCK_SIZE; + outbuf += 8 * CHACHA20_BLOCK_SIZE; + inbuf += 8 * CHACHA20_BLOCK_SIZE; + } + else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 4) + { + nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 4); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 4 * CHACHA20_BLOCK_SIZE; + outbuf += 4 * CHACHA20_BLOCK_SIZE; + inbuf += 4 * CHACHA20_BLOCK_SIZE; + } + else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 2) + { + nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 2); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 2 * CHACHA20_BLOCK_SIZE; + outbuf += 2 * CHACHA20_BLOCK_SIZE; + inbuf += 2 * CHACHA20_BLOCK_SIZE; + } + else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE) + { + nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 1); + burn = nburn > burn ? nburn : burn; + + authptr = outbuf; + length -= 1 * CHACHA20_BLOCK_SIZE; + outbuf += 1 * CHACHA20_BLOCK_SIZE; + inbuf += 1 * CHACHA20_BLOCK_SIZE; + } +#endif if (authptr) { @@ -862,6 +916,44 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf, } #endif +#ifdef USE_S390X_VX_POLY1305 + if (ctx->use_s390x) + { + if (length >= 8 * CHACHA20_BLOCK_SIZE && + authoffset >= 8 * CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 8; + + burn = _gcry_chacha20_poly1305_s390x_vx_blocks8( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, authptr); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + authptr += nblocks * CHACHA20_BLOCK_SIZE; + } + + if (length >= CHACHA20_BLOCK_SIZE && + authoffset >= CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + + burn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, authptr); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + authptr += nblocks * CHACHA20_BLOCK_SIZE; + } + } +#endif + if (authoffset > 0) { _gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset); @@ -1026,6 +1118,40 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf, } #endif +#ifdef USE_S390X_VX_POLY1305 + if (ctx->use_s390x) + { + if (length >= 8 * CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + nblocks -= nblocks % 8; + + nburn = _gcry_chacha20_poly1305_s390x_vx_blocks8( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, inbuf); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } + + if (length >= CHACHA20_BLOCK_SIZE) + { + size_t nblocks = length / CHACHA20_BLOCK_SIZE; + + nburn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1( + ctx->input, outbuf, inbuf, nblocks, + &c->u_mode.poly1305.ctx.state, inbuf); + burn = nburn > burn ? nburn : burn; + + length -= nblocks * CHACHA20_BLOCK_SIZE; + outbuf += nblocks * CHACHA20_BLOCK_SIZE; + inbuf += nblocks * CHACHA20_BLOCK_SIZE; + } + } +#endif + while (length) { size_t currlen = length; diff --git a/cipher/poly1305-s390x.S b/cipher/poly1305-s390x.S new file mode 100644 index 00000000..844245f6 --- /dev/null +++ b/cipher/poly1305-s390x.S @@ -0,0 +1,87 @@ +/* poly1305-s390x.S - zSeries implementation of Poly1305 + * + * Copyright (C) 2020 Jussi Kivilinna <[email protected]> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9 +#include <config.h> +#if defined(HAVE_GCC_INLINE_ASM_S390X) + +#include "asm-poly1305-s390x.h" + +.text + +.balign 8 +.globl _gcry_poly1305_s390x_blocks1 +ELF(.type _gcry_poly1305_s390x_blocks1,@function;) + +_gcry_poly1305_s390x_blocks1: + /* input: + * %r2: poly1305-state + * %r3: src + * %r4: len + * %r5: high_pad + */ + CFI_STARTPROC(); + + stmg %r6, %r14, 6 * 8(%r15); + + lgr POLY_RSTATE, %r2; + lgr POLY_RSRC, %r3; + srlg %r0, %r4, 4; + + cgije %r5, 0, .Lpoly_high0; + + POLY1305_LOAD_STATE(); + +.balign 4 +.Lpoly_loop_high1: + POLY1305_BLOCK_PART1(0 * 16); + INC_POLY1305_SRC(1 * 16); +.Lpoly_block_part2: + POLY1305_BLOCK_PART2(); + POLY1305_BLOCK_PART3(); + POLY1305_BLOCK_PART4(); + POLY1305_BLOCK_PART5(); + POLY1305_BLOCK_PART6(); + POLY1305_BLOCK_PART7(); + POLY1305_BLOCK_PART8(); + + brctg %r0, .Lpoly_loop_high1; + +.balign 4 +.Lpoly_done: + POLY1305_STORE_STATE(); + + lmg %r6, %r14, 6 * 8(%r15); + xgr %r2, %r2; + br %r14; + +.balign 4 +.Lpoly_high0: + lghi %r0, 1; + POLY1305_LOAD_STATE(); + POLY1305_BLOCK_PART1_HB(0 * 16, 0); + j .Lpoly_block_part2; + + CFI_ENDPROC(); +ELF(.size _gcry_poly1305_s390x_blocks1, + .-_gcry_poly1305_s390x_blocks1;) + +#endif /*HAVE_GCC_INLINE_ASM_S390X*/ +#endif /*__s390x__*/ diff --git a/cipher/poly1305.c b/cipher/poly1305.c index adcb6792..6cb4d2b7 100644 --- a/cipher/poly1305.c +++ b/cipher/poly1305.c @@ -35,6 +35,9 @@ static const char *selftest (void); +#undef HAVE_ASM_POLY1305_BLOCKS + + #undef USE_MPI_64BIT #undef USE_MPI_32BIT #if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_TYPE_U64) @@ -46,6 +49,35 @@ static const char *selftest (void); #endif +/* USE_S390X_ASM indicates whether to enable zSeries code. */ +#undef USE_S390X_ASM +#if BYTES_PER_MPI_LIMB == 8 +# if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9 +# if defined(HAVE_GCC_INLINE_ASM_S390X) +# define USE_S390X_ASM 1 +# endif /* USE_S390X_ASM */ +# endif +#endif + + +#ifdef USE_S390X_ASM + +#define HAVE_ASM_POLY1305_BLOCKS 1 + +extern unsigned int _gcry_poly1305_s390x_blocks1(void *state, + const byte *buf, size_t len, + byte high_pad); + +static unsigned int +poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, + byte high_pad) +{ + return _gcry_poly1305_s390x_blocks1(&ctx->state, buf, len, high_pad); +} + +#endif /* USE_S390X_ASM */ + + static void poly1305_init (poly1305_context_t *ctx, const byte key[POLY1305_KEYLEN]) { @@ -146,6 +178,8 @@ static void poly1305_init (poly1305_context_t *ctx, ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \ } while (0) +#ifndef HAVE_ASM_POLY1305_BLOCKS + static unsigned int poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, byte high_pad) @@ -201,6 +235,8 @@ poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, return 6 * sizeof (void *) + 18 * sizeof (u64); } +#endif /* !HAVE_ASM_POLY1305_BLOCKS */ + static unsigned int poly1305_final (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN]) { @@ -354,6 +390,8 @@ static unsigned int poly1305_final (poly1305_context_t *ctx, ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \ } while (0) +#ifndef HAVE_ASM_POLY1305_BLOCKS + static unsigned int poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, byte high_pad) @@ -403,6 +441,8 @@ poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len, return 6 * sizeof (void *) + 28 * sizeof (u32); } +#endif /* !HAVE_ASM_POLY1305_BLOCKS */ + static unsigned int poly1305_final (poly1305_context_t *ctx, byte mac[POLY1305_TAGLEN]) { diff --git a/configure.ac b/configure.ac index c97d050e..a121093d 100644 --- a/configure.ac +++ b/configure.ac @@ -2042,6 +2042,14 @@ AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions], : : "a" (fac) : "memory"); + asm volatile ("risbgn %%r11, %%r11, 0, 129, 0\n\t" + : + : + : "memory", "r11"); + asm volatile ("algrk %%r14, %%r14, %%r14\n\t" + : + : + : "memory", "r14"); return (unsigned int)r1 ^ reg0; } ]])], diff --git a/tests/basic.c b/tests/basic.c index 436c1da8..46e4c0f8 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -4857,10 +4857,146 @@ _check_poly1305_cipher (unsigned int step) "\x3f\xf4\xde\xf0\x8e\x4b\x7a\x9d\xe5\x76\xd2\x65\x86\xce\xc6\x4b" "\x61\x16", "\x1a\xe1\x0b\x59\x4f\x09\xe2\x6a\x7e\x90\x2e\xcb\xd0\x60\x06\x91" }, + /* generated with c implementation */ + { GCRY_CIPHER_CHACHA20, + "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a\xf3\x33\x88\x86\x04\xf6\xb5\xf0" + "\x47\x39\x17\xc1\x40\x2b\x80\x09\x9d\xca\x5c\xbc\x20\x70\x75\xc0", + "\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08", 12, + "\xf3\x33\x88\x86\x00\x00\x00\x00\x00\x00\x4e\x91", 12, + "\xb0\x58\x83\x17\x3a\x8e\x69\xf2\x18\x9d\x71\xe4\x8a\x0b\x7a\xcd" + "\xe2\xd8\xb9\x8b\xdf\x99\xc2\x6d\x05\x4b\x44\x1e\x65\x5d\xda\xd5" + "\x79\xf0\x19\xab\x94\x50\xd0\xc5\x54\xfe\x76\xc8\xd9\xf3\x39\x33" + "\x9c\x0f\x27\x89\x85\x99\xe3\xed\x5c\x31\x04\xa6\x20\xab\xb3\x78" + "\xac\x31\xba\x21\x8c\xac\x70\xd1\xe2\x92\xd6\x50\x58\x69\xab\xd4" + "\x38\xdc\x9c\x71\x81\xf7\xf1\x68\x10\x50\x07\x09\x0e\x51\x49\xd2" + "\x10\x9a\x2e\x78\xfb\xc7\xd3\xc2\x84\xda\xf2\x52\x17\x2c\xa6\xe8" + "\x56\x60\x80\x46\xed\xfb\x9f\xab\xc2\x01\xf0\x06\x6b\x6e\xcc\xf6" + "\x55\x3e\x81\xc7\x71\x9f\x10\xf0\x8e\x5a\x4b\xf6\xae\x90\x75\x03" + "\x4f\xb3\xb4\xff\x66\xfa\xe3\xb6\x1c\xca\x0c\x75\x8a\x08\x3d\xce" + "\x58\x69\x9d\xa9\x19\x29\xda\x2f\xa1\xb2\xae\xa7\x83\xd5\x92\xc2" + "\x15\xdc\xef\x76\xd2\xd1\x9f\xb4\x7f\x3e\xb3\x7a\xa8\x3e\xba\xa3" + "\x9e\x2e\x73\xe3\x4d\xdc\x50\xba\x5b\xb0\x8b\x1a\x87\x21\x03\x93" + "\x74\x20\x01\xda\x38\x85\x1c\x3c\x57\x51\x09\x0e\xd8\xfc\x2b\xef" + "\x38\x8e\x11\xa4\x9e\x11\xcc\xc5\x9f\x4c\xc2\x0d\x3e\x5f\x73\x40" + "\x5a\xf4\x5b\x57\x84\x6e\xc7\xd0\x8e\xad\x1c\x1b\xae\x59\xba\xf5" + "\x77\xed\x44\x08\x9c\x9b\xfd\x88\xd9\x27\xe8\x43\xe8\xdd\x86\xfd" + "\x05\x3a\xc2\x11\x88\x98\x87\xcb\xa1\x72\xc2\x52\x5c\xd1\x1a\x40" + "\x80\xe2\x1e\xe8\x9b\x4e\x63\x9b\xfb\x58\x11\x44\x36\x35\x83\x9b" + "\x20\x9b\x4b\x58\xef\x1f\xfa\xe1\xb0\xe0\xb8\x60\x87\x0b\xdb\x83" + "\x6f\xeb\xc0\x80\x63\xa8\xc4\x22\x0f\x1d\xec\x9b\x44\xfa\xd3\x13" + "\x75\xb0\xfe\x74\x3c\xde\x9e\xb4\x91\x72\xc5\xf6\x36\x14\x18\x2d" + "\x15\x2e\x6b\x34\xcf\xed\x86\x4f\x1b\x56\xcf\x09\x8f\x3d\xd1\x8d" + "\x01\x7c\xba\x6a\xf4\x82\xdc\xf6\x9e\xc9\x79\xd4\x9e\x50\xc2\x9a" + "\x4f\x90\x10\x44\xd5\xcf\x6b\x1d\xb3\xce\x7c\xeb\x3f\x8f\xbc\xe6" + "\x76\xad\x78\x97\xee\xaf\x66\x73\xe4\x11\xb9\x6c\xf4\xc1\x1a\x76" + "\xd6\x54\x4c\x6c\x44\x58\xec\xd9\x8f\xf9\xc6\x7f\x71\x95\x04\xfe" + "\x6b\x42\xd6\x4f\xc6\xa8\xc1\xfa\x1e\x2c\xf2\x49\x6a\x5a\xe5\x28" + "\x34\x30\x05\xc1\x21\x3a\x5f\xfd\xaf\x61\x1f\xa0\x91\xd4\x17\xcf" + "\x65\x9d\xf5\xdb\x4b\xc2\x3d\x12\xed\xe1\x4e\xf1\x34\x50\x13\xa7" + "\x3f\xe6\x26\xcb\xc9\xb3\x64\x69\xa9\x82\x21\xec\x64\xa9\x2e\x83" + "\xa9\x9d\xa0\xbe\x20\xef\x5f\x71\x45\xe7\x9f\x75\xa3\x72\x16\xef" + "\x1b\xf7\x9a\x15\xe2\x75\x92\x39\xbb\xb1\x4f\x34\xf4\x88\x0d\xcf" + "\xbf\xd6\xfe\x5d\x61\x14\x45\x83\xf9\x6a\x3e\x81\x0f\x14\x78\xda" + "\x94\xe2\xce\x7d\x1c\x15\xd7\xe0\x95\x1d\xd8\x96\xc2\x11\xb1\x55" + "\xae\xc6\x95\x43\x38\x0a\x01\xc2\x30\xb8\x1b\x12\x39\x98\x58\x20" + "\xbd\x65\x50\x1d\x17\x13\x02\xb9\xe4\x88\x39\x72\xc8\x58\xa0\xa8" + "\x8f\xb9\xc2\x78\x82\x3a\x56\xe8\x0d\xf9\x1b\xbb\xfb\xf0\x5b\xc4" + "\x9a\x2d\xf0\xd5\x57\x6f\xce\x4b\xb6\x3e\x1b\xbf\x54\xb4\x3e\x4e" + "\x52\x5c\x2e\x6b\x5e\x01\xd1\xb3\xb5\x16\x67\xe4\x16\xad\x3c\x4d" + "\x1c\xb2\xc0\x54\xcc\xf9\xba\x11\x85\xdf\x43\x1a\xfb\x55\x9b\x88" + "\x27\x9e\x17\x29\x41\x7d\x2a\xb4\xf6\x61\x93\xa5\x1f\x5b\xb3\x06" + "\xbe\x86\x40\x11\xc6\xfc\x36\x44\xdb\xbf\x4c\x6b\x21\x15\xa9\x10" + "\x01\xdc\x53\x9c\x57\x27\xbe\x55\x19\x86\x17\x96\xfa\xdc\x4d\xf4" + "\xd9\x79\xbe\x6c\x29\x1b\xed\xbd\x09\x72\xb4\xbf\x88\xc7\x52\x39" + "\x5f\x62\x35\xad\x41\x87\xa6\xaa\x99\x20\xbc\x7d\x97\x67\x83\xa5" + "\xc3\x43\xc6\x7f\x31\xb9\x0c\xe1\x82\xa5\x66\x9a\x58\xe3\xaf\x6b" + "\x59\x09\x5b\xad\xed\xc2\x57\x66\x4e\x72\xb0\xaa\x0d\xeb\x9c\x48" + "\x3f\x0b\xaf\xc6\x46\x06\x54\x3a\x2a\x19\xb3\x9d\xde\xd9\xa0\xcf" + "\x71\x69\x33\xe8\x2c\xa8\x56\x8c\x0b\xae\x41\xc7\xb5\xfd\xca\xea" + "\x0f\xd1\xd7\xe0\x3e\xf6\xf5\xd1\xb2\x57\x21\x00\x32\xca\x02\x4d" + "\x18\xbe\x2c\x25\xe9\xbe\x0a\x34\x44\x92\xaa\x43\x09\xf7\xb4\x35" + "\xac\x65\xc3\xc1\x4c\x66\x74\x91\x9f\xae\xe2\x27\x37\x8a\xfe\x13" + "\x57\xf0\x39\x30\xf0\x06\xef\xa0\x5f\x90\xb7\xfa\xd9\x42\x3e\xcb" + "\xdc\x9c\x44\x36\x13\x8e\x66\xbc\x85\xe8\xfa\x2c\x73\xa5\x87\xbd" + "\x63\x98\x42\x56\x1a\xe9\xc4\x80\xa1\x0e\xd5\x9a\x27\xd2\x82\x20" + "\x08\xe5\x98\x60\x00\x6d\xd9\x53\x9b\xae\x67\xfb\x03\xff\x82\xf1" + "\xc6\x9b\x0b\xf1\x2c\x97\x89\x1c\x8e\x84\xd0\xb3\x2a\x44\xa3\xb2" + "\x77\x1d\xf2\x2e\x6a\xf7\x05\x67\x32\x21\xca\x39\x2e\x7f\x1a\x69" + "\x21\xdd\xaa\xfc\x19\xad\xc5\xf8\xfe\x6f\x17\x9e\x32\x64\xf8\xeb" + "\x98\x8a\x5e\x2e\x89\xea\xfb\xed\xd7\x09\x1a\x7f\xa5\xf6\xe3\xd4" + "\x33\x60\xbb\xc2\x2b\x1a\xd6\x4c\x03\xe1\xc3\xc6\x90\x0e\x7a\x89" + "\xe8\x50\x4b\x47\xc2\x91\x5d\x2a\x49\xf5\xb0\x5f\x69\xbb\x88\x51" + "\x0c\xa2\xc0\x88\x99\x91\xcd\x77\x11\x31\x3a\x8f\x99\x03\xd7\x5e", + 1024, + "\x9d\x96\x71\x67\x3d\x66\x16\x72\x55\x29\x61\x42\x77\x99\x4a\x50" + "\xdd\x2a\x80\x56\x8f\xb7\x50\x82\x80\x63\x47\x7b\xc1\x44\x3b\x02" + "\x5b\xe8\x96\x93\x97\x6c\xff\x42\x90\x40\xf9\xe9\x93\xfe\x7e\xa3" + "\x4c\xd9\xe8\xdc\xda\xf7\x8f\xcd\xe7\xa7\x1f\xaa\x7c\x8b\x07\xda" + "\xf0\x70\x4d\x47\x8e\x87\x86\x71\x1e\x7a\x13\x7b\x9c\x42\x5d\x30" + "\x0c\x04\xfb\x7b\xe0\x0e\xa7\xb1\x5c\x89\xf7\xdd\x81\x0a\xe0\xe4" + "\xe2\x69\xa2\x36\x60\x45\x1c\xcc\x27\x2f\xaf\x70\x59\x6d\xc5\xb4" + "\x40\x04\x69\x1d\xe8\xf3\xf5\x7e\x49\xd7\x81\x12\x5b\xd3\xc6\x77" + "\x82\x5c\x9e\x91\x6b\x6b\x7d\xd7\x45\xb8\x39\x94\x0a\x1a\xb4\xc4" + "\xff\xba\x05\x7b\x0b\xba\xe1\x81\x90\x29\xdd\xb5\x58\x0b\x1f\x82" + "\x9e\x4d\xdd\x1b\xc1\x62\x14\x1a\x8f\xc1\x8c\xf6\x46\x07\xb2\xcd" + "\x6a\xb5\xa1\x06\x4c\xc3\xa3\x3f\x02\x08\xe2\x29\x3c\x05\xbd\xcb" + "\xf0\xfa\x27\xf1\x7b\x48\x45\x46\x62\x88\x01\xb8\xd3\x0a\x29\xbc" + "\xd6\xbb\x20\xee\x75\x5f\x29\x0c\x47\x9e\x0f\x1d\xdf\x81\x39\x9a" + "\x1c\x48\x69\x09\xeb\x42\xae\x71\x11\x4c\x53\x9c\x69\xa6\x71\x50" + "\x45\x4d\x31\x71\xdd\xdb\xb1\x64\x37\xbf\x03\x76\xb2\x44\xf9\xbb" + "\xa3\x25\x6b\xcf\xb0\x9f\x1d\x78\xdf\x93\xde\x2d\x57\x23\x6f\xff" + "\x02\xf8\xc6\xf5\x5f\x4b\xd5\x8a\x15\xc2\x5f\x9d\x47\x3b\x2f\x8f" + "\x36\x93\x4a\x96\xae\x57\xaa\xd7\x6e\xea\x45\x94\xfb\xa2\xab\x56" + "\xae\x7e\xb3\xc5\x87\xa5\xd4\x2d\xf0\x99\x1e\x0a\x05\xb8\x33\xe4" + "\x89\x6c\x9e\x6d\x8c\xf1\xb4\xaa\x1f\xaa\xfb\x4b\x40\x90\xc0\x50" + "\xf3\x7d\x2a\x67\x68\x25\x0a\x9a\x89\x1f\x90\xfd\xb0\x9d\x7d\xaf" + "\x72\x22\xeb\x22\xb9\x63\x5f\x2c\x54\x49\xa3\x99\xc4\x74\xab\xc0" + "\x2c\x85\x31\x26\x84\x57\xfd\xce\x34\x10\x63\x57\x9f\x0c\x0a\xa3" + "\x02\xb0\x87\x36\xf5\xf8\x1e\x66\x81\x74\x2c\x3e\x90\xc0\x10\xf1" + "\x53\xd4\xc3\x45\x9b\xe2\x58\xcf\x86\x2e\xf4\xb3\x11\xff\xe6\xc8" + "\x5c\x74\x6e\xb4\xd9\x52\x2c\x52\x71\x5e\xb4\xf1\xca\xa7\x1c\x09" + "\x6a\x2d\xc0\x20\x38\xf5\x61\xdc\xd9\x8d\x42\x71\x65\xf8\xce\xa7" + "\xcb\x2c\x44\x09\x87\x5a\x02\xdd\x8c\xe1\xec\xd0\xe1\xeb\x4d\x25" + "\x70\x57\xbd\xc7\x1b\xee\xb5\xc0\x81\xc5\x75\x45\xb8\xb7\xad\xfd" + "\x33\xdc\xbe\x09\x71\xd0\xd4\xee\xf7\x37\x4e\x6f\x80\x5f\xec\x3f" + "\x35\x75\x39\xaa\x41\xe6\x62\x17\xc5\x8f\xa4\xa7\x31\xd6\xd5\xe9" + "\x56\xc2\xc7\x1d\xf1\x58\xf6\xad\x3b\xbc\xbe\x65\x12\xd4\xfb\xe2" + "\x0a\x5a\x64\x9e\xad\x70\x1d\x95\xbd\x24\x1a\xa9\x99\xc0\x70\x74" + "\xb1\x79\x01\x4f\xfd\x5d\x76\xa7\xd9\x53\x3d\x87\x2b\x51\xb4\xf3" + "\x17\xa5\x41\xe9\x8b\xba\xd3\x69\xcd\xe6\x44\x0f\x18\x8f\x59\x0d" + "\xb0\xb8\x2a\x7f\xbb\x16\x51\xf5\xe8\xad\xda\x66\xaa\x3a\xb6\x7d" + "\x10\x13\x8d\xd9\x7d\x15\x09\x80\x7b\x00\x67\x96\x90\x21\x3e\xd4" + "\x1a\xe8\x3b\x1c\x78\x31\x9b\x63\x64\xb9\x1b\x50\x11\x93\x48\x13" + "\x89\xcb\xba\x57\x23\xcd\x95\x95\xd5\xee\x8b\x0d\xb4\xdf\x0c\x8a" + "\xae\xae\x55\x3f\x93\xad\xc1\x3e\xe5\x31\x20\x73\x58\xb0\x0b\xba" + "\xf5\x03\x7b\x50\x39\xa3\x66\xa9\x82\x47\x65\x29\xa8\x49\xd7\x5c" + "\x51\x89\x97\x03\x31\x11\x75\x83\x6e\x4e\x80\x2d\x57\x93\x88\xec" + "\x0e\x22\xa8\xde\x50\x99\x2c\xaa\xaf\x60\x3a\x74\xa0\x31\x16\x37" + "\xcd\x8a\x4d\xda\x40\x1d\x0c\xf1\xc4\x7a\xd0\xaa\xf4\xa7\x55\xe3" + "\xa4\xe3\x9d\x27\x4f\x81\xc6\x07\x74\x13\x8e\x4b\xd9\x6c\x33\xba" + "\x28\x8d\xb7\x79\x36\x29\xfc\x98\x91\x29\x87\xe7\xf6\x92\xb8\x7c" + "\xe4\xca\xb7\x21\x49\x8c\x01\x59\xad\x65\x37\x62\x9b\xba\x40\xc1" + "\x79\x87\xe5\x48\x58\xe3\x0e\x3a\xda\x31\x03\x55\x36\x64\x00\xda" + "\x61\x8a\x0a\x93\xdc\x82\xcc\x63\x40\xb5\x46\xde\xf0\x8c\x3f\x6d" + "\x3e\x32\xf2\xe6\x1d\x37\xf0\xd1\x7e\x33\x52\xb6\x97\xc3\x80\x64" + "\xa4\x0d\x5f\x97\xa5\xd8\xa3\x47\x1a\x83\x1f\xd0\x52\x81\xb9\xd9" + "\x7a\x32\xe6\xf1\x3e\x7d\xdc\x01\x5d\xb8\x44\x12\xc0\x1f\x72\x72" + "\x8b\x0e\xfa\x05\x37\x73\xbd\xc4\x06\x67\x18\xd7\xd4\x80\x2c\x2c" + "\x13\x06\xfe\x82\x5b\x65\x88\xe3\x0b\x06\x3c\xe6\xe4\xd0\x8f\x24" + "\x6a\x6a\x4d\x21\x4c\x2d\x05\x76\x12\xf9\xee\xbf\xb5\x5e\xcd\x03" + "\xf0\x5b\x35\x82\xb7\x1d\x7b\xca\xa6\x14\x40\x68\xd2\xa5\x49\x34" + "\x69\xb7\x05\x48\xf9\xdb\x93\xd4\x0b\x45\x8d\xb3\x1e\xa3\xf9\x5d" + "\x8c\x18\xc5\x40\x14\x67\xc5\x40\xbe\x61\x53\x74\x52\x94\x6c\x5e" + "\xc6\xdf\xd0\xe7\xe5\xbd\x4b\xca\x89\xca\xf6\xf4\xc5\x6f\xf6\x87" + "\x9e\x3a\x11\x5a\xa8\xcd\x83\x70\x19\x63\x8a\xaf\x08\xb1\x33\xa9" + "\x2a\xcc\xde\x7f\xd2\x63\xfb\x85\x40\x77\x40\x8f\x9d\xa0\x7c\xed" + "\x8d\xe5\xe5\x31\x05\x75\xf2\x7e\xab\x22\x54\xbf\xfe\xd3\x1f\x45" + "\x95\x0d\x6d\x07\x6a\x90\x06\xd6\x45\x97\xc0\x82\x88\xfc\xd8\xd0", + "\xf1\xef\xf4\x8d\x9c\xfa\x92\x10\xd9\x4f\x22\x3f\x2f\x75\xe1\x8b" }, }; gcry_cipher_hd_t hde, hdd; - unsigned char out[1024]; + unsigned char out[2048]; unsigned char tag[16]; int i, keylen; gcry_error_t err = 0; |