ref: 077e719dfbf9bf2582bed80026251cc0d108c16e
parent: 1eb373945455f1ba03fa1b221529d74ca2a778ad
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Sun Nov 19 19:10:35 EST 2017
libsec: write optimized _chachablock() function for amd64 / sse2 doing 4 quarterround's in parallel using 128-bit vector registers. for second round shuffle the columns and then shuffle back. code is rather obvious. only trick here is for the first quaterround PSHUFLW/PSHUFHW is used to swap the halfwords for the <<<16 rotation.
--- a/sys/src/ape/lib/sec/amd64/mkfile
+++ b/sys/src/ape/lib/sec/amd64/mkfile
@@ -3,6 +3,7 @@
LIB=/$objtype/lib/ape/libsec.a
FILES=\
+ chachablock\
md5block\
sha1block\
aesni\
--- a/sys/src/ape/lib/sec/port/mkfile
+++ b/sys/src/ape/lib/sec/port/mkfile
@@ -11,7 +11,7 @@
sha1pickle.c md5pickle.c\
poly1305.c\
rc4.c\
- chacha.c\
+ chacha.c chachablock.c\
salsa.c\
genrandom.c prng.c fastrand.c nfastrand.c\
probably_prime.c smallprimetest.c genprime.c dsaprimes.c\
--- /dev/null
+++ b/sys/src/libsec/amd64/chachablock.s
@@ -1,0 +1,74 @@
+#define ROTATE(n, v1, v2) \
+ MOVO v1, v2; \
+ PSLLL $(n), v1; \
+ PSRLL $(32-n), v2; \
+ POR v1, v2
+
+TEXT _chachablock(SB), 0, $0
+ MOVOU 0(RARG), X0
+ MOVOU 16(RARG), X1
+ MOVOU 32(RARG), X2
+ MOVOU 48(RARG), X3
+
+ MOVL rounds+8(FP), CX
+ SHRL $1, CX
+
+_loop:
+ PADDL X1, X0
+ PXOR X0, X3
+ /* ROTATE(16, X3, X3) */
+ PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+ PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+
+ PADDL X3, X2
+ MOVO X1, X4
+ PXOR X2, X4
+ ROTATE(12, X4, X1)
+
+ PADDL X1, X0
+ MOVO X0, X4
+ PXOR X3, X4
+ ROTATE(8, X4, X3)
+
+ PADDL X3, X2
+ MOVO X1, X4
+ PXOR X2, X4
+ ROTATE(7, X4, X1)
+
+ PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X1, X1
+ PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
+ PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X3, X3
+
+ PADDL X1, X0
+ PXOR X0, X3
+ /* ROTATE(16, X3, X3) */
+ PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+ PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3
+
+ PADDL X3, X2
+ MOVO X1, X4
+ PXOR X2, X4
+ ROTATE(12, X4, X1)
+
+ PADDL X1, X0
+ MOVO X0, X4
+ PXOR X3, X4
+ ROTATE(8, X4, X3)
+
+ PADDL X3, X2
+ MOVO X1, X4
+ PXOR X2, X4
+ ROTATE(7, X4, X1)
+
+ PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X1, X1
+ PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2
+ PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X3, X3
+
+ DECL CX
+ JNE _loop
+
+ MOVOU X0, 0(RARG)
+ MOVOU X1, 16(RARG)
+ MOVOU X2, 32(RARG)
+ MOVOU X3, 48(RARG)
+ RET
--- a/sys/src/libsec/amd64/mkfile
+++ b/sys/src/libsec/amd64/mkfile
@@ -3,6 +3,7 @@
LIB=/$objtype/lib/libsec.a
FILES=\
+ chachablock\
md5block\
sha1block\
aesni\
--- a/sys/src/libsec/port/chacha.c
+++ b/sys/src/libsec/port/chacha.c
@@ -10,26 +10,13 @@
#include "os.h"
#include <libsec.h>
-enum{
- Blockwords= ChachaBsize/sizeof(u32int)
-};
+/* from chachablock.$O */
+extern void _chachablock(u32int x[16], int rounds);
/* little-endian data order */
#define GET4(p) ((p)[0]|((p)[1]<<8)|((p)[2]<<16)|((p)[3]<<24))
#define PUT4(p,v) (p)[0]=(v);(p)[1]=(v)>>8;(p)[2]=(v)>>16;(p)[3]=(v)>>24
-#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c))))
-
-#define QUARTERROUND(ia,ib,ic,id) { \
- u32int a, b, c, d, t; \
- a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \
- a += b; t = d^a; d = ROTATE(t,16); \
- c += d; t = b^c; b = ROTATE(t,12); \
- a += b; t = d^a; d = ROTATE(t, 8); \
- c += d; t = b^c; b = ROTATE(t, 7); \
- x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \
-}
-
#define ENCRYPT(s, x, y, d) {\
u32int v; \
v = GET4(s); \
@@ -88,22 +75,6 @@
}
static void
-dorounds(u32int x[Blockwords], int rounds)
-{
- for(; rounds > 0; rounds -= 2) {
- QUARTERROUND(0, 4, 8,12)
- QUARTERROUND(1, 5, 9,13)
- QUARTERROUND(2, 6,10,14)
- QUARTERROUND(3, 7,11,15)
-
- QUARTERROUND(0, 5,10,15)
- QUARTERROUND(1, 6,11,12)
- QUARTERROUND(2, 7, 8,13)
- QUARTERROUND(3, 4, 9,14)
- }
-}
-
-static void
hchachablock(uchar h[32], Chachastate *s)
{
u32int x[16];
@@ -125,7 +96,7 @@
x[14] = s->input[14];
x[15] = s->input[15];
- dorounds(x, s->rounds);
+ _chachablock(x, s->rounds);
PUT4(h+0*4, x[0]);
PUT4(h+1*4, x[1]);
@@ -183,7 +154,7 @@
static void
encryptblock(Chachastate *s, uchar *src, uchar *dst)
{
- u32int x[Blockwords];
+ u32int x[16];
int i;
x[0] = s->input[0];
@@ -202,7 +173,7 @@
x[13] = s->input[13];
x[14] = s->input[14];
x[15] = s->input[15];
- dorounds(x, s->rounds);
+ _chachablock(x, s->rounds);
for(i=0; i<nelem(x); i+=4){
ENCRYPT(src, x[i], s->input[i], dst);
--- /dev/null
+++ b/sys/src/libsec/port/chachablock.c
@@ -1,0 +1,29 @@
+#include "os.h"
+
+#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c))))
+
+#define QUARTERROUND(ia,ib,ic,id) { \
+ u32int a, b, c, d, t; \
+ a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \
+ a += b; t = d^a; d = ROTATE(t,16); \
+ c += d; t = b^c; b = ROTATE(t,12); \
+ a += b; t = d^a; d = ROTATE(t, 8); \
+ c += d; t = b^c; b = ROTATE(t, 7); \
+ x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \
+}
+
+void
+_chachablock(u32int x[16], int rounds)
+{
+ for(; rounds > 0; rounds -= 2) {
+ QUARTERROUND(0, 4, 8,12)
+ QUARTERROUND(1, 5, 9,13)
+ QUARTERROUND(2, 6,10,14)
+ QUARTERROUND(3, 7,11,15)
+
+ QUARTERROUND(0, 5,10,15)
+ QUARTERROUND(1, 6,11,12)
+ QUARTERROUND(2, 7, 8,13)
+ QUARTERROUND(3, 4, 9,14)
+ }
+}
--- a/sys/src/libsec/port/mkfile
+++ b/sys/src/libsec/port/mkfile
@@ -10,7 +10,7 @@
sha1pickle.c md5pickle.c\
poly1305.c\
rc4.c\
- chacha.c\
+ chacha.c chachablock.c\
salsa.c\
genrandom.c prng.c fastrand.c nfastrand.c\
probably_prime.c smallprimetest.c genprime.c dsaprimes.c\