ref: c09cd2882c7c15600c74a9c12f104b80203f562c
parent: d51d54442e9d28c95b59b0e633b4b4c2755c3240
author: cinap_lenrek <cinap_lenrek@felloff.net>
date: Wed Nov 29 21:16:27 EST 2017
libsec: unroll portable sha2block functions - unroll the loops - rotate the taps on each step, avoiding copies - simplify boolean formulas for Ch() and Maj() this yields arround 40% throughput increase on 32/64bit archs for sha2_256 and sha2_512 on amd64.
--- a/sys/src/libsec/port/sha2block128.c
+++ b/sys/src/libsec/port/sha2block128.c
@@ -1,8 +1,6 @@
/*
- * sha2_512 block cipher
+ * sha2_512 block cipher - unrolled version
*
- * Implementation straight from Federal Information Processing Standards
- * publication 180-2 (+Change Notice to include SHA-224) August 1, 2002
* note: the following upper and lower case macro names are distinct
* and reflect the functions defined in FIPS pub. 180-2.
*/
@@ -14,8 +12,8 @@
#define sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x) >> 6))
#define SIGMA0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
#define SIGMA1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
-#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
-#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x,y,z) (((x) | (y)) & ((z) | ((x) & (y))))
/*
* first 64 bits of the fractional parts of cube roots of
@@ -41,14 +39,13 @@
0xca273eceea26619cLL, 0xd186b8c721c0c207LL, 0xeada7dd6cde0eb1eLL, 0xf57d4f7fee6ed178LL,
0x06f067aa72176fbaLL, 0x0a637dc5a2c898a6LL, 0x113f9804bef90daeLL, 0x1b710b35131c471bLL,
0x28db77f523047d84LL, 0x32caab7b40c72493LL, 0x3c9ebe0a15c9bebcLL, 0x431d67c49c100d4cLL,
- 0x4cc5d4becb3e42b6LL, 0x597f299cfc657e2aLL, 0x5fcb6fab3ad6faecLL, 0x6c44198c4a475817LL };
+ 0x4cc5d4becb3e42b6LL, 0x597f299cfc657e2aLL, 0x5fcb6fab3ad6faecLL, 0x6c44198c4a475817LL
+};
void
_sha2block128(uchar *p, ulong len, u64int *s)
{
- u64int a, b, c, d, e, f, g, h, t1, t2;
- u64int *kp, *wp;
- u64int w[80];
+ u64int w[16], a, b, c, d, e, f, g, h;
uchar *end;
/* at this point, we have a multiple of 64 bytes */
@@ -62,33 +59,111 @@
g = s[6];
h = s[7];
- for(wp = w; wp < &w[16]; wp++, p += 8)
- wp[0] = ((vlong)p[0])<<56 | ((vlong)p[1])<<48 |
- ((vlong)p[2])<<40 | ((vlong)p[3])<<32 |
- p[4] << 24 | p[5] << 16 | p[6] << 8 | p[7];
- for(; wp < &w[80]; wp++) {
- u64int s0, s1;
+#define STEP(a,b,c,d,e,f,g,h,i) \
+ if(i < 16) { \
+ w[i] = (u64int)(p[0]<<24 | p[1]<<16 | p[2]<<8 | p[3])<<32 | \
+ (p[4]<<24 | p[5]<<16 | p[6]<<8 | p[7]); \
+ p += 8; \
+ } else { \
+ u64int s0, s1; \
+ s1 = sigma1(w[i-2&15]); \
+ s0 = sigma0(w[i-15&15]); \
+ w[i&15] += s1 + w[i-7&15] + s0; \
+ } \
+ h += SIGMA1(e) + Ch(e,f,g) + K512[i] + w[i&15]; \
+ d += h; \
+ h += SIGMA0(a) + Maj(a,b,c);
- s0 = sigma0(wp[-15]);
- s1 = sigma1(wp[-2]);
-// wp[0] = sigma1(wp[-2]) + wp[-7] + sigma0(wp[-15]) + wp[-16];
- wp[0] = s1 + wp[-7] + s0 + wp[-16];
- }
+ STEP(a,b,c,d,e,f,g,h,0);
+ STEP(h,a,b,c,d,e,f,g,1);
+ STEP(g,h,a,b,c,d,e,f,2);
+ STEP(f,g,h,a,b,c,d,e,3);
+ STEP(e,f,g,h,a,b,c,d,4);
+ STEP(d,e,f,g,h,a,b,c,5);
+ STEP(c,d,e,f,g,h,a,b,6);
+ STEP(b,c,d,e,f,g,h,a,7);
- for(kp = K512, wp = w; wp < &w[80]; ) {
- t1 = h + SIGMA1(e) + Ch(e,f,g) + *kp++ + *wp++;
- t2 = SIGMA0(a) + Maj(a,b,c);
- h = g;
- g = f;
- f = e;
- e = d + t1;
- d = c;
- c = b;
- b = a;
- a = t1 + t2;
- }
+ STEP(a,b,c,d,e,f,g,h,8);
+ STEP(h,a,b,c,d,e,f,g,9);
+ STEP(g,h,a,b,c,d,e,f,10);
+ STEP(f,g,h,a,b,c,d,e,11);
+ STEP(e,f,g,h,a,b,c,d,12);
+ STEP(d,e,f,g,h,a,b,c,13);
+ STEP(c,d,e,f,g,h,a,b,14);
+ STEP(b,c,d,e,f,g,h,a,15);
- /* save state */
+ STEP(a,b,c,d,e,f,g,h,16);
+ STEP(h,a,b,c,d,e,f,g,17);
+ STEP(g,h,a,b,c,d,e,f,18);
+ STEP(f,g,h,a,b,c,d,e,19);
+ STEP(e,f,g,h,a,b,c,d,20);
+ STEP(d,e,f,g,h,a,b,c,21);
+ STEP(c,d,e,f,g,h,a,b,22);
+ STEP(b,c,d,e,f,g,h,a,23);
+
+ STEP(a,b,c,d,e,f,g,h,24);
+ STEP(h,a,b,c,d,e,f,g,25);
+ STEP(g,h,a,b,c,d,e,f,26);
+ STEP(f,g,h,a,b,c,d,e,27);
+ STEP(e,f,g,h,a,b,c,d,28);
+ STEP(d,e,f,g,h,a,b,c,29);
+ STEP(c,d,e,f,g,h,a,b,30);
+ STEP(b,c,d,e,f,g,h,a,31);
+
+ STEP(a,b,c,d,e,f,g,h,32);
+ STEP(h,a,b,c,d,e,f,g,33);
+ STEP(g,h,a,b,c,d,e,f,34);
+ STEP(f,g,h,a,b,c,d,e,35);
+ STEP(e,f,g,h,a,b,c,d,36);
+ STEP(d,e,f,g,h,a,b,c,37);
+ STEP(c,d,e,f,g,h,a,b,38);
+ STEP(b,c,d,e,f,g,h,a,39);
+
+ STEP(a,b,c,d,e,f,g,h,40);
+ STEP(h,a,b,c,d,e,f,g,41);
+ STEP(g,h,a,b,c,d,e,f,42);
+ STEP(f,g,h,a,b,c,d,e,43);
+ STEP(e,f,g,h,a,b,c,d,44);
+ STEP(d,e,f,g,h,a,b,c,45);
+ STEP(c,d,e,f,g,h,a,b,46);
+ STEP(b,c,d,e,f,g,h,a,47);
+
+ STEP(a,b,c,d,e,f,g,h,48);
+ STEP(h,a,b,c,d,e,f,g,49);
+ STEP(g,h,a,b,c,d,e,f,50);
+ STEP(f,g,h,a,b,c,d,e,51);
+ STEP(e,f,g,h,a,b,c,d,52);
+ STEP(d,e,f,g,h,a,b,c,53);
+ STEP(c,d,e,f,g,h,a,b,54);
+ STEP(b,c,d,e,f,g,h,a,55);
+
+ STEP(a,b,c,d,e,f,g,h,56);
+ STEP(h,a,b,c,d,e,f,g,57);
+ STEP(g,h,a,b,c,d,e,f,58);
+ STEP(f,g,h,a,b,c,d,e,59);
+ STEP(e,f,g,h,a,b,c,d,60);
+ STEP(d,e,f,g,h,a,b,c,61);
+ STEP(c,d,e,f,g,h,a,b,62);
+ STEP(b,c,d,e,f,g,h,a,63);
+
+ STEP(a,b,c,d,e,f,g,h,64);
+ STEP(h,a,b,c,d,e,f,g,65);
+ STEP(g,h,a,b,c,d,e,f,66);
+ STEP(f,g,h,a,b,c,d,e,67);
+ STEP(e,f,g,h,a,b,c,d,68);
+ STEP(d,e,f,g,h,a,b,c,69);
+ STEP(c,d,e,f,g,h,a,b,70);
+ STEP(b,c,d,e,f,g,h,a,71);
+
+ STEP(a,b,c,d,e,f,g,h,72);
+ STEP(h,a,b,c,d,e,f,g,73);
+ STEP(g,h,a,b,c,d,e,f,74);
+ STEP(f,g,h,a,b,c,d,e,75);
+ STEP(e,f,g,h,a,b,c,d,76);
+ STEP(d,e,f,g,h,a,b,c,77);
+ STEP(c,d,e,f,g,h,a,b,78);
+ STEP(b,c,d,e,f,g,h,a,79);
+
s[0] += a;
s[1] += b;
s[2] += c;
--- a/sys/src/libsec/port/sha2block64.c
+++ b/sys/src/libsec/port/sha2block64.c
@@ -1,8 +1,6 @@
/*
- * sha2_256 block cipher
+ * sha2_256 block cipher - unrolled version
*
- * Implementation straight from Federal Information Processing Standards
- * publication 180-2 (+Change Notice to include SHA-224) August 1, 2002
* note: the following upper and lower case macro names are distinct
* and reflect the functions defined in FIPS pub. 180-2.
*/
@@ -15,8 +13,8 @@
#define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x) >> 10))
#define SIGMA0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22))
#define SIGMA1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25))
-#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
-#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+#define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
+#define Maj(x,y,z) (((x) | (y)) & ((z) | ((x) & (y))))
/*
* first 32 bits of the fractional parts of cube roots of
@@ -44,9 +42,7 @@
void
_sha2block64(uchar *p, ulong len, u32int *s)
{
- u32int a, b, c, d, e, f, g, h, t1, t2;
- u32int *kp, *wp;
- u32int w[64];
+ u32int w[16], a, b, c, d, e, f, g, h;
uchar *end;
/* at this point, we have a multiple of 64 bytes */
@@ -60,26 +56,89 @@
g = s[6];
h = s[7];
- for(wp = w; wp < &w[16]; wp++, p += 4)
- wp[0] = p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3];
- for(; wp < &w[64]; wp++)
- wp[0] = sigma1(wp[-2]) + wp[-7] +
- sigma0(wp[-15]) + wp[-16];
+#define STEP(a,b,c,d,e,f,g,h,i) \
+ if(i < 16) {\
+ w[i] = p[0]<<24 | p[1]<<16 | p[2]<<8 | p[3]; \
+ p += 4; \
+ } else { \
+ w[i&15] += sigma1(w[i-2&15]) + w[i-7&15] + sigma0(w[i-15&15]); \
+ } \
+ h += SIGMA1(e) + Ch(e,f,g) + K256[i] + w[i&15]; \
+ d += h; \
+ h += SIGMA0(a) + Maj(a,b,c);
- for(kp = K256, wp = w; wp < &w[64]; ) {
- t1 = h + SIGMA1(e) + Ch(e,f,g) + *kp++ + *wp++;
- t2 = SIGMA0(a) + Maj(a,b,c);
- h = g;
- g = f;
- f = e;
- e = d + t1;
- d = c;
- c = b;
- b = a;
- a = t1 + t2;
- }
+ STEP(a,b,c,d,e,f,g,h,0);
+ STEP(h,a,b,c,d,e,f,g,1);
+ STEP(g,h,a,b,c,d,e,f,2);
+ STEP(f,g,h,a,b,c,d,e,3);
+ STEP(e,f,g,h,a,b,c,d,4);
+ STEP(d,e,f,g,h,a,b,c,5);
+ STEP(c,d,e,f,g,h,a,b,6);
+ STEP(b,c,d,e,f,g,h,a,7);
- /* save state */
+ STEP(a,b,c,d,e,f,g,h,8);
+ STEP(h,a,b,c,d,e,f,g,9);
+ STEP(g,h,a,b,c,d,e,f,10);
+ STEP(f,g,h,a,b,c,d,e,11);
+ STEP(e,f,g,h,a,b,c,d,12);
+ STEP(d,e,f,g,h,a,b,c,13);
+ STEP(c,d,e,f,g,h,a,b,14);
+ STEP(b,c,d,e,f,g,h,a,15);
+
+ STEP(a,b,c,d,e,f,g,h,16);
+ STEP(h,a,b,c,d,e,f,g,17);
+ STEP(g,h,a,b,c,d,e,f,18);
+ STEP(f,g,h,a,b,c,d,e,19);
+ STEP(e,f,g,h,a,b,c,d,20);
+ STEP(d,e,f,g,h,a,b,c,21);
+ STEP(c,d,e,f,g,h,a,b,22);
+ STEP(b,c,d,e,f,g,h,a,23);
+
+ STEP(a,b,c,d,e,f,g,h,24);
+ STEP(h,a,b,c,d,e,f,g,25);
+ STEP(g,h,a,b,c,d,e,f,26);
+ STEP(f,g,h,a,b,c,d,e,27);
+ STEP(e,f,g,h,a,b,c,d,28);
+ STEP(d,e,f,g,h,a,b,c,29);
+ STEP(c,d,e,f,g,h,a,b,30);
+ STEP(b,c,d,e,f,g,h,a,31);
+
+ STEP(a,b,c,d,e,f,g,h,32);
+ STEP(h,a,b,c,d,e,f,g,33);
+ STEP(g,h,a,b,c,d,e,f,34);
+ STEP(f,g,h,a,b,c,d,e,35);
+ STEP(e,f,g,h,a,b,c,d,36);
+ STEP(d,e,f,g,h,a,b,c,37);
+ STEP(c,d,e,f,g,h,a,b,38);
+ STEP(b,c,d,e,f,g,h,a,39);
+
+ STEP(a,b,c,d,e,f,g,h,40);
+ STEP(h,a,b,c,d,e,f,g,41);
+ STEP(g,h,a,b,c,d,e,f,42);
+ STEP(f,g,h,a,b,c,d,e,43);
+ STEP(e,f,g,h,a,b,c,d,44);
+ STEP(d,e,f,g,h,a,b,c,45);
+ STEP(c,d,e,f,g,h,a,b,46);
+ STEP(b,c,d,e,f,g,h,a,47);
+
+ STEP(a,b,c,d,e,f,g,h,48);
+ STEP(h,a,b,c,d,e,f,g,49);
+ STEP(g,h,a,b,c,d,e,f,50);
+ STEP(f,g,h,a,b,c,d,e,51);
+ STEP(e,f,g,h,a,b,c,d,52);
+ STEP(d,e,f,g,h,a,b,c,53);
+ STEP(c,d,e,f,g,h,a,b,54);
+ STEP(b,c,d,e,f,g,h,a,55);
+
+ STEP(a,b,c,d,e,f,g,h,56);
+ STEP(h,a,b,c,d,e,f,g,57);
+ STEP(g,h,a,b,c,d,e,f,58);
+ STEP(f,g,h,a,b,c,d,e,59);
+ STEP(e,f,g,h,a,b,c,d,60);
+ STEP(d,e,f,g,h,a,b,c,61);
+ STEP(c,d,e,f,g,h,a,b,62);
+ STEP(b,c,d,e,f,g,h,a,63);
+
s[0] += a;
s[1] += b;
s[2] += c;