shithub: blake2

Download patch

ref: 259e61dedee5383eac1a90db6ef88f9ccdcf6002
parent: e971a0428f61a2d07f253b979eaf24b85e3cda50
author: Samuel Neves <sneves@dei.uc.pt>
date: Sat Jun 11 06:29:09 EDT 2016

api cleanup

--- a/ref/blake2-impl.h
+++ b/ref/blake2-impl.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2_IMPL_H__
-#define __BLAKE2_IMPL_H__
+#ifndef BLAKE2_IMPL_H
+#define BLAKE2_IMPL_H
 
 #include <stdint.h>
 #include <string.h>
--- a/ref/blake2.h
+++ b/ref/blake2.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2_H__
-#define __BLAKE2_H__
+#ifndef BLAKE2_H
+#define BLAKE2_H
 
 #include <stddef.h>
 #include <stdint.h>
@@ -47,44 +46,44 @@
     BLAKE2B_PERSONALBYTES = 16
   };
 
-  typedef struct __blake2s_state
+  typedef struct blake2s_state__
   {
     uint32_t h[8];
     uint32_t t[2];
     uint32_t f[2];
-    uint8_t  buf[2 * BLAKE2S_BLOCKBYTES];
-    size_t   buflen;
+    uint8_t  buf[BLAKE2S_BLOCKBYTES];
+    uint32_t buflen;
     uint8_t  last_node;
   } blake2s_state;
 
-  typedef struct __blake2b_state
+  typedef struct blake2b_state__
   {
     uint64_t h[8];
     uint64_t t[2];
     uint64_t f[2];
-    uint8_t  buf[2 * BLAKE2B_BLOCKBYTES];
-    size_t   buflen;
+    uint8_t  buf[BLAKE2B_BLOCKBYTES];
+    uint32_t buflen;
     uint8_t  last_node;
   } blake2b_state;
 
-  typedef struct __blake2sp_state
+  typedef struct blake2sp_state__
   {
     blake2s_state S[8][1];
     blake2s_state R[1];
-    uint8_t buf[8 * BLAKE2S_BLOCKBYTES];
-    size_t  buflen;
+    uint8_t       buf[8 * BLAKE2S_BLOCKBYTES];
+    uint32_t      buflen;
   } blake2sp_state;
 
-  typedef struct __blake2bp_state
+  typedef struct blake2bp_state__
   {
     blake2b_state S[4][1];
     blake2b_state R[1];
-    uint8_t buf[4 * BLAKE2B_BLOCKBYTES];
-    size_t  buflen;
+    uint8_t       buf[4 * BLAKE2B_BLOCKBYTES];
+    uint32_t      buflen;
   } blake2bp_state;
 
 
-  BLAKE2_PACKED(struct __blake2s_param
+  BLAKE2_PACKED(struct blake2s_param__
   {
     uint8_t  digest_length; /* 1 */
     uint8_t  key_length;    /* 2 */
@@ -99,9 +98,9 @@
     uint8_t  personal[BLAKE2S_PERSONALBYTES];  /* 32 */
   });
 
-  typedef struct __blake2s_param blake2s_param;
+  typedef struct blake2s_param__ blake2s_param;
 
-  BLAKE2_PACKED(struct __blake2b_param
+  BLAKE2_PACKED(struct blake2b_param__
   {
     uint8_t  digest_length; /* 1 */
     uint8_t  key_length;    /* 2 */
@@ -116,7 +115,7 @@
     uint8_t  personal[BLAKE2B_PERSONALBYTES];  /* 64 */
   });
 
-  typedef struct __blake2b_param blake2b_param;
+  typedef struct blake2b_param__ blake2b_param;
 
   /* Padded structs result in a compile-time error */
   enum {
@@ -125,37 +124,37 @@
   };
 
   /* Streaming API */
-  int blake2s_init( blake2s_state *S, const uint8_t outlen );
-  int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2s_init( blake2s_state *S, size_t outlen );
+  int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen );
   int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
-  int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen );
-  int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen );
+  int blake2s_update( blake2s_state *S, const void *in, size_t inlen );
+  int blake2s_final( blake2s_state *S, void *out, size_t outlen );
 
-  int blake2b_init( blake2b_state *S, const uint8_t outlen );
-  int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
+  int blake2b_init( blake2b_state *S, size_t outlen );
+  int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen );
   int blake2b_init_param( blake2b_state *S, const blake2b_param *P );
-  int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen );
-  int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen );
+  int blake2b_update( blake2b_state *S, const void *in, size_t inlen );
+  int blake2b_final( blake2b_state *S, void *out, size_t outlen );
 
-  int blake2sp_init( blake2sp_state *S, const uint8_t outlen );
-  int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
-  int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen );
-  int blake2sp_final( blake2sp_state *S, uint8_t *out, uint8_t outlen );
+  int blake2sp_init( blake2sp_state *S, size_t outlen );
+  int blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen );
+  int blake2sp_update( blake2sp_state *S, const void *in, size_t inlen );
+  int blake2sp_final( blake2sp_state *S, void *out, size_t outlen );
 
-  int blake2bp_init( blake2bp_state *S, const uint8_t outlen );
-  int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
-  int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
-  int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
+  int blake2bp_init( blake2bp_state *S, size_t outlen );
+  int blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen );
+  int blake2bp_update( blake2bp_state *S, const void *in, size_t inlen );
+  int blake2bp_final( blake2bp_state *S, void *out, size_t outlen );
 
   /* Simple API */
-  int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
-  int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+  int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
 
-  int blake2sp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
-  int blake2bp( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
+  int blake2bp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
 
   /* This is simply an alias for blake2b */
-  int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
+  int blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen );
 
 #if defined(__cplusplus)
 }
--- a/ref/blake2b-ref.c
+++ b/ref/blake2b-ref.c
@@ -170,7 +170,7 @@
 
 
 
-int blake2b_init( blake2b_state *S, const uint8_t outlen )
+int blake2b_init( blake2b_state *S, size_t outlen )
 {
   blake2b_param P[1];
 
@@ -191,7 +191,7 @@
 }
 
 
-int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
+int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
 {
   blake2b_param P[1];
 
@@ -199,8 +199,8 @@
 
   if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
 
-  P->digest_length = outlen;
-  P->key_length    = keylen;
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = (uint8_t)keylen;
   P->fanout        = 1;
   P->depth         = 1;
   store32( &P->leaf_length, 0 );
@@ -223,48 +223,53 @@
   return 0;
 }
 
-static int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
+#define G(r,i,a,b,c,d)                      \
+  do {                                      \
+    a = a + b + m[blake2b_sigma[r][2*i+0]]; \
+    d = rotr64(d ^ a, 32);                  \
+    c = c + d;                              \
+    b = rotr64(b ^ c, 24);                  \
+    a = a + b + m[blake2b_sigma[r][2*i+1]]; \
+    d = rotr64(d ^ a, 16);                  \
+    c = c + d;                              \
+    b = rotr64(b ^ c, 63);                  \
+  } while(0)
+
+#define ROUND(r)                    \
+  do {                              \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+  } while(0)
+
+static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
 {
   uint64_t m[16];
   uint64_t v[16];
   int i;
 
-  for( i = 0; i < 16; ++i )
+  for( i = 0; i < 16; ++i ) {
     m[i] = load64( block + i * sizeof( m[i] ) );
+  }
 
-  for( i = 0; i < 8; ++i )
+  for( i = 0; i < 8; ++i ) {
     v[i] = S->h[i];
+  }
 
   v[ 8] = blake2b_IV[0];
   v[ 9] = blake2b_IV[1];
   v[10] = blake2b_IV[2];
   v[11] = blake2b_IV[3];
-  v[12] = S->t[0] ^ blake2b_IV[4];
-  v[13] = S->t[1] ^ blake2b_IV[5];
-  v[14] = S->f[0] ^ blake2b_IV[6];
-  v[15] = S->f[1] ^ blake2b_IV[7];
-#define G(r,i,a,b,c,d) \
-  do { \
-    a = a + b + m[blake2b_sigma[r][2*i+0]]; \
-    d = rotr64(d ^ a, 32); \
-    c = c + d; \
-    b = rotr64(b ^ c, 24); \
-    a = a + b + m[blake2b_sigma[r][2*i+1]]; \
-    d = rotr64(d ^ a, 16); \
-    c = c + d; \
-    b = rotr64(b ^ c, 63); \
-  } while(0)
-#define ROUND(r)  \
-  do { \
-    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-  } while(0)
+  v[12] = blake2b_IV[4] ^ S->t[0];
+  v[13] = blake2b_IV[5] ^ S->t[1];
+  v[14] = blake2b_IV[6] ^ S->f[0];
+  v[15] = blake2b_IV[7] ^ S->f[1];
+
   ROUND( 0 );
   ROUND( 1 );
   ROUND( 2 );
@@ -278,47 +283,42 @@
   ROUND( 10 );
   ROUND( 11 );
 
-  for( i = 0; i < 8; ++i )
+  for( i = 0; i < 8; ++i ) {
     S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+  }
+}
 
 #undef G
 #undef ROUND
-  return 0;
-}
 
-/* inlen now in bytes */
-int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen )
+int blake2b_update( blake2b_state *S, const void *pin, size_t inlen )
 {
-  while( inlen > 0 )
+  const unsigned char * in = (const unsigned char *)pin;
+  if( inlen > 0 )
   {
     size_t left = S->buflen;
-    size_t fill = 2 * BLAKE2B_BLOCKBYTES - left;
-
+    size_t fill = BLAKE2B_BLOCKBYTES - left;
     if( inlen > fill )
     {
+      S->buflen = 0;
       memcpy( S->buf + left, in, fill ); /* Fill buffer */
-      S->buflen += fill;
       blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
       blake2b_compress( S, S->buf ); /* Compress */
-      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); /* Shift buffer left */
-      S->buflen -= BLAKE2B_BLOCKBYTES;
-      in += fill;
-      inlen -= fill;
+      in += fill; inlen -= fill;
+      while(inlen > BLAKE2B_BLOCKBYTES) {
+        blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
+        blake2b_compress( S, in );
+        in += BLAKE2B_BLOCKBYTES;
+        inlen -= BLAKE2B_BLOCKBYTES;
+      }
     }
-    else /* inlen <= fill */
-    {
-      memcpy( S->buf + left, in, inlen );
-      S->buflen += inlen; /* Be lazy, do not compress */
-      in += inlen;
-      inlen -= inlen;
-    }
+    memcpy( S->buf + S->buflen, in, inlen );
+    S->buflen += (uint32_t)inlen;
   }
-
   return 0;
 }
 
-/* Is this correct? */
-int blake2b_final( blake2b_state *S, uint8_t *out, uint8_t outlen )
+int blake2b_final( blake2b_state *S, void *out, size_t outlen )
 {
   uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
   int i;
@@ -329,17 +329,9 @@
   if( blake2b_is_lastblock( S ) )
     return -1;
 
-  if( S->buflen > BLAKE2B_BLOCKBYTES )
-  {
-    blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
-    blake2b_compress( S, S->buf );
-    S->buflen -= BLAKE2B_BLOCKBYTES;
-    memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, S->buflen );
-  }
-
   blake2b_increment_counter( S, S->buflen );
   blake2b_set_lastblock( S );
-  memset( S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
+  memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
   blake2b_compress( S, S->buf );
 
   for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
@@ -346,11 +338,12 @@
     store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
 
   memcpy( out, buffer, outlen );
+  secure_zero_memory(buffer, sizeof(buffer));
   return 0;
 }
 
 /* inlen, at least, should be uint64_t. Others can be size_t. */
-int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
 {
   blake2b_state S[1];
 
@@ -379,14 +372,14 @@
   return 0;
 }
 
-int blake2( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen ) {
-  return blake2b(out, in, key, outlen, inlen, keylen);
+int blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) {
+  return blake2b(out, outlen, in, inlen, key, keylen);
 }
 
 #if defined(SUPERCOP)
 int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
 {
-  return blake2b( out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0 );
+  return blake2b( out, BLAKE2B_OUTBYTES, in, inlen, NULL, 0 );
 }
 #endif
 
@@ -396,19 +389,19 @@
 int main( void )
 {
   uint8_t key[BLAKE2B_KEYBYTES];
-  uint8_t buf[KAT_LENGTH];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
   size_t i;
 
   for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
     key[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
     buf[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
   {
     uint8_t hash[BLAKE2B_OUTBYTES];
-    blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
+    blake2b( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
 
     if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
     {
--- a/ref/blake2bp-ref.c
+++ b/ref/blake2bp-ref.c
@@ -27,11 +27,11 @@
 
 #define PARALLELISM_DEGREE 4
 
-static int blake2bp_init_leaf( blake2b_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
+static int blake2bp_init_leaf( blake2b_state *S, size_t outlen, size_t keylen, uint64_t offset )
 {
   blake2b_param P[1];
-  P->digest_length = outlen;
-  P->key_length = keylen;
+  P->digest_length = (uint8_t)outlen;
+  P->key_length = (uint8_t)keylen;
   P->fanout = PARALLELISM_DEGREE;
   P->depth = 2;
   store32( &P->leaf_length, 0 );
@@ -44,11 +44,11 @@
   return blake2b_init_param( S, P );
 }
 
-static int blake2bp_init_root( blake2b_state *S, uint8_t outlen, uint8_t keylen )
+static int blake2bp_init_root( blake2b_state *S, size_t outlen, size_t keylen )
 {
   blake2b_param P[1];
-  P->digest_length = outlen;
-  P->key_length = keylen;
+  P->digest_length = (uint8_t)outlen;
+  P->key_length = (uint8_t)keylen;
   P->fanout = PARALLELISM_DEGREE;
   P->depth = 2;
   store32( &P->leaf_length, 0 );
@@ -62,7 +62,7 @@
 }
 
 
-int blake2bp_init( blake2bp_state *S, const uint8_t outlen )
+int blake2bp_init( blake2bp_state *S, size_t outlen )
 {
   size_t i;
 
@@ -82,7 +82,7 @@
   return 0;
 }
 
-int blake2bp_init_key( blake2bp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
+int blake2bp_init_key( blake2bp_state *S, size_t outlen, const void *key, size_t keylen )
 {
   size_t i;
 
@@ -115,8 +115,9 @@
 }
 
 
-int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen )
+int blake2bp_update( blake2bp_state *S, const void *pin, size_t inlen )
 {
+  const unsigned char * in = (const unsigned char *)pin;
   size_t left = S->buflen;
   size_t fill = sizeof( S->buf ) - left;
   size_t i;
@@ -143,8 +144,8 @@
 #if defined(_OPENMP)
     size_t      i = omp_get_thread_num();
 #endif
-    uint64_t inlen__ = inlen;
-    const uint8_t *in__ = ( const uint8_t * )in;
+    size_t inlen__ = inlen;
+    const unsigned char *in__ = ( const unsigned char * )in;
     in__ += i * BLAKE2B_BLOCKBYTES;
 
     while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES )
@@ -165,7 +166,7 @@
   return 0;
 }
 
-int blake2bp_final( blake2bp_state *S, uint8_t *out, const uint8_t outlen )
+int blake2bp_final( blake2bp_state *S, void *out, size_t outlen )
 {
   uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES];
   size_t i;
@@ -190,7 +191,7 @@
   return blake2b_final( S->R, out, outlen );
 }
 
-int blake2bp( uint8_t *out, const void *in, const void *key, uint8_t outlen, uint64_t inlen, uint8_t keylen )
+int blake2bp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
 {
   uint8_t hash[PARALLELISM_DEGREE][BLAKE2B_OUTBYTES];
   blake2b_state S[PARALLELISM_DEGREE][1];
@@ -235,8 +236,8 @@
 #if defined(_OPENMP)
     size_t      i = omp_get_thread_num();
 #endif
-    uint64_t inlen__ = inlen;
-    const uint8_t *in__ = ( const uint8_t * )in;
+    size_t inlen__ = inlen;
+    const unsigned char *in__ = ( const unsigned char * )in;
     in__ += i * BLAKE2B_BLOCKBYTES;
 
     while( inlen__ >= PARALLELISM_DEGREE * BLAKE2B_BLOCKBYTES )
@@ -273,19 +274,19 @@
 int main( void )
 {
   uint8_t key[BLAKE2B_KEYBYTES];
-  uint8_t buf[KAT_LENGTH];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
   size_t i;
 
   for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
     key[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
     buf[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
   {
     uint8_t hash[BLAKE2B_OUTBYTES];
-    blake2bp( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
+    blake2bp( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
 
     if( 0 != memcmp( hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES ) )
     {
--- a/ref/blake2s-ref.c
+++ b/ref/blake2s-ref.c
@@ -163,7 +163,7 @@
 
 
 /* Sequential blake2s initialization */
-int blake2s_init( blake2s_state *S, const uint8_t outlen )
+int blake2s_init( blake2s_state *S, size_t outlen )
 {
   blake2s_param P[1];
 
@@ -184,7 +184,7 @@
   return blake2s_init_param( S, P );
 }
 
-int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
+int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen )
 {
   blake2s_param P[1];
 
@@ -192,8 +192,8 @@
 
   if ( !key || !keylen || keylen > BLAKE2S_KEYBYTES ) return -1;
 
-  P->digest_length = outlen;
-  P->key_length    = keylen;
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = (uint8_t)keylen;
   P->fanout        = 1;
   P->depth         = 1;
   store32( &P->leaf_length, 0 );
@@ -216,17 +216,43 @@
   return 0;
 }
 
-static int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
+#define G(r,i,a,b,c,d)                      \
+  do {                                      \
+    a = a + b + m[blake2s_sigma[r][2*i+0]]; \
+    d = rotr32(d ^ a, 16);                  \
+    c = c + d;                              \
+    b = rotr32(b ^ c, 12);                  \
+    a = a + b + m[blake2s_sigma[r][2*i+1]]; \
+    d = rotr32(d ^ a, 8);                   \
+    c = c + d;                              \
+    b = rotr32(b ^ c, 7);                   \
+  } while(0)
+
+#define ROUND(r)                    \
+  do {                              \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
+  } while(0)
+
+static void blake2s_compress( blake2s_state *S, const uint8_t in[BLAKE2S_BLOCKBYTES] )
 {
   uint32_t m[16];
   uint32_t v[16];
   size_t i;
 
-  for( i = 0; i < 16; ++i )
-    m[i] = load32( block + i * sizeof( m[i] ) );
+  for( i = 0; i < 16; ++i ) {
+    m[i] = load32( in + i * sizeof( m[i] ) );
+  }
 
-  for( i = 0; i < 8; ++i )
+  for( i = 0; i < 8; ++i ) {
     v[i] = S->h[i];
+  }
 
   v[ 8] = blake2s_IV[0];
   v[ 9] = blake2s_IV[1];
@@ -236,28 +262,7 @@
   v[13] = S->t[1] ^ blake2s_IV[5];
   v[14] = S->f[0] ^ blake2s_IV[6];
   v[15] = S->f[1] ^ blake2s_IV[7];
-#define G(r,i,a,b,c,d) \
-  do { \
-    a = a + b + m[blake2s_sigma[r][2*i+0]]; \
-    d = rotr32(d ^ a, 16); \
-    c = c + d; \
-    b = rotr32(b ^ c, 12); \
-    a = a + b + m[blake2s_sigma[r][2*i+1]]; \
-    d = rotr32(d ^ a, 8); \
-    c = c + d; \
-    b = rotr32(b ^ c, 7); \
-  } while(0)
-#define ROUND(r)  \
-  do { \
-    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
-    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
-    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
-    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
-    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
-    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
-    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
-    G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
-  } while(0)
+
   ROUND( 0 );
   ROUND( 1 );
   ROUND( 2 );
@@ -269,46 +274,42 @@
   ROUND( 8 );
   ROUND( 9 );
 
-  for( i = 0; i < 8; ++i )
+  for( i = 0; i < 8; ++i ) {
     S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
+  }
+}
 
 #undef G
 #undef ROUND
-  return 0;
-}
 
-
-int blake2s_update( blake2s_state *S, const uint8_t *in, uint64_t inlen )
+int blake2s_update( blake2s_state *S, const void *pin, size_t inlen )
 {
-  while( inlen > 0 )
+  const unsigned char * in = (const unsigned char *)pin;
+  if( inlen > 0 )
   {
     size_t left = S->buflen;
-    size_t fill = 2 * BLAKE2S_BLOCKBYTES - left;
-
+    size_t fill = BLAKE2S_BLOCKBYTES - left;
     if( inlen > fill )
     {
+      S->buflen = 0;
       memcpy( S->buf + left, in, fill ); /* Fill buffer */
-      S->buflen += fill;
       blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
       blake2s_compress( S, S->buf ); /* Compress */
-      memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); /* Shift buffer left */
-      S->buflen -= BLAKE2S_BLOCKBYTES;
-      in += fill;
-      inlen -= fill;
+      in += fill; inlen -= fill;
+      while(inlen > BLAKE2S_BLOCKBYTES) {
+        blake2s_increment_counter(S, BLAKE2S_BLOCKBYTES);
+        blake2s_compress( S, in );
+        in += BLAKE2S_BLOCKBYTES;
+        inlen -= BLAKE2S_BLOCKBYTES;
+      }
     }
-    else /* inlen <= fill */
-    {
-      memcpy( S->buf + left, in, inlen );
-      S->buflen += inlen; /* Be lazy, do not compress */
-      in += inlen;
-      inlen -= inlen;
-    }
+    memcpy( S->buf + S->buflen, in, inlen );
+    S->buflen += (uint32_t)inlen;
   }
-
   return 0;
 }
 
-int blake2s_final( blake2s_state *S, uint8_t *out, uint8_t outlen )
+int blake2s_final( blake2s_state *S, void *out, size_t outlen )
 {
   uint8_t buffer[BLAKE2S_OUTBYTES] = {0};
   int i;
@@ -319,18 +320,9 @@
   if( blake2s_is_lastblock( S ) )
     return -1;
 
-
-  if( S->buflen > BLAKE2S_BLOCKBYTES )
-  {
-    blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
-    blake2s_compress( S, S->buf );
-    S->buflen -= BLAKE2S_BLOCKBYTES;
-    memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, S->buflen );
-  }
-
   blake2s_increment_counter( S, ( uint32_t )S->buflen );
   blake2s_set_lastblock( S );
-  memset( S->buf + S->buflen, 0, 2 * BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
+  memset( S->buf + S->buflen, 0, BLAKE2S_BLOCKBYTES - S->buflen ); /* Padding */
   blake2s_compress( S, S->buf );
 
   for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
@@ -337,10 +329,11 @@
     store32( buffer + sizeof( S->h[i] ) * i, S->h[i] );
 
   memcpy( out, buffer, outlen );
+  secure_zero_memory(buffer, sizeof(buffer));
   return 0;
 }
 
-int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen )
+int blake2s( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
 {
   blake2s_state S[1];
 
@@ -372,7 +365,7 @@
 #if defined(SUPERCOP)
 int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
 {
-  return blake2s( out, in, NULL, BLAKE2S_OUTBYTES, inlen, 0 );
+  return blake2s( out, BLAKE2S_OUTBYTES in, inlen, NULL, 0 );
 }
 #endif
 
@@ -382,19 +375,19 @@
 int main( void )
 {
   uint8_t key[BLAKE2S_KEYBYTES];
-  uint8_t buf[KAT_LENGTH];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
   size_t i;
 
   for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
     key[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
     buf[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
   {
     uint8_t hash[BLAKE2S_OUTBYTES];
-    blake2s( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
+    blake2s( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES );
 
     if( 0 != memcmp( hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES ) )
     {
--- a/ref/blake2sp-ref.c
+++ b/ref/blake2sp-ref.c
@@ -26,11 +26,11 @@
 
 #define PARALLELISM_DEGREE 8
 
-static int blake2sp_init_leaf( blake2s_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
+static int blake2sp_init_leaf( blake2s_state *S, size_t outlen, size_t keylen, uint64_t offset )
 {
   blake2s_param P[1];
-  P->digest_length = outlen;
-  P->key_length = keylen;
+  P->digest_length = (uint8_t)outlen;
+  P->key_length = (uint8_t)keylen;
   P->fanout = PARALLELISM_DEGREE;
   P->depth = 2;
   store32( &P->leaf_length, 0 );
@@ -42,11 +42,11 @@
   return blake2s_init_param( S, P );
 }
 
-static int blake2sp_init_root( blake2s_state *S, uint8_t outlen, uint8_t keylen )
+static int blake2sp_init_root( blake2s_state *S, size_t outlen, size_t keylen )
 {
   blake2s_param P[1];
-  P->digest_length = outlen;
-  P->key_length = keylen;
+  P->digest_length = (uint8_t)outlen;
+  P->key_length = (uint8_t)keylen;
   P->fanout = PARALLELISM_DEGREE;
   P->depth = 2;
   store32( &P->leaf_length, 0 );
@@ -59,7 +59,7 @@
 }
 
 
-int blake2sp_init( blake2sp_state *S, const uint8_t outlen )
+int blake2sp_init( blake2sp_state *S, size_t outlen )
 {
   size_t i;
 
@@ -79,7 +79,7 @@
   return 0;
 }
 
-int blake2sp_init_key( blake2sp_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
+int blake2sp_init_key( blake2sp_state *S, size_t outlen, const void *key, size_t keylen )
 {
   size_t i;
 
@@ -112,8 +112,9 @@
 }
 
 
-int blake2sp_update( blake2sp_state *S, const uint8_t *in, uint64_t inlen )
+int blake2sp_update( blake2sp_state *S, const void *pin, size_t inlen )
 {
+  const unsigned char * in = (const unsigned char *)pin;
   size_t left = S->buflen;
   size_t fill = sizeof( S->buf ) - left;
   size_t i;
@@ -139,8 +140,8 @@
 #if defined(_OPENMP)
     size_t      i = omp_get_thread_num();
 #endif
-    uint64_t inlen__ = inlen;
-    const uint8_t *in__ = ( const uint8_t * )in;
+    size_t inlen__ = inlen;
+    const unsigned char *in__ = ( const unsigned char * )in;
     in__ += i * BLAKE2S_BLOCKBYTES;
 
     while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES )
@@ -162,7 +163,7 @@
 }
 
 
-int blake2sp_final( blake2sp_state *S, uint8_t *out, const uint8_t outlen )
+int blake2sp_final( blake2sp_state *S, void *out, size_t outlen )
 {
   uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES];
   size_t i;
@@ -188,7 +189,7 @@
 }
 
 
-int blake2sp( uint8_t *out, const void *in, const void *key, uint8_t outlen, uint64_t inlen, uint8_t keylen )
+int blake2sp( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
 {
   uint8_t hash[PARALLELISM_DEGREE][BLAKE2S_OUTBYTES];
   blake2s_state S[PARALLELISM_DEGREE][1];
@@ -233,8 +234,8 @@
 #if defined(_OPENMP)
     size_t      i = omp_get_thread_num();
 #endif
-    uint64_t inlen__ = inlen;
-    const uint8_t *in__ = ( const uint8_t * )in;
+    size_t inlen__ = inlen;
+    const unsigned char *in__ = ( const unsigned char * )in;
     in__ += i * BLAKE2S_BLOCKBYTES;
 
     while( inlen__ >= PARALLELISM_DEGREE * BLAKE2S_BLOCKBYTES )
@@ -273,19 +274,19 @@
 int main( void )
 {
   uint8_t key[BLAKE2S_KEYBYTES];
-  uint8_t buf[KAT_LENGTH];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
   size_t i;
 
   for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
     key[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
     buf[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
   {
     uint8_t hash[BLAKE2S_OUTBYTES];
-    blake2sp( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
+    blake2sp( hash, BLAKE2S_OUTBYTES, buf, i, key, BLAKE2S_KEYBYTES );
 
     if( 0 != memcmp( hash, blake2sp_keyed_kat[i], BLAKE2S_OUTBYTES ) )
     {
--- a/ref/genkat-c.c
+++ b/ref/genkat-c.c
@@ -28,7 +28,7 @@
 #define MAKE_KAT(name,size_prefix) \
 do  \
 { \
-  printf( "static const uint8_t " #name "_kat[KAT_LENGTH][" #size_prefix "_OUTBYTES] = \n{\n" ); \
+  printf( "static const uint8_t " #name "_kat[BLAKE2_KAT_LENGTH][" #size_prefix "_OUTBYTES] = \n{\n" ); \
    \
   for( size_t i = 0; i < LENGTH; ++i ) \
   { \
@@ -48,7 +48,7 @@
 #define MAKE_KEYED_KAT(name,size_prefix) \
 do  \
 { \
-  printf( "static const uint8_t " #name "_keyed_kat[KAT_LENGTH][" #size_prefix "_OUTBYTES] = \n{\n" ); \
+  printf( "static const uint8_t " #name "_keyed_kat[BLAKE2_KAT_LENGTH][" #size_prefix "_OUTBYTES] = \n{\n" ); \
   \
   for( size_t i = 0; i < LENGTH; ++i ) \
   { \
@@ -78,11 +78,10 @@
   for( size_t i = 0; i < sizeof( key ); ++i )
     key[i] = i;
 
-  puts( "#pragma once\n"
-        "#ifndef __BLAKE2_KAT_H__\n"
-        "#define __BLAKE2_KAT_H__\n\n\n"
+  puts( "#ifndef BLAKE2_KAT_H\n"
+        "#define BLAKE2_KAT_H\n\n\n"
         "#include <stdint.h>\n\n"
-        "#define KAT_LENGTH " STR( LENGTH ) "\n\n\n" );
+        "#define BLAKE2_KAT_LENGTH " STR( LENGTH ) "\n\n\n" );
   MAKE_KAT( blake2s, BLAKE2S );
   MAKE_KEYED_KAT( blake2s, BLAKE2S );
   MAKE_KAT( blake2b, BLAKE2B );
@@ -91,122 +90,6 @@
   MAKE_KEYED_KAT( blake2sp, BLAKE2S );
   MAKE_KAT( blake2bp, BLAKE2B );
   MAKE_KEYED_KAT( blake2bp, BLAKE2B );
-  /*printf( "static const uint8_t blake2s_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2s( hash, in, NULL, BLAKE2S_OUTBYTES, i, 0 );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2S_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2S_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2s_keyed_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2s( hash, in, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2S_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2S_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2b_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2b( hash, in, NULL, BLAKE2B_OUTBYTES, i, 0 );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2B_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2B_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2b_keyed_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2b( hash, in, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2B_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2B_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-
-
-  printf( "static const uint8_t blake2sp_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2sp( hash, in, NULL, BLAKE2S_OUTBYTES, i, 0 );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2S_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2S_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2sp_keyed_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2sp( hash, in, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2S_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2S_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-
-
-  printf( "static const uint8_t blake2bp_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2bp( hash, in, NULL, BLAKE2B_OUTBYTES, i, 0 );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2B_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2B_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2bp_keyed_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2bp( hash, in, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2B_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2B_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );*/
-  puts( "#endif\n\n\n" );
+  puts( "#endif" );
   return 0;
 }
--- a/ref/makefile
+++ b/ref/makefile
@@ -1,7 +1,7 @@
 CC=gcc
 CFLAGS=-O2 -Wall -I../testvectors
 
-all:		blake2s blake2b blake2sp blake2bp
+all:		blake2s blake2b blake2sp blake2bp check
 
 blake2s:	blake2s-ref.c
 		$(CC) blake2s-ref.c -o $@ $(CFLAGS) -DBLAKE2S_SELFTEST
@@ -15,6 +15,12 @@
 blake2bp:	blake2bp-ref.c blake2b-ref.c
 		$(CC) blake2bp-ref.c blake2b-ref.c -o $@ $(CFLAGS) -DBLAKE2BP_SELFTEST
 
+check: blake2s blake2b blake2sp blake2bp
+	./blake2s
+	./blake2b
+	./blake2sp
+	./blake2bp
+
 kat:
 		$(CC) $(CFLAGS) -o genkat-c genkat-c.c blake2b-ref.c blake2s-ref.c blake2sp-ref.c blake2bp-ref.c
 		$(CC) $(CFLAGS) -g -o genkat-json genkat-json.c blake2b-ref.c blake2s-ref.c blake2sp-ref.c blake2bp-ref.c
@@ -21,5 +27,5 @@
 		./genkat-c > blake2-kat.h
 		./genkat-json > blake2-kat.json
 
-clean:		
+clean:
 		rm -rf *.o genkat-c genkat-json blake2s blake2b blake2sp blake2bp
--- a/sse/blake2-config.h
+++ b/sse/blake2-config.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2_CONFIG_H__
-#define __BLAKE2_CONFIG_H__
+#ifndef BLAKE2_CONFIG_H
+#define BLAKE2_CONFIG_H
 
 /* These don't work everywhere */
 #if defined(__SSE2__) || defined(__x86_64__) || defined(__amd64__)
--- a/sse/blake2-impl.h
+++ b/sse/blake2-impl.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2_IMPL_H__
-#define __BLAKE2_IMPL_H__
+#ifndef BLAKE2_IMPL_H
+#define BLAKE2_IMPL_H
 
 #include <stdint.h>
 #include <string.h>
--- a/sse/blake2.h
+++ b/sse/blake2.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2_H__
-#define __BLAKE2_H__
+#ifndef BLAKE2_H
+#define BLAKE2_H
 
 #include <stddef.h>
 #include <stdint.h>
--- a/sse/blake2b-load-sse2.h
+++ b/sse/blake2b-load-sse2.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2B_LOAD_SSE2_H__
-#define __BLAKE2B_LOAD_SSE2_H__
+#ifndef BLAKE2B_LOAD_SSE2_H
+#define BLAKE2B_LOAD_SSE2_H
 
 #define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
 #define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
--- a/sse/blake2b-load-sse41.h
+++ b/sse/blake2b-load-sse41.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2B_LOAD_SSE41_H__
-#define __BLAKE2B_LOAD_SSE41_H__
+#ifndef BLAKE2B_LOAD_SSE41_H
+#define BLAKE2B_LOAD_SSE41_H
 
 #define LOAD_MSG_0_1(b0, b1) \
 do \
--- a/sse/blake2b-round.h
+++ b/sse/blake2b-round.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2B_ROUND_H__
-#define __BLAKE2B_ROUND_H__
+#ifndef BLAKE2B_ROUND_H
+#define BLAKE2B_ROUND_H
 
 #define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
--- a/sse/blake2b.c
+++ b/sse/blake2b.c
@@ -429,16 +429,16 @@
 int main( int argc, char **argv )
 {
   uint8_t key[BLAKE2B_KEYBYTES];
-  uint8_t buf[KAT_LENGTH];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
   size_t i;
 
   for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
     key[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
     buf[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
   {
     uint8_t hash[BLAKE2B_OUTBYTES];
     blake2b( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
--- a/sse/blake2bp.c
+++ b/sse/blake2bp.c
@@ -275,16 +275,16 @@
 int main( int argc, char **argv )
 {
   uint8_t key[BLAKE2B_KEYBYTES];
-  uint8_t buf[KAT_LENGTH];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
   size_t i;
 
   for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
     key[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
     buf[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
   {
     uint8_t hash[BLAKE2B_OUTBYTES];
     /*blake2bp( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES ); */
--- a/sse/blake2s-load-sse2.h
+++ b/sse/blake2s-load-sse2.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2S_LOAD_SSE2_H__
-#define __BLAKE2S_LOAD_SSE2_H__
+#ifndef BLAKE2S_LOAD_SSE2_H
+#define BLAKE2S_LOAD_SSE2_H
 
 #define LOAD_MSG_0_1(buf) buf = _mm_set_epi32(m6,m4,m2,m0)
 #define LOAD_MSG_0_2(buf) buf = _mm_set_epi32(m7,m5,m3,m1)
--- a/sse/blake2s-load-sse41.h
+++ b/sse/blake2s-load-sse41.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2S_LOAD_SSE41_H__
-#define __BLAKE2S_LOAD_SSE41_H__
+#ifndef BLAKE2S_LOAD_SSE41_H
+#define BLAKE2S_LOAD_SSE41_H
 
 #define LOAD_MSG_0_1(buf) \
 buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
--- a/sse/blake2s-load-xop.h
+++ b/sse/blake2s-load-xop.h
@@ -12,14 +12,14 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2S_LOAD_XOP_H__
-#define __BLAKE2S_LOAD_XOP_H__
+#ifndef BLAKE2S_LOAD_XOP_H
+#define BLAKE2S_LOAD_XOP_H
 
 #define TOB(x) ((x)*4*0x01010101 + 0x03020100) /* ..or not TOB */
 
+#if 0
 /* Basic VPPERM emulation, for testing purposes */
-/*static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel)
+static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel)
 {
    const __m128i sixteen = _mm_set1_epi8(16);
    const __m128i t0 = _mm_shuffle_epi8(src1, sel);
@@ -27,7 +27,8 @@
    const __m128i mask = _mm_or_si128(_mm_cmpeq_epi8(sel, sixteen),
                                      _mm_cmpgt_epi8(sel, sixteen)); /* (>=16) = 0xff : 00 */
    return _mm_blendv_epi8(t0, s1, mask);
-}*/
+}
+#endif
 
 #define LOAD_MSG_0_1(buf) \
 buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) );
--- a/sse/blake2s-round.h
+++ b/sse/blake2s-round.h
@@ -12,9 +12,8 @@
    More information about the BLAKE2 hash function can be found at
    https://blake2.net.
 */
-#pragma once
-#ifndef __BLAKE2S_ROUND_H__
-#define __BLAKE2S_ROUND_H__
+#ifndef BLAKE2S_ROUND_H
+#define BLAKE2S_ROUND_H
 
 #define LOADU(p)  _mm_loadu_si128( (const __m128i *)(p) )
 #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
--- a/sse/blake2s.c
+++ b/sse/blake2s.c
@@ -406,16 +406,16 @@
 int main( int argc, char **argv )
 {
   uint8_t key[BLAKE2S_KEYBYTES];
-  uint8_t buf[KAT_LENGTH];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
   size_t i;
 
   for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
     key[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
     buf[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
   {
     uint8_t hash[BLAKE2S_OUTBYTES];
 
--- a/sse/blake2sp.c
+++ b/sse/blake2sp.c
@@ -272,16 +272,16 @@
 int main( int argc, char **argv )
 {
   uint8_t key[BLAKE2S_KEYBYTES];
-  uint8_t buf[KAT_LENGTH];
+  uint8_t buf[BLAKE2_KAT_LENGTH];
   size_t i;
 
   for( i = 0; i < BLAKE2S_KEYBYTES; ++i )
     key[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
     buf[i] = ( uint8_t )i;
 
-  for( i = 0; i < KAT_LENGTH; ++i )
+  for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
   {
     uint8_t hash[BLAKE2S_OUTBYTES];
     blake2sp( hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
--- a/sse/genkat-c.c
+++ b/sse/genkat-c.c
@@ -28,7 +28,7 @@
 #define MAKE_KAT(name,size_prefix) \
 do  \
 { \
-  printf( "static const uint8_t " #name "_kat[KAT_LENGTH][" #size_prefix "_OUTBYTES] = \n{\n" ); \
+  printf( "static const uint8_t " #name "_kat[BLAKE2_KAT_LENGTH][" #size_prefix "_OUTBYTES] = \n{\n" ); \
    \
   for( size_t i = 0; i < LENGTH; ++i ) \
   { \
@@ -48,7 +48,7 @@
 #define MAKE_KEYED_KAT(name,size_prefix) \
 do  \
 { \
-  printf( "static const uint8_t " #name "_keyed_kat[KAT_LENGTH][" #size_prefix "_OUTBYTES] = \n{\n" ); \
+  printf( "static const uint8_t " #name "_keyed_kat[BLAKE2_KAT_LENGTH][" #size_prefix "_OUTBYTES] = \n{\n" ); \
   \
   for( size_t i = 0; i < LENGTH; ++i ) \
   { \
@@ -78,11 +78,10 @@
   for( size_t i = 0; i < sizeof( key ); ++i )
     key[i] = i;
 
-  puts( "#pragma once\n"
-        "#ifndef __BLAKE2_KAT_H__\n"
-        "#define __BLAKE2_KAT_H__\n\n\n"
+  puts( "#ifndef BLAKE2_KAT_H\n"
+        "#define BLAKE2_KAT_H\n\n\n"
         "#include <stdint.h>\n\n"
-        "#define KAT_LENGTH " STR( LENGTH ) "\n\n\n" );
+        "#define BLAKE2_KAT_LENGTH " STR( LENGTH ) "\n\n\n" );
   MAKE_KAT( blake2s, BLAKE2S );
   MAKE_KEYED_KAT( blake2s, BLAKE2S );
   MAKE_KAT( blake2b, BLAKE2B );
@@ -91,122 +90,6 @@
   MAKE_KEYED_KAT( blake2sp, BLAKE2S );
   MAKE_KAT( blake2bp, BLAKE2B );
   MAKE_KEYED_KAT( blake2bp, BLAKE2B );
-  /*printf( "static const uint8_t blake2s_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2s( hash, in, NULL, BLAKE2S_OUTBYTES, i, 0 );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2S_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2S_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2s_keyed_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2s( hash, in, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2S_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2S_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2b_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2b( hash, in, NULL, BLAKE2B_OUTBYTES, i, 0 );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2B_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2B_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2b_keyed_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2b( hash, in, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2B_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2B_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-
-
-  printf( "static const uint8_t blake2sp_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2sp( hash, in, NULL, BLAKE2S_OUTBYTES, i, 0 );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2S_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2S_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2sp_keyed_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2sp( hash, in, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2S_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2S_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-
-
-  printf( "static const uint8_t blake2bp_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2bp( hash, in, NULL, BLAKE2B_OUTBYTES, i, 0 );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2B_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2B_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );
-  printf( "static const uint8_t blake2bp_keyed_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = \n{\n" );
-
-  for( size_t i = 0; i < LENGTH; ++i )
-  {
-    blake2bp( hash, in, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
-    printf( "\t{\n\t\t" );
-
-    for( int j = 0; j < BLAKE2B_OUTBYTES; ++j )
-      printf( "0x%02X%s", hash[j], ( j + 1 ) == BLAKE2B_OUTBYTES ? "\n" : j && !( ( j + 1 ) % 8 ) ? ",\n\t\t" : ", " );
-
-    printf( "\t},\n" );
-  }
-
-  printf( "};\n\n\n\n\n" );*/
-  puts( "#endif\n\n\n" );
+  puts( "#endif" );
   return 0;
 }
--- a/sse/makefile
+++ b/sse/makefile
@@ -1,7 +1,7 @@
 CC=gcc
 CFLAGS=-Wall -O3 -march=native -I../testvectors
 
-all:		blake2s blake2b blake2sp blake2bp
+all:		blake2s blake2b blake2sp blake2bp check
 
 blake2s:	blake2s.c
 		$(CC) blake2s.c -o $@ $(CFLAGS) -DBLAKE2S_SELFTEST
@@ -14,6 +14,12 @@
 
 blake2bp:	blake2bp.c blake2b.c
 		$(CC) blake2bp.c blake2b.c -o $@ $(CFLAGS) -DBLAKE2BP_SELFTEST
+
+check: blake2s blake2b blake2sp blake2bp
+	./blake2s
+	./blake2b
+	./blake2sp
+	./blake2bp
 
 kat:
 		$(CC) $(CFLAGS) -o genkat-c genkat-c.c blake2b.c blake2s.c blake2sp.c blake2bp.c
--- a/testvectors/blake2-kat.h
+++ b/testvectors/blake2-kat.h
@@ -1,15 +1,14 @@
-#pragma once
-#ifndef __BLAKE2_KAT_H__
-#define __BLAKE2_KAT_H__
+#ifndef BLAKE2_KAT_H
+#define BLAKE2_KAT_H
 
 
 #include <stdint.h>
 
-#define KAT_LENGTH 256
+#define BLAKE2_KAT_LENGTH 256
 
 
 
-static const uint8_t blake2s_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = 
+static const uint8_t blake2s_kat[BLAKE2_KAT_LENGTH][BLAKE2S_OUTBYTES] = 
 {
 	{
 		0x69, 0x21, 0x7A, 0x30, 0x79, 0x90, 0x80, 0x94,
@@ -1552,7 +1551,7 @@
 
 
 
-static const uint8_t blake2s_keyed_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = 
+static const uint8_t blake2s_keyed_kat[BLAKE2_KAT_LENGTH][BLAKE2S_OUTBYTES] = 
 {
 	{
 		0x48, 0xA8, 0x99, 0x7D, 0xA4, 0x07, 0x87, 0x6B,
@@ -3095,7 +3094,7 @@
 
 
 
-static const uint8_t blake2b_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = 
+static const uint8_t blake2b_kat[BLAKE2_KAT_LENGTH][BLAKE2B_OUTBYTES] = 
 {
 	{
 		0x78, 0x6A, 0x02, 0xF7, 0x42, 0x01, 0x59, 0x03,
@@ -5662,7 +5661,7 @@
 
 
 
-static const uint8_t blake2b_keyed_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = 
+static const uint8_t blake2b_keyed_kat[BLAKE2_KAT_LENGTH][BLAKE2B_OUTBYTES] = 
 {
 	{
 		0x10, 0xEB, 0xB6, 0x77, 0x00, 0xB1, 0x86, 0x8E,
@@ -8229,7 +8228,7 @@
 
 
 
-static const uint8_t blake2sp_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = 
+static const uint8_t blake2sp_kat[BLAKE2_KAT_LENGTH][BLAKE2S_OUTBYTES] = 
 {
 	{
 		0xDD, 0x0E, 0x89, 0x17, 0x76, 0x93, 0x3F, 0x43,
@@ -9772,7 +9771,7 @@
 
 
 
-static const uint8_t blake2sp_keyed_kat[KAT_LENGTH][BLAKE2S_OUTBYTES] = 
+static const uint8_t blake2sp_keyed_kat[BLAKE2_KAT_LENGTH][BLAKE2S_OUTBYTES] = 
 {
 	{
 		0x71, 0x5C, 0xB1, 0x38, 0x95, 0xAE, 0xB6, 0x78,
@@ -11315,7 +11314,7 @@
 
 
 
-static const uint8_t blake2bp_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = 
+static const uint8_t blake2bp_kat[BLAKE2_KAT_LENGTH][BLAKE2B_OUTBYTES] = 
 {
 	{
 		0xB5, 0xEF, 0x81, 0x1A, 0x80, 0x38, 0xF7, 0x0B,
@@ -13882,7 +13881,7 @@
 
 
 
-static const uint8_t blake2bp_keyed_kat[KAT_LENGTH][BLAKE2B_OUTBYTES] = 
+static const uint8_t blake2bp_keyed_kat[BLAKE2_KAT_LENGTH][BLAKE2B_OUTBYTES] = 
 {
 	{
 		0x9D, 0x94, 0x61, 0x07, 0x3E, 0x4E, 0xB6, 0x40,
--