shithub: blake2

Download patch

ref: 24d4cc2ea6d2ce5212303ec2548c208a6113fc9b
parent: d51174678721048e1f7a05c2f3d537d42b550c36
parent: 81a1bcf245f8d4ba1d627327b9069559b72d00b1
author: Samuel Neves <sneves@dei.uc.pt>
date: Fri Jun 10 18:27:28 EDT 2016

fix conflicts

--- a/b2sum/makefile
+++ b/b2sum/makefile
@@ -2,7 +2,7 @@
 PREFIX?=/usr/local
 MANDIR?=$(PREFIX)/man
 CC?=gcc
-CFLAGS?=-O3 -march=native -static
+CFLAGS?=-O3 -march=native -static -Werror=declaration-after-statement
 CFLAGS+=-std=c99 -I../sse -fopenmp
 LIBS=
 #FILES=b2sum.c ../ref/blake2b-ref.c ../ref/blake2s-ref.c ../ref/blake2bp-ref.c ../ref/blake2sp-ref.c 
--- a/ref/blake2-impl.h
+++ b/ref/blake2-impl.h
@@ -19,7 +19,7 @@
 #include <stdint.h>
 #include <string.h>
 
-static inline uint32_t load32( const void *src )
+BLAKE2_LOCAL_INLINE(uint32_t) load32( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   uint32_t w;
@@ -35,7 +35,7 @@
 #endif
 }
 
-static inline uint64_t load64( const void *src )
+BLAKE2_LOCAL_INLINE(uint64_t) load64( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   uint64_t w;
@@ -55,7 +55,7 @@
 #endif
 }
 
-static inline void store32( void *dst, uint32_t w )
+BLAKE2_LOCAL_INLINE(void) store32( void *dst, uint32_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   memcpy(dst, &w, sizeof w);
@@ -68,7 +68,7 @@
 #endif
 }
 
-static inline void store64( void *dst, uint64_t w )
+BLAKE2_LOCAL_INLINE(void) store64( void *dst, uint64_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   memcpy(dst, &w, sizeof w);
@@ -85,7 +85,7 @@
 #endif
 }
 
-static inline uint64_t load48( const void *src )
+BLAKE2_LOCAL_INLINE(uint64_t) load48( const void *src )
 {
   const uint8_t *p = ( const uint8_t * )src;
   uint64_t w = *p++;
@@ -97,7 +97,7 @@
   return w;
 }
 
-static inline void store48( void *dst, uint64_t w )
+BLAKE2_LOCAL_INLINE(void) store48( void *dst, uint64_t w )
 {
   uint8_t *p = ( uint8_t * )dst;
   *p++ = ( uint8_t )w; w >>= 8;
@@ -108,28 +108,28 @@
   *p++ = ( uint8_t )w;
 }
 
-static inline uint32_t rotl32( const uint32_t w, const unsigned c )
+BLAKE2_LOCAL_INLINE(uint32_t) rotl32( const uint32_t w, const unsigned c )
 {
   return ( w << c ) | ( w >> ( 32 - c ) );
 }
 
-static inline uint64_t rotl64( const uint64_t w, const unsigned c )
+BLAKE2_LOCAL_INLINE(uint64_t) rotl64( const uint64_t w, const unsigned c )
 {
   return ( w << c ) | ( w >> ( 64 - c ) );
 }
 
-static inline uint32_t rotr32( const uint32_t w, const unsigned c )
+BLAKE2_LOCAL_INLINE(uint32_t) rotr32( const uint32_t w, const unsigned c )
 {
   return ( w >> c ) | ( w << ( 32 - c ) );
 }
 
-static inline uint64_t rotr64( const uint64_t w, const unsigned c )
+BLAKE2_LOCAL_INLINE(uint64_t) rotr64( const uint64_t w, const unsigned c )
 {
   return ( w >> c ) | ( w << ( 64 - c ) );
 }
 
 /* prevents compiler optimizing out memset() */
-static inline void secure_zero_memory(void *v, size_t n)
+BLAKE2_LOCAL_INLINE(void) secure_zero_memory(void *v, size_t n)
 {
   static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
   memset_v(v, 0, n);
--- a/ref/blake2.h
+++ b/ref/blake2.h
@@ -19,6 +19,14 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#ifdef BLAKE2_NO_INLINE
+#define BLAKE2_LOCAL_INLINE(type) static type
+#endif
+
+#ifndef BLAKE2_LOCAL_INLINE
+#define BLAKE2_LOCAL_INLINE(type) static inline type
+#endif
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -81,36 +89,36 @@
 #pragma pack(push, 1)
   typedef struct __blake2s_param
   {
-    uint8_t  digest_length; // 1
-    uint8_t  key_length;    // 2
-    uint8_t  fanout;        // 3
-    uint8_t  depth;         // 4
-    uint32_t leaf_length;   // 8
+    uint8_t  digest_length; /* 1 */
+    uint8_t  key_length;    /* 2 */
+    uint8_t  fanout;        /* 3 */
+    uint8_t  depth;         /* 4 */
+    uint32_t leaf_length;   /* 8 */
     uint8_t  node_offset[6];// 14
-    uint8_t  node_depth;    // 15
-    uint8_t  inner_length;  // 16
-    // uint8_t  reserved[0];
-    uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
-    uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+    uint8_t  node_depth;    /* 15 */
+    uint8_t  inner_length;  /* 16 */
+    /* uint8_t  reserved[0]; */
+    uint8_t  salt[BLAKE2S_SALTBYTES]; /* 24 */
+    uint8_t  personal[BLAKE2S_PERSONALBYTES];  /* 32 */
   } blake2s_param;
 
   typedef struct __blake2b_param
   {
-    uint8_t  digest_length; // 1
-    uint8_t  key_length;    // 2
-    uint8_t  fanout;        // 3
-    uint8_t  depth;         // 4
-    uint32_t leaf_length;   // 8
-    uint64_t node_offset;   // 16
-    uint8_t  node_depth;    // 17
-    uint8_t  inner_length;  // 18
-    uint8_t  reserved[14];  // 32
-    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
-    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
+    uint8_t  digest_length; /* 1 */
+    uint8_t  key_length;    /* 2 */
+    uint8_t  fanout;        /* 3 */
+    uint8_t  depth;         /* 4 */
+    uint32_t leaf_length;   /* 8 */
+    uint64_t node_offset;   /* 16 */
+    uint8_t  node_depth;    /* 17 */
+    uint8_t  inner_length;  /* 18 */
+    uint8_t  reserved[14];  /* 32 */
+    uint8_t  salt[BLAKE2B_SALTBYTES]; /* 48 */
+    uint8_t  personal[BLAKE2B_PERSONALBYTES];  /* 64 */
   } blake2b_param;
 #pragma pack(pop)
 
-  // Streaming API
+  /* Streaming API */
   int blake2s_init( blake2s_state *S, const uint8_t outlen );
   int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
   int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
@@ -133,7 +141,7 @@
   int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
   int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
 
-  // Simple API
+  /* Simple API */
   int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
   int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
 
--- a/ref/blake2b-ref.c
+++ b/ref/blake2b-ref.c
@@ -45,13 +45,13 @@
 };
 
 
-static inline int blake2b_set_lastnode( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_set_lastnode( blake2b_state *S )
 {
   S->f[1] = -1;
   return 0;
 }
 
-static inline int blake2b_clear_lastnode( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_clear_lastnode( blake2b_state *S )
 {
   S->f[1] = 0;
   return 0;
@@ -58,12 +58,12 @@
 }
 
 /* Some helper functions, not necessarily useful */
-static inline int blake2b_is_lastblock( const blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_is_lastblock( const blake2b_state *S )
 {
   return S->f[0] != 0;
 }
 
-static inline int blake2b_set_lastblock( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_set_lastblock( blake2b_state *S )
 {
   if( S->last_node ) blake2b_set_lastnode( S );
 
@@ -71,7 +71,7 @@
   return 0;
 }
 
-static inline int blake2b_clear_lastblock( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_clear_lastblock( blake2b_state *S )
 {
   if( S->last_node ) blake2b_clear_lastnode( S );
 
@@ -79,7 +79,7 @@
   return 0;
 }
 
-static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
+BLAKE2_LOCAL_INLINE(int) blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
 {
   S->t[0] += inc;
   S->t[1] += ( S->t[0] < inc );
@@ -88,62 +88,62 @@
 
 
 
-// Parameter-related functions
-static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
+/* Parameter-related functions */
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
 {
   P->digest_length = digest_length;
   return 0;
 }
 
-static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
 {
   P->fanout = fanout;
   return 0;
 }
 
-static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
 {
   P->depth = depth;
   return 0;
 }
 
-static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
 {
   store32( &P->leaf_length, leaf_length );
   return 0;
 }
 
-static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
 {
   store64( &P->node_offset, node_offset );
   return 0;
 }
 
-static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
 {
   P->node_depth = node_depth;
   return 0;
 }
 
-static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
 {
   P->inner_length = inner_length;
   return 0;
 }
 
-static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
 {
   memcpy( P->salt, salt, BLAKE2B_SALTBYTES );
   return 0;
 }
 
-static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
 {
   memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES );
   return 0;
 }
 
-static inline int blake2b_init0( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_init0( blake2b_state *S )
 {
   memset( S, 0, sizeof( blake2b_state ) );
 
@@ -155,9 +155,10 @@
 /* init xors IV with input parameter block */
 int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
 {
-  blake2b_init0( S );
   const uint8_t *p = ( const uint8_t * )( P );
 
+  blake2b_init0( S );
+
   /* IV XOR ParamBlock */
   for( size_t i = 0; i < 8; ++i )
     S->h[i] ^= load64( p + sizeof( S->h[i] ) * i );
@@ -293,19 +294,19 @@
 
     if( inlen > fill )
     {
-      memcpy( S->buf + left, in, fill ); // Fill buffer
+      memcpy( S->buf + left, in, fill ); /* Fill buffer */
       S->buflen += fill;
       blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
-      blake2b_compress( S, S->buf ); // Compress
-      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
+      blake2b_compress( S, S->buf ); /* Compress */
+      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); /* Shift buffer left */
       S->buflen -= BLAKE2B_BLOCKBYTES;
       in += fill;
       inlen -= fill;
     }
-    else // inlen <= fill
+    else /* inlen <= fill */
     {
       memcpy( S->buf + left, in, inlen );
-      S->buflen += inlen; // Be lazy, do not compress
+      S->buflen += inlen; /* Be lazy, do not compress */
       in += inlen;
       inlen -= inlen;
     }
--- a/ref/blake2bp-ref.c
+++ b/ref/blake2bp-ref.c
@@ -27,7 +27,7 @@
 
 #define PARALLELISM_DEGREE 4
 
-static inline int blake2bp_init_leaf( blake2b_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
+BLAKE2_LOCAL_INLINE(int) blake2bp_init_leaf( blake2b_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
 {
   blake2b_param P[1];
   P->digest_length = outlen;
@@ -44,7 +44,7 @@
   return blake2b_init_param( S, P );
 }
 
-static inline int blake2bp_init_root( blake2b_state *S, uint8_t outlen, uint8_t keylen )
+BLAKE2_LOCAL_INLINE(int) blake2bp_init_root( blake2b_state *S, uint8_t outlen, uint8_t keylen )
 {
   blake2b_param P[1];
   P->digest_length = outlen;
@@ -205,7 +205,7 @@
   for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
     if( blake2bp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;
 
-  S[PARALLELISM_DEGREE - 1]->last_node = 1; // mark last node
+  S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */
 
   if( keylen > 0 )
   {
@@ -253,7 +253,7 @@
   if( blake2bp_init_root( FS, outlen, keylen ) < 0 )
     return -1;
 
-  FS->last_node = 1; // Mark as last node
+  FS->last_node = 1; /* Mark as last node */
 
   for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
     blake2b_update( FS, hash[i], BLAKE2B_OUTBYTES );
--- a/ref/blake2s-ref.c
+++ b/ref/blake2s-ref.c
@@ -40,13 +40,13 @@
   { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
 };
 
-static inline int blake2s_set_lastnode( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_set_lastnode( blake2s_state *S )
 {
   S->f[1] = -1;
   return 0;
 }
 
-static inline int blake2s_clear_lastnode( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_clear_lastnode( blake2s_state *S )
 {
   S->f[1] = 0;
   return 0;
@@ -53,12 +53,12 @@
 }
 
 /* Some helper functions, not necessarily useful */
-static inline int blake2s_is_lastblock( const blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_is_lastblock( const blake2s_state *S )
 {
   return S->f[0] != 0;
 }
 
-static inline int blake2s_set_lastblock( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_set_lastblock( blake2s_state *S )
 {
   if( S->last_node ) blake2s_set_lastnode( S );
 
@@ -66,7 +66,7 @@
   return 0;
 }
 
-static inline int blake2s_clear_lastblock( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_clear_lastblock( blake2s_state *S )
 {
   if( S->last_node ) blake2s_clear_lastnode( S );
 
@@ -74,7 +74,7 @@
   return 0;
 }
 
-static inline int blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
+BLAKE2_LOCAL_INLINE(int) blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
 {
   S->t[0] += inc;
   S->t[1] += ( S->t[0] < inc );
@@ -81,62 +81,62 @@
   return 0;
 }
 
-// Parameter-related functions
-static inline int blake2s_param_set_digest_length( blake2s_param *P, const uint8_t digest_length )
+/* Parameter-related functions */
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_digest_length( blake2s_param *P, const uint8_t digest_length )
 {
   P->digest_length = digest_length;
   return 0;
 }
 
-static inline int blake2s_param_set_fanout( blake2s_param *P, const uint8_t fanout )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_fanout( blake2s_param *P, const uint8_t fanout )
 {
   P->fanout = fanout;
   return 0;
 }
 
-static inline int blake2s_param_set_max_depth( blake2s_param *P, const uint8_t depth )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_max_depth( blake2s_param *P, const uint8_t depth )
 {
   P->depth = depth;
   return 0;
 }
 
-static inline int blake2s_param_set_leaf_length( blake2s_param *P, const uint32_t leaf_length )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_leaf_length( blake2s_param *P, const uint32_t leaf_length )
 {
   store32( &P->leaf_length, leaf_length );
   return 0;
 }
 
-static inline int blake2s_param_set_node_offset( blake2s_param *P, const uint64_t node_offset )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_node_offset( blake2s_param *P, const uint64_t node_offset )
 {
   store48( P->node_offset, node_offset );
   return 0;
 }
 
-static inline int blake2s_param_set_node_depth( blake2s_param *P, const uint8_t node_depth )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_node_depth( blake2s_param *P, const uint8_t node_depth )
 {
   P->node_depth = node_depth;
   return 0;
 }
 
-static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_t inner_length )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_inner_length( blake2s_param *P, const uint8_t inner_length )
 {
   P->inner_length = inner_length;
   return 0;
 }
 
-static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
 {
   memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
   return 0;
 }
 
-static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
 {
   memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
   return 0;
 }
 
-static inline int blake2s_init0( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_init0( blake2s_state *S )
 {
   memset( S, 0, sizeof( blake2s_state ) );
 
@@ -148,9 +148,10 @@
 /* init2 xors IV with input parameter block */
 int blake2s_init_param( blake2s_state *S, const blake2s_param *P )
 {
-  blake2s_init0( S );
   const uint32_t *p = ( const uint32_t * )( P );
 
+  blake2s_init0( S );
+
   /* IV XOR ParamBlock */
   for( size_t i = 0; i < 8; ++i )
     S->h[i] ^= load32( &p[i] );
@@ -159,7 +160,7 @@
 }
 
 
-// Sequential blake2s initialization
+/* Sequential blake2s initialization */
 int blake2s_init( blake2s_state *S, const uint8_t outlen )
 {
   blake2s_param P[1];
@@ -175,7 +176,7 @@
   store48( &P->node_offset, 0 );
   P->node_depth    = 0;
   P->inner_length  = 0;
-  // memset(P->reserved, 0, sizeof(P->reserved) );
+  /* memset(P->reserved, 0, sizeof(P->reserved) ); */
   memset( P->salt,     0, sizeof( P->salt ) );
   memset( P->personal, 0, sizeof( P->personal ) );
   return blake2s_init_param( S, P );
@@ -197,7 +198,7 @@
   store48( &P->node_offset, 0 );
   P->node_depth    = 0;
   P->inner_length  = 0;
-  // memset(P->reserved, 0, sizeof(P->reserved) );
+  /* memset(P->reserved, 0, sizeof(P->reserved) ); */
   memset( P->salt,     0, sizeof( P->salt ) );
   memset( P->personal, 0, sizeof( P->personal ) );
 
@@ -283,19 +284,19 @@
 
     if( inlen > fill )
     {
-      memcpy( S->buf + left, in, fill ); // Fill buffer
+      memcpy( S->buf + left, in, fill ); /* Fill buffer */
       S->buflen += fill;
       blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
-      blake2s_compress( S, S->buf ); // Compress
-      memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
+      blake2s_compress( S, S->buf ); /* Compress */
+      memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); /* Shift buffer left */
       S->buflen -= BLAKE2S_BLOCKBYTES;
       in += fill;
       inlen -= fill;
     }
-    else // inlen <= fill
+    else /* inlen <= fill */
     {
       memcpy( S->buf + left, in, inlen );
-      S->buflen += inlen; // Be lazy, do not compress
+      S->buflen += inlen; /* Be lazy, do not compress */
       in += inlen;
       inlen -= inlen;
     }
--- a/ref/blake2sp-ref.c
+++ b/ref/blake2sp-ref.c
@@ -26,7 +26,7 @@
 
 #define PARALLELISM_DEGREE 8
 
-static inline int blake2sp_init_leaf( blake2s_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
+BLAKE2_LOCAL_INLINE(int) blake2sp_init_leaf( blake2s_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
 {
   blake2s_param P[1];
   P->digest_length = outlen;
@@ -42,7 +42,7 @@
   return blake2s_init_param( S, P );
 }
 
-static inline int blake2sp_init_root( blake2s_state *S, uint8_t outlen, uint8_t keylen )
+BLAKE2_LOCAL_INLINE(int) blake2sp_init_root( blake2s_state *S, uint8_t outlen, uint8_t keylen )
 {
   blake2s_param P[1];
   P->digest_length = outlen;
@@ -203,7 +203,7 @@
   for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
     if( blake2sp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;
 
-  S[PARALLELISM_DEGREE - 1]->last_node = 1; // mark last node
+  S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */
 
   if( keylen > 0 )
   {
--- a/ref/makefile
+++ b/ref/makefile
@@ -1,5 +1,5 @@
 CC=gcc
-CFLAGS=-std=c99 -Wall -pedantic -I../testvectors
+CFLAGS=-O2 -Wall -I../testvectors
 
 all:		blake2s blake2b blake2sp blake2bp
 
--- a/sse/blake2-config.h
+++ b/sse/blake2-config.h
@@ -16,7 +16,7 @@
 #ifndef __BLAKE2_CONFIG_H__
 #define __BLAKE2_CONFIG_H__
 
-// These don't work everywhere
+/* These don't work everywhere */
 #if defined(__SSE2__) || defined(__x86_64__) || defined(__amd64__)
 #define HAVE_SSE2
 #endif
--- a/sse/blake2-impl.h
+++ b/sse/blake2-impl.h
@@ -19,7 +19,7 @@
 #include <stdint.h>
 #include <string.h>
 
-static inline uint32_t load32( const void *src )
+BLAKE2_LOCAL_INLINE(uint32_t) load32( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   uint32_t w;
@@ -35,7 +35,7 @@
 #endif
 }
 
-static inline uint64_t load64( const void *src )
+BLAKE2_LOCAL_INLINE(uint64_t) load64( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   uint64_t w;
@@ -55,7 +55,7 @@
 #endif
 }
 
-static inline void store32( void *dst, uint32_t w )
+BLAKE2_LOCAL_INLINE(void) store32( void *dst, uint32_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   memcpy(dst, &w, sizeof w);
@@ -68,7 +68,7 @@
 #endif
 }
 
-static inline void store64( void *dst, uint64_t w )
+BLAKE2_LOCAL_INLINE(void) store64( void *dst, uint64_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   memcpy(dst, &w, sizeof w);
@@ -85,7 +85,7 @@
 #endif
 }
 
-static inline uint64_t load48( const void *src )
+BLAKE2_LOCAL_INLINE(uint64_t) load48( const void *src )
 {
   const uint8_t *p = ( const uint8_t * )src;
   uint64_t w = *p++;
@@ -97,7 +97,7 @@
   return w;
 }
 
-static inline void store48( void *dst, uint64_t w )
+BLAKE2_LOCAL_INLINE(void) store48( void *dst, uint64_t w )
 {
   uint8_t *p = ( uint8_t * )dst;
   *p++ = ( uint8_t )w; w >>= 8;
@@ -108,28 +108,28 @@
   *p++ = ( uint8_t )w;
 }
 
-static inline uint32_t rotl32( const uint32_t w, const unsigned c )
+BLAKE2_LOCAL_INLINE(uint32_t) rotl32( const uint32_t w, const unsigned c )
 {
   return ( w << c ) | ( w >> ( 32 - c ) );
 }
 
-static inline uint64_t rotl64( const uint64_t w, const unsigned c )
+BLAKE2_LOCAL_INLINE(uint64_t) rotl64( const uint64_t w, const unsigned c )
 {
   return ( w << c ) | ( w >> ( 64 - c ) );
 }
 
-static inline uint32_t rotr32( const uint32_t w, const unsigned c )
+BLAKE2_LOCAL_INLINE(uint32_t) rotr32( const uint32_t w, const unsigned c )
 {
   return ( w >> c ) | ( w << ( 32 - c ) );
 }
 
-static inline uint64_t rotr64( const uint64_t w, const unsigned c )
+BLAKE2_LOCAL_INLINE(uint64_t) rotr64( const uint64_t w, const unsigned c )
 {
   return ( w >> c ) | ( w << ( 64 - c ) );
 }
 
 /* prevents compiler optimizing out memset() */
-static inline void secure_zero_memory(void *v, size_t n)
+BLAKE2_LOCAL_INLINE(void) secure_zero_memory(void *v, size_t n)
 {
   static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
   memset_v(v, 0, n);
--- a/sse/blake2.h
+++ b/sse/blake2.h
@@ -1,14 +1,16 @@
 /*
-   BLAKE2 reference source code package - optimized C implementations
-
-   Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
-
-   To the extent possible under law, the author(s) have dedicated all copyright
-   and related and neighboring rights to this software to the public domain
-   worldwide. This software is distributed without any warranty.
-
-   You should have received a copy of the CC0 Public Domain Dedication along with
-   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+   BLAKE2 reference source code package - reference C implementations
+  
+   Copyright 2012, Samuel Neves <sneves@dei.uc.pt>.  You may use this under the
+   terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
+   your option.  The terms of these licenses can be found at:
+  
+   - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
+   - OpenSSL license   : https://www.openssl.org/source/license.html
+   - Apache 2.0        : http://www.apache.org/licenses/LICENSE-2.0
+  
+   More information about the BLAKE2 hash function can be found at
+   https://blake2.net.
 */
 #pragma once
 #ifndef __BLAKE2_H__
@@ -17,6 +19,14 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#ifdef BLAKE2_NO_INLINE
+#define BLAKE2_LOCAL_INLINE(type) static type
+#endif
+
+#ifndef BLAKE2_LOCAL_INLINE
+#define BLAKE2_LOCAL_INLINE(type) static inline type
+#endif
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -75,39 +85,40 @@
     size_t  buflen;
   } blake2bp_state;
 
+
 #pragma pack(push, 1)
   typedef struct __blake2s_param
   {
-    uint8_t  digest_length; // 1
-    uint8_t  key_length;    // 2
-    uint8_t  fanout;        // 3
-    uint8_t  depth;         // 4
-    uint32_t leaf_length;   // 8
+    uint8_t  digest_length; /* 1 */
+    uint8_t  key_length;    /* 2 */
+    uint8_t  fanout;        /* 3 */
+    uint8_t  depth;         /* 4 */
+    uint32_t leaf_length;   /* 8 */
     uint8_t  node_offset[6];// 14
-    uint8_t  node_depth;    // 15
-    uint8_t  inner_length;  // 16
-    // uint8_t  reserved[0];
-    uint8_t  salt[BLAKE2S_SALTBYTES]; // 24
-    uint8_t  personal[BLAKE2S_PERSONALBYTES];  // 32
+    uint8_t  node_depth;    /* 15 */
+    uint8_t  inner_length;  /* 16 */
+    /* uint8_t  reserved[0]; */
+    uint8_t  salt[BLAKE2S_SALTBYTES]; /* 24 */
+    uint8_t  personal[BLAKE2S_PERSONALBYTES];  /* 32 */
   } blake2s_param;
 
   typedef struct __blake2b_param
   {
-    uint8_t  digest_length; // 1
-    uint8_t  key_length;    // 2
-    uint8_t  fanout;        // 3
-    uint8_t  depth;         // 4
-    uint32_t leaf_length;   // 8
-    uint64_t node_offset;   // 16
-    uint8_t  node_depth;    // 17
-    uint8_t  inner_length;  // 18
-    uint8_t  reserved[14];  // 32
-    uint8_t  salt[BLAKE2B_SALTBYTES]; // 48
-    uint8_t  personal[BLAKE2B_PERSONALBYTES];  // 64
+    uint8_t  digest_length; /* 1 */
+    uint8_t  key_length;    /* 2 */
+    uint8_t  fanout;        /* 3 */
+    uint8_t  depth;         /* 4 */
+    uint32_t leaf_length;   /* 8 */
+    uint64_t node_offset;   /* 16 */
+    uint8_t  node_depth;    /* 17 */
+    uint8_t  inner_length;  /* 18 */
+    uint8_t  reserved[14];  /* 32 */
+    uint8_t  salt[BLAKE2B_SALTBYTES]; /* 48 */
+    uint8_t  personal[BLAKE2B_PERSONALBYTES];  /* 64 */
   } blake2b_param;
 #pragma pack(pop)
 
-  // Streaming API
+  /* Streaming API */
   int blake2s_init( blake2s_state *S, const uint8_t outlen );
   int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen );
   int blake2s_init_param( blake2s_state *S, const blake2s_param *P );
@@ -130,7 +141,7 @@
   int blake2bp_update( blake2bp_state *S, const uint8_t *in, uint64_t inlen );
   int blake2bp_final( blake2bp_state *S, uint8_t *out, uint8_t outlen );
 
-  // Simple API
+  /* Simple API */
   int blake2s( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
   int blake2b( uint8_t *out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen );
 
--- a/sse/blake2b.c
+++ b/sse/blake2b.c
@@ -67,24 +67,24 @@
 
 
 /* Some helper functions, not necessarily useful */
-static inline int blake2b_set_lastnode( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_set_lastnode( blake2b_state *S )
 {
   S->f[1] = -1;
   return 0;
 }
 
-static inline int blake2b_clear_lastnode( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_clear_lastnode( blake2b_state *S )
 {
   S->f[1] = 0;
   return 0;
 }
 
-static inline int blake2b_is_lastblock( const blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_is_lastblock( const blake2b_state *S )
 {
   return S->f[0] != 0;
 }
 
-static inline int blake2b_set_lastblock( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_set_lastblock( blake2b_state *S )
 {
   if( S->last_node ) blake2b_set_lastnode( S );
 
@@ -92,7 +92,7 @@
   return 0;
 }
 
-static inline int blake2b_clear_lastblock( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_clear_lastblock( blake2b_state *S )
 {
   if( S->last_node ) blake2b_clear_lastnode( S );
 
@@ -101,10 +101,10 @@
 }
 
 
-static inline int blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
+BLAKE2_LOCAL_INLINE(int) blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
 {
 #if __x86_64__
-  // ADD/ADC chain
+  /* ADD/ADC chain */
   __uint128_t t = ( ( __uint128_t )S->t[1] << 64 ) | S->t[0];
   t += inc;
   S->t[0] = ( uint64_t )( t >>  0 );
@@ -117,62 +117,62 @@
 }
 
 
-// Parameter-related functions
-static inline int blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
+/* Parameter-related functions */
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_digest_length( blake2b_param *P, const uint8_t digest_length )
 {
   P->digest_length = digest_length;
   return 0;
 }
 
-static inline int blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_fanout( blake2b_param *P, const uint8_t fanout )
 {
   P->fanout = fanout;
   return 0;
 }
 
-static inline int blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_max_depth( blake2b_param *P, const uint8_t depth )
 {
   P->depth = depth;
   return 0;
 }
 
-static inline int blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_leaf_length( blake2b_param *P, const uint32_t leaf_length )
 {
   P->leaf_length = leaf_length;
   return 0;
 }
 
-static inline int blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_node_offset( blake2b_param *P, const uint64_t node_offset )
 {
   P->node_offset = node_offset;
   return 0;
 }
 
-static inline int blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_node_depth( blake2b_param *P, const uint8_t node_depth )
 {
   P->node_depth = node_depth;
   return 0;
 }
 
-static inline int blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_inner_length( blake2b_param *P, const uint8_t inner_length )
 {
   P->inner_length = inner_length;
   return 0;
 }
 
-static inline int blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_salt( blake2b_param *P, const uint8_t salt[BLAKE2B_SALTBYTES] )
 {
   memcpy( P->salt, salt, BLAKE2B_SALTBYTES );
   return 0;
 }
 
-static inline int blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2b_param_set_personal( blake2b_param *P, const uint8_t personal[BLAKE2B_PERSONALBYTES] )
 {
   memcpy( P->personal, personal, BLAKE2B_PERSONALBYTES );
   return 0;
 }
 
-static inline int blake2b_init0( blake2b_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2b_init0( blake2b_state *S )
 {
   memset( S, 0, sizeof( blake2b_state ) );
 
@@ -184,7 +184,7 @@
 /* init xors IV with input parameter block */
 int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
 {
-  //blake2b_init0( S );
+  /*blake2b_init0( S ); */
   const uint8_t * v = ( const uint8_t * )( blake2b_IV );
   const uint8_t * p = ( const uint8_t * )( P );
   uint8_t * h = ( uint8_t * )( S->h );
@@ -200,8 +200,6 @@
 /* Some sort of default parameter block initialization, for sequential blake2b */
 int blake2b_init( blake2b_state *S, const uint8_t outlen )
 {
-  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
-
   const blake2b_param P =
   {
     outlen,
@@ -216,15 +214,14 @@
     {0},
     {0}
   };
+
+  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
   return blake2b_init_param( S, &P );
 }
 
 int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
 {
-  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
-
-  if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
-
   const blake2b_param P =
   {
     outlen,
@@ -240,6 +237,10 @@
     {0}
   };
 
+  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
+
+  if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
+
   if( blake2b_init_param( S, &P ) < 0 )
     return 0;
 
@@ -253,7 +254,7 @@
   return 0;
 }
 
-static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
 {
   __m128i row1l, row1h;
   __m128i row2l, row2h;
@@ -333,19 +334,19 @@
 
     if( inlen > fill )
     {
-      memcpy( S->buf + left, in, fill ); // Fill buffer
+      memcpy( S->buf + left, in, fill ); /* Fill buffer */
       S->buflen += fill;
       blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
-      blake2b_compress( S, S->buf ); // Compress
-      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); // Shift buffer left
+      blake2b_compress( S, S->buf ); /* Compress */
+      memcpy( S->buf, S->buf + BLAKE2B_BLOCKBYTES, BLAKE2B_BLOCKBYTES ); /* Shift buffer left */
       S->buflen -= BLAKE2B_BLOCKBYTES;
       in += fill;
       inlen -= fill;
     }
-    else // inlen <= fill
+    else /* inlen <= fill */
     {
       memcpy( S->buf + left, in, inlen );
-      S->buflen += inlen; // Be lazy, do not compress
+      S->buflen += inlen; /* Be lazy, do not compress */
       in += inlen;
       inlen -= inlen;
     }
--- a/sse/blake2bp.c
+++ b/sse/blake2bp.c
@@ -27,7 +27,7 @@
 
 #define PARALLELISM_DEGREE 4
 
-static inline int blake2bp_init_leaf( blake2b_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
+BLAKE2_LOCAL_INLINE(int) blake2bp_init_leaf( blake2b_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
 {
   blake2b_param P[1];
   P->digest_length = outlen;
@@ -44,7 +44,7 @@
   return blake2b_init_param( S, P );
 }
 
-static inline int blake2bp_init_root( blake2b_state *S, uint8_t outlen, uint8_t keylen )
+BLAKE2_LOCAL_INLINE(int) blake2bp_init_root( blake2b_state *S, uint8_t outlen, uint8_t keylen )
 {
   blake2b_param P[1];
   P->digest_length = outlen;
@@ -206,7 +206,7 @@
   for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
     if( blake2bp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;
 
-  S[PARALLELISM_DEGREE - 1]->last_node = 1; // mark last node
+  S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */
 
   if( keylen > 0 )
   {
@@ -254,7 +254,7 @@
   if( blake2bp_init_root( FS, outlen, keylen ) < 0 )
     return -1;
 
-  FS->last_node = 1; // Mark as last node
+  FS->last_node = 1; /* Mark as last node */
 
   for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
     blake2b_update( FS, hash[i], BLAKE2B_OUTBYTES );
@@ -280,7 +280,7 @@
   for( size_t i = 0; i < KAT_LENGTH; ++i )
   {
     uint8_t hash[BLAKE2B_OUTBYTES];
-    //blake2bp( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES );
+    /*blake2bp( hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES ); */
     blake2bp_state S[1];
     blake2bp_init_key( S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES );
     blake2bp_update( S, buf, i );
--- a/sse/blake2s-load-xop.h
+++ b/sse/blake2s-load-xop.h
@@ -16,7 +16,7 @@
 #ifndef __BLAKE2S_LOAD_XOP_H__
 #define __BLAKE2S_LOAD_XOP_H__
 
-#define TOB(x) ((x)*4*0x01010101 + 0x03020100) // ..or not TOB
+#define TOB(x) ((x)*4*0x01010101 + 0x03020100) /* ..or not TOB */
 
 /* Basic VPPERM emulation, for testing purposes */
 /*static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel)
@@ -25,7 +25,7 @@
    const __m128i t0 = _mm_shuffle_epi8(src1, sel);
    const __m128i s1 = _mm_shuffle_epi8(src2, _mm_sub_epi8(sel, sixteen));
    const __m128i mask = _mm_or_si128(_mm_cmpeq_epi8(sel, sixteen),
-                                     _mm_cmpgt_epi8(sel, sixteen)); // (>=16) = 0xff : 00
+                                     _mm_cmpgt_epi8(sel, sixteen)); /* (>=16) = 0xff : 00 */
    return _mm_blendv_epi8(t0, s1, mask);
 }*/
 
--- a/sse/blake2s.c
+++ b/sse/blake2s.c
@@ -61,24 +61,24 @@
 
 
 /* Some helper functions, not necessarily useful */
-static inline int blake2s_set_lastnode( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_set_lastnode( blake2s_state *S )
 {
   S->f[1] = -1;
   return 0;
 }
 
-static inline int blake2s_clear_lastnode( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_clear_lastnode( blake2s_state *S )
 {
   S->f[1] = 0;
   return 0;
 }
 
-static inline int blake2s_is_lastblock( const blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_is_lastblock( const blake2s_state *S )
 {
   return S->f[0] != 0;
 }
 
-static inline int blake2s_set_lastblock( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_set_lastblock( blake2s_state *S )
 {
   if( S->last_node ) blake2s_set_lastnode( S );
 
@@ -86,7 +86,7 @@
   return 0;
 }
 
-static inline int blake2s_clear_lastblock( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_clear_lastblock( blake2s_state *S )
 {
   if( S->last_node ) blake2s_clear_lastnode( S );
 
@@ -94,7 +94,7 @@
   return 0;
 }
 
-static inline int blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
+BLAKE2_LOCAL_INLINE(int) blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
 {
   uint64_t t = ( ( uint64_t )S->t[1] << 32 ) | S->t[0];
   t += inc;
@@ -104,62 +104,62 @@
 }
 
 
-// Parameter-related functions
-static inline int blake2s_param_set_digest_length( blake2s_param *P, const uint8_t digest_length )
+/* Parameter-related functions */
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_digest_length( blake2s_param *P, const uint8_t digest_length )
 {
   P->digest_length = digest_length;
   return 0;
 }
 
-static inline int blake2s_param_set_fanout( blake2s_param *P, const uint8_t fanout )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_fanout( blake2s_param *P, const uint8_t fanout )
 {
   P->fanout = fanout;
   return 0;
 }
 
-static inline int blake2s_param_set_max_depth( blake2s_param *P, const uint8_t depth )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_max_depth( blake2s_param *P, const uint8_t depth )
 {
   P->depth = depth;
   return 0;
 }
 
-static inline int blake2s_param_set_leaf_length( blake2s_param *P, const uint32_t leaf_length )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_leaf_length( blake2s_param *P, const uint32_t leaf_length )
 {
   P->leaf_length = leaf_length;
   return 0;
 }
 
-static inline int blake2s_param_set_node_offset( blake2s_param *P, const uint64_t node_offset )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_node_offset( blake2s_param *P, const uint64_t node_offset )
 {
   store48( P->node_offset, node_offset );
   return 0;
 }
 
-static inline int blake2s_param_set_node_depth( blake2s_param *P, const uint8_t node_depth )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_node_depth( blake2s_param *P, const uint8_t node_depth )
 {
   P->node_depth = node_depth;
   return 0;
 }
 
-static inline int blake2s_param_set_inner_length( blake2s_param *P, const uint8_t inner_length )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_inner_length( blake2s_param *P, const uint8_t inner_length )
 {
   P->inner_length = inner_length;
   return 0;
 }
 
-static inline int blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_salt( blake2s_param *P, const uint8_t salt[BLAKE2S_SALTBYTES] )
 {
   memcpy( P->salt, salt, BLAKE2S_SALTBYTES );
   return 0;
 }
 
-static inline int blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2s_param_set_personal( blake2s_param *P, const uint8_t personal[BLAKE2S_PERSONALBYTES] )
 {
   memcpy( P->personal, personal, BLAKE2S_PERSONALBYTES );
   return 0;
 }
 
-static inline int blake2s_init0( blake2s_state *S )
+BLAKE2_LOCAL_INLINE(int) blake2s_init0( blake2s_state *S )
 {
   memset( S, 0, sizeof( blake2s_state ) );
 
@@ -171,7 +171,7 @@
 /* init2 xors IV with input parameter block */
 int blake2s_init_param( blake2s_state *S, const blake2s_param *P )
 {
-  //blake2s_init0( S );
+  /*blake2s_init0( S ); */
   const uint8_t * v = ( const uint8_t * )( blake2s_IV );
   const uint8_t * p = ( const uint8_t * )( P );
   uint8_t * h = ( uint8_t * )( S->h );
@@ -187,9 +187,6 @@
 /* Some sort of default parameter block initialization, for sequential blake2s */
 int blake2s_init( blake2s_state *S, const uint8_t outlen )
 {
-  /* Move interval verification here? */
-  if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
-
   const blake2s_param P =
   {
     outlen,
@@ -203,6 +200,8 @@
     {0},
     {0}
   };
+  /* Move interval verification here? */
+  if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
   return blake2s_init_param( S, &P );
 }
 
@@ -209,11 +208,6 @@
 
 int blake2s_init_key( blake2s_state *S, const uint8_t outlen, const void *key, const uint8_t keylen )
 {
-  /* Move interval verification here? */
-  if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
-
-  if ( ( !key ) || ( !keylen ) || keylen > BLAKE2S_KEYBYTES ) return -1;
-
   const blake2s_param P =
   {
     outlen,
@@ -228,6 +222,11 @@
     {0}
   };
 
+  /* Move interval verification here? */
+  if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
+
+  if ( ( !key ) || ( !keylen ) || keylen > BLAKE2S_KEYBYTES ) return -1;
+
   if( blake2s_init_param( S, &P ) < 0 )
     return -1;
 
@@ -242,7 +241,7 @@
 }
 
 
-static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
+BLAKE2_LOCAL_INLINE(int) blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
 {
   __m128i row1, row2, row3, row4;
   __m128i buf1, buf2, buf3, buf4;
@@ -309,19 +308,19 @@
 
     if( inlen > fill )
     {
-      memcpy( S->buf + left, in, fill ); // Fill buffer
+      memcpy( S->buf + left, in, fill ); /* Fill buffer */
       S->buflen += fill;
       blake2s_increment_counter( S, BLAKE2S_BLOCKBYTES );
-      blake2s_compress( S, S->buf ); // Compress
-      memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); // Shift buffer left
+      blake2s_compress( S, S->buf ); /* Compress */
+      memcpy( S->buf, S->buf + BLAKE2S_BLOCKBYTES, BLAKE2S_BLOCKBYTES ); /* Shift buffer left */
       S->buflen -= BLAKE2S_BLOCKBYTES;
       in += fill;
       inlen -= fill;
     }
-    else // inlen <= fill
+    else /* inlen <= fill */
     {
       memcpy( S->buf + left, in, inlen );
-      S->buflen += inlen; // Be lazy, do not compress
+      S->buflen += inlen; /* Be lazy, do not compress */
       in += inlen;
       inlen -= inlen;
     }
--- a/sse/blake2sp.c
+++ b/sse/blake2sp.c
@@ -26,7 +26,7 @@
 
 #define PARALLELISM_DEGREE 8
 
-static inline int blake2sp_init_leaf( blake2s_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
+BLAKE2_LOCAL_INLINE(int) blake2sp_init_leaf( blake2s_state *S, uint8_t outlen, uint8_t keylen, uint64_t offset )
 {
   blake2s_param P[1];
   P->digest_length = outlen;
@@ -42,7 +42,7 @@
   return blake2s_init_param( S, P );
 }
 
-static inline int blake2sp_init_root( blake2s_state *S, uint8_t outlen, uint8_t keylen )
+BLAKE2_LOCAL_INLINE(int) blake2sp_init_root( blake2s_state *S, uint8_t outlen, uint8_t keylen )
 {
   blake2s_param P[1];
   P->digest_length = outlen;
@@ -203,7 +203,7 @@
   for( size_t i = 0; i < PARALLELISM_DEGREE; ++i )
     if( blake2sp_init_leaf( S[i], outlen, keylen, i ) < 0 ) return -1;
 
-  S[PARALLELISM_DEGREE - 1]->last_node = 1; // mark last node
+  S[PARALLELISM_DEGREE - 1]->last_node = 1; /* mark last node */
 
   if( keylen > 0 )
   {
--- a/sse/makefile
+++ b/sse/makefile
@@ -1,5 +1,5 @@
 CC=gcc
-CFLAGS=-std=c99 -Wall -pedantic -O3 -march=native -I../testvectors
+CFLAGS=-Wall -O3 -march=native -I../testvectors
 
 all:		blake2s blake2b blake2sp blake2bp
 
--