shithub: blake2

Download patch

ref: 88df903283b721e5dfcb488331fd90c45f940d25
parent: ab60beb7a1c9bdce7315f7324338793610df934a
author: Samuel Neves <sneves@dei.uc.pt>
date: Sat Jun 11 13:47:44 EDT 2016

remove unused code, vars

--- a/ref/blake2-impl.h
+++ b/ref/blake2-impl.h
@@ -18,7 +18,19 @@
 #include <stdint.h>
 #include <string.h>
 
-static uint32_t load32( const void *src )
+#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L)
+  #if   defined(_MSC_VER)
+    #define BLAKE2_INLINE __inline
+  #elif defined(__GNUC__)
+    #define BLAKE2_INLINE __inline__
+  #else
+    #define BLAKE2_INLINE
+  #endif
+#else
+  #define BLAKE2_INLINE inline
+#endif
+
+static BLAKE2_INLINE uint32_t load32( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   uint32_t w;
@@ -33,7 +45,7 @@
 #endif
 }
 
-static uint64_t load64( const void *src )
+static BLAKE2_INLINE uint64_t load64( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   uint64_t w;
@@ -52,7 +64,7 @@
 #endif
 }
 
-static void store32( void *dst, uint32_t w )
+static BLAKE2_INLINE void store32( void *dst, uint32_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   memcpy(dst, &w, sizeof w);
@@ -65,7 +77,7 @@
 #endif
 }
 
-static void store64( void *dst, uint64_t w )
+static BLAKE2_INLINE void store64( void *dst, uint64_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   memcpy(dst, &w, sizeof w);
@@ -82,7 +94,7 @@
 #endif
 }
 
-static uint64_t load48( const void *src )
+static BLAKE2_INLINE uint64_t load48( const void *src )
 {
   const uint8_t *p = ( const uint8_t * )src;
   return (( uint64_t )( p[0] ) <<  0) |
@@ -93,7 +105,7 @@
          (( uint64_t )( p[5] ) << 40) ;
 }
 
-static void store48( void *dst, uint64_t w )
+static BLAKE2_INLINE void store48( void *dst, uint64_t w )
 {
   uint8_t *p = ( uint8_t * )dst;
   p[0] = (uint8_t)(w >>  0);
@@ -104,28 +116,18 @@
   p[5] = (uint8_t)(w >> 40);
 }
 
-static uint32_t rotl32( const uint32_t w, const unsigned c )
+static BLAKE2_INLINE uint32_t rotr32( const uint32_t w, const unsigned c )
 {
-  return ( w << c ) | ( w >> ( 32 - c ) );
-}
-
-static uint64_t rotl64( const uint64_t w, const unsigned c )
-{
-  return ( w << c ) | ( w >> ( 64 - c ) );
-}
-
-static uint32_t rotr32( const uint32_t w, const unsigned c )
-{
   return ( w >> c ) | ( w << ( 32 - c ) );
 }
 
-static uint64_t rotr64( const uint64_t w, const unsigned c )
+static BLAKE2_INLINE uint64_t rotr64( const uint64_t w, const unsigned c )
 {
   return ( w >> c ) | ( w << ( 64 - c ) );
 }
 
 /* prevents compiler optimizing out memset() */
-static void secure_zero_memory(void *v, size_t n)
+static BLAKE2_INLINE void secure_zero_memory(void *v, size_t n)
 {
   static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
   memset_v(v, 0, n);
--- a/ref/blake2b-ref.c
+++ b/ref/blake2b-ref.c
@@ -50,11 +50,6 @@
   S->f[1] = (uint64_t)-1;
 }
 
-static void blake2b_clear_lastnode( blake2b_state *S )
-{
-  S->f[1] = 0;
-}
-
 /* Some helper functions, not necessarily useful */
 static int blake2b_is_lastblock( const blake2b_state *S )
 {
@@ -66,13 +61,6 @@
   if( S->last_node ) blake2b_set_lastnode( S );
 
   S->f[0] = (uint64_t)-1;
-}
-
-static void blake2b_clear_lastblock( blake2b_state *S )
-{
-  if( S->last_node ) blake2b_clear_lastnode( S );
-
-  S->f[0] = 0;
 }
 
 static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
--- a/ref/blake2s-ref.c
+++ b/ref/blake2s-ref.c
@@ -45,11 +45,6 @@
   S->f[1] = (uint32_t)-1;
 }
 
-static void blake2s_clear_lastnode( blake2s_state *S )
-{
-  S->f[1] = 0;
-}
-
 /* Some helper functions, not necessarily useful */
 static int blake2s_is_lastblock( const blake2s_state *S )
 {
@@ -61,13 +56,6 @@
   if( S->last_node ) blake2s_set_lastnode( S );
 
   S->f[0] = (uint32_t)-1;
-}
-
-static void blake2s_clear_lastblock( blake2s_state *S )
-{
-  if( S->last_node ) blake2s_clear_lastnode( S );
-
-  S->f[0] = 0;
 }
 
 static void blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
--- a/sse/blake2-impl.h
+++ b/sse/blake2-impl.h
@@ -18,7 +18,19 @@
 #include <stdint.h>
 #include <string.h>
 
-static uint32_t load32( const void *src )
+#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L)
+  #if   defined(_MSC_VER)
+    #define BLAKE2_INLINE __inline
+  #elif defined(__GNUC__)
+    #define BLAKE2_INLINE __inline__
+  #else
+    #define BLAKE2_INLINE
+  #endif
+#else
+  #define BLAKE2_INLINE inline
+#endif
+
+static BLAKE2_INLINE uint32_t load32( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   uint32_t w;
@@ -33,7 +45,7 @@
 #endif
 }
 
-static uint64_t load64( const void *src )
+static BLAKE2_INLINE uint64_t load64( const void *src )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   uint64_t w;
@@ -52,7 +64,7 @@
 #endif
 }
 
-static void store32( void *dst, uint32_t w )
+static BLAKE2_INLINE void store32( void *dst, uint32_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   memcpy(dst, &w, sizeof w);
@@ -65,7 +77,7 @@
 #endif
 }
 
-static void store64( void *dst, uint64_t w )
+static BLAKE2_INLINE void store64( void *dst, uint64_t w )
 {
 #if defined(NATIVE_LITTLE_ENDIAN)
   memcpy(dst, &w, sizeof w);
@@ -82,7 +94,7 @@
 #endif
 }
 
-static uint64_t load48( const void *src )
+static BLAKE2_INLINE uint64_t load48( const void *src )
 {
   const uint8_t *p = ( const uint8_t * )src;
   return (( uint64_t )( p[0] ) <<  0) |
@@ -93,7 +105,7 @@
          (( uint64_t )( p[5] ) << 40) ;
 }
 
-static void store48( void *dst, uint64_t w )
+static BLAKE2_INLINE void store48( void *dst, uint64_t w )
 {
   uint8_t *p = ( uint8_t * )dst;
   p[0] = (uint8_t)(w >>  0);
@@ -104,28 +116,18 @@
   p[5] = (uint8_t)(w >> 40);
 }
 
-static uint32_t rotl32( const uint32_t w, const unsigned c )
+static BLAKE2_INLINE uint32_t rotr32( const uint32_t w, const unsigned c )
 {
-  return ( w << c ) | ( w >> ( 32 - c ) );
-}
-
-static uint64_t rotl64( const uint64_t w, const unsigned c )
-{
-  return ( w << c ) | ( w >> ( 64 - c ) );
-}
-
-static uint32_t rotr32( const uint32_t w, const unsigned c )
-{
   return ( w >> c ) | ( w << ( 32 - c ) );
 }
 
-static uint64_t rotr64( const uint64_t w, const unsigned c )
+static BLAKE2_INLINE uint64_t rotr64( const uint64_t w, const unsigned c )
 {
   return ( w >> c ) | ( w << ( 64 - c ) );
 }
 
 /* prevents compiler optimizing out memset() */
-static void secure_zero_memory(void *v, size_t n)
+static BLAKE2_INLINE void secure_zero_memory(void *v, size_t n)
 {
   static void *(*const volatile memset_v)(void *, int, size_t) = &memset;
   memset_v(v, 0, n);
--- a/sse/blake2b.c
+++ b/sse/blake2b.c
@@ -49,23 +49,6 @@
   0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };
 
-static const uint8_t blake2b_sigma[12][16] =
-{
-  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
-  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
-  { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
-  {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
-  {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
-  {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
-  { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
-  { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
-  {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
-  { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
-  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
-  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 }
-};
-
-
 /* Some helper functions */
 static void blake2b_set_lastnode( blake2b_state *S )
 {
@@ -72,11 +55,6 @@
   S->f[1] = (uint64_t)-1;
 }
 
-static void blake2b_clear_lastnode( blake2b_state *S )
-{
-  S->f[1] = 0;
-}
-
 static int blake2b_is_lastblock( const blake2b_state *S )
 {
   return S->f[0] != 0;
@@ -89,14 +67,6 @@
   S->f[0] = (uint64_t)-1;
 }
 
-static void blake2b_clear_lastblock( blake2b_state *S )
-{
-  if( S->last_node ) blake2b_clear_lastnode( S );
-
-  S->f[0] = 0;
-}
-
-
 static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
 {
   S->t[0] += inc;
@@ -103,14 +73,6 @@
   S->t[1] += ( S->t[0] < inc );
 }
 
-static void blake2b_init0( blake2b_state *S )
-{
-  size_t i;
-  memset( S, 0, sizeof( blake2b_state ) );
-
-  for( i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
-}
-
 /* init xors IV with input parameter block */
 int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
 {
@@ -132,48 +94,46 @@
 /* Some sort of default parameter block initialization, for sequential blake2b */
 int blake2b_init( blake2b_state *S, size_t outlen )
 {
-  const blake2b_param P =
-  {
-    (uint8_t)outlen,
-    0,
-    1,
-    1,
-    0,
-    0,
-    0,
-    0,
-    {0},
-    {0},
-    {0}
-  };
+  blake2b_param P[1];
 
   if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
 
-  return blake2b_init_param( S, &P );
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = 0;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store64( &P->node_offset, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  memset( P->reserved, 0, sizeof( P->reserved ) );
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+
+  return blake2b_init_param( S, P );
 }
 
 int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
 {
-  const blake2b_param P =
-  {
-    (uint8_t)outlen,
-    (uint8_t)keylen,
-    1,
-    1,
-    0,
-    0,
-    0,
-    0,
-    {0},
-    {0},
-    {0}
-  };
+  blake2b_param P[1];
 
   if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
 
   if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
 
-  if( blake2b_init_param( S, &P ) < 0 )
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = (uint8_t)keylen;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store64( &P->node_offset, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  memset( P->reserved, 0, sizeof( P->reserved ) );
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+
+  if( blake2b_init_param( S, P ) < 0 )
     return 0;
 
   {
@@ -208,22 +168,22 @@
   const __m128i m6 = LOADU( block + 96 );
   const __m128i m7 = LOADU( block + 112 );
 #else
-  const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
-  const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
-  const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
-  const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
-  const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
-  const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
-  const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
-  const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
-  const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
-  const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
-  const uint64_t m10 = ( ( uint64_t * )block )[10];
-  const uint64_t m11 = ( ( uint64_t * )block )[11];
-  const uint64_t m12 = ( ( uint64_t * )block )[12];
-  const uint64_t m13 = ( ( uint64_t * )block )[13];
-  const uint64_t m14 = ( ( uint64_t * )block )[14];
-  const uint64_t m15 = ( ( uint64_t * )block )[15];
+  const uint64_t  m0 = load64(block +  0 * sizeof(uint64_t));
+  const uint64_t  m1 = load64(block +  1 * sizeof(uint64_t));
+  const uint64_t  m2 = load64(block +  2 * sizeof(uint64_t));
+  const uint64_t  m3 = load64(block +  3 * sizeof(uint64_t));
+  const uint64_t  m4 = load64(block +  4 * sizeof(uint64_t));
+  const uint64_t  m5 = load64(block +  5 * sizeof(uint64_t));
+  const uint64_t  m6 = load64(block +  6 * sizeof(uint64_t));
+  const uint64_t  m7 = load64(block +  7 * sizeof(uint64_t));
+  const uint64_t  m8 = load64(block +  8 * sizeof(uint64_t));
+  const uint64_t  m9 = load64(block +  9 * sizeof(uint64_t));
+  const uint64_t m10 = load64(block + 10 * sizeof(uint64_t));
+  const uint64_t m11 = load64(block + 11 * sizeof(uint64_t));
+  const uint64_t m12 = load64(block + 12 * sizeof(uint64_t));
+  const uint64_t m13 = load64(block + 13 * sizeof(uint64_t));
+  const uint64_t m14 = load64(block + 14 * sizeof(uint64_t));
+  const uint64_t m15 = load64(block + 15 * sizeof(uint64_t));
 #endif
   row1l = LOADU( &S->h[0] );
   row1h = LOADU( &S->h[2] );
--- a/sse/blake2s.c
+++ b/sse/blake2s.c
@@ -45,21 +45,6 @@
   0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
 };
 
-static const uint8_t blake2s_sigma[10][16] =
-{
-  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
-  { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
-  { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
-  {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
-  {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
-  {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
-  { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
-  { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
-  {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
-  { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
-};
-
-
 /* Some helper functions */
 static void blake2s_set_lastnode( blake2s_state *S )
 {
@@ -66,11 +51,6 @@
   S->f[1] = (uint32_t)-1;
 }
 
-static void blake2s_clear_lastnode( blake2s_state *S )
-{
-  S->f[1] = 0;
-}
-
 static int blake2s_is_lastblock( const blake2s_state *S )
 {
   return S->f[0] != 0;
@@ -83,13 +63,6 @@
   S->f[0] = (uint32_t)-1;
 }
 
-static void blake2s_clear_lastblock( blake2s_state *S )
-{
-  if( S->last_node ) blake2s_clear_lastnode( S );
-
-  S->f[0] = 0;
-}
-
 static void blake2s_increment_counter( blake2s_state *S, const uint32_t inc )
 {
   uint64_t t = ( ( uint64_t )S->t[1] << 32 ) | S->t[0];
@@ -98,14 +71,6 @@
   S->t[1] = ( uint32_t )( t >> 32 );
 }
 
-static void blake2s_init0( blake2s_state *S )
-{
-  size_t i;
-  memset( S, 0, sizeof( blake2s_state ) );
-
-  for( i = 0; i < 8; ++i ) S->h[i] = blake2s_IV[i];
-}
-
 /* init2 xors IV with input parameter block */
 int blake2s_init_param( blake2s_state *S, const blake2s_param *P )
 {
@@ -127,40 +92,30 @@
 /* Some sort of default parameter block initialization, for sequential blake2s */
 int blake2s_init( blake2s_state *S, size_t outlen )
 {
-  const blake2s_param P =
-  {
-    (uint8_t)outlen,
-    0,
-    1,
-    1,
-    0,
-    {0},
-    0,
-    0,
-    {0},
-    {0}
-  };
+  blake2s_param P[1];
+
   /* Move interval verification here? */
   if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
-  return blake2s_init_param( S, &P );
+
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = 0;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store48( &P->node_offset, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  /* memset(P->reserved, 0, sizeof(P->reserved) ); */
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+
+  return blake2s_init_param( S, P );
 }
 
 
 int blake2s_init_key( blake2s_state *S, size_t outlen, const void *key, size_t keylen )
 {
-  const blake2s_param P =
-  {
-    (uint8_t)outlen,
-    (uint8_t)keylen,
-    1,
-    1,
-    0,
-    {0},
-    0,
-    0,
-    {0},
-    {0}
-  };
+  blake2s_param P[1];
 
   /* Move interval verification here? */
   if ( ( !outlen ) || ( outlen > BLAKE2S_OUTBYTES ) ) return -1;
@@ -167,7 +122,19 @@
 
   if ( ( !key ) || ( !keylen ) || keylen > BLAKE2S_KEYBYTES ) return -1;
 
-  if( blake2s_init_param( S, &P ) < 0 )
+  P->digest_length = (uint8_t)outlen;
+  P->key_length    = (uint8_t)keylen;
+  P->fanout        = 1;
+  P->depth         = 1;
+  store32( &P->leaf_length, 0 );
+  store48( &P->node_offset, 0 );
+  P->node_depth    = 0;
+  P->inner_length  = 0;
+  /* memset(P->reserved, 0, sizeof(P->reserved) ); */
+  memset( P->salt,     0, sizeof( P->salt ) );
+  memset( P->personal, 0, sizeof( P->personal ) );
+
+  if( blake2s_init_param( S, P ) < 0 )
     return -1;
 
   {
@@ -202,27 +169,27 @@
   const __m128i m2 = LOADU( block +  32 );
   const __m128i m3 = LOADU( block +  48 );
 #else
-  const uint32_t  m0 = ( ( uint32_t * )block )[ 0];
-  const uint32_t  m1 = ( ( uint32_t * )block )[ 1];
-  const uint32_t  m2 = ( ( uint32_t * )block )[ 2];
-  const uint32_t  m3 = ( ( uint32_t * )block )[ 3];
-  const uint32_t  m4 = ( ( uint32_t * )block )[ 4];
-  const uint32_t  m5 = ( ( uint32_t * )block )[ 5];
-  const uint32_t  m6 = ( ( uint32_t * )block )[ 6];
-  const uint32_t  m7 = ( ( uint32_t * )block )[ 7];
-  const uint32_t  m8 = ( ( uint32_t * )block )[ 8];
-  const uint32_t  m9 = ( ( uint32_t * )block )[ 9];
-  const uint32_t m10 = ( ( uint32_t * )block )[10];
-  const uint32_t m11 = ( ( uint32_t * )block )[11];
-  const uint32_t m12 = ( ( uint32_t * )block )[12];
-  const uint32_t m13 = ( ( uint32_t * )block )[13];
-  const uint32_t m14 = ( ( uint32_t * )block )[14];
-  const uint32_t m15 = ( ( uint32_t * )block )[15];
+  const uint32_t  m0 = load32(block +  0 * sizeof(uint32_t));
+  const uint32_t  m1 = load32(block +  1 * sizeof(uint32_t));
+  const uint32_t  m2 = load32(block +  2 * sizeof(uint32_t));
+  const uint32_t  m3 = load32(block +  3 * sizeof(uint32_t));
+  const uint32_t  m4 = load32(block +  4 * sizeof(uint32_t));
+  const uint32_t  m5 = load32(block +  5 * sizeof(uint32_t));
+  const uint32_t  m6 = load32(block +  6 * sizeof(uint32_t));
+  const uint32_t  m7 = load32(block +  7 * sizeof(uint32_t));
+  const uint32_t  m8 = load32(block +  8 * sizeof(uint32_t));
+  const uint32_t  m9 = load32(block +  9 * sizeof(uint32_t));
+  const uint32_t m10 = load32(block + 10 * sizeof(uint32_t));
+  const uint32_t m11 = load32(block + 11 * sizeof(uint32_t));
+  const uint32_t m12 = load32(block + 12 * sizeof(uint32_t));
+  const uint32_t m13 = load32(block + 13 * sizeof(uint32_t));
+  const uint32_t m14 = load32(block + 14 * sizeof(uint32_t));
+  const uint32_t m15 = load32(block + 15 * sizeof(uint32_t));
 #endif
   row1 = ff0 = LOADU( &S->h[0] );
   row2 = ff1 = LOADU( &S->h[4] );
-  row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A );
-  row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOADU( &S->t[0] ) );
+  row3 = _mm_loadu_si128( (__m128i const *)&blake2s_IV[0] );
+  row4 = _mm_xor_si128( _mm_loadu_si128( (__m128i const *)&blake2s_IV[4] ), LOADU( &S->t[0] ) );
   ROUND( 0 );
   ROUND( 1 );
   ROUND( 2 );
--- a/sse/makefile
+++ b/sse/makefile
@@ -1,5 +1,5 @@
 CC=gcc
-CFLAGS=-O3 -march=native -I../testvectors
+CFLAGS=-O3 -I../testvectors
 
 all:		blake2s blake2b blake2sp blake2bp check
 
--