]> granicus.if.org Git - php/commitdiff
Replace slow generic sha3 implementation by https://github.com/gvanas/KeccakCodePackage
authorChristian Schneider <schneider@search.ch>
Wed, 5 Apr 2017 15:30:23 +0000 (17:30 +0200)
committerAnatol Belski <ab@php.net>
Sun, 28 May 2017 10:50:33 +0000 (12:50 +0200)
Fix hash_copy() tests by using correct size for sha3 context

sync config.w32 with with new sha3 files

Move dependency on KeccakHash.h to hash_sha3.c so we do not rely on it to install php_hash_sha3.h

Allocate memory for KeccacInstance in hash_sha3.c so header files do not need to know about implementation details while keeping API backward compatible to original sha3 implementation

Fix memory leak because hash_copy is called after init which already allocates the hashinstance

27 files changed:
ext/hash/config.m4
ext/hash/config.w32
ext/hash/hash_sha3.c
ext/hash/php_hash_sha3.h
ext/hash/sha3/generic32lc/KeccakHash.c [new file with mode: 0644]
ext/hash/sha3/generic32lc/KeccakHash.h [new file with mode: 0644]
ext/hash/sha3/generic32lc/KeccakP-1600-SnP.h [new file with mode: 0644]
ext/hash/sha3/generic32lc/KeccakP-1600-inplace32BI.c [new file with mode: 0644]
ext/hash/sha3/generic32lc/KeccakSponge.c [new file with mode: 0644]
ext/hash/sha3/generic32lc/KeccakSponge.h [new file with mode: 0644]
ext/hash/sha3/generic32lc/KeccakSponge.inc [new file with mode: 0644]
ext/hash/sha3/generic32lc/SnP-Relaned.h [new file with mode: 0644]
ext/hash/sha3/generic32lc/align.h [new file with mode: 0644]
ext/hash/sha3/generic32lc/brg_endian.h [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakHash.c [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakHash.h [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakP-1600-64.macros [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakP-1600-SnP.h [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakP-1600-opt64-config.h [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakP-1600-opt64.c [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakP-1600-unrolling.macros [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakSponge.c [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakSponge.h [new file with mode: 0644]
ext/hash/sha3/generic64lc/KeccakSponge.inc [new file with mode: 0644]
ext/hash/sha3/generic64lc/SnP-Relaned.h [new file with mode: 0644]
ext/hash/sha3/generic64lc/align.h [new file with mode: 0644]
ext/hash/sha3/generic64lc/brg_endian.h [new file with mode: 0644]

index 703cf14a3022a61d57d1dd724028b0c239615b56..5629453d86128b5a142f9be0b88922e3f09fd4bf 100644 (file)
@@ -25,15 +25,24 @@ if test "$PHP_HASH" != "no"; then
   AC_CHECK_SIZEOF(long, 4)
   AC_CHECK_SIZEOF(long long, 8)
   
+  PHP_CHECK_64BIT([
+    SHA3_DIR="sha3/generic32lc"
+    SHA3_OPT_SRC="$SHA3_DIR/KeccakP-1600-inplace32BI.c"
+  ],[
+    SHA3_DIR="sha3/generic64lc"
+    SHA3_OPT_SRC="$SHA3_DIR/KeccakP-1600-opt64.c"
+  ])
+  EXT_HASH_SHA3_SOURCES="$SHA3_OPT_SRC $SHA3_DIR/KeccakHash.c $SHA3_DIR/KeccakSponge.c"
+  PHP_HASH_CFLAGS="-I@ext_srcdir@/$SHA3_DIR -DKeccakP200_excluded -DKeccakP400_excluded -DKeccakP800_excluded"
   EXT_HASH_SOURCES="hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c \
     hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c hash_adler32.c \
-    hash_crc32.c hash_fnv.c hash_joaat.c hash_sha3.c"
+    hash_crc32.c hash_fnv.c hash_joaat.c hash_sha3.c $EXT_HASH_SHA3_SOURCES"
   EXT_HASH_HEADERS="php_hash.h php_hash_md.h php_hash_sha.h php_hash_ripemd.h \
     php_hash_haval.h php_hash_tiger.h php_hash_gost.h php_hash_snefru.h \
     php_hash_whirlpool.h php_hash_adler32.h php_hash_crc32.h \
     php_hash_fnv.h php_hash_joaat.h php_hash_sha3.h"
   
-  PHP_NEW_EXTENSION(hash, $EXT_HASH_SOURCES, $ext_shared)
+  PHP_NEW_EXTENSION(hash, $EXT_HASH_SOURCES, $ext_shared,,$PHP_HASH_CFLAGS)
   ifdef([PHP_INSTALL_HEADERS], [
        PHP_INSTALL_HEADERS(ext/hash, $EXT_HASH_HEADERS)
   ])
index 17711facd8f7a8f489df67c032f4816ff067c9b1..1112142bbbac1e70eeb6ce7849419c6c0e2a901a 100644 (file)
@@ -12,13 +12,24 @@ if (PHP_MHASH != "no") {
 }
 
 if (PHP_HASH != "no") {
-       AC_DEFINE('HAVE_HASH_EXT', 1);
-       EXTENSION("hash", "hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c "
-               + "hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c "
-               + "hash_adler32.c hash_crc32.c hash_joaat.c hash_fnv.c hash_sha3.c");
+       var sha3_arch_dir = "sha3/" + (X64 ? "generic64lc" : "generic32lc");
+       var sha3_dir = "ext/hash/" + sha3_arch_dir;
+       if (CHECK_HEADER_ADD_INCLUDE("KeccakHash.h", "CFLAGS_HASH", PHP_HASH + ";" + sha3_dir)) {
+               AC_DEFINE('HAVE_HASH_EXT', 1);
+               EXTENSION("hash", "hash.c hash_md.c hash_sha.c hash_ripemd.c hash_haval.c "
+                       + "hash_tiger.c hash_gost.c hash_snefru.c hash_whirlpool.c "
+                       + "hash_adler32.c hash_crc32.c hash_joaat.c hash_fnv.c hash_sha3.c");
+
+               ADD_SOURCES(sha3_dir, "KeccakHash.c KeccakSponge.c " + (X64 ? "KeccakP-1600-opt64.c" : "KeccakP-1600-inplace32BI.c"),
+                       "hash");
+               ADD_FLAG("CFLAGS_HASH", "/DKeccakP200_excluded /DKeccakP400_excluded /DKeccakP800_excluded");
+
 
                PHP_INSTALL_HEADERS("ext/hash/", "php_hash.h php_hash_md.h php_hash_sha.h php_hash_ripemd.h " +
                "php_hash_haval.h php_hash_tiger.h php_hash_gost.h php_hash_snefru.h " + 
-               "php_hash_whirlpool.h php_hash_adler32.h php_hash_crc32.h php_hash_sha3.h");
+               "php_hash_whirlpool.h php_hash_adler32.h php_hash_crc32.h php_hash_sha3.h ");
+       } else {
+               WARNING("gd not enabled; libraries and headers not found");
+       }
 }
 
index a3bfda4899838630869326563094bfc6c3d68a81..ee9d010da4b267c43190a4abe02fa5a85d475174 100644 (file)
 #include "php_hash.h"
 #include "php_hash_sha3.h"
 
-#if (defined(__APPLE__) || defined(__APPLE_CC__)) && \
-    (defined(__BIG_ENDIAN__) || defined(__LITTLE_ENDIAN__))
-# if defined(__LITTLE_ENDIAN__)
-#  undef WORDS_BIGENDIAN
-# else
-#  if defined(__BIG_ENDIAN__)
-#   define WORDS_BIGENDIAN
-#  endif
-# endif
-#endif
+#define SUCCESS SHA3_SUCCESS /* Avoid conflict between KeccacHash.h and zend_types.h */
+#include "KeccakHash.h"
 
-static inline uint64_t rol64(uint64_t v, unsigned char b) {
-       return (v << b) | (v >> (64 - b));
-}
-static inline unsigned char idx(unsigned char x, unsigned char y) {
-       return x + (5 * y);
-}
-
-#ifdef WORDS_BIGENDIAN
-static inline uint64_t load64(const unsigned char* x) {
-       char i;
-       uint64_t ret = 0;
-       for (i = 7; i >= 0; --i) {
-               ret <<= 8;
-               ret |= x[i];
-       }
-       return ret;
-}
-static inline void store64(unsigned char* x, uint64_t val) {
-       char i;
-       for (i = 0; i < 8; ++i) {
-               x[i] = val & 0xFF;
-               val >>= 8;
-       }
-}
-static inline void xor64(unsigned char* x, uint64_t val) {
-       char i;
-       for (i = 0; i < 8; ++i) {
-               x[i] ^= val & 0xFF;
-               val >>= 8;
-       }
-}
-# define readLane(x, y)     load64(ctx->state+sizeof(uint64_t)*idx(x, y))
-# define writeLane(x, y, v) store64(ctx->state+sizeof(uint64_t)*idx(x, y), v)
-# define XORLane(x, y, v)   xor64(ctx->state+sizeof(uint64_t)*idx(x, y), v)
-#else
-# define readLane(x, y)     (((uint64_t*)ctx->state)[idx(x,y)])
-# define writeLane(x, y, v) (((uint64_t*)ctx->state)[idx(x,y)] = v)
-# define XORLane(x, y, v)   (((uint64_t*)ctx->state)[idx(x,y)] ^= v)
-#endif
-
-static inline char LFSR86540(unsigned char* pLFSR)
-{
-       unsigned char LFSR = *pLFSR;
-       char result = LFSR & 0x01;
-       if (LFSR & 0x80) {
-               // Primitive polynomial over GF(2): x^8+x^6+x^5+x^4+1
-               LFSR = (LFSR << 1) ^ 0x71;
-       } else {
-               LFSR <<= 1;
-       }
-       *pLFSR = LFSR;
-       return result;
-}
-
-static void permute(PHP_SHA3_CTX* ctx) {
-       unsigned char LFSRstate = 0x01;
-       unsigned char round;
-
-       for (round = 0; round < 24; ++round) {
-               { // Theta step (see [Keccak Reference, Section 2.3.2])
-                       uint64_t C[5], D;
-                       unsigned char x, y;
-                       for (x = 0; x < 5; ++x) {
-                               C[x] = readLane(x, 0) ^ readLane(x, 1) ^
-                               readLane(x, 2) ^ readLane(x, 3) ^ readLane(x, 4);
-                       }
-                       for (x = 0; x < 5; ++x) {
-                               D = C[(x+4)%5] ^ rol64(C[(x+1)%5], 1);
-                               for (y = 0; y < 5; ++y) {
-                                       XORLane(x, y, D);
-                               }
-                       }
-               }
-
-               { // p and Pi steps (see [Keccak Reference, Sections 2.3.3 and 2.3.4])
-                       unsigned char x = 1, y = 0, t;
-                       uint64_t current = readLane(x, y);
-                       for (t = 0; t < 24; ++t) {
-                               unsigned char r = ((t + 1) * (t + 2) / 2) % 64;
-                               unsigned char Y = (2*x + 3*y) % 5;
-                               uint64_t temp;
-                               x = y;
-                               y = Y;
-                               temp = readLane(x, y);
-                               writeLane(x, y, rol64(current, r));
-                               current = temp;
-                       }
-               }
-
-               { // X step (see [Keccak Reference, Section 2.3.1])
-                       unsigned char x, y;
-                       for (y = 0; y < 5; ++y) {
-                               uint64_t temp[5];
-                               for (x = 0; x < 5; ++x) {
-                                       temp[x] = readLane(x, y);
-                               }
-                               for (x = 0; x < 5; ++x) {
-                                       writeLane(x, y, temp[x] ^((~temp[(x+1)%5]) & temp[(x+2)%5]));
-                               }
-                       }
-               }
-
-               { // i step (see [Keccak Reference, Section 2.3.5])
-                       unsigned char j;
-                       for (j = 0; j < 7; ++j) {
-                               if (LFSR86540(&LFSRstate)) {
-                                       uint64_t bitPos = (1<<j) - 1;
-                                       XORLane(0, 0, (uint64_t)1 << bitPos);
-                               }
-                       }
-               }
-       }
-}
 
 // ==========================================================================
 
-static void PHP_SHA3_Init(PHP_SHA3_CTX* ctx,
-                          int bits) {
-       memset(ctx, 0, sizeof(PHP_SHA3_CTX));
-}
-
-static void PHP_SHA3_Update(PHP_SHA3_CTX* ctx,
-                            const unsigned char* buf,
-                            unsigned int count,
-                            size_t block_size) {
-       while (count > 0) {
-               unsigned int len = block_size - ctx->pos;
-               if (len > count) len = count;
-               count -= len;
-               while (len-- > 0) {
-                       ctx->state[ctx->pos++] ^= *(buf++);
-               }
-               if (ctx->pos >= block_size) {
-                       permute(ctx);
-                       ctx->pos = 0;
-               }
-       }
-}
-
-static void PHP_SHA3_Final(unsigned char* digest,
-                           PHP_SHA3_CTX* ctx,
-                           int block_size,
-                           int digest_size) {
-       int len = digest_size;
-
-       // Pad state to finalize
-       ctx->state[ctx->pos++] ^= 0x06;
-       ctx->state[block_size-1] ^= 0x80;
-       permute(ctx);
-
-       // Square output for digest
-       for(;;) {
-               int bs = (len < block_size) ? len : block_size;
-               memcpy(digest, ctx->state, bs);
-               digest += bs;
-               len -= bs;
-               if (!len) break;
-               permute(ctx);
-       }
-
-       // Zero out context
-       memset(ctx, 0, sizeof(PHP_SHA3_CTX));
+static int hash_sha3_copy(const void *ops, void *orig_context, void *dest_context)
+{
+       PHP_SHA3_CTX* orig = (PHP_SHA3_CTX*)orig_context;
+       PHP_SHA3_CTX* dest = (PHP_SHA3_CTX*)dest_context;
+       memcpy(dest->hashinstance, orig->hashinstance, sizeof(Keccak_HashInstance));
+       return SUCCESS;
 }
 
-// ==========================================================================
-
 #define DECLARE_SHA3_OPS(bits) \
 void PHP_SHA3##bits##Init(PHP_SHA3_##bits##_CTX* ctx) { \
-       PHP_SHA3_Init(ctx, bits); \
+       ctx->hashinstance = emalloc(sizeof(Keccak_HashInstance)); \
+       Keccak_HashInitialize_SHA3_##bits((Keccak_HashInstance *)ctx->hashinstance); \
 } \
 void PHP_SHA3##bits##Update(PHP_SHA3_##bits##_CTX* ctx, \
                             const unsigned char* input, \
                             unsigned int inputLen) { \
-       PHP_SHA3_Update(ctx, input, inputLen, \
-                    (1600 - (2 * bits)) >> 3); \
+       Keccak_HashUpdate((Keccak_HashInstance *)ctx->hashinstance, input, inputLen * 8); \
 } \
 void PHP_SHA3##bits##Final(unsigned char* digest, \
                            PHP_SHA3_##bits##_CTX* ctx) { \
-       PHP_SHA3_Final(digest, ctx, \
-                   (1600 - (2 * bits)) >> 3, \
-                   bits >> 3); \
+       Keccak_HashFinal((Keccak_HashInstance *)ctx->hashinstance, digest); \
+       efree(ctx->hashinstance); \
+       ctx->hashinstance = NULL; \
 } \
 const php_hash_ops php_hash_sha3_##bits##_ops = { \
        (php_hash_init_func_t) PHP_SHA3##bits##Init, \
        (php_hash_update_func_t) PHP_SHA3##bits##Update, \
        (php_hash_final_func_t) PHP_SHA3##bits##Final, \
-       php_hash_copy, \
+       hash_sha3_copy, \
        bits >> 3, \
        (1600 - (2 * bits)) >> 3, \
        sizeof(PHP_SHA3_##bits##_CTX), \
index 8b70ef4a7e0ad7da7497d51d9e602126fec57653..b47d1b102f3708452485482ce63a8b9ffb4b932f 100644 (file)
@@ -22,8 +22,7 @@
 #include "php.h"
 
 typedef struct {
-       unsigned char state[200]; // 5 * 5 * sizeof(uint64)
-       uint32_t pos;
+       void *hashinstance;
 } PHP_SHA3_CTX;
 
 typedef PHP_SHA3_CTX PHP_SHA3_224_CTX;
diff --git a/ext/hash/sha3/generic32lc/KeccakHash.c b/ext/hash/sha3/generic32lc/KeccakHash.c
new file mode 100644 (file)
index 0000000..259831b
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KeccakHash.h"
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashInitialize(Keccak_HashInstance *instance, unsigned int rate, unsigned int capacity, unsigned int hashbitlen, unsigned char delimitedSuffix)
+{
+    HashReturn result;
+
+    if (delimitedSuffix == 0)
+        return FAIL;
+    result = (HashReturn)KeccakWidth1600_SpongeInitialize(&instance->sponge, rate, capacity);
+    if (result != SUCCESS)
+        return result;
+    instance->fixedOutputLength = hashbitlen;
+    instance->delimitedSuffix = delimitedSuffix;
+    return SUCCESS;
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *instance, const BitSequence *data, DataLength databitlen)
+{
+    if ((databitlen % 8) == 0)
+        return (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
+    else {
+        HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
+        if (ret == SUCCESS) {
+            /* The last partial byte is assumed to be aligned on the least significant bits */
+            unsigned char lastByte = data[databitlen/8];
+            /* Concatenate the last few bits provided here with those of the suffix */
+            unsigned short delimitedLastBytes = (unsigned short)((unsigned short)lastByte | ((unsigned short)instance->delimitedSuffix << (databitlen % 8)));
+            if ((delimitedLastBytes & 0xFF00) == 0x0000) {
+                instance->delimitedSuffix = delimitedLastBytes & 0xFF;
+            }
+            else {
+                unsigned char oneByte[1];
+                oneByte[0] = delimitedLastBytes & 0xFF;
+                ret = (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, oneByte, 1);
+                instance->delimitedSuffix = (delimitedLastBytes >> 8) & 0xFF;
+            }
+        }
+        return ret;
+    }
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashFinal(Keccak_HashInstance *instance, BitSequence *hashval)
+{
+    HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorbLastFewBits(&instance->sponge, instance->delimitedSuffix);
+    if (ret == SUCCESS)
+        return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, hashval, instance->fixedOutputLength/8);
+    else
+        return ret;
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *instance, BitSequence *data, DataLength databitlen)
+{
+    if ((databitlen % 8) != 0)
+        return FAIL;
+    return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, data, databitlen/8);
+}
diff --git a/ext/hash/sha3/generic32lc/KeccakHash.h b/ext/hash/sha3/generic32lc/KeccakHash.h
new file mode 100644 (file)
index 0000000..ec35d3d
--- /dev/null
@@ -0,0 +1,113 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakHashInterface_h_
+#define _KeccakHashInterface_h_
+
+#ifndef KeccakP1600_excluded
+
+#include "KeccakSponge.h"
+#include <string.h>
+
+typedef unsigned char BitSequence;
+typedef size_t DataLength;
+typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn;
+
+typedef struct {
+    KeccakWidth1600_SpongeInstance sponge;
+    unsigned int fixedOutputLength;
+    unsigned char delimitedSuffix;
+} Keccak_HashInstance;
+
+/**
+  * Function to initialize the Keccak[r, c] sponge function instance used in sequential hashing mode.
+  * @param  hashInstance    Pointer to the hash instance to be initialized.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @param  hashbitlen  The desired number of output bits,
+  *                     or 0 for an arbitrarily-long output.
+  * @param  delimitedSuffix Bits that will be automatically appended to the end
+  *                         of the input message, as in domain separation.
+  *                         This is a byte containing from 0 to 7 bits
+  *                         formatted like the @a delimitedData parameter of
+  *                         the Keccak_SpongeAbsorbLastFewBits() function.
+  * @pre    One must have r+c=1600 and the rate a multiple of 8 bits in this implementation.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashInitialize(Keccak_HashInstance *hashInstance, unsigned int rate, unsigned int capacity, unsigned int hashbitlen, unsigned char delimitedSuffix);
+
+/** Macro to initialize a SHAKE128 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHAKE128(hashInstance)        Keccak_HashInitialize(hashInstance, 1344,  256,   0, 0x1F)
+
+/** Macro to initialize a SHAKE256 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHAKE256(hashInstance)        Keccak_HashInitialize(hashInstance, 1088,  512,   0, 0x1F)
+
+/** Macro to initialize a SHA3-224 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_224(hashInstance)        Keccak_HashInitialize(hashInstance, 1152,  448, 224, 0x06)
+
+/** Macro to initialize a SHA3-256 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_256(hashInstance)        Keccak_HashInitialize(hashInstance, 1088,  512, 256, 0x06)
+
+/** Macro to initialize a SHA3-384 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_384(hashInstance)        Keccak_HashInitialize(hashInstance,  832,  768, 384, 0x06)
+
+/** Macro to initialize a SHA3-512 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_512(hashInstance)        Keccak_HashInitialize(hashInstance,  576, 1024, 512, 0x06)
+
+/**
+  * Function to give input data to be absorbed.
+  * @param  hashInstance    Pointer to the hash instance initialized by Keccak_HashInitialize().
+  * @param  data        Pointer to the input data.
+  *                     When @a databitLen is not a multiple of 8, the last bits of data must be
+  *                     in the least significant bits of the last byte (little-endian convention).
+  * @param  databitLen  The number of input bits provided in the input data.
+  * @pre    In the previous call to Keccak_HashUpdate(), databitlen was a multiple of 8.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *hashInstance, const BitSequence *data, DataLength databitlen);
+
+/**
+  * Function to call after all input blocks have been input and to get
+  * output bits if the length was specified when calling Keccak_HashInitialize().
+  * @param  hashInstance    Pointer to the hash instance initialized by Keccak_HashInitialize().
+  * If @a hashbitlen was not 0 in the call to Keccak_HashInitialize(), the number of
+  *     output bits is equal to @a hashbitlen.
+  * If @a hashbitlen was 0 in the call to Keccak_HashInitialize(), the output bits
+  *     must be extracted using the Keccak_HashSqueeze() function.
+  * @param  hashval     Pointer to the buffer where to store the output data.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashFinal(Keccak_HashInstance *hashInstance, BitSequence *hashval);
+
+ /**
+  * Function to squeeze output data.
+  * @param  hashInstance    Pointer to the hash instance initialized by Keccak_HashInitialize().
+  * @param  data        Pointer to the buffer where to store the output data.
+  * @param  databitlen  The number of output bits desired (must be a multiple of 8).
+  * @pre    Keccak_HashFinal() must have been already called.
+  * @pre    @a databitlen is a multiple of 8.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *hashInstance, BitSequence *data, DataLength databitlen);
+
+#endif
+
+#endif
diff --git a/ext/hash/sha3/generic32lc/KeccakP-1600-SnP.h b/ext/hash/sha3/generic32lc/KeccakP-1600-SnP.h
new file mode 100644 (file)
index 0000000..258f411
--- /dev/null
@@ -0,0 +1,38 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+/** For the documentation, see SnP-documentation.h.
+ */
+
+#define KeccakP1600_implementation      "in-place 32-bit optimized implementation"
+#define KeccakP1600_stateSizeInBytes    200
+#define KeccakP1600_stateAlignment      8
+
+#define KeccakP1600_StaticInitialize()
+void KeccakP1600_Initialize(void *state);
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_Permute_24rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
+
+#endif
diff --git a/ext/hash/sha3/generic32lc/KeccakP-1600-inplace32BI.c b/ext/hash/sha3/generic32lc/KeccakP-1600-inplace32BI.c
new file mode 100644 (file)
index 0000000..3595e5f
--- /dev/null
@@ -0,0 +1,1158 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include    <string.h>
+#include "brg_endian.h"
+#include "KeccakP-1600-SnP.h"
+#include "SnP-Relaned.h"
+
+typedef unsigned char UINT8;
+typedef unsigned int UINT32;
+/* WARNING: on 8-bit and 16-bit platforms, this should be replaced by: */
+/* typedef unsigned long       UINT32; */
+
+#define ROL32(a, offset) ((((UINT32)a) << (offset)) ^ (((UINT32)a) >> (32-(offset))))
+
+/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+#define prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        temp0 = (low); \
+        temp = (temp0 ^ (temp0 >>  1)) & 0x22222222UL;  temp0 = temp0 ^ temp ^ (temp <<  1); \
+        temp = (temp0 ^ (temp0 >>  2)) & 0x0C0C0C0CUL;  temp0 = temp0 ^ temp ^ (temp <<  2); \
+        temp = (temp0 ^ (temp0 >>  4)) & 0x00F000F0UL;  temp0 = temp0 ^ temp ^ (temp <<  4); \
+        temp = (temp0 ^ (temp0 >>  8)) & 0x0000FF00UL;  temp0 = temp0 ^ temp ^ (temp <<  8); \
+        temp1 = (high); \
+        temp = (temp1 ^ (temp1 >>  1)) & 0x22222222UL;  temp1 = temp1 ^ temp ^ (temp <<  1); \
+        temp = (temp1 ^ (temp1 >>  2)) & 0x0C0C0C0CUL;  temp1 = temp1 ^ temp ^ (temp <<  2); \
+        temp = (temp1 ^ (temp1 >>  4)) & 0x00F000F0UL;  temp1 = temp1 ^ temp ^ (temp <<  4); \
+        temp = (temp1 ^ (temp1 >>  8)) & 0x0000FF00UL;  temp1 = temp1 ^ temp ^ (temp <<  8);
+
+#define toBitInterleavingAndXOR(low, high, even, odd, temp, temp0, temp1) \
+        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        even ^= (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        odd ^= (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+#define toBitInterleavingAndAND(low, high, even, odd, temp, temp0, temp1) \
+        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        even &= (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        odd &= (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+#define toBitInterleavingAndSet(low, high, even, odd, temp, temp0, temp1) \
+        prepareToBitInterleaving(low, high, temp, temp0, temp1) \
+        even = (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        odd = (temp0 >> 16) | (temp1 & 0xFFFF0000);
+
+/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */
+#define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+        temp0 = (even); \
+        temp1 = (odd); \
+        temp = (temp0 & 0x0000FFFF) | (temp1 << 16); \
+        temp1 = (temp0 >> 16) | (temp1 & 0xFFFF0000); \
+        temp0 = temp; \
+        temp = (temp0 ^ (temp0 >>  8)) & 0x0000FF00UL;  temp0 = temp0 ^ temp ^ (temp <<  8); \
+        temp = (temp0 ^ (temp0 >>  4)) & 0x00F000F0UL;  temp0 = temp0 ^ temp ^ (temp <<  4); \
+        temp = (temp0 ^ (temp0 >>  2)) & 0x0C0C0C0CUL;  temp0 = temp0 ^ temp ^ (temp <<  2); \
+        temp = (temp0 ^ (temp0 >>  1)) & 0x22222222UL;  temp0 = temp0 ^ temp ^ (temp <<  1); \
+        temp = (temp1 ^ (temp1 >>  8)) & 0x0000FF00UL;  temp1 = temp1 ^ temp ^ (temp <<  8); \
+        temp = (temp1 ^ (temp1 >>  4)) & 0x00F000F0UL;  temp1 = temp1 ^ temp ^ (temp <<  4); \
+        temp = (temp1 ^ (temp1 >>  2)) & 0x0C0C0C0CUL;  temp1 = temp1 ^ temp ^ (temp <<  2); \
+        temp = (temp1 ^ (temp1 >>  1)) & 0x22222222UL;  temp1 = temp1 ^ temp ^ (temp <<  1);
+
+#define fromBitInterleaving(even, odd, low, high, temp, temp0, temp1) \
+        prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+        low = temp0; \
+        high = temp1;
+
+#define fromBitInterleavingAndXOR(even, odd, lowIn, highIn, lowOut, highOut, temp, temp0, temp1) \
+        prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \
+        lowOut = lowIn ^ temp0; \
+        highOut = highIn ^ temp1;
+
+void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length)
+{
+    UINT8 laneAsBytes[8];
+    UINT32 low, high;
+    UINT32 temp, temp0, temp1;
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+
+    memset(laneAsBytes, 0xFF, offset);
+    memset(laneAsBytes+offset, 0x00, length);
+    memset(laneAsBytes+offset+length, 0xFF, 8-offset-length);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    low = *((UINT32*)(laneAsBytes+0));
+    high = *((UINT32*)(laneAsBytes+4));
+#else
+    low = laneAsBytes[0]
+        | ((UINT32)(laneAsBytes[1]) << 8)
+        | ((UINT32)(laneAsBytes[2]) << 16)
+        | ((UINT32)(laneAsBytes[3]) << 24);
+    high = laneAsBytes[4]
+        | ((UINT32)(laneAsBytes[5]) << 8)
+        | ((UINT32)(laneAsBytes[6]) << 16)
+        | ((UINT32)(laneAsBytes[7]) << 24);
+#endif
+    toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Initialize(void *state)
+{
+    memset(state, 0, 200);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+    unsigned int lanePosition = offset/8;
+    unsigned int offsetInLane = offset%8;
+    UINT32 low, high;
+    UINT32 temp, temp0, temp1;
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+
+    if (offsetInLane < 4) {
+        low = (UINT32)byte << (offsetInLane*8);
+        high = 0;
+    }
+    else {
+        low = 0;
+        high = (UINT32)byte << ((offsetInLane-4)*8);
+    }
+    toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    UINT8 laneAsBytes[8];
+    UINT32 low, high;
+    UINT32 temp, temp0, temp1;
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+
+    memset(laneAsBytes, 0, 8);
+    memcpy(laneAsBytes+offset, data, length);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    low = *((UINT32*)(laneAsBytes+0));
+    high = *((UINT32*)(laneAsBytes+4));
+#else
+    low = laneAsBytes[0]
+        | ((UINT32)(laneAsBytes[1]) << 8)
+        | ((UINT32)(laneAsBytes[2]) << 16)
+        | ((UINT32)(laneAsBytes[3]) << 24);
+    high = laneAsBytes[4]
+        | ((UINT32)(laneAsBytes[5]) << 8)
+        | ((UINT32)(laneAsBytes[6]) << 16)
+        | ((UINT32)(laneAsBytes[7]) << 24);
+#endif
+    toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    const UINT32 * pI = (const UINT32 *)data;
+    UINT32 * pS = (UINT32*)state;
+    UINT32 t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        UINT32 low;
+        UINT32 high;
+        memcpy(&low, pI++, 4);
+        memcpy(&high, pI++, 4);
+        toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1);
+#else
+        toBitInterleavingAndXOR(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        UINT8 laneAsBytes[8];
+        memcpy(laneAsBytes, data+lanePosition*8, 8);
+        UINT32 low = laneAsBytes[0]
+            | ((UINT32)(laneAsBytes[1]) << 8)
+            | ((UINT32)(laneAsBytes[2]) << 16)
+            | ((UINT32)(laneAsBytes[3]) << 24);
+        UINT32 high = laneAsBytes[4]
+            | ((UINT32)(laneAsBytes[5]) << 8)
+            | ((UINT32)(laneAsBytes[6]) << 16)
+            | ((UINT32)(laneAsBytes[7]) << 24);
+        UINT32 even, odd, temp, temp0, temp1;
+        UINT32 *stateAsHalfLanes = (UINT32*)state;
+        toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    KeccakP1600_SetBytesInLaneToZero(state, lanePosition, offset, length);
+    KeccakP1600_AddBytesInLane(state, lanePosition, data, offset, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    const UINT32 * pI = (const UINT32 *)data;
+    UINT32 * pS = (UINT32 *)state;
+    UINT32 t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        UINT32 low;
+        UINT32 high;
+        memcpy(&low, pI++, 4);
+        memcpy(&high, pI++, 4);
+        toBitInterleavingAndSet(low, high, *(pS++), *(pS++), t, x0, x1);
+#else
+        toBitInterleavingAndSet(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        UINT8 laneAsBytes[8];
+        memcpy(laneAsBytes, data+lanePosition*8, 8);
+        UINT32 low = laneAsBytes[0]
+            | ((UINT32)(laneAsBytes[1]) << 8)
+            | ((UINT32)(laneAsBytes[2]) << 16)
+            | ((UINT32)(laneAsBytes[3]) << 24);
+        UINT32 high = laneAsBytes[4]
+            | ((UINT32)(laneAsBytes[5]) << 8)
+            | ((UINT32)(laneAsBytes[6]) << 16)
+            | ((UINT32)(laneAsBytes[7]) << 24);
+        UINT32 even, odd, temp, temp0, temp1;
+        UINT32 *stateAsHalfLanes = (UINT32*)state;
+        toBitInterleavingAndSet(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1);
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_OverwriteBytes(state, data, offset, length, KeccakP1600_OverwriteLanes, KeccakP1600_OverwriteBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
+{
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    unsigned int i;
+
+    for(i=0; i<byteCount/8; i++) {
+        stateAsHalfLanes[i*2+0] = 0;
+        stateAsHalfLanes[i*2+1] = 0;
+    }
+    if (byteCount%8 != 0)
+        KeccakP1600_SetBytesInLaneToZero(state, byteCount/8, 0, byteCount%8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    UINT32 low, high, temp, temp0, temp1;
+    UINT8 laneAsBytes[8];
+
+    fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    *((UINT32*)(laneAsBytes+0)) = low;
+    *((UINT32*)(laneAsBytes+4)) = high;
+#else
+    laneAsBytes[0] = low & 0xFF;
+    laneAsBytes[1] = (low >> 8) & 0xFF;
+    laneAsBytes[2] = (low >> 16) & 0xFF;
+    laneAsBytes[3] = (low >> 24) & 0xFF;
+    laneAsBytes[4] = high & 0xFF;
+    laneAsBytes[5] = (high >> 8) & 0xFF;
+    laneAsBytes[6] = (high >> 16) & 0xFF;
+    laneAsBytes[7] = (high >> 24) & 0xFF;
+#endif
+    memcpy(data, laneAsBytes+offset, length);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    UINT32 * pI = (UINT32 *)data;
+    const UINT32 * pS = ( const UINT32 *)state;
+    UINT32 t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        UINT32 low;
+        UINT32 high;
+        fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
+        memcpy(pI++, &low, 4);
+        memcpy(pI++, &high, 4);
+#else
+        fromBitInterleaving(*(pS++), *(pS++), *(pI++), *(pI++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        UINT32 *stateAsHalfLanes = (UINT32*)state;
+        UINT32 low, high, temp, temp0, temp1;
+        fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+        UINT8 laneAsBytes[8];
+        laneAsBytes[0] = low & 0xFF;
+        laneAsBytes[1] = (low >> 8) & 0xFF;
+        laneAsBytes[2] = (low >> 16) & 0xFF;
+        laneAsBytes[3] = (low >> 24) & 0xFF;
+        laneAsBytes[4] = high & 0xFF;
+        laneAsBytes[5] = (high >> 8) & 0xFF;
+        laneAsBytes[6] = (high >> 16) & 0xFF;
+        laneAsBytes[7] = (high >> 24) & 0xFF;
+        memcpy(data+lanePosition*8, laneAsBytes, 8);
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePosition, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    UINT32 low, high, temp, temp0, temp1;
+    UINT8 laneAsBytes[8];
+    unsigned int i;
+
+    fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    *((UINT32*)(laneAsBytes+0)) = low;
+    *((UINT32*)(laneAsBytes+4)) = high;
+#else
+    laneAsBytes[0] = low & 0xFF;
+    laneAsBytes[1] = (low >> 8) & 0xFF;
+    laneAsBytes[2] = (low >> 16) & 0xFF;
+    laneAsBytes[3] = (low >> 24) & 0xFF;
+    laneAsBytes[4] = high & 0xFF;
+    laneAsBytes[5] = (high >> 8) & 0xFF;
+    laneAsBytes[6] = (high >> 16) & 0xFF;
+    laneAsBytes[7] = (high >> 24) & 0xFF;
+#endif
+    for(i=0; i<length; i++)
+        output[i] = input[i] ^ laneAsBytes[offset+i];
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *input, unsigned char *output, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    const UINT32 * pI = (const UINT32 *)input;
+    UINT32 * pO = (UINT32 *)output;
+    const UINT32 * pS = (const UINT32 *)state;
+    UINT32 t, x0, x1;
+    int i;
+    for (i = laneCount-1; i >= 0; --i) {
+#ifdef NO_MISALIGNED_ACCESSES
+        UINT32 low;
+        UINT32 high;
+        fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1);
+        *(pO++) = *(pI++) ^ low;
+        *(pO++) = *(pI++) ^ high;
+#else
+        fromBitInterleavingAndXOR(*(pS++), *(pS++), *(pI++), *(pI++), *(pO++), *(pO++), t, x0, x1)
+#endif
+    }
+#else
+    unsigned int lanePosition;
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++) {
+        UINT32 *stateAsHalfLanes = (UINT32*)state;
+        UINT32 low, high, temp, temp0, temp1;
+        fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1);
+        UINT8 laneAsBytes[8];
+        laneAsBytes[0] = low & 0xFF;
+        laneAsBytes[1] = (low >> 8) & 0xFF;
+        laneAsBytes[2] = (low >> 16) & 0xFF;
+        laneAsBytes[3] = (low >> 24) & 0xFF;
+        laneAsBytes[4] = high & 0xFF;
+        laneAsBytes[5] = (high >> 8) & 0xFF;
+        laneAsBytes[6] = (high >> 16) & 0xFF;
+        laneAsBytes[7] = (high >> 24) & 0xFF;
+        ((UINT32*)(output+lanePosition*8))[0] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+0));
+        ((UINT32*)(output+lanePosition*8))[1] = ((UINT32*)(input+lanePosition*8))[0] ^ (*(const UINT32*)(laneAsBytes+4));
+    }
+#endif
+}
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractAndAddBytes(state, input, output, offset, length, KeccakP1600_ExtractAndAddLanes, KeccakP1600_ExtractAndAddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+static const UINT32 KeccakF1600RoundConstants_int2[2*24+1] =
+{
+    0x00000001UL,    0x00000000UL,
+    0x00000000UL,    0x00000089UL,
+    0x00000000UL,    0x8000008bUL,
+    0x00000000UL,    0x80008080UL,
+    0x00000001UL,    0x0000008bUL,
+    0x00000001UL,    0x00008000UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000001UL,    0x80000082UL,
+    0x00000000UL,    0x0000000bUL,
+    0x00000000UL,    0x0000000aUL,
+    0x00000001UL,    0x00008082UL,
+    0x00000000UL,    0x00008003UL,
+    0x00000001UL,    0x0000808bUL,
+    0x00000001UL,    0x8000000bUL,
+    0x00000001UL,    0x8000008aUL,
+    0x00000001UL,    0x80000081UL,
+    0x00000000UL,    0x80000081UL,
+    0x00000000UL,    0x80000008UL,
+    0x00000000UL,    0x00000083UL,
+    0x00000000UL,    0x80008003UL,
+    0x00000001UL,    0x80008088UL,
+    0x00000000UL,    0x80000088UL,
+    0x00000001UL,    0x00008000UL,
+    0x00000000UL,    0x80008082UL,
+    0x000000FFUL
+};
+
+#define KeccakRound0() \
+        Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \
+        Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \
+        Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \
+        Da1 = Cz^Du0; \
+        Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \
+        Do1 = Cy^Cx; \
+        Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \
+        De1 = Cz^Cw; \
+        Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \
+        Di1 = Du1^Cw; \
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Age0^De0), 22); \
+        Bi = ROL32((Aki1^Di1), 22); \
+        Bo = ROL32((Amo1^Do1), 11); \
+        Bu = ROL32((Asu0^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Age1^De1), 22); \
+        Bi = ROL32((Aki0^Di0), 21); \
+        Bo = ROL32((Amo0^Do0), 10); \
+        Bu = ROL32((Asu1^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aka1^Da1),  2); \
+        Bo = ROL32((Ame1^De1), 23); \
+        Bu = ROL32((Asi1^Di1), 31); \
+        Ba = ROL32((Abo0^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aka0^Da0),  1); \
+        Bo = ROL32((Ame0^De0), 22); \
+        Bu = ROL32((Asi0^Di0), 30); \
+        Ba = ROL32((Abo1^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Asa0^Da0),  9); \
+        Ba = ROL32((Abe1^De1),  1); \
+        Be = ROL32((Agi0^Di0),  3); \
+        Bi = ROL32((Ako1^Do1), 13); \
+        Bo = ROL32((Amu0^Du0),  4); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Asa1^Da1),  9); \
+        Ba = (Abe0^De0); \
+        Be = ROL32((Agi1^Di1),  3); \
+        Bi = ROL32((Ako0^Do0), 12); \
+        Bo = ROL32((Amu1^Du1),  4); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aga0^Da0), 18); \
+        Bi = ROL32((Ake0^De0),  5); \
+        Bo = ROL32((Ami1^Di1),  8); \
+        Bu = ROL32((Aso0^Do0), 28); \
+        Ba = ROL32((Abu1^Du1), 14); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aga1^Da1), 18); \
+        Bi = ROL32((Ake1^De1),  5); \
+        Bo = ROL32((Ami0^Di0),  7); \
+        Bu = ROL32((Aso1^Do1), 28); \
+        Ba = ROL32((Abu0^Du0), 13); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Ama1^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Abi0^Di0), 31); \
+        Be = ROL32((Ago1^Do1), 28); \
+        Bi = ROL32((Aku1^Du1), 20); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Ama0^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Abi1^Di1), 31); \
+        Be = ROL32((Ago0^Do0), 27); \
+        Bi = ROL32((Aku0^Du0), 19); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be )
+
+#define KeccakRound1() \
+        Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \
+        Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \
+        Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \
+        Da1 = Cz^Du0; \
+        Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \
+        Do1 = Cy^Cx; \
+        Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \
+        De1 = Cz^Cw; \
+        Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \
+        Di1 = Du1^Cw; \
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Ame1^De0), 22); \
+        Bi = ROL32((Agi1^Di1), 22); \
+        Bo = ROL32((Aso1^Do1), 11); \
+        Bu = ROL32((Aku1^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Ame0^De1), 22); \
+        Bi = ROL32((Agi0^Di0), 21); \
+        Bo = ROL32((Aso0^Do0), 10); \
+        Bu = ROL32((Aku0^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Asa1^Da1),  2); \
+        Bo = ROL32((Ake1^De1), 23); \
+        Bu = ROL32((Abi1^Di1), 31); \
+        Ba = ROL32((Amo1^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Asa0^Da0),  1); \
+        Bo = ROL32((Ake0^De0), 22); \
+        Bu = ROL32((Abi0^Di0), 30); \
+        Ba = ROL32((Amo0^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Ama1^Da0),  9); \
+        Ba = ROL32((Age1^De1),  1); \
+        Be = ROL32((Asi1^Di0),  3); \
+        Bi = ROL32((Ako0^Do1), 13); \
+        Bo = ROL32((Abu1^Du0),  4); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Ama0^Da1),  9); \
+        Ba = (Age0^De0); \
+        Be = ROL32((Asi0^Di1),  3); \
+        Bi = ROL32((Ako1^Do0), 12); \
+        Bo = ROL32((Abu0^Du1),  4); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aka1^Da0), 18); \
+        Bi = ROL32((Abe1^De0),  5); \
+        Bo = ROL32((Ami0^Di1),  8); \
+        Bu = ROL32((Ago1^Do0), 28); \
+        Ba = ROL32((Asu1^Du1), 14); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Aka0^Da1), 18); \
+        Bi = ROL32((Abe0^De1),  5); \
+        Bo = ROL32((Ami1^Di0),  7); \
+        Bu = ROL32((Ago0^Do1), 28); \
+        Ba = ROL32((Asu0^Du0), 13); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aga1^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Aki1^Di0), 31); \
+        Be = ROL32((Abo1^Do1), 28); \
+        Bi = ROL32((Amu1^Du1), 20); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aga0^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Aki0^Di1), 31); \
+        Be = ROL32((Abo0^Do0), 27); \
+        Bi = ROL32((Amu0^Du0), 19); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be );
+
+#define KeccakRound2() \
+        Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \
+        Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \
+        Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \
+        Da1 = Cz^Du0; \
+        Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \
+        Do1 = Cy^Cx; \
+        Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \
+        De1 = Cz^Cw; \
+        Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \
+        Di1 = Du1^Cw; \
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Ake1^De0), 22); \
+        Bi = ROL32((Asi0^Di1), 22); \
+        Bo = ROL32((Ago0^Do1), 11); \
+        Bu = ROL32((Amu1^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Ake0^De1), 22); \
+        Bi = ROL32((Asi1^Di0), 21); \
+        Bo = ROL32((Ago1^Do0), 10); \
+        Bu = ROL32((Amu0^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Ama0^Da1),  2); \
+        Bo = ROL32((Abe0^De1), 23); \
+        Bu = ROL32((Aki0^Di1), 31); \
+        Ba = ROL32((Aso1^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Ama1^Da0),  1); \
+        Bo = ROL32((Abe1^De0), 22); \
+        Bu = ROL32((Aki1^Di0), 30); \
+        Ba = ROL32((Aso0^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aga1^Da0),  9); \
+        Ba = ROL32((Ame0^De1),  1); \
+        Be = ROL32((Abi1^Di0),  3); \
+        Bi = ROL32((Ako1^Do1), 13); \
+        Bo = ROL32((Asu1^Du0),  4); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aga0^Da1),  9); \
+        Ba = (Ame1^De0); \
+        Be = ROL32((Abi0^Di1),  3); \
+        Bi = ROL32((Ako0^Do0), 12); \
+        Bo = ROL32((Asu0^Du1),  4); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Asa1^Da0), 18); \
+        Bi = ROL32((Age1^De0),  5); \
+        Bo = ROL32((Ami1^Di1),  8); \
+        Bu = ROL32((Abo1^Do0), 28); \
+        Ba = ROL32((Aku0^Du1), 14); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Asa0^Da1), 18); \
+        Bi = ROL32((Age0^De1),  5); \
+        Bo = ROL32((Ami0^Di0),  7); \
+        Bu = ROL32((Abo0^Do1), 28); \
+        Ba = ROL32((Aku1^Du0), 13); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aka0^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Agi1^Di0), 31); \
+        Be = ROL32((Amo0^Do1), 28); \
+        Bi = ROL32((Abu0^Du1), 20); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Aka1^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Agi0^Di1), 31); \
+        Be = ROL32((Amo1^Do0), 27); \
+        Bi = ROL32((Abu1^Du0), 19); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be );
+
+#define KeccakRound3() \
+        Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \
+        Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \
+        Da0 = Cx^ROL32(Du1, 1); \
+        Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \
+        Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \
+        Da1 = Cz^Du0; \
+        Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \
+        Do0 = Cw^ROL32(Cz, 1); \
+        Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \
+        Do1 = Cy^Cx; \
+        Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \
+        De0 = Cx^ROL32(Cy, 1); \
+        Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \
+        De1 = Cz^Cw; \
+        Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \
+        Di0 = Du0^ROL32(Cy, 1); \
+        Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \
+        Di1 = Du1^Cw; \
+        Du0 = Cw^ROL32(Cz, 1); \
+        Du1 = Cy^Cx; \
+\
+        Ba = (Aba0^Da0); \
+        Be = ROL32((Abe0^De0), 22); \
+        Bi = ROL32((Abi0^Di1), 22); \
+        Bo = ROL32((Abo0^Do1), 11); \
+        Bu = ROL32((Abu0^Du0),  7); \
+        Aba0 =   Ba ^((~Be)&  Bi ); \
+        Aba0 ^= *(pRoundConstants++); \
+        Abe0 =   Be ^((~Bi)&  Bo ); \
+        Abi0 =   Bi ^((~Bo)&  Bu ); \
+        Abo0 =   Bo ^((~Bu)&  Ba ); \
+        Abu0 =   Bu ^((~Ba)&  Be ); \
+        Ba = (Aba1^Da1); \
+        Be = ROL32((Abe1^De1), 22); \
+        Bi = ROL32((Abi1^Di0), 21); \
+        Bo = ROL32((Abo1^Do0), 10); \
+        Bu = ROL32((Abu1^Du1),  7); \
+        Aba1 =   Ba ^((~Be)&  Bi ); \
+        Aba1 ^= *(pRoundConstants++); \
+        Abe1 =   Be ^((~Bi)&  Bo ); \
+        Abi1 =   Bi ^((~Bo)&  Bu ); \
+        Abo1 =   Bo ^((~Bu)&  Ba ); \
+        Abu1 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aga0^Da1),  2); \
+        Bo = ROL32((Age0^De1), 23); \
+        Bu = ROL32((Agi0^Di1), 31); \
+        Ba = ROL32((Ago0^Do0), 14); \
+        Be = ROL32((Agu0^Du0), 10); \
+        Aga0 =   Ba ^((~Be)&  Bi ); \
+        Age0 =   Be ^((~Bi)&  Bo ); \
+        Agi0 =   Bi ^((~Bo)&  Bu ); \
+        Ago0 =   Bo ^((~Bu)&  Ba ); \
+        Agu0 =   Bu ^((~Ba)&  Be ); \
+        Bi = ROL32((Aga1^Da0),  1); \
+        Bo = ROL32((Age1^De0), 22); \
+        Bu = ROL32((Agi1^Di0), 30); \
+        Ba = ROL32((Ago1^Do1), 14); \
+        Be = ROL32((Agu1^Du1), 10); \
+        Aga1 =   Ba ^((~Be)&  Bi ); \
+        Age1 =   Be ^((~Bi)&  Bo ); \
+        Agi1 =   Bi ^((~Bo)&  Bu ); \
+        Ago1 =   Bo ^((~Bu)&  Ba ); \
+        Agu1 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aka0^Da0),  9); \
+        Ba = ROL32((Ake0^De1),  1); \
+        Be = ROL32((Aki0^Di0),  3); \
+        Bi = ROL32((Ako0^Do1), 13); \
+        Bo = ROL32((Aku0^Du0),  4); \
+        Aka0 =   Ba ^((~Be)&  Bi ); \
+        Ake0 =   Be ^((~Bi)&  Bo ); \
+        Aki0 =   Bi ^((~Bo)&  Bu ); \
+        Ako0 =   Bo ^((~Bu)&  Ba ); \
+        Aku0 =   Bu ^((~Ba)&  Be ); \
+        Bu = ROL32((Aka1^Da1),  9); \
+        Ba = (Ake1^De0); \
+        Be = ROL32((Aki1^Di1),  3); \
+        Bi = ROL32((Ako1^Do0), 12); \
+        Bo = ROL32((Aku1^Du1),  4); \
+        Aka1 =   Ba ^((~Be)&  Bi ); \
+        Ake1 =   Be ^((~Bi)&  Bo ); \
+        Aki1 =   Bi ^((~Bo)&  Bu ); \
+        Ako1 =   Bo ^((~Bu)&  Ba ); \
+        Aku1 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Ama0^Da0), 18); \
+        Bi = ROL32((Ame0^De0),  5); \
+        Bo = ROL32((Ami0^Di1),  8); \
+        Bu = ROL32((Amo0^Do0), 28); \
+        Ba = ROL32((Amu0^Du1), 14); \
+        Ama0 =   Ba ^((~Be)&  Bi ); \
+        Ame0 =   Be ^((~Bi)&  Bo ); \
+        Ami0 =   Bi ^((~Bo)&  Bu ); \
+        Amo0 =   Bo ^((~Bu)&  Ba ); \
+        Amu0 =   Bu ^((~Ba)&  Be ); \
+        Be = ROL32((Ama1^Da1), 18); \
+        Bi = ROL32((Ame1^De1),  5); \
+        Bo = ROL32((Ami1^Di0),  7); \
+        Bu = ROL32((Amo1^Do1), 28); \
+        Ba = ROL32((Amu1^Du0), 13); \
+        Ama1 =   Ba ^((~Be)&  Bi ); \
+        Ame1 =   Be ^((~Bi)&  Bo ); \
+        Ami1 =   Bi ^((~Bo)&  Bu ); \
+        Amo1 =   Bo ^((~Bu)&  Ba ); \
+        Amu1 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Asa0^Da1), 21); \
+        Bu = ROL32((Ase0^De0),  1); \
+        Ba = ROL32((Asi0^Di0), 31); \
+        Be = ROL32((Aso0^Do1), 28); \
+        Bi = ROL32((Asu0^Du1), 20); \
+        Asa0 =   Ba ^((~Be)&  Bi ); \
+        Ase0 =   Be ^((~Bi)&  Bo ); \
+        Asi0 =   Bi ^((~Bo)&  Bu ); \
+        Aso0 =   Bo ^((~Bu)&  Ba ); \
+        Asu0 =   Bu ^((~Ba)&  Be ); \
+        Bo = ROL32((Asa1^Da0), 20); \
+        Bu = ROL32((Ase1^De1),  1); \
+        Ba = ROL32((Asi1^Di1), 31); \
+        Be = ROL32((Aso1^Do0), 27); \
+        Bi = ROL32((Asu1^Du0), 19); \
+        Asa1 =   Ba ^((~Be)&  Bi ); \
+        Ase1 =   Be ^((~Bi)&  Bo ); \
+        Asi1 =   Bi ^((~Bo)&  Bu ); \
+        Aso1 =   Bo ^((~Bu)&  Ba ); \
+        Asu1 =   Bu ^((~Ba)&  Be );
+
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds)
+{
+    UINT32 Da0, De0, Di0, Do0, Du0;
+    UINT32 Da1, De1, Di1, Do1, Du1;
+    UINT32 Ba, Be, Bi, Bo, Bu;
+    UINT32 Cx, Cy, Cz, Cw;
+    const UINT32 *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2;
+    UINT32 *stateAsHalfLanes = (UINT32*)state;
+    #define Aba0 stateAsHalfLanes[ 0]
+    #define Aba1 stateAsHalfLanes[ 1]
+    #define Abe0 stateAsHalfLanes[ 2]
+    #define Abe1 stateAsHalfLanes[ 3]
+    #define Abi0 stateAsHalfLanes[ 4]
+    #define Abi1 stateAsHalfLanes[ 5]
+    #define Abo0 stateAsHalfLanes[ 6]
+    #define Abo1 stateAsHalfLanes[ 7]
+    #define Abu0 stateAsHalfLanes[ 8]
+    #define Abu1 stateAsHalfLanes[ 9]
+    #define Aga0 stateAsHalfLanes[10]
+    #define Aga1 stateAsHalfLanes[11]
+    #define Age0 stateAsHalfLanes[12]
+    #define Age1 stateAsHalfLanes[13]
+    #define Agi0 stateAsHalfLanes[14]
+    #define Agi1 stateAsHalfLanes[15]
+    #define Ago0 stateAsHalfLanes[16]
+    #define Ago1 stateAsHalfLanes[17]
+    #define Agu0 stateAsHalfLanes[18]
+    #define Agu1 stateAsHalfLanes[19]
+    #define Aka0 stateAsHalfLanes[20]
+    #define Aka1 stateAsHalfLanes[21]
+    #define Ake0 stateAsHalfLanes[22]
+    #define Ake1 stateAsHalfLanes[23]
+    #define Aki0 stateAsHalfLanes[24]
+    #define Aki1 stateAsHalfLanes[25]
+    #define Ako0 stateAsHalfLanes[26]
+    #define Ako1 stateAsHalfLanes[27]
+    #define Aku0 stateAsHalfLanes[28]
+    #define Aku1 stateAsHalfLanes[29]
+    #define Ama0 stateAsHalfLanes[30]
+    #define Ama1 stateAsHalfLanes[31]
+    #define Ame0 stateAsHalfLanes[32]
+    #define Ame1 stateAsHalfLanes[33]
+    #define Ami0 stateAsHalfLanes[34]
+    #define Ami1 stateAsHalfLanes[35]
+    #define Amo0 stateAsHalfLanes[36]
+    #define Amo1 stateAsHalfLanes[37]
+    #define Amu0 stateAsHalfLanes[38]
+    #define Amu1 stateAsHalfLanes[39]
+    #define Asa0 stateAsHalfLanes[40]
+    #define Asa1 stateAsHalfLanes[41]
+    #define Ase0 stateAsHalfLanes[42]
+    #define Ase1 stateAsHalfLanes[43]
+    #define Asi0 stateAsHalfLanes[44]
+    #define Asi1 stateAsHalfLanes[45]
+    #define Aso0 stateAsHalfLanes[46]
+    #define Aso1 stateAsHalfLanes[47]
+    #define Asu0 stateAsHalfLanes[48]
+    #define Asu1 stateAsHalfLanes[49]
+
+    nRounds &= 3;
+    switch ( nRounds )
+    {
+        #define I0 Ba
+        #define I1 Be
+        #define T0 Bi
+        #define T1 Bo
+        #define SwapPI13( in0,in1,in2,in3,eo0,eo1,eo2,eo3 ) \
+            I0 = (in0)[0]; I1 = (in0)[1];       \
+            T0 = (in1)[0]; T1 = (in1)[1];       \
+            (in0)[eo0] = T0; (in0)[eo0^1] = T1; \
+            T0 = (in2)[0]; T1 = (in2)[1];       \
+            (in1)[eo1] = T0; (in1)[eo1^1] = T1; \
+            T0 = (in3)[0]; T1 = (in3)[1];       \
+            (in2)[eo2] = T0; (in2)[eo2^1] = T1; \
+            (in3)[eo3] = I0; (in3)[eo3^1] = I1
+        #define SwapPI2( in0,in1,in2,in3 ) \
+            I0 = (in0)[0]; I1 = (in0)[1]; \
+            T0 = (in1)[0]; T1 = (in1)[1]; \
+            (in0)[1] = T0; (in0)[0] = T1; \
+            (in1)[1] = I0; (in1)[0] = I1; \
+            I0 = (in2)[0]; I1 = (in2)[1]; \
+            T0 = (in3)[0]; T1 = (in3)[1]; \
+            (in2)[1] = T0; (in2)[0] = T1; \
+            (in3)[1] = I0; (in3)[0] = I1
+        #define SwapEO( even,odd ) T0 = even; even = odd; odd = T0
+
+        case 1:
+            SwapPI13( &Aga0, &Aka0, &Asa0, &Ama0, 1, 0, 1, 0 );
+            SwapPI13( &Abe0, &Age0, &Ame0, &Ake0, 0, 1, 0, 1 );
+            SwapPI13( &Abi0, &Aki0, &Agi0, &Asi0, 1, 0, 1, 0 );
+            SwapEO( Ami0, Ami1 );
+            SwapPI13( &Abo0, &Amo0, &Aso0, &Ago0, 1, 0, 1, 0 );
+            SwapEO( Ako0, Ako1 );
+            SwapPI13( &Abu0, &Asu0, &Aku0, &Amu0, 0, 1, 0, 1 );
+            break;        
+
+        case 2:
+            SwapPI2( &Aga0, &Asa0, &Aka0, &Ama0 );
+            SwapPI2( &Abe0, &Ame0, &Age0, &Ake0 );
+            SwapPI2( &Abi0, &Agi0, &Aki0, &Asi0 );
+            SwapPI2( &Abo0, &Aso0, &Ago0, &Amo0 );
+            SwapPI2( &Abu0, &Aku0, &Amu0, &Asu0 );
+            break;        
+
+        case 3:
+            SwapPI13( &Aga0, &Ama0, &Asa0, &Aka0, 0, 1, 0, 1 );
+            SwapPI13( &Abe0, &Ake0, &Ame0, &Age0, 1, 0, 1, 0 );
+            SwapPI13( &Abi0, &Asi0, &Agi0, &Aki0, 0, 1, 0, 1 );
+            SwapEO( Ami0, Ami1 );
+            SwapPI13( &Abo0, &Ago0, &Aso0, &Amo0, 0, 1, 0, 1 );
+            SwapEO( Ako0, Ako1 );
+            SwapPI13( &Abu0, &Amu0, &Aku0, &Asu0, 1, 0, 1, 0 );
+            break;        
+        #undef I0
+        #undef I1
+        #undef T0
+        #undef T1
+        #undef SwapPI13
+        #undef SwapPI2
+        #undef SwapEO
+    }
+
+    do
+    {
+        /* Code for 4 rounds, using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */
+        switch ( nRounds )
+        {
+            case 0: KeccakRound0(); /* fall through */
+            case 3: KeccakRound1();
+            case 2: KeccakRound2();
+            case 1: KeccakRound3();
+        }
+        nRounds = 0;
+    }
+    while ( *pRoundConstants != 0xFF );
+
+    #undef Aba0
+    #undef Aba1
+    #undef Abe0
+    #undef Abe1
+    #undef Abi0
+    #undef Abi1
+    #undef Abo0
+    #undef Abo1
+    #undef Abu0
+    #undef Abu1
+    #undef Aga0
+    #undef Aga1
+    #undef Age0
+    #undef Age1
+    #undef Agi0
+    #undef Agi1
+    #undef Ago0
+    #undef Ago1
+    #undef Agu0
+    #undef Agu1
+    #undef Aka0
+    #undef Aka1
+    #undef Ake0
+    #undef Ake1
+    #undef Aki0
+    #undef Aki1
+    #undef Ako0
+    #undef Ako1
+    #undef Aku0
+    #undef Aku1
+    #undef Ama0
+    #undef Ama1
+    #undef Ame0
+    #undef Ame1
+    #undef Ami0
+    #undef Ami1
+    #undef Amo0
+    #undef Amo1
+    #undef Amu0
+    #undef Amu1
+    #undef Asa0
+    #undef Asa1
+    #undef Ase0
+    #undef Ase1
+    #undef Asi0
+    #undef Asi1
+    #undef Aso0
+    #undef Aso1
+    #undef Asu0
+    #undef Asu1
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_12rounds(void *state)
+{
+     KeccakP1600_Permute_Nrounds(state, 12);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_24rounds(void *state)
+{
+     KeccakP1600_Permute_Nrounds(state, 24);
+}
diff --git a/ext/hash/sha3/generic32lc/KeccakSponge.c b/ext/hash/sha3/generic32lc/KeccakSponge.c
new file mode 100644 (file)
index 0000000..08d4a19
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include "KeccakSponge.h"
+
+#ifdef KeccakReference
+    #include "displayIntermediateValues.h"
+#endif
+
+#ifndef KeccakP200_excluded
+    #include "KeccakP-200-SnP.h"
+
+    #define prefix KeccakWidth200
+    #define SnP KeccakP200
+    #define SnP_width 200
+    #define SnP_Permute KeccakP200_Permute_18rounds
+    #if defined(KeccakF200_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF200_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP400_excluded
+    #include "KeccakP-400-SnP.h"
+
+    #define prefix KeccakWidth400
+    #define SnP KeccakP400
+    #define SnP_width 400
+    #define SnP_Permute KeccakP400_Permute_20rounds
+    #if defined(KeccakF400_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF400_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP800_excluded
+    #include "KeccakP-800-SnP.h"
+
+    #define prefix KeccakWidth800
+    #define SnP KeccakP800
+    #define SnP_width 800
+    #define SnP_Permute KeccakP800_Permute_22rounds
+    #if defined(KeccakF800_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF800_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+
+    #define prefix KeccakWidth1600
+    #define SnP KeccakP1600
+    #define SnP_width 1600
+    #define SnP_Permute KeccakP1600_Permute_24rounds
+    #if defined(KeccakF1600_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF1600_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+
+    #define prefix KeccakWidth1600_12rounds
+    #define SnP KeccakP1600
+    #define SnP_width 1600
+    #define SnP_Permute KeccakP1600_Permute_12rounds
+    #if defined(KeccakP1600_12rounds_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
diff --git a/ext/hash/sha3/generic32lc/KeccakSponge.h b/ext/hash/sha3/generic32lc/KeccakSponge.h
new file mode 100644 (file)
index 0000000..a8526fe
--- /dev/null
@@ -0,0 +1,178 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakSponge_h_
+#define _KeccakSponge_h_
+
+/** General information
+  *
+  * The following type and functions are not actually implemented. Their
+  * documentation is generic, with the prefix Prefix replaced by
+  * - KeccakWidth200 for a sponge function based on Keccak-f[200]
+  * - KeccakWidth400 for a sponge function based on Keccak-f[400]
+  * - KeccakWidth800 for a sponge function based on Keccak-f[800]
+  * - KeccakWidth1600 for a sponge function based on Keccak-f[1600]
+  *
+  * In all these functions, the rate and capacity must sum to the width of the
+  * chosen permutation. For instance, to use the sponge function
+  * Keccak[r=1344, c=256], one must use KeccakWidth1600_Sponge() or a combination
+  * of KeccakWidth1600_SpongeInitialize(), KeccakWidth1600_SpongeAbsorb(),
+  * KeccakWidth1600_SpongeAbsorbLastFewBits() and
+  * KeccakWidth1600_SpongeSqueeze().
+  *
+  * The Prefix_SpongeInstance contains the sponge instance attributes for use
+  * with the Prefix_Sponge* functions.
+  * It gathers the state processed by the permutation as well as the rate,
+  * the position of input/output bytes in the state and the phase
+  * (absorbing or squeezing).
+  */
+
+#ifdef DontReallyInclude_DocumentationOnly
+/** Function to evaluate the sponge function Keccak[r, c] in a single call.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @param  input           Pointer to the input message (before the suffix).
+  * @param  inputByteLen    The length of the input message in bytes.
+  * @param  suffix          Byte containing from 0 to 7 suffix bits
+  *                         that must be absorbed after @a input.
+  *                         These <i>n</i> bits must be in the least significant bit positions.
+  *                         These bits must be delimited with a bit 1 at position <i>n</i>
+  *                         (counting from 0=LSB to 7=MSB) and followed by bits 0
+  *                         from position <i>n</i>+1 to position 7.
+  *                         Some examples:
+  *                             - If no bits are to be absorbed, then @a suffix must be 0x01.
+  *                             - If the 2-bit sequence 0,0 is to be absorbed, @a suffix must be 0x04.
+  *                             - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a suffix must be 0x32.
+  *                             - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a suffix must be 0x8B.
+  *                         .
+  * @param  output          Pointer to the output buffer.
+  * @param  outputByteLen   The desired number of output bytes.
+  * @pre    One must have r+c equal to the supported width of this implementation
+  *         and the rate a multiple of 8 bits (one byte) in this implementation.
+  * @pre    @a suffix ≠ 0x00
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen);
+
+/**
+  * Function to initialize the state of the Keccak[r, c] sponge function.
+  * The phase of the sponge function is set to absorbing.
+  * @param  spongeInstance  Pointer to the sponge instance to be initialized.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @pre    One must have r+c equal to the supported width of this implementation
+  *         and the rate a multiple of 8 bits (one byte) in this implementation.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeInitialize(Prefix_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity);
+
+/**
+  * Function to give input data bytes for the sponge function to absorb.
+  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+  * @param  data        Pointer to the input data.
+  * @param  dataByteLen  The number of input bytes provided in the input data.
+  * @pre    The sponge function must be in the absorbing phase,
+  *         i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
+  *         must not have been called before.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeAbsorb(Prefix_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen);
+
+/**
+  * Function to give input data bits for the sponge function to absorb
+  * and then to switch to the squeezing phase.
+  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+  * @param  delimitedData   Byte containing from 0 to 7 trailing bits
+  *                     that must be absorbed.
+  *                     These <i>n</i> bits must be in the least significant bit positions.
+  *                     These bits must be delimited with a bit 1 at position <i>n</i>
+  *                     (counting from 0=LSB to 7=MSB) and followed by bits 0
+  *                     from position <i>n</i>+1 to position 7.
+  *                     Some examples:
+  *                         - If no bits are to be absorbed, then @a delimitedData must be 0x01.
+  *                         - If the 2-bit sequence 0,0 is to be absorbed, @a delimitedData must be 0x04.
+  *                         - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a delimitedData must be 0x32.
+  *                         - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a delimitedData must be 0x8B.
+  *                     .
+  * @pre    The sponge function must be in the absorbing phase,
+  *         i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
+  *         must not have been called before.
+  * @pre    @a delimitedData ≠ 0x00
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeAbsorbLastFewBits(Prefix_SpongeInstance *spongeInstance, unsigned char delimitedData);
+
+/**
+  * Function to squeeze output data from the sponge function.
+  * If the sponge function was in the absorbing phase, this function
+  * switches it to the squeezing phase
+  * as if Prefix_SpongeAbsorbLastFewBits(spongeInstance, 0x01) was called.
+  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+  * @param  data        Pointer to the buffer where to store the output data.
+  * @param  dataByteLen The number of output bytes desired.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeSqueeze(Prefix_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
+#endif
+
+#include <string.h>
+#include "align.h"
+
+#define KCP_DeclareSpongeStructure(prefix, size, alignment) \
+    ALIGN(alignment) typedef struct prefix##_SpongeInstanceStruct { \
+        unsigned char state[size]; \
+        unsigned int rate; \
+        unsigned int byteIOIndex; \
+        int squeezing; \
+    } prefix##_SpongeInstance;
+
+#define KCP_DeclareSpongeFunctions(prefix) \
+    int prefix##_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen); \
+    int prefix##_SpongeInitialize(prefix##_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity); \
+    int prefix##_SpongeAbsorb(prefix##_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen); \
+    int prefix##_SpongeAbsorbLastFewBits(prefix##_SpongeInstance *spongeInstance, unsigned char delimitedData); \
+    int prefix##_SpongeSqueeze(prefix##_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
+
+#ifndef KeccakP200_excluded
+    #include "KeccakP-200-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth200, KeccakP200_stateSizeInBytes, KeccakP200_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth200)
+#endif
+
+#ifndef KeccakP400_excluded
+    #include "KeccakP-400-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth400, KeccakP400_stateSizeInBytes, KeccakP400_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth400)
+#endif
+
+#ifndef KeccakP800_excluded
+    #include "KeccakP-800-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth800, KeccakP800_stateSizeInBytes, KeccakP800_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth800)
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth1600, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth1600)
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth1600_12rounds, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth1600_12rounds)
+#endif
+
+#endif
diff --git a/ext/hash/sha3/generic32lc/KeccakSponge.inc b/ext/hash/sha3/generic32lc/KeccakSponge.inc
new file mode 100644 (file)
index 0000000..42a15aa
--- /dev/null
@@ -0,0 +1,313 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define JOIN0(a, b)                     a ## b
+#define JOIN(a, b)                      JOIN0(a, b)
+
+#define Sponge                          JOIN(prefix, _Sponge)
+#define SpongeInstance                  JOIN(prefix, _SpongeInstance)
+#define SpongeInitialize                JOIN(prefix, _SpongeInitialize)
+#define SpongeAbsorb                    JOIN(prefix, _SpongeAbsorb)
+#define SpongeAbsorbLastFewBits         JOIN(prefix, _SpongeAbsorbLastFewBits)
+#define SpongeSqueeze                   JOIN(prefix, _SpongeSqueeze)
+
+#define SnP_stateSizeInBytes            JOIN(SnP, _stateSizeInBytes)
+#define SnP_stateAlignment              JOIN(SnP, _stateAlignment)
+#define SnP_StaticInitialize            JOIN(SnP, _StaticInitialize)
+#define SnP_Initialize                  JOIN(SnP, _Initialize)
+#define SnP_AddByte                     JOIN(SnP, _AddByte)
+#define SnP_AddBytes                    JOIN(SnP, _AddBytes)
+#define SnP_ExtractBytes                JOIN(SnP, _ExtractBytes)
+
+int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen)
+{
+    ALIGN(SnP_stateAlignment) unsigned char state[SnP_stateSizeInBytes];
+    unsigned int partialBlock;
+    const unsigned char *curInput = input;
+    unsigned char *curOutput = output;
+    unsigned int rateInBytes = rate/8;
+
+    if (rate+capacity != SnP_width)
+        return 1;
+    if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+        return 1;
+    if (suffix == 0)
+        return 1;
+
+    /* Initialize the state */
+    SnP_StaticInitialize();
+    SnP_Initialize(state);
+
+    /* First, absorb whole blocks */
+#ifdef SnP_FastLoop_Absorb
+    if (((rateInBytes % (SnP_width/200)) == 0) && (inputByteLen >= rateInBytes)) {
+        /* fast lane: whole lane rate */
+        size_t j;
+        j = SnP_FastLoop_Absorb(state, rateInBytes/(SnP_width/200), curInput, inputByteLen);
+        curInput += j;
+        inputByteLen -= j;
+    }
+#endif
+    while(inputByteLen >= (size_t)rateInBytes) {
+        #ifdef KeccakReference
+        displayBytes(1, "Block to be absorbed", curInput, rateInBytes);
+        #endif
+        SnP_AddBytes(state, curInput, 0, rateInBytes);
+        SnP_Permute(state);
+        curInput += rateInBytes;
+        inputByteLen -= rateInBytes;
+    }
+
+    /* Then, absorb what remains */
+    partialBlock = (unsigned int)inputByteLen;
+    #ifdef KeccakReference
+    displayBytes(1, "Block to be absorbed (part)", curInput, partialBlock);
+    #endif
+    SnP_AddBytes(state, curInput, 0, partialBlock);
+
+    /* Finally, absorb the suffix */
+    #ifdef KeccakReference
+    {
+        unsigned char delimitedData1[1];
+        delimitedData1[0] = suffix;
+        displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+    }
+    #endif
+    /* Last few bits, whose delimiter coincides with first bit of padding */
+    SnP_AddByte(state, suffix, partialBlock);
+    /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+    if ((suffix >= 0x80) && (partialBlock == (rateInBytes-1)))
+        SnP_Permute(state);
+    /* Second bit of padding */
+    SnP_AddByte(state, 0x80, rateInBytes-1);
+    #ifdef KeccakReference
+    {
+        unsigned char block[SnP_width/8];
+        memset(block, 0, SnP_width/8);
+        block[rateInBytes-1] = 0x80;
+        displayBytes(1, "Second bit of padding", block, rateInBytes);
+    }
+    #endif
+    SnP_Permute(state);
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+
+    /* First, output whole blocks */
+    while(outputByteLen > (size_t)rateInBytes) {
+        SnP_ExtractBytes(state, curOutput, 0, rateInBytes);
+        SnP_Permute(state);
+        #ifdef KeccakReference
+        displayBytes(1, "Squeezed block", curOutput, rateInBytes);
+        #endif
+        curOutput += rateInBytes;
+        outputByteLen -= rateInBytes;
+    }
+
+    /* Finally, output what remains */
+    partialBlock = (unsigned int)outputByteLen;
+    SnP_ExtractBytes(state, curOutput, 0, partialBlock);
+    #ifdef KeccakReference
+    displayBytes(1, "Squeezed block (part)", curOutput, partialBlock);
+    #endif
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+
+int SpongeInitialize(SpongeInstance *instance, unsigned int rate, unsigned int capacity)
+{
+    if (rate+capacity != SnP_width)
+        return 1;
+    if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+        return 1;
+    SnP_StaticInitialize();
+    SnP_Initialize(instance->state);
+    instance->rate = rate;
+    instance->byteIOIndex = 0;
+    instance->squeezing = 0;
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorb(SpongeInstance *instance, const unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    unsigned int partialBlock;
+    const unsigned char *curData;
+    unsigned int rateInBytes = instance->rate/8;
+
+    if (instance->squeezing)
+        return 1; /* Too late for additional input */
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == 0) && (dataByteLen >= (i + rateInBytes))) {
+#ifdef SnP_FastLoop_Absorb
+            /* processing full blocks first */
+            if ((rateInBytes % (SnP_width/200)) == 0) {
+                /* fast lane: whole lane rate */
+                j = SnP_FastLoop_Absorb(instance->state, rateInBytes/(SnP_width/200), curData, dataByteLen - i);
+                i += j;
+                curData += j;
+            }
+            else {
+#endif
+                for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, rateInBytes);
+                    #endif
+                    SnP_AddBytes(instance->state, curData, 0, rateInBytes);
+                    SnP_Permute(instance->state);
+                    curData+=rateInBytes;
+                }
+                i = dataByteLen - j;
+#ifdef SnP_FastLoop_Absorb
+            }
+#endif
+        }
+        else {
+            /* normal lane: using the message queue */
+            partialBlock = (unsigned int)(dataByteLen - i);
+            if (partialBlock+instance->byteIOIndex > rateInBytes)
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            #ifdef KeccakReference
+            displayBytes(1, "Block to be absorbed (part)", curData, partialBlock);
+            #endif
+            i += partialBlock;
+
+            SnP_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+            if (instance->byteIOIndex == rateInBytes) {
+                SnP_Permute(instance->state);
+                instance->byteIOIndex = 0;
+            }
+        }
+    }
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorbLastFewBits(SpongeInstance *instance, unsigned char delimitedData)
+{
+    unsigned int rateInBytes = instance->rate/8;
+
+    if (delimitedData == 0)
+        return 1;
+    if (instance->squeezing)
+        return 1; /* Too late for additional input */
+
+    #ifdef KeccakReference
+    {
+        unsigned char delimitedData1[1];
+        delimitedData1[0] = delimitedData;
+        displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+    }
+    #endif
+    /* Last few bits, whose delimiter coincides with first bit of padding */
+    SnP_AddByte(instance->state, delimitedData, instance->byteIOIndex);
+    /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+    if ((delimitedData >= 0x80) && (instance->byteIOIndex == (rateInBytes-1)))
+        SnP_Permute(instance->state);
+    /* Second bit of padding */
+    SnP_AddByte(instance->state, 0x80, rateInBytes-1);
+    #ifdef KeccakReference
+    {
+        unsigned char block[SnP_width/8];
+        memset(block, 0, SnP_width/8);
+        block[rateInBytes-1] = 0x80;
+        displayBytes(1, "Second bit of padding", block, rateInBytes);
+    }
+    #endif
+    SnP_Permute(instance->state);
+    instance->byteIOIndex = 0;
+    instance->squeezing = 1;
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeSqueeze(SpongeInstance *instance, unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    unsigned int partialBlock;
+    unsigned int rateInBytes = instance->rate/8;
+    unsigned char *curData;
+
+    if (!instance->squeezing)
+        SpongeAbsorbLastFewBits(instance, 0x01);
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == rateInBytes) && (dataByteLen >= (i + rateInBytes))) {
+            for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                SnP_Permute(instance->state);
+                SnP_ExtractBytes(instance->state, curData, 0, rateInBytes);
+                #ifdef KeccakReference
+                displayBytes(1, "Squeezed block", curData, rateInBytes);
+                #endif
+                curData+=rateInBytes;
+            }
+            i = dataByteLen - j;
+        }
+        else {
+            /* normal lane: using the message queue */
+            if (instance->byteIOIndex == rateInBytes) {
+                SnP_Permute(instance->state);
+                instance->byteIOIndex = 0;
+            }
+            partialBlock = (unsigned int)(dataByteLen - i);
+            if (partialBlock+instance->byteIOIndex > rateInBytes)
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            i += partialBlock;
+
+            SnP_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            #ifdef KeccakReference
+            displayBytes(1, "Squeezed block (part)", curData, partialBlock);
+            #endif
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+        }
+    }
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+#undef Sponge
+#undef SpongeInstance
+#undef SpongeInitialize
+#undef SpongeAbsorb
+#undef SpongeAbsorbLastFewBits
+#undef SpongeSqueeze
+#undef SnP_stateSizeInBytes
+#undef SnP_stateAlignment
+#undef SnP_StaticInitialize
+#undef SnP_Initialize
+#undef SnP_AddByte
+#undef SnP_AddBytes
+#undef SnP_ExtractBytes
diff --git a/ext/hash/sha3/generic32lc/SnP-Relaned.h b/ext/hash/sha3/generic32lc/SnP-Relaned.h
new file mode 100644 (file)
index 0000000..086e635
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _SnP_Relaned_h_
+#define _SnP_Relaned_h_
+
+#define SnP_AddBytes(state, data, offset, length, SnP_AddLanes, SnP_AddBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_AddLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_AddBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+#define SnP_OverwriteBytes(state, data, offset, length, SnP_OverwriteLanes, SnP_OverwriteBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_OverwriteLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_OverwriteBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_OverwriteBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_ExtractBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+#define SnP_ExtractAndAddBytes(state, input, output, offset, length, SnP_ExtractAndAddLanes, SnP_ExtractAndAddBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_ExtractAndAddLanes(state, input, output, (length)/SnP_laneLengthInBytes); \
+            SnP_ExtractAndAddBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (input)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                (output)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curInput = (input); \
+            unsigned char *_curOutput = (output); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_ExtractAndAddBytesInLane(state, _lanePosition, _curInput, _curOutput, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curInput += _bytesInLane; \
+                _curOutput += _bytesInLane; \
+            } \
+        } \
+    }
+
+#endif
diff --git a/ext/hash/sha3/generic32lc/align.h b/ext/hash/sha3/generic32lc/align.h
new file mode 100644 (file)
index 0000000..e29771e
--- /dev/null
@@ -0,0 +1,34 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _align_h_
+#define _align_h_
+
+/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
+#ifdef ALIGN
+#undef ALIGN
+#endif
+
+#if defined(__GNUC__)
+#define ALIGN(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#elif defined(__ARMCC_VERSION)
+#define ALIGN(x) __align(x)
+#else
+#define ALIGN(x)
+#endif
+
+#endif
diff --git a/ext/hash/sha3/generic32lc/brg_endian.h b/ext/hash/sha3/generic32lc/brg_endian.h
new file mode 100644 (file)
index 0000000..7226eb3
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+ Changes for ARM 9/9/2010
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#if 0
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif defined(__arm__)
+# ifdef __BIG_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# else
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif 1     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif
diff --git a/ext/hash/sha3/generic64lc/KeccakHash.c b/ext/hash/sha3/generic64lc/KeccakHash.c
new file mode 100644 (file)
index 0000000..259831b
--- /dev/null
@@ -0,0 +1,80 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include "KeccakHash.h"
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashInitialize(Keccak_HashInstance *instance, unsigned int rate, unsigned int capacity, unsigned int hashbitlen, unsigned char delimitedSuffix)
+{
+    HashReturn result;
+
+    if (delimitedSuffix == 0)
+        return FAIL;
+    result = (HashReturn)KeccakWidth1600_SpongeInitialize(&instance->sponge, rate, capacity);
+    if (result != SUCCESS)
+        return result;
+    instance->fixedOutputLength = hashbitlen;
+    instance->delimitedSuffix = delimitedSuffix;
+    return SUCCESS;
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *instance, const BitSequence *data, DataLength databitlen)
+{
+    if ((databitlen % 8) == 0)
+        return (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
+    else {
+        HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, data, databitlen/8);
+        if (ret == SUCCESS) {
+            /* The last partial byte is assumed to be aligned on the least significant bits */
+            unsigned char lastByte = data[databitlen/8];
+            /* Concatenate the last few bits provided here with those of the suffix */
+            unsigned short delimitedLastBytes = (unsigned short)((unsigned short)lastByte | ((unsigned short)instance->delimitedSuffix << (databitlen % 8)));
+            if ((delimitedLastBytes & 0xFF00) == 0x0000) {
+                instance->delimitedSuffix = delimitedLastBytes & 0xFF;
+            }
+            else {
+                unsigned char oneByte[1];
+                oneByte[0] = delimitedLastBytes & 0xFF;
+                ret = (HashReturn)KeccakWidth1600_SpongeAbsorb(&instance->sponge, oneByte, 1);
+                instance->delimitedSuffix = (delimitedLastBytes >> 8) & 0xFF;
+            }
+        }
+        return ret;
+    }
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashFinal(Keccak_HashInstance *instance, BitSequence *hashval)
+{
+    HashReturn ret = (HashReturn)KeccakWidth1600_SpongeAbsorbLastFewBits(&instance->sponge, instance->delimitedSuffix);
+    if (ret == SUCCESS)
+        return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, hashval, instance->fixedOutputLength/8);
+    else
+        return ret;
+}
+
+/* ---------------------------------------------------------------- */
+
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *instance, BitSequence *data, DataLength databitlen)
+{
+    if ((databitlen % 8) != 0)
+        return FAIL;
+    return (HashReturn)KeccakWidth1600_SpongeSqueeze(&instance->sponge, data, databitlen/8);
+}
diff --git a/ext/hash/sha3/generic64lc/KeccakHash.h b/ext/hash/sha3/generic64lc/KeccakHash.h
new file mode 100644 (file)
index 0000000..ec35d3d
--- /dev/null
@@ -0,0 +1,113 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakHashInterface_h_
+#define _KeccakHashInterface_h_
+
+#ifndef KeccakP1600_excluded
+
+#include "KeccakSponge.h"
+#include <string.h>
+
+typedef unsigned char BitSequence;
+typedef size_t DataLength;
+typedef enum { SUCCESS = 0, FAIL = 1, BAD_HASHLEN = 2 } HashReturn;
+
+typedef struct {
+    KeccakWidth1600_SpongeInstance sponge;
+    unsigned int fixedOutputLength;
+    unsigned char delimitedSuffix;
+} Keccak_HashInstance;
+
+/**
+  * Function to initialize the Keccak[r, c] sponge function instance used in sequential hashing mode.
+  * @param  hashInstance    Pointer to the hash instance to be initialized.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @param  hashbitlen  The desired number of output bits,
+  *                     or 0 for an arbitrarily-long output.
+  * @param  delimitedSuffix Bits that will be automatically appended to the end
+  *                         of the input message, as in domain separation.
+  *                         This is a byte containing from 0 to 7 bits
+  *                         formatted like the @a delimitedData parameter of
+  *                         the Keccak_SpongeAbsorbLastFewBits() function.
+  * @pre    One must have r+c=1600 and the rate a multiple of 8 bits in this implementation.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashInitialize(Keccak_HashInstance *hashInstance, unsigned int rate, unsigned int capacity, unsigned int hashbitlen, unsigned char delimitedSuffix);
+
+/** Macro to initialize a SHAKE128 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHAKE128(hashInstance)        Keccak_HashInitialize(hashInstance, 1344,  256,   0, 0x1F)
+
+/** Macro to initialize a SHAKE256 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHAKE256(hashInstance)        Keccak_HashInitialize(hashInstance, 1088,  512,   0, 0x1F)
+
+/** Macro to initialize a SHA3-224 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_224(hashInstance)        Keccak_HashInitialize(hashInstance, 1152,  448, 224, 0x06)
+
+/** Macro to initialize a SHA3-256 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_256(hashInstance)        Keccak_HashInitialize(hashInstance, 1088,  512, 256, 0x06)
+
+/** Macro to initialize a SHA3-384 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_384(hashInstance)        Keccak_HashInitialize(hashInstance,  832,  768, 384, 0x06)
+
+/** Macro to initialize a SHA3-512 instance as specified in the FIPS 202 standard.
+  */
+#define Keccak_HashInitialize_SHA3_512(hashInstance)        Keccak_HashInitialize(hashInstance,  576, 1024, 512, 0x06)
+
+/**
+  * Function to give input data to be absorbed.
+  * @param  hashInstance    Pointer to the hash instance initialized by Keccak_HashInitialize().
+  * @param  data        Pointer to the input data.
+  *                     When @a databitLen is not a multiple of 8, the last bits of data must be
+  *                     in the least significant bits of the last byte (little-endian convention).
+  * @param  databitLen  The number of input bits provided in the input data.
+  * @pre    In the previous call to Keccak_HashUpdate(), databitlen was a multiple of 8.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashUpdate(Keccak_HashInstance *hashInstance, const BitSequence *data, DataLength databitlen);
+
+/**
+  * Function to call after all input blocks have been input and to get
+  * output bits if the length was specified when calling Keccak_HashInitialize().
+  * @param  hashInstance    Pointer to the hash instance initialized by Keccak_HashInitialize().
+  * If @a hashbitlen was not 0 in the call to Keccak_HashInitialize(), the number of
+  *     output bits is equal to @a hashbitlen.
+  * If @a hashbitlen was 0 in the call to Keccak_HashInitialize(), the output bits
+  *     must be extracted using the Keccak_HashSqueeze() function.
+  * @param  hashval     Pointer to the buffer where to store the output data.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashFinal(Keccak_HashInstance *hashInstance, BitSequence *hashval);
+
+ /**
+  * Function to squeeze output data.
+  * @param  hashInstance    Pointer to the hash instance initialized by Keccak_HashInitialize().
+  * @param  data        Pointer to the buffer where to store the output data.
+  * @param  databitlen  The number of output bits desired (must be a multiple of 8).
+  * @pre    Keccak_HashFinal() must have been already called.
+  * @pre    @a databitlen is a multiple of 8.
+  * @return SUCCESS if successful, FAIL otherwise.
+  */
+HashReturn Keccak_HashSqueeze(Keccak_HashInstance *hashInstance, BitSequence *data, DataLength databitlen);
+
+#endif
+
+#endif
diff --git a/ext/hash/sha3/generic64lc/KeccakP-1600-64.macros b/ext/hash/sha3/generic64lc/KeccakP-1600-64.macros
new file mode 100644 (file)
index 0000000..d81b152
--- /dev/null
@@ -0,0 +1,2197 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define declareABCDE \
+    UINT64 Aba, Abe, Abi, Abo, Abu; \
+    UINT64 Aga, Age, Agi, Ago, Agu; \
+    UINT64 Aka, Ake, Aki, Ako, Aku; \
+    UINT64 Ama, Ame, Ami, Amo, Amu; \
+    UINT64 Asa, Ase, Asi, Aso, Asu; \
+    UINT64 Bba, Bbe, Bbi, Bbo, Bbu; \
+    UINT64 Bga, Bge, Bgi, Bgo, Bgu; \
+    UINT64 Bka, Bke, Bki, Bko, Bku; \
+    UINT64 Bma, Bme, Bmi, Bmo, Bmu; \
+    UINT64 Bsa, Bse, Bsi, Bso, Bsu; \
+    UINT64 Ca, Ce, Ci, Co, Cu; \
+    UINT64 Da, De, Di, Do, Du; \
+    UINT64 Eba, Ebe, Ebi, Ebo, Ebu; \
+    UINT64 Ega, Ege, Egi, Ego, Egu; \
+    UINT64 Eka, Eke, Eki, Eko, Eku; \
+    UINT64 Ema, Eme, Emi, Emo, Emu; \
+    UINT64 Esa, Ese, Esi, Eso, Esu; \
+
+#define prepareTheta \
+    Ca = Aba^Aga^Aka^Ama^Asa; \
+    Ce = Abe^Age^Ake^Ame^Ase; \
+    Ci = Abi^Agi^Aki^Ami^Asi; \
+    Co = Abo^Ago^Ako^Amo^Aso; \
+    Cu = Abu^Agu^Aku^Amu^Asu; \
+
+#ifdef UseBebigokimisa
+/* --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+    Cu ^= E##su; \
+\
+
+/* --- Code for round (lane complementing pattern 'bebigokimisa') */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^(  Bbe |  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)|  Bbo ); \
+    E##bi =   Bbi ^(  Bbo &  Bbu ); \
+    E##bo =   Bbo ^(  Bbu |  Bba ); \
+    E##bu =   Bbu ^(  Bba &  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^(  Bge |  Bgi ); \
+    E##ge =   Bge ^(  Bgi &  Bgo ); \
+    E##gi =   Bgi ^(  Bgo |(~Bgu)); \
+    E##go =   Bgo ^(  Bgu |  Bga ); \
+    E##gu =   Bgu ^(  Bga &  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^(  Bke |  Bki ); \
+    E##ke =   Bke ^(  Bki &  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko = (~Bko)^(  Bku |  Bka ); \
+    E##ku =   Bku ^(  Bka &  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^(  Bme &  Bmi ); \
+    E##me =   Bme ^(  Bmi |  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)|  Bmu ); \
+    E##mo = (~Bmo)^(  Bmu &  Bma ); \
+    E##mu =   Bmu ^(  Bma |  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se = (~Bse)^(  Bsi |  Bso ); \
+    E##si =   Bsi ^(  Bso &  Bsu ); \
+    E##so =   Bso ^(  Bsu |  Bsa ); \
+    E##su =   Bsu ^(  Bsa &  Bse ); \
+\
+
+#else /* UseBebigokimisa */
+/* --- Code for round, with prepare-theta */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    Ca = E##ba; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    Ce = E##be; \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    Ci = E##bi; \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    Co = E##bo; \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+    Cu = E##bu; \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    Ca ^= E##ga; \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    Ce ^= E##ge; \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    Ci ^= E##gi; \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    Co ^= E##go; \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+    Cu ^= E##gu; \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    Ca ^= E##ka; \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    Ce ^= E##ke; \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    Ci ^= E##ki; \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    Co ^= E##ko; \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+    Cu ^= E##ku; \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    Ca ^= E##ma; \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    Ce ^= E##me; \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    Ci ^= E##mi; \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    Co ^= E##mo; \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+    Cu ^= E##mu; \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    Ca ^= E##sa; \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    Ce ^= E##se; \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    Ci ^= E##si; \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    Co ^= E##so; \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+    Cu ^= E##su; \
+\
+
+/* --- Code for round */
+/* --- 64-bit lanes mapped to 64-bit words */
+#define thetaRhoPiChiIota(i, A, E) \
+    Da = Cu^ROL64(Ce, 1); \
+    De = Ca^ROL64(Ci, 1); \
+    Di = Ce^ROL64(Co, 1); \
+    Do = Ci^ROL64(Cu, 1); \
+    Du = Co^ROL64(Ca, 1); \
+\
+    A##ba ^= Da; \
+    Bba = A##ba; \
+    A##ge ^= De; \
+    Bbe = ROL64(A##ge, 44); \
+    A##ki ^= Di; \
+    Bbi = ROL64(A##ki, 43); \
+    A##mo ^= Do; \
+    Bbo = ROL64(A##mo, 21); \
+    A##su ^= Du; \
+    Bbu = ROL64(A##su, 14); \
+    E##ba =   Bba ^((~Bbe)&  Bbi ); \
+    E##ba ^= KeccakF1600RoundConstants[i]; \
+    E##be =   Bbe ^((~Bbi)&  Bbo ); \
+    E##bi =   Bbi ^((~Bbo)&  Bbu ); \
+    E##bo =   Bbo ^((~Bbu)&  Bba ); \
+    E##bu =   Bbu ^((~Bba)&  Bbe ); \
+\
+    A##bo ^= Do; \
+    Bga = ROL64(A##bo, 28); \
+    A##gu ^= Du; \
+    Bge = ROL64(A##gu, 20); \
+    A##ka ^= Da; \
+    Bgi = ROL64(A##ka, 3); \
+    A##me ^= De; \
+    Bgo = ROL64(A##me, 45); \
+    A##si ^= Di; \
+    Bgu = ROL64(A##si, 61); \
+    E##ga =   Bga ^((~Bge)&  Bgi ); \
+    E##ge =   Bge ^((~Bgi)&  Bgo ); \
+    E##gi =   Bgi ^((~Bgo)&  Bgu ); \
+    E##go =   Bgo ^((~Bgu)&  Bga ); \
+    E##gu =   Bgu ^((~Bga)&  Bge ); \
+\
+    A##be ^= De; \
+    Bka = ROL64(A##be, 1); \
+    A##gi ^= Di; \
+    Bke = ROL64(A##gi, 6); \
+    A##ko ^= Do; \
+    Bki = ROL64(A##ko, 25); \
+    A##mu ^= Du; \
+    Bko = ROL64(A##mu, 8); \
+    A##sa ^= Da; \
+    Bku = ROL64(A##sa, 18); \
+    E##ka =   Bka ^((~Bke)&  Bki ); \
+    E##ke =   Bke ^((~Bki)&  Bko ); \
+    E##ki =   Bki ^((~Bko)&  Bku ); \
+    E##ko =   Bko ^((~Bku)&  Bka ); \
+    E##ku =   Bku ^((~Bka)&  Bke ); \
+\
+    A##bu ^= Du; \
+    Bma = ROL64(A##bu, 27); \
+    A##ga ^= Da; \
+    Bme = ROL64(A##ga, 36); \
+    A##ke ^= De; \
+    Bmi = ROL64(A##ke, 10); \
+    A##mi ^= Di; \
+    Bmo = ROL64(A##mi, 15); \
+    A##so ^= Do; \
+    Bmu = ROL64(A##so, 56); \
+    E##ma =   Bma ^((~Bme)&  Bmi ); \
+    E##me =   Bme ^((~Bmi)&  Bmo ); \
+    E##mi =   Bmi ^((~Bmo)&  Bmu ); \
+    E##mo =   Bmo ^((~Bmu)&  Bma ); \
+    E##mu =   Bmu ^((~Bma)&  Bme ); \
+\
+    A##bi ^= Di; \
+    Bsa = ROL64(A##bi, 62); \
+    A##go ^= Do; \
+    Bse = ROL64(A##go, 55); \
+    A##ku ^= Du; \
+    Bsi = ROL64(A##ku, 39); \
+    A##ma ^= Da; \
+    Bso = ROL64(A##ma, 41); \
+    A##se ^= De; \
+    Bsu = ROL64(A##se, 2); \
+    E##sa =   Bsa ^((~Bse)&  Bsi ); \
+    E##se =   Bse ^((~Bsi)&  Bso ); \
+    E##si =   Bsi ^((~Bso)&  Bsu ); \
+    E##so =   Bso ^((~Bsu)&  Bsa ); \
+    E##su =   Bsu ^((~Bsa)&  Bse ); \
+\
+
+#endif /* UseBebigokimisa */
+
+#define copyFromState(X, state) \
+    X##ba = state[ 0]; \
+    X##be = state[ 1]; \
+    X##bi = state[ 2]; \
+    X##bo = state[ 3]; \
+    X##bu = state[ 4]; \
+    X##ga = state[ 5]; \
+    X##ge = state[ 6]; \
+    X##gi = state[ 7]; \
+    X##go = state[ 8]; \
+    X##gu = state[ 9]; \
+    X##ka = state[10]; \
+    X##ke = state[11]; \
+    X##ki = state[12]; \
+    X##ko = state[13]; \
+    X##ku = state[14]; \
+    X##ma = state[15]; \
+    X##me = state[16]; \
+    X##mi = state[17]; \
+    X##mo = state[18]; \
+    X##mu = state[19]; \
+    X##sa = state[20]; \
+    X##se = state[21]; \
+    X##si = state[22]; \
+    X##so = state[23]; \
+    X##su = state[24]; \
+
+#define copyToState(state, X) \
+    state[ 0] = X##ba; \
+    state[ 1] = X##be; \
+    state[ 2] = X##bi; \
+    state[ 3] = X##bo; \
+    state[ 4] = X##bu; \
+    state[ 5] = X##ga; \
+    state[ 6] = X##ge; \
+    state[ 7] = X##gi; \
+    state[ 8] = X##go; \
+    state[ 9] = X##gu; \
+    state[10] = X##ka; \
+    state[11] = X##ke; \
+    state[12] = X##ki; \
+    state[13] = X##ko; \
+    state[14] = X##ku; \
+    state[15] = X##ma; \
+    state[16] = X##me; \
+    state[17] = X##mi; \
+    state[18] = X##mo; \
+    state[19] = X##mu; \
+    state[20] = X##sa; \
+    state[21] = X##se; \
+    state[22] = X##si; \
+    state[23] = X##so; \
+    state[24] = X##su; \
+
+#define copyStateVariables(X, Y) \
+    X##ba = Y##ba; \
+    X##be = Y##be; \
+    X##bi = Y##bi; \
+    X##bo = Y##bo; \
+    X##bu = Y##bu; \
+    X##ga = Y##ga; \
+    X##ge = Y##ge; \
+    X##gi = Y##gi; \
+    X##go = Y##go; \
+    X##gu = Y##gu; \
+    X##ka = Y##ka; \
+    X##ke = Y##ke; \
+    X##ki = Y##ki; \
+    X##ko = Y##ko; \
+    X##ku = Y##ku; \
+    X##ma = Y##ma; \
+    X##me = Y##me; \
+    X##mi = Y##mi; \
+    X##mo = Y##mo; \
+    X##mu = Y##mu; \
+    X##sa = Y##sa; \
+    X##se = Y##se; \
+    X##si = Y##si; \
+    X##so = Y##so; \
+    X##su = Y##su; \
+
+#define copyFromStateAndAdd(X, state, input, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount < 1) { \
+                        X##ba = state[ 0]; \
+                    } \
+                    else { \
+                        X##ba = state[ 0]^input[ 0]; \
+                    } \
+                    X##be = state[ 1]; \
+                    X##bi = state[ 2]; \
+                } \
+                else { \
+                    X##ba = state[ 0]^input[ 0]; \
+                    X##be = state[ 1]^input[ 1]; \
+                    if (laneCount < 3) { \
+                        X##bi = state[ 2]; \
+                    } \
+                    else { \
+                        X##bi = state[ 2]^input[ 2]; \
+                    } \
+                } \
+                X##bo = state[ 3]; \
+                X##bu = state[ 4]; \
+                X##ga = state[ 5]; \
+                X##ge = state[ 6]; \
+            } \
+            else { \
+                X##ba = state[ 0]^input[ 0]; \
+                X##be = state[ 1]^input[ 1]; \
+                X##bi = state[ 2]^input[ 2]; \
+                X##bo = state[ 3]^input[ 3]; \
+                if (laneCount < 6) { \
+                    if (laneCount < 5) { \
+                        X##bu = state[ 4]; \
+                    } \
+                    else { \
+                        X##bu = state[ 4]^input[ 4]; \
+                    } \
+                    X##ga = state[ 5]; \
+                    X##ge = state[ 6]; \
+                } \
+                else { \
+                    X##bu = state[ 4]^input[ 4]; \
+                    X##ga = state[ 5]^input[ 5]; \
+                    if (laneCount < 7) { \
+                        X##ge = state[ 6]; \
+                    } \
+                    else { \
+                        X##ge = state[ 6]^input[ 6]; \
+                    } \
+                } \
+            } \
+            X##gi = state[ 7]; \
+            X##go = state[ 8]; \
+            X##gu = state[ 9]; \
+            X##ka = state[10]; \
+            X##ke = state[11]; \
+            X##ki = state[12]; \
+            X##ko = state[13]; \
+            X##ku = state[14]; \
+        } \
+        else { \
+            X##ba = state[ 0]^input[ 0]; \
+            X##be = state[ 1]^input[ 1]; \
+            X##bi = state[ 2]^input[ 2]; \
+            X##bo = state[ 3]^input[ 3]; \
+            X##bu = state[ 4]^input[ 4]; \
+            X##ga = state[ 5]^input[ 5]; \
+            X##ge = state[ 6]^input[ 6]; \
+            X##gi = state[ 7]^input[ 7]; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount < 9) { \
+                        X##go = state[ 8]; \
+                    } \
+                    else { \
+                        X##go = state[ 8]^input[ 8]; \
+                    } \
+                    X##gu = state[ 9]; \
+                    X##ka = state[10]; \
+                } \
+                else { \
+                    X##go = state[ 8]^input[ 8]; \
+                    X##gu = state[ 9]^input[ 9]; \
+                    if (laneCount < 11) { \
+                        X##ka = state[10]; \
+                    } \
+                    else { \
+                        X##ka = state[10]^input[10]; \
+                    } \
+                } \
+                X##ke = state[11]; \
+                X##ki = state[12]; \
+                X##ko = state[13]; \
+                X##ku = state[14]; \
+            } \
+            else { \
+                X##go = state[ 8]^input[ 8]; \
+                X##gu = state[ 9]^input[ 9]; \
+                X##ka = state[10]^input[10]; \
+                X##ke = state[11]^input[11]; \
+                if (laneCount < 14) { \
+                    if (laneCount < 13) { \
+                        X##ki = state[12]; \
+                    } \
+                    else { \
+                        X##ki = state[12]^input[12]; \
+                    } \
+                    X##ko = state[13]; \
+                    X##ku = state[14]; \
+                } \
+                else { \
+                    X##ki = state[12]^input[12]; \
+                    X##ko = state[13]^input[13]; \
+                    if (laneCount < 15) { \
+                        X##ku = state[14]; \
+                    } \
+                    else { \
+                        X##ku = state[14]^input[14]; \
+                    } \
+                } \
+            } \
+        } \
+        X##ma = state[15]; \
+        X##me = state[16]; \
+        X##mi = state[17]; \
+        X##mo = state[18]; \
+        X##mu = state[19]; \
+        X##sa = state[20]; \
+        X##se = state[21]; \
+        X##si = state[22]; \
+        X##so = state[23]; \
+        X##su = state[24]; \
+    } \
+    else { \
+        X##ba = state[ 0]^input[ 0]; \
+        X##be = state[ 1]^input[ 1]; \
+        X##bi = state[ 2]^input[ 2]; \
+        X##bo = state[ 3]^input[ 3]; \
+        X##bu = state[ 4]^input[ 4]; \
+        X##ga = state[ 5]^input[ 5]; \
+        X##ge = state[ 6]^input[ 6]; \
+        X##gi = state[ 7]^input[ 7]; \
+        X##go = state[ 8]^input[ 8]; \
+        X##gu = state[ 9]^input[ 9]; \
+        X##ka = state[10]^input[10]; \
+        X##ke = state[11]^input[11]; \
+        X##ki = state[12]^input[12]; \
+        X##ko = state[13]^input[13]; \
+        X##ku = state[14]^input[14]; \
+        X##ma = state[15]^input[15]; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount < 17) { \
+                        X##me = state[16]; \
+                    } \
+                    else { \
+                        X##me = state[16]^input[16]; \
+                    } \
+                    X##mi = state[17]; \
+                    X##mo = state[18]; \
+                } \
+                else { \
+                    X##me = state[16]^input[16]; \
+                    X##mi = state[17]^input[17]; \
+                    if (laneCount < 19) { \
+                        X##mo = state[18]; \
+                    } \
+                    else { \
+                        X##mo = state[18]^input[18]; \
+                    } \
+                } \
+                X##mu = state[19]; \
+                X##sa = state[20]; \
+                X##se = state[21]; \
+                X##si = state[22]; \
+            } \
+            else { \
+                X##me = state[16]^input[16]; \
+                X##mi = state[17]^input[17]; \
+                X##mo = state[18]^input[18]; \
+                X##mu = state[19]^input[19]; \
+                if (laneCount < 22) { \
+                    if (laneCount < 21) { \
+                        X##sa = state[20]; \
+                    } \
+                    else { \
+                        X##sa = state[20]^input[20]; \
+                    } \
+                    X##se = state[21]; \
+                    X##si = state[22]; \
+                } \
+                else { \
+                    X##sa = state[20]^input[20]; \
+                    X##se = state[21]^input[21]; \
+                    if (laneCount < 23) { \
+                        X##si = state[22]; \
+                    } \
+                    else { \
+                        X##si = state[22]^input[22]; \
+                    } \
+                } \
+            } \
+            X##so = state[23]; \
+            X##su = state[24]; \
+        } \
+        else { \
+            X##me = state[16]^input[16]; \
+            X##mi = state[17]^input[17]; \
+            X##mo = state[18]^input[18]; \
+            X##mu = state[19]^input[19]; \
+            X##sa = state[20]^input[20]; \
+            X##se = state[21]^input[21]; \
+            X##si = state[22]^input[22]; \
+            X##so = state[23]^input[23]; \
+            if (laneCount < 25) { \
+                X##su = state[24]; \
+            } \
+            else { \
+                X##su = state[24]^input[24]; \
+            } \
+        } \
+    }
+
+#define addInput(X, input, laneCount) \
+    if (laneCount == 21) { \
+        X##ba ^= input[ 0]; \
+        X##be ^= input[ 1]; \
+        X##bi ^= input[ 2]; \
+        X##bo ^= input[ 3]; \
+        X##bu ^= input[ 4]; \
+        X##ga ^= input[ 5]; \
+        X##ge ^= input[ 6]; \
+        X##gi ^= input[ 7]; \
+        X##go ^= input[ 8]; \
+        X##gu ^= input[ 9]; \
+        X##ka ^= input[10]; \
+        X##ke ^= input[11]; \
+        X##ki ^= input[12]; \
+        X##ko ^= input[13]; \
+        X##ku ^= input[14]; \
+        X##ma ^= input[15]; \
+        X##me ^= input[16]; \
+        X##mi ^= input[17]; \
+        X##mo ^= input[18]; \
+        X##mu ^= input[19]; \
+        X##sa ^= input[20]; \
+    } \
+    else if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount < 1) { \
+                    } \
+                    else { \
+                        X##ba ^= input[ 0]; \
+                    } \
+                } \
+                else { \
+                    X##ba ^= input[ 0]; \
+                    X##be ^= input[ 1]; \
+                    if (laneCount < 3) { \
+                    } \
+                    else { \
+                        X##bi ^= input[ 2]; \
+                    } \
+                } \
+            } \
+            else { \
+                X##ba ^= input[ 0]; \
+                X##be ^= input[ 1]; \
+                X##bi ^= input[ 2]; \
+                X##bo ^= input[ 3]; \
+                if (laneCount < 6) { \
+                    if (laneCount < 5) { \
+                    } \
+                    else { \
+                        X##bu ^= input[ 4]; \
+                    } \
+                } \
+                else { \
+                    X##bu ^= input[ 4]; \
+                    X##ga ^= input[ 5]; \
+                    if (laneCount < 7) { \
+                    } \
+                    else { \
+                        X##ge ^= input[ 6]; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            X##ba ^= input[ 0]; \
+            X##be ^= input[ 1]; \
+            X##bi ^= input[ 2]; \
+            X##bo ^= input[ 3]; \
+            X##bu ^= input[ 4]; \
+            X##ga ^= input[ 5]; \
+            X##ge ^= input[ 6]; \
+            X##gi ^= input[ 7]; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount < 9) { \
+                    } \
+                    else { \
+                        X##go ^= input[ 8]; \
+                    } \
+                } \
+                else { \
+                    X##go ^= input[ 8]; \
+                    X##gu ^= input[ 9]; \
+                    if (laneCount < 11) { \
+                    } \
+                    else { \
+                        X##ka ^= input[10]; \
+                    } \
+                } \
+            } \
+            else { \
+                X##go ^= input[ 8]; \
+                X##gu ^= input[ 9]; \
+                X##ka ^= input[10]; \
+                X##ke ^= input[11]; \
+                if (laneCount < 14) { \
+                    if (laneCount < 13) { \
+                    } \
+                    else { \
+                        X##ki ^= input[12]; \
+                    } \
+                } \
+                else { \
+                    X##ki ^= input[12]; \
+                    X##ko ^= input[13]; \
+                    if (laneCount < 15) { \
+                    } \
+                    else { \
+                        X##ku ^= input[14]; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        X##ba ^= input[ 0]; \
+        X##be ^= input[ 1]; \
+        X##bi ^= input[ 2]; \
+        X##bo ^= input[ 3]; \
+        X##bu ^= input[ 4]; \
+        X##ga ^= input[ 5]; \
+        X##ge ^= input[ 6]; \
+        X##gi ^= input[ 7]; \
+        X##go ^= input[ 8]; \
+        X##gu ^= input[ 9]; \
+        X##ka ^= input[10]; \
+        X##ke ^= input[11]; \
+        X##ki ^= input[12]; \
+        X##ko ^= input[13]; \
+        X##ku ^= input[14]; \
+        X##ma ^= input[15]; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount < 17) { \
+                    } \
+                    else { \
+                        X##me ^= input[16]; \
+                    } \
+                } \
+                else { \
+                    X##me ^= input[16]; \
+                    X##mi ^= input[17]; \
+                    if (laneCount < 19) { \
+                    } \
+                    else { \
+                        X##mo ^= input[18]; \
+                    } \
+                } \
+            } \
+            else { \
+                X##me ^= input[16]; \
+                X##mi ^= input[17]; \
+                X##mo ^= input[18]; \
+                X##mu ^= input[19]; \
+                if (laneCount < 22) { \
+                    if (laneCount < 21) { \
+                    } \
+                    else { \
+                        X##sa ^= input[20]; \
+                    } \
+                } \
+                else { \
+                    X##sa ^= input[20]; \
+                    X##se ^= input[21]; \
+                    if (laneCount < 23) { \
+                    } \
+                    else { \
+                        X##si ^= input[22]; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            X##me ^= input[16]; \
+            X##mi ^= input[17]; \
+            X##mo ^= input[18]; \
+            X##mu ^= input[19]; \
+            X##sa ^= input[20]; \
+            X##se ^= input[21]; \
+            X##si ^= input[22]; \
+            X##so ^= input[23]; \
+            if (laneCount < 25) { \
+            } \
+            else { \
+                X##su ^= input[24]; \
+            } \
+        } \
+    }
+
+#ifdef UseBebigokimisa
+
+#define copyToStateAndOutput(X, state, output, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    state[ 0] = X##ba; \
+                    if (laneCount >= 1) { \
+                        output[ 0] = X##ba; \
+                    } \
+                    state[ 1] = X##be; \
+                    state[ 2] = X##bi; \
+                } \
+                else { \
+                    state[ 0] = X##ba; \
+                    output[ 0] = X##ba; \
+                    state[ 1] = X##be; \
+                    output[ 1] = ~X##be; \
+                    state[ 2] = X##bi; \
+                    if (laneCount >= 3) { \
+                        output[ 2] = ~X##bi; \
+                    } \
+                } \
+                state[ 3] = X##bo; \
+                state[ 4] = X##bu; \
+                state[ 5] = X##ga; \
+                state[ 6] = X##ge; \
+            } \
+            else { \
+                state[ 0] = X##ba; \
+                output[ 0] = X##ba; \
+                state[ 1] = X##be; \
+                output[ 1] = ~X##be; \
+                state[ 2] = X##bi; \
+                output[ 2] = ~X##bi; \
+                state[ 3] = X##bo; \
+                output[ 3] = X##bo; \
+                if (laneCount < 6) { \
+                    state[ 4] = X##bu; \
+                    if (laneCount >= 5) { \
+                        output[ 4] = X##bu; \
+                    } \
+                    state[ 5] = X##ga; \
+                    state[ 6] = X##ge; \
+                } \
+                else { \
+                    state[ 4] = X##bu; \
+                    output[ 4] = X##bu; \
+                    state[ 5] = X##ga; \
+                    output[ 5] = X##ga; \
+                    state[ 6] = X##ge; \
+                    if (laneCount >= 7) { \
+                        output[ 6] = X##ge; \
+                    } \
+                } \
+            } \
+            state[ 7] = X##gi; \
+            state[ 8] = X##go; \
+            state[ 9] = X##gu; \
+            state[10] = X##ka; \
+            state[11] = X##ke; \
+            state[12] = X##ki; \
+            state[13] = X##ko; \
+            state[14] = X##ku; \
+        } \
+        else { \
+            state[ 0] = X##ba; \
+            output[ 0] = X##ba; \
+            state[ 1] = X##be; \
+            output[ 1] = ~X##be; \
+            state[ 2] = X##bi; \
+            output[ 2] = ~X##bi; \
+            state[ 3] = X##bo; \
+            output[ 3] = X##bo; \
+            state[ 4] = X##bu; \
+            output[ 4] = X##bu; \
+            state[ 5] = X##ga; \
+            output[ 5] = X##ga; \
+            state[ 6] = X##ge; \
+            output[ 6] = X##ge; \
+            state[ 7] = X##gi; \
+            output[ 7] = X##gi; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    state[ 8] = X##go; \
+                    if (laneCount >= 9) { \
+                        output[ 8] = ~X##go; \
+                    } \
+                    state[ 9] = X##gu; \
+                    state[10] = X##ka; \
+                } \
+                else { \
+                    state[ 8] = X##go; \
+                    output[ 8] = ~X##go; \
+                    state[ 9] = X##gu; \
+                    output[ 9] = X##gu; \
+                    state[10] = X##ka; \
+                    if (laneCount >= 11) { \
+                        output[10] = X##ka; \
+                    } \
+                } \
+                state[11] = X##ke; \
+                state[12] = X##ki; \
+                state[13] = X##ko; \
+                state[14] = X##ku; \
+            } \
+            else { \
+                state[ 8] = X##go; \
+                output[ 8] = ~X##go; \
+                state[ 9] = X##gu; \
+                output[ 9] = X##gu; \
+                state[10] = X##ka; \
+                output[10] = X##ka; \
+                state[11] = X##ke; \
+                output[11] = X##ke; \
+                if (laneCount < 14) { \
+                    state[12] = X##ki; \
+                    if (laneCount >= 13) { \
+                        output[12] = ~X##ki; \
+                    } \
+                    state[13] = X##ko; \
+                    state[14] = X##ku; \
+                } \
+                else { \
+                    state[12] = X##ki; \
+                    output[12] = ~X##ki; \
+                    state[13] = X##ko; \
+                    output[13] = X##ko; \
+                    state[14] = X##ku; \
+                    if (laneCount >= 15) { \
+                        output[14] = X##ku; \
+                    } \
+                } \
+            } \
+        } \
+        state[15] = X##ma; \
+        state[16] = X##me; \
+        state[17] = X##mi; \
+        state[18] = X##mo; \
+        state[19] = X##mu; \
+        state[20] = X##sa; \
+        state[21] = X##se; \
+        state[22] = X##si; \
+        state[23] = X##so; \
+        state[24] = X##su; \
+    } \
+    else { \
+        state[ 0] = X##ba; \
+        output[ 0] = X##ba; \
+        state[ 1] = X##be; \
+        output[ 1] = ~X##be; \
+        state[ 2] = X##bi; \
+        output[ 2] = ~X##bi; \
+        state[ 3] = X##bo; \
+        output[ 3] = X##bo; \
+        state[ 4] = X##bu; \
+        output[ 4] = X##bu; \
+        state[ 5] = X##ga; \
+        output[ 5] = X##ga; \
+        state[ 6] = X##ge; \
+        output[ 6] = X##ge; \
+        state[ 7] = X##gi; \
+        output[ 7] = X##gi; \
+        state[ 8] = X##go; \
+        output[ 8] = ~X##go; \
+        state[ 9] = X##gu; \
+        output[ 9] = X##gu; \
+        state[10] = X##ka; \
+        output[10] = X##ka; \
+        state[11] = X##ke; \
+        output[11] = X##ke; \
+        state[12] = X##ki; \
+        output[12] = ~X##ki; \
+        state[13] = X##ko; \
+        output[13] = X##ko; \
+        state[14] = X##ku; \
+        output[14] = X##ku; \
+        state[15] = X##ma; \
+        output[15] = X##ma; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    state[16] = X##me; \
+                    if (laneCount >= 17) { \
+                        output[16] = X##me; \
+                    } \
+                    state[17] = X##mi; \
+                    state[18] = X##mo; \
+                } \
+                else { \
+                    state[16] = X##me; \
+                    output[16] = X##me; \
+                    state[17] = X##mi; \
+                    output[17] = ~X##mi; \
+                    state[18] = X##mo; \
+                    if (laneCount >= 19) { \
+                        output[18] = X##mo; \
+                    } \
+                } \
+                state[19] = X##mu; \
+                state[20] = X##sa; \
+                state[21] = X##se; \
+                state[22] = X##si; \
+            } \
+            else { \
+                state[16] = X##me; \
+                output[16] = X##me; \
+                state[17] = X##mi; \
+                output[17] = ~X##mi; \
+                state[18] = X##mo; \
+                output[18] = X##mo; \
+                state[19] = X##mu; \
+                output[19] = X##mu; \
+                if (laneCount < 22) { \
+                    state[20] = X##sa; \
+                    if (laneCount >= 21) { \
+                        output[20] = ~X##sa; \
+                    } \
+                    state[21] = X##se; \
+                    state[22] = X##si; \
+                } \
+                else { \
+                    state[20] = X##sa; \
+                    output[20] = ~X##sa; \
+                    state[21] = X##se; \
+                    output[21] = X##se; \
+                    state[22] = X##si; \
+                    if (laneCount >= 23) { \
+                        output[22] = X##si; \
+                    } \
+                } \
+            } \
+            state[23] = X##so; \
+            state[24] = X##su; \
+        } \
+        else { \
+            state[16] = X##me; \
+            output[16] = X##me; \
+            state[17] = X##mi; \
+            output[17] = ~X##mi; \
+            state[18] = X##mo; \
+            output[18] = X##mo; \
+            state[19] = X##mu; \
+            output[19] = X##mu; \
+            state[20] = X##sa; \
+            output[20] = ~X##sa; \
+            state[21] = X##se; \
+            output[21] = X##se; \
+            state[22] = X##si; \
+            output[22] = X##si; \
+            state[23] = X##so; \
+            output[23] = X##so; \
+            state[24] = X##su; \
+            if (laneCount >= 25) { \
+                output[24] = X##su; \
+            } \
+        } \
+    }
+
+#define output(X, output, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount >= 1) { \
+                        output[ 0] = X##ba; \
+                    } \
+                } \
+                else { \
+                    output[ 0] = X##ba; \
+                    output[ 1] = ~X##be; \
+                    if (laneCount >= 3) { \
+                        output[ 2] = ~X##bi; \
+                    } \
+                } \
+            } \
+            else { \
+                output[ 0] = X##ba; \
+                output[ 1] = ~X##be; \
+                output[ 2] = ~X##bi; \
+                output[ 3] = X##bo; \
+                if (laneCount < 6) { \
+                    if (laneCount >= 5) { \
+                        output[ 4] = X##bu; \
+                    } \
+                } \
+                else { \
+                    output[ 4] = X##bu; \
+                    output[ 5] = X##ga; \
+                    if (laneCount >= 7) { \
+                        output[ 6] = X##ge; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            output[ 0] = X##ba; \
+            output[ 1] = ~X##be; \
+            output[ 2] = ~X##bi; \
+            output[ 3] = X##bo; \
+            output[ 4] = X##bu; \
+            output[ 5] = X##ga; \
+            output[ 6] = X##ge; \
+            output[ 7] = X##gi; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount >= 9) { \
+                        output[ 8] = ~X##go; \
+                    } \
+                } \
+                else { \
+                    output[ 8] = ~X##go; \
+                    output[ 9] = X##gu; \
+                    if (laneCount >= 11) { \
+                        output[10] = X##ka; \
+                    } \
+                } \
+            } \
+            else { \
+                output[ 8] = ~X##go; \
+                output[ 9] = X##gu; \
+                output[10] = X##ka; \
+                output[11] = X##ke; \
+                if (laneCount < 14) { \
+                    if (laneCount >= 13) { \
+                        output[12] = ~X##ki; \
+                    } \
+                } \
+                else { \
+                    output[12] = ~X##ki; \
+                    output[13] = X##ko; \
+                    if (laneCount >= 15) { \
+                        output[14] = X##ku; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        output[ 0] = X##ba; \
+        output[ 1] = ~X##be; \
+        output[ 2] = ~X##bi; \
+        output[ 3] = X##bo; \
+        output[ 4] = X##bu; \
+        output[ 5] = X##ga; \
+        output[ 6] = X##ge; \
+        output[ 7] = X##gi; \
+        output[ 8] = ~X##go; \
+        output[ 9] = X##gu; \
+        output[10] = X##ka; \
+        output[11] = X##ke; \
+        output[12] = ~X##ki; \
+        output[13] = X##ko; \
+        output[14] = X##ku; \
+        output[15] = X##ma; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount >= 17) { \
+                        output[16] = X##me; \
+                    } \
+                } \
+                else { \
+                    output[16] = X##me; \
+                    output[17] = ~X##mi; \
+                    if (laneCount >= 19) { \
+                        output[18] = X##mo; \
+                    } \
+                } \
+            } \
+            else { \
+                output[16] = X##me; \
+                output[17] = ~X##mi; \
+                output[18] = X##mo; \
+                output[19] = X##mu; \
+                if (laneCount < 22) { \
+                    if (laneCount >= 21) { \
+                        output[20] = ~X##sa; \
+                    } \
+                } \
+                else { \
+                    output[20] = ~X##sa; \
+                    output[21] = X##se; \
+                    if (laneCount >= 23) { \
+                        output[22] = X##si; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            output[16] = X##me; \
+            output[17] = ~X##mi; \
+            output[18] = X##mo; \
+            output[19] = X##mu; \
+            output[20] = ~X##sa; \
+            output[21] = X##se; \
+            output[22] = X##si; \
+            output[23] = X##so; \
+            if (laneCount >= 25) { \
+                output[24] = X##su; \
+            } \
+        } \
+    }
+
+#define wrapOne(X, input, output, index, name) \
+    X##name ^= input[index]; \
+    output[index] = X##name;
+
+#define wrapOneInvert(X, input, output, index, name) \
+    X##name ^= input[index]; \
+    output[index] = ~X##name;
+
+#define unwrapOne(X, input, output, index, name) \
+    output[index] = input[index] ^ X##name; \
+    X##name ^= output[index];
+
+#define unwrapOneInvert(X, input, output, index, name) \
+    output[index] = ~(input[index] ^ X##name); \
+    X##name ^= output[index]; \
+
+#else /* UseBebigokimisa */
+
+#define copyToStateAndOutput(X, state, output, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    state[ 0] = X##ba; \
+                    if (laneCount >= 1) { \
+                        output[ 0] = X##ba; \
+                    } \
+                    state[ 1] = X##be; \
+                    state[ 2] = X##bi; \
+                } \
+                else { \
+                    state[ 0] = X##ba; \
+                    output[ 0] = X##ba; \
+                    state[ 1] = X##be; \
+                    output[ 1] = X##be; \
+                    state[ 2] = X##bi; \
+                    if (laneCount >= 3) { \
+                        output[ 2] = X##bi; \
+                    } \
+                } \
+                state[ 3] = X##bo; \
+                state[ 4] = X##bu; \
+                state[ 5] = X##ga; \
+                state[ 6] = X##ge; \
+            } \
+            else { \
+                state[ 0] = X##ba; \
+                output[ 0] = X##ba; \
+                state[ 1] = X##be; \
+                output[ 1] = X##be; \
+                state[ 2] = X##bi; \
+                output[ 2] = X##bi; \
+                state[ 3] = X##bo; \
+                output[ 3] = X##bo; \
+                if (laneCount < 6) { \
+                    state[ 4] = X##bu; \
+                    if (laneCount >= 5) { \
+                        output[ 4] = X##bu; \
+                    } \
+                    state[ 5] = X##ga; \
+                    state[ 6] = X##ge; \
+                } \
+                else { \
+                    state[ 4] = X##bu; \
+                    output[ 4] = X##bu; \
+                    state[ 5] = X##ga; \
+                    output[ 5] = X##ga; \
+                    state[ 6] = X##ge; \
+                    if (laneCount >= 7) { \
+                        output[ 6] = X##ge; \
+                    } \
+                } \
+            } \
+            state[ 7] = X##gi; \
+            state[ 8] = X##go; \
+            state[ 9] = X##gu; \
+            state[10] = X##ka; \
+            state[11] = X##ke; \
+            state[12] = X##ki; \
+            state[13] = X##ko; \
+            state[14] = X##ku; \
+        } \
+        else { \
+            state[ 0] = X##ba; \
+            output[ 0] = X##ba; \
+            state[ 1] = X##be; \
+            output[ 1] = X##be; \
+            state[ 2] = X##bi; \
+            output[ 2] = X##bi; \
+            state[ 3] = X##bo; \
+            output[ 3] = X##bo; \
+            state[ 4] = X##bu; \
+            output[ 4] = X##bu; \
+            state[ 5] = X##ga; \
+            output[ 5] = X##ga; \
+            state[ 6] = X##ge; \
+            output[ 6] = X##ge; \
+            state[ 7] = X##gi; \
+            output[ 7] = X##gi; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    state[ 8] = X##go; \
+                    if (laneCount >= 9) { \
+                        output[ 8] = X##go; \
+                    } \
+                    state[ 9] = X##gu; \
+                    state[10] = X##ka; \
+                } \
+                else { \
+                    state[ 8] = X##go; \
+                    output[ 8] = X##go; \
+                    state[ 9] = X##gu; \
+                    output[ 9] = X##gu; \
+                    state[10] = X##ka; \
+                    if (laneCount >= 11) { \
+                        output[10] = X##ka; \
+                    } \
+                } \
+                state[11] = X##ke; \
+                state[12] = X##ki; \
+                state[13] = X##ko; \
+                state[14] = X##ku; \
+            } \
+            else { \
+                state[ 8] = X##go; \
+                output[ 8] = X##go; \
+                state[ 9] = X##gu; \
+                output[ 9] = X##gu; \
+                state[10] = X##ka; \
+                output[10] = X##ka; \
+                state[11] = X##ke; \
+                output[11] = X##ke; \
+                if (laneCount < 14) { \
+                    state[12] = X##ki; \
+                    if (laneCount >= 13) { \
+                        output[12]= X##ki; \
+                    } \
+                    state[13] = X##ko; \
+                    state[14] = X##ku; \
+                } \
+                else { \
+                    state[12] = X##ki; \
+                    output[12]= X##ki; \
+                    state[13] = X##ko; \
+                    output[13] = X##ko; \
+                    state[14] = X##ku; \
+                    if (laneCount >= 15) { \
+                        output[14] = X##ku; \
+                    } \
+                } \
+            } \
+        } \
+        state[15] = X##ma; \
+        state[16] = X##me; \
+        state[17] = X##mi; \
+        state[18] = X##mo; \
+        state[19] = X##mu; \
+        state[20] = X##sa; \
+        state[21] = X##se; \
+        state[22] = X##si; \
+        state[23] = X##so; \
+        state[24] = X##su; \
+    } \
+    else { \
+        state[ 0] = X##ba; \
+        output[ 0] = X##ba; \
+        state[ 1] = X##be; \
+        output[ 1] = X##be; \
+        state[ 2] = X##bi; \
+        output[ 2] = X##bi; \
+        state[ 3] = X##bo; \
+        output[ 3] = X##bo; \
+        state[ 4] = X##bu; \
+        output[ 4] = X##bu; \
+        state[ 5] = X##ga; \
+        output[ 5] = X##ga; \
+        state[ 6] = X##ge; \
+        output[ 6] = X##ge; \
+        state[ 7] = X##gi; \
+        output[ 7] = X##gi; \
+        state[ 8] = X##go; \
+        output[ 8] = X##go; \
+        state[ 9] = X##gu; \
+        output[ 9] = X##gu; \
+        state[10] = X##ka; \
+        output[10] = X##ka; \
+        state[11] = X##ke; \
+        output[11] = X##ke; \
+        state[12] = X##ki; \
+        output[12]= X##ki; \
+        state[13] = X##ko; \
+        output[13] = X##ko; \
+        state[14] = X##ku; \
+        output[14] = X##ku; \
+        state[15] = X##ma; \
+        output[15] = X##ma; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    state[16] = X##me; \
+                    if (laneCount >= 17) { \
+                        output[16] = X##me; \
+                    } \
+                    state[17] = X##mi; \
+                    state[18] = X##mo; \
+                } \
+                else { \
+                    state[16] = X##me; \
+                    output[16] = X##me; \
+                    state[17] = X##mi; \
+                    output[17] = X##mi; \
+                    state[18] = X##mo; \
+                    if (laneCount >= 19) { \
+                        output[18] = X##mo; \
+                    } \
+                } \
+                state[19] = X##mu; \
+                state[20] = X##sa; \
+                state[21] = X##se; \
+                state[22] = X##si; \
+            } \
+            else { \
+                state[16] = X##me; \
+                output[16] = X##me; \
+                state[17] = X##mi; \
+                output[17] = X##mi; \
+                state[18] = X##mo; \
+                output[18] = X##mo; \
+                state[19] = X##mu; \
+                output[19] = X##mu; \
+                if (laneCount < 22) { \
+                    state[20] = X##sa; \
+                    if (laneCount >= 21) { \
+                        output[20] = X##sa; \
+                    } \
+                    state[21] = X##se; \
+                    state[22] = X##si; \
+                } \
+                else { \
+                    state[20] = X##sa; \
+                    output[20] = X##sa; \
+                    state[21] = X##se; \
+                    output[21] = X##se; \
+                    state[22] = X##si; \
+                    if (laneCount >= 23) { \
+                        output[22] = X##si; \
+                    } \
+                } \
+            } \
+            state[23] = X##so; \
+            state[24] = X##su; \
+        } \
+        else { \
+            state[16] = X##me; \
+            output[16] = X##me; \
+            state[17] = X##mi; \
+            output[17] = X##mi; \
+            state[18] = X##mo; \
+            output[18] = X##mo; \
+            state[19] = X##mu; \
+            output[19] = X##mu; \
+            state[20] = X##sa; \
+            output[20] = X##sa; \
+            state[21] = X##se; \
+            output[21] = X##se; \
+            state[22] = X##si; \
+            output[22] = X##si; \
+            state[23] = X##so; \
+            output[23] = X##so; \
+            state[24] = X##su; \
+            if (laneCount >= 25) { \
+                output[24] = X##su; \
+            } \
+        } \
+    }
+
+#define output(X, output, laneCount) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount >= 1) { \
+                        output[ 0] = X##ba; \
+                    } \
+                } \
+                else { \
+                    output[ 0] = X##ba; \
+                    output[ 1] = X##be; \
+                    if (laneCount >= 3) { \
+                        output[ 2] = X##bi; \
+                    } \
+                } \
+            } \
+            else { \
+                output[ 0] = X##ba; \
+                output[ 1] = X##be; \
+                output[ 2] = X##bi; \
+                output[ 3] = X##bo; \
+                if (laneCount < 6) { \
+                    if (laneCount >= 5) { \
+                        output[ 4] = X##bu; \
+                    } \
+                } \
+                else { \
+                    output[ 4] = X##bu; \
+                    output[ 5] = X##ga; \
+                    if (laneCount >= 7) { \
+                        output[ 6] = X##ge; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            output[ 0] = X##ba; \
+            output[ 1] = X##be; \
+            output[ 2] = X##bi; \
+            output[ 3] = X##bo; \
+            output[ 4] = X##bu; \
+            output[ 5] = X##ga; \
+            output[ 6] = X##ge; \
+            output[ 7] = X##gi; \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount >= 9) { \
+                        output[ 8] = X##go; \
+                    } \
+                } \
+                else { \
+                    output[ 8] = X##go; \
+                    output[ 9] = X##gu; \
+                    if (laneCount >= 11) { \
+                        output[10] = X##ka; \
+                    } \
+                } \
+            } \
+            else { \
+                output[ 8] = X##go; \
+                output[ 9] = X##gu; \
+                output[10] = X##ka; \
+                output[11] = X##ke; \
+                if (laneCount < 14) { \
+                    if (laneCount >= 13) { \
+                        output[12] = X##ki; \
+                    } \
+                } \
+                else { \
+                    output[12] = X##ki; \
+                    output[13] = X##ko; \
+                    if (laneCount >= 15) { \
+                        output[14] = X##ku; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        output[ 0] = X##ba; \
+        output[ 1] = X##be; \
+        output[ 2] = X##bi; \
+        output[ 3] = X##bo; \
+        output[ 4] = X##bu; \
+        output[ 5] = X##ga; \
+        output[ 6] = X##ge; \
+        output[ 7] = X##gi; \
+        output[ 8] = X##go; \
+        output[ 9] = X##gu; \
+        output[10] = X##ka; \
+        output[11] = X##ke; \
+        output[12] = X##ki; \
+        output[13] = X##ko; \
+        output[14] = X##ku; \
+        output[15] = X##ma; \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount >= 17) { \
+                        output[16] = X##me; \
+                    } \
+                } \
+                else { \
+                    output[16] = X##me; \
+                    output[17] = X##mi; \
+                    if (laneCount >= 19) { \
+                        output[18] = X##mo; \
+                    } \
+                } \
+            } \
+            else { \
+                output[16] = X##me; \
+                output[17] = X##mi; \
+                output[18] = X##mo; \
+                output[19] = X##mu; \
+                if (laneCount < 22) { \
+                    if (laneCount >= 21) { \
+                        output[20] = X##sa; \
+                    } \
+                } \
+                else { \
+                    output[20] = X##sa; \
+                    output[21] = X##se; \
+                    if (laneCount >= 23) { \
+                        output[22] = X##si; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            output[16] = X##me; \
+            output[17] = X##mi; \
+            output[18] = X##mo; \
+            output[19] = X##mu; \
+            output[20] = X##sa; \
+            output[21] = X##se; \
+            output[22] = X##si; \
+            output[23] = X##so; \
+            if (laneCount >= 25) { \
+                output[24] = X##su; \
+            } \
+        } \
+    }
+
+#define wrapOne(X, input, output, index, name) \
+    X##name ^= input[index]; \
+    output[index] = X##name;
+
+#define wrapOneInvert(X, input, output, index, name) \
+    X##name ^= input[index]; \
+    output[index] = X##name;
+
+#define unwrapOne(X, input, output, index, name) \
+    output[index] = input[index] ^ X##name; \
+    X##name ^= output[index];
+
+#define unwrapOneInvert(X, input, output, index, name) \
+    output[index] = input[index] ^ X##name; \
+    X##name ^= output[index];
+
+#endif
+
+#define wrap(X, input, output, laneCount, trailingBits) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount < 1) { \
+                        X##ba ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 0, ba) \
+                        X##be ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOne(X, input, output, 0, ba) \
+                    wrapOneInvert(X, input, output, 1, be) \
+                    if (laneCount < 3) { \
+                        X##bi ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOneInvert(X, input, output, 2, bi) \
+                        X##bo ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                wrapOne(X, input, output, 0, ba) \
+                wrapOneInvert(X, input, output, 1, be) \
+                wrapOneInvert(X, input, output, 2, bi) \
+                wrapOne(X, input, output, 3, bo) \
+                if (laneCount < 6) { \
+                    if (laneCount < 5) { \
+                        X##bu ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 4, bu) \
+                        X##ga ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOne(X, input, output, 4, bu) \
+                    wrapOne(X, input, output, 5, ga) \
+                    if (laneCount < 7) { \
+                        X##ge ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 6, ge) \
+                        X##gi ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            wrapOne(X, input, output, 0, ba) \
+            wrapOneInvert(X, input, output, 1, be) \
+            wrapOneInvert(X, input, output, 2, bi) \
+            wrapOne(X, input, output, 3, bo) \
+            wrapOne(X, input, output, 4, bu) \
+            wrapOne(X, input, output, 5, ga) \
+            wrapOne(X, input, output, 6, ge) \
+            wrapOne(X, input, output, 7, gi) \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount < 9) { \
+                        X##go ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOneInvert(X, input, output, 8, go) \
+                        X##gu ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOneInvert(X, input, output, 8, go) \
+                    wrapOne(X, input, output, 9, gu) \
+                    if (laneCount < 11) { \
+                        X##ka ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 10, ka) \
+                        X##ke ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                wrapOneInvert(X, input, output, 8, go) \
+                wrapOne(X, input, output, 9, gu) \
+                wrapOne(X, input, output, 10, ka) \
+                wrapOne(X, input, output, 11, ke) \
+                if (laneCount < 14) { \
+                    if (laneCount < 13) { \
+                        X##ki ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOneInvert(X, input, output, 12, ki) \
+                        X##ko ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOneInvert(X, input, output, 12, ki) \
+                    wrapOne(X, input, output, 13, ko) \
+                    if (laneCount < 15) { \
+                        X##ku ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 14, ku) \
+                        X##ma ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        wrapOne(X, input, output, 0, ba) \
+        wrapOneInvert(X, input, output, 1, be) \
+        wrapOneInvert(X, input, output, 2, bi) \
+        wrapOne(X, input, output, 3, bo) \
+        wrapOne(X, input, output, 4, bu) \
+        wrapOne(X, input, output, 5, ga) \
+        wrapOne(X, input, output, 6, ge) \
+        wrapOne(X, input, output, 7, gi) \
+        wrapOneInvert(X, input, output, 8, go) \
+        wrapOne(X, input, output, 9, gu) \
+        wrapOne(X, input, output, 10, ka) \
+        wrapOne(X, input, output, 11, ke) \
+        wrapOneInvert(X, input, output, 12, ki) \
+        wrapOne(X, input, output, 13, ko) \
+        wrapOne(X, input, output, 14, ku) \
+        wrapOne(X, input, output, 15, ma) \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount < 17) { \
+                        X##me ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 16, me) \
+                        X##mi ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOne(X, input, output, 16, me) \
+                    wrapOneInvert(X, input, output, 17, mi) \
+                    if (laneCount < 19) { \
+                        X##mo ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 18, mo) \
+                        X##mu ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                wrapOne(X, input, output, 16, me) \
+                wrapOneInvert(X, input, output, 17, mi) \
+                wrapOne(X, input, output, 18, mo) \
+                wrapOne(X, input, output, 19, mu) \
+                if (laneCount < 22) { \
+                    if (laneCount < 21) { \
+                        X##sa ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOneInvert(X, input, output, 20, sa) \
+                        X##se ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    wrapOneInvert(X, input, output, 20, sa) \
+                    wrapOne(X, input, output, 21, se) \
+                    if (laneCount < 23) { \
+                        X##si ^= trailingBits; \
+                    } \
+                    else { \
+                        wrapOne(X, input, output, 22, si) \
+                        X##so ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            wrapOne(X, input, output, 16, me) \
+            wrapOneInvert(X, input, output, 17, mi) \
+            wrapOne(X, input, output, 18, mo) \
+            wrapOne(X, input, output, 19, mu) \
+            wrapOneInvert(X, input, output, 20, sa) \
+            wrapOne(X, input, output, 21, se) \
+            wrapOne(X, input, output, 22, si) \
+            wrapOne(X, input, output, 23, so) \
+            if (laneCount < 25) { \
+                X##su ^= trailingBits; \
+            } \
+            else { \
+                wrapOne(X, input, output, 24, su) \
+            } \
+        } \
+    }
+
+#define unwrap(X, input, output, laneCount, trailingBits) \
+    if (laneCount < 16) { \
+        if (laneCount < 8) { \
+            if (laneCount < 4) { \
+                if (laneCount < 2) { \
+                    if (laneCount < 1) { \
+                        X##ba ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 0, ba) \
+                        X##be ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOne(X, input, output, 0, ba) \
+                    unwrapOneInvert(X, input, output, 1, be) \
+                    if (laneCount < 3) { \
+                        X##bi ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOneInvert(X, input, output, 2, bi) \
+                        X##bo ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                unwrapOne(X, input, output, 0, ba) \
+                unwrapOneInvert(X, input, output, 1, be) \
+                unwrapOneInvert(X, input, output, 2, bi) \
+                unwrapOne(X, input, output, 3, bo) \
+                if (laneCount < 6) { \
+                    if (laneCount < 5) { \
+                        X##bu ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 4, bu) \
+                        X##ga ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOne(X, input, output, 4, bu) \
+                    unwrapOne(X, input, output, 5, ga) \
+                    if (laneCount < 7) { \
+                        X##ge ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 6, ge) \
+                        X##gi ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            unwrapOne(X, input, output, 0, ba) \
+            unwrapOneInvert(X, input, output, 1, be) \
+            unwrapOneInvert(X, input, output, 2, bi) \
+            unwrapOne(X, input, output, 3, bo) \
+            unwrapOne(X, input, output, 4, bu) \
+            unwrapOne(X, input, output, 5, ga) \
+            unwrapOne(X, input, output, 6, ge) \
+            unwrapOne(X, input, output, 7, gi) \
+            if (laneCount < 12) { \
+                if (laneCount < 10) { \
+                    if (laneCount < 9) { \
+                        X##go ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOneInvert(X, input, output, 8, go) \
+                        X##gu ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOneInvert(X, input, output, 8, go) \
+                    unwrapOne(X, input, output, 9, gu) \
+                    if (laneCount < 11) { \
+                        X##ka ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 10, ka) \
+                        X##ke ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                unwrapOneInvert(X, input, output, 8, go) \
+                unwrapOne(X, input, output, 9, gu) \
+                unwrapOne(X, input, output, 10, ka) \
+                unwrapOne(X, input, output, 11, ke) \
+                if (laneCount < 14) { \
+                    if (laneCount < 13) { \
+                        X##ki ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOneInvert(X, input, output, 12, ki) \
+                        X##ko ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOneInvert(X, input, output, 12, ki) \
+                    unwrapOne(X, input, output, 13, ko) \
+                    if (laneCount < 15) { \
+                        X##ku ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 14, ku) \
+                        X##ma ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+    } \
+    else { \
+        unwrapOne(X, input, output, 0, ba) \
+        unwrapOneInvert(X, input, output, 1, be) \
+        unwrapOneInvert(X, input, output, 2, bi) \
+        unwrapOne(X, input, output, 3, bo) \
+        unwrapOne(X, input, output, 4, bu) \
+        unwrapOne(X, input, output, 5, ga) \
+        unwrapOne(X, input, output, 6, ge) \
+        unwrapOne(X, input, output, 7, gi) \
+        unwrapOneInvert(X, input, output, 8, go) \
+        unwrapOne(X, input, output, 9, gu) \
+        unwrapOne(X, input, output, 10, ka) \
+        unwrapOne(X, input, output, 11, ke) \
+        unwrapOneInvert(X, input, output, 12, ki) \
+        unwrapOne(X, input, output, 13, ko) \
+        unwrapOne(X, input, output, 14, ku) \
+        unwrapOne(X, input, output, 15, ma) \
+        if (laneCount < 24) { \
+            if (laneCount < 20) { \
+                if (laneCount < 18) { \
+                    if (laneCount < 17) { \
+                        X##me ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 16, me) \
+                        X##mi ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOne(X, input, output, 16, me) \
+                    unwrapOneInvert(X, input, output, 17, mi) \
+                    if (laneCount < 19) { \
+                        X##mo ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 18, mo) \
+                        X##mu ^= trailingBits; \
+                    } \
+                } \
+            } \
+            else { \
+                unwrapOne(X, input, output, 16, me) \
+                unwrapOneInvert(X, input, output, 17, mi) \
+                unwrapOne(X, input, output, 18, mo) \
+                unwrapOne(X, input, output, 19, mu) \
+                if (laneCount < 22) { \
+                    if (laneCount < 21) { \
+                        X##sa ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOneInvert(X, input, output, 20, sa) \
+                        X##se ^= trailingBits; \
+                    } \
+                } \
+                else { \
+                    unwrapOneInvert(X, input, output, 20, sa) \
+                    unwrapOne(X, input, output, 21, se) \
+                    if (laneCount < 23) { \
+                        X##si ^= trailingBits; \
+                    } \
+                    else { \
+                        unwrapOne(X, input, output, 22, si) \
+                        X##so ^= trailingBits; \
+                    } \
+                } \
+            } \
+        } \
+        else { \
+            unwrapOne(X, input, output, 16, me) \
+            unwrapOneInvert(X, input, output, 17, mi) \
+            unwrapOne(X, input, output, 18, mo) \
+            unwrapOne(X, input, output, 19, mu) \
+            unwrapOneInvert(X, input, output, 20, sa) \
+            unwrapOne(X, input, output, 21, se) \
+            unwrapOne(X, input, output, 22, si) \
+            unwrapOne(X, input, output, 23, so) \
+            if (laneCount < 25) { \
+                X##su ^= trailingBits; \
+            } \
+            else { \
+                unwrapOne(X, input, output, 24, su) \
+            } \
+        } \
+    }
diff --git a/ext/hash/sha3/generic64lc/KeccakP-1600-SnP.h b/ext/hash/sha3/generic64lc/KeccakP-1600-SnP.h
new file mode 100644 (file)
index 0000000..85c67ca
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakP_1600_SnP_h_
+#define _KeccakP_1600_SnP_h_
+
+/** For the documentation, see SnP-documentation.h.
+ */
+
+#include "brg_endian.h"
+#include "KeccakP-1600-opt64-config.h"
+
+#define KeccakP1600_implementation      "generic 64-bit optimized implementation (" KeccakP1600_implementation_config ")"
+#define KeccakP1600_stateSizeInBytes    200
+#define KeccakP1600_stateAlignment      8
+#define KeccakF1600_FastLoop_supported
+
+#include <stddef.h>
+
+#define KeccakP1600_StaticInitialize()
+void KeccakP1600_Initialize(void *state);
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define KeccakP1600_AddByte(state, byte, offset) \
+    ((unsigned char*)(state))[(offset)] ^= (byte)
+#else
+void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset);
+#endif
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount);
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nrounds);
+void KeccakP1600_Permute_12rounds(void *state);
+void KeccakP1600_Permute_24rounds(void *state);
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length);
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length);
+size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen);
+
+#endif
diff --git a/ext/hash/sha3/generic64lc/KeccakP-1600-opt64-config.h b/ext/hash/sha3/generic64lc/KeccakP-1600-opt64-config.h
new file mode 100644 (file)
index 0000000..9501c64
--- /dev/null
@@ -0,0 +1,3 @@
+#define KeccakP1600_implementation_config "lane complementing, all rounds unrolled"
+#define KeccakP1600_fullUnrolling
+#define KeccakP1600_useLaneComplementing
diff --git a/ext/hash/sha3/generic64lc/KeccakP-1600-opt64.c b/ext/hash/sha3/generic64lc/KeccakP-1600-opt64.c
new file mode 100644 (file)
index 0000000..40853ff
--- /dev/null
@@ -0,0 +1,484 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include <string.h>
+#include <stdlib.h>
+#include "brg_endian.h"
+#include "KeccakP-1600-opt64-config.h"
+
+typedef unsigned char UINT8;
+typedef unsigned long long int UINT64;
+
+#if defined(KeccakP1600_useLaneComplementing)
+#define UseBebigokimisa
+#endif
+
+#if defined(_MSC_VER)
+#define ROL64(a, offset) _rotl64(a, offset)
+#elif defined(KeccakP1600_useSHLD)
+    #define ROL64(x,N) ({ \
+    register UINT64 __out; \
+    register UINT64 __in = x; \
+    __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \
+    __out; \
+    })
+#else
+#define ROL64(a, offset) ((((UINT64)a) << offset) ^ (((UINT64)a) >> (64-offset)))
+#endif
+
+#include "KeccakP-1600-64.macros"
+#ifdef KeccakP1600_fullUnrolling
+#define FullUnrolling
+#else
+#define Unrolling KeccakP1600_unrolling
+#endif
+#include "KeccakP-1600-unrolling.macros"
+#include "SnP-Relaned.h"
+
+static const UINT64 KeccakF1600RoundConstants[24] = {
+    0x0000000000000001ULL,
+    0x0000000000008082ULL,
+    0x800000000000808aULL,
+    0x8000000080008000ULL,
+    0x000000000000808bULL,
+    0x0000000080000001ULL,
+    0x8000000080008081ULL,
+    0x8000000000008009ULL,
+    0x000000000000008aULL,
+    0x0000000000000088ULL,
+    0x0000000080008009ULL,
+    0x000000008000000aULL,
+    0x000000008000808bULL,
+    0x800000000000008bULL,
+    0x8000000000008089ULL,
+    0x8000000000008003ULL,
+    0x8000000000008002ULL,
+    0x8000000000000080ULL,
+    0x000000000000800aULL,
+    0x800000008000000aULL,
+    0x8000000080008081ULL,
+    0x8000000000008080ULL,
+    0x0000000080000001ULL,
+    0x8000000080008008ULL };
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Initialize(void *state)
+{
+    memset(state, 0, 200);
+#ifdef KeccakP1600_useLaneComplementing
+    ((UINT64*)state)[ 1] = ~(UINT64)0;
+    ((UINT64*)state)[ 2] = ~(UINT64)0;
+    ((UINT64*)state)[ 8] = ~(UINT64)0;
+    ((UINT64*)state)[12] = ~(UINT64)0;
+    ((UINT64*)state)[17] = ~(UINT64)0;
+    ((UINT64*)state)[20] = ~(UINT64)0;
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    UINT64 lane;
+    if (length == 0)
+        return;
+    if (length == 1)
+        lane = data[0];
+    else {
+        lane = 0;
+        memcpy(&lane, data, length);
+    }
+    lane <<= offset*8;
+#else
+    UINT64 lane = 0;
+    unsigned int i;
+    for(i=0; i<length; i++)
+        lane |= ((UINT64)data[i]) << ((i+offset)*8);
+#endif
+    ((UINT64*)state)[lanePosition] ^= lane;
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    unsigned int i = 0;
+#ifdef NO_MISALIGNED_ACCESSES
+    /* If either pointer is misaligned, fall back to byte-wise xor. */
+    if (((((uintptr_t)state) & 7) != 0) || ((((uintptr_t)data) & 7) != 0)) {
+      for (i = 0; i < laneCount * 8; i++) {
+        ((unsigned char*)state)[i] ^= data[i];
+      }
+    }
+    else
+#endif
+    {
+      /* Otherwise... */
+      for( ; (i+8)<=laneCount; i+=8) {
+          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+          ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+          ((UINT64*)state)[i+2] ^= ((UINT64*)data)[i+2];
+          ((UINT64*)state)[i+3] ^= ((UINT64*)data)[i+3];
+          ((UINT64*)state)[i+4] ^= ((UINT64*)data)[i+4];
+          ((UINT64*)state)[i+5] ^= ((UINT64*)data)[i+5];
+          ((UINT64*)state)[i+6] ^= ((UINT64*)data)[i+6];
+          ((UINT64*)state)[i+7] ^= ((UINT64*)data)[i+7];
+      }
+      for( ; (i+4)<=laneCount; i+=4) {
+          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+          ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+          ((UINT64*)state)[i+2] ^= ((UINT64*)data)[i+2];
+          ((UINT64*)state)[i+3] ^= ((UINT64*)data)[i+3];
+      }
+      for( ; (i+2)<=laneCount; i+=2) {
+          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+          ((UINT64*)state)[i+1] ^= ((UINT64*)data)[i+1];
+      }
+      if (i<laneCount) {
+          ((UINT64*)state)[i+0] ^= ((UINT64*)data)[i+0];
+      }
+    }
+#else
+    unsigned int i;
+    UINT8 *curData = data;
+    for(i=0; i<laneCount; i++, curData+=8) {
+        UINT64 lane = (UINT64)curData[0]
+            | ((UINT64)curData[1] << 8)
+            | ((UINT64)curData[2] << 16)
+            | ((UINT64)curData[3] << 24)
+            | ((UINT64)curData[4] <<32)
+            | ((UINT64)curData[5] << 40)
+            | ((UINT64)curData[6] << 48)
+            | ((UINT64)curData[7] << 56);
+        ((UINT64*)state)[i] ^= lane;
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset)
+{
+    UINT64 lane = byte;
+    lane <<= (offset%8)*8;
+    ((UINT64*)state)[offset/8] ^= lane;
+}
+#endif
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#ifdef KeccakP1600_useLaneComplementing
+    if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20)) {
+        unsigned int i;
+        for(i=0; i<length; i++)
+            ((unsigned char*)state)[lanePosition*8+offset+i] = ~data[i];
+    }
+    else
+#endif
+    {
+        memcpy((unsigned char*)state+lanePosition*8+offset, data, length);
+    }
+#else
+#error "Not yet implemented"
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteLanes(void *state, const unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#ifdef KeccakP1600_useLaneComplementing
+    unsigned int lanePosition;
+
+    for(lanePosition=0; lanePosition<laneCount; lanePosition++)
+        if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+            ((UINT64*)state)[lanePosition] = ~((const UINT64*)data)[lanePosition];
+        else
+            ((UINT64*)state)[lanePosition] = ((const UINT64*)data)[lanePosition];
+#else
+    memcpy(state, data, laneCount*8);
+#endif
+#else
+#error "Not yet implemented"
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_OverwriteBytes(state, data, offset, length, KeccakP1600_OverwriteLanes, KeccakP1600_OverwriteBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#ifdef KeccakP1600_useLaneComplementing
+    unsigned int lanePosition;
+
+    for(lanePosition=0; lanePosition<byteCount/8; lanePosition++)
+        if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+            ((UINT64*)state)[lanePosition] = ~0;
+        else
+            ((UINT64*)state)[lanePosition] = 0;
+    if (byteCount%8 != 0) {
+        lanePosition = byteCount/8;
+        if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+            memset((unsigned char*)state+lanePosition*8, 0xFF, byteCount%8);
+        else
+            memset((unsigned char*)state+lanePosition*8, 0, byteCount%8);
+    }
+#else
+    memset(state, 0, byteCount);
+#endif
+#else
+#error "Not yet implemented"
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_Nrounds(void *state, unsigned int nr)
+{
+    declareABCDE
+    unsigned int i;
+    UINT64 *stateAsLanes = (UINT64*)state;
+
+    copyFromState(A, stateAsLanes)
+    roundsN(nr)
+    copyToState(stateAsLanes, A)
+
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_24rounds(void *state)
+{
+    declareABCDE
+    #ifndef KeccakP1600_fullUnrolling
+    unsigned int i;
+    #endif
+    UINT64 *stateAsLanes = (UINT64*)state;
+
+    copyFromState(A, stateAsLanes)
+    rounds24
+    copyToState(stateAsLanes, A)
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_Permute_12rounds(void *state)
+{
+    declareABCDE
+    #ifndef KeccakP1600_fullUnrolling
+    unsigned int i;
+    #endif
+    UINT64 *stateAsLanes = (UINT64*)state;
+
+    copyFromState(A, stateAsLanes)
+    rounds12
+    copyToState(stateAsLanes, A)
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    UINT64 lane = ((UINT64*)state)[lanePosition];
+#ifdef KeccakP1600_useLaneComplementing
+    if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+        lane = ~lane;
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    {
+        UINT64 lane1[1];
+        lane1[0] = lane;
+        memcpy(data, (UINT8*)lane1+offset, length);
+    }
+#else
+    unsigned int i;
+    lane >>= offset*8;
+    for(i=0; i<length; i++) {
+        data[i] = lane & 0xFF;
+        lane >>= 8;
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+void fromWordToBytes(UINT8 *bytes, const UINT64 word)
+{
+    unsigned int i;
+
+    for(i=0; i<(64/8); i++)
+        bytes[i] = (word >> (8*i)) & 0xFF;
+}
+#endif
+
+void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount)
+{
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    memcpy(data, state, laneCount*8);
+#else
+    unsigned int i;
+
+    for(i=0; i<laneCount; i++)
+        fromWordToBytes(data+(i*8), ((const UINT64*)state)[i]);
+#endif
+#ifdef KeccakP1600_useLaneComplementing
+    if (laneCount > 1) {
+        ((UINT64*)data)[ 1] = ~((UINT64*)data)[ 1];
+        if (laneCount > 2) {
+            ((UINT64*)data)[ 2] = ~((UINT64*)data)[ 2];
+            if (laneCount > 8) {
+                ((UINT64*)data)[ 8] = ~((UINT64*)data)[ 8];
+                if (laneCount > 12) {
+                    ((UINT64*)data)[12] = ~((UINT64*)data)[12];
+                    if (laneCount > 17) {
+                        ((UINT64*)data)[17] = ~((UINT64*)data)[17];
+                        if (laneCount > 20) {
+                            ((UINT64*)data)[20] = ~((UINT64*)data)[20];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytesInLane(const void *state, unsigned int lanePosition, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    UINT64 lane = ((UINT64*)state)[lanePosition];
+#ifdef KeccakP1600_useLaneComplementing
+    if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20))
+        lane = ~lane;
+#endif
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+    {
+        unsigned int i;
+        UINT64 lane1[1];
+        lane1[0] = lane;
+        for(i=0; i<length; i++)
+            output[i] = input[i] ^ ((UINT8*)lane1)[offset+i];
+    }
+#else
+    unsigned int i;
+    lane >>= offset*8;
+    for(i=0; i<length; i++) {
+        output[i] = input[i] ^ (lane & 0xFF);
+        lane >>= 8;
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddLanes(const void *state, const unsigned char *input, unsigned char *output, unsigned int laneCount)
+{
+    unsigned int i;
+#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN)
+    unsigned char temp[8];
+    unsigned int j;
+#endif
+
+    for(i=0; i<laneCount; i++) {
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+        ((UINT64*)output)[i] = ((UINT64*)input)[i] ^ ((const UINT64*)state)[i];
+#else
+        fromWordToBytes(temp, ((const UINT64*)state)[i]);
+        for(j=0; j<8; j++)
+            output[i*8+j] = input[i*8+j] ^ temp[j];
+#endif
+    }
+#ifdef KeccakP1600_useLaneComplementing
+    if (laneCount > 1) {
+        ((UINT64*)output)[ 1] = ~((UINT64*)output)[ 1];
+        if (laneCount > 2) {
+            ((UINT64*)output)[ 2] = ~((UINT64*)output)[ 2];
+            if (laneCount > 8) {
+                ((UINT64*)output)[ 8] = ~((UINT64*)output)[ 8];
+                if (laneCount > 12) {
+                    ((UINT64*)output)[12] = ~((UINT64*)output)[12];
+                    if (laneCount > 17) {
+                        ((UINT64*)output)[17] = ~((UINT64*)output)[17];
+                        if (laneCount > 20) {
+                            ((UINT64*)output)[20] = ~((UINT64*)output)[20];
+                        }
+                    }
+                }
+            }
+        }
+    }
+#endif
+}
+
+/* ---------------------------------------------------------------- */
+
+void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length)
+{
+    SnP_ExtractAndAddBytes(state, input, output, offset, length, KeccakP1600_ExtractAndAddLanes, KeccakP1600_ExtractAndAddBytesInLane, 8);
+}
+
+/* ---------------------------------------------------------------- */
+
+size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen)
+{
+    size_t originalDataByteLen = dataByteLen;
+    declareABCDE
+    #ifndef KeccakP1600_fullUnrolling
+    unsigned int i;
+    #endif
+    UINT64 *stateAsLanes = (UINT64*)state;
+    UINT64 *inDataAsLanes = (UINT64*)data;
+
+    copyFromState(A, stateAsLanes)
+    while(dataByteLen >= laneCount*8) {
+        addInput(A, inDataAsLanes, laneCount)
+        rounds24
+        inDataAsLanes += laneCount;
+        dataByteLen -= laneCount*8;
+    }
+    copyToState(stateAsLanes, A)
+    return originalDataByteLen - dataByteLen;
+}
diff --git a/ext/hash/sha3/generic64lc/KeccakP-1600-unrolling.macros b/ext/hash/sha3/generic64lc/KeccakP-1600-unrolling.macros
new file mode 100644 (file)
index 0000000..3180bb0
--- /dev/null
@@ -0,0 +1,198 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#if (defined(FullUnrolling))
+#define rounds24 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta( 0, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 1, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 2, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 3, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 4, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 5, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 6, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 7, E, A) \
+    thetaRhoPiChiIotaPrepareTheta( 8, A, E) \
+    thetaRhoPiChiIotaPrepareTheta( 9, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(10, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(11, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 12)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=12) { \
+        thetaRhoPiChiIotaPrepareTheta(i   , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 5, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 6, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 7, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 8, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+ 9, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+10, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+11, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    thetaRhoPiChiIotaPrepareTheta(12, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(13, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(14, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(15, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(16, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(17, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(18, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(19, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(20, A, E) \
+    thetaRhoPiChiIotaPrepareTheta(21, E, A) \
+    thetaRhoPiChiIotaPrepareTheta(22, A, E) \
+    thetaRhoPiChiIota(23, E, A) \
+
+#elif (Unrolling == 6)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=6) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \
+    } \
+
+#elif (Unrolling == 4)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=4) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \
+    } \
+
+#elif (Unrolling == 3)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=3) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+        thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#elif (Unrolling == 2)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    } \
+
+#elif (Unrolling == 1)
+#define rounds24 \
+    prepareTheta \
+    for(i=0; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#define rounds12 \
+    prepareTheta \
+    for(i=12; i<24; i++) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        copyStateVariables(A, E) \
+    } \
+
+#else
+#error "Unrolling is not correctly specified!"
+#endif
+
+#define roundsN(__nrounds) \
+    prepareTheta \
+    i = 24 - (__nrounds); \
+    if ((i&1) != 0) { \
+        thetaRhoPiChiIotaPrepareTheta(i, A, E) \
+        copyStateVariables(A, E) \
+        ++i; \
+    } \
+    for( /* empty */; i<24; i+=2) { \
+        thetaRhoPiChiIotaPrepareTheta(i  , A, E) \
+        thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \
+    }
diff --git a/ext/hash/sha3/generic64lc/KeccakSponge.c b/ext/hash/sha3/generic64lc/KeccakSponge.c
new file mode 100644 (file)
index 0000000..08d4a19
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#include "KeccakSponge.h"
+
+#ifdef KeccakReference
+    #include "displayIntermediateValues.h"
+#endif
+
+#ifndef KeccakP200_excluded
+    #include "KeccakP-200-SnP.h"
+
+    #define prefix KeccakWidth200
+    #define SnP KeccakP200
+    #define SnP_width 200
+    #define SnP_Permute KeccakP200_Permute_18rounds
+    #if defined(KeccakF200_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF200_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP400_excluded
+    #include "KeccakP-400-SnP.h"
+
+    #define prefix KeccakWidth400
+    #define SnP KeccakP400
+    #define SnP_width 400
+    #define SnP_Permute KeccakP400_Permute_20rounds
+    #if defined(KeccakF400_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF400_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP800_excluded
+    #include "KeccakP-800-SnP.h"
+
+    #define prefix KeccakWidth800
+    #define SnP KeccakP800
+    #define SnP_width 800
+    #define SnP_Permute KeccakP800_Permute_22rounds
+    #if defined(KeccakF800_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF800_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+
+    #define prefix KeccakWidth1600
+    #define SnP KeccakP1600
+    #define SnP_width 1600
+    #define SnP_Permute KeccakP1600_Permute_24rounds
+    #if defined(KeccakF1600_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakF1600_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+
+    #define prefix KeccakWidth1600_12rounds
+    #define SnP KeccakP1600
+    #define SnP_width 1600
+    #define SnP_Permute KeccakP1600_Permute_12rounds
+    #if defined(KeccakP1600_12rounds_FastLoop_supported)
+        #define SnP_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb
+    #endif
+        #include "KeccakSponge.inc"
+    #undef prefix
+    #undef SnP
+    #undef SnP_width
+    #undef SnP_Permute
+    #undef SnP_FastLoop_Absorb
+#endif
diff --git a/ext/hash/sha3/generic64lc/KeccakSponge.h b/ext/hash/sha3/generic64lc/KeccakSponge.h
new file mode 100644 (file)
index 0000000..a8526fe
--- /dev/null
@@ -0,0 +1,178 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _KeccakSponge_h_
+#define _KeccakSponge_h_
+
+/** General information
+  *
+  * The following type and functions are not actually implemented. Their
+  * documentation is generic, with the prefix Prefix replaced by
+  * - KeccakWidth200 for a sponge function based on Keccak-f[200]
+  * - KeccakWidth400 for a sponge function based on Keccak-f[400]
+  * - KeccakWidth800 for a sponge function based on Keccak-f[800]
+  * - KeccakWidth1600 for a sponge function based on Keccak-f[1600]
+  *
+  * In all these functions, the rate and capacity must sum to the width of the
+  * chosen permutation. For instance, to use the sponge function
+  * Keccak[r=1344, c=256], one must use KeccakWidth1600_Sponge() or a combination
+  * of KeccakWidth1600_SpongeInitialize(), KeccakWidth1600_SpongeAbsorb(),
+  * KeccakWidth1600_SpongeAbsorbLastFewBits() and
+  * KeccakWidth1600_SpongeSqueeze().
+  *
+  * The Prefix_SpongeInstance contains the sponge instance attributes for use
+  * with the Prefix_Sponge* functions.
+  * It gathers the state processed by the permutation as well as the rate,
+  * the position of input/output bytes in the state and the phase
+  * (absorbing or squeezing).
+  */
+
+#ifdef DontReallyInclude_DocumentationOnly
+/** Function to evaluate the sponge function Keccak[r, c] in a single call.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @param  input           Pointer to the input message (before the suffix).
+  * @param  inputByteLen    The length of the input message in bytes.
+  * @param  suffix          Byte containing from 0 to 7 suffix bits
+  *                         that must be absorbed after @a input.
+  *                         These <i>n</i> bits must be in the least significant bit positions.
+  *                         These bits must be delimited with a bit 1 at position <i>n</i>
+  *                         (counting from 0=LSB to 7=MSB) and followed by bits 0
+  *                         from position <i>n</i>+1 to position 7.
+  *                         Some examples:
+  *                             - If no bits are to be absorbed, then @a suffix must be 0x01.
+  *                             - If the 2-bit sequence 0,0 is to be absorbed, @a suffix must be 0x04.
+  *                             - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a suffix must be 0x32.
+  *                             - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a suffix must be 0x8B.
+  *                         .
+  * @param  output          Pointer to the output buffer.
+  * @param  outputByteLen   The desired number of output bytes.
+  * @pre    One must have r+c equal to the supported width of this implementation
+  *         and the rate a multiple of 8 bits (one byte) in this implementation.
+  * @pre    @a suffix ≠ 0x00
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen);
+
+/**
+  * Function to initialize the state of the Keccak[r, c] sponge function.
+  * The phase of the sponge function is set to absorbing.
+  * @param  spongeInstance  Pointer to the sponge instance to be initialized.
+  * @param  rate        The value of the rate r.
+  * @param  capacity    The value of the capacity c.
+  * @pre    One must have r+c equal to the supported width of this implementation
+  *         and the rate a multiple of 8 bits (one byte) in this implementation.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeInitialize(Prefix_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity);
+
+/**
+  * Function to give input data bytes for the sponge function to absorb.
+  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+  * @param  data        Pointer to the input data.
+  * @param  dataByteLen  The number of input bytes provided in the input data.
+  * @pre    The sponge function must be in the absorbing phase,
+  *         i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
+  *         must not have been called before.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeAbsorb(Prefix_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen);
+
+/**
+  * Function to give input data bits for the sponge function to absorb
+  * and then to switch to the squeezing phase.
+  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+  * @param  delimitedData   Byte containing from 0 to 7 trailing bits
+  *                     that must be absorbed.
+  *                     These <i>n</i> bits must be in the least significant bit positions.
+  *                     These bits must be delimited with a bit 1 at position <i>n</i>
+  *                     (counting from 0=LSB to 7=MSB) and followed by bits 0
+  *                     from position <i>n</i>+1 to position 7.
+  *                     Some examples:
+  *                         - If no bits are to be absorbed, then @a delimitedData must be 0x01.
+  *                         - If the 2-bit sequence 0,0 is to be absorbed, @a delimitedData must be 0x04.
+  *                         - If the 5-bit sequence 0,1,0,0,1 is to be absorbed, @a delimitedData must be 0x32.
+  *                         - If the 7-bit sequence 1,1,0,1,0,0,0 is to be absorbed, @a delimitedData must be 0x8B.
+  *                     .
+  * @pre    The sponge function must be in the absorbing phase,
+  *         i.e., Prefix_SpongeSqueeze() or Prefix_SpongeAbsorbLastFewBits()
+  *         must not have been called before.
+  * @pre    @a delimitedData ≠ 0x00
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeAbsorbLastFewBits(Prefix_SpongeInstance *spongeInstance, unsigned char delimitedData);
+
+/**
+  * Function to squeeze output data from the sponge function.
+  * If the sponge function was in the absorbing phase, this function
+  * switches it to the squeezing phase
+  * as if Prefix_SpongeAbsorbLastFewBits(spongeInstance, 0x01) was called.
+  * @param  spongeInstance  Pointer to the sponge instance initialized by Prefix_SpongeInitialize().
+  * @param  data        Pointer to the buffer where to store the output data.
+  * @param  dataByteLen The number of output bytes desired.
+  * @return Zero if successful, 1 otherwise.
+  */
+int Prefix_SpongeSqueeze(Prefix_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
+#endif
+
+#include <string.h>
+#include "align.h"
+
+#define KCP_DeclareSpongeStructure(prefix, size, alignment) \
+    ALIGN(alignment) typedef struct prefix##_SpongeInstanceStruct { \
+        unsigned char state[size]; \
+        unsigned int rate; \
+        unsigned int byteIOIndex; \
+        int squeezing; \
+    } prefix##_SpongeInstance;
+
+#define KCP_DeclareSpongeFunctions(prefix) \
+    int prefix##_Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen); \
+    int prefix##_SpongeInitialize(prefix##_SpongeInstance *spongeInstance, unsigned int rate, unsigned int capacity); \
+    int prefix##_SpongeAbsorb(prefix##_SpongeInstance *spongeInstance, const unsigned char *data, size_t dataByteLen); \
+    int prefix##_SpongeAbsorbLastFewBits(prefix##_SpongeInstance *spongeInstance, unsigned char delimitedData); \
+    int prefix##_SpongeSqueeze(prefix##_SpongeInstance *spongeInstance, unsigned char *data, size_t dataByteLen);
+
+#ifndef KeccakP200_excluded
+    #include "KeccakP-200-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth200, KeccakP200_stateSizeInBytes, KeccakP200_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth200)
+#endif
+
+#ifndef KeccakP400_excluded
+    #include "KeccakP-400-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth400, KeccakP400_stateSizeInBytes, KeccakP400_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth400)
+#endif
+
+#ifndef KeccakP800_excluded
+    #include "KeccakP-800-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth800, KeccakP800_stateSizeInBytes, KeccakP800_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth800)
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth1600, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth1600)
+#endif
+
+#ifndef KeccakP1600_excluded
+    #include "KeccakP-1600-SnP.h"
+    KCP_DeclareSpongeStructure(KeccakWidth1600_12rounds, KeccakP1600_stateSizeInBytes, KeccakP1600_stateAlignment)
+    KCP_DeclareSpongeFunctions(KeccakWidth1600_12rounds)
+#endif
+
+#endif
diff --git a/ext/hash/sha3/generic64lc/KeccakSponge.inc b/ext/hash/sha3/generic64lc/KeccakSponge.inc
new file mode 100644 (file)
index 0000000..42a15aa
--- /dev/null
@@ -0,0 +1,313 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#define JOIN0(a, b)                     a ## b
+#define JOIN(a, b)                      JOIN0(a, b)
+
+#define Sponge                          JOIN(prefix, _Sponge)
+#define SpongeInstance                  JOIN(prefix, _SpongeInstance)
+#define SpongeInitialize                JOIN(prefix, _SpongeInitialize)
+#define SpongeAbsorb                    JOIN(prefix, _SpongeAbsorb)
+#define SpongeAbsorbLastFewBits         JOIN(prefix, _SpongeAbsorbLastFewBits)
+#define SpongeSqueeze                   JOIN(prefix, _SpongeSqueeze)
+
+#define SnP_stateSizeInBytes            JOIN(SnP, _stateSizeInBytes)
+#define SnP_stateAlignment              JOIN(SnP, _stateAlignment)
+#define SnP_StaticInitialize            JOIN(SnP, _StaticInitialize)
+#define SnP_Initialize                  JOIN(SnP, _Initialize)
+#define SnP_AddByte                     JOIN(SnP, _AddByte)
+#define SnP_AddBytes                    JOIN(SnP, _AddBytes)
+#define SnP_ExtractBytes                JOIN(SnP, _ExtractBytes)
+
+int Sponge(unsigned int rate, unsigned int capacity, const unsigned char *input, size_t inputByteLen, unsigned char suffix, unsigned char *output, size_t outputByteLen)
+{
+    ALIGN(SnP_stateAlignment) unsigned char state[SnP_stateSizeInBytes];
+    unsigned int partialBlock;
+    const unsigned char *curInput = input;
+    unsigned char *curOutput = output;
+    unsigned int rateInBytes = rate/8;
+
+    if (rate+capacity != SnP_width)
+        return 1;
+    if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+        return 1;
+    if (suffix == 0)
+        return 1;
+
+    /* Initialize the state */
+    SnP_StaticInitialize();
+    SnP_Initialize(state);
+
+    /* First, absorb whole blocks */
+#ifdef SnP_FastLoop_Absorb
+    if (((rateInBytes % (SnP_width/200)) == 0) && (inputByteLen >= rateInBytes)) {
+        /* fast lane: whole lane rate */
+        size_t j;
+        j = SnP_FastLoop_Absorb(state, rateInBytes/(SnP_width/200), curInput, inputByteLen);
+        curInput += j;
+        inputByteLen -= j;
+    }
+#endif
+    while(inputByteLen >= (size_t)rateInBytes) {
+        #ifdef KeccakReference
+        displayBytes(1, "Block to be absorbed", curInput, rateInBytes);
+        #endif
+        SnP_AddBytes(state, curInput, 0, rateInBytes);
+        SnP_Permute(state);
+        curInput += rateInBytes;
+        inputByteLen -= rateInBytes;
+    }
+
+    /* Then, absorb what remains */
+    partialBlock = (unsigned int)inputByteLen;
+    #ifdef KeccakReference
+    displayBytes(1, "Block to be absorbed (part)", curInput, partialBlock);
+    #endif
+    SnP_AddBytes(state, curInput, 0, partialBlock);
+
+    /* Finally, absorb the suffix */
+    #ifdef KeccakReference
+    {
+        unsigned char delimitedData1[1];
+        delimitedData1[0] = suffix;
+        displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+    }
+    #endif
+    /* Last few bits, whose delimiter coincides with first bit of padding */
+    SnP_AddByte(state, suffix, partialBlock);
+    /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+    if ((suffix >= 0x80) && (partialBlock == (rateInBytes-1)))
+        SnP_Permute(state);
+    /* Second bit of padding */
+    SnP_AddByte(state, 0x80, rateInBytes-1);
+    #ifdef KeccakReference
+    {
+        unsigned char block[SnP_width/8];
+        memset(block, 0, SnP_width/8);
+        block[rateInBytes-1] = 0x80;
+        displayBytes(1, "Second bit of padding", block, rateInBytes);
+    }
+    #endif
+    SnP_Permute(state);
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+
+    /* First, output whole blocks */
+    while(outputByteLen > (size_t)rateInBytes) {
+        SnP_ExtractBytes(state, curOutput, 0, rateInBytes);
+        SnP_Permute(state);
+        #ifdef KeccakReference
+        displayBytes(1, "Squeezed block", curOutput, rateInBytes);
+        #endif
+        curOutput += rateInBytes;
+        outputByteLen -= rateInBytes;
+    }
+
+    /* Finally, output what remains */
+    partialBlock = (unsigned int)outputByteLen;
+    SnP_ExtractBytes(state, curOutput, 0, partialBlock);
+    #ifdef KeccakReference
+    displayBytes(1, "Squeezed block (part)", curOutput, partialBlock);
+    #endif
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+/* ---------------------------------------------------------------- */
+
+int SpongeInitialize(SpongeInstance *instance, unsigned int rate, unsigned int capacity)
+{
+    if (rate+capacity != SnP_width)
+        return 1;
+    if ((rate <= 0) || (rate > SnP_width) || ((rate % 8) != 0))
+        return 1;
+    SnP_StaticInitialize();
+    SnP_Initialize(instance->state);
+    instance->rate = rate;
+    instance->byteIOIndex = 0;
+    instance->squeezing = 0;
+
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorb(SpongeInstance *instance, const unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    unsigned int partialBlock;
+    const unsigned char *curData;
+    unsigned int rateInBytes = instance->rate/8;
+
+    if (instance->squeezing)
+        return 1; /* Too late for additional input */
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == 0) && (dataByteLen >= (i + rateInBytes))) {
+#ifdef SnP_FastLoop_Absorb
+            /* processing full blocks first */
+            if ((rateInBytes % (SnP_width/200)) == 0) {
+                /* fast lane: whole lane rate */
+                j = SnP_FastLoop_Absorb(instance->state, rateInBytes/(SnP_width/200), curData, dataByteLen - i);
+                i += j;
+                curData += j;
+            }
+            else {
+#endif
+                for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                    #ifdef KeccakReference
+                    displayBytes(1, "Block to be absorbed", curData, rateInBytes);
+                    #endif
+                    SnP_AddBytes(instance->state, curData, 0, rateInBytes);
+                    SnP_Permute(instance->state);
+                    curData+=rateInBytes;
+                }
+                i = dataByteLen - j;
+#ifdef SnP_FastLoop_Absorb
+            }
+#endif
+        }
+        else {
+            /* normal lane: using the message queue */
+            partialBlock = (unsigned int)(dataByteLen - i);
+            if (partialBlock+instance->byteIOIndex > rateInBytes)
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            #ifdef KeccakReference
+            displayBytes(1, "Block to be absorbed (part)", curData, partialBlock);
+            #endif
+            i += partialBlock;
+
+            SnP_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+            if (instance->byteIOIndex == rateInBytes) {
+                SnP_Permute(instance->state);
+                instance->byteIOIndex = 0;
+            }
+        }
+    }
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeAbsorbLastFewBits(SpongeInstance *instance, unsigned char delimitedData)
+{
+    unsigned int rateInBytes = instance->rate/8;
+
+    if (delimitedData == 0)
+        return 1;
+    if (instance->squeezing)
+        return 1; /* Too late for additional input */
+
+    #ifdef KeccakReference
+    {
+        unsigned char delimitedData1[1];
+        delimitedData1[0] = delimitedData;
+        displayBytes(1, "Block to be absorbed (last few bits + first bit of padding)", delimitedData1, 1);
+    }
+    #endif
+    /* Last few bits, whose delimiter coincides with first bit of padding */
+    SnP_AddByte(instance->state, delimitedData, instance->byteIOIndex);
+    /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */
+    if ((delimitedData >= 0x80) && (instance->byteIOIndex == (rateInBytes-1)))
+        SnP_Permute(instance->state);
+    /* Second bit of padding */
+    SnP_AddByte(instance->state, 0x80, rateInBytes-1);
+    #ifdef KeccakReference
+    {
+        unsigned char block[SnP_width/8];
+        memset(block, 0, SnP_width/8);
+        block[rateInBytes-1] = 0x80;
+        displayBytes(1, "Second bit of padding", block, rateInBytes);
+    }
+    #endif
+    SnP_Permute(instance->state);
+    instance->byteIOIndex = 0;
+    instance->squeezing = 1;
+    #ifdef KeccakReference
+    displayText(1, "--- Switching to squeezing phase ---");
+    #endif
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+int SpongeSqueeze(SpongeInstance *instance, unsigned char *data, size_t dataByteLen)
+{
+    size_t i, j;
+    unsigned int partialBlock;
+    unsigned int rateInBytes = instance->rate/8;
+    unsigned char *curData;
+
+    if (!instance->squeezing)
+        SpongeAbsorbLastFewBits(instance, 0x01);
+
+    i = 0;
+    curData = data;
+    while(i < dataByteLen) {
+        if ((instance->byteIOIndex == rateInBytes) && (dataByteLen >= (i + rateInBytes))) {
+            for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) {
+                SnP_Permute(instance->state);
+                SnP_ExtractBytes(instance->state, curData, 0, rateInBytes);
+                #ifdef KeccakReference
+                displayBytes(1, "Squeezed block", curData, rateInBytes);
+                #endif
+                curData+=rateInBytes;
+            }
+            i = dataByteLen - j;
+        }
+        else {
+            /* normal lane: using the message queue */
+            if (instance->byteIOIndex == rateInBytes) {
+                SnP_Permute(instance->state);
+                instance->byteIOIndex = 0;
+            }
+            partialBlock = (unsigned int)(dataByteLen - i);
+            if (partialBlock+instance->byteIOIndex > rateInBytes)
+                partialBlock = rateInBytes-instance->byteIOIndex;
+            i += partialBlock;
+
+            SnP_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock);
+            #ifdef KeccakReference
+            displayBytes(1, "Squeezed block (part)", curData, partialBlock);
+            #endif
+            curData += partialBlock;
+            instance->byteIOIndex += partialBlock;
+        }
+    }
+    return 0;
+}
+
+/* ---------------------------------------------------------------- */
+
+#undef Sponge
+#undef SpongeInstance
+#undef SpongeInitialize
+#undef SpongeAbsorb
+#undef SpongeAbsorbLastFewBits
+#undef SpongeSqueeze
+#undef SnP_stateSizeInBytes
+#undef SnP_stateAlignment
+#undef SnP_StaticInitialize
+#undef SnP_Initialize
+#undef SnP_AddByte
+#undef SnP_AddBytes
+#undef SnP_ExtractBytes
diff --git a/ext/hash/sha3/generic64lc/SnP-Relaned.h b/ext/hash/sha3/generic64lc/SnP-Relaned.h
new file mode 100644 (file)
index 0000000..086e635
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _SnP_Relaned_h_
+#define _SnP_Relaned_h_
+
+#define SnP_AddBytes(state, data, offset, length, SnP_AddLanes, SnP_AddBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_AddLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_AddBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+#define SnP_OverwriteBytes(state, data, offset, length, SnP_OverwriteLanes, SnP_OverwriteBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_OverwriteLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_OverwriteBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_OverwriteBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \
+            SnP_ExtractBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            unsigned char *_curData = (data); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curData += _bytesInLane; \
+            } \
+        } \
+    }
+
+#define SnP_ExtractAndAddBytes(state, input, output, offset, length, SnP_ExtractAndAddLanes, SnP_ExtractAndAddBytesInLane, SnP_laneLengthInBytes) \
+    { \
+        if ((offset) == 0) { \
+            SnP_ExtractAndAddLanes(state, input, output, (length)/SnP_laneLengthInBytes); \
+            SnP_ExtractAndAddBytesInLane(state, \
+                (length)/SnP_laneLengthInBytes, \
+                (input)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                (output)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \
+                0, \
+                (length)%SnP_laneLengthInBytes); \
+        } \
+        else { \
+            unsigned int _sizeLeft = (length); \
+            unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \
+            unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \
+            const unsigned char *_curInput = (input); \
+            unsigned char *_curOutput = (output); \
+            while(_sizeLeft > 0) { \
+                unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \
+                if (_bytesInLane > _sizeLeft) \
+                    _bytesInLane = _sizeLeft; \
+                SnP_ExtractAndAddBytesInLane(state, _lanePosition, _curInput, _curOutput, _offsetInLane, _bytesInLane); \
+                _sizeLeft -= _bytesInLane; \
+                _lanePosition++; \
+                _offsetInLane = 0; \
+                _curInput += _bytesInLane; \
+                _curOutput += _bytesInLane; \
+            } \
+        } \
+    }
+
+#endif
diff --git a/ext/hash/sha3/generic64lc/align.h b/ext/hash/sha3/generic64lc/align.h
new file mode 100644 (file)
index 0000000..e29771e
--- /dev/null
@@ -0,0 +1,34 @@
+/*
+Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
+Joan Daemen, Michaël Peeters, Gilles Van Assche and Ronny Van Keer, hereby
+denoted as "the implementer".
+
+For more information, feedback or questions, please refer to our websites:
+http://keccak.noekeon.org/
+http://keyak.noekeon.org/
+http://ketje.noekeon.org/
+
+To the extent possible under law, the implementer has waived all copyright
+and related or neighboring rights to the source code in this file.
+http://creativecommons.org/publicdomain/zero/1.0/
+*/
+
+#ifndef _align_h_
+#define _align_h_
+
+/* on Mac OS-X and possibly others, ALIGN(x) is defined in param.h, and -Werror chokes on the redef. */
+#ifdef ALIGN
+#undef ALIGN
+#endif
+
+#if defined(__GNUC__)
+#define ALIGN(x) __attribute__ ((aligned(x)))
+#elif defined(_MSC_VER)
+#define ALIGN(x) __declspec(align(x))
+#elif defined(__ARMCC_VERSION)
+#define ALIGN(x) __align(x)
+#else
+#define ALIGN(x)
+#endif
+
+#endif
diff --git a/ext/hash/sha3/generic64lc/brg_endian.h b/ext/hash/sha3/generic64lc/brg_endian.h
new file mode 100644 (file)
index 0000000..7226eb3
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The redistribution and use of this software (with or without changes)
+ is allowed without the payment of fees or royalties provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 20/12/2007
+ Changes for ARM 9/9/2010
+*/
+
+#ifndef _BRG_ENDIAN_H
+#define _BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN      4321 /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN   1234 /* byte 0 is least significant (i386) */
+
+#if 0
+/* Include files where endian defines and byteswap functions may reside */
+#if defined( __sun )
+#  include <sys/isa_defs.h>
+#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ )
+#  include <sys/endian.h>
+#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \
+      defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ )
+#  include <machine/endian.h>
+#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#  if !defined( __MINGW32__ ) && !defined( _AIX )
+#    include <endian.h>
+#    if !defined( __BEOS__ )
+#      include <byteswap.h>
+#    endif
+#  endif
+#endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN )
+#  if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN )
+#  if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( _BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( _LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN )
+#  if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ )
+#  if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#  elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+#    define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#  endif
+#elif defined( __BIG_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined( __LITTLE_ENDIAN__ )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+#if   defined( __alpha__ ) || defined( __alpha ) || defined( i386 )       || \
+      defined( __i386__ )  || defined( _M_I86 )  || defined( _M_IX86 )    || \
+      defined( __OS2__ )   || defined( sun386 )  || defined( __TURBOC__ ) || \
+      defined( vax )       || defined( vms )     || defined( VMS )        || \
+      defined( __VMS )     || defined( _M_X64 )
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+#elif defined( AMIGA )   || defined( applec )    || defined( __AS400__ )  || \
+      defined( _CRAY )   || defined( __hppa )    || defined( __hp9000 )   || \
+      defined( ibm370 )  || defined( mc68000 )   || defined( m68k )       || \
+      defined( __MRC__ ) || defined( __MVS__ )   || defined( __MWERKS__ ) || \
+      defined( sparc )   || defined( __sparc)    || defined( SYMANTEC_C ) || \
+      defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM )   || \
+      defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX )
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+#elif defined(__arm__)
+# ifdef __BIG_ENDIAN
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+# else
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+# endif
+#elif 1     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0     /* **** EDIT HERE IF NECESSARY **** */
+#  define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#  error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order
+#endif
+
+#endif
+
+#endif