1a8847a7eSRob Norris // SPDX-License-Identifier: LicenseRef-OpenZFS-ThirdParty-PublicDomain
23c67d83aSTony Hutter /*
33c67d83aSTony Hutter * Implementation of the Skein block functions.
43c67d83aSTony Hutter * Source code author: Doug Whiting, 2008.
53c67d83aSTony Hutter * This algorithm and source code is released to the public domain.
63c67d83aSTony Hutter * Compile-time switches:
73c67d83aSTony Hutter * SKEIN_USE_ASM -- set bits (256/512/1024) to select which
83c67d83aSTony Hutter * versions use ASM code for block processing
93c67d83aSTony Hutter * [default: use C for all block sizes]
103c67d83aSTony Hutter */
113c67d83aSTony Hutter /* Copyright 2013 Doug Whiting. This code is released to the public domain. */
123c67d83aSTony Hutter
133c67d83aSTony Hutter #include <sys/skein.h>
143c67d83aSTony Hutter #include "skein_impl.h"
153c67d83aSTony Hutter #include <sys/isa_defs.h> /* for _ILP32 */
163c67d83aSTony Hutter
173c67d83aSTony Hutter #ifndef SKEIN_USE_ASM
183c67d83aSTony Hutter #define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
193c67d83aSTony Hutter #endif
203c67d83aSTony Hutter
213c67d83aSTony Hutter #ifndef SKEIN_LOOP
223c67d83aSTony Hutter /*
233c67d83aSTony Hutter * The low-level checksum routines use a lot of stack space. On systems where
243c67d83aSTony Hutter * small stacks frame are enforced (like 32-bit kernel builds), do not unroll
253c67d83aSTony Hutter * checksum calculations to save stack space.
263c67d83aSTony Hutter *
273c67d83aSTony Hutter * Even with no loops unrolled, we still can exceed the 1k stack frame limit
283c67d83aSTony Hutter * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32). We can
293c67d83aSTony Hutter * safely ignore it though, since that the checksum functions will be called
303c67d83aSTony Hutter * from a worker thread that won't be using much stack. That's why we have
313c67d83aSTony Hutter * the #pragma here to ignore the warning.
323c67d83aSTony Hutter */
333c67d83aSTony Hutter #if defined(_ILP32) || defined(__powerpc) /* Assume small stack */
34fe975048Sszubersk #if defined(__GNUC__) && !defined(__clang__)
353c67d83aSTony Hutter #pragma GCC diagnostic ignored "-Wframe-larger-than="
36fe975048Sszubersk #endif
373c67d83aSTony Hutter /*
383c67d83aSTony Hutter * We're running on 32-bit, don't unroll loops to save stack frame space
393c67d83aSTony Hutter *
403c67d83aSTony Hutter * Due to the ways the calculations on SKEIN_LOOP are done in
413c67d83aSTony Hutter * Skein_*_Process_Block(), a value of 111 disables unrolling loops
423c67d83aSTony Hutter * in any of those functions.
433c67d83aSTony Hutter */
443c67d83aSTony Hutter #define SKEIN_LOOP 111
453c67d83aSTony Hutter #else
463c67d83aSTony Hutter /* We're compiling with large stacks */
473c67d83aSTony Hutter #define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
483c67d83aSTony Hutter #endif
493c67d83aSTony Hutter #endif
503c67d83aSTony Hutter
513c67d83aSTony Hutter /* some useful definitions for code here */
523c67d83aSTony Hutter #define BLK_BITS (WCNT*64)
533c67d83aSTony Hutter #define KW_TWK_BASE (0)
543c67d83aSTony Hutter #define KW_KEY_BASE (3)
553c67d83aSTony Hutter #define ks (kw + KW_KEY_BASE)
563c67d83aSTony Hutter #define ts (kw + KW_TWK_BASE)
573c67d83aSTony Hutter
583c67d83aSTony Hutter /* no debugging in Illumos version */
593c67d83aSTony Hutter #define DebugSaveTweak(ctx)
603c67d83aSTony Hutter
613c67d83aSTony Hutter /* Skein_256 */
623c67d83aSTony Hutter #if !(SKEIN_USE_ASM & 256)
633c67d83aSTony Hutter void
Skein_256_Process_Block(Skein_256_Ctxt_t * ctx,const uint8_t * blkPtr,size_t blkCnt,size_t byteCntAdd)643c67d83aSTony Hutter Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
653c67d83aSTony Hutter size_t blkCnt, size_t byteCntAdd)
664ea3f864SGeorge Melikov {
673c67d83aSTony Hutter enum {
683c67d83aSTony Hutter WCNT = SKEIN_256_STATE_WORDS
693c67d83aSTony Hutter };
703c67d83aSTony Hutter #undef RCNT
713c67d83aSTony Hutter #define RCNT (SKEIN_256_ROUNDS_TOTAL / 8)
723c67d83aSTony Hutter
733c67d83aSTony Hutter #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
743c67d83aSTony Hutter #define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
753c67d83aSTony Hutter #else
763c67d83aSTony Hutter #define SKEIN_UNROLL_256 (0)
773c67d83aSTony Hutter #endif
783c67d83aSTony Hutter
793c67d83aSTony Hutter #if SKEIN_UNROLL_256
803c67d83aSTony Hutter #if (RCNT % SKEIN_UNROLL_256)
813c67d83aSTony Hutter #error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */
823c67d83aSTony Hutter #endif
833c67d83aSTony Hutter size_t r;
843c67d83aSTony Hutter /* key schedule words : chaining vars + tweak + "rotation" */
853c67d83aSTony Hutter uint64_t kw[WCNT + 4 + RCNT * 2];
863c67d83aSTony Hutter #else
873c67d83aSTony Hutter uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
883c67d83aSTony Hutter #endif
893c67d83aSTony Hutter /* local copy of context vars, for speed */
903c67d83aSTony Hutter uint64_t X0, X1, X2, X3;
913c67d83aSTony Hutter uint64_t w[WCNT]; /* local copy of input block */
923c67d83aSTony Hutter #ifdef SKEIN_DEBUG
933c67d83aSTony Hutter /* use for debugging (help compiler put Xn in registers) */
943c67d83aSTony Hutter const uint64_t *Xptr[4];
953c67d83aSTony Hutter Xptr[0] = &X0;
963c67d83aSTony Hutter Xptr[1] = &X1;
973c67d83aSTony Hutter Xptr[2] = &X2;
983c67d83aSTony Hutter Xptr[3] = &X3;
993c67d83aSTony Hutter #endif
1003c67d83aSTony Hutter Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
1013c67d83aSTony Hutter ts[0] = ctx->h.T[0];
1023c67d83aSTony Hutter ts[1] = ctx->h.T[1];
1033c67d83aSTony Hutter do {
1043c67d83aSTony Hutter /*
1053c67d83aSTony Hutter * this implementation only supports 2**64 input bytes
1063c67d83aSTony Hutter * (no carry out here)
1073c67d83aSTony Hutter */
1083c67d83aSTony Hutter ts[0] += byteCntAdd; /* update processed length */
1093c67d83aSTony Hutter
1103c67d83aSTony Hutter /* precompute the key schedule for this block */
1113c67d83aSTony Hutter ks[0] = ctx->X[0];
1123c67d83aSTony Hutter ks[1] = ctx->X[1];
1133c67d83aSTony Hutter ks[2] = ctx->X[2];
1143c67d83aSTony Hutter ks[3] = ctx->X[3];
1153c67d83aSTony Hutter ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
1163c67d83aSTony Hutter
1173c67d83aSTony Hutter ts[2] = ts[0] ^ ts[1];
1183c67d83aSTony Hutter
1193c67d83aSTony Hutter /* get input block in little-endian format */
1203c67d83aSTony Hutter Skein_Get64_LSB_First(w, blkPtr, WCNT);
1213c67d83aSTony Hutter DebugSaveTweak(ctx);
1223c67d83aSTony Hutter Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
1233c67d83aSTony Hutter
1243c67d83aSTony Hutter X0 = w[0] + ks[0]; /* do the first full key injection */
1253c67d83aSTony Hutter X1 = w[1] + ks[1] + ts[0];
1263c67d83aSTony Hutter X2 = w[2] + ks[2] + ts[1];
1273c67d83aSTony Hutter X3 = w[3] + ks[3];
1283c67d83aSTony Hutter
1293c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
1303c67d83aSTony Hutter Xptr); /* show starting state values */
1313c67d83aSTony Hutter
1323c67d83aSTony Hutter blkPtr += SKEIN_256_BLOCK_BYTES;
1333c67d83aSTony Hutter
1343c67d83aSTony Hutter /* run the rounds */
1353c67d83aSTony Hutter
1363c67d83aSTony Hutter #define Round256(p0, p1, p2, p3, ROT, rNum) \
1373c67d83aSTony Hutter X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
1383c67d83aSTony Hutter X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
1393c67d83aSTony Hutter
1403c67d83aSTony Hutter #if SKEIN_UNROLL_256 == 0
1413c67d83aSTony Hutter #define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \
1423c67d83aSTony Hutter Round256(p0, p1, p2, p3, ROT, rNum) \
1433c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
1443c67d83aSTony Hutter
1453c67d83aSTony Hutter #define I256(R) \
1463c67d83aSTony Hutter X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \
1473c67d83aSTony Hutter X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \
1483c67d83aSTony Hutter X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \
1493c67d83aSTony Hutter X3 += ks[((R) + 4) % 5] + (R) + 1; \
1503c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
1513c67d83aSTony Hutter #else /* looping version */
1523c67d83aSTony Hutter #define R256(p0, p1, p2, p3, ROT, rNum) \
1533c67d83aSTony Hutter Round256(p0, p1, p2, p3, ROT, rNum) \
1543c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
1553c67d83aSTony Hutter
1563c67d83aSTony Hutter #define I256(R) \
1573c67d83aSTony Hutter X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
1583c67d83aSTony Hutter X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \
1593c67d83aSTony Hutter X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \
1603c67d83aSTony Hutter X3 += ks[r + (R) + 3] + r + (R); \
1613c67d83aSTony Hutter ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */ \
1623c67d83aSTony Hutter ts[r + (R) + 2] = ts[r + (R) - 1]; \
1633c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
1643c67d83aSTony Hutter
1659d40bdf4SAndrea Gelmini /* loop through it */
1663c67d83aSTony Hutter for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
1673c67d83aSTony Hutter #endif
1683c67d83aSTony Hutter {
1693c67d83aSTony Hutter #define R256_8_rounds(R) \
1703c67d83aSTony Hutter R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \
1713c67d83aSTony Hutter R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \
1723c67d83aSTony Hutter R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \
1733c67d83aSTony Hutter R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \
1743c67d83aSTony Hutter I256(2 * (R)); \
1753c67d83aSTony Hutter R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \
1763c67d83aSTony Hutter R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \
1773c67d83aSTony Hutter R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \
1783c67d83aSTony Hutter R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \
1793c67d83aSTony Hutter I256(2 * (R) + 1);
1803c67d83aSTony Hutter
1813c67d83aSTony Hutter R256_8_rounds(0);
1823c67d83aSTony Hutter
1833c67d83aSTony Hutter #define R256_Unroll_R(NN) \
1843c67d83aSTony Hutter ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
1853c67d83aSTony Hutter (SKEIN_UNROLL_256 > (NN)))
1863c67d83aSTony Hutter
1873c67d83aSTony Hutter #if R256_Unroll_R(1)
1883c67d83aSTony Hutter R256_8_rounds(1);
1893c67d83aSTony Hutter #endif
1903c67d83aSTony Hutter #if R256_Unroll_R(2)
1913c67d83aSTony Hutter R256_8_rounds(2);
1923c67d83aSTony Hutter #endif
1933c67d83aSTony Hutter #if R256_Unroll_R(3)
1943c67d83aSTony Hutter R256_8_rounds(3);
1953c67d83aSTony Hutter #endif
1963c67d83aSTony Hutter #if R256_Unroll_R(4)
1973c67d83aSTony Hutter R256_8_rounds(4);
1983c67d83aSTony Hutter #endif
1993c67d83aSTony Hutter #if R256_Unroll_R(5)
2003c67d83aSTony Hutter R256_8_rounds(5);
2013c67d83aSTony Hutter #endif
2023c67d83aSTony Hutter #if R256_Unroll_R(6)
2033c67d83aSTony Hutter R256_8_rounds(6);
2043c67d83aSTony Hutter #endif
2053c67d83aSTony Hutter #if R256_Unroll_R(7)
2063c67d83aSTony Hutter R256_8_rounds(7);
2073c67d83aSTony Hutter #endif
2083c67d83aSTony Hutter #if R256_Unroll_R(8)
2093c67d83aSTony Hutter R256_8_rounds(8);
2103c67d83aSTony Hutter #endif
2113c67d83aSTony Hutter #if R256_Unroll_R(9)
2123c67d83aSTony Hutter R256_8_rounds(9);
2133c67d83aSTony Hutter #endif
2143c67d83aSTony Hutter #if R256_Unroll_R(10)
2153c67d83aSTony Hutter R256_8_rounds(10);
2163c67d83aSTony Hutter #endif
2173c67d83aSTony Hutter #if R256_Unroll_R(11)
2183c67d83aSTony Hutter R256_8_rounds(11);
2193c67d83aSTony Hutter #endif
2203c67d83aSTony Hutter #if R256_Unroll_R(12)
2213c67d83aSTony Hutter R256_8_rounds(12);
2223c67d83aSTony Hutter #endif
2233c67d83aSTony Hutter #if R256_Unroll_R(13)
2243c67d83aSTony Hutter R256_8_rounds(13);
2253c67d83aSTony Hutter #endif
2263c67d83aSTony Hutter #if R256_Unroll_R(14)
2273c67d83aSTony Hutter R256_8_rounds(14);
2283c67d83aSTony Hutter #endif
2293c67d83aSTony Hutter #if (SKEIN_UNROLL_256 > 14)
2303c67d83aSTony Hutter #error "need more unrolling in Skein_256_Process_Block"
2313c67d83aSTony Hutter #endif
2323c67d83aSTony Hutter }
2333c67d83aSTony Hutter /*
2343c67d83aSTony Hutter * do the final "feedforward" xor, update context chaining vars
2353c67d83aSTony Hutter */
2363c67d83aSTony Hutter ctx->X[0] = X0 ^ w[0];
2373c67d83aSTony Hutter ctx->X[1] = X1 ^ w[1];
2383c67d83aSTony Hutter ctx->X[2] = X2 ^ w[2];
2393c67d83aSTony Hutter ctx->X[3] = X3 ^ w[3];
2403c67d83aSTony Hutter
2413c67d83aSTony Hutter Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
2423c67d83aSTony Hutter
2433c67d83aSTony Hutter ts[1] &= ~SKEIN_T1_FLAG_FIRST;
2444ea3f864SGeorge Melikov } while (--blkCnt);
2453c67d83aSTony Hutter ctx->h.T[0] = ts[0];
2463c67d83aSTony Hutter ctx->h.T[1] = ts[1];
2473c67d83aSTony Hutter }
2483c67d83aSTony Hutter
2493c67d83aSTony Hutter #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
2503c67d83aSTony Hutter size_t
Skein_256_Process_Block_CodeSize(void)2513c67d83aSTony Hutter Skein_256_Process_Block_CodeSize(void)
2523c67d83aSTony Hutter {
2533c67d83aSTony Hutter return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
2543c67d83aSTony Hutter ((uint8_t *)Skein_256_Process_Block);
2553c67d83aSTony Hutter }
2563c67d83aSTony Hutter
2573c67d83aSTony Hutter uint_t
Skein_256_Unroll_Cnt(void)2583c67d83aSTony Hutter Skein_256_Unroll_Cnt(void)
2593c67d83aSTony Hutter {
2603c67d83aSTony Hutter return (SKEIN_UNROLL_256);
2613c67d83aSTony Hutter }
2623c67d83aSTony Hutter #endif
2633c67d83aSTony Hutter #endif
2643c67d83aSTony Hutter
2653c67d83aSTony Hutter /* Skein_512 */
2663c67d83aSTony Hutter #if !(SKEIN_USE_ASM & 512)
2673c67d83aSTony Hutter void
Skein_512_Process_Block(Skein_512_Ctxt_t * ctx,const uint8_t * blkPtr,size_t blkCnt,size_t byteCntAdd)2683c67d83aSTony Hutter Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
2693c67d83aSTony Hutter size_t blkCnt, size_t byteCntAdd)
2704ea3f864SGeorge Melikov {
2713c67d83aSTony Hutter enum {
2723c67d83aSTony Hutter WCNT = SKEIN_512_STATE_WORDS
2733c67d83aSTony Hutter };
2743c67d83aSTony Hutter #undef RCNT
2753c67d83aSTony Hutter #define RCNT (SKEIN_512_ROUNDS_TOTAL / 8)
2763c67d83aSTony Hutter
2773c67d83aSTony Hutter #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
2783c67d83aSTony Hutter #define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
2793c67d83aSTony Hutter #else
2803c67d83aSTony Hutter #define SKEIN_UNROLL_512 (0)
2813c67d83aSTony Hutter #endif
2823c67d83aSTony Hutter
2833c67d83aSTony Hutter #if SKEIN_UNROLL_512
2843c67d83aSTony Hutter #if (RCNT % SKEIN_UNROLL_512)
2853c67d83aSTony Hutter #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */
2863c67d83aSTony Hutter #endif
2873c67d83aSTony Hutter size_t r;
2883c67d83aSTony Hutter /* key schedule words : chaining vars + tweak + "rotation" */
2893c67d83aSTony Hutter uint64_t kw[WCNT + 4 + RCNT * 2];
2903c67d83aSTony Hutter #else
2913c67d83aSTony Hutter uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
2923c67d83aSTony Hutter #endif
2933c67d83aSTony Hutter /* local copy of vars, for speed */
2943c67d83aSTony Hutter uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
2953c67d83aSTony Hutter uint64_t w[WCNT]; /* local copy of input block */
2963c67d83aSTony Hutter #ifdef SKEIN_DEBUG
2973c67d83aSTony Hutter /* use for debugging (help compiler put Xn in registers) */
2983c67d83aSTony Hutter const uint64_t *Xptr[8];
2993c67d83aSTony Hutter Xptr[0] = &X0;
3003c67d83aSTony Hutter Xptr[1] = &X1;
3013c67d83aSTony Hutter Xptr[2] = &X2;
3023c67d83aSTony Hutter Xptr[3] = &X3;
3033c67d83aSTony Hutter Xptr[4] = &X4;
3043c67d83aSTony Hutter Xptr[5] = &X5;
3053c67d83aSTony Hutter Xptr[6] = &X6;
3063c67d83aSTony Hutter Xptr[7] = &X7;
3073c67d83aSTony Hutter #endif
3083c67d83aSTony Hutter
3093c67d83aSTony Hutter Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
3103c67d83aSTony Hutter ts[0] = ctx->h.T[0];
3113c67d83aSTony Hutter ts[1] = ctx->h.T[1];
3123c67d83aSTony Hutter do {
3133c67d83aSTony Hutter /*
3143c67d83aSTony Hutter * this implementation only supports 2**64 input bytes
3153c67d83aSTony Hutter * (no carry out here)
3163c67d83aSTony Hutter */
3173c67d83aSTony Hutter ts[0] += byteCntAdd; /* update processed length */
3183c67d83aSTony Hutter
3193c67d83aSTony Hutter /* precompute the key schedule for this block */
3203c67d83aSTony Hutter ks[0] = ctx->X[0];
3213c67d83aSTony Hutter ks[1] = ctx->X[1];
3223c67d83aSTony Hutter ks[2] = ctx->X[2];
3233c67d83aSTony Hutter ks[3] = ctx->X[3];
3243c67d83aSTony Hutter ks[4] = ctx->X[4];
3253c67d83aSTony Hutter ks[5] = ctx->X[5];
3263c67d83aSTony Hutter ks[6] = ctx->X[6];
3273c67d83aSTony Hutter ks[7] = ctx->X[7];
3283c67d83aSTony Hutter ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
3293c67d83aSTony Hutter ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
3303c67d83aSTony Hutter
3313c67d83aSTony Hutter ts[2] = ts[0] ^ ts[1];
3323c67d83aSTony Hutter
3333c67d83aSTony Hutter /* get input block in little-endian format */
3343c67d83aSTony Hutter Skein_Get64_LSB_First(w, blkPtr, WCNT);
3353c67d83aSTony Hutter DebugSaveTweak(ctx);
3363c67d83aSTony Hutter Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
3373c67d83aSTony Hutter
3383c67d83aSTony Hutter X0 = w[0] + ks[0]; /* do the first full key injection */
3393c67d83aSTony Hutter X1 = w[1] + ks[1];
3403c67d83aSTony Hutter X2 = w[2] + ks[2];
3413c67d83aSTony Hutter X3 = w[3] + ks[3];
3423c67d83aSTony Hutter X4 = w[4] + ks[4];
3433c67d83aSTony Hutter X5 = w[5] + ks[5] + ts[0];
3443c67d83aSTony Hutter X6 = w[6] + ks[6] + ts[1];
3453c67d83aSTony Hutter X7 = w[7] + ks[7];
3463c67d83aSTony Hutter
3473c67d83aSTony Hutter blkPtr += SKEIN_512_BLOCK_BYTES;
3483c67d83aSTony Hutter
3493c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
3503c67d83aSTony Hutter Xptr);
3513c67d83aSTony Hutter /* run the rounds */
3523c67d83aSTony Hutter #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
3533c67d83aSTony Hutter X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
3543c67d83aSTony Hutter X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
3553c67d83aSTony Hutter X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
3563c67d83aSTony Hutter X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
3573c67d83aSTony Hutter
3583c67d83aSTony Hutter #if SKEIN_UNROLL_512 == 0
3593c67d83aSTony Hutter #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \
3603c67d83aSTony Hutter Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
3613c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
3623c67d83aSTony Hutter
3633c67d83aSTony Hutter #define I512(R) \
3643c67d83aSTony Hutter X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */\
3653c67d83aSTony Hutter X1 += ks[((R) + 2) % 9]; \
3663c67d83aSTony Hutter X2 += ks[((R) + 3) % 9]; \
3673c67d83aSTony Hutter X3 += ks[((R) + 4) % 9]; \
3683c67d83aSTony Hutter X4 += ks[((R) + 5) % 9]; \
3693c67d83aSTony Hutter X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \
3703c67d83aSTony Hutter X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \
3713c67d83aSTony Hutter X7 += ks[((R) + 8) % 9] + (R) + 1; \
3723c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
3733c67d83aSTony Hutter #else /* looping version */
3743c67d83aSTony Hutter #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
3753c67d83aSTony Hutter Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
3763c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
3773c67d83aSTony Hutter
3783c67d83aSTony Hutter #define I512(R) \
3793c67d83aSTony Hutter X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
3803c67d83aSTony Hutter X1 += ks[r + (R) + 1]; \
3813c67d83aSTony Hutter X2 += ks[r + (R) + 2]; \
3823c67d83aSTony Hutter X3 += ks[r + (R) + 3]; \
3833c67d83aSTony Hutter X4 += ks[r + (R) + 4]; \
3843c67d83aSTony Hutter X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \
3853c67d83aSTony Hutter X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \
3863c67d83aSTony Hutter X7 += ks[r + (R) + 7] + r + (R); \
3873c67d83aSTony Hutter ks[r + (R)+8] = ks[r + (R) - 1]; /* rotate key schedule */\
3883c67d83aSTony Hutter ts[r + (R)+2] = ts[r + (R) - 1]; \
3893c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
3903c67d83aSTony Hutter
3919d40bdf4SAndrea Gelmini /* loop through it */
3923c67d83aSTony Hutter for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
3933c67d83aSTony Hutter #endif /* end of looped code definitions */
3943c67d83aSTony Hutter {
3953c67d83aSTony Hutter #define R512_8_rounds(R) /* do 8 full rounds */ \
3963c67d83aSTony Hutter R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \
3973c67d83aSTony Hutter R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \
3983c67d83aSTony Hutter R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \
3993c67d83aSTony Hutter R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \
4003c67d83aSTony Hutter I512(2 * (R)); \
4013c67d83aSTony Hutter R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \
4023c67d83aSTony Hutter R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \
4033c67d83aSTony Hutter R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \
4043c67d83aSTony Hutter R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \
4053c67d83aSTony Hutter I512(2*(R) + 1); /* and key injection */
4063c67d83aSTony Hutter
4073c67d83aSTony Hutter R512_8_rounds(0);
4083c67d83aSTony Hutter
4093c67d83aSTony Hutter #define R512_Unroll_R(NN) \
4103c67d83aSTony Hutter ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
4113c67d83aSTony Hutter (SKEIN_UNROLL_512 > (NN)))
4123c67d83aSTony Hutter
4133c67d83aSTony Hutter #if R512_Unroll_R(1)
4143c67d83aSTony Hutter R512_8_rounds(1);
4153c67d83aSTony Hutter #endif
4163c67d83aSTony Hutter #if R512_Unroll_R(2)
4173c67d83aSTony Hutter R512_8_rounds(2);
4183c67d83aSTony Hutter #endif
4193c67d83aSTony Hutter #if R512_Unroll_R(3)
4203c67d83aSTony Hutter R512_8_rounds(3);
4213c67d83aSTony Hutter #endif
4223c67d83aSTony Hutter #if R512_Unroll_R(4)
4233c67d83aSTony Hutter R512_8_rounds(4);
4243c67d83aSTony Hutter #endif
4253c67d83aSTony Hutter #if R512_Unroll_R(5)
4263c67d83aSTony Hutter R512_8_rounds(5);
4273c67d83aSTony Hutter #endif
4283c67d83aSTony Hutter #if R512_Unroll_R(6)
4293c67d83aSTony Hutter R512_8_rounds(6);
4303c67d83aSTony Hutter #endif
4313c67d83aSTony Hutter #if R512_Unroll_R(7)
4323c67d83aSTony Hutter R512_8_rounds(7);
4333c67d83aSTony Hutter #endif
4343c67d83aSTony Hutter #if R512_Unroll_R(8)
4353c67d83aSTony Hutter R512_8_rounds(8);
4363c67d83aSTony Hutter #endif
4373c67d83aSTony Hutter #if R512_Unroll_R(9)
4383c67d83aSTony Hutter R512_8_rounds(9);
4393c67d83aSTony Hutter #endif
4403c67d83aSTony Hutter #if R512_Unroll_R(10)
4413c67d83aSTony Hutter R512_8_rounds(10);
4423c67d83aSTony Hutter #endif
4433c67d83aSTony Hutter #if R512_Unroll_R(11)
4443c67d83aSTony Hutter R512_8_rounds(11);
4453c67d83aSTony Hutter #endif
4463c67d83aSTony Hutter #if R512_Unroll_R(12)
4473c67d83aSTony Hutter R512_8_rounds(12);
4483c67d83aSTony Hutter #endif
4493c67d83aSTony Hutter #if R512_Unroll_R(13)
4503c67d83aSTony Hutter R512_8_rounds(13);
4513c67d83aSTony Hutter #endif
4523c67d83aSTony Hutter #if R512_Unroll_R(14)
4533c67d83aSTony Hutter R512_8_rounds(14);
4543c67d83aSTony Hutter #endif
4553c67d83aSTony Hutter #if (SKEIN_UNROLL_512 > 14)
4563c67d83aSTony Hutter #error "need more unrolling in Skein_512_Process_Block"
4573c67d83aSTony Hutter #endif
4583c67d83aSTony Hutter }
4593c67d83aSTony Hutter
4603c67d83aSTony Hutter /*
4613c67d83aSTony Hutter * do the final "feedforward" xor, update context chaining vars
4623c67d83aSTony Hutter */
4633c67d83aSTony Hutter ctx->X[0] = X0 ^ w[0];
4643c67d83aSTony Hutter ctx->X[1] = X1 ^ w[1];
4653c67d83aSTony Hutter ctx->X[2] = X2 ^ w[2];
4663c67d83aSTony Hutter ctx->X[3] = X3 ^ w[3];
4673c67d83aSTony Hutter ctx->X[4] = X4 ^ w[4];
4683c67d83aSTony Hutter ctx->X[5] = X5 ^ w[5];
4693c67d83aSTony Hutter ctx->X[6] = X6 ^ w[6];
4703c67d83aSTony Hutter ctx->X[7] = X7 ^ w[7];
4713c67d83aSTony Hutter Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
4723c67d83aSTony Hutter
4733c67d83aSTony Hutter ts[1] &= ~SKEIN_T1_FLAG_FIRST;
4744ea3f864SGeorge Melikov } while (--blkCnt);
4753c67d83aSTony Hutter ctx->h.T[0] = ts[0];
4763c67d83aSTony Hutter ctx->h.T[1] = ts[1];
4773c67d83aSTony Hutter }
4783c67d83aSTony Hutter
4793c67d83aSTony Hutter #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
4803c67d83aSTony Hutter size_t
Skein_512_Process_Block_CodeSize(void)4813c67d83aSTony Hutter Skein_512_Process_Block_CodeSize(void)
4823c67d83aSTony Hutter {
4833c67d83aSTony Hutter return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
4843c67d83aSTony Hutter ((uint8_t *)Skein_512_Process_Block);
4853c67d83aSTony Hutter }
4863c67d83aSTony Hutter
4873c67d83aSTony Hutter uint_t
Skein_512_Unroll_Cnt(void)4883c67d83aSTony Hutter Skein_512_Unroll_Cnt(void)
4893c67d83aSTony Hutter {
4903c67d83aSTony Hutter return (SKEIN_UNROLL_512);
4913c67d83aSTony Hutter }
4923c67d83aSTony Hutter #endif
4933c67d83aSTony Hutter #endif
4943c67d83aSTony Hutter
4953c67d83aSTony Hutter /* Skein1024 */
4963c67d83aSTony Hutter #if !(SKEIN_USE_ASM & 1024)
4973c67d83aSTony Hutter void
Skein1024_Process_Block(Skein1024_Ctxt_t * ctx,const uint8_t * blkPtr,size_t blkCnt,size_t byteCntAdd)4983c67d83aSTony Hutter Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
4993c67d83aSTony Hutter size_t blkCnt, size_t byteCntAdd)
5003c67d83aSTony Hutter {
5013c67d83aSTony Hutter /* do it in C, always looping (unrolled is bigger AND slower!) */
5023c67d83aSTony Hutter enum {
5033c67d83aSTony Hutter WCNT = SKEIN1024_STATE_WORDS
5043c67d83aSTony Hutter };
5053c67d83aSTony Hutter #undef RCNT
5063c67d83aSTony Hutter #define RCNT (SKEIN1024_ROUNDS_TOTAL/8)
5073c67d83aSTony Hutter
5083c67d83aSTony Hutter #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
5093c67d83aSTony Hutter #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
5103c67d83aSTony Hutter #else
5113c67d83aSTony Hutter #define SKEIN_UNROLL_1024 (0)
5123c67d83aSTony Hutter #endif
5133c67d83aSTony Hutter
5143c67d83aSTony Hutter #if (SKEIN_UNROLL_1024 != 0)
5153c67d83aSTony Hutter #if (RCNT % SKEIN_UNROLL_1024)
5163c67d83aSTony Hutter #error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */
5173c67d83aSTony Hutter #endif
5183c67d83aSTony Hutter size_t r;
5193c67d83aSTony Hutter /* key schedule words : chaining vars + tweak + "rotation" */
5203c67d83aSTony Hutter uint64_t kw[WCNT + 4 + RCNT * 2];
5213c67d83aSTony Hutter #else
5223c67d83aSTony Hutter uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
5233c67d83aSTony Hutter #endif
5243c67d83aSTony Hutter
5253c67d83aSTony Hutter /* local copy of vars, for speed */
5263c67d83aSTony Hutter uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
5273c67d83aSTony Hutter X12, X13, X14, X15;
5283c67d83aSTony Hutter uint64_t w[WCNT]; /* local copy of input block */
5293c67d83aSTony Hutter #ifdef SKEIN_DEBUG
5303c67d83aSTony Hutter /* use for debugging (help compiler put Xn in registers) */
5313c67d83aSTony Hutter const uint64_t *Xptr[16];
5323c67d83aSTony Hutter Xptr[0] = &X00;
5333c67d83aSTony Hutter Xptr[1] = &X01;
5343c67d83aSTony Hutter Xptr[2] = &X02;
5353c67d83aSTony Hutter Xptr[3] = &X03;
5363c67d83aSTony Hutter Xptr[4] = &X04;
5373c67d83aSTony Hutter Xptr[5] = &X05;
5383c67d83aSTony Hutter Xptr[6] = &X06;
5393c67d83aSTony Hutter Xptr[7] = &X07;
5403c67d83aSTony Hutter Xptr[8] = &X08;
5413c67d83aSTony Hutter Xptr[9] = &X09;
5423c67d83aSTony Hutter Xptr[10] = &X10;
5433c67d83aSTony Hutter Xptr[11] = &X11;
5443c67d83aSTony Hutter Xptr[12] = &X12;
5453c67d83aSTony Hutter Xptr[13] = &X13;
5463c67d83aSTony Hutter Xptr[14] = &X14;
5473c67d83aSTony Hutter Xptr[15] = &X15;
5483c67d83aSTony Hutter #endif
5493c67d83aSTony Hutter
5503c67d83aSTony Hutter Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
5513c67d83aSTony Hutter ts[0] = ctx->h.T[0];
5523c67d83aSTony Hutter ts[1] = ctx->h.T[1];
5533c67d83aSTony Hutter do {
5543c67d83aSTony Hutter /*
5553c67d83aSTony Hutter * this implementation only supports 2**64 input bytes
5563c67d83aSTony Hutter * (no carry out here)
5573c67d83aSTony Hutter */
5583c67d83aSTony Hutter ts[0] += byteCntAdd; /* update processed length */
5593c67d83aSTony Hutter
5603c67d83aSTony Hutter /* precompute the key schedule for this block */
5613c67d83aSTony Hutter ks[0] = ctx->X[0];
5623c67d83aSTony Hutter ks[1] = ctx->X[1];
5633c67d83aSTony Hutter ks[2] = ctx->X[2];
5643c67d83aSTony Hutter ks[3] = ctx->X[3];
5653c67d83aSTony Hutter ks[4] = ctx->X[4];
5663c67d83aSTony Hutter ks[5] = ctx->X[5];
5673c67d83aSTony Hutter ks[6] = ctx->X[6];
5683c67d83aSTony Hutter ks[7] = ctx->X[7];
5693c67d83aSTony Hutter ks[8] = ctx->X[8];
5703c67d83aSTony Hutter ks[9] = ctx->X[9];
5713c67d83aSTony Hutter ks[10] = ctx->X[10];
5723c67d83aSTony Hutter ks[11] = ctx->X[11];
5733c67d83aSTony Hutter ks[12] = ctx->X[12];
5743c67d83aSTony Hutter ks[13] = ctx->X[13];
5753c67d83aSTony Hutter ks[14] = ctx->X[14];
5763c67d83aSTony Hutter ks[15] = ctx->X[15];
5773c67d83aSTony Hutter ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
5783c67d83aSTony Hutter ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
5793c67d83aSTony Hutter ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
5803c67d83aSTony Hutter ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
5813c67d83aSTony Hutter
5823c67d83aSTony Hutter ts[2] = ts[0] ^ ts[1];
5833c67d83aSTony Hutter
5843c67d83aSTony Hutter /* get input block in little-endian format */
5853c67d83aSTony Hutter Skein_Get64_LSB_First(w, blkPtr, WCNT);
5863c67d83aSTony Hutter DebugSaveTweak(ctx);
5873c67d83aSTony Hutter Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
5883c67d83aSTony Hutter
5893c67d83aSTony Hutter X00 = w[0] + ks[0]; /* do the first full key injection */
5903c67d83aSTony Hutter X01 = w[1] + ks[1];
5913c67d83aSTony Hutter X02 = w[2] + ks[2];
5923c67d83aSTony Hutter X03 = w[3] + ks[3];
5933c67d83aSTony Hutter X04 = w[4] + ks[4];
5943c67d83aSTony Hutter X05 = w[5] + ks[5];
5953c67d83aSTony Hutter X06 = w[6] + ks[6];
5963c67d83aSTony Hutter X07 = w[7] + ks[7];
5973c67d83aSTony Hutter X08 = w[8] + ks[8];
5983c67d83aSTony Hutter X09 = w[9] + ks[9];
5993c67d83aSTony Hutter X10 = w[10] + ks[10];
6003c67d83aSTony Hutter X11 = w[11] + ks[11];
6013c67d83aSTony Hutter X12 = w[12] + ks[12];
6023c67d83aSTony Hutter X13 = w[13] + ks[13] + ts[0];
6033c67d83aSTony Hutter X14 = w[14] + ks[14] + ts[1];
6043c67d83aSTony Hutter X15 = w[15] + ks[15];
6053c67d83aSTony Hutter
6063c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
6073c67d83aSTony Hutter Xptr);
6083c67d83aSTony Hutter
6093c67d83aSTony Hutter #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \
6103c67d83aSTony Hutter pD, pE, pF, ROT, rNum) \
6113c67d83aSTony Hutter X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
6123c67d83aSTony Hutter X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
6133c67d83aSTony Hutter X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
6143c67d83aSTony Hutter X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
6153c67d83aSTony Hutter X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
6163c67d83aSTony Hutter X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
6173c67d83aSTony Hutter X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
6183c67d83aSTony Hutter X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
6193c67d83aSTony Hutter
6203c67d83aSTony Hutter #if SKEIN_UNROLL_1024 == 0
6213c67d83aSTony Hutter #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \
6223c67d83aSTony Hutter pE, pF, ROT, rn) \
6233c67d83aSTony Hutter Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \
6243c67d83aSTony Hutter pD, pE, pF, ROT, rn) \
6253c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
6263c67d83aSTony Hutter
6273c67d83aSTony Hutter #define I1024(R) \
6283c67d83aSTony Hutter X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */\
6293c67d83aSTony Hutter X01 += ks[((R) + 2) % 17]; \
6303c67d83aSTony Hutter X02 += ks[((R) + 3) % 17]; \
6313c67d83aSTony Hutter X03 += ks[((R) + 4) % 17]; \
6323c67d83aSTony Hutter X04 += ks[((R) + 5) % 17]; \
6333c67d83aSTony Hutter X05 += ks[((R) + 6) % 17]; \
6343c67d83aSTony Hutter X06 += ks[((R) + 7) % 17]; \
6353c67d83aSTony Hutter X07 += ks[((R) + 8) % 17]; \
6363c67d83aSTony Hutter X08 += ks[((R) + 9) % 17]; \
6373c67d83aSTony Hutter X09 += ks[((R) + 10) % 17]; \
6383c67d83aSTony Hutter X10 += ks[((R) + 11) % 17]; \
6393c67d83aSTony Hutter X11 += ks[((R) + 12) % 17]; \
6403c67d83aSTony Hutter X12 += ks[((R) + 13) % 17]; \
6413c67d83aSTony Hutter X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \
6423c67d83aSTony Hutter X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \
6433c67d83aSTony Hutter X15 += ks[((R) + 16) % 17] + (R) +1; \
6443c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
6453c67d83aSTony Hutter #else /* looping version */
6463c67d83aSTony Hutter #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \
6473c67d83aSTony Hutter pE, pF, ROT, rn) \
6483c67d83aSTony Hutter Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \
6493c67d83aSTony Hutter pD, pE, pF, ROT, rn) \
6503c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
6513c67d83aSTony Hutter
6523c67d83aSTony Hutter #define I1024(R) \
6533c67d83aSTony Hutter X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \
6543c67d83aSTony Hutter X01 += ks[r + (R) + 1]; \
6553c67d83aSTony Hutter X02 += ks[r + (R) + 2]; \
6563c67d83aSTony Hutter X03 += ks[r + (R) + 3]; \
6573c67d83aSTony Hutter X04 += ks[r + (R) + 4]; \
6583c67d83aSTony Hutter X05 += ks[r + (R) + 5]; \
6593c67d83aSTony Hutter X06 += ks[r + (R) + 6]; \
6603c67d83aSTony Hutter X07 += ks[r + (R) + 7]; \
6613c67d83aSTony Hutter X08 += ks[r + (R) + 8]; \
6623c67d83aSTony Hutter X09 += ks[r + (R) + 9]; \
6633c67d83aSTony Hutter X10 += ks[r + (R) + 10]; \
6643c67d83aSTony Hutter X11 += ks[r + (R) + 11]; \
6653c67d83aSTony Hutter X12 += ks[r + (R) + 12]; \
6663c67d83aSTony Hutter X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \
6673c67d83aSTony Hutter X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \
6683c67d83aSTony Hutter X15 += ks[r + (R) + 15] + r + (R); \
6693c67d83aSTony Hutter ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\
6703c67d83aSTony Hutter ts[r + (R) + 2] = ts[r + (R) - 1]; \
6713c67d83aSTony Hutter Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
6723c67d83aSTony Hutter
6739d40bdf4SAndrea Gelmini /* loop through it */
6743c67d83aSTony Hutter for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
6753c67d83aSTony Hutter #endif
6763c67d83aSTony Hutter {
6773c67d83aSTony Hutter #define R1024_8_rounds(R) /* do 8 full rounds */ \
6783c67d83aSTony Hutter R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \
6793c67d83aSTony Hutter 14, 15, R1024_0, 8 * (R) + 1); \
6803c67d83aSTony Hutter R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \
6813c67d83aSTony Hutter 08, 01, R1024_1, 8 * (R) + 2); \
6823c67d83aSTony Hutter R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \
6833c67d83aSTony Hutter 10, 09, R1024_2, 8 * (R) + 3); \
6843c67d83aSTony Hutter R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \
6853c67d83aSTony Hutter 12, 07, R1024_3, 8 * (R) + 4); \
6863c67d83aSTony Hutter I1024(2 * (R)); \
6873c67d83aSTony Hutter R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \
6883c67d83aSTony Hutter 14, 15, R1024_4, 8 * (R) + 5); \
6893c67d83aSTony Hutter R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \
6903c67d83aSTony Hutter 08, 01, R1024_5, 8 * (R) + 6); \
6913c67d83aSTony Hutter R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \
6923c67d83aSTony Hutter 10, 09, R1024_6, 8 * (R) + 7); \
6933c67d83aSTony Hutter R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \
6943c67d83aSTony Hutter 12, 07, R1024_7, 8 * (R) + 8); \
6953c67d83aSTony Hutter I1024(2 * (R) + 1);
6963c67d83aSTony Hutter
6973c67d83aSTony Hutter R1024_8_rounds(0);
6983c67d83aSTony Hutter
6993c67d83aSTony Hutter #define R1024_Unroll_R(NN) \
7003c67d83aSTony Hutter ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \
7013c67d83aSTony Hutter (SKEIN_UNROLL_1024 > (NN)))
7023c67d83aSTony Hutter
7033c67d83aSTony Hutter #if R1024_Unroll_R(1)
7043c67d83aSTony Hutter R1024_8_rounds(1);
7053c67d83aSTony Hutter #endif
7063c67d83aSTony Hutter #if R1024_Unroll_R(2)
7073c67d83aSTony Hutter R1024_8_rounds(2);
7083c67d83aSTony Hutter #endif
7093c67d83aSTony Hutter #if R1024_Unroll_R(3)
7103c67d83aSTony Hutter R1024_8_rounds(3);
7113c67d83aSTony Hutter #endif
7123c67d83aSTony Hutter #if R1024_Unroll_R(4)
7133c67d83aSTony Hutter R1024_8_rounds(4);
7143c67d83aSTony Hutter #endif
7153c67d83aSTony Hutter #if R1024_Unroll_R(5)
7163c67d83aSTony Hutter R1024_8_rounds(5);
7173c67d83aSTony Hutter #endif
7183c67d83aSTony Hutter #if R1024_Unroll_R(6)
7193c67d83aSTony Hutter R1024_8_rounds(6);
7203c67d83aSTony Hutter #endif
7213c67d83aSTony Hutter #if R1024_Unroll_R(7)
7223c67d83aSTony Hutter R1024_8_rounds(7);
7233c67d83aSTony Hutter #endif
7243c67d83aSTony Hutter #if R1024_Unroll_R(8)
7253c67d83aSTony Hutter R1024_8_rounds(8);
7263c67d83aSTony Hutter #endif
7273c67d83aSTony Hutter #if R1024_Unroll_R(9)
7283c67d83aSTony Hutter R1024_8_rounds(9);
7293c67d83aSTony Hutter #endif
7303c67d83aSTony Hutter #if R1024_Unroll_R(10)
7313c67d83aSTony Hutter R1024_8_rounds(10);
7323c67d83aSTony Hutter #endif
7333c67d83aSTony Hutter #if R1024_Unroll_R(11)
7343c67d83aSTony Hutter R1024_8_rounds(11);
7353c67d83aSTony Hutter #endif
7363c67d83aSTony Hutter #if R1024_Unroll_R(12)
7373c67d83aSTony Hutter R1024_8_rounds(12);
7383c67d83aSTony Hutter #endif
7393c67d83aSTony Hutter #if R1024_Unroll_R(13)
7403c67d83aSTony Hutter R1024_8_rounds(13);
7413c67d83aSTony Hutter #endif
7423c67d83aSTony Hutter #if R1024_Unroll_R(14)
7433c67d83aSTony Hutter R1024_8_rounds(14);
7443c67d83aSTony Hutter #endif
7453c67d83aSTony Hutter #if (SKEIN_UNROLL_1024 > 14)
7463c67d83aSTony Hutter #error "need more unrolling in Skein_1024_Process_Block"
7473c67d83aSTony Hutter #endif
7483c67d83aSTony Hutter }
7493c67d83aSTony Hutter /*
7503c67d83aSTony Hutter * do the final "feedforward" xor, update context chaining vars
7513c67d83aSTony Hutter */
7523c67d83aSTony Hutter
7533c67d83aSTony Hutter ctx->X[0] = X00 ^ w[0];
7543c67d83aSTony Hutter ctx->X[1] = X01 ^ w[1];
7553c67d83aSTony Hutter ctx->X[2] = X02 ^ w[2];
7563c67d83aSTony Hutter ctx->X[3] = X03 ^ w[3];
7573c67d83aSTony Hutter ctx->X[4] = X04 ^ w[4];
7583c67d83aSTony Hutter ctx->X[5] = X05 ^ w[5];
7593c67d83aSTony Hutter ctx->X[6] = X06 ^ w[6];
7603c67d83aSTony Hutter ctx->X[7] = X07 ^ w[7];
7613c67d83aSTony Hutter ctx->X[8] = X08 ^ w[8];
7623c67d83aSTony Hutter ctx->X[9] = X09 ^ w[9];
7633c67d83aSTony Hutter ctx->X[10] = X10 ^ w[10];
7643c67d83aSTony Hutter ctx->X[11] = X11 ^ w[11];
7653c67d83aSTony Hutter ctx->X[12] = X12 ^ w[12];
7663c67d83aSTony Hutter ctx->X[13] = X13 ^ w[13];
7673c67d83aSTony Hutter ctx->X[14] = X14 ^ w[14];
7683c67d83aSTony Hutter ctx->X[15] = X15 ^ w[15];
7693c67d83aSTony Hutter
7703c67d83aSTony Hutter Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
7713c67d83aSTony Hutter
7723c67d83aSTony Hutter ts[1] &= ~SKEIN_T1_FLAG_FIRST;
7733c67d83aSTony Hutter blkPtr += SKEIN1024_BLOCK_BYTES;
7743c67d83aSTony Hutter } while (--blkCnt);
7753c67d83aSTony Hutter ctx->h.T[0] = ts[0];
7763c67d83aSTony Hutter ctx->h.T[1] = ts[1];
7773c67d83aSTony Hutter }
7783c67d83aSTony Hutter
7793c67d83aSTony Hutter #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
7803c67d83aSTony Hutter size_t
Skein1024_Process_Block_CodeSize(void)7813c67d83aSTony Hutter Skein1024_Process_Block_CodeSize(void)
7823c67d83aSTony Hutter {
7833c67d83aSTony Hutter return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
7843c67d83aSTony Hutter ((uint8_t *)Skein1024_Process_Block);
7853c67d83aSTony Hutter }
7863c67d83aSTony Hutter
7873c67d83aSTony Hutter uint_t
Skein1024_Unroll_Cnt(void)7883c67d83aSTony Hutter Skein1024_Unroll_Cnt(void)
7893c67d83aSTony Hutter {
7903c67d83aSTony Hutter return (SKEIN_UNROLL_1024);
7913c67d83aSTony Hutter }
7923c67d83aSTony Hutter #endif
7933c67d83aSTony Hutter #endif
794