xref: /src/sys/contrib/openzfs/module/icp/algs/skein/skein_block.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1a8847a7eSRob Norris // SPDX-License-Identifier: LicenseRef-OpenZFS-ThirdParty-PublicDomain
23c67d83aSTony Hutter /*
33c67d83aSTony Hutter  * Implementation of the Skein block functions.
43c67d83aSTony Hutter  * Source code author: Doug Whiting, 2008.
53c67d83aSTony Hutter  * This algorithm and source code is released to the public domain.
63c67d83aSTony Hutter  * Compile-time switches:
73c67d83aSTony Hutter  *  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
83c67d83aSTony Hutter  *                    versions use ASM code for block processing
93c67d83aSTony Hutter  *                    [default: use C for all block sizes]
103c67d83aSTony Hutter  */
113c67d83aSTony Hutter /* Copyright 2013 Doug Whiting. This code is released to the public domain. */
123c67d83aSTony Hutter 
133c67d83aSTony Hutter #include <sys/skein.h>
143c67d83aSTony Hutter #include "skein_impl.h"
153c67d83aSTony Hutter #include <sys/isa_defs.h>	/* for _ILP32 */
163c67d83aSTony Hutter 
173c67d83aSTony Hutter #ifndef	SKEIN_USE_ASM
183c67d83aSTony Hutter #define	SKEIN_USE_ASM	(0)	/* default is all C code (no ASM) */
193c67d83aSTony Hutter #endif
203c67d83aSTony Hutter 
213c67d83aSTony Hutter #ifndef	SKEIN_LOOP
223c67d83aSTony Hutter /*
233c67d83aSTony Hutter  * The low-level checksum routines use a lot of stack space. On systems where
243c67d83aSTony Hutter  * small stacks frame are enforced (like 32-bit kernel builds), do not unroll
253c67d83aSTony Hutter  * checksum calculations to save stack space.
263c67d83aSTony Hutter  *
273c67d83aSTony Hutter  * Even with no loops unrolled, we still can exceed the 1k stack frame limit
283c67d83aSTony Hutter  * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32).  We can
293c67d83aSTony Hutter  * safely ignore it though, since that the checksum functions will be called
303c67d83aSTony Hutter  * from a worker thread that won't be using much stack.  That's why we have
313c67d83aSTony Hutter  * the #pragma here to ignore the warning.
323c67d83aSTony Hutter  */
333c67d83aSTony Hutter #if defined(_ILP32) || defined(__powerpc)	/* Assume small stack */
34fe975048Sszubersk #if defined(__GNUC__) && !defined(__clang__)
353c67d83aSTony Hutter #pragma GCC diagnostic ignored "-Wframe-larger-than="
36fe975048Sszubersk #endif
373c67d83aSTony Hutter /*
383c67d83aSTony Hutter  * We're running on 32-bit, don't unroll loops to save stack frame space
393c67d83aSTony Hutter  *
403c67d83aSTony Hutter  * Due to the ways the calculations on SKEIN_LOOP are done in
413c67d83aSTony Hutter  * Skein_*_Process_Block(), a value of 111 disables unrolling loops
423c67d83aSTony Hutter  * in any of those functions.
433c67d83aSTony Hutter  */
443c67d83aSTony Hutter #define	SKEIN_LOOP 111
453c67d83aSTony Hutter #else
463c67d83aSTony Hutter /* We're compiling with large stacks */
473c67d83aSTony Hutter #define	SKEIN_LOOP 001		/* default: unroll 256 and 512, but not 1024 */
483c67d83aSTony Hutter #endif
493c67d83aSTony Hutter #endif
503c67d83aSTony Hutter 
513c67d83aSTony Hutter /* some useful definitions for code here */
523c67d83aSTony Hutter #define	BLK_BITS	(WCNT*64)
533c67d83aSTony Hutter #define	KW_TWK_BASE	(0)
543c67d83aSTony Hutter #define	KW_KEY_BASE	(3)
553c67d83aSTony Hutter #define	ks		(kw + KW_KEY_BASE)
563c67d83aSTony Hutter #define	ts		(kw + KW_TWK_BASE)
573c67d83aSTony Hutter 
583c67d83aSTony Hutter /* no debugging in Illumos version */
593c67d83aSTony Hutter #define	DebugSaveTweak(ctx)
603c67d83aSTony Hutter 
613c67d83aSTony Hutter /* Skein_256 */
623c67d83aSTony Hutter #if	!(SKEIN_USE_ASM & 256)
633c67d83aSTony Hutter void
Skein_256_Process_Block(Skein_256_Ctxt_t * ctx,const uint8_t * blkPtr,size_t blkCnt,size_t byteCntAdd)643c67d83aSTony Hutter Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
653c67d83aSTony Hutter     size_t blkCnt, size_t byteCntAdd)
664ea3f864SGeorge Melikov {
673c67d83aSTony Hutter 	enum {
683c67d83aSTony Hutter 		WCNT = SKEIN_256_STATE_WORDS
693c67d83aSTony Hutter 	};
703c67d83aSTony Hutter #undef  RCNT
713c67d83aSTony Hutter #define	RCNT  (SKEIN_256_ROUNDS_TOTAL / 8)
723c67d83aSTony Hutter 
733c67d83aSTony Hutter #ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
743c67d83aSTony Hutter #define	SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
753c67d83aSTony Hutter #else
763c67d83aSTony Hutter #define	SKEIN_UNROLL_256 (0)
773c67d83aSTony Hutter #endif
783c67d83aSTony Hutter 
793c67d83aSTony Hutter #if	SKEIN_UNROLL_256
803c67d83aSTony Hutter #if	(RCNT % SKEIN_UNROLL_256)
813c67d83aSTony Hutter #error "Invalid SKEIN_UNROLL_256"	/* sanity check on unroll count */
823c67d83aSTony Hutter #endif
833c67d83aSTony Hutter 	size_t r;
843c67d83aSTony Hutter 	/* key schedule words : chaining vars + tweak + "rotation" */
853c67d83aSTony Hutter 	uint64_t kw[WCNT + 4 + RCNT * 2];
863c67d83aSTony Hutter #else
873c67d83aSTony Hutter 	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
883c67d83aSTony Hutter #endif
893c67d83aSTony Hutter 	/* local copy of context vars, for speed */
903c67d83aSTony Hutter 	uint64_t X0, X1, X2, X3;
913c67d83aSTony Hutter 	uint64_t w[WCNT];		/* local copy of input block */
923c67d83aSTony Hutter #ifdef	SKEIN_DEBUG
933c67d83aSTony Hutter 	/* use for debugging (help compiler put Xn in registers) */
943c67d83aSTony Hutter 	const uint64_t *Xptr[4];
953c67d83aSTony Hutter 	Xptr[0] = &X0;
963c67d83aSTony Hutter 	Xptr[1] = &X1;
973c67d83aSTony Hutter 	Xptr[2] = &X2;
983c67d83aSTony Hutter 	Xptr[3] = &X3;
993c67d83aSTony Hutter #endif
1003c67d83aSTony Hutter 	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
1013c67d83aSTony Hutter 	ts[0] = ctx->h.T[0];
1023c67d83aSTony Hutter 	ts[1] = ctx->h.T[1];
1033c67d83aSTony Hutter 	do {
1043c67d83aSTony Hutter 		/*
1053c67d83aSTony Hutter 		 * this implementation only supports 2**64 input bytes
1063c67d83aSTony Hutter 		 * (no carry out here)
1073c67d83aSTony Hutter 		 */
1083c67d83aSTony Hutter 		ts[0] += byteCntAdd;	/* update processed length */
1093c67d83aSTony Hutter 
1103c67d83aSTony Hutter 		/* precompute the key schedule for this block */
1113c67d83aSTony Hutter 		ks[0] = ctx->X[0];
1123c67d83aSTony Hutter 		ks[1] = ctx->X[1];
1133c67d83aSTony Hutter 		ks[2] = ctx->X[2];
1143c67d83aSTony Hutter 		ks[3] = ctx->X[3];
1153c67d83aSTony Hutter 		ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
1163c67d83aSTony Hutter 
1173c67d83aSTony Hutter 		ts[2] = ts[0] ^ ts[1];
1183c67d83aSTony Hutter 
1193c67d83aSTony Hutter 		/* get input block in little-endian format */
1203c67d83aSTony Hutter 		Skein_Get64_LSB_First(w, blkPtr, WCNT);
1213c67d83aSTony Hutter 		DebugSaveTweak(ctx);
1223c67d83aSTony Hutter 		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
1233c67d83aSTony Hutter 
1243c67d83aSTony Hutter 		X0 = w[0] + ks[0];	/* do the first full key injection */
1253c67d83aSTony Hutter 		X1 = w[1] + ks[1] + ts[0];
1263c67d83aSTony Hutter 		X2 = w[2] + ks[2] + ts[1];
1273c67d83aSTony Hutter 		X3 = w[3] + ks[3];
1283c67d83aSTony Hutter 
1293c67d83aSTony Hutter 		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
1303c67d83aSTony Hutter 		    Xptr);	/* show starting state values */
1313c67d83aSTony Hutter 
1323c67d83aSTony Hutter 		blkPtr += SKEIN_256_BLOCK_BYTES;
1333c67d83aSTony Hutter 
1343c67d83aSTony Hutter 		/* run the rounds */
1353c67d83aSTony Hutter 
1363c67d83aSTony Hutter #define	Round256(p0, p1, p2, p3, ROT, rNum)                          \
1373c67d83aSTony Hutter 	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
1383c67d83aSTony Hutter 	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
1393c67d83aSTony Hutter 
1403c67d83aSTony Hutter #if	SKEIN_UNROLL_256 == 0
1413c67d83aSTony Hutter #define	R256(p0, p1, p2, p3, ROT, rNum)		/* fully unrolled */	\
1423c67d83aSTony Hutter 	Round256(p0, p1, p2, p3, ROT, rNum)		\
1433c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
1443c67d83aSTony Hutter 
1453c67d83aSTony Hutter #define	I256(R)								\
1463c67d83aSTony Hutter 	X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \
1473c67d83aSTony Hutter 	X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];			\
1483c67d83aSTony Hutter 	X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];			\
1493c67d83aSTony Hutter 	X3 += ks[((R) + 4) % 5] + (R) + 1;			\
1503c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
1513c67d83aSTony Hutter #else				/* looping version */
1523c67d83aSTony Hutter #define	R256(p0, p1, p2, p3, ROT, rNum)                             \
1533c67d83aSTony Hutter 	Round256(p0, p1, p2, p3, ROT, rNum)                             \
1543c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
1553c67d83aSTony Hutter 
1563c67d83aSTony Hutter #define	I256(R)								\
1573c67d83aSTony Hutter 	X0 += ks[r + (R) + 0];	/* inject the key schedule value */	\
1583c67d83aSTony Hutter 	X1 += ks[r + (R) + 1] + ts[r + (R) + 0];			\
1593c67d83aSTony Hutter 	X2 += ks[r + (R) + 2] + ts[r + (R) + 1];			\
1603c67d83aSTony Hutter 	X3 += ks[r + (R) + 3] + r + (R);				\
1613c67d83aSTony Hutter 	ks[r + (R) + 4] = ks[r + (R) - 1];   /* rotate key schedule */	\
1623c67d83aSTony Hutter 	ts[r + (R) + 2] = ts[r + (R) - 1];			\
1633c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
1643c67d83aSTony Hutter 
1659d40bdf4SAndrea Gelmini 		/* loop through it */
1663c67d83aSTony Hutter 		for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
1673c67d83aSTony Hutter #endif
1683c67d83aSTony Hutter 		{
1693c67d83aSTony Hutter #define	R256_8_rounds(R)                         \
1703c67d83aSTony Hutter 	R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1);  \
1713c67d83aSTony Hutter 	R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2);  \
1723c67d83aSTony Hutter 	R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3);  \
1733c67d83aSTony Hutter 	R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4);  \
1743c67d83aSTony Hutter 	I256(2 * (R));                           \
1753c67d83aSTony Hutter 	R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5);  \
1763c67d83aSTony Hutter 	R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6);  \
1773c67d83aSTony Hutter 	R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7);  \
1783c67d83aSTony Hutter 	R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8);  \
1793c67d83aSTony Hutter 	I256(2 * (R) + 1);
1803c67d83aSTony Hutter 
1813c67d83aSTony Hutter 			R256_8_rounds(0);
1823c67d83aSTony Hutter 
1833c67d83aSTony Hutter #define	R256_Unroll_R(NN) \
1843c67d83aSTony Hutter 	((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
1853c67d83aSTony Hutter 	(SKEIN_UNROLL_256 > (NN)))
1863c67d83aSTony Hutter 
1873c67d83aSTony Hutter #if	R256_Unroll_R(1)
1883c67d83aSTony Hutter 			R256_8_rounds(1);
1893c67d83aSTony Hutter #endif
1903c67d83aSTony Hutter #if	R256_Unroll_R(2)
1913c67d83aSTony Hutter 			R256_8_rounds(2);
1923c67d83aSTony Hutter #endif
1933c67d83aSTony Hutter #if	R256_Unroll_R(3)
1943c67d83aSTony Hutter 			R256_8_rounds(3);
1953c67d83aSTony Hutter #endif
1963c67d83aSTony Hutter #if	R256_Unroll_R(4)
1973c67d83aSTony Hutter 			R256_8_rounds(4);
1983c67d83aSTony Hutter #endif
1993c67d83aSTony Hutter #if	R256_Unroll_R(5)
2003c67d83aSTony Hutter 			R256_8_rounds(5);
2013c67d83aSTony Hutter #endif
2023c67d83aSTony Hutter #if	R256_Unroll_R(6)
2033c67d83aSTony Hutter 			R256_8_rounds(6);
2043c67d83aSTony Hutter #endif
2053c67d83aSTony Hutter #if	R256_Unroll_R(7)
2063c67d83aSTony Hutter 			R256_8_rounds(7);
2073c67d83aSTony Hutter #endif
2083c67d83aSTony Hutter #if	R256_Unroll_R(8)
2093c67d83aSTony Hutter 			R256_8_rounds(8);
2103c67d83aSTony Hutter #endif
2113c67d83aSTony Hutter #if	R256_Unroll_R(9)
2123c67d83aSTony Hutter 			R256_8_rounds(9);
2133c67d83aSTony Hutter #endif
2143c67d83aSTony Hutter #if	R256_Unroll_R(10)
2153c67d83aSTony Hutter 			R256_8_rounds(10);
2163c67d83aSTony Hutter #endif
2173c67d83aSTony Hutter #if	R256_Unroll_R(11)
2183c67d83aSTony Hutter 			R256_8_rounds(11);
2193c67d83aSTony Hutter #endif
2203c67d83aSTony Hutter #if	R256_Unroll_R(12)
2213c67d83aSTony Hutter 			R256_8_rounds(12);
2223c67d83aSTony Hutter #endif
2233c67d83aSTony Hutter #if	R256_Unroll_R(13)
2243c67d83aSTony Hutter 			R256_8_rounds(13);
2253c67d83aSTony Hutter #endif
2263c67d83aSTony Hutter #if	R256_Unroll_R(14)
2273c67d83aSTony Hutter 			R256_8_rounds(14);
2283c67d83aSTony Hutter #endif
2293c67d83aSTony Hutter #if	(SKEIN_UNROLL_256 > 14)
2303c67d83aSTony Hutter #error  "need more unrolling in Skein_256_Process_Block"
2313c67d83aSTony Hutter #endif
2323c67d83aSTony Hutter 		}
2333c67d83aSTony Hutter 		/*
2343c67d83aSTony Hutter 		 * do the final "feedforward" xor, update context chaining vars
2353c67d83aSTony Hutter 		 */
2363c67d83aSTony Hutter 		ctx->X[0] = X0 ^ w[0];
2373c67d83aSTony Hutter 		ctx->X[1] = X1 ^ w[1];
2383c67d83aSTony Hutter 		ctx->X[2] = X2 ^ w[2];
2393c67d83aSTony Hutter 		ctx->X[3] = X3 ^ w[3];
2403c67d83aSTony Hutter 
2413c67d83aSTony Hutter 		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
2423c67d83aSTony Hutter 
2433c67d83aSTony Hutter 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
2444ea3f864SGeorge Melikov 	} while (--blkCnt);
2453c67d83aSTony Hutter 	ctx->h.T[0] = ts[0];
2463c67d83aSTony Hutter 	ctx->h.T[1] = ts[1];
2473c67d83aSTony Hutter }
2483c67d83aSTony Hutter 
2493c67d83aSTony Hutter #if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
2503c67d83aSTony Hutter size_t
Skein_256_Process_Block_CodeSize(void)2513c67d83aSTony Hutter Skein_256_Process_Block_CodeSize(void)
2523c67d83aSTony Hutter {
2533c67d83aSTony Hutter 	return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
2543c67d83aSTony Hutter 	    ((uint8_t *)Skein_256_Process_Block);
2553c67d83aSTony Hutter }
2563c67d83aSTony Hutter 
2573c67d83aSTony Hutter uint_t
Skein_256_Unroll_Cnt(void)2583c67d83aSTony Hutter Skein_256_Unroll_Cnt(void)
2593c67d83aSTony Hutter {
2603c67d83aSTony Hutter 	return (SKEIN_UNROLL_256);
2613c67d83aSTony Hutter }
2623c67d83aSTony Hutter #endif
2633c67d83aSTony Hutter #endif
2643c67d83aSTony Hutter 
2653c67d83aSTony Hutter /* Skein_512 */
2663c67d83aSTony Hutter #if	!(SKEIN_USE_ASM & 512)
2673c67d83aSTony Hutter void
Skein_512_Process_Block(Skein_512_Ctxt_t * ctx,const uint8_t * blkPtr,size_t blkCnt,size_t byteCntAdd)2683c67d83aSTony Hutter Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
2693c67d83aSTony Hutter     size_t blkCnt, size_t byteCntAdd)
2704ea3f864SGeorge Melikov {
2713c67d83aSTony Hutter 	enum {
2723c67d83aSTony Hutter 		WCNT = SKEIN_512_STATE_WORDS
2733c67d83aSTony Hutter 	};
2743c67d83aSTony Hutter #undef  RCNT
2753c67d83aSTony Hutter #define	RCNT  (SKEIN_512_ROUNDS_TOTAL / 8)
2763c67d83aSTony Hutter 
2773c67d83aSTony Hutter #ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
2783c67d83aSTony Hutter #define	SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
2793c67d83aSTony Hutter #else
2803c67d83aSTony Hutter #define	SKEIN_UNROLL_512 (0)
2813c67d83aSTony Hutter #endif
2823c67d83aSTony Hutter 
2833c67d83aSTony Hutter #if	SKEIN_UNROLL_512
2843c67d83aSTony Hutter #if	(RCNT % SKEIN_UNROLL_512)
2853c67d83aSTony Hutter #error "Invalid SKEIN_UNROLL_512"	/* sanity check on unroll count */
2863c67d83aSTony Hutter #endif
2873c67d83aSTony Hutter 	size_t r;
2883c67d83aSTony Hutter 	/* key schedule words : chaining vars + tweak + "rotation" */
2893c67d83aSTony Hutter 	uint64_t kw[WCNT + 4 + RCNT * 2];
2903c67d83aSTony Hutter #else
2913c67d83aSTony Hutter 	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
2923c67d83aSTony Hutter #endif
2933c67d83aSTony Hutter 	/* local copy of vars, for speed */
2943c67d83aSTony Hutter 	uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
2953c67d83aSTony Hutter 	uint64_t w[WCNT];		/* local copy of input block */
2963c67d83aSTony Hutter #ifdef	SKEIN_DEBUG
2973c67d83aSTony Hutter 	/* use for debugging (help compiler put Xn in registers) */
2983c67d83aSTony Hutter 	const uint64_t *Xptr[8];
2993c67d83aSTony Hutter 	Xptr[0] = &X0;
3003c67d83aSTony Hutter 	Xptr[1] = &X1;
3013c67d83aSTony Hutter 	Xptr[2] = &X2;
3023c67d83aSTony Hutter 	Xptr[3] = &X3;
3033c67d83aSTony Hutter 	Xptr[4] = &X4;
3043c67d83aSTony Hutter 	Xptr[5] = &X5;
3053c67d83aSTony Hutter 	Xptr[6] = &X6;
3063c67d83aSTony Hutter 	Xptr[7] = &X7;
3073c67d83aSTony Hutter #endif
3083c67d83aSTony Hutter 
3093c67d83aSTony Hutter 	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
3103c67d83aSTony Hutter 	ts[0] = ctx->h.T[0];
3113c67d83aSTony Hutter 	ts[1] = ctx->h.T[1];
3123c67d83aSTony Hutter 	do {
3133c67d83aSTony Hutter 		/*
3143c67d83aSTony Hutter 		 * this implementation only supports 2**64 input bytes
3153c67d83aSTony Hutter 		 * (no carry out here)
3163c67d83aSTony Hutter 		 */
3173c67d83aSTony Hutter 		ts[0] += byteCntAdd;	/* update processed length */
3183c67d83aSTony Hutter 
3193c67d83aSTony Hutter 		/* precompute the key schedule for this block */
3203c67d83aSTony Hutter 		ks[0] = ctx->X[0];
3213c67d83aSTony Hutter 		ks[1] = ctx->X[1];
3223c67d83aSTony Hutter 		ks[2] = ctx->X[2];
3233c67d83aSTony Hutter 		ks[3] = ctx->X[3];
3243c67d83aSTony Hutter 		ks[4] = ctx->X[4];
3253c67d83aSTony Hutter 		ks[5] = ctx->X[5];
3263c67d83aSTony Hutter 		ks[6] = ctx->X[6];
3273c67d83aSTony Hutter 		ks[7] = ctx->X[7];
3283c67d83aSTony Hutter 		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
3293c67d83aSTony Hutter 		    ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
3303c67d83aSTony Hutter 
3313c67d83aSTony Hutter 		ts[2] = ts[0] ^ ts[1];
3323c67d83aSTony Hutter 
3333c67d83aSTony Hutter 		/* get input block in little-endian format */
3343c67d83aSTony Hutter 		Skein_Get64_LSB_First(w, blkPtr, WCNT);
3353c67d83aSTony Hutter 		DebugSaveTweak(ctx);
3363c67d83aSTony Hutter 		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
3373c67d83aSTony Hutter 
3383c67d83aSTony Hutter 		X0 = w[0] + ks[0];	/* do the first full key injection */
3393c67d83aSTony Hutter 		X1 = w[1] + ks[1];
3403c67d83aSTony Hutter 		X2 = w[2] + ks[2];
3413c67d83aSTony Hutter 		X3 = w[3] + ks[3];
3423c67d83aSTony Hutter 		X4 = w[4] + ks[4];
3433c67d83aSTony Hutter 		X5 = w[5] + ks[5] + ts[0];
3443c67d83aSTony Hutter 		X6 = w[6] + ks[6] + ts[1];
3453c67d83aSTony Hutter 		X7 = w[7] + ks[7];
3463c67d83aSTony Hutter 
3473c67d83aSTony Hutter 		blkPtr += SKEIN_512_BLOCK_BYTES;
3483c67d83aSTony Hutter 
3493c67d83aSTony Hutter 		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
3503c67d83aSTony Hutter 		    Xptr);
3513c67d83aSTony Hutter 		/* run the rounds */
3523c67d83aSTony Hutter #define	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
3533c67d83aSTony Hutter 	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
3543c67d83aSTony Hutter 	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
3553c67d83aSTony Hutter 	X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
3563c67d83aSTony Hutter 	X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
3573c67d83aSTony Hutter 
3583c67d83aSTony Hutter #if	SKEIN_UNROLL_512 == 0
3593c67d83aSTony Hutter #define	R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)	/* unrolled */	\
3603c67d83aSTony Hutter 	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
3613c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
3623c67d83aSTony Hutter 
3633c67d83aSTony Hutter #define	I512(R)								\
3643c67d83aSTony Hutter 	X0 += ks[((R) + 1) % 9];	/* inject the key schedule value */\
3653c67d83aSTony Hutter 	X1 += ks[((R) + 2) % 9];					\
3663c67d83aSTony Hutter 	X2 += ks[((R) + 3) % 9];					\
3673c67d83aSTony Hutter 	X3 += ks[((R) + 4) % 9];					\
3683c67d83aSTony Hutter 	X4 += ks[((R) + 5) % 9];					\
3693c67d83aSTony Hutter 	X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];			\
3703c67d83aSTony Hutter 	X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];			\
3713c67d83aSTony Hutter 	X7 += ks[((R) + 8) % 9] + (R) + 1;				\
3723c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
3733c67d83aSTony Hutter #else				/* looping version */
3743c67d83aSTony Hutter #define	R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)			\
3753c67d83aSTony Hutter 	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
3763c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
3773c67d83aSTony Hutter 
3783c67d83aSTony Hutter #define	I512(R)								\
3793c67d83aSTony Hutter 	X0 += ks[r + (R) + 0];	/* inject the key schedule value */	\
3803c67d83aSTony Hutter 	X1 += ks[r + (R) + 1];						\
3813c67d83aSTony Hutter 	X2 += ks[r + (R) + 2];						\
3823c67d83aSTony Hutter 	X3 += ks[r + (R) + 3];						\
3833c67d83aSTony Hutter 	X4 += ks[r + (R) + 4];						\
3843c67d83aSTony Hutter 	X5 += ks[r + (R) + 5] + ts[r + (R) + 0];			\
3853c67d83aSTony Hutter 	X6 += ks[r + (R) + 6] + ts[r + (R) + 1];			\
3863c67d83aSTony Hutter 	X7 += ks[r + (R) + 7] + r + (R);				\
3873c67d83aSTony Hutter 	ks[r + (R)+8] = ks[r + (R) - 1];	/* rotate key schedule */\
3883c67d83aSTony Hutter 	ts[r + (R)+2] = ts[r + (R) - 1];				\
3893c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
3903c67d83aSTony Hutter 
3919d40bdf4SAndrea Gelmini 		/* loop through it */
3923c67d83aSTony Hutter 		for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
3933c67d83aSTony Hutter #endif				/* end of looped code definitions */
3943c67d83aSTony Hutter 		{
3953c67d83aSTony Hutter #define	R512_8_rounds(R)	/* do 8 full rounds */			\
3963c67d83aSTony Hutter 	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1);		\
3973c67d83aSTony Hutter 	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2);		\
3983c67d83aSTony Hutter 	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3);		\
3993c67d83aSTony Hutter 	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4);		\
4003c67d83aSTony Hutter 	I512(2 * (R));							\
4013c67d83aSTony Hutter 	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5);		\
4023c67d83aSTony Hutter 	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6);		\
4033c67d83aSTony Hutter 	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7);		\
4043c67d83aSTony Hutter 	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8);		\
4053c67d83aSTony Hutter 	I512(2*(R) + 1);		/* and key injection */
4063c67d83aSTony Hutter 
4073c67d83aSTony Hutter 			R512_8_rounds(0);
4083c67d83aSTony Hutter 
4093c67d83aSTony Hutter #define	R512_Unroll_R(NN) \
4103c67d83aSTony Hutter 	((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
4113c67d83aSTony Hutter 	(SKEIN_UNROLL_512 > (NN)))
4123c67d83aSTony Hutter 
4133c67d83aSTony Hutter #if	R512_Unroll_R(1)
4143c67d83aSTony Hutter 			R512_8_rounds(1);
4153c67d83aSTony Hutter #endif
4163c67d83aSTony Hutter #if	R512_Unroll_R(2)
4173c67d83aSTony Hutter 			R512_8_rounds(2);
4183c67d83aSTony Hutter #endif
4193c67d83aSTony Hutter #if	R512_Unroll_R(3)
4203c67d83aSTony Hutter 			R512_8_rounds(3);
4213c67d83aSTony Hutter #endif
4223c67d83aSTony Hutter #if	R512_Unroll_R(4)
4233c67d83aSTony Hutter 			R512_8_rounds(4);
4243c67d83aSTony Hutter #endif
4253c67d83aSTony Hutter #if	R512_Unroll_R(5)
4263c67d83aSTony Hutter 			R512_8_rounds(5);
4273c67d83aSTony Hutter #endif
4283c67d83aSTony Hutter #if	R512_Unroll_R(6)
4293c67d83aSTony Hutter 			R512_8_rounds(6);
4303c67d83aSTony Hutter #endif
4313c67d83aSTony Hutter #if	R512_Unroll_R(7)
4323c67d83aSTony Hutter 			R512_8_rounds(7);
4333c67d83aSTony Hutter #endif
4343c67d83aSTony Hutter #if	R512_Unroll_R(8)
4353c67d83aSTony Hutter 			R512_8_rounds(8);
4363c67d83aSTony Hutter #endif
4373c67d83aSTony Hutter #if	R512_Unroll_R(9)
4383c67d83aSTony Hutter 			R512_8_rounds(9);
4393c67d83aSTony Hutter #endif
4403c67d83aSTony Hutter #if	R512_Unroll_R(10)
4413c67d83aSTony Hutter 			R512_8_rounds(10);
4423c67d83aSTony Hutter #endif
4433c67d83aSTony Hutter #if	R512_Unroll_R(11)
4443c67d83aSTony Hutter 			R512_8_rounds(11);
4453c67d83aSTony Hutter #endif
4463c67d83aSTony Hutter #if	R512_Unroll_R(12)
4473c67d83aSTony Hutter 			R512_8_rounds(12);
4483c67d83aSTony Hutter #endif
4493c67d83aSTony Hutter #if	R512_Unroll_R(13)
4503c67d83aSTony Hutter 			R512_8_rounds(13);
4513c67d83aSTony Hutter #endif
4523c67d83aSTony Hutter #if	R512_Unroll_R(14)
4533c67d83aSTony Hutter 			R512_8_rounds(14);
4543c67d83aSTony Hutter #endif
4553c67d83aSTony Hutter #if	(SKEIN_UNROLL_512 > 14)
4563c67d83aSTony Hutter #error "need more unrolling in Skein_512_Process_Block"
4573c67d83aSTony Hutter #endif
4583c67d83aSTony Hutter 		}
4593c67d83aSTony Hutter 
4603c67d83aSTony Hutter 		/*
4613c67d83aSTony Hutter 		 * do the final "feedforward" xor, update context chaining vars
4623c67d83aSTony Hutter 		 */
4633c67d83aSTony Hutter 		ctx->X[0] = X0 ^ w[0];
4643c67d83aSTony Hutter 		ctx->X[1] = X1 ^ w[1];
4653c67d83aSTony Hutter 		ctx->X[2] = X2 ^ w[2];
4663c67d83aSTony Hutter 		ctx->X[3] = X3 ^ w[3];
4673c67d83aSTony Hutter 		ctx->X[4] = X4 ^ w[4];
4683c67d83aSTony Hutter 		ctx->X[5] = X5 ^ w[5];
4693c67d83aSTony Hutter 		ctx->X[6] = X6 ^ w[6];
4703c67d83aSTony Hutter 		ctx->X[7] = X7 ^ w[7];
4713c67d83aSTony Hutter 		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
4723c67d83aSTony Hutter 
4733c67d83aSTony Hutter 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
4744ea3f864SGeorge Melikov 	} while (--blkCnt);
4753c67d83aSTony Hutter 	ctx->h.T[0] = ts[0];
4763c67d83aSTony Hutter 	ctx->h.T[1] = ts[1];
4773c67d83aSTony Hutter }
4783c67d83aSTony Hutter 
4793c67d83aSTony Hutter #if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
4803c67d83aSTony Hutter size_t
Skein_512_Process_Block_CodeSize(void)4813c67d83aSTony Hutter Skein_512_Process_Block_CodeSize(void)
4823c67d83aSTony Hutter {
4833c67d83aSTony Hutter 	return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
4843c67d83aSTony Hutter 	    ((uint8_t *)Skein_512_Process_Block);
4853c67d83aSTony Hutter }
4863c67d83aSTony Hutter 
4873c67d83aSTony Hutter uint_t
Skein_512_Unroll_Cnt(void)4883c67d83aSTony Hutter Skein_512_Unroll_Cnt(void)
4893c67d83aSTony Hutter {
4903c67d83aSTony Hutter 	return (SKEIN_UNROLL_512);
4913c67d83aSTony Hutter }
4923c67d83aSTony Hutter #endif
4933c67d83aSTony Hutter #endif
4943c67d83aSTony Hutter 
4953c67d83aSTony Hutter /*  Skein1024 */
4963c67d83aSTony Hutter #if	!(SKEIN_USE_ASM & 1024)
4973c67d83aSTony Hutter void
Skein1024_Process_Block(Skein1024_Ctxt_t * ctx,const uint8_t * blkPtr,size_t blkCnt,size_t byteCntAdd)4983c67d83aSTony Hutter Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
4993c67d83aSTony Hutter     size_t blkCnt, size_t byteCntAdd)
5003c67d83aSTony Hutter {
5013c67d83aSTony Hutter 	/* do it in C, always looping (unrolled is bigger AND slower!) */
5023c67d83aSTony Hutter 	enum {
5033c67d83aSTony Hutter 		WCNT = SKEIN1024_STATE_WORDS
5043c67d83aSTony Hutter 	};
5053c67d83aSTony Hutter #undef  RCNT
5063c67d83aSTony Hutter #define	RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
5073c67d83aSTony Hutter 
5083c67d83aSTony Hutter #ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
5093c67d83aSTony Hutter #define	SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
5103c67d83aSTony Hutter #else
5113c67d83aSTony Hutter #define	SKEIN_UNROLL_1024 (0)
5123c67d83aSTony Hutter #endif
5133c67d83aSTony Hutter 
5143c67d83aSTony Hutter #if	(SKEIN_UNROLL_1024 != 0)
5153c67d83aSTony Hutter #if	(RCNT % SKEIN_UNROLL_1024)
5163c67d83aSTony Hutter #error "Invalid SKEIN_UNROLL_1024"	/* sanity check on unroll count */
5173c67d83aSTony Hutter #endif
5183c67d83aSTony Hutter 	size_t r;
5193c67d83aSTony Hutter 	/* key schedule words : chaining vars + tweak + "rotation" */
5203c67d83aSTony Hutter 	uint64_t kw[WCNT + 4 + RCNT * 2];
5213c67d83aSTony Hutter #else
5223c67d83aSTony Hutter 	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
5233c67d83aSTony Hutter #endif
5243c67d83aSTony Hutter 
5253c67d83aSTony Hutter 	/* local copy of vars, for speed */
5263c67d83aSTony Hutter 	uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
5273c67d83aSTony Hutter 	    X12, X13, X14, X15;
5283c67d83aSTony Hutter 	uint64_t w[WCNT];		/* local copy of input block */
5293c67d83aSTony Hutter #ifdef	SKEIN_DEBUG
5303c67d83aSTony Hutter 	/* use for debugging (help compiler put Xn in registers) */
5313c67d83aSTony Hutter 	const uint64_t *Xptr[16];
5323c67d83aSTony Hutter 	Xptr[0] = &X00;
5333c67d83aSTony Hutter 	Xptr[1] = &X01;
5343c67d83aSTony Hutter 	Xptr[2] = &X02;
5353c67d83aSTony Hutter 	Xptr[3] = &X03;
5363c67d83aSTony Hutter 	Xptr[4] = &X04;
5373c67d83aSTony Hutter 	Xptr[5] = &X05;
5383c67d83aSTony Hutter 	Xptr[6] = &X06;
5393c67d83aSTony Hutter 	Xptr[7] = &X07;
5403c67d83aSTony Hutter 	Xptr[8] = &X08;
5413c67d83aSTony Hutter 	Xptr[9] = &X09;
5423c67d83aSTony Hutter 	Xptr[10] = &X10;
5433c67d83aSTony Hutter 	Xptr[11] = &X11;
5443c67d83aSTony Hutter 	Xptr[12] = &X12;
5453c67d83aSTony Hutter 	Xptr[13] = &X13;
5463c67d83aSTony Hutter 	Xptr[14] = &X14;
5473c67d83aSTony Hutter 	Xptr[15] = &X15;
5483c67d83aSTony Hutter #endif
5493c67d83aSTony Hutter 
5503c67d83aSTony Hutter 	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
5513c67d83aSTony Hutter 	ts[0] = ctx->h.T[0];
5523c67d83aSTony Hutter 	ts[1] = ctx->h.T[1];
5533c67d83aSTony Hutter 	do {
5543c67d83aSTony Hutter 		/*
5553c67d83aSTony Hutter 		 * this implementation only supports 2**64 input bytes
5563c67d83aSTony Hutter 		 * (no carry out here)
5573c67d83aSTony Hutter 		 */
5583c67d83aSTony Hutter 		ts[0] += byteCntAdd;	/* update processed length */
5593c67d83aSTony Hutter 
5603c67d83aSTony Hutter 		/* precompute the key schedule for this block */
5613c67d83aSTony Hutter 		ks[0] = ctx->X[0];
5623c67d83aSTony Hutter 		ks[1] = ctx->X[1];
5633c67d83aSTony Hutter 		ks[2] = ctx->X[2];
5643c67d83aSTony Hutter 		ks[3] = ctx->X[3];
5653c67d83aSTony Hutter 		ks[4] = ctx->X[4];
5663c67d83aSTony Hutter 		ks[5] = ctx->X[5];
5673c67d83aSTony Hutter 		ks[6] = ctx->X[6];
5683c67d83aSTony Hutter 		ks[7] = ctx->X[7];
5693c67d83aSTony Hutter 		ks[8] = ctx->X[8];
5703c67d83aSTony Hutter 		ks[9] = ctx->X[9];
5713c67d83aSTony Hutter 		ks[10] = ctx->X[10];
5723c67d83aSTony Hutter 		ks[11] = ctx->X[11];
5733c67d83aSTony Hutter 		ks[12] = ctx->X[12];
5743c67d83aSTony Hutter 		ks[13] = ctx->X[13];
5753c67d83aSTony Hutter 		ks[14] = ctx->X[14];
5763c67d83aSTony Hutter 		ks[15] = ctx->X[15];
5773c67d83aSTony Hutter 		ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
5783c67d83aSTony Hutter 		    ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
5793c67d83aSTony Hutter 		    ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
5803c67d83aSTony Hutter 		    ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
5813c67d83aSTony Hutter 
5823c67d83aSTony Hutter 		ts[2] = ts[0] ^ ts[1];
5833c67d83aSTony Hutter 
5843c67d83aSTony Hutter 		/* get input block in little-endian format */
5853c67d83aSTony Hutter 		Skein_Get64_LSB_First(w, blkPtr, WCNT);
5863c67d83aSTony Hutter 		DebugSaveTweak(ctx);
5873c67d83aSTony Hutter 		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
5883c67d83aSTony Hutter 
5893c67d83aSTony Hutter 		X00 = w[0] + ks[0];	/* do the first full key injection */
5903c67d83aSTony Hutter 		X01 = w[1] + ks[1];
5913c67d83aSTony Hutter 		X02 = w[2] + ks[2];
5923c67d83aSTony Hutter 		X03 = w[3] + ks[3];
5933c67d83aSTony Hutter 		X04 = w[4] + ks[4];
5943c67d83aSTony Hutter 		X05 = w[5] + ks[5];
5953c67d83aSTony Hutter 		X06 = w[6] + ks[6];
5963c67d83aSTony Hutter 		X07 = w[7] + ks[7];
5973c67d83aSTony Hutter 		X08 = w[8] + ks[8];
5983c67d83aSTony Hutter 		X09 = w[9] + ks[9];
5993c67d83aSTony Hutter 		X10 = w[10] + ks[10];
6003c67d83aSTony Hutter 		X11 = w[11] + ks[11];
6013c67d83aSTony Hutter 		X12 = w[12] + ks[12];
6023c67d83aSTony Hutter 		X13 = w[13] + ks[13] + ts[0];
6033c67d83aSTony Hutter 		X14 = w[14] + ks[14] + ts[1];
6043c67d83aSTony Hutter 		X15 = w[15] + ks[15];
6053c67d83aSTony Hutter 
6063c67d83aSTony Hutter 		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
6073c67d83aSTony Hutter 		    Xptr);
6083c67d83aSTony Hutter 
6093c67d83aSTony Hutter #define	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
6103c67d83aSTony Hutter 	pD, pE, pF, ROT, rNum)						\
6113c67d83aSTony Hutter 	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
6123c67d83aSTony Hutter 	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
6133c67d83aSTony Hutter 	X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
6143c67d83aSTony Hutter 	X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
6153c67d83aSTony Hutter 	X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
6163c67d83aSTony Hutter 	X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
6173c67d83aSTony Hutter 	X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
6183c67d83aSTony Hutter 	X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
6193c67d83aSTony Hutter 
6203c67d83aSTony Hutter #if	SKEIN_UNROLL_1024 == 0
6213c67d83aSTony Hutter #define	R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,	\
6223c67d83aSTony Hutter 	pE, pF, ROT, rn)						\
6233c67d83aSTony Hutter 	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
6243c67d83aSTony Hutter 	pD, pE, pF, ROT, rn)						\
6253c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
6263c67d83aSTony Hutter 
6273c67d83aSTony Hutter #define	I1024(R)							\
6283c67d83aSTony Hutter 	X00 += ks[((R) + 1) % 17];	/* inject the key schedule value */\
6293c67d83aSTony Hutter 	X01 += ks[((R) + 2) % 17];					\
6303c67d83aSTony Hutter 	X02 += ks[((R) + 3) % 17];					\
6313c67d83aSTony Hutter 	X03 += ks[((R) + 4) % 17];					\
6323c67d83aSTony Hutter 	X04 += ks[((R) + 5) % 17];					\
6333c67d83aSTony Hutter 	X05 += ks[((R) + 6) % 17];					\
6343c67d83aSTony Hutter 	X06 += ks[((R) + 7) % 17];					\
6353c67d83aSTony Hutter 	X07 += ks[((R) + 8) % 17];					\
6363c67d83aSTony Hutter 	X08 += ks[((R) + 9) % 17];					\
6373c67d83aSTony Hutter 	X09 += ks[((R) + 10) % 17];					\
6383c67d83aSTony Hutter 	X10 += ks[((R) + 11) % 17];					\
6393c67d83aSTony Hutter 	X11 += ks[((R) + 12) % 17];					\
6403c67d83aSTony Hutter 	X12 += ks[((R) + 13) % 17];					\
6413c67d83aSTony Hutter 	X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];			\
6423c67d83aSTony Hutter 	X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];			\
6433c67d83aSTony Hutter 	X15 += ks[((R) + 16) % 17] + (R) +1;				\
6443c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
6453c67d83aSTony Hutter #else				/* looping version */
6463c67d83aSTony Hutter #define	R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,	\
6473c67d83aSTony Hutter 	pE, pF, ROT, rn)						\
6483c67d83aSTony Hutter 	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
6493c67d83aSTony Hutter 	pD, pE, pF, ROT, rn)						\
6503c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
6513c67d83aSTony Hutter 
6523c67d83aSTony Hutter #define	I1024(R)							\
6533c67d83aSTony Hutter 	X00 += ks[r + (R) + 0];	/* inject the key schedule value */	\
6543c67d83aSTony Hutter 	X01 += ks[r + (R) + 1];						\
6553c67d83aSTony Hutter 	X02 += ks[r + (R) + 2];						\
6563c67d83aSTony Hutter 	X03 += ks[r + (R) + 3];						\
6573c67d83aSTony Hutter 	X04 += ks[r + (R) + 4];						\
6583c67d83aSTony Hutter 	X05 += ks[r + (R) + 5];						\
6593c67d83aSTony Hutter 	X06 += ks[r + (R) + 6];						\
6603c67d83aSTony Hutter 	X07 += ks[r + (R) + 7];						\
6613c67d83aSTony Hutter 	X08 += ks[r + (R) + 8];						\
6623c67d83aSTony Hutter 	X09 += ks[r + (R) + 9];						\
6633c67d83aSTony Hutter 	X10 += ks[r + (R) + 10];					\
6643c67d83aSTony Hutter 	X11 += ks[r + (R) + 11];					\
6653c67d83aSTony Hutter 	X12 += ks[r + (R) + 12];					\
6663c67d83aSTony Hutter 	X13 += ks[r + (R) + 13] + ts[r + (R) + 0];			\
6673c67d83aSTony Hutter 	X14 += ks[r + (R) + 14] + ts[r + (R) + 1];			\
6683c67d83aSTony Hutter 	X15 += ks[r + (R) + 15] +  r + (R);				\
6693c67d83aSTony Hutter 	ks[r + (R) + 16] = ks[r + (R) - 1];	/* rotate key schedule */\
6703c67d83aSTony Hutter 	ts[r + (R) + 2] = ts[r + (R) - 1];				\
6713c67d83aSTony Hutter 	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
6723c67d83aSTony Hutter 
6739d40bdf4SAndrea Gelmini 		/* loop through it */
6743c67d83aSTony Hutter 		for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
6753c67d83aSTony Hutter #endif
6763c67d83aSTony Hutter 		{
6773c67d83aSTony Hutter #define	R1024_8_rounds(R)	/* do 8 full rounds */			\
6783c67d83aSTony Hutter 	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,	\
6793c67d83aSTony Hutter 	    14, 15, R1024_0, 8 * (R) + 1);				\
6803c67d83aSTony Hutter 	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,	\
6813c67d83aSTony Hutter 	    08, 01, R1024_1, 8 * (R) + 2);				\
6823c67d83aSTony Hutter 	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,	\
6833c67d83aSTony Hutter 	    10, 09, R1024_2, 8 * (R) + 3);				\
6843c67d83aSTony Hutter 	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,	\
6853c67d83aSTony Hutter 	    12, 07, R1024_3, 8 * (R) + 4);				\
6863c67d83aSTony Hutter 	I1024(2 * (R));							\
6873c67d83aSTony Hutter 	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,	\
6883c67d83aSTony Hutter 	    14, 15, R1024_4, 8 * (R) + 5);				\
6893c67d83aSTony Hutter 	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,	\
6903c67d83aSTony Hutter 	    08, 01, R1024_5, 8 * (R) + 6);				\
6913c67d83aSTony Hutter 	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,	\
6923c67d83aSTony Hutter 	    10, 09, R1024_6, 8 * (R) + 7);				\
6933c67d83aSTony Hutter 	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,	\
6943c67d83aSTony Hutter 	    12, 07, R1024_7, 8 * (R) + 8);				\
6953c67d83aSTony Hutter 	I1024(2 * (R) + 1);
6963c67d83aSTony Hutter 
6973c67d83aSTony Hutter 			R1024_8_rounds(0);
6983c67d83aSTony Hutter 
6993c67d83aSTony Hutter #define	R1024_Unroll_R(NN)						\
7003c67d83aSTony Hutter 	((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) ||	\
7013c67d83aSTony Hutter 	(SKEIN_UNROLL_1024 > (NN)))
7023c67d83aSTony Hutter 
7033c67d83aSTony Hutter #if	R1024_Unroll_R(1)
7043c67d83aSTony Hutter 			R1024_8_rounds(1);
7053c67d83aSTony Hutter #endif
7063c67d83aSTony Hutter #if	R1024_Unroll_R(2)
7073c67d83aSTony Hutter 			R1024_8_rounds(2);
7083c67d83aSTony Hutter #endif
7093c67d83aSTony Hutter #if	R1024_Unroll_R(3)
7103c67d83aSTony Hutter 			R1024_8_rounds(3);
7113c67d83aSTony Hutter #endif
7123c67d83aSTony Hutter #if	R1024_Unroll_R(4)
7133c67d83aSTony Hutter 			R1024_8_rounds(4);
7143c67d83aSTony Hutter #endif
7153c67d83aSTony Hutter #if	R1024_Unroll_R(5)
7163c67d83aSTony Hutter 			R1024_8_rounds(5);
7173c67d83aSTony Hutter #endif
7183c67d83aSTony Hutter #if	R1024_Unroll_R(6)
7193c67d83aSTony Hutter 			R1024_8_rounds(6);
7203c67d83aSTony Hutter #endif
7213c67d83aSTony Hutter #if	R1024_Unroll_R(7)
7223c67d83aSTony Hutter 			R1024_8_rounds(7);
7233c67d83aSTony Hutter #endif
7243c67d83aSTony Hutter #if	R1024_Unroll_R(8)
7253c67d83aSTony Hutter 			R1024_8_rounds(8);
7263c67d83aSTony Hutter #endif
7273c67d83aSTony Hutter #if	R1024_Unroll_R(9)
7283c67d83aSTony Hutter 			R1024_8_rounds(9);
7293c67d83aSTony Hutter #endif
7303c67d83aSTony Hutter #if	R1024_Unroll_R(10)
7313c67d83aSTony Hutter 			R1024_8_rounds(10);
7323c67d83aSTony Hutter #endif
7333c67d83aSTony Hutter #if	R1024_Unroll_R(11)
7343c67d83aSTony Hutter 			R1024_8_rounds(11);
7353c67d83aSTony Hutter #endif
7363c67d83aSTony Hutter #if	R1024_Unroll_R(12)
7373c67d83aSTony Hutter 			R1024_8_rounds(12);
7383c67d83aSTony Hutter #endif
7393c67d83aSTony Hutter #if	R1024_Unroll_R(13)
7403c67d83aSTony Hutter 			R1024_8_rounds(13);
7413c67d83aSTony Hutter #endif
7423c67d83aSTony Hutter #if	R1024_Unroll_R(14)
7433c67d83aSTony Hutter 			R1024_8_rounds(14);
7443c67d83aSTony Hutter #endif
7453c67d83aSTony Hutter #if	(SKEIN_UNROLL_1024 > 14)
7463c67d83aSTony Hutter #error  "need more unrolling in Skein_1024_Process_Block"
7473c67d83aSTony Hutter #endif
7483c67d83aSTony Hutter 		}
7493c67d83aSTony Hutter 		/*
7503c67d83aSTony Hutter 		 * do the final "feedforward" xor, update context chaining vars
7513c67d83aSTony Hutter 		 */
7523c67d83aSTony Hutter 
7533c67d83aSTony Hutter 		ctx->X[0] = X00 ^ w[0];
7543c67d83aSTony Hutter 		ctx->X[1] = X01 ^ w[1];
7553c67d83aSTony Hutter 		ctx->X[2] = X02 ^ w[2];
7563c67d83aSTony Hutter 		ctx->X[3] = X03 ^ w[3];
7573c67d83aSTony Hutter 		ctx->X[4] = X04 ^ w[4];
7583c67d83aSTony Hutter 		ctx->X[5] = X05 ^ w[5];
7593c67d83aSTony Hutter 		ctx->X[6] = X06 ^ w[6];
7603c67d83aSTony Hutter 		ctx->X[7] = X07 ^ w[7];
7613c67d83aSTony Hutter 		ctx->X[8] = X08 ^ w[8];
7623c67d83aSTony Hutter 		ctx->X[9] = X09 ^ w[9];
7633c67d83aSTony Hutter 		ctx->X[10] = X10 ^ w[10];
7643c67d83aSTony Hutter 		ctx->X[11] = X11 ^ w[11];
7653c67d83aSTony Hutter 		ctx->X[12] = X12 ^ w[12];
7663c67d83aSTony Hutter 		ctx->X[13] = X13 ^ w[13];
7673c67d83aSTony Hutter 		ctx->X[14] = X14 ^ w[14];
7683c67d83aSTony Hutter 		ctx->X[15] = X15 ^ w[15];
7693c67d83aSTony Hutter 
7703c67d83aSTony Hutter 		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
7713c67d83aSTony Hutter 
7723c67d83aSTony Hutter 		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
7733c67d83aSTony Hutter 		blkPtr += SKEIN1024_BLOCK_BYTES;
7743c67d83aSTony Hutter 	} while (--blkCnt);
7753c67d83aSTony Hutter 	ctx->h.T[0] = ts[0];
7763c67d83aSTony Hutter 	ctx->h.T[1] = ts[1];
7773c67d83aSTony Hutter }
7783c67d83aSTony Hutter 
7793c67d83aSTony Hutter #if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
7803c67d83aSTony Hutter size_t
Skein1024_Process_Block_CodeSize(void)7813c67d83aSTony Hutter Skein1024_Process_Block_CodeSize(void)
7823c67d83aSTony Hutter {
7833c67d83aSTony Hutter 	return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
7843c67d83aSTony Hutter 	    ((uint8_t *)Skein1024_Process_Block);
7853c67d83aSTony Hutter }
7863c67d83aSTony Hutter 
7873c67d83aSTony Hutter uint_t
Skein1024_Unroll_Cnt(void)7883c67d83aSTony Hutter Skein1024_Unroll_Cnt(void)
7893c67d83aSTony Hutter {
7903c67d83aSTony Hutter 	return (SKEIN_UNROLL_1024);
7913c67d83aSTony Hutter }
7923c67d83aSTony Hutter #endif
7933c67d83aSTony Hutter #endif
794