x86/crypto/sha1_avx2_x86_64_asm.S

2  *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
59  * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
62  *Visit http://software.intel.com/en-us/articles/
63  *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
65  *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
66  *even number of 'blocks' consecutive 64-byte blocks.
69  *	struct sha1_state *state, const u8* input, int blocks );
146  *    - 80 DWORDs per iteration * 2
201 	/* message scheduling pre-compute for rounds 0-15 */
209 		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
224 	 * message scheduling pre-compute for rounds 16-31
226 	 * pre-calculate K+w[i] values and store to mem
229 	 * "brute force" vectorization for rounds 16-31 only
230 	 * due to w[i]->w[i-3] dependency
237 		/* w[i-14] */
239 		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
266 	 * in SHA-1 specification:
267 	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
269 	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
271 	 * since w[i]=>w[i-3] dependency is broken
359 		rorx	$(32-30), B, B    /* b>>>2 */
388 	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
389 	rorx	$(32-30),A, TB		/* b>>>2 for next round */
391 	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
408 	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
410 		rorx	$(32-30), A, TB	/* b>>>2 for next round */
412 	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
427 	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
434 	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
435 	rorx	$(32-30), A, TB		/* b>>>2 for next round */
459  * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
474 	# Precalc WK for first 2 blocks
525 	sub $1, BLOCKS_CTR
583 	sub     $1, BLOCKS_CTR
622  * macro implements SHA-1 function's body for several 64-byte blocks
639 	and	$~(0x20-1), %rsp
640 	sub	$RESERVE_STACK, %rsp