x86/crypto/sha512-avx2-asm.S

169 	# Extract w[t-7]
170 	MY_VPALIGNR	YTMP0, Y_3, Y_2, 8		# YTMP0 = W[-7]
171 	# Calculate w[t-16] + w[t-7]
172 	vpaddq		Y_0, YTMP0, YTMP0		# YTMP0 = W[-7] + W[-16]
173 	# Extract w[t-15]
174 	MY_VPALIGNR	YTMP1, Y_1, Y_0, 8		# YTMP1 = W[-15]
178 	# Calculate w[t-15] ror 1
181 	vpor		YTMP2, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1
182 	# Calculate w[t-15] shr 7
183 	vpsrlq		$7, YTMP1, YTMP4		# YTMP4 = W[-15] >> 7
188 	add	frame_XFER(%rsp),h		# h = k + w + h         # --
200 	add	h, d		# d = k + w + h + d                     # --
213 	add	y1, h		# h = k + w + h + S0                    # --
215 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
217 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
224 	# Calculate w[t-15] ror 8
227 	vpor		YTMP2, YTMP1, YTMP1		# YTMP1 = W[-15] ror 8
229 	vpxor		YTMP4, YTMP3, YTMP3		# YTMP3 = W[-15] ror 1 ^ W[-15] >> 7
233 	# Add three components, w[t-16], w[t-7] and sigma0
234 	vpaddq		YTMP1, YTMP0, YTMP0		# YTMP0 = W[-16] + W[-7] + s0
235 	# Move to appropriate lanes for calculating w[16] and w[17]
236 	vperm2f128	$0x0, YTMP0, YTMP0, Y_0		# Y_0 = W[-16] + W[-7] + s0 {BABA}
237 	# Move to appropriate lanes for calculating w[18] and w[19]
238 	vpand		MASK_YMM_LO(%rip), YTMP0, YTMP0	# YTMP0 = W[-16] + W[-7] + s0 {DC00}
240 	# Calculate w[16] and w[17] in both 128 bit lanes
242 	# Calculate sigma1 for w[16] and w[17] on both 128 bit lanes
243 	vperm2f128	$0x11, Y_3, Y_3, YTMP2		# YTMP2 = W[-2] {BABA}
244 	vpsrlq		$6, YTMP2, YTMP4		# YTMP4 = W[-2] >> 6 {BABA}
250 	add	1*8+frame_XFER(%rsp), h		# h = k + w + h         # --
264 	add	h, d		# d = k + w + h + d                     # --
278 	add	y1, h		# h = k + w + h + S0                    # --
280 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
281 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
289 	vpsrlq		$19, YTMP2, YTMP3		# YTMP3 = W[-2] >> 19 {BABA}
290 	vpsllq		$(64-19), YTMP2, YTMP1		# YTMP1 = W[-2] << 19 {BABA}
291 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {BABA}
292 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {BABA}
293 	vpsrlq		$61, YTMP2, YTMP3		# YTMP3 = W[-2] >> 61 {BABA}
294 	vpsllq		$(64-61), YTMP2, YTMP1		# YTMP1 = W[-2] << 61 {BABA}
295 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {BABA}
296 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
297 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {BABA}
299 	# Add sigma1 to the other compunents to get w[16] and w[17]
300 	vpaddq		YTMP4, Y_0, Y_0			# Y_0 = {W[1], W[0], W[1], W[0]}
302 	# Calculate sigma1 for w[18] and w[19] for upper 128 bit lane
303 	vpsrlq		$6, Y_0, YTMP4			# YTMP4 = W[-2] >> 6 {DC--}
307 	add	2*8+frame_XFER(%rsp), h		# h = k + w + h         # --
319 	add	h, d		# d = k + w + h + d                     # --
335 	add	y1, h		# h = k + w + h + S0                    # --
336 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
337 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
345 	vpsrlq		$19, Y_0, YTMP3			# YTMP3 = W[-2] >> 19 {DC--}
346 	vpsllq		$(64-19), Y_0, YTMP1		# YTMP1 = W[-2] << 19 {DC--}
347 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 19 {DC--}
348 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = W[-2] ror 19 ^ W[-2] >> 6 {DC--}
349 	vpsrlq		$61, Y_0, YTMP3			# YTMP3 = W[-2] >> 61 {DC--}
350 	vpsllq		$(64-61), Y_0, YTMP1		# YTMP1 = W[-2] << 61 {DC--}
351 	vpor		YTMP1, YTMP3, YTMP3		# YTMP3 = W[-2] ror 61 {DC--}
352 	vpxor		YTMP3, YTMP4, YTMP4		# YTMP4 = s1 = (W[-2] ror 19) ^
353 							#  (W[-2] ror 61) ^ (W[-2] >> 6) {DC--}
355 	# Add the sigma0 + w[t-7] + w[t-16] for w[18] and w[19]
356 	# to newly calculated sigma1 to get w[18] and w[19]
357 	vpaddq		YTMP4, YTMP0, YTMP2		# YTMP2 = {W[3], W[2], --, --}
359 	# Form w[19, w[18], w17], w[16]
360 	vpblendd		$0xF0, YTMP2, Y_0, Y_0		# Y_0 = {W[3], W[2], W[1], W[0]}
365 	add	3*8+frame_XFER(%rsp), h		# h = k + w + h         # --
377 	add	h, d		# d = k + w + h + d                     # --
387 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
396 	add	y1, h		# h = k + w + h + S0                    # --
397 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --
426 	add	frame_XFER(%rsp), h		# h = k + w + h         # --
435 	add	h, d		# d = k + w + h + d                     # --
437 	add	y1, h		# h = k + w + h + S0                    # --
439 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
445 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
464 	add	8*1+frame_XFER(%rsp), h		# h = k + w + h         # --
473 	add	h, d		# d = k + w + h + d                     # --
475 	add	y1, h		# h = k + w + h + S0                    # --
477 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
483 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
502 	add	8*2+frame_XFER(%rsp), h		# h = k + w + h         # --
511 	add	h, d		# d = k + w + h + d                     # --
513 	add	y1, h		# h = k + w + h + S0                    # --
515 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
521 	add	y2, old_h	# h = k + w + h + S0 + S1 + CH = t1 + S0# --
540 	add	8*3+frame_XFER(%rsp), h		# h = k + w + h         # --
550 	add	h, d		# d = k + w + h + d                     # --
552 	add	y1, h		# h = k + w + h + S0                    # --
554 	add	y2, d		# d = k + w + h + d + S1 + CH = d + t1  # --
556 	add	y2, h		# h = k + w + h + S0 + S1 + CH = t1 + S0# --