xref: /src/sys/contrib/openzfs/module/icp/asm-x86_64/blake3/blake3_sse2.S (revision 5956d97f4b3204318ceb6aa9c77bd0bc6ea87a41)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
26 */
27
28#if defined(HAVE_SSE2)
29
30#define _ASM
31#include <sys/asm_linkage.h>
32
33#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
34#if __has_include(<cet.h>)
35#include <cet.h>
36#endif
37#endif
38
39#if !defined(_CET_ENDBR)
40#define _CET_ENDBR
41#endif
42
43.intel_syntax noprefix
44.global zfs_blake3_hash_many_sse2
45.global zfs_blake3_compress_in_place_sse2
46.global zfs_blake3_compress_xof_sse2
47
48.text
49.type zfs_blake3_hash_many_sse2,@function
50.type zfs_blake3_compress_in_place_sse2,@function
51.type zfs_blake3_compress_xof_sse2,@function
52
53        .p2align  6
54zfs_blake3_hash_many_sse2:
55        _CET_ENDBR
56        push    r15
57        push    r14
58        push    r13
59        push    r12
60        push    rbx
61        push    rbp
62        mov     rbp, rsp
63        sub     rsp, 360
64        and     rsp, 0xFFFFFFFFFFFFFFC0
65        neg     r9d
66        movd    xmm0, r9d
67        pshufd  xmm0, xmm0, 0x00
68        movdqa  xmmword ptr [rsp+0x130], xmm0
69        movdqa  xmm1, xmm0
70        pand    xmm1, xmmword ptr [ADD0+rip]
71        pand    xmm0, xmmword ptr [ADD1+rip]
72        movdqa  xmmword ptr [rsp+0x150], xmm0
73        movd    xmm0, r8d
74        pshufd  xmm0, xmm0, 0x00
75        paddd   xmm0, xmm1
76        movdqa  xmmword ptr [rsp+0x110], xmm0
77        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
78        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
79        pcmpgtd xmm1, xmm0
80        shr     r8, 32
81        movd    xmm2, r8d
82        pshufd  xmm2, xmm2, 0x00
83        psubd   xmm2, xmm1
84        movdqa  xmmword ptr [rsp+0x120], xmm2
85        mov     rbx, qword ptr [rbp+0x50]
86        mov     r15, rdx
87        shl     r15, 6
88        movzx   r13d, byte ptr [rbp+0x38]
89        movzx   r12d, byte ptr [rbp+0x48]
90        cmp     rsi, 4
91        jc      3f
922:
93        movdqu  xmm3, xmmword ptr [rcx]
94        pshufd  xmm0, xmm3, 0x00
95        pshufd  xmm1, xmm3, 0x55
96        pshufd  xmm2, xmm3, 0xAA
97        pshufd  xmm3, xmm3, 0xFF
98        movdqu  xmm7, xmmword ptr [rcx+0x10]
99        pshufd  xmm4, xmm7, 0x00
100        pshufd  xmm5, xmm7, 0x55
101        pshufd  xmm6, xmm7, 0xAA
102        pshufd  xmm7, xmm7, 0xFF
103        mov     r8, qword ptr [rdi]
104        mov     r9, qword ptr [rdi+0x8]
105        mov     r10, qword ptr [rdi+0x10]
106        mov     r11, qword ptr [rdi+0x18]
107        movzx   eax, byte ptr [rbp+0x40]
108        or      eax, r13d
109        xor     edx, edx
1109:
111        mov     r14d, eax
112        or      eax, r12d
113        add     rdx, 64
114        cmp     rdx, r15
115        cmovne  eax, r14d
116        movdqu  xmm8, xmmword ptr [r8+rdx-0x40]
117        movdqu  xmm9, xmmword ptr [r9+rdx-0x40]
118        movdqu  xmm10, xmmword ptr [r10+rdx-0x40]
119        movdqu  xmm11, xmmword ptr [r11+rdx-0x40]
120        movdqa  xmm12, xmm8
121        punpckldq xmm8, xmm9
122        punpckhdq xmm12, xmm9
123        movdqa  xmm14, xmm10
124        punpckldq xmm10, xmm11
125        punpckhdq xmm14, xmm11
126        movdqa  xmm9, xmm8
127        punpcklqdq xmm8, xmm10
128        punpckhqdq xmm9, xmm10
129        movdqa  xmm13, xmm12
130        punpcklqdq xmm12, xmm14
131        punpckhqdq xmm13, xmm14
132        movdqa  xmmword ptr [rsp], xmm8
133        movdqa  xmmword ptr [rsp+0x10], xmm9
134        movdqa  xmmword ptr [rsp+0x20], xmm12
135        movdqa  xmmword ptr [rsp+0x30], xmm13
136        movdqu  xmm8, xmmword ptr [r8+rdx-0x30]
137        movdqu  xmm9, xmmword ptr [r9+rdx-0x30]
138        movdqu  xmm10, xmmword ptr [r10+rdx-0x30]
139        movdqu  xmm11, xmmword ptr [r11+rdx-0x30]
140        movdqa  xmm12, xmm8
141        punpckldq xmm8, xmm9
142        punpckhdq xmm12, xmm9
143        movdqa  xmm14, xmm10
144        punpckldq xmm10, xmm11
145        punpckhdq xmm14, xmm11
146        movdqa  xmm9, xmm8
147        punpcklqdq xmm8, xmm10
148        punpckhqdq xmm9, xmm10
149        movdqa  xmm13, xmm12
150        punpcklqdq xmm12, xmm14
151        punpckhqdq xmm13, xmm14
152        movdqa  xmmword ptr [rsp+0x40], xmm8
153        movdqa  xmmword ptr [rsp+0x50], xmm9
154        movdqa  xmmword ptr [rsp+0x60], xmm12
155        movdqa  xmmword ptr [rsp+0x70], xmm13
156        movdqu  xmm8, xmmword ptr [r8+rdx-0x20]
157        movdqu  xmm9, xmmword ptr [r9+rdx-0x20]
158        movdqu  xmm10, xmmword ptr [r10+rdx-0x20]
159        movdqu  xmm11, xmmword ptr [r11+rdx-0x20]
160        movdqa  xmm12, xmm8
161        punpckldq xmm8, xmm9
162        punpckhdq xmm12, xmm9
163        movdqa  xmm14, xmm10
164        punpckldq xmm10, xmm11
165        punpckhdq xmm14, xmm11
166        movdqa  xmm9, xmm8
167        punpcklqdq xmm8, xmm10
168        punpckhqdq xmm9, xmm10
169        movdqa  xmm13, xmm12
170        punpcklqdq xmm12, xmm14
171        punpckhqdq xmm13, xmm14
172        movdqa  xmmword ptr [rsp+0x80], xmm8
173        movdqa  xmmword ptr [rsp+0x90], xmm9
174        movdqa  xmmword ptr [rsp+0xA0], xmm12
175        movdqa  xmmword ptr [rsp+0xB0], xmm13
176        movdqu  xmm8, xmmword ptr [r8+rdx-0x10]
177        movdqu  xmm9, xmmword ptr [r9+rdx-0x10]
178        movdqu  xmm10, xmmword ptr [r10+rdx-0x10]
179        movdqu  xmm11, xmmword ptr [r11+rdx-0x10]
180        movdqa  xmm12, xmm8
181        punpckldq xmm8, xmm9
182        punpckhdq xmm12, xmm9
183        movdqa  xmm14, xmm10
184        punpckldq xmm10, xmm11
185        punpckhdq xmm14, xmm11
186        movdqa  xmm9, xmm8
187        punpcklqdq xmm8, xmm10
188        punpckhqdq xmm9, xmm10
189        movdqa  xmm13, xmm12
190        punpcklqdq xmm12, xmm14
191        punpckhqdq xmm13, xmm14
192        movdqa  xmmword ptr [rsp+0xC0], xmm8
193        movdqa  xmmword ptr [rsp+0xD0], xmm9
194        movdqa  xmmword ptr [rsp+0xE0], xmm12
195        movdqa  xmmword ptr [rsp+0xF0], xmm13
196        movdqa  xmm9, xmmword ptr [BLAKE3_IV_1+rip]
197        movdqa  xmm10, xmmword ptr [BLAKE3_IV_2+rip]
198        movdqa  xmm11, xmmword ptr [BLAKE3_IV_3+rip]
199        movdqa  xmm12, xmmword ptr [rsp+0x110]
200        movdqa  xmm13, xmmword ptr [rsp+0x120]
201        movdqa  xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
202        movd    xmm15, eax
203        pshufd  xmm15, xmm15, 0x00
204        prefetcht0 [r8+rdx+0x80]
205        prefetcht0 [r9+rdx+0x80]
206        prefetcht0 [r10+rdx+0x80]
207        prefetcht0 [r11+rdx+0x80]
208        paddd   xmm0, xmmword ptr [rsp]
209        paddd   xmm1, xmmword ptr [rsp+0x20]
210        paddd   xmm2, xmmword ptr [rsp+0x40]
211        paddd   xmm3, xmmword ptr [rsp+0x60]
212        paddd   xmm0, xmm4
213        paddd   xmm1, xmm5
214        paddd   xmm2, xmm6
215        paddd   xmm3, xmm7
216        pxor    xmm12, xmm0
217        pxor    xmm13, xmm1
218        pxor    xmm14, xmm2
219        pxor    xmm15, xmm3
220        pshuflw xmm12, xmm12, 0xB1
221        pshufhw xmm12, xmm12, 0xB1
222        pshuflw xmm13, xmm13, 0xB1
223        pshufhw xmm13, xmm13, 0xB1
224        pshuflw xmm14, xmm14, 0xB1
225        pshufhw xmm14, xmm14, 0xB1
226        pshuflw xmm15, xmm15, 0xB1
227        pshufhw xmm15, xmm15, 0xB1
228        movdqa  xmm8, xmmword ptr [BLAKE3_IV_0+rip]
229        paddd   xmm8, xmm12
230        paddd   xmm9, xmm13
231        paddd   xmm10, xmm14
232        paddd   xmm11, xmm15
233        pxor    xmm4, xmm8
234        pxor    xmm5, xmm9
235        pxor    xmm6, xmm10
236        pxor    xmm7, xmm11
237        movdqa  xmmword ptr [rsp+0x100], xmm8
238        movdqa  xmm8, xmm4
239        psrld   xmm8, 12
240        pslld   xmm4, 20
241        por     xmm4, xmm8
242        movdqa  xmm8, xmm5
243        psrld   xmm8, 12
244        pslld   xmm5, 20
245        por     xmm5, xmm8
246        movdqa  xmm8, xmm6
247        psrld   xmm8, 12
248        pslld   xmm6, 20
249        por     xmm6, xmm8
250        movdqa  xmm8, xmm7
251        psrld   xmm8, 12
252        pslld   xmm7, 20
253        por     xmm7, xmm8
254        paddd   xmm0, xmmword ptr [rsp+0x10]
255        paddd   xmm1, xmmword ptr [rsp+0x30]
256        paddd   xmm2, xmmword ptr [rsp+0x50]
257        paddd   xmm3, xmmword ptr [rsp+0x70]
258        paddd   xmm0, xmm4
259        paddd   xmm1, xmm5
260        paddd   xmm2, xmm6
261        paddd   xmm3, xmm7
262        pxor    xmm12, xmm0
263        pxor    xmm13, xmm1
264        pxor    xmm14, xmm2
265        pxor    xmm15, xmm3
266        movdqa  xmm8, xmm12
267        psrld   xmm12, 8
268        pslld   xmm8, 24
269        pxor    xmm12, xmm8
270        movdqa  xmm8, xmm13
271        psrld   xmm13, 8
272        pslld   xmm8, 24
273        pxor    xmm13, xmm8
274        movdqa  xmm8, xmm14
275        psrld   xmm14, 8
276        pslld   xmm8, 24
277        pxor    xmm14, xmm8
278        movdqa  xmm8, xmm15
279        psrld   xmm15, 8
280        pslld   xmm8, 24
281        pxor    xmm15, xmm8
282        movdqa  xmm8, xmmword ptr [rsp+0x100]
283        paddd   xmm8, xmm12
284        paddd   xmm9, xmm13
285        paddd   xmm10, xmm14
286        paddd   xmm11, xmm15
287        pxor    xmm4, xmm8
288        pxor    xmm5, xmm9
289        pxor    xmm6, xmm10
290        pxor    xmm7, xmm11
291        movdqa  xmmword ptr [rsp+0x100], xmm8
292        movdqa  xmm8, xmm4
293        psrld   xmm8, 7
294        pslld   xmm4, 25
295        por     xmm4, xmm8
296        movdqa  xmm8, xmm5
297        psrld   xmm8, 7
298        pslld   xmm5, 25
299        por     xmm5, xmm8
300        movdqa  xmm8, xmm6
301        psrld   xmm8, 7
302        pslld   xmm6, 25
303        por     xmm6, xmm8
304        movdqa  xmm8, xmm7
305        psrld   xmm8, 7
306        pslld   xmm7, 25
307        por     xmm7, xmm8
308        paddd   xmm0, xmmword ptr [rsp+0x80]
309        paddd   xmm1, xmmword ptr [rsp+0xA0]
310        paddd   xmm2, xmmword ptr [rsp+0xC0]
311        paddd   xmm3, xmmword ptr [rsp+0xE0]
312        paddd   xmm0, xmm5
313        paddd   xmm1, xmm6
314        paddd   xmm2, xmm7
315        paddd   xmm3, xmm4
316        pxor    xmm15, xmm0
317        pxor    xmm12, xmm1
318        pxor    xmm13, xmm2
319        pxor    xmm14, xmm3
320        pshuflw xmm15, xmm15, 0xB1
321        pshufhw xmm15, xmm15, 0xB1
322        pshuflw xmm12, xmm12, 0xB1
323        pshufhw xmm12, xmm12, 0xB1
324        pshuflw xmm13, xmm13, 0xB1
325        pshufhw xmm13, xmm13, 0xB1
326        pshuflw xmm14, xmm14, 0xB1
327        pshufhw xmm14, xmm14, 0xB1
328        paddd   xmm10, xmm15
329        paddd   xmm11, xmm12
330        movdqa  xmm8, xmmword ptr [rsp+0x100]
331        paddd   xmm8, xmm13
332        paddd   xmm9, xmm14
333        pxor    xmm5, xmm10
334        pxor    xmm6, xmm11
335        pxor    xmm7, xmm8
336        pxor    xmm4, xmm9
337        movdqa  xmmword ptr [rsp+0x100], xmm8
338        movdqa  xmm8, xmm5
339        psrld   xmm8, 12
340        pslld   xmm5, 20
341        por     xmm5, xmm8
342        movdqa  xmm8, xmm6
343        psrld   xmm8, 12
344        pslld   xmm6, 20
345        por     xmm6, xmm8
346        movdqa  xmm8, xmm7
347        psrld   xmm8, 12
348        pslld   xmm7, 20
349        por     xmm7, xmm8
350        movdqa  xmm8, xmm4
351        psrld   xmm8, 12
352        pslld   xmm4, 20
353        por     xmm4, xmm8
354        paddd   xmm0, xmmword ptr [rsp+0x90]
355        paddd   xmm1, xmmword ptr [rsp+0xB0]
356        paddd   xmm2, xmmword ptr [rsp+0xD0]
357        paddd   xmm3, xmmword ptr [rsp+0xF0]
358        paddd   xmm0, xmm5
359        paddd   xmm1, xmm6
360        paddd   xmm2, xmm7
361        paddd   xmm3, xmm4
362        pxor    xmm15, xmm0
363        pxor    xmm12, xmm1
364        pxor    xmm13, xmm2
365        pxor    xmm14, xmm3
366        movdqa  xmm8, xmm15
367        psrld   xmm15, 8
368        pslld   xmm8, 24
369        pxor    xmm15, xmm8
370        movdqa  xmm8, xmm12
371        psrld   xmm12, 8
372        pslld   xmm8, 24
373        pxor    xmm12, xmm8
374        movdqa  xmm8, xmm13
375        psrld   xmm13, 8
376        pslld   xmm8, 24
377        pxor    xmm13, xmm8
378        movdqa  xmm8, xmm14
379        psrld   xmm14, 8
380        pslld   xmm8, 24
381        pxor    xmm14, xmm8
382        paddd   xmm10, xmm15
383        paddd   xmm11, xmm12
384        movdqa  xmm8, xmmword ptr [rsp+0x100]
385        paddd   xmm8, xmm13
386        paddd   xmm9, xmm14
387        pxor    xmm5, xmm10
388        pxor    xmm6, xmm11
389        pxor    xmm7, xmm8
390        pxor    xmm4, xmm9
391        movdqa  xmmword ptr [rsp+0x100], xmm8
392        movdqa  xmm8, xmm5
393        psrld   xmm8, 7
394        pslld   xmm5, 25
395        por     xmm5, xmm8
396        movdqa  xmm8, xmm6
397        psrld   xmm8, 7
398        pslld   xmm6, 25
399        por     xmm6, xmm8
400        movdqa  xmm8, xmm7
401        psrld   xmm8, 7
402        pslld   xmm7, 25
403        por     xmm7, xmm8
404        movdqa  xmm8, xmm4
405        psrld   xmm8, 7
406        pslld   xmm4, 25
407        por     xmm4, xmm8
408        paddd   xmm0, xmmword ptr [rsp+0x20]
409        paddd   xmm1, xmmword ptr [rsp+0x30]
410        paddd   xmm2, xmmword ptr [rsp+0x70]
411        paddd   xmm3, xmmword ptr [rsp+0x40]
412        paddd   xmm0, xmm4
413        paddd   xmm1, xmm5
414        paddd   xmm2, xmm6
415        paddd   xmm3, xmm7
416        pxor    xmm12, xmm0
417        pxor    xmm13, xmm1
418        pxor    xmm14, xmm2
419        pxor    xmm15, xmm3
420        pshuflw xmm12, xmm12, 0xB1
421        pshufhw xmm12, xmm12, 0xB1
422        pshuflw xmm13, xmm13, 0xB1
423        pshufhw xmm13, xmm13, 0xB1
424        pshuflw xmm14, xmm14, 0xB1
425        pshufhw xmm14, xmm14, 0xB1
426        pshuflw xmm15, xmm15, 0xB1
427        pshufhw xmm15, xmm15, 0xB1
428        movdqa  xmm8, xmmword ptr [rsp+0x100]
429        paddd   xmm8, xmm12
430        paddd   xmm9, xmm13
431        paddd   xmm10, xmm14
432        paddd   xmm11, xmm15
433        pxor    xmm4, xmm8
434        pxor    xmm5, xmm9
435        pxor    xmm6, xmm10
436        pxor    xmm7, xmm11
437        movdqa  xmmword ptr [rsp+0x100], xmm8
438        movdqa  xmm8, xmm4
439        psrld   xmm8, 12
440        pslld   xmm4, 20
441        por     xmm4, xmm8
442        movdqa  xmm8, xmm5
443        psrld   xmm8, 12
444        pslld   xmm5, 20
445        por     xmm5, xmm8
446        movdqa  xmm8, xmm6
447        psrld   xmm8, 12
448        pslld   xmm6, 20
449        por     xmm6, xmm8
450        movdqa  xmm8, xmm7
451        psrld   xmm8, 12
452        pslld   xmm7, 20
453        por     xmm7, xmm8
454        paddd   xmm0, xmmword ptr [rsp+0x60]
455        paddd   xmm1, xmmword ptr [rsp+0xA0]
456        paddd   xmm2, xmmword ptr [rsp]
457        paddd   xmm3, xmmword ptr [rsp+0xD0]
458        paddd   xmm0, xmm4
459        paddd   xmm1, xmm5
460        paddd   xmm2, xmm6
461        paddd   xmm3, xmm7
462        pxor    xmm12, xmm0
463        pxor    xmm13, xmm1
464        pxor    xmm14, xmm2
465        pxor    xmm15, xmm3
466        movdqa  xmm8, xmm12
467        psrld   xmm12, 8
468        pslld   xmm8, 24
469        pxor    xmm12, xmm8
470        movdqa  xmm8, xmm13
471        psrld   xmm13, 8
472        pslld   xmm8, 24
473        pxor    xmm13, xmm8
474        movdqa  xmm8, xmm14
475        psrld   xmm14, 8
476        pslld   xmm8, 24
477        pxor    xmm14, xmm8
478        movdqa  xmm8, xmm15
479        psrld   xmm15, 8
480        pslld   xmm8, 24
481        pxor    xmm15, xmm8
482        movdqa  xmm8, xmmword ptr [rsp+0x100]
483        paddd   xmm8, xmm12
484        paddd   xmm9, xmm13
485        paddd   xmm10, xmm14
486        paddd   xmm11, xmm15
487        pxor    xmm4, xmm8
488        pxor    xmm5, xmm9
489        pxor    xmm6, xmm10
490        pxor    xmm7, xmm11
491        movdqa  xmmword ptr [rsp+0x100], xmm8
492        movdqa  xmm8, xmm4
493        psrld   xmm8, 7
494        pslld   xmm4, 25
495        por     xmm4, xmm8
496        movdqa  xmm8, xmm5
497        psrld   xmm8, 7
498        pslld   xmm5, 25
499        por     xmm5, xmm8
500        movdqa  xmm8, xmm6
501        psrld   xmm8, 7
502        pslld   xmm6, 25
503        por     xmm6, xmm8
504        movdqa  xmm8, xmm7
505        psrld   xmm8, 7
506        pslld   xmm7, 25
507        por     xmm7, xmm8
508        paddd   xmm0, xmmword ptr [rsp+0x10]
509        paddd   xmm1, xmmword ptr [rsp+0xC0]
510        paddd   xmm2, xmmword ptr [rsp+0x90]
511        paddd   xmm3, xmmword ptr [rsp+0xF0]
512        paddd   xmm0, xmm5
513        paddd   xmm1, xmm6
514        paddd   xmm2, xmm7
515        paddd   xmm3, xmm4
516        pxor    xmm15, xmm0
517        pxor    xmm12, xmm1
518        pxor    xmm13, xmm2
519        pxor    xmm14, xmm3
520        pshuflw xmm15, xmm15, 0xB1
521        pshufhw xmm15, xmm15, 0xB1
522        pshuflw xmm12, xmm12, 0xB1
523        pshufhw xmm12, xmm12, 0xB1
524        pshuflw xmm13, xmm13, 0xB1
525        pshufhw xmm13, xmm13, 0xB1
526        pshuflw xmm14, xmm14, 0xB1
527        pshufhw xmm14, xmm14, 0xB1
528        paddd   xmm10, xmm15
529        paddd   xmm11, xmm12
530        movdqa  xmm8, xmmword ptr [rsp+0x100]
531        paddd   xmm8, xmm13
532        paddd   xmm9, xmm14
533        pxor    xmm5, xmm10
534        pxor    xmm6, xmm11
535        pxor    xmm7, xmm8
536        pxor    xmm4, xmm9
537        movdqa  xmmword ptr [rsp+0x100], xmm8
538        movdqa  xmm8, xmm5
539        psrld   xmm8, 12
540        pslld   xmm5, 20
541        por     xmm5, xmm8
542        movdqa  xmm8, xmm6
543        psrld   xmm8, 12
544        pslld   xmm6, 20
545        por     xmm6, xmm8
546        movdqa  xmm8, xmm7
547        psrld   xmm8, 12
548        pslld   xmm7, 20
549        por     xmm7, xmm8
550        movdqa  xmm8, xmm4
551        psrld   xmm8, 12
552        pslld   xmm4, 20
553        por     xmm4, xmm8
554        paddd   xmm0, xmmword ptr [rsp+0xB0]
555        paddd   xmm1, xmmword ptr [rsp+0x50]
556        paddd   xmm2, xmmword ptr [rsp+0xE0]
557        paddd   xmm3, xmmword ptr [rsp+0x80]
558        paddd   xmm0, xmm5
559        paddd   xmm1, xmm6
560        paddd   xmm2, xmm7
561        paddd   xmm3, xmm4
562        pxor    xmm15, xmm0
563        pxor    xmm12, xmm1
564        pxor    xmm13, xmm2
565        pxor    xmm14, xmm3
566        movdqa  xmm8, xmm15
567        psrld   xmm15, 8
568        pslld   xmm8, 24
569        pxor    xmm15, xmm8
570        movdqa  xmm8, xmm12
571        psrld   xmm12, 8
572        pslld   xmm8, 24
573        pxor    xmm12, xmm8
574        movdqa  xmm8, xmm13
575        psrld   xmm13, 8
576        pslld   xmm8, 24
577        pxor    xmm13, xmm8
578        movdqa  xmm8, xmm14
579        psrld   xmm14, 8
580        pslld   xmm8, 24
581        pxor    xmm14, xmm8
582        paddd   xmm10, xmm15
583        paddd   xmm11, xmm12
584        movdqa  xmm8, xmmword ptr [rsp+0x100]
585        paddd   xmm8, xmm13
586        paddd   xmm9, xmm14
587        pxor    xmm5, xmm10
588        pxor    xmm6, xmm11
589        pxor    xmm7, xmm8
590        pxor    xmm4, xmm9
591        movdqa  xmmword ptr [rsp+0x100], xmm8
592        movdqa  xmm8, xmm5
593        psrld   xmm8, 7
594        pslld   xmm5, 25
595        por     xmm5, xmm8
596        movdqa  xmm8, xmm6
597        psrld   xmm8, 7
598        pslld   xmm6, 25
599        por     xmm6, xmm8
600        movdqa  xmm8, xmm7
601        psrld   xmm8, 7
602        pslld   xmm7, 25
603        por     xmm7, xmm8
604        movdqa  xmm8, xmm4
605        psrld   xmm8, 7
606        pslld   xmm4, 25
607        por     xmm4, xmm8
608        paddd   xmm0, xmmword ptr [rsp+0x30]
609        paddd   xmm1, xmmword ptr [rsp+0xA0]
610        paddd   xmm2, xmmword ptr [rsp+0xD0]
611        paddd   xmm3, xmmword ptr [rsp+0x70]
612        paddd   xmm0, xmm4
613        paddd   xmm1, xmm5
614        paddd   xmm2, xmm6
615        paddd   xmm3, xmm7
616        pxor    xmm12, xmm0
617        pxor    xmm13, xmm1
618        pxor    xmm14, xmm2
619        pxor    xmm15, xmm3
620        pshuflw xmm12, xmm12, 0xB1
621        pshufhw xmm12, xmm12, 0xB1
622        pshuflw xmm13, xmm13, 0xB1
623        pshufhw xmm13, xmm13, 0xB1
624        pshuflw xmm14, xmm14, 0xB1
625        pshufhw xmm14, xmm14, 0xB1
626        pshuflw xmm15, xmm15, 0xB1
627        pshufhw xmm15, xmm15, 0xB1
628        movdqa  xmm8, xmmword ptr [rsp+0x100]
629        paddd   xmm8, xmm12
630        paddd   xmm9, xmm13
631        paddd   xmm10, xmm14
632        paddd   xmm11, xmm15
633        pxor    xmm4, xmm8
634        pxor    xmm5, xmm9
635        pxor    xmm6, xmm10
636        pxor    xmm7, xmm11
637        movdqa  xmmword ptr [rsp+0x100], xmm8
638        movdqa  xmm8, xmm4
639        psrld   xmm8, 12
640        pslld   xmm4, 20
641        por     xmm4, xmm8
642        movdqa  xmm8, xmm5
643        psrld   xmm8, 12
644        pslld   xmm5, 20
645        por     xmm5, xmm8
646        movdqa  xmm8, xmm6
647        psrld   xmm8, 12
648        pslld   xmm6, 20
649        por     xmm6, xmm8
650        movdqa  xmm8, xmm7
651        psrld   xmm8, 12
652        pslld   xmm7, 20
653        por     xmm7, xmm8
654        paddd   xmm0, xmmword ptr [rsp+0x40]
655        paddd   xmm1, xmmword ptr [rsp+0xC0]
656        paddd   xmm2, xmmword ptr [rsp+0x20]
657        paddd   xmm3, xmmword ptr [rsp+0xE0]
658        paddd   xmm0, xmm4
659        paddd   xmm1, xmm5
660        paddd   xmm2, xmm6
661        paddd   xmm3, xmm7
662        pxor    xmm12, xmm0
663        pxor    xmm13, xmm1
664        pxor    xmm14, xmm2
665        pxor    xmm15, xmm3
666        movdqa  xmm8, xmm12
667        psrld   xmm12, 8
668        pslld   xmm8, 24
669        pxor    xmm12, xmm8
670        movdqa  xmm8, xmm13
671        psrld   xmm13, 8
672        pslld   xmm8, 24
673        pxor    xmm13, xmm8
674        movdqa  xmm8, xmm14
675        psrld   xmm14, 8
676        pslld   xmm8, 24
677        pxor    xmm14, xmm8
678        movdqa  xmm8, xmm15
679        psrld   xmm15, 8
680        pslld   xmm8, 24
681        pxor    xmm15, xmm8
682        movdqa  xmm8, xmmword ptr [rsp+0x100]
683        paddd   xmm8, xmm12
684        paddd   xmm9, xmm13
685        paddd   xmm10, xmm14
686        paddd   xmm11, xmm15
687        pxor    xmm4, xmm8
688        pxor    xmm5, xmm9
689        pxor    xmm6, xmm10
690        pxor    xmm7, xmm11
691        movdqa  xmmword ptr [rsp+0x100], xmm8
692        movdqa  xmm8, xmm4
693        psrld   xmm8, 7
694        pslld   xmm4, 25
695        por     xmm4, xmm8
696        movdqa  xmm8, xmm5
697        psrld   xmm8, 7
698        pslld   xmm5, 25
699        por     xmm5, xmm8
700        movdqa  xmm8, xmm6
701        psrld   xmm8, 7
702        pslld   xmm6, 25
703        por     xmm6, xmm8
704        movdqa  xmm8, xmm7
705        psrld   xmm8, 7
706        pslld   xmm7, 25
707        por     xmm7, xmm8
708        paddd   xmm0, xmmword ptr [rsp+0x60]
709        paddd   xmm1, xmmword ptr [rsp+0x90]
710        paddd   xmm2, xmmword ptr [rsp+0xB0]
711        paddd   xmm3, xmmword ptr [rsp+0x80]
712        paddd   xmm0, xmm5
713        paddd   xmm1, xmm6
714        paddd   xmm2, xmm7
715        paddd   xmm3, xmm4
716        pxor    xmm15, xmm0
717        pxor    xmm12, xmm1
718        pxor    xmm13, xmm2
719        pxor    xmm14, xmm3
720        pshuflw xmm15, xmm15, 0xB1
721        pshufhw xmm15, xmm15, 0xB1
722        pshuflw xmm12, xmm12, 0xB1
723        pshufhw xmm12, xmm12, 0xB1
724        pshuflw xmm13, xmm13, 0xB1
725        pshufhw xmm13, xmm13, 0xB1
726        pshuflw xmm14, xmm14, 0xB1
727        pshufhw xmm14, xmm14, 0xB1
728        paddd   xmm10, xmm15
729        paddd   xmm11, xmm12
730        movdqa  xmm8, xmmword ptr [rsp+0x100]
731        paddd   xmm8, xmm13
732        paddd   xmm9, xmm14
733        pxor    xmm5, xmm10
734        pxor    xmm6, xmm11
735        pxor    xmm7, xmm8
736        pxor    xmm4, xmm9
737        movdqa  xmmword ptr [rsp+0x100], xmm8
738        movdqa  xmm8, xmm5
739        psrld   xmm8, 12
740        pslld   xmm5, 20
741        por     xmm5, xmm8
742        movdqa  xmm8, xmm6
743        psrld   xmm8, 12
744        pslld   xmm6, 20
745        por     xmm6, xmm8
746        movdqa  xmm8, xmm7
747        psrld   xmm8, 12
748        pslld   xmm7, 20
749        por     xmm7, xmm8
750        movdqa  xmm8, xmm4
751        psrld   xmm8, 12
752        pslld   xmm4, 20
753        por     xmm4, xmm8
754        paddd   xmm0, xmmword ptr [rsp+0x50]
755        paddd   xmm1, xmmword ptr [rsp]
756        paddd   xmm2, xmmword ptr [rsp+0xF0]
757        paddd   xmm3, xmmword ptr [rsp+0x10]
758        paddd   xmm0, xmm5
759        paddd   xmm1, xmm6
760        paddd   xmm2, xmm7
761        paddd   xmm3, xmm4
762        pxor    xmm15, xmm0
763        pxor    xmm12, xmm1
764        pxor    xmm13, xmm2
765        pxor    xmm14, xmm3
766        movdqa  xmm8, xmm15
767        psrld   xmm15, 8
768        pslld   xmm8, 24
769        pxor    xmm15, xmm8
770        movdqa  xmm8, xmm12
771        psrld   xmm12, 8
772        pslld   xmm8, 24
773        pxor    xmm12, xmm8
774        movdqa  xmm8, xmm13
775        psrld   xmm13, 8
776        pslld   xmm8, 24
777        pxor    xmm13, xmm8
778        movdqa  xmm8, xmm14
779        psrld   xmm14, 8
780        pslld   xmm8, 24
781        pxor    xmm14, xmm8
782        paddd   xmm10, xmm15
783        paddd   xmm11, xmm12
784        movdqa  xmm8, xmmword ptr [rsp+0x100]
785        paddd   xmm8, xmm13
786        paddd   xmm9, xmm14
787        pxor    xmm5, xmm10
788        pxor    xmm6, xmm11
789        pxor    xmm7, xmm8
790        pxor    xmm4, xmm9
791        movdqa  xmmword ptr [rsp+0x100], xmm8
792        movdqa  xmm8, xmm5
793        psrld   xmm8, 7
794        pslld   xmm5, 25
795        por     xmm5, xmm8
796        movdqa  xmm8, xmm6
797        psrld   xmm8, 7
798        pslld   xmm6, 25
799        por     xmm6, xmm8
800        movdqa  xmm8, xmm7
801        psrld   xmm8, 7
802        pslld   xmm7, 25
803        por     xmm7, xmm8
804        movdqa  xmm8, xmm4
805        psrld   xmm8, 7
806        pslld   xmm4, 25
807        por     xmm4, xmm8
808        paddd   xmm0, xmmword ptr [rsp+0xA0]
809        paddd   xmm1, xmmword ptr [rsp+0xC0]
810        paddd   xmm2, xmmword ptr [rsp+0xE0]
811        paddd   xmm3, xmmword ptr [rsp+0xD0]
812        paddd   xmm0, xmm4
813        paddd   xmm1, xmm5
814        paddd   xmm2, xmm6
815        paddd   xmm3, xmm7
816        pxor    xmm12, xmm0
817        pxor    xmm13, xmm1
818        pxor    xmm14, xmm2
819        pxor    xmm15, xmm3
820        pshuflw xmm12, xmm12, 0xB1
821        pshufhw xmm12, xmm12, 0xB1
822        pshuflw xmm13, xmm13, 0xB1
823        pshufhw xmm13, xmm13, 0xB1
824        pshuflw xmm14, xmm14, 0xB1
825        pshufhw xmm14, xmm14, 0xB1
826        pshuflw xmm15, xmm15, 0xB1
827        pshufhw xmm15, xmm15, 0xB1
828        movdqa  xmm8, xmmword ptr [rsp+0x100]
829        paddd   xmm8, xmm12
830        paddd   xmm9, xmm13
831        paddd   xmm10, xmm14
832        paddd   xmm11, xmm15
833        pxor    xmm4, xmm8
834        pxor    xmm5, xmm9
835        pxor    xmm6, xmm10
836        pxor    xmm7, xmm11
837        movdqa  xmmword ptr [rsp+0x100], xmm8
838        movdqa  xmm8, xmm4
839        psrld   xmm8, 12
840        pslld   xmm4, 20
841        por     xmm4, xmm8
842        movdqa  xmm8, xmm5
843        psrld   xmm8, 12
844        pslld   xmm5, 20
845        por     xmm5, xmm8
846        movdqa  xmm8, xmm6
847        psrld   xmm8, 12
848        pslld   xmm6, 20
849        por     xmm6, xmm8
850        movdqa  xmm8, xmm7
851        psrld   xmm8, 12
852        pslld   xmm7, 20
853        por     xmm7, xmm8
854        paddd   xmm0, xmmword ptr [rsp+0x70]
855        paddd   xmm1, xmmword ptr [rsp+0x90]
856        paddd   xmm2, xmmword ptr [rsp+0x30]
857        paddd   xmm3, xmmword ptr [rsp+0xF0]
858        paddd   xmm0, xmm4
859        paddd   xmm1, xmm5
860        paddd   xmm2, xmm6
861        paddd   xmm3, xmm7
862        pxor    xmm12, xmm0
863        pxor    xmm13, xmm1
864        pxor    xmm14, xmm2
865        pxor    xmm15, xmm3
866        movdqa  xmm8, xmm12
867        psrld   xmm12, 8
868        pslld   xmm8, 24
869        pxor    xmm12, xmm8
870        movdqa  xmm8, xmm13
871        psrld   xmm13, 8
872        pslld   xmm8, 24
873        pxor    xmm13, xmm8
874        movdqa  xmm8, xmm14
875        psrld   xmm14, 8
876        pslld   xmm8, 24
877        pxor    xmm14, xmm8
878        movdqa  xmm8, xmm15
879        psrld   xmm15, 8
880        pslld   xmm8, 24
881        pxor    xmm15, xmm8
882        movdqa  xmm8, xmmword ptr [rsp+0x100]
883        paddd   xmm8, xmm12
884        paddd   xmm9, xmm13
885        paddd   xmm10, xmm14
886        paddd   xmm11, xmm15
887        pxor    xmm4, xmm8
888        pxor    xmm5, xmm9
889        pxor    xmm6, xmm10
890        pxor    xmm7, xmm11
891        movdqa  xmmword ptr [rsp+0x100], xmm8
892        movdqa  xmm8, xmm4
893        psrld   xmm8, 7
894        pslld   xmm4, 25
895        por     xmm4, xmm8
896        movdqa  xmm8, xmm5
897        psrld   xmm8, 7
898        pslld   xmm5, 25
899        por     xmm5, xmm8
900        movdqa  xmm8, xmm6
901        psrld   xmm8, 7
902        pslld   xmm6, 25
903        por     xmm6, xmm8
904        movdqa  xmm8, xmm7
905        psrld   xmm8, 7
906        pslld   xmm7, 25
907        por     xmm7, xmm8
908        paddd   xmm0, xmmword ptr [rsp+0x40]
909        paddd   xmm1, xmmword ptr [rsp+0xB0]
910        paddd   xmm2, xmmword ptr [rsp+0x50]
911        paddd   xmm3, xmmword ptr [rsp+0x10]
912        paddd   xmm0, xmm5
913        paddd   xmm1, xmm6
914        paddd   xmm2, xmm7
915        paddd   xmm3, xmm4
916        pxor    xmm15, xmm0
917        pxor    xmm12, xmm1
918        pxor    xmm13, xmm2
919        pxor    xmm14, xmm3
920        pshuflw xmm15, xmm15, 0xB1
921        pshufhw xmm15, xmm15, 0xB1
922        pshuflw xmm12, xmm12, 0xB1
923        pshufhw xmm12, xmm12, 0xB1
924        pshuflw xmm13, xmm13, 0xB1
925        pshufhw xmm13, xmm13, 0xB1
926        pshuflw xmm14, xmm14, 0xB1
927        pshufhw xmm14, xmm14, 0xB1
928        paddd   xmm10, xmm15
929        paddd   xmm11, xmm12
930        movdqa  xmm8, xmmword ptr [rsp+0x100]
931        paddd   xmm8, xmm13
932        paddd   xmm9, xmm14
933        pxor    xmm5, xmm10
934        pxor    xmm6, xmm11
935        pxor    xmm7, xmm8
936        pxor    xmm4, xmm9
937        movdqa  xmmword ptr [rsp+0x100], xmm8
938        movdqa  xmm8, xmm5
939        psrld   xmm8, 12
940        pslld   xmm5, 20
941        por     xmm5, xmm8
942        movdqa  xmm8, xmm6
943        psrld   xmm8, 12
944        pslld   xmm6, 20
945        por     xmm6, xmm8
946        movdqa  xmm8, xmm7
947        psrld   xmm8, 12
948        pslld   xmm7, 20
949        por     xmm7, xmm8
950        movdqa  xmm8, xmm4
951        psrld   xmm8, 12
952        pslld   xmm4, 20
953        por     xmm4, xmm8
954        paddd   xmm0, xmmword ptr [rsp]
955        paddd   xmm1, xmmword ptr [rsp+0x20]
956        paddd   xmm2, xmmword ptr [rsp+0x80]
957        paddd   xmm3, xmmword ptr [rsp+0x60]
958        paddd   xmm0, xmm5
959        paddd   xmm1, xmm6
960        paddd   xmm2, xmm7
961        paddd   xmm3, xmm4
962        pxor    xmm15, xmm0
963        pxor    xmm12, xmm1
964        pxor    xmm13, xmm2
965        pxor    xmm14, xmm3
966        movdqa  xmm8, xmm15
967        psrld   xmm15, 8
968        pslld   xmm8, 24
969        pxor    xmm15, xmm8
970        movdqa  xmm8, xmm12
971        psrld   xmm12, 8
972        pslld   xmm8, 24
973        pxor    xmm12, xmm8
974        movdqa  xmm8, xmm13
975        psrld   xmm13, 8
976        pslld   xmm8, 24
977        pxor    xmm13, xmm8
978        movdqa  xmm8, xmm14
979        psrld   xmm14, 8
980        pslld   xmm8, 24
981        pxor    xmm14, xmm8
982        paddd   xmm10, xmm15
983        paddd   xmm11, xmm12
984        movdqa  xmm8, xmmword ptr [rsp+0x100]
985        paddd   xmm8, xmm13
986        paddd   xmm9, xmm14
987        pxor    xmm5, xmm10
988        pxor    xmm6, xmm11
989        pxor    xmm7, xmm8
990        pxor    xmm4, xmm9
991        movdqa  xmmword ptr [rsp+0x100], xmm8
992        movdqa  xmm8, xmm5
993        psrld   xmm8, 7
994        pslld   xmm5, 25
995        por     xmm5, xmm8
996        movdqa  xmm8, xmm6
997        psrld   xmm8, 7
998        pslld   xmm6, 25
999        por     xmm6, xmm8
1000        movdqa  xmm8, xmm7
1001        psrld   xmm8, 7
1002        pslld   xmm7, 25
1003        por     xmm7, xmm8
1004        movdqa  xmm8, xmm4
1005        psrld   xmm8, 7
1006        pslld   xmm4, 25
1007        por     xmm4, xmm8
1008        paddd   xmm0, xmmword ptr [rsp+0xC0]
1009        paddd   xmm1, xmmword ptr [rsp+0x90]
1010        paddd   xmm2, xmmword ptr [rsp+0xF0]
1011        paddd   xmm3, xmmword ptr [rsp+0xE0]
1012        paddd   xmm0, xmm4
1013        paddd   xmm1, xmm5
1014        paddd   xmm2, xmm6
1015        paddd   xmm3, xmm7
1016        pxor    xmm12, xmm0
1017        pxor    xmm13, xmm1
1018        pxor    xmm14, xmm2
1019        pxor    xmm15, xmm3
1020        pshuflw xmm12, xmm12, 0xB1
1021        pshufhw xmm12, xmm12, 0xB1
1022        pshuflw xmm13, xmm13, 0xB1
1023        pshufhw xmm13, xmm13, 0xB1
1024        pshuflw xmm14, xmm14, 0xB1
1025        pshufhw xmm14, xmm14, 0xB1
1026        pshuflw xmm15, xmm15, 0xB1
1027        pshufhw xmm15, xmm15, 0xB1
1028        movdqa  xmm8, xmmword ptr [rsp+0x100]
1029        paddd   xmm8, xmm12
1030        paddd   xmm9, xmm13
1031        paddd   xmm10, xmm14
1032        paddd   xmm11, xmm15
1033        pxor    xmm4, xmm8
1034        pxor    xmm5, xmm9
1035        pxor    xmm6, xmm10
1036        pxor    xmm7, xmm11
1037        movdqa  xmmword ptr [rsp+0x100], xmm8
1038        movdqa  xmm8, xmm4
1039        psrld   xmm8, 12
1040        pslld   xmm4, 20
1041        por     xmm4, xmm8
1042        movdqa  xmm8, xmm5
1043        psrld   xmm8, 12
1044        pslld   xmm5, 20
1045        por     xmm5, xmm8
1046        movdqa  xmm8, xmm6
1047        psrld   xmm8, 12
1048        pslld   xmm6, 20
1049        por     xmm6, xmm8
1050        movdqa  xmm8, xmm7
1051        psrld   xmm8, 12
1052        pslld   xmm7, 20
1053        por     xmm7, xmm8
1054        paddd   xmm0, xmmword ptr [rsp+0xD0]
1055        paddd   xmm1, xmmword ptr [rsp+0xB0]
1056        paddd   xmm2, xmmword ptr [rsp+0xA0]
1057        paddd   xmm3, xmmword ptr [rsp+0x80]
1058        paddd   xmm0, xmm4
1059        paddd   xmm1, xmm5
1060        paddd   xmm2, xmm6
1061        paddd   xmm3, xmm7
1062        pxor    xmm12, xmm0
1063        pxor    xmm13, xmm1
1064        pxor    xmm14, xmm2
1065        pxor    xmm15, xmm3
1066        movdqa  xmm8, xmm12
1067        psrld   xmm12, 8
1068        pslld   xmm8, 24
1069        pxor    xmm12, xmm8
1070        movdqa  xmm8, xmm13
1071        psrld   xmm13, 8
1072        pslld   xmm8, 24
1073        pxor    xmm13, xmm8
1074        movdqa  xmm8, xmm14
1075        psrld   xmm14, 8
1076        pslld   xmm8, 24
1077        pxor    xmm14, xmm8
1078        movdqa  xmm8, xmm15
1079        psrld   xmm15, 8
1080        pslld   xmm8, 24
1081        pxor    xmm15, xmm8
1082        movdqa  xmm8, xmmword ptr [rsp+0x100]
1083        paddd   xmm8, xmm12
1084        paddd   xmm9, xmm13
1085        paddd   xmm10, xmm14
1086        paddd   xmm11, xmm15
1087        pxor    xmm4, xmm8
1088        pxor    xmm5, xmm9
1089        pxor    xmm6, xmm10
1090        pxor    xmm7, xmm11
1091        movdqa  xmmword ptr [rsp+0x100], xmm8
1092        movdqa  xmm8, xmm4
1093        psrld   xmm8, 7
1094        pslld   xmm4, 25
1095        por     xmm4, xmm8
1096        movdqa  xmm8, xmm5
1097        psrld   xmm8, 7
1098        pslld   xmm5, 25
1099        por     xmm5, xmm8
1100        movdqa  xmm8, xmm6
1101        psrld   xmm8, 7
1102        pslld   xmm6, 25
1103        por     xmm6, xmm8
1104        movdqa  xmm8, xmm7
1105        psrld   xmm8, 7
1106        pslld   xmm7, 25
1107        por     xmm7, xmm8
1108        paddd   xmm0, xmmword ptr [rsp+0x70]
1109        paddd   xmm1, xmmword ptr [rsp+0x50]
1110        paddd   xmm2, xmmword ptr [rsp]
1111        paddd   xmm3, xmmword ptr [rsp+0x60]
1112        paddd   xmm0, xmm5
1113        paddd   xmm1, xmm6
1114        paddd   xmm2, xmm7
1115        paddd   xmm3, xmm4
1116        pxor    xmm15, xmm0
1117        pxor    xmm12, xmm1
1118        pxor    xmm13, xmm2
1119        pxor    xmm14, xmm3
1120        pshuflw xmm15, xmm15, 0xB1
1121        pshufhw xmm15, xmm15, 0xB1
1122        pshuflw xmm12, xmm12, 0xB1
1123        pshufhw xmm12, xmm12, 0xB1
1124        pshuflw xmm13, xmm13, 0xB1
1125        pshufhw xmm13, xmm13, 0xB1
1126        pshuflw xmm14, xmm14, 0xB1
1127        pshufhw xmm14, xmm14, 0xB1
1128        paddd   xmm10, xmm15
1129        paddd   xmm11, xmm12
1130        movdqa  xmm8, xmmword ptr [rsp+0x100]
1131        paddd   xmm8, xmm13
1132        paddd   xmm9, xmm14
1133        pxor    xmm5, xmm10
1134        pxor    xmm6, xmm11
1135        pxor    xmm7, xmm8
1136        pxor    xmm4, xmm9
1137        movdqa  xmmword ptr [rsp+0x100], xmm8
1138        movdqa  xmm8, xmm5
1139        psrld   xmm8, 12
1140        pslld   xmm5, 20
1141        por     xmm5, xmm8
1142        movdqa  xmm8, xmm6
1143        psrld   xmm8, 12
1144        pslld   xmm6, 20
1145        por     xmm6, xmm8
1146        movdqa  xmm8, xmm7
1147        psrld   xmm8, 12
1148        pslld   xmm7, 20
1149        por     xmm7, xmm8
1150        movdqa  xmm8, xmm4
1151        psrld   xmm8, 12
1152        pslld   xmm4, 20
1153        por     xmm4, xmm8
1154        paddd   xmm0, xmmword ptr [rsp+0x20]
1155        paddd   xmm1, xmmword ptr [rsp+0x30]
1156        paddd   xmm2, xmmword ptr [rsp+0x10]
1157        paddd   xmm3, xmmword ptr [rsp+0x40]
1158        paddd   xmm0, xmm5
1159        paddd   xmm1, xmm6
1160        paddd   xmm2, xmm7
1161        paddd   xmm3, xmm4
1162        pxor    xmm15, xmm0
1163        pxor    xmm12, xmm1
1164        pxor    xmm13, xmm2
1165        pxor    xmm14, xmm3
1166        movdqa  xmm8, xmm15
1167        psrld   xmm15, 8
1168        pslld   xmm8, 24
1169        pxor    xmm15, xmm8
1170        movdqa  xmm8, xmm12
1171        psrld   xmm12, 8
1172        pslld   xmm8, 24
1173        pxor    xmm12, xmm8
1174        movdqa  xmm8, xmm13
1175        psrld   xmm13, 8
1176        pslld   xmm8, 24
1177        pxor    xmm13, xmm8
1178        movdqa  xmm8, xmm14
1179        psrld   xmm14, 8
1180        pslld   xmm8, 24
1181        pxor    xmm14, xmm8
1182        paddd   xmm10, xmm15
1183        paddd   xmm11, xmm12
1184        movdqa  xmm8, xmmword ptr [rsp+0x100]
1185        paddd   xmm8, xmm13
1186        paddd   xmm9, xmm14
1187        pxor    xmm5, xmm10
1188        pxor    xmm6, xmm11
1189        pxor    xmm7, xmm8
1190        pxor    xmm4, xmm9
1191        movdqa  xmmword ptr [rsp+0x100], xmm8
1192        movdqa  xmm8, xmm5
1193        psrld   xmm8, 7
1194        pslld   xmm5, 25
1195        por     xmm5, xmm8
1196        movdqa  xmm8, xmm6
1197        psrld   xmm8, 7
1198        pslld   xmm6, 25
1199        por     xmm6, xmm8
1200        movdqa  xmm8, xmm7
1201        psrld   xmm8, 7
1202        pslld   xmm7, 25
1203        por     xmm7, xmm8
1204        movdqa  xmm8, xmm4
1205        psrld   xmm8, 7
1206        pslld   xmm4, 25
1207        por     xmm4, xmm8
1208        paddd   xmm0, xmmword ptr [rsp+0x90]
1209        paddd   xmm1, xmmword ptr [rsp+0xB0]
1210        paddd   xmm2, xmmword ptr [rsp+0x80]
1211        paddd   xmm3, xmmword ptr [rsp+0xF0]
1212        paddd   xmm0, xmm4
1213        paddd   xmm1, xmm5
1214        paddd   xmm2, xmm6
1215        paddd   xmm3, xmm7
1216        pxor    xmm12, xmm0
1217        pxor    xmm13, xmm1
1218        pxor    xmm14, xmm2
1219        pxor    xmm15, xmm3
1220        pshuflw xmm12, xmm12, 0xB1
1221        pshufhw xmm12, xmm12, 0xB1
1222        pshuflw xmm13, xmm13, 0xB1
1223        pshufhw xmm13, xmm13, 0xB1
1224        pshuflw xmm14, xmm14, 0xB1
1225        pshufhw xmm14, xmm14, 0xB1
1226        pshuflw xmm15, xmm15, 0xB1
1227        pshufhw xmm15, xmm15, 0xB1
1228        movdqa  xmm8, xmmword ptr [rsp+0x100]
1229        paddd   xmm8, xmm12
1230        paddd   xmm9, xmm13
1231        paddd   xmm10, xmm14
1232        paddd   xmm11, xmm15
1233        pxor    xmm4, xmm8
1234        pxor    xmm5, xmm9
1235        pxor    xmm6, xmm10
1236        pxor    xmm7, xmm11
1237        movdqa  xmmword ptr [rsp+0x100], xmm8
1238        movdqa  xmm8, xmm4
1239        psrld   xmm8, 12
1240        pslld   xmm4, 20
1241        por     xmm4, xmm8
1242        movdqa  xmm8, xmm5
1243        psrld   xmm8, 12
1244        pslld   xmm5, 20
1245        por     xmm5, xmm8
1246        movdqa  xmm8, xmm6
1247        psrld   xmm8, 12
1248        pslld   xmm6, 20
1249        por     xmm6, xmm8
1250        movdqa  xmm8, xmm7
1251        psrld   xmm8, 12
1252        pslld   xmm7, 20
1253        por     xmm7, xmm8
1254        paddd   xmm0, xmmword ptr [rsp+0xE0]
1255        paddd   xmm1, xmmword ptr [rsp+0x50]
1256        paddd   xmm2, xmmword ptr [rsp+0xC0]
1257        paddd   xmm3, xmmword ptr [rsp+0x10]
1258        paddd   xmm0, xmm4
1259        paddd   xmm1, xmm5
1260        paddd   xmm2, xmm6
1261        paddd   xmm3, xmm7
1262        pxor    xmm12, xmm0
1263        pxor    xmm13, xmm1
1264        pxor    xmm14, xmm2
1265        pxor    xmm15, xmm3
1266        movdqa  xmm8, xmm12
1267        psrld   xmm12, 8
1268        pslld   xmm8, 24
1269        pxor    xmm12, xmm8
1270        movdqa  xmm8, xmm13
1271        psrld   xmm13, 8
1272        pslld   xmm8, 24
1273        pxor    xmm13, xmm8
1274        movdqa  xmm8, xmm14
1275        psrld   xmm14, 8
1276        pslld   xmm8, 24
1277        pxor    xmm14, xmm8
1278        movdqa  xmm8, xmm15
1279        psrld   xmm15, 8
1280        pslld   xmm8, 24
1281        pxor    xmm15, xmm8
1282        movdqa  xmm8, xmmword ptr [rsp+0x100]
1283        paddd   xmm8, xmm12
1284        paddd   xmm9, xmm13
1285        paddd   xmm10, xmm14
1286        paddd   xmm11, xmm15
1287        pxor    xmm4, xmm8
1288        pxor    xmm5, xmm9
1289        pxor    xmm6, xmm10
1290        pxor    xmm7, xmm11
1291        movdqa  xmmword ptr [rsp+0x100], xmm8
1292        movdqa  xmm8, xmm4
1293        psrld   xmm8, 7
1294        pslld   xmm4, 25
1295        por     xmm4, xmm8
1296        movdqa  xmm8, xmm5
1297        psrld   xmm8, 7
1298        pslld   xmm5, 25
1299        por     xmm5, xmm8
1300        movdqa  xmm8, xmm6
1301        psrld   xmm8, 7
1302        pslld   xmm6, 25
1303        por     xmm6, xmm8
1304        movdqa  xmm8, xmm7
1305        psrld   xmm8, 7
1306        pslld   xmm7, 25
1307        por     xmm7, xmm8
1308        paddd   xmm0, xmmword ptr [rsp+0xD0]
1309        paddd   xmm1, xmmword ptr [rsp]
1310        paddd   xmm2, xmmword ptr [rsp+0x20]
1311        paddd   xmm3, xmmword ptr [rsp+0x40]
1312        paddd   xmm0, xmm5
1313        paddd   xmm1, xmm6
1314        paddd   xmm2, xmm7
1315        paddd   xmm3, xmm4
1316        pxor    xmm15, xmm0
1317        pxor    xmm12, xmm1
1318        pxor    xmm13, xmm2
1319        pxor    xmm14, xmm3
1320        pshuflw xmm15, xmm15, 0xB1
1321        pshufhw xmm15, xmm15, 0xB1
1322        pshuflw xmm12, xmm12, 0xB1
1323        pshufhw xmm12, xmm12, 0xB1
1324        pshuflw xmm13, xmm13, 0xB1
1325        pshufhw xmm13, xmm13, 0xB1
1326        pshuflw xmm14, xmm14, 0xB1
1327        pshufhw xmm14, xmm14, 0xB1
1328        paddd   xmm10, xmm15
1329        paddd   xmm11, xmm12
1330        movdqa  xmm8, xmmword ptr [rsp+0x100]
1331        paddd   xmm8, xmm13
1332        paddd   xmm9, xmm14
1333        pxor    xmm5, xmm10
1334        pxor    xmm6, xmm11
1335        pxor    xmm7, xmm8
1336        pxor    xmm4, xmm9
1337        movdqa  xmmword ptr [rsp+0x100], xmm8
1338        movdqa  xmm8, xmm5
1339        psrld   xmm8, 12
1340        pslld   xmm5, 20
1341        por     xmm5, xmm8
1342        movdqa  xmm8, xmm6
1343        psrld   xmm8, 12
1344        pslld   xmm6, 20
1345        por     xmm6, xmm8
1346        movdqa  xmm8, xmm7
1347        psrld   xmm8, 12
1348        pslld   xmm7, 20
1349        por     xmm7, xmm8
1350        movdqa  xmm8, xmm4
1351        psrld   xmm8, 12
1352        pslld   xmm4, 20
1353        por     xmm4, xmm8
1354        paddd   xmm0, xmmword ptr [rsp+0x30]
1355        paddd   xmm1, xmmword ptr [rsp+0xA0]
1356        paddd   xmm2, xmmword ptr [rsp+0x60]
1357        paddd   xmm3, xmmword ptr [rsp+0x70]
1358        paddd   xmm0, xmm5
1359        paddd   xmm1, xmm6
1360        paddd   xmm2, xmm7
1361        paddd   xmm3, xmm4
1362        pxor    xmm15, xmm0
1363        pxor    xmm12, xmm1
1364        pxor    xmm13, xmm2
1365        pxor    xmm14, xmm3
1366        movdqa  xmm8, xmm15
1367        psrld   xmm15, 8
1368        pslld   xmm8, 24
1369        pxor    xmm15, xmm8
1370        movdqa  xmm8, xmm12
1371        psrld   xmm12, 8
1372        pslld   xmm8, 24
1373        pxor    xmm12, xmm8
1374        movdqa  xmm8, xmm13
1375        psrld   xmm13, 8
1376        pslld   xmm8, 24
1377        pxor    xmm13, xmm8
1378        movdqa  xmm8, xmm14
1379        psrld   xmm14, 8
1380        pslld   xmm8, 24
1381        pxor    xmm14, xmm8
1382        paddd   xmm10, xmm15
1383        paddd   xmm11, xmm12
1384        movdqa  xmm8, xmmword ptr [rsp+0x100]
1385        paddd   xmm8, xmm13
1386        paddd   xmm9, xmm14
1387        pxor    xmm5, xmm10
1388        pxor    xmm6, xmm11
1389        pxor    xmm7, xmm8
1390        pxor    xmm4, xmm9
1391        movdqa  xmmword ptr [rsp+0x100], xmm8
1392        movdqa  xmm8, xmm5
1393        psrld   xmm8, 7
1394        pslld   xmm5, 25
1395        por     xmm5, xmm8
1396        movdqa  xmm8, xmm6
1397        psrld   xmm8, 7
1398        pslld   xmm6, 25
1399        por     xmm6, xmm8
1400        movdqa  xmm8, xmm7
1401        psrld   xmm8, 7
1402        pslld   xmm7, 25
1403        por     xmm7, xmm8
1404        movdqa  xmm8, xmm4
1405        psrld   xmm8, 7
1406        pslld   xmm4, 25
1407        por     xmm4, xmm8
1408        paddd   xmm0, xmmword ptr [rsp+0xB0]
1409        paddd   xmm1, xmmword ptr [rsp+0x50]
1410        paddd   xmm2, xmmword ptr [rsp+0x10]
1411        paddd   xmm3, xmmword ptr [rsp+0x80]
1412        paddd   xmm0, xmm4
1413        paddd   xmm1, xmm5
1414        paddd   xmm2, xmm6
1415        paddd   xmm3, xmm7
1416        pxor    xmm12, xmm0
1417        pxor    xmm13, xmm1
1418        pxor    xmm14, xmm2
1419        pxor    xmm15, xmm3
1420        pshuflw xmm12, xmm12, 0xB1
1421        pshufhw xmm12, xmm12, 0xB1
1422        pshuflw xmm13, xmm13, 0xB1
1423        pshufhw xmm13, xmm13, 0xB1
1424        pshuflw xmm14, xmm14, 0xB1
1425        pshufhw xmm14, xmm14, 0xB1
1426        pshuflw xmm15, xmm15, 0xB1
1427        pshufhw xmm15, xmm15, 0xB1
1428        movdqa  xmm8, xmmword ptr [rsp+0x100]
1429        paddd   xmm8, xmm12
1430        paddd   xmm9, xmm13
1431        paddd   xmm10, xmm14
1432        paddd   xmm11, xmm15
1433        pxor    xmm4, xmm8
1434        pxor    xmm5, xmm9
1435        pxor    xmm6, xmm10
1436        pxor    xmm7, xmm11
1437        movdqa  xmmword ptr [rsp+0x100], xmm8
1438        movdqa  xmm8, xmm4
1439        psrld   xmm8, 12
1440        pslld   xmm4, 20
1441        por     xmm4, xmm8
1442        movdqa  xmm8, xmm5
1443        psrld   xmm8, 12
1444        pslld   xmm5, 20
1445        por     xmm5, xmm8
1446        movdqa  xmm8, xmm6
1447        psrld   xmm8, 12
1448        pslld   xmm6, 20
1449        por     xmm6, xmm8
1450        movdqa  xmm8, xmm7
1451        psrld   xmm8, 12
1452        pslld   xmm7, 20
1453        por     xmm7, xmm8
1454        paddd   xmm0, xmmword ptr [rsp+0xF0]
1455        paddd   xmm1, xmmword ptr [rsp]
1456        paddd   xmm2, xmmword ptr [rsp+0x90]
1457        paddd   xmm3, xmmword ptr [rsp+0x60]
1458        paddd   xmm0, xmm4
1459        paddd   xmm1, xmm5
1460        paddd   xmm2, xmm6
1461        paddd   xmm3, xmm7
1462        pxor    xmm12, xmm0
1463        pxor    xmm13, xmm1
1464        pxor    xmm14, xmm2
1465        pxor    xmm15, xmm3
1466        movdqa  xmm8, xmm12
1467        psrld   xmm12, 8
1468        pslld   xmm8, 24
1469        pxor    xmm12, xmm8
1470        movdqa  xmm8, xmm13
1471        psrld   xmm13, 8
1472        pslld   xmm8, 24
1473        pxor    xmm13, xmm8
1474        movdqa  xmm8, xmm14
1475        psrld   xmm14, 8
1476        pslld   xmm8, 24
1477        pxor    xmm14, xmm8
1478        movdqa  xmm8, xmm15
1479        psrld   xmm15, 8
1480        pslld   xmm8, 24
1481        pxor    xmm15, xmm8
1482        movdqa  xmm8, xmmword ptr [rsp+0x100]
1483        paddd   xmm8, xmm12
1484        paddd   xmm9, xmm13
1485        paddd   xmm10, xmm14
1486        paddd   xmm11, xmm15
1487        pxor    xmm4, xmm8
1488        pxor    xmm5, xmm9
1489        pxor    xmm6, xmm10
1490        pxor    xmm7, xmm11
1491        movdqa  xmmword ptr [rsp+0x100], xmm8
1492        movdqa  xmm8, xmm4
1493        psrld   xmm8, 7
1494        pslld   xmm4, 25
1495        por     xmm4, xmm8
1496        movdqa  xmm8, xmm5
1497        psrld   xmm8, 7
1498        pslld   xmm5, 25
1499        por     xmm5, xmm8
1500        movdqa  xmm8, xmm6
1501        psrld   xmm8, 7
1502        pslld   xmm6, 25
1503        por     xmm6, xmm8
1504        movdqa  xmm8, xmm7
1505        psrld   xmm8, 7
1506        pslld   xmm7, 25
1507        por     xmm7, xmm8
1508        paddd   xmm0, xmmword ptr [rsp+0xE0]
1509        paddd   xmm1, xmmword ptr [rsp+0x20]
1510        paddd   xmm2, xmmword ptr [rsp+0x30]
1511        paddd   xmm3, xmmword ptr [rsp+0x70]
1512        paddd   xmm0, xmm5
1513        paddd   xmm1, xmm6
1514        paddd   xmm2, xmm7
1515        paddd   xmm3, xmm4
1516        pxor    xmm15, xmm0
1517        pxor    xmm12, xmm1
1518        pxor    xmm13, xmm2
1519        pxor    xmm14, xmm3
1520        pshuflw xmm15, xmm15, 0xB1
1521        pshufhw xmm15, xmm15, 0xB1
1522        pshuflw xmm12, xmm12, 0xB1
1523        pshufhw xmm12, xmm12, 0xB1
1524        pshuflw xmm13, xmm13, 0xB1
1525        pshufhw xmm13, xmm13, 0xB1
1526        pshuflw xmm14, xmm14, 0xB1
1527        pshufhw xmm14, xmm14, 0xB1
1528        paddd   xmm10, xmm15
1529        paddd   xmm11, xmm12
1530        movdqa  xmm8, xmmword ptr [rsp+0x100]
1531        paddd   xmm8, xmm13
1532        paddd   xmm9, xmm14
1533        pxor    xmm5, xmm10
1534        pxor    xmm6, xmm11
1535        pxor    xmm7, xmm8
1536        pxor    xmm4, xmm9
1537        movdqa  xmmword ptr [rsp+0x100], xmm8
1538        movdqa  xmm8, xmm5
1539        psrld   xmm8, 12
1540        pslld   xmm5, 20
1541        por     xmm5, xmm8
1542        movdqa  xmm8, xmm6
1543        psrld   xmm8, 12
1544        pslld   xmm6, 20
1545        por     xmm6, xmm8
1546        movdqa  xmm8, xmm7
1547        psrld   xmm8, 12
1548        pslld   xmm7, 20
1549        por     xmm7, xmm8
1550        movdqa  xmm8, xmm4
1551        psrld   xmm8, 12
1552        pslld   xmm4, 20
1553        por     xmm4, xmm8
1554        paddd   xmm0, xmmword ptr [rsp+0xA0]
1555        paddd   xmm1, xmmword ptr [rsp+0xC0]
1556        paddd   xmm2, xmmword ptr [rsp+0x40]
1557        paddd   xmm3, xmmword ptr [rsp+0xD0]
1558        paddd   xmm0, xmm5
1559        paddd   xmm1, xmm6
1560        paddd   xmm2, xmm7
1561        paddd   xmm3, xmm4
1562        pxor    xmm15, xmm0
1563        pxor    xmm12, xmm1
1564        pxor    xmm13, xmm2
1565        pxor    xmm14, xmm3
1566        movdqa  xmm8, xmm15
1567        psrld   xmm15, 8
1568        pslld   xmm8, 24
1569        pxor    xmm15, xmm8
1570        movdqa  xmm8, xmm12
1571        psrld   xmm12, 8
1572        pslld   xmm8, 24
1573        pxor    xmm12, xmm8
1574        movdqa  xmm8, xmm13
1575        psrld   xmm13, 8
1576        pslld   xmm8, 24
1577        pxor    xmm13, xmm8
1578        movdqa  xmm8, xmm14
1579        psrld   xmm14, 8
1580        pslld   xmm8, 24
1581        pxor    xmm14, xmm8
1582        paddd   xmm10, xmm15
1583        paddd   xmm11, xmm12
1584        movdqa  xmm8, xmmword ptr [rsp+0x100]
1585        paddd   xmm8, xmm13
1586        paddd   xmm9, xmm14
1587        pxor    xmm5, xmm10
1588        pxor    xmm6, xmm11
1589        pxor    xmm7, xmm8
1590        pxor    xmm4, xmm9
1591        pxor    xmm0, xmm8
1592        pxor    xmm1, xmm9
1593        pxor    xmm2, xmm10
1594        pxor    xmm3, xmm11
1595        movdqa  xmm8, xmm5
1596        psrld   xmm8, 7
1597        pslld   xmm5, 25
1598        por     xmm5, xmm8
1599        movdqa  xmm8, xmm6
1600        psrld   xmm8, 7
1601        pslld   xmm6, 25
1602        por     xmm6, xmm8
1603        movdqa  xmm8, xmm7
1604        psrld   xmm8, 7
1605        pslld   xmm7, 25
1606        por     xmm7, xmm8
1607        movdqa  xmm8, xmm4
1608        psrld   xmm8, 7
1609        pslld   xmm4, 25
1610        por     xmm4, xmm8
1611        pxor    xmm4, xmm12
1612        pxor    xmm5, xmm13
1613        pxor    xmm6, xmm14
1614        pxor    xmm7, xmm15
1615        mov     eax, r13d
1616        jne     9b
1617        movdqa  xmm9, xmm0
1618        punpckldq xmm0, xmm1
1619        punpckhdq xmm9, xmm1
1620        movdqa  xmm11, xmm2
1621        punpckldq xmm2, xmm3
1622        punpckhdq xmm11, xmm3
1623        movdqa  xmm1, xmm0
1624        punpcklqdq xmm0, xmm2
1625        punpckhqdq xmm1, xmm2
1626        movdqa  xmm3, xmm9
1627        punpcklqdq xmm9, xmm11
1628        punpckhqdq xmm3, xmm11
1629        movdqu  xmmword ptr [rbx], xmm0
1630        movdqu  xmmword ptr [rbx+0x20], xmm1
1631        movdqu  xmmword ptr [rbx+0x40], xmm9
1632        movdqu  xmmword ptr [rbx+0x60], xmm3
1633        movdqa  xmm9, xmm4
1634        punpckldq xmm4, xmm5
1635        punpckhdq xmm9, xmm5
1636        movdqa  xmm11, xmm6
1637        punpckldq xmm6, xmm7
1638        punpckhdq xmm11, xmm7
1639        movdqa  xmm5, xmm4
1640        punpcklqdq xmm4, xmm6
1641        punpckhqdq xmm5, xmm6
1642        movdqa  xmm7, xmm9
1643        punpcklqdq xmm9, xmm11
1644        punpckhqdq xmm7, xmm11
1645        movdqu  xmmword ptr [rbx+0x10], xmm4
1646        movdqu  xmmword ptr [rbx+0x30], xmm5
1647        movdqu  xmmword ptr [rbx+0x50], xmm9
1648        movdqu  xmmword ptr [rbx+0x70], xmm7
1649        movdqa  xmm1, xmmword ptr [rsp+0x110]
1650        movdqa  xmm0, xmm1
1651        paddd   xmm1, xmmword ptr [rsp+0x150]
1652        movdqa  xmmword ptr [rsp+0x110], xmm1
1653        pxor    xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1654        pxor    xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1655        pcmpgtd xmm0, xmm1
1656        movdqa  xmm1, xmmword ptr [rsp+0x120]
1657        psubd   xmm1, xmm0
1658        movdqa  xmmword ptr [rsp+0x120], xmm1
1659        add     rbx, 128
1660        add     rdi, 32
1661        sub     rsi, 4
1662        cmp     rsi, 4
1663        jnc     2b
1664        test    rsi, rsi
1665        jnz     3f
16664:
1667        mov     rsp, rbp
1668        pop     rbp
1669        pop     rbx
1670        pop     r12
1671        pop     r13
1672        pop     r14
1673        pop     r15
1674        RET
1675.p2align 5
16763:
1677        test    esi, 0x2
1678        je      3f
1679        movups  xmm0, xmmword ptr [rcx]
1680        movups  xmm1, xmmword ptr [rcx+0x10]
1681        movaps  xmm8, xmm0
1682        movaps  xmm9, xmm1
1683        movd    xmm13, dword ptr [rsp+0x110]
1684        movd    xmm14, dword ptr [rsp+0x120]
1685        punpckldq xmm13, xmm14
1686        movaps  xmmword ptr [rsp], xmm13
1687        movd    xmm14, dword ptr [rsp+0x114]
1688        movd    xmm13, dword ptr [rsp+0x124]
1689        punpckldq xmm14, xmm13
1690        movaps  xmmword ptr [rsp+0x10], xmm14
1691        mov     r8, qword ptr [rdi]
1692        mov     r9, qword ptr [rdi+0x8]
1693        movzx   eax, byte ptr [rbp+0x40]
1694        or      eax, r13d
1695        xor     edx, edx
16962:
1697        mov     r14d, eax
1698        or      eax, r12d
1699        add     rdx, 64
1700        cmp     rdx, r15
1701        cmovne  eax, r14d
1702        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1703        movaps  xmm10, xmm2
1704        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1705        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1706        movaps  xmm3, xmm4
1707        shufps  xmm4, xmm5, 136
1708        shufps  xmm3, xmm5, 221
1709        movaps  xmm5, xmm3
1710        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1711        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1712        movaps  xmm3, xmm6
1713        shufps  xmm6, xmm7, 136
1714        pshufd  xmm6, xmm6, 0x93
1715        shufps  xmm3, xmm7, 221
1716        pshufd  xmm7, xmm3, 0x93
1717        movups  xmm12, xmmword ptr [r9+rdx-0x40]
1718        movups  xmm13, xmmword ptr [r9+rdx-0x30]
1719        movaps  xmm11, xmm12
1720        shufps  xmm12, xmm13, 136
1721        shufps  xmm11, xmm13, 221
1722        movaps  xmm13, xmm11
1723        movups  xmm14, xmmword ptr [r9+rdx-0x20]
1724        movups  xmm15, xmmword ptr [r9+rdx-0x10]
1725        movaps  xmm11, xmm14
1726        shufps  xmm14, xmm15, 136
1727        pshufd  xmm14, xmm14, 0x93
1728        shufps  xmm11, xmm15, 221
1729        pshufd  xmm15, xmm11, 0x93
1730        shl     rax, 0x20
1731        or      rax, 0x40
1732        movq    xmm3, rax
1733        movdqa  xmmword ptr [rsp+0x20], xmm3
1734        movaps  xmm3, xmmword ptr [rsp]
1735        movaps  xmm11, xmmword ptr [rsp+0x10]
1736        punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1737        punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1738        mov     al, 7
17399:
1740        paddd   xmm0, xmm4
1741        paddd   xmm8, xmm12
1742        movaps  xmmword ptr [rsp+0x20], xmm4
1743        movaps  xmmword ptr [rsp+0x30], xmm12
1744        paddd   xmm0, xmm1
1745        paddd   xmm8, xmm9
1746        pxor    xmm3, xmm0
1747        pxor    xmm11, xmm8
1748        pshuflw xmm3, xmm3, 0xB1
1749        pshufhw xmm3, xmm3, 0xB1
1750        pshuflw xmm11, xmm11, 0xB1
1751        pshufhw xmm11, xmm11, 0xB1
1752        paddd   xmm2, xmm3
1753        paddd   xmm10, xmm11
1754        pxor    xmm1, xmm2
1755        pxor    xmm9, xmm10
1756        movdqa  xmm4, xmm1
1757        pslld   xmm1, 20
1758        psrld   xmm4, 12
1759        por     xmm1, xmm4
1760        movdqa  xmm4, xmm9
1761        pslld   xmm9, 20
1762        psrld   xmm4, 12
1763        por     xmm9, xmm4
1764        paddd   xmm0, xmm5
1765        paddd   xmm8, xmm13
1766        movaps  xmmword ptr [rsp+0x40], xmm5
1767        movaps  xmmword ptr [rsp+0x50], xmm13
1768        paddd   xmm0, xmm1
1769        paddd   xmm8, xmm9
1770        pxor    xmm3, xmm0
1771        pxor    xmm11, xmm8
1772        movdqa  xmm13, xmm3
1773        psrld   xmm3, 8
1774        pslld   xmm13, 24
1775        pxor    xmm3, xmm13
1776        movdqa  xmm13, xmm11
1777        psrld   xmm11, 8
1778        pslld   xmm13, 24
1779        pxor    xmm11, xmm13
1780        paddd   xmm2, xmm3
1781        paddd   xmm10, xmm11
1782        pxor    xmm1, xmm2
1783        pxor    xmm9, xmm10
1784        movdqa  xmm4, xmm1
1785        pslld   xmm1, 25
1786        psrld   xmm4, 7
1787        por     xmm1, xmm4
1788        movdqa  xmm4, xmm9
1789        pslld   xmm9, 25
1790        psrld   xmm4, 7
1791        por     xmm9, xmm4
1792        pshufd  xmm0, xmm0, 0x93
1793        pshufd  xmm8, xmm8, 0x93
1794        pshufd  xmm3, xmm3, 0x4E
1795        pshufd  xmm11, xmm11, 0x4E
1796        pshufd  xmm2, xmm2, 0x39
1797        pshufd  xmm10, xmm10, 0x39
1798        paddd   xmm0, xmm6
1799        paddd   xmm8, xmm14
1800        paddd   xmm0, xmm1
1801        paddd   xmm8, xmm9
1802        pxor    xmm3, xmm0
1803        pxor    xmm11, xmm8
1804        pshuflw xmm3, xmm3, 0xB1
1805        pshufhw xmm3, xmm3, 0xB1
1806        pshuflw xmm11, xmm11, 0xB1
1807        pshufhw xmm11, xmm11, 0xB1
1808        paddd   xmm2, xmm3
1809        paddd   xmm10, xmm11
1810        pxor    xmm1, xmm2
1811        pxor    xmm9, xmm10
1812        movdqa  xmm4, xmm1
1813        pslld   xmm1, 20
1814        psrld   xmm4, 12
1815        por     xmm1, xmm4
1816        movdqa  xmm4, xmm9
1817        pslld   xmm9, 20
1818        psrld   xmm4, 12
1819        por     xmm9, xmm4
1820        paddd   xmm0, xmm7
1821        paddd   xmm8, xmm15
1822        paddd   xmm0, xmm1
1823        paddd   xmm8, xmm9
1824        pxor    xmm3, xmm0
1825        pxor    xmm11, xmm8
1826        movdqa  xmm13, xmm3
1827        psrld   xmm3, 8
1828        pslld   xmm13, 24
1829        pxor    xmm3, xmm13
1830        movdqa  xmm13, xmm11
1831        psrld   xmm11, 8
1832        pslld   xmm13, 24
1833        pxor    xmm11, xmm13
1834        paddd   xmm2, xmm3
1835        paddd   xmm10, xmm11
1836        pxor    xmm1, xmm2
1837        pxor    xmm9, xmm10
1838        movdqa  xmm4, xmm1
1839        pslld   xmm1, 25
1840        psrld   xmm4, 7
1841        por     xmm1, xmm4
1842        movdqa  xmm4, xmm9
1843        pslld   xmm9, 25
1844        psrld   xmm4, 7
1845        por     xmm9, xmm4
1846        pshufd  xmm0, xmm0, 0x39
1847        pshufd  xmm8, xmm8, 0x39
1848        pshufd  xmm3, xmm3, 0x4E
1849        pshufd  xmm11, xmm11, 0x4E
1850        pshufd  xmm2, xmm2, 0x93
1851        pshufd  xmm10, xmm10, 0x93
1852        dec     al
1853        je      9f
1854        movdqa  xmm12, xmmword ptr [rsp+0x20]
1855        movdqa  xmm5, xmmword ptr [rsp+0x40]
1856        pshufd  xmm13, xmm12, 0x0F
1857        shufps  xmm12, xmm5, 214
1858        pshufd  xmm4, xmm12, 0x39
1859        movdqa  xmm12, xmm6
1860        shufps  xmm12, xmm7, 250
1861        pand    xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1862        pand    xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1863        por     xmm13, xmm12
1864        movdqa  xmmword ptr [rsp+0x20], xmm13
1865        movdqa  xmm12, xmm7
1866        punpcklqdq xmm12, xmm5
1867        movdqa  xmm13, xmm6
1868        pand    xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1869        pand    xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1870        por     xmm12, xmm13
1871        pshufd  xmm12, xmm12, 0x78
1872        punpckhdq xmm5, xmm7
1873        punpckldq xmm6, xmm5
1874        pshufd  xmm7, xmm6, 0x1E
1875        movdqa  xmmword ptr [rsp+0x40], xmm12
1876        movdqa  xmm5, xmmword ptr [rsp+0x30]
1877        movdqa  xmm13, xmmword ptr [rsp+0x50]
1878        pshufd  xmm6, xmm5, 0x0F
1879        shufps  xmm5, xmm13, 214
1880        pshufd  xmm12, xmm5, 0x39
1881        movdqa  xmm5, xmm14
1882        shufps  xmm5, xmm15, 250
1883        pand    xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1884        pand    xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1885        por     xmm6, xmm5
1886        movdqa  xmm5, xmm15
1887        punpcklqdq xmm5, xmm13
1888        movdqa  xmmword ptr [rsp+0x30], xmm2
1889        movdqa  xmm2, xmm14
1890        pand    xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1891        pand    xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1892        por     xmm5, xmm2
1893        movdqa  xmm2, xmmword ptr [rsp+0x30]
1894        pshufd  xmm5, xmm5, 0x78
1895        punpckhdq xmm13, xmm15
1896        punpckldq xmm14, xmm13
1897        pshufd  xmm15, xmm14, 0x1E
1898        movdqa  xmm13, xmm6
1899        movdqa  xmm14, xmm5
1900        movdqa  xmm5, xmmword ptr [rsp+0x20]
1901        movdqa  xmm6, xmmword ptr [rsp+0x40]
1902        jmp     9b
19039:
1904        pxor    xmm0, xmm2
1905        pxor    xmm1, xmm3
1906        pxor    xmm8, xmm10
1907        pxor    xmm9, xmm11
1908        mov     eax, r13d
1909        cmp     rdx, r15
1910        jne     2b
1911        movups  xmmword ptr [rbx], xmm0
1912        movups  xmmword ptr [rbx+0x10], xmm1
1913        movups  xmmword ptr [rbx+0x20], xmm8
1914        movups  xmmword ptr [rbx+0x30], xmm9
1915        mov     eax, dword ptr [rsp+0x130]
1916        neg     eax
1917        mov    r10d, dword ptr [rsp+0x110+8*rax]
1918        mov    r11d, dword ptr [rsp+0x120+8*rax]
1919        mov dword ptr [rsp+0x110], r10d
1920        mov dword ptr [rsp+0x120], r11d
1921        add     rdi, 16
1922        add     rbx, 64
1923        sub     rsi, 2
19243:
1925        test    esi, 0x1
1926        je      4b
1927        movups  xmm0, xmmword ptr [rcx]
1928        movups  xmm1, xmmword ptr [rcx+0x10]
1929        movd    xmm13, dword ptr [rsp+0x110]
1930        movd    xmm14, dword ptr [rsp+0x120]
1931        punpckldq xmm13, xmm14
1932        mov     r8, qword ptr [rdi]
1933        movzx   eax, byte ptr [rbp+0x40]
1934        or      eax, r13d
1935        xor     edx, edx
19362:
1937        mov     r14d, eax
1938        or      eax, r12d
1939        add     rdx, 64
1940        cmp     rdx, r15
1941        cmovne  eax, r14d
1942        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
1943        shl     rax, 32
1944        or      rax, 64
1945        movq    xmm12, rax
1946        movdqa  xmm3, xmm13
1947        punpcklqdq xmm3, xmm12
1948        movups  xmm4, xmmword ptr [r8+rdx-0x40]
1949        movups  xmm5, xmmword ptr [r8+rdx-0x30]
1950        movaps  xmm8, xmm4
1951        shufps  xmm4, xmm5, 136
1952        shufps  xmm8, xmm5, 221
1953        movaps  xmm5, xmm8
1954        movups  xmm6, xmmword ptr [r8+rdx-0x20]
1955        movups  xmm7, xmmword ptr [r8+rdx-0x10]
1956        movaps  xmm8, xmm6
1957        shufps  xmm6, xmm7, 136
1958        pshufd  xmm6, xmm6, 0x93
1959        shufps  xmm8, xmm7, 221
1960        pshufd  xmm7, xmm8, 0x93
1961        mov     al, 7
19629:
1963        paddd   xmm0, xmm4
1964        paddd   xmm0, xmm1
1965        pxor    xmm3, xmm0
1966        pshuflw xmm3, xmm3, 0xB1
1967        pshufhw xmm3, xmm3, 0xB1
1968        paddd   xmm2, xmm3
1969        pxor    xmm1, xmm2
1970        movdqa  xmm11, xmm1
1971        pslld   xmm1, 20
1972        psrld   xmm11, 12
1973        por     xmm1, xmm11
1974        paddd   xmm0, xmm5
1975        paddd   xmm0, xmm1
1976        pxor    xmm3, xmm0
1977        movdqa  xmm14, xmm3
1978        psrld   xmm3, 8
1979        pslld   xmm14, 24
1980        pxor    xmm3, xmm14
1981        paddd   xmm2, xmm3
1982        pxor    xmm1, xmm2
1983        movdqa  xmm11, xmm1
1984        pslld   xmm1, 25
1985        psrld   xmm11, 7
1986        por     xmm1, xmm11
1987        pshufd  xmm0, xmm0, 0x93
1988        pshufd  xmm3, xmm3, 0x4E
1989        pshufd  xmm2, xmm2, 0x39
1990        paddd   xmm0, xmm6
1991        paddd   xmm0, xmm1
1992        pxor    xmm3, xmm0
1993        pshuflw xmm3, xmm3, 0xB1
1994        pshufhw xmm3, xmm3, 0xB1
1995        paddd   xmm2, xmm3
1996        pxor    xmm1, xmm2
1997        movdqa  xmm11, xmm1
1998        pslld   xmm1, 20
1999        psrld   xmm11, 12
2000        por     xmm1, xmm11
2001        paddd   xmm0, xmm7
2002        paddd   xmm0, xmm1
2003        pxor    xmm3, xmm0
2004        movdqa  xmm14, xmm3
2005        psrld   xmm3, 8
2006        pslld   xmm14, 24
2007        pxor    xmm3, xmm14
2008        paddd   xmm2, xmm3
2009        pxor    xmm1, xmm2
2010        movdqa  xmm11, xmm1
2011        pslld   xmm1, 25
2012        psrld   xmm11, 7
2013        por     xmm1, xmm11
2014        pshufd  xmm0, xmm0, 0x39
2015        pshufd  xmm3, xmm3, 0x4E
2016        pshufd  xmm2, xmm2, 0x93
2017        dec     al
2018        jz      9f
2019        movdqa  xmm8, xmm4
2020        shufps  xmm8, xmm5, 214
2021        pshufd  xmm9, xmm4, 0x0F
2022        pshufd  xmm4, xmm8, 0x39
2023        movdqa  xmm8, xmm6
2024        shufps  xmm8, xmm7, 250
2025        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2026        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2027        por     xmm9, xmm8
2028        movdqa  xmm8, xmm7
2029        punpcklqdq xmm8, xmm5
2030        movdqa  xmm10, xmm6
2031        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2032        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2033        por     xmm8, xmm10
2034        pshufd  xmm8, xmm8, 0x78
2035        punpckhdq xmm5, xmm7
2036        punpckldq xmm6, xmm5
2037        pshufd  xmm7, xmm6, 0x1E
2038        movdqa  xmm5, xmm9
2039        movdqa  xmm6, xmm8
2040        jmp     9b
20419:
2042        pxor    xmm0, xmm2
2043        pxor    xmm1, xmm3
2044        mov     eax, r13d
2045        cmp     rdx, r15
2046        jne     2b
2047        movups  xmmword ptr [rbx], xmm0
2048        movups  xmmword ptr [rbx+0x10], xmm1
2049        jmp     4b
2050
2051.p2align 6
2052zfs_blake3_compress_in_place_sse2:
2053        _CET_ENDBR
2054        movups  xmm0, xmmword ptr [rdi]
2055        movups  xmm1, xmmword ptr [rdi+0x10]
2056        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2057        shl     r8, 32
2058        add     rdx, r8
2059        movq    xmm3, rcx
2060        movq    xmm4, rdx
2061        punpcklqdq xmm3, xmm4
2062        movups  xmm4, xmmword ptr [rsi]
2063        movups  xmm5, xmmword ptr [rsi+0x10]
2064        movaps  xmm8, xmm4
2065        shufps  xmm4, xmm5, 136
2066        shufps  xmm8, xmm5, 221
2067        movaps  xmm5, xmm8
2068        movups  xmm6, xmmword ptr [rsi+0x20]
2069        movups  xmm7, xmmword ptr [rsi+0x30]
2070        movaps  xmm8, xmm6
2071        shufps  xmm6, xmm7, 136
2072        pshufd  xmm6, xmm6, 0x93
2073        shufps  xmm8, xmm7, 221
2074        pshufd  xmm7, xmm8, 0x93
2075        mov     al, 7
20769:
2077        paddd   xmm0, xmm4
2078        paddd   xmm0, xmm1
2079        pxor    xmm3, xmm0
2080        pshuflw xmm3, xmm3, 0xB1
2081        pshufhw xmm3, xmm3, 0xB1
2082        paddd   xmm2, xmm3
2083        pxor    xmm1, xmm2
2084        movdqa  xmm11, xmm1
2085        pslld   xmm1, 20
2086        psrld   xmm11, 12
2087        por     xmm1, xmm11
2088        paddd   xmm0, xmm5
2089        paddd   xmm0, xmm1
2090        pxor    xmm3, xmm0
2091        movdqa  xmm14, xmm3
2092        psrld   xmm3, 8
2093        pslld   xmm14, 24
2094        pxor    xmm3, xmm14
2095        paddd   xmm2, xmm3
2096        pxor    xmm1, xmm2
2097        movdqa  xmm11, xmm1
2098        pslld   xmm1, 25
2099        psrld   xmm11, 7
2100        por     xmm1, xmm11
2101        pshufd  xmm0, xmm0, 0x93
2102        pshufd  xmm3, xmm3, 0x4E
2103        pshufd  xmm2, xmm2, 0x39
2104        paddd   xmm0, xmm6
2105        paddd   xmm0, xmm1
2106        pxor    xmm3, xmm0
2107        pshuflw xmm3, xmm3, 0xB1
2108        pshufhw xmm3, xmm3, 0xB1
2109        paddd   xmm2, xmm3
2110        pxor    xmm1, xmm2
2111        movdqa  xmm11, xmm1
2112        pslld   xmm1, 20
2113        psrld   xmm11, 12
2114        por     xmm1, xmm11
2115        paddd   xmm0, xmm7
2116        paddd   xmm0, xmm1
2117        pxor    xmm3, xmm0
2118        movdqa  xmm14, xmm3
2119        psrld   xmm3, 8
2120        pslld   xmm14, 24
2121        pxor    xmm3, xmm14
2122        paddd   xmm2, xmm3
2123        pxor    xmm1, xmm2
2124        movdqa  xmm11, xmm1
2125        pslld   xmm1, 25
2126        psrld   xmm11, 7
2127        por     xmm1, xmm11
2128        pshufd  xmm0, xmm0, 0x39
2129        pshufd  xmm3, xmm3, 0x4E
2130        pshufd  xmm2, xmm2, 0x93
2131        dec     al
2132        jz      9f
2133        movdqa  xmm8, xmm4
2134        shufps  xmm8, xmm5, 214
2135        pshufd  xmm9, xmm4, 0x0F
2136        pshufd  xmm4, xmm8, 0x39
2137        movdqa  xmm8, xmm6
2138        shufps  xmm8, xmm7, 250
2139        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2140        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2141        por     xmm9, xmm8
2142        movdqa  xmm8, xmm7
2143        punpcklqdq xmm8, xmm5
2144        movdqa  xmm10, xmm6
2145        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2146        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2147        por     xmm8, xmm10
2148        pshufd  xmm8, xmm8, 0x78
2149        punpckhdq xmm5, xmm7
2150        punpckldq xmm6, xmm5
2151        pshufd  xmm7, xmm6, 0x1E
2152        movdqa  xmm5, xmm9
2153        movdqa  xmm6, xmm8
2154        jmp     9b
21559:
2156        pxor    xmm0, xmm2
2157        pxor    xmm1, xmm3
2158        movups  xmmword ptr [rdi], xmm0
2159        movups  xmmword ptr [rdi+0x10], xmm1
2160        RET
2161
2162.p2align 6
2163zfs_blake3_compress_xof_sse2:
2164        _CET_ENDBR
2165        movups  xmm0, xmmword ptr [rdi]
2166        movups  xmm1, xmmword ptr [rdi+0x10]
2167        movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
2168        movzx   eax, r8b
2169        movzx   edx, dl
2170        shl     rax, 32
2171        add     rdx, rax
2172        movq    xmm3, rcx
2173        movq    xmm4, rdx
2174        punpcklqdq xmm3, xmm4
2175        movups  xmm4, xmmword ptr [rsi]
2176        movups  xmm5, xmmword ptr [rsi+0x10]
2177        movaps  xmm8, xmm4
2178        shufps  xmm4, xmm5, 136
2179        shufps  xmm8, xmm5, 221
2180        movaps  xmm5, xmm8
2181        movups  xmm6, xmmword ptr [rsi+0x20]
2182        movups  xmm7, xmmword ptr [rsi+0x30]
2183        movaps  xmm8, xmm6
2184        shufps  xmm6, xmm7, 136
2185        pshufd  xmm6, xmm6, 0x93
2186        shufps  xmm8, xmm7, 221
2187        pshufd  xmm7, xmm8, 0x93
2188        mov     al, 7
21899:
2190        paddd   xmm0, xmm4
2191        paddd   xmm0, xmm1
2192        pxor    xmm3, xmm0
2193        pshuflw xmm3, xmm3, 0xB1
2194        pshufhw xmm3, xmm3, 0xB1
2195        paddd   xmm2, xmm3
2196        pxor    xmm1, xmm2
2197        movdqa  xmm11, xmm1
2198        pslld   xmm1, 20
2199        psrld   xmm11, 12
2200        por     xmm1, xmm11
2201        paddd   xmm0, xmm5
2202        paddd   xmm0, xmm1
2203        pxor    xmm3, xmm0
2204        movdqa  xmm14, xmm3
2205        psrld   xmm3, 8
2206        pslld   xmm14, 24
2207        pxor    xmm3, xmm14
2208        paddd   xmm2, xmm3
2209        pxor    xmm1, xmm2
2210        movdqa  xmm11, xmm1
2211        pslld   xmm1, 25
2212        psrld   xmm11, 7
2213        por     xmm1, xmm11
2214        pshufd  xmm0, xmm0, 0x93
2215        pshufd  xmm3, xmm3, 0x4E
2216        pshufd  xmm2, xmm2, 0x39
2217        paddd   xmm0, xmm6
2218        paddd   xmm0, xmm1
2219        pxor    xmm3, xmm0
2220        pshuflw xmm3, xmm3, 0xB1
2221        pshufhw xmm3, xmm3, 0xB1
2222        paddd   xmm2, xmm3
2223        pxor    xmm1, xmm2
2224        movdqa  xmm11, xmm1
2225        pslld   xmm1, 20
2226        psrld   xmm11, 12
2227        por     xmm1, xmm11
2228        paddd   xmm0, xmm7
2229        paddd   xmm0, xmm1
2230        pxor    xmm3, xmm0
2231        movdqa  xmm14, xmm3
2232        psrld   xmm3, 8
2233        pslld   xmm14, 24
2234        pxor    xmm3, xmm14
2235        paddd   xmm2, xmm3
2236        pxor    xmm1, xmm2
2237        movdqa  xmm11, xmm1
2238        pslld   xmm1, 25
2239        psrld   xmm11, 7
2240        por     xmm1, xmm11
2241        pshufd  xmm0, xmm0, 0x39
2242        pshufd  xmm3, xmm3, 0x4E
2243        pshufd  xmm2, xmm2, 0x93
2244        dec     al
2245        jz      9f
2246        movdqa  xmm8, xmm4
2247        shufps  xmm8, xmm5, 214
2248        pshufd  xmm9, xmm4, 0x0F
2249        pshufd  xmm4, xmm8, 0x39
2250        movdqa  xmm8, xmm6
2251        shufps  xmm8, xmm7, 250
2252        pand    xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2253        pand    xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2254        por     xmm9, xmm8
2255        movdqa  xmm8, xmm7
2256        punpcklqdq xmm8, xmm5
2257        movdqa  xmm10, xmm6
2258        pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2259        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2260        por     xmm8, xmm10
2261        pshufd  xmm8, xmm8, 0x78
2262        punpckhdq xmm5, xmm7
2263        punpckldq xmm6, xmm5
2264        pshufd  xmm7, xmm6, 0x1E
2265        movdqa  xmm5, xmm9
2266        movdqa  xmm6, xmm8
2267        jmp     9b
22689:
2269        movdqu  xmm4, xmmword ptr [rdi]
2270        movdqu  xmm5, xmmword ptr [rdi+0x10]
2271        pxor    xmm0, xmm2
2272        pxor    xmm1, xmm3
2273        pxor    xmm2, xmm4
2274        pxor    xmm3, xmm5
2275        movups  xmmword ptr [r9], xmm0
2276        movups  xmmword ptr [r9+0x10], xmm1
2277        movups  xmmword ptr [r9+0x20], xmm2
2278        movups  xmmword ptr [r9+0x30], xmm3
2279        RET
2280
2281.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2
2282.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2
2283.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2
2284
2285#ifdef __APPLE__
2286.static_data
2287#else
2288.section .rodata
2289#endif
2290.p2align  6
2291BLAKE3_IV:
2292        .long  0x6A09E667, 0xBB67AE85
2293        .long  0x3C6EF372, 0xA54FF53A
2294ADD0:
2295        .long  0, 1, 2, 3
2296ADD1:
2297	.long  4, 4, 4, 4
2298BLAKE3_IV_0:
2299	.long  0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2300BLAKE3_IV_1:
2301	.long  0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2302BLAKE3_IV_2:
2303	.long  0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2304BLAKE3_IV_3:
2305	.long  0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2306BLAKE3_BLOCK_LEN:
2307	.long  64, 64, 64, 64
2308CMP_MSB_MASK:
2309	.long  0x80000000, 0x80000000, 0x80000000, 0x80000000
2310PBLENDW_0x33_MASK:
2311	.long  0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2312PBLENDW_0xCC_MASK:
2313	.long  0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2314PBLENDW_0x3F_MASK:
2315	.long  0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2316PBLENDW_0xC0_MASK:
2317	.long  0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
2318
2319#endif	/* HAVE_SSE2 */
2320
2321#ifdef __ELF__
2322.section .note.GNU-stack,"",%progbits
2323#endif
2324