Lines Matching +full:0 +full:- +full:4

4  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
29 * chacha_permute - permute one block
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
42 ld1 {v12.4s}, [x10]
46 add v0.4s, v0.4s, v1.4s
51 add v2.4s, v2.4s, v3.4s
53 shl v1.4s, v4.4s, #12
54 sri v1.4s, v4.4s, #20
57 add v0.4s, v0.4s, v1.4s
62 add v2.4s, v2.4s, v3.4s
64 shl v1.4s, v4.4s, #7
65 sri v1.4s, v4.4s, #25
67 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68 ext v1.16b, v1.16b, v1.16b, #4
69 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
71 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
75 add v0.4s, v0.4s, v1.4s
80 add v2.4s, v2.4s, v3.4s
82 shl v1.4s, v4.4s, #12
83 sri v1.4s, v4.4s, #20
86 add v0.4s, v0.4s, v1.4s
91 add v2.4s, v2.4s, v3.4s
93 shl v1.4s, v4.4s, #7
94 sri v1.4s, v4.4s, #25
96 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101 ext v3.16b, v3.16b, v3.16b, #4
115 stp x29, x30, [sp, #-16]!
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
124 ld1 {v4.16b-v7.16b}, [x2]
127 add v0.4s, v0.4s, v8.4s
131 add v1.4s, v1.4s, v9.4s
135 add v2.4s, v2.4s, v10.4s
139 add v3.4s, v3.4s, v11.4s
142 st1 {v0.16b-v3.16b}, [x1]
150 // x1: output (8 32-bit words)
153 stp x29, x30, [sp, #-16]!
156 ld1 {v0.4s-v3.4s}, [x0]
161 st1 {v0.4s}, [x1], #16
162 st1 {v3.4s}, [x1]
190 // x1: 4 data blocks output, o
191 // x2: 4 data blocks input, i
205 // matrix by interleaving 32- and then 64-bit words, which allows us to
212 ld1 {v30.4s-v31.4s}, [x9]
214 // x0..15[0-3] = s0..3[0..3]
216 ld4r { v0.4s- v3.4s}, [x0]
217 ld4r { v4.4s- v7.4s}, [x8], #16
218 ld4r { v8.4s-v11.4s}, [x8], #16
219 ld4r {v12.4s-v15.4s}, [x8]
221 mov a0, v0.s[0]
222 mov a1, v1.s[0]
223 mov a2, v2.s[0]
224 mov a3, v3.s[0]
225 mov a4, v4.s[0]
226 mov a5, v5.s[0]
227 mov a6, v6.s[0]
228 mov a7, v7.s[0]
229 mov a8, v8.s[0]
230 mov a9, v9.s[0]
231 mov a10, v10.s[0]
232 mov a11, v11.s[0]
233 mov a12, v12.s[0]
234 mov a13, v13.s[0]
235 mov a14, v14.s[0]
236 mov a15, v15.s[0]
238 // x12 += counter values 1-4
239 add v12.4s, v12.4s, v30.4s
246 add v0.4s, v0.4s, v4.4s
248 add v1.4s, v1.4s, v5.4s
250 add v2.4s, v2.4s, v6.4s
252 add v3.4s, v3.4s, v7.4s
277 add v8.4s, v8.4s, v12.4s
279 add v9.4s, v9.4s, v13.4s
281 add v10.4s, v10.4s, v14.4s
283 add v11.4s, v11.4s, v15.4s
295 shl v4.4s, v16.4s, #12
296 shl v5.4s, v17.4s, #12
297 shl v6.4s, v18.4s, #12
298 shl v7.4s, v19.4s, #12
300 sri v4.4s, v16.4s, #20
302 sri v5.4s, v17.4s, #20
304 sri v6.4s, v18.4s, #20
306 sri v7.4s, v19.4s, #20
313 add v0.4s, v0.4s, v4.4s
315 add v1.4s, v1.4s, v5.4s
317 add v2.4s, v2.4s, v6.4s
319 add v3.4s, v3.4s, v7.4s
344 add v8.4s, v8.4s, v12.4s
346 add v9.4s, v9.4s, v13.4s
348 add v10.4s, v10.4s, v14.4s
350 add v11.4s, v11.4s, v15.4s
362 shl v4.4s, v16.4s, #7
363 shl v5.4s, v17.4s, #7
364 shl v6.4s, v18.4s, #7
365 shl v7.4s, v19.4s, #7
367 sri v4.4s, v16.4s, #25
369 sri v5.4s, v17.4s, #25
371 sri v6.4s, v18.4s, #25
373 sri v7.4s, v19.4s, #25
380 add v0.4s, v0.4s, v5.4s
382 add v1.4s, v1.4s, v6.4s
384 add v2.4s, v2.4s, v7.4s
386 add v3.4s, v3.4s, v4.4s
411 add v10.4s, v10.4s, v15.4s
413 add v11.4s, v11.4s, v12.4s
415 add v8.4s, v8.4s, v13.4s
417 add v9.4s, v9.4s, v14.4s
429 shl v5.4s, v16.4s, #12
430 shl v6.4s, v17.4s, #12
431 shl v7.4s, v18.4s, #12
432 shl v4.4s, v19.4s, #12
434 sri v5.4s, v16.4s, #20
436 sri v6.4s, v17.4s, #20
438 sri v7.4s, v18.4s, #20
440 sri v4.4s, v19.4s, #20
447 add v0.4s, v0.4s, v5.4s
449 add v1.4s, v1.4s, v6.4s
451 add v2.4s, v2.4s, v7.4s
453 add v3.4s, v3.4s, v4.4s
478 add v10.4s, v10.4s, v15.4s
480 add v11.4s, v11.4s, v12.4s
482 add v8.4s, v8.4s, v13.4s
484 add v9.4s, v9.4s, v14.4s
496 shl v5.4s, v16.4s, #7
497 shl v6.4s, v17.4s, #7
498 shl v7.4s, v18.4s, #7
499 shl v4.4s, v19.4s, #7
501 sri v5.4s, v16.4s, #25
503 sri v6.4s, v17.4s, #25
505 sri v7.4s, v18.4s, #25
507 sri v4.4s, v19.4s, #25
513 ld4r {v16.4s-v19.4s}, [x0], #16
514 ld4r {v20.4s-v23.4s}, [x0], #16
516 // x12 += counter values 0-3
517 add v12.4s, v12.4s, v30.4s
519 // x0[0-3] += s0[0]
520 // x1[0-3] += s0[1]
521 // x2[0-3] += s0[2]
522 // x3[0-3] += s0[3]
523 add v0.4s, v0.4s, v16.4s
524 mov w6, v16.s[0]
525 mov w7, v17.s[0]
526 add v1.4s, v1.4s, v17.4s
527 mov w8, v18.s[0]
528 mov w9, v19.s[0]
529 add v2.4s, v2.4s, v18.4s
532 add v3.4s, v3.4s, v19.4s
540 ld4r {v24.4s-v27.4s}, [x0], #16
541 ld4r {v28.4s-v31.4s}, [x0]
543 // x4[0-3] += s1[0]
544 // x5[0-3] += s1[1]
545 // x6[0-3] += s1[2]
546 // x7[0-3] += s1[3]
547 add v4.4s, v4.4s, v20.4s
548 mov w6, v20.s[0]
549 mov w7, v21.s[0]
550 add v5.4s, v5.4s, v21.4s
551 mov w8, v22.s[0]
552 mov w9, v23.s[0]
553 add v6.4s, v6.4s, v22.4s
556 add v7.4s, v7.4s, v23.4s
564 // x8[0-3] += s2[0]
565 // x9[0-3] += s2[1]
566 // x10[0-3] += s2[2]
567 // x11[0-3] += s2[3]
568 add v8.4s, v8.4s, v24.4s
569 mov w6, v24.s[0]
570 mov w7, v25.s[0]
571 add v9.4s, v9.4s, v25.4s
572 mov w8, v26.s[0]
573 mov w9, v27.s[0]
574 add v10.4s, v10.4s, v26.4s
577 add v11.4s, v11.4s, v27.4s
585 // x12[0-3] += s3[0]
586 // x13[0-3] += s3[1]
587 // x14[0-3] += s3[2]
588 // x15[0-3] += s3[3]
589 add v12.4s, v12.4s, v28.4s
590 mov w6, v28.s[0]
591 mov w7, v29.s[0]
592 add v13.4s, v13.4s, v29.4s
593 mov w8, v30.s[0]
594 mov w9, v31.s[0]
595 add v14.4s, v14.4s, v30.4s
598 add v15.4s, v15.4s, v31.4s
606 // interleave 32-bit words in state n, n+1
608 zip1 v16.4s, v0.4s, v1.4s
609 ldp w8, w9, [x2, #-56]
611 zip2 v17.4s, v0.4s, v1.4s
613 zip1 v18.4s, v2.4s, v3.4s
615 zip2 v19.4s, v2.4s, v3.4s
617 ldp w6, w7, [x2, #-48]
618 zip1 v20.4s, v4.4s, v5.4s
619 ldp w8, w9, [x2, #-40]
621 zip2 v21.4s, v4.4s, v5.4s
623 zip1 v22.4s, v6.4s, v7.4s
625 zip2 v23.4s, v6.4s, v7.4s
627 ldp w6, w7, [x2, #-32]
628 zip1 v24.4s, v8.4s, v9.4s
629 ldp w8, w9, [x2, #-24]
631 zip2 v25.4s, v8.4s, v9.4s
633 zip1 v26.4s, v10.4s, v11.4s
635 zip2 v27.4s, v10.4s, v11.4s
637 ldp w6, w7, [x2, #-16]
638 zip1 v28.4s, v12.4s, v13.4s
639 ldp w8, w9, [x2, #-8]
641 zip2 v29.4s, v12.4s, v13.4s
643 zip1 v30.4s, v14.4s, v15.4s
645 zip2 v31.4s, v14.4s, v15.4s
654 // interleave 64-bit words in state n, n+2
660 stp a2, a3, [x1, #-56]
661 ld1 {v16.16b-v19.16b}, [x2], x3
664 ccmp x3, xzr, #4, lt
671 stp a4, a5, [x1, #-48]
674 stp a6, a7, [x1, #-40]
675 ld1 {v20.16b-v23.16b}, [x2], x3
678 ccmp x3, xzr, #4, lt
685 stp a8, a9, [x1, #-32]
688 stp a10, a11, [x1, #-24]
689 ld1 {v24.16b-v27.16b}, [x2], x3
692 ccmp x3, xzr, #4, lt
698 stp a12, a13, [x1, #-16]
701 stp a14, a15, [x1, #-8]
702 ld1 {v28.16b-v31.16b}, [x2]
705 tbnz x5, #63, 0f
710 st1 {v16.16b-v19.16b}, [x1], #64
718 st1 {v20.16b-v23.16b}, [x1], #64
726 st1 {v24.16b-v27.16b}, [x1], #64
734 st1 {v28.16b-v31.16b}, [x1]
740 0: ld1 {v8.16b}, [x10]
745 ld1 {v16.16b-v19.16b}, [x2]
746 tbl v4.16b, {v0.16b-v3.16b}, v8.16b
747 tbx v20.16b, {v16.16b-v19.16b}, v9.16b
750 tbl v5.16b, {v0.16b-v3.16b}, v8.16b
751 tbx v21.16b, {v16.16b-v19.16b}, v9.16b
754 tbl v6.16b, {v0.16b-v3.16b}, v8.16b
755 tbx v22.16b, {v16.16b-v19.16b}, v9.16b
758 tbl v7.16b, {v0.16b-v3.16b}, v8.16b
759 tbx v23.16b, {v16.16b-v19.16b}, v9.16b
765 st1 {v20.16b-v23.16b}, [x1]
773 tbl v0.16b, {v4.16b-v7.16b}, v8.16b
774 tbx v20.16b, {v16.16b-v19.16b}, v9.16b
777 tbl v1.16b, {v4.16b-v7.16b}, v8.16b
778 tbx v21.16b, {v16.16b-v19.16b}, v9.16b
781 tbl v2.16b, {v4.16b-v7.16b}, v8.16b
782 tbx v22.16b, {v16.16b-v19.16b}, v9.16b
785 tbl v3.16b, {v4.16b-v7.16b}, v8.16b
786 tbx v23.16b, {v16.16b-v19.16b}, v9.16b
792 st1 {v20.16b-v23.16b}, [x1]
800 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
801 tbx v24.16b, {v20.16b-v23.16b}, v5.16b
804 tbl v1.16b, {v8.16b-v11.16b}, v4.16b
805 tbx v25.16b, {v20.16b-v23.16b}, v5.16b
808 tbl v2.16b, {v8.16b-v11.16b}, v4.16b
809 tbx v26.16b, {v20.16b-v23.16b}, v5.16b
812 tbl v3.16b, {v8.16b-v11.16b}, v4.16b
813 tbx v27.16b, {v20.16b-v23.16b}, v5.16b
819 st1 {v24.16b-v27.16b}, [x1]
827 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
828 tbx v28.16b, {v24.16b-v27.16b}, v5.16b
831 tbl v1.16b, {v12.16b-v15.16b}, v4.16b
832 tbx v29.16b, {v24.16b-v27.16b}, v5.16b
835 tbl v2.16b, {v12.16b-v15.16b}, v4.16b
836 tbx v30.16b, {v24.16b-v27.16b}, v5.16b
839 tbl v3.16b, {v12.16b-v15.16b}, v4.16b
840 tbx v31.16b, {v24.16b-v27.16b}, v5.16b
846 st1 {v28.16b-v31.16b}, [x1]
853 .set .Li, 0
855 .byte (.Li - 64)
859 CTRINC: .word 1, 2, 3, 4
860 ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f