1f569ca16SArd Biesheuvel#!/usr/bin/env perl 2f569ca16SArd Biesheuvel# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3f569ca16SArd Biesheuvel# 4f569ca16SArd Biesheuvel# ==================================================================== 5f569ca16SArd Biesheuvel# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL 6f569ca16SArd Biesheuvel# project. 7f569ca16SArd Biesheuvel# ==================================================================== 8f569ca16SArd Biesheuvel# 9f569ca16SArd Biesheuvel# This module implements Poly1305 hash for ARMv8. 10f569ca16SArd Biesheuvel# 11f569ca16SArd Biesheuvel# June 2015 12f569ca16SArd Biesheuvel# 13f569ca16SArd Biesheuvel# Numbers are cycles per processed byte with poly1305_blocks alone. 14f569ca16SArd Biesheuvel# 15f569ca16SArd Biesheuvel# IALU/gcc-4.9 NEON 16f569ca16SArd Biesheuvel# 17f569ca16SArd Biesheuvel# Apple A7 1.86/+5% 0.72 18f569ca16SArd Biesheuvel# Cortex-A53 2.69/+58% 1.47 19f569ca16SArd Biesheuvel# Cortex-A57 2.70/+7% 1.14 20f569ca16SArd Biesheuvel# Denver 1.64/+50% 1.18(*) 21f569ca16SArd Biesheuvel# X-Gene 2.13/+68% 2.27 22f569ca16SArd Biesheuvel# Mongoose 1.77/+75% 1.12 23f569ca16SArd Biesheuvel# Kryo 2.70/+55% 1.13 24f569ca16SArd Biesheuvel# ThunderX2 1.17/+95% 1.36 25f569ca16SArd Biesheuvel# 26f569ca16SArd Biesheuvel# (*) estimate based on resources availability is less than 1.0, 27f569ca16SArd Biesheuvel# i.e. measured result is worse than expected, presumably binary 28f569ca16SArd Biesheuvel# translator is not almighty; 29f569ca16SArd Biesheuvel 30f569ca16SArd Biesheuvel$flavour=shift; 31f569ca16SArd Biesheuvel$output=shift; 32f569ca16SArd Biesheuvel 33f569ca16SArd Biesheuvelif ($flavour && $flavour ne "void") { 34f569ca16SArd Biesheuvel $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 35f569ca16SArd Biesheuvel ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 36f569ca16SArd Biesheuvel ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 37f569ca16SArd Biesheuvel die "can't locate arm-xlate.pl"; 38f569ca16SArd Biesheuvel 39f569ca16SArd Biesheuvel open STDOUT,"| \"$^X\" $xlate $flavour $output"; 40f569ca16SArd Biesheuvel} else { 41f569ca16SArd Biesheuvel open STDOUT,">$output"; 42f569ca16SArd Biesheuvel} 43f569ca16SArd Biesheuvel 44f569ca16SArd Biesheuvelmy ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); 45f569ca16SArd Biesheuvelmy ($mac,$nonce)=($inp,$len); 46f569ca16SArd Biesheuvel 47f569ca16SArd Biesheuvelmy ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); 48f569ca16SArd Biesheuvel 49f569ca16SArd Biesheuvel$code.=<<___; 50f569ca16SArd Biesheuvel#ifndef __KERNEL__ 51f569ca16SArd Biesheuvel# include "arm_arch.h" 52f569ca16SArd Biesheuvel.extern OPENSSL_armcap_P 53f569ca16SArd Biesheuvel#endif 54f569ca16SArd Biesheuvel 55f569ca16SArd Biesheuvel.text 56f569ca16SArd Biesheuvel 57f569ca16SArd Biesheuvel// forward "declarations" are required for Apple 58f569ca16SArd Biesheuvel.globl poly1305_blocks 59f569ca16SArd Biesheuvel.globl poly1305_emit 60f569ca16SArd Biesheuvel 61f569ca16SArd Biesheuvel.globl poly1305_init 62f569ca16SArd Biesheuvel.type poly1305_init,%function 63f569ca16SArd Biesheuvel.align 5 64f569ca16SArd Biesheuvelpoly1305_init: 65f569ca16SArd Biesheuvel cmp $inp,xzr 66f569ca16SArd Biesheuvel stp xzr,xzr,[$ctx] // zero hash value 67f569ca16SArd Biesheuvel stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] 68f569ca16SArd Biesheuvel 69f569ca16SArd Biesheuvel csel x0,xzr,x0,eq 70f569ca16SArd Biesheuvel b.eq .Lno_key 71f569ca16SArd Biesheuvel 72f569ca16SArd Biesheuvel#ifndef __KERNEL__ 73f569ca16SArd Biesheuvel adrp x17,OPENSSL_armcap_P 74f569ca16SArd Biesheuvel ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 75f569ca16SArd Biesheuvel#endif 76f569ca16SArd Biesheuvel 77f569ca16SArd Biesheuvel ldp $r0,$r1,[$inp] // load key 78f569ca16SArd Biesheuvel mov $s1,#0xfffffffc0fffffff 79f569ca16SArd Biesheuvel movk $s1,#0x0fff,lsl#48 80f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 81f569ca16SArd Biesheuvel rev $r0,$r0 // flip bytes 82f569ca16SArd Biesheuvel rev $r1,$r1 83f569ca16SArd Biesheuvel#endif 84f569ca16SArd Biesheuvel and $r0,$r0,$s1 // &=0ffffffc0fffffff 85f569ca16SArd Biesheuvel and $s1,$s1,#-4 86f569ca16SArd Biesheuvel and $r1,$r1,$s1 // &=0ffffffc0ffffffc 87f569ca16SArd Biesheuvel mov w#$s1,#-1 88f569ca16SArd Biesheuvel stp $r0,$r1,[$ctx,#32] // save key value 89f569ca16SArd Biesheuvel str w#$s1,[$ctx,#48] // impossible key power value 90f569ca16SArd Biesheuvel 91f569ca16SArd Biesheuvel#ifndef __KERNEL__ 92f569ca16SArd Biesheuvel tst w17,#ARMV7_NEON 93f569ca16SArd Biesheuvel 94f569ca16SArd Biesheuvel adr $d0,.Lpoly1305_blocks 95f569ca16SArd Biesheuvel adr $r0,.Lpoly1305_blocks_neon 96f569ca16SArd Biesheuvel adr $d1,.Lpoly1305_emit 97f569ca16SArd Biesheuvel 98f569ca16SArd Biesheuvel csel $d0,$d0,$r0,eq 99f569ca16SArd Biesheuvel 100f569ca16SArd Biesheuvel# ifdef __ILP32__ 101f569ca16SArd Biesheuvel stp w#$d0,w#$d1,[$len] 102f569ca16SArd Biesheuvel# else 103f569ca16SArd Biesheuvel stp $d0,$d1,[$len] 104f569ca16SArd Biesheuvel# endif 105f569ca16SArd Biesheuvel#endif 106f569ca16SArd Biesheuvel mov x0,#1 107f569ca16SArd Biesheuvel.Lno_key: 108f569ca16SArd Biesheuvel ret 109f569ca16SArd Biesheuvel.size poly1305_init,.-poly1305_init 110f569ca16SArd Biesheuvel 111f569ca16SArd Biesheuvel.type poly1305_blocks,%function 112f569ca16SArd Biesheuvel.align 5 113f569ca16SArd Biesheuvelpoly1305_blocks: 114f569ca16SArd Biesheuvel.Lpoly1305_blocks: 115f569ca16SArd Biesheuvel ands $len,$len,#-16 116f569ca16SArd Biesheuvel b.eq .Lno_data 117f569ca16SArd Biesheuvel 118f569ca16SArd Biesheuvel ldp $h0,$h1,[$ctx] // load hash value 119f569ca16SArd Biesheuvel ldp $h2,x17,[$ctx,#16] // [along with is_base2_26] 120f569ca16SArd Biesheuvel ldp $r0,$r1,[$ctx,#32] // load key value 121f569ca16SArd Biesheuvel 122f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 123f569ca16SArd Biesheuvel lsr $d0,$h0,#32 124f569ca16SArd Biesheuvel mov w#$d1,w#$h0 125f569ca16SArd Biesheuvel lsr $d2,$h1,#32 126f569ca16SArd Biesheuvel mov w15,w#$h1 127f569ca16SArd Biesheuvel lsr x16,$h2,#32 128f569ca16SArd Biesheuvel#else 129f569ca16SArd Biesheuvel mov w#$d0,w#$h0 130f569ca16SArd Biesheuvel lsr $d1,$h0,#32 131f569ca16SArd Biesheuvel mov w#$d2,w#$h1 132f569ca16SArd Biesheuvel lsr x15,$h1,#32 133f569ca16SArd Biesheuvel mov w16,w#$h2 134f569ca16SArd Biesheuvel#endif 135f569ca16SArd Biesheuvel 136f569ca16SArd Biesheuvel add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 137f569ca16SArd Biesheuvel lsr $d1,$d2,#12 138f569ca16SArd Biesheuvel adds $d0,$d0,$d2,lsl#52 139f569ca16SArd Biesheuvel add $d1,$d1,x15,lsl#14 140f569ca16SArd Biesheuvel adc $d1,$d1,xzr 141f569ca16SArd Biesheuvel lsr $d2,x16,#24 142f569ca16SArd Biesheuvel adds $d1,$d1,x16,lsl#40 143f569ca16SArd Biesheuvel adc $d2,$d2,xzr 144f569ca16SArd Biesheuvel 145f569ca16SArd Biesheuvel cmp x17,#0 // is_base2_26? 146f569ca16SArd Biesheuvel add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 147f569ca16SArd Biesheuvel csel $h0,$h0,$d0,eq // choose between radixes 148f569ca16SArd Biesheuvel csel $h1,$h1,$d1,eq 149f569ca16SArd Biesheuvel csel $h2,$h2,$d2,eq 150f569ca16SArd Biesheuvel 151f569ca16SArd Biesheuvel.Loop: 152f569ca16SArd Biesheuvel ldp $t0,$t1,[$inp],#16 // load input 153f569ca16SArd Biesheuvel sub $len,$len,#16 154f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 155f569ca16SArd Biesheuvel rev $t0,$t0 156f569ca16SArd Biesheuvel rev $t1,$t1 157f569ca16SArd Biesheuvel#endif 158f569ca16SArd Biesheuvel adds $h0,$h0,$t0 // accumulate input 159f569ca16SArd Biesheuvel adcs $h1,$h1,$t1 160f569ca16SArd Biesheuvel 161f569ca16SArd Biesheuvel mul $d0,$h0,$r0 // h0*r0 162f569ca16SArd Biesheuvel adc $h2,$h2,$padbit 163f569ca16SArd Biesheuvel umulh $d1,$h0,$r0 164f569ca16SArd Biesheuvel 165f569ca16SArd Biesheuvel mul $t0,$h1,$s1 // h1*5*r1 166f569ca16SArd Biesheuvel umulh $t1,$h1,$s1 167f569ca16SArd Biesheuvel 168f569ca16SArd Biesheuvel adds $d0,$d0,$t0 169f569ca16SArd Biesheuvel mul $t0,$h0,$r1 // h0*r1 170f569ca16SArd Biesheuvel adc $d1,$d1,$t1 171f569ca16SArd Biesheuvel umulh $d2,$h0,$r1 172f569ca16SArd Biesheuvel 173f569ca16SArd Biesheuvel adds $d1,$d1,$t0 174f569ca16SArd Biesheuvel mul $t0,$h1,$r0 // h1*r0 175f569ca16SArd Biesheuvel adc $d2,$d2,xzr 176f569ca16SArd Biesheuvel umulh $t1,$h1,$r0 177f569ca16SArd Biesheuvel 178f569ca16SArd Biesheuvel adds $d1,$d1,$t0 179f569ca16SArd Biesheuvel mul $t0,$h2,$s1 // h2*5*r1 180f569ca16SArd Biesheuvel adc $d2,$d2,$t1 181f569ca16SArd Biesheuvel mul $t1,$h2,$r0 // h2*r0 182f569ca16SArd Biesheuvel 183f569ca16SArd Biesheuvel adds $d1,$d1,$t0 184f569ca16SArd Biesheuvel adc $d2,$d2,$t1 185f569ca16SArd Biesheuvel 186f569ca16SArd Biesheuvel and $t0,$d2,#-4 // final reduction 187f569ca16SArd Biesheuvel and $h2,$d2,#3 188f569ca16SArd Biesheuvel add $t0,$t0,$d2,lsr#2 189f569ca16SArd Biesheuvel adds $h0,$d0,$t0 190f569ca16SArd Biesheuvel adcs $h1,$d1,xzr 191f569ca16SArd Biesheuvel adc $h2,$h2,xzr 192f569ca16SArd Biesheuvel 193f569ca16SArd Biesheuvel cbnz $len,.Loop 194f569ca16SArd Biesheuvel 195f569ca16SArd Biesheuvel stp $h0,$h1,[$ctx] // store hash value 196f569ca16SArd Biesheuvel stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26] 197f569ca16SArd Biesheuvel 198f569ca16SArd Biesheuvel.Lno_data: 199f569ca16SArd Biesheuvel ret 200f569ca16SArd Biesheuvel.size poly1305_blocks,.-poly1305_blocks 201f569ca16SArd Biesheuvel 202f569ca16SArd Biesheuvel.type poly1305_emit,%function 203f569ca16SArd Biesheuvel.align 5 204f569ca16SArd Biesheuvelpoly1305_emit: 205f569ca16SArd Biesheuvel.Lpoly1305_emit: 206f569ca16SArd Biesheuvel ldp $h0,$h1,[$ctx] // load hash base 2^64 207f569ca16SArd Biesheuvel ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26] 208f569ca16SArd Biesheuvel ldp $t0,$t1,[$nonce] // load nonce 209f569ca16SArd Biesheuvel 210f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 211f569ca16SArd Biesheuvel lsr $d0,$h0,#32 212f569ca16SArd Biesheuvel mov w#$d1,w#$h0 213f569ca16SArd Biesheuvel lsr $d2,$h1,#32 214f569ca16SArd Biesheuvel mov w15,w#$h1 215f569ca16SArd Biesheuvel lsr x16,$h2,#32 216f569ca16SArd Biesheuvel#else 217f569ca16SArd Biesheuvel mov w#$d0,w#$h0 218f569ca16SArd Biesheuvel lsr $d1,$h0,#32 219f569ca16SArd Biesheuvel mov w#$d2,w#$h1 220f569ca16SArd Biesheuvel lsr x15,$h1,#32 221f569ca16SArd Biesheuvel mov w16,w#$h2 222f569ca16SArd Biesheuvel#endif 223f569ca16SArd Biesheuvel 224f569ca16SArd Biesheuvel add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64 225f569ca16SArd Biesheuvel lsr $d1,$d2,#12 226f569ca16SArd Biesheuvel adds $d0,$d0,$d2,lsl#52 227f569ca16SArd Biesheuvel add $d1,$d1,x15,lsl#14 228f569ca16SArd Biesheuvel adc $d1,$d1,xzr 229f569ca16SArd Biesheuvel lsr $d2,x16,#24 230f569ca16SArd Biesheuvel adds $d1,$d1,x16,lsl#40 231f569ca16SArd Biesheuvel adc $d2,$d2,xzr 232f569ca16SArd Biesheuvel 233f569ca16SArd Biesheuvel cmp $r0,#0 // is_base2_26? 234f569ca16SArd Biesheuvel csel $h0,$h0,$d0,eq // choose between radixes 235f569ca16SArd Biesheuvel csel $h1,$h1,$d1,eq 236f569ca16SArd Biesheuvel csel $h2,$h2,$d2,eq 237f569ca16SArd Biesheuvel 238f569ca16SArd Biesheuvel adds $d0,$h0,#5 // compare to modulus 239f569ca16SArd Biesheuvel adcs $d1,$h1,xzr 240f569ca16SArd Biesheuvel adc $d2,$h2,xzr 241f569ca16SArd Biesheuvel 242f569ca16SArd Biesheuvel tst $d2,#-4 // see if it's carried/borrowed 243f569ca16SArd Biesheuvel 244f569ca16SArd Biesheuvel csel $h0,$h0,$d0,eq 245f569ca16SArd Biesheuvel csel $h1,$h1,$d1,eq 246f569ca16SArd Biesheuvel 247f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 248f569ca16SArd Biesheuvel ror $t0,$t0,#32 // flip nonce words 249f569ca16SArd Biesheuvel ror $t1,$t1,#32 250f569ca16SArd Biesheuvel#endif 251f569ca16SArd Biesheuvel adds $h0,$h0,$t0 // accumulate nonce 252f569ca16SArd Biesheuvel adc $h1,$h1,$t1 253f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 254f569ca16SArd Biesheuvel rev $h0,$h0 // flip output bytes 255f569ca16SArd Biesheuvel rev $h1,$h1 256f569ca16SArd Biesheuvel#endif 257f569ca16SArd Biesheuvel stp $h0,$h1,[$mac] // write result 258f569ca16SArd Biesheuvel 259f569ca16SArd Biesheuvel ret 260f569ca16SArd Biesheuvel.size poly1305_emit,.-poly1305_emit 261f569ca16SArd Biesheuvel___ 262f569ca16SArd Biesheuvelmy ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8)); 263f569ca16SArd Biesheuvelmy ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13)); 264f569ca16SArd Biesheuvelmy ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18)); 265f569ca16SArd Biesheuvelmy ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23)); 266f569ca16SArd Biesheuvelmy ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); 267f569ca16SArd Biesheuvelmy ($T0,$T1,$MASK) = map("v$_",(29..31)); 268f569ca16SArd Biesheuvel 269f569ca16SArd Biesheuvelmy ($in2,$zeros)=("x16","x17"); 270f569ca16SArd Biesheuvelmy $is_base2_26 = $zeros; # borrow 271f569ca16SArd Biesheuvel 272f569ca16SArd Biesheuvel$code.=<<___; 273f569ca16SArd Biesheuvel.type poly1305_mult,%function 274f569ca16SArd Biesheuvel.align 5 275f569ca16SArd Biesheuvelpoly1305_mult: 276f569ca16SArd Biesheuvel mul $d0,$h0,$r0 // h0*r0 277f569ca16SArd Biesheuvel umulh $d1,$h0,$r0 278f569ca16SArd Biesheuvel 279f569ca16SArd Biesheuvel mul $t0,$h1,$s1 // h1*5*r1 280f569ca16SArd Biesheuvel umulh $t1,$h1,$s1 281f569ca16SArd Biesheuvel 282f569ca16SArd Biesheuvel adds $d0,$d0,$t0 283f569ca16SArd Biesheuvel mul $t0,$h0,$r1 // h0*r1 284f569ca16SArd Biesheuvel adc $d1,$d1,$t1 285f569ca16SArd Biesheuvel umulh $d2,$h0,$r1 286f569ca16SArd Biesheuvel 287f569ca16SArd Biesheuvel adds $d1,$d1,$t0 288f569ca16SArd Biesheuvel mul $t0,$h1,$r0 // h1*r0 289f569ca16SArd Biesheuvel adc $d2,$d2,xzr 290f569ca16SArd Biesheuvel umulh $t1,$h1,$r0 291f569ca16SArd Biesheuvel 292f569ca16SArd Biesheuvel adds $d1,$d1,$t0 293f569ca16SArd Biesheuvel mul $t0,$h2,$s1 // h2*5*r1 294f569ca16SArd Biesheuvel adc $d2,$d2,$t1 295f569ca16SArd Biesheuvel mul $t1,$h2,$r0 // h2*r0 296f569ca16SArd Biesheuvel 297f569ca16SArd Biesheuvel adds $d1,$d1,$t0 298f569ca16SArd Biesheuvel adc $d2,$d2,$t1 299f569ca16SArd Biesheuvel 300f569ca16SArd Biesheuvel and $t0,$d2,#-4 // final reduction 301f569ca16SArd Biesheuvel and $h2,$d2,#3 302f569ca16SArd Biesheuvel add $t0,$t0,$d2,lsr#2 303f569ca16SArd Biesheuvel adds $h0,$d0,$t0 304f569ca16SArd Biesheuvel adcs $h1,$d1,xzr 305f569ca16SArd Biesheuvel adc $h2,$h2,xzr 306f569ca16SArd Biesheuvel 307f569ca16SArd Biesheuvel ret 308f569ca16SArd Biesheuvel.size poly1305_mult,.-poly1305_mult 309f569ca16SArd Biesheuvel 310f569ca16SArd Biesheuvel.type poly1305_splat,%function 311f569ca16SArd Biesheuvel.align 4 312f569ca16SArd Biesheuvelpoly1305_splat: 313f569ca16SArd Biesheuvel and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26 314f569ca16SArd Biesheuvel ubfx x13,$h0,#26,#26 315f569ca16SArd Biesheuvel extr x14,$h1,$h0,#52 316f569ca16SArd Biesheuvel and x14,x14,#0x03ffffff 317f569ca16SArd Biesheuvel ubfx x15,$h1,#14,#26 318f569ca16SArd Biesheuvel extr x16,$h2,$h1,#40 319f569ca16SArd Biesheuvel 320f569ca16SArd Biesheuvel str w12,[$ctx,#16*0] // r0 321f569ca16SArd Biesheuvel add w12,w13,w13,lsl#2 // r1*5 322f569ca16SArd Biesheuvel str w13,[$ctx,#16*1] // r1 323f569ca16SArd Biesheuvel add w13,w14,w14,lsl#2 // r2*5 324f569ca16SArd Biesheuvel str w12,[$ctx,#16*2] // s1 325f569ca16SArd Biesheuvel str w14,[$ctx,#16*3] // r2 326f569ca16SArd Biesheuvel add w14,w15,w15,lsl#2 // r3*5 327f569ca16SArd Biesheuvel str w13,[$ctx,#16*4] // s2 328f569ca16SArd Biesheuvel str w15,[$ctx,#16*5] // r3 329f569ca16SArd Biesheuvel add w15,w16,w16,lsl#2 // r4*5 330f569ca16SArd Biesheuvel str w14,[$ctx,#16*6] // s3 331f569ca16SArd Biesheuvel str w16,[$ctx,#16*7] // r4 332f569ca16SArd Biesheuvel str w15,[$ctx,#16*8] // s4 333f569ca16SArd Biesheuvel 334f569ca16SArd Biesheuvel ret 335f569ca16SArd Biesheuvel.size poly1305_splat,.-poly1305_splat 336f569ca16SArd Biesheuvel 337f569ca16SArd Biesheuvel#ifdef __KERNEL__ 338f569ca16SArd Biesheuvel.globl poly1305_blocks_neon 339f569ca16SArd Biesheuvel#endif 340f569ca16SArd Biesheuvel.type poly1305_blocks_neon,%function 341f569ca16SArd Biesheuvel.align 5 342f569ca16SArd Biesheuvelpoly1305_blocks_neon: 343f569ca16SArd Biesheuvel.Lpoly1305_blocks_neon: 344f569ca16SArd Biesheuvel ldr $is_base2_26,[$ctx,#24] 345f569ca16SArd Biesheuvel cmp $len,#128 346f569ca16SArd Biesheuvel b.lo .Lpoly1305_blocks 347f569ca16SArd Biesheuvel 348f569ca16SArd Biesheuvel .inst 0xd503233f // paciasp 349f569ca16SArd Biesheuvel stp x29,x30,[sp,#-80]! 350f569ca16SArd Biesheuvel add x29,sp,#0 351f569ca16SArd Biesheuvel 352f569ca16SArd Biesheuvel stp d8,d9,[sp,#16] // meet ABI requirements 353f569ca16SArd Biesheuvel stp d10,d11,[sp,#32] 354f569ca16SArd Biesheuvel stp d12,d13,[sp,#48] 355f569ca16SArd Biesheuvel stp d14,d15,[sp,#64] 356f569ca16SArd Biesheuvel 357f569ca16SArd Biesheuvel cbz $is_base2_26,.Lbase2_64_neon 358f569ca16SArd Biesheuvel 359f569ca16SArd Biesheuvel ldp w10,w11,[$ctx] // load hash value base 2^26 360f569ca16SArd Biesheuvel ldp w12,w13,[$ctx,#8] 361f569ca16SArd Biesheuvel ldr w14,[$ctx,#16] 362f569ca16SArd Biesheuvel 363f569ca16SArd Biesheuvel tst $len,#31 364f569ca16SArd Biesheuvel b.eq .Leven_neon 365f569ca16SArd Biesheuvel 366f569ca16SArd Biesheuvel ldp $r0,$r1,[$ctx,#32] // load key value 367f569ca16SArd Biesheuvel 368f569ca16SArd Biesheuvel add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64 369f569ca16SArd Biesheuvel lsr $h1,x12,#12 370f569ca16SArd Biesheuvel adds $h0,$h0,x12,lsl#52 371f569ca16SArd Biesheuvel add $h1,$h1,x13,lsl#14 372f569ca16SArd Biesheuvel adc $h1,$h1,xzr 373f569ca16SArd Biesheuvel lsr $h2,x14,#24 374f569ca16SArd Biesheuvel adds $h1,$h1,x14,lsl#40 375f569ca16SArd Biesheuvel adc $d2,$h2,xzr // can be partially reduced... 376f569ca16SArd Biesheuvel 377f569ca16SArd Biesheuvel ldp $d0,$d1,[$inp],#16 // load input 378f569ca16SArd Biesheuvel sub $len,$len,#16 379f569ca16SArd Biesheuvel add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 380f569ca16SArd Biesheuvel 381f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 382f569ca16SArd Biesheuvel rev $d0,$d0 383f569ca16SArd Biesheuvel rev $d1,$d1 384f569ca16SArd Biesheuvel#endif 385f569ca16SArd Biesheuvel adds $h0,$h0,$d0 // accumulate input 386f569ca16SArd Biesheuvel adcs $h1,$h1,$d1 387f569ca16SArd Biesheuvel adc $h2,$h2,$padbit 388f569ca16SArd Biesheuvel 389f569ca16SArd Biesheuvel bl poly1305_mult 390f569ca16SArd Biesheuvel 391f569ca16SArd Biesheuvel and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 392f569ca16SArd Biesheuvel ubfx x11,$h0,#26,#26 393f569ca16SArd Biesheuvel extr x12,$h1,$h0,#52 394f569ca16SArd Biesheuvel and x12,x12,#0x03ffffff 395f569ca16SArd Biesheuvel ubfx x13,$h1,#14,#26 396f569ca16SArd Biesheuvel extr x14,$h2,$h1,#40 397f569ca16SArd Biesheuvel 398f569ca16SArd Biesheuvel b .Leven_neon 399f569ca16SArd Biesheuvel 400f569ca16SArd Biesheuvel.align 4 401f569ca16SArd Biesheuvel.Lbase2_64_neon: 402f569ca16SArd Biesheuvel ldp $r0,$r1,[$ctx,#32] // load key value 403f569ca16SArd Biesheuvel 404f569ca16SArd Biesheuvel ldp $h0,$h1,[$ctx] // load hash value base 2^64 405f569ca16SArd Biesheuvel ldr $h2,[$ctx,#16] 406f569ca16SArd Biesheuvel 407f569ca16SArd Biesheuvel tst $len,#31 408f569ca16SArd Biesheuvel b.eq .Linit_neon 409f569ca16SArd Biesheuvel 410f569ca16SArd Biesheuvel ldp $d0,$d1,[$inp],#16 // load input 411f569ca16SArd Biesheuvel sub $len,$len,#16 412f569ca16SArd Biesheuvel add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 413f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 414f569ca16SArd Biesheuvel rev $d0,$d0 415f569ca16SArd Biesheuvel rev $d1,$d1 416f569ca16SArd Biesheuvel#endif 417f569ca16SArd Biesheuvel adds $h0,$h0,$d0 // accumulate input 418f569ca16SArd Biesheuvel adcs $h1,$h1,$d1 419f569ca16SArd Biesheuvel adc $h2,$h2,$padbit 420f569ca16SArd Biesheuvel 421f569ca16SArd Biesheuvel bl poly1305_mult 422f569ca16SArd Biesheuvel 423f569ca16SArd Biesheuvel.Linit_neon: 424f569ca16SArd Biesheuvel ldr w17,[$ctx,#48] // first table element 425f569ca16SArd Biesheuvel and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26 426f569ca16SArd Biesheuvel ubfx x11,$h0,#26,#26 427f569ca16SArd Biesheuvel extr x12,$h1,$h0,#52 428f569ca16SArd Biesheuvel and x12,x12,#0x03ffffff 429f569ca16SArd Biesheuvel ubfx x13,$h1,#14,#26 430f569ca16SArd Biesheuvel extr x14,$h2,$h1,#40 431f569ca16SArd Biesheuvel 432f569ca16SArd Biesheuvel cmp w17,#-1 // is value impossible? 433f569ca16SArd Biesheuvel b.ne .Leven_neon 434f569ca16SArd Biesheuvel 435f569ca16SArd Biesheuvel fmov ${H0},x10 436f569ca16SArd Biesheuvel fmov ${H1},x11 437f569ca16SArd Biesheuvel fmov ${H2},x12 438f569ca16SArd Biesheuvel fmov ${H3},x13 439f569ca16SArd Biesheuvel fmov ${H4},x14 440f569ca16SArd Biesheuvel 441f569ca16SArd Biesheuvel ////////////////////////////////// initialize r^n table 442f569ca16SArd Biesheuvel mov $h0,$r0 // r^1 443f569ca16SArd Biesheuvel add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2) 444f569ca16SArd Biesheuvel mov $h1,$r1 445f569ca16SArd Biesheuvel mov $h2,xzr 446f569ca16SArd Biesheuvel add $ctx,$ctx,#48+12 447f569ca16SArd Biesheuvel bl poly1305_splat 448f569ca16SArd Biesheuvel 449f569ca16SArd Biesheuvel bl poly1305_mult // r^2 450f569ca16SArd Biesheuvel sub $ctx,$ctx,#4 451f569ca16SArd Biesheuvel bl poly1305_splat 452f569ca16SArd Biesheuvel 453f569ca16SArd Biesheuvel bl poly1305_mult // r^3 454f569ca16SArd Biesheuvel sub $ctx,$ctx,#4 455f569ca16SArd Biesheuvel bl poly1305_splat 456f569ca16SArd Biesheuvel 457f569ca16SArd Biesheuvel bl poly1305_mult // r^4 458f569ca16SArd Biesheuvel sub $ctx,$ctx,#4 459f569ca16SArd Biesheuvel bl poly1305_splat 460f569ca16SArd Biesheuvel sub $ctx,$ctx,#48 // restore original $ctx 461f569ca16SArd Biesheuvel b .Ldo_neon 462f569ca16SArd Biesheuvel 463f569ca16SArd Biesheuvel.align 4 464f569ca16SArd Biesheuvel.Leven_neon: 465f569ca16SArd Biesheuvel fmov ${H0},x10 466f569ca16SArd Biesheuvel fmov ${H1},x11 467f569ca16SArd Biesheuvel fmov ${H2},x12 468f569ca16SArd Biesheuvel fmov ${H3},x13 469f569ca16SArd Biesheuvel fmov ${H4},x14 470f569ca16SArd Biesheuvel 471f569ca16SArd Biesheuvel.Ldo_neon: 472f569ca16SArd Biesheuvel ldp x8,x12,[$inp,#32] // inp[2:3] 473f569ca16SArd Biesheuvel subs $len,$len,#64 474f569ca16SArd Biesheuvel ldp x9,x13,[$inp,#48] 475f569ca16SArd Biesheuvel add $in2,$inp,#96 476*9369693aSJia He adrp $zeros,.Lzeros 477*9369693aSJia He add $zeros,$zeros,#:lo12:.Lzeros 478f569ca16SArd Biesheuvel 479f569ca16SArd Biesheuvel lsl $padbit,$padbit,#24 480f569ca16SArd Biesheuvel add x15,$ctx,#48 481f569ca16SArd Biesheuvel 482f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 483f569ca16SArd Biesheuvel rev x8,x8 484f569ca16SArd Biesheuvel rev x12,x12 485f569ca16SArd Biesheuvel rev x9,x9 486f569ca16SArd Biesheuvel rev x13,x13 487f569ca16SArd Biesheuvel#endif 488f569ca16SArd Biesheuvel and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 489f569ca16SArd Biesheuvel and x5,x9,#0x03ffffff 490f569ca16SArd Biesheuvel ubfx x6,x8,#26,#26 491f569ca16SArd Biesheuvel ubfx x7,x9,#26,#26 492f569ca16SArd Biesheuvel add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 493f569ca16SArd Biesheuvel extr x8,x12,x8,#52 494f569ca16SArd Biesheuvel extr x9,x13,x9,#52 495f569ca16SArd Biesheuvel add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 496f569ca16SArd Biesheuvel fmov $IN23_0,x4 497f569ca16SArd Biesheuvel and x8,x8,#0x03ffffff 498f569ca16SArd Biesheuvel and x9,x9,#0x03ffffff 499f569ca16SArd Biesheuvel ubfx x10,x12,#14,#26 500f569ca16SArd Biesheuvel ubfx x11,x13,#14,#26 501f569ca16SArd Biesheuvel add x12,$padbit,x12,lsr#40 502f569ca16SArd Biesheuvel add x13,$padbit,x13,lsr#40 503f569ca16SArd Biesheuvel add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 504f569ca16SArd Biesheuvel fmov $IN23_1,x6 505f569ca16SArd Biesheuvel add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 506f569ca16SArd Biesheuvel add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 507f569ca16SArd Biesheuvel fmov $IN23_2,x8 508f569ca16SArd Biesheuvel fmov $IN23_3,x10 509f569ca16SArd Biesheuvel fmov $IN23_4,x12 510f569ca16SArd Biesheuvel 511f569ca16SArd Biesheuvel ldp x8,x12,[$inp],#16 // inp[0:1] 512f569ca16SArd Biesheuvel ldp x9,x13,[$inp],#48 513f569ca16SArd Biesheuvel 514f569ca16SArd Biesheuvel ld1 {$R0,$R1,$S1,$R2},[x15],#64 515f569ca16SArd Biesheuvel ld1 {$S2,$R3,$S3,$R4},[x15],#64 516f569ca16SArd Biesheuvel ld1 {$S4},[x15] 517f569ca16SArd Biesheuvel 518f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 519f569ca16SArd Biesheuvel rev x8,x8 520f569ca16SArd Biesheuvel rev x12,x12 521f569ca16SArd Biesheuvel rev x9,x9 522f569ca16SArd Biesheuvel rev x13,x13 523f569ca16SArd Biesheuvel#endif 524f569ca16SArd Biesheuvel and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 525f569ca16SArd Biesheuvel and x5,x9,#0x03ffffff 526f569ca16SArd Biesheuvel ubfx x6,x8,#26,#26 527f569ca16SArd Biesheuvel ubfx x7,x9,#26,#26 528f569ca16SArd Biesheuvel add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 529f569ca16SArd Biesheuvel extr x8,x12,x8,#52 530f569ca16SArd Biesheuvel extr x9,x13,x9,#52 531f569ca16SArd Biesheuvel add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 532f569ca16SArd Biesheuvel fmov $IN01_0,x4 533f569ca16SArd Biesheuvel and x8,x8,#0x03ffffff 534f569ca16SArd Biesheuvel and x9,x9,#0x03ffffff 535f569ca16SArd Biesheuvel ubfx x10,x12,#14,#26 536f569ca16SArd Biesheuvel ubfx x11,x13,#14,#26 537f569ca16SArd Biesheuvel add x12,$padbit,x12,lsr#40 538f569ca16SArd Biesheuvel add x13,$padbit,x13,lsr#40 539f569ca16SArd Biesheuvel add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 540f569ca16SArd Biesheuvel fmov $IN01_1,x6 541f569ca16SArd Biesheuvel add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 542f569ca16SArd Biesheuvel add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 543f569ca16SArd Biesheuvel movi $MASK.2d,#-1 544f569ca16SArd Biesheuvel fmov $IN01_2,x8 545f569ca16SArd Biesheuvel fmov $IN01_3,x10 546f569ca16SArd Biesheuvel fmov $IN01_4,x12 547f569ca16SArd Biesheuvel ushr $MASK.2d,$MASK.2d,#38 548f569ca16SArd Biesheuvel 549f569ca16SArd Biesheuvel b.ls .Lskip_loop 550f569ca16SArd Biesheuvel 551f569ca16SArd Biesheuvel.align 4 552f569ca16SArd Biesheuvel.Loop_neon: 553f569ca16SArd Biesheuvel //////////////////////////////////////////////////////////////// 554f569ca16SArd Biesheuvel // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 555f569ca16SArd Biesheuvel // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 556f569ca16SArd Biesheuvel // \___________________/ 557f569ca16SArd Biesheuvel // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 558f569ca16SArd Biesheuvel // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 559f569ca16SArd Biesheuvel // \___________________/ \____________________/ 560f569ca16SArd Biesheuvel // 561f569ca16SArd Biesheuvel // Note that we start with inp[2:3]*r^2. This is because it 562f569ca16SArd Biesheuvel // doesn't depend on reduction in previous iteration. 563f569ca16SArd Biesheuvel //////////////////////////////////////////////////////////////// 564f569ca16SArd Biesheuvel // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 565f569ca16SArd Biesheuvel // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 566f569ca16SArd Biesheuvel // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 567f569ca16SArd Biesheuvel // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 568f569ca16SArd Biesheuvel // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 569f569ca16SArd Biesheuvel 570f569ca16SArd Biesheuvel subs $len,$len,#64 571f569ca16SArd Biesheuvel umull $ACC4,$IN23_0,${R4}[2] 572f569ca16SArd Biesheuvel csel $in2,$zeros,$in2,lo 573f569ca16SArd Biesheuvel umull $ACC3,$IN23_0,${R3}[2] 574f569ca16SArd Biesheuvel umull $ACC2,$IN23_0,${R2}[2] 575f569ca16SArd Biesheuvel ldp x8,x12,[$in2],#16 // inp[2:3] (or zero) 576f569ca16SArd Biesheuvel umull $ACC1,$IN23_0,${R1}[2] 577f569ca16SArd Biesheuvel ldp x9,x13,[$in2],#48 578f569ca16SArd Biesheuvel umull $ACC0,$IN23_0,${R0}[2] 579f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 580f569ca16SArd Biesheuvel rev x8,x8 581f569ca16SArd Biesheuvel rev x12,x12 582f569ca16SArd Biesheuvel rev x9,x9 583f569ca16SArd Biesheuvel rev x13,x13 584f569ca16SArd Biesheuvel#endif 585f569ca16SArd Biesheuvel 586f569ca16SArd Biesheuvel umlal $ACC4,$IN23_1,${R3}[2] 587f569ca16SArd Biesheuvel and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 588f569ca16SArd Biesheuvel umlal $ACC3,$IN23_1,${R2}[2] 589f569ca16SArd Biesheuvel and x5,x9,#0x03ffffff 590f569ca16SArd Biesheuvel umlal $ACC2,$IN23_1,${R1}[2] 591f569ca16SArd Biesheuvel ubfx x6,x8,#26,#26 592f569ca16SArd Biesheuvel umlal $ACC1,$IN23_1,${R0}[2] 593f569ca16SArd Biesheuvel ubfx x7,x9,#26,#26 594f569ca16SArd Biesheuvel umlal $ACC0,$IN23_1,${S4}[2] 595f569ca16SArd Biesheuvel add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 596f569ca16SArd Biesheuvel 597f569ca16SArd Biesheuvel umlal $ACC4,$IN23_2,${R2}[2] 598f569ca16SArd Biesheuvel extr x8,x12,x8,#52 599f569ca16SArd Biesheuvel umlal $ACC3,$IN23_2,${R1}[2] 600f569ca16SArd Biesheuvel extr x9,x13,x9,#52 601f569ca16SArd Biesheuvel umlal $ACC2,$IN23_2,${R0}[2] 602f569ca16SArd Biesheuvel add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 603f569ca16SArd Biesheuvel umlal $ACC1,$IN23_2,${S4}[2] 604f569ca16SArd Biesheuvel fmov $IN23_0,x4 605f569ca16SArd Biesheuvel umlal $ACC0,$IN23_2,${S3}[2] 606f569ca16SArd Biesheuvel and x8,x8,#0x03ffffff 607f569ca16SArd Biesheuvel 608f569ca16SArd Biesheuvel umlal $ACC4,$IN23_3,${R1}[2] 609f569ca16SArd Biesheuvel and x9,x9,#0x03ffffff 610f569ca16SArd Biesheuvel umlal $ACC3,$IN23_3,${R0}[2] 611f569ca16SArd Biesheuvel ubfx x10,x12,#14,#26 612f569ca16SArd Biesheuvel umlal $ACC2,$IN23_3,${S4}[2] 613f569ca16SArd Biesheuvel ubfx x11,x13,#14,#26 614f569ca16SArd Biesheuvel umlal $ACC1,$IN23_3,${S3}[2] 615f569ca16SArd Biesheuvel add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 616f569ca16SArd Biesheuvel umlal $ACC0,$IN23_3,${S2}[2] 617f569ca16SArd Biesheuvel fmov $IN23_1,x6 618f569ca16SArd Biesheuvel 619f569ca16SArd Biesheuvel add $IN01_2,$IN01_2,$H2 620f569ca16SArd Biesheuvel add x12,$padbit,x12,lsr#40 621f569ca16SArd Biesheuvel umlal $ACC4,$IN23_4,${R0}[2] 622f569ca16SArd Biesheuvel add x13,$padbit,x13,lsr#40 623f569ca16SArd Biesheuvel umlal $ACC3,$IN23_4,${S4}[2] 624f569ca16SArd Biesheuvel add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 625f569ca16SArd Biesheuvel umlal $ACC2,$IN23_4,${S3}[2] 626f569ca16SArd Biesheuvel add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 627f569ca16SArd Biesheuvel umlal $ACC1,$IN23_4,${S2}[2] 628f569ca16SArd Biesheuvel fmov $IN23_2,x8 629f569ca16SArd Biesheuvel umlal $ACC0,$IN23_4,${S1}[2] 630f569ca16SArd Biesheuvel fmov $IN23_3,x10 631f569ca16SArd Biesheuvel 632f569ca16SArd Biesheuvel //////////////////////////////////////////////////////////////// 633f569ca16SArd Biesheuvel // (hash+inp[0:1])*r^4 and accumulate 634f569ca16SArd Biesheuvel 635f569ca16SArd Biesheuvel add $IN01_0,$IN01_0,$H0 636f569ca16SArd Biesheuvel fmov $IN23_4,x12 637f569ca16SArd Biesheuvel umlal $ACC3,$IN01_2,${R1}[0] 638f569ca16SArd Biesheuvel ldp x8,x12,[$inp],#16 // inp[0:1] 639f569ca16SArd Biesheuvel umlal $ACC0,$IN01_2,${S3}[0] 640f569ca16SArd Biesheuvel ldp x9,x13,[$inp],#48 641f569ca16SArd Biesheuvel umlal $ACC4,$IN01_2,${R2}[0] 642f569ca16SArd Biesheuvel umlal $ACC1,$IN01_2,${S4}[0] 643f569ca16SArd Biesheuvel umlal $ACC2,$IN01_2,${R0}[0] 644f569ca16SArd Biesheuvel#ifdef __AARCH64EB__ 645f569ca16SArd Biesheuvel rev x8,x8 646f569ca16SArd Biesheuvel rev x12,x12 647f569ca16SArd Biesheuvel rev x9,x9 648f569ca16SArd Biesheuvel rev x13,x13 649f569ca16SArd Biesheuvel#endif 650f569ca16SArd Biesheuvel 651f569ca16SArd Biesheuvel add $IN01_1,$IN01_1,$H1 652f569ca16SArd Biesheuvel umlal $ACC3,$IN01_0,${R3}[0] 653f569ca16SArd Biesheuvel umlal $ACC4,$IN01_0,${R4}[0] 654f569ca16SArd Biesheuvel and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 655f569ca16SArd Biesheuvel umlal $ACC2,$IN01_0,${R2}[0] 656f569ca16SArd Biesheuvel and x5,x9,#0x03ffffff 657f569ca16SArd Biesheuvel umlal $ACC0,$IN01_0,${R0}[0] 658f569ca16SArd Biesheuvel ubfx x6,x8,#26,#26 659f569ca16SArd Biesheuvel umlal $ACC1,$IN01_0,${R1}[0] 660f569ca16SArd Biesheuvel ubfx x7,x9,#26,#26 661f569ca16SArd Biesheuvel 662f569ca16SArd Biesheuvel add $IN01_3,$IN01_3,$H3 663f569ca16SArd Biesheuvel add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 664f569ca16SArd Biesheuvel umlal $ACC3,$IN01_1,${R2}[0] 665f569ca16SArd Biesheuvel extr x8,x12,x8,#52 666f569ca16SArd Biesheuvel umlal $ACC4,$IN01_1,${R3}[0] 667f569ca16SArd Biesheuvel extr x9,x13,x9,#52 668f569ca16SArd Biesheuvel umlal $ACC0,$IN01_1,${S4}[0] 669f569ca16SArd Biesheuvel add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 670f569ca16SArd Biesheuvel umlal $ACC2,$IN01_1,${R1}[0] 671f569ca16SArd Biesheuvel fmov $IN01_0,x4 672f569ca16SArd Biesheuvel umlal $ACC1,$IN01_1,${R0}[0] 673f569ca16SArd Biesheuvel and x8,x8,#0x03ffffff 674f569ca16SArd Biesheuvel 675f569ca16SArd Biesheuvel add $IN01_4,$IN01_4,$H4 676f569ca16SArd Biesheuvel and x9,x9,#0x03ffffff 677f569ca16SArd Biesheuvel umlal $ACC3,$IN01_3,${R0}[0] 678f569ca16SArd Biesheuvel ubfx x10,x12,#14,#26 679f569ca16SArd Biesheuvel umlal $ACC0,$IN01_3,${S2}[0] 680f569ca16SArd Biesheuvel ubfx x11,x13,#14,#26 681f569ca16SArd Biesheuvel umlal $ACC4,$IN01_3,${R1}[0] 682f569ca16SArd Biesheuvel add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 683f569ca16SArd Biesheuvel umlal $ACC1,$IN01_3,${S3}[0] 684f569ca16SArd Biesheuvel fmov $IN01_1,x6 685f569ca16SArd Biesheuvel umlal $ACC2,$IN01_3,${S4}[0] 686f569ca16SArd Biesheuvel add x12,$padbit,x12,lsr#40 687f569ca16SArd Biesheuvel 688f569ca16SArd Biesheuvel umlal $ACC3,$IN01_4,${S4}[0] 689f569ca16SArd Biesheuvel add x13,$padbit,x13,lsr#40 690f569ca16SArd Biesheuvel umlal $ACC0,$IN01_4,${S1}[0] 691f569ca16SArd Biesheuvel add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 692f569ca16SArd Biesheuvel umlal $ACC4,$IN01_4,${R0}[0] 693f569ca16SArd Biesheuvel add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 694f569ca16SArd Biesheuvel umlal $ACC1,$IN01_4,${S2}[0] 695f569ca16SArd Biesheuvel fmov $IN01_2,x8 696f569ca16SArd Biesheuvel umlal $ACC2,$IN01_4,${S3}[0] 697f569ca16SArd Biesheuvel fmov $IN01_3,x10 698f569ca16SArd Biesheuvel fmov $IN01_4,x12 699f569ca16SArd Biesheuvel 700f569ca16SArd Biesheuvel ///////////////////////////////////////////////////////////////// 701f569ca16SArd Biesheuvel // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 702f569ca16SArd Biesheuvel // and P. Schwabe 703f569ca16SArd Biesheuvel // 704f569ca16SArd Biesheuvel // [see discussion in poly1305-armv4 module] 705f569ca16SArd Biesheuvel 706f569ca16SArd Biesheuvel ushr $T0.2d,$ACC3,#26 707f569ca16SArd Biesheuvel xtn $H3,$ACC3 708f569ca16SArd Biesheuvel ushr $T1.2d,$ACC0,#26 709f569ca16SArd Biesheuvel and $ACC0,$ACC0,$MASK.2d 710f569ca16SArd Biesheuvel add $ACC4,$ACC4,$T0.2d // h3 -> h4 711f569ca16SArd Biesheuvel bic $H3,#0xfc,lsl#24 // &=0x03ffffff 712f569ca16SArd Biesheuvel add $ACC1,$ACC1,$T1.2d // h0 -> h1 713f569ca16SArd Biesheuvel 714f569ca16SArd Biesheuvel ushr $T0.2d,$ACC4,#26 715f569ca16SArd Biesheuvel xtn $H4,$ACC4 716f569ca16SArd Biesheuvel ushr $T1.2d,$ACC1,#26 717f569ca16SArd Biesheuvel xtn $H1,$ACC1 718f569ca16SArd Biesheuvel bic $H4,#0xfc,lsl#24 719f569ca16SArd Biesheuvel add $ACC2,$ACC2,$T1.2d // h1 -> h2 720f569ca16SArd Biesheuvel 721f569ca16SArd Biesheuvel add $ACC0,$ACC0,$T0.2d 722f569ca16SArd Biesheuvel shl $T0.2d,$T0.2d,#2 723f569ca16SArd Biesheuvel shrn $T1.2s,$ACC2,#26 724f569ca16SArd Biesheuvel xtn $H2,$ACC2 725f569ca16SArd Biesheuvel add $ACC0,$ACC0,$T0.2d // h4 -> h0 726f569ca16SArd Biesheuvel bic $H1,#0xfc,lsl#24 727f569ca16SArd Biesheuvel add $H3,$H3,$T1.2s // h2 -> h3 728f569ca16SArd Biesheuvel bic $H2,#0xfc,lsl#24 729f569ca16SArd Biesheuvel 730f569ca16SArd Biesheuvel shrn $T0.2s,$ACC0,#26 731f569ca16SArd Biesheuvel xtn $H0,$ACC0 732f569ca16SArd Biesheuvel ushr $T1.2s,$H3,#26 733f569ca16SArd Biesheuvel bic $H3,#0xfc,lsl#24 734f569ca16SArd Biesheuvel bic $H0,#0xfc,lsl#24 735f569ca16SArd Biesheuvel add $H1,$H1,$T0.2s // h0 -> h1 736f569ca16SArd Biesheuvel add $H4,$H4,$T1.2s // h3 -> h4 737f569ca16SArd Biesheuvel 738f569ca16SArd Biesheuvel b.hi .Loop_neon 739f569ca16SArd Biesheuvel 740f569ca16SArd Biesheuvel.Lskip_loop: 741f569ca16SArd Biesheuvel dup $IN23_2,${IN23_2}[0] 742f569ca16SArd Biesheuvel add $IN01_2,$IN01_2,$H2 743f569ca16SArd Biesheuvel 744f569ca16SArd Biesheuvel //////////////////////////////////////////////////////////////// 745f569ca16SArd Biesheuvel // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 746f569ca16SArd Biesheuvel 747f569ca16SArd Biesheuvel adds $len,$len,#32 748f569ca16SArd Biesheuvel b.ne .Long_tail 749f569ca16SArd Biesheuvel 750f569ca16SArd Biesheuvel dup $IN23_2,${IN01_2}[0] 751f569ca16SArd Biesheuvel add $IN23_0,$IN01_0,$H0 752f569ca16SArd Biesheuvel add $IN23_3,$IN01_3,$H3 753f569ca16SArd Biesheuvel add $IN23_1,$IN01_1,$H1 754f569ca16SArd Biesheuvel add $IN23_4,$IN01_4,$H4 755f569ca16SArd Biesheuvel 756f569ca16SArd Biesheuvel.Long_tail: 757f569ca16SArd Biesheuvel dup $IN23_0,${IN23_0}[0] 758f569ca16SArd Biesheuvel umull2 $ACC0,$IN23_2,${S3} 759f569ca16SArd Biesheuvel umull2 $ACC3,$IN23_2,${R1} 760f569ca16SArd Biesheuvel umull2 $ACC4,$IN23_2,${R2} 761f569ca16SArd Biesheuvel umull2 $ACC2,$IN23_2,${R0} 762f569ca16SArd Biesheuvel umull2 $ACC1,$IN23_2,${S4} 763f569ca16SArd Biesheuvel 764f569ca16SArd Biesheuvel dup $IN23_1,${IN23_1}[0] 765f569ca16SArd Biesheuvel umlal2 $ACC0,$IN23_0,${R0} 766f569ca16SArd Biesheuvel umlal2 $ACC2,$IN23_0,${R2} 767f569ca16SArd Biesheuvel umlal2 $ACC3,$IN23_0,${R3} 768f569ca16SArd Biesheuvel umlal2 $ACC4,$IN23_0,${R4} 769f569ca16SArd Biesheuvel umlal2 $ACC1,$IN23_0,${R1} 770f569ca16SArd Biesheuvel 771f569ca16SArd Biesheuvel dup $IN23_3,${IN23_3}[0] 772f569ca16SArd Biesheuvel umlal2 $ACC0,$IN23_1,${S4} 773f569ca16SArd Biesheuvel umlal2 $ACC3,$IN23_1,${R2} 774f569ca16SArd Biesheuvel umlal2 $ACC2,$IN23_1,${R1} 775f569ca16SArd Biesheuvel umlal2 $ACC4,$IN23_1,${R3} 776f569ca16SArd Biesheuvel umlal2 $ACC1,$IN23_1,${R0} 777f569ca16SArd Biesheuvel 778f569ca16SArd Biesheuvel dup $IN23_4,${IN23_4}[0] 779f569ca16SArd Biesheuvel umlal2 $ACC3,$IN23_3,${R0} 780f569ca16SArd Biesheuvel umlal2 $ACC4,$IN23_3,${R1} 781f569ca16SArd Biesheuvel umlal2 $ACC0,$IN23_3,${S2} 782f569ca16SArd Biesheuvel umlal2 $ACC1,$IN23_3,${S3} 783f569ca16SArd Biesheuvel umlal2 $ACC2,$IN23_3,${S4} 784f569ca16SArd Biesheuvel 785f569ca16SArd Biesheuvel umlal2 $ACC3,$IN23_4,${S4} 786f569ca16SArd Biesheuvel umlal2 $ACC0,$IN23_4,${S1} 787f569ca16SArd Biesheuvel umlal2 $ACC4,$IN23_4,${R0} 788f569ca16SArd Biesheuvel umlal2 $ACC1,$IN23_4,${S2} 789f569ca16SArd Biesheuvel umlal2 $ACC2,$IN23_4,${S3} 790f569ca16SArd Biesheuvel 791f569ca16SArd Biesheuvel b.eq .Lshort_tail 792f569ca16SArd Biesheuvel 793f569ca16SArd Biesheuvel //////////////////////////////////////////////////////////////// 794f569ca16SArd Biesheuvel // (hash+inp[0:1])*r^4:r^3 and accumulate 795f569ca16SArd Biesheuvel 796f569ca16SArd Biesheuvel add $IN01_0,$IN01_0,$H0 797f569ca16SArd Biesheuvel umlal $ACC3,$IN01_2,${R1} 798f569ca16SArd Biesheuvel umlal $ACC0,$IN01_2,${S3} 799f569ca16SArd Biesheuvel umlal $ACC4,$IN01_2,${R2} 800f569ca16SArd Biesheuvel umlal $ACC1,$IN01_2,${S4} 801f569ca16SArd Biesheuvel umlal $ACC2,$IN01_2,${R0} 802f569ca16SArd Biesheuvel 803f569ca16SArd Biesheuvel add $IN01_1,$IN01_1,$H1 804f569ca16SArd Biesheuvel umlal $ACC3,$IN01_0,${R3} 805f569ca16SArd Biesheuvel umlal $ACC0,$IN01_0,${R0} 806f569ca16SArd Biesheuvel umlal $ACC4,$IN01_0,${R4} 807f569ca16SArd Biesheuvel umlal $ACC1,$IN01_0,${R1} 808f569ca16SArd Biesheuvel umlal $ACC2,$IN01_0,${R2} 809f569ca16SArd Biesheuvel 810f569ca16SArd Biesheuvel add $IN01_3,$IN01_3,$H3 811f569ca16SArd Biesheuvel umlal $ACC3,$IN01_1,${R2} 812f569ca16SArd Biesheuvel umlal $ACC0,$IN01_1,${S4} 813f569ca16SArd Biesheuvel umlal $ACC4,$IN01_1,${R3} 814f569ca16SArd Biesheuvel umlal $ACC1,$IN01_1,${R0} 815f569ca16SArd Biesheuvel umlal $ACC2,$IN01_1,${R1} 816f569ca16SArd Biesheuvel 817f569ca16SArd Biesheuvel add $IN01_4,$IN01_4,$H4 818f569ca16SArd Biesheuvel umlal $ACC3,$IN01_3,${R0} 819f569ca16SArd Biesheuvel umlal $ACC0,$IN01_3,${S2} 820f569ca16SArd Biesheuvel umlal $ACC4,$IN01_3,${R1} 821f569ca16SArd Biesheuvel umlal $ACC1,$IN01_3,${S3} 822f569ca16SArd Biesheuvel umlal $ACC2,$IN01_3,${S4} 823f569ca16SArd Biesheuvel 824f569ca16SArd Biesheuvel umlal $ACC3,$IN01_4,${S4} 825f569ca16SArd Biesheuvel umlal $ACC0,$IN01_4,${S1} 826f569ca16SArd Biesheuvel umlal $ACC4,$IN01_4,${R0} 827f569ca16SArd Biesheuvel umlal $ACC1,$IN01_4,${S2} 828f569ca16SArd Biesheuvel umlal $ACC2,$IN01_4,${S3} 829f569ca16SArd Biesheuvel 830f569ca16SArd Biesheuvel.Lshort_tail: 831f569ca16SArd Biesheuvel //////////////////////////////////////////////////////////////// 832f569ca16SArd Biesheuvel // horizontal add 833f569ca16SArd Biesheuvel 834f569ca16SArd Biesheuvel addp $ACC3,$ACC3,$ACC3 835f569ca16SArd Biesheuvel ldp d8,d9,[sp,#16] // meet ABI requirements 836f569ca16SArd Biesheuvel addp $ACC0,$ACC0,$ACC0 837f569ca16SArd Biesheuvel ldp d10,d11,[sp,#32] 838f569ca16SArd Biesheuvel addp $ACC4,$ACC4,$ACC4 839f569ca16SArd Biesheuvel ldp d12,d13,[sp,#48] 840f569ca16SArd Biesheuvel addp $ACC1,$ACC1,$ACC1 841f569ca16SArd Biesheuvel ldp d14,d15,[sp,#64] 842f569ca16SArd Biesheuvel addp $ACC2,$ACC2,$ACC2 843f569ca16SArd Biesheuvel ldr x30,[sp,#8] 844f569ca16SArd Biesheuvel 845f569ca16SArd Biesheuvel //////////////////////////////////////////////////////////////// 846f569ca16SArd Biesheuvel // lazy reduction, but without narrowing 847f569ca16SArd Biesheuvel 848f569ca16SArd Biesheuvel ushr $T0.2d,$ACC3,#26 849f569ca16SArd Biesheuvel and $ACC3,$ACC3,$MASK.2d 850f569ca16SArd Biesheuvel ushr $T1.2d,$ACC0,#26 851f569ca16SArd Biesheuvel and $ACC0,$ACC0,$MASK.2d 852f569ca16SArd Biesheuvel 853f569ca16SArd Biesheuvel add $ACC4,$ACC4,$T0.2d // h3 -> h4 854f569ca16SArd Biesheuvel add $ACC1,$ACC1,$T1.2d // h0 -> h1 855f569ca16SArd Biesheuvel 856f569ca16SArd Biesheuvel ushr $T0.2d,$ACC4,#26 857f569ca16SArd Biesheuvel and $ACC4,$ACC4,$MASK.2d 858f569ca16SArd Biesheuvel ushr $T1.2d,$ACC1,#26 859f569ca16SArd Biesheuvel and $ACC1,$ACC1,$MASK.2d 860f569ca16SArd Biesheuvel add $ACC2,$ACC2,$T1.2d // h1 -> h2 861f569ca16SArd Biesheuvel 862f569ca16SArd Biesheuvel add $ACC0,$ACC0,$T0.2d 863f569ca16SArd Biesheuvel shl $T0.2d,$T0.2d,#2 864f569ca16SArd Biesheuvel ushr $T1.2d,$ACC2,#26 865f569ca16SArd Biesheuvel and $ACC2,$ACC2,$MASK.2d 866f569ca16SArd Biesheuvel add $ACC0,$ACC0,$T0.2d // h4 -> h0 867f569ca16SArd Biesheuvel add $ACC3,$ACC3,$T1.2d // h2 -> h3 868f569ca16SArd Biesheuvel 869f569ca16SArd Biesheuvel ushr $T0.2d,$ACC0,#26 870f569ca16SArd Biesheuvel and $ACC0,$ACC0,$MASK.2d 871f569ca16SArd Biesheuvel ushr $T1.2d,$ACC3,#26 872f569ca16SArd Biesheuvel and $ACC3,$ACC3,$MASK.2d 873f569ca16SArd Biesheuvel add $ACC1,$ACC1,$T0.2d // h0 -> h1 874f569ca16SArd Biesheuvel add $ACC4,$ACC4,$T1.2d // h3 -> h4 875f569ca16SArd Biesheuvel 876f569ca16SArd Biesheuvel //////////////////////////////////////////////////////////////// 877f569ca16SArd Biesheuvel // write the result, can be partially reduced 878f569ca16SArd Biesheuvel 879f569ca16SArd Biesheuvel st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16 880f569ca16SArd Biesheuvel mov x4,#1 881f569ca16SArd Biesheuvel st1 {$ACC4}[0],[$ctx] 882f569ca16SArd Biesheuvel str x4,[$ctx,#8] // set is_base2_26 883f569ca16SArd Biesheuvel 884f569ca16SArd Biesheuvel ldr x29,[sp],#80 885519a0d7eSArd Biesheuvel .inst 0xd50323bf // autiasp 886f569ca16SArd Biesheuvel ret 887f569ca16SArd Biesheuvel.size poly1305_blocks_neon,.-poly1305_blocks_neon 888f569ca16SArd Biesheuvel 889*9369693aSJia He.pushsection .rodata 890f569ca16SArd Biesheuvel.align 5 891f569ca16SArd Biesheuvel.Lzeros: 892f569ca16SArd Biesheuvel.long 0,0,0,0,0,0,0,0 893f569ca16SArd Biesheuvel.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm" 894*9369693aSJia He.popsection 895*9369693aSJia He 896f569ca16SArd Biesheuvel.align 2 897f569ca16SArd Biesheuvel#if !defined(__KERNEL__) && !defined(_WIN64) 898f569ca16SArd Biesheuvel.comm OPENSSL_armcap_P,4,4 899f569ca16SArd Biesheuvel.hidden OPENSSL_armcap_P 900f569ca16SArd Biesheuvel#endif 901f569ca16SArd Biesheuvel___ 902f569ca16SArd Biesheuvel 903f569ca16SArd Biesheuvelforeach (split("\n",$code)) { 904f569ca16SArd Biesheuvel s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or 905f569ca16SArd Biesheuvel s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or 906f569ca16SArd Biesheuvel (m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or 907f569ca16SArd Biesheuvel (m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or 908f569ca16SArd Biesheuvel (m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or 909f569ca16SArd Biesheuvel (m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or 910f569ca16SArd Biesheuvel (m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1)); 911f569ca16SArd Biesheuvel 912f569ca16SArd Biesheuvel s/\.[124]([sd])\[/.$1\[/; 913f569ca16SArd Biesheuvel s/w#x([0-9]+)/w$1/g; 914f569ca16SArd Biesheuvel 915f569ca16SArd Biesheuvel print $_,"\n"; 916f569ca16SArd Biesheuvel} 917f569ca16SArd Biesheuvelclose STDOUT; 918