1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with NEON vmull.p8 instructions. 4 * 5 * Copyright (C) 2015 - 2017 Linaro Ltd. 6 * Copyright (C) 2023 Google LLC. <ardb@google.com> 7 */ 8 9#include <linux/linkage.h> 10#include <asm/assembler.h> 11 12 .fpu neon 13 14 SHASH .req q0 15 T1 .req q1 16 XL .req q2 17 XM .req q3 18 XH .req q4 19 IN1 .req q4 20 21 SHASH_L .req d0 22 SHASH_H .req d1 23 T1_L .req d2 24 T1_H .req d3 25 XL_L .req d4 26 XL_H .req d5 27 XM_L .req d6 28 XM_H .req d7 29 XH_L .req d8 30 31 t0l .req d10 32 t0h .req d11 33 t1l .req d12 34 t1h .req d13 35 t2l .req d14 36 t2h .req d15 37 t3l .req d16 38 t3h .req d17 39 t4l .req d18 40 t4h .req d19 41 42 t0q .req q5 43 t1q .req q6 44 t2q .req q7 45 t3q .req q8 46 t4q .req q9 47 48 s1l .req d20 49 s1h .req d21 50 s2l .req d22 51 s2h .req d23 52 s3l .req d24 53 s3h .req d25 54 s4l .req d26 55 s4h .req d27 56 57 SHASH2_p8 .req d28 58 59 k16 .req d29 60 k32 .req d30 61 k48 .req d31 62 63 T2 .req q7 64 65 .text 66 67 /* 68 * This implementation of 64x64 -> 128 bit polynomial multiplication 69 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 70 * "Fast Software Polynomial Multiplication on ARM Processors Using 71 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 72 * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 73 * 74 * It has been slightly tweaked for in-order performance, and to allow 75 * 'rq' to overlap with 'ad' or 'bd'. 76 */ 77 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 78 vext.8 t0l, \ad, \ad, #1 @ A1 79 .ifc \b1, t4l 80 vext.8 t4l, \bd, \bd, #1 @ B1 81 .endif 82 vmull.p8 t0q, t0l, \bd @ F = A1*B 83 vext.8 t1l, \ad, \ad, #2 @ A2 84 vmull.p8 t4q, \ad, \b1 @ E = A*B1 85 .ifc \b2, t3l 86 vext.8 t3l, \bd, \bd, #2 @ B2 87 .endif 88 vmull.p8 t1q, t1l, \bd @ H = A2*B 89 vext.8 t2l, \ad, \ad, #3 @ A3 90 vmull.p8 t3q, \ad, \b2 @ G = A*B2 91 veor t0q, t0q, t4q @ L = E + F 92 .ifc \b3, t4l 93 vext.8 t4l, \bd, \bd, #3 @ B3 94 .endif 95 vmull.p8 t2q, t2l, \bd @ J = A3*B 96 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 97 veor t1q, t1q, t3q @ M = G + H 98 .ifc \b4, t3l 99 vext.8 t3l, \bd, \bd, #4 @ B4 100 .endif 101 vmull.p8 t4q, \ad, \b3 @ I = A*B3 102 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 103 vmull.p8 t3q, \ad, \b4 @ K = A*B4 104 vand t0h, t0h, k48 105 vand t1h, t1h, k32 106 veor t2q, t2q, t4q @ N = I + J 107 veor t0l, t0l, t0h 108 veor t1l, t1l, t1h 109 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 110 vand t2h, t2h, k16 111 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 112 vmov.i64 t3h, #0 113 vext.8 t0q, t0q, t0q, #15 114 veor t2l, t2l, t2h 115 vext.8 t1q, t1q, t1q, #14 116 vmull.p8 \rq, \ad, \bd @ D = A*B 117 vext.8 t2q, t2q, t2q, #13 118 vext.8 t3q, t3q, t3q, #12 119 veor t0q, t0q, t1q 120 veor t2q, t2q, t3q 121 veor \rq, \rq, t0q 122 veor \rq, \rq, t2q 123 .endm 124 125 .macro __pmull_reduce_p8 126 veor XL_H, XL_H, XM_L 127 veor XH_L, XH_L, XM_H 128 129 vshl.i64 T1, XL, #57 130 vshl.i64 T2, XL, #62 131 veor T1, T1, T2 132 vshl.i64 T2, XL, #63 133 veor T1, T1, T2 134 veor XL_H, XL_H, T1_L 135 veor XH_L, XH_L, T1_H 136 137 vshr.u64 T1, XL, #1 138 veor XH, XH, XL 139 veor XL, XL, T1 140 vshr.u64 T1, T1, #6 141 vshr.u64 XL, XL, #1 142 .endm 143 144 .macro vrev64_if_be a 145#ifdef CONFIG_CPU_BIG_ENDIAN 146 vrev64.8 \a, \a 147#endif 148 .endm 149 150 .macro ghash_update 151 vld1.64 {XL}, [r1] 152 vrev64_if_be XL 153 1540: 155 vld1.8 {T1}, [r2]! 156 subs r0, r0, #1 157 158 /* multiply XL by SHASH in GF(2^128) */ 159 vrev64.8 T1, T1 160 161 vext.8 IN1, T1, T1, #8 162 veor T1_L, T1_L, XL_H 163 veor XL, XL, IN1 164 165 __pmull_p8 XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 166 veor T1, T1, XL 167 __pmull_p8 XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 168 __pmull_p8 XM, T1_L, SHASH2_p8 @ (a1+a0)(b1+b0) 169 170 veor T1, XL, XH 171 veor XM, XM, T1 172 173 __pmull_reduce_p8 174 175 veor T1, T1, XH 176 veor XL, XL, T1 177 178 bne 0b 179 .endm 180 181 /* 182 * void pmull_ghash_update_p8(size_t blocks, struct polyval_elem *dg, 183 * const u8 *src, 184 * const struct polyval_elem *h) 185 */ 186ENTRY(pmull_ghash_update_p8) 187 vld1.64 {SHASH}, [r3] 188 vrev64_if_be SHASH 189 veor SHASH2_p8, SHASH_L, SHASH_H 190 191 vext.8 s1l, SHASH_L, SHASH_L, #1 192 vext.8 s2l, SHASH_L, SHASH_L, #2 193 vext.8 s3l, SHASH_L, SHASH_L, #3 194 vext.8 s4l, SHASH_L, SHASH_L, #4 195 vext.8 s1h, SHASH_H, SHASH_H, #1 196 vext.8 s2h, SHASH_H, SHASH_H, #2 197 vext.8 s3h, SHASH_H, SHASH_H, #3 198 vext.8 s4h, SHASH_H, SHASH_H, #4 199 200 vmov.i64 k16, #0xffff 201 vmov.i64 k32, #0xffffffff 202 vmov.i64 k48, #0xffffffffffff 203 204 ghash_update 205 vrev64_if_be XL 206 vst1.64 {XL}, [r1] 207 208 bx lr 209ENDPROC(pmull_ghash_update_p8) 210