1// SPDX-License-Identifier: GPL-2.0 2/* 3 * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. 4 */ 5 6#include <asm/asm.h> 7#include <asm/regdef.h> 8#include <linux/linkage.h> 9 10.text 11 12.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 13 \op \d0, \d0, \s0 14 \op \d1, \d1, \s1 15 \op \d2, \d2, \s2 16 \op \d3, \d3, \s3 17.endm 18 19/* 20 * Very basic LoongArch implementation of ChaCha20. Produces a given positive 21 * number of blocks of output with a nonce of 0, taking an input key and 22 * 8-byte counter. Importantly does not spill to the stack. Its arguments 23 * are: 24 * 25 * a0: output bytes 26 * a1: 32-byte key input 27 * a2: 8-byte counter input/output 28 * a3: number of 64-byte blocks to write to output 29 */ 30SYM_FUNC_START(__arch_chacha20_blocks_nostack) 31 32/* We don't need a frame pointer */ 33#define s9 fp 34 35#define output a0 36#define key a1 37#define counter a2 38#define nblocks a3 39#define i a4 40#define state0 s0 41#define state1 s1 42#define state2 s2 43#define state3 s3 44#define state4 s4 45#define state5 s5 46#define state6 s6 47#define state7 s7 48#define state8 s8 49#define state9 s9 50#define state10 a5 51#define state11 a6 52#define state12 a7 53#define state13 t0 54#define state14 t1 55#define state15 t2 56#define cnt_lo t3 57#define cnt_hi t4 58#define copy0 t5 59#define copy1 t6 60#define copy2 t7 61#define copy3 t8 62 63/* Packs to be used with OP_4REG */ 64#define line0 state0, state1, state2, state3 65#define line1 state4, state5, state6, state7 66#define line2 state8, state9, state10, state11 67#define line3 state12, state13, state14, state15 68 69#define line1_perm state5, state6, state7, state4 70#define line2_perm state10, state11, state8, state9 71#define line3_perm state15, state12, state13, state14 72 73#define copy copy0, copy1, copy2, copy3 74 75#define _16 16, 16, 16, 16 76#define _20 20, 20, 20, 20 77#define _24 24, 24, 24, 24 78#define _25 25, 25, 25, 25 79 80 /* 81 * The ABI requires s0-s9 saved, and sp aligned to 16-byte. 82 * This does not violate the stack-less requirement: no sensitive data 83 * is spilled onto the stack. 84 */ 85 PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN 86 REG_S s0, sp, 0 87 REG_S s1, sp, SZREG 88 REG_S s2, sp, SZREG * 2 89 REG_S s3, sp, SZREG * 3 90 REG_S s4, sp, SZREG * 4 91 REG_S s5, sp, SZREG * 5 92 REG_S s6, sp, SZREG * 6 93 REG_S s7, sp, SZREG * 7 94 REG_S s8, sp, SZREG * 8 95 REG_S s9, sp, SZREG * 9 96 97 li.w copy0, 0x61707865 98 li.w copy1, 0x3320646e 99 li.w copy2, 0x79622d32 100 li.w copy3, 0x6b206574 101 102 ld.w cnt_lo, counter, 0 103 ld.w cnt_hi, counter, 4 104 105.Lblock: 106 /* state[0,1,2,3] = "expand 32-byte k" */ 107 move state0, copy0 108 move state1, copy1 109 move state2, copy2 110 move state3, copy3 111 112 /* state[4,5,..,11] = key */ 113 ld.w state4, key, 0 114 ld.w state5, key, 4 115 ld.w state6, key, 8 116 ld.w state7, key, 12 117 ld.w state8, key, 16 118 ld.w state9, key, 20 119 ld.w state10, key, 24 120 ld.w state11, key, 28 121 122 /* state[12,13] = counter */ 123 move state12, cnt_lo 124 move state13, cnt_hi 125 126 /* state[14,15] = 0 */ 127 move state14, zero 128 move state15, zero 129 130 li.w i, 10 131.Lpermute: 132 /* odd round */ 133 OP_4REG add.w line0, line1 134 OP_4REG xor line3, line0 135 OP_4REG rotri.w line3, _16 136 137 OP_4REG add.w line2, line3 138 OP_4REG xor line1, line2 139 OP_4REG rotri.w line1, _20 140 141 OP_4REG add.w line0, line1 142 OP_4REG xor line3, line0 143 OP_4REG rotri.w line3, _24 144 145 OP_4REG add.w line2, line3 146 OP_4REG xor line1, line2 147 OP_4REG rotri.w line1, _25 148 149 /* even round */ 150 OP_4REG add.w line0, line1_perm 151 OP_4REG xor line3_perm, line0 152 OP_4REG rotri.w line3_perm, _16 153 154 OP_4REG add.w line2_perm, line3_perm 155 OP_4REG xor line1_perm, line2_perm 156 OP_4REG rotri.w line1_perm, _20 157 158 OP_4REG add.w line0, line1_perm 159 OP_4REG xor line3_perm, line0 160 OP_4REG rotri.w line3_perm, _24 161 162 OP_4REG add.w line2_perm, line3_perm 163 OP_4REG xor line1_perm, line2_perm 164 OP_4REG rotri.w line1_perm, _25 165 166 addi.w i, i, -1 167 bnez i, .Lpermute 168 169 /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ 170 OP_4REG add.w line0, copy 171 st.w state0, output, 0 172 st.w state1, output, 4 173 st.w state2, output, 8 174 st.w state3, output, 12 175 176 /* from now on state[0,1,2,3] are scratch registers */ 177 178 /* state[0,1,2,3] = lo32(key) */ 179 ld.w state0, key, 0 180 ld.w state1, key, 4 181 ld.w state2, key, 8 182 ld.w state3, key, 12 183 184 /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ 185 OP_4REG add.w line1, line0 186 st.w state4, output, 16 187 st.w state5, output, 20 188 st.w state6, output, 24 189 st.w state7, output, 28 190 191 /* state[0,1,2,3] = hi32(key) */ 192 ld.w state0, key, 16 193 ld.w state1, key, 20 194 ld.w state2, key, 24 195 ld.w state3, key, 28 196 197 /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */ 198 OP_4REG add.w line2, line0 199 st.w state8, output, 32 200 st.w state9, output, 36 201 st.w state10, output, 40 202 st.w state11, output, 44 203 204 /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ 205 add.w state12, state12, cnt_lo 206 add.w state13, state13, cnt_hi 207 st.w state12, output, 48 208 st.w state13, output, 52 209 st.w state14, output, 56 210 st.w state15, output, 60 211 212 /* ++counter */ 213 addi.w cnt_lo, cnt_lo, 1 214 sltui state0, cnt_lo, 1 215 add.w cnt_hi, cnt_hi, state0 216 217 /* output += 64 */ 218 PTR_ADDI output, output, 64 219 /* --nblocks */ 220 PTR_ADDI nblocks, nblocks, -1 221 bnez nblocks, .Lblock 222 223 /* counter = [cnt_lo, cnt_hi] */ 224 st.w cnt_lo, counter, 0 225 st.w cnt_hi, counter, 4 226 227 /* 228 * Zero out the potentially sensitive regs, in case nothing uses these 229 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and 230 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we 231 * only need to zero state[11,...,15]. 232 */ 233 move state10, zero 234 move state11, zero 235 move state12, zero 236 move state13, zero 237 move state14, zero 238 move state15, zero 239 240 REG_L s0, sp, 0 241 REG_L s1, sp, SZREG 242 REG_L s2, sp, SZREG * 2 243 REG_L s3, sp, SZREG * 3 244 REG_L s4, sp, SZREG * 4 245 REG_L s5, sp, SZREG * 5 246 REG_L s6, sp, SZREG * 6 247 REG_L s7, sp, SZREG * 7 248 REG_L s8, sp, SZREG * 8 249 REG_L s9, sp, SZREG * 9 250 PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN) 251 252 jr ra 253SYM_FUNC_END(__arch_chacha20_blocks_nostack) 254