118efd0b1SXi Ruoyao// SPDX-License-Identifier: GPL-2.0 218efd0b1SXi Ruoyao/* 318efd0b1SXi Ruoyao * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. 418efd0b1SXi Ruoyao */ 518efd0b1SXi Ruoyao 618efd0b1SXi Ruoyao#include <asm/asm.h> 718efd0b1SXi Ruoyao#include <asm/regdef.h> 818efd0b1SXi Ruoyao#include <linux/linkage.h> 918efd0b1SXi Ruoyao 1018efd0b1SXi Ruoyao.text 1118efd0b1SXi Ruoyao 129805f39dSXi Ruoyao.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 139805f39dSXi Ruoyao \op \d0, \d0, \s0 149805f39dSXi Ruoyao \op \d1, \d1, \s1 159805f39dSXi Ruoyao \op \d2, \d2, \s2 169805f39dSXi Ruoyao \op \d3, \d3, \s3 1718efd0b1SXi Ruoyao.endm 1818efd0b1SXi Ruoyao 1918efd0b1SXi Ruoyao/* 2018efd0b1SXi Ruoyao * Very basic LoongArch implementation of ChaCha20. Produces a given positive 2118efd0b1SXi Ruoyao * number of blocks of output with a nonce of 0, taking an input key and 2218efd0b1SXi Ruoyao * 8-byte counter. Importantly does not spill to the stack. Its arguments 2318efd0b1SXi Ruoyao * are: 2418efd0b1SXi Ruoyao * 2518efd0b1SXi Ruoyao * a0: output bytes 2618efd0b1SXi Ruoyao * a1: 32-byte key input 2718efd0b1SXi Ruoyao * a2: 8-byte counter input/output 2818efd0b1SXi Ruoyao * a3: number of 64-byte blocks to write to output 2918efd0b1SXi Ruoyao */ 3018efd0b1SXi RuoyaoSYM_FUNC_START(__arch_chacha20_blocks_nostack) 3118efd0b1SXi Ruoyao 3218efd0b1SXi Ruoyao/* We don't need a frame pointer */ 3318efd0b1SXi Ruoyao#define s9 fp 3418efd0b1SXi Ruoyao 3518efd0b1SXi Ruoyao#define output a0 3618efd0b1SXi Ruoyao#define key a1 3718efd0b1SXi Ruoyao#define counter a2 3818efd0b1SXi Ruoyao#define nblocks a3 3918efd0b1SXi Ruoyao#define i a4 4018efd0b1SXi Ruoyao#define state0 s0 4118efd0b1SXi Ruoyao#define state1 s1 4218efd0b1SXi Ruoyao#define state2 s2 4318efd0b1SXi Ruoyao#define state3 s3 4418efd0b1SXi Ruoyao#define state4 s4 4518efd0b1SXi Ruoyao#define state5 s5 4618efd0b1SXi Ruoyao#define state6 s6 4718efd0b1SXi Ruoyao#define state7 s7 4818efd0b1SXi Ruoyao#define state8 s8 4918efd0b1SXi Ruoyao#define state9 s9 5018efd0b1SXi Ruoyao#define state10 a5 5118efd0b1SXi Ruoyao#define state11 a6 5218efd0b1SXi Ruoyao#define state12 a7 5318efd0b1SXi Ruoyao#define state13 t0 5418efd0b1SXi Ruoyao#define state14 t1 5518efd0b1SXi Ruoyao#define state15 t2 5618efd0b1SXi Ruoyao#define cnt_lo t3 5718efd0b1SXi Ruoyao#define cnt_hi t4 5818efd0b1SXi Ruoyao#define copy0 t5 5918efd0b1SXi Ruoyao#define copy1 t6 6018efd0b1SXi Ruoyao#define copy2 t7 61*a34ea549SXi Ruoyao#define copy3 t8 6218efd0b1SXi Ruoyao 639805f39dSXi Ruoyao/* Packs to be used with OP_4REG */ 649805f39dSXi Ruoyao#define line0 state0, state1, state2, state3 659805f39dSXi Ruoyao#define line1 state4, state5, state6, state7 669805f39dSXi Ruoyao#define line2 state8, state9, state10, state11 679805f39dSXi Ruoyao#define line3 state12, state13, state14, state15 689805f39dSXi Ruoyao 699805f39dSXi Ruoyao#define line1_perm state5, state6, state7, state4 709805f39dSXi Ruoyao#define line2_perm state10, state11, state8, state9 719805f39dSXi Ruoyao#define line3_perm state15, state12, state13, state14 729805f39dSXi Ruoyao 739805f39dSXi Ruoyao#define copy copy0, copy1, copy2, copy3 749805f39dSXi Ruoyao 759805f39dSXi Ruoyao#define _16 16, 16, 16, 16 769805f39dSXi Ruoyao#define _20 20, 20, 20, 20 779805f39dSXi Ruoyao#define _24 24, 24, 24, 24 789805f39dSXi Ruoyao#define _25 25, 25, 25, 25 799805f39dSXi Ruoyao 8018efd0b1SXi Ruoyao /* 8118efd0b1SXi Ruoyao * The ABI requires s0-s9 saved, and sp aligned to 16-byte. 8218efd0b1SXi Ruoyao * This does not violate the stack-less requirement: no sensitive data 8318efd0b1SXi Ruoyao * is spilled onto the stack. 8418efd0b1SXi Ruoyao */ 8518efd0b1SXi Ruoyao PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN 8618efd0b1SXi Ruoyao REG_S s0, sp, 0 8718efd0b1SXi Ruoyao REG_S s1, sp, SZREG 8818efd0b1SXi Ruoyao REG_S s2, sp, SZREG * 2 8918efd0b1SXi Ruoyao REG_S s3, sp, SZREG * 3 9018efd0b1SXi Ruoyao REG_S s4, sp, SZREG * 4 9118efd0b1SXi Ruoyao REG_S s5, sp, SZREG * 5 9218efd0b1SXi Ruoyao REG_S s6, sp, SZREG * 6 9318efd0b1SXi Ruoyao REG_S s7, sp, SZREG * 7 9418efd0b1SXi Ruoyao REG_S s8, sp, SZREG * 8 9518efd0b1SXi Ruoyao REG_S s9, sp, SZREG * 9 9618efd0b1SXi Ruoyao 9718efd0b1SXi Ruoyao li.w copy0, 0x61707865 9818efd0b1SXi Ruoyao li.w copy1, 0x3320646e 9918efd0b1SXi Ruoyao li.w copy2, 0x79622d32 100*a34ea549SXi Ruoyao li.w copy3, 0x6b206574 10118efd0b1SXi Ruoyao 10218efd0b1SXi Ruoyao ld.w cnt_lo, counter, 0 10318efd0b1SXi Ruoyao ld.w cnt_hi, counter, 4 10418efd0b1SXi Ruoyao 10518efd0b1SXi Ruoyao.Lblock: 10618efd0b1SXi Ruoyao /* state[0,1,2,3] = "expand 32-byte k" */ 10718efd0b1SXi Ruoyao move state0, copy0 10818efd0b1SXi Ruoyao move state1, copy1 10918efd0b1SXi Ruoyao move state2, copy2 110*a34ea549SXi Ruoyao move state3, copy3 11118efd0b1SXi Ruoyao 11218efd0b1SXi Ruoyao /* state[4,5,..,11] = key */ 11318efd0b1SXi Ruoyao ld.w state4, key, 0 11418efd0b1SXi Ruoyao ld.w state5, key, 4 11518efd0b1SXi Ruoyao ld.w state6, key, 8 11618efd0b1SXi Ruoyao ld.w state7, key, 12 11718efd0b1SXi Ruoyao ld.w state8, key, 16 11818efd0b1SXi Ruoyao ld.w state9, key, 20 11918efd0b1SXi Ruoyao ld.w state10, key, 24 12018efd0b1SXi Ruoyao ld.w state11, key, 28 12118efd0b1SXi Ruoyao 12218efd0b1SXi Ruoyao /* state[12,13] = counter */ 12318efd0b1SXi Ruoyao move state12, cnt_lo 12418efd0b1SXi Ruoyao move state13, cnt_hi 12518efd0b1SXi Ruoyao 12618efd0b1SXi Ruoyao /* state[14,15] = 0 */ 12718efd0b1SXi Ruoyao move state14, zero 12818efd0b1SXi Ruoyao move state15, zero 12918efd0b1SXi Ruoyao 13018efd0b1SXi Ruoyao li.w i, 10 13118efd0b1SXi Ruoyao.Lpermute: 13218efd0b1SXi Ruoyao /* odd round */ 1339805f39dSXi Ruoyao OP_4REG add.w line0, line1 1349805f39dSXi Ruoyao OP_4REG xor line3, line0 1359805f39dSXi Ruoyao OP_4REG rotri.w line3, _16 1369805f39dSXi Ruoyao 1379805f39dSXi Ruoyao OP_4REG add.w line2, line3 1389805f39dSXi Ruoyao OP_4REG xor line1, line2 1399805f39dSXi Ruoyao OP_4REG rotri.w line1, _20 1409805f39dSXi Ruoyao 1419805f39dSXi Ruoyao OP_4REG add.w line0, line1 1429805f39dSXi Ruoyao OP_4REG xor line3, line0 1439805f39dSXi Ruoyao OP_4REG rotri.w line3, _24 1449805f39dSXi Ruoyao 1459805f39dSXi Ruoyao OP_4REG add.w line2, line3 1469805f39dSXi Ruoyao OP_4REG xor line1, line2 1479805f39dSXi Ruoyao OP_4REG rotri.w line1, _25 14818efd0b1SXi Ruoyao 14918efd0b1SXi Ruoyao /* even round */ 1509805f39dSXi Ruoyao OP_4REG add.w line0, line1_perm 1519805f39dSXi Ruoyao OP_4REG xor line3_perm, line0 1529805f39dSXi Ruoyao OP_4REG rotri.w line3_perm, _16 1539805f39dSXi Ruoyao 1549805f39dSXi Ruoyao OP_4REG add.w line2_perm, line3_perm 1559805f39dSXi Ruoyao OP_4REG xor line1_perm, line2_perm 1569805f39dSXi Ruoyao OP_4REG rotri.w line1_perm, _20 1579805f39dSXi Ruoyao 1589805f39dSXi Ruoyao OP_4REG add.w line0, line1_perm 1599805f39dSXi Ruoyao OP_4REG xor line3_perm, line0 1609805f39dSXi Ruoyao OP_4REG rotri.w line3_perm, _24 1619805f39dSXi Ruoyao 1629805f39dSXi Ruoyao OP_4REG add.w line2_perm, line3_perm 1639805f39dSXi Ruoyao OP_4REG xor line1_perm, line2_perm 1649805f39dSXi Ruoyao OP_4REG rotri.w line1_perm, _25 16518efd0b1SXi Ruoyao 16618efd0b1SXi Ruoyao addi.w i, i, -1 16718efd0b1SXi Ruoyao bnez i, .Lpermute 16818efd0b1SXi Ruoyao 16918efd0b1SXi Ruoyao /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ 1709805f39dSXi Ruoyao OP_4REG add.w line0, copy 17118efd0b1SXi Ruoyao st.w state0, output, 0 17218efd0b1SXi Ruoyao st.w state1, output, 4 17318efd0b1SXi Ruoyao st.w state2, output, 8 17418efd0b1SXi Ruoyao st.w state3, output, 12 17518efd0b1SXi Ruoyao 17618efd0b1SXi Ruoyao /* from now on state[0,1,2,3] are scratch registers */ 17718efd0b1SXi Ruoyao 17818efd0b1SXi Ruoyao /* state[0,1,2,3] = lo32(key) */ 17918efd0b1SXi Ruoyao ld.w state0, key, 0 18018efd0b1SXi Ruoyao ld.w state1, key, 4 18118efd0b1SXi Ruoyao ld.w state2, key, 8 18218efd0b1SXi Ruoyao ld.w state3, key, 12 18318efd0b1SXi Ruoyao 18418efd0b1SXi Ruoyao /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ 1859805f39dSXi Ruoyao OP_4REG add.w line1, line0 18618efd0b1SXi Ruoyao st.w state4, output, 16 18718efd0b1SXi Ruoyao st.w state5, output, 20 18818efd0b1SXi Ruoyao st.w state6, output, 24 18918efd0b1SXi Ruoyao st.w state7, output, 28 19018efd0b1SXi Ruoyao 19118efd0b1SXi Ruoyao /* state[0,1,2,3] = hi32(key) */ 19218efd0b1SXi Ruoyao ld.w state0, key, 16 19318efd0b1SXi Ruoyao ld.w state1, key, 20 19418efd0b1SXi Ruoyao ld.w state2, key, 24 19518efd0b1SXi Ruoyao ld.w state3, key, 28 19618efd0b1SXi Ruoyao 19718efd0b1SXi Ruoyao /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */ 1989805f39dSXi Ruoyao OP_4REG add.w line2, line0 19918efd0b1SXi Ruoyao st.w state8, output, 32 20018efd0b1SXi Ruoyao st.w state9, output, 36 20118efd0b1SXi Ruoyao st.w state10, output, 40 20218efd0b1SXi Ruoyao st.w state11, output, 44 20318efd0b1SXi Ruoyao 20418efd0b1SXi Ruoyao /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ 20518efd0b1SXi Ruoyao add.w state12, state12, cnt_lo 20618efd0b1SXi Ruoyao add.w state13, state13, cnt_hi 20718efd0b1SXi Ruoyao st.w state12, output, 48 20818efd0b1SXi Ruoyao st.w state13, output, 52 20918efd0b1SXi Ruoyao st.w state14, output, 56 21018efd0b1SXi Ruoyao st.w state15, output, 60 21118efd0b1SXi Ruoyao 21218efd0b1SXi Ruoyao /* ++counter */ 21318efd0b1SXi Ruoyao addi.w cnt_lo, cnt_lo, 1 21418efd0b1SXi Ruoyao sltui state0, cnt_lo, 1 21518efd0b1SXi Ruoyao add.w cnt_hi, cnt_hi, state0 21618efd0b1SXi Ruoyao 21718efd0b1SXi Ruoyao /* output += 64 */ 21818efd0b1SXi Ruoyao PTR_ADDI output, output, 64 21918efd0b1SXi Ruoyao /* --nblocks */ 22018efd0b1SXi Ruoyao PTR_ADDI nblocks, nblocks, -1 22118efd0b1SXi Ruoyao bnez nblocks, .Lblock 22218efd0b1SXi Ruoyao 22318efd0b1SXi Ruoyao /* counter = [cnt_lo, cnt_hi] */ 22418efd0b1SXi Ruoyao st.w cnt_lo, counter, 0 22518efd0b1SXi Ruoyao st.w cnt_hi, counter, 4 22618efd0b1SXi Ruoyao 22718efd0b1SXi Ruoyao /* 22818efd0b1SXi Ruoyao * Zero out the potentially sensitive regs, in case nothing uses these 22918efd0b1SXi Ruoyao * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and 23018efd0b1SXi Ruoyao * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we 23118efd0b1SXi Ruoyao * only need to zero state[11,...,15]. 23218efd0b1SXi Ruoyao */ 23318efd0b1SXi Ruoyao move state10, zero 23418efd0b1SXi Ruoyao move state11, zero 23518efd0b1SXi Ruoyao move state12, zero 23618efd0b1SXi Ruoyao move state13, zero 23718efd0b1SXi Ruoyao move state14, zero 23818efd0b1SXi Ruoyao move state15, zero 23918efd0b1SXi Ruoyao 24018efd0b1SXi Ruoyao REG_L s0, sp, 0 24118efd0b1SXi Ruoyao REG_L s1, sp, SZREG 24218efd0b1SXi Ruoyao REG_L s2, sp, SZREG * 2 24318efd0b1SXi Ruoyao REG_L s3, sp, SZREG * 3 24418efd0b1SXi Ruoyao REG_L s4, sp, SZREG * 4 24518efd0b1SXi Ruoyao REG_L s5, sp, SZREG * 5 24618efd0b1SXi Ruoyao REG_L s6, sp, SZREG * 6 24718efd0b1SXi Ruoyao REG_L s7, sp, SZREG * 7 24818efd0b1SXi Ruoyao REG_L s8, sp, SZREG * 8 24918efd0b1SXi Ruoyao REG_L s9, sp, SZREG * 9 25018efd0b1SXi Ruoyao PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN) 25118efd0b1SXi Ruoyao 25218efd0b1SXi Ruoyao jr ra 25318efd0b1SXi RuoyaoSYM_FUNC_END(__arch_chacha20_blocks_nostack) 254