1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/*************************************************************************** 3* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * 4* * 5***************************************************************************/ 6 7.file "twofish-x86_64-asm.S" 8.text 9 10#include <linux/linkage.h> 11#include <linux/cfi_types.h> 12#include <asm/asm-offsets.h> 13 14#define a_offset 0 15#define b_offset 4 16#define c_offset 8 17#define d_offset 12 18 19/* Structure of the crypto context struct*/ 20 21#define s0 0 /* S0 Array 256 Words each */ 22#define s1 1024 /* S1 Array */ 23#define s2 2048 /* S2 Array */ 24#define s3 3072 /* S3 Array */ 25#define w 4096 /* 8 whitening keys (word) */ 26#define k 4128 /* key 1-32 ( word ) */ 27 28/* define a few register aliases to allow macro substitution */ 29 30#define R0 %rax 31#define R0D %eax 32#define R0B %al 33#define R0H %ah 34 35#define R1 %rbx 36#define R1D %ebx 37#define R1B %bl 38#define R1H %bh 39 40#define R2 %rcx 41#define R2D %ecx 42#define R2B %cl 43#define R2H %ch 44 45#define R3 %rdx 46#define R3D %edx 47#define R3B %dl 48#define R3H %dh 49 50 51/* performs input whitening */ 52#define input_whitening(src,context,offset)\ 53 xor w+offset(context), src; 54 55/* performs input whitening */ 56#define output_whitening(src,context,offset)\ 57 xor w+16+offset(context), src; 58 59 60/* 61 * a input register containing a (rotated 16) 62 * b input register containing b 63 * c input register containing c 64 * d input register containing d (already rol $1) 65 * operations on a and b are interleaved to increase performance 66 */ 67#define encrypt_round(a,b,c,d,round)\ 68 movzx b ## B, %edi;\ 69 mov s1(%r11,%rdi,4),%r8d;\ 70 movzx a ## B, %edi;\ 71 mov s2(%r11,%rdi,4),%r9d;\ 72 movzx b ## H, %edi;\ 73 ror $16, b ## D;\ 74 xor s2(%r11,%rdi,4),%r8d;\ 75 movzx a ## H, %edi;\ 76 ror $16, a ## D;\ 77 xor s3(%r11,%rdi,4),%r9d;\ 78 movzx b ## B, %edi;\ 79 xor s3(%r11,%rdi,4),%r8d;\ 80 movzx a ## B, %edi;\ 81 xor (%r11,%rdi,4), %r9d;\ 82 movzx b ## H, %edi;\ 83 ror $15, b ## D;\ 84 xor (%r11,%rdi,4), %r8d;\ 85 movzx a ## H, %edi;\ 86 xor s1(%r11,%rdi,4),%r9d;\ 87 add %r8d, %r9d;\ 88 add %r9d, %r8d;\ 89 add k+round(%r11), %r9d;\ 90 xor %r9d, c ## D;\ 91 rol $15, c ## D;\ 92 add k+4+round(%r11),%r8d;\ 93 xor %r8d, d ## D; 94 95/* 96 * a input register containing a(rotated 16) 97 * b input register containing b 98 * c input register containing c 99 * d input register containing d (already rol $1) 100 * operations on a and b are interleaved to increase performance 101 * during the round a and b are prepared for the output whitening 102 */ 103#define encrypt_last_round(a,b,c,d,round)\ 104 mov b ## D, %r10d;\ 105 shl $32, %r10;\ 106 movzx b ## B, %edi;\ 107 mov s1(%r11,%rdi,4),%r8d;\ 108 movzx a ## B, %edi;\ 109 mov s2(%r11,%rdi,4),%r9d;\ 110 movzx b ## H, %edi;\ 111 ror $16, b ## D;\ 112 xor s2(%r11,%rdi,4),%r8d;\ 113 movzx a ## H, %edi;\ 114 ror $16, a ## D;\ 115 xor s3(%r11,%rdi,4),%r9d;\ 116 movzx b ## B, %edi;\ 117 xor s3(%r11,%rdi,4),%r8d;\ 118 movzx a ## B, %edi;\ 119 xor (%r11,%rdi,4), %r9d;\ 120 xor a, %r10;\ 121 movzx b ## H, %edi;\ 122 xor (%r11,%rdi,4), %r8d;\ 123 movzx a ## H, %edi;\ 124 xor s1(%r11,%rdi,4),%r9d;\ 125 add %r8d, %r9d;\ 126 add %r9d, %r8d;\ 127 add k+round(%r11), %r9d;\ 128 xor %r9d, c ## D;\ 129 ror $1, c ## D;\ 130 add k+4+round(%r11),%r8d;\ 131 xor %r8d, d ## D 132 133/* 134 * a input register containing a 135 * b input register containing b (rotated 16) 136 * c input register containing c (already rol $1) 137 * d input register containing d 138 * operations on a and b are interleaved to increase performance 139 */ 140#define decrypt_round(a,b,c,d,round)\ 141 movzx a ## B, %edi;\ 142 mov (%r11,%rdi,4), %r9d;\ 143 movzx b ## B, %edi;\ 144 mov s3(%r11,%rdi,4),%r8d;\ 145 movzx a ## H, %edi;\ 146 ror $16, a ## D;\ 147 xor s1(%r11,%rdi,4),%r9d;\ 148 movzx b ## H, %edi;\ 149 ror $16, b ## D;\ 150 xor (%r11,%rdi,4), %r8d;\ 151 movzx a ## B, %edi;\ 152 xor s2(%r11,%rdi,4),%r9d;\ 153 movzx b ## B, %edi;\ 154 xor s1(%r11,%rdi,4),%r8d;\ 155 movzx a ## H, %edi;\ 156 ror $15, a ## D;\ 157 xor s3(%r11,%rdi,4),%r9d;\ 158 movzx b ## H, %edi;\ 159 xor s2(%r11,%rdi,4),%r8d;\ 160 add %r8d, %r9d;\ 161 add %r9d, %r8d;\ 162 add k+round(%r11), %r9d;\ 163 xor %r9d, c ## D;\ 164 add k+4+round(%r11),%r8d;\ 165 xor %r8d, d ## D;\ 166 rol $15, d ## D; 167 168/* 169 * a input register containing a 170 * b input register containing b 171 * c input register containing c (already rol $1) 172 * d input register containing d 173 * operations on a and b are interleaved to increase performance 174 * during the round a and b are prepared for the output whitening 175 */ 176#define decrypt_last_round(a,b,c,d,round)\ 177 movzx a ## B, %edi;\ 178 mov (%r11,%rdi,4), %r9d;\ 179 movzx b ## B, %edi;\ 180 mov s3(%r11,%rdi,4),%r8d;\ 181 movzx b ## H, %edi;\ 182 ror $16, b ## D;\ 183 xor (%r11,%rdi,4), %r8d;\ 184 movzx a ## H, %edi;\ 185 mov b ## D, %r10d;\ 186 shl $32, %r10;\ 187 xor a, %r10;\ 188 ror $16, a ## D;\ 189 xor s1(%r11,%rdi,4),%r9d;\ 190 movzx b ## B, %edi;\ 191 xor s1(%r11,%rdi,4),%r8d;\ 192 movzx a ## B, %edi;\ 193 xor s2(%r11,%rdi,4),%r9d;\ 194 movzx b ## H, %edi;\ 195 xor s2(%r11,%rdi,4),%r8d;\ 196 movzx a ## H, %edi;\ 197 xor s3(%r11,%rdi,4),%r9d;\ 198 add %r8d, %r9d;\ 199 add %r9d, %r8d;\ 200 add k+round(%r11), %r9d;\ 201 xor %r9d, c ## D;\ 202 add k+4+round(%r11),%r8d;\ 203 xor %r8d, d ## D;\ 204 ror $1, d ## D; 205 206SYM_TYPED_FUNC_START(twofish_enc_blk) 207 pushq R1 208 209 /* %rdi contains the ctx address */ 210 /* %rsi contains the output address */ 211 /* %rdx contains the input address */ 212 /* ctx address is moved to free one non-rex register 213 as target for the 8bit high operations */ 214 mov %rdi, %r11 215 216 movq (R3), R1 217 movq 8(R3), R3 218 input_whitening(R1,%r11,a_offset) 219 input_whitening(R3,%r11,c_offset) 220 mov R1D, R0D 221 rol $16, R0D 222 shr $32, R1 223 mov R3D, R2D 224 shr $32, R3 225 rol $1, R3D 226 227 encrypt_round(R0,R1,R2,R3,0); 228 encrypt_round(R2,R3,R0,R1,8); 229 encrypt_round(R0,R1,R2,R3,2*8); 230 encrypt_round(R2,R3,R0,R1,3*8); 231 encrypt_round(R0,R1,R2,R3,4*8); 232 encrypt_round(R2,R3,R0,R1,5*8); 233 encrypt_round(R0,R1,R2,R3,6*8); 234 encrypt_round(R2,R3,R0,R1,7*8); 235 encrypt_round(R0,R1,R2,R3,8*8); 236 encrypt_round(R2,R3,R0,R1,9*8); 237 encrypt_round(R0,R1,R2,R3,10*8); 238 encrypt_round(R2,R3,R0,R1,11*8); 239 encrypt_round(R0,R1,R2,R3,12*8); 240 encrypt_round(R2,R3,R0,R1,13*8); 241 encrypt_round(R0,R1,R2,R3,14*8); 242 encrypt_last_round(R2,R3,R0,R1,15*8); 243 244 245 output_whitening(%r10,%r11,a_offset) 246 movq %r10, (%rsi) 247 248 shl $32, R1 249 xor R0, R1 250 251 output_whitening(R1,%r11,c_offset) 252 movq R1, 8(%rsi) 253 254 popq R1 255 movl $1,%eax 256 RET 257SYM_FUNC_END(twofish_enc_blk) 258 259SYM_TYPED_FUNC_START(twofish_dec_blk) 260 pushq R1 261 262 /* %rdi contains the ctx address */ 263 /* %rsi contains the output address */ 264 /* %rdx contains the input address */ 265 /* ctx address is moved to free one non-rex register 266 as target for the 8bit high operations */ 267 mov %rdi, %r11 268 269 movq (R3), R1 270 movq 8(R3), R3 271 output_whitening(R1,%r11,a_offset) 272 output_whitening(R3,%r11,c_offset) 273 mov R1D, R0D 274 shr $32, R1 275 rol $16, R1D 276 mov R3D, R2D 277 shr $32, R3 278 rol $1, R2D 279 280 decrypt_round(R0,R1,R2,R3,15*8); 281 decrypt_round(R2,R3,R0,R1,14*8); 282 decrypt_round(R0,R1,R2,R3,13*8); 283 decrypt_round(R2,R3,R0,R1,12*8); 284 decrypt_round(R0,R1,R2,R3,11*8); 285 decrypt_round(R2,R3,R0,R1,10*8); 286 decrypt_round(R0,R1,R2,R3,9*8); 287 decrypt_round(R2,R3,R0,R1,8*8); 288 decrypt_round(R0,R1,R2,R3,7*8); 289 decrypt_round(R2,R3,R0,R1,6*8); 290 decrypt_round(R0,R1,R2,R3,5*8); 291 decrypt_round(R2,R3,R0,R1,4*8); 292 decrypt_round(R0,R1,R2,R3,3*8); 293 decrypt_round(R2,R3,R0,R1,2*8); 294 decrypt_round(R0,R1,R2,R3,1*8); 295 decrypt_last_round(R2,R3,R0,R1,0); 296 297 input_whitening(%r10,%r11,a_offset) 298 movq %r10, (%rsi) 299 300 shl $32, R1 301 xor R0, R1 302 303 input_whitening(R1,%r11,c_offset) 304 movq R1, 8(%rsi) 305 306 popq R1 307 movl $1,%eax 308 RET 309SYM_FUNC_END(twofish_dec_blk) 310