xref: /linux/drivers/net/ethernet/netronome/nfp/bpf/jit.c (revision f5ad4101009e7f5f5984ffea6923d4fcd470932a)
1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 /* Copyright (C) 2016-2018 Netronome Systems, Inc. */
3 
4 #define pr_fmt(fmt)	"NFP net bpf: " fmt
5 
6 #include <linux/bug.h>
7 #include <linux/bpf.h>
8 #include <linux/filter.h>
9 #include <linux/kernel.h>
10 #include <linux/pkt_cls.h>
11 #include <linux/reciprocal_div.h>
12 #include <linux/unistd.h>
13 
14 #include "main.h"
15 #include "../nfp_asm.h"
16 #include "../nfp_net_ctrl.h"
17 
18 /* --- NFP prog --- */
19 /* Foreach "multiple" entries macros provide pos and next<n> pointers.
20  * It's safe to modify the next pointers (but not pos).
21  */
22 #define nfp_for_each_insn_walk2(nfp_prog, pos, next)			\
23 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
24 	     next = list_next_entry(pos, l);			\
25 	     &(nfp_prog)->insns != &pos->l &&			\
26 	     &(nfp_prog)->insns != &next->l;			\
27 	     pos = nfp_meta_next(pos),				\
28 	     next = nfp_meta_next(pos))
29 
30 #define nfp_for_each_insn_walk3(nfp_prog, pos, next, next2)		\
31 	for (pos = list_first_entry(&(nfp_prog)->insns, typeof(*pos), l), \
32 	     next = list_next_entry(pos, l),			\
33 	     next2 = list_next_entry(next, l);			\
34 	     &(nfp_prog)->insns != &pos->l &&			\
35 	     &(nfp_prog)->insns != &next->l &&			\
36 	     &(nfp_prog)->insns != &next2->l;			\
37 	     pos = nfp_meta_next(pos),				\
38 	     next = nfp_meta_next(pos),				\
39 	     next2 = nfp_meta_next(next))
40 
41 static bool
42 nfp_meta_has_prev(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
43 {
44 	return meta->l.prev != &nfp_prog->insns;
45 }
46 
47 static void nfp_prog_push(struct nfp_prog *nfp_prog, u64 insn)
48 {
49 	if (nfp_prog->__prog_alloc_len / sizeof(u64) == nfp_prog->prog_len) {
50 		pr_warn("instruction limit reached (%u NFP instructions)\n",
51 			nfp_prog->prog_len);
52 		nfp_prog->error = -ENOSPC;
53 		return;
54 	}
55 
56 	nfp_prog->prog[nfp_prog->prog_len] = insn;
57 	nfp_prog->prog_len++;
58 }
59 
60 static unsigned int nfp_prog_current_offset(struct nfp_prog *nfp_prog)
61 {
62 	return nfp_prog->prog_len;
63 }
64 
65 static bool
66 nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off)
67 {
68 	/* If there is a recorded error we may have dropped instructions;
69 	 * that doesn't have to be due to translator bug, and the translation
70 	 * will fail anyway, so just return OK.
71 	 */
72 	if (nfp_prog->error)
73 		return true;
74 	return !WARN_ON_ONCE(nfp_prog_current_offset(nfp_prog) != off);
75 }
76 
77 /* --- Emitters --- */
78 static void
79 __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op,
80 	   u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx,
81 	   bool indir)
82 {
83 	u64 insn;
84 
85 	insn =	FIELD_PREP(OP_CMD_A_SRC, areg) |
86 		FIELD_PREP(OP_CMD_CTX, ctx) |
87 		FIELD_PREP(OP_CMD_B_SRC, breg) |
88 		FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) |
89 		FIELD_PREP(OP_CMD_XFER, xfer) |
90 		FIELD_PREP(OP_CMD_CNT, size) |
91 		FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) |
92 		FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) |
93 		FIELD_PREP(OP_CMD_INDIR, indir) |
94 		FIELD_PREP(OP_CMD_MODE, mode);
95 
96 	nfp_prog_push(nfp_prog, insn);
97 }
98 
99 static void
100 emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
101 	     swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir)
102 {
103 	struct nfp_insn_re_regs reg;
104 	int err;
105 
106 	err = swreg_to_restricted(reg_none(), lreg, rreg, &reg, false);
107 	if (err) {
108 		nfp_prog->error = err;
109 		return;
110 	}
111 	if (reg.swap) {
112 		pr_err("cmd can't swap arguments\n");
113 		nfp_prog->error = -EFAULT;
114 		return;
115 	}
116 	if (reg.dst_lmextn || reg.src_lmextn) {
117 		pr_err("cmd can't use LMextn\n");
118 		nfp_prog->error = -EFAULT;
119 		return;
120 	}
121 
122 	__emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx,
123 		   indir);
124 }
125 
126 static void
127 emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
128 	 swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
129 {
130 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false);
131 }
132 
133 static void
134 emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer,
135 	       swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx)
136 {
137 	emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true);
138 }
139 
140 static void
141 __emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, enum br_ev_pip ev_pip,
142 	  enum br_ctx_signal_state css, u16 addr, u8 defer)
143 {
144 	u16 addr_lo, addr_hi;
145 	u64 insn;
146 
147 	addr_lo = addr & (OP_BR_ADDR_LO >> __bf_shf(OP_BR_ADDR_LO));
148 	addr_hi = addr != addr_lo;
149 
150 	insn = OP_BR_BASE |
151 		FIELD_PREP(OP_BR_MASK, mask) |
152 		FIELD_PREP(OP_BR_EV_PIP, ev_pip) |
153 		FIELD_PREP(OP_BR_CSS, css) |
154 		FIELD_PREP(OP_BR_DEFBR, defer) |
155 		FIELD_PREP(OP_BR_ADDR_LO, addr_lo) |
156 		FIELD_PREP(OP_BR_ADDR_HI, addr_hi);
157 
158 	nfp_prog_push(nfp_prog, insn);
159 }
160 
161 static void
162 emit_br_relo(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer,
163 	     enum nfp_relo_type relo)
164 {
165 	if (mask == BR_UNC && defer > 2) {
166 		pr_err("BUG: branch defer out of bounds %d\n", defer);
167 		nfp_prog->error = -EFAULT;
168 		return;
169 	}
170 
171 	__emit_br(nfp_prog, mask,
172 		  mask != BR_UNC ? BR_EV_PIP_COND : BR_EV_PIP_UNCOND,
173 		  BR_CSS_NONE, addr, defer);
174 
175 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
176 		FIELD_PREP(OP_RELO_TYPE, relo);
177 }
178 
179 static void
180 emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer)
181 {
182 	emit_br_relo(nfp_prog, mask, addr, defer, RELO_BR_REL);
183 }
184 
185 static void
186 __emit_br_bit(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 addr, u8 defer,
187 	      bool set, bool src_lmextn)
188 {
189 	u16 addr_lo, addr_hi;
190 	u64 insn;
191 
192 	addr_lo = addr & (OP_BR_BIT_ADDR_LO >> __bf_shf(OP_BR_BIT_ADDR_LO));
193 	addr_hi = addr != addr_lo;
194 
195 	insn = OP_BR_BIT_BASE |
196 		FIELD_PREP(OP_BR_BIT_A_SRC, areg) |
197 		FIELD_PREP(OP_BR_BIT_B_SRC, breg) |
198 		FIELD_PREP(OP_BR_BIT_BV, set) |
199 		FIELD_PREP(OP_BR_BIT_DEFBR, defer) |
200 		FIELD_PREP(OP_BR_BIT_ADDR_LO, addr_lo) |
201 		FIELD_PREP(OP_BR_BIT_ADDR_HI, addr_hi) |
202 		FIELD_PREP(OP_BR_BIT_SRC_LMEXTN, src_lmextn);
203 
204 	nfp_prog_push(nfp_prog, insn);
205 }
206 
207 static void
208 emit_br_bit_relo(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr,
209 		 u8 defer, bool set, enum nfp_relo_type relo)
210 {
211 	struct nfp_insn_re_regs reg;
212 	int err;
213 
214 	/* NOTE: The bit to test is specified as an rotation amount, such that
215 	 *	 the bit to test will be placed on the MSB of the result when
216 	 *	 doing a rotate right. For bit X, we need right rotate X + 1.
217 	 */
218 	bit += 1;
219 
220 	err = swreg_to_restricted(reg_none(), src, reg_imm(bit), &reg, false);
221 	if (err) {
222 		nfp_prog->error = err;
223 		return;
224 	}
225 
226 	__emit_br_bit(nfp_prog, reg.areg, reg.breg, addr, defer, set,
227 		      reg.src_lmextn);
228 
229 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
230 		FIELD_PREP(OP_RELO_TYPE, relo);
231 }
232 
233 static void
234 emit_br_bset(struct nfp_prog *nfp_prog, swreg src, u8 bit, u16 addr, u8 defer)
235 {
236 	emit_br_bit_relo(nfp_prog, src, bit, addr, defer, true, RELO_BR_REL);
237 }
238 
239 static void
240 __emit_br_alu(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
241 	      u8 defer, bool dst_lmextn, bool src_lmextn)
242 {
243 	u64 insn;
244 
245 	insn = OP_BR_ALU_BASE |
246 		FIELD_PREP(OP_BR_ALU_A_SRC, areg) |
247 		FIELD_PREP(OP_BR_ALU_B_SRC, breg) |
248 		FIELD_PREP(OP_BR_ALU_DEFBR, defer) |
249 		FIELD_PREP(OP_BR_ALU_IMM_HI, imm_hi) |
250 		FIELD_PREP(OP_BR_ALU_SRC_LMEXTN, src_lmextn) |
251 		FIELD_PREP(OP_BR_ALU_DST_LMEXTN, dst_lmextn);
252 
253 	nfp_prog_push(nfp_prog, insn);
254 }
255 
256 static void emit_rtn(struct nfp_prog *nfp_prog, swreg base, u8 defer)
257 {
258 	struct nfp_insn_ur_regs reg;
259 	int err;
260 
261 	err = swreg_to_unrestricted(reg_none(), base, reg_imm(0), &reg);
262 	if (err) {
263 		nfp_prog->error = err;
264 		return;
265 	}
266 
267 	__emit_br_alu(nfp_prog, reg.areg, reg.breg, 0, defer, reg.dst_lmextn,
268 		      reg.src_lmextn);
269 }
270 
271 static void
272 __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi,
273 	     enum immed_width width, bool invert,
274 	     enum immed_shift shift, bool wr_both,
275 	     bool dst_lmextn, bool src_lmextn)
276 {
277 	u64 insn;
278 
279 	insn = OP_IMMED_BASE |
280 		FIELD_PREP(OP_IMMED_A_SRC, areg) |
281 		FIELD_PREP(OP_IMMED_B_SRC, breg) |
282 		FIELD_PREP(OP_IMMED_IMM, imm_hi) |
283 		FIELD_PREP(OP_IMMED_WIDTH, width) |
284 		FIELD_PREP(OP_IMMED_INV, invert) |
285 		FIELD_PREP(OP_IMMED_SHIFT, shift) |
286 		FIELD_PREP(OP_IMMED_WR_AB, wr_both) |
287 		FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) |
288 		FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn);
289 
290 	nfp_prog_push(nfp_prog, insn);
291 }
292 
293 static void
294 emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm,
295 	   enum immed_width width, bool invert, enum immed_shift shift)
296 {
297 	struct nfp_insn_ur_regs reg;
298 	int err;
299 
300 	if (swreg_type(dst) == NN_REG_IMM) {
301 		nfp_prog->error = -EFAULT;
302 		return;
303 	}
304 
305 	err = swreg_to_unrestricted(dst, dst, reg_imm(imm & 0xff), &reg);
306 	if (err) {
307 		nfp_prog->error = err;
308 		return;
309 	}
310 
311 	/* Use reg.dst when destination is No-Dest. */
312 	__emit_immed(nfp_prog,
313 		     swreg_type(dst) == NN_REG_NONE ? reg.dst : reg.areg,
314 		     reg.breg, imm >> 8, width, invert, shift,
315 		     reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
316 }
317 
318 static void
319 __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
320 	   enum shf_sc sc, u8 shift,
321 	   u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both,
322 	   bool dst_lmextn, bool src_lmextn)
323 {
324 	u64 insn;
325 
326 	if (!FIELD_FIT(OP_SHF_SHIFT, shift)) {
327 		nfp_prog->error = -EFAULT;
328 		return;
329 	}
330 
331 	/* NFP shift instruction has something special. If shift direction is
332 	 * left then shift amount of 1 to 31 is specified as 32 minus the amount
333 	 * to shift.
334 	 *
335 	 * But no need to do this for indirect shift which has shift amount be
336 	 * 0. Even after we do this subtraction, shift amount 0 will be turned
337 	 * into 32 which will eventually be encoded the same as 0 because only
338 	 * low 5 bits are encoded, but shift amount be 32 will fail the
339 	 * FIELD_PREP check done later on shift mask (0x1f), due to 32 is out of
340 	 * mask range.
341 	 */
342 	if (sc == SHF_SC_L_SHF && shift)
343 		shift = 32 - shift;
344 
345 	insn = OP_SHF_BASE |
346 		FIELD_PREP(OP_SHF_A_SRC, areg) |
347 		FIELD_PREP(OP_SHF_SC, sc) |
348 		FIELD_PREP(OP_SHF_B_SRC, breg) |
349 		FIELD_PREP(OP_SHF_I8, i8) |
350 		FIELD_PREP(OP_SHF_SW, sw) |
351 		FIELD_PREP(OP_SHF_DST, dst) |
352 		FIELD_PREP(OP_SHF_SHIFT, shift) |
353 		FIELD_PREP(OP_SHF_OP, op) |
354 		FIELD_PREP(OP_SHF_DST_AB, dst_ab) |
355 		FIELD_PREP(OP_SHF_WR_AB, wr_both) |
356 		FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) |
357 		FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn);
358 
359 	nfp_prog_push(nfp_prog, insn);
360 }
361 
362 static void
363 emit_shf(struct nfp_prog *nfp_prog, swreg dst,
364 	 swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift)
365 {
366 	struct nfp_insn_re_regs reg;
367 	int err;
368 
369 	err = swreg_to_restricted(dst, lreg, rreg, &reg, true);
370 	if (err) {
371 		nfp_prog->error = err;
372 		return;
373 	}
374 
375 	__emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift,
376 		   reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both,
377 		   reg.dst_lmextn, reg.src_lmextn);
378 }
379 
380 static void
381 emit_shf_indir(struct nfp_prog *nfp_prog, swreg dst,
382 	       swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc)
383 {
384 	if (sc == SHF_SC_R_ROT) {
385 		pr_err("indirect shift is not allowed on rotation\n");
386 		nfp_prog->error = -EFAULT;
387 		return;
388 	}
389 
390 	emit_shf(nfp_prog, dst, lreg, op, rreg, sc, 0);
391 }
392 
393 static void
394 __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab,
395 	   u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both,
396 	   bool dst_lmextn, bool src_lmextn)
397 {
398 	u64 insn;
399 
400 	insn = OP_ALU_BASE |
401 		FIELD_PREP(OP_ALU_A_SRC, areg) |
402 		FIELD_PREP(OP_ALU_B_SRC, breg) |
403 		FIELD_PREP(OP_ALU_DST, dst) |
404 		FIELD_PREP(OP_ALU_SW, swap) |
405 		FIELD_PREP(OP_ALU_OP, op) |
406 		FIELD_PREP(OP_ALU_DST_AB, dst_ab) |
407 		FIELD_PREP(OP_ALU_WR_AB, wr_both) |
408 		FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) |
409 		FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn);
410 
411 	nfp_prog_push(nfp_prog, insn);
412 }
413 
414 static void
415 emit_alu(struct nfp_prog *nfp_prog, swreg dst,
416 	 swreg lreg, enum alu_op op, swreg rreg)
417 {
418 	struct nfp_insn_ur_regs reg;
419 	int err;
420 
421 	err = swreg_to_unrestricted(dst, lreg, rreg, &reg);
422 	if (err) {
423 		nfp_prog->error = err;
424 		return;
425 	}
426 
427 	__emit_alu(nfp_prog, reg.dst, reg.dst_ab,
428 		   reg.areg, op, reg.breg, reg.swap, reg.wr_both,
429 		   reg.dst_lmextn, reg.src_lmextn);
430 }
431 
432 static void
433 __emit_mul(struct nfp_prog *nfp_prog, enum alu_dst_ab dst_ab, u16 areg,
434 	   enum mul_type type, enum mul_step step, u16 breg, bool swap,
435 	   bool wr_both, bool dst_lmextn, bool src_lmextn)
436 {
437 	u64 insn;
438 
439 	insn = OP_MUL_BASE |
440 		FIELD_PREP(OP_MUL_A_SRC, areg) |
441 		FIELD_PREP(OP_MUL_B_SRC, breg) |
442 		FIELD_PREP(OP_MUL_STEP, step) |
443 		FIELD_PREP(OP_MUL_DST_AB, dst_ab) |
444 		FIELD_PREP(OP_MUL_SW, swap) |
445 		FIELD_PREP(OP_MUL_TYPE, type) |
446 		FIELD_PREP(OP_MUL_WR_AB, wr_both) |
447 		FIELD_PREP(OP_MUL_SRC_LMEXTN, src_lmextn) |
448 		FIELD_PREP(OP_MUL_DST_LMEXTN, dst_lmextn);
449 
450 	nfp_prog_push(nfp_prog, insn);
451 }
452 
453 static void
454 emit_mul(struct nfp_prog *nfp_prog, swreg lreg, enum mul_type type,
455 	 enum mul_step step, swreg rreg)
456 {
457 	struct nfp_insn_ur_regs reg;
458 	u16 areg;
459 	int err;
460 
461 	if (type == MUL_TYPE_START && step != MUL_STEP_NONE) {
462 		nfp_prog->error = -EINVAL;
463 		return;
464 	}
465 
466 	if (step == MUL_LAST || step == MUL_LAST_2) {
467 		/* When type is step and step Number is LAST or LAST2, left
468 		 * source is used as destination.
469 		 */
470 		err = swreg_to_unrestricted(lreg, reg_none(), rreg, &reg);
471 		areg = reg.dst;
472 	} else {
473 		err = swreg_to_unrestricted(reg_none(), lreg, rreg, &reg);
474 		areg = reg.areg;
475 	}
476 
477 	if (err) {
478 		nfp_prog->error = err;
479 		return;
480 	}
481 
482 	__emit_mul(nfp_prog, reg.dst_ab, areg, type, step, reg.breg, reg.swap,
483 		   reg.wr_both, reg.dst_lmextn, reg.src_lmextn);
484 }
485 
486 static void
487 __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc,
488 		u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8,
489 		bool zero, bool swap, bool wr_both,
490 		bool dst_lmextn, bool src_lmextn)
491 {
492 	u64 insn;
493 
494 	insn = OP_LDF_BASE |
495 		FIELD_PREP(OP_LDF_A_SRC, areg) |
496 		FIELD_PREP(OP_LDF_SC, sc) |
497 		FIELD_PREP(OP_LDF_B_SRC, breg) |
498 		FIELD_PREP(OP_LDF_I8, imm8) |
499 		FIELD_PREP(OP_LDF_SW, swap) |
500 		FIELD_PREP(OP_LDF_ZF, zero) |
501 		FIELD_PREP(OP_LDF_BMASK, bmask) |
502 		FIELD_PREP(OP_LDF_SHF, shift) |
503 		FIELD_PREP(OP_LDF_WR_AB, wr_both) |
504 		FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) |
505 		FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn);
506 
507 	nfp_prog_push(nfp_prog, insn);
508 }
509 
510 static void
511 emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
512 		  enum shf_sc sc, u8 shift, bool zero)
513 {
514 	struct nfp_insn_re_regs reg;
515 	int err;
516 
517 	/* Note: ld_field is special as it uses one of the src regs as dst */
518 	err = swreg_to_restricted(dst, dst, src, &reg, true);
519 	if (err) {
520 		nfp_prog->error = err;
521 		return;
522 	}
523 
524 	__emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift,
525 			reg.i8, zero, reg.swap, reg.wr_both,
526 			reg.dst_lmextn, reg.src_lmextn);
527 }
528 
529 static void
530 emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src,
531 	      enum shf_sc sc, u8 shift)
532 {
533 	emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false);
534 }
535 
536 static void
537 __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr,
538 	    bool dst_lmextn, bool src_lmextn)
539 {
540 	u64 insn;
541 
542 	insn = OP_LCSR_BASE |
543 		FIELD_PREP(OP_LCSR_A_SRC, areg) |
544 		FIELD_PREP(OP_LCSR_B_SRC, breg) |
545 		FIELD_PREP(OP_LCSR_WRITE, wr) |
546 		FIELD_PREP(OP_LCSR_ADDR, addr / 4) |
547 		FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) |
548 		FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn);
549 
550 	nfp_prog_push(nfp_prog, insn);
551 }
552 
553 static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr)
554 {
555 	struct nfp_insn_ur_regs reg;
556 	int err;
557 
558 	/* This instruction takes immeds instead of reg_none() for the ignored
559 	 * operand, but we can't encode 2 immeds in one instr with our normal
560 	 * swreg infra so if param is an immed, we encode as reg_none() and
561 	 * copy the immed to both operands.
562 	 */
563 	if (swreg_type(src) == NN_REG_IMM) {
564 		err = swreg_to_unrestricted(reg_none(), src, reg_none(), &reg);
565 		reg.breg = reg.areg;
566 	} else {
567 		err = swreg_to_unrestricted(reg_none(), src, reg_imm(0), &reg);
568 	}
569 	if (err) {
570 		nfp_prog->error = err;
571 		return;
572 	}
573 
574 	__emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr,
575 		    false, reg.src_lmextn);
576 }
577 
578 /* CSR value is read in following immed[gpr, 0] */
579 static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr)
580 {
581 	__emit_lcsr(nfp_prog, 0, 0, false, addr, false, false);
582 }
583 
584 static void emit_nop(struct nfp_prog *nfp_prog)
585 {
586 	__emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0);
587 }
588 
589 /* --- Wrappers --- */
590 static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift)
591 {
592 	if (!(imm & 0xffff0000)) {
593 		*val = imm;
594 		*shift = IMMED_SHIFT_0B;
595 	} else if (!(imm & 0xff0000ff)) {
596 		*val = imm >> 8;
597 		*shift = IMMED_SHIFT_1B;
598 	} else if (!(imm & 0x0000ffff)) {
599 		*val = imm >> 16;
600 		*shift = IMMED_SHIFT_2B;
601 	} else {
602 		return false;
603 	}
604 
605 	return true;
606 }
607 
608 static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm)
609 {
610 	enum immed_shift shift;
611 	u16 val;
612 
613 	if (pack_immed(imm, &val, &shift)) {
614 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, false, shift);
615 	} else if (pack_immed(~imm, &val, &shift)) {
616 		emit_immed(nfp_prog, dst, val, IMMED_WIDTH_ALL, true, shift);
617 	} else {
618 		emit_immed(nfp_prog, dst, imm & 0xffff, IMMED_WIDTH_ALL,
619 			   false, IMMED_SHIFT_0B);
620 		emit_immed(nfp_prog, dst, imm >> 16, IMMED_WIDTH_WORD,
621 			   false, IMMED_SHIFT_2B);
622 	}
623 }
624 
625 static void
626 wrp_zext(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst)
627 {
628 	if (meta->flags & FLAG_INSN_DO_ZEXT)
629 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
630 }
631 
632 static void
633 wrp_immed_relo(struct nfp_prog *nfp_prog, swreg dst, u32 imm,
634 	       enum nfp_relo_type relo)
635 {
636 	if (imm > 0xffff) {
637 		pr_err("relocation of a large immediate!\n");
638 		nfp_prog->error = -EFAULT;
639 		return;
640 	}
641 	emit_immed(nfp_prog, dst, imm, IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
642 
643 	nfp_prog->prog[nfp_prog->prog_len - 1] |=
644 		FIELD_PREP(OP_RELO_TYPE, relo);
645 }
646 
647 /* ur_load_imm_any() - encode immediate or use tmp register (unrestricted)
648  * If the @imm is small enough encode it directly in operand and return
649  * otherwise load @imm to a spare register and return its encoding.
650  */
651 static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
652 {
653 	if (FIELD_FIT(UR_REG_IMM_MAX, imm))
654 		return reg_imm(imm);
655 
656 	wrp_immed(nfp_prog, tmp_reg, imm);
657 	return tmp_reg;
658 }
659 
660 /* re_load_imm_any() - encode immediate or use tmp register (restricted)
661  * If the @imm is small enough encode it directly in operand and return
662  * otherwise load @imm to a spare register and return its encoding.
663  */
664 static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg)
665 {
666 	if (FIELD_FIT(RE_REG_IMM_MAX, imm))
667 		return reg_imm(imm);
668 
669 	wrp_immed(nfp_prog, tmp_reg, imm);
670 	return tmp_reg;
671 }
672 
673 static void wrp_nops(struct nfp_prog *nfp_prog, unsigned int count)
674 {
675 	while (count--)
676 		emit_nop(nfp_prog);
677 }
678 
679 static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src)
680 {
681 	emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src);
682 }
683 
684 static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src)
685 {
686 	wrp_mov(nfp_prog, reg_both(dst), reg_b(src));
687 }
688 
689 /* wrp_reg_subpart() - load @field_len bytes from @offset of @src, write the
690  * result to @dst from low end.
691  */
692 static void
693 wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len,
694 		u8 offset)
695 {
696 	enum shf_sc sc = offset ? SHF_SC_R_SHF : SHF_SC_NONE;
697 	u8 mask = (1 << field_len) - 1;
698 
699 	emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true);
700 }
701 
702 /* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the
703  * result to @dst from offset, there is no change on the other bits of @dst.
704  */
705 static void
706 wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src,
707 		   u8 field_len, u8 offset)
708 {
709 	enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE;
710 	u8 mask = ((1 << field_len) - 1) << offset;
711 
712 	emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8);
713 }
714 
715 static void
716 addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset,
717 	      swreg *rega, swreg *regb)
718 {
719 	if (offset == reg_imm(0)) {
720 		*rega = reg_a(src_gpr);
721 		*regb = reg_b(src_gpr + 1);
722 		return;
723 	}
724 
725 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(src_gpr), ALU_OP_ADD, offset);
726 	emit_alu(nfp_prog, imm_b(nfp_prog), reg_b(src_gpr + 1), ALU_OP_ADD_C,
727 		 reg_imm(0));
728 	*rega = imm_a(nfp_prog);
729 	*regb = imm_b(nfp_prog);
730 }
731 
732 /* NFP has Command Push Pull bus which supports bluk memory operations. */
733 static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
734 {
735 	bool descending_seq = meta->ldst_gather_len < 0;
736 	s16 len = abs(meta->ldst_gather_len);
737 	swreg src_base, off;
738 	bool src_40bit_addr;
739 	unsigned int i;
740 	u8 xfer_num;
741 
742 	off = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
743 	src_40bit_addr = meta->ptr.type == PTR_TO_MAP_VALUE;
744 	src_base = reg_a(meta->insn.src_reg * 2);
745 	xfer_num = round_up(len, 4) / 4;
746 
747 	if (src_40bit_addr)
748 		addr40_offset(nfp_prog, meta->insn.src_reg * 2, off, &src_base,
749 			      &off);
750 
751 	/* Setup PREV_ALU fields to override memory read length. */
752 	if (len > 32)
753 		wrp_immed(nfp_prog, reg_none(),
754 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
755 
756 	/* Memory read from source addr into transfer-in registers. */
757 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP,
758 		     src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0,
759 		     src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32);
760 
761 	/* Move from transfer-in to transfer-out. */
762 	for (i = 0; i < xfer_num; i++)
763 		wrp_mov(nfp_prog, reg_xfer(i), reg_xfer(i));
764 
765 	off = re_load_imm_any(nfp_prog, meta->paired_st->off, imm_b(nfp_prog));
766 
767 	if (len <= 8) {
768 		/* Use single direct_ref write8. */
769 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
770 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 1,
771 			 CMD_CTX_SWAP);
772 	} else if (len <= 32 && IS_ALIGNED(len, 4)) {
773 		/* Use single direct_ref write32. */
774 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
775 			 reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1,
776 			 CMD_CTX_SWAP);
777 	} else if (len <= 32) {
778 		/* Use single indirect_ref write8. */
779 		wrp_immed(nfp_prog, reg_none(),
780 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1));
781 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
782 			       reg_a(meta->paired_st->dst_reg * 2), off,
783 			       len - 1, CMD_CTX_SWAP);
784 	} else if (IS_ALIGNED(len, 4)) {
785 		/* Use single indirect_ref write32. */
786 		wrp_immed(nfp_prog, reg_none(),
787 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
788 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
789 			       reg_a(meta->paired_st->dst_reg * 2), off,
790 			       xfer_num - 1, CMD_CTX_SWAP);
791 	} else if (len <= 40) {
792 		/* Use one direct_ref write32 to write the first 32-bytes, then
793 		 * another direct_ref write8 to write the remaining bytes.
794 		 */
795 		emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
796 			 reg_a(meta->paired_st->dst_reg * 2), off, 7,
797 			 CMD_CTX_SWAP);
798 
799 		off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32,
800 				      imm_b(nfp_prog));
801 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8,
802 			 reg_a(meta->paired_st->dst_reg * 2), off, len - 33,
803 			 CMD_CTX_SWAP);
804 	} else {
805 		/* Use one indirect_ref write32 to write 4-bytes aligned length,
806 		 * then another direct_ref write8 to write the remaining bytes.
807 		 */
808 		u8 new_off;
809 
810 		wrp_immed(nfp_prog, reg_none(),
811 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2));
812 		emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0,
813 			       reg_a(meta->paired_st->dst_reg * 2), off,
814 			       xfer_num - 2, CMD_CTX_SWAP);
815 		new_off = meta->paired_st->off + (xfer_num - 1) * 4;
816 		off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog));
817 		emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b,
818 			 xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off,
819 			 (len & 0x3) - 1, CMD_CTX_SWAP);
820 	}
821 
822 	/* TODO: The following extra load is to make sure data flow be identical
823 	 *  before and after we do memory copy optimization.
824 	 *
825 	 *  The load destination register is not guaranteed to be dead, so we
826 	 *  need to make sure it is loaded with the value the same as before
827 	 *  this transformation.
828 	 *
829 	 *  These extra loads could be removed once we have accurate register
830 	 *  usage information.
831 	 */
832 	if (descending_seq)
833 		xfer_num = 0;
834 	else if (BPF_SIZE(meta->insn.code) != BPF_DW)
835 		xfer_num = xfer_num - 1;
836 	else
837 		xfer_num = xfer_num - 2;
838 
839 	switch (BPF_SIZE(meta->insn.code)) {
840 	case BPF_B:
841 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
842 				reg_xfer(xfer_num), 1,
843 				IS_ALIGNED(len, 4) ? 3 : (len & 3) - 1);
844 		break;
845 	case BPF_H:
846 		wrp_reg_subpart(nfp_prog, reg_both(meta->insn.dst_reg * 2),
847 				reg_xfer(xfer_num), 2, (len & 3) ^ 2);
848 		break;
849 	case BPF_W:
850 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
851 			reg_xfer(0));
852 		break;
853 	case BPF_DW:
854 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2),
855 			reg_xfer(xfer_num));
856 		wrp_mov(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1),
857 			reg_xfer(xfer_num + 1));
858 		break;
859 	}
860 
861 	if (BPF_SIZE(meta->insn.code) != BPF_DW)
862 		wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
863 
864 	return 0;
865 }
866 
867 static int
868 data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, swreg offset,
869 	u8 dst_gpr, int size)
870 {
871 	unsigned int i;
872 	u16 shift, sz;
873 
874 	/* We load the value from the address indicated in @offset and then
875 	 * shift out the data we don't need.  Note: this is big endian!
876 	 */
877 	sz = max(size, 4);
878 	shift = size < 4 ? 4 - size : 0;
879 
880 	emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0,
881 		 pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP);
882 
883 	i = 0;
884 	if (shift)
885 		emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE,
886 			 reg_xfer(0), SHF_SC_R_SHF, shift * 8);
887 	else
888 		for (; i * 4 < size; i++)
889 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
890 
891 	if (i < 2)
892 		wrp_zext(nfp_prog, meta, dst_gpr);
893 
894 	return 0;
895 }
896 
897 static int
898 data_ld_host_order(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
899 		   u8 dst_gpr, swreg lreg, swreg rreg, int size,
900 		   enum cmd_mode mode)
901 {
902 	unsigned int i;
903 	u8 mask, sz;
904 
905 	/* We load the value from the address indicated in rreg + lreg and then
906 	 * mask out the data we don't need.  Note: this is little endian!
907 	 */
908 	sz = max(size, 4);
909 	mask = size < 4 ? GENMASK(size - 1, 0) : 0;
910 
911 	emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0,
912 		 lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP);
913 
914 	i = 0;
915 	if (mask)
916 		emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask,
917 				  reg_xfer(0), SHF_SC_NONE, 0, true);
918 	else
919 		for (; i * 4 < size; i++)
920 			wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i));
921 
922 	if (i < 2)
923 		wrp_zext(nfp_prog, meta, dst_gpr);
924 
925 	return 0;
926 }
927 
928 static int
929 data_ld_host_order_addr32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
930 			  u8 src_gpr, swreg offset, u8 dst_gpr, u8 size)
931 {
932 	return data_ld_host_order(nfp_prog, meta, dst_gpr, reg_a(src_gpr),
933 				  offset, size, CMD_MODE_32b);
934 }
935 
936 static int
937 data_ld_host_order_addr40(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
938 			  u8 src_gpr, swreg offset, u8 dst_gpr, u8 size)
939 {
940 	swreg rega, regb;
941 
942 	addr40_offset(nfp_prog, src_gpr, offset, &rega, &regb);
943 
944 	return data_ld_host_order(nfp_prog, meta, dst_gpr, rega, regb,
945 				  size, CMD_MODE_40b_BA);
946 }
947 
948 static int
949 construct_data_ind_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
950 		      u16 offset, u16 src, u8 size)
951 {
952 	swreg tmp_reg;
953 
954 	/* Calculate the true offset (src_reg + imm) */
955 	tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
956 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg);
957 
958 	/* Check packet length (size guaranteed to fit b/c it's u8) */
959 	emit_alu(nfp_prog, imm_a(nfp_prog),
960 		 imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size));
961 	emit_alu(nfp_prog, reg_none(),
962 		 plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog));
963 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
964 
965 	/* Load data */
966 	return data_ld(nfp_prog, meta, imm_b(nfp_prog), 0, size);
967 }
968 
969 static int
970 construct_data_ld(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
971 		  u16 offset, u8 size)
972 {
973 	swreg tmp_reg;
974 
975 	/* Check packet length */
976 	tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog));
977 	emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
978 	emit_br_relo(nfp_prog, BR_BLO, BR_OFF_RELO, 0, RELO_BR_GO_ABORT);
979 
980 	/* Load data */
981 	tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog));
982 	return data_ld(nfp_prog, meta, tmp_reg, 0, size);
983 }
984 
985 static int
986 data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
987 		    u8 src_gpr, u8 size)
988 {
989 	unsigned int i;
990 
991 	for (i = 0; i * 4 < size; i++)
992 		wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i));
993 
994 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
995 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
996 
997 	return 0;
998 }
999 
1000 static int
1001 data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset,
1002 		   u64 imm, u8 size)
1003 {
1004 	wrp_immed(nfp_prog, reg_xfer(0), imm);
1005 	if (size == 8)
1006 		wrp_immed(nfp_prog, reg_xfer(1), imm >> 32);
1007 
1008 	emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0,
1009 		 reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP);
1010 
1011 	return 0;
1012 }
1013 
1014 typedef int
1015 (*lmem_step)(struct nfp_prog *nfp_prog, u8 gpr, u8 gpr_byte, s32 off,
1016 	     unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1017 	     bool needs_inc);
1018 
1019 static int
1020 wrp_lmem_load(struct nfp_prog *nfp_prog, u8 dst, u8 dst_byte, s32 off,
1021 	      unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1022 	      bool needs_inc)
1023 {
1024 	bool should_inc = needs_inc && new_gpr && !last;
1025 	u32 idx, src_byte;
1026 	enum shf_sc sc;
1027 	swreg reg;
1028 	int shf;
1029 	u8 mask;
1030 
1031 	if (WARN_ON_ONCE(dst_byte + size > 4 || off % 4 + size > 4))
1032 		return -EOPNOTSUPP;
1033 
1034 	idx = off / 4;
1035 
1036 	/* Move the entire word */
1037 	if (size == 4) {
1038 		wrp_mov(nfp_prog, reg_both(dst),
1039 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx));
1040 		return 0;
1041 	}
1042 
1043 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1044 		return -EOPNOTSUPP;
1045 
1046 	src_byte = off % 4;
1047 
1048 	mask = (1 << size) - 1;
1049 	mask <<= dst_byte;
1050 
1051 	if (WARN_ON_ONCE(mask > 0xf))
1052 		return -EOPNOTSUPP;
1053 
1054 	shf = abs(src_byte - dst_byte) * 8;
1055 	if (src_byte == dst_byte) {
1056 		sc = SHF_SC_NONE;
1057 	} else if (src_byte < dst_byte) {
1058 		shf = 32 - shf;
1059 		sc = SHF_SC_L_SHF;
1060 	} else {
1061 		sc = SHF_SC_R_SHF;
1062 	}
1063 
1064 	/* ld_field can address fewer indexes, if offset too large do RMW.
1065 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1066 	 */
1067 	if (idx <= RE_REG_LM_IDX_MAX) {
1068 		reg = reg_lm(lm3 ? 3 : 0, idx);
1069 	} else {
1070 		reg = imm_a(nfp_prog);
1071 		/* If it's not the first part of the load and we start a new GPR
1072 		 * that means we are loading a second part of the LMEM word into
1073 		 * a new GPR.  IOW we've already looked that LMEM word and
1074 		 * therefore it has been loaded into imm_a().
1075 		 */
1076 		if (first || !new_gpr)
1077 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1078 	}
1079 
1080 	emit_ld_field_any(nfp_prog, reg_both(dst), mask, reg, sc, shf, new_gpr);
1081 
1082 	if (should_inc)
1083 		wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1084 
1085 	return 0;
1086 }
1087 
1088 static int
1089 wrp_lmem_store(struct nfp_prog *nfp_prog, u8 src, u8 src_byte, s32 off,
1090 	       unsigned int size, bool first, bool new_gpr, bool last, bool lm3,
1091 	       bool needs_inc)
1092 {
1093 	bool should_inc = needs_inc && new_gpr && !last;
1094 	u32 idx, dst_byte;
1095 	enum shf_sc sc;
1096 	swreg reg;
1097 	int shf;
1098 	u8 mask;
1099 
1100 	if (WARN_ON_ONCE(src_byte + size > 4 || off % 4 + size > 4))
1101 		return -EOPNOTSUPP;
1102 
1103 	idx = off / 4;
1104 
1105 	/* Move the entire word */
1106 	if (size == 4) {
1107 		wrp_mov(nfp_prog,
1108 			should_inc ? reg_lm_inc(3) : reg_lm(lm3 ? 3 : 0, idx),
1109 			reg_b(src));
1110 		return 0;
1111 	}
1112 
1113 	if (WARN_ON_ONCE(lm3 && idx > RE_REG_LM_IDX_MAX))
1114 		return -EOPNOTSUPP;
1115 
1116 	dst_byte = off % 4;
1117 
1118 	mask = (1 << size) - 1;
1119 	mask <<= dst_byte;
1120 
1121 	if (WARN_ON_ONCE(mask > 0xf))
1122 		return -EOPNOTSUPP;
1123 
1124 	shf = abs(src_byte - dst_byte) * 8;
1125 	if (src_byte == dst_byte) {
1126 		sc = SHF_SC_NONE;
1127 	} else if (src_byte < dst_byte) {
1128 		shf = 32 - shf;
1129 		sc = SHF_SC_L_SHF;
1130 	} else {
1131 		sc = SHF_SC_R_SHF;
1132 	}
1133 
1134 	/* ld_field can address fewer indexes, if offset too large do RMW.
1135 	 * Because we RMV twice we waste 2 cycles on unaligned 8 byte writes.
1136 	 */
1137 	if (idx <= RE_REG_LM_IDX_MAX) {
1138 		reg = reg_lm(lm3 ? 3 : 0, idx);
1139 	} else {
1140 		reg = imm_a(nfp_prog);
1141 		/* Only first and last LMEM locations are going to need RMW,
1142 		 * the middle location will be overwritten fully.
1143 		 */
1144 		if (first || last)
1145 			wrp_mov(nfp_prog, reg, reg_lm(0, idx));
1146 	}
1147 
1148 	emit_ld_field(nfp_prog, reg, mask, reg_b(src), sc, shf);
1149 
1150 	if (new_gpr || last) {
1151 		if (idx > RE_REG_LM_IDX_MAX)
1152 			wrp_mov(nfp_prog, reg_lm(0, idx), reg);
1153 		if (should_inc)
1154 			wrp_mov(nfp_prog, reg_none(), reg_lm_inc(3));
1155 	}
1156 
1157 	return 0;
1158 }
1159 
1160 static int
1161 mem_op_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1162 	     unsigned int size, unsigned int ptr_off, u8 gpr, u8 ptr_gpr,
1163 	     bool clr_gpr, lmem_step step)
1164 {
1165 	s32 off = nfp_prog->stack_frame_depth + meta->insn.off + ptr_off;
1166 	bool first = true, narrow_ld, last;
1167 	bool needs_inc = false;
1168 	swreg stack_off_reg;
1169 	u8 prev_gpr = 255;
1170 	u32 gpr_byte = 0;
1171 	bool lm3 = true;
1172 	int ret;
1173 
1174 	if (meta->ptr_not_const ||
1175 	    meta->flags & FLAG_INSN_PTR_CALLER_STACK_FRAME) {
1176 		/* Use of the last encountered ptr_off is OK, they all have
1177 		 * the same alignment.  Depend on low bits of value being
1178 		 * discarded when written to LMaddr register.
1179 		 */
1180 		stack_off_reg = ur_load_imm_any(nfp_prog, meta->insn.off,
1181 						stack_imm(nfp_prog));
1182 
1183 		emit_alu(nfp_prog, imm_b(nfp_prog),
1184 			 reg_a(ptr_gpr), ALU_OP_ADD, stack_off_reg);
1185 
1186 		needs_inc = true;
1187 	} else if (off + size <= 64) {
1188 		/* We can reach bottom 64B with LMaddr0 */
1189 		lm3 = false;
1190 	} else if (round_down(off, 32) == round_down(off + size - 1, 32)) {
1191 		/* We have to set up a new pointer.  If we know the offset
1192 		 * and the entire access falls into a single 32 byte aligned
1193 		 * window we won't have to increment the LM pointer.
1194 		 * The 32 byte alignment is imporant because offset is ORed in
1195 		 * not added when doing *l$indexN[off].
1196 		 */
1197 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 32),
1198 						stack_imm(nfp_prog));
1199 		emit_alu(nfp_prog, imm_b(nfp_prog),
1200 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1201 
1202 		off %= 32;
1203 	} else {
1204 		stack_off_reg = ur_load_imm_any(nfp_prog, round_down(off, 4),
1205 						stack_imm(nfp_prog));
1206 
1207 		emit_alu(nfp_prog, imm_b(nfp_prog),
1208 			 stack_reg(nfp_prog), ALU_OP_ADD, stack_off_reg);
1209 
1210 		needs_inc = true;
1211 	}
1212 
1213 	narrow_ld = clr_gpr && size < 8;
1214 
1215 	if (lm3) {
1216 		unsigned int nop_cnt;
1217 
1218 		emit_csr_wr(nfp_prog, imm_b(nfp_prog), NFP_CSR_ACT_LM_ADDR3);
1219 		/* For size < 4 one slot will be filled by zeroing of upper,
1220 		 * but be careful, that zeroing could be eliminated by zext
1221 		 * optimization.
1222 		 */
1223 		nop_cnt = narrow_ld && meta->flags & FLAG_INSN_DO_ZEXT ? 2 : 3;
1224 		wrp_nops(nfp_prog, nop_cnt);
1225 	}
1226 
1227 	if (narrow_ld)
1228 		wrp_zext(nfp_prog, meta, gpr);
1229 
1230 	while (size) {
1231 		u32 slice_end;
1232 		u8 slice_size;
1233 
1234 		slice_size = min(size, 4 - gpr_byte);
1235 		slice_end = min(off + slice_size, round_up(off + 1, 4));
1236 		slice_size = slice_end - off;
1237 
1238 		last = slice_size == size;
1239 
1240 		if (needs_inc)
1241 			off %= 4;
1242 
1243 		ret = step(nfp_prog, gpr, gpr_byte, off, slice_size,
1244 			   first, gpr != prev_gpr, last, lm3, needs_inc);
1245 		if (ret)
1246 			return ret;
1247 
1248 		prev_gpr = gpr;
1249 		first = false;
1250 
1251 		gpr_byte += slice_size;
1252 		if (gpr_byte >= 4) {
1253 			gpr_byte -= 4;
1254 			gpr++;
1255 		}
1256 
1257 		size -= slice_size;
1258 		off += slice_size;
1259 	}
1260 
1261 	return 0;
1262 }
1263 
1264 static void
1265 wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm)
1266 {
1267 	swreg tmp_reg;
1268 
1269 	if (alu_op == ALU_OP_AND) {
1270 		if (!imm)
1271 			wrp_immed(nfp_prog, reg_both(dst), 0);
1272 		if (!imm || !~imm)
1273 			return;
1274 	}
1275 	if (alu_op == ALU_OP_OR) {
1276 		if (!~imm)
1277 			wrp_immed(nfp_prog, reg_both(dst), ~0U);
1278 		if (!imm || !~imm)
1279 			return;
1280 	}
1281 	if (alu_op == ALU_OP_XOR) {
1282 		if (!~imm)
1283 			emit_alu(nfp_prog, reg_both(dst), reg_none(),
1284 				 ALU_OP_NOT, reg_b(dst));
1285 		if (!imm || !~imm)
1286 			return;
1287 	}
1288 
1289 	tmp_reg = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1290 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, tmp_reg);
1291 }
1292 
1293 static int
1294 wrp_alu64_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1295 	      enum alu_op alu_op, bool skip)
1296 {
1297 	const struct bpf_insn *insn = &meta->insn;
1298 	u64 imm = insn->imm; /* sign extend */
1299 
1300 	if (skip) {
1301 		meta->flags |= FLAG_INSN_SKIP_NOOP;
1302 		return 0;
1303 	}
1304 
1305 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, imm & ~0U);
1306 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, alu_op, imm >> 32);
1307 
1308 	return 0;
1309 }
1310 
1311 static int
1312 wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1313 	      enum alu_op alu_op)
1314 {
1315 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1316 
1317 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1318 	emit_alu(nfp_prog, reg_both(dst + 1),
1319 		 reg_a(dst + 1), alu_op, reg_b(src + 1));
1320 
1321 	return 0;
1322 }
1323 
1324 static int
1325 wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1326 	      enum alu_op alu_op)
1327 {
1328 	const struct bpf_insn *insn = &meta->insn;
1329 	u8 dst = insn->dst_reg * 2;
1330 
1331 	wrp_alu_imm(nfp_prog, dst, alu_op, insn->imm);
1332 	wrp_zext(nfp_prog, meta, dst);
1333 
1334 	return 0;
1335 }
1336 
1337 static int
1338 wrp_alu32_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1339 	      enum alu_op alu_op)
1340 {
1341 	u8 dst = meta->insn.dst_reg * 2, src = meta->insn.src_reg * 2;
1342 
1343 	emit_alu(nfp_prog, reg_both(dst), reg_a(dst), alu_op, reg_b(src));
1344 	wrp_zext(nfp_prog, meta, dst);
1345 
1346 	return 0;
1347 }
1348 
1349 static void
1350 wrp_test_reg_one(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u8 src,
1351 		 enum br_mask br_mask, u16 off)
1352 {
1353 	emit_alu(nfp_prog, reg_none(), reg_a(dst), alu_op, reg_b(src));
1354 	emit_br(nfp_prog, br_mask, off, 0);
1355 }
1356 
1357 static int
1358 wrp_test_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1359 	     enum alu_op alu_op, enum br_mask br_mask)
1360 {
1361 	const struct bpf_insn *insn = &meta->insn;
1362 
1363 	wrp_test_reg_one(nfp_prog, insn->dst_reg * 2, alu_op,
1364 			 insn->src_reg * 2, br_mask, insn->off);
1365 	if (is_mbpf_jmp64(meta))
1366 		wrp_test_reg_one(nfp_prog, insn->dst_reg * 2 + 1, alu_op,
1367 				 insn->src_reg * 2 + 1, br_mask, insn->off);
1368 
1369 	return 0;
1370 }
1371 
1372 static const struct jmp_code_map {
1373 	enum br_mask br_mask;
1374 	bool swap;
1375 } jmp_code_map[] = {
1376 	[BPF_JGT >> 4]	= { BR_BLO, true },
1377 	[BPF_JGE >> 4]	= { BR_BHS, false },
1378 	[BPF_JLT >> 4]	= { BR_BLO, false },
1379 	[BPF_JLE >> 4]	= { BR_BHS, true },
1380 	[BPF_JSGT >> 4]	= { BR_BLT, true },
1381 	[BPF_JSGE >> 4]	= { BR_BGE, false },
1382 	[BPF_JSLT >> 4]	= { BR_BLT, false },
1383 	[BPF_JSLE >> 4]	= { BR_BGE, true },
1384 };
1385 
1386 static const struct jmp_code_map *nfp_jmp_code_get(struct nfp_insn_meta *meta)
1387 {
1388 	unsigned int op;
1389 
1390 	op = BPF_OP(meta->insn.code) >> 4;
1391 	/* br_mask of 0 is BR_BEQ which we don't use in jump code table */
1392 	if (WARN_ONCE(op >= ARRAY_SIZE(jmp_code_map) ||
1393 		      !jmp_code_map[op].br_mask,
1394 		      "no code found for jump instruction"))
1395 		return NULL;
1396 
1397 	return &jmp_code_map[op];
1398 }
1399 
1400 static int cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1401 {
1402 	const struct bpf_insn *insn = &meta->insn;
1403 	u64 imm = insn->imm; /* sign extend */
1404 	const struct jmp_code_map *code;
1405 	enum alu_op alu_op, carry_op;
1406 	u8 reg = insn->dst_reg * 2;
1407 	swreg tmp_reg;
1408 
1409 	code = nfp_jmp_code_get(meta);
1410 	if (!code)
1411 		return -EINVAL;
1412 
1413 	alu_op = meta->jump_neg_op ? ALU_OP_ADD : ALU_OP_SUB;
1414 	carry_op = meta->jump_neg_op ? ALU_OP_ADD_C : ALU_OP_SUB_C;
1415 
1416 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
1417 	if (!code->swap)
1418 		emit_alu(nfp_prog, reg_none(), reg_a(reg), alu_op, tmp_reg);
1419 	else
1420 		emit_alu(nfp_prog, reg_none(), tmp_reg, alu_op, reg_a(reg));
1421 
1422 	if (is_mbpf_jmp64(meta)) {
1423 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
1424 		if (!code->swap)
1425 			emit_alu(nfp_prog, reg_none(),
1426 				 reg_a(reg + 1), carry_op, tmp_reg);
1427 		else
1428 			emit_alu(nfp_prog, reg_none(),
1429 				 tmp_reg, carry_op, reg_a(reg + 1));
1430 	}
1431 
1432 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1433 
1434 	return 0;
1435 }
1436 
1437 static int cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1438 {
1439 	const struct bpf_insn *insn = &meta->insn;
1440 	const struct jmp_code_map *code;
1441 	u8 areg, breg;
1442 
1443 	code = nfp_jmp_code_get(meta);
1444 	if (!code)
1445 		return -EINVAL;
1446 
1447 	areg = insn->dst_reg * 2;
1448 	breg = insn->src_reg * 2;
1449 
1450 	if (code->swap) {
1451 		areg ^= breg;
1452 		breg ^= areg;
1453 		areg ^= breg;
1454 	}
1455 
1456 	emit_alu(nfp_prog, reg_none(), reg_a(areg), ALU_OP_SUB, reg_b(breg));
1457 	if (is_mbpf_jmp64(meta))
1458 		emit_alu(nfp_prog, reg_none(),
1459 			 reg_a(areg + 1), ALU_OP_SUB_C, reg_b(breg + 1));
1460 	emit_br(nfp_prog, code->br_mask, insn->off, 0);
1461 
1462 	return 0;
1463 }
1464 
1465 static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out)
1466 {
1467 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in,
1468 		      SHF_SC_R_ROT, 8);
1469 	emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out),
1470 		      SHF_SC_R_ROT, 16);
1471 }
1472 
1473 static void
1474 wrp_mul_u32(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1475 	    swreg rreg, bool gen_high_half)
1476 {
1477 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1478 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_1, rreg);
1479 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_2, rreg);
1480 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_3, rreg);
1481 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_32x32, MUL_STEP_4, rreg);
1482 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_32x32, MUL_LAST, reg_none());
1483 	if (gen_high_half)
1484 		emit_mul(nfp_prog, dst_hi, MUL_TYPE_STEP_32x32, MUL_LAST_2,
1485 			 reg_none());
1486 	else
1487 		wrp_immed(nfp_prog, dst_hi, 0);
1488 }
1489 
1490 static void
1491 wrp_mul_u16(struct nfp_prog *nfp_prog, swreg dst_hi, swreg dst_lo, swreg lreg,
1492 	    swreg rreg)
1493 {
1494 	emit_mul(nfp_prog, lreg, MUL_TYPE_START, MUL_STEP_NONE, rreg);
1495 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_1, rreg);
1496 	emit_mul(nfp_prog, lreg, MUL_TYPE_STEP_16x16, MUL_STEP_2, rreg);
1497 	emit_mul(nfp_prog, dst_lo, MUL_TYPE_STEP_16x16, MUL_LAST, reg_none());
1498 }
1499 
1500 static int
1501 wrp_mul(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
1502 	bool gen_high_half, bool ropnd_from_reg)
1503 {
1504 	swreg multiplier, multiplicand, dst_hi, dst_lo;
1505 	const struct bpf_insn *insn = &meta->insn;
1506 	u32 lopnd_max, ropnd_max;
1507 	u8 dst_reg;
1508 
1509 	dst_reg = insn->dst_reg;
1510 	multiplicand = reg_a(dst_reg * 2);
1511 	dst_hi = reg_both(dst_reg * 2 + 1);
1512 	dst_lo = reg_both(dst_reg * 2);
1513 	lopnd_max = meta->umax_dst;
1514 	if (ropnd_from_reg) {
1515 		multiplier = reg_b(insn->src_reg * 2);
1516 		ropnd_max = meta->umax_src;
1517 	} else {
1518 		u32 imm = insn->imm;
1519 
1520 		multiplier = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1521 		ropnd_max = imm;
1522 	}
1523 	if (lopnd_max > U16_MAX || ropnd_max > U16_MAX)
1524 		wrp_mul_u32(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier,
1525 			    gen_high_half);
1526 	else
1527 		wrp_mul_u16(nfp_prog, dst_hi, dst_lo, multiplicand, multiplier);
1528 
1529 	return 0;
1530 }
1531 
1532 static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
1533 {
1534 	swreg dst_both = reg_both(dst), dst_a = reg_a(dst), dst_b = reg_a(dst);
1535 	struct reciprocal_value_adv rvalue;
1536 	u8 pre_shift, exp;
1537 	swreg magic;
1538 
1539 	if (imm > U32_MAX) {
1540 		wrp_immed(nfp_prog, dst_both, 0);
1541 		return 0;
1542 	}
1543 
1544 	/* NOTE: because we are using "reciprocal_value_adv" which doesn't
1545 	 * support "divisor > (1u << 31)", we need to JIT separate NFP sequence
1546 	 * to handle such case which actually equals to the result of unsigned
1547 	 * comparison "dst >= imm" which could be calculated using the following
1548 	 * NFP sequence:
1549 	 *
1550 	 *  alu[--, dst, -, imm]
1551 	 *  immed[imm, 0]
1552 	 *  alu[dst, imm, +carry, 0]
1553 	 *
1554 	 */
1555 	if (imm > 1U << 31) {
1556 		swreg tmp_b = ur_load_imm_any(nfp_prog, imm, imm_b(nfp_prog));
1557 
1558 		emit_alu(nfp_prog, reg_none(), dst_a, ALU_OP_SUB, tmp_b);
1559 		wrp_immed(nfp_prog, imm_a(nfp_prog), 0);
1560 		emit_alu(nfp_prog, dst_both, imm_a(nfp_prog), ALU_OP_ADD_C,
1561 			 reg_imm(0));
1562 		return 0;
1563 	}
1564 
1565 	rvalue = reciprocal_value_adv(imm, 32);
1566 	exp = rvalue.exp;
1567 	if (rvalue.is_wide_m && !(imm & 1)) {
1568 		pre_shift = fls(imm & -imm) - 1;
1569 		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
1570 	} else {
1571 		pre_shift = 0;
1572 	}
1573 	magic = ur_load_imm_any(nfp_prog, rvalue.m, imm_b(nfp_prog));
1574 	if (imm == 1U << exp) {
1575 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1576 			 SHF_SC_R_SHF, exp);
1577 	} else if (rvalue.is_wide_m) {
1578 		wrp_mul_u32(nfp_prog, imm_both(nfp_prog), reg_none(), dst_a,
1579 			    magic, true);
1580 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_SUB,
1581 			 imm_b(nfp_prog));
1582 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1583 			 SHF_SC_R_SHF, 1);
1584 		emit_alu(nfp_prog, dst_both, dst_a, ALU_OP_ADD,
1585 			 imm_b(nfp_prog));
1586 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE, dst_b,
1587 			 SHF_SC_R_SHF, rvalue.sh - 1);
1588 	} else {
1589 		if (pre_shift)
1590 			emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1591 				 dst_b, SHF_SC_R_SHF, pre_shift);
1592 		wrp_mul_u32(nfp_prog, dst_both, reg_none(), dst_a, magic, true);
1593 		emit_shf(nfp_prog, dst_both, reg_none(), SHF_OP_NONE,
1594 			 dst_b, SHF_SC_R_SHF, rvalue.sh);
1595 	}
1596 
1597 	return 0;
1598 }
1599 
1600 static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1601 {
1602 	swreg tmp = imm_a(nfp_prog), tmp_len = imm_b(nfp_prog);
1603 	struct nfp_bpf_cap_adjust_head *adjust_head;
1604 	u32 ret_einval, end;
1605 
1606 	adjust_head = &nfp_prog->bpf->adjust_head;
1607 
1608 	/* Optimized version - 5 vs 14 cycles */
1609 	if (nfp_prog->adjust_head_location != UINT_MAX) {
1610 		if (WARN_ON_ONCE(nfp_prog->adjust_head_location != meta->n))
1611 			return -EINVAL;
1612 
1613 		emit_alu(nfp_prog, pptr_reg(nfp_prog),
1614 			 reg_a(2 * 2), ALU_OP_ADD, pptr_reg(nfp_prog));
1615 		emit_alu(nfp_prog, plen_reg(nfp_prog),
1616 			 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1617 		emit_alu(nfp_prog, pv_len(nfp_prog),
1618 			 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1619 
1620 		wrp_immed(nfp_prog, reg_both(0), 0);
1621 		wrp_immed(nfp_prog, reg_both(1), 0);
1622 
1623 		/* TODO: when adjust head is guaranteed to succeed we can
1624 		 * also eliminate the following if (r0 == 0) branch.
1625 		 */
1626 
1627 		return 0;
1628 	}
1629 
1630 	ret_einval = nfp_prog_current_offset(nfp_prog) + 14;
1631 	end = ret_einval + 2;
1632 
1633 	/* We need to use a temp because offset is just a part of the pkt ptr */
1634 	emit_alu(nfp_prog, tmp,
1635 		 reg_a(2 * 2), ALU_OP_ADD_2B, pptr_reg(nfp_prog));
1636 
1637 	/* Validate result will fit within FW datapath constraints */
1638 	emit_alu(nfp_prog, reg_none(),
1639 		 tmp, ALU_OP_SUB, reg_imm(adjust_head->off_min));
1640 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1641 	emit_alu(nfp_prog, reg_none(),
1642 		 reg_imm(adjust_head->off_max), ALU_OP_SUB, tmp);
1643 	emit_br(nfp_prog, BR_BLO, ret_einval, 0);
1644 
1645 	/* Validate the length is at least ETH_HLEN */
1646 	emit_alu(nfp_prog, tmp_len,
1647 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1648 	emit_alu(nfp_prog, reg_none(),
1649 		 tmp_len, ALU_OP_SUB, reg_imm(ETH_HLEN));
1650 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1651 
1652 	/* Load the ret code */
1653 	wrp_immed(nfp_prog, reg_both(0), 0);
1654 	wrp_immed(nfp_prog, reg_both(1), 0);
1655 
1656 	/* Modify the packet metadata */
1657 	emit_ld_field(nfp_prog, pptr_reg(nfp_prog), 0x3, tmp, SHF_SC_NONE, 0);
1658 
1659 	/* Skip over the -EINVAL ret code (defer 2) */
1660 	emit_br(nfp_prog, BR_UNC, end, 2);
1661 
1662 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1663 		 plen_reg(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1664 	emit_alu(nfp_prog, pv_len(nfp_prog),
1665 		 pv_len(nfp_prog), ALU_OP_SUB, reg_a(2 * 2));
1666 
1667 	/* return -EINVAL target */
1668 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1669 		return -EINVAL;
1670 
1671 	wrp_immed(nfp_prog, reg_both(0), -22);
1672 	wrp_immed(nfp_prog, reg_both(1), ~0);
1673 
1674 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1675 		return -EINVAL;
1676 
1677 	return 0;
1678 }
1679 
1680 static int adjust_tail(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1681 {
1682 	u32 ret_einval, end;
1683 	swreg plen, delta;
1684 
1685 	BUILD_BUG_ON(plen_reg(nfp_prog) != reg_b(STATIC_REG_PKT_LEN));
1686 
1687 	plen = imm_a(nfp_prog);
1688 	delta = reg_a(2 * 2);
1689 
1690 	ret_einval = nfp_prog_current_offset(nfp_prog) + 9;
1691 	end = nfp_prog_current_offset(nfp_prog) + 11;
1692 
1693 	/* Calculate resulting length */
1694 	emit_alu(nfp_prog, plen, plen_reg(nfp_prog), ALU_OP_ADD, delta);
1695 	/* delta == 0 is not allowed by the kernel, add must overflow to make
1696 	 * length smaller.
1697 	 */
1698 	emit_br(nfp_prog, BR_BCC, ret_einval, 0);
1699 
1700 	/* if (new_len < 14) then -EINVAL */
1701 	emit_alu(nfp_prog, reg_none(), plen, ALU_OP_SUB, reg_imm(ETH_HLEN));
1702 	emit_br(nfp_prog, BR_BMI, ret_einval, 0);
1703 
1704 	emit_alu(nfp_prog, plen_reg(nfp_prog),
1705 		 plen_reg(nfp_prog), ALU_OP_ADD, delta);
1706 	emit_alu(nfp_prog, pv_len(nfp_prog),
1707 		 pv_len(nfp_prog), ALU_OP_ADD, delta);
1708 
1709 	emit_br(nfp_prog, BR_UNC, end, 2);
1710 	wrp_immed(nfp_prog, reg_both(0), 0);
1711 	wrp_immed(nfp_prog, reg_both(1), 0);
1712 
1713 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_einval))
1714 		return -EINVAL;
1715 
1716 	wrp_immed(nfp_prog, reg_both(0), -22);
1717 	wrp_immed(nfp_prog, reg_both(1), ~0);
1718 
1719 	if (!nfp_prog_confirm_current_offset(nfp_prog, end))
1720 		return -EINVAL;
1721 
1722 	return 0;
1723 }
1724 
1725 static int
1726 map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1727 {
1728 	bool load_lm_ptr;
1729 	u32 ret_tgt;
1730 	s64 lm_off;
1731 
1732 	/* We only have to reload LM0 if the key is not at start of stack */
1733 	lm_off = nfp_prog->stack_frame_depth;
1734 	lm_off += meta->arg2.reg.var_off.value;
1735 	load_lm_ptr = meta->arg2.var_off || lm_off;
1736 
1737 	/* Set LM0 to start of key */
1738 	if (load_lm_ptr)
1739 		emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0);
1740 	if (meta->func_id == BPF_FUNC_map_update_elem)
1741 		emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2);
1742 
1743 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1744 		     2, RELO_BR_HELPER);
1745 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
1746 
1747 	/* Load map ID into A0 */
1748 	wrp_mov(nfp_prog, reg_a(0), reg_a(2));
1749 
1750 	/* Load the return address into B0 */
1751 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1752 
1753 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1754 		return -EINVAL;
1755 
1756 	/* Reset the LM0 pointer */
1757 	if (!load_lm_ptr)
1758 		return 0;
1759 
1760 	emit_csr_wr(nfp_prog, stack_reg(nfp_prog), NFP_CSR_ACT_LM_ADDR0);
1761 	wrp_nops(nfp_prog, 3);
1762 
1763 	return 0;
1764 }
1765 
1766 static int
1767 nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1768 {
1769 	__emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM);
1770 	/* CSR value is read in following immed[gpr, 0] */
1771 	emit_immed(nfp_prog, reg_both(0), 0,
1772 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1773 	emit_immed(nfp_prog, reg_both(1), 0,
1774 		   IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B);
1775 	return 0;
1776 }
1777 
1778 static int
1779 nfp_perf_event_output(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1780 {
1781 	swreg ptr_type;
1782 	u32 ret_tgt;
1783 
1784 	ptr_type = ur_load_imm_any(nfp_prog, meta->arg1.type, imm_a(nfp_prog));
1785 
1786 	ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
1787 
1788 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id,
1789 		     2, RELO_BR_HELPER);
1790 
1791 	/* Load ptr type into A1 */
1792 	wrp_mov(nfp_prog, reg_a(1), ptr_type);
1793 
1794 	/* Load the return address into B0 */
1795 	wrp_immed_relo(nfp_prog, reg_b(0), ret_tgt, RELO_IMMED_REL);
1796 
1797 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
1798 		return -EINVAL;
1799 
1800 	return 0;
1801 }
1802 
1803 static int
1804 nfp_queue_select(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1805 {
1806 	u32 jmp_tgt;
1807 
1808 	jmp_tgt = nfp_prog_current_offset(nfp_prog) + 5;
1809 
1810 	/* Make sure the queue id fits into FW field */
1811 	emit_alu(nfp_prog, reg_none(), reg_a(meta->insn.src_reg * 2),
1812 		 ALU_OP_AND_NOT_B, reg_imm(0xff));
1813 	emit_br(nfp_prog, BR_BEQ, jmp_tgt, 2);
1814 
1815 	/* Set the 'queue selected' bit and the queue value */
1816 	emit_shf(nfp_prog, pv_qsel_set(nfp_prog),
1817 		 pv_qsel_set(nfp_prog), SHF_OP_OR, reg_imm(1),
1818 		 SHF_SC_L_SHF, PKT_VEL_QSEL_SET_BIT);
1819 	emit_ld_field(nfp_prog,
1820 		      pv_qsel_val(nfp_prog), 0x1, reg_b(meta->insn.src_reg * 2),
1821 		      SHF_SC_NONE, 0);
1822 	/* Delay slots end here, we will jump over next instruction if queue
1823 	 * value fits into the field.
1824 	 */
1825 	emit_ld_field(nfp_prog,
1826 		      pv_qsel_val(nfp_prog), 0x1, reg_imm(NFP_NET_RXR_MAX),
1827 		      SHF_SC_NONE, 0);
1828 
1829 	if (!nfp_prog_confirm_current_offset(nfp_prog, jmp_tgt))
1830 		return -EINVAL;
1831 
1832 	return 0;
1833 }
1834 
1835 /* --- Callbacks --- */
1836 static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1837 {
1838 	const struct bpf_insn *insn = &meta->insn;
1839 	u8 dst = insn->dst_reg * 2;
1840 	u8 src = insn->src_reg * 2;
1841 
1842 	if (insn->src_reg == BPF_REG_10) {
1843 		swreg stack_depth_reg;
1844 
1845 		stack_depth_reg = ur_load_imm_any(nfp_prog,
1846 						  nfp_prog->stack_frame_depth,
1847 						  stack_imm(nfp_prog));
1848 		emit_alu(nfp_prog, reg_both(dst), stack_reg(nfp_prog),
1849 			 ALU_OP_ADD, stack_depth_reg);
1850 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
1851 	} else {
1852 		wrp_reg_mov(nfp_prog, dst, src);
1853 		wrp_reg_mov(nfp_prog, dst + 1, src + 1);
1854 	}
1855 
1856 	return 0;
1857 }
1858 
1859 static int mov_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1860 {
1861 	u64 imm = meta->insn.imm; /* sign extend */
1862 
1863 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2), imm & ~0U);
1864 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), imm >> 32);
1865 
1866 	return 0;
1867 }
1868 
1869 static int xor_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1870 {
1871 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_XOR);
1872 }
1873 
1874 static int xor_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1875 {
1876 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm);
1877 }
1878 
1879 static int and_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1880 {
1881 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_AND);
1882 }
1883 
1884 static int and_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1885 {
1886 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm);
1887 }
1888 
1889 static int or_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1890 {
1891 	return wrp_alu64_reg(nfp_prog, meta, ALU_OP_OR);
1892 }
1893 
1894 static int or_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1895 {
1896 	return wrp_alu64_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm);
1897 }
1898 
1899 static int add_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1900 {
1901 	const struct bpf_insn *insn = &meta->insn;
1902 
1903 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1904 		 reg_a(insn->dst_reg * 2), ALU_OP_ADD,
1905 		 reg_b(insn->src_reg * 2));
1906 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1907 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_ADD_C,
1908 		 reg_b(insn->src_reg * 2 + 1));
1909 
1910 	return 0;
1911 }
1912 
1913 static int add_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1914 {
1915 	const struct bpf_insn *insn = &meta->insn;
1916 	u64 imm = insn->imm; /* sign extend */
1917 
1918 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_ADD, imm & ~0U);
1919 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_ADD_C, imm >> 32);
1920 
1921 	return 0;
1922 }
1923 
1924 static int sub_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1925 {
1926 	const struct bpf_insn *insn = &meta->insn;
1927 
1928 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2),
1929 		 reg_a(insn->dst_reg * 2), ALU_OP_SUB,
1930 		 reg_b(insn->src_reg * 2));
1931 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1),
1932 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_SUB_C,
1933 		 reg_b(insn->src_reg * 2 + 1));
1934 
1935 	return 0;
1936 }
1937 
1938 static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1939 {
1940 	const struct bpf_insn *insn = &meta->insn;
1941 	u64 imm = insn->imm; /* sign extend */
1942 
1943 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2, ALU_OP_SUB, imm & ~0U);
1944 	wrp_alu_imm(nfp_prog, insn->dst_reg * 2 + 1, ALU_OP_SUB_C, imm >> 32);
1945 
1946 	return 0;
1947 }
1948 
1949 static int mul_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1950 {
1951 	return wrp_mul(nfp_prog, meta, true, true);
1952 }
1953 
1954 static int mul_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1955 {
1956 	return wrp_mul(nfp_prog, meta, true, false);
1957 }
1958 
1959 static int div_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1960 {
1961 	const struct bpf_insn *insn = &meta->insn;
1962 
1963 	return wrp_div_imm(nfp_prog, insn->dst_reg * 2, insn->imm);
1964 }
1965 
1966 static int div_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1967 {
1968 	/* NOTE: verifier hook has rejected cases for which verifier doesn't
1969 	 * know whether the source operand is constant or not.
1970 	 */
1971 	return wrp_div_imm(nfp_prog, meta->insn.dst_reg * 2, meta->umin_src);
1972 }
1973 
1974 static int neg_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
1975 {
1976 	const struct bpf_insn *insn = &meta->insn;
1977 
1978 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2), reg_imm(0),
1979 		 ALU_OP_SUB, reg_b(insn->dst_reg * 2));
1980 	emit_alu(nfp_prog, reg_both(insn->dst_reg * 2 + 1), reg_imm(0),
1981 		 ALU_OP_SUB_C, reg_b(insn->dst_reg * 2 + 1));
1982 
1983 	return 0;
1984 }
1985 
1986 /* Pseudo code:
1987  *   if shift_amt >= 32
1988  *     dst_high = dst_low << shift_amt[4:0]
1989  *     dst_low = 0;
1990  *   else
1991  *     dst_high = (dst_high, dst_low) >> (32 - shift_amt)
1992  *     dst_low = dst_low << shift_amt
1993  *
1994  * The indirect shift will use the same logic at runtime.
1995  */
1996 static int __shl_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
1997 {
1998 	if (!shift_amt)
1999 		return 0;
2000 
2001 	if (shift_amt < 32) {
2002 		emit_shf(nfp_prog, reg_both(dst + 1), reg_a(dst + 1),
2003 			 SHF_OP_NONE, reg_b(dst), SHF_SC_R_DSHF,
2004 			 32 - shift_amt);
2005 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2006 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
2007 	} else if (shift_amt == 32) {
2008 		wrp_reg_mov(nfp_prog, dst + 1, dst);
2009 		wrp_immed(nfp_prog, reg_both(dst), 0);
2010 	} else if (shift_amt > 32) {
2011 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2012 			 reg_b(dst), SHF_SC_L_SHF, shift_amt - 32);
2013 		wrp_immed(nfp_prog, reg_both(dst), 0);
2014 	}
2015 
2016 	return 0;
2017 }
2018 
2019 static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2020 {
2021 	const struct bpf_insn *insn = &meta->insn;
2022 	u8 dst = insn->dst_reg * 2;
2023 
2024 	return __shl_imm64(nfp_prog, dst, insn->imm);
2025 }
2026 
2027 static void shl_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2028 {
2029 	emit_alu(nfp_prog, imm_both(nfp_prog), reg_imm(32), ALU_OP_SUB,
2030 		 reg_b(src));
2031 	emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR, reg_imm(0));
2032 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_a(dst + 1), SHF_OP_NONE,
2033 		       reg_b(dst), SHF_SC_R_DSHF);
2034 }
2035 
2036 /* NOTE: for indirect left shift, HIGH part should be calculated first. */
2037 static void shl_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2038 {
2039 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2040 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2041 		       reg_b(dst), SHF_SC_L_SHF);
2042 }
2043 
2044 static void shl_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2045 {
2046 	shl_reg64_lt32_high(nfp_prog, dst, src);
2047 	shl_reg64_lt32_low(nfp_prog, dst, src);
2048 }
2049 
2050 static void shl_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2051 {
2052 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2053 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2054 		       reg_b(dst), SHF_SC_L_SHF);
2055 	wrp_immed(nfp_prog, reg_both(dst), 0);
2056 }
2057 
2058 static int shl_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2059 {
2060 	const struct bpf_insn *insn = &meta->insn;
2061 	u64 umin, umax;
2062 	u8 dst, src;
2063 
2064 	dst = insn->dst_reg * 2;
2065 	umin = meta->umin_src;
2066 	umax = meta->umax_src;
2067 	if (umin == umax)
2068 		return __shl_imm64(nfp_prog, dst, umin);
2069 
2070 	src = insn->src_reg * 2;
2071 	if (umax < 32) {
2072 		shl_reg64_lt32(nfp_prog, dst, src);
2073 	} else if (umin >= 32) {
2074 		shl_reg64_ge32(nfp_prog, dst, src);
2075 	} else {
2076 		/* Generate different instruction sequences depending on runtime
2077 		 * value of shift amount.
2078 		 */
2079 		u16 label_ge32, label_end;
2080 
2081 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 7;
2082 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2083 
2084 		shl_reg64_lt32_high(nfp_prog, dst, src);
2085 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2086 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2087 		/* shl_reg64_lt32_low packed in delay slot. */
2088 		shl_reg64_lt32_low(nfp_prog, dst, src);
2089 
2090 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2091 			return -EINVAL;
2092 		shl_reg64_ge32(nfp_prog, dst, src);
2093 
2094 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2095 			return -EINVAL;
2096 	}
2097 
2098 	return 0;
2099 }
2100 
2101 /* Pseudo code:
2102  *   if shift_amt >= 32
2103  *     dst_high = 0;
2104  *     dst_low = dst_high >> shift_amt[4:0]
2105  *   else
2106  *     dst_high = dst_high >> shift_amt
2107  *     dst_low = (dst_high, dst_low) >> shift_amt
2108  *
2109  * The indirect shift will use the same logic at runtime.
2110  */
2111 static int __shr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2112 {
2113 	if (!shift_amt)
2114 		return 0;
2115 
2116 	if (shift_amt < 32) {
2117 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2118 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2119 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2120 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2121 	} else if (shift_amt == 32) {
2122 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2123 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2124 	} else if (shift_amt > 32) {
2125 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2126 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2127 		wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2128 	}
2129 
2130 	return 0;
2131 }
2132 
2133 static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2134 {
2135 	const struct bpf_insn *insn = &meta->insn;
2136 	u8 dst = insn->dst_reg * 2;
2137 
2138 	return __shr_imm64(nfp_prog, dst, insn->imm);
2139 }
2140 
2141 /* NOTE: for indirect right shift, LOW part should be calculated first. */
2142 static void shr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2143 {
2144 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2145 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_NONE,
2146 		       reg_b(dst + 1), SHF_SC_R_SHF);
2147 }
2148 
2149 static void shr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2150 {
2151 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2152 	emit_shf_indir(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2153 		       reg_b(dst), SHF_SC_R_DSHF);
2154 }
2155 
2156 static void shr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2157 {
2158 	shr_reg64_lt32_low(nfp_prog, dst, src);
2159 	shr_reg64_lt32_high(nfp_prog, dst, src);
2160 }
2161 
2162 static void shr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2163 {
2164 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2165 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2166 		       reg_b(dst + 1), SHF_SC_R_SHF);
2167 	wrp_immed(nfp_prog, reg_both(dst + 1), 0);
2168 }
2169 
2170 static int shr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2171 {
2172 	const struct bpf_insn *insn = &meta->insn;
2173 	u64 umin, umax;
2174 	u8 dst, src;
2175 
2176 	dst = insn->dst_reg * 2;
2177 	umin = meta->umin_src;
2178 	umax = meta->umax_src;
2179 	if (umin == umax)
2180 		return __shr_imm64(nfp_prog, dst, umin);
2181 
2182 	src = insn->src_reg * 2;
2183 	if (umax < 32) {
2184 		shr_reg64_lt32(nfp_prog, dst, src);
2185 	} else if (umin >= 32) {
2186 		shr_reg64_ge32(nfp_prog, dst, src);
2187 	} else {
2188 		/* Generate different instruction sequences depending on runtime
2189 		 * value of shift amount.
2190 		 */
2191 		u16 label_ge32, label_end;
2192 
2193 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2194 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2195 		shr_reg64_lt32_low(nfp_prog, dst, src);
2196 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2197 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2198 		/* shr_reg64_lt32_high packed in delay slot. */
2199 		shr_reg64_lt32_high(nfp_prog, dst, src);
2200 
2201 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2202 			return -EINVAL;
2203 		shr_reg64_ge32(nfp_prog, dst, src);
2204 
2205 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2206 			return -EINVAL;
2207 	}
2208 
2209 	return 0;
2210 }
2211 
2212 /* Code logic is the same as __shr_imm64 except ashr requires signedness bit
2213  * told through PREV_ALU result.
2214  */
2215 static int __ashr_imm64(struct nfp_prog *nfp_prog, u8 dst, u8 shift_amt)
2216 {
2217 	if (!shift_amt)
2218 		return 0;
2219 
2220 	if (shift_amt < 32) {
2221 		emit_shf(nfp_prog, reg_both(dst), reg_a(dst + 1), SHF_OP_NONE,
2222 			 reg_b(dst), SHF_SC_R_DSHF, shift_amt);
2223 		/* Set signedness bit. */
2224 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2225 			 reg_imm(0));
2226 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2227 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt);
2228 	} else if (shift_amt == 32) {
2229 		/* NOTE: this also helps setting signedness bit. */
2230 		wrp_reg_mov(nfp_prog, dst, dst + 1);
2231 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2232 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2233 	} else if (shift_amt > 32) {
2234 		emit_alu(nfp_prog, reg_none(), reg_a(dst + 1), ALU_OP_OR,
2235 			 reg_imm(0));
2236 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2237 			 reg_b(dst + 1), SHF_SC_R_SHF, shift_amt - 32);
2238 		emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2239 			 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2240 	}
2241 
2242 	return 0;
2243 }
2244 
2245 static int ashr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2246 {
2247 	const struct bpf_insn *insn = &meta->insn;
2248 	u8 dst = insn->dst_reg * 2;
2249 
2250 	return __ashr_imm64(nfp_prog, dst, insn->imm);
2251 }
2252 
2253 static void ashr_reg64_lt32_high(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2254 {
2255 	/* NOTE: the first insn will set both indirect shift amount (source A)
2256 	 * and signedness bit (MSB of result).
2257 	 */
2258 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2259 	emit_shf_indir(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2260 		       reg_b(dst + 1), SHF_SC_R_SHF);
2261 }
2262 
2263 static void ashr_reg64_lt32_low(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2264 {
2265 	/* NOTE: it is the same as logic shift because we don't need to shift in
2266 	 * signedness bit when the shift amount is less than 32.
2267 	 */
2268 	return shr_reg64_lt32_low(nfp_prog, dst, src);
2269 }
2270 
2271 static void ashr_reg64_lt32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2272 {
2273 	ashr_reg64_lt32_low(nfp_prog, dst, src);
2274 	ashr_reg64_lt32_high(nfp_prog, dst, src);
2275 }
2276 
2277 static void ashr_reg64_ge32(struct nfp_prog *nfp_prog, u8 dst, u8 src)
2278 {
2279 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst + 1));
2280 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2281 		       reg_b(dst + 1), SHF_SC_R_SHF);
2282 	emit_shf(nfp_prog, reg_both(dst + 1), reg_none(), SHF_OP_ASHR,
2283 		 reg_b(dst + 1), SHF_SC_R_SHF, 31);
2284 }
2285 
2286 /* Like ashr_imm64, but need to use indirect shift. */
2287 static int ashr_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2288 {
2289 	const struct bpf_insn *insn = &meta->insn;
2290 	u64 umin, umax;
2291 	u8 dst, src;
2292 
2293 	dst = insn->dst_reg * 2;
2294 	umin = meta->umin_src;
2295 	umax = meta->umax_src;
2296 	if (umin == umax)
2297 		return __ashr_imm64(nfp_prog, dst, umin);
2298 
2299 	src = insn->src_reg * 2;
2300 	if (umax < 32) {
2301 		ashr_reg64_lt32(nfp_prog, dst, src);
2302 	} else if (umin >= 32) {
2303 		ashr_reg64_ge32(nfp_prog, dst, src);
2304 	} else {
2305 		u16 label_ge32, label_end;
2306 
2307 		label_ge32 = nfp_prog_current_offset(nfp_prog) + 6;
2308 		emit_br_bset(nfp_prog, reg_a(src), 5, label_ge32, 0);
2309 		ashr_reg64_lt32_low(nfp_prog, dst, src);
2310 		label_end = nfp_prog_current_offset(nfp_prog) + 6;
2311 		emit_br(nfp_prog, BR_UNC, label_end, 2);
2312 		/* ashr_reg64_lt32_high packed in delay slot. */
2313 		ashr_reg64_lt32_high(nfp_prog, dst, src);
2314 
2315 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_ge32))
2316 			return -EINVAL;
2317 		ashr_reg64_ge32(nfp_prog, dst, src);
2318 
2319 		if (!nfp_prog_confirm_current_offset(nfp_prog, label_end))
2320 			return -EINVAL;
2321 	}
2322 
2323 	return 0;
2324 }
2325 
2326 static int mov_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2327 {
2328 	const struct bpf_insn *insn = &meta->insn;
2329 
2330 	wrp_reg_mov(nfp_prog, insn->dst_reg * 2,  insn->src_reg * 2);
2331 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2332 
2333 	return 0;
2334 }
2335 
2336 static int mov_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2337 {
2338 	const struct bpf_insn *insn = &meta->insn;
2339 
2340 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), insn->imm);
2341 	wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0);
2342 
2343 	return 0;
2344 }
2345 
2346 static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2347 {
2348 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_XOR);
2349 }
2350 
2351 static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2352 {
2353 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR);
2354 }
2355 
2356 static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2357 {
2358 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_AND);
2359 }
2360 
2361 static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2362 {
2363 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND);
2364 }
2365 
2366 static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2367 {
2368 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_OR);
2369 }
2370 
2371 static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2372 {
2373 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR);
2374 }
2375 
2376 static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2377 {
2378 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_ADD);
2379 }
2380 
2381 static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2382 {
2383 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD);
2384 }
2385 
2386 static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2387 {
2388 	return wrp_alu32_reg(nfp_prog, meta, ALU_OP_SUB);
2389 }
2390 
2391 static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2392 {
2393 	return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB);
2394 }
2395 
2396 static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2397 {
2398 	return wrp_mul(nfp_prog, meta, false, true);
2399 }
2400 
2401 static int mul_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2402 {
2403 	return wrp_mul(nfp_prog, meta, false, false);
2404 }
2405 
2406 static int div_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2407 {
2408 	return div_reg64(nfp_prog, meta);
2409 }
2410 
2411 static int div_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2412 {
2413 	return div_imm64(nfp_prog, meta);
2414 }
2415 
2416 static int neg_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2417 {
2418 	u8 dst = meta->insn.dst_reg * 2;
2419 
2420 	emit_alu(nfp_prog, reg_both(dst), reg_imm(0), ALU_OP_SUB, reg_b(dst));
2421 	wrp_zext(nfp_prog, meta, dst);
2422 
2423 	return 0;
2424 }
2425 
2426 static int
2427 __ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2428 	   u8 shift_amt)
2429 {
2430 	if (shift_amt) {
2431 		/* Set signedness bit (MSB of result). */
2432 		emit_alu(nfp_prog, reg_none(), reg_a(dst), ALU_OP_OR,
2433 			 reg_imm(0));
2434 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2435 			 reg_b(dst), SHF_SC_R_SHF, shift_amt);
2436 	}
2437 	wrp_zext(nfp_prog, meta, dst);
2438 
2439 	return 0;
2440 }
2441 
2442 static int ashr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2443 {
2444 	const struct bpf_insn *insn = &meta->insn;
2445 	u64 umin, umax;
2446 	u8 dst, src;
2447 
2448 	dst = insn->dst_reg * 2;
2449 	umin = meta->umin_src;
2450 	umax = meta->umax_src;
2451 	if (umin == umax)
2452 		return __ashr_imm(nfp_prog, meta, dst, umin);
2453 
2454 	src = insn->src_reg * 2;
2455 	/* NOTE: the first insn will set both indirect shift amount (source A)
2456 	 * and signedness bit (MSB of result).
2457 	 */
2458 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_b(dst));
2459 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_ASHR,
2460 		       reg_b(dst), SHF_SC_R_SHF);
2461 	wrp_zext(nfp_prog, meta, dst);
2462 
2463 	return 0;
2464 }
2465 
2466 static int ashr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2467 {
2468 	const struct bpf_insn *insn = &meta->insn;
2469 	u8 dst = insn->dst_reg * 2;
2470 
2471 	return __ashr_imm(nfp_prog, meta, dst, insn->imm);
2472 }
2473 
2474 static int
2475 __shr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2476 	  u8 shift_amt)
2477 {
2478 	if (shift_amt)
2479 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2480 			 reg_b(dst), SHF_SC_R_SHF, shift_amt);
2481 	wrp_zext(nfp_prog, meta, dst);
2482 	return 0;
2483 }
2484 
2485 static int shr_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2486 {
2487 	const struct bpf_insn *insn = &meta->insn;
2488 	u8 dst = insn->dst_reg * 2;
2489 
2490 	return __shr_imm(nfp_prog, meta, dst, insn->imm);
2491 }
2492 
2493 static int shr_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2494 {
2495 	const struct bpf_insn *insn = &meta->insn;
2496 	u64 umin, umax;
2497 	u8 dst, src;
2498 
2499 	dst = insn->dst_reg * 2;
2500 	umin = meta->umin_src;
2501 	umax = meta->umax_src;
2502 	if (umin == umax)
2503 		return __shr_imm(nfp_prog, meta, dst, umin);
2504 
2505 	src = insn->src_reg * 2;
2506 	emit_alu(nfp_prog, reg_none(), reg_a(src), ALU_OP_OR, reg_imm(0));
2507 	emit_shf_indir(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2508 		       reg_b(dst), SHF_SC_R_SHF);
2509 	wrp_zext(nfp_prog, meta, dst);
2510 	return 0;
2511 }
2512 
2513 static int
2514 __shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, u8 dst,
2515 	  u8 shift_amt)
2516 {
2517 	if (shift_amt)
2518 		emit_shf(nfp_prog, reg_both(dst), reg_none(), SHF_OP_NONE,
2519 			 reg_b(dst), SHF_SC_L_SHF, shift_amt);
2520 	wrp_zext(nfp_prog, meta, dst);
2521 	return 0;
2522 }
2523 
2524 static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2525 {
2526 	const struct bpf_insn *insn = &meta->insn;
2527 	u8 dst = insn->dst_reg * 2;
2528 
2529 	return __shl_imm(nfp_prog, meta, dst, insn->imm);
2530 }
2531 
2532 static int shl_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2533 {
2534 	const struct bpf_insn *insn = &meta->insn;
2535 	u64 umin, umax;
2536 	u8 dst, src;
2537 
2538 	dst = insn->dst_reg * 2;
2539 	umin = meta->umin_src;
2540 	umax = meta->umax_src;
2541 	if (umin == umax)
2542 		return __shl_imm(nfp_prog, meta, dst, umin);
2543 
2544 	src = insn->src_reg * 2;
2545 	shl_reg64_lt32_low(nfp_prog, dst, src);
2546 	wrp_zext(nfp_prog, meta, dst);
2547 	return 0;
2548 }
2549 
2550 static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2551 {
2552 	const struct bpf_insn *insn = &meta->insn;
2553 	u8 gpr = insn->dst_reg * 2;
2554 
2555 	switch (insn->imm) {
2556 	case 16:
2557 		emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr),
2558 			      SHF_SC_R_ROT, 8);
2559 		emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr),
2560 			      SHF_SC_R_SHF, 16);
2561 
2562 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2563 		break;
2564 	case 32:
2565 		wrp_end32(nfp_prog, reg_a(gpr), gpr);
2566 		wrp_immed(nfp_prog, reg_both(gpr + 1), 0);
2567 		break;
2568 	case 64:
2569 		wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1));
2570 
2571 		wrp_end32(nfp_prog, reg_a(gpr), gpr + 1);
2572 		wrp_end32(nfp_prog, imm_a(nfp_prog), gpr);
2573 		break;
2574 	}
2575 
2576 	return 0;
2577 }
2578 
2579 static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2580 {
2581 	struct nfp_insn_meta *prev = nfp_meta_prev(meta);
2582 	u32 imm_lo, imm_hi;
2583 	u8 dst;
2584 
2585 	dst = prev->insn.dst_reg * 2;
2586 	imm_lo = prev->insn.imm;
2587 	imm_hi = meta->insn.imm;
2588 
2589 	wrp_immed(nfp_prog, reg_both(dst), imm_lo);
2590 
2591 	/* mov is always 1 insn, load imm may be two, so try to use mov */
2592 	if (imm_hi == imm_lo)
2593 		wrp_mov(nfp_prog, reg_both(dst + 1), reg_a(dst));
2594 	else
2595 		wrp_immed(nfp_prog, reg_both(dst + 1), imm_hi);
2596 
2597 	return 0;
2598 }
2599 
2600 static int imm_ld8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2601 {
2602 	meta->double_cb = imm_ld8_part2;
2603 	return 0;
2604 }
2605 
2606 static int data_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2607 {
2608 	return construct_data_ld(nfp_prog, meta, meta->insn.imm, 1);
2609 }
2610 
2611 static int data_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2612 {
2613 	return construct_data_ld(nfp_prog, meta, meta->insn.imm, 2);
2614 }
2615 
2616 static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2617 {
2618 	return construct_data_ld(nfp_prog, meta, meta->insn.imm, 4);
2619 }
2620 
2621 static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2622 {
2623 	return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2624 				     meta->insn.src_reg * 2, 1);
2625 }
2626 
2627 static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2628 {
2629 	return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2630 				     meta->insn.src_reg * 2, 2);
2631 }
2632 
2633 static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2634 {
2635 	return construct_data_ind_ld(nfp_prog, meta, meta->insn.imm,
2636 				     meta->insn.src_reg * 2, 4);
2637 }
2638 
2639 static int
2640 mem_ldx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2641 	      unsigned int size, unsigned int ptr_off)
2642 {
2643 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2644 			    meta->insn.dst_reg * 2, meta->insn.src_reg * 2,
2645 			    true, wrp_lmem_load);
2646 }
2647 
2648 static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2649 		       u8 size)
2650 {
2651 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2652 
2653 	switch (meta->insn.off) {
2654 	case offsetof(struct __sk_buff, len):
2655 		if (size != sizeof_field(struct __sk_buff, len))
2656 			return -EOPNOTSUPP;
2657 		wrp_mov(nfp_prog, dst, plen_reg(nfp_prog));
2658 		break;
2659 	case offsetof(struct __sk_buff, data):
2660 		if (size != sizeof_field(struct __sk_buff, data))
2661 			return -EOPNOTSUPP;
2662 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2663 		break;
2664 	case offsetof(struct __sk_buff, data_end):
2665 		if (size != sizeof_field(struct __sk_buff, data_end))
2666 			return -EOPNOTSUPP;
2667 		emit_alu(nfp_prog, dst,
2668 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2669 		break;
2670 	default:
2671 		return -EOPNOTSUPP;
2672 	}
2673 
2674 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2675 
2676 	return 0;
2677 }
2678 
2679 static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2680 		       u8 size)
2681 {
2682 	swreg dst = reg_both(meta->insn.dst_reg * 2);
2683 
2684 	switch (meta->insn.off) {
2685 	case offsetof(struct xdp_md, data):
2686 		if (size != sizeof_field(struct xdp_md, data))
2687 			return -EOPNOTSUPP;
2688 		wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog));
2689 		break;
2690 	case offsetof(struct xdp_md, data_end):
2691 		if (size != sizeof_field(struct xdp_md, data_end))
2692 			return -EOPNOTSUPP;
2693 		emit_alu(nfp_prog, dst,
2694 			 plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog));
2695 		break;
2696 	default:
2697 		return -EOPNOTSUPP;
2698 	}
2699 
2700 	wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0);
2701 
2702 	return 0;
2703 }
2704 
2705 static int
2706 mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2707 	     unsigned int size)
2708 {
2709 	swreg tmp_reg;
2710 
2711 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2712 
2713 	return data_ld_host_order_addr32(nfp_prog, meta, meta->insn.src_reg * 2,
2714 					 tmp_reg, meta->insn.dst_reg * 2, size);
2715 }
2716 
2717 static int
2718 mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2719 	     unsigned int size)
2720 {
2721 	swreg tmp_reg;
2722 
2723 	tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2724 
2725 	return data_ld_host_order_addr40(nfp_prog, meta, meta->insn.src_reg * 2,
2726 					 tmp_reg, meta->insn.dst_reg * 2, size);
2727 }
2728 
2729 static void
2730 mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog,
2731 			   struct nfp_insn_meta *meta)
2732 {
2733 	s16 range_start = meta->pkt_cache.range_start;
2734 	s16 range_end = meta->pkt_cache.range_end;
2735 	swreg src_base, off;
2736 	u8 xfer_num, len;
2737 	bool indir;
2738 
2739 	off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog));
2740 	src_base = reg_a(meta->insn.src_reg * 2);
2741 	len = range_end - range_start;
2742 	xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH;
2743 
2744 	indir = len > 8 * REG_WIDTH;
2745 	/* Setup PREV_ALU for indirect mode. */
2746 	if (indir)
2747 		wrp_immed(nfp_prog, reg_none(),
2748 			  CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1));
2749 
2750 	/* Cache memory into transfer-in registers. */
2751 	emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base,
2752 		     off, xfer_num - 1, CMD_CTX_SWAP, indir);
2753 }
2754 
2755 static int
2756 mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog,
2757 				     struct nfp_insn_meta *meta,
2758 				     unsigned int size)
2759 {
2760 	s16 range_start = meta->pkt_cache.range_start;
2761 	s16 insn_off = meta->insn.off - range_start;
2762 	swreg dst_lo, dst_hi, src_lo, src_mid;
2763 	u8 dst_gpr = meta->insn.dst_reg * 2;
2764 	u8 len_lo = size, len_mid = 0;
2765 	u8 idx = insn_off / REG_WIDTH;
2766 	u8 off = insn_off % REG_WIDTH;
2767 
2768 	dst_hi = reg_both(dst_gpr + 1);
2769 	dst_lo = reg_both(dst_gpr);
2770 	src_lo = reg_xfer(idx);
2771 
2772 	/* The read length could involve as many as three registers. */
2773 	if (size > REG_WIDTH - off) {
2774 		/* Calculate the part in the second register. */
2775 		len_lo = REG_WIDTH - off;
2776 		len_mid = size - len_lo;
2777 
2778 		/* Calculate the part in the third register. */
2779 		if (size > 2 * REG_WIDTH - off)
2780 			len_mid = REG_WIDTH;
2781 	}
2782 
2783 	wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off);
2784 
2785 	if (!len_mid) {
2786 		wrp_zext(nfp_prog, meta, dst_gpr);
2787 		return 0;
2788 	}
2789 
2790 	src_mid = reg_xfer(idx + 1);
2791 
2792 	if (size <= REG_WIDTH) {
2793 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo);
2794 		wrp_zext(nfp_prog, meta, dst_gpr);
2795 	} else {
2796 		swreg src_hi = reg_xfer(idx + 2);
2797 
2798 		wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid,
2799 				   REG_WIDTH - len_lo, len_lo);
2800 		wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo,
2801 				REG_WIDTH - len_lo);
2802 		wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo,
2803 				   len_lo);
2804 	}
2805 
2806 	return 0;
2807 }
2808 
2809 static int
2810 mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog,
2811 				   struct nfp_insn_meta *meta,
2812 				   unsigned int size)
2813 {
2814 	swreg dst_lo, dst_hi, src_lo;
2815 	u8 dst_gpr, idx;
2816 
2817 	idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH;
2818 	dst_gpr = meta->insn.dst_reg * 2;
2819 	dst_hi = reg_both(dst_gpr + 1);
2820 	dst_lo = reg_both(dst_gpr);
2821 	src_lo = reg_xfer(idx);
2822 
2823 	if (size < REG_WIDTH) {
2824 		wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0);
2825 		wrp_zext(nfp_prog, meta, dst_gpr);
2826 	} else if (size == REG_WIDTH) {
2827 		wrp_mov(nfp_prog, dst_lo, src_lo);
2828 		wrp_zext(nfp_prog, meta, dst_gpr);
2829 	} else {
2830 		swreg src_hi = reg_xfer(idx + 1);
2831 
2832 		wrp_mov(nfp_prog, dst_lo, src_lo);
2833 		wrp_mov(nfp_prog, dst_hi, src_hi);
2834 	}
2835 
2836 	return 0;
2837 }
2838 
2839 static int
2840 mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog,
2841 			   struct nfp_insn_meta *meta, unsigned int size)
2842 {
2843 	u8 off = meta->insn.off - meta->pkt_cache.range_start;
2844 
2845 	if (IS_ALIGNED(off, REG_WIDTH))
2846 		return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size);
2847 
2848 	return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size);
2849 }
2850 
2851 static int
2852 mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2853 	unsigned int size)
2854 {
2855 	if (meta->ldst_gather_len)
2856 		return nfp_cpp_memcpy(nfp_prog, meta);
2857 
2858 	if (meta->ptr.type == PTR_TO_CTX) {
2859 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
2860 			return mem_ldx_xdp(nfp_prog, meta, size);
2861 		else
2862 			return mem_ldx_skb(nfp_prog, meta, size);
2863 	}
2864 
2865 	if (meta->ptr.type == PTR_TO_PACKET) {
2866 		if (meta->pkt_cache.range_end) {
2867 			if (meta->pkt_cache.do_init)
2868 				mem_ldx_data_init_pktcache(nfp_prog, meta);
2869 
2870 			return mem_ldx_data_from_pktcache(nfp_prog, meta, size);
2871 		} else {
2872 			return mem_ldx_data(nfp_prog, meta, size);
2873 		}
2874 	}
2875 
2876 	if (meta->ptr.type == PTR_TO_STACK)
2877 		return mem_ldx_stack(nfp_prog, meta, size, meta->ptr.var_off.value);
2878 
2879 	if (meta->ptr.type == PTR_TO_MAP_VALUE)
2880 		return mem_ldx_emem(nfp_prog, meta, size);
2881 
2882 	return -EOPNOTSUPP;
2883 }
2884 
2885 static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2886 {
2887 	return mem_ldx(nfp_prog, meta, 1);
2888 }
2889 
2890 static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2891 {
2892 	return mem_ldx(nfp_prog, meta, 2);
2893 }
2894 
2895 static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2896 {
2897 	return mem_ldx(nfp_prog, meta, 4);
2898 }
2899 
2900 static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2901 {
2902 	return mem_ldx(nfp_prog, meta, 8);
2903 }
2904 
2905 static int
2906 mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2907 	    unsigned int size)
2908 {
2909 	u64 imm = meta->insn.imm; /* sign extend */
2910 	swreg off_reg;
2911 
2912 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2913 
2914 	return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2915 				  imm, size);
2916 }
2917 
2918 static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2919 		  unsigned int size)
2920 {
2921 	if (meta->ptr.type == PTR_TO_PACKET)
2922 		return mem_st_data(nfp_prog, meta, size);
2923 
2924 	return -EOPNOTSUPP;
2925 }
2926 
2927 static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2928 {
2929 	return mem_st(nfp_prog, meta, 1);
2930 }
2931 
2932 static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2933 {
2934 	return mem_st(nfp_prog, meta, 2);
2935 }
2936 
2937 static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2938 {
2939 	return mem_st(nfp_prog, meta, 4);
2940 }
2941 
2942 static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2943 {
2944 	return mem_st(nfp_prog, meta, 8);
2945 }
2946 
2947 static int
2948 mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2949 	     unsigned int size)
2950 {
2951 	swreg off_reg;
2952 
2953 	off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
2954 
2955 	return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg,
2956 				   meta->insn.src_reg * 2, size);
2957 }
2958 
2959 static int
2960 mem_stx_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2961 	      unsigned int size, unsigned int ptr_off)
2962 {
2963 	return mem_op_stack(nfp_prog, meta, size, ptr_off,
2964 			    meta->insn.src_reg * 2, meta->insn.dst_reg * 2,
2965 			    false, wrp_lmem_store);
2966 }
2967 
2968 static int mem_stx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2969 {
2970 	switch (meta->insn.off) {
2971 	case offsetof(struct xdp_md, rx_queue_index):
2972 		return nfp_queue_select(nfp_prog, meta);
2973 	}
2974 
2975 	WARN_ON_ONCE(1); /* verifier should have rejected bad accesses */
2976 	return -EOPNOTSUPP;
2977 }
2978 
2979 static int
2980 mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
2981 	unsigned int size)
2982 {
2983 	if (meta->ptr.type == PTR_TO_PACKET)
2984 		return mem_stx_data(nfp_prog, meta, size);
2985 
2986 	if (meta->ptr.type == PTR_TO_STACK)
2987 		return mem_stx_stack(nfp_prog, meta, size, meta->ptr.var_off.value);
2988 
2989 	return -EOPNOTSUPP;
2990 }
2991 
2992 static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2993 {
2994 	return mem_stx(nfp_prog, meta, 1);
2995 }
2996 
2997 static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
2998 {
2999 	return mem_stx(nfp_prog, meta, 2);
3000 }
3001 
3002 static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3003 {
3004 	if (meta->ptr.type == PTR_TO_CTX)
3005 		if (nfp_prog->type == BPF_PROG_TYPE_XDP)
3006 			return mem_stx_xdp(nfp_prog, meta);
3007 	return mem_stx(nfp_prog, meta, 4);
3008 }
3009 
3010 static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3011 {
3012 	return mem_stx(nfp_prog, meta, 8);
3013 }
3014 
3015 static int
3016 mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64)
3017 {
3018 	u8 dst_gpr = meta->insn.dst_reg * 2;
3019 	u8 src_gpr = meta->insn.src_reg * 2;
3020 	unsigned int full_add, out;
3021 	swreg addra, addrb, off;
3022 
3023 	off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog));
3024 
3025 	/* We can fit 16 bits into command immediate, if we know the immediate
3026 	 * is guaranteed to either always or never fit into 16 bit we only
3027 	 * generate code to handle that particular case, otherwise generate
3028 	 * code for both.
3029 	 */
3030 	out = nfp_prog_current_offset(nfp_prog);
3031 	full_add = nfp_prog_current_offset(nfp_prog);
3032 
3033 	if (meta->insn.off) {
3034 		out += 2;
3035 		full_add += 2;
3036 	}
3037 	if (meta->xadd_maybe_16bit) {
3038 		out += 3;
3039 		full_add += 3;
3040 	}
3041 	if (meta->xadd_over_16bit)
3042 		out += 2 + is64;
3043 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
3044 		out += 5;
3045 		full_add += 5;
3046 	}
3047 
3048 	/* Generate the branch for choosing add_imm vs add */
3049 	if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) {
3050 		swreg max_imm = imm_a(nfp_prog);
3051 
3052 		wrp_immed(nfp_prog, max_imm, 0xffff);
3053 		emit_alu(nfp_prog, reg_none(),
3054 			 max_imm, ALU_OP_SUB, reg_b(src_gpr));
3055 		emit_alu(nfp_prog, reg_none(),
3056 			 reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1));
3057 		emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0);
3058 		/* defer for add */
3059 	}
3060 
3061 	/* If insn has an offset add to the address */
3062 	if (!meta->insn.off) {
3063 		addra = reg_a(dst_gpr);
3064 		addrb = reg_b(dst_gpr + 1);
3065 	} else {
3066 		emit_alu(nfp_prog, imma_a(nfp_prog),
3067 			 reg_a(dst_gpr), ALU_OP_ADD, off);
3068 		emit_alu(nfp_prog, imma_b(nfp_prog),
3069 			 reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0));
3070 		addra = imma_a(nfp_prog);
3071 		addrb = imma_b(nfp_prog);
3072 	}
3073 
3074 	/* Generate the add_imm if 16 bits are possible */
3075 	if (meta->xadd_maybe_16bit) {
3076 		swreg prev_alu = imm_a(nfp_prog);
3077 
3078 		wrp_immed(nfp_prog, prev_alu,
3079 			  FIELD_PREP(CMD_OVE_DATA, 2) |
3080 			  CMD_OVE_LEN |
3081 			  FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2));
3082 		wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2);
3083 		emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0,
3084 			       addra, addrb, 0, CMD_CTX_NO_SWAP);
3085 
3086 		if (meta->xadd_over_16bit)
3087 			emit_br(nfp_prog, BR_UNC, out, 0);
3088 	}
3089 
3090 	if (!nfp_prog_confirm_current_offset(nfp_prog, full_add))
3091 		return -EINVAL;
3092 
3093 	/* Generate the add if 16 bits are not guaranteed */
3094 	if (meta->xadd_over_16bit) {
3095 		emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0,
3096 			 addra, addrb, is64 << 2,
3097 			 is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1);
3098 
3099 		wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr));
3100 		if (is64)
3101 			wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1));
3102 	}
3103 
3104 	if (!nfp_prog_confirm_current_offset(nfp_prog, out))
3105 		return -EINVAL;
3106 
3107 	return 0;
3108 }
3109 
3110 static int mem_atomic4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3111 {
3112 	if (meta->insn.imm != BPF_ADD)
3113 		return -EOPNOTSUPP;
3114 
3115 	return mem_xadd(nfp_prog, meta, false);
3116 }
3117 
3118 static int mem_atomic8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3119 {
3120 	if (meta->insn.imm != BPF_ADD)
3121 		return -EOPNOTSUPP;
3122 
3123 	return mem_xadd(nfp_prog, meta, true);
3124 }
3125 
3126 static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3127 {
3128 	emit_br(nfp_prog, BR_UNC, meta->insn.off, 0);
3129 
3130 	return 0;
3131 }
3132 
3133 static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3134 {
3135 	const struct bpf_insn *insn = &meta->insn;
3136 	u64 imm = insn->imm; /* sign extend */
3137 	swreg or1, or2, tmp_reg;
3138 
3139 	or1 = reg_a(insn->dst_reg * 2);
3140 	or2 = reg_b(insn->dst_reg * 2 + 1);
3141 
3142 	if (imm & ~0U) {
3143 		tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3144 		emit_alu(nfp_prog, imm_a(nfp_prog),
3145 			 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3146 		or1 = imm_a(nfp_prog);
3147 	}
3148 
3149 	if (imm >> 32) {
3150 		tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3151 		emit_alu(nfp_prog, imm_b(nfp_prog),
3152 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3153 		or2 = imm_b(nfp_prog);
3154 	}
3155 
3156 	emit_alu(nfp_prog, reg_none(), or1, ALU_OP_OR, or2);
3157 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3158 
3159 	return 0;
3160 }
3161 
3162 static int jeq32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3163 {
3164 	const struct bpf_insn *insn = &meta->insn;
3165 	swreg tmp_reg;
3166 
3167 	tmp_reg = ur_load_imm_any(nfp_prog, insn->imm, imm_b(nfp_prog));
3168 	emit_alu(nfp_prog, reg_none(),
3169 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3170 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3171 
3172 	return 0;
3173 }
3174 
3175 static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3176 {
3177 	const struct bpf_insn *insn = &meta->insn;
3178 	u64 imm = insn->imm; /* sign extend */
3179 	u8 dst_gpr = insn->dst_reg * 2;
3180 	swreg tmp_reg;
3181 
3182 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3183 	emit_alu(nfp_prog, imm_b(nfp_prog),
3184 		 reg_a(dst_gpr), ALU_OP_AND, tmp_reg);
3185 	/* Upper word of the mask can only be 0 or ~0 from sign extension,
3186 	 * so either ignore it or OR the whole thing in.
3187 	 */
3188 	if (is_mbpf_jmp64(meta) && imm >> 32) {
3189 		emit_alu(nfp_prog, reg_none(),
3190 			 reg_a(dst_gpr + 1), ALU_OP_OR, imm_b(nfp_prog));
3191 	}
3192 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3193 
3194 	return 0;
3195 }
3196 
3197 static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3198 {
3199 	const struct bpf_insn *insn = &meta->insn;
3200 	u64 imm = insn->imm; /* sign extend */
3201 	bool is_jmp32 = is_mbpf_jmp32(meta);
3202 	swreg tmp_reg;
3203 
3204 	if (!imm) {
3205 		if (is_jmp32)
3206 			emit_alu(nfp_prog, reg_none(), reg_none(), ALU_OP_NONE,
3207 				 reg_b(insn->dst_reg * 2));
3208 		else
3209 			emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2),
3210 				 ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1));
3211 		emit_br(nfp_prog, BR_BNE, insn->off, 0);
3212 		return 0;
3213 	}
3214 
3215 	tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog));
3216 	emit_alu(nfp_prog, reg_none(),
3217 		 reg_a(insn->dst_reg * 2), ALU_OP_XOR, tmp_reg);
3218 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3219 
3220 	if (is_jmp32)
3221 		return 0;
3222 
3223 	tmp_reg = ur_load_imm_any(nfp_prog, imm >> 32, imm_b(nfp_prog));
3224 	emit_alu(nfp_prog, reg_none(),
3225 		 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR, tmp_reg);
3226 	emit_br(nfp_prog, BR_BNE, insn->off, 0);
3227 
3228 	return 0;
3229 }
3230 
3231 static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3232 {
3233 	const struct bpf_insn *insn = &meta->insn;
3234 
3235 	emit_alu(nfp_prog, imm_a(nfp_prog), reg_a(insn->dst_reg * 2),
3236 		 ALU_OP_XOR, reg_b(insn->src_reg * 2));
3237 	if (is_mbpf_jmp64(meta)) {
3238 		emit_alu(nfp_prog, imm_b(nfp_prog),
3239 			 reg_a(insn->dst_reg * 2 + 1), ALU_OP_XOR,
3240 			 reg_b(insn->src_reg * 2 + 1));
3241 		emit_alu(nfp_prog, reg_none(), imm_a(nfp_prog), ALU_OP_OR,
3242 			 imm_b(nfp_prog));
3243 	}
3244 	emit_br(nfp_prog, BR_BEQ, insn->off, 0);
3245 
3246 	return 0;
3247 }
3248 
3249 static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3250 {
3251 	return wrp_test_reg(nfp_prog, meta, ALU_OP_AND, BR_BNE);
3252 }
3253 
3254 static int jne_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3255 {
3256 	return wrp_test_reg(nfp_prog, meta, ALU_OP_XOR, BR_BNE);
3257 }
3258 
3259 static int
3260 bpf_to_bpf_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3261 {
3262 	u32 ret_tgt, stack_depth, offset_br;
3263 	swreg tmp_reg;
3264 
3265 	stack_depth = round_up(nfp_prog->stack_frame_depth, STACK_FRAME_ALIGN);
3266 	/* Space for saving the return address is accounted for by the callee,
3267 	 * so stack_depth can be zero for the main function.
3268 	 */
3269 	if (stack_depth) {
3270 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3271 					  stack_imm(nfp_prog));
3272 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3273 			 stack_reg(nfp_prog), ALU_OP_ADD, tmp_reg);
3274 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3275 			    NFP_CSR_ACT_LM_ADDR0);
3276 	}
3277 
3278 	/* Two cases for jumping to the callee:
3279 	 *
3280 	 * - If callee uses and needs to save R6~R9 then:
3281 	 *     1. Put the start offset of the callee into imm_b(). This will
3282 	 *        require a fixup step, as we do not necessarily know this
3283 	 *        address yet.
3284 	 *     2. Put the return address from the callee to the caller into
3285 	 *        register ret_reg().
3286 	 *     3. (After defer slots are consumed) Jump to the subroutine that
3287 	 *        pushes the registers to the stack.
3288 	 *   The subroutine acts as a trampoline, and returns to the address in
3289 	 *   imm_b(), i.e. jumps to the callee.
3290 	 *
3291 	 * - If callee does not need to save R6~R9 then just load return
3292 	 *   address to the caller in ret_reg(), and jump to the callee
3293 	 *   directly.
3294 	 *
3295 	 * Using ret_reg() to pass the return address to the callee is set here
3296 	 * as a convention. The callee can then push this address onto its
3297 	 * stack frame in its prologue. The advantages of passing the return
3298 	 * address through ret_reg(), instead of pushing it to the stack right
3299 	 * here, are the following:
3300 	 * - It looks cleaner.
3301 	 * - If the called function is called multiple time, we get a lower
3302 	 *   program size.
3303 	 * - We save two no-op instructions that should be added just before
3304 	 *   the emit_br() when stack depth is not null otherwise.
3305 	 * - If we ever find a register to hold the return address during whole
3306 	 *   execution of the callee, we will not have to push the return
3307 	 *   address to the stack for leaf functions.
3308 	 */
3309 	if (!meta->jmp_dst) {
3310 		pr_err("BUG: BPF-to-BPF call has no destination recorded\n");
3311 		return -ELOOP;
3312 	}
3313 	if (nfp_prog->subprog[meta->jmp_dst->subprog_idx].needs_reg_push) {
3314 		ret_tgt = nfp_prog_current_offset(nfp_prog) + 3;
3315 		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2,
3316 			     RELO_BR_GO_CALL_PUSH_REGS);
3317 		offset_br = nfp_prog_current_offset(nfp_prog);
3318 		wrp_immed_relo(nfp_prog, imm_b(nfp_prog), 0, RELO_IMMED_REL);
3319 	} else {
3320 		ret_tgt = nfp_prog_current_offset(nfp_prog) + 2;
3321 		emit_br(nfp_prog, BR_UNC, meta->insn.imm, 1);
3322 		offset_br = nfp_prog_current_offset(nfp_prog);
3323 	}
3324 	wrp_immed_relo(nfp_prog, ret_reg(nfp_prog), ret_tgt, RELO_IMMED_REL);
3325 
3326 	if (!nfp_prog_confirm_current_offset(nfp_prog, ret_tgt))
3327 		return -EINVAL;
3328 
3329 	if (stack_depth) {
3330 		tmp_reg = ur_load_imm_any(nfp_prog, stack_depth,
3331 					  stack_imm(nfp_prog));
3332 		emit_alu(nfp_prog, stack_reg(nfp_prog),
3333 			 stack_reg(nfp_prog), ALU_OP_SUB, tmp_reg);
3334 		emit_csr_wr(nfp_prog, stack_reg(nfp_prog),
3335 			    NFP_CSR_ACT_LM_ADDR0);
3336 		wrp_nops(nfp_prog, 3);
3337 	}
3338 
3339 	meta->num_insns_after_br = nfp_prog_current_offset(nfp_prog);
3340 	meta->num_insns_after_br -= offset_br;
3341 
3342 	return 0;
3343 }
3344 
3345 static int helper_call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3346 {
3347 	switch (meta->insn.imm) {
3348 	case BPF_FUNC_xdp_adjust_head:
3349 		return adjust_head(nfp_prog, meta);
3350 	case BPF_FUNC_xdp_adjust_tail:
3351 		return adjust_tail(nfp_prog, meta);
3352 	case BPF_FUNC_map_lookup_elem:
3353 	case BPF_FUNC_map_update_elem:
3354 	case BPF_FUNC_map_delete_elem:
3355 		return map_call_stack_common(nfp_prog, meta);
3356 	case BPF_FUNC_get_prandom_u32:
3357 		return nfp_get_prandom_u32(nfp_prog, meta);
3358 	case BPF_FUNC_perf_event_output:
3359 		return nfp_perf_event_output(nfp_prog, meta);
3360 	default:
3361 		WARN_ONCE(1, "verifier allowed unsupported function\n");
3362 		return -EOPNOTSUPP;
3363 	}
3364 }
3365 
3366 static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3367 {
3368 	if (is_mbpf_pseudo_call(meta))
3369 		return bpf_to_bpf_call(nfp_prog, meta);
3370 	else
3371 		return helper_call(nfp_prog, meta);
3372 }
3373 
3374 static bool nfp_is_main_function(struct nfp_insn_meta *meta)
3375 {
3376 	return meta->subprog_idx == 0;
3377 }
3378 
3379 static int goto_out(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3380 {
3381 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 0, RELO_BR_GO_OUT);
3382 
3383 	return 0;
3384 }
3385 
3386 static int
3387 nfp_subprog_epilogue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3388 {
3389 	if (nfp_prog->subprog[meta->subprog_idx].needs_reg_push) {
3390 		/* Pop R6~R9 to the stack via related subroutine.
3391 		 * We loaded the return address to the caller into ret_reg().
3392 		 * This means that the subroutine does not come back here, we
3393 		 * make it jump back to the subprogram caller directly!
3394 		 */
3395 		emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 1,
3396 			     RELO_BR_GO_CALL_POP_REGS);
3397 		/* Pop return address from the stack. */
3398 		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3399 	} else {
3400 		/* Pop return address from the stack. */
3401 		wrp_mov(nfp_prog, ret_reg(nfp_prog), reg_lm(0, 0));
3402 		/* Jump back to caller if no callee-saved registers were used
3403 		 * by the subprogram.
3404 		 */
3405 		emit_rtn(nfp_prog, ret_reg(nfp_prog), 0);
3406 	}
3407 
3408 	return 0;
3409 }
3410 
3411 static int jmp_exit(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3412 {
3413 	if (nfp_is_main_function(meta))
3414 		return goto_out(nfp_prog, meta);
3415 	else
3416 		return nfp_subprog_epilogue(nfp_prog, meta);
3417 }
3418 
3419 static const instr_cb_t instr_cb[256] = {
3420 	[BPF_ALU64 | BPF_MOV | BPF_X] =	mov_reg64,
3421 	[BPF_ALU64 | BPF_MOV | BPF_K] =	mov_imm64,
3422 	[BPF_ALU64 | BPF_XOR | BPF_X] =	xor_reg64,
3423 	[BPF_ALU64 | BPF_XOR | BPF_K] =	xor_imm64,
3424 	[BPF_ALU64 | BPF_AND | BPF_X] =	and_reg64,
3425 	[BPF_ALU64 | BPF_AND | BPF_K] =	and_imm64,
3426 	[BPF_ALU64 | BPF_OR | BPF_X] =	or_reg64,
3427 	[BPF_ALU64 | BPF_OR | BPF_K] =	or_imm64,
3428 	[BPF_ALU64 | BPF_ADD | BPF_X] =	add_reg64,
3429 	[BPF_ALU64 | BPF_ADD | BPF_K] =	add_imm64,
3430 	[BPF_ALU64 | BPF_SUB | BPF_X] =	sub_reg64,
3431 	[BPF_ALU64 | BPF_SUB | BPF_K] =	sub_imm64,
3432 	[BPF_ALU64 | BPF_MUL | BPF_X] =	mul_reg64,
3433 	[BPF_ALU64 | BPF_MUL | BPF_K] =	mul_imm64,
3434 	[BPF_ALU64 | BPF_DIV | BPF_X] =	div_reg64,
3435 	[BPF_ALU64 | BPF_DIV | BPF_K] =	div_imm64,
3436 	[BPF_ALU64 | BPF_NEG] =		neg_reg64,
3437 	[BPF_ALU64 | BPF_LSH | BPF_X] =	shl_reg64,
3438 	[BPF_ALU64 | BPF_LSH | BPF_K] =	shl_imm64,
3439 	[BPF_ALU64 | BPF_RSH | BPF_X] =	shr_reg64,
3440 	[BPF_ALU64 | BPF_RSH | BPF_K] =	shr_imm64,
3441 	[BPF_ALU64 | BPF_ARSH | BPF_X] = ashr_reg64,
3442 	[BPF_ALU64 | BPF_ARSH | BPF_K] = ashr_imm64,
3443 	[BPF_ALU | BPF_MOV | BPF_X] =	mov_reg,
3444 	[BPF_ALU | BPF_MOV | BPF_K] =	mov_imm,
3445 	[BPF_ALU | BPF_XOR | BPF_X] =	xor_reg,
3446 	[BPF_ALU | BPF_XOR | BPF_K] =	xor_imm,
3447 	[BPF_ALU | BPF_AND | BPF_X] =	and_reg,
3448 	[BPF_ALU | BPF_AND | BPF_K] =	and_imm,
3449 	[BPF_ALU | BPF_OR | BPF_X] =	or_reg,
3450 	[BPF_ALU | BPF_OR | BPF_K] =	or_imm,
3451 	[BPF_ALU | BPF_ADD | BPF_X] =	add_reg,
3452 	[BPF_ALU | BPF_ADD | BPF_K] =	add_imm,
3453 	[BPF_ALU | BPF_SUB | BPF_X] =	sub_reg,
3454 	[BPF_ALU | BPF_SUB | BPF_K] =	sub_imm,
3455 	[BPF_ALU | BPF_MUL | BPF_X] =	mul_reg,
3456 	[BPF_ALU | BPF_MUL | BPF_K] =	mul_imm,
3457 	[BPF_ALU | BPF_DIV | BPF_X] =	div_reg,
3458 	[BPF_ALU | BPF_DIV | BPF_K] =	div_imm,
3459 	[BPF_ALU | BPF_NEG] =		neg_reg,
3460 	[BPF_ALU | BPF_LSH | BPF_X] =	shl_reg,
3461 	[BPF_ALU | BPF_LSH | BPF_K] =	shl_imm,
3462 	[BPF_ALU | BPF_RSH | BPF_X] =	shr_reg,
3463 	[BPF_ALU | BPF_RSH | BPF_K] =	shr_imm,
3464 	[BPF_ALU | BPF_ARSH | BPF_X] =	ashr_reg,
3465 	[BPF_ALU | BPF_ARSH | BPF_K] =	ashr_imm,
3466 	[BPF_ALU | BPF_END | BPF_X] =	end_reg32,
3467 	[BPF_LD | BPF_IMM | BPF_DW] =	imm_ld8,
3468 	[BPF_LD | BPF_ABS | BPF_B] =	data_ld1,
3469 	[BPF_LD | BPF_ABS | BPF_H] =	data_ld2,
3470 	[BPF_LD | BPF_ABS | BPF_W] =	data_ld4,
3471 	[BPF_LD | BPF_IND | BPF_B] =	data_ind_ld1,
3472 	[BPF_LD | BPF_IND | BPF_H] =	data_ind_ld2,
3473 	[BPF_LD | BPF_IND | BPF_W] =	data_ind_ld4,
3474 	[BPF_LDX | BPF_MEM | BPF_B] =	mem_ldx1,
3475 	[BPF_LDX | BPF_MEM | BPF_H] =	mem_ldx2,
3476 	[BPF_LDX | BPF_MEM | BPF_W] =	mem_ldx4,
3477 	[BPF_LDX | BPF_MEM | BPF_DW] =	mem_ldx8,
3478 	[BPF_STX | BPF_MEM | BPF_B] =	mem_stx1,
3479 	[BPF_STX | BPF_MEM | BPF_H] =	mem_stx2,
3480 	[BPF_STX | BPF_MEM | BPF_W] =	mem_stx4,
3481 	[BPF_STX | BPF_MEM | BPF_DW] =	mem_stx8,
3482 	[BPF_STX | BPF_ATOMIC | BPF_W] =	mem_atomic4,
3483 	[BPF_STX | BPF_ATOMIC | BPF_DW] =	mem_atomic8,
3484 	[BPF_ST | BPF_MEM | BPF_B] =	mem_st1,
3485 	[BPF_ST | BPF_MEM | BPF_H] =	mem_st2,
3486 	[BPF_ST | BPF_MEM | BPF_W] =	mem_st4,
3487 	[BPF_ST | BPF_MEM | BPF_DW] =	mem_st8,
3488 	[BPF_JMP | BPF_JA | BPF_K] =	jump,
3489 	[BPF_JMP | BPF_JEQ | BPF_K] =	jeq_imm,
3490 	[BPF_JMP | BPF_JGT | BPF_K] =	cmp_imm,
3491 	[BPF_JMP | BPF_JGE | BPF_K] =	cmp_imm,
3492 	[BPF_JMP | BPF_JLT | BPF_K] =	cmp_imm,
3493 	[BPF_JMP | BPF_JLE | BPF_K] =	cmp_imm,
3494 	[BPF_JMP | BPF_JSGT | BPF_K] =  cmp_imm,
3495 	[BPF_JMP | BPF_JSGE | BPF_K] =  cmp_imm,
3496 	[BPF_JMP | BPF_JSLT | BPF_K] =  cmp_imm,
3497 	[BPF_JMP | BPF_JSLE | BPF_K] =  cmp_imm,
3498 	[BPF_JMP | BPF_JSET | BPF_K] =	jset_imm,
3499 	[BPF_JMP | BPF_JNE | BPF_K] =	jne_imm,
3500 	[BPF_JMP | BPF_JEQ | BPF_X] =	jeq_reg,
3501 	[BPF_JMP | BPF_JGT | BPF_X] =	cmp_reg,
3502 	[BPF_JMP | BPF_JGE | BPF_X] =	cmp_reg,
3503 	[BPF_JMP | BPF_JLT | BPF_X] =	cmp_reg,
3504 	[BPF_JMP | BPF_JLE | BPF_X] =	cmp_reg,
3505 	[BPF_JMP | BPF_JSGT | BPF_X] =  cmp_reg,
3506 	[BPF_JMP | BPF_JSGE | BPF_X] =  cmp_reg,
3507 	[BPF_JMP | BPF_JSLT | BPF_X] =  cmp_reg,
3508 	[BPF_JMP | BPF_JSLE | BPF_X] =  cmp_reg,
3509 	[BPF_JMP | BPF_JSET | BPF_X] =	jset_reg,
3510 	[BPF_JMP | BPF_JNE | BPF_X] =	jne_reg,
3511 	[BPF_JMP32 | BPF_JEQ | BPF_K] =	jeq32_imm,
3512 	[BPF_JMP32 | BPF_JGT | BPF_K] =	cmp_imm,
3513 	[BPF_JMP32 | BPF_JGE | BPF_K] =	cmp_imm,
3514 	[BPF_JMP32 | BPF_JLT | BPF_K] =	cmp_imm,
3515 	[BPF_JMP32 | BPF_JLE | BPF_K] =	cmp_imm,
3516 	[BPF_JMP32 | BPF_JSGT | BPF_K] =cmp_imm,
3517 	[BPF_JMP32 | BPF_JSGE | BPF_K] =cmp_imm,
3518 	[BPF_JMP32 | BPF_JSLT | BPF_K] =cmp_imm,
3519 	[BPF_JMP32 | BPF_JSLE | BPF_K] =cmp_imm,
3520 	[BPF_JMP32 | BPF_JSET | BPF_K] =jset_imm,
3521 	[BPF_JMP32 | BPF_JNE | BPF_K] =	jne_imm,
3522 	[BPF_JMP32 | BPF_JEQ | BPF_X] =	jeq_reg,
3523 	[BPF_JMP32 | BPF_JGT | BPF_X] =	cmp_reg,
3524 	[BPF_JMP32 | BPF_JGE | BPF_X] =	cmp_reg,
3525 	[BPF_JMP32 | BPF_JLT | BPF_X] =	cmp_reg,
3526 	[BPF_JMP32 | BPF_JLE | BPF_X] =	cmp_reg,
3527 	[BPF_JMP32 | BPF_JSGT | BPF_X] =cmp_reg,
3528 	[BPF_JMP32 | BPF_JSGE | BPF_X] =cmp_reg,
3529 	[BPF_JMP32 | BPF_JSLT | BPF_X] =cmp_reg,
3530 	[BPF_JMP32 | BPF_JSLE | BPF_X] =cmp_reg,
3531 	[BPF_JMP32 | BPF_JSET | BPF_X] =jset_reg,
3532 	[BPF_JMP32 | BPF_JNE | BPF_X] =	jne_reg,
3533 	[BPF_JMP | BPF_CALL] =		call,
3534 	[BPF_JMP | BPF_EXIT] =		jmp_exit,
3535 };
3536 
3537 /* --- Assembler logic --- */
3538 static int
3539 nfp_fixup_immed_relo(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
3540 		     struct nfp_insn_meta *jmp_dst, u32 br_idx)
3541 {
3542 	if (immed_get_value(nfp_prog->prog[br_idx + 1])) {
3543 		pr_err("BUG: failed to fix up callee register saving\n");
3544 		return -EINVAL;
3545 	}
3546 
3547 	immed_set_value(&nfp_prog->prog[br_idx + 1], jmp_dst->off);
3548 
3549 	return 0;
3550 }
3551 
3552 static int nfp_fixup_branches(struct nfp_prog *nfp_prog)
3553 {
3554 	struct nfp_insn_meta *meta, *jmp_dst;
3555 	u32 idx, br_idx;
3556 	int err;
3557 
3558 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3559 		if (meta->flags & FLAG_INSN_SKIP_MASK)
3560 			continue;
3561 		if (!is_mbpf_jmp(meta))
3562 			continue;
3563 		if (meta->insn.code == (BPF_JMP | BPF_EXIT) &&
3564 		    !nfp_is_main_function(meta))
3565 			continue;
3566 		if (is_mbpf_helper_call(meta))
3567 			continue;
3568 
3569 		if (list_is_last(&meta->l, &nfp_prog->insns))
3570 			br_idx = nfp_prog->last_bpf_off;
3571 		else
3572 			br_idx = list_next_entry(meta, l)->off - 1;
3573 
3574 		/* For BPF-to-BPF function call, a stack adjustment sequence is
3575 		 * generated after the return instruction. Therefore, we must
3576 		 * withdraw the length of this sequence to have br_idx pointing
3577 		 * to where the "branch" NFP instruction is expected to be.
3578 		 */
3579 		if (is_mbpf_pseudo_call(meta))
3580 			br_idx -= meta->num_insns_after_br;
3581 
3582 		if (!nfp_is_br(nfp_prog->prog[br_idx])) {
3583 			pr_err("Fixup found block not ending in branch %d %02x %016llx!!\n",
3584 			       br_idx, meta->insn.code, nfp_prog->prog[br_idx]);
3585 			return -ELOOP;
3586 		}
3587 
3588 		if (meta->insn.code == (BPF_JMP | BPF_EXIT))
3589 			continue;
3590 
3591 		/* Leave special branches for later */
3592 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3593 		    RELO_BR_REL && !is_mbpf_pseudo_call(meta))
3594 			continue;
3595 
3596 		if (!meta->jmp_dst) {
3597 			pr_err("Non-exit jump doesn't have destination info recorded!!\n");
3598 			return -ELOOP;
3599 		}
3600 
3601 		jmp_dst = meta->jmp_dst;
3602 
3603 		if (jmp_dst->flags & FLAG_INSN_SKIP_PREC_DEPENDENT) {
3604 			pr_err("Branch landing on removed instruction!!\n");
3605 			return -ELOOP;
3606 		}
3607 
3608 		if (is_mbpf_pseudo_call(meta) &&
3609 		    nfp_prog->subprog[jmp_dst->subprog_idx].needs_reg_push) {
3610 			err = nfp_fixup_immed_relo(nfp_prog, meta,
3611 						   jmp_dst, br_idx);
3612 			if (err)
3613 				return err;
3614 		}
3615 
3616 		if (FIELD_GET(OP_RELO_TYPE, nfp_prog->prog[br_idx]) !=
3617 		    RELO_BR_REL)
3618 			continue;
3619 
3620 		for (idx = meta->off; idx <= br_idx; idx++) {
3621 			if (!nfp_is_br(nfp_prog->prog[idx]))
3622 				continue;
3623 			br_set_offset(&nfp_prog->prog[idx], jmp_dst->off);
3624 		}
3625 	}
3626 
3627 	return 0;
3628 }
3629 
3630 static void nfp_intro(struct nfp_prog *nfp_prog)
3631 {
3632 	wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0));
3633 	emit_alu(nfp_prog, plen_reg(nfp_prog),
3634 		 plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog));
3635 }
3636 
3637 static void
3638 nfp_subprog_prologue(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3639 {
3640 	/* Save return address into the stack. */
3641 	wrp_mov(nfp_prog, reg_lm(0, 0), ret_reg(nfp_prog));
3642 }
3643 
3644 static void
3645 nfp_start_subprog(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta)
3646 {
3647 	unsigned int depth = nfp_prog->subprog[meta->subprog_idx].stack_depth;
3648 
3649 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3650 	nfp_subprog_prologue(nfp_prog, meta);
3651 }
3652 
3653 bool nfp_is_subprog_start(struct nfp_insn_meta *meta)
3654 {
3655 	return meta->flags & FLAG_INSN_IS_SUBPROG_START;
3656 }
3657 
3658 static void nfp_outro_tc_da(struct nfp_prog *nfp_prog)
3659 {
3660 	/* TC direct-action mode:
3661 	 *   0,1   ok        NOT SUPPORTED[1]
3662 	 *   2   drop  0x22 -> drop,  count as stat1
3663 	 *   4,5 nuke  0x02 -> drop
3664 	 *   7  redir  0x44 -> redir, count as stat2
3665 	 *   * unspec  0x11 -> pass,  count as stat0
3666 	 *
3667 	 * [1] We can't support OK and RECLASSIFY because we can't tell TC
3668 	 *     the exact decision made.  We are forced to support UNSPEC
3669 	 *     to handle aborts so that's the only one we handle for passing
3670 	 *     packets up the stack.
3671 	 */
3672 	/* Target for aborts */
3673 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3674 
3675 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3676 
3677 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3678 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16);
3679 
3680 	/* Target for normal exits */
3681 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3682 
3683 	/* if R0 > 7 jump to abort */
3684 	emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0));
3685 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3686 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3687 
3688 	wrp_immed(nfp_prog, reg_b(2), 0x41221211);
3689 	wrp_immed(nfp_prog, reg_b(3), 0x41001211);
3690 
3691 	emit_shf(nfp_prog, reg_a(1),
3692 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 2);
3693 
3694 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3695 	emit_shf(nfp_prog, reg_a(2),
3696 		 reg_imm(0xf), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3697 
3698 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3699 	emit_shf(nfp_prog, reg_b(2),
3700 		 reg_imm(0xf), SHF_OP_AND, reg_b(3), SHF_SC_R_SHF, 0);
3701 
3702 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3703 
3704 	emit_shf(nfp_prog, reg_b(2),
3705 		 reg_a(2), SHF_OP_OR, reg_b(2), SHF_SC_L_SHF, 4);
3706 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3707 }
3708 
3709 static void nfp_outro_xdp(struct nfp_prog *nfp_prog)
3710 {
3711 	/* XDP return codes:
3712 	 *   0 aborted  0x82 -> drop,  count as stat3
3713 	 *   1    drop  0x22 -> drop,  count as stat1
3714 	 *   2    pass  0x11 -> pass,  count as stat0
3715 	 *   3      tx  0x44 -> redir, count as stat2
3716 	 *   * unknown  0x82 -> drop,  count as stat3
3717 	 */
3718 	/* Target for aborts */
3719 	nfp_prog->tgt_abort = nfp_prog_current_offset(nfp_prog);
3720 
3721 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3722 
3723 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3724 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16);
3725 
3726 	/* Target for normal exits */
3727 	nfp_prog->tgt_out = nfp_prog_current_offset(nfp_prog);
3728 
3729 	/* if R0 > 3 jump to abort */
3730 	emit_alu(nfp_prog, reg_none(), reg_imm(3), ALU_OP_SUB, reg_b(0));
3731 	emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0);
3732 
3733 	wrp_immed(nfp_prog, reg_b(2), 0x44112282);
3734 
3735 	emit_shf(nfp_prog, reg_a(1),
3736 		 reg_none(), SHF_OP_NONE, reg_b(0), SHF_SC_L_SHF, 3);
3737 
3738 	emit_alu(nfp_prog, reg_none(), reg_a(1), ALU_OP_OR, reg_imm(0));
3739 	emit_shf(nfp_prog, reg_b(2),
3740 		 reg_imm(0xff), SHF_OP_AND, reg_b(2), SHF_SC_R_SHF, 0);
3741 
3742 	emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO, 2, RELO_BR_NEXT_PKT);
3743 
3744 	wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS);
3745 	emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16);
3746 }
3747 
3748 static bool nfp_prog_needs_callee_reg_save(struct nfp_prog *nfp_prog)
3749 {
3750 	unsigned int idx;
3751 
3752 	for (idx = 1; idx < nfp_prog->subprog_cnt; idx++)
3753 		if (nfp_prog->subprog[idx].needs_reg_push)
3754 			return true;
3755 
3756 	return false;
3757 }
3758 
3759 static void nfp_push_callee_registers(struct nfp_prog *nfp_prog)
3760 {
3761 	u8 reg;
3762 
3763 	/* Subroutine: Save all callee saved registers (R6 ~ R9).
3764 	 * imm_b() holds the return address.
3765 	 */
3766 	nfp_prog->tgt_call_push_regs = nfp_prog_current_offset(nfp_prog);
3767 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3768 		u8 adj = (reg - BPF_REG_0) * 2;
3769 		u8 idx = (reg - BPF_REG_6) * 2;
3770 
3771 		/* The first slot in the stack frame is used to push the return
3772 		 * address in bpf_to_bpf_call(), start just after.
3773 		 */
3774 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx), reg_b(adj));
3775 
3776 		if (reg == BPF_REG_8)
3777 			/* Prepare to jump back, last 3 insns use defer slots */
3778 			emit_rtn(nfp_prog, imm_b(nfp_prog), 3);
3779 
3780 		wrp_mov(nfp_prog, reg_lm(0, 1 + idx + 1), reg_b(adj + 1));
3781 	}
3782 }
3783 
3784 static void nfp_pop_callee_registers(struct nfp_prog *nfp_prog)
3785 {
3786 	u8 reg;
3787 
3788 	/* Subroutine: Restore all callee saved registers (R6 ~ R9).
3789 	 * ret_reg() holds the return address.
3790 	 */
3791 	nfp_prog->tgt_call_pop_regs = nfp_prog_current_offset(nfp_prog);
3792 	for (reg = BPF_REG_6; reg <= BPF_REG_9; reg++) {
3793 		u8 adj = (reg - BPF_REG_0) * 2;
3794 		u8 idx = (reg - BPF_REG_6) * 2;
3795 
3796 		/* The first slot in the stack frame holds the return address,
3797 		 * start popping just after that.
3798 		 */
3799 		wrp_mov(nfp_prog, reg_both(adj), reg_lm(0, 1 + idx));
3800 
3801 		if (reg == BPF_REG_8)
3802 			/* Prepare to jump back, last 3 insns use defer slots */
3803 			emit_rtn(nfp_prog, ret_reg(nfp_prog), 3);
3804 
3805 		wrp_mov(nfp_prog, reg_both(adj + 1), reg_lm(0, 1 + idx + 1));
3806 	}
3807 }
3808 
3809 static void nfp_outro(struct nfp_prog *nfp_prog)
3810 {
3811 	switch (nfp_prog->type) {
3812 	case BPF_PROG_TYPE_SCHED_CLS:
3813 		nfp_outro_tc_da(nfp_prog);
3814 		break;
3815 	case BPF_PROG_TYPE_XDP:
3816 		nfp_outro_xdp(nfp_prog);
3817 		break;
3818 	default:
3819 		WARN_ON(1);
3820 	}
3821 
3822 	if (!nfp_prog_needs_callee_reg_save(nfp_prog))
3823 		return;
3824 
3825 	nfp_push_callee_registers(nfp_prog);
3826 	nfp_pop_callee_registers(nfp_prog);
3827 }
3828 
3829 static int nfp_translate(struct nfp_prog *nfp_prog)
3830 {
3831 	struct nfp_insn_meta *meta;
3832 	unsigned int depth;
3833 	int err;
3834 
3835 	depth = nfp_prog->subprog[0].stack_depth;
3836 	nfp_prog->stack_frame_depth = round_up(depth, 4);
3837 
3838 	nfp_intro(nfp_prog);
3839 	if (nfp_prog->error)
3840 		return nfp_prog->error;
3841 
3842 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3843 		instr_cb_t cb = instr_cb[meta->insn.code];
3844 
3845 		meta->off = nfp_prog_current_offset(nfp_prog);
3846 
3847 		if (nfp_is_subprog_start(meta)) {
3848 			nfp_start_subprog(nfp_prog, meta);
3849 			if (nfp_prog->error)
3850 				return nfp_prog->error;
3851 		}
3852 
3853 		if (meta->flags & FLAG_INSN_SKIP_MASK) {
3854 			nfp_prog->n_translated++;
3855 			continue;
3856 		}
3857 
3858 		if (nfp_meta_has_prev(nfp_prog, meta) &&
3859 		    nfp_meta_prev(meta)->double_cb)
3860 			cb = nfp_meta_prev(meta)->double_cb;
3861 		if (!cb)
3862 			return -ENOENT;
3863 		err = cb(nfp_prog, meta);
3864 		if (err)
3865 			return err;
3866 		if (nfp_prog->error)
3867 			return nfp_prog->error;
3868 
3869 		nfp_prog->n_translated++;
3870 	}
3871 
3872 	nfp_prog->last_bpf_off = nfp_prog_current_offset(nfp_prog) - 1;
3873 
3874 	nfp_outro(nfp_prog);
3875 	if (nfp_prog->error)
3876 		return nfp_prog->error;
3877 
3878 	wrp_nops(nfp_prog, NFP_USTORE_PREFETCH_WINDOW);
3879 	if (nfp_prog->error)
3880 		return nfp_prog->error;
3881 
3882 	return nfp_fixup_branches(nfp_prog);
3883 }
3884 
3885 /* --- Optimizations --- */
3886 static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog)
3887 {
3888 	struct nfp_insn_meta *meta;
3889 
3890 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3891 		struct bpf_insn insn = meta->insn;
3892 
3893 		/* Programs converted from cBPF start with register xoring */
3894 		if (insn.code == (BPF_ALU64 | BPF_XOR | BPF_X) &&
3895 		    insn.src_reg == insn.dst_reg)
3896 			continue;
3897 
3898 		/* Programs start with R6 = R1 but we ignore the skb pointer */
3899 		if (insn.code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
3900 		    insn.src_reg == 1 && insn.dst_reg == 6)
3901 			meta->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
3902 
3903 		/* Return as soon as something doesn't match */
3904 		if (!(meta->flags & FLAG_INSN_SKIP_MASK))
3905 			return;
3906 	}
3907 }
3908 
3909 /* abs(insn.imm) will fit better into unrestricted reg immediate -
3910  * convert add/sub of a negative number into a sub/add of a positive one.
3911  */
3912 static void nfp_bpf_opt_neg_add_sub(struct nfp_prog *nfp_prog)
3913 {
3914 	struct nfp_insn_meta *meta;
3915 
3916 	list_for_each_entry(meta, &nfp_prog->insns, l) {
3917 		struct bpf_insn insn = meta->insn;
3918 
3919 		if (meta->flags & FLAG_INSN_SKIP_MASK)
3920 			continue;
3921 
3922 		if (!is_mbpf_alu(meta) && !is_mbpf_jmp(meta))
3923 			continue;
3924 		if (BPF_SRC(insn.code) != BPF_K)
3925 			continue;
3926 		if (insn.imm >= 0)
3927 			continue;
3928 
3929 		if (is_mbpf_jmp(meta)) {
3930 			switch (BPF_OP(insn.code)) {
3931 			case BPF_JGE:
3932 			case BPF_JSGE:
3933 			case BPF_JLT:
3934 			case BPF_JSLT:
3935 				meta->jump_neg_op = true;
3936 				break;
3937 			default:
3938 				continue;
3939 			}
3940 		} else {
3941 			if (BPF_OP(insn.code) == BPF_ADD)
3942 				insn.code = BPF_CLASS(insn.code) | BPF_SUB;
3943 			else if (BPF_OP(insn.code) == BPF_SUB)
3944 				insn.code = BPF_CLASS(insn.code) | BPF_ADD;
3945 			else
3946 				continue;
3947 
3948 			meta->insn.code = insn.code | BPF_K;
3949 		}
3950 
3951 		meta->insn.imm = -insn.imm;
3952 	}
3953 }
3954 
3955 /* Remove masking after load since our load guarantees this is not needed */
3956 static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog)
3957 {
3958 	struct nfp_insn_meta *meta1, *meta2;
3959 	static const s32 exp_mask[] = {
3960 		[BPF_B] = 0x000000ffU,
3961 		[BPF_H] = 0x0000ffffU,
3962 		[BPF_W] = 0xffffffffU,
3963 	};
3964 
3965 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
3966 		struct bpf_insn insn, next;
3967 
3968 		insn = meta1->insn;
3969 		next = meta2->insn;
3970 
3971 		if (BPF_CLASS(insn.code) != BPF_LD)
3972 			continue;
3973 		if (BPF_MODE(insn.code) != BPF_ABS &&
3974 		    BPF_MODE(insn.code) != BPF_IND)
3975 			continue;
3976 
3977 		if (next.code != (BPF_ALU64 | BPF_AND | BPF_K))
3978 			continue;
3979 
3980 		if (!exp_mask[BPF_SIZE(insn.code)])
3981 			continue;
3982 		if (exp_mask[BPF_SIZE(insn.code)] != next.imm)
3983 			continue;
3984 
3985 		if (next.src_reg || next.dst_reg)
3986 			continue;
3987 
3988 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST)
3989 			continue;
3990 
3991 		meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
3992 	}
3993 }
3994 
3995 static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog)
3996 {
3997 	struct nfp_insn_meta *meta1, *meta2, *meta3;
3998 
3999 	nfp_for_each_insn_walk3(nfp_prog, meta1, meta2, meta3) {
4000 		struct bpf_insn insn, next1, next2;
4001 
4002 		insn = meta1->insn;
4003 		next1 = meta2->insn;
4004 		next2 = meta3->insn;
4005 
4006 		if (BPF_CLASS(insn.code) != BPF_LD)
4007 			continue;
4008 		if (BPF_MODE(insn.code) != BPF_ABS &&
4009 		    BPF_MODE(insn.code) != BPF_IND)
4010 			continue;
4011 		if (BPF_SIZE(insn.code) != BPF_W)
4012 			continue;
4013 
4014 		if (!(next1.code == (BPF_LSH | BPF_K | BPF_ALU64) &&
4015 		      next2.code == (BPF_RSH | BPF_K | BPF_ALU64)) &&
4016 		    !(next1.code == (BPF_RSH | BPF_K | BPF_ALU64) &&
4017 		      next2.code == (BPF_LSH | BPF_K | BPF_ALU64)))
4018 			continue;
4019 
4020 		if (next1.src_reg || next1.dst_reg ||
4021 		    next2.src_reg || next2.dst_reg)
4022 			continue;
4023 
4024 		if (next1.imm != 0x20 || next2.imm != 0x20)
4025 			continue;
4026 
4027 		if (meta2->flags & FLAG_INSN_IS_JUMP_DST ||
4028 		    meta3->flags & FLAG_INSN_IS_JUMP_DST)
4029 			continue;
4030 
4031 		meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4032 		meta3->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4033 	}
4034 }
4035 
4036 /* load/store pair that forms memory copy sould look like the following:
4037  *
4038  *   ld_width R, [addr_src + offset_src]
4039  *   st_width [addr_dest + offset_dest], R
4040  *
4041  * The destination register of load and source register of store should
4042  * be the same, load and store should also perform at the same width.
4043  * If either of addr_src or addr_dest is stack pointer, we don't do the
4044  * CPP optimization as stack is modelled by registers on NFP.
4045  */
4046 static bool
4047 curr_pair_is_memcpy(struct nfp_insn_meta *ld_meta,
4048 		    struct nfp_insn_meta *st_meta)
4049 {
4050 	struct bpf_insn *ld = &ld_meta->insn;
4051 	struct bpf_insn *st = &st_meta->insn;
4052 
4053 	if (!is_mbpf_load(ld_meta) || !is_mbpf_store(st_meta))
4054 		return false;
4055 
4056 	if (ld_meta->ptr.type != PTR_TO_PACKET &&
4057 	    ld_meta->ptr.type != PTR_TO_MAP_VALUE)
4058 		return false;
4059 
4060 	if (st_meta->ptr.type != PTR_TO_PACKET)
4061 		return false;
4062 
4063 	if (BPF_SIZE(ld->code) != BPF_SIZE(st->code))
4064 		return false;
4065 
4066 	if (ld->dst_reg != st->src_reg)
4067 		return false;
4068 
4069 	/* There is jump to the store insn in this pair. */
4070 	if (st_meta->flags & FLAG_INSN_IS_JUMP_DST)
4071 		return false;
4072 
4073 	return true;
4074 }
4075 
4076 /* Currently, we only support chaining load/store pairs if:
4077  *
4078  *  - Their address base registers are the same.
4079  *  - Their address offsets are in the same order.
4080  *  - They operate at the same memory width.
4081  *  - There is no jump into the middle of them.
4082  */
4083 static bool
4084 curr_pair_chain_with_previous(struct nfp_insn_meta *ld_meta,
4085 			      struct nfp_insn_meta *st_meta,
4086 			      struct bpf_insn *prev_ld,
4087 			      struct bpf_insn *prev_st)
4088 {
4089 	u8 prev_size, curr_size, prev_ld_base, prev_st_base, prev_ld_dst;
4090 	struct bpf_insn *ld = &ld_meta->insn;
4091 	struct bpf_insn *st = &st_meta->insn;
4092 	s16 prev_ld_off, prev_st_off;
4093 
4094 	/* This pair is the start pair. */
4095 	if (!prev_ld)
4096 		return true;
4097 
4098 	prev_size = BPF_LDST_BYTES(prev_ld);
4099 	curr_size = BPF_LDST_BYTES(ld);
4100 	prev_ld_base = prev_ld->src_reg;
4101 	prev_st_base = prev_st->dst_reg;
4102 	prev_ld_dst = prev_ld->dst_reg;
4103 	prev_ld_off = prev_ld->off;
4104 	prev_st_off = prev_st->off;
4105 
4106 	if (ld->dst_reg != prev_ld_dst)
4107 		return false;
4108 
4109 	if (ld->src_reg != prev_ld_base || st->dst_reg != prev_st_base)
4110 		return false;
4111 
4112 	if (curr_size != prev_size)
4113 		return false;
4114 
4115 	/* There is jump to the head of this pair. */
4116 	if (ld_meta->flags & FLAG_INSN_IS_JUMP_DST)
4117 		return false;
4118 
4119 	/* Both in ascending order. */
4120 	if (prev_ld_off + prev_size == ld->off &&
4121 	    prev_st_off + prev_size == st->off)
4122 		return true;
4123 
4124 	/* Both in descending order. */
4125 	if (ld->off + curr_size == prev_ld_off &&
4126 	    st->off + curr_size == prev_st_off)
4127 		return true;
4128 
4129 	return false;
4130 }
4131 
4132 /* Return TRUE if cross memory access happens. Cross memory access means
4133  * store area is overlapping with load area that a later load might load
4134  * the value from previous store, for this case we can't treat the sequence
4135  * as an memory copy.
4136  */
4137 static bool
4138 cross_mem_access(struct bpf_insn *ld, struct nfp_insn_meta *head_ld_meta,
4139 		 struct nfp_insn_meta *head_st_meta)
4140 {
4141 	s16 head_ld_off, head_st_off, ld_off;
4142 
4143 	/* Different pointer types does not overlap. */
4144 	if (head_ld_meta->ptr.type != head_st_meta->ptr.type)
4145 		return false;
4146 
4147 	/* load and store are both PTR_TO_PACKET, check ID info.  */
4148 	if (head_ld_meta->ptr.id != head_st_meta->ptr.id)
4149 		return true;
4150 
4151 	/* Canonicalize the offsets. Turn all of them against the original
4152 	 * base register.
4153 	 */
4154 	head_ld_off = head_ld_meta->insn.off + head_ld_meta->ptr.var_off.value;
4155 	head_st_off = head_st_meta->insn.off + head_st_meta->ptr.var_off.value;
4156 	ld_off = ld->off + head_ld_meta->ptr.var_off.value;
4157 
4158 	/* Ascending order cross. */
4159 	if (ld_off > head_ld_off &&
4160 	    head_ld_off < head_st_off && ld_off >= head_st_off)
4161 		return true;
4162 
4163 	/* Descending order cross. */
4164 	if (ld_off < head_ld_off &&
4165 	    head_ld_off > head_st_off && ld_off <= head_st_off)
4166 		return true;
4167 
4168 	return false;
4169 }
4170 
4171 /* This pass try to identify the following instructoin sequences.
4172  *
4173  *   load R, [regA + offA]
4174  *   store [regB + offB], R
4175  *   load R, [regA + offA + const_imm_A]
4176  *   store [regB + offB + const_imm_A], R
4177  *   load R, [regA + offA + 2 * const_imm_A]
4178  *   store [regB + offB + 2 * const_imm_A], R
4179  *   ...
4180  *
4181  * Above sequence is typically generated by compiler when lowering
4182  * memcpy. NFP prefer using CPP instructions to accelerate it.
4183  */
4184 static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog)
4185 {
4186 	struct nfp_insn_meta *head_ld_meta = NULL;
4187 	struct nfp_insn_meta *head_st_meta = NULL;
4188 	struct nfp_insn_meta *meta1, *meta2;
4189 	struct bpf_insn *prev_ld = NULL;
4190 	struct bpf_insn *prev_st = NULL;
4191 	u8 count = 0;
4192 
4193 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4194 		struct bpf_insn *ld = &meta1->insn;
4195 		struct bpf_insn *st = &meta2->insn;
4196 
4197 		/* Reset record status if any of the following if true:
4198 		 *   - The current insn pair is not load/store.
4199 		 *   - The load/store pair doesn't chain with previous one.
4200 		 *   - The chained load/store pair crossed with previous pair.
4201 		 *   - The chained load/store pair has a total size of memory
4202 		 *     copy beyond 128 bytes which is the maximum length a
4203 		 *     single NFP CPP command can transfer.
4204 		 */
4205 		if (!curr_pair_is_memcpy(meta1, meta2) ||
4206 		    !curr_pair_chain_with_previous(meta1, meta2, prev_ld,
4207 						   prev_st) ||
4208 		    (head_ld_meta && (cross_mem_access(ld, head_ld_meta,
4209 						       head_st_meta) ||
4210 				      head_ld_meta->ldst_gather_len >= 128))) {
4211 			if (!count)
4212 				continue;
4213 
4214 			if (count > 1) {
4215 				s16 prev_ld_off = prev_ld->off;
4216 				s16 prev_st_off = prev_st->off;
4217 				s16 head_ld_off = head_ld_meta->insn.off;
4218 
4219 				if (prev_ld_off < head_ld_off) {
4220 					head_ld_meta->insn.off = prev_ld_off;
4221 					head_st_meta->insn.off = prev_st_off;
4222 					head_ld_meta->ldst_gather_len =
4223 						-head_ld_meta->ldst_gather_len;
4224 				}
4225 
4226 				head_ld_meta->paired_st = &head_st_meta->insn;
4227 				head_st_meta->flags |=
4228 					FLAG_INSN_SKIP_PREC_DEPENDENT;
4229 			} else {
4230 				head_ld_meta->ldst_gather_len = 0;
4231 			}
4232 
4233 			/* If the chain is ended by an load/store pair then this
4234 			 * could serve as the new head of the next chain.
4235 			 */
4236 			if (curr_pair_is_memcpy(meta1, meta2)) {
4237 				head_ld_meta = meta1;
4238 				head_st_meta = meta2;
4239 				head_ld_meta->ldst_gather_len =
4240 					BPF_LDST_BYTES(ld);
4241 				meta1 = nfp_meta_next(meta1);
4242 				meta2 = nfp_meta_next(meta2);
4243 				prev_ld = ld;
4244 				prev_st = st;
4245 				count = 1;
4246 			} else {
4247 				head_ld_meta = NULL;
4248 				head_st_meta = NULL;
4249 				prev_ld = NULL;
4250 				prev_st = NULL;
4251 				count = 0;
4252 			}
4253 
4254 			continue;
4255 		}
4256 
4257 		if (!head_ld_meta) {
4258 			head_ld_meta = meta1;
4259 			head_st_meta = meta2;
4260 		} else {
4261 			meta1->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4262 			meta2->flags |= FLAG_INSN_SKIP_PREC_DEPENDENT;
4263 		}
4264 
4265 		head_ld_meta->ldst_gather_len += BPF_LDST_BYTES(ld);
4266 		meta1 = nfp_meta_next(meta1);
4267 		meta2 = nfp_meta_next(meta2);
4268 		prev_ld = ld;
4269 		prev_st = st;
4270 		count++;
4271 	}
4272 }
4273 
4274 static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog)
4275 {
4276 	struct nfp_insn_meta *meta, *range_node = NULL;
4277 	s16 range_start = 0, range_end = 0;
4278 	bool cache_avail = false;
4279 	struct bpf_insn *insn;
4280 	s32 range_ptr_off = 0;
4281 	u32 range_ptr_id = 0;
4282 
4283 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4284 		if (meta->flags & FLAG_INSN_IS_JUMP_DST)
4285 			cache_avail = false;
4286 
4287 		if (meta->flags & FLAG_INSN_SKIP_MASK)
4288 			continue;
4289 
4290 		insn = &meta->insn;
4291 
4292 		if (is_mbpf_store_pkt(meta) ||
4293 		    insn->code == (BPF_JMP | BPF_CALL) ||
4294 		    is_mbpf_classic_store_pkt(meta) ||
4295 		    is_mbpf_classic_load(meta)) {
4296 			cache_avail = false;
4297 			continue;
4298 		}
4299 
4300 		if (!is_mbpf_load(meta))
4301 			continue;
4302 
4303 		if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) {
4304 			cache_avail = false;
4305 			continue;
4306 		}
4307 
4308 		if (!cache_avail) {
4309 			cache_avail = true;
4310 			if (range_node)
4311 				goto end_current_then_start_new;
4312 			goto start_new;
4313 		}
4314 
4315 		/* Check ID to make sure two reads share the same
4316 		 * variable offset against PTR_TO_PACKET, and check OFF
4317 		 * to make sure they also share the same constant
4318 		 * offset.
4319 		 *
4320 		 * OFFs don't really need to be the same, because they
4321 		 * are the constant offsets against PTR_TO_PACKET, so
4322 		 * for different OFFs, we could canonicalize them to
4323 		 * offsets against original packet pointer. We don't
4324 		 * support this.
4325 		 */
4326 		if (meta->ptr.id == range_ptr_id &&
4327 		    meta->ptr.var_off.value == range_ptr_off) {
4328 			s16 new_start = range_start;
4329 			s16 end, off = insn->off;
4330 			s16 new_end = range_end;
4331 			bool changed = false;
4332 
4333 			if (off < range_start) {
4334 				new_start = off;
4335 				changed = true;
4336 			}
4337 
4338 			end = off + BPF_LDST_BYTES(insn);
4339 			if (end > range_end) {
4340 				new_end = end;
4341 				changed = true;
4342 			}
4343 
4344 			if (!changed)
4345 				continue;
4346 
4347 			if (new_end - new_start <= 64) {
4348 				/* Install new range. */
4349 				range_start = new_start;
4350 				range_end = new_end;
4351 				continue;
4352 			}
4353 		}
4354 
4355 end_current_then_start_new:
4356 		range_node->pkt_cache.range_start = range_start;
4357 		range_node->pkt_cache.range_end = range_end;
4358 start_new:
4359 		range_node = meta;
4360 		range_node->pkt_cache.do_init = true;
4361 		range_ptr_id = range_node->ptr.id;
4362 		range_ptr_off = range_node->ptr.var_off.value;
4363 		range_start = insn->off;
4364 		range_end = insn->off + BPF_LDST_BYTES(insn);
4365 	}
4366 
4367 	if (range_node) {
4368 		range_node->pkt_cache.range_start = range_start;
4369 		range_node->pkt_cache.range_end = range_end;
4370 	}
4371 
4372 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4373 		if (meta->flags & FLAG_INSN_SKIP_MASK)
4374 			continue;
4375 
4376 		if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) {
4377 			if (meta->pkt_cache.do_init) {
4378 				range_start = meta->pkt_cache.range_start;
4379 				range_end = meta->pkt_cache.range_end;
4380 			} else {
4381 				meta->pkt_cache.range_start = range_start;
4382 				meta->pkt_cache.range_end = range_end;
4383 			}
4384 		}
4385 	}
4386 }
4387 
4388 static int nfp_bpf_optimize(struct nfp_prog *nfp_prog)
4389 {
4390 	nfp_bpf_opt_reg_init(nfp_prog);
4391 
4392 	nfp_bpf_opt_neg_add_sub(nfp_prog);
4393 	nfp_bpf_opt_ld_mask(nfp_prog);
4394 	nfp_bpf_opt_ld_shift(nfp_prog);
4395 	nfp_bpf_opt_ldst_gather(nfp_prog);
4396 	nfp_bpf_opt_pkt_cache(nfp_prog);
4397 
4398 	return 0;
4399 }
4400 
4401 static int nfp_bpf_replace_map_ptrs(struct nfp_prog *nfp_prog)
4402 {
4403 	struct nfp_insn_meta *meta1, *meta2;
4404 	struct nfp_bpf_map *nfp_map;
4405 	struct bpf_map *map;
4406 	u32 id;
4407 
4408 	nfp_for_each_insn_walk2(nfp_prog, meta1, meta2) {
4409 		if (meta1->flags & FLAG_INSN_SKIP_MASK ||
4410 		    meta2->flags & FLAG_INSN_SKIP_MASK)
4411 			continue;
4412 
4413 		if (meta1->insn.code != (BPF_LD | BPF_IMM | BPF_DW) ||
4414 		    meta1->insn.src_reg != BPF_PSEUDO_MAP_FD)
4415 			continue;
4416 
4417 		map = (void *)(unsigned long)((u32)meta1->insn.imm |
4418 					      (u64)meta2->insn.imm << 32);
4419 		if (bpf_map_offload_neutral(map)) {
4420 			id = map->id;
4421 		} else {
4422 			nfp_map = map_to_offmap(map)->dev_priv;
4423 			id = nfp_map->tid;
4424 		}
4425 
4426 		meta1->insn.imm = id;
4427 		meta2->insn.imm = 0;
4428 	}
4429 
4430 	return 0;
4431 }
4432 
4433 static int nfp_bpf_ustore_calc(u64 *prog, unsigned int len)
4434 {
4435 	__le64 *ustore = (__force __le64 *)prog;
4436 	int i;
4437 
4438 	for (i = 0; i < len; i++) {
4439 		int err;
4440 
4441 		err = nfp_ustore_check_valid_no_ecc(prog[i]);
4442 		if (err)
4443 			return err;
4444 
4445 		ustore[i] = cpu_to_le64(nfp_ustore_calc_ecc_insn(prog[i]));
4446 	}
4447 
4448 	return 0;
4449 }
4450 
4451 static void nfp_bpf_prog_trim(struct nfp_prog *nfp_prog)
4452 {
4453 	void *prog;
4454 
4455 	prog = kvmalloc_array(nfp_prog->prog_len, sizeof(u64), GFP_KERNEL);
4456 	if (!prog)
4457 		return;
4458 
4459 	nfp_prog->__prog_alloc_len = nfp_prog->prog_len * sizeof(u64);
4460 	memcpy(prog, nfp_prog->prog, nfp_prog->__prog_alloc_len);
4461 	kvfree(nfp_prog->prog);
4462 	nfp_prog->prog = prog;
4463 }
4464 
4465 int nfp_bpf_jit(struct nfp_prog *nfp_prog)
4466 {
4467 	int ret;
4468 
4469 	ret = nfp_bpf_replace_map_ptrs(nfp_prog);
4470 	if (ret)
4471 		return ret;
4472 
4473 	ret = nfp_bpf_optimize(nfp_prog);
4474 	if (ret)
4475 		return ret;
4476 
4477 	ret = nfp_translate(nfp_prog);
4478 	if (ret) {
4479 		pr_err("Translation failed with error %d (translated: %u)\n",
4480 		       ret, nfp_prog->n_translated);
4481 		return -EINVAL;
4482 	}
4483 
4484 	nfp_bpf_prog_trim(nfp_prog);
4485 
4486 	return ret;
4487 }
4488 
4489 void nfp_bpf_jit_prepare(struct nfp_prog *nfp_prog)
4490 {
4491 	struct nfp_insn_meta *meta;
4492 
4493 	/* Another pass to record jump information. */
4494 	list_for_each_entry(meta, &nfp_prog->insns, l) {
4495 		struct nfp_insn_meta *dst_meta;
4496 		u64 code = meta->insn.code;
4497 		unsigned int dst_idx;
4498 		bool pseudo_call;
4499 
4500 		if (!is_mbpf_jmp(meta))
4501 			continue;
4502 		if (BPF_OP(code) == BPF_EXIT)
4503 			continue;
4504 		if (is_mbpf_helper_call(meta))
4505 			continue;
4506 
4507 		/* If opcode is BPF_CALL at this point, this can only be a
4508 		 * BPF-to-BPF call (a.k.a pseudo call).
4509 		 */
4510 		pseudo_call = BPF_OP(code) == BPF_CALL;
4511 
4512 		if (pseudo_call)
4513 			dst_idx = meta->n + 1 + meta->insn.imm;
4514 		else
4515 			dst_idx = meta->n + 1 + meta->insn.off;
4516 
4517 		dst_meta = nfp_bpf_goto_meta(nfp_prog, meta, dst_idx);
4518 
4519 		if (pseudo_call)
4520 			dst_meta->flags |= FLAG_INSN_IS_SUBPROG_START;
4521 
4522 		dst_meta->flags |= FLAG_INSN_IS_JUMP_DST;
4523 		meta->jmp_dst = dst_meta;
4524 	}
4525 }
4526 
4527 bool nfp_bpf_supported_opcode(u8 code)
4528 {
4529 	return !!instr_cb[code];
4530 }
4531 
4532 void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv)
4533 {
4534 	unsigned int i;
4535 	u64 *prog;
4536 	int err;
4537 
4538 	prog = kmemdup_array(nfp_prog->prog, nfp_prog->prog_len, sizeof(u64),
4539 			     GFP_KERNEL);
4540 	if (!prog)
4541 		return ERR_PTR(-ENOMEM);
4542 
4543 	for (i = 0; i < nfp_prog->prog_len; i++) {
4544 		enum nfp_relo_type special;
4545 		u32 val;
4546 		u16 off;
4547 
4548 		special = FIELD_GET(OP_RELO_TYPE, prog[i]);
4549 		switch (special) {
4550 		case RELO_NONE:
4551 			continue;
4552 		case RELO_BR_REL:
4553 			br_add_offset(&prog[i], bv->start_off);
4554 			break;
4555 		case RELO_BR_GO_OUT:
4556 			br_set_offset(&prog[i],
4557 				      nfp_prog->tgt_out + bv->start_off);
4558 			break;
4559 		case RELO_BR_GO_ABORT:
4560 			br_set_offset(&prog[i],
4561 				      nfp_prog->tgt_abort + bv->start_off);
4562 			break;
4563 		case RELO_BR_GO_CALL_PUSH_REGS:
4564 			if (!nfp_prog->tgt_call_push_regs) {
4565 				pr_err("BUG: failed to detect subprogram registers needs\n");
4566 				err = -EINVAL;
4567 				goto err_free_prog;
4568 			}
4569 			off = nfp_prog->tgt_call_push_regs + bv->start_off;
4570 			br_set_offset(&prog[i], off);
4571 			break;
4572 		case RELO_BR_GO_CALL_POP_REGS:
4573 			if (!nfp_prog->tgt_call_pop_regs) {
4574 				pr_err("BUG: failed to detect subprogram registers needs\n");
4575 				err = -EINVAL;
4576 				goto err_free_prog;
4577 			}
4578 			off = nfp_prog->tgt_call_pop_regs + bv->start_off;
4579 			br_set_offset(&prog[i], off);
4580 			break;
4581 		case RELO_BR_NEXT_PKT:
4582 			br_set_offset(&prog[i], bv->tgt_done);
4583 			break;
4584 		case RELO_BR_HELPER:
4585 			val = br_get_offset(prog[i]);
4586 			val -= BR_OFF_RELO;
4587 			switch (val) {
4588 			case BPF_FUNC_map_lookup_elem:
4589 				val = nfp_prog->bpf->helpers.map_lookup;
4590 				break;
4591 			case BPF_FUNC_map_update_elem:
4592 				val = nfp_prog->bpf->helpers.map_update;
4593 				break;
4594 			case BPF_FUNC_map_delete_elem:
4595 				val = nfp_prog->bpf->helpers.map_delete;
4596 				break;
4597 			case BPF_FUNC_perf_event_output:
4598 				val = nfp_prog->bpf->helpers.perf_event_output;
4599 				break;
4600 			default:
4601 				pr_err("relocation of unknown helper %d\n",
4602 				       val);
4603 				err = -EINVAL;
4604 				goto err_free_prog;
4605 			}
4606 			br_set_offset(&prog[i], val);
4607 			break;
4608 		case RELO_IMMED_REL:
4609 			immed_add_value(&prog[i], bv->start_off);
4610 			break;
4611 		}
4612 
4613 		prog[i] &= ~OP_RELO_TYPE;
4614 	}
4615 
4616 	err = nfp_bpf_ustore_calc(prog, nfp_prog->prog_len);
4617 	if (err)
4618 		goto err_free_prog;
4619 
4620 	return prog;
4621 
4622 err_free_prog:
4623 	kfree(prog);
4624 	return ERR_PTR(err);
4625 }
4626