1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 /*
3 * Copyright(c) 2016 - 2020 Intel Corporation.
4 */
5
6 #include <linux/hash.h>
7 #include <linux/bitops.h>
8 #include <linux/lockdep.h>
9 #include <linux/vmalloc.h>
10 #include <linux/slab.h>
11 #include <rdma/ib_verbs.h>
12 #include <rdma/ib_hdrs.h>
13 #include <rdma/opa_addr.h>
14 #include <rdma/uverbs_ioctl.h>
15 #include "qp.h"
16 #include "vt.h"
17 #include "trace.h"
18
19 #define RVT_RWQ_COUNT_THRESHOLD 16
20
21 static void rvt_rc_timeout(struct timer_list *t);
22 static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
23 enum ib_qp_type type);
24
25 /*
26 * Convert the AETH RNR timeout code into the number of microseconds.
27 */
28 static const u32 ib_rvt_rnr_table[32] = {
29 655360, /* 00: 655.36 */
30 10, /* 01: .01 */
31 20, /* 02 .02 */
32 30, /* 03: .03 */
33 40, /* 04: .04 */
34 60, /* 05: .06 */
35 80, /* 06: .08 */
36 120, /* 07: .12 */
37 160, /* 08: .16 */
38 240, /* 09: .24 */
39 320, /* 0A: .32 */
40 480, /* 0B: .48 */
41 640, /* 0C: .64 */
42 960, /* 0D: .96 */
43 1280, /* 0E: 1.28 */
44 1920, /* 0F: 1.92 */
45 2560, /* 10: 2.56 */
46 3840, /* 11: 3.84 */
47 5120, /* 12: 5.12 */
48 7680, /* 13: 7.68 */
49 10240, /* 14: 10.24 */
50 15360, /* 15: 15.36 */
51 20480, /* 16: 20.48 */
52 30720, /* 17: 30.72 */
53 40960, /* 18: 40.96 */
54 61440, /* 19: 61.44 */
55 81920, /* 1A: 81.92 */
56 122880, /* 1B: 122.88 */
57 163840, /* 1C: 163.84 */
58 245760, /* 1D: 245.76 */
59 327680, /* 1E: 327.68 */
60 491520 /* 1F: 491.52 */
61 };
62
63 /*
64 * Note that it is OK to post send work requests in the SQE and ERR
65 * states; rvt_do_send() will process them and generate error
66 * completions as per IB 1.2 C10-96.
67 */
68 const int ib_rvt_state_ops[IB_QPS_ERR + 1] = {
69 [IB_QPS_RESET] = 0,
70 [IB_QPS_INIT] = RVT_POST_RECV_OK,
71 [IB_QPS_RTR] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK,
72 [IB_QPS_RTS] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
73 RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK |
74 RVT_PROCESS_NEXT_SEND_OK,
75 [IB_QPS_SQD] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
76 RVT_POST_SEND_OK | RVT_PROCESS_SEND_OK,
77 [IB_QPS_SQE] = RVT_POST_RECV_OK | RVT_PROCESS_RECV_OK |
78 RVT_POST_SEND_OK | RVT_FLUSH_SEND,
79 [IB_QPS_ERR] = RVT_POST_RECV_OK | RVT_FLUSH_RECV |
80 RVT_POST_SEND_OK | RVT_FLUSH_SEND,
81 };
82 EXPORT_SYMBOL(ib_rvt_state_ops);
83
84 /* platform specific: return the last level cache (llc) size, in KiB */
rvt_wss_llc_size(void)85 static int rvt_wss_llc_size(void)
86 {
87 /* assume that the boot CPU value is universal for all CPUs */
88 return boot_cpu_data.x86_cache_size;
89 }
90
91 /* platform specific: cacheless copy */
cacheless_memcpy(void * dst,void * src,size_t n)92 static void cacheless_memcpy(void *dst, void *src, size_t n)
93 {
94 /*
95 * Use the only available X64 cacheless copy.
96 * The extra fault recovery machinery is not invoked.
97 */
98 copy_to_nontemporal(dst, src, n);
99 }
100
rvt_wss_exit(struct rvt_dev_info * rdi)101 void rvt_wss_exit(struct rvt_dev_info *rdi)
102 {
103 struct rvt_wss *wss = rdi->wss;
104
105 if (!wss)
106 return;
107
108 /* coded to handle partially initialized and repeat callers */
109 kfree(wss->entries);
110 wss->entries = NULL;
111 kfree(rdi->wss);
112 rdi->wss = NULL;
113 }
114
115 /*
116 * rvt_wss_init - Init wss data structures
117 *
118 * Return: 0 on success
119 */
rvt_wss_init(struct rvt_dev_info * rdi)120 int rvt_wss_init(struct rvt_dev_info *rdi)
121 {
122 unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
123 unsigned int wss_threshold = rdi->dparms.wss_threshold;
124 unsigned int wss_clean_period = rdi->dparms.wss_clean_period;
125 long llc_size;
126 long llc_bits;
127 long table_size;
128 long table_bits;
129 struct rvt_wss *wss;
130 int node = rdi->dparms.node;
131
132 if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) {
133 rdi->wss = NULL;
134 return 0;
135 }
136
137 rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node);
138 if (!rdi->wss)
139 return -ENOMEM;
140 wss = rdi->wss;
141
142 /* check for a valid percent range - default to 80 if none or invalid */
143 if (wss_threshold < 1 || wss_threshold > 100)
144 wss_threshold = 80;
145
146 /* reject a wildly large period */
147 if (wss_clean_period > 1000000)
148 wss_clean_period = 256;
149
150 /* reject a zero period */
151 if (wss_clean_period == 0)
152 wss_clean_period = 1;
153
154 /*
155 * Calculate the table size - the next power of 2 larger than the
156 * LLC size. LLC size is in KiB.
157 */
158 llc_size = rvt_wss_llc_size() * 1024;
159 table_size = roundup_pow_of_two(llc_size);
160
161 /* one bit per page in rounded up table */
162 llc_bits = llc_size / PAGE_SIZE;
163 table_bits = table_size / PAGE_SIZE;
164 wss->pages_mask = table_bits - 1;
165 wss->num_entries = table_bits / BITS_PER_LONG;
166
167 wss->threshold = (llc_bits * wss_threshold) / 100;
168 if (wss->threshold == 0)
169 wss->threshold = 1;
170
171 wss->clean_period = wss_clean_period;
172 atomic_set(&wss->clean_counter, wss_clean_period);
173
174 wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries),
175 GFP_KERNEL, node);
176 if (!wss->entries) {
177 rvt_wss_exit(rdi);
178 return -ENOMEM;
179 }
180
181 return 0;
182 }
183
184 /*
185 * Advance the clean counter. When the clean period has expired,
186 * clean an entry.
187 *
188 * This is implemented in atomics to avoid locking. Because multiple
189 * variables are involved, it can be racy which can lead to slightly
190 * inaccurate information. Since this is only a heuristic, this is
191 * OK. Any innaccuracies will clean themselves out as the counter
192 * advances. That said, it is unlikely the entry clean operation will
193 * race - the next possible racer will not start until the next clean
194 * period.
195 *
196 * The clean counter is implemented as a decrement to zero. When zero
197 * is reached an entry is cleaned.
198 */
wss_advance_clean_counter(struct rvt_wss * wss)199 static void wss_advance_clean_counter(struct rvt_wss *wss)
200 {
201 int entry;
202 int weight;
203 unsigned long bits;
204
205 /* become the cleaner if we decrement the counter to zero */
206 if (atomic_dec_and_test(&wss->clean_counter)) {
207 /*
208 * Set, not add, the clean period. This avoids an issue
209 * where the counter could decrement below the clean period.
210 * Doing a set can result in lost decrements, slowing the
211 * clean advance. Since this a heuristic, this possible
212 * slowdown is OK.
213 *
214 * An alternative is to loop, advancing the counter by a
215 * clean period until the result is > 0. However, this could
216 * lead to several threads keeping another in the clean loop.
217 * This could be mitigated by limiting the number of times
218 * we stay in the loop.
219 */
220 atomic_set(&wss->clean_counter, wss->clean_period);
221
222 /*
223 * Uniquely grab the entry to clean and move to next.
224 * The current entry is always the lower bits of
225 * wss.clean_entry. The table size, wss.num_entries,
226 * is always a power-of-2.
227 */
228 entry = (atomic_inc_return(&wss->clean_entry) - 1)
229 & (wss->num_entries - 1);
230
231 /* clear the entry and count the bits */
232 bits = xchg(&wss->entries[entry], 0);
233 weight = hweight64((u64)bits);
234 /* only adjust the contended total count if needed */
235 if (weight)
236 atomic_sub(weight, &wss->total_count);
237 }
238 }
239
240 /*
241 * Insert the given address into the working set array.
242 */
wss_insert(struct rvt_wss * wss,void * address)243 static void wss_insert(struct rvt_wss *wss, void *address)
244 {
245 u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask;
246 u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
247 u32 nr = page & (BITS_PER_LONG - 1);
248
249 if (!test_and_set_bit(nr, &wss->entries[entry]))
250 atomic_inc(&wss->total_count);
251
252 wss_advance_clean_counter(wss);
253 }
254
255 /*
256 * Is the working set larger than the threshold?
257 */
wss_exceeds_threshold(struct rvt_wss * wss)258 static inline bool wss_exceeds_threshold(struct rvt_wss *wss)
259 {
260 return atomic_read(&wss->total_count) >= wss->threshold;
261 }
262
get_map_page(struct rvt_qpn_table * qpt,struct rvt_qpn_map * map)263 static void get_map_page(struct rvt_qpn_table *qpt,
264 struct rvt_qpn_map *map)
265 {
266 unsigned long page = get_zeroed_page(GFP_KERNEL);
267
268 /*
269 * Free the page if someone raced with us installing it.
270 */
271
272 spin_lock(&qpt->lock);
273 if (map->page)
274 free_page(page);
275 else
276 map->page = (void *)page;
277 spin_unlock(&qpt->lock);
278 }
279
280 /**
281 * init_qpn_table - initialize the QP number table for a device
282 * @rdi: rvt dev struct
283 * @qpt: the QPN table
284 */
init_qpn_table(struct rvt_dev_info * rdi,struct rvt_qpn_table * qpt)285 static int init_qpn_table(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt)
286 {
287 u32 offset, i;
288 struct rvt_qpn_map *map;
289 int ret = 0;
290
291 if (!(rdi->dparms.qpn_res_end >= rdi->dparms.qpn_res_start))
292 return -EINVAL;
293
294 spin_lock_init(&qpt->lock);
295
296 qpt->last = rdi->dparms.qpn_start;
297 qpt->incr = rdi->dparms.qpn_inc << rdi->dparms.qos_shift;
298
299 /*
300 * Drivers may want some QPs beyond what we need for verbs let them use
301 * our qpn table. No need for two. Lets go ahead and mark the bitmaps
302 * for those. The reserved range must be *after* the range which verbs
303 * will pick from.
304 */
305
306 /* Figure out number of bit maps needed before reserved range */
307 qpt->nmaps = rdi->dparms.qpn_res_start / RVT_BITS_PER_PAGE;
308
309 /* This should always be zero */
310 offset = rdi->dparms.qpn_res_start & RVT_BITS_PER_PAGE_MASK;
311
312 /* Starting with the first reserved bit map */
313 map = &qpt->map[qpt->nmaps];
314
315 rvt_pr_info(rdi, "Reserving QPNs from 0x%x to 0x%x for non-verbs use\n",
316 rdi->dparms.qpn_res_start, rdi->dparms.qpn_res_end);
317 for (i = rdi->dparms.qpn_res_start; i <= rdi->dparms.qpn_res_end; i++) {
318 if (!map->page) {
319 get_map_page(qpt, map);
320 if (!map->page) {
321 ret = -ENOMEM;
322 break;
323 }
324 }
325 set_bit(offset, map->page);
326 offset++;
327 if (offset == RVT_BITS_PER_PAGE) {
328 /* next page */
329 qpt->nmaps++;
330 map++;
331 offset = 0;
332 }
333 }
334 return ret;
335 }
336
337 /**
338 * free_qpn_table - free the QP number table for a device
339 * @qpt: the QPN table
340 */
free_qpn_table(struct rvt_qpn_table * qpt)341 static void free_qpn_table(struct rvt_qpn_table *qpt)
342 {
343 int i;
344
345 for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
346 free_page((unsigned long)qpt->map[i].page);
347 }
348
349 /**
350 * rvt_driver_qp_init - Init driver qp resources
351 * @rdi: rvt dev strucutre
352 *
353 * Return: 0 on success
354 */
rvt_driver_qp_init(struct rvt_dev_info * rdi)355 int rvt_driver_qp_init(struct rvt_dev_info *rdi)
356 {
357 int i;
358 int ret = -ENOMEM;
359
360 if (!rdi->dparms.qp_table_size)
361 return -EINVAL;
362
363 /*
364 * If driver is not doing any QP allocation then make sure it is
365 * providing the necessary QP functions.
366 */
367 if (!rdi->driver_f.free_all_qps ||
368 !rdi->driver_f.qp_priv_alloc ||
369 !rdi->driver_f.qp_priv_free ||
370 !rdi->driver_f.notify_qp_reset ||
371 !rdi->driver_f.notify_restart_rc)
372 return -EINVAL;
373
374 /* allocate parent object */
375 rdi->qp_dev = kzalloc_node(sizeof(*rdi->qp_dev), GFP_KERNEL,
376 rdi->dparms.node);
377 if (!rdi->qp_dev)
378 return -ENOMEM;
379
380 /* allocate hash table */
381 rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size;
382 rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size);
383 rdi->qp_dev->qp_table =
384 kmalloc_array_node(rdi->qp_dev->qp_table_size,
385 sizeof(*rdi->qp_dev->qp_table),
386 GFP_KERNEL, rdi->dparms.node);
387 if (!rdi->qp_dev->qp_table)
388 goto no_qp_table;
389
390 for (i = 0; i < rdi->qp_dev->qp_table_size; i++)
391 RCU_INIT_POINTER(rdi->qp_dev->qp_table[i], NULL);
392
393 spin_lock_init(&rdi->qp_dev->qpt_lock);
394
395 /* initialize qpn map */
396 if (init_qpn_table(rdi, &rdi->qp_dev->qpn_table))
397 goto fail_table;
398
399 spin_lock_init(&rdi->n_qps_lock);
400
401 return 0;
402
403 fail_table:
404 kfree(rdi->qp_dev->qp_table);
405 free_qpn_table(&rdi->qp_dev->qpn_table);
406
407 no_qp_table:
408 kfree(rdi->qp_dev);
409
410 return ret;
411 }
412
413 /**
414 * rvt_free_qp_cb - callback function to reset a qp
415 * @qp: the qp to reset
416 * @v: a 64-bit value
417 *
418 * This function resets the qp and removes it from the
419 * qp hash table.
420 */
rvt_free_qp_cb(struct rvt_qp * qp,u64 v)421 static void rvt_free_qp_cb(struct rvt_qp *qp, u64 v)
422 {
423 unsigned int *qp_inuse = (unsigned int *)v;
424 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
425
426 /* Reset the qp and remove it from the qp hash list */
427 rvt_reset_qp(rdi, qp, qp->ibqp.qp_type);
428
429 /* Increment the qp_inuse count */
430 (*qp_inuse)++;
431 }
432
433 /**
434 * rvt_free_all_qps - check for QPs still in use
435 * @rdi: rvt device info structure
436 *
437 * There should not be any QPs still in use.
438 * Free memory for table.
439 * Return the number of QPs still in use.
440 */
rvt_free_all_qps(struct rvt_dev_info * rdi)441 static unsigned rvt_free_all_qps(struct rvt_dev_info *rdi)
442 {
443 unsigned int qp_inuse = 0;
444
445 qp_inuse += rvt_mcast_tree_empty(rdi);
446
447 rvt_qp_iter(rdi, (u64)&qp_inuse, rvt_free_qp_cb);
448
449 return qp_inuse;
450 }
451
452 /**
453 * rvt_qp_exit - clean up qps on device exit
454 * @rdi: rvt dev structure
455 *
456 * Check for qp leaks and free resources.
457 */
rvt_qp_exit(struct rvt_dev_info * rdi)458 void rvt_qp_exit(struct rvt_dev_info *rdi)
459 {
460 u32 qps_inuse = rvt_free_all_qps(rdi);
461
462 if (qps_inuse)
463 rvt_pr_err(rdi, "QP memory leak! %u still in use\n",
464 qps_inuse);
465
466 kfree(rdi->qp_dev->qp_table);
467 free_qpn_table(&rdi->qp_dev->qpn_table);
468 kfree(rdi->qp_dev);
469 }
470
mk_qpn(struct rvt_qpn_table * qpt,struct rvt_qpn_map * map,unsigned off)471 static inline unsigned mk_qpn(struct rvt_qpn_table *qpt,
472 struct rvt_qpn_map *map, unsigned off)
473 {
474 return (map - qpt->map) * RVT_BITS_PER_PAGE + off;
475 }
476
477 /**
478 * alloc_qpn - Allocate the next available qpn or zero/one for QP type
479 * IB_QPT_SMI/IB_QPT_GSI
480 * @rdi: rvt device info structure
481 * @qpt: queue pair number table pointer
482 * @type: the QP type
483 * @port_num: IB port number, 1 based, comes from core
484 * @exclude_prefix: prefix of special queue pair number being allocated
485 *
486 * Return: The queue pair number
487 */
alloc_qpn(struct rvt_dev_info * rdi,struct rvt_qpn_table * qpt,enum ib_qp_type type,u8 port_num,u8 exclude_prefix)488 static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt,
489 enum ib_qp_type type, u8 port_num, u8 exclude_prefix)
490 {
491 u32 i, offset, max_scan, qpn;
492 struct rvt_qpn_map *map;
493 int ret;
494 u32 max_qpn = exclude_prefix == RVT_AIP_QP_PREFIX ?
495 RVT_AIP_QPN_MAX : RVT_QPN_MAX;
496
497 if (rdi->driver_f.alloc_qpn)
498 return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num);
499
500 if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
501 unsigned n;
502
503 ret = type == IB_QPT_GSI;
504 n = 1 << (ret + 2 * (port_num - 1));
505 spin_lock(&qpt->lock);
506 if (qpt->flags & n)
507 ret = -EINVAL;
508 else
509 qpt->flags |= n;
510 spin_unlock(&qpt->lock);
511
512 return ret;
513 }
514
515 qpn = qpt->last + qpt->incr;
516 if (qpn >= max_qpn)
517 qpn = qpt->incr | ((qpt->last & 1) ^ 1);
518 /* offset carries bit 0 */
519 offset = qpn & RVT_BITS_PER_PAGE_MASK;
520 map = &qpt->map[qpn / RVT_BITS_PER_PAGE];
521 max_scan = qpt->nmaps - !offset;
522 for (i = 0;;) {
523 if (unlikely(!map->page)) {
524 get_map_page(qpt, map);
525 if (unlikely(!map->page))
526 break;
527 }
528 do {
529 if (!test_and_set_bit(offset, map->page)) {
530 qpt->last = qpn;
531 ret = qpn;
532
533 return ret;
534 }
535 offset += qpt->incr;
536 /*
537 * This qpn might be bogus if offset >= BITS_PER_PAGE.
538 * That is OK. It gets re-assigned below
539 */
540 qpn = mk_qpn(qpt, map, offset);
541 } while (offset < RVT_BITS_PER_PAGE && qpn < RVT_QPN_MAX);
542 /*
543 * In order to keep the number of pages allocated to a
544 * minimum, we scan the all existing pages before increasing
545 * the size of the bitmap table.
546 */
547 if (++i > max_scan) {
548 if (qpt->nmaps == RVT_QPNMAP_ENTRIES)
549 break;
550 map = &qpt->map[qpt->nmaps++];
551 /* start at incr with current bit 0 */
552 offset = qpt->incr | (offset & 1);
553 } else if (map < &qpt->map[qpt->nmaps]) {
554 ++map;
555 /* start at incr with current bit 0 */
556 offset = qpt->incr | (offset & 1);
557 } else {
558 map = &qpt->map[0];
559 /* wrap to first map page, invert bit 0 */
560 offset = qpt->incr | ((offset & 1) ^ 1);
561 }
562 /* there can be no set bits in low-order QoS bits */
563 WARN_ON(rdi->dparms.qos_shift > 1 &&
564 offset & ((BIT(rdi->dparms.qos_shift - 1) - 1) << 1));
565 qpn = mk_qpn(qpt, map, offset);
566 }
567
568 return -ENOMEM;
569 }
570
571 /**
572 * rvt_clear_mr_refs - Drop help mr refs
573 * @qp: rvt qp data structure
574 * @clr_sends: If shoudl clear send side or not
575 */
rvt_clear_mr_refs(struct rvt_qp * qp,int clr_sends)576 static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
577 {
578 unsigned n;
579 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
580
581 if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags))
582 rvt_put_ss(&qp->s_rdma_read_sge);
583
584 rvt_put_ss(&qp->r_sge);
585
586 if (clr_sends) {
587 while (qp->s_last != qp->s_head) {
588 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
589
590 rvt_put_qp_swqe(qp, wqe);
591 if (++qp->s_last >= qp->s_size)
592 qp->s_last = 0;
593 smp_wmb(); /* see qp_set_savail */
594 }
595 if (qp->s_rdma_mr) {
596 rvt_put_mr(qp->s_rdma_mr);
597 qp->s_rdma_mr = NULL;
598 }
599 }
600
601 for (n = 0; qp->s_ack_queue && n < rvt_max_atomic(rdi); n++) {
602 struct rvt_ack_entry *e = &qp->s_ack_queue[n];
603
604 if (e->rdma_sge.mr) {
605 rvt_put_mr(e->rdma_sge.mr);
606 e->rdma_sge.mr = NULL;
607 }
608 }
609 }
610
611 /**
612 * rvt_swqe_has_lkey - return true if lkey is used by swqe
613 * @wqe: the send wqe
614 * @lkey: the lkey
615 *
616 * Test the swqe for using lkey
617 */
rvt_swqe_has_lkey(struct rvt_swqe * wqe,u32 lkey)618 static bool rvt_swqe_has_lkey(struct rvt_swqe *wqe, u32 lkey)
619 {
620 int i;
621
622 for (i = 0; i < wqe->wr.num_sge; i++) {
623 struct rvt_sge *sge = &wqe->sg_list[i];
624
625 if (rvt_mr_has_lkey(sge->mr, lkey))
626 return true;
627 }
628 return false;
629 }
630
631 /**
632 * rvt_qp_sends_has_lkey - return true is qp sends use lkey
633 * @qp: the rvt_qp
634 * @lkey: the lkey
635 */
rvt_qp_sends_has_lkey(struct rvt_qp * qp,u32 lkey)636 static bool rvt_qp_sends_has_lkey(struct rvt_qp *qp, u32 lkey)
637 {
638 u32 s_last = qp->s_last;
639
640 while (s_last != qp->s_head) {
641 struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, s_last);
642
643 if (rvt_swqe_has_lkey(wqe, lkey))
644 return true;
645
646 if (++s_last >= qp->s_size)
647 s_last = 0;
648 }
649 if (qp->s_rdma_mr)
650 if (rvt_mr_has_lkey(qp->s_rdma_mr, lkey))
651 return true;
652 return false;
653 }
654
655 /**
656 * rvt_qp_acks_has_lkey - return true if acks have lkey
657 * @qp: the qp
658 * @lkey: the lkey
659 */
rvt_qp_acks_has_lkey(struct rvt_qp * qp,u32 lkey)660 static bool rvt_qp_acks_has_lkey(struct rvt_qp *qp, u32 lkey)
661 {
662 int i;
663 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
664
665 for (i = 0; qp->s_ack_queue && i < rvt_max_atomic(rdi); i++) {
666 struct rvt_ack_entry *e = &qp->s_ack_queue[i];
667
668 if (rvt_mr_has_lkey(e->rdma_sge.mr, lkey))
669 return true;
670 }
671 return false;
672 }
673
674 /**
675 * rvt_qp_mr_clean - clean up remote ops for lkey
676 * @qp: the qp
677 * @lkey: the lkey that is being de-registered
678 *
679 * This routine checks if the lkey is being used by
680 * the qp.
681 *
682 * If so, the qp is put into an error state to elminate
683 * any references from the qp.
684 */
rvt_qp_mr_clean(struct rvt_qp * qp,u32 lkey)685 void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey)
686 {
687 bool lastwqe = false;
688
689 if (qp->ibqp.qp_type == IB_QPT_SMI ||
690 qp->ibqp.qp_type == IB_QPT_GSI)
691 /* avoid special QPs */
692 return;
693 spin_lock_irq(&qp->r_lock);
694 spin_lock(&qp->s_hlock);
695 spin_lock(&qp->s_lock);
696
697 if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
698 goto check_lwqe;
699
700 if (rvt_ss_has_lkey(&qp->r_sge, lkey) ||
701 rvt_qp_sends_has_lkey(qp, lkey) ||
702 rvt_qp_acks_has_lkey(qp, lkey))
703 lastwqe = rvt_error_qp(qp, IB_WC_LOC_PROT_ERR);
704 check_lwqe:
705 spin_unlock(&qp->s_lock);
706 spin_unlock(&qp->s_hlock);
707 spin_unlock_irq(&qp->r_lock);
708 if (lastwqe) {
709 struct ib_event ev;
710
711 ev.device = qp->ibqp.device;
712 ev.element.qp = &qp->ibqp;
713 ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
714 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
715 }
716 }
717
718 /**
719 * rvt_remove_qp - remove qp form table
720 * @rdi: rvt dev struct
721 * @qp: qp to remove
722 *
723 * Remove the QP from the table so it can't be found asynchronously by
724 * the receive routine.
725 */
rvt_remove_qp(struct rvt_dev_info * rdi,struct rvt_qp * qp)726 static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
727 {
728 struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
729 u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
730 unsigned long flags;
731 int removed = 1;
732
733 spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
734
735 if (rcu_dereference_protected(rvp->qp[0],
736 lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
737 RCU_INIT_POINTER(rvp->qp[0], NULL);
738 } else if (rcu_dereference_protected(rvp->qp[1],
739 lockdep_is_held(&rdi->qp_dev->qpt_lock)) == qp) {
740 RCU_INIT_POINTER(rvp->qp[1], NULL);
741 } else {
742 struct rvt_qp *q;
743 struct rvt_qp __rcu **qpp;
744
745 removed = 0;
746 qpp = &rdi->qp_dev->qp_table[n];
747 for (; (q = rcu_dereference_protected(*qpp,
748 lockdep_is_held(&rdi->qp_dev->qpt_lock))) != NULL;
749 qpp = &q->next) {
750 if (q == qp) {
751 RCU_INIT_POINTER(*qpp,
752 rcu_dereference_protected(qp->next,
753 lockdep_is_held(&rdi->qp_dev->qpt_lock)));
754 removed = 1;
755 trace_rvt_qpremove(qp, n);
756 break;
757 }
758 }
759 }
760
761 spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
762 if (removed) {
763 synchronize_rcu();
764 rvt_put_qp(qp);
765 }
766 }
767
768 /**
769 * rvt_alloc_rq - allocate memory for user or kernel buffer
770 * @rq: receive queue data structure
771 * @size: number of request queue entries
772 * @node: The NUMA node
773 * @udata: True if user data is available or not false
774 *
775 * Return: If memory allocation failed, return -ENONEM
776 * This function is used by both shared receive
777 * queues and non-shared receive queues to allocate
778 * memory.
779 */
rvt_alloc_rq(struct rvt_rq * rq,u32 size,int node,struct ib_udata * udata)780 int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
781 struct ib_udata *udata)
782 {
783 if (udata) {
784 rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
785 if (!rq->wq)
786 goto bail;
787 /* need kwq with no buffers */
788 rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
789 if (!rq->kwq)
790 goto bail;
791 rq->kwq->curr_wq = rq->wq->wq;
792 } else {
793 /* need kwq with buffers */
794 rq->kwq =
795 vzalloc_node(sizeof(struct rvt_krwq) + size, node);
796 if (!rq->kwq)
797 goto bail;
798 rq->kwq->curr_wq = rq->kwq->wq;
799 }
800
801 spin_lock_init(&rq->kwq->p_lock);
802 spin_lock_init(&rq->kwq->c_lock);
803 return 0;
804 bail:
805 rvt_free_rq(rq);
806 return -ENOMEM;
807 }
808
809 /**
810 * rvt_init_qp - initialize the QP state to the reset state
811 * @rdi: rvt dev struct
812 * @qp: the QP to init or reinit
813 * @type: the QP type
814 *
815 * This function is called from both rvt_create_qp() and
816 * rvt_reset_qp(). The difference is that the reset
817 * patch the necessary locks to protect against concurent
818 * access.
819 */
rvt_init_qp(struct rvt_dev_info * rdi,struct rvt_qp * qp,enum ib_qp_type type)820 static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
821 enum ib_qp_type type)
822 {
823 qp->remote_qpn = 0;
824 qp->qkey = 0;
825 qp->qp_access_flags = 0;
826 qp->s_flags &= RVT_S_SIGNAL_REQ_WR;
827 qp->s_hdrwords = 0;
828 qp->s_wqe = NULL;
829 qp->s_draining = 0;
830 qp->s_next_psn = 0;
831 qp->s_last_psn = 0;
832 qp->s_sending_psn = 0;
833 qp->s_sending_hpsn = 0;
834 qp->s_psn = 0;
835 qp->r_psn = 0;
836 qp->r_msn = 0;
837 if (type == IB_QPT_RC) {
838 qp->s_state = IB_OPCODE_RC_SEND_LAST;
839 qp->r_state = IB_OPCODE_RC_SEND_LAST;
840 } else {
841 qp->s_state = IB_OPCODE_UC_SEND_LAST;
842 qp->r_state = IB_OPCODE_UC_SEND_LAST;
843 }
844 qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
845 qp->r_nak_state = 0;
846 qp->r_aflags = 0;
847 qp->r_flags = 0;
848 qp->s_head = 0;
849 qp->s_tail = 0;
850 qp->s_cur = 0;
851 qp->s_acked = 0;
852 qp->s_last = 0;
853 qp->s_ssn = 1;
854 qp->s_lsn = 0;
855 qp->s_mig_state = IB_MIG_MIGRATED;
856 qp->r_head_ack_queue = 0;
857 qp->s_tail_ack_queue = 0;
858 qp->s_acked_ack_queue = 0;
859 qp->s_num_rd_atomic = 0;
860 qp->r_sge.num_sge = 0;
861 atomic_set(&qp->s_reserved_used, 0);
862 }
863
864 /**
865 * _rvt_reset_qp - initialize the QP state to the reset state
866 * @rdi: rvt dev struct
867 * @qp: the QP to reset
868 * @type: the QP type
869 *
870 * r_lock, s_hlock, and s_lock are required to be held by the caller
871 */
_rvt_reset_qp(struct rvt_dev_info * rdi,struct rvt_qp * qp,enum ib_qp_type type)872 static void _rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
873 enum ib_qp_type type)
874 __must_hold(&qp->s_lock)
875 __must_hold(&qp->s_hlock)
876 __must_hold(&qp->r_lock)
877 {
878 lockdep_assert_held(&qp->r_lock);
879 lockdep_assert_held(&qp->s_hlock);
880 lockdep_assert_held(&qp->s_lock);
881 if (qp->state != IB_QPS_RESET) {
882 qp->state = IB_QPS_RESET;
883
884 /* Let drivers flush their waitlist */
885 rdi->driver_f.flush_qp_waiters(qp);
886 rvt_stop_rc_timers(qp);
887 qp->s_flags &= ~(RVT_S_TIMER | RVT_S_ANY_WAIT);
888 spin_unlock(&qp->s_lock);
889 spin_unlock(&qp->s_hlock);
890 spin_unlock_irq(&qp->r_lock);
891
892 /* Stop the send queue and the retry timer */
893 rdi->driver_f.stop_send_queue(qp);
894 rvt_del_timers_sync(qp);
895 /* Wait for things to stop */
896 rdi->driver_f.quiesce_qp(qp);
897
898 /* take qp out the hash and wait for it to be unused */
899 rvt_remove_qp(rdi, qp);
900
901 /* grab the lock b/c it was locked at call time */
902 spin_lock_irq(&qp->r_lock);
903 spin_lock(&qp->s_hlock);
904 spin_lock(&qp->s_lock);
905
906 rvt_clear_mr_refs(qp, 1);
907 /*
908 * Let the driver do any tear down or re-init it needs to for
909 * a qp that has been reset
910 */
911 rdi->driver_f.notify_qp_reset(qp);
912 }
913 rvt_init_qp(rdi, qp, type);
914 lockdep_assert_held(&qp->r_lock);
915 lockdep_assert_held(&qp->s_hlock);
916 lockdep_assert_held(&qp->s_lock);
917 }
918
919 /**
920 * rvt_reset_qp - initialize the QP state to the reset state
921 * @rdi: the device info
922 * @qp: the QP to reset
923 * @type: the QP type
924 *
925 * This is the wrapper function to acquire the r_lock, s_hlock, and s_lock
926 * before calling _rvt_reset_qp().
927 */
rvt_reset_qp(struct rvt_dev_info * rdi,struct rvt_qp * qp,enum ib_qp_type type)928 static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
929 enum ib_qp_type type)
930 {
931 spin_lock_irq(&qp->r_lock);
932 spin_lock(&qp->s_hlock);
933 spin_lock(&qp->s_lock);
934 _rvt_reset_qp(rdi, qp, type);
935 spin_unlock(&qp->s_lock);
936 spin_unlock(&qp->s_hlock);
937 spin_unlock_irq(&qp->r_lock);
938 }
939
940 /**
941 * rvt_free_qpn - Free a qpn from the bit map
942 * @qpt: QP table
943 * @qpn: queue pair number to free
944 */
rvt_free_qpn(struct rvt_qpn_table * qpt,u32 qpn)945 static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
946 {
947 struct rvt_qpn_map *map;
948
949 if ((qpn & RVT_AIP_QP_PREFIX_MASK) == RVT_AIP_QP_BASE)
950 qpn &= RVT_AIP_QP_SUFFIX;
951
952 map = qpt->map + (qpn & RVT_QPN_MASK) / RVT_BITS_PER_PAGE;
953 if (map->page)
954 clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page);
955 }
956
957 /**
958 * get_allowed_ops - Given a QP type return the appropriate allowed OP
959 * @type: valid, supported, QP type
960 */
get_allowed_ops(enum ib_qp_type type)961 static u8 get_allowed_ops(enum ib_qp_type type)
962 {
963 return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ?
964 IB_OPCODE_UC : IB_OPCODE_UD;
965 }
966
967 /**
968 * free_ud_wq_attr - Clean up AH attribute cache for UD QPs
969 * @qp: Valid QP with allowed_ops set
970 *
971 * The rvt_swqe data structure being used is a union, so this is
972 * only valid for UD QPs.
973 */
free_ud_wq_attr(struct rvt_qp * qp)974 static void free_ud_wq_attr(struct rvt_qp *qp)
975 {
976 struct rvt_swqe *wqe;
977 int i;
978
979 for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
980 wqe = rvt_get_swqe_ptr(qp, i);
981 kfree(wqe->ud_wr.attr);
982 wqe->ud_wr.attr = NULL;
983 }
984 }
985
986 /**
987 * alloc_ud_wq_attr - AH attribute cache for UD QPs
988 * @qp: Valid QP with allowed_ops set
989 * @node: Numa node for allocation
990 *
991 * The rvt_swqe data structure being used is a union, so this is
992 * only valid for UD QPs.
993 */
alloc_ud_wq_attr(struct rvt_qp * qp,int node)994 static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
995 {
996 struct rvt_swqe *wqe;
997 int i;
998
999 for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
1000 wqe = rvt_get_swqe_ptr(qp, i);
1001 wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
1002 GFP_KERNEL, node);
1003 if (!wqe->ud_wr.attr) {
1004 free_ud_wq_attr(qp);
1005 return -ENOMEM;
1006 }
1007 }
1008
1009 return 0;
1010 }
1011
1012 /**
1013 * rvt_create_qp - create a queue pair for a device
1014 * @ibqp: the queue pair
1015 * @init_attr: the attributes of the queue pair
1016 * @udata: user data for libibverbs.so
1017 *
1018 * Queue pair creation is mostly an rvt issue. However, drivers have their own
1019 * unique idea of what queue pair numbers mean. For instance there is a reserved
1020 * range for PSM.
1021 *
1022 * Return: 0 on success, otherwise returns an errno.
1023 *
1024 * Called by the ib_create_qp() core verbs function.
1025 */
rvt_create_qp(struct ib_qp * ibqp,struct ib_qp_init_attr * init_attr,struct ib_udata * udata)1026 int rvt_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
1027 struct ib_udata *udata)
1028 {
1029 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1030 int ret = -ENOMEM;
1031 struct rvt_swqe *swq = NULL;
1032 size_t sz;
1033 size_t sg_list_sz = 0;
1034 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1035 void *priv = NULL;
1036 size_t sqsize;
1037 u8 exclude_prefix = 0;
1038
1039 if (!rdi)
1040 return -EINVAL;
1041
1042 if (init_attr->create_flags & ~IB_QP_CREATE_NETDEV_USE)
1043 return -EOPNOTSUPP;
1044
1045 if (init_attr->cap.max_send_sge > rdi->dparms.props.max_send_sge ||
1046 init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr)
1047 return -EINVAL;
1048
1049 /* Check receive queue parameters if no SRQ is specified. */
1050 if (!init_attr->srq) {
1051 if (init_attr->cap.max_recv_sge >
1052 rdi->dparms.props.max_recv_sge ||
1053 init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr)
1054 return -EINVAL;
1055
1056 if (init_attr->cap.max_send_sge +
1057 init_attr->cap.max_send_wr +
1058 init_attr->cap.max_recv_sge +
1059 init_attr->cap.max_recv_wr == 0)
1060 return -EINVAL;
1061 }
1062 sqsize =
1063 init_attr->cap.max_send_wr + 1 +
1064 rdi->dparms.reserved_operations;
1065 switch (init_attr->qp_type) {
1066 case IB_QPT_SMI:
1067 case IB_QPT_GSI:
1068 if (init_attr->port_num == 0 ||
1069 init_attr->port_num > ibqp->device->phys_port_cnt)
1070 return -EINVAL;
1071 fallthrough;
1072 case IB_QPT_UC:
1073 case IB_QPT_RC:
1074 case IB_QPT_UD:
1075 sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge);
1076 swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node);
1077 if (!swq)
1078 return -ENOMEM;
1079
1080 if (init_attr->srq) {
1081 struct rvt_srq *srq = ibsrq_to_rvtsrq(init_attr->srq);
1082
1083 if (srq->rq.max_sge > 1)
1084 sg_list_sz = sizeof(*qp->r_sg_list) *
1085 (srq->rq.max_sge - 1);
1086 } else if (init_attr->cap.max_recv_sge > 1)
1087 sg_list_sz = sizeof(*qp->r_sg_list) *
1088 (init_attr->cap.max_recv_sge - 1);
1089 qp->r_sg_list =
1090 kzalloc_node(sg_list_sz, GFP_KERNEL, rdi->dparms.node);
1091 if (!qp->r_sg_list)
1092 goto bail_qp;
1093 qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
1094
1095 RCU_INIT_POINTER(qp->next, NULL);
1096 if (init_attr->qp_type == IB_QPT_RC) {
1097 qp->s_ack_queue =
1098 kcalloc_node(rvt_max_atomic(rdi),
1099 sizeof(*qp->s_ack_queue),
1100 GFP_KERNEL,
1101 rdi->dparms.node);
1102 if (!qp->s_ack_queue)
1103 goto bail_qp;
1104 }
1105 /* initialize timers needed for rc qp */
1106 timer_setup(&qp->s_timer, rvt_rc_timeout, 0);
1107 hrtimer_setup(&qp->s_rnr_timer, rvt_rc_rnr_retry, CLOCK_MONOTONIC,
1108 HRTIMER_MODE_REL);
1109
1110 /*
1111 * Driver needs to set up it's private QP structure and do any
1112 * initialization that is needed.
1113 */
1114 priv = rdi->driver_f.qp_priv_alloc(rdi, qp);
1115 if (IS_ERR(priv)) {
1116 ret = PTR_ERR(priv);
1117 goto bail_qp;
1118 }
1119 qp->priv = priv;
1120 qp->timeout_jiffies =
1121 usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
1122 1000UL);
1123 if (init_attr->srq) {
1124 sz = 0;
1125 } else {
1126 qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
1127 qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
1128 sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
1129 sizeof(struct rvt_rwqe);
1130 ret = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
1131 rdi->dparms.node, udata);
1132 if (ret)
1133 goto bail_driver_priv;
1134 }
1135
1136 /*
1137 * ib_create_qp() will initialize qp->ibqp
1138 * except for qp->ibqp.qp_num.
1139 */
1140 spin_lock_init(&qp->r_lock);
1141 spin_lock_init(&qp->s_hlock);
1142 spin_lock_init(&qp->s_lock);
1143 atomic_set(&qp->refcount, 0);
1144 atomic_set(&qp->local_ops_pending, 0);
1145 init_waitqueue_head(&qp->wait);
1146 INIT_LIST_HEAD(&qp->rspwait);
1147 qp->state = IB_QPS_RESET;
1148 qp->s_wq = swq;
1149 qp->s_size = sqsize;
1150 qp->s_avail = init_attr->cap.max_send_wr;
1151 qp->s_max_sge = init_attr->cap.max_send_sge;
1152 if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
1153 qp->s_flags = RVT_S_SIGNAL_REQ_WR;
1154 ret = alloc_ud_wq_attr(qp, rdi->dparms.node);
1155 if (ret)
1156 goto bail_rq_rvt;
1157
1158 if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
1159 exclude_prefix = RVT_AIP_QP_PREFIX;
1160
1161 ret = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
1162 init_attr->qp_type,
1163 init_attr->port_num,
1164 exclude_prefix);
1165 if (ret < 0)
1166 goto bail_rq_wq;
1167
1168 qp->ibqp.qp_num = ret;
1169 if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE)
1170 qp->ibqp.qp_num |= RVT_AIP_QP_BASE;
1171 qp->port_num = init_attr->port_num;
1172 rvt_init_qp(rdi, qp, init_attr->qp_type);
1173 if (rdi->driver_f.qp_priv_init) {
1174 ret = rdi->driver_f.qp_priv_init(rdi, qp, init_attr);
1175 if (ret)
1176 goto bail_rq_wq;
1177 }
1178 break;
1179
1180 default:
1181 /* Don't support raw QPs */
1182 return -EOPNOTSUPP;
1183 }
1184
1185 init_attr->cap.max_inline_data = 0;
1186
1187 /*
1188 * Return the address of the RWQ as the offset to mmap.
1189 * See rvt_mmap() for details.
1190 */
1191 if (udata && udata->outlen >= sizeof(__u64)) {
1192 if (!qp->r_rq.wq) {
1193 __u64 offset = 0;
1194
1195 ret = ib_copy_to_udata(udata, &offset,
1196 sizeof(offset));
1197 if (ret)
1198 goto bail_qpn;
1199 } else {
1200 u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
1201
1202 qp->ip = rvt_create_mmap_info(rdi, s, udata,
1203 qp->r_rq.wq);
1204 if (IS_ERR(qp->ip)) {
1205 ret = PTR_ERR(qp->ip);
1206 goto bail_qpn;
1207 }
1208
1209 ret = ib_copy_to_udata(udata, &qp->ip->offset,
1210 sizeof(qp->ip->offset));
1211 if (ret)
1212 goto bail_ip;
1213 }
1214 qp->pid = current->pid;
1215 }
1216
1217 spin_lock(&rdi->n_qps_lock);
1218 if (rdi->n_qps_allocated == rdi->dparms.props.max_qp) {
1219 spin_unlock(&rdi->n_qps_lock);
1220 ret = -ENOMEM;
1221 goto bail_ip;
1222 }
1223
1224 rdi->n_qps_allocated++;
1225 /*
1226 * Maintain a busy_jiffies variable that will be added to the timeout
1227 * period in mod_retry_timer and add_retry_timer. This busy jiffies
1228 * is scaled by the number of rc qps created for the device to reduce
1229 * the number of timeouts occurring when there is a large number of
1230 * qps. busy_jiffies is incremented every rc qp scaling interval.
1231 * The scaling interval is selected based on extensive performance
1232 * evaluation of targeted workloads.
1233 */
1234 if (init_attr->qp_type == IB_QPT_RC) {
1235 rdi->n_rc_qps++;
1236 rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
1237 }
1238 spin_unlock(&rdi->n_qps_lock);
1239
1240 if (qp->ip) {
1241 spin_lock_irq(&rdi->pending_lock);
1242 list_add(&qp->ip->pending_mmaps, &rdi->pending_mmaps);
1243 spin_unlock_irq(&rdi->pending_lock);
1244 }
1245
1246 return 0;
1247
1248 bail_ip:
1249 if (qp->ip)
1250 kref_put(&qp->ip->ref, rvt_release_mmap_info);
1251
1252 bail_qpn:
1253 rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
1254
1255 bail_rq_wq:
1256 free_ud_wq_attr(qp);
1257
1258 bail_rq_rvt:
1259 rvt_free_rq(&qp->r_rq);
1260
1261 bail_driver_priv:
1262 rdi->driver_f.qp_priv_free(rdi, qp);
1263
1264 bail_qp:
1265 kfree(qp->s_ack_queue);
1266 kfree(qp->r_sg_list);
1267 vfree(swq);
1268 return ret;
1269 }
1270
1271 /**
1272 * rvt_error_qp - put a QP into the error state
1273 * @qp: the QP to put into the error state
1274 * @err: the receive completion error to signal if a RWQE is active
1275 *
1276 * Flushes both send and receive work queues.
1277 *
1278 * Return: true if last WQE event should be generated.
1279 * The QP r_lock and s_lock should be held and interrupts disabled.
1280 * If we are already in error state, just return.
1281 */
rvt_error_qp(struct rvt_qp * qp,enum ib_wc_status err)1282 int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
1283 {
1284 struct ib_wc wc;
1285 int ret = 0;
1286 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
1287
1288 lockdep_assert_held(&qp->r_lock);
1289 lockdep_assert_held(&qp->s_lock);
1290 if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
1291 goto bail;
1292
1293 qp->state = IB_QPS_ERR;
1294
1295 if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
1296 qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
1297 timer_delete(&qp->s_timer);
1298 }
1299
1300 if (qp->s_flags & RVT_S_ANY_WAIT_SEND)
1301 qp->s_flags &= ~RVT_S_ANY_WAIT_SEND;
1302
1303 rdi->driver_f.notify_error_qp(qp);
1304
1305 /* Schedule the sending tasklet to drain the send work queue. */
1306 if (READ_ONCE(qp->s_last) != qp->s_head)
1307 rdi->driver_f.schedule_send(qp);
1308
1309 rvt_clear_mr_refs(qp, 0);
1310
1311 memset(&wc, 0, sizeof(wc));
1312 wc.qp = &qp->ibqp;
1313 wc.opcode = IB_WC_RECV;
1314
1315 if (test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) {
1316 wc.wr_id = qp->r_wr_id;
1317 wc.status = err;
1318 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
1319 }
1320 wc.status = IB_WC_WR_FLUSH_ERR;
1321
1322 if (qp->r_rq.kwq) {
1323 u32 head;
1324 u32 tail;
1325 struct rvt_rwq *wq = NULL;
1326 struct rvt_krwq *kwq = NULL;
1327
1328 spin_lock(&qp->r_rq.kwq->c_lock);
1329 /* qp->ip used to validate if there is a user buffer mmaped */
1330 if (qp->ip) {
1331 wq = qp->r_rq.wq;
1332 head = RDMA_READ_UAPI_ATOMIC(wq->head);
1333 tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
1334 } else {
1335 kwq = qp->r_rq.kwq;
1336 head = kwq->head;
1337 tail = kwq->tail;
1338 }
1339 /* sanity check pointers before trusting them */
1340 if (head >= qp->r_rq.size)
1341 head = 0;
1342 if (tail >= qp->r_rq.size)
1343 tail = 0;
1344 while (tail != head) {
1345 wc.wr_id = rvt_get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
1346 if (++tail >= qp->r_rq.size)
1347 tail = 0;
1348 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
1349 }
1350 if (qp->ip)
1351 RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
1352 else
1353 kwq->tail = tail;
1354 spin_unlock(&qp->r_rq.kwq->c_lock);
1355 } else if (qp->ibqp.event_handler) {
1356 ret = 1;
1357 }
1358
1359 bail:
1360 return ret;
1361 }
1362 EXPORT_SYMBOL(rvt_error_qp);
1363
1364 /*
1365 * Put the QP into the hash table.
1366 * The hash table holds a reference to the QP.
1367 */
rvt_insert_qp(struct rvt_dev_info * rdi,struct rvt_qp * qp)1368 static void rvt_insert_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
1369 {
1370 struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
1371 unsigned long flags;
1372
1373 rvt_get_qp(qp);
1374 spin_lock_irqsave(&rdi->qp_dev->qpt_lock, flags);
1375
1376 if (qp->ibqp.qp_num <= 1) {
1377 rcu_assign_pointer(rvp->qp[qp->ibqp.qp_num], qp);
1378 } else {
1379 u32 n = hash_32(qp->ibqp.qp_num, rdi->qp_dev->qp_table_bits);
1380
1381 qp->next = rdi->qp_dev->qp_table[n];
1382 rcu_assign_pointer(rdi->qp_dev->qp_table[n], qp);
1383 trace_rvt_qpinsert(qp, n);
1384 }
1385
1386 spin_unlock_irqrestore(&rdi->qp_dev->qpt_lock, flags);
1387 }
1388
1389 /**
1390 * rvt_modify_qp - modify the attributes of a queue pair
1391 * @ibqp: the queue pair who's attributes we're modifying
1392 * @attr: the new attributes
1393 * @attr_mask: the mask of attributes to modify
1394 * @udata: user data for libibverbs.so
1395 *
1396 * Return: 0 on success, otherwise returns an errno.
1397 */
rvt_modify_qp(struct ib_qp * ibqp,struct ib_qp_attr * attr,int attr_mask,struct ib_udata * udata)1398 int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
1399 int attr_mask, struct ib_udata *udata)
1400 {
1401 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1402 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1403 enum ib_qp_state cur_state, new_state;
1404 struct ib_event ev;
1405 int lastwqe = 0;
1406 int mig = 0;
1407 int pmtu = 0; /* for gcc warning only */
1408 int opa_ah;
1409
1410 if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
1411 return -EOPNOTSUPP;
1412
1413 spin_lock_irq(&qp->r_lock);
1414 spin_lock(&qp->s_hlock);
1415 spin_lock(&qp->s_lock);
1416
1417 cur_state = attr_mask & IB_QP_CUR_STATE ?
1418 attr->cur_qp_state : qp->state;
1419 new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
1420 opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num);
1421
1422 if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
1423 attr_mask))
1424 goto inval;
1425
1426 if (rdi->driver_f.check_modify_qp &&
1427 rdi->driver_f.check_modify_qp(qp, attr, attr_mask, udata))
1428 goto inval;
1429
1430 if (attr_mask & IB_QP_AV) {
1431 if (opa_ah) {
1432 if (rdma_ah_get_dlid(&attr->ah_attr) >=
1433 opa_get_mcast_base(OPA_MCAST_NR))
1434 goto inval;
1435 } else {
1436 if (rdma_ah_get_dlid(&attr->ah_attr) >=
1437 be16_to_cpu(IB_MULTICAST_LID_BASE))
1438 goto inval;
1439 }
1440
1441 if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr))
1442 goto inval;
1443 }
1444
1445 if (attr_mask & IB_QP_ALT_PATH) {
1446 if (opa_ah) {
1447 if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
1448 opa_get_mcast_base(OPA_MCAST_NR))
1449 goto inval;
1450 } else {
1451 if (rdma_ah_get_dlid(&attr->alt_ah_attr) >=
1452 be16_to_cpu(IB_MULTICAST_LID_BASE))
1453 goto inval;
1454 }
1455
1456 if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
1457 goto inval;
1458 if (attr->alt_pkey_index >= rvt_get_npkeys(rdi))
1459 goto inval;
1460 }
1461
1462 if (attr_mask & IB_QP_PKEY_INDEX)
1463 if (attr->pkey_index >= rvt_get_npkeys(rdi))
1464 goto inval;
1465
1466 if (attr_mask & IB_QP_MIN_RNR_TIMER)
1467 if (attr->min_rnr_timer > 31)
1468 goto inval;
1469
1470 if (attr_mask & IB_QP_PORT)
1471 if (qp->ibqp.qp_type == IB_QPT_SMI ||
1472 qp->ibqp.qp_type == IB_QPT_GSI ||
1473 attr->port_num == 0 ||
1474 attr->port_num > ibqp->device->phys_port_cnt)
1475 goto inval;
1476
1477 if (attr_mask & IB_QP_DEST_QPN)
1478 if (attr->dest_qp_num > RVT_QPN_MASK)
1479 goto inval;
1480
1481 if (attr_mask & IB_QP_RETRY_CNT)
1482 if (attr->retry_cnt > 7)
1483 goto inval;
1484
1485 if (attr_mask & IB_QP_RNR_RETRY)
1486 if (attr->rnr_retry > 7)
1487 goto inval;
1488
1489 /*
1490 * Don't allow invalid path_mtu values. OK to set greater
1491 * than the active mtu (or even the max_cap, if we have tuned
1492 * that to a small mtu. We'll set qp->path_mtu
1493 * to the lesser of requested attribute mtu and active,
1494 * for packetizing messages.
1495 * Note that the QP port has to be set in INIT and MTU in RTR.
1496 */
1497 if (attr_mask & IB_QP_PATH_MTU) {
1498 pmtu = rdi->driver_f.get_pmtu_from_attr(rdi, qp, attr);
1499 if (pmtu < 0)
1500 goto inval;
1501 }
1502
1503 if (attr_mask & IB_QP_PATH_MIG_STATE) {
1504 if (attr->path_mig_state == IB_MIG_REARM) {
1505 if (qp->s_mig_state == IB_MIG_ARMED)
1506 goto inval;
1507 if (new_state != IB_QPS_RTS)
1508 goto inval;
1509 } else if (attr->path_mig_state == IB_MIG_MIGRATED) {
1510 if (qp->s_mig_state == IB_MIG_REARM)
1511 goto inval;
1512 if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
1513 goto inval;
1514 if (qp->s_mig_state == IB_MIG_ARMED)
1515 mig = 1;
1516 } else {
1517 goto inval;
1518 }
1519 }
1520
1521 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1522 if (attr->max_dest_rd_atomic > rdi->dparms.max_rdma_atomic)
1523 goto inval;
1524
1525 switch (new_state) {
1526 case IB_QPS_RESET:
1527 if (qp->state != IB_QPS_RESET)
1528 _rvt_reset_qp(rdi, qp, ibqp->qp_type);
1529 break;
1530
1531 case IB_QPS_RTR:
1532 /* Allow event to re-trigger if QP set to RTR more than once */
1533 qp->r_flags &= ~RVT_R_COMM_EST;
1534 qp->state = new_state;
1535 break;
1536
1537 case IB_QPS_SQD:
1538 qp->s_draining = qp->s_last != qp->s_cur;
1539 qp->state = new_state;
1540 break;
1541
1542 case IB_QPS_SQE:
1543 if (qp->ibqp.qp_type == IB_QPT_RC)
1544 goto inval;
1545 qp->state = new_state;
1546 break;
1547
1548 case IB_QPS_ERR:
1549 lastwqe = rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
1550 break;
1551
1552 default:
1553 qp->state = new_state;
1554 break;
1555 }
1556
1557 if (attr_mask & IB_QP_PKEY_INDEX)
1558 qp->s_pkey_index = attr->pkey_index;
1559
1560 if (attr_mask & IB_QP_PORT)
1561 qp->port_num = attr->port_num;
1562
1563 if (attr_mask & IB_QP_DEST_QPN)
1564 qp->remote_qpn = attr->dest_qp_num;
1565
1566 if (attr_mask & IB_QP_SQ_PSN) {
1567 qp->s_next_psn = attr->sq_psn & rdi->dparms.psn_modify_mask;
1568 qp->s_psn = qp->s_next_psn;
1569 qp->s_sending_psn = qp->s_next_psn;
1570 qp->s_last_psn = qp->s_next_psn - 1;
1571 qp->s_sending_hpsn = qp->s_last_psn;
1572 }
1573
1574 if (attr_mask & IB_QP_RQ_PSN)
1575 qp->r_psn = attr->rq_psn & rdi->dparms.psn_modify_mask;
1576
1577 if (attr_mask & IB_QP_ACCESS_FLAGS)
1578 qp->qp_access_flags = attr->qp_access_flags;
1579
1580 if (attr_mask & IB_QP_AV) {
1581 rdma_replace_ah_attr(&qp->remote_ah_attr, &attr->ah_attr);
1582 qp->s_srate = rdma_ah_get_static_rate(&attr->ah_attr);
1583 qp->srate_mbps = ib_rate_to_mbps(qp->s_srate);
1584 }
1585
1586 if (attr_mask & IB_QP_ALT_PATH) {
1587 rdma_replace_ah_attr(&qp->alt_ah_attr, &attr->alt_ah_attr);
1588 qp->s_alt_pkey_index = attr->alt_pkey_index;
1589 }
1590
1591 if (attr_mask & IB_QP_PATH_MIG_STATE) {
1592 qp->s_mig_state = attr->path_mig_state;
1593 if (mig) {
1594 qp->remote_ah_attr = qp->alt_ah_attr;
1595 qp->port_num = rdma_ah_get_port_num(&qp->alt_ah_attr);
1596 qp->s_pkey_index = qp->s_alt_pkey_index;
1597 }
1598 }
1599
1600 if (attr_mask & IB_QP_PATH_MTU) {
1601 qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu);
1602 qp->log_pmtu = ilog2(qp->pmtu);
1603 }
1604
1605 if (attr_mask & IB_QP_RETRY_CNT) {
1606 qp->s_retry_cnt = attr->retry_cnt;
1607 qp->s_retry = attr->retry_cnt;
1608 }
1609
1610 if (attr_mask & IB_QP_RNR_RETRY) {
1611 qp->s_rnr_retry_cnt = attr->rnr_retry;
1612 qp->s_rnr_retry = attr->rnr_retry;
1613 }
1614
1615 if (attr_mask & IB_QP_MIN_RNR_TIMER)
1616 qp->r_min_rnr_timer = attr->min_rnr_timer;
1617
1618 if (attr_mask & IB_QP_TIMEOUT) {
1619 qp->timeout = attr->timeout;
1620 qp->timeout_jiffies = rvt_timeout_to_jiffies(qp->timeout);
1621 }
1622
1623 if (attr_mask & IB_QP_QKEY)
1624 qp->qkey = attr->qkey;
1625
1626 if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1627 qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
1628
1629 if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
1630 qp->s_max_rd_atomic = attr->max_rd_atomic;
1631
1632 if (rdi->driver_f.modify_qp)
1633 rdi->driver_f.modify_qp(qp, attr, attr_mask, udata);
1634
1635 spin_unlock(&qp->s_lock);
1636 spin_unlock(&qp->s_hlock);
1637 spin_unlock_irq(&qp->r_lock);
1638
1639 if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1640 rvt_insert_qp(rdi, qp);
1641
1642 if (lastwqe) {
1643 ev.device = qp->ibqp.device;
1644 ev.element.qp = &qp->ibqp;
1645 ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
1646 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1647 }
1648 if (mig) {
1649 ev.device = qp->ibqp.device;
1650 ev.element.qp = &qp->ibqp;
1651 ev.event = IB_EVENT_PATH_MIG;
1652 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
1653 }
1654 return 0;
1655
1656 inval:
1657 spin_unlock(&qp->s_lock);
1658 spin_unlock(&qp->s_hlock);
1659 spin_unlock_irq(&qp->r_lock);
1660 return -EINVAL;
1661 }
1662
1663 /**
1664 * rvt_destroy_qp - destroy a queue pair
1665 * @ibqp: the queue pair to destroy
1666 * @udata: unused by the driver
1667 *
1668 * Note that this can be called while the QP is actively sending or
1669 * receiving!
1670 *
1671 * Return: 0 on success.
1672 */
rvt_destroy_qp(struct ib_qp * ibqp,struct ib_udata * udata)1673 int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
1674 {
1675 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1676 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1677
1678 rvt_reset_qp(rdi, qp, ibqp->qp_type);
1679
1680 wait_event(qp->wait, !atomic_read(&qp->refcount));
1681 /* qpn is now available for use again */
1682 rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
1683
1684 spin_lock(&rdi->n_qps_lock);
1685 rdi->n_qps_allocated--;
1686 if (qp->ibqp.qp_type == IB_QPT_RC) {
1687 rdi->n_rc_qps--;
1688 rdi->busy_jiffies = rdi->n_rc_qps / RC_QP_SCALING_INTERVAL;
1689 }
1690 spin_unlock(&rdi->n_qps_lock);
1691
1692 if (qp->ip)
1693 kref_put(&qp->ip->ref, rvt_release_mmap_info);
1694 kvfree(qp->r_rq.kwq);
1695 rdi->driver_f.qp_priv_free(rdi, qp);
1696 kfree(qp->s_ack_queue);
1697 kfree(qp->r_sg_list);
1698 rdma_destroy_ah_attr(&qp->remote_ah_attr);
1699 rdma_destroy_ah_attr(&qp->alt_ah_attr);
1700 free_ud_wq_attr(qp);
1701 vfree(qp->s_wq);
1702 return 0;
1703 }
1704
1705 /**
1706 * rvt_query_qp - query an ipbq
1707 * @ibqp: IB qp to query
1708 * @attr: attr struct to fill in
1709 * @attr_mask: attr mask ignored
1710 * @init_attr: struct to fill in
1711 *
1712 * Return: always 0
1713 */
rvt_query_qp(struct ib_qp * ibqp,struct ib_qp_attr * attr,int attr_mask,struct ib_qp_init_attr * init_attr)1714 int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
1715 int attr_mask, struct ib_qp_init_attr *init_attr)
1716 {
1717 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1718 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
1719
1720 attr->qp_state = qp->state;
1721 attr->cur_qp_state = attr->qp_state;
1722 attr->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu);
1723 attr->path_mig_state = qp->s_mig_state;
1724 attr->qkey = qp->qkey;
1725 attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask;
1726 attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
1727 attr->dest_qp_num = qp->remote_qpn;
1728 attr->qp_access_flags = qp->qp_access_flags;
1729 attr->cap.max_send_wr = qp->s_size - 1 -
1730 rdi->dparms.reserved_operations;
1731 attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
1732 attr->cap.max_send_sge = qp->s_max_sge;
1733 attr->cap.max_recv_sge = qp->r_rq.max_sge;
1734 attr->cap.max_inline_data = 0;
1735 attr->ah_attr = qp->remote_ah_attr;
1736 attr->alt_ah_attr = qp->alt_ah_attr;
1737 attr->pkey_index = qp->s_pkey_index;
1738 attr->alt_pkey_index = qp->s_alt_pkey_index;
1739 attr->en_sqd_async_notify = 0;
1740 attr->sq_draining = qp->s_draining;
1741 attr->max_rd_atomic = qp->s_max_rd_atomic;
1742 attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
1743 attr->min_rnr_timer = qp->r_min_rnr_timer;
1744 attr->port_num = qp->port_num;
1745 attr->timeout = qp->timeout;
1746 attr->retry_cnt = qp->s_retry_cnt;
1747 attr->rnr_retry = qp->s_rnr_retry_cnt;
1748 attr->alt_port_num =
1749 rdma_ah_get_port_num(&qp->alt_ah_attr);
1750 attr->alt_timeout = qp->alt_timeout;
1751
1752 init_attr->event_handler = qp->ibqp.event_handler;
1753 init_attr->qp_context = qp->ibqp.qp_context;
1754 init_attr->send_cq = qp->ibqp.send_cq;
1755 init_attr->recv_cq = qp->ibqp.recv_cq;
1756 init_attr->srq = qp->ibqp.srq;
1757 init_attr->cap = attr->cap;
1758 if (qp->s_flags & RVT_S_SIGNAL_REQ_WR)
1759 init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
1760 else
1761 init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
1762 init_attr->qp_type = qp->ibqp.qp_type;
1763 init_attr->port_num = qp->port_num;
1764 return 0;
1765 }
1766
1767 /**
1768 * rvt_post_recv - post a receive on a QP
1769 * @ibqp: the QP to post the receive on
1770 * @wr: the WR to post
1771 * @bad_wr: the first bad WR is put here
1772 *
1773 * This may be called from interrupt context.
1774 *
1775 * Return: 0 on success otherwise errno
1776 */
rvt_post_recv(struct ib_qp * ibqp,const struct ib_recv_wr * wr,const struct ib_recv_wr ** bad_wr)1777 int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
1778 const struct ib_recv_wr **bad_wr)
1779 {
1780 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
1781 struct rvt_krwq *wq = qp->r_rq.kwq;
1782 unsigned long flags;
1783 int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
1784 !qp->ibqp.srq;
1785
1786 /* Check that state is OK to post receive. */
1787 if (!(ib_rvt_state_ops[qp->state] & RVT_POST_RECV_OK) || !wq) {
1788 *bad_wr = wr;
1789 return -EINVAL;
1790 }
1791
1792 for (; wr; wr = wr->next) {
1793 struct rvt_rwqe *wqe;
1794 u32 next;
1795 int i;
1796
1797 if ((unsigned)wr->num_sge > qp->r_rq.max_sge) {
1798 *bad_wr = wr;
1799 return -EINVAL;
1800 }
1801
1802 spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
1803 next = wq->head + 1;
1804 if (next >= qp->r_rq.size)
1805 next = 0;
1806 if (next == READ_ONCE(wq->tail)) {
1807 spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
1808 *bad_wr = wr;
1809 return -ENOMEM;
1810 }
1811 if (unlikely(qp_err_flush)) {
1812 struct ib_wc wc;
1813
1814 memset(&wc, 0, sizeof(wc));
1815 wc.qp = &qp->ibqp;
1816 wc.opcode = IB_WC_RECV;
1817 wc.wr_id = wr->wr_id;
1818 wc.status = IB_WC_WR_FLUSH_ERR;
1819 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
1820 } else {
1821 wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
1822 wqe->wr_id = wr->wr_id;
1823 wqe->num_sge = wr->num_sge;
1824 for (i = 0; i < wr->num_sge; i++) {
1825 wqe->sg_list[i].addr = wr->sg_list[i].addr;
1826 wqe->sg_list[i].length = wr->sg_list[i].length;
1827 wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
1828 }
1829 /*
1830 * Make sure queue entry is written
1831 * before the head index.
1832 */
1833 smp_store_release(&wq->head, next);
1834 }
1835 spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
1836 }
1837 return 0;
1838 }
1839
1840 /**
1841 * rvt_qp_valid_operation - validate post send wr request
1842 * @qp: the qp
1843 * @post_parms: the post send table for the driver
1844 * @wr: the work request
1845 *
1846 * The routine validates the operation based on the
1847 * validation table an returns the length of the operation
1848 * which can extend beyond the ib_send_bw. Operation
1849 * dependent flags key atomic operation validation.
1850 *
1851 * There is an exception for UD qps that validates the pd and
1852 * overrides the length to include the additional UD specific
1853 * length.
1854 *
1855 * Returns a negative error or the length of the work request
1856 * for building the swqe.
1857 */
rvt_qp_valid_operation(struct rvt_qp * qp,const struct rvt_operation_params * post_parms,const struct ib_send_wr * wr)1858 static inline int rvt_qp_valid_operation(
1859 struct rvt_qp *qp,
1860 const struct rvt_operation_params *post_parms,
1861 const struct ib_send_wr *wr)
1862 {
1863 int len;
1864
1865 if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length)
1866 return -EINVAL;
1867 if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
1868 return -EINVAL;
1869 if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
1870 ibpd_to_rvtpd(qp->ibqp.pd)->user)
1871 return -EINVAL;
1872 if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
1873 (wr->num_sge == 0 ||
1874 wr->sg_list[0].length < sizeof(u64) ||
1875 wr->sg_list[0].addr & (sizeof(u64) - 1)))
1876 return -EINVAL;
1877 if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
1878 !qp->s_max_rd_atomic)
1879 return -EINVAL;
1880 len = post_parms[wr->opcode].length;
1881 /* UD specific */
1882 if (qp->ibqp.qp_type != IB_QPT_UC &&
1883 qp->ibqp.qp_type != IB_QPT_RC) {
1884 if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
1885 return -EINVAL;
1886 len = sizeof(struct ib_ud_wr);
1887 }
1888 return len;
1889 }
1890
1891 /**
1892 * rvt_qp_is_avail - determine queue capacity
1893 * @qp: the qp
1894 * @rdi: the rdmavt device
1895 * @reserved_op: is reserved operation
1896 *
1897 * This assumes the s_hlock is held but the s_last
1898 * qp variable is uncontrolled.
1899 *
1900 * For non reserved operations, the qp->s_avail
1901 * may be changed.
1902 *
1903 * The return value is zero or a -ENOMEM.
1904 */
rvt_qp_is_avail(struct rvt_qp * qp,struct rvt_dev_info * rdi,bool reserved_op)1905 static inline int rvt_qp_is_avail(
1906 struct rvt_qp *qp,
1907 struct rvt_dev_info *rdi,
1908 bool reserved_op)
1909 {
1910 u32 slast;
1911 u32 avail;
1912 u32 reserved_used;
1913
1914 /* see rvt_qp_wqe_unreserve() */
1915 smp_mb__before_atomic();
1916 if (unlikely(reserved_op)) {
1917 /* see rvt_qp_wqe_unreserve() */
1918 reserved_used = atomic_read(&qp->s_reserved_used);
1919 if (reserved_used >= rdi->dparms.reserved_operations)
1920 return -ENOMEM;
1921 return 0;
1922 }
1923 /* non-reserved operations */
1924 if (likely(qp->s_avail))
1925 return 0;
1926 /* See rvt_qp_complete_swqe() */
1927 slast = smp_load_acquire(&qp->s_last);
1928 if (qp->s_head >= slast)
1929 avail = qp->s_size - (qp->s_head - slast);
1930 else
1931 avail = slast - qp->s_head;
1932
1933 reserved_used = atomic_read(&qp->s_reserved_used);
1934 avail = avail - 1 -
1935 (rdi->dparms.reserved_operations - reserved_used);
1936 /* insure we don't assign a negative s_avail */
1937 if ((s32)avail <= 0)
1938 return -ENOMEM;
1939 qp->s_avail = avail;
1940 if (WARN_ON(qp->s_avail >
1941 (qp->s_size - 1 - rdi->dparms.reserved_operations)))
1942 rvt_pr_err(rdi,
1943 "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
1944 qp->ibqp.qp_num, qp->s_size, qp->s_avail,
1945 qp->s_head, qp->s_tail, qp->s_cur,
1946 qp->s_acked, qp->s_last);
1947 return 0;
1948 }
1949
1950 /**
1951 * rvt_post_one_wr - post one RC, UC, or UD send work request
1952 * @qp: the QP to post on
1953 * @wr: the work request to send
1954 * @call_send: kick the send engine into gear
1955 */
rvt_post_one_wr(struct rvt_qp * qp,const struct ib_send_wr * wr,bool * call_send)1956 static int rvt_post_one_wr(struct rvt_qp *qp,
1957 const struct ib_send_wr *wr,
1958 bool *call_send)
1959 {
1960 struct rvt_swqe *wqe;
1961 u32 next;
1962 int i;
1963 int j;
1964 int acc;
1965 struct rvt_lkey_table *rkt;
1966 struct rvt_pd *pd;
1967 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
1968 u8 log_pmtu;
1969 int ret;
1970 size_t cplen;
1971 bool reserved_op;
1972 int local_ops_delayed = 0;
1973
1974 BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
1975
1976 /* IB spec says that num_sge == 0 is OK. */
1977 if (unlikely(wr->num_sge > qp->s_max_sge))
1978 return -EINVAL;
1979
1980 ret = rvt_qp_valid_operation(qp, rdi->post_parms, wr);
1981 if (ret < 0)
1982 return ret;
1983 cplen = ret;
1984
1985 /*
1986 * Local operations include fast register and local invalidate.
1987 * Fast register needs to be processed immediately because the
1988 * registered lkey may be used by following work requests and the
1989 * lkey needs to be valid at the time those requests are posted.
1990 * Local invalidate can be processed immediately if fencing is
1991 * not required and no previous local invalidate ops are pending.
1992 * Signaled local operations that have been processed immediately
1993 * need to have requests with "completion only" flags set posted
1994 * to the send queue in order to generate completions.
1995 */
1996 if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
1997 switch (wr->opcode) {
1998 case IB_WR_REG_MR:
1999 ret = rvt_fast_reg_mr(qp,
2000 reg_wr(wr)->mr,
2001 reg_wr(wr)->key,
2002 reg_wr(wr)->access);
2003 if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
2004 return ret;
2005 break;
2006 case IB_WR_LOCAL_INV:
2007 if ((wr->send_flags & IB_SEND_FENCE) ||
2008 atomic_read(&qp->local_ops_pending)) {
2009 local_ops_delayed = 1;
2010 } else {
2011 ret = rvt_invalidate_rkey(
2012 qp, wr->ex.invalidate_rkey);
2013 if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
2014 return ret;
2015 }
2016 break;
2017 default:
2018 return -EINVAL;
2019 }
2020 }
2021
2022 reserved_op = rdi->post_parms[wr->opcode].flags &
2023 RVT_OPERATION_USE_RESERVE;
2024 /* check for avail */
2025 ret = rvt_qp_is_avail(qp, rdi, reserved_op);
2026 if (ret)
2027 return ret;
2028 next = qp->s_head + 1;
2029 if (next >= qp->s_size)
2030 next = 0;
2031
2032 rkt = &rdi->lkey_table;
2033 pd = ibpd_to_rvtpd(qp->ibqp.pd);
2034 wqe = rvt_get_swqe_ptr(qp, qp->s_head);
2035
2036 /* cplen has length from above */
2037 memcpy(&wqe->ud_wr, wr, cplen);
2038
2039 wqe->length = 0;
2040 j = 0;
2041 if (wr->num_sge) {
2042 struct rvt_sge *last_sge = NULL;
2043
2044 acc = wr->opcode >= IB_WR_RDMA_READ ?
2045 IB_ACCESS_LOCAL_WRITE : 0;
2046 for (i = 0; i < wr->num_sge; i++) {
2047 u32 length = wr->sg_list[i].length;
2048
2049 if (length == 0)
2050 continue;
2051 ret = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge,
2052 &wr->sg_list[i], acc);
2053 if (unlikely(ret < 0))
2054 goto bail_inval_free;
2055 wqe->length += length;
2056 if (ret)
2057 last_sge = &wqe->sg_list[j];
2058 j += ret;
2059 }
2060 wqe->wr.num_sge = j;
2061 }
2062
2063 /*
2064 * Calculate and set SWQE PSN values prior to handing it off
2065 * to the driver's check routine. This give the driver the
2066 * opportunity to adjust PSN values based on internal checks.
2067 */
2068 log_pmtu = qp->log_pmtu;
2069 if (qp->allowed_ops == IB_OPCODE_UD) {
2070 struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
2071
2072 log_pmtu = ah->log_pmtu;
2073 rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
2074 }
2075
2076 if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
2077 if (local_ops_delayed)
2078 atomic_inc(&qp->local_ops_pending);
2079 else
2080 wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
2081 wqe->ssn = 0;
2082 wqe->psn = 0;
2083 wqe->lpsn = 0;
2084 } else {
2085 wqe->ssn = qp->s_ssn++;
2086 wqe->psn = qp->s_next_psn;
2087 wqe->lpsn = wqe->psn +
2088 (wqe->length ?
2089 ((wqe->length - 1) >> log_pmtu) :
2090 0);
2091 }
2092
2093 /* general part of wqe valid - allow for driver checks */
2094 if (rdi->driver_f.setup_wqe) {
2095 ret = rdi->driver_f.setup_wqe(qp, wqe, call_send);
2096 if (ret < 0)
2097 goto bail_inval_free_ref;
2098 }
2099
2100 if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL))
2101 qp->s_next_psn = wqe->lpsn + 1;
2102
2103 if (unlikely(reserved_op)) {
2104 wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
2105 rvt_qp_wqe_reserve(qp, wqe);
2106 } else {
2107 wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
2108 qp->s_avail--;
2109 }
2110 trace_rvt_post_one_wr(qp, wqe, wr->num_sge);
2111 smp_wmb(); /* see request builders */
2112 qp->s_head = next;
2113
2114 return 0;
2115
2116 bail_inval_free_ref:
2117 if (qp->allowed_ops == IB_OPCODE_UD)
2118 rdma_destroy_ah_attr(wqe->ud_wr.attr);
2119 bail_inval_free:
2120 /* release mr holds */
2121 while (j) {
2122 struct rvt_sge *sge = &wqe->sg_list[--j];
2123
2124 rvt_put_mr(sge->mr);
2125 }
2126 return ret;
2127 }
2128
2129 /**
2130 * rvt_post_send - post a send on a QP
2131 * @ibqp: the QP to post the send on
2132 * @wr: the list of work requests to post
2133 * @bad_wr: the first bad WR is put here
2134 *
2135 * This may be called from interrupt context.
2136 *
2137 * Return: 0 on success else errno
2138 */
rvt_post_send(struct ib_qp * ibqp,const struct ib_send_wr * wr,const struct ib_send_wr ** bad_wr)2139 int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
2140 const struct ib_send_wr **bad_wr)
2141 {
2142 struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
2143 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
2144 unsigned long flags = 0;
2145 bool call_send;
2146 unsigned nreq = 0;
2147 int err = 0;
2148
2149 spin_lock_irqsave(&qp->s_hlock, flags);
2150
2151 /*
2152 * Ensure QP state is such that we can send. If not bail out early,
2153 * there is no need to do this every time we post a send.
2154 */
2155 if (unlikely(!(ib_rvt_state_ops[qp->state] & RVT_POST_SEND_OK))) {
2156 spin_unlock_irqrestore(&qp->s_hlock, flags);
2157 return -EINVAL;
2158 }
2159
2160 /*
2161 * If the send queue is empty, and we only have a single WR then just go
2162 * ahead and kick the send engine into gear. Otherwise we will always
2163 * just schedule the send to happen later.
2164 */
2165 call_send = qp->s_head == READ_ONCE(qp->s_last) && !wr->next;
2166
2167 for (; wr; wr = wr->next) {
2168 err = rvt_post_one_wr(qp, wr, &call_send);
2169 if (unlikely(err)) {
2170 *bad_wr = wr;
2171 goto bail;
2172 }
2173 nreq++;
2174 }
2175 bail:
2176 spin_unlock_irqrestore(&qp->s_hlock, flags);
2177 if (nreq) {
2178 /*
2179 * Only call do_send if there is exactly one packet, and the
2180 * driver said it was ok.
2181 */
2182 if (nreq == 1 && call_send)
2183 rdi->driver_f.do_send(qp);
2184 else
2185 rdi->driver_f.schedule_send_no_lock(qp);
2186 }
2187 return err;
2188 }
2189
2190 /**
2191 * rvt_post_srq_recv - post a receive on a shared receive queue
2192 * @ibsrq: the SRQ to post the receive on
2193 * @wr: the list of work requests to post
2194 * @bad_wr: A pointer to the first WR to cause a problem is put here
2195 *
2196 * This may be called from interrupt context.
2197 *
2198 * Return: 0 on success else errno
2199 */
rvt_post_srq_recv(struct ib_srq * ibsrq,const struct ib_recv_wr * wr,const struct ib_recv_wr ** bad_wr)2200 int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
2201 const struct ib_recv_wr **bad_wr)
2202 {
2203 struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
2204 struct rvt_krwq *wq;
2205 unsigned long flags;
2206
2207 for (; wr; wr = wr->next) {
2208 struct rvt_rwqe *wqe;
2209 u32 next;
2210 int i;
2211
2212 if ((unsigned)wr->num_sge > srq->rq.max_sge) {
2213 *bad_wr = wr;
2214 return -EINVAL;
2215 }
2216
2217 spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
2218 wq = srq->rq.kwq;
2219 next = wq->head + 1;
2220 if (next >= srq->rq.size)
2221 next = 0;
2222 if (next == READ_ONCE(wq->tail)) {
2223 spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
2224 *bad_wr = wr;
2225 return -ENOMEM;
2226 }
2227
2228 wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
2229 wqe->wr_id = wr->wr_id;
2230 wqe->num_sge = wr->num_sge;
2231 for (i = 0; i < wr->num_sge; i++) {
2232 wqe->sg_list[i].addr = wr->sg_list[i].addr;
2233 wqe->sg_list[i].length = wr->sg_list[i].length;
2234 wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
2235 }
2236 /* Make sure queue entry is written before the head index. */
2237 smp_store_release(&wq->head, next);
2238 spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
2239 }
2240 return 0;
2241 }
2242
2243 /*
2244 * rvt used the internal kernel struct as part of its ABI, for now make sure
2245 * the kernel struct does not change layout. FIXME: rvt should never cast the
2246 * user struct to a kernel struct.
2247 */
rvt_cast_sge(struct rvt_wqe_sge * sge)2248 static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge)
2249 {
2250 BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
2251 offsetof(struct rvt_wqe_sge, addr));
2252 BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
2253 offsetof(struct rvt_wqe_sge, length));
2254 BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
2255 offsetof(struct rvt_wqe_sge, lkey));
2256 return (struct ib_sge *)sge;
2257 }
2258
2259 /*
2260 * Validate a RWQE and fill in the SGE state.
2261 * Return 1 if OK.
2262 */
init_sge(struct rvt_qp * qp,struct rvt_rwqe * wqe)2263 static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
2264 {
2265 int i, j, ret;
2266 struct ib_wc wc;
2267 struct rvt_lkey_table *rkt;
2268 struct rvt_pd *pd;
2269 struct rvt_sge_state *ss;
2270 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2271
2272 rkt = &rdi->lkey_table;
2273 pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
2274 ss = &qp->r_sge;
2275 ss->sg_list = qp->r_sg_list;
2276 qp->r_len = 0;
2277 for (i = j = 0; i < wqe->num_sge; i++) {
2278 if (wqe->sg_list[i].length == 0)
2279 continue;
2280 /* Check LKEY */
2281 ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
2282 NULL, rvt_cast_sge(&wqe->sg_list[i]),
2283 IB_ACCESS_LOCAL_WRITE);
2284 if (unlikely(ret <= 0))
2285 goto bad_lkey;
2286 qp->r_len += wqe->sg_list[i].length;
2287 j++;
2288 }
2289 ss->num_sge = j;
2290 ss->total_len = qp->r_len;
2291 return 1;
2292
2293 bad_lkey:
2294 while (j) {
2295 struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
2296
2297 rvt_put_mr(sge->mr);
2298 }
2299 ss->num_sge = 0;
2300 memset(&wc, 0, sizeof(wc));
2301 wc.wr_id = wqe->wr_id;
2302 wc.status = IB_WC_LOC_PROT_ERR;
2303 wc.opcode = IB_WC_RECV;
2304 wc.qp = &qp->ibqp;
2305 /* Signal solicited completion event. */
2306 rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
2307 return 0;
2308 }
2309
2310 /**
2311 * get_rvt_head - get head indices of the circular buffer
2312 * @rq: data structure for request queue entry
2313 * @ip: the QP
2314 *
2315 * Return - head index value
2316 */
get_rvt_head(struct rvt_rq * rq,void * ip)2317 static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip)
2318 {
2319 u32 head;
2320
2321 if (ip)
2322 head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
2323 else
2324 head = rq->kwq->head;
2325
2326 return head;
2327 }
2328
2329 /**
2330 * rvt_get_rwqe - copy the next RWQE into the QP's RWQE
2331 * @qp: the QP
2332 * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
2333 *
2334 * Return -1 if there is a local error, 0 if no RWQE is available,
2335 * otherwise return 1.
2336 *
2337 * Can be called from interrupt level.
2338 */
rvt_get_rwqe(struct rvt_qp * qp,bool wr_id_only)2339 int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
2340 {
2341 unsigned long flags;
2342 struct rvt_rq *rq;
2343 struct rvt_krwq *kwq = NULL;
2344 struct rvt_rwq *wq;
2345 struct rvt_srq *srq;
2346 struct rvt_rwqe *wqe;
2347 void (*handler)(struct ib_event *, void *);
2348 u32 tail;
2349 u32 head;
2350 int ret;
2351 void *ip = NULL;
2352
2353 if (qp->ibqp.srq) {
2354 srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
2355 handler = srq->ibsrq.event_handler;
2356 rq = &srq->rq;
2357 ip = srq->ip;
2358 } else {
2359 srq = NULL;
2360 handler = NULL;
2361 rq = &qp->r_rq;
2362 ip = qp->ip;
2363 }
2364
2365 spin_lock_irqsave(&rq->kwq->c_lock, flags);
2366 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
2367 ret = 0;
2368 goto unlock;
2369 }
2370 kwq = rq->kwq;
2371 if (ip) {
2372 wq = rq->wq;
2373 tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
2374 } else {
2375 tail = kwq->tail;
2376 }
2377
2378 /* Validate tail before using it since it is user writable. */
2379 if (tail >= rq->size)
2380 tail = 0;
2381
2382 if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
2383 head = get_rvt_head(rq, ip);
2384 kwq->count = rvt_get_rq_count(rq, head, tail);
2385 }
2386 if (unlikely(kwq->count == 0)) {
2387 ret = 0;
2388 goto unlock;
2389 }
2390 /* Make sure entry is read after the count is read. */
2391 smp_rmb();
2392 wqe = rvt_get_rwqe_ptr(rq, tail);
2393 /*
2394 * Even though we update the tail index in memory, the verbs
2395 * consumer is not supposed to post more entries until a
2396 * completion is generated.
2397 */
2398 if (++tail >= rq->size)
2399 tail = 0;
2400 if (ip)
2401 RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
2402 else
2403 kwq->tail = tail;
2404 if (!wr_id_only && !init_sge(qp, wqe)) {
2405 ret = -1;
2406 goto unlock;
2407 }
2408 qp->r_wr_id = wqe->wr_id;
2409
2410 kwq->count--;
2411 ret = 1;
2412 set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
2413 if (handler) {
2414 /*
2415 * Validate head pointer value and compute
2416 * the number of remaining WQEs.
2417 */
2418 if (kwq->count < srq->limit) {
2419 kwq->count =
2420 rvt_get_rq_count(rq,
2421 get_rvt_head(rq, ip), tail);
2422 if (kwq->count < srq->limit) {
2423 struct ib_event ev;
2424
2425 srq->limit = 0;
2426 spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
2427 ev.device = qp->ibqp.device;
2428 ev.element.srq = qp->ibqp.srq;
2429 ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
2430 handler(&ev, srq->ibsrq.srq_context);
2431 goto bail;
2432 }
2433 }
2434 }
2435 unlock:
2436 spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
2437 bail:
2438 return ret;
2439 }
2440 EXPORT_SYMBOL(rvt_get_rwqe);
2441
2442 /**
2443 * rvt_comm_est - handle trap with QP established
2444 * @qp: the QP
2445 */
rvt_comm_est(struct rvt_qp * qp)2446 void rvt_comm_est(struct rvt_qp *qp)
2447 {
2448 qp->r_flags |= RVT_R_COMM_EST;
2449 if (qp->ibqp.event_handler) {
2450 struct ib_event ev;
2451
2452 ev.device = qp->ibqp.device;
2453 ev.element.qp = &qp->ibqp;
2454 ev.event = IB_EVENT_COMM_EST;
2455 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
2456 }
2457 }
2458 EXPORT_SYMBOL(rvt_comm_est);
2459
rvt_rc_error(struct rvt_qp * qp,enum ib_wc_status err)2460 void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err)
2461 {
2462 unsigned long flags;
2463 int lastwqe;
2464
2465 spin_lock_irqsave(&qp->s_lock, flags);
2466 lastwqe = rvt_error_qp(qp, err);
2467 spin_unlock_irqrestore(&qp->s_lock, flags);
2468
2469 if (lastwqe) {
2470 struct ib_event ev;
2471
2472 ev.device = qp->ibqp.device;
2473 ev.element.qp = &qp->ibqp;
2474 ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
2475 qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
2476 }
2477 }
2478 EXPORT_SYMBOL(rvt_rc_error);
2479
2480 /*
2481 * rvt_rnr_tbl_to_usec - return index into ib_rvt_rnr_table
2482 * @index - the index
2483 * return usec from an index into ib_rvt_rnr_table
2484 */
rvt_rnr_tbl_to_usec(u32 index)2485 unsigned long rvt_rnr_tbl_to_usec(u32 index)
2486 {
2487 return ib_rvt_rnr_table[(index & IB_AETH_CREDIT_MASK)];
2488 }
2489 EXPORT_SYMBOL(rvt_rnr_tbl_to_usec);
2490
rvt_aeth_to_usec(u32 aeth)2491 static inline unsigned long rvt_aeth_to_usec(u32 aeth)
2492 {
2493 return ib_rvt_rnr_table[(aeth >> IB_AETH_CREDIT_SHIFT) &
2494 IB_AETH_CREDIT_MASK];
2495 }
2496
2497 /*
2498 * rvt_add_retry_timer_ext - add/start a retry timer
2499 * @qp - the QP
2500 * @shift - timeout shift to wait for multiple packets
2501 * add a retry timer on the QP
2502 */
rvt_add_retry_timer_ext(struct rvt_qp * qp,u8 shift)2503 void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
2504 {
2505 struct ib_qp *ibqp = &qp->ibqp;
2506 struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
2507
2508 lockdep_assert_held(&qp->s_lock);
2509 qp->s_flags |= RVT_S_TIMER;
2510 /* 4.096 usec. * (1 << qp->timeout) */
2511 qp->s_timer.expires = jiffies + rdi->busy_jiffies +
2512 (qp->timeout_jiffies << shift);
2513 add_timer(&qp->s_timer);
2514 }
2515 EXPORT_SYMBOL(rvt_add_retry_timer_ext);
2516
2517 /**
2518 * rvt_add_rnr_timer - add/start an rnr timer on the QP
2519 * @qp: the QP
2520 * @aeth: aeth of RNR timeout, simulated aeth for loopback
2521 */
rvt_add_rnr_timer(struct rvt_qp * qp,u32 aeth)2522 void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth)
2523 {
2524 u32 to;
2525
2526 lockdep_assert_held(&qp->s_lock);
2527 qp->s_flags |= RVT_S_WAIT_RNR;
2528 to = rvt_aeth_to_usec(aeth);
2529 trace_rvt_rnrnak_add(qp, to);
2530 hrtimer_start(&qp->s_rnr_timer,
2531 ns_to_ktime(1000 * to), HRTIMER_MODE_REL_PINNED);
2532 }
2533 EXPORT_SYMBOL(rvt_add_rnr_timer);
2534
2535 /**
2536 * rvt_stop_rc_timers - stop all timers
2537 * @qp: the QP
2538 * stop any pending timers
2539 */
rvt_stop_rc_timers(struct rvt_qp * qp)2540 void rvt_stop_rc_timers(struct rvt_qp *qp)
2541 {
2542 lockdep_assert_held(&qp->s_lock);
2543 /* Remove QP from all timers */
2544 if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) {
2545 qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR);
2546 timer_delete(&qp->s_timer);
2547 hrtimer_try_to_cancel(&qp->s_rnr_timer);
2548 }
2549 }
2550 EXPORT_SYMBOL(rvt_stop_rc_timers);
2551
2552 /**
2553 * rvt_stop_rnr_timer - stop an rnr timer
2554 * @qp: the QP
2555 *
2556 * stop an rnr timer and return if the timer
2557 * had been pending.
2558 */
rvt_stop_rnr_timer(struct rvt_qp * qp)2559 static void rvt_stop_rnr_timer(struct rvt_qp *qp)
2560 {
2561 lockdep_assert_held(&qp->s_lock);
2562 /* Remove QP from rnr timer */
2563 if (qp->s_flags & RVT_S_WAIT_RNR) {
2564 qp->s_flags &= ~RVT_S_WAIT_RNR;
2565 trace_rvt_rnrnak_stop(qp, 0);
2566 }
2567 }
2568
2569 /**
2570 * rvt_del_timers_sync - wait for any timeout routines to exit
2571 * @qp: the QP
2572 */
rvt_del_timers_sync(struct rvt_qp * qp)2573 void rvt_del_timers_sync(struct rvt_qp *qp)
2574 {
2575 timer_delete_sync(&qp->s_timer);
2576 hrtimer_cancel(&qp->s_rnr_timer);
2577 }
2578 EXPORT_SYMBOL(rvt_del_timers_sync);
2579
2580 /*
2581 * This is called from s_timer for missing responses.
2582 */
rvt_rc_timeout(struct timer_list * t)2583 static void rvt_rc_timeout(struct timer_list *t)
2584 {
2585 struct rvt_qp *qp = timer_container_of(qp, t, s_timer);
2586 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2587 unsigned long flags;
2588
2589 spin_lock_irqsave(&qp->r_lock, flags);
2590 spin_lock(&qp->s_lock);
2591 if (qp->s_flags & RVT_S_TIMER) {
2592 struct rvt_ibport *rvp = rdi->ports[qp->port_num - 1];
2593
2594 qp->s_flags &= ~RVT_S_TIMER;
2595 rvp->n_rc_timeouts++;
2596 timer_delete(&qp->s_timer);
2597 trace_rvt_rc_timeout(qp, qp->s_last_psn + 1);
2598 if (rdi->driver_f.notify_restart_rc)
2599 rdi->driver_f.notify_restart_rc(qp,
2600 qp->s_last_psn + 1,
2601 1);
2602 rdi->driver_f.schedule_send(qp);
2603 }
2604 spin_unlock(&qp->s_lock);
2605 spin_unlock_irqrestore(&qp->r_lock, flags);
2606 }
2607
2608 /*
2609 * This is called from s_timer for RNR timeouts.
2610 */
rvt_rc_rnr_retry(struct hrtimer * t)2611 enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t)
2612 {
2613 struct rvt_qp *qp = container_of(t, struct rvt_qp, s_rnr_timer);
2614 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2615 unsigned long flags;
2616
2617 spin_lock_irqsave(&qp->s_lock, flags);
2618 rvt_stop_rnr_timer(qp);
2619 trace_rvt_rnrnak_timeout(qp, 0);
2620 rdi->driver_f.schedule_send(qp);
2621 spin_unlock_irqrestore(&qp->s_lock, flags);
2622 return HRTIMER_NORESTART;
2623 }
2624 EXPORT_SYMBOL(rvt_rc_rnr_retry);
2625
2626 /**
2627 * rvt_qp_iter_init - initial for QP iteration
2628 * @rdi: rvt devinfo
2629 * @v: u64 value
2630 * @cb: user-defined callback
2631 *
2632 * This returns an iterator suitable for iterating QPs
2633 * in the system.
2634 *
2635 * The @cb is a user-defined callback and @v is a 64-bit
2636 * value passed to and relevant for processing in the
2637 * @cb. An example use case would be to alter QP processing
2638 * based on criteria not part of the rvt_qp.
2639 *
2640 * Use cases that require memory allocation to succeed
2641 * must preallocate appropriately.
2642 *
2643 * Return: a pointer to an rvt_qp_iter or NULL
2644 */
rvt_qp_iter_init(struct rvt_dev_info * rdi,u64 v,void (* cb)(struct rvt_qp * qp,u64 v))2645 struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi,
2646 u64 v,
2647 void (*cb)(struct rvt_qp *qp, u64 v))
2648 {
2649 struct rvt_qp_iter *i;
2650
2651 i = kzalloc_obj(*i);
2652 if (!i)
2653 return NULL;
2654
2655 i->rdi = rdi;
2656 /* number of special QPs (SMI/GSI) for device */
2657 i->specials = rdi->ibdev.phys_port_cnt * 2;
2658 i->v = v;
2659 i->cb = cb;
2660
2661 return i;
2662 }
2663 EXPORT_SYMBOL(rvt_qp_iter_init);
2664
2665 /**
2666 * rvt_qp_iter_next - return the next QP in iter
2667 * @iter: the iterator
2668 *
2669 * Fine grained QP iterator suitable for use
2670 * with debugfs seq_file mechanisms.
2671 *
2672 * Updates iter->qp with the current QP when the return
2673 * value is 0.
2674 *
2675 * Return: 0 - iter->qp is valid 1 - no more QPs
2676 */
rvt_qp_iter_next(struct rvt_qp_iter * iter)2677 int rvt_qp_iter_next(struct rvt_qp_iter *iter)
2678 __must_hold(RCU)
2679 {
2680 int n = iter->n;
2681 int ret = 1;
2682 struct rvt_qp *pqp = iter->qp;
2683 struct rvt_qp *qp;
2684 struct rvt_dev_info *rdi = iter->rdi;
2685
2686 /*
2687 * The approach is to consider the special qps
2688 * as additional table entries before the
2689 * real hash table. Since the qp code sets
2690 * the qp->next hash link to NULL, this works just fine.
2691 *
2692 * iter->specials is 2 * # ports
2693 *
2694 * n = 0..iter->specials is the special qp indices
2695 *
2696 * n = iter->specials..rdi->qp_dev->qp_table_size+iter->specials are
2697 * the potential hash bucket entries
2698 *
2699 */
2700 for (; n < rdi->qp_dev->qp_table_size + iter->specials; n++) {
2701 if (pqp) {
2702 qp = rcu_dereference(pqp->next);
2703 } else {
2704 if (n < iter->specials) {
2705 struct rvt_ibport *rvp;
2706 int pidx;
2707
2708 pidx = n % rdi->ibdev.phys_port_cnt;
2709 rvp = rdi->ports[pidx];
2710 qp = rcu_dereference(rvp->qp[n & 1]);
2711 } else {
2712 qp = rcu_dereference(
2713 rdi->qp_dev->qp_table[
2714 (n - iter->specials)]);
2715 }
2716 }
2717 pqp = qp;
2718 if (qp) {
2719 iter->qp = qp;
2720 iter->n = n;
2721 return 0;
2722 }
2723 }
2724 return ret;
2725 }
2726 EXPORT_SYMBOL(rvt_qp_iter_next);
2727
2728 /**
2729 * rvt_qp_iter - iterate all QPs
2730 * @rdi: rvt devinfo
2731 * @v: a 64-bit value
2732 * @cb: a callback
2733 *
2734 * This provides a way for iterating all QPs.
2735 *
2736 * The @cb is a user-defined callback and @v is a 64-bit
2737 * value passed to and relevant for processing in the
2738 * cb. An example use case would be to alter QP processing
2739 * based on criteria not part of the rvt_qp.
2740 *
2741 * The code has an internal iterator to simplify
2742 * non seq_file use cases.
2743 */
rvt_qp_iter(struct rvt_dev_info * rdi,u64 v,void (* cb)(struct rvt_qp * qp,u64 v))2744 void rvt_qp_iter(struct rvt_dev_info *rdi,
2745 u64 v,
2746 void (*cb)(struct rvt_qp *qp, u64 v))
2747 {
2748 int ret;
2749 struct rvt_qp_iter i = {
2750 .rdi = rdi,
2751 .specials = rdi->ibdev.phys_port_cnt * 2,
2752 .v = v,
2753 .cb = cb
2754 };
2755
2756 rcu_read_lock();
2757 do {
2758 ret = rvt_qp_iter_next(&i);
2759 if (!ret) {
2760 rvt_get_qp(i.qp);
2761 rcu_read_unlock();
2762 i.cb(i.qp, i.v);
2763 rcu_read_lock();
2764 rvt_put_qp(i.qp);
2765 }
2766 } while (!ret);
2767 rcu_read_unlock();
2768 }
2769 EXPORT_SYMBOL(rvt_qp_iter);
2770
2771 /*
2772 * This should be called with s_lock and r_lock held.
2773 */
rvt_send_complete(struct rvt_qp * qp,struct rvt_swqe * wqe,enum ib_wc_status status)2774 void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
2775 enum ib_wc_status status)
2776 {
2777 u32 old_last, last;
2778 struct rvt_dev_info *rdi;
2779
2780 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
2781 return;
2782 rdi = ib_to_rvt(qp->ibqp.device);
2783
2784 old_last = qp->s_last;
2785 trace_rvt_qp_send_completion(qp, wqe, old_last);
2786 last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
2787 status);
2788 if (qp->s_acked == old_last)
2789 qp->s_acked = last;
2790 if (qp->s_cur == old_last)
2791 qp->s_cur = last;
2792 if (qp->s_tail == old_last)
2793 qp->s_tail = last;
2794 if (qp->state == IB_QPS_SQD && last == qp->s_cur)
2795 qp->s_draining = 0;
2796 }
2797 EXPORT_SYMBOL(rvt_send_complete);
2798
2799 /**
2800 * rvt_copy_sge - copy data to SGE memory
2801 * @qp: associated QP
2802 * @ss: the SGE state
2803 * @data: the data to copy
2804 * @length: the length of the data
2805 * @release: boolean to release MR
2806 * @copy_last: do a separate copy of the last 8 bytes
2807 */
rvt_copy_sge(struct rvt_qp * qp,struct rvt_sge_state * ss,void * data,u32 length,bool release,bool copy_last)2808 void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
2809 void *data, u32 length,
2810 bool release, bool copy_last)
2811 {
2812 struct rvt_sge *sge = &ss->sge;
2813 int i;
2814 bool in_last = false;
2815 bool cacheless_copy = false;
2816 struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
2817 struct rvt_wss *wss = rdi->wss;
2818 unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode;
2819
2820 if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) {
2821 cacheless_copy = length >= PAGE_SIZE;
2822 } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) {
2823 if (length >= PAGE_SIZE) {
2824 /*
2825 * NOTE: this *assumes*:
2826 * o The first vaddr is the dest.
2827 * o If multiple pages, then vaddr is sequential.
2828 */
2829 wss_insert(wss, sge->vaddr);
2830 if (length >= (2 * PAGE_SIZE))
2831 wss_insert(wss, (sge->vaddr + PAGE_SIZE));
2832
2833 cacheless_copy = wss_exceeds_threshold(wss);
2834 } else {
2835 wss_advance_clean_counter(wss);
2836 }
2837 }
2838
2839 if (copy_last) {
2840 if (length > 8) {
2841 length -= 8;
2842 } else {
2843 copy_last = false;
2844 in_last = true;
2845 }
2846 }
2847
2848 again:
2849 while (length) {
2850 u32 len = rvt_get_sge_length(sge, length);
2851
2852 WARN_ON_ONCE(len == 0);
2853 if (unlikely(in_last)) {
2854 /* enforce byte transfer ordering */
2855 for (i = 0; i < len; i++)
2856 ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
2857 } else if (cacheless_copy) {
2858 cacheless_memcpy(sge->vaddr, data, len);
2859 } else {
2860 memcpy(sge->vaddr, data, len);
2861 }
2862 rvt_update_sge(ss, len, release);
2863 data += len;
2864 length -= len;
2865 }
2866
2867 if (copy_last) {
2868 copy_last = false;
2869 in_last = true;
2870 length = 8;
2871 goto again;
2872 }
2873 }
2874 EXPORT_SYMBOL(rvt_copy_sge);
2875
loopback_qp_drop(struct rvt_ibport * rvp,struct rvt_qp * sqp)2876 static enum ib_wc_status loopback_qp_drop(struct rvt_ibport *rvp,
2877 struct rvt_qp *sqp)
2878 {
2879 rvp->n_pkt_drops++;
2880 /*
2881 * For RC, the requester would timeout and retry so
2882 * shortcut the timeouts and just signal too many retries.
2883 */
2884 return sqp->ibqp.qp_type == IB_QPT_RC ?
2885 IB_WC_RETRY_EXC_ERR : IB_WC_SUCCESS;
2886 }
2887
2888 /**
2889 * rvt_ruc_loopback - handle UC and RC loopback requests
2890 * @sqp: the sending QP
2891 *
2892 * This is called from rvt_do_send() to forward a WQE addressed to the same HFI
2893 * Note that although we are single threaded due to the send engine, we still
2894 * have to protect against post_send(). We don't have to worry about
2895 * receive interrupts since this is a connected protocol and all packets
2896 * will pass through here.
2897 */
rvt_ruc_loopback(struct rvt_qp * sqp)2898 void rvt_ruc_loopback(struct rvt_qp *sqp)
2899 {
2900 struct rvt_ibport *rvp = NULL;
2901 struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device);
2902 struct rvt_qp *qp;
2903 struct rvt_swqe *wqe;
2904 struct rvt_sge *sge;
2905 unsigned long flags;
2906 struct ib_wc wc;
2907 u64 sdata;
2908 atomic64_t *maddr;
2909 enum ib_wc_status send_status;
2910 bool release;
2911 int ret;
2912 bool copy_last = false;
2913 int local_ops = 0;
2914
2915 rcu_read_lock();
2916 rvp = rdi->ports[sqp->port_num - 1];
2917
2918 /*
2919 * Note that we check the responder QP state after
2920 * checking the requester's state.
2921 */
2922
2923 qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp,
2924 sqp->remote_qpn);
2925
2926 spin_lock_irqsave(&sqp->s_lock, flags);
2927
2928 /* Return if we are already busy processing a work request. */
2929 if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
2930 !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
2931 goto unlock;
2932
2933 sqp->s_flags |= RVT_S_BUSY;
2934
2935 again:
2936 if (sqp->s_last == READ_ONCE(sqp->s_head))
2937 goto clr_busy;
2938 wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
2939
2940 /* Return if it is not OK to start a new work request. */
2941 if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
2942 if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
2943 goto clr_busy;
2944 /* We are in the error state, flush the work request. */
2945 send_status = IB_WC_WR_FLUSH_ERR;
2946 goto flush_send;
2947 }
2948
2949 /*
2950 * We can rely on the entry not changing without the s_lock
2951 * being held until we update s_last.
2952 * We increment s_cur to indicate s_last is in progress.
2953 */
2954 if (sqp->s_last == sqp->s_cur) {
2955 if (++sqp->s_cur >= sqp->s_size)
2956 sqp->s_cur = 0;
2957 }
2958 spin_unlock_irqrestore(&sqp->s_lock, flags);
2959
2960 if (!qp) {
2961 send_status = loopback_qp_drop(rvp, sqp);
2962 goto serr_no_r_lock;
2963 }
2964 spin_lock_irqsave(&qp->r_lock, flags);
2965 if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
2966 qp->ibqp.qp_type != sqp->ibqp.qp_type) {
2967 send_status = loopback_qp_drop(rvp, sqp);
2968 goto serr;
2969 }
2970
2971 memset(&wc, 0, sizeof(wc));
2972 send_status = IB_WC_SUCCESS;
2973
2974 release = true;
2975 sqp->s_sge.sge = wqe->sg_list[0];
2976 sqp->s_sge.sg_list = wqe->sg_list + 1;
2977 sqp->s_sge.num_sge = wqe->wr.num_sge;
2978 sqp->s_len = wqe->length;
2979 switch (wqe->wr.opcode) {
2980 case IB_WR_REG_MR:
2981 goto send_comp;
2982
2983 case IB_WR_LOCAL_INV:
2984 if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
2985 if (rvt_invalidate_rkey(sqp,
2986 wqe->wr.ex.invalidate_rkey))
2987 send_status = IB_WC_LOC_PROT_ERR;
2988 local_ops = 1;
2989 }
2990 goto send_comp;
2991
2992 case IB_WR_SEND_WITH_INV:
2993 case IB_WR_SEND_WITH_IMM:
2994 case IB_WR_SEND:
2995 ret = rvt_get_rwqe(qp, false);
2996 if (ret < 0)
2997 goto op_err;
2998 if (!ret)
2999 goto rnr_nak;
3000 if (wqe->length > qp->r_len)
3001 goto inv_err;
3002 switch (wqe->wr.opcode) {
3003 case IB_WR_SEND_WITH_INV:
3004 if (!rvt_invalidate_rkey(qp,
3005 wqe->wr.ex.invalidate_rkey)) {
3006 wc.wc_flags = IB_WC_WITH_INVALIDATE;
3007 wc.ex.invalidate_rkey =
3008 wqe->wr.ex.invalidate_rkey;
3009 }
3010 break;
3011 case IB_WR_SEND_WITH_IMM:
3012 wc.wc_flags = IB_WC_WITH_IMM;
3013 wc.ex.imm_data = wqe->wr.ex.imm_data;
3014 break;
3015 default:
3016 break;
3017 }
3018 break;
3019
3020 case IB_WR_RDMA_WRITE_WITH_IMM:
3021 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3022 goto inv_err;
3023 wc.wc_flags = IB_WC_WITH_IMM;
3024 wc.ex.imm_data = wqe->wr.ex.imm_data;
3025 ret = rvt_get_rwqe(qp, true);
3026 if (ret < 0)
3027 goto op_err;
3028 if (!ret)
3029 goto rnr_nak;
3030 /* skip copy_last set and qp_access_flags recheck */
3031 goto do_write;
3032 case IB_WR_RDMA_WRITE:
3033 copy_last = rvt_is_user_qp(qp);
3034 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
3035 goto inv_err;
3036 do_write:
3037 if (wqe->length == 0)
3038 break;
3039 if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
3040 wqe->rdma_wr.remote_addr,
3041 wqe->rdma_wr.rkey,
3042 IB_ACCESS_REMOTE_WRITE)))
3043 goto acc_err;
3044 qp->r_sge.sg_list = NULL;
3045 qp->r_sge.num_sge = 1;
3046 qp->r_sge.total_len = wqe->length;
3047 break;
3048
3049 case IB_WR_RDMA_READ:
3050 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
3051 goto inv_err;
3052 if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
3053 wqe->rdma_wr.remote_addr,
3054 wqe->rdma_wr.rkey,
3055 IB_ACCESS_REMOTE_READ)))
3056 goto acc_err;
3057 release = false;
3058 sqp->s_sge.sg_list = NULL;
3059 sqp->s_sge.num_sge = 1;
3060 qp->r_sge.sge = wqe->sg_list[0];
3061 qp->r_sge.sg_list = wqe->sg_list + 1;
3062 qp->r_sge.num_sge = wqe->wr.num_sge;
3063 qp->r_sge.total_len = wqe->length;
3064 break;
3065
3066 case IB_WR_ATOMIC_CMP_AND_SWP:
3067 case IB_WR_ATOMIC_FETCH_AND_ADD:
3068 if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
3069 goto inv_err;
3070 if (unlikely(wqe->atomic_wr.remote_addr & (sizeof(u64) - 1)))
3071 goto inv_err;
3072 if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
3073 wqe->atomic_wr.remote_addr,
3074 wqe->atomic_wr.rkey,
3075 IB_ACCESS_REMOTE_ATOMIC)))
3076 goto acc_err;
3077 /* Perform atomic OP and save result. */
3078 maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
3079 sdata = wqe->atomic_wr.compare_add;
3080 *(u64 *)sqp->s_sge.sge.vaddr =
3081 (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
3082 (u64)atomic64_add_return(sdata, maddr) - sdata :
3083 (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
3084 sdata, wqe->atomic_wr.swap);
3085 rvt_put_mr(qp->r_sge.sge.mr);
3086 qp->r_sge.num_sge = 0;
3087 goto send_comp;
3088
3089 default:
3090 send_status = IB_WC_LOC_QP_OP_ERR;
3091 goto serr;
3092 }
3093
3094 sge = &sqp->s_sge.sge;
3095 while (sqp->s_len) {
3096 u32 len = rvt_get_sge_length(sge, sqp->s_len);
3097
3098 WARN_ON_ONCE(len == 0);
3099 rvt_copy_sge(qp, &qp->r_sge, sge->vaddr,
3100 len, release, copy_last);
3101 rvt_update_sge(&sqp->s_sge, len, !release);
3102 sqp->s_len -= len;
3103 }
3104 if (release)
3105 rvt_put_ss(&qp->r_sge);
3106
3107 if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
3108 goto send_comp;
3109
3110 if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
3111 wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
3112 else
3113 wc.opcode = IB_WC_RECV;
3114 wc.wr_id = qp->r_wr_id;
3115 wc.status = IB_WC_SUCCESS;
3116 wc.byte_len = wqe->length;
3117 wc.qp = &qp->ibqp;
3118 wc.src_qp = qp->remote_qpn;
3119 wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
3120 wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
3121 wc.port_num = 1;
3122 /* Signal completion event if the solicited bit is set. */
3123 rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
3124
3125 send_comp:
3126 spin_unlock_irqrestore(&qp->r_lock, flags);
3127 spin_lock_irqsave(&sqp->s_lock, flags);
3128 rvp->n_loop_pkts++;
3129 flush_send:
3130 sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
3131 spin_lock(&sqp->r_lock);
3132 rvt_send_complete(sqp, wqe, send_status);
3133 spin_unlock(&sqp->r_lock);
3134 if (local_ops) {
3135 atomic_dec(&sqp->local_ops_pending);
3136 local_ops = 0;
3137 }
3138 goto again;
3139
3140 rnr_nak:
3141 /* Handle RNR NAK */
3142 if (qp->ibqp.qp_type == IB_QPT_UC)
3143 goto send_comp;
3144 rvp->n_rnr_naks++;
3145 /*
3146 * Note: we don't need the s_lock held since the BUSY flag
3147 * makes this single threaded.
3148 */
3149 if (sqp->s_rnr_retry == 0) {
3150 send_status = IB_WC_RNR_RETRY_EXC_ERR;
3151 goto serr;
3152 }
3153 if (sqp->s_rnr_retry_cnt < 7)
3154 sqp->s_rnr_retry--;
3155 spin_unlock_irqrestore(&qp->r_lock, flags);
3156 spin_lock_irqsave(&sqp->s_lock, flags);
3157 if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
3158 goto clr_busy;
3159 rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
3160 IB_AETH_CREDIT_SHIFT);
3161 goto clr_busy;
3162
3163 op_err:
3164 send_status = IB_WC_REM_OP_ERR;
3165 wc.status = IB_WC_LOC_QP_OP_ERR;
3166 goto err;
3167
3168 inv_err:
3169 send_status =
3170 sqp->ibqp.qp_type == IB_QPT_RC ?
3171 IB_WC_REM_INV_REQ_ERR :
3172 IB_WC_SUCCESS;
3173 wc.status = IB_WC_LOC_QP_OP_ERR;
3174 goto err;
3175
3176 acc_err:
3177 send_status = IB_WC_REM_ACCESS_ERR;
3178 wc.status = IB_WC_LOC_PROT_ERR;
3179 err:
3180 /* responder goes to error state */
3181 rvt_rc_error(qp, wc.status);
3182
3183 serr:
3184 spin_unlock_irqrestore(&qp->r_lock, flags);
3185 serr_no_r_lock:
3186 spin_lock_irqsave(&sqp->s_lock, flags);
3187 spin_lock(&sqp->r_lock);
3188 rvt_send_complete(sqp, wqe, send_status);
3189 spin_unlock(&sqp->r_lock);
3190 if (sqp->ibqp.qp_type == IB_QPT_RC) {
3191 int lastwqe;
3192
3193 spin_lock(&sqp->r_lock);
3194 lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
3195 spin_unlock(&sqp->r_lock);
3196
3197 sqp->s_flags &= ~RVT_S_BUSY;
3198 spin_unlock_irqrestore(&sqp->s_lock, flags);
3199 if (lastwqe) {
3200 struct ib_event ev;
3201
3202 ev.device = sqp->ibqp.device;
3203 ev.element.qp = &sqp->ibqp;
3204 ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
3205 sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
3206 }
3207 goto done;
3208 }
3209 clr_busy:
3210 sqp->s_flags &= ~RVT_S_BUSY;
3211 unlock:
3212 spin_unlock_irqrestore(&sqp->s_lock, flags);
3213 done:
3214 rcu_read_unlock();
3215 }
3216 EXPORT_SYMBOL(rvt_ruc_loopback);
3217