1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 */ 34 35 #include "ipoib.h" 36 37 #include <linux/module.h> 38 39 #include <linux/init.h> 40 #include <linux/slab.h> 41 #include <linux/kernel.h> 42 #include <linux/vmalloc.h> 43 44 #include <linux/if_arp.h> /* For ARPHRD_xxx */ 45 46 #include <linux/ip.h> 47 #include <linux/in.h> 48 49 #include <linux/jhash.h> 50 #include <net/arp.h> 51 #include <net/addrconf.h> 52 #include <net/netdev_lock.h> 53 #include <net/pkt_sched.h> 54 #include <linux/inetdevice.h> 55 #include <rdma/ib_cache.h> 56 57 MODULE_AUTHOR("Roland Dreier"); 58 MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); 59 MODULE_LICENSE("Dual BSD/GPL"); 60 61 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; 62 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; 63 64 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); 65 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); 66 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); 67 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); 68 69 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 70 int ipoib_debug_level; 71 72 module_param_named(debug_level, ipoib_debug_level, int, 0644); 73 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); 74 #endif 75 76 struct ipoib_path_iter { 77 struct net_device *dev; 78 struct ipoib_path path; 79 }; 80 81 static const u8 ipv4_bcast_addr[] = { 82 0x00, 0xff, 0xff, 0xff, 83 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 84 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff 85 }; 86 87 struct workqueue_struct *ipoib_workqueue; 88 89 struct ib_sa_client ipoib_sa_client; 90 91 static int ipoib_add_one(struct ib_device *device); 92 static void ipoib_remove_one(struct ib_device *device, void *client_data); 93 static void ipoib_neigh_reclaim(struct rcu_head *rp); 94 static struct net_device *ipoib_get_net_dev_by_params( 95 struct ib_device *dev, u32 port, u16 pkey, 96 const union ib_gid *gid, const struct sockaddr *addr, 97 void *client_data); 98 static int ipoib_set_mac(struct net_device *dev, void *addr); 99 static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, 100 int cmd); 101 102 static struct ib_client ipoib_client = { 103 .name = "ipoib", 104 .add = ipoib_add_one, 105 .remove = ipoib_remove_one, 106 .get_net_dev_by_params = ipoib_get_net_dev_by_params, 107 }; 108 109 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 110 static int ipoib_netdev_event(struct notifier_block *this, 111 unsigned long event, void *ptr) 112 { 113 struct netdev_notifier_info *ni = ptr; 114 struct net_device *dev = ni->dev; 115 116 if (dev->netdev_ops->ndo_open != ipoib_open) 117 return NOTIFY_DONE; 118 119 switch (event) { 120 case NETDEV_REGISTER: 121 ipoib_create_debug_files(dev); 122 break; 123 case NETDEV_CHANGENAME: 124 ipoib_delete_debug_files(dev); 125 ipoib_create_debug_files(dev); 126 break; 127 case NETDEV_UNREGISTER: 128 ipoib_delete_debug_files(dev); 129 break; 130 } 131 132 return NOTIFY_DONE; 133 } 134 #endif 135 136 struct ipoib_ifupdown_work { 137 struct work_struct work; 138 struct net_device *dev; 139 netdevice_tracker dev_tracker; 140 bool up; 141 }; 142 143 static void ipoib_ifupdown_task(struct work_struct *work) 144 { 145 struct ipoib_ifupdown_work *pwork = 146 container_of(work, struct ipoib_ifupdown_work, work); 147 struct net_device *dev = pwork->dev; 148 unsigned int flags; 149 150 rtnl_lock(); 151 flags = dev->flags; 152 if (pwork->up) 153 flags |= IFF_UP; 154 else 155 flags &= ~IFF_UP; 156 157 if (dev->flags != flags) 158 dev_change_flags(dev, flags, NULL); 159 rtnl_unlock(); 160 netdev_put(dev, &pwork->dev_tracker); 161 kfree(pwork); 162 } 163 164 static void ipoib_schedule_ifupdown_task(struct net_device *dev, bool up) 165 { 166 struct ipoib_ifupdown_work *work; 167 168 if ((up && (dev->flags & IFF_UP)) || 169 (!up && !(dev->flags & IFF_UP))) 170 return; 171 172 work = kmalloc(sizeof(*work), GFP_KERNEL); 173 if (!work) 174 return; 175 work->dev = dev; 176 netdev_hold(dev, &work->dev_tracker, GFP_KERNEL); 177 work->up = up; 178 INIT_WORK(&work->work, ipoib_ifupdown_task); 179 queue_work(ipoib_workqueue, &work->work); 180 } 181 182 int ipoib_open(struct net_device *dev) 183 { 184 struct ipoib_dev_priv *priv = ipoib_priv(dev); 185 186 ipoib_dbg(priv, "bringing up interface\n"); 187 188 netif_carrier_off(dev); 189 190 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 191 192 if (ipoib_ib_dev_open(dev)) { 193 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) 194 return 0; 195 goto err_disable; 196 } 197 198 ipoib_ib_dev_up(dev); 199 200 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 201 struct ipoib_dev_priv *cpriv; 202 203 /* Bring up any child interfaces too */ 204 netdev_lock_ops_to_full(dev); 205 list_for_each_entry(cpriv, &priv->child_intfs, list) 206 ipoib_schedule_ifupdown_task(cpriv->dev, true); 207 netdev_unlock_full_to_ops(dev); 208 } else if (priv->parent) { 209 struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); 210 211 if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags)) 212 ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n", 213 ppriv->dev->name); 214 } 215 netif_start_queue(dev); 216 217 return 0; 218 219 err_disable: 220 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 221 222 return -EINVAL; 223 } 224 225 static int ipoib_stop(struct net_device *dev) 226 { 227 struct ipoib_dev_priv *priv = ipoib_priv(dev); 228 229 ipoib_dbg(priv, "stopping interface\n"); 230 231 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); 232 233 netif_stop_queue(dev); 234 235 ipoib_ib_dev_down(dev); 236 ipoib_ib_dev_stop(dev); 237 238 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 239 struct ipoib_dev_priv *cpriv; 240 241 /* Bring down any child interfaces too */ 242 netdev_lock_ops_to_full(dev); 243 list_for_each_entry(cpriv, &priv->child_intfs, list) 244 ipoib_schedule_ifupdown_task(cpriv->dev, false); 245 netdev_unlock_full_to_ops(dev); 246 } 247 248 return 0; 249 } 250 251 static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) 252 { 253 struct ipoib_dev_priv *priv = ipoib_priv(dev); 254 255 if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) 256 features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO); 257 258 return features; 259 } 260 261 static int ipoib_change_mtu(struct net_device *dev, int new_mtu) 262 { 263 struct ipoib_dev_priv *priv = ipoib_priv(dev); 264 int ret = 0; 265 266 /* dev->mtu > 2K ==> connected mode */ 267 if (ipoib_cm_admin_enabled(dev)) { 268 if (new_mtu > ipoib_cm_max_mtu(dev)) 269 return -EINVAL; 270 271 if (new_mtu > priv->mcast_mtu) 272 ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", 273 priv->mcast_mtu); 274 275 WRITE_ONCE(dev->mtu, new_mtu); 276 return 0; 277 } 278 279 if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) || 280 new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) 281 return -EINVAL; 282 283 priv->admin_mtu = new_mtu; 284 285 if (priv->mcast_mtu < priv->admin_mtu) 286 ipoib_dbg(priv, "MTU must be smaller than the underlying " 287 "link layer MTU - 4 (%u)\n", priv->mcast_mtu); 288 289 new_mtu = min(priv->mcast_mtu, priv->admin_mtu); 290 291 if (priv->rn_ops->ndo_change_mtu) { 292 bool carrier_status = netif_carrier_ok(dev); 293 294 netif_carrier_off(dev); 295 296 /* notify lower level on the real mtu */ 297 ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu); 298 299 if (carrier_status) 300 netif_carrier_on(dev); 301 } else { 302 WRITE_ONCE(dev->mtu, new_mtu); 303 } 304 305 return ret; 306 } 307 308 static void ipoib_get_stats(struct net_device *dev, 309 struct rtnl_link_stats64 *stats) 310 { 311 struct ipoib_dev_priv *priv = ipoib_priv(dev); 312 313 if (priv->rn_ops->ndo_get_stats64) 314 priv->rn_ops->ndo_get_stats64(dev, stats); 315 else 316 netdev_stats_to_stats64(stats, &dev->stats); 317 } 318 319 /* Called with an RCU read lock taken */ 320 static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr, 321 struct net_device *dev) 322 { 323 struct net *net = dev_net(dev); 324 struct in_device *in_dev; 325 struct sockaddr_in *addr_in = (struct sockaddr_in *)addr; 326 struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr; 327 __be32 ret_addr; 328 329 switch (addr->sa_family) { 330 case AF_INET: 331 in_dev = in_dev_get(dev); 332 if (!in_dev) 333 return false; 334 335 ret_addr = inet_confirm_addr(net, in_dev, 0, 336 addr_in->sin_addr.s_addr, 337 RT_SCOPE_HOST); 338 in_dev_put(in_dev); 339 if (ret_addr) 340 return true; 341 342 break; 343 case AF_INET6: 344 if (IS_ENABLED(CONFIG_IPV6) && 345 ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1)) 346 return true; 347 348 break; 349 } 350 return false; 351 } 352 353 /* 354 * Find the master net_device on top of the given net_device. 355 * @dev: base IPoIB net_device 356 * 357 * Returns the master net_device with a reference held, or the same net_device 358 * if no master exists. 359 */ 360 static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) 361 { 362 struct net_device *master; 363 364 rcu_read_lock(); 365 master = netdev_master_upper_dev_get_rcu(dev); 366 dev_hold(master); 367 rcu_read_unlock(); 368 369 if (master) 370 return master; 371 372 dev_hold(dev); 373 return dev; 374 } 375 376 struct ipoib_walk_data { 377 const struct sockaddr *addr; 378 struct net_device *result; 379 }; 380 381 static int ipoib_upper_walk(struct net_device *upper, 382 struct netdev_nested_priv *priv) 383 { 384 struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; 385 int ret = 0; 386 387 if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { 388 dev_hold(upper); 389 data->result = upper; 390 ret = 1; 391 } 392 393 return ret; 394 } 395 396 /** 397 * ipoib_get_net_dev_match_addr - Find a net_device matching 398 * the given address, which is an upper device of the given net_device. 399 * 400 * @addr: IP address to look for. 401 * @dev: base IPoIB net_device 402 * 403 * If found, returns the net_device with a reference held. Otherwise return 404 * NULL. 405 */ 406 static struct net_device *ipoib_get_net_dev_match_addr( 407 const struct sockaddr *addr, struct net_device *dev) 408 { 409 struct netdev_nested_priv priv; 410 struct ipoib_walk_data data = { 411 .addr = addr, 412 }; 413 414 priv.data = (void *)&data; 415 rcu_read_lock(); 416 if (ipoib_is_dev_match_addr_rcu(addr, dev)) { 417 dev_hold(dev); 418 data.result = dev; 419 goto out; 420 } 421 422 netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv); 423 out: 424 rcu_read_unlock(); 425 return data.result; 426 } 427 428 /* returns the number of IPoIB netdevs on top a given ipoib device matching a 429 * pkey_index and address, if one exists. 430 * 431 * @found_net_dev: contains a matching net_device if the return value >= 1, 432 * with a reference held. */ 433 static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, 434 const union ib_gid *gid, 435 u16 pkey_index, 436 const struct sockaddr *addr, 437 int nesting, 438 struct net_device **found_net_dev) 439 { 440 struct ipoib_dev_priv *child_priv; 441 struct net_device *net_dev = NULL; 442 int matches = 0; 443 444 if (priv->pkey_index == pkey_index && 445 (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { 446 if (!addr) { 447 net_dev = ipoib_get_master_net_dev(priv->dev); 448 } else { 449 /* Verify the net_device matches the IP address, as 450 * IPoIB child devices currently share a GID. */ 451 net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev); 452 } 453 if (net_dev) { 454 if (!*found_net_dev) 455 *found_net_dev = net_dev; 456 else 457 dev_put(net_dev); 458 ++matches; 459 } 460 } 461 462 if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) 463 return matches; 464 465 /* Check child interfaces */ 466 netdev_lock(priv->dev); 467 list_for_each_entry(child_priv, &priv->child_intfs, list) { 468 matches += ipoib_match_gid_pkey_addr(child_priv, gid, 469 pkey_index, addr, 470 nesting + 1, 471 found_net_dev); 472 if (matches > 1) 473 break; 474 } 475 netdev_unlock(priv->dev); 476 477 return matches; 478 } 479 480 /* Returns the number of matching net_devs found (between 0 and 2). Also 481 * return the matching net_device in the @net_dev parameter, holding a 482 * reference to the net_device, if the number of matches >= 1 */ 483 static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port, 484 u16 pkey_index, 485 const union ib_gid *gid, 486 const struct sockaddr *addr, 487 struct net_device **net_dev) 488 { 489 struct ipoib_dev_priv *priv; 490 int matches = 0; 491 492 *net_dev = NULL; 493 494 list_for_each_entry(priv, dev_list, list) { 495 if (priv->port != port) 496 continue; 497 498 matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, 499 addr, 0, net_dev); 500 if (matches > 1) 501 break; 502 } 503 504 return matches; 505 } 506 507 static struct net_device *ipoib_get_net_dev_by_params( 508 struct ib_device *dev, u32 port, u16 pkey, 509 const union ib_gid *gid, const struct sockaddr *addr, 510 void *client_data) 511 { 512 struct net_device *net_dev; 513 struct list_head *dev_list = client_data; 514 u16 pkey_index; 515 int matches; 516 int ret; 517 518 if (!rdma_protocol_ib(dev, port)) 519 return NULL; 520 521 ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); 522 if (ret) 523 return NULL; 524 525 /* See if we can find a unique device matching the L2 parameters */ 526 matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, 527 gid, NULL, &net_dev); 528 529 switch (matches) { 530 case 0: 531 return NULL; 532 case 1: 533 return net_dev; 534 } 535 536 dev_put(net_dev); 537 538 /* Couldn't find a unique device with L2 parameters only. Use L3 539 * address to uniquely match the net device */ 540 matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, 541 gid, addr, &net_dev); 542 switch (matches) { 543 case 0: 544 return NULL; 545 default: 546 dev_warn_ratelimited(&dev->dev, 547 "duplicate IP address detected\n"); 548 fallthrough; 549 case 1: 550 return net_dev; 551 } 552 } 553 554 int ipoib_set_mode(struct net_device *dev, const char *buf) 555 { 556 struct ipoib_dev_priv *priv = ipoib_priv(dev); 557 558 if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && 559 !strcmp(buf, "connected\n")) || 560 (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && 561 !strcmp(buf, "datagram\n"))) { 562 return 0; 563 } 564 565 /* flush paths if we switch modes so that connections are restarted */ 566 if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { 567 set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); 568 ipoib_warn(priv, "enabling connected mode " 569 "will cause multicast packet drops\n"); 570 netdev_lock_ops(dev); 571 netdev_update_features(dev); 572 netif_set_mtu(dev, ipoib_cm_max_mtu(dev)); 573 netif_set_real_num_tx_queues(dev, 1); 574 netdev_unlock_ops(dev); 575 rtnl_unlock(); 576 priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; 577 578 ipoib_flush_paths(dev); 579 return (!rtnl_trylock()) ? -EBUSY : 0; 580 } 581 582 if (!strcmp(buf, "datagram\n")) { 583 clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); 584 netdev_lock_ops(dev); 585 netdev_update_features(dev); 586 netif_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); 587 netif_set_real_num_tx_queues(dev, dev->num_tx_queues); 588 netdev_unlock_ops(dev); 589 rtnl_unlock(); 590 ipoib_flush_paths(dev); 591 return (!rtnl_trylock()) ? -EBUSY : 0; 592 } 593 594 return -EINVAL; 595 } 596 597 struct ipoib_path *__path_find(struct net_device *dev, void *gid) 598 { 599 struct ipoib_dev_priv *priv = ipoib_priv(dev); 600 struct rb_node *n = priv->path_tree.rb_node; 601 struct ipoib_path *path; 602 int ret; 603 604 while (n) { 605 path = rb_entry(n, struct ipoib_path, rb_node); 606 607 ret = memcmp(gid, path->pathrec.dgid.raw, 608 sizeof (union ib_gid)); 609 610 if (ret < 0) 611 n = n->rb_left; 612 else if (ret > 0) 613 n = n->rb_right; 614 else 615 return path; 616 } 617 618 return NULL; 619 } 620 621 static int __path_add(struct net_device *dev, struct ipoib_path *path) 622 { 623 struct ipoib_dev_priv *priv = ipoib_priv(dev); 624 struct rb_node **n = &priv->path_tree.rb_node; 625 struct rb_node *pn = NULL; 626 struct ipoib_path *tpath; 627 int ret; 628 629 while (*n) { 630 pn = *n; 631 tpath = rb_entry(pn, struct ipoib_path, rb_node); 632 633 ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, 634 sizeof (union ib_gid)); 635 if (ret < 0) 636 n = &pn->rb_left; 637 else if (ret > 0) 638 n = &pn->rb_right; 639 else 640 return -EEXIST; 641 } 642 643 rb_link_node(&path->rb_node, pn, n); 644 rb_insert_color(&path->rb_node, &priv->path_tree); 645 646 list_add_tail(&path->list, &priv->path_list); 647 648 return 0; 649 } 650 651 static void path_free(struct net_device *dev, struct ipoib_path *path) 652 { 653 struct sk_buff *skb; 654 655 while ((skb = __skb_dequeue(&path->queue))) 656 dev_kfree_skb_irq(skb); 657 658 ipoib_dbg(ipoib_priv(dev), "%s\n", __func__); 659 660 /* remove all neigh connected to this path */ 661 ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); 662 663 if (path->ah) 664 ipoib_put_ah(path->ah); 665 666 kfree(path); 667 } 668 669 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 670 671 struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) 672 { 673 struct ipoib_path_iter *iter; 674 675 iter = kmalloc(sizeof(*iter), GFP_KERNEL); 676 if (!iter) 677 return NULL; 678 679 iter->dev = dev; 680 memset(iter->path.pathrec.dgid.raw, 0, 16); 681 682 if (ipoib_path_iter_next(iter)) { 683 kfree(iter); 684 return NULL; 685 } 686 687 return iter; 688 } 689 690 int ipoib_path_iter_next(struct ipoib_path_iter *iter) 691 { 692 struct ipoib_dev_priv *priv = ipoib_priv(iter->dev); 693 struct rb_node *n; 694 struct ipoib_path *path; 695 int ret = 1; 696 697 spin_lock_irq(&priv->lock); 698 699 n = rb_first(&priv->path_tree); 700 701 while (n) { 702 path = rb_entry(n, struct ipoib_path, rb_node); 703 704 if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, 705 sizeof (union ib_gid)) < 0) { 706 iter->path = *path; 707 ret = 0; 708 break; 709 } 710 711 n = rb_next(n); 712 } 713 714 spin_unlock_irq(&priv->lock); 715 716 return ret; 717 } 718 719 void ipoib_path_iter_read(struct ipoib_path_iter *iter, 720 struct ipoib_path *path) 721 { 722 *path = iter->path; 723 } 724 725 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ 726 727 void ipoib_mark_paths_invalid(struct net_device *dev) 728 { 729 struct ipoib_dev_priv *priv = ipoib_priv(dev); 730 struct ipoib_path *path, *tp; 731 732 spin_lock_irq(&priv->lock); 733 734 list_for_each_entry_safe(path, tp, &priv->path_list, list) { 735 ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n", 736 be32_to_cpu(sa_path_get_dlid(&path->pathrec)), 737 path->pathrec.dgid.raw); 738 if (path->ah) 739 path->ah->valid = 0; 740 } 741 742 spin_unlock_irq(&priv->lock); 743 } 744 745 static void push_pseudo_header(struct sk_buff *skb, const char *daddr) 746 { 747 struct ipoib_pseudo_header *phdr; 748 749 phdr = skb_push(skb, sizeof(*phdr)); 750 memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); 751 } 752 753 void ipoib_flush_paths(struct net_device *dev) 754 { 755 struct ipoib_dev_priv *priv = ipoib_priv(dev); 756 struct ipoib_path *path, *tp; 757 LIST_HEAD(remove_list); 758 unsigned long flags; 759 760 netif_tx_lock_bh(dev); 761 spin_lock_irqsave(&priv->lock, flags); 762 763 list_splice_init(&priv->path_list, &remove_list); 764 765 list_for_each_entry(path, &remove_list, list) 766 rb_erase(&path->rb_node, &priv->path_tree); 767 768 list_for_each_entry_safe(path, tp, &remove_list, list) { 769 if (path->query) 770 ib_sa_cancel_query(path->query_id, path->query); 771 spin_unlock_irqrestore(&priv->lock, flags); 772 netif_tx_unlock_bh(dev); 773 wait_for_completion(&path->done); 774 path_free(dev, path); 775 netif_tx_lock_bh(dev); 776 spin_lock_irqsave(&priv->lock, flags); 777 } 778 779 spin_unlock_irqrestore(&priv->lock, flags); 780 netif_tx_unlock_bh(dev); 781 } 782 783 static void path_rec_completion(int status, 784 struct sa_path_rec *pathrec, 785 unsigned int num_prs, void *path_ptr) 786 { 787 struct ipoib_path *path = path_ptr; 788 struct net_device *dev = path->dev; 789 struct ipoib_dev_priv *priv = ipoib_priv(dev); 790 struct ipoib_ah *ah = NULL; 791 struct ipoib_ah *old_ah = NULL; 792 struct ipoib_neigh *neigh, *tn; 793 struct sk_buff_head skqueue; 794 struct sk_buff *skb; 795 unsigned long flags; 796 797 if (!status) 798 ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", 799 be32_to_cpu(sa_path_get_dlid(pathrec)), 800 pathrec->dgid.raw); 801 else 802 ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", 803 status, path->pathrec.dgid.raw); 804 805 skb_queue_head_init(&skqueue); 806 807 if (!status) { 808 struct rdma_ah_attr av; 809 810 if (!ib_init_ah_attr_from_path(priv->ca, priv->port, 811 pathrec, &av, NULL)) { 812 ah = ipoib_create_ah(dev, priv->pd, &av); 813 rdma_destroy_ah_attr(&av); 814 } 815 } 816 817 spin_lock_irqsave(&priv->lock, flags); 818 819 if (!IS_ERR_OR_NULL(ah)) { 820 /* 821 * pathrec.dgid is used as the database key from the LLADDR, 822 * it must remain unchanged even if the SA returns a different 823 * GID to use in the AH. 824 */ 825 if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw, 826 sizeof(union ib_gid))) { 827 ipoib_dbg( 828 priv, 829 "%s got PathRec for gid %pI6 while asked for %pI6\n", 830 dev->name, pathrec->dgid.raw, 831 path->pathrec.dgid.raw); 832 memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw, 833 sizeof(union ib_gid)); 834 } 835 836 path->pathrec = *pathrec; 837 838 old_ah = path->ah; 839 path->ah = ah; 840 841 ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", 842 ah, be32_to_cpu(sa_path_get_dlid(pathrec)), 843 pathrec->sl); 844 845 while ((skb = __skb_dequeue(&path->queue))) 846 __skb_queue_tail(&skqueue, skb); 847 848 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { 849 if (neigh->ah) { 850 WARN_ON(neigh->ah != old_ah); 851 /* 852 * Dropping the ah reference inside 853 * priv->lock is safe here, because we 854 * will hold one more reference from 855 * the original value of path->ah (ie 856 * old_ah). 857 */ 858 ipoib_put_ah(neigh->ah); 859 } 860 kref_get(&path->ah->ref); 861 neigh->ah = path->ah; 862 863 if (ipoib_cm_enabled(dev, neigh->daddr)) { 864 if (!ipoib_cm_get(neigh)) 865 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, 866 path, 867 neigh)); 868 if (!ipoib_cm_get(neigh)) { 869 ipoib_neigh_free(neigh); 870 continue; 871 } 872 } 873 874 while ((skb = __skb_dequeue(&neigh->queue))) 875 __skb_queue_tail(&skqueue, skb); 876 } 877 path->ah->valid = 1; 878 } 879 880 path->query = NULL; 881 complete(&path->done); 882 883 spin_unlock_irqrestore(&priv->lock, flags); 884 885 if (IS_ERR_OR_NULL(ah)) 886 ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); 887 888 if (old_ah) 889 ipoib_put_ah(old_ah); 890 891 while ((skb = __skb_dequeue(&skqueue))) { 892 int ret; 893 skb->dev = dev; 894 ret = dev_queue_xmit(skb); 895 if (ret) 896 ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n", 897 __func__, ret); 898 } 899 } 900 901 static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path, 902 void *gid) 903 { 904 path->dev = priv->dev; 905 906 if (rdma_cap_opa_ah(priv->ca, priv->port)) 907 path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; 908 else 909 path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; 910 911 memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid)); 912 path->pathrec.sgid = priv->local_gid; 913 path->pathrec.pkey = cpu_to_be16(priv->pkey); 914 path->pathrec.numb_path = 1; 915 path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; 916 } 917 918 static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) 919 { 920 struct ipoib_dev_priv *priv = ipoib_priv(dev); 921 struct ipoib_path *path; 922 923 if (!priv->broadcast) 924 return NULL; 925 926 path = kzalloc(sizeof(*path), GFP_ATOMIC); 927 if (!path) 928 return NULL; 929 930 skb_queue_head_init(&path->queue); 931 932 INIT_LIST_HEAD(&path->neigh_list); 933 934 init_path_rec(priv, path, gid); 935 936 return path; 937 } 938 939 static int path_rec_start(struct net_device *dev, 940 struct ipoib_path *path) 941 { 942 struct ipoib_dev_priv *priv = ipoib_priv(dev); 943 944 ipoib_dbg(priv, "Start path record lookup for %pI6\n", 945 path->pathrec.dgid.raw); 946 947 init_completion(&path->done); 948 949 path->query_id = 950 ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, 951 &path->pathrec, 952 IB_SA_PATH_REC_DGID | 953 IB_SA_PATH_REC_SGID | 954 IB_SA_PATH_REC_NUMB_PATH | 955 IB_SA_PATH_REC_TRAFFIC_CLASS | 956 IB_SA_PATH_REC_PKEY, 957 1000, GFP_ATOMIC, 958 path_rec_completion, 959 path, &path->query); 960 if (path->query_id < 0) { 961 ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); 962 path->query = NULL; 963 complete(&path->done); 964 return path->query_id; 965 } 966 967 return 0; 968 } 969 970 static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr, 971 struct net_device *dev) 972 { 973 struct ipoib_dev_priv *priv = ipoib_priv(dev); 974 struct ipoib_path *path; 975 unsigned long flags; 976 977 spin_lock_irqsave(&priv->lock, flags); 978 979 path = __path_find(dev, daddr + 4); 980 if (!path) 981 goto out; 982 if (!path->query) 983 path_rec_start(dev, path); 984 out: 985 spin_unlock_irqrestore(&priv->lock, flags); 986 } 987 988 static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr, 989 struct net_device *dev) 990 { 991 struct ipoib_dev_priv *priv = ipoib_priv(dev); 992 struct rdma_netdev *rn = netdev_priv(dev); 993 struct ipoib_path *path; 994 struct ipoib_neigh *neigh; 995 unsigned long flags; 996 997 spin_lock_irqsave(&priv->lock, flags); 998 neigh = ipoib_neigh_alloc(daddr, dev); 999 if (!neigh) { 1000 spin_unlock_irqrestore(&priv->lock, flags); 1001 ++dev->stats.tx_dropped; 1002 dev_kfree_skb_any(skb); 1003 return NULL; 1004 } 1005 1006 /* To avoid race condition, make sure that the 1007 * neigh will be added only once. 1008 */ 1009 if (unlikely(!list_empty(&neigh->list))) { 1010 spin_unlock_irqrestore(&priv->lock, flags); 1011 return neigh; 1012 } 1013 1014 path = __path_find(dev, daddr + 4); 1015 if (!path) { 1016 path = path_rec_create(dev, daddr + 4); 1017 if (!path) 1018 goto err_path; 1019 1020 __path_add(dev, path); 1021 } 1022 1023 list_add_tail(&neigh->list, &path->neigh_list); 1024 1025 if (path->ah && path->ah->valid) { 1026 kref_get(&path->ah->ref); 1027 neigh->ah = path->ah; 1028 1029 if (ipoib_cm_enabled(dev, neigh->daddr)) { 1030 if (!ipoib_cm_get(neigh)) 1031 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); 1032 if (!ipoib_cm_get(neigh)) { 1033 ipoib_neigh_free(neigh); 1034 goto err_drop; 1035 } 1036 if (skb_queue_len(&neigh->queue) < 1037 IPOIB_MAX_PATH_REC_QUEUE) { 1038 push_pseudo_header(skb, neigh->daddr); 1039 __skb_queue_tail(&neigh->queue, skb); 1040 } else { 1041 ipoib_warn(priv, "queue length limit %d. Packet drop.\n", 1042 skb_queue_len(&neigh->queue)); 1043 goto err_drop; 1044 } 1045 } else { 1046 spin_unlock_irqrestore(&priv->lock, flags); 1047 path->ah->last_send = rn->send(dev, skb, path->ah->ah, 1048 IPOIB_QPN(daddr)); 1049 ipoib_neigh_put(neigh); 1050 return NULL; 1051 } 1052 } else { 1053 neigh->ah = NULL; 1054 1055 if (!path->query && path_rec_start(dev, path)) 1056 goto err_path; 1057 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 1058 push_pseudo_header(skb, neigh->daddr); 1059 __skb_queue_tail(&neigh->queue, skb); 1060 } else { 1061 goto err_drop; 1062 } 1063 } 1064 1065 spin_unlock_irqrestore(&priv->lock, flags); 1066 ipoib_neigh_put(neigh); 1067 return NULL; 1068 1069 err_path: 1070 ipoib_neigh_free(neigh); 1071 err_drop: 1072 ++dev->stats.tx_dropped; 1073 dev_kfree_skb_any(skb); 1074 1075 spin_unlock_irqrestore(&priv->lock, flags); 1076 ipoib_neigh_put(neigh); 1077 1078 return NULL; 1079 } 1080 1081 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, 1082 struct ipoib_pseudo_header *phdr) 1083 { 1084 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1085 struct rdma_netdev *rn = netdev_priv(dev); 1086 struct ipoib_path *path; 1087 unsigned long flags; 1088 1089 spin_lock_irqsave(&priv->lock, flags); 1090 1091 /* no broadcast means that all paths are (going to be) not valid */ 1092 if (!priv->broadcast) 1093 goto drop_and_unlock; 1094 1095 path = __path_find(dev, phdr->hwaddr + 4); 1096 if (!path || !path->ah || !path->ah->valid) { 1097 if (!path) { 1098 path = path_rec_create(dev, phdr->hwaddr + 4); 1099 if (!path) 1100 goto drop_and_unlock; 1101 __path_add(dev, path); 1102 } else { 1103 /* 1104 * make sure there are no changes in the existing 1105 * path record 1106 */ 1107 init_path_rec(priv, path, phdr->hwaddr + 4); 1108 } 1109 if (!path->query && path_rec_start(dev, path)) { 1110 goto drop_and_unlock; 1111 } 1112 1113 if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 1114 push_pseudo_header(skb, phdr->hwaddr); 1115 __skb_queue_tail(&path->queue, skb); 1116 goto unlock; 1117 } else { 1118 goto drop_and_unlock; 1119 } 1120 } 1121 1122 spin_unlock_irqrestore(&priv->lock, flags); 1123 ipoib_dbg(priv, "Send unicast ARP to %08x\n", 1124 be32_to_cpu(sa_path_get_dlid(&path->pathrec))); 1125 path->ah->last_send = rn->send(dev, skb, path->ah->ah, 1126 IPOIB_QPN(phdr->hwaddr)); 1127 return; 1128 1129 drop_and_unlock: 1130 ++dev->stats.tx_dropped; 1131 dev_kfree_skb_any(skb); 1132 unlock: 1133 spin_unlock_irqrestore(&priv->lock, flags); 1134 } 1135 1136 static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) 1137 { 1138 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1139 struct rdma_netdev *rn = netdev_priv(dev); 1140 struct ipoib_neigh *neigh; 1141 struct ipoib_pseudo_header *phdr; 1142 struct ipoib_header *header; 1143 unsigned long flags; 1144 1145 phdr = (struct ipoib_pseudo_header *) skb->data; 1146 skb_pull(skb, sizeof(*phdr)); 1147 header = (struct ipoib_header *) skb->data; 1148 1149 if (unlikely(phdr->hwaddr[4] == 0xff)) { 1150 /* multicast, arrange "if" according to probability */ 1151 if ((header->proto != htons(ETH_P_IP)) && 1152 (header->proto != htons(ETH_P_IPV6)) && 1153 (header->proto != htons(ETH_P_ARP)) && 1154 (header->proto != htons(ETH_P_RARP)) && 1155 (header->proto != htons(ETH_P_TIPC))) { 1156 /* ethertype not supported by IPoIB */ 1157 ++dev->stats.tx_dropped; 1158 dev_kfree_skb_any(skb); 1159 return NETDEV_TX_OK; 1160 } 1161 /* Add in the P_Key for multicast*/ 1162 phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; 1163 phdr->hwaddr[9] = priv->pkey & 0xff; 1164 1165 neigh = ipoib_neigh_get(dev, phdr->hwaddr); 1166 if (likely(neigh)) 1167 goto send_using_neigh; 1168 ipoib_mcast_send(dev, phdr->hwaddr, skb); 1169 return NETDEV_TX_OK; 1170 } 1171 1172 /* unicast, arrange "switch" according to probability */ 1173 switch (header->proto) { 1174 case htons(ETH_P_IP): 1175 case htons(ETH_P_IPV6): 1176 case htons(ETH_P_TIPC): 1177 neigh = ipoib_neigh_get(dev, phdr->hwaddr); 1178 if (unlikely(!neigh)) { 1179 neigh = neigh_add_path(skb, phdr->hwaddr, dev); 1180 if (likely(!neigh)) 1181 return NETDEV_TX_OK; 1182 } 1183 break; 1184 case htons(ETH_P_ARP): 1185 case htons(ETH_P_RARP): 1186 /* for unicast ARP and RARP should always perform path find */ 1187 unicast_arp_send(skb, dev, phdr); 1188 return NETDEV_TX_OK; 1189 default: 1190 /* ethertype not supported by IPoIB */ 1191 ++dev->stats.tx_dropped; 1192 dev_kfree_skb_any(skb); 1193 return NETDEV_TX_OK; 1194 } 1195 1196 send_using_neigh: 1197 /* note we now hold a ref to neigh */ 1198 if (ipoib_cm_get(neigh)) { 1199 if (ipoib_cm_up(neigh)) { 1200 ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); 1201 goto unref; 1202 } 1203 } else if (neigh->ah && neigh->ah->valid) { 1204 neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, 1205 IPOIB_QPN(phdr->hwaddr)); 1206 goto unref; 1207 } else if (neigh->ah) { 1208 neigh_refresh_path(neigh, phdr->hwaddr, dev); 1209 } 1210 1211 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { 1212 push_pseudo_header(skb, phdr->hwaddr); 1213 spin_lock_irqsave(&priv->lock, flags); 1214 __skb_queue_tail(&neigh->queue, skb); 1215 spin_unlock_irqrestore(&priv->lock, flags); 1216 } else { 1217 ++dev->stats.tx_dropped; 1218 dev_kfree_skb_any(skb); 1219 } 1220 1221 unref: 1222 ipoib_neigh_put(neigh); 1223 1224 return NETDEV_TX_OK; 1225 } 1226 1227 static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) 1228 { 1229 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1230 struct rdma_netdev *rn = netdev_priv(dev); 1231 1232 if (rn->tx_timeout) { 1233 rn->tx_timeout(dev, txqueue); 1234 return; 1235 } 1236 ipoib_warn(priv, "transmit timeout: latency %d msecs\n", 1237 jiffies_to_msecs(jiffies - dev_trans_start(dev))); 1238 ipoib_warn(priv, 1239 "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n", 1240 netif_queue_stopped(dev), priv->tx_head, priv->tx_tail, 1241 priv->global_tx_head, priv->global_tx_tail); 1242 1243 1244 schedule_work(&priv->tx_timeout_work); 1245 } 1246 1247 void ipoib_ib_tx_timeout_work(struct work_struct *work) 1248 { 1249 struct ipoib_dev_priv *priv = container_of(work, 1250 struct ipoib_dev_priv, 1251 tx_timeout_work); 1252 int err; 1253 1254 rtnl_lock(); 1255 netdev_lock_ops(priv->dev); 1256 1257 if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) 1258 goto unlock; 1259 1260 ipoib_stop(priv->dev); 1261 err = ipoib_open(priv->dev); 1262 if (err) { 1263 ipoib_warn(priv, "ipoib_open failed recovering from a tx_timeout, err(%d).\n", 1264 err); 1265 goto unlock; 1266 } 1267 1268 netif_tx_wake_all_queues(priv->dev); 1269 unlock: 1270 netdev_unlock_ops(priv->dev); 1271 rtnl_unlock(); 1272 1273 } 1274 1275 static int ipoib_hard_header(struct sk_buff *skb, 1276 struct net_device *dev, 1277 unsigned short type, 1278 const void *daddr, 1279 const void *saddr, 1280 unsigned int len) 1281 { 1282 struct ipoib_header *header; 1283 1284 header = skb_push(skb, sizeof(*header)); 1285 1286 header->proto = htons(type); 1287 header->reserved = 0; 1288 1289 /* 1290 * we don't rely on dst_entry structure, always stuff the 1291 * destination address into skb hard header so we can figure out where 1292 * to send the packet later. 1293 */ 1294 push_pseudo_header(skb, daddr); 1295 1296 return IPOIB_HARD_LEN; 1297 } 1298 1299 static void ipoib_set_mcast_list(struct net_device *dev) 1300 { 1301 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1302 1303 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { 1304 ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); 1305 return; 1306 } 1307 1308 queue_work(priv->wq, &priv->restart_task); 1309 } 1310 1311 static int ipoib_get_iflink(const struct net_device *dev) 1312 { 1313 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1314 1315 /* parent interface */ 1316 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) 1317 return READ_ONCE(dev->ifindex); 1318 1319 /* child/vlan interface */ 1320 return READ_ONCE(priv->parent->ifindex); 1321 } 1322 1323 static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) 1324 { 1325 /* 1326 * Use only the address parts that contributes to spreading 1327 * The subnet prefix is not used as one can not connect to 1328 * same remote port (GUID) using the same remote QPN via two 1329 * different subnets. 1330 */ 1331 /* qpn octets[1:4) & port GUID octets[12:20) */ 1332 u32 *d32 = (u32 *) daddr; 1333 u32 hv; 1334 1335 hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); 1336 return hv & htbl->mask; 1337 } 1338 1339 struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) 1340 { 1341 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1342 struct ipoib_neigh_table *ntbl = &priv->ntbl; 1343 struct ipoib_neigh_hash *htbl; 1344 struct ipoib_neigh *neigh = NULL; 1345 u32 hash_val; 1346 1347 rcu_read_lock_bh(); 1348 1349 htbl = rcu_dereference_bh(ntbl->htbl); 1350 1351 if (!htbl) 1352 goto out_unlock; 1353 1354 hash_val = ipoib_addr_hash(htbl, daddr); 1355 for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); 1356 neigh != NULL; 1357 neigh = rcu_dereference_bh(neigh->hnext)) { 1358 if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { 1359 /* found, take one ref on behalf of the caller */ 1360 if (!refcount_inc_not_zero(&neigh->refcnt)) { 1361 /* deleted */ 1362 neigh = NULL; 1363 goto out_unlock; 1364 } 1365 1366 if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)) 1367 neigh->alive = jiffies; 1368 goto out_unlock; 1369 } 1370 } 1371 1372 out_unlock: 1373 rcu_read_unlock_bh(); 1374 return neigh; 1375 } 1376 1377 static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) 1378 { 1379 struct ipoib_neigh_table *ntbl = &priv->ntbl; 1380 struct ipoib_neigh_hash *htbl; 1381 unsigned long neigh_obsolete; 1382 unsigned long dt; 1383 unsigned long flags; 1384 int i; 1385 LIST_HEAD(remove_list); 1386 1387 spin_lock_irqsave(&priv->lock, flags); 1388 1389 htbl = rcu_dereference_protected(ntbl->htbl, 1390 lockdep_is_held(&priv->lock)); 1391 1392 if (!htbl) 1393 goto out_unlock; 1394 1395 /* neigh is obsolete if it was idle for two GC periods */ 1396 dt = 2 * arp_tbl.gc_interval; 1397 neigh_obsolete = jiffies - dt; 1398 1399 for (i = 0; i < htbl->size; i++) { 1400 struct ipoib_neigh *neigh; 1401 struct ipoib_neigh __rcu **np = &htbl->buckets[i]; 1402 1403 while ((neigh = rcu_dereference_protected(*np, 1404 lockdep_is_held(&priv->lock))) != NULL) { 1405 /* was the neigh idle for two GC periods */ 1406 if (time_after(neigh_obsolete, neigh->alive)) { 1407 1408 ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list); 1409 1410 rcu_assign_pointer(*np, 1411 rcu_dereference_protected(neigh->hnext, 1412 lockdep_is_held(&priv->lock))); 1413 /* remove from path/mc list */ 1414 list_del_init(&neigh->list); 1415 call_rcu(&neigh->rcu, ipoib_neigh_reclaim); 1416 } else { 1417 np = &neigh->hnext; 1418 } 1419 1420 } 1421 } 1422 1423 out_unlock: 1424 spin_unlock_irqrestore(&priv->lock, flags); 1425 ipoib_mcast_remove_list(&remove_list); 1426 } 1427 1428 static void ipoib_reap_neigh(struct work_struct *work) 1429 { 1430 struct ipoib_dev_priv *priv = 1431 container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); 1432 1433 __ipoib_reap_neigh(priv); 1434 1435 queue_delayed_work(priv->wq, &priv->neigh_reap_task, 1436 arp_tbl.gc_interval); 1437 } 1438 1439 1440 static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, 1441 struct net_device *dev) 1442 { 1443 struct ipoib_neigh *neigh; 1444 1445 neigh = kzalloc(sizeof(*neigh), GFP_ATOMIC); 1446 if (!neigh) 1447 return NULL; 1448 1449 neigh->dev = dev; 1450 memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); 1451 skb_queue_head_init(&neigh->queue); 1452 INIT_LIST_HEAD(&neigh->list); 1453 ipoib_cm_set(neigh, NULL); 1454 /* one ref on behalf of the caller */ 1455 refcount_set(&neigh->refcnt, 1); 1456 1457 return neigh; 1458 } 1459 1460 struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, 1461 struct net_device *dev) 1462 { 1463 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1464 struct ipoib_neigh_table *ntbl = &priv->ntbl; 1465 struct ipoib_neigh_hash *htbl; 1466 struct ipoib_neigh *neigh; 1467 u32 hash_val; 1468 1469 htbl = rcu_dereference_protected(ntbl->htbl, 1470 lockdep_is_held(&priv->lock)); 1471 if (!htbl) { 1472 neigh = NULL; 1473 goto out_unlock; 1474 } 1475 1476 /* need to add a new neigh, but maybe some other thread succeeded? 1477 * recalc hash, maybe hash resize took place so we do a search 1478 */ 1479 hash_val = ipoib_addr_hash(htbl, daddr); 1480 for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], 1481 lockdep_is_held(&priv->lock)); 1482 neigh != NULL; 1483 neigh = rcu_dereference_protected(neigh->hnext, 1484 lockdep_is_held(&priv->lock))) { 1485 if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { 1486 /* found, take one ref on behalf of the caller */ 1487 if (!refcount_inc_not_zero(&neigh->refcnt)) { 1488 /* deleted */ 1489 neigh = NULL; 1490 break; 1491 } 1492 neigh->alive = jiffies; 1493 goto out_unlock; 1494 } 1495 } 1496 1497 neigh = ipoib_neigh_ctor(daddr, dev); 1498 if (!neigh) 1499 goto out_unlock; 1500 1501 /* one ref on behalf of the hash table */ 1502 refcount_inc(&neigh->refcnt); 1503 neigh->alive = jiffies; 1504 /* put in hash */ 1505 rcu_assign_pointer(neigh->hnext, 1506 rcu_dereference_protected(htbl->buckets[hash_val], 1507 lockdep_is_held(&priv->lock))); 1508 rcu_assign_pointer(htbl->buckets[hash_val], neigh); 1509 atomic_inc(&ntbl->entries); 1510 1511 out_unlock: 1512 1513 return neigh; 1514 } 1515 1516 void ipoib_neigh_dtor(struct ipoib_neigh *neigh) 1517 { 1518 /* neigh reference count was dropprd to zero */ 1519 struct net_device *dev = neigh->dev; 1520 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1521 struct sk_buff *skb; 1522 if (neigh->ah) 1523 ipoib_put_ah(neigh->ah); 1524 while ((skb = __skb_dequeue(&neigh->queue))) { 1525 ++dev->stats.tx_dropped; 1526 dev_kfree_skb_any(skb); 1527 } 1528 if (ipoib_cm_get(neigh)) 1529 ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); 1530 ipoib_dbg(ipoib_priv(dev), 1531 "neigh free for %06x %pI6\n", 1532 IPOIB_QPN(neigh->daddr), 1533 neigh->daddr + 4); 1534 kfree(neigh); 1535 if (atomic_dec_and_test(&priv->ntbl.entries)) { 1536 if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) 1537 complete(&priv->ntbl.flushed); 1538 } 1539 } 1540 1541 static void ipoib_neigh_reclaim(struct rcu_head *rp) 1542 { 1543 /* Called as a result of removal from hash table */ 1544 struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); 1545 /* note TX context may hold another ref */ 1546 ipoib_neigh_put(neigh); 1547 } 1548 1549 void ipoib_neigh_free(struct ipoib_neigh *neigh) 1550 { 1551 struct net_device *dev = neigh->dev; 1552 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1553 struct ipoib_neigh_table *ntbl = &priv->ntbl; 1554 struct ipoib_neigh_hash *htbl; 1555 struct ipoib_neigh __rcu **np; 1556 struct ipoib_neigh *n; 1557 u32 hash_val; 1558 1559 htbl = rcu_dereference_protected(ntbl->htbl, 1560 lockdep_is_held(&priv->lock)); 1561 if (!htbl) 1562 return; 1563 1564 hash_val = ipoib_addr_hash(htbl, neigh->daddr); 1565 np = &htbl->buckets[hash_val]; 1566 for (n = rcu_dereference_protected(*np, 1567 lockdep_is_held(&priv->lock)); 1568 n != NULL; 1569 n = rcu_dereference_protected(*np, 1570 lockdep_is_held(&priv->lock))) { 1571 if (n == neigh) { 1572 /* found */ 1573 rcu_assign_pointer(*np, 1574 rcu_dereference_protected(neigh->hnext, 1575 lockdep_is_held(&priv->lock))); 1576 /* remove from parent list */ 1577 list_del_init(&neigh->list); 1578 call_rcu(&neigh->rcu, ipoib_neigh_reclaim); 1579 return; 1580 } else { 1581 np = &n->hnext; 1582 } 1583 } 1584 } 1585 1586 static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) 1587 { 1588 struct ipoib_neigh_table *ntbl = &priv->ntbl; 1589 struct ipoib_neigh_hash *htbl; 1590 struct ipoib_neigh __rcu **buckets; 1591 u32 size; 1592 1593 clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); 1594 ntbl->htbl = NULL; 1595 htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); 1596 if (!htbl) 1597 return -ENOMEM; 1598 size = roundup_pow_of_two(arp_tbl.gc_thresh3); 1599 buckets = kvcalloc(size, sizeof(*buckets), GFP_KERNEL); 1600 if (!buckets) { 1601 kfree(htbl); 1602 return -ENOMEM; 1603 } 1604 htbl->size = size; 1605 htbl->mask = (size - 1); 1606 htbl->buckets = buckets; 1607 RCU_INIT_POINTER(ntbl->htbl, htbl); 1608 htbl->ntbl = ntbl; 1609 atomic_set(&ntbl->entries, 0); 1610 1611 /* start garbage collection */ 1612 queue_delayed_work(priv->wq, &priv->neigh_reap_task, 1613 arp_tbl.gc_interval); 1614 1615 return 0; 1616 } 1617 1618 static void neigh_hash_free_rcu(struct rcu_head *head) 1619 { 1620 struct ipoib_neigh_hash *htbl = container_of(head, 1621 struct ipoib_neigh_hash, 1622 rcu); 1623 struct ipoib_neigh __rcu **buckets = htbl->buckets; 1624 struct ipoib_neigh_table *ntbl = htbl->ntbl; 1625 1626 kvfree(buckets); 1627 kfree(htbl); 1628 complete(&ntbl->deleted); 1629 } 1630 1631 void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) 1632 { 1633 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1634 struct ipoib_neigh_table *ntbl = &priv->ntbl; 1635 struct ipoib_neigh_hash *htbl; 1636 unsigned long flags; 1637 int i; 1638 1639 /* remove all neigh connected to a given path or mcast */ 1640 spin_lock_irqsave(&priv->lock, flags); 1641 1642 htbl = rcu_dereference_protected(ntbl->htbl, 1643 lockdep_is_held(&priv->lock)); 1644 1645 if (!htbl) 1646 goto out_unlock; 1647 1648 for (i = 0; i < htbl->size; i++) { 1649 struct ipoib_neigh *neigh; 1650 struct ipoib_neigh __rcu **np = &htbl->buckets[i]; 1651 1652 while ((neigh = rcu_dereference_protected(*np, 1653 lockdep_is_held(&priv->lock))) != NULL) { 1654 /* delete neighs belong to this parent */ 1655 if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { 1656 rcu_assign_pointer(*np, 1657 rcu_dereference_protected(neigh->hnext, 1658 lockdep_is_held(&priv->lock))); 1659 /* remove from parent list */ 1660 list_del_init(&neigh->list); 1661 call_rcu(&neigh->rcu, ipoib_neigh_reclaim); 1662 } else { 1663 np = &neigh->hnext; 1664 } 1665 1666 } 1667 } 1668 out_unlock: 1669 spin_unlock_irqrestore(&priv->lock, flags); 1670 } 1671 1672 static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) 1673 { 1674 struct ipoib_neigh_table *ntbl = &priv->ntbl; 1675 struct ipoib_neigh_hash *htbl; 1676 unsigned long flags; 1677 int i, wait_flushed = 0; 1678 1679 init_completion(&priv->ntbl.flushed); 1680 set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); 1681 1682 spin_lock_irqsave(&priv->lock, flags); 1683 1684 htbl = rcu_dereference_protected(ntbl->htbl, 1685 lockdep_is_held(&priv->lock)); 1686 if (!htbl) 1687 goto out_unlock; 1688 1689 wait_flushed = atomic_read(&priv->ntbl.entries); 1690 if (!wait_flushed) 1691 goto free_htbl; 1692 1693 for (i = 0; i < htbl->size; i++) { 1694 struct ipoib_neigh *neigh; 1695 struct ipoib_neigh __rcu **np = &htbl->buckets[i]; 1696 1697 while ((neigh = rcu_dereference_protected(*np, 1698 lockdep_is_held(&priv->lock))) != NULL) { 1699 rcu_assign_pointer(*np, 1700 rcu_dereference_protected(neigh->hnext, 1701 lockdep_is_held(&priv->lock))); 1702 /* remove from path/mc list */ 1703 list_del_init(&neigh->list); 1704 call_rcu(&neigh->rcu, ipoib_neigh_reclaim); 1705 } 1706 } 1707 1708 free_htbl: 1709 rcu_assign_pointer(ntbl->htbl, NULL); 1710 call_rcu(&htbl->rcu, neigh_hash_free_rcu); 1711 1712 out_unlock: 1713 spin_unlock_irqrestore(&priv->lock, flags); 1714 if (wait_flushed) 1715 wait_for_completion(&priv->ntbl.flushed); 1716 } 1717 1718 static void ipoib_neigh_hash_uninit(struct net_device *dev) 1719 { 1720 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1721 1722 ipoib_dbg(priv, "%s\n", __func__); 1723 init_completion(&priv->ntbl.deleted); 1724 1725 cancel_delayed_work_sync(&priv->neigh_reap_task); 1726 1727 ipoib_flush_neighs(priv); 1728 1729 wait_for_completion(&priv->ntbl.deleted); 1730 } 1731 1732 static void ipoib_napi_add(struct net_device *dev) 1733 { 1734 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1735 1736 netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll, 1737 IPOIB_NUM_WC); 1738 netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll, 1739 MAX_SEND_CQE); 1740 } 1741 1742 static void ipoib_napi_del(struct net_device *dev) 1743 { 1744 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1745 1746 netif_napi_del(&priv->recv_napi); 1747 netif_napi_del(&priv->send_napi); 1748 } 1749 1750 static void ipoib_dev_uninit_default(struct net_device *dev) 1751 { 1752 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1753 1754 ipoib_transport_dev_cleanup(dev); 1755 1756 ipoib_napi_del(dev); 1757 1758 ipoib_cm_dev_cleanup(dev); 1759 1760 kfree(priv->rx_ring); 1761 vfree(priv->tx_ring); 1762 1763 priv->rx_ring = NULL; 1764 priv->tx_ring = NULL; 1765 } 1766 1767 static int ipoib_dev_init_default(struct net_device *dev) 1768 { 1769 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1770 u8 addr_mod[3]; 1771 1772 ipoib_napi_add(dev); 1773 1774 /* Allocate RX/TX "rings" to hold queued skbs */ 1775 priv->rx_ring = kcalloc(ipoib_recvq_size, 1776 sizeof(*priv->rx_ring), 1777 GFP_KERNEL); 1778 if (!priv->rx_ring) 1779 goto out; 1780 1781 priv->tx_ring = vzalloc(array_size(ipoib_sendq_size, 1782 sizeof(*priv->tx_ring))); 1783 if (!priv->tx_ring) { 1784 pr_warn("%s: failed to allocate TX ring (%d entries)\n", 1785 priv->ca->name, ipoib_sendq_size); 1786 goto out_rx_ring_cleanup; 1787 } 1788 1789 /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */ 1790 1791 if (ipoib_transport_dev_init(dev, priv->ca)) { 1792 pr_warn("%s: ipoib_transport_dev_init failed\n", 1793 priv->ca->name); 1794 goto out_tx_ring_cleanup; 1795 } 1796 1797 /* after qp created set dev address */ 1798 addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff; 1799 addr_mod[1] = (priv->qp->qp_num >> 8) & 0xff; 1800 addr_mod[2] = (priv->qp->qp_num) & 0xff; 1801 dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod)); 1802 1803 return 0; 1804 1805 out_tx_ring_cleanup: 1806 vfree(priv->tx_ring); 1807 1808 out_rx_ring_cleanup: 1809 kfree(priv->rx_ring); 1810 1811 out: 1812 ipoib_napi_del(dev); 1813 return -ENOMEM; 1814 } 1815 1816 static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, 1817 int cmd) 1818 { 1819 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1820 1821 if (!priv->rn_ops->ndo_eth_ioctl) 1822 return -EOPNOTSUPP; 1823 1824 return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd); 1825 } 1826 1827 static int ipoib_dev_init(struct net_device *dev) 1828 { 1829 struct ipoib_dev_priv *priv = ipoib_priv(dev); 1830 int ret = -ENOMEM; 1831 1832 priv->qp = NULL; 1833 1834 /* 1835 * the various IPoIB tasks assume they will never race against 1836 * themselves, so always use a single thread workqueue 1837 */ 1838 priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM); 1839 if (!priv->wq) { 1840 pr_warn("%s: failed to allocate device WQ\n", dev->name); 1841 goto out; 1842 } 1843 1844 /* create pd, which used both for control and datapath*/ 1845 priv->pd = ib_alloc_pd(priv->ca, 0); 1846 if (IS_ERR(priv->pd)) { 1847 pr_warn("%s: failed to allocate PD\n", priv->ca->name); 1848 goto clean_wq; 1849 } 1850 1851 ret = priv->rn_ops->ndo_init(dev); 1852 if (ret) { 1853 pr_warn("%s failed to init HW resource\n", dev->name); 1854 goto out_free_pd; 1855 } 1856 1857 ret = ipoib_neigh_hash_init(priv); 1858 if (ret) { 1859 pr_warn("%s failed to init neigh hash\n", dev->name); 1860 goto out_dev_uninit; 1861 } 1862 1863 if (dev->flags & IFF_UP) { 1864 if (ipoib_ib_dev_open(dev)) { 1865 pr_warn("%s failed to open device\n", dev->name); 1866 ret = -ENODEV; 1867 goto out_hash_uninit; 1868 } 1869 } 1870 1871 return 0; 1872 1873 out_hash_uninit: 1874 ipoib_neigh_hash_uninit(dev); 1875 1876 out_dev_uninit: 1877 ipoib_ib_dev_cleanup(dev); 1878 1879 out_free_pd: 1880 if (priv->pd) { 1881 ib_dealloc_pd(priv->pd); 1882 priv->pd = NULL; 1883 } 1884 1885 clean_wq: 1886 if (priv->wq) { 1887 destroy_workqueue(priv->wq); 1888 priv->wq = NULL; 1889 } 1890 1891 out: 1892 return ret; 1893 } 1894 1895 /* 1896 * This must be called before doing an unregister_netdev on a parent device to 1897 * shutdown the IB event handler. 1898 */ 1899 static void ipoib_parent_unregister_pre(struct net_device *ndev) 1900 { 1901 struct ipoib_dev_priv *priv = ipoib_priv(ndev); 1902 1903 /* 1904 * ipoib_set_mac checks netif_running before pushing work, clearing 1905 * running ensures the it will not add more work. 1906 */ 1907 rtnl_lock(); 1908 dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL); 1909 rtnl_unlock(); 1910 1911 /* ipoib_event() cannot be running once this returns */ 1912 ib_unregister_event_handler(&priv->event_handler); 1913 1914 /* 1915 * Work on the queue grabs the rtnl lock, so this cannot be done while 1916 * also holding it. 1917 */ 1918 flush_workqueue(ipoib_workqueue); 1919 } 1920 1921 static void ipoib_set_dev_features(struct ipoib_dev_priv *priv) 1922 { 1923 priv->hca_caps = priv->ca->attrs.device_cap_flags; 1924 priv->kernel_caps = priv->ca->attrs.kernel_cap_flags; 1925 1926 if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { 1927 priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; 1928 1929 if (priv->kernel_caps & IBK_UD_TSO) 1930 priv->dev->hw_features |= NETIF_F_TSO; 1931 1932 priv->dev->features |= priv->dev->hw_features; 1933 } 1934 } 1935 1936 static int ipoib_parent_init(struct net_device *ndev) 1937 { 1938 struct ipoib_dev_priv *priv = ipoib_priv(ndev); 1939 struct ib_port_attr attr; 1940 int result; 1941 1942 result = ib_query_port(priv->ca, priv->port, &attr); 1943 if (result) { 1944 pr_warn("%s: ib_query_port %d failed\n", priv->ca->name, 1945 priv->port); 1946 return result; 1947 } 1948 priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr); 1949 1950 result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); 1951 if (result) { 1952 pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", 1953 priv->ca->name, priv->port, result); 1954 return result; 1955 } 1956 1957 result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid); 1958 if (result) { 1959 pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n", 1960 priv->ca->name, priv->port, result); 1961 return result; 1962 } 1963 dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid)); 1964 1965 SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); 1966 priv->dev->dev_port = priv->port - 1; 1967 /* Let's set this one too for backwards compatibility. */ 1968 priv->dev->dev_id = priv->port - 1; 1969 1970 return 0; 1971 } 1972 1973 static void ipoib_child_init(struct net_device *ndev) 1974 { 1975 struct ipoib_dev_priv *priv = ipoib_priv(ndev); 1976 struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); 1977 1978 priv->max_ib_mtu = ppriv->max_ib_mtu; 1979 set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); 1980 if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN)) 1981 memcpy(&priv->local_gid, priv->dev->dev_addr + 4, 1982 sizeof(priv->local_gid)); 1983 else { 1984 __dev_addr_set(priv->dev, ppriv->dev->dev_addr, 1985 INFINIBAND_ALEN); 1986 memcpy(&priv->local_gid, &ppriv->local_gid, 1987 sizeof(priv->local_gid)); 1988 } 1989 } 1990 1991 static int ipoib_ndo_init(struct net_device *ndev) 1992 { 1993 struct ipoib_dev_priv *priv = ipoib_priv(ndev); 1994 int rc; 1995 struct rdma_netdev *rn = netdev_priv(ndev); 1996 1997 if (priv->parent) { 1998 ipoib_child_init(ndev); 1999 } else { 2000 rc = ipoib_parent_init(ndev); 2001 if (rc) 2002 return rc; 2003 } 2004 2005 /* MTU will be reset when mcast join happens */ 2006 ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); 2007 priv->mcast_mtu = priv->admin_mtu = ndev->mtu; 2008 rn->mtu = priv->mcast_mtu; 2009 ndev->max_mtu = IPOIB_CM_MTU; 2010 2011 ndev->neigh_priv_len = sizeof(struct ipoib_neigh); 2012 2013 /* 2014 * Set the full membership bit, so that we join the right 2015 * broadcast group, etc. 2016 */ 2017 priv->pkey |= 0x8000; 2018 2019 ndev->broadcast[8] = priv->pkey >> 8; 2020 ndev->broadcast[9] = priv->pkey & 0xff; 2021 set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); 2022 2023 ipoib_set_dev_features(priv); 2024 2025 rc = ipoib_dev_init(ndev); 2026 if (rc) { 2027 pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n", 2028 priv->ca->name, priv->dev->name, priv->port, rc); 2029 return rc; 2030 } 2031 2032 if (priv->parent) { 2033 struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); 2034 2035 dev_hold(priv->parent); 2036 2037 netdev_lock(priv->parent); 2038 list_add_tail(&priv->list, &ppriv->child_intfs); 2039 netdev_unlock(priv->parent); 2040 } 2041 2042 return 0; 2043 } 2044 2045 static void ipoib_ndo_uninit(struct net_device *dev) 2046 { 2047 struct ipoib_dev_priv *priv = ipoib_priv(dev); 2048 2049 /* 2050 * ipoib_remove_one guarantees the children are removed before the 2051 * parent, and that is the only place where a parent can be removed. 2052 */ 2053 WARN_ON(!list_empty(&priv->child_intfs)); 2054 2055 if (priv->parent) { 2056 struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); 2057 2058 netdev_lock(ppriv->dev); 2059 list_del(&priv->list); 2060 netdev_unlock(ppriv->dev); 2061 } 2062 2063 ipoib_neigh_hash_uninit(dev); 2064 2065 ipoib_ib_dev_cleanup(dev); 2066 2067 /* no more works over the priv->wq */ 2068 if (priv->wq) { 2069 /* See ipoib_mcast_carrier_on_task() */ 2070 WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)); 2071 destroy_workqueue(priv->wq); 2072 priv->wq = NULL; 2073 } 2074 2075 dev_put(priv->parent); 2076 } 2077 2078 static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) 2079 { 2080 struct ipoib_dev_priv *priv = ipoib_priv(dev); 2081 2082 return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state); 2083 } 2084 2085 static int ipoib_get_vf_config(struct net_device *dev, int vf, 2086 struct ifla_vf_info *ivf) 2087 { 2088 struct ipoib_dev_priv *priv = ipoib_priv(dev); 2089 int err; 2090 2091 err = ib_get_vf_config(priv->ca, vf, priv->port, ivf); 2092 if (err) 2093 return err; 2094 2095 ivf->vf = vf; 2096 memcpy(ivf->mac, dev->dev_addr, dev->addr_len); 2097 2098 return 0; 2099 } 2100 2101 static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) 2102 { 2103 struct ipoib_dev_priv *priv = ipoib_priv(dev); 2104 2105 if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID) 2106 return -EINVAL; 2107 2108 return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type); 2109 } 2110 2111 static int ipoib_get_vf_guid(struct net_device *dev, int vf, 2112 struct ifla_vf_guid *node_guid, 2113 struct ifla_vf_guid *port_guid) 2114 { 2115 struct ipoib_dev_priv *priv = ipoib_priv(dev); 2116 2117 return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid); 2118 } 2119 2120 static int ipoib_get_vf_stats(struct net_device *dev, int vf, 2121 struct ifla_vf_stats *vf_stats) 2122 { 2123 struct ipoib_dev_priv *priv = ipoib_priv(dev); 2124 2125 return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats); 2126 } 2127 2128 static const struct header_ops ipoib_header_ops = { 2129 .create = ipoib_hard_header, 2130 }; 2131 2132 static const struct net_device_ops ipoib_netdev_ops_pf = { 2133 .ndo_init = ipoib_ndo_init, 2134 .ndo_uninit = ipoib_ndo_uninit, 2135 .ndo_open = ipoib_open, 2136 .ndo_stop = ipoib_stop, 2137 .ndo_change_mtu = ipoib_change_mtu, 2138 .ndo_fix_features = ipoib_fix_features, 2139 .ndo_start_xmit = ipoib_start_xmit, 2140 .ndo_tx_timeout = ipoib_timeout, 2141 .ndo_set_rx_mode = ipoib_set_mcast_list, 2142 .ndo_get_iflink = ipoib_get_iflink, 2143 .ndo_set_vf_link_state = ipoib_set_vf_link_state, 2144 .ndo_get_vf_config = ipoib_get_vf_config, 2145 .ndo_get_vf_stats = ipoib_get_vf_stats, 2146 .ndo_get_vf_guid = ipoib_get_vf_guid, 2147 .ndo_set_vf_guid = ipoib_set_vf_guid, 2148 .ndo_set_mac_address = ipoib_set_mac, 2149 .ndo_get_stats64 = ipoib_get_stats, 2150 .ndo_eth_ioctl = ipoib_ioctl, 2151 }; 2152 2153 static const struct net_device_ops ipoib_netdev_ops_vf = { 2154 .ndo_init = ipoib_ndo_init, 2155 .ndo_uninit = ipoib_ndo_uninit, 2156 .ndo_open = ipoib_open, 2157 .ndo_stop = ipoib_stop, 2158 .ndo_change_mtu = ipoib_change_mtu, 2159 .ndo_fix_features = ipoib_fix_features, 2160 .ndo_start_xmit = ipoib_start_xmit, 2161 .ndo_tx_timeout = ipoib_timeout, 2162 .ndo_set_rx_mode = ipoib_set_mcast_list, 2163 .ndo_get_iflink = ipoib_get_iflink, 2164 .ndo_get_stats64 = ipoib_get_stats, 2165 .ndo_eth_ioctl = ipoib_ioctl, 2166 }; 2167 2168 static const struct net_device_ops ipoib_netdev_default_pf = { 2169 .ndo_init = ipoib_dev_init_default, 2170 .ndo_uninit = ipoib_dev_uninit_default, 2171 .ndo_open = ipoib_ib_dev_open_default, 2172 .ndo_stop = ipoib_ib_dev_stop_default, 2173 }; 2174 2175 void ipoib_setup_common(struct net_device *dev) 2176 { 2177 dev->header_ops = &ipoib_header_ops; 2178 dev->netdev_ops = &ipoib_netdev_default_pf; 2179 2180 ipoib_set_ethtool_ops(dev); 2181 2182 dev->watchdog_timeo = 10 * HZ; 2183 2184 dev->flags |= IFF_BROADCAST | IFF_MULTICAST; 2185 2186 dev->hard_header_len = IPOIB_HARD_LEN; 2187 dev->addr_len = INFINIBAND_ALEN; 2188 dev->type = ARPHRD_INFINIBAND; 2189 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; 2190 dev->features = (NETIF_F_VLAN_CHALLENGED | 2191 NETIF_F_HIGHDMA); 2192 netif_keep_dst(dev); 2193 2194 memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); 2195 2196 /* 2197 * unregister_netdev always frees the netdev, we use this mode 2198 * consistently to unify all the various unregister paths, including 2199 * those connected to rtnl_link_ops which require it. 2200 */ 2201 dev->needs_free_netdev = true; 2202 } 2203 2204 static void ipoib_build_priv(struct net_device *dev) 2205 { 2206 struct ipoib_dev_priv *priv = ipoib_priv(dev); 2207 2208 priv->dev = dev; 2209 spin_lock_init(&priv->lock); 2210 mutex_init(&priv->mcast_mutex); 2211 2212 INIT_LIST_HEAD(&priv->path_list); 2213 INIT_LIST_HEAD(&priv->child_intfs); 2214 INIT_LIST_HEAD(&priv->dead_ahs); 2215 INIT_LIST_HEAD(&priv->multicast_list); 2216 2217 INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); 2218 INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); 2219 INIT_WORK(&priv->reschedule_napi_work, ipoib_napi_schedule_work); 2220 INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); 2221 INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); 2222 INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); 2223 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); 2224 INIT_WORK(&priv->tx_timeout_work, ipoib_ib_tx_timeout_work); 2225 INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); 2226 INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); 2227 } 2228 2229 static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port, 2230 const char *name) 2231 { 2232 struct net_device *dev; 2233 2234 dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name, 2235 NET_NAME_UNKNOWN, ipoib_setup_common); 2236 if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP) 2237 return dev; 2238 2239 dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN, 2240 ipoib_setup_common); 2241 if (!dev) 2242 return ERR_PTR(-ENOMEM); 2243 return dev; 2244 } 2245 2246 int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, 2247 struct net_device *dev) 2248 { 2249 struct rdma_netdev *rn = netdev_priv(dev); 2250 struct ipoib_dev_priv *priv; 2251 int rc; 2252 2253 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 2254 if (!priv) 2255 return -ENOMEM; 2256 2257 priv->ca = hca; 2258 priv->port = port; 2259 2260 rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name, 2261 NET_NAME_UNKNOWN, ipoib_setup_common, dev); 2262 if (rc) { 2263 if (rc != -EOPNOTSUPP) 2264 goto out; 2265 2266 rn->send = ipoib_send; 2267 rn->attach_mcast = ipoib_mcast_attach; 2268 rn->detach_mcast = ipoib_mcast_detach; 2269 rn->hca = hca; 2270 2271 rc = netif_set_real_num_tx_queues(dev, 1); 2272 if (rc) 2273 goto out; 2274 2275 rc = netif_set_real_num_rx_queues(dev, 1); 2276 if (rc) 2277 goto out; 2278 } 2279 2280 priv->rn_ops = dev->netdev_ops; 2281 2282 if (hca->attrs.kernel_cap_flags & IBK_VIRTUAL_FUNCTION) 2283 dev->netdev_ops = &ipoib_netdev_ops_vf; 2284 else 2285 dev->netdev_ops = &ipoib_netdev_ops_pf; 2286 2287 rn->clnt_priv = priv; 2288 /* 2289 * Only the child register_netdev flows can handle priv_destructor 2290 * being set, so we force it to NULL here and handle manually until it 2291 * is safe to turn on. 2292 */ 2293 priv->next_priv_destructor = dev->priv_destructor; 2294 dev->priv_destructor = NULL; 2295 2296 ipoib_build_priv(dev); 2297 2298 return 0; 2299 2300 out: 2301 kfree(priv); 2302 return rc; 2303 } 2304 2305 struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, 2306 const char *name) 2307 { 2308 struct net_device *dev; 2309 int rc; 2310 2311 dev = ipoib_alloc_netdev(hca, port, name); 2312 if (IS_ERR(dev)) 2313 return dev; 2314 2315 rc = ipoib_intf_init(hca, port, name, dev); 2316 if (rc) { 2317 free_netdev(dev); 2318 return ERR_PTR(rc); 2319 } 2320 2321 /* 2322 * Upon success the caller must ensure ipoib_intf_free is called or 2323 * register_netdevice succeed'd and priv_destructor is set to 2324 * ipoib_intf_free. 2325 */ 2326 return dev; 2327 } 2328 2329 void ipoib_intf_free(struct net_device *dev) 2330 { 2331 struct ipoib_dev_priv *priv = ipoib_priv(dev); 2332 struct rdma_netdev *rn = netdev_priv(dev); 2333 2334 dev->priv_destructor = priv->next_priv_destructor; 2335 if (dev->priv_destructor) 2336 dev->priv_destructor(dev); 2337 2338 /* 2339 * There are some error flows around register_netdev failing that may 2340 * attempt to call priv_destructor twice, prevent that from happening. 2341 */ 2342 dev->priv_destructor = NULL; 2343 2344 /* unregister/destroy is very complicated. Make bugs more obvious. */ 2345 rn->clnt_priv = NULL; 2346 2347 kfree(priv); 2348 } 2349 2350 static ssize_t pkey_show(struct device *dev, struct device_attribute *attr, 2351 char *buf) 2352 { 2353 struct net_device *ndev = to_net_dev(dev); 2354 struct ipoib_dev_priv *priv = ipoib_priv(ndev); 2355 2356 return sysfs_emit(buf, "0x%04x\n", priv->pkey); 2357 } 2358 static DEVICE_ATTR_RO(pkey); 2359 2360 static ssize_t umcast_show(struct device *dev, struct device_attribute *attr, 2361 char *buf) 2362 { 2363 struct net_device *ndev = to_net_dev(dev); 2364 struct ipoib_dev_priv *priv = ipoib_priv(ndev); 2365 2366 return sysfs_emit(buf, "%d\n", 2367 test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); 2368 } 2369 2370 void ipoib_set_umcast(struct net_device *ndev, int umcast_val) 2371 { 2372 struct ipoib_dev_priv *priv = ipoib_priv(ndev); 2373 2374 if (umcast_val > 0) { 2375 set_bit(IPOIB_FLAG_UMCAST, &priv->flags); 2376 ipoib_warn(priv, "ignoring multicast groups joined directly " 2377 "by userspace\n"); 2378 } else 2379 clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); 2380 } 2381 2382 static ssize_t umcast_store(struct device *dev, struct device_attribute *attr, 2383 const char *buf, size_t count) 2384 { 2385 unsigned long umcast_val = simple_strtoul(buf, NULL, 0); 2386 2387 ipoib_set_umcast(to_net_dev(dev), umcast_val); 2388 2389 return count; 2390 } 2391 static DEVICE_ATTR_RW(umcast); 2392 2393 int ipoib_add_umcast_attr(struct net_device *dev) 2394 { 2395 return device_create_file(&dev->dev, &dev_attr_umcast); 2396 } 2397 2398 static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) 2399 { 2400 struct ipoib_dev_priv *child_priv; 2401 struct net_device *netdev = priv->dev; 2402 2403 netif_addr_lock_bh(netdev); 2404 2405 memcpy(&priv->local_gid.global.interface_id, 2406 &gid->global.interface_id, 2407 sizeof(gid->global.interface_id)); 2408 dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid)); 2409 clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); 2410 2411 netif_addr_unlock_bh(netdev); 2412 2413 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 2414 netdev_lock_ops_to_full(priv->dev); 2415 list_for_each_entry(child_priv, &priv->child_intfs, list) 2416 set_base_guid(child_priv, gid); 2417 netdev_unlock_full_to_ops(priv->dev); 2418 } 2419 } 2420 2421 static int ipoib_check_lladdr(struct net_device *dev, 2422 struct sockaddr_storage *ss) 2423 { 2424 union ib_gid *gid = (union ib_gid *)(ss->__data + 4); 2425 int ret = 0; 2426 2427 netif_addr_lock_bh(dev); 2428 2429 /* Make sure the QPN, reserved and subnet prefix match the current 2430 * lladdr, it also makes sure the lladdr is unicast. 2431 */ 2432 if (memcmp(dev->dev_addr, ss->__data, 2433 4 + sizeof(gid->global.subnet_prefix)) || 2434 gid->global.interface_id == 0) 2435 ret = -EINVAL; 2436 2437 netif_addr_unlock_bh(dev); 2438 2439 return ret; 2440 } 2441 2442 static int ipoib_set_mac(struct net_device *dev, void *addr) 2443 { 2444 struct ipoib_dev_priv *priv = ipoib_priv(dev); 2445 struct sockaddr_storage *ss = addr; 2446 int ret; 2447 2448 if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) 2449 return -EBUSY; 2450 2451 ret = ipoib_check_lladdr(dev, ss); 2452 if (ret) 2453 return ret; 2454 2455 set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); 2456 2457 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { 2458 struct ipoib_dev_priv *cpriv; 2459 2460 netdev_lock_ops_to_full(dev); 2461 list_for_each_entry(cpriv, &priv->child_intfs, list) 2462 queue_work(ipoib_workqueue, &cpriv->flush_light); 2463 netdev_unlock_full_to_ops(dev); 2464 } 2465 queue_work(ipoib_workqueue, &priv->flush_light); 2466 2467 return 0; 2468 } 2469 2470 static ssize_t create_child_store(struct device *dev, 2471 struct device_attribute *attr, 2472 const char *buf, size_t count) 2473 { 2474 int pkey; 2475 int ret; 2476 2477 if (sscanf(buf, "%i", &pkey) != 1) 2478 return -EINVAL; 2479 2480 if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) 2481 return -EINVAL; 2482 2483 ret = ipoib_vlan_add(to_net_dev(dev), pkey); 2484 2485 return ret ? ret : count; 2486 } 2487 static DEVICE_ATTR_WO(create_child); 2488 2489 static ssize_t delete_child_store(struct device *dev, 2490 struct device_attribute *attr, 2491 const char *buf, size_t count) 2492 { 2493 int pkey; 2494 int ret; 2495 2496 if (sscanf(buf, "%i", &pkey) != 1) 2497 return -EINVAL; 2498 2499 if (pkey < 0 || pkey > 0xffff) 2500 return -EINVAL; 2501 2502 ret = ipoib_vlan_delete(to_net_dev(dev), pkey); 2503 2504 return ret ? ret : count; 2505 2506 } 2507 static DEVICE_ATTR_WO(delete_child); 2508 2509 int ipoib_add_pkey_attr(struct net_device *dev) 2510 { 2511 return device_create_file(&dev->dev, &dev_attr_pkey); 2512 } 2513 2514 /* 2515 * We erroneously exposed the iface's port number in the dev_id 2516 * sysfs field long after dev_port was introduced for that purpose[1], 2517 * and we need to stop everyone from relying on that. 2518 * Let's overload the shower routine for the dev_id file here 2519 * to gently bring the issue up. 2520 * 2521 * [1] https://www.spinics.net/lists/netdev/msg272123.html 2522 */ 2523 static ssize_t dev_id_show(struct device *dev, 2524 struct device_attribute *attr, char *buf) 2525 { 2526 struct net_device *ndev = to_net_dev(dev); 2527 2528 /* 2529 * ndev->dev_port will be equal to 0 in old kernel prior to commit 2530 * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface 2531 * port numbers") Zero was chosen as special case for user space 2532 * applications to fallback and query dev_id to check if it has 2533 * different value or not. 2534 * 2535 * Don't print warning in such scenario. 2536 * 2537 * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358 2538 */ 2539 if (ndev->dev_port && ndev->dev_id == ndev->dev_port) 2540 netdev_info_once(ndev, 2541 "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n", 2542 current->comm); 2543 2544 return sysfs_emit(buf, "%#x\n", ndev->dev_id); 2545 } 2546 static DEVICE_ATTR_RO(dev_id); 2547 2548 static int ipoib_intercept_dev_id_attr(struct net_device *dev) 2549 { 2550 device_remove_file(&dev->dev, &dev_attr_dev_id); 2551 return device_create_file(&dev->dev, &dev_attr_dev_id); 2552 } 2553 2554 static struct net_device *ipoib_add_port(const char *format, 2555 struct ib_device *hca, u32 port) 2556 { 2557 struct rtnl_link_ops *ops = ipoib_get_link_ops(); 2558 struct rdma_netdev_alloc_params params; 2559 struct ipoib_dev_priv *priv; 2560 struct net_device *ndev; 2561 int result; 2562 2563 ndev = ipoib_intf_alloc(hca, port, format); 2564 if (IS_ERR(ndev)) { 2565 pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port, 2566 PTR_ERR(ndev)); 2567 return ndev; 2568 } 2569 priv = ipoib_priv(ndev); 2570 2571 INIT_IB_EVENT_HANDLER(&priv->event_handler, 2572 priv->ca, ipoib_event); 2573 ib_register_event_handler(&priv->event_handler); 2574 2575 /* call event handler to ensure pkey in sync */ 2576 ipoib_queue_work(priv, IPOIB_FLUSH_HEAVY); 2577 2578 ndev->rtnl_link_ops = ipoib_get_link_ops(); 2579 2580 result = register_netdev(ndev); 2581 if (result) { 2582 pr_warn("%s: couldn't register ipoib port %d; error %d\n", 2583 hca->name, port, result); 2584 2585 ipoib_parent_unregister_pre(ndev); 2586 ipoib_intf_free(ndev); 2587 free_netdev(ndev); 2588 2589 return ERR_PTR(result); 2590 } 2591 2592 if (hca->ops.rdma_netdev_get_params) { 2593 int rc = hca->ops.rdma_netdev_get_params(hca, port, 2594 RDMA_NETDEV_IPOIB, 2595 ¶ms); 2596 2597 if (!rc && ops->priv_size < params.sizeof_priv) 2598 ops->priv_size = params.sizeof_priv; 2599 } 2600 /* 2601 * We cannot set priv_destructor before register_netdev because we 2602 * need priv to be always valid during the error flow to execute 2603 * ipoib_parent_unregister_pre(). Instead handle it manually and only 2604 * enter priv_destructor mode once we are completely registered. 2605 */ 2606 ndev->priv_destructor = ipoib_intf_free; 2607 2608 if (ipoib_intercept_dev_id_attr(ndev)) 2609 goto sysfs_failed; 2610 if (ipoib_cm_add_mode_attr(ndev)) 2611 goto sysfs_failed; 2612 if (ipoib_add_pkey_attr(ndev)) 2613 goto sysfs_failed; 2614 if (ipoib_add_umcast_attr(ndev)) 2615 goto sysfs_failed; 2616 if (device_create_file(&ndev->dev, &dev_attr_create_child)) 2617 goto sysfs_failed; 2618 if (device_create_file(&ndev->dev, &dev_attr_delete_child)) 2619 goto sysfs_failed; 2620 2621 return ndev; 2622 2623 sysfs_failed: 2624 ipoib_parent_unregister_pre(ndev); 2625 unregister_netdev(ndev); 2626 return ERR_PTR(-ENOMEM); 2627 } 2628 2629 static int ipoib_add_one(struct ib_device *device) 2630 { 2631 struct list_head *dev_list; 2632 struct net_device *dev; 2633 struct ipoib_dev_priv *priv; 2634 unsigned int p; 2635 int count = 0; 2636 2637 dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL); 2638 if (!dev_list) 2639 return -ENOMEM; 2640 2641 INIT_LIST_HEAD(dev_list); 2642 2643 rdma_for_each_port (device, p) { 2644 if (!rdma_protocol_ib(device, p)) 2645 continue; 2646 dev = ipoib_add_port("ib%d", device, p); 2647 if (!IS_ERR(dev)) { 2648 priv = ipoib_priv(dev); 2649 list_add_tail(&priv->list, dev_list); 2650 count++; 2651 } 2652 } 2653 2654 if (!count) { 2655 kfree(dev_list); 2656 return -EOPNOTSUPP; 2657 } 2658 2659 ib_set_client_data(device, &ipoib_client, dev_list); 2660 return 0; 2661 } 2662 2663 static void ipoib_remove_one(struct ib_device *device, void *client_data) 2664 { 2665 struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv; 2666 struct list_head *dev_list = client_data; 2667 2668 list_for_each_entry_safe(priv, tmp, dev_list, list) { 2669 LIST_HEAD(head); 2670 ipoib_parent_unregister_pre(priv->dev); 2671 2672 rtnl_lock(); 2673 2674 netdev_lock(priv->dev); 2675 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, 2676 list) 2677 unregister_netdevice_queue(cpriv->dev, &head); 2678 netdev_unlock(priv->dev); 2679 unregister_netdevice_queue(priv->dev, &head); 2680 unregister_netdevice_many(&head); 2681 2682 rtnl_unlock(); 2683 } 2684 2685 kfree(dev_list); 2686 } 2687 2688 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 2689 static struct notifier_block ipoib_netdev_notifier = { 2690 .notifier_call = ipoib_netdev_event, 2691 }; 2692 #endif 2693 2694 static int __init ipoib_init_module(void) 2695 { 2696 int ret; 2697 2698 ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); 2699 ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); 2700 ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); 2701 2702 ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); 2703 ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); 2704 ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); 2705 #ifdef CONFIG_INFINIBAND_IPOIB_CM 2706 ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); 2707 ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0); 2708 #endif 2709 2710 /* 2711 * When copying small received packets, we only copy from the 2712 * linear data part of the SKB, so we rely on this condition. 2713 */ 2714 BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); 2715 2716 ipoib_register_debugfs(); 2717 2718 /* 2719 * We create a global workqueue here that is used for all flush 2720 * operations. However, if you attempt to flush a workqueue 2721 * from a task on that same workqueue, it deadlocks the system. 2722 * We want to be able to flush the tasks associated with a 2723 * specific net device, so we also create a workqueue for each 2724 * netdevice. We queue up the tasks for that device only on 2725 * its private workqueue, and we only queue up flush events 2726 * on our global flush workqueue. This avoids the deadlocks. 2727 */ 2728 ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0); 2729 if (!ipoib_workqueue) { 2730 ret = -ENOMEM; 2731 goto err_fs; 2732 } 2733 2734 ib_sa_register_client(&ipoib_sa_client); 2735 2736 ret = ib_register_client(&ipoib_client); 2737 if (ret) 2738 goto err_sa; 2739 2740 ret = ipoib_netlink_init(); 2741 if (ret) 2742 goto err_client; 2743 2744 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 2745 register_netdevice_notifier(&ipoib_netdev_notifier); 2746 #endif 2747 return 0; 2748 2749 err_client: 2750 ib_unregister_client(&ipoib_client); 2751 2752 err_sa: 2753 ib_sa_unregister_client(&ipoib_sa_client); 2754 destroy_workqueue(ipoib_workqueue); 2755 2756 err_fs: 2757 ipoib_unregister_debugfs(); 2758 2759 return ret; 2760 } 2761 2762 static void __exit ipoib_cleanup_module(void) 2763 { 2764 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG 2765 unregister_netdevice_notifier(&ipoib_netdev_notifier); 2766 #endif 2767 ipoib_netlink_fini(); 2768 ib_unregister_client(&ipoib_client); 2769 ib_sa_unregister_client(&ipoib_sa_client); 2770 ipoib_unregister_debugfs(); 2771 destroy_workqueue(ipoib_workqueue); 2772 } 2773 2774 module_init(ipoib_init_module); 2775 module_exit(ipoib_cleanup_module); 2776