1 // SPDX-License-Identifier: GPL-2.0-only 2 3 /* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges 4 * 5 * Copyright (c) 2019-2020 Red Hat GmbH 6 * 7 * Author: Stefano Brivio <sbrivio@redhat.com> 8 */ 9 10 /** 11 * DOC: Theory of Operation 12 * 13 * 14 * Problem 15 * ------- 16 * 17 * Match packet bytes against entries composed of ranged or non-ranged packet 18 * field specifiers, mapping them to arbitrary references. For example: 19 * 20 * :: 21 * 22 * --- fields ---> 23 * | [net],[port],[net]... => [reference] 24 * entries [net],[port],[net]... => [reference] 25 * | [net],[port],[net]... => [reference] 26 * V ... 27 * 28 * where [net] fields can be IP ranges or netmasks, and [port] fields are port 29 * ranges. Arbitrary packet fields can be matched. 30 * 31 * 32 * Algorithm Overview 33 * ------------------ 34 * 35 * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally 36 * relies on the consideration that every contiguous range in a space of b bits 37 * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], 38 * as also illustrated in Section 9 of [Kogan 2014]. 39 * 40 * Classification against a number of entries, that require matching given bits 41 * of a packet field, is performed by grouping those bits in sets of arbitrary 42 * size, and classifying packet bits one group at a time. 43 * 44 * Example: 45 * to match the source port (16 bits) of a packet, we can divide those 16 bits 46 * in 4 groups of 4 bits each. Given the entry: 47 * 0000 0001 0101 1001 48 * and a packet with source port: 49 * 0000 0001 1010 1001 50 * first and second groups match, but the third doesn't. We conclude that the 51 * packet doesn't match the given entry. 52 * 53 * Translate the set to a sequence of lookup tables, one per field. Each table 54 * has two dimensions: bit groups to be matched for a single packet field, and 55 * all the possible values of said groups (buckets). Input entries are 56 * represented as one or more rules, depending on the number of composing 57 * netmasks for the given field specifier, and a group match is indicated as a 58 * set bit, with number corresponding to the rule index, in all the buckets 59 * whose value matches the entry for a given group. 60 * 61 * Rules are mapped between fields through an array of x, n pairs, with each 62 * item mapping a matched rule to one or more rules. The position of the pair in 63 * the array indicates the matched rule to be mapped to the next field, x 64 * indicates the first rule index in the next field, and n the amount of 65 * next-field rules the current rule maps to. 66 * 67 * The mapping array for the last field maps to the desired references. 68 * 69 * To match, we perform table lookups using the values of grouped packet bits, 70 * and use a sequence of bitwise operations to progressively evaluate rule 71 * matching. 72 * 73 * A stand-alone, reference implementation, also including notes about possible 74 * future optimisations, is available at: 75 * https://pipapo.lameexcu.se/ 76 * 77 * Insertion 78 * --------- 79 * 80 * - For each packet field: 81 * 82 * - divide the b packet bits we want to classify into groups of size t, 83 * obtaining ceil(b / t) groups 84 * 85 * Example: match on destination IP address, with t = 4: 32 bits, 8 groups 86 * of 4 bits each 87 * 88 * - allocate a lookup table with one column ("bucket") for each possible 89 * value of a group, and with one row for each group 90 * 91 * Example: 8 groups, 2^4 buckets: 92 * 93 * :: 94 * 95 * bucket 96 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 97 * 0 98 * 1 99 * 2 100 * 3 101 * 4 102 * 5 103 * 6 104 * 7 105 * 106 * - map the bits we want to classify for the current field, for a given 107 * entry, to a single rule for non-ranged and netmask set items, and to one 108 * or multiple rules for ranges. Ranges are expanded to composing netmasks 109 * by pipapo_expand(). 110 * 111 * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 112 * - rule #0: 10.0.0.5 113 * - rule #1: 192.168.1.0/24 114 * - rule #2: 192.168.2.0/31 115 * 116 * - insert references to the rules in the lookup table, selecting buckets 117 * according to bit values of a rule in the given group. This is done by 118 * pipapo_insert(). 119 * 120 * Example: given: 121 * - rule #0: 10.0.0.5 mapping to buckets 122 * < 0 10 0 0 0 0 0 5 > 123 * - rule #1: 192.168.1.0/24 mapping to buckets 124 * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > 125 * - rule #2: 192.168.2.0/31 mapping to buckets 126 * < 12 0 10 8 0 2 0 < 0..1 > > 127 * 128 * these bits are set in the lookup table: 129 * 130 * :: 131 * 132 * bucket 133 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 134 * 0 0 1,2 135 * 1 1,2 0 136 * 2 0 1,2 137 * 3 0 1,2 138 * 4 0,1,2 139 * 5 0 1 2 140 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 141 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 142 * 143 * - if this is not the last field in the set, fill a mapping array that maps 144 * rules from the lookup table to rules belonging to the same entry in 145 * the next lookup table, done by pipapo_map(). 146 * 147 * Note that as rules map to contiguous ranges of rules, given how netmask 148 * expansion and insertion is performed, &union nft_pipapo_map_bucket stores 149 * this information as pairs of first rule index, rule count. 150 * 151 * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, 152 * given lookup table #0 for field 0 (see example above): 153 * 154 * :: 155 * 156 * bucket 157 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 158 * 0 0 1,2 159 * 1 1,2 0 160 * 2 0 1,2 161 * 3 0 1,2 162 * 4 0,1,2 163 * 5 0 1 2 164 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 165 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 166 * 167 * and lookup table #1 for field 1 with: 168 * - rule #0: 1024 mapping to buckets 169 * < 0 0 4 0 > 170 * - rule #1: 2048 mapping to buckets 171 * < 0 0 5 0 > 172 * 173 * :: 174 * 175 * bucket 176 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 177 * 0 0,1 178 * 1 0,1 179 * 2 0 1 180 * 3 0,1 181 * 182 * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 183 * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 184 * (rules #1, #2) to 2048 in lookup table #2 (rule #1): 185 * 186 * :: 187 * 188 * rule indices in current field: 0 1 2 189 * map to rules in next field: 0 1 1 190 * 191 * - if this is the last field in the set, fill a mapping array that maps 192 * rules from the last lookup table to element pointers, also done by 193 * pipapo_map(). 194 * 195 * Note that, in this implementation, we have two elements (start, end) for 196 * each entry. The pointer to the end element is stored in this array, and 197 * the pointer to the start element is linked from it. 198 * 199 * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem 200 * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. 201 * From the rules of lookup table #1 as mapped above: 202 * 203 * :: 204 * 205 * rule indices in last field: 0 1 206 * map to elements: 0x66 0x42 207 * 208 * 209 * Matching 210 * -------- 211 * 212 * We use a result bitmap, with the size of a single lookup table bucket, to 213 * represent the matching state that applies at every algorithm step. This is 214 * done by pipapo_lookup(). 215 * 216 * - For each packet field: 217 * 218 * - start with an all-ones result bitmap (res_map in pipapo_lookup()) 219 * 220 * - perform a lookup into the table corresponding to the current field, 221 * for each group, and at every group, AND the current result bitmap with 222 * the value from the lookup table bucket 223 * 224 * :: 225 * 226 * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from 227 * insertion examples. 228 * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for 229 * convenience in this example. Initial result bitmap is 0xff, the steps 230 * below show the value of the result bitmap after each group is processed: 231 * 232 * bucket 233 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 234 * 0 0 1,2 235 * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 236 * 237 * 1 1,2 0 238 * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 239 * 240 * 2 0 1,2 241 * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 242 * 243 * 3 0 1,2 244 * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 245 * 246 * 4 0,1,2 247 * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 248 * 249 * 5 0 1 2 250 * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 251 * 252 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 253 * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 254 * 255 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 256 * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 257 * 258 * - at the next field, start with a new, all-zeroes result bitmap. For each 259 * bit set in the previous result bitmap, fill the new result bitmap 260 * (fill_map in pipapo_lookup()) with the rule indices from the 261 * corresponding buckets of the mapping field for this field, done by 262 * pipapo_refill() 263 * 264 * Example: with mapping table from insertion examples, with the current 265 * result bitmap from the previous example, 0x02: 266 * 267 * :: 268 * 269 * rule indices in current field: 0 1 2 270 * map to rules in next field: 0 1 1 271 * 272 * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be 273 * set. 274 * 275 * We can now extend this example to cover the second iteration of the step 276 * above (lookup and AND bitmap): assuming the port field is 277 * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table 278 * for "port" field from pre-computation example: 279 * 280 * :: 281 * 282 * bucket 283 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 284 * 0 0,1 285 * 1 0,1 286 * 2 0 1 287 * 3 0,1 288 * 289 * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] 290 * & 0x3 [bucket 0], resulting bitmap is 0x2. 291 * 292 * - if this is the last field in the set, look up the value from the mapping 293 * array corresponding to the final result bitmap 294 * 295 * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for 296 * last field from insertion example: 297 * 298 * :: 299 * 300 * rule indices in last field: 0 1 301 * map to elements: 0x66 0x42 302 * 303 * the matching element is at 0x42. 304 * 305 * 306 * References 307 * ---------- 308 * 309 * [Ligatti 2010] 310 * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with 311 * Automatic Time-space Tradeoffs 312 * Jay Ligatti, Josh Kuhn, and Chris Gage. 313 * Proceedings of the IEEE International Conference on Computer 314 * Communication Networks (ICCCN), August 2010. 315 * https://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf 316 * 317 * [Rottenstreich 2010] 318 * Worst-Case TCAM Rule Expansion 319 * Ori Rottenstreich and Isaac Keslassy. 320 * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. 321 * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf 322 * 323 * [Kogan 2014] 324 * SAX-PAC (Scalable And eXpressive PAcket Classification) 325 * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, 326 * and Patrick Eugster. 327 * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. 328 * https://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf 329 */ 330 331 #include <linux/kernel.h> 332 #include <linux/init.h> 333 #include <linux/module.h> 334 #include <linux/netlink.h> 335 #include <linux/netfilter.h> 336 #include <linux/netfilter/nf_tables.h> 337 #include <net/netfilter/nf_tables_core.h> 338 #include <uapi/linux/netfilter/nf_tables.h> 339 #include <linux/bitmap.h> 340 #include <linux/bitops.h> 341 342 #include "nft_set_pipapo_avx2.h" 343 #include "nft_set_pipapo.h" 344 345 /** 346 * pipapo_refill() - For each set bit, set bits from selected mapping table item 347 * @map: Bitmap to be scanned for set bits 348 * @len: Length of bitmap in longs 349 * @rules: Number of rules in field 350 * @dst: Destination bitmap 351 * @mt: Mapping table containing bit set specifiers 352 * @match_only: Find a single bit and return, don't fill 353 * 354 * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. 355 * 356 * For each bit set in map, select the bucket from mapping table with index 357 * corresponding to the position of the bit set. Use start bit and amount of 358 * bits specified in bucket to fill region in dst. 359 * 360 * Return: -1 on no match, bit position on 'match_only', 0 otherwise. 361 */ 362 int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, 363 unsigned long *dst, 364 const union nft_pipapo_map_bucket *mt, bool match_only) 365 { 366 unsigned long bitset; 367 unsigned int k; 368 int ret = -1; 369 370 for (k = 0; k < len; k++) { 371 bitset = map[k]; 372 while (bitset) { 373 unsigned long t = bitset & -bitset; 374 int r = __builtin_ctzl(bitset); 375 int i = k * BITS_PER_LONG + r; 376 377 if (unlikely(i >= rules)) { 378 map[k] = 0; 379 return -1; 380 } 381 382 if (match_only) { 383 bitmap_clear(map, i, 1); 384 return i; 385 } 386 387 ret = 0; 388 389 bitmap_set(dst, mt[i].to, mt[i].n); 390 391 bitset ^= t; 392 } 393 map[k] = 0; 394 } 395 396 return ret; 397 } 398 399 /** 400 * nft_pipapo_lookup() - Lookup function 401 * @net: Network namespace 402 * @set: nftables API set representation 403 * @key: nftables API element representation containing key data 404 * @ext: nftables API extension pointer, filled with matching reference 405 * 406 * For more details, see DOC: Theory of Operation. 407 * 408 * Return: true on match, false otherwise. 409 */ 410 bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, 411 const u32 *key, const struct nft_set_ext **ext) 412 { 413 struct nft_pipapo *priv = nft_set_priv(set); 414 struct nft_pipapo_scratch *scratch; 415 unsigned long *res_map, *fill_map; 416 u8 genmask = nft_genmask_cur(net); 417 const struct nft_pipapo_match *m; 418 const struct nft_pipapo_field *f; 419 const u8 *rp = (const u8 *)key; 420 bool map_index; 421 int i; 422 423 local_bh_disable(); 424 425 m = rcu_dereference(priv->match); 426 427 if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) 428 goto out; 429 430 scratch = *raw_cpu_ptr(m->scratch); 431 432 map_index = scratch->map_index; 433 434 res_map = scratch->map + (map_index ? m->bsize_max : 0); 435 fill_map = scratch->map + (map_index ? 0 : m->bsize_max); 436 437 pipapo_resmap_init(m, res_map); 438 439 nft_pipapo_for_each_field(f, i, m) { 440 bool last = i == m->field_count - 1; 441 int b; 442 443 /* For each bit group: select lookup table bucket depending on 444 * packet bytes value, then AND bucket value 445 */ 446 if (likely(f->bb == 8)) 447 pipapo_and_field_buckets_8bit(f, res_map, rp); 448 else 449 pipapo_and_field_buckets_4bit(f, res_map, rp); 450 NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; 451 452 rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); 453 454 /* Now populate the bitmap for the next field, unless this is 455 * the last field, in which case return the matched 'ext' 456 * pointer if any. 457 * 458 * Now res_map contains the matching bitmap, and fill_map is the 459 * bitmap for the next field. 460 */ 461 next_match: 462 b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, 463 last); 464 if (b < 0) { 465 scratch->map_index = map_index; 466 local_bh_enable(); 467 468 return false; 469 } 470 471 if (last) { 472 *ext = &f->mt[b].e->ext; 473 if (unlikely(nft_set_elem_expired(*ext) || 474 !nft_set_elem_active(*ext, genmask))) 475 goto next_match; 476 477 /* Last field: we're just returning the key without 478 * filling the initial bitmap for the next field, so the 479 * current inactive bitmap is clean and can be reused as 480 * *next* bitmap (not initial) for the next packet. 481 */ 482 scratch->map_index = map_index; 483 local_bh_enable(); 484 485 return true; 486 } 487 488 /* Swap bitmap indices: res_map is the initial bitmap for the 489 * next field, and fill_map is guaranteed to be all-zeroes at 490 * this point. 491 */ 492 map_index = !map_index; 493 swap(res_map, fill_map); 494 495 rp += NFT_PIPAPO_GROUPS_PADDING(f); 496 } 497 498 out: 499 local_bh_enable(); 500 return false; 501 } 502 503 /** 504 * pipapo_get() - Get matching element reference given key data 505 * @net: Network namespace 506 * @set: nftables API set representation 507 * @m: storage containing active/existing elements 508 * @data: Key data to be matched against existing elements 509 * @genmask: If set, check that element is active in given genmask 510 * @tstamp: timestamp to check for expired elements 511 * @gfp: the type of memory to allocate (see kmalloc). 512 * 513 * This is essentially the same as the lookup function, except that it matches 514 * key data against the uncommitted copy and doesn't use preallocated maps for 515 * bitmap results. 516 * 517 * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. 518 */ 519 static struct nft_pipapo_elem *pipapo_get(const struct net *net, 520 const struct nft_set *set, 521 const struct nft_pipapo_match *m, 522 const u8 *data, u8 genmask, 523 u64 tstamp, gfp_t gfp) 524 { 525 struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); 526 unsigned long *res_map, *fill_map = NULL; 527 const struct nft_pipapo_field *f; 528 int i; 529 530 if (m->bsize_max == 0) 531 return ret; 532 533 res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), gfp); 534 if (!res_map) { 535 ret = ERR_PTR(-ENOMEM); 536 goto out; 537 } 538 539 fill_map = kcalloc(m->bsize_max, sizeof(*res_map), gfp); 540 if (!fill_map) { 541 ret = ERR_PTR(-ENOMEM); 542 goto out; 543 } 544 545 pipapo_resmap_init(m, res_map); 546 547 nft_pipapo_for_each_field(f, i, m) { 548 bool last = i == m->field_count - 1; 549 int b; 550 551 /* For each bit group: select lookup table bucket depending on 552 * packet bytes value, then AND bucket value 553 */ 554 if (f->bb == 8) 555 pipapo_and_field_buckets_8bit(f, res_map, data); 556 else if (f->bb == 4) 557 pipapo_and_field_buckets_4bit(f, res_map, data); 558 else 559 BUG(); 560 561 data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f); 562 563 /* Now populate the bitmap for the next field, unless this is 564 * the last field, in which case return the matched 'ext' 565 * pointer if any. 566 * 567 * Now res_map contains the matching bitmap, and fill_map is the 568 * bitmap for the next field. 569 */ 570 next_match: 571 b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, 572 last); 573 if (b < 0) 574 goto out; 575 576 if (last) { 577 if (__nft_set_elem_expired(&f->mt[b].e->ext, tstamp)) 578 goto next_match; 579 if ((genmask && 580 !nft_set_elem_active(&f->mt[b].e->ext, genmask))) 581 goto next_match; 582 583 ret = f->mt[b].e; 584 goto out; 585 } 586 587 data += NFT_PIPAPO_GROUPS_PADDING(f); 588 589 /* Swap bitmap indices: fill_map will be the initial bitmap for 590 * the next field (i.e. the new res_map), and res_map is 591 * guaranteed to be all-zeroes at this point, ready to be filled 592 * according to the next mapping table. 593 */ 594 swap(res_map, fill_map); 595 } 596 597 out: 598 kfree(fill_map); 599 kfree(res_map); 600 return ret; 601 } 602 603 /** 604 * nft_pipapo_get() - Get matching element reference given key data 605 * @net: Network namespace 606 * @set: nftables API set representation 607 * @elem: nftables API element representation containing key data 608 * @flags: Unused 609 */ 610 static struct nft_elem_priv * 611 nft_pipapo_get(const struct net *net, const struct nft_set *set, 612 const struct nft_set_elem *elem, unsigned int flags) 613 { 614 struct nft_pipapo *priv = nft_set_priv(set); 615 struct nft_pipapo_match *m = rcu_dereference(priv->match); 616 struct nft_pipapo_elem *e; 617 618 e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, 619 nft_genmask_cur(net), get_jiffies_64(), 620 GFP_ATOMIC); 621 if (IS_ERR(e)) 622 return ERR_CAST(e); 623 624 return &e->priv; 625 } 626 627 /** 628 * pipapo_realloc_mt() - Reallocate mapping table if needed upon resize 629 * @f: Field containing mapping table 630 * @old_rules: Amount of existing mapped rules 631 * @rules: Amount of new rules to map 632 * 633 * Return: 0 on success, negative error code on failure. 634 */ 635 static int pipapo_realloc_mt(struct nft_pipapo_field *f, 636 unsigned int old_rules, unsigned int rules) 637 { 638 union nft_pipapo_map_bucket *new_mt = NULL, *old_mt = f->mt; 639 const unsigned int extra = PAGE_SIZE / sizeof(*new_mt); 640 unsigned int rules_alloc = rules; 641 642 might_sleep(); 643 644 if (unlikely(rules == 0)) 645 goto out_free; 646 647 /* growing and enough space left, no action needed */ 648 if (rules > old_rules && f->rules_alloc > rules) 649 return 0; 650 651 /* downsize and extra slack has not grown too large */ 652 if (rules < old_rules) { 653 unsigned int remove = f->rules_alloc - rules; 654 655 if (remove < (2u * extra)) 656 return 0; 657 } 658 659 /* If set needs more than one page of memory for rules then 660 * allocate another extra page to avoid frequent reallocation. 661 */ 662 if (rules > extra && 663 check_add_overflow(rules, extra, &rules_alloc)) 664 return -EOVERFLOW; 665 666 if (rules_alloc > (INT_MAX / sizeof(*new_mt))) 667 return -ENOMEM; 668 669 new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); 670 if (!new_mt) 671 return -ENOMEM; 672 673 if (old_mt) 674 memcpy(new_mt, old_mt, min(old_rules, rules) * sizeof(*new_mt)); 675 676 if (rules > old_rules) { 677 memset(new_mt + old_rules, 0, 678 (rules - old_rules) * sizeof(*new_mt)); 679 } 680 out_free: 681 f->rules_alloc = rules_alloc; 682 f->mt = new_mt; 683 684 kvfree(old_mt); 685 686 return 0; 687 } 688 689 690 /** 691 * lt_calculate_size() - Get storage size for lookup table with overflow check 692 * @groups: Amount of bit groups 693 * @bb: Number of bits grouped together in lookup table buckets 694 * @bsize: Size of each bucket in lookup table, in longs 695 * 696 * Return: allocation size including alignment overhead, negative on overflow 697 */ 698 static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb, 699 unsigned int bsize) 700 { 701 ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long); 702 703 if (check_mul_overflow(ret, bsize, &ret)) 704 return -1; 705 if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret)) 706 return -1; 707 if (ret > INT_MAX) 708 return -1; 709 710 return ret; 711 } 712 713 /** 714 * pipapo_resize() - Resize lookup or mapping table, or both 715 * @f: Field containing lookup and mapping tables 716 * @old_rules: Previous amount of rules in field 717 * @rules: New amount of rules 718 * 719 * Increase, decrease or maintain tables size depending on new amount of rules, 720 * and copy data over. In case the new size is smaller, throw away data for 721 * highest-numbered rules. 722 * 723 * Return: 0 on success, -ENOMEM on allocation failure. 724 */ 725 static int pipapo_resize(struct nft_pipapo_field *f, 726 unsigned int old_rules, unsigned int rules) 727 { 728 long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; 729 unsigned int new_bucket_size, copy; 730 int group, bucket, err; 731 ssize_t lt_size; 732 733 if (rules >= NFT_PIPAPO_RULE0_MAX) 734 return -ENOSPC; 735 736 new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); 737 #ifdef NFT_PIPAPO_ALIGN 738 new_bucket_size = roundup(new_bucket_size, 739 NFT_PIPAPO_ALIGN / sizeof(*new_lt)); 740 #endif 741 742 if (new_bucket_size == f->bsize) 743 goto mt; 744 745 if (new_bucket_size > f->bsize) 746 copy = f->bsize; 747 else 748 copy = new_bucket_size; 749 750 lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size); 751 if (lt_size < 0) 752 return -ENOMEM; 753 754 new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); 755 if (!new_lt) 756 return -ENOMEM; 757 758 new_p = NFT_PIPAPO_LT_ALIGN(new_lt); 759 old_p = NFT_PIPAPO_LT_ALIGN(old_lt); 760 761 for (group = 0; group < f->groups; group++) { 762 for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS(f->bb); bucket++) { 763 memcpy(new_p, old_p, copy * sizeof(*new_p)); 764 new_p += copy; 765 old_p += copy; 766 767 if (new_bucket_size > f->bsize) 768 new_p += new_bucket_size - f->bsize; 769 else 770 old_p += f->bsize - new_bucket_size; 771 } 772 } 773 774 mt: 775 err = pipapo_realloc_mt(f, old_rules, rules); 776 if (err) { 777 kvfree(new_lt); 778 return err; 779 } 780 781 if (new_lt) { 782 f->bsize = new_bucket_size; 783 f->lt = new_lt; 784 kvfree(old_lt); 785 } 786 787 return 0; 788 } 789 790 /** 791 * pipapo_bucket_set() - Set rule bit in bucket given group and group value 792 * @f: Field containing lookup table 793 * @rule: Rule index 794 * @group: Group index 795 * @v: Value of bit group 796 */ 797 static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, 798 int v) 799 { 800 unsigned long *pos; 801 802 pos = NFT_PIPAPO_LT_ALIGN(f->lt); 803 pos += f->bsize * NFT_PIPAPO_BUCKETS(f->bb) * group; 804 pos += f->bsize * v; 805 806 __set_bit(rule, pos); 807 } 808 809 /** 810 * pipapo_lt_4b_to_8b() - Switch lookup table group width from 4 bits to 8 bits 811 * @old_groups: Number of current groups 812 * @bsize: Size of one bucket, in longs 813 * @old_lt: Pointer to the current lookup table 814 * @new_lt: Pointer to the new, pre-allocated lookup table 815 * 816 * Each bucket with index b in the new lookup table, belonging to group g, is 817 * filled with the bit intersection between: 818 * - bucket with index given by the upper 4 bits of b, from group g, and 819 * - bucket with index given by the lower 4 bits of b, from group g + 1 820 * 821 * That is, given buckets from the new lookup table N(x, y) and the old lookup 822 * table O(x, y), with x bucket index, and y group index: 823 * 824 * N(b, g) := O(b / 16, g) & O(b % 16, g + 1) 825 * 826 * This ensures equivalence of the matching results on lookup. Two examples in 827 * pictures: 828 * 829 * bucket 830 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 ... 254 255 831 * 0 ^ 832 * 1 | ^ 833 * ... ( & ) | 834 * / \ | 835 * / \ .-( & )-. 836 * / bucket \ | | 837 * group 0 / 1 2 3 \ 4 5 6 7 8 9 10 11 12 13 |14 15 | 838 * 0 / \ | | 839 * 1 \ | | 840 * 2 | --' 841 * 3 '- 842 * ... 843 */ 844 static void pipapo_lt_4b_to_8b(int old_groups, int bsize, 845 unsigned long *old_lt, unsigned long *new_lt) 846 { 847 int g, b, i; 848 849 for (g = 0; g < old_groups / 2; g++) { 850 int src_g0 = g * 2, src_g1 = g * 2 + 1; 851 852 for (b = 0; b < NFT_PIPAPO_BUCKETS(8); b++) { 853 int src_b0 = b / NFT_PIPAPO_BUCKETS(4); 854 int src_b1 = b % NFT_PIPAPO_BUCKETS(4); 855 int src_i0 = src_g0 * NFT_PIPAPO_BUCKETS(4) + src_b0; 856 int src_i1 = src_g1 * NFT_PIPAPO_BUCKETS(4) + src_b1; 857 858 for (i = 0; i < bsize; i++) { 859 *new_lt = old_lt[src_i0 * bsize + i] & 860 old_lt[src_i1 * bsize + i]; 861 new_lt++; 862 } 863 } 864 } 865 } 866 867 /** 868 * pipapo_lt_8b_to_4b() - Switch lookup table group width from 8 bits to 4 bits 869 * @old_groups: Number of current groups 870 * @bsize: Size of one bucket, in longs 871 * @old_lt: Pointer to the current lookup table 872 * @new_lt: Pointer to the new, pre-allocated lookup table 873 * 874 * Each bucket with index b in the new lookup table, belonging to group g, is 875 * filled with the bit union of: 876 * - all the buckets with index such that the upper four bits of the lower byte 877 * equal b, from group g, with g odd 878 * - all the buckets with index such that the lower four bits equal b, from 879 * group g, with g even 880 * 881 * That is, given buckets from the new lookup table N(x, y) and the old lookup 882 * table O(x, y), with x bucket index, and y group index: 883 * 884 * - with g odd: N(b, g) := U(O(x, g) for each x : x = (b & 0xf0) >> 4) 885 * - with g even: N(b, g) := U(O(x, g) for each x : x = b & 0x0f) 886 * 887 * where U() denotes the arbitrary union operation (binary OR of n terms). This 888 * ensures equivalence of the matching results on lookup. 889 */ 890 static void pipapo_lt_8b_to_4b(int old_groups, int bsize, 891 unsigned long *old_lt, unsigned long *new_lt) 892 { 893 int g, b, bsrc, i; 894 895 memset(new_lt, 0, old_groups * 2 * NFT_PIPAPO_BUCKETS(4) * bsize * 896 sizeof(unsigned long)); 897 898 for (g = 0; g < old_groups * 2; g += 2) { 899 int src_g = g / 2; 900 901 for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { 902 for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; 903 bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); 904 bsrc++) { 905 if (((bsrc & 0xf0) >> 4) != b) 906 continue; 907 908 for (i = 0; i < bsize; i++) 909 new_lt[i] |= old_lt[bsrc * bsize + i]; 910 } 911 912 new_lt += bsize; 913 } 914 915 for (b = 0; b < NFT_PIPAPO_BUCKETS(4); b++) { 916 for (bsrc = NFT_PIPAPO_BUCKETS(8) * src_g; 917 bsrc < NFT_PIPAPO_BUCKETS(8) * (src_g + 1); 918 bsrc++) { 919 if ((bsrc & 0x0f) != b) 920 continue; 921 922 for (i = 0; i < bsize; i++) 923 new_lt[i] |= old_lt[bsrc * bsize + i]; 924 } 925 926 new_lt += bsize; 927 } 928 } 929 } 930 931 /** 932 * pipapo_lt_bits_adjust() - Adjust group size for lookup table if needed 933 * @f: Field containing lookup table 934 */ 935 static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) 936 { 937 unsigned int groups, bb; 938 unsigned long *new_lt; 939 ssize_t lt_size; 940 941 lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * 942 sizeof(*f->lt); 943 944 if (f->bb == NFT_PIPAPO_GROUP_BITS_SMALL_SET && 945 lt_size > NFT_PIPAPO_LT_SIZE_HIGH) { 946 groups = f->groups * 2; 947 bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; 948 949 lt_size = lt_calculate_size(groups, bb, f->bsize); 950 if (lt_size < 0) 951 return; 952 } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && 953 lt_size < NFT_PIPAPO_LT_SIZE_LOW) { 954 groups = f->groups / 2; 955 bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; 956 957 lt_size = lt_calculate_size(groups, bb, f->bsize); 958 if (lt_size < 0) 959 return; 960 961 /* Don't increase group width if the resulting lookup table size 962 * would exceed the upper size threshold for a "small" set. 963 */ 964 if (lt_size > NFT_PIPAPO_LT_SIZE_HIGH) 965 return; 966 } else { 967 return; 968 } 969 970 new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); 971 if (!new_lt) 972 return; 973 974 NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; 975 if (f->bb == 4 && bb == 8) { 976 pipapo_lt_4b_to_8b(f->groups, f->bsize, 977 NFT_PIPAPO_LT_ALIGN(f->lt), 978 NFT_PIPAPO_LT_ALIGN(new_lt)); 979 } else if (f->bb == 8 && bb == 4) { 980 pipapo_lt_8b_to_4b(f->groups, f->bsize, 981 NFT_PIPAPO_LT_ALIGN(f->lt), 982 NFT_PIPAPO_LT_ALIGN(new_lt)); 983 } else { 984 BUG(); 985 } 986 987 f->groups = groups; 988 f->bb = bb; 989 kvfree(f->lt); 990 f->lt = new_lt; 991 } 992 993 /** 994 * pipapo_insert() - Insert new rule in field given input key and mask length 995 * @f: Field containing lookup table 996 * @k: Input key for classification, without nftables padding 997 * @mask_bits: Length of mask; matches field length for non-ranged entry 998 * 999 * Insert a new rule reference in lookup buckets corresponding to k and 1000 * mask_bits. 1001 * 1002 * Return: 1 on success (one rule inserted), negative error code on failure. 1003 */ 1004 static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, 1005 int mask_bits) 1006 { 1007 unsigned int rule = f->rules, group, ret, bit_offset = 0; 1008 1009 ret = pipapo_resize(f, f->rules, f->rules + 1); 1010 if (ret) 1011 return ret; 1012 1013 f->rules++; 1014 1015 for (group = 0; group < f->groups; group++) { 1016 int i, v; 1017 u8 mask; 1018 1019 v = k[group / (BITS_PER_BYTE / f->bb)]; 1020 v &= GENMASK(BITS_PER_BYTE - bit_offset - 1, 0); 1021 v >>= (BITS_PER_BYTE - bit_offset) - f->bb; 1022 1023 bit_offset += f->bb; 1024 bit_offset %= BITS_PER_BYTE; 1025 1026 if (mask_bits >= (group + 1) * f->bb) { 1027 /* Not masked */ 1028 pipapo_bucket_set(f, rule, group, v); 1029 } else if (mask_bits <= group * f->bb) { 1030 /* Completely masked */ 1031 for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) 1032 pipapo_bucket_set(f, rule, group, i); 1033 } else { 1034 /* The mask limit falls on this group */ 1035 mask = GENMASK(f->bb - 1, 0); 1036 mask >>= mask_bits - group * f->bb; 1037 for (i = 0; i < NFT_PIPAPO_BUCKETS(f->bb); i++) { 1038 if ((i & ~mask) == (v & ~mask)) 1039 pipapo_bucket_set(f, rule, group, i); 1040 } 1041 } 1042 } 1043 1044 pipapo_lt_bits_adjust(f); 1045 1046 return 1; 1047 } 1048 1049 /** 1050 * pipapo_step_diff() - Check if setting @step bit in netmask would change it 1051 * @base: Mask we are expanding 1052 * @step: Step bit for given expansion step 1053 * @len: Total length of mask space (set and unset bits), bytes 1054 * 1055 * Convenience function for mask expansion. 1056 * 1057 * Return: true if step bit changes mask (i.e. isn't set), false otherwise. 1058 */ 1059 static bool pipapo_step_diff(u8 *base, int step, int len) 1060 { 1061 /* Network order, byte-addressed */ 1062 #ifdef __BIG_ENDIAN__ 1063 return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); 1064 #else 1065 return !(BIT(step % BITS_PER_BYTE) & 1066 base[len - 1 - step / BITS_PER_BYTE]); 1067 #endif 1068 } 1069 1070 /** 1071 * pipapo_step_after_end() - Check if mask exceeds range end with given step 1072 * @base: Mask we are expanding 1073 * @end: End of range 1074 * @step: Step bit for given expansion step, highest bit to be set 1075 * @len: Total length of mask space (set and unset bits), bytes 1076 * 1077 * Convenience function for mask expansion. 1078 * 1079 * Return: true if mask exceeds range setting step bits, false otherwise. 1080 */ 1081 static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, 1082 int len) 1083 { 1084 u8 tmp[NFT_PIPAPO_MAX_BYTES]; 1085 int i; 1086 1087 memcpy(tmp, base, len); 1088 1089 /* Network order, byte-addressed */ 1090 for (i = 0; i <= step; i++) 1091 #ifdef __BIG_ENDIAN__ 1092 tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); 1093 #else 1094 tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); 1095 #endif 1096 1097 return memcmp(tmp, end, len) > 0; 1098 } 1099 1100 /** 1101 * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry 1102 * @base: Netmask base 1103 * @step: Step bit to sum 1104 * @len: Netmask length, bytes 1105 */ 1106 static void pipapo_base_sum(u8 *base, int step, int len) 1107 { 1108 bool carry = false; 1109 int i; 1110 1111 /* Network order, byte-addressed */ 1112 #ifdef __BIG_ENDIAN__ 1113 for (i = step / BITS_PER_BYTE; i < len; i++) { 1114 #else 1115 for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { 1116 #endif 1117 if (carry) 1118 base[i]++; 1119 else 1120 base[i] += 1 << (step % BITS_PER_BYTE); 1121 1122 if (base[i]) 1123 break; 1124 1125 carry = true; 1126 } 1127 } 1128 1129 /** 1130 * pipapo_expand() - Expand to composing netmasks, insert into lookup table 1131 * @f: Field containing lookup table 1132 * @start: Start of range 1133 * @end: End of range 1134 * @len: Length of value in bits 1135 * 1136 * Expand range to composing netmasks and insert corresponding rule references 1137 * in lookup buckets. 1138 * 1139 * Return: number of inserted rules on success, negative error code on failure. 1140 */ 1141 static int pipapo_expand(struct nft_pipapo_field *f, 1142 const u8 *start, const u8 *end, int len) 1143 { 1144 int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); 1145 u8 base[NFT_PIPAPO_MAX_BYTES]; 1146 1147 memcpy(base, start, bytes); 1148 while (memcmp(base, end, bytes) <= 0) { 1149 int err; 1150 1151 step = 0; 1152 while (pipapo_step_diff(base, step, bytes)) { 1153 if (pipapo_step_after_end(base, end, step, bytes)) 1154 break; 1155 1156 step++; 1157 if (step >= len) { 1158 if (!masks) { 1159 err = pipapo_insert(f, base, 0); 1160 if (err < 0) 1161 return err; 1162 masks = 1; 1163 } 1164 goto out; 1165 } 1166 } 1167 1168 err = pipapo_insert(f, base, len - step); 1169 1170 if (err < 0) 1171 return err; 1172 1173 masks++; 1174 pipapo_base_sum(base, step, bytes); 1175 } 1176 out: 1177 return masks; 1178 } 1179 1180 /** 1181 * pipapo_map() - Insert rules in mapping tables, mapping them between fields 1182 * @m: Matching data, including mapping table 1183 * @map: Table of rule maps: array of first rule and amount of rules 1184 * in next field a given rule maps to, for each field 1185 * @e: For last field, nft_set_ext pointer matching rules map to 1186 */ 1187 static void pipapo_map(struct nft_pipapo_match *m, 1188 union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], 1189 struct nft_pipapo_elem *e) 1190 { 1191 struct nft_pipapo_field *f; 1192 int i, j; 1193 1194 for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { 1195 for (j = 0; j < map[i].n; j++) { 1196 f->mt[map[i].to + j].to = map[i + 1].to; 1197 f->mt[map[i].to + j].n = map[i + 1].n; 1198 } 1199 } 1200 1201 /* Last field: map to ext instead of mapping to next field */ 1202 for (j = 0; j < map[i].n; j++) 1203 f->mt[map[i].to + j].e = e; 1204 } 1205 1206 /** 1207 * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address 1208 * @m: Matching data 1209 * @cpu: CPU number 1210 */ 1211 static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) 1212 { 1213 struct nft_pipapo_scratch *s; 1214 void *mem; 1215 1216 s = *per_cpu_ptr(m->scratch, cpu); 1217 if (!s) 1218 return; 1219 1220 mem = s; 1221 mem -= s->align_off; 1222 kfree(mem); 1223 } 1224 1225 /** 1226 * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results 1227 * @clone: Copy of matching data with pending insertions and deletions 1228 * @bsize_max: Maximum bucket size, scratch maps cover two buckets 1229 * 1230 * Return: 0 on success, -ENOMEM on failure. 1231 */ 1232 static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, 1233 unsigned long bsize_max) 1234 { 1235 int i; 1236 1237 for_each_possible_cpu(i) { 1238 struct nft_pipapo_scratch *scratch; 1239 #ifdef NFT_PIPAPO_ALIGN 1240 void *scratch_aligned; 1241 u32 align_off; 1242 #endif 1243 scratch = kzalloc_node(struct_size(scratch, map, 1244 bsize_max * 2) + 1245 NFT_PIPAPO_ALIGN_HEADROOM, 1246 GFP_KERNEL_ACCOUNT, cpu_to_node(i)); 1247 if (!scratch) { 1248 /* On failure, there's no need to undo previous 1249 * allocations: this means that some scratch maps have 1250 * a bigger allocated size now (this is only called on 1251 * insertion), but the extra space won't be used by any 1252 * CPU as new elements are not inserted and m->bsize_max 1253 * is not updated. 1254 */ 1255 return -ENOMEM; 1256 } 1257 1258 pipapo_free_scratch(clone, i); 1259 1260 #ifdef NFT_PIPAPO_ALIGN 1261 /* Align &scratch->map (not the struct itself): the extra 1262 * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() 1263 * above guarantee we can waste up to those bytes in order 1264 * to align the map field regardless of its offset within 1265 * the struct. 1266 */ 1267 BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM); 1268 1269 scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); 1270 scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); 1271 align_off = scratch_aligned - (void *)scratch; 1272 1273 scratch = scratch_aligned; 1274 scratch->align_off = align_off; 1275 #endif 1276 *per_cpu_ptr(clone->scratch, i) = scratch; 1277 } 1278 1279 return 0; 1280 } 1281 1282 static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) 1283 { 1284 #ifdef CONFIG_PROVE_LOCKING 1285 const struct net *net = read_pnet(&set->net); 1286 1287 return lockdep_is_held(&nft_pernet(net)->commit_mutex); 1288 #else 1289 return true; 1290 #endif 1291 } 1292 1293 static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old); 1294 1295 /** 1296 * pipapo_maybe_clone() - Build clone for pending data changes, if not existing 1297 * @set: nftables API set representation 1298 * 1299 * Return: newly created or existing clone, if any. NULL on allocation failure 1300 */ 1301 static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set) 1302 { 1303 struct nft_pipapo *priv = nft_set_priv(set); 1304 struct nft_pipapo_match *m; 1305 1306 if (priv->clone) 1307 return priv->clone; 1308 1309 m = rcu_dereference_protected(priv->match, 1310 nft_pipapo_transaction_mutex_held(set)); 1311 priv->clone = pipapo_clone(m); 1312 1313 return priv->clone; 1314 } 1315 1316 /** 1317 * nft_pipapo_insert() - Validate and insert ranged elements 1318 * @net: Network namespace 1319 * @set: nftables API set representation 1320 * @elem: nftables API element representation containing key data 1321 * @elem_priv: Filled with pointer to &struct nft_set_ext in inserted element 1322 * 1323 * Return: 0 on success, error pointer on failure. 1324 */ 1325 static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, 1326 const struct nft_set_elem *elem, 1327 struct nft_elem_priv **elem_priv) 1328 { 1329 const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); 1330 union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; 1331 const u8 *start = (const u8 *)elem->key.val.data, *end; 1332 struct nft_pipapo_match *m = pipapo_maybe_clone(set); 1333 u8 genmask = nft_genmask_next(net); 1334 struct nft_pipapo_elem *e, *dup; 1335 u64 tstamp = nft_net_tstamp(net); 1336 struct nft_pipapo_field *f; 1337 const u8 *start_p, *end_p; 1338 int i, bsize_max, err = 0; 1339 1340 if (!m) 1341 return -ENOMEM; 1342 1343 if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) 1344 end = (const u8 *)nft_set_ext_key_end(ext)->data; 1345 else 1346 end = start; 1347 1348 dup = pipapo_get(net, set, m, start, genmask, tstamp, GFP_KERNEL); 1349 if (!IS_ERR(dup)) { 1350 /* Check if we already have the same exact entry */ 1351 const struct nft_data *dup_key, *dup_end; 1352 1353 dup_key = nft_set_ext_key(&dup->ext); 1354 if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END)) 1355 dup_end = nft_set_ext_key_end(&dup->ext); 1356 else 1357 dup_end = dup_key; 1358 1359 if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) && 1360 !memcmp(end, dup_end->data, sizeof(*dup_end->data))) { 1361 *elem_priv = &dup->priv; 1362 return -EEXIST; 1363 } 1364 1365 return -ENOTEMPTY; 1366 } 1367 1368 if (PTR_ERR(dup) == -ENOENT) { 1369 /* Look for partially overlapping entries */ 1370 dup = pipapo_get(net, set, m, end, nft_genmask_next(net), tstamp, 1371 GFP_KERNEL); 1372 } 1373 1374 if (PTR_ERR(dup) != -ENOENT) { 1375 if (IS_ERR(dup)) 1376 return PTR_ERR(dup); 1377 *elem_priv = &dup->priv; 1378 return -ENOTEMPTY; 1379 } 1380 1381 /* Validate */ 1382 start_p = start; 1383 end_p = end; 1384 1385 /* some helpers return -1, or 0 >= for valid rule pos, 1386 * so we cannot support more than INT_MAX rules at this time. 1387 */ 1388 BUILD_BUG_ON(NFT_PIPAPO_RULE0_MAX > INT_MAX); 1389 1390 nft_pipapo_for_each_field(f, i, m) { 1391 if (f->rules >= NFT_PIPAPO_RULE0_MAX) 1392 return -ENOSPC; 1393 1394 if (memcmp(start_p, end_p, 1395 f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) > 0) 1396 return -EINVAL; 1397 1398 start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); 1399 end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); 1400 } 1401 1402 /* Insert */ 1403 bsize_max = m->bsize_max; 1404 1405 nft_pipapo_for_each_field(f, i, m) { 1406 int ret; 1407 1408 rulemap[i].to = f->rules; 1409 1410 ret = memcmp(start, end, 1411 f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); 1412 if (!ret) 1413 ret = pipapo_insert(f, start, f->groups * f->bb); 1414 else 1415 ret = pipapo_expand(f, start, end, f->groups * f->bb); 1416 1417 if (ret < 0) 1418 return ret; 1419 1420 if (f->bsize > bsize_max) 1421 bsize_max = f->bsize; 1422 1423 rulemap[i].n = ret; 1424 1425 start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); 1426 end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); 1427 } 1428 1429 if (!*get_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { 1430 put_cpu_ptr(m->scratch); 1431 1432 err = pipapo_realloc_scratch(m, bsize_max); 1433 if (err) 1434 return err; 1435 1436 m->bsize_max = bsize_max; 1437 } else { 1438 put_cpu_ptr(m->scratch); 1439 } 1440 1441 e = nft_elem_priv_cast(elem->priv); 1442 *elem_priv = &e->priv; 1443 1444 pipapo_map(m, rulemap, e); 1445 1446 return 0; 1447 } 1448 1449 /** 1450 * pipapo_clone() - Clone matching data to create new working copy 1451 * @old: Existing matching data 1452 * 1453 * Return: copy of matching data passed as 'old' or NULL. 1454 */ 1455 static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) 1456 { 1457 struct nft_pipapo_field *dst, *src; 1458 struct nft_pipapo_match *new; 1459 int i; 1460 1461 new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT); 1462 if (!new) 1463 return NULL; 1464 1465 new->field_count = old->field_count; 1466 new->bsize_max = old->bsize_max; 1467 1468 new->scratch = alloc_percpu(*new->scratch); 1469 if (!new->scratch) 1470 goto out_scratch; 1471 1472 for_each_possible_cpu(i) 1473 *per_cpu_ptr(new->scratch, i) = NULL; 1474 1475 if (pipapo_realloc_scratch(new, old->bsize_max)) 1476 goto out_scratch_realloc; 1477 1478 rcu_head_init(&new->rcu); 1479 1480 src = old->f; 1481 dst = new->f; 1482 1483 for (i = 0; i < old->field_count; i++) { 1484 unsigned long *new_lt; 1485 ssize_t lt_size; 1486 1487 memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); 1488 1489 lt_size = lt_calculate_size(src->groups, src->bb, src->bsize); 1490 if (lt_size < 0) 1491 goto out_lt; 1492 1493 new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); 1494 if (!new_lt) 1495 goto out_lt; 1496 1497 dst->lt = new_lt; 1498 1499 memcpy(NFT_PIPAPO_LT_ALIGN(new_lt), 1500 NFT_PIPAPO_LT_ALIGN(src->lt), 1501 src->bsize * sizeof(*dst->lt) * 1502 src->groups * NFT_PIPAPO_BUCKETS(src->bb)); 1503 1504 if (src->rules > 0) { 1505 if (src->rules_alloc > (INT_MAX / sizeof(*src->mt))) 1506 goto out_mt; 1507 1508 dst->mt = kvmalloc_array(src->rules_alloc, 1509 sizeof(*src->mt), 1510 GFP_KERNEL_ACCOUNT); 1511 if (!dst->mt) 1512 goto out_mt; 1513 1514 memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); 1515 } else { 1516 dst->mt = NULL; 1517 dst->rules_alloc = 0; 1518 } 1519 1520 src++; 1521 dst++; 1522 } 1523 1524 return new; 1525 1526 out_mt: 1527 kvfree(dst->lt); 1528 out_lt: 1529 for (dst--; i > 0; i--) { 1530 kvfree(dst->mt); 1531 kvfree(dst->lt); 1532 dst--; 1533 } 1534 out_scratch_realloc: 1535 for_each_possible_cpu(i) 1536 pipapo_free_scratch(new, i); 1537 out_scratch: 1538 free_percpu(new->scratch); 1539 kfree(new); 1540 1541 return NULL; 1542 } 1543 1544 /** 1545 * pipapo_rules_same_key() - Get number of rules originated from the same entry 1546 * @f: Field containing mapping table 1547 * @first: Index of first rule in set of rules mapping to same entry 1548 * 1549 * Using the fact that all rules in a field that originated from the same entry 1550 * will map to the same set of rules in the next field, or to the same element 1551 * reference, return the cardinality of the set of rules that originated from 1552 * the same entry as the rule with index @first, @first rule included. 1553 * 1554 * In pictures: 1555 * rules 1556 * field #0 0 1 2 3 4 1557 * map to: 0 1 2-4 2-4 5-9 1558 * . . ....... . ... 1559 * | | | | \ \ 1560 * | | | | \ \ 1561 * | | | | \ \ 1562 * ' ' ' ' ' \ 1563 * in field #1 0 1 2 3 4 5 ... 1564 * 1565 * if this is called for rule 2 on field #0, it will return 3, as also rules 2 1566 * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. 1567 * 1568 * For the last field in a set, we can rely on associated entries to map to the 1569 * same element references. 1570 * 1571 * Return: Number of rules that originated from the same entry as @first. 1572 */ 1573 static unsigned int pipapo_rules_same_key(struct nft_pipapo_field *f, unsigned int first) 1574 { 1575 struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ 1576 unsigned int r; 1577 1578 for (r = first; r < f->rules; r++) { 1579 if (r != first && e != f->mt[r].e) 1580 return r - first; 1581 1582 e = f->mt[r].e; 1583 } 1584 1585 if (r != first) 1586 return r - first; 1587 1588 return 0; 1589 } 1590 1591 /** 1592 * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones 1593 * @mt: Mapping array 1594 * @rules: Original amount of rules in mapping table 1595 * @start: First rule index to be removed 1596 * @n: Amount of rules to be removed 1597 * @to_offset: First rule index, in next field, this group of rules maps to 1598 * @is_last: If this is the last field, delete reference from mapping array 1599 * 1600 * This is used to unmap rules from the mapping table for a single field, 1601 * maintaining consistency and compactness for the existing ones. 1602 * 1603 * In pictures: let's assume that we want to delete rules 2 and 3 from the 1604 * following mapping array: 1605 * 1606 * rules 1607 * 0 1 2 3 4 1608 * map to: 4-10 4-10 11-15 11-15 16-18 1609 * 1610 * the result will be: 1611 * 1612 * rules 1613 * 0 1 2 1614 * map to: 4-10 4-10 11-13 1615 * 1616 * for fields before the last one. In case this is the mapping table for the 1617 * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: 1618 * 1619 * rules 1620 * 0 1 2 3 4 1621 * element pointers: 0x42 0x42 0x33 0x33 0x44 1622 * 1623 * the result will be: 1624 * 1625 * rules 1626 * 0 1 2 1627 * element pointers: 0x42 0x42 0x44 1628 */ 1629 static void pipapo_unmap(union nft_pipapo_map_bucket *mt, unsigned int rules, 1630 unsigned int start, unsigned int n, 1631 unsigned int to_offset, bool is_last) 1632 { 1633 int i; 1634 1635 memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); 1636 memset(mt + rules - n, 0, n * sizeof(*mt)); 1637 1638 if (is_last) 1639 return; 1640 1641 for (i = start; i < rules - n; i++) 1642 mt[i].to -= to_offset; 1643 } 1644 1645 /** 1646 * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map 1647 * @m: Matching data 1648 * @rulemap: Table of rule maps, arrays of first rule and amount of rules 1649 * in next field a given entry maps to, for each field 1650 * 1651 * For each rule in lookup table buckets mapping to this set of rules, drop 1652 * all bits set in lookup table mapping. In pictures, assuming we want to drop 1653 * rules 0 and 1 from this lookup table: 1654 * 1655 * bucket 1656 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 1657 * 0 0 1,2 1658 * 1 1,2 0 1659 * 2 0 1,2 1660 * 3 0 1,2 1661 * 4 0,1,2 1662 * 5 0 1 2 1663 * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1664 * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 1665 * 1666 * rule 2 becomes rule 0, and the result will be: 1667 * 1668 * bucket 1669 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 1670 * 0 0 1671 * 1 0 1672 * 2 0 1673 * 3 0 1674 * 4 0 1675 * 5 0 1676 * 6 0 1677 * 7 0 0 1678 * 1679 * once this is done, call unmap() to drop all the corresponding rule references 1680 * from mapping tables. 1681 */ 1682 static void pipapo_drop(struct nft_pipapo_match *m, 1683 union nft_pipapo_map_bucket rulemap[]) 1684 { 1685 struct nft_pipapo_field *f; 1686 int i; 1687 1688 nft_pipapo_for_each_field(f, i, m) { 1689 int g; 1690 1691 for (g = 0; g < f->groups; g++) { 1692 unsigned long *pos; 1693 int b; 1694 1695 pos = NFT_PIPAPO_LT_ALIGN(f->lt) + g * 1696 NFT_PIPAPO_BUCKETS(f->bb) * f->bsize; 1697 1698 for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { 1699 bitmap_cut(pos, pos, rulemap[i].to, 1700 rulemap[i].n, 1701 f->bsize * BITS_PER_LONG); 1702 1703 pos += f->bsize; 1704 } 1705 } 1706 1707 pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, 1708 rulemap[i + 1].n, i == m->field_count - 1); 1709 if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { 1710 /* We can ignore this, a failure to shrink tables down 1711 * doesn't make tables invalid. 1712 */ 1713 ; 1714 } 1715 f->rules -= rulemap[i].n; 1716 1717 pipapo_lt_bits_adjust(f); 1718 } 1719 } 1720 1721 static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, 1722 struct nft_pipapo_elem *e) 1723 1724 { 1725 nft_setelem_data_deactivate(net, set, &e->priv); 1726 } 1727 1728 /** 1729 * pipapo_gc() - Drop expired entries from set, destroy start and end elements 1730 * @set: nftables API set representation 1731 * @m: Matching data 1732 */ 1733 static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) 1734 { 1735 struct nft_pipapo *priv = nft_set_priv(set); 1736 struct net *net = read_pnet(&set->net); 1737 unsigned int rules_f0, first_rule = 0; 1738 u64 tstamp = nft_net_tstamp(net); 1739 struct nft_pipapo_elem *e; 1740 struct nft_trans_gc *gc; 1741 1742 gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); 1743 if (!gc) 1744 return; 1745 1746 while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { 1747 union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; 1748 const struct nft_pipapo_field *f; 1749 unsigned int i, start, rules_fx; 1750 1751 start = first_rule; 1752 rules_fx = rules_f0; 1753 1754 nft_pipapo_for_each_field(f, i, m) { 1755 rulemap[i].to = start; 1756 rulemap[i].n = rules_fx; 1757 1758 if (i < m->field_count - 1) { 1759 rules_fx = f->mt[start].n; 1760 start = f->mt[start].to; 1761 } 1762 } 1763 1764 /* Pick the last field, and its last index */ 1765 f--; 1766 i--; 1767 e = f->mt[rulemap[i].to].e; 1768 1769 /* synchronous gc never fails, there is no need to set on 1770 * NFT_SET_ELEM_DEAD_BIT. 1771 */ 1772 if (__nft_set_elem_expired(&e->ext, tstamp)) { 1773 gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); 1774 if (!gc) 1775 return; 1776 1777 nft_pipapo_gc_deactivate(net, set, e); 1778 pipapo_drop(m, rulemap); 1779 nft_trans_gc_elem_add(gc, e); 1780 1781 /* And check again current first rule, which is now the 1782 * first we haven't checked. 1783 */ 1784 } else { 1785 first_rule += rules_f0; 1786 } 1787 } 1788 1789 gc = nft_trans_gc_catchall_sync(gc); 1790 if (gc) { 1791 nft_trans_gc_queue_sync_done(gc); 1792 priv->last_gc = jiffies; 1793 } 1794 } 1795 1796 /** 1797 * pipapo_free_fields() - Free per-field tables contained in matching data 1798 * @m: Matching data 1799 */ 1800 static void pipapo_free_fields(struct nft_pipapo_match *m) 1801 { 1802 struct nft_pipapo_field *f; 1803 int i; 1804 1805 nft_pipapo_for_each_field(f, i, m) { 1806 kvfree(f->lt); 1807 kvfree(f->mt); 1808 } 1809 } 1810 1811 static void pipapo_free_match(struct nft_pipapo_match *m) 1812 { 1813 int i; 1814 1815 for_each_possible_cpu(i) 1816 pipapo_free_scratch(m, i); 1817 1818 free_percpu(m->scratch); 1819 pipapo_free_fields(m); 1820 1821 kfree(m); 1822 } 1823 1824 /** 1825 * pipapo_reclaim_match - RCU callback to free fields from old matching data 1826 * @rcu: RCU head 1827 */ 1828 static void pipapo_reclaim_match(struct rcu_head *rcu) 1829 { 1830 struct nft_pipapo_match *m; 1831 1832 m = container_of(rcu, struct nft_pipapo_match, rcu); 1833 pipapo_free_match(m); 1834 } 1835 1836 /** 1837 * nft_pipapo_commit() - Replace lookup data with current working copy 1838 * @set: nftables API set representation 1839 * 1840 * While at it, check if we should perform garbage collection on the working 1841 * copy before committing it for lookup, and don't replace the table if the 1842 * working copy doesn't have pending changes. 1843 * 1844 * We also need to create a new working copy for subsequent insertions and 1845 * deletions. 1846 */ 1847 static void nft_pipapo_commit(struct nft_set *set) 1848 { 1849 struct nft_pipapo *priv = nft_set_priv(set); 1850 struct nft_pipapo_match *old; 1851 1852 if (!priv->clone) 1853 return; 1854 1855 if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) 1856 pipapo_gc(set, priv->clone); 1857 1858 old = rcu_replace_pointer(priv->match, priv->clone, 1859 nft_pipapo_transaction_mutex_held(set)); 1860 priv->clone = NULL; 1861 1862 if (old) 1863 call_rcu(&old->rcu, pipapo_reclaim_match); 1864 } 1865 1866 static void nft_pipapo_abort(const struct nft_set *set) 1867 { 1868 struct nft_pipapo *priv = nft_set_priv(set); 1869 1870 if (!priv->clone) 1871 return; 1872 pipapo_free_match(priv->clone); 1873 priv->clone = NULL; 1874 } 1875 1876 /** 1877 * nft_pipapo_activate() - Mark element reference as active given key, commit 1878 * @net: Network namespace 1879 * @set: nftables API set representation 1880 * @elem_priv: nftables API element representation containing key data 1881 * 1882 * On insertion, elements are added to a copy of the matching data currently 1883 * in use for lookups, and not directly inserted into current lookup data. Both 1884 * nft_pipapo_insert() and nft_pipapo_activate() are called once for each 1885 * element, hence we can't purpose either one as a real commit operation. 1886 */ 1887 static void nft_pipapo_activate(const struct net *net, 1888 const struct nft_set *set, 1889 struct nft_elem_priv *elem_priv) 1890 { 1891 struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); 1892 1893 nft_clear(net, &e->ext); 1894 } 1895 1896 /** 1897 * nft_pipapo_deactivate() - Search for element and make it inactive 1898 * @net: Network namespace 1899 * @set: nftables API set representation 1900 * @elem: nftables API element representation containing key data 1901 * 1902 * Return: deactivated element if found, NULL otherwise. 1903 */ 1904 static struct nft_elem_priv * 1905 nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, 1906 const struct nft_set_elem *elem) 1907 { 1908 struct nft_pipapo_match *m = pipapo_maybe_clone(set); 1909 struct nft_pipapo_elem *e; 1910 1911 /* removal must occur on priv->clone, if we are low on memory 1912 * we have no choice and must fail the removal request. 1913 */ 1914 if (!m) 1915 return NULL; 1916 1917 e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, 1918 nft_genmask_next(net), nft_net_tstamp(net), GFP_KERNEL); 1919 if (IS_ERR(e)) 1920 return NULL; 1921 1922 nft_set_elem_change_active(net, set, &e->ext); 1923 1924 return &e->priv; 1925 } 1926 1927 /** 1928 * nft_pipapo_flush() - make element inactive 1929 * @net: Network namespace 1930 * @set: nftables API set representation 1931 * @elem_priv: nftables API element representation containing key data 1932 * 1933 * This is functionally the same as nft_pipapo_deactivate(), with a slightly 1934 * different interface, and it's also called once for each element in a set 1935 * being flushed, so we can't implement, strictly speaking, a flush operation, 1936 * which would otherwise be as simple as allocating an empty copy of the 1937 * matching data. 1938 * 1939 * Note that we could in theory do that, mark the set as flushed, and ignore 1940 * subsequent calls, but we would leak all the elements after the first one, 1941 * because they wouldn't then be freed as result of API calls. 1942 * 1943 * Return: true if element was found and deactivated. 1944 */ 1945 static void nft_pipapo_flush(const struct net *net, const struct nft_set *set, 1946 struct nft_elem_priv *elem_priv) 1947 { 1948 struct nft_pipapo_elem *e = nft_elem_priv_cast(elem_priv); 1949 1950 nft_set_elem_change_active(net, set, &e->ext); 1951 } 1952 1953 /** 1954 * pipapo_get_boundaries() - Get byte interval for associated rules 1955 * @f: Field including lookup table 1956 * @first_rule: First rule (lowest index) 1957 * @rule_count: Number of associated rules 1958 * @left: Byte expression for left boundary (start of range) 1959 * @right: Byte expression for right boundary (end of range) 1960 * 1961 * Given the first rule and amount of rules that originated from the same entry, 1962 * build the original range associated with the entry, and calculate the length 1963 * of the originating netmask. 1964 * 1965 * In pictures: 1966 * 1967 * bucket 1968 * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 1969 * 0 1,2 1970 * 1 1,2 1971 * 2 1,2 1972 * 3 1,2 1973 * 4 1,2 1974 * 5 1 2 1975 * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1976 * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1977 * 1978 * this is the lookup table corresponding to the IPv4 range 1979 * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, 1980 * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. 1981 * 1982 * This function fills @left and @right with the byte values of the leftmost 1983 * and rightmost bucket indices for the lowest and highest rule indices, 1984 * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in 1985 * nibbles: 1986 * left: < 12, 0, 10, 8, 0, 1, 0, 0 > 1987 * right: < 12, 0, 10, 8, 0, 2, 2, 1 > 1988 * corresponding to bytes: 1989 * left: < 192, 168, 1, 0 > 1990 * right: < 192, 168, 2, 1 > 1991 * with mask length irrelevant here, unused on return, as the range is already 1992 * defined by its start and end points. The mask length is relevant for a single 1993 * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore 1994 * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes 1995 * < 192, 168, 1, 255 >, and the mask length, calculated from the distances 1996 * between leftmost and rightmost bucket indices for each group, would be 24. 1997 * 1998 * Return: mask length, in bits. 1999 */ 2000 static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, 2001 int rule_count, u8 *left, u8 *right) 2002 { 2003 int g, mask_len = 0, bit_offset = 0; 2004 u8 *l = left, *r = right; 2005 2006 for (g = 0; g < f->groups; g++) { 2007 int b, x0, x1; 2008 2009 x0 = -1; 2010 x1 = -1; 2011 for (b = 0; b < NFT_PIPAPO_BUCKETS(f->bb); b++) { 2012 unsigned long *pos; 2013 2014 pos = NFT_PIPAPO_LT_ALIGN(f->lt) + 2015 (g * NFT_PIPAPO_BUCKETS(f->bb) + b) * f->bsize; 2016 if (test_bit(first_rule, pos) && x0 == -1) 2017 x0 = b; 2018 if (test_bit(first_rule + rule_count - 1, pos)) 2019 x1 = b; 2020 } 2021 2022 *l |= x0 << (BITS_PER_BYTE - f->bb - bit_offset); 2023 *r |= x1 << (BITS_PER_BYTE - f->bb - bit_offset); 2024 2025 bit_offset += f->bb; 2026 if (bit_offset >= BITS_PER_BYTE) { 2027 bit_offset %= BITS_PER_BYTE; 2028 l++; 2029 r++; 2030 } 2031 2032 if (x1 - x0 == 0) 2033 mask_len += 4; 2034 else if (x1 - x0 == 1) 2035 mask_len += 3; 2036 else if (x1 - x0 == 3) 2037 mask_len += 2; 2038 else if (x1 - x0 == 7) 2039 mask_len += 1; 2040 } 2041 2042 return mask_len; 2043 } 2044 2045 /** 2046 * pipapo_match_field() - Match rules against byte ranges 2047 * @f: Field including the lookup table 2048 * @first_rule: First of associated rules originating from same entry 2049 * @rule_count: Amount of associated rules 2050 * @start: Start of range to be matched 2051 * @end: End of range to be matched 2052 * 2053 * Return: true on match, false otherwise. 2054 */ 2055 static bool pipapo_match_field(struct nft_pipapo_field *f, 2056 int first_rule, int rule_count, 2057 const u8 *start, const u8 *end) 2058 { 2059 u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; 2060 u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; 2061 2062 pipapo_get_boundaries(f, first_rule, rule_count, left, right); 2063 2064 return !memcmp(start, left, 2065 f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)) && 2066 !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f)); 2067 } 2068 2069 /** 2070 * nft_pipapo_remove() - Remove element given key, commit 2071 * @net: Network namespace 2072 * @set: nftables API set representation 2073 * @elem_priv: nftables API element representation containing key data 2074 * 2075 * Similarly to nft_pipapo_activate(), this is used as commit operation by the 2076 * API, but it's called once per element in the pending transaction, so we can't 2077 * implement this as a single commit operation. Closest we can get is to remove 2078 * the matched element here, if any, and commit the updated matching data. 2079 */ 2080 static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, 2081 struct nft_elem_priv *elem_priv) 2082 { 2083 struct nft_pipapo *priv = nft_set_priv(set); 2084 struct nft_pipapo_match *m = priv->clone; 2085 unsigned int rules_f0, first_rule = 0; 2086 struct nft_pipapo_elem *e; 2087 const u8 *data; 2088 2089 e = nft_elem_priv_cast(elem_priv); 2090 data = (const u8 *)nft_set_ext_key(&e->ext); 2091 2092 while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { 2093 union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; 2094 const u8 *match_start, *match_end; 2095 struct nft_pipapo_field *f; 2096 int i, start, rules_fx; 2097 2098 match_start = data; 2099 2100 if (nft_set_ext_exists(&e->ext, NFT_SET_EXT_KEY_END)) 2101 match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; 2102 else 2103 match_end = data; 2104 2105 start = first_rule; 2106 rules_fx = rules_f0; 2107 2108 nft_pipapo_for_each_field(f, i, m) { 2109 bool last = i == m->field_count - 1; 2110 2111 if (!pipapo_match_field(f, start, rules_fx, 2112 match_start, match_end)) 2113 break; 2114 2115 rulemap[i].to = start; 2116 rulemap[i].n = rules_fx; 2117 2118 rules_fx = f->mt[start].n; 2119 start = f->mt[start].to; 2120 2121 match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); 2122 match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); 2123 2124 if (last && f->mt[rulemap[i].to].e == e) { 2125 pipapo_drop(m, rulemap); 2126 return; 2127 } 2128 } 2129 2130 first_rule += rules_f0; 2131 } 2132 2133 WARN_ON_ONCE(1); /* elem_priv not found */ 2134 } 2135 2136 /** 2137 * nft_pipapo_do_walk() - Walk over elements in m 2138 * @ctx: nftables API context 2139 * @set: nftables API set representation 2140 * @m: matching data pointing to key mapping array 2141 * @iter: Iterator 2142 * 2143 * As elements are referenced in the mapping array for the last field, directly 2144 * scan that array: there's no need to follow rule mappings from the first 2145 * field. @m is protected either by RCU read lock or by transaction mutex. 2146 */ 2147 static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set, 2148 const struct nft_pipapo_match *m, 2149 struct nft_set_iter *iter) 2150 { 2151 const struct nft_pipapo_field *f; 2152 unsigned int i, r; 2153 2154 for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) 2155 ; 2156 2157 for (r = 0; r < f->rules; r++) { 2158 struct nft_pipapo_elem *e; 2159 2160 if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) 2161 continue; 2162 2163 if (iter->count < iter->skip) 2164 goto cont; 2165 2166 e = f->mt[r].e; 2167 2168 iter->err = iter->fn(ctx, set, iter, &e->priv); 2169 if (iter->err < 0) 2170 return; 2171 2172 cont: 2173 iter->count++; 2174 } 2175 } 2176 2177 /** 2178 * nft_pipapo_walk() - Walk over elements 2179 * @ctx: nftables API context 2180 * @set: nftables API set representation 2181 * @iter: Iterator 2182 * 2183 * Test if destructive action is needed or not, clone active backend if needed 2184 * and call the real function to work on the data. 2185 */ 2186 static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, 2187 struct nft_set_iter *iter) 2188 { 2189 struct nft_pipapo *priv = nft_set_priv(set); 2190 const struct nft_pipapo_match *m; 2191 2192 switch (iter->type) { 2193 case NFT_ITER_UPDATE: 2194 m = pipapo_maybe_clone(set); 2195 if (!m) { 2196 iter->err = -ENOMEM; 2197 return; 2198 } 2199 2200 nft_pipapo_do_walk(ctx, set, m, iter); 2201 break; 2202 case NFT_ITER_READ: 2203 rcu_read_lock(); 2204 m = rcu_dereference(priv->match); 2205 nft_pipapo_do_walk(ctx, set, m, iter); 2206 rcu_read_unlock(); 2207 break; 2208 default: 2209 iter->err = -EINVAL; 2210 WARN_ON_ONCE(1); 2211 break; 2212 } 2213 } 2214 2215 /** 2216 * nft_pipapo_privsize() - Return the size of private data for the set 2217 * @nla: netlink attributes, ignored as size doesn't depend on them 2218 * @desc: Set description, ignored as size doesn't depend on it 2219 * 2220 * Return: size of private data for this set implementation, in bytes 2221 */ 2222 static u64 nft_pipapo_privsize(const struct nlattr * const nla[], 2223 const struct nft_set_desc *desc) 2224 { 2225 return sizeof(struct nft_pipapo); 2226 } 2227 2228 /** 2229 * nft_pipapo_estimate() - Set size, space and lookup complexity 2230 * @desc: Set description, element count and field description used 2231 * @features: Flags: NFT_SET_INTERVAL needs to be there 2232 * @est: Storage for estimation data 2233 * 2234 * Return: true if set description is compatible, false otherwise 2235 */ 2236 static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, 2237 struct nft_set_estimate *est) 2238 { 2239 if (!(features & NFT_SET_INTERVAL) || 2240 desc->field_count < NFT_PIPAPO_MIN_FIELDS) 2241 return false; 2242 2243 est->size = pipapo_estimate_size(desc); 2244 if (!est->size) 2245 return false; 2246 2247 est->lookup = NFT_SET_CLASS_O_LOG_N; 2248 2249 est->space = NFT_SET_CLASS_O_N; 2250 2251 return true; 2252 } 2253 2254 /** 2255 * nft_pipapo_init() - Initialise data for a set instance 2256 * @set: nftables API set representation 2257 * @desc: Set description 2258 * @nla: netlink attributes 2259 * 2260 * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink 2261 * attributes, initialise internal set parameters, current instance of matching 2262 * data and a copy for subsequent insertions. 2263 * 2264 * Return: 0 on success, negative error code on failure. 2265 */ 2266 static int nft_pipapo_init(const struct nft_set *set, 2267 const struct nft_set_desc *desc, 2268 const struct nlattr * const nla[]) 2269 { 2270 struct nft_pipapo *priv = nft_set_priv(set); 2271 struct nft_pipapo_match *m; 2272 struct nft_pipapo_field *f; 2273 int err, i, field_count; 2274 2275 BUILD_BUG_ON(offsetof(struct nft_pipapo_elem, priv) != 0); 2276 2277 field_count = desc->field_count ? : 1; 2278 2279 BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS > 255); 2280 BUILD_BUG_ON(NFT_PIPAPO_MAX_FIELDS != NFT_REG32_COUNT); 2281 2282 if (field_count > NFT_PIPAPO_MAX_FIELDS) 2283 return -EINVAL; 2284 2285 m = kmalloc(struct_size(m, f, field_count), GFP_KERNEL); 2286 if (!m) 2287 return -ENOMEM; 2288 2289 m->field_count = field_count; 2290 m->bsize_max = 0; 2291 2292 m->scratch = alloc_percpu(struct nft_pipapo_scratch *); 2293 if (!m->scratch) { 2294 err = -ENOMEM; 2295 goto out_scratch; 2296 } 2297 for_each_possible_cpu(i) 2298 *per_cpu_ptr(m->scratch, i) = NULL; 2299 2300 rcu_head_init(&m->rcu); 2301 2302 nft_pipapo_for_each_field(f, i, m) { 2303 unsigned int len = desc->field_len[i] ? : set->klen; 2304 2305 /* f->groups is u8 */ 2306 BUILD_BUG_ON((NFT_PIPAPO_MAX_BYTES * 2307 BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS_LARGE_SET) >= 256); 2308 2309 f->bb = NFT_PIPAPO_GROUP_BITS_INIT; 2310 f->groups = len * NFT_PIPAPO_GROUPS_PER_BYTE(f); 2311 2312 priv->width += round_up(len, sizeof(u32)); 2313 2314 f->bsize = 0; 2315 f->rules = 0; 2316 f->rules_alloc = 0; 2317 f->lt = NULL; 2318 f->mt = NULL; 2319 } 2320 2321 rcu_assign_pointer(priv->match, m); 2322 2323 return 0; 2324 2325 out_scratch: 2326 kfree(m); 2327 2328 return err; 2329 } 2330 2331 /** 2332 * nft_set_pipapo_match_destroy() - Destroy elements from key mapping array 2333 * @ctx: context 2334 * @set: nftables API set representation 2335 * @m: matching data pointing to key mapping array 2336 */ 2337 static void nft_set_pipapo_match_destroy(const struct nft_ctx *ctx, 2338 const struct nft_set *set, 2339 struct nft_pipapo_match *m) 2340 { 2341 struct nft_pipapo_field *f; 2342 unsigned int i, r; 2343 2344 for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) 2345 ; 2346 2347 for (r = 0; r < f->rules; r++) { 2348 struct nft_pipapo_elem *e; 2349 2350 if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) 2351 continue; 2352 2353 e = f->mt[r].e; 2354 2355 nf_tables_set_elem_destroy(ctx, set, &e->priv); 2356 } 2357 } 2358 2359 /** 2360 * nft_pipapo_destroy() - Free private data for set and all committed elements 2361 * @ctx: context 2362 * @set: nftables API set representation 2363 */ 2364 static void nft_pipapo_destroy(const struct nft_ctx *ctx, 2365 const struct nft_set *set) 2366 { 2367 struct nft_pipapo *priv = nft_set_priv(set); 2368 struct nft_pipapo_match *m; 2369 2370 m = rcu_dereference_protected(priv->match, true); 2371 2372 if (priv->clone) { 2373 nft_set_pipapo_match_destroy(ctx, set, priv->clone); 2374 pipapo_free_match(priv->clone); 2375 priv->clone = NULL; 2376 } else { 2377 nft_set_pipapo_match_destroy(ctx, set, m); 2378 } 2379 2380 pipapo_free_match(m); 2381 } 2382 2383 /** 2384 * nft_pipapo_gc_init() - Initialise garbage collection 2385 * @set: nftables API set representation 2386 * 2387 * Instead of actually setting up a periodic work for garbage collection, as 2388 * this operation requires a swap of matching data with the working copy, we'll 2389 * do that opportunistically with other commit operations if the interval is 2390 * elapsed, so we just need to set the current jiffies timestamp here. 2391 */ 2392 static void nft_pipapo_gc_init(const struct nft_set *set) 2393 { 2394 struct nft_pipapo *priv = nft_set_priv(set); 2395 2396 priv->last_gc = jiffies; 2397 } 2398 2399 const struct nft_set_type nft_set_pipapo_type = { 2400 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | 2401 NFT_SET_TIMEOUT, 2402 .ops = { 2403 .lookup = nft_pipapo_lookup, 2404 .insert = nft_pipapo_insert, 2405 .activate = nft_pipapo_activate, 2406 .deactivate = nft_pipapo_deactivate, 2407 .flush = nft_pipapo_flush, 2408 .remove = nft_pipapo_remove, 2409 .walk = nft_pipapo_walk, 2410 .get = nft_pipapo_get, 2411 .privsize = nft_pipapo_privsize, 2412 .estimate = nft_pipapo_estimate, 2413 .init = nft_pipapo_init, 2414 .destroy = nft_pipapo_destroy, 2415 .gc_init = nft_pipapo_gc_init, 2416 .commit = nft_pipapo_commit, 2417 .abort = nft_pipapo_abort, 2418 .elemsize = offsetof(struct nft_pipapo_elem, ext), 2419 }, 2420 }; 2421 2422 #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) 2423 const struct nft_set_type nft_set_pipapo_avx2_type = { 2424 .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | 2425 NFT_SET_TIMEOUT, 2426 .ops = { 2427 .lookup = nft_pipapo_avx2_lookup, 2428 .insert = nft_pipapo_insert, 2429 .activate = nft_pipapo_activate, 2430 .deactivate = nft_pipapo_deactivate, 2431 .flush = nft_pipapo_flush, 2432 .remove = nft_pipapo_remove, 2433 .walk = nft_pipapo_walk, 2434 .get = nft_pipapo_get, 2435 .privsize = nft_pipapo_privsize, 2436 .estimate = nft_pipapo_avx2_estimate, 2437 .init = nft_pipapo_init, 2438 .destroy = nft_pipapo_destroy, 2439 .gc_init = nft_pipapo_gc_init, 2440 .commit = nft_pipapo_commit, 2441 .abort = nft_pipapo_abort, 2442 .elemsize = offsetof(struct nft_pipapo_elem, ext), 2443 }, 2444 }; 2445 #endif 2446