1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
8 #include <net/ip.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
16
17 static DEFINE_MUTEX(flowtable_lock);
18 static LIST_HEAD(flowtables);
19 static __read_mostly struct kmem_cache *flow_offload_cachep;
20
21 static void
flow_offload_fill_dir(struct flow_offload * flow,enum flow_offload_tuple_dir dir)22 flow_offload_fill_dir(struct flow_offload *flow,
23 enum flow_offload_tuple_dir dir)
24 {
25 struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
26 struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
27
28 ft->dir = dir;
29
30 switch (ctt->src.l3num) {
31 case NFPROTO_IPV4:
32 ft->src_v4 = ctt->src.u3.in;
33 ft->dst_v4 = ctt->dst.u3.in;
34 break;
35 case NFPROTO_IPV6:
36 ft->src_v6 = ctt->src.u3.in6;
37 ft->dst_v6 = ctt->dst.u3.in6;
38 break;
39 }
40
41 ft->l3proto = ctt->src.l3num;
42 ft->l4proto = ctt->dst.protonum;
43
44 switch (ctt->dst.protonum) {
45 case IPPROTO_TCP:
46 case IPPROTO_UDP:
47 ft->src_port = ctt->src.u.tcp.port;
48 ft->dst_port = ctt->dst.u.tcp.port;
49 break;
50 }
51 }
52
flow_offload_alloc(struct nf_conn * ct)53 struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
54 {
55 struct flow_offload *flow;
56
57 if (unlikely(nf_ct_is_dying(ct)))
58 return NULL;
59
60 flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC);
61 if (!flow)
62 return NULL;
63
64 refcount_inc(&ct->ct_general.use);
65 flow->ct = ct;
66
67 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
68 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
69
70 if (ct->status & IPS_SRC_NAT)
71 __set_bit(NF_FLOW_SNAT, &flow->flags);
72 if (ct->status & IPS_DST_NAT)
73 __set_bit(NF_FLOW_DNAT, &flow->flags);
74
75 return flow;
76 }
77 EXPORT_SYMBOL_GPL(flow_offload_alloc);
78
flow_offload_dst_cookie(struct flow_offload_tuple * flow_tuple)79 static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
80 {
81 if (flow_tuple->l3proto == NFPROTO_IPV6)
82 return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
83
84 return 0;
85 }
86
nft_route_dst_fetch(struct nf_flow_route * route,enum flow_offload_tuple_dir dir)87 static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route,
88 enum flow_offload_tuple_dir dir)
89 {
90 struct dst_entry *dst = route->tuple[dir].dst;
91
92 route->tuple[dir].dst = NULL;
93
94 return dst;
95 }
96
flow_offload_fill_route(struct flow_offload * flow,struct nf_flow_route * route,enum flow_offload_tuple_dir dir)97 static int flow_offload_fill_route(struct flow_offload *flow,
98 struct nf_flow_route *route,
99 enum flow_offload_tuple_dir dir)
100 {
101 struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
102 struct dst_entry *dst = nft_route_dst_fetch(route, dir);
103 int i, j = 0;
104
105 switch (flow_tuple->l3proto) {
106 case NFPROTO_IPV4:
107 flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
108 break;
109 case NFPROTO_IPV6:
110 flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
111 break;
112 }
113
114 flow_tuple->iifidx = route->tuple[dir].in.ifindex;
115 for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
116 flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
117 flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
118 if (route->tuple[dir].in.ingress_vlans & BIT(i))
119 flow_tuple->in_vlan_ingress |= BIT(j);
120 j++;
121 }
122
123 flow_tuple->tun = route->tuple[dir].in.tun;
124 flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
125 flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
126
127 switch (route->tuple[dir].xmit_type) {
128 case FLOW_OFFLOAD_XMIT_DIRECT:
129 memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
130 ETH_ALEN);
131 memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
132 ETH_ALEN);
133 flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
134 dst_release(dst);
135 break;
136 case FLOW_OFFLOAD_XMIT_XFRM:
137 case FLOW_OFFLOAD_XMIT_NEIGH:
138 flow_tuple->ifidx = route->tuple[dir].out.ifindex;
139 flow_tuple->dst_cache = dst;
140 flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
141 break;
142 default:
143 WARN_ON_ONCE(1);
144 break;
145 }
146 flow_tuple->xmit_type = route->tuple[dir].xmit_type;
147
148 return 0;
149 }
150
nft_flow_dst_release(struct flow_offload * flow,enum flow_offload_tuple_dir dir)151 static void nft_flow_dst_release(struct flow_offload *flow,
152 enum flow_offload_tuple_dir dir)
153 {
154 if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
155 flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
156 dst_release(flow->tuplehash[dir].tuple.dst_cache);
157 }
158
flow_offload_route_init(struct flow_offload * flow,struct nf_flow_route * route)159 void flow_offload_route_init(struct flow_offload *flow,
160 struct nf_flow_route *route)
161 {
162 flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
163 flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
164 flow->type = NF_FLOW_OFFLOAD_ROUTE;
165 }
166 EXPORT_SYMBOL_GPL(flow_offload_route_init);
167
nf_flow_has_expired(const struct flow_offload * flow)168 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
169 {
170 return nf_flow_timeout_delta(flow->timeout) <= 0;
171 }
172
flow_offload_fixup_tcp(struct nf_conn * ct,u8 tcp_state)173 static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
174 {
175 struct ip_ct_tcp *tcp = &ct->proto.tcp;
176
177 spin_lock_bh(&ct->lock);
178 if (tcp->state != tcp_state)
179 tcp->state = tcp_state;
180
181 /* syn packet triggers the TCP reopen case from conntrack. */
182 if (tcp->state == TCP_CONNTRACK_CLOSE)
183 ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
184
185 /* Conntrack state is outdated due to offload bypass.
186 * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
187 * TCP reset validation will fail.
188 */
189 tcp->seen[0].td_maxwin = 0;
190 tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
191 tcp->seen[1].td_maxwin = 0;
192 tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
193 spin_unlock_bh(&ct->lock);
194 }
195
flow_offload_fixup_ct(struct flow_offload * flow)196 static void flow_offload_fixup_ct(struct flow_offload *flow)
197 {
198 struct nf_conn *ct = flow->ct;
199 struct net *net = nf_ct_net(ct);
200 int l4num = nf_ct_protonum(ct);
201 bool expired, closing = false;
202 u32 offload_timeout = 0;
203 s32 timeout;
204
205 if (l4num == IPPROTO_TCP) {
206 const struct nf_tcp_net *tn = nf_tcp_pernet(net);
207 u8 tcp_state;
208
209 /* Enter CLOSE state if fin/rst packet has been seen, this
210 * allows TCP reopen from conntrack. Otherwise, pick up from
211 * the last seen TCP state.
212 */
213 closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
214 if (closing) {
215 flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
216 timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
217 expired = false;
218 } else {
219 tcp_state = READ_ONCE(ct->proto.tcp.state);
220 flow_offload_fixup_tcp(ct, tcp_state);
221 timeout = READ_ONCE(tn->timeouts[tcp_state]);
222 expired = nf_flow_has_expired(flow);
223 }
224 offload_timeout = READ_ONCE(tn->offload_timeout);
225
226 } else if (l4num == IPPROTO_UDP) {
227 const struct nf_udp_net *tn = nf_udp_pernet(net);
228 enum udp_conntrack state =
229 test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
230 UDP_CT_REPLIED : UDP_CT_UNREPLIED;
231
232 timeout = READ_ONCE(tn->timeouts[state]);
233 expired = nf_flow_has_expired(flow);
234 offload_timeout = READ_ONCE(tn->offload_timeout);
235 } else {
236 return;
237 }
238
239 if (expired)
240 timeout -= offload_timeout;
241
242 if (timeout < 0)
243 timeout = 0;
244
245 if (closing ||
246 nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
247 nf_ct_refresh(ct, timeout);
248 }
249
flow_offload_route_release(struct flow_offload * flow)250 static void flow_offload_route_release(struct flow_offload *flow)
251 {
252 nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
253 nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
254 }
255
flow_offload_free(struct flow_offload * flow)256 void flow_offload_free(struct flow_offload *flow)
257 {
258 switch (flow->type) {
259 case NF_FLOW_OFFLOAD_ROUTE:
260 flow_offload_route_release(flow);
261 break;
262 default:
263 break;
264 }
265 nf_ct_put(flow->ct);
266 kfree_rcu(flow, rcu_head);
267 }
268 EXPORT_SYMBOL_GPL(flow_offload_free);
269
flow_offload_hash(const void * data,u32 len,u32 seed)270 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
271 {
272 const struct flow_offload_tuple *tuple = data;
273
274 return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
275 }
276
flow_offload_hash_obj(const void * data,u32 len,u32 seed)277 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
278 {
279 const struct flow_offload_tuple_rhash *tuplehash = data;
280
281 return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
282 }
283
flow_offload_hash_cmp(struct rhashtable_compare_arg * arg,const void * ptr)284 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
285 const void *ptr)
286 {
287 const struct flow_offload_tuple *tuple = arg->key;
288 const struct flow_offload_tuple_rhash *x = ptr;
289
290 if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
291 return 1;
292
293 return 0;
294 }
295
296 static const struct rhashtable_params nf_flow_offload_rhash_params = {
297 .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
298 .hashfn = flow_offload_hash,
299 .obj_hashfn = flow_offload_hash_obj,
300 .obj_cmpfn = flow_offload_hash_cmp,
301 .automatic_shrinking = true,
302 };
303
flow_offload_get_timeout(struct flow_offload * flow)304 unsigned long flow_offload_get_timeout(struct flow_offload *flow)
305 {
306 unsigned long timeout = NF_FLOW_TIMEOUT;
307 struct net *net = nf_ct_net(flow->ct);
308 int l4num = nf_ct_protonum(flow->ct);
309
310 if (l4num == IPPROTO_TCP) {
311 struct nf_tcp_net *tn = nf_tcp_pernet(net);
312
313 timeout = tn->offload_timeout;
314 } else if (l4num == IPPROTO_UDP) {
315 struct nf_udp_net *tn = nf_udp_pernet(net);
316
317 timeout = tn->offload_timeout;
318 }
319
320 return timeout;
321 }
322
flow_offload_add(struct nf_flowtable * flow_table,struct flow_offload * flow)323 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
324 {
325 int err;
326
327 flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
328
329 err = rhashtable_insert_fast(&flow_table->rhashtable,
330 &flow->tuplehash[0].node,
331 nf_flow_offload_rhash_params);
332 if (err < 0)
333 return err;
334
335 err = rhashtable_insert_fast(&flow_table->rhashtable,
336 &flow->tuplehash[1].node,
337 nf_flow_offload_rhash_params);
338 if (err < 0) {
339 rhashtable_remove_fast(&flow_table->rhashtable,
340 &flow->tuplehash[0].node,
341 nf_flow_offload_rhash_params);
342 return err;
343 }
344
345 nf_ct_refresh(flow->ct, NF_CT_DAY);
346
347 if (nf_flowtable_hw_offload(flow_table)) {
348 __set_bit(NF_FLOW_HW, &flow->flags);
349 nf_flow_offload_add(flow_table, flow);
350 }
351
352 return 0;
353 }
354 EXPORT_SYMBOL_GPL(flow_offload_add);
355
flow_offload_refresh(struct nf_flowtable * flow_table,struct flow_offload * flow,bool force)356 void flow_offload_refresh(struct nf_flowtable *flow_table,
357 struct flow_offload *flow, bool force)
358 {
359 u32 timeout;
360
361 timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
362 if (force || timeout - READ_ONCE(flow->timeout) > HZ)
363 WRITE_ONCE(flow->timeout, timeout);
364 else
365 return;
366
367 if (likely(!nf_flowtable_hw_offload(flow_table)) ||
368 test_bit(NF_FLOW_CLOSING, &flow->flags))
369 return;
370
371 nf_flow_offload_add(flow_table, flow);
372 }
373 EXPORT_SYMBOL_GPL(flow_offload_refresh);
374
flow_offload_del(struct nf_flowtable * flow_table,struct flow_offload * flow)375 static void flow_offload_del(struct nf_flowtable *flow_table,
376 struct flow_offload *flow)
377 {
378 rhashtable_remove_fast(&flow_table->rhashtable,
379 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
380 nf_flow_offload_rhash_params);
381 rhashtable_remove_fast(&flow_table->rhashtable,
382 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
383 nf_flow_offload_rhash_params);
384 flow_offload_free(flow);
385 }
386
flow_offload_teardown(struct flow_offload * flow)387 void flow_offload_teardown(struct flow_offload *flow)
388 {
389 clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
390 if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
391 flow_offload_fixup_ct(flow);
392 }
393 EXPORT_SYMBOL_GPL(flow_offload_teardown);
394
395 struct flow_offload_tuple_rhash *
flow_offload_lookup(struct nf_flowtable * flow_table,struct flow_offload_tuple * tuple)396 flow_offload_lookup(struct nf_flowtable *flow_table,
397 struct flow_offload_tuple *tuple)
398 {
399 struct flow_offload_tuple_rhash *tuplehash;
400 struct flow_offload *flow;
401 int dir;
402
403 tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
404 nf_flow_offload_rhash_params);
405 if (!tuplehash)
406 return NULL;
407
408 dir = tuplehash->tuple.dir;
409 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
410 if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
411 return NULL;
412
413 if (unlikely(nf_ct_is_dying(flow->ct)))
414 return NULL;
415
416 return tuplehash;
417 }
418 EXPORT_SYMBOL_GPL(flow_offload_lookup);
419
420 static int
nf_flow_table_iterate(struct nf_flowtable * flow_table,void (* iter)(struct nf_flowtable * flowtable,struct flow_offload * flow,void * data),void * data)421 nf_flow_table_iterate(struct nf_flowtable *flow_table,
422 void (*iter)(struct nf_flowtable *flowtable,
423 struct flow_offload *flow, void *data),
424 void *data)
425 {
426 struct flow_offload_tuple_rhash *tuplehash;
427 struct rhashtable_iter hti;
428 struct flow_offload *flow;
429 int err = 0;
430
431 rhashtable_walk_enter(&flow_table->rhashtable, &hti);
432 rhashtable_walk_start(&hti);
433
434 while ((tuplehash = rhashtable_walk_next(&hti))) {
435 if (IS_ERR(tuplehash)) {
436 if (PTR_ERR(tuplehash) != -EAGAIN) {
437 err = PTR_ERR(tuplehash);
438 break;
439 }
440 continue;
441 }
442 if (tuplehash->tuple.dir)
443 continue;
444
445 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
446
447 iter(flow_table, flow, data);
448 }
449 rhashtable_walk_stop(&hti);
450 rhashtable_walk_exit(&hti);
451
452 return err;
453 }
454
nf_flow_custom_gc(struct nf_flowtable * flow_table,const struct flow_offload * flow)455 static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
456 const struct flow_offload *flow)
457 {
458 return flow_table->type->gc && flow_table->type->gc(flow);
459 }
460
461 /**
462 * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
463 * @ct: Flowtable offloaded tcp ct
464 *
465 * Return: number of seconds when ct entry should expire.
466 */
nf_flow_table_tcp_timeout(const struct nf_conn * ct)467 static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
468 {
469 u8 state = READ_ONCE(ct->proto.tcp.state);
470
471 switch (state) {
472 case TCP_CONNTRACK_SYN_SENT:
473 case TCP_CONNTRACK_SYN_RECV:
474 return 0;
475 case TCP_CONNTRACK_ESTABLISHED:
476 return NF_CT_DAY;
477 case TCP_CONNTRACK_FIN_WAIT:
478 case TCP_CONNTRACK_CLOSE_WAIT:
479 case TCP_CONNTRACK_LAST_ACK:
480 case TCP_CONNTRACK_TIME_WAIT:
481 return 5 * 60 * HZ;
482 case TCP_CONNTRACK_CLOSE:
483 return 0;
484 }
485
486 return 0;
487 }
488
489 /**
490 * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
491 * @ct: Flowtable offloaded ct
492 *
493 * Datapath lookups in the conntrack table will evict nf_conn entries
494 * if they have expired.
495 *
496 * Once nf_conn entries have been offloaded, nf_conntrack might not see any
497 * packets anymore. Thus ct->timeout is no longer refreshed and ct can
498 * be evicted.
499 *
500 * To avoid the need for an additional check on the offload bit for every
501 * packet processed via nf_conntrack_in(), set an arbitrary timeout large
502 * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
503 * from the packet path via nf_ct_is_expired().
504 */
nf_flow_table_extend_ct_timeout(struct nf_conn * ct)505 static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
506 {
507 static const u32 min_timeout = 5 * 60 * HZ;
508 u32 expires = nf_ct_expires(ct);
509
510 /* normal case: large enough timeout, nothing to do. */
511 if (likely(expires >= min_timeout))
512 return;
513
514 /* must check offload bit after this, we do not hold any locks.
515 * flowtable and ct entries could have been removed on another CPU.
516 */
517 if (!refcount_inc_not_zero(&ct->ct_general.use))
518 return;
519
520 /* load ct->status after refcount increase */
521 smp_acquire__after_ctrl_dep();
522
523 if (nf_ct_is_confirmed(ct) &&
524 test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
525 u8 l4proto = nf_ct_protonum(ct);
526 u32 new_timeout = true;
527
528 switch (l4proto) {
529 case IPPROTO_UDP:
530 new_timeout = NF_CT_DAY;
531 break;
532 case IPPROTO_TCP:
533 new_timeout = nf_flow_table_tcp_timeout(ct);
534 break;
535 default:
536 WARN_ON_ONCE(1);
537 break;
538 }
539
540 /* Update to ct->timeout from nf_conntrack happens
541 * without holding ct->lock.
542 *
543 * Use cmpxchg to ensure timeout extension doesn't
544 * happen when we race with conntrack datapath.
545 *
546 * The inverse -- datapath updating ->timeout right
547 * after this -- is fine, datapath is authoritative.
548 */
549 if (new_timeout) {
550 new_timeout += nfct_time_stamp;
551 cmpxchg(&ct->timeout, expires, new_timeout);
552 }
553 }
554
555 nf_ct_put(ct);
556 }
557
nf_flow_offload_gc_step(struct nf_flowtable * flow_table,struct flow_offload * flow,void * data)558 static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
559 struct flow_offload *flow, void *data)
560 {
561 bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
562
563 if (nf_flow_has_expired(flow) ||
564 nf_ct_is_dying(flow->ct) ||
565 nf_flow_custom_gc(flow_table, flow)) {
566 flow_offload_teardown(flow);
567 teardown = true;
568 } else if (!teardown) {
569 nf_flow_table_extend_ct_timeout(flow->ct);
570 }
571
572 if (teardown) {
573 if (test_bit(NF_FLOW_HW, &flow->flags)) {
574 if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
575 nf_flow_offload_del(flow_table, flow);
576 else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags))
577 flow_offload_del(flow_table, flow);
578 } else {
579 flow_offload_del(flow_table, flow);
580 }
581 } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
582 test_bit(NF_FLOW_HW, &flow->flags) &&
583 !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
584 nf_flow_offload_del(flow_table, flow);
585 } else if (test_bit(NF_FLOW_HW, &flow->flags)) {
586 nf_flow_offload_stats(flow_table, flow);
587 }
588 }
589
nf_flow_table_gc_run(struct nf_flowtable * flow_table)590 void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
591 {
592 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
593 }
594
nf_flow_offload_work_gc(struct work_struct * work)595 static void nf_flow_offload_work_gc(struct work_struct *work)
596 {
597 struct nf_flowtable *flow_table;
598
599 flow_table = container_of(work, struct nf_flowtable, gc_work.work);
600 nf_flow_table_gc_run(flow_table);
601 queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
602 }
603
nf_flow_nat_port_tcp(struct sk_buff * skb,unsigned int thoff,__be16 port,__be16 new_port)604 static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
605 __be16 port, __be16 new_port)
606 {
607 struct tcphdr *tcph;
608
609 tcph = (void *)(skb_network_header(skb) + thoff);
610 inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
611 }
612
nf_flow_nat_port_udp(struct sk_buff * skb,unsigned int thoff,__be16 port,__be16 new_port)613 static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
614 __be16 port, __be16 new_port)
615 {
616 struct udphdr *udph;
617
618 udph = (void *)(skb_network_header(skb) + thoff);
619 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
620 inet_proto_csum_replace2(&udph->check, skb, port,
621 new_port, false);
622 if (!udph->check)
623 udph->check = CSUM_MANGLED_0;
624 }
625 }
626
nf_flow_nat_port(struct sk_buff * skb,unsigned int thoff,u8 protocol,__be16 port,__be16 new_port)627 static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
628 u8 protocol, __be16 port, __be16 new_port)
629 {
630 switch (protocol) {
631 case IPPROTO_TCP:
632 nf_flow_nat_port_tcp(skb, thoff, port, new_port);
633 break;
634 case IPPROTO_UDP:
635 nf_flow_nat_port_udp(skb, thoff, port, new_port);
636 break;
637 }
638 }
639
nf_flow_snat_port(const struct flow_offload * flow,struct sk_buff * skb,unsigned int thoff,u8 protocol,enum flow_offload_tuple_dir dir)640 void nf_flow_snat_port(const struct flow_offload *flow,
641 struct sk_buff *skb, unsigned int thoff,
642 u8 protocol, enum flow_offload_tuple_dir dir)
643 {
644 struct flow_ports *hdr;
645 __be16 port, new_port;
646
647 hdr = (void *)(skb_network_header(skb) + thoff);
648
649 switch (dir) {
650 case FLOW_OFFLOAD_DIR_ORIGINAL:
651 port = hdr->source;
652 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
653 hdr->source = new_port;
654 break;
655 case FLOW_OFFLOAD_DIR_REPLY:
656 port = hdr->dest;
657 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
658 hdr->dest = new_port;
659 break;
660 }
661
662 nf_flow_nat_port(skb, thoff, protocol, port, new_port);
663 }
664 EXPORT_SYMBOL_GPL(nf_flow_snat_port);
665
nf_flow_dnat_port(const struct flow_offload * flow,struct sk_buff * skb,unsigned int thoff,u8 protocol,enum flow_offload_tuple_dir dir)666 void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
667 unsigned int thoff, u8 protocol,
668 enum flow_offload_tuple_dir dir)
669 {
670 struct flow_ports *hdr;
671 __be16 port, new_port;
672
673 hdr = (void *)(skb_network_header(skb) + thoff);
674
675 switch (dir) {
676 case FLOW_OFFLOAD_DIR_ORIGINAL:
677 port = hdr->dest;
678 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
679 hdr->dest = new_port;
680 break;
681 case FLOW_OFFLOAD_DIR_REPLY:
682 port = hdr->source;
683 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
684 hdr->source = new_port;
685 break;
686 }
687
688 nf_flow_nat_port(skb, thoff, protocol, port, new_port);
689 }
690 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
691
nf_flow_table_init(struct nf_flowtable * flowtable)692 int nf_flow_table_init(struct nf_flowtable *flowtable)
693 {
694 int err;
695
696 INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
697 flow_block_init(&flowtable->flow_block);
698 init_rwsem(&flowtable->flow_block_lock);
699
700 err = rhashtable_init(&flowtable->rhashtable,
701 &nf_flow_offload_rhash_params);
702 if (err < 0)
703 return err;
704
705 queue_delayed_work(system_power_efficient_wq,
706 &flowtable->gc_work, HZ);
707
708 mutex_lock(&flowtable_lock);
709 list_add(&flowtable->list, &flowtables);
710 mutex_unlock(&flowtable_lock);
711
712 return 0;
713 }
714 EXPORT_SYMBOL_GPL(nf_flow_table_init);
715
nf_flow_table_do_cleanup(struct nf_flowtable * flow_table,struct flow_offload * flow,void * data)716 static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
717 struct flow_offload *flow, void *data)
718 {
719 struct net_device *dev = data;
720
721 if (!dev) {
722 flow_offload_teardown(flow);
723 return;
724 }
725
726 if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
727 (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
728 flow->tuplehash[1].tuple.iifidx == dev->ifindex))
729 flow_offload_teardown(flow);
730 }
731
nf_flow_table_gc_cleanup(struct nf_flowtable * flowtable,struct net_device * dev)732 void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
733 struct net_device *dev)
734 {
735 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
736 flush_delayed_work(&flowtable->gc_work);
737 nf_flow_table_offload_flush(flowtable);
738 }
739
nf_flow_table_cleanup(struct net_device * dev)740 void nf_flow_table_cleanup(struct net_device *dev)
741 {
742 struct nf_flowtable *flowtable;
743
744 mutex_lock(&flowtable_lock);
745 list_for_each_entry(flowtable, &flowtables, list)
746 nf_flow_table_gc_cleanup(flowtable, dev);
747 mutex_unlock(&flowtable_lock);
748 }
749 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
750
nf_flow_table_free(struct nf_flowtable * flow_table)751 void nf_flow_table_free(struct nf_flowtable *flow_table)
752 {
753 mutex_lock(&flowtable_lock);
754 list_del(&flow_table->list);
755 mutex_unlock(&flowtable_lock);
756
757 cancel_delayed_work_sync(&flow_table->gc_work);
758 nf_flow_table_offload_flush(flow_table);
759 /* ... no more pending work after this stage ... */
760 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
761 nf_flow_table_gc_run(flow_table);
762 nf_flow_table_offload_flush_cleanup(flow_table);
763 rhashtable_destroy(&flow_table->rhashtable);
764 }
765 EXPORT_SYMBOL_GPL(nf_flow_table_free);
766
nf_flow_table_init_net(struct net * net)767 static int nf_flow_table_init_net(struct net *net)
768 {
769 net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
770 return net->ft.stat ? 0 : -ENOMEM;
771 }
772
nf_flow_table_fini_net(struct net * net)773 static void nf_flow_table_fini_net(struct net *net)
774 {
775 free_percpu(net->ft.stat);
776 }
777
nf_flow_table_pernet_init(struct net * net)778 static int nf_flow_table_pernet_init(struct net *net)
779 {
780 int ret;
781
782 ret = nf_flow_table_init_net(net);
783 if (ret < 0)
784 return ret;
785
786 ret = nf_flow_table_init_proc(net);
787 if (ret < 0)
788 goto out_proc;
789
790 return 0;
791
792 out_proc:
793 nf_flow_table_fini_net(net);
794 return ret;
795 }
796
nf_flow_table_pernet_exit(struct list_head * net_exit_list)797 static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
798 {
799 struct net *net;
800
801 list_for_each_entry(net, net_exit_list, exit_list) {
802 nf_flow_table_fini_proc(net);
803 nf_flow_table_fini_net(net);
804 }
805 }
806
807 static struct pernet_operations nf_flow_table_net_ops = {
808 .init = nf_flow_table_pernet_init,
809 .exit_batch = nf_flow_table_pernet_exit,
810 };
811
nf_flow_table_module_init(void)812 static int __init nf_flow_table_module_init(void)
813 {
814 int ret;
815
816 flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN);
817 if (!flow_offload_cachep)
818 return -ENOMEM;
819
820 ret = register_pernet_subsys(&nf_flow_table_net_ops);
821 if (ret < 0)
822 goto out_pernet;
823
824 ret = nf_flow_table_offload_init();
825 if (ret)
826 goto out_offload;
827
828 ret = nf_flow_register_bpf();
829 if (ret)
830 goto out_bpf;
831
832 return 0;
833
834 out_bpf:
835 nf_flow_table_offload_exit();
836 out_offload:
837 unregister_pernet_subsys(&nf_flow_table_net_ops);
838 out_pernet:
839 kmem_cache_destroy(flow_offload_cachep);
840 return ret;
841 }
842
nf_flow_table_module_exit(void)843 static void __exit nf_flow_table_module_exit(void)
844 {
845 nf_flow_table_offload_exit();
846 unregister_pernet_subsys(&nf_flow_table_net_ops);
847 kmem_cache_destroy(flow_offload_cachep);
848 }
849
850 module_init(nf_flow_table_module_init);
851 module_exit(nf_flow_table_module_exit);
852
853 MODULE_LICENSE("GPL");
854 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
855 MODULE_DESCRIPTION("Netfilter flow table module");
856