1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Event cache for netfilter. */ 3 4 /* 5 * (C) 2005 Harald Welte <laforge@gnumonks.org> 6 * (C) 2005 Patrick McHardy <kaber@trash.net> 7 * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org> 8 * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org> 9 */ 10 11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13 #include <linux/types.h> 14 #include <linux/netfilter.h> 15 #include <linux/skbuff.h> 16 #include <linux/vmalloc.h> 17 #include <linux/stddef.h> 18 #include <linux/err.h> 19 #include <linux/kernel.h> 20 #include <linux/netdevice.h> 21 #include <linux/slab.h> 22 #include <linux/export.h> 23 24 #include <net/netfilter/nf_conntrack.h> 25 #include <net/netfilter/nf_conntrack_core.h> 26 #include <net/netfilter/nf_conntrack_ecache.h> 27 #include <net/netfilter/nf_conntrack_extend.h> 28 29 static DEFINE_MUTEX(nf_ct_ecache_mutex); 30 31 #define DYING_NULLS_VAL ((1 << 30) + 1) 32 #define ECACHE_MAX_JIFFIES msecs_to_jiffies(10) 33 #define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10) 34 35 enum retry_state { 36 STATE_CONGESTED, 37 STATE_RESTART, 38 STATE_DONE, 39 }; 40 41 struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net) 42 { 43 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 44 45 return &cnet->ecache; 46 } 47 #if IS_MODULE(CONFIG_NF_CT_NETLINK) 48 EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache); 49 #endif 50 51 static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet) 52 { 53 unsigned long stop = jiffies + ECACHE_MAX_JIFFIES; 54 struct hlist_nulls_head evicted_list; 55 enum retry_state ret = STATE_DONE; 56 struct nf_conntrack_tuple_hash *h; 57 struct hlist_nulls_node *n; 58 unsigned int sent; 59 60 INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL); 61 62 next: 63 sent = 0; 64 spin_lock_bh(&cnet->ecache.dying_lock); 65 66 hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) { 67 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 68 69 /* The worker owns all entries, ct remains valid until nf_ct_put 70 * in the loop below. 71 */ 72 if (nf_conntrack_event(IPCT_DESTROY, ct)) { 73 ret = STATE_CONGESTED; 74 break; 75 } 76 77 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode); 78 hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list); 79 80 if (time_after(stop, jiffies)) { 81 ret = STATE_RESTART; 82 break; 83 } 84 85 if (sent++ > 16) { 86 spin_unlock_bh(&cnet->ecache.dying_lock); 87 cond_resched(); 88 goto next; 89 } 90 } 91 92 spin_unlock_bh(&cnet->ecache.dying_lock); 93 94 hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) { 95 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); 96 97 hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode); 98 nf_ct_put(ct); 99 100 cond_resched(); 101 } 102 103 return ret; 104 } 105 106 static void ecache_work(struct work_struct *work) 107 { 108 struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work); 109 int ret, delay = -1; 110 111 ret = ecache_work_evict_list(cnet); 112 switch (ret) { 113 case STATE_CONGESTED: 114 delay = ECACHE_RETRY_JIFFIES; 115 break; 116 case STATE_RESTART: 117 delay = 0; 118 break; 119 case STATE_DONE: 120 break; 121 } 122 123 if (delay >= 0) 124 schedule_delayed_work(&cnet->ecache.dwork, delay); 125 } 126 127 static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, 128 const u32 events, 129 const u32 missed, 130 const struct nf_ct_event *item) 131 { 132 struct net *net = nf_ct_net(item->ct); 133 struct nf_ct_event_notifier *notify; 134 u32 old, want; 135 int ret; 136 137 if (!((events | missed) & e->ctmask)) 138 return 0; 139 140 rcu_read_lock(); 141 142 notify = rcu_dereference(net->ct.nf_conntrack_event_cb); 143 if (!notify) { 144 rcu_read_unlock(); 145 return 0; 146 } 147 148 ret = notify->ct_event(events | missed, item); 149 rcu_read_unlock(); 150 151 if (likely(ret >= 0 && missed == 0)) 152 return 0; 153 154 do { 155 old = READ_ONCE(e->missed); 156 if (ret < 0) 157 want = old | events; 158 else 159 want = old & ~missed; 160 } while (cmpxchg(&e->missed, old, want) != old); 161 162 return ret; 163 } 164 165 static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e) 166 { 167 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 168 if (local64_read(&e->timestamp)) 169 local64_set(&e->timestamp, ktime_get_real_ns()); 170 #endif 171 } 172 173 int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, 174 u32 portid, int report) 175 { 176 struct nf_conntrack_ecache *e; 177 struct nf_ct_event item; 178 unsigned int missed; 179 int ret; 180 181 if (!nf_ct_is_confirmed(ct)) 182 return 0; 183 184 e = nf_ct_ecache_find(ct); 185 if (!e) 186 return 0; 187 188 memset(&item, 0, sizeof(item)); 189 190 item.ct = ct; 191 item.portid = e->portid ? e->portid : portid; 192 item.report = report; 193 194 /* This is a resent of a destroy event? If so, skip missed */ 195 missed = e->portid ? 0 : e->missed; 196 197 nf_ct_ecache_tstamp_refresh(e); 198 199 ret = __nf_conntrack_eventmask_report(e, events, missed, &item); 200 if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { 201 /* This is a destroy event that has been triggered by a process, 202 * we store the PORTID to include it in the retransmission. 203 */ 204 if (e->portid == 0 && portid != 0) 205 e->portid = portid; 206 } 207 208 return ret; 209 } 210 EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); 211 212 /* deliver cached events and clear cache entry - must be called with locally 213 * disabled softirqs */ 214 void nf_ct_deliver_cached_events(struct nf_conn *ct) 215 { 216 struct nf_conntrack_ecache *e; 217 struct nf_ct_event item; 218 unsigned int events; 219 220 if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) 221 return; 222 223 e = nf_ct_ecache_find(ct); 224 if (e == NULL) 225 return; 226 227 events = xchg(&e->cache, 0); 228 229 item.ct = ct; 230 item.portid = 0; 231 item.report = 0; 232 233 /* We make a copy of the missed event cache without taking 234 * the lock, thus we may send missed events twice. However, 235 * this does not harm and it happens very rarely. 236 */ 237 __nf_conntrack_eventmask_report(e, events, e->missed, &item); 238 } 239 EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); 240 241 void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, 242 struct nf_conntrack_expect *exp, 243 u32 portid, int report) 244 245 { 246 struct net *net = nf_ct_exp_net(exp); 247 struct nf_ct_event_notifier *notify; 248 struct nf_conntrack_ecache *e; 249 250 rcu_read_lock(); 251 notify = rcu_dereference(net->ct.nf_conntrack_event_cb); 252 if (!notify) 253 goto out_unlock; 254 255 e = nf_ct_ecache_find(exp->master); 256 if (!e) 257 goto out_unlock; 258 259 if (e->expmask & (1 << event)) { 260 struct nf_exp_event item = { 261 .exp = exp, 262 .portid = portid, 263 .report = report 264 }; 265 notify->exp_event(1 << event, &item); 266 } 267 out_unlock: 268 rcu_read_unlock(); 269 } 270 271 void nf_conntrack_register_notifier(struct net *net, 272 const struct nf_ct_event_notifier *new) 273 { 274 struct nf_ct_event_notifier *notify; 275 276 mutex_lock(&nf_ct_ecache_mutex); 277 notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, 278 lockdep_is_held(&nf_ct_ecache_mutex)); 279 WARN_ON_ONCE(notify); 280 rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); 281 mutex_unlock(&nf_ct_ecache_mutex); 282 } 283 EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); 284 285 void nf_conntrack_unregister_notifier(struct net *net) 286 { 287 mutex_lock(&nf_ct_ecache_mutex); 288 RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); 289 mutex_unlock(&nf_ct_ecache_mutex); 290 /* synchronize_rcu() is called after netns pre_exit */ 291 } 292 EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); 293 294 void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) 295 { 296 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 297 298 if (state == NFCT_ECACHE_DESTROY_FAIL && 299 !delayed_work_pending(&cnet->ecache.dwork)) { 300 schedule_delayed_work(&cnet->ecache.dwork, HZ); 301 net->ct.ecache_dwork_pending = true; 302 } else if (state == NFCT_ECACHE_DESTROY_SENT) { 303 if (!hlist_nulls_empty(&cnet->ecache.dying_list)) 304 mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); 305 else 306 net->ct.ecache_dwork_pending = false; 307 } 308 } 309 310 static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e) 311 { 312 #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP 313 u64 ts = 0; 314 315 if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) 316 ts = ktime_get_real_ns(); 317 318 local64_set(&e->timestamp, ts); 319 #endif 320 } 321 322 bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) 323 { 324 struct net *net = nf_ct_net(ct); 325 struct nf_conntrack_ecache *e; 326 327 switch (net->ct.sysctl_events) { 328 case 0: 329 /* assignment via template / ruleset? ignore sysctl. */ 330 if (ctmask || expmask) 331 break; 332 return true; 333 case 2: /* autodetect: no event listener, don't allocate extension. */ 334 if (!READ_ONCE(nf_ctnetlink_has_listener)) 335 return true; 336 fallthrough; 337 case 1: 338 /* always allocate an extension. */ 339 if (!ctmask && !expmask) { 340 ctmask = ~0; 341 expmask = ~0; 342 } 343 break; 344 default: 345 WARN_ON_ONCE(1); 346 return true; 347 } 348 349 e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); 350 if (e) { 351 nf_ct_ecache_tstamp_new(ct, e); 352 e->ctmask = ctmask; 353 e->expmask = expmask; 354 } 355 356 return e != NULL; 357 } 358 EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add); 359 360 #define NF_CT_EVENTS_DEFAULT 2 361 static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT; 362 363 void nf_conntrack_ecache_pernet_init(struct net *net) 364 { 365 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 366 367 net->ct.sysctl_events = nf_ct_events; 368 369 INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work); 370 INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL); 371 spin_lock_init(&cnet->ecache.dying_lock); 372 373 BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */ 374 } 375 376 void nf_conntrack_ecache_pernet_fini(struct net *net) 377 { 378 struct nf_conntrack_net *cnet = nf_ct_pernet(net); 379 380 cancel_delayed_work_sync(&cnet->ecache.dwork); 381 } 382