1 #ifndef _IP_SET_AHASH_H
2 #define _IP_SET_AHASH_H
3 
4 #include <linux/rcupdate.h>
5 #include <linux/jhash.h>
6 #include <linux/netfilter/ipset/ip_set_timeout.h>
7 
8 #define CONCAT(a, b, c)		a##b##c
9 #define TOKEN(a, b, c)		CONCAT(a, b, c)
10 
11 #define type_pf_next		TOKEN(TYPE, PF, _elem)
12 
13 /* Hashing which uses arrays to resolve clashing. The hash table is resized
14  * (doubled) when searching becomes too long.
15  * Internally jhash is used with the assumption that the size of the
16  * stored data is a multiple of sizeof(u32). If storage supports timeout,
17  * the timeout field must be the last one in the data structure - that field
18  * is ignored when computing the hash key.
19  *
20  * Readers and resizing
21  *
22  * Resizing can be triggered by userspace command only, and those
23  * are serialized by the nfnl mutex. During resizing the set is
24  * read-locked, so the only possible concurrent operations are
25  * the kernel side readers. Those must be protected by proper RCU locking.
26  */
27 
28 /* Number of elements to store in an initial array block */
29 #define AHASH_INIT_SIZE			4
30 /* Max number of elements to store in an array block */
31 #define AHASH_MAX_SIZE			(3*AHASH_INIT_SIZE)
32 
33 /* Max number of elements can be tuned */
34 #ifdef IP_SET_HASH_WITH_MULTI
35 #define AHASH_MAX(h)			((h)->ahash_max)
36 
37 static inline u8
tune_ahash_max(u8 curr,u32 multi)38 tune_ahash_max(u8 curr, u32 multi)
39 {
40 	u32 n;
41 
42 	if (multi < curr)
43 		return curr;
44 
45 	n = curr + AHASH_INIT_SIZE;
46 	/* Currently, at listing one hash bucket must fit into a message.
47 	 * Therefore we have a hard limit here.
48 	 */
49 	return n > curr && n <= 64 ? n : curr;
50 }
51 #define TUNE_AHASH_MAX(h, multi)	\
52 	((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
53 #else
54 #define AHASH_MAX(h)			AHASH_MAX_SIZE
55 #define TUNE_AHASH_MAX(h, multi)
56 #endif
57 
58 /* A hash bucket */
59 struct hbucket {
60 	void *value;		/* the array of the values */
61 	u8 size;		/* size of the array */
62 	u8 pos;			/* position of the first free entry */
63 };
64 
65 /* The hash table: the table size stored here in order to make resizing easy */
66 struct htable {
67 	u8 htable_bits;		/* size of hash table == 2^htable_bits */
68 	struct hbucket bucket[0]; /* hashtable buckets */
69 };
70 
71 #define hbucket(h, i)		(&((h)->bucket[i]))
72 
73 /* Book-keeping of the prefixes added to the set */
74 struct ip_set_hash_nets {
75 	u8 cidr;		/* the different cidr values in the set */
76 	u32 nets;		/* number of elements per cidr */
77 };
78 
79 /* The generic ip_set hash structure */
80 struct ip_set_hash {
81 	struct htable *table;	/* the hash table */
82 	u32 maxelem;		/* max elements in the hash */
83 	u32 elements;		/* current element (vs timeout) */
84 	u32 initval;		/* random jhash init value */
85 	u32 timeout;		/* timeout value, if enabled */
86 	struct timer_list gc;	/* garbage collection when timeout enabled */
87 	struct type_pf_next next; /* temporary storage for uadd */
88 #ifdef IP_SET_HASH_WITH_MULTI
89 	u8 ahash_max;		/* max elements in an array block */
90 #endif
91 #ifdef IP_SET_HASH_WITH_NETMASK
92 	u8 netmask;		/* netmask value for subnets to store */
93 #endif
94 #ifdef IP_SET_HASH_WITH_RBTREE
95 	struct rb_root rbtree;
96 #endif
97 #ifdef IP_SET_HASH_WITH_NETS
98 	struct ip_set_hash_nets nets[0]; /* book-keeping of prefixes */
99 #endif
100 };
101 
102 /* Compute htable_bits from the user input parameter hashsize */
103 static u8
htable_bits(u32 hashsize)104 htable_bits(u32 hashsize)
105 {
106 	/* Assume that hashsize == 2^htable_bits */
107 	u8 bits = fls(hashsize - 1);
108 	if (jhash_size(bits) != hashsize)
109 		/* Round up to the first 2^n value */
110 		bits = fls(hashsize);
111 
112 	return bits;
113 }
114 
115 #ifdef IP_SET_HASH_WITH_NETS
116 
117 #define SET_HOST_MASK(family)	(family == AF_INET ? 32 : 128)
118 
119 /* Network cidr size book keeping when the hash stores different
120  * sized networks */
121 static void
add_cidr(struct ip_set_hash * h,u8 cidr,u8 host_mask)122 add_cidr(struct ip_set_hash *h, u8 cidr, u8 host_mask)
123 {
124 	u8 i;
125 
126 	++h->nets[cidr-1].nets;
127 
128 	pr_debug("add_cidr added %u: %u\n", cidr, h->nets[cidr-1].nets);
129 
130 	if (h->nets[cidr-1].nets > 1)
131 		return;
132 
133 	/* New cidr size */
134 	for (i = 0; i < host_mask && h->nets[i].cidr; i++) {
135 		/* Add in increasing prefix order, so larger cidr first */
136 		if (h->nets[i].cidr < cidr)
137 			swap(h->nets[i].cidr, cidr);
138 	}
139 	if (i < host_mask)
140 		h->nets[i].cidr = cidr;
141 }
142 
143 static void
del_cidr(struct ip_set_hash * h,u8 cidr,u8 host_mask)144 del_cidr(struct ip_set_hash *h, u8 cidr, u8 host_mask)
145 {
146 	u8 i;
147 
148 	--h->nets[cidr-1].nets;
149 
150 	pr_debug("del_cidr deleted %u: %u\n", cidr, h->nets[cidr-1].nets);
151 
152 	if (h->nets[cidr-1].nets != 0)
153 		return;
154 
155 	/* All entries with this cidr size deleted, so cleanup h->cidr[] */
156 	for (i = 0; i < host_mask - 1 && h->nets[i].cidr; i++) {
157 		if (h->nets[i].cidr == cidr)
158 			h->nets[i].cidr = cidr = h->nets[i+1].cidr;
159 	}
160 	h->nets[i - 1].cidr = 0;
161 }
162 #endif
163 
164 /* Destroy the hashtable part of the set */
165 static void
ahash_destroy(struct htable * t)166 ahash_destroy(struct htable *t)
167 {
168 	struct hbucket *n;
169 	u32 i;
170 
171 	for (i = 0; i < jhash_size(t->htable_bits); i++) {
172 		n = hbucket(t, i);
173 		if (n->size)
174 			/* FIXME: use slab cache */
175 			kfree(n->value);
176 	}
177 
178 	ip_set_free(t);
179 }
180 
181 /* Calculate the actual memory size of the set data */
182 static size_t
ahash_memsize(const struct ip_set_hash * h,size_t dsize,u8 host_mask)183 ahash_memsize(const struct ip_set_hash *h, size_t dsize, u8 host_mask)
184 {
185 	u32 i;
186 	struct htable *t = h->table;
187 	size_t memsize = sizeof(*h)
188 			 + sizeof(*t)
189 #ifdef IP_SET_HASH_WITH_NETS
190 			 + sizeof(struct ip_set_hash_nets) * host_mask
191 #endif
192 			 + jhash_size(t->htable_bits) * sizeof(struct hbucket);
193 
194 	for (i = 0; i < jhash_size(t->htable_bits); i++)
195 			memsize += t->bucket[i].size * dsize;
196 
197 	return memsize;
198 }
199 
200 /* Flush a hash type of set: destroy all elements */
201 static void
ip_set_hash_flush(struct ip_set * set)202 ip_set_hash_flush(struct ip_set *set)
203 {
204 	struct ip_set_hash *h = set->data;
205 	struct htable *t = h->table;
206 	struct hbucket *n;
207 	u32 i;
208 
209 	for (i = 0; i < jhash_size(t->htable_bits); i++) {
210 		n = hbucket(t, i);
211 		if (n->size) {
212 			n->size = n->pos = 0;
213 			/* FIXME: use slab cache */
214 			kfree(n->value);
215 		}
216 	}
217 #ifdef IP_SET_HASH_WITH_NETS
218 	memset(h->nets, 0, sizeof(struct ip_set_hash_nets)
219 			   * SET_HOST_MASK(set->family));
220 #endif
221 	h->elements = 0;
222 }
223 
224 /* Destroy a hash type of set */
225 static void
ip_set_hash_destroy(struct ip_set * set)226 ip_set_hash_destroy(struct ip_set *set)
227 {
228 	struct ip_set_hash *h = set->data;
229 
230 	if (with_timeout(h->timeout))
231 		del_timer_sync(&h->gc);
232 
233 	ahash_destroy(h->table);
234 #ifdef IP_SET_HASH_WITH_RBTREE
235 	rbtree_destroy(&h->rbtree);
236 #endif
237 	kfree(h);
238 
239 	set->data = NULL;
240 }
241 
242 #endif /* _IP_SET_AHASH_H */
243 
244 #ifndef HKEY_DATALEN
245 #define HKEY_DATALEN	sizeof(struct type_pf_elem)
246 #endif
247 
248 #define HKEY(data, initval, htable_bits)			\
249 (jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval)	\
250 	& jhash_mask(htable_bits))
251 
252 #define CONCAT(a, b, c)		a##b##c
253 #define TOKEN(a, b, c)		CONCAT(a, b, c)
254 
255 /* Type/family dependent function prototypes */
256 
257 #define type_pf_data_equal	TOKEN(TYPE, PF, _data_equal)
258 #define type_pf_data_isnull	TOKEN(TYPE, PF, _data_isnull)
259 #define type_pf_data_copy	TOKEN(TYPE, PF, _data_copy)
260 #define type_pf_data_zero_out	TOKEN(TYPE, PF, _data_zero_out)
261 #define type_pf_data_netmask	TOKEN(TYPE, PF, _data_netmask)
262 #define type_pf_data_list	TOKEN(TYPE, PF, _data_list)
263 #define type_pf_data_tlist	TOKEN(TYPE, PF, _data_tlist)
264 #define type_pf_data_next	TOKEN(TYPE, PF, _data_next)
265 
266 #define type_pf_elem		TOKEN(TYPE, PF, _elem)
267 #define type_pf_telem		TOKEN(TYPE, PF, _telem)
268 #define type_pf_data_timeout	TOKEN(TYPE, PF, _data_timeout)
269 #define type_pf_data_expired	TOKEN(TYPE, PF, _data_expired)
270 #define type_pf_data_timeout_set TOKEN(TYPE, PF, _data_timeout_set)
271 
272 #define type_pf_elem_add	TOKEN(TYPE, PF, _elem_add)
273 #define type_pf_add		TOKEN(TYPE, PF, _add)
274 #define type_pf_del		TOKEN(TYPE, PF, _del)
275 #define type_pf_test_cidrs	TOKEN(TYPE, PF, _test_cidrs)
276 #define type_pf_test		TOKEN(TYPE, PF, _test)
277 
278 #define type_pf_elem_tadd	TOKEN(TYPE, PF, _elem_tadd)
279 #define type_pf_del_telem	TOKEN(TYPE, PF, _ahash_del_telem)
280 #define type_pf_expire		TOKEN(TYPE, PF, _expire)
281 #define type_pf_tadd		TOKEN(TYPE, PF, _tadd)
282 #define type_pf_tdel		TOKEN(TYPE, PF, _tdel)
283 #define type_pf_ttest_cidrs	TOKEN(TYPE, PF, _ahash_ttest_cidrs)
284 #define type_pf_ttest		TOKEN(TYPE, PF, _ahash_ttest)
285 
286 #define type_pf_resize		TOKEN(TYPE, PF, _resize)
287 #define type_pf_tresize		TOKEN(TYPE, PF, _tresize)
288 #define type_pf_flush		ip_set_hash_flush
289 #define type_pf_destroy		ip_set_hash_destroy
290 #define type_pf_head		TOKEN(TYPE, PF, _head)
291 #define type_pf_list		TOKEN(TYPE, PF, _list)
292 #define type_pf_tlist		TOKEN(TYPE, PF, _tlist)
293 #define type_pf_same_set	TOKEN(TYPE, PF, _same_set)
294 #define type_pf_kadt		TOKEN(TYPE, PF, _kadt)
295 #define type_pf_uadt		TOKEN(TYPE, PF, _uadt)
296 #define type_pf_gc		TOKEN(TYPE, PF, _gc)
297 #define type_pf_gc_init		TOKEN(TYPE, PF, _gc_init)
298 #define type_pf_variant		TOKEN(TYPE, PF, _variant)
299 #define type_pf_tvariant	TOKEN(TYPE, PF, _tvariant)
300 
301 /* Flavour without timeout */
302 
303 /* Get the ith element from the array block n */
304 #define ahash_data(n, i)	\
305 	((struct type_pf_elem *)((n)->value) + (i))
306 
307 /* Add an element to the hash table when resizing the set:
308  * we spare the maintenance of the internal counters. */
309 static int
type_pf_elem_add(struct hbucket * n,const struct type_pf_elem * value,u8 ahash_max)310 type_pf_elem_add(struct hbucket *n, const struct type_pf_elem *value,
311 		 u8 ahash_max)
312 {
313 	if (n->pos >= n->size) {
314 		void *tmp;
315 
316 		if (n->size >= ahash_max)
317 			/* Trigger rehashing */
318 			return -EAGAIN;
319 
320 		tmp = kzalloc((n->size + AHASH_INIT_SIZE)
321 			      * sizeof(struct type_pf_elem),
322 			      GFP_ATOMIC);
323 		if (!tmp)
324 			return -ENOMEM;
325 		if (n->size) {
326 			memcpy(tmp, n->value,
327 			       sizeof(struct type_pf_elem) * n->size);
328 			kfree(n->value);
329 		}
330 		n->value = tmp;
331 		n->size += AHASH_INIT_SIZE;
332 	}
333 	type_pf_data_copy(ahash_data(n, n->pos++), value);
334 	return 0;
335 }
336 
337 /* Resize a hash: create a new hash table with doubling the hashsize
338  * and inserting the elements to it. Repeat until we succeed or
339  * fail due to memory pressures. */
340 static int
type_pf_resize(struct ip_set * set,bool retried)341 type_pf_resize(struct ip_set *set, bool retried)
342 {
343 	struct ip_set_hash *h = set->data;
344 	struct htable *t, *orig = h->table;
345 	u8 htable_bits = orig->htable_bits;
346 	const struct type_pf_elem *data;
347 	struct hbucket *n, *m;
348 	u32 i, j;
349 	int ret;
350 
351 retry:
352 	ret = 0;
353 	htable_bits++;
354 	pr_debug("attempt to resize set %s from %u to %u, t %p\n",
355 		 set->name, orig->htable_bits, htable_bits, orig);
356 	if (!htable_bits)
357 		/* In case we have plenty of memory :-) */
358 		return -IPSET_ERR_HASH_FULL;
359 	t = ip_set_alloc(sizeof(*t)
360 			 + jhash_size(htable_bits) * sizeof(struct hbucket));
361 	if (!t)
362 		return -ENOMEM;
363 	t->htable_bits = htable_bits;
364 
365 	read_lock_bh(&set->lock);
366 	for (i = 0; i < jhash_size(orig->htable_bits); i++) {
367 		n = hbucket(orig, i);
368 		for (j = 0; j < n->pos; j++) {
369 			data = ahash_data(n, j);
370 			m = hbucket(t, HKEY(data, h->initval, htable_bits));
371 			ret = type_pf_elem_add(m, data, AHASH_MAX(h));
372 			if (ret < 0) {
373 				read_unlock_bh(&set->lock);
374 				ahash_destroy(t);
375 				if (ret == -EAGAIN)
376 					goto retry;
377 				return ret;
378 			}
379 		}
380 	}
381 
382 	rcu_assign_pointer(h->table, t);
383 	read_unlock_bh(&set->lock);
384 
385 	/* Give time to other readers of the set */
386 	synchronize_rcu_bh();
387 
388 	pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
389 		 orig->htable_bits, orig, t->htable_bits, t);
390 	ahash_destroy(orig);
391 
392 	return 0;
393 }
394 
395 static inline void
396 type_pf_data_next(struct ip_set_hash *h, const struct type_pf_elem *d);
397 
398 /* Add an element to a hash and update the internal counters when succeeded,
399  * otherwise report the proper error code. */
400 static int
type_pf_add(struct ip_set * set,void * value,u32 timeout,u32 flags)401 type_pf_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
402 {
403 	struct ip_set_hash *h = set->data;
404 	struct htable *t;
405 	const struct type_pf_elem *d = value;
406 	struct hbucket *n;
407 	int i, ret = 0;
408 	u32 key, multi = 0;
409 
410 	if (h->elements >= h->maxelem)
411 		return -IPSET_ERR_HASH_FULL;
412 
413 	rcu_read_lock_bh();
414 	t = rcu_dereference_bh(h->table);
415 	key = HKEY(value, h->initval, t->htable_bits);
416 	n = hbucket(t, key);
417 	for (i = 0; i < n->pos; i++)
418 		if (type_pf_data_equal(ahash_data(n, i), d, &multi)) {
419 			ret = -IPSET_ERR_EXIST;
420 			goto out;
421 		}
422 	TUNE_AHASH_MAX(h, multi);
423 	ret = type_pf_elem_add(n, value, AHASH_MAX(h));
424 	if (ret != 0) {
425 		if (ret == -EAGAIN)
426 			type_pf_data_next(h, d);
427 		goto out;
428 	}
429 
430 #ifdef IP_SET_HASH_WITH_NETS
431 	add_cidr(h, d->cidr, HOST_MASK);
432 #endif
433 	h->elements++;
434 out:
435 	rcu_read_unlock_bh();
436 	return ret;
437 }
438 
439 /* Delete an element from the hash: swap it with the last element
440  * and free up space if possible.
441  */
442 static int
type_pf_del(struct ip_set * set,void * value,u32 timeout,u32 flags)443 type_pf_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
444 {
445 	struct ip_set_hash *h = set->data;
446 	struct htable *t = h->table;
447 	const struct type_pf_elem *d = value;
448 	struct hbucket *n;
449 	int i;
450 	struct type_pf_elem *data;
451 	u32 key, multi = 0;
452 
453 	key = HKEY(value, h->initval, t->htable_bits);
454 	n = hbucket(t, key);
455 	for (i = 0; i < n->pos; i++) {
456 		data = ahash_data(n, i);
457 		if (!type_pf_data_equal(data, d, &multi))
458 			continue;
459 		if (i != n->pos - 1)
460 			/* Not last one */
461 			type_pf_data_copy(data, ahash_data(n, n->pos - 1));
462 
463 		n->pos--;
464 		h->elements--;
465 #ifdef IP_SET_HASH_WITH_NETS
466 		del_cidr(h, d->cidr, HOST_MASK);
467 #endif
468 		if (n->pos + AHASH_INIT_SIZE < n->size) {
469 			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
470 					    * sizeof(struct type_pf_elem),
471 					    GFP_ATOMIC);
472 			if (!tmp)
473 				return 0;
474 			n->size -= AHASH_INIT_SIZE;
475 			memcpy(tmp, n->value,
476 			       n->size * sizeof(struct type_pf_elem));
477 			kfree(n->value);
478 			n->value = tmp;
479 		}
480 		return 0;
481 	}
482 
483 	return -IPSET_ERR_EXIST;
484 }
485 
486 #ifdef IP_SET_HASH_WITH_NETS
487 
488 /* Special test function which takes into account the different network
489  * sizes added to the set */
490 static int
type_pf_test_cidrs(struct ip_set * set,struct type_pf_elem * d,u32 timeout)491 type_pf_test_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
492 {
493 	struct ip_set_hash *h = set->data;
494 	struct htable *t = h->table;
495 	struct hbucket *n;
496 	const struct type_pf_elem *data;
497 	int i, j = 0;
498 	u32 key, multi = 0;
499 	u8 host_mask = SET_HOST_MASK(set->family);
500 
501 	pr_debug("test by nets\n");
502 	for (; j < host_mask && h->nets[j].cidr && !multi; j++) {
503 		type_pf_data_netmask(d, h->nets[j].cidr);
504 		key = HKEY(d, h->initval, t->htable_bits);
505 		n = hbucket(t, key);
506 		for (i = 0; i < n->pos; i++) {
507 			data = ahash_data(n, i);
508 			if (type_pf_data_equal(data, d, &multi))
509 				return 1;
510 		}
511 	}
512 	return 0;
513 }
514 #endif
515 
516 /* Test whether the element is added to the set */
517 static int
type_pf_test(struct ip_set * set,void * value,u32 timeout,u32 flags)518 type_pf_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
519 {
520 	struct ip_set_hash *h = set->data;
521 	struct htable *t = h->table;
522 	struct type_pf_elem *d = value;
523 	struct hbucket *n;
524 	const struct type_pf_elem *data;
525 	int i;
526 	u32 key, multi = 0;
527 
528 #ifdef IP_SET_HASH_WITH_NETS
529 	/* If we test an IP address and not a network address,
530 	 * try all possible network sizes */
531 	if (d->cidr == SET_HOST_MASK(set->family))
532 		return type_pf_test_cidrs(set, d, timeout);
533 #endif
534 
535 	key = HKEY(d, h->initval, t->htable_bits);
536 	n = hbucket(t, key);
537 	for (i = 0; i < n->pos; i++) {
538 		data = ahash_data(n, i);
539 		if (type_pf_data_equal(data, d, &multi))
540 			return 1;
541 	}
542 	return 0;
543 }
544 
545 /* Reply a HEADER request: fill out the header part of the set */
546 static int
type_pf_head(struct ip_set * set,struct sk_buff * skb)547 type_pf_head(struct ip_set *set, struct sk_buff *skb)
548 {
549 	const struct ip_set_hash *h = set->data;
550 	struct nlattr *nested;
551 	size_t memsize;
552 
553 	read_lock_bh(&set->lock);
554 	memsize = ahash_memsize(h, with_timeout(h->timeout)
555 					? sizeof(struct type_pf_telem)
556 					: sizeof(struct type_pf_elem),
557 				set->family == AF_INET ? 32 : 128);
558 	read_unlock_bh(&set->lock);
559 
560 	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
561 	if (!nested)
562 		goto nla_put_failure;
563 	NLA_PUT_NET32(skb, IPSET_ATTR_HASHSIZE,
564 		      htonl(jhash_size(h->table->htable_bits)));
565 	NLA_PUT_NET32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem));
566 #ifdef IP_SET_HASH_WITH_NETMASK
567 	if (h->netmask != HOST_MASK)
568 		NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, h->netmask);
569 #endif
570 	NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1));
571 	NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize));
572 	if (with_timeout(h->timeout))
573 		NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout));
574 	ipset_nest_end(skb, nested);
575 
576 	return 0;
577 nla_put_failure:
578 	return -EMSGSIZE;
579 }
580 
581 /* Reply a LIST/SAVE request: dump the elements of the specified set */
582 static int
type_pf_list(const struct ip_set * set,struct sk_buff * skb,struct netlink_callback * cb)583 type_pf_list(const struct ip_set *set,
584 	     struct sk_buff *skb, struct netlink_callback *cb)
585 {
586 	const struct ip_set_hash *h = set->data;
587 	const struct htable *t = h->table;
588 	struct nlattr *atd, *nested;
589 	const struct hbucket *n;
590 	const struct type_pf_elem *data;
591 	u32 first = cb->args[2];
592 	/* We assume that one hash bucket fills into one page */
593 	void *incomplete;
594 	int i;
595 
596 	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
597 	if (!atd)
598 		return -EMSGSIZE;
599 	pr_debug("list hash set %s\n", set->name);
600 	for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
601 		incomplete = skb_tail_pointer(skb);
602 		n = hbucket(t, cb->args[2]);
603 		pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n);
604 		for (i = 0; i < n->pos; i++) {
605 			data = ahash_data(n, i);
606 			pr_debug("list hash %lu hbucket %p i %u, data %p\n",
607 				 cb->args[2], n, i, data);
608 			nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
609 			if (!nested) {
610 				if (cb->args[2] == first) {
611 					nla_nest_cancel(skb, atd);
612 					return -EMSGSIZE;
613 				} else
614 					goto nla_put_failure;
615 			}
616 			if (type_pf_data_list(skb, data))
617 				goto nla_put_failure;
618 			ipset_nest_end(skb, nested);
619 		}
620 	}
621 	ipset_nest_end(skb, atd);
622 	/* Set listing finished */
623 	cb->args[2] = 0;
624 
625 	return 0;
626 
627 nla_put_failure:
628 	nlmsg_trim(skb, incomplete);
629 	ipset_nest_end(skb, atd);
630 	if (unlikely(first == cb->args[2])) {
631 		pr_warning("Can't list set %s: one bucket does not fit into "
632 			   "a message. Please report it!\n", set->name);
633 		cb->args[2] = 0;
634 		return -EMSGSIZE;
635 	}
636 	return 0;
637 }
638 
639 static int
640 type_pf_kadt(struct ip_set *set, const struct sk_buff * skb,
641 	     const struct xt_action_param *par,
642 	     enum ipset_adt adt, const struct ip_set_adt_opt *opt);
643 static int
644 type_pf_uadt(struct ip_set *set, struct nlattr *tb[],
645 	     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
646 
647 static const struct ip_set_type_variant type_pf_variant = {
648 	.kadt	= type_pf_kadt,
649 	.uadt	= type_pf_uadt,
650 	.adt	= {
651 		[IPSET_ADD] = type_pf_add,
652 		[IPSET_DEL] = type_pf_del,
653 		[IPSET_TEST] = type_pf_test,
654 	},
655 	.destroy = type_pf_destroy,
656 	.flush	= type_pf_flush,
657 	.head	= type_pf_head,
658 	.list	= type_pf_list,
659 	.resize	= type_pf_resize,
660 	.same_set = type_pf_same_set,
661 };
662 
663 /* Flavour with timeout support */
664 
665 #define ahash_tdata(n, i) \
666 	(struct type_pf_elem *)((struct type_pf_telem *)((n)->value) + (i))
667 
668 static inline u32
type_pf_data_timeout(const struct type_pf_elem * data)669 type_pf_data_timeout(const struct type_pf_elem *data)
670 {
671 	const struct type_pf_telem *tdata =
672 		(const struct type_pf_telem *) data;
673 
674 	return tdata->timeout;
675 }
676 
677 static inline bool
type_pf_data_expired(const struct type_pf_elem * data)678 type_pf_data_expired(const struct type_pf_elem *data)
679 {
680 	const struct type_pf_telem *tdata =
681 		(const struct type_pf_telem *) data;
682 
683 	return ip_set_timeout_expired(tdata->timeout);
684 }
685 
686 static inline void
type_pf_data_timeout_set(struct type_pf_elem * data,u32 timeout)687 type_pf_data_timeout_set(struct type_pf_elem *data, u32 timeout)
688 {
689 	struct type_pf_telem *tdata = (struct type_pf_telem *) data;
690 
691 	tdata->timeout = ip_set_timeout_set(timeout);
692 }
693 
694 static int
type_pf_elem_tadd(struct hbucket * n,const struct type_pf_elem * value,u8 ahash_max,u32 timeout)695 type_pf_elem_tadd(struct hbucket *n, const struct type_pf_elem *value,
696 		  u8 ahash_max, u32 timeout)
697 {
698 	struct type_pf_elem *data;
699 
700 	if (n->pos >= n->size) {
701 		void *tmp;
702 
703 		if (n->size >= ahash_max)
704 			/* Trigger rehashing */
705 			return -EAGAIN;
706 
707 		tmp = kzalloc((n->size + AHASH_INIT_SIZE)
708 			      * sizeof(struct type_pf_telem),
709 			      GFP_ATOMIC);
710 		if (!tmp)
711 			return -ENOMEM;
712 		if (n->size) {
713 			memcpy(tmp, n->value,
714 			       sizeof(struct type_pf_telem) * n->size);
715 			kfree(n->value);
716 		}
717 		n->value = tmp;
718 		n->size += AHASH_INIT_SIZE;
719 	}
720 	data = ahash_tdata(n, n->pos++);
721 	type_pf_data_copy(data, value);
722 	type_pf_data_timeout_set(data, timeout);
723 	return 0;
724 }
725 
726 /* Delete expired elements from the hashtable */
727 static void
type_pf_expire(struct ip_set_hash * h)728 type_pf_expire(struct ip_set_hash *h)
729 {
730 	struct htable *t = h->table;
731 	struct hbucket *n;
732 	struct type_pf_elem *data;
733 	u32 i;
734 	int j;
735 
736 	for (i = 0; i < jhash_size(t->htable_bits); i++) {
737 		n = hbucket(t, i);
738 		for (j = 0; j < n->pos; j++) {
739 			data = ahash_tdata(n, j);
740 			if (type_pf_data_expired(data)) {
741 				pr_debug("expired %u/%u\n", i, j);
742 #ifdef IP_SET_HASH_WITH_NETS
743 				del_cidr(h, data->cidr, HOST_MASK);
744 #endif
745 				if (j != n->pos - 1)
746 					/* Not last one */
747 					type_pf_data_copy(data,
748 						ahash_tdata(n, n->pos - 1));
749 				n->pos--;
750 				h->elements--;
751 			}
752 		}
753 		if (n->pos + AHASH_INIT_SIZE < n->size) {
754 			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
755 					    * sizeof(struct type_pf_telem),
756 					    GFP_ATOMIC);
757 			if (!tmp)
758 				/* Still try to delete expired elements */
759 				continue;
760 			n->size -= AHASH_INIT_SIZE;
761 			memcpy(tmp, n->value,
762 			       n->size * sizeof(struct type_pf_telem));
763 			kfree(n->value);
764 			n->value = tmp;
765 		}
766 	}
767 }
768 
769 static int
type_pf_tresize(struct ip_set * set,bool retried)770 type_pf_tresize(struct ip_set *set, bool retried)
771 {
772 	struct ip_set_hash *h = set->data;
773 	struct htable *t, *orig = h->table;
774 	u8 htable_bits = orig->htable_bits;
775 	const struct type_pf_elem *data;
776 	struct hbucket *n, *m;
777 	u32 i, j;
778 	int ret;
779 
780 	/* Try to cleanup once */
781 	if (!retried) {
782 		i = h->elements;
783 		write_lock_bh(&set->lock);
784 		type_pf_expire(set->data);
785 		write_unlock_bh(&set->lock);
786 		if (h->elements <  i)
787 			return 0;
788 	}
789 
790 retry:
791 	ret = 0;
792 	htable_bits++;
793 	if (!htable_bits)
794 		/* In case we have plenty of memory :-) */
795 		return -IPSET_ERR_HASH_FULL;
796 	t = ip_set_alloc(sizeof(*t)
797 			 + jhash_size(htable_bits) * sizeof(struct hbucket));
798 	if (!t)
799 		return -ENOMEM;
800 	t->htable_bits = htable_bits;
801 
802 	read_lock_bh(&set->lock);
803 	for (i = 0; i < jhash_size(orig->htable_bits); i++) {
804 		n = hbucket(orig, i);
805 		for (j = 0; j < n->pos; j++) {
806 			data = ahash_tdata(n, j);
807 			m = hbucket(t, HKEY(data, h->initval, htable_bits));
808 			ret = type_pf_elem_tadd(m, data, AHASH_MAX(h),
809 						type_pf_data_timeout(data));
810 			if (ret < 0) {
811 				read_unlock_bh(&set->lock);
812 				ahash_destroy(t);
813 				if (ret == -EAGAIN)
814 					goto retry;
815 				return ret;
816 			}
817 		}
818 	}
819 
820 	rcu_assign_pointer(h->table, t);
821 	read_unlock_bh(&set->lock);
822 
823 	/* Give time to other readers of the set */
824 	synchronize_rcu_bh();
825 
826 	ahash_destroy(orig);
827 
828 	return 0;
829 }
830 
831 static int
type_pf_tadd(struct ip_set * set,void * value,u32 timeout,u32 flags)832 type_pf_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
833 {
834 	struct ip_set_hash *h = set->data;
835 	struct htable *t = h->table;
836 	const struct type_pf_elem *d = value;
837 	struct hbucket *n;
838 	struct type_pf_elem *data;
839 	int ret = 0, i, j = AHASH_MAX(h) + 1;
840 	bool flag_exist = flags & IPSET_FLAG_EXIST;
841 	u32 key, multi = 0;
842 
843 	if (h->elements >= h->maxelem)
844 		/* FIXME: when set is full, we slow down here */
845 		type_pf_expire(h);
846 	if (h->elements >= h->maxelem)
847 		return -IPSET_ERR_HASH_FULL;
848 
849 	rcu_read_lock_bh();
850 	t = rcu_dereference_bh(h->table);
851 	key = HKEY(d, h->initval, t->htable_bits);
852 	n = hbucket(t, key);
853 	for (i = 0; i < n->pos; i++) {
854 		data = ahash_tdata(n, i);
855 		if (type_pf_data_equal(data, d, &multi)) {
856 			if (type_pf_data_expired(data) || flag_exist)
857 				j = i;
858 			else {
859 				ret = -IPSET_ERR_EXIST;
860 				goto out;
861 			}
862 		} else if (j == AHASH_MAX(h) + 1 &&
863 			   type_pf_data_expired(data))
864 			j = i;
865 	}
866 	if (j != AHASH_MAX(h) + 1) {
867 		data = ahash_tdata(n, j);
868 #ifdef IP_SET_HASH_WITH_NETS
869 		del_cidr(h, data->cidr, HOST_MASK);
870 		add_cidr(h, d->cidr, HOST_MASK);
871 #endif
872 		type_pf_data_copy(data, d);
873 		type_pf_data_timeout_set(data, timeout);
874 		goto out;
875 	}
876 	TUNE_AHASH_MAX(h, multi);
877 	ret = type_pf_elem_tadd(n, d, AHASH_MAX(h), timeout);
878 	if (ret != 0) {
879 		if (ret == -EAGAIN)
880 			type_pf_data_next(h, d);
881 		goto out;
882 	}
883 
884 #ifdef IP_SET_HASH_WITH_NETS
885 	add_cidr(h, d->cidr, HOST_MASK);
886 #endif
887 	h->elements++;
888 out:
889 	rcu_read_unlock_bh();
890 	return ret;
891 }
892 
893 static int
type_pf_tdel(struct ip_set * set,void * value,u32 timeout,u32 flags)894 type_pf_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
895 {
896 	struct ip_set_hash *h = set->data;
897 	struct htable *t = h->table;
898 	const struct type_pf_elem *d = value;
899 	struct hbucket *n;
900 	int i;
901 	struct type_pf_elem *data;
902 	u32 key, multi = 0;
903 
904 	key = HKEY(value, h->initval, t->htable_bits);
905 	n = hbucket(t, key);
906 	for (i = 0; i < n->pos; i++) {
907 		data = ahash_tdata(n, i);
908 		if (!type_pf_data_equal(data, d, &multi))
909 			continue;
910 		if (type_pf_data_expired(data))
911 			return -IPSET_ERR_EXIST;
912 		if (i != n->pos - 1)
913 			/* Not last one */
914 			type_pf_data_copy(data, ahash_tdata(n, n->pos - 1));
915 
916 		n->pos--;
917 		h->elements--;
918 #ifdef IP_SET_HASH_WITH_NETS
919 		del_cidr(h, d->cidr, HOST_MASK);
920 #endif
921 		if (n->pos + AHASH_INIT_SIZE < n->size) {
922 			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
923 					    * sizeof(struct type_pf_telem),
924 					    GFP_ATOMIC);
925 			if (!tmp)
926 				return 0;
927 			n->size -= AHASH_INIT_SIZE;
928 			memcpy(tmp, n->value,
929 			       n->size * sizeof(struct type_pf_telem));
930 			kfree(n->value);
931 			n->value = tmp;
932 		}
933 		return 0;
934 	}
935 
936 	return -IPSET_ERR_EXIST;
937 }
938 
939 #ifdef IP_SET_HASH_WITH_NETS
940 static int
type_pf_ttest_cidrs(struct ip_set * set,struct type_pf_elem * d,u32 timeout)941 type_pf_ttest_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
942 {
943 	struct ip_set_hash *h = set->data;
944 	struct htable *t = h->table;
945 	struct type_pf_elem *data;
946 	struct hbucket *n;
947 	int i, j = 0;
948 	u32 key, multi = 0;
949 	u8 host_mask = SET_HOST_MASK(set->family);
950 
951 	for (; j < host_mask && h->nets[j].cidr && !multi; j++) {
952 		type_pf_data_netmask(d, h->nets[j].cidr);
953 		key = HKEY(d, h->initval, t->htable_bits);
954 		n = hbucket(t, key);
955 		for (i = 0; i < n->pos; i++) {
956 			data = ahash_tdata(n, i);
957 			if (type_pf_data_equal(data, d, &multi))
958 				return !type_pf_data_expired(data);
959 		}
960 	}
961 	return 0;
962 }
963 #endif
964 
965 static int
type_pf_ttest(struct ip_set * set,void * value,u32 timeout,u32 flags)966 type_pf_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
967 {
968 	struct ip_set_hash *h = set->data;
969 	struct htable *t = h->table;
970 	struct type_pf_elem *data, *d = value;
971 	struct hbucket *n;
972 	int i;
973 	u32 key, multi = 0;
974 
975 #ifdef IP_SET_HASH_WITH_NETS
976 	if (d->cidr == SET_HOST_MASK(set->family))
977 		return type_pf_ttest_cidrs(set, d, timeout);
978 #endif
979 	key = HKEY(d, h->initval, t->htable_bits);
980 	n = hbucket(t, key);
981 	for (i = 0; i < n->pos; i++) {
982 		data = ahash_tdata(n, i);
983 		if (type_pf_data_equal(data, d, &multi))
984 			return !type_pf_data_expired(data);
985 	}
986 	return 0;
987 }
988 
989 static int
type_pf_tlist(const struct ip_set * set,struct sk_buff * skb,struct netlink_callback * cb)990 type_pf_tlist(const struct ip_set *set,
991 	      struct sk_buff *skb, struct netlink_callback *cb)
992 {
993 	const struct ip_set_hash *h = set->data;
994 	const struct htable *t = h->table;
995 	struct nlattr *atd, *nested;
996 	const struct hbucket *n;
997 	const struct type_pf_elem *data;
998 	u32 first = cb->args[2];
999 	/* We assume that one hash bucket fills into one page */
1000 	void *incomplete;
1001 	int i;
1002 
1003 	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
1004 	if (!atd)
1005 		return -EMSGSIZE;
1006 	for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
1007 		incomplete = skb_tail_pointer(skb);
1008 		n = hbucket(t, cb->args[2]);
1009 		for (i = 0; i < n->pos; i++) {
1010 			data = ahash_tdata(n, i);
1011 			pr_debug("list %p %u\n", n, i);
1012 			if (type_pf_data_expired(data))
1013 				continue;
1014 			pr_debug("do list %p %u\n", n, i);
1015 			nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
1016 			if (!nested) {
1017 				if (cb->args[2] == first) {
1018 					nla_nest_cancel(skb, atd);
1019 					return -EMSGSIZE;
1020 				} else
1021 					goto nla_put_failure;
1022 			}
1023 			if (type_pf_data_tlist(skb, data))
1024 				goto nla_put_failure;
1025 			ipset_nest_end(skb, nested);
1026 		}
1027 	}
1028 	ipset_nest_end(skb, atd);
1029 	/* Set listing finished */
1030 	cb->args[2] = 0;
1031 
1032 	return 0;
1033 
1034 nla_put_failure:
1035 	nlmsg_trim(skb, incomplete);
1036 	ipset_nest_end(skb, atd);
1037 	if (unlikely(first == cb->args[2])) {
1038 		pr_warning("Can't list set %s: one bucket does not fit into "
1039 			   "a message. Please report it!\n", set->name);
1040 		cb->args[2] = 0;
1041 		return -EMSGSIZE;
1042 	}
1043 	return 0;
1044 }
1045 
1046 static const struct ip_set_type_variant type_pf_tvariant = {
1047 	.kadt	= type_pf_kadt,
1048 	.uadt	= type_pf_uadt,
1049 	.adt	= {
1050 		[IPSET_ADD] = type_pf_tadd,
1051 		[IPSET_DEL] = type_pf_tdel,
1052 		[IPSET_TEST] = type_pf_ttest,
1053 	},
1054 	.destroy = type_pf_destroy,
1055 	.flush	= type_pf_flush,
1056 	.head	= type_pf_head,
1057 	.list	= type_pf_tlist,
1058 	.resize	= type_pf_tresize,
1059 	.same_set = type_pf_same_set,
1060 };
1061 
1062 static void
type_pf_gc(unsigned long ul_set)1063 type_pf_gc(unsigned long ul_set)
1064 {
1065 	struct ip_set *set = (struct ip_set *) ul_set;
1066 	struct ip_set_hash *h = set->data;
1067 
1068 	pr_debug("called\n");
1069 	write_lock_bh(&set->lock);
1070 	type_pf_expire(h);
1071 	write_unlock_bh(&set->lock);
1072 
1073 	h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
1074 	add_timer(&h->gc);
1075 }
1076 
1077 static void
type_pf_gc_init(struct ip_set * set)1078 type_pf_gc_init(struct ip_set *set)
1079 {
1080 	struct ip_set_hash *h = set->data;
1081 
1082 	init_timer(&h->gc);
1083 	h->gc.data = (unsigned long) set;
1084 	h->gc.function = type_pf_gc;
1085 	h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
1086 	add_timer(&h->gc);
1087 	pr_debug("gc initialized, run in every %u\n",
1088 		 IPSET_GC_PERIOD(h->timeout));
1089 }
1090 
1091 #undef HKEY_DATALEN
1092 #undef HKEY
1093 #undef type_pf_data_equal
1094 #undef type_pf_data_isnull
1095 #undef type_pf_data_copy
1096 #undef type_pf_data_zero_out
1097 #undef type_pf_data_list
1098 #undef type_pf_data_tlist
1099 
1100 #undef type_pf_elem
1101 #undef type_pf_telem
1102 #undef type_pf_data_timeout
1103 #undef type_pf_data_expired
1104 #undef type_pf_data_netmask
1105 #undef type_pf_data_timeout_set
1106 
1107 #undef type_pf_elem_add
1108 #undef type_pf_add
1109 #undef type_pf_del
1110 #undef type_pf_test_cidrs
1111 #undef type_pf_test
1112 
1113 #undef type_pf_elem_tadd
1114 #undef type_pf_expire
1115 #undef type_pf_tadd
1116 #undef type_pf_tdel
1117 #undef type_pf_ttest_cidrs
1118 #undef type_pf_ttest
1119 
1120 #undef type_pf_resize
1121 #undef type_pf_tresize
1122 #undef type_pf_flush
1123 #undef type_pf_destroy
1124 #undef type_pf_head
1125 #undef type_pf_list
1126 #undef type_pf_tlist
1127 #undef type_pf_same_set
1128 #undef type_pf_kadt
1129 #undef type_pf_uadt
1130 #undef type_pf_gc
1131 #undef type_pf_gc_init
1132 #undef type_pf_variant
1133 #undef type_pf_tvariant
1134