1 /*
2  * In-kernel transcendent memory (generic implementation)
3  *
4  * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
5  *
6  * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7  * "handles" (triples containing a pool id, and object id, and an index), to
8  * pages in a page-accessible memory (PAM).  Tmem references the PAM pages via
9  * an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10  * set of functions (pamops).  Each pampd contains some representation of
11  * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
12  * pages and must be able to insert, find, and delete these pages at a
13  * potential frequency of thousands per second concurrently across many CPUs,
14  * (and, if used with KVM, across many vcpus across many guests).
15  * Tmem is tracked with a hierarchy of data structures, organized by
16  * the elements in a handle-tuple: pool_id, object_id, and page index.
17  * One or more "clients" (e.g. guests) each provide one or more tmem_pools.
18  * Each pool, contains a hash table of rb_trees of tmem_objs.  Each
19  * tmem_obj contains a radix-tree-like tree of pointers, with intermediate
20  * nodes called tmem_objnodes.  Each leaf pointer in this tree points to
21  * a pampd, which is accessible only through a small set of callbacks
22  * registered by the PAM implementation (see tmem_register_pamops). Tmem
23  * does all memory allocation via a set of callbacks registered by the tmem
24  * host implementation (e.g. see tmem_register_hostops).
25  */
26 
27 #include <linux/list.h>
28 #include <linux/spinlock.h>
29 #include <linux/atomic.h>
30 
31 #include "tmem.h"
32 
33 /* data structure sentinels used for debugging... see tmem.h */
34 #define POOL_SENTINEL 0x87658765
35 #define OBJ_SENTINEL 0x12345678
36 #define OBJNODE_SENTINEL 0xfedcba09
37 
38 /*
39  * A tmem host implementation must use this function to register callbacks
40  * for memory allocation.
41  */
42 static struct tmem_hostops tmem_hostops;
43 
44 static void tmem_objnode_tree_init(void);
45 
tmem_register_hostops(struct tmem_hostops * m)46 void tmem_register_hostops(struct tmem_hostops *m)
47 {
48 	tmem_objnode_tree_init();
49 	tmem_hostops = *m;
50 }
51 
52 /*
53  * A tmem host implementation must use this function to register
54  * callbacks for a page-accessible memory (PAM) implementation
55  */
56 static struct tmem_pamops tmem_pamops;
57 
tmem_register_pamops(struct tmem_pamops * m)58 void tmem_register_pamops(struct tmem_pamops *m)
59 {
60 	tmem_pamops = *m;
61 }
62 
63 /*
64  * Oid's are potentially very sparse and tmem_objs may have an indeterminately
65  * short life, being added and deleted at a relatively high frequency.
66  * So an rb_tree is an ideal data structure to manage tmem_objs.  But because
67  * of the potentially huge number of tmem_objs, each pool manages a hashtable
68  * of rb_trees to reduce search, insert, delete, and rebalancing time.
69  * Each hashbucket also has a lock to manage concurrent access.
70  *
71  * The following routines manage tmem_objs.  When any tmem_obj is accessed,
72  * the hashbucket lock must be held.
73  */
74 
75 /* searches for object==oid in pool, returns locked object if found */
tmem_obj_find(struct tmem_hashbucket * hb,struct tmem_oid * oidp)76 static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
77 					struct tmem_oid *oidp)
78 {
79 	struct rb_node *rbnode;
80 	struct tmem_obj *obj;
81 
82 	rbnode = hb->obj_rb_root.rb_node;
83 	while (rbnode) {
84 		BUG_ON(RB_EMPTY_NODE(rbnode));
85 		obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
86 		switch (tmem_oid_compare(oidp, &obj->oid)) {
87 		case 0: /* equal */
88 			goto out;
89 		case -1:
90 			rbnode = rbnode->rb_left;
91 			break;
92 		case 1:
93 			rbnode = rbnode->rb_right;
94 			break;
95 		}
96 	}
97 	obj = NULL;
98 out:
99 	return obj;
100 }
101 
102 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
103 
104 /* free an object that has no more pampds in it */
tmem_obj_free(struct tmem_obj * obj,struct tmem_hashbucket * hb)105 static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
106 {
107 	struct tmem_pool *pool;
108 
109 	BUG_ON(obj == NULL);
110 	ASSERT_SENTINEL(obj, OBJ);
111 	BUG_ON(obj->pampd_count > 0);
112 	pool = obj->pool;
113 	BUG_ON(pool == NULL);
114 	if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
115 		tmem_pampd_destroy_all_in_obj(obj);
116 	BUG_ON(obj->objnode_tree_root != NULL);
117 	BUG_ON((long)obj->objnode_count != 0);
118 	atomic_dec(&pool->obj_count);
119 	BUG_ON(atomic_read(&pool->obj_count) < 0);
120 	INVERT_SENTINEL(obj, OBJ);
121 	obj->pool = NULL;
122 	tmem_oid_set_invalid(&obj->oid);
123 	rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
124 }
125 
126 /*
127  * initialize, and insert an tmem_object_root (called only if find failed)
128  */
tmem_obj_init(struct tmem_obj * obj,struct tmem_hashbucket * hb,struct tmem_pool * pool,struct tmem_oid * oidp)129 static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
130 					struct tmem_pool *pool,
131 					struct tmem_oid *oidp)
132 {
133 	struct rb_root *root = &hb->obj_rb_root;
134 	struct rb_node **new = &(root->rb_node), *parent = NULL;
135 	struct tmem_obj *this;
136 
137 	BUG_ON(pool == NULL);
138 	atomic_inc(&pool->obj_count);
139 	obj->objnode_tree_height = 0;
140 	obj->objnode_tree_root = NULL;
141 	obj->pool = pool;
142 	obj->oid = *oidp;
143 	obj->objnode_count = 0;
144 	obj->pampd_count = 0;
145 	(*tmem_pamops.new_obj)(obj);
146 	SET_SENTINEL(obj, OBJ);
147 	while (*new) {
148 		BUG_ON(RB_EMPTY_NODE(*new));
149 		this = rb_entry(*new, struct tmem_obj, rb_tree_node);
150 		parent = *new;
151 		switch (tmem_oid_compare(oidp, &this->oid)) {
152 		case 0:
153 			BUG(); /* already present; should never happen! */
154 			break;
155 		case -1:
156 			new = &(*new)->rb_left;
157 			break;
158 		case 1:
159 			new = &(*new)->rb_right;
160 			break;
161 		}
162 	}
163 	rb_link_node(&obj->rb_tree_node, parent, new);
164 	rb_insert_color(&obj->rb_tree_node, root);
165 }
166 
167 /*
168  * Tmem is managed as a set of tmem_pools with certain attributes, such as
169  * "ephemeral" vs "persistent".  These attributes apply to all tmem_objs
170  * and all pampds that belong to a tmem_pool.  A tmem_pool is created
171  * or deleted relatively rarely (for example, when a filesystem is
172  * mounted or unmounted.
173  */
174 
175 /* flush all data from a pool and, optionally, free it */
tmem_pool_flush(struct tmem_pool * pool,bool destroy)176 static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
177 {
178 	struct rb_node *rbnode;
179 	struct tmem_obj *obj;
180 	struct tmem_hashbucket *hb = &pool->hashbucket[0];
181 	int i;
182 
183 	BUG_ON(pool == NULL);
184 	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
185 		spin_lock(&hb->lock);
186 		rbnode = rb_first(&hb->obj_rb_root);
187 		while (rbnode != NULL) {
188 			obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
189 			rbnode = rb_next(rbnode);
190 			tmem_pampd_destroy_all_in_obj(obj);
191 			tmem_obj_free(obj, hb);
192 			(*tmem_hostops.obj_free)(obj, pool);
193 		}
194 		spin_unlock(&hb->lock);
195 	}
196 	if (destroy)
197 		list_del(&pool->pool_list);
198 }
199 
200 /*
201  * A tmem_obj contains a radix-tree-like tree in which the intermediate
202  * nodes are called tmem_objnodes.  (The kernel lib/radix-tree.c implementation
203  * is very specialized and tuned for specific uses and is not particularly
204  * suited for use from this code, though some code from the core algorithms has
205  * been reused, thus the copyright notices below).  Each tmem_objnode contains
206  * a set of pointers which point to either a set of intermediate tmem_objnodes
207  * or a set of of pampds.
208  *
209  * Portions Copyright (C) 2001 Momchil Velikov
210  * Portions Copyright (C) 2001 Christoph Hellwig
211  * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
212  */
213 
214 struct tmem_objnode_tree_path {
215 	struct tmem_objnode *objnode;
216 	int offset;
217 };
218 
219 /* objnode height_to_maxindex translation */
220 static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
221 
tmem_objnode_tree_init(void)222 static void tmem_objnode_tree_init(void)
223 {
224 	unsigned int ht, tmp;
225 
226 	for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
227 		tmp = ht * OBJNODE_TREE_MAP_SHIFT;
228 		if (tmp >= OBJNODE_TREE_INDEX_BITS)
229 			tmem_objnode_tree_h2max[ht] = ~0UL;
230 		else
231 			tmem_objnode_tree_h2max[ht] =
232 			    (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
233 	}
234 }
235 
tmem_objnode_alloc(struct tmem_obj * obj)236 static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
237 {
238 	struct tmem_objnode *objnode;
239 
240 	ASSERT_SENTINEL(obj, OBJ);
241 	BUG_ON(obj->pool == NULL);
242 	ASSERT_SENTINEL(obj->pool, POOL);
243 	objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
244 	if (unlikely(objnode == NULL))
245 		goto out;
246 	objnode->obj = obj;
247 	SET_SENTINEL(objnode, OBJNODE);
248 	memset(&objnode->slots, 0, sizeof(objnode->slots));
249 	objnode->slots_in_use = 0;
250 	obj->objnode_count++;
251 out:
252 	return objnode;
253 }
254 
tmem_objnode_free(struct tmem_objnode * objnode)255 static void tmem_objnode_free(struct tmem_objnode *objnode)
256 {
257 	struct tmem_pool *pool;
258 	int i;
259 
260 	BUG_ON(objnode == NULL);
261 	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
262 		BUG_ON(objnode->slots[i] != NULL);
263 	ASSERT_SENTINEL(objnode, OBJNODE);
264 	INVERT_SENTINEL(objnode, OBJNODE);
265 	BUG_ON(objnode->obj == NULL);
266 	ASSERT_SENTINEL(objnode->obj, OBJ);
267 	pool = objnode->obj->pool;
268 	BUG_ON(pool == NULL);
269 	ASSERT_SENTINEL(pool, POOL);
270 	objnode->obj->objnode_count--;
271 	objnode->obj = NULL;
272 	(*tmem_hostops.objnode_free)(objnode, pool);
273 }
274 
275 /*
276  * lookup index in object and return associated pampd (or NULL if not found)
277  */
__tmem_pampd_lookup_in_obj(struct tmem_obj * obj,uint32_t index)278 static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
279 {
280 	unsigned int height, shift;
281 	struct tmem_objnode **slot = NULL;
282 
283 	BUG_ON(obj == NULL);
284 	ASSERT_SENTINEL(obj, OBJ);
285 	BUG_ON(obj->pool == NULL);
286 	ASSERT_SENTINEL(obj->pool, POOL);
287 
288 	height = obj->objnode_tree_height;
289 	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
290 		goto out;
291 	if (height == 0 && obj->objnode_tree_root) {
292 		slot = &obj->objnode_tree_root;
293 		goto out;
294 	}
295 	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
296 	slot = &obj->objnode_tree_root;
297 	while (height > 0) {
298 		if (*slot == NULL)
299 			goto out;
300 		slot = (struct tmem_objnode **)
301 			((*slot)->slots +
302 			 ((index >> shift) & OBJNODE_TREE_MAP_MASK));
303 		shift -= OBJNODE_TREE_MAP_SHIFT;
304 		height--;
305 	}
306 out:
307 	return slot != NULL ? (void **)slot : NULL;
308 }
309 
tmem_pampd_lookup_in_obj(struct tmem_obj * obj,uint32_t index)310 static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
311 {
312 	struct tmem_objnode **slot;
313 
314 	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
315 	return slot != NULL ? *slot : NULL;
316 }
317 
tmem_pampd_replace_in_obj(struct tmem_obj * obj,uint32_t index,void * new_pampd)318 static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
319 					void *new_pampd)
320 {
321 	struct tmem_objnode **slot;
322 	void *ret = NULL;
323 
324 	slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
325 	if ((slot != NULL) && (*slot != NULL)) {
326 		void *old_pampd = *(void **)slot;
327 		*(void **)slot = new_pampd;
328 		(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
329 		ret = new_pampd;
330 	}
331 	return ret;
332 }
333 
tmem_pampd_add_to_obj(struct tmem_obj * obj,uint32_t index,void * pampd)334 static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
335 					void *pampd)
336 {
337 	int ret = 0;
338 	struct tmem_objnode *objnode = NULL, *newnode, *slot;
339 	unsigned int height, shift;
340 	int offset = 0;
341 
342 	/* if necessary, extend the tree to be higher  */
343 	if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
344 		height = obj->objnode_tree_height + 1;
345 		if (index > tmem_objnode_tree_h2max[height])
346 			while (index > tmem_objnode_tree_h2max[height])
347 				height++;
348 		if (obj->objnode_tree_root == NULL) {
349 			obj->objnode_tree_height = height;
350 			goto insert;
351 		}
352 		do {
353 			newnode = tmem_objnode_alloc(obj);
354 			if (!newnode) {
355 				ret = -ENOMEM;
356 				goto out;
357 			}
358 			newnode->slots[0] = obj->objnode_tree_root;
359 			newnode->slots_in_use = 1;
360 			obj->objnode_tree_root = newnode;
361 			obj->objnode_tree_height++;
362 		} while (height > obj->objnode_tree_height);
363 	}
364 insert:
365 	slot = obj->objnode_tree_root;
366 	height = obj->objnode_tree_height;
367 	shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
368 	while (height > 0) {
369 		if (slot == NULL) {
370 			/* add a child objnode.  */
371 			slot = tmem_objnode_alloc(obj);
372 			if (!slot) {
373 				ret = -ENOMEM;
374 				goto out;
375 			}
376 			if (objnode) {
377 
378 				objnode->slots[offset] = slot;
379 				objnode->slots_in_use++;
380 			} else
381 				obj->objnode_tree_root = slot;
382 		}
383 		/* go down a level */
384 		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
385 		objnode = slot;
386 		slot = objnode->slots[offset];
387 		shift -= OBJNODE_TREE_MAP_SHIFT;
388 		height--;
389 	}
390 	BUG_ON(slot != NULL);
391 	if (objnode) {
392 		objnode->slots_in_use++;
393 		objnode->slots[offset] = pampd;
394 	} else
395 		obj->objnode_tree_root = pampd;
396 	obj->pampd_count++;
397 out:
398 	return ret;
399 }
400 
tmem_pampd_delete_from_obj(struct tmem_obj * obj,uint32_t index)401 static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
402 {
403 	struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
404 	struct tmem_objnode_tree_path *pathp = path;
405 	struct tmem_objnode *slot = NULL;
406 	unsigned int height, shift;
407 	int offset;
408 
409 	BUG_ON(obj == NULL);
410 	ASSERT_SENTINEL(obj, OBJ);
411 	BUG_ON(obj->pool == NULL);
412 	ASSERT_SENTINEL(obj->pool, POOL);
413 	height = obj->objnode_tree_height;
414 	if (index > tmem_objnode_tree_h2max[height])
415 		goto out;
416 	slot = obj->objnode_tree_root;
417 	if (height == 0 && obj->objnode_tree_root) {
418 		obj->objnode_tree_root = NULL;
419 		goto out;
420 	}
421 	shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
422 	pathp->objnode = NULL;
423 	do {
424 		if (slot == NULL)
425 			goto out;
426 		pathp++;
427 		offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
428 		pathp->offset = offset;
429 		pathp->objnode = slot;
430 		slot = slot->slots[offset];
431 		shift -= OBJNODE_TREE_MAP_SHIFT;
432 		height--;
433 	} while (height > 0);
434 	if (slot == NULL)
435 		goto out;
436 	while (pathp->objnode) {
437 		pathp->objnode->slots[pathp->offset] = NULL;
438 		pathp->objnode->slots_in_use--;
439 		if (pathp->objnode->slots_in_use) {
440 			if (pathp->objnode == obj->objnode_tree_root) {
441 				while (obj->objnode_tree_height > 0 &&
442 				  obj->objnode_tree_root->slots_in_use == 1 &&
443 				  obj->objnode_tree_root->slots[0]) {
444 					struct tmem_objnode *to_free =
445 						obj->objnode_tree_root;
446 
447 					obj->objnode_tree_root =
448 							to_free->slots[0];
449 					obj->objnode_tree_height--;
450 					to_free->slots[0] = NULL;
451 					to_free->slots_in_use = 0;
452 					tmem_objnode_free(to_free);
453 				}
454 			}
455 			goto out;
456 		}
457 		tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
458 		pathp--;
459 	}
460 	obj->objnode_tree_height = 0;
461 	obj->objnode_tree_root = NULL;
462 
463 out:
464 	if (slot != NULL)
465 		obj->pampd_count--;
466 	BUG_ON(obj->pampd_count < 0);
467 	return slot;
468 }
469 
470 /* recursively walk the objnode_tree destroying pampds and objnodes */
tmem_objnode_node_destroy(struct tmem_obj * obj,struct tmem_objnode * objnode,unsigned int ht)471 static void tmem_objnode_node_destroy(struct tmem_obj *obj,
472 					struct tmem_objnode *objnode,
473 					unsigned int ht)
474 {
475 	int i;
476 
477 	if (ht == 0)
478 		return;
479 	for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
480 		if (objnode->slots[i]) {
481 			if (ht == 1) {
482 				obj->pampd_count--;
483 				(*tmem_pamops.free)(objnode->slots[i],
484 						obj->pool, NULL, 0);
485 				objnode->slots[i] = NULL;
486 				continue;
487 			}
488 			tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
489 			tmem_objnode_free(objnode->slots[i]);
490 			objnode->slots[i] = NULL;
491 		}
492 	}
493 }
494 
tmem_pampd_destroy_all_in_obj(struct tmem_obj * obj)495 static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
496 {
497 	if (obj->objnode_tree_root == NULL)
498 		return;
499 	if (obj->objnode_tree_height == 0) {
500 		obj->pampd_count--;
501 		(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
502 	} else {
503 		tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
504 					obj->objnode_tree_height);
505 		tmem_objnode_free(obj->objnode_tree_root);
506 		obj->objnode_tree_height = 0;
507 	}
508 	obj->objnode_tree_root = NULL;
509 	(*tmem_pamops.free_obj)(obj->pool, obj);
510 }
511 
512 /*
513  * Tmem is operated on by a set of well-defined actions:
514  * "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
515  * (The tmem ABI allows for subpages and exchanges but these operations
516  * are not included in this implementation.)
517  *
518  * These "tmem core" operations are implemented in the following functions.
519  */
520 
521 /*
522  * "Put" a page, e.g. copy a page from the kernel into newly allocated
523  * PAM space (if such space is available).  Tmem_put is complicated by
524  * a corner case: What if a page with matching handle already exists in
525  * tmem?  To guarantee coherency, one of two actions is necessary: Either
526  * the data for the page must be overwritten, or the page must be
527  * "flushed" so that the data is not accessible to a subsequent "get".
528  * Since these "duplicate puts" are relatively rare, this implementation
529  * always flushes for simplicity.
530  */
tmem_put(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,char * data,size_t size,bool raw,bool ephemeral)531 int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
532 		char *data, size_t size, bool raw, bool ephemeral)
533 {
534 	struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
535 	void *pampd = NULL, *pampd_del = NULL;
536 	int ret = -ENOMEM;
537 	struct tmem_hashbucket *hb;
538 
539 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
540 	spin_lock(&hb->lock);
541 	obj = objfound = tmem_obj_find(hb, oidp);
542 	if (obj != NULL) {
543 		pampd = tmem_pampd_lookup_in_obj(objfound, index);
544 		if (pampd != NULL) {
545 			/* if found, is a dup put, flush the old one */
546 			pampd_del = tmem_pampd_delete_from_obj(obj, index);
547 			BUG_ON(pampd_del != pampd);
548 			(*tmem_pamops.free)(pampd, pool, oidp, index);
549 			if (obj->pampd_count == 0) {
550 				objnew = obj;
551 				objfound = NULL;
552 			}
553 			pampd = NULL;
554 		}
555 	} else {
556 		obj = objnew = (*tmem_hostops.obj_alloc)(pool);
557 		if (unlikely(obj == NULL)) {
558 			ret = -ENOMEM;
559 			goto out;
560 		}
561 		tmem_obj_init(obj, hb, pool, oidp);
562 	}
563 	BUG_ON(obj == NULL);
564 	BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
565 	pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
566 					obj->pool, &obj->oid, index);
567 	if (unlikely(pampd == NULL))
568 		goto free;
569 	ret = tmem_pampd_add_to_obj(obj, index, pampd);
570 	if (unlikely(ret == -ENOMEM))
571 		/* may have partially built objnode tree ("stump") */
572 		goto delete_and_free;
573 	goto out;
574 
575 delete_and_free:
576 	(void)tmem_pampd_delete_from_obj(obj, index);
577 free:
578 	if (pampd)
579 		(*tmem_pamops.free)(pampd, pool, NULL, 0);
580 	if (objnew) {
581 		tmem_obj_free(objnew, hb);
582 		(*tmem_hostops.obj_free)(objnew, pool);
583 	}
584 out:
585 	spin_unlock(&hb->lock);
586 	return ret;
587 }
588 
589 /*
590  * "Get" a page, e.g. if one can be found, copy the tmem page with the
591  * matching handle from PAM space to the kernel.  By tmem definition,
592  * when a "get" is successful on an ephemeral page, the page is "flushed",
593  * and when a "get" is successful on a persistent page, the page is retained
594  * in tmem.  Note that to preserve
595  * coherency, "get" can never be skipped if tmem contains the data.
596  * That is, if a get is done with a certain handle and fails, any
597  * subsequent "get" must also fail (unless of course there is a
598  * "put" done with the same handle).
599 
600  */
tmem_get(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,char * data,size_t * size,bool raw,int get_and_free)601 int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
602 		char *data, size_t *size, bool raw, int get_and_free)
603 {
604 	struct tmem_obj *obj;
605 	void *pampd;
606 	bool ephemeral = is_ephemeral(pool);
607 	int ret = -1;
608 	struct tmem_hashbucket *hb;
609 	bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
610 	bool lock_held = false;
611 
612 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
613 	spin_lock(&hb->lock);
614 	lock_held = true;
615 	obj = tmem_obj_find(hb, oidp);
616 	if (obj == NULL)
617 		goto out;
618 	if (free)
619 		pampd = tmem_pampd_delete_from_obj(obj, index);
620 	else
621 		pampd = tmem_pampd_lookup_in_obj(obj, index);
622 	if (pampd == NULL)
623 		goto out;
624 	if (free) {
625 		if (obj->pampd_count == 0) {
626 			tmem_obj_free(obj, hb);
627 			(*tmem_hostops.obj_free)(obj, pool);
628 			obj = NULL;
629 		}
630 	}
631 	if (tmem_pamops.is_remote(pampd)) {
632 		lock_held = false;
633 		spin_unlock(&hb->lock);
634 	}
635 	if (free)
636 		ret = (*tmem_pamops.get_data_and_free)(
637 				data, size, raw, pampd, pool, oidp, index);
638 	else
639 		ret = (*tmem_pamops.get_data)(
640 				data, size, raw, pampd, pool, oidp, index);
641 	if (ret < 0)
642 		goto out;
643 	ret = 0;
644 out:
645 	if (lock_held)
646 		spin_unlock(&hb->lock);
647 	return ret;
648 }
649 
650 /*
651  * If a page in tmem matches the handle, "flush" this page from tmem such
652  * that any subsequent "get" does not succeed (unless, of course, there
653  * was another "put" with the same handle).
654  */
tmem_flush_page(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index)655 int tmem_flush_page(struct tmem_pool *pool,
656 				struct tmem_oid *oidp, uint32_t index)
657 {
658 	struct tmem_obj *obj;
659 	void *pampd;
660 	int ret = -1;
661 	struct tmem_hashbucket *hb;
662 
663 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
664 	spin_lock(&hb->lock);
665 	obj = tmem_obj_find(hb, oidp);
666 	if (obj == NULL)
667 		goto out;
668 	pampd = tmem_pampd_delete_from_obj(obj, index);
669 	if (pampd == NULL)
670 		goto out;
671 	(*tmem_pamops.free)(pampd, pool, oidp, index);
672 	if (obj->pampd_count == 0) {
673 		tmem_obj_free(obj, hb);
674 		(*tmem_hostops.obj_free)(obj, pool);
675 	}
676 	ret = 0;
677 
678 out:
679 	spin_unlock(&hb->lock);
680 	return ret;
681 }
682 
683 /*
684  * If a page in tmem matches the handle, replace the page so that any
685  * subsequent "get" gets the new page.  Returns 0 if
686  * there was a page to replace, else returns -1.
687  */
tmem_replace(struct tmem_pool * pool,struct tmem_oid * oidp,uint32_t index,void * new_pampd)688 int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
689 			uint32_t index, void *new_pampd)
690 {
691 	struct tmem_obj *obj;
692 	int ret = -1;
693 	struct tmem_hashbucket *hb;
694 
695 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
696 	spin_lock(&hb->lock);
697 	obj = tmem_obj_find(hb, oidp);
698 	if (obj == NULL)
699 		goto out;
700 	new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
701 	ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
702 out:
703 	spin_unlock(&hb->lock);
704 	return ret;
705 }
706 
707 /*
708  * "Flush" all pages in tmem matching this oid.
709  */
tmem_flush_object(struct tmem_pool * pool,struct tmem_oid * oidp)710 int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
711 {
712 	struct tmem_obj *obj;
713 	struct tmem_hashbucket *hb;
714 	int ret = -1;
715 
716 	hb = &pool->hashbucket[tmem_oid_hash(oidp)];
717 	spin_lock(&hb->lock);
718 	obj = tmem_obj_find(hb, oidp);
719 	if (obj == NULL)
720 		goto out;
721 	tmem_pampd_destroy_all_in_obj(obj);
722 	tmem_obj_free(obj, hb);
723 	(*tmem_hostops.obj_free)(obj, pool);
724 	ret = 0;
725 
726 out:
727 	spin_unlock(&hb->lock);
728 	return ret;
729 }
730 
731 /*
732  * "Flush" all pages (and tmem_objs) from this tmem_pool and disable
733  * all subsequent access to this tmem_pool.
734  */
tmem_destroy_pool(struct tmem_pool * pool)735 int tmem_destroy_pool(struct tmem_pool *pool)
736 {
737 	int ret = -1;
738 
739 	if (pool == NULL)
740 		goto out;
741 	tmem_pool_flush(pool, 1);
742 	ret = 0;
743 out:
744 	return ret;
745 }
746 
747 static LIST_HEAD(tmem_global_pool_list);
748 
749 /*
750  * Create a new tmem_pool with the provided flag and return
751  * a pool id provided by the tmem host implementation.
752  */
tmem_new_pool(struct tmem_pool * pool,uint32_t flags)753 void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
754 {
755 	int persistent = flags & TMEM_POOL_PERSIST;
756 	int shared = flags & TMEM_POOL_SHARED;
757 	struct tmem_hashbucket *hb = &pool->hashbucket[0];
758 	int i;
759 
760 	for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
761 		hb->obj_rb_root = RB_ROOT;
762 		spin_lock_init(&hb->lock);
763 	}
764 	INIT_LIST_HEAD(&pool->pool_list);
765 	atomic_set(&pool->obj_count, 0);
766 	SET_SENTINEL(pool, POOL);
767 	list_add_tail(&pool->pool_list, &tmem_global_pool_list);
768 	pool->persistent = persistent;
769 	pool->shared = shared;
770 }
771