1 /*
2  * Copyright (C) 2009-2011 Red Hat, Inc.
3  *
4  * Author: Mikulas Patocka <mpatocka@redhat.com>
5  *
6  * This file is released under the GPL.
7  */
8 
9 #include "dm-bufio.h"
10 
11 #include <linux/device-mapper.h>
12 #include <linux/dm-io.h>
13 #include <linux/slab.h>
14 #include <linux/vmalloc.h>
15 #include <linux/version.h>
16 #include <linux/shrinker.h>
17 #include <linux/module.h>
18 
19 #define DM_MSG_PREFIX "bufio"
20 
21 /*
22  * Memory management policy:
23  *	Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
24  *	or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
25  *	Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
26  *	Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
27  *	dirty buffers.
28  */
29 #define DM_BUFIO_MIN_BUFFERS		8
30 
31 #define DM_BUFIO_MEMORY_PERCENT		2
32 #define DM_BUFIO_VMALLOC_PERCENT	25
33 #define DM_BUFIO_WRITEBACK_PERCENT	75
34 
35 /*
36  * Check buffer ages in this interval (seconds)
37  */
38 #define DM_BUFIO_WORK_TIMER_SECS	10
39 
40 /*
41  * Free buffers when they are older than this (seconds)
42  */
43 #define DM_BUFIO_DEFAULT_AGE_SECS	60
44 
45 /*
46  * The number of bvec entries that are embedded directly in the buffer.
47  * If the chunk size is larger, dm-io is used to do the io.
48  */
49 #define DM_BUFIO_INLINE_VECS		16
50 
51 /*
52  * Buffer hash
53  */
54 #define DM_BUFIO_HASH_BITS	20
55 #define DM_BUFIO_HASH(block) \
56 	((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \
57 	 ((1 << DM_BUFIO_HASH_BITS) - 1))
58 
59 /*
60  * Don't try to use kmem_cache_alloc for blocks larger than this.
61  * For explanation, see alloc_buffer_data below.
62  */
63 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT	(PAGE_SIZE >> 1)
64 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT	(PAGE_SIZE << (MAX_ORDER - 1))
65 
66 /*
67  * dm_buffer->list_mode
68  */
69 #define LIST_CLEAN	0
70 #define LIST_DIRTY	1
71 #define LIST_SIZE	2
72 
73 /*
74  * Linking of buffers:
75  *	All buffers are linked to cache_hash with their hash_list field.
76  *
77  *	Clean buffers that are not being written (B_WRITING not set)
78  *	are linked to lru[LIST_CLEAN] with their lru_list field.
79  *
80  *	Dirty and clean buffers that are being written are linked to
81  *	lru[LIST_DIRTY] with their lru_list field. When the write
82  *	finishes, the buffer cannot be relinked immediately (because we
83  *	are in an interrupt context and relinking requires process
84  *	context), so some clean-not-writing buffers can be held on
85  *	dirty_lru too.  They are later added to lru in the process
86  *	context.
87  */
88 struct dm_bufio_client {
89 	struct mutex lock;
90 
91 	struct list_head lru[LIST_SIZE];
92 	unsigned long n_buffers[LIST_SIZE];
93 
94 	struct block_device *bdev;
95 	unsigned block_size;
96 	unsigned char sectors_per_block_bits;
97 	unsigned char pages_per_block_bits;
98 	unsigned char blocks_per_page_bits;
99 	unsigned aux_size;
100 	void (*alloc_callback)(struct dm_buffer *);
101 	void (*write_callback)(struct dm_buffer *);
102 
103 	struct dm_io_client *dm_io;
104 
105 	struct list_head reserved_buffers;
106 	unsigned need_reserved_buffers;
107 
108 	struct hlist_head *cache_hash;
109 	wait_queue_head_t free_buffer_wait;
110 
111 	int async_write_error;
112 
113 	struct list_head client_list;
114 	struct shrinker shrinker;
115 };
116 
117 /*
118  * Buffer state bits.
119  */
120 #define B_READING	0
121 #define B_WRITING	1
122 #define B_DIRTY		2
123 
124 /*
125  * Describes how the block was allocated:
126  * kmem_cache_alloc(), __get_free_pages() or vmalloc().
127  * See the comment at alloc_buffer_data.
128  */
129 enum data_mode {
130 	DATA_MODE_SLAB = 0,
131 	DATA_MODE_GET_FREE_PAGES = 1,
132 	DATA_MODE_VMALLOC = 2,
133 	DATA_MODE_LIMIT = 3
134 };
135 
136 struct dm_buffer {
137 	struct hlist_node hash_list;
138 	struct list_head lru_list;
139 	sector_t block;
140 	void *data;
141 	enum data_mode data_mode;
142 	unsigned char list_mode;		/* LIST_* */
143 	unsigned hold_count;
144 	int read_error;
145 	int write_error;
146 	unsigned long state;
147 	unsigned long last_accessed;
148 	struct dm_bufio_client *c;
149 	struct bio bio;
150 	struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
151 };
152 
153 /*----------------------------------------------------------------*/
154 
155 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
156 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
157 
dm_bufio_cache_index(struct dm_bufio_client * c)158 static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
159 {
160 	unsigned ret = c->blocks_per_page_bits - 1;
161 
162 	BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
163 
164 	return ret;
165 }
166 
167 #define DM_BUFIO_CACHE(c)	(dm_bufio_caches[dm_bufio_cache_index(c)])
168 #define DM_BUFIO_CACHE_NAME(c)	(dm_bufio_cache_names[dm_bufio_cache_index(c)])
169 
170 #define dm_bufio_in_request()	(!!current->bio_list)
171 
dm_bufio_lock(struct dm_bufio_client * c)172 static void dm_bufio_lock(struct dm_bufio_client *c)
173 {
174 	mutex_lock_nested(&c->lock, dm_bufio_in_request());
175 }
176 
dm_bufio_trylock(struct dm_bufio_client * c)177 static int dm_bufio_trylock(struct dm_bufio_client *c)
178 {
179 	return mutex_trylock(&c->lock);
180 }
181 
dm_bufio_unlock(struct dm_bufio_client * c)182 static void dm_bufio_unlock(struct dm_bufio_client *c)
183 {
184 	mutex_unlock(&c->lock);
185 }
186 
187 /*
188  * FIXME Move to sched.h?
189  */
190 #ifdef CONFIG_PREEMPT_VOLUNTARY
191 #  define dm_bufio_cond_resched()		\
192 do {						\
193 	if (unlikely(need_resched()))		\
194 		_cond_resched();		\
195 } while (0)
196 #else
197 #  define dm_bufio_cond_resched()                do { } while (0)
198 #endif
199 
200 /*----------------------------------------------------------------*/
201 
202 /*
203  * Default cache size: available memory divided by the ratio.
204  */
205 static unsigned long dm_bufio_default_cache_size;
206 
207 /*
208  * Total cache size set by the user.
209  */
210 static unsigned long dm_bufio_cache_size;
211 
212 /*
213  * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
214  * at any time.  If it disagrees, the user has changed cache size.
215  */
216 static unsigned long dm_bufio_cache_size_latch;
217 
218 static DEFINE_SPINLOCK(param_spinlock);
219 
220 /*
221  * Buffers are freed after this timeout
222  */
223 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
224 
225 static unsigned long dm_bufio_peak_allocated;
226 static unsigned long dm_bufio_allocated_kmem_cache;
227 static unsigned long dm_bufio_allocated_get_free_pages;
228 static unsigned long dm_bufio_allocated_vmalloc;
229 static unsigned long dm_bufio_current_allocated;
230 
231 /*----------------------------------------------------------------*/
232 
233 /*
234  * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
235  */
236 static unsigned long dm_bufio_cache_size_per_client;
237 
238 /*
239  * The current number of clients.
240  */
241 static int dm_bufio_client_count;
242 
243 /*
244  * The list of all clients.
245  */
246 static LIST_HEAD(dm_bufio_all_clients);
247 
248 /*
249  * This mutex protects dm_bufio_cache_size_latch,
250  * dm_bufio_cache_size_per_client and dm_bufio_client_count
251  */
252 static DEFINE_MUTEX(dm_bufio_clients_lock);
253 
254 /*----------------------------------------------------------------*/
255 
adjust_total_allocated(enum data_mode data_mode,long diff)256 static void adjust_total_allocated(enum data_mode data_mode, long diff)
257 {
258 	static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
259 		&dm_bufio_allocated_kmem_cache,
260 		&dm_bufio_allocated_get_free_pages,
261 		&dm_bufio_allocated_vmalloc,
262 	};
263 
264 	spin_lock(&param_spinlock);
265 
266 	*class_ptr[data_mode] += diff;
267 
268 	dm_bufio_current_allocated += diff;
269 
270 	if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
271 		dm_bufio_peak_allocated = dm_bufio_current_allocated;
272 
273 	spin_unlock(&param_spinlock);
274 }
275 
276 /*
277  * Change the number of clients and recalculate per-client limit.
278  */
__cache_size_refresh(void)279 static void __cache_size_refresh(void)
280 {
281 	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
282 	BUG_ON(dm_bufio_client_count < 0);
283 
284 	dm_bufio_cache_size_latch = dm_bufio_cache_size;
285 
286 	barrier();
287 
288 	/*
289 	 * Use default if set to 0 and report the actual cache size used.
290 	 */
291 	if (!dm_bufio_cache_size_latch) {
292 		(void)cmpxchg(&dm_bufio_cache_size, 0,
293 			      dm_bufio_default_cache_size);
294 		dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
295 	}
296 
297 	dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
298 					 (dm_bufio_client_count ? : 1);
299 }
300 
301 /*
302  * Allocating buffer data.
303  *
304  * Small buffers are allocated with kmem_cache, to use space optimally.
305  *
306  * For large buffers, we choose between get_free_pages and vmalloc.
307  * Each has advantages and disadvantages.
308  *
309  * __get_free_pages can randomly fail if the memory is fragmented.
310  * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
311  * as low as 128M) so using it for caching is not appropriate.
312  *
313  * If the allocation may fail we use __get_free_pages. Memory fragmentation
314  * won't have a fatal effect here, but it just causes flushes of some other
315  * buffers and more I/O will be performed. Don't use __get_free_pages if it
316  * always fails (i.e. order >= MAX_ORDER).
317  *
318  * If the allocation shouldn't fail we use __vmalloc. This is only for the
319  * initial reserve allocation, so there's no risk of wasting all vmalloc
320  * space.
321  */
alloc_buffer_data(struct dm_bufio_client * c,gfp_t gfp_mask,enum data_mode * data_mode)322 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
323 			       enum data_mode *data_mode)
324 {
325 	if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
326 		*data_mode = DATA_MODE_SLAB;
327 		return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
328 	}
329 
330 	if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
331 	    gfp_mask & __GFP_NORETRY) {
332 		*data_mode = DATA_MODE_GET_FREE_PAGES;
333 		return (void *)__get_free_pages(gfp_mask,
334 						c->pages_per_block_bits);
335 	}
336 
337 	*data_mode = DATA_MODE_VMALLOC;
338 	return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
339 }
340 
341 /*
342  * Free buffer's data.
343  */
free_buffer_data(struct dm_bufio_client * c,void * data,enum data_mode data_mode)344 static void free_buffer_data(struct dm_bufio_client *c,
345 			     void *data, enum data_mode data_mode)
346 {
347 	switch (data_mode) {
348 	case DATA_MODE_SLAB:
349 		kmem_cache_free(DM_BUFIO_CACHE(c), data);
350 		break;
351 
352 	case DATA_MODE_GET_FREE_PAGES:
353 		free_pages((unsigned long)data, c->pages_per_block_bits);
354 		break;
355 
356 	case DATA_MODE_VMALLOC:
357 		vfree(data);
358 		break;
359 
360 	default:
361 		DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
362 		       data_mode);
363 		BUG();
364 	}
365 }
366 
367 /*
368  * Allocate buffer and its data.
369  */
alloc_buffer(struct dm_bufio_client * c,gfp_t gfp_mask)370 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
371 {
372 	struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
373 				      gfp_mask);
374 
375 	if (!b)
376 		return NULL;
377 
378 	b->c = c;
379 
380 	b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
381 	if (!b->data) {
382 		kfree(b);
383 		return NULL;
384 	}
385 
386 	adjust_total_allocated(b->data_mode, (long)c->block_size);
387 
388 	return b;
389 }
390 
391 /*
392  * Free buffer and its data.
393  */
free_buffer(struct dm_buffer * b)394 static void free_buffer(struct dm_buffer *b)
395 {
396 	struct dm_bufio_client *c = b->c;
397 
398 	adjust_total_allocated(b->data_mode, -(long)c->block_size);
399 
400 	free_buffer_data(c, b->data, b->data_mode);
401 	kfree(b);
402 }
403 
404 /*
405  * Link buffer to the hash list and clean or dirty queue.
406  */
__link_buffer(struct dm_buffer * b,sector_t block,int dirty)407 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
408 {
409 	struct dm_bufio_client *c = b->c;
410 
411 	c->n_buffers[dirty]++;
412 	b->block = block;
413 	b->list_mode = dirty;
414 	list_add(&b->lru_list, &c->lru[dirty]);
415 	hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
416 	b->last_accessed = jiffies;
417 }
418 
419 /*
420  * Unlink buffer from the hash list and dirty or clean queue.
421  */
__unlink_buffer(struct dm_buffer * b)422 static void __unlink_buffer(struct dm_buffer *b)
423 {
424 	struct dm_bufio_client *c = b->c;
425 
426 	BUG_ON(!c->n_buffers[b->list_mode]);
427 
428 	c->n_buffers[b->list_mode]--;
429 	hlist_del(&b->hash_list);
430 	list_del(&b->lru_list);
431 }
432 
433 /*
434  * Place the buffer to the head of dirty or clean LRU queue.
435  */
__relink_lru(struct dm_buffer * b,int dirty)436 static void __relink_lru(struct dm_buffer *b, int dirty)
437 {
438 	struct dm_bufio_client *c = b->c;
439 
440 	BUG_ON(!c->n_buffers[b->list_mode]);
441 
442 	c->n_buffers[b->list_mode]--;
443 	c->n_buffers[dirty]++;
444 	b->list_mode = dirty;
445 	list_del(&b->lru_list);
446 	list_add(&b->lru_list, &c->lru[dirty]);
447 }
448 
449 /*----------------------------------------------------------------
450  * Submit I/O on the buffer.
451  *
452  * Bio interface is faster but it has some problems:
453  *	the vector list is limited (increasing this limit increases
454  *	memory-consumption per buffer, so it is not viable);
455  *
456  *	the memory must be direct-mapped, not vmalloced;
457  *
458  *	the I/O driver can reject requests spuriously if it thinks that
459  *	the requests are too big for the device or if they cross a
460  *	controller-defined memory boundary.
461  *
462  * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
463  * it is not vmalloced, try using the bio interface.
464  *
465  * If the buffer is big, if it is vmalloced or if the underlying device
466  * rejects the bio because it is too large, use dm-io layer to do the I/O.
467  * The dm-io layer splits the I/O into multiple requests, avoiding the above
468  * shortcomings.
469  *--------------------------------------------------------------*/
470 
471 /*
472  * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
473  * that the request was handled directly with bio interface.
474  */
dmio_complete(unsigned long error,void * context)475 static void dmio_complete(unsigned long error, void *context)
476 {
477 	struct dm_buffer *b = context;
478 
479 	b->bio.bi_end_io(&b->bio, error ? -EIO : 0);
480 }
481 
use_dmio(struct dm_buffer * b,int rw,sector_t block,bio_end_io_t * end_io)482 static void use_dmio(struct dm_buffer *b, int rw, sector_t block,
483 		     bio_end_io_t *end_io)
484 {
485 	int r;
486 	struct dm_io_request io_req = {
487 		.bi_rw = rw,
488 		.notify.fn = dmio_complete,
489 		.notify.context = b,
490 		.client = b->c->dm_io,
491 	};
492 	struct dm_io_region region = {
493 		.bdev = b->c->bdev,
494 		.sector = block << b->c->sectors_per_block_bits,
495 		.count = b->c->block_size >> SECTOR_SHIFT,
496 	};
497 
498 	if (b->data_mode != DATA_MODE_VMALLOC) {
499 		io_req.mem.type = DM_IO_KMEM;
500 		io_req.mem.ptr.addr = b->data;
501 	} else {
502 		io_req.mem.type = DM_IO_VMA;
503 		io_req.mem.ptr.vma = b->data;
504 	}
505 
506 	b->bio.bi_end_io = end_io;
507 
508 	r = dm_io(&io_req, 1, &region, NULL);
509 	if (r)
510 		end_io(&b->bio, r);
511 }
512 
use_inline_bio(struct dm_buffer * b,int rw,sector_t block,bio_end_io_t * end_io)513 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block,
514 			   bio_end_io_t *end_io)
515 {
516 	char *ptr;
517 	int len;
518 
519 	bio_init(&b->bio);
520 	b->bio.bi_io_vec = b->bio_vec;
521 	b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
522 	b->bio.bi_sector = block << b->c->sectors_per_block_bits;
523 	b->bio.bi_bdev = b->c->bdev;
524 	b->bio.bi_end_io = end_io;
525 
526 	/*
527 	 * We assume that if len >= PAGE_SIZE ptr is page-aligned.
528 	 * If len < PAGE_SIZE the buffer doesn't cross page boundary.
529 	 */
530 	ptr = b->data;
531 	len = b->c->block_size;
532 
533 	if (len >= PAGE_SIZE)
534 		BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1));
535 	else
536 		BUG_ON((unsigned long)ptr & (len - 1));
537 
538 	do {
539 		if (!bio_add_page(&b->bio, virt_to_page(ptr),
540 				  len < PAGE_SIZE ? len : PAGE_SIZE,
541 				  virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
542 			BUG_ON(b->c->block_size <= PAGE_SIZE);
543 			use_dmio(b, rw, block, end_io);
544 			return;
545 		}
546 
547 		len -= PAGE_SIZE;
548 		ptr += PAGE_SIZE;
549 	} while (len > 0);
550 
551 	submit_bio(rw, &b->bio);
552 }
553 
submit_io(struct dm_buffer * b,int rw,sector_t block,bio_end_io_t * end_io)554 static void submit_io(struct dm_buffer *b, int rw, sector_t block,
555 		      bio_end_io_t *end_io)
556 {
557 	if (rw == WRITE && b->c->write_callback)
558 		b->c->write_callback(b);
559 
560 	if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE &&
561 	    b->data_mode != DATA_MODE_VMALLOC)
562 		use_inline_bio(b, rw, block, end_io);
563 	else
564 		use_dmio(b, rw, block, end_io);
565 }
566 
567 /*----------------------------------------------------------------
568  * Writing dirty buffers
569  *--------------------------------------------------------------*/
570 
571 /*
572  * The endio routine for write.
573  *
574  * Set the error, clear B_WRITING bit and wake anyone who was waiting on
575  * it.
576  */
write_endio(struct bio * bio,int error)577 static void write_endio(struct bio *bio, int error)
578 {
579 	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
580 
581 	b->write_error = error;
582 	if (error) {
583 		struct dm_bufio_client *c = b->c;
584 		(void)cmpxchg(&c->async_write_error, 0, error);
585 	}
586 
587 	BUG_ON(!test_bit(B_WRITING, &b->state));
588 
589 	smp_mb__before_clear_bit();
590 	clear_bit(B_WRITING, &b->state);
591 	smp_mb__after_clear_bit();
592 
593 	wake_up_bit(&b->state, B_WRITING);
594 }
595 
596 /*
597  * This function is called when wait_on_bit is actually waiting.
598  */
do_io_schedule(void * word)599 static int do_io_schedule(void *word)
600 {
601 	io_schedule();
602 
603 	return 0;
604 }
605 
606 /*
607  * Initiate a write on a dirty buffer, but don't wait for it.
608  *
609  * - If the buffer is not dirty, exit.
610  * - If there some previous write going on, wait for it to finish (we can't
611  *   have two writes on the same buffer simultaneously).
612  * - Submit our write and don't wait on it. We set B_WRITING indicating
613  *   that there is a write in progress.
614  */
__write_dirty_buffer(struct dm_buffer * b)615 static void __write_dirty_buffer(struct dm_buffer *b)
616 {
617 	if (!test_bit(B_DIRTY, &b->state))
618 		return;
619 
620 	clear_bit(B_DIRTY, &b->state);
621 	wait_on_bit_lock(&b->state, B_WRITING,
622 			 do_io_schedule, TASK_UNINTERRUPTIBLE);
623 
624 	submit_io(b, WRITE, b->block, write_endio);
625 }
626 
627 /*
628  * Wait until any activity on the buffer finishes.  Possibly write the
629  * buffer if it is dirty.  When this function finishes, there is no I/O
630  * running on the buffer and the buffer is not dirty.
631  */
__make_buffer_clean(struct dm_buffer * b)632 static void __make_buffer_clean(struct dm_buffer *b)
633 {
634 	BUG_ON(b->hold_count);
635 
636 	if (!b->state)	/* fast case */
637 		return;
638 
639 	wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
640 	__write_dirty_buffer(b);
641 	wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
642 }
643 
644 /*
645  * Find some buffer that is not held by anybody, clean it, unlink it and
646  * return it.
647  */
__get_unclaimed_buffer(struct dm_bufio_client * c)648 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
649 {
650 	struct dm_buffer *b;
651 
652 	list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
653 		BUG_ON(test_bit(B_WRITING, &b->state));
654 		BUG_ON(test_bit(B_DIRTY, &b->state));
655 
656 		if (!b->hold_count) {
657 			__make_buffer_clean(b);
658 			__unlink_buffer(b);
659 			return b;
660 		}
661 		dm_bufio_cond_resched();
662 	}
663 
664 	list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
665 		BUG_ON(test_bit(B_READING, &b->state));
666 
667 		if (!b->hold_count) {
668 			__make_buffer_clean(b);
669 			__unlink_buffer(b);
670 			return b;
671 		}
672 		dm_bufio_cond_resched();
673 	}
674 
675 	return NULL;
676 }
677 
678 /*
679  * Wait until some other threads free some buffer or release hold count on
680  * some buffer.
681  *
682  * This function is entered with c->lock held, drops it and regains it
683  * before exiting.
684  */
__wait_for_free_buffer(struct dm_bufio_client * c)685 static void __wait_for_free_buffer(struct dm_bufio_client *c)
686 {
687 	DECLARE_WAITQUEUE(wait, current);
688 
689 	add_wait_queue(&c->free_buffer_wait, &wait);
690 	set_task_state(current, TASK_UNINTERRUPTIBLE);
691 	dm_bufio_unlock(c);
692 
693 	io_schedule();
694 
695 	set_task_state(current, TASK_RUNNING);
696 	remove_wait_queue(&c->free_buffer_wait, &wait);
697 
698 	dm_bufio_lock(c);
699 }
700 
701 /*
702  * Allocate a new buffer. If the allocation is not possible, wait until
703  * some other thread frees a buffer.
704  *
705  * May drop the lock and regain it.
706  */
__alloc_buffer_wait_no_callback(struct dm_bufio_client * c)707 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c)
708 {
709 	struct dm_buffer *b;
710 
711 	/*
712 	 * dm-bufio is resistant to allocation failures (it just keeps
713 	 * one buffer reserved in cases all the allocations fail).
714 	 * So set flags to not try too hard:
715 	 *	GFP_NOIO: don't recurse into the I/O layer
716 	 *	__GFP_NORETRY: don't retry and rather return failure
717 	 *	__GFP_NOMEMALLOC: don't use emergency reserves
718 	 *	__GFP_NOWARN: don't print a warning in case of failure
719 	 *
720 	 * For debugging, if we set the cache size to 1, no new buffers will
721 	 * be allocated.
722 	 */
723 	while (1) {
724 		if (dm_bufio_cache_size_latch != 1) {
725 			b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
726 			if (b)
727 				return b;
728 		}
729 
730 		if (!list_empty(&c->reserved_buffers)) {
731 			b = list_entry(c->reserved_buffers.next,
732 				       struct dm_buffer, lru_list);
733 			list_del(&b->lru_list);
734 			c->need_reserved_buffers++;
735 
736 			return b;
737 		}
738 
739 		b = __get_unclaimed_buffer(c);
740 		if (b)
741 			return b;
742 
743 		__wait_for_free_buffer(c);
744 	}
745 }
746 
__alloc_buffer_wait(struct dm_bufio_client * c)747 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c)
748 {
749 	struct dm_buffer *b = __alloc_buffer_wait_no_callback(c);
750 
751 	if (c->alloc_callback)
752 		c->alloc_callback(b);
753 
754 	return b;
755 }
756 
757 /*
758  * Free a buffer and wake other threads waiting for free buffers.
759  */
__free_buffer_wake(struct dm_buffer * b)760 static void __free_buffer_wake(struct dm_buffer *b)
761 {
762 	struct dm_bufio_client *c = b->c;
763 
764 	if (!c->need_reserved_buffers)
765 		free_buffer(b);
766 	else {
767 		list_add(&b->lru_list, &c->reserved_buffers);
768 		c->need_reserved_buffers--;
769 	}
770 
771 	wake_up(&c->free_buffer_wait);
772 }
773 
__write_dirty_buffers_async(struct dm_bufio_client * c,int no_wait)774 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait)
775 {
776 	struct dm_buffer *b, *tmp;
777 
778 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
779 		BUG_ON(test_bit(B_READING, &b->state));
780 
781 		if (!test_bit(B_DIRTY, &b->state) &&
782 		    !test_bit(B_WRITING, &b->state)) {
783 			__relink_lru(b, LIST_CLEAN);
784 			continue;
785 		}
786 
787 		if (no_wait && test_bit(B_WRITING, &b->state))
788 			return;
789 
790 		__write_dirty_buffer(b);
791 		dm_bufio_cond_resched();
792 	}
793 }
794 
795 /*
796  * Get writeback threshold and buffer limit for a given client.
797  */
__get_memory_limit(struct dm_bufio_client * c,unsigned long * threshold_buffers,unsigned long * limit_buffers)798 static void __get_memory_limit(struct dm_bufio_client *c,
799 			       unsigned long *threshold_buffers,
800 			       unsigned long *limit_buffers)
801 {
802 	unsigned long buffers;
803 
804 	if (dm_bufio_cache_size != dm_bufio_cache_size_latch) {
805 		mutex_lock(&dm_bufio_clients_lock);
806 		__cache_size_refresh();
807 		mutex_unlock(&dm_bufio_clients_lock);
808 	}
809 
810 	buffers = dm_bufio_cache_size_per_client >>
811 		  (c->sectors_per_block_bits + SECTOR_SHIFT);
812 
813 	if (buffers < DM_BUFIO_MIN_BUFFERS)
814 		buffers = DM_BUFIO_MIN_BUFFERS;
815 
816 	*limit_buffers = buffers;
817 	*threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100;
818 }
819 
820 /*
821  * Check if we're over watermark.
822  * If we are over threshold_buffers, start freeing buffers.
823  * If we're over "limit_buffers", block until we get under the limit.
824  */
__check_watermark(struct dm_bufio_client * c)825 static void __check_watermark(struct dm_bufio_client *c)
826 {
827 	unsigned long threshold_buffers, limit_buffers;
828 
829 	__get_memory_limit(c, &threshold_buffers, &limit_buffers);
830 
831 	while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
832 	       limit_buffers) {
833 
834 		struct dm_buffer *b = __get_unclaimed_buffer(c);
835 
836 		if (!b)
837 			return;
838 
839 		__free_buffer_wake(b);
840 		dm_bufio_cond_resched();
841 	}
842 
843 	if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
844 		__write_dirty_buffers_async(c, 1);
845 }
846 
847 /*
848  * Find a buffer in the hash.
849  */
__find(struct dm_bufio_client * c,sector_t block)850 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
851 {
852 	struct dm_buffer *b;
853 	struct hlist_node *hn;
854 
855 	hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)],
856 			     hash_list) {
857 		dm_bufio_cond_resched();
858 		if (b->block == block)
859 			return b;
860 	}
861 
862 	return NULL;
863 }
864 
865 /*----------------------------------------------------------------
866  * Getting a buffer
867  *--------------------------------------------------------------*/
868 
869 enum new_flag {
870 	NF_FRESH = 0,
871 	NF_READ = 1,
872 	NF_GET = 2
873 };
874 
__bufio_new(struct dm_bufio_client * c,sector_t block,enum new_flag nf,struct dm_buffer ** bp,int * need_submit)875 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
876 				     enum new_flag nf, struct dm_buffer **bp,
877 				     int *need_submit)
878 {
879 	struct dm_buffer *b, *new_b = NULL;
880 
881 	*need_submit = 0;
882 
883 	b = __find(c, block);
884 	if (b) {
885 		b->hold_count++;
886 		__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
887 			     test_bit(B_WRITING, &b->state));
888 		return b;
889 	}
890 
891 	if (nf == NF_GET)
892 		return NULL;
893 
894 	new_b = __alloc_buffer_wait(c);
895 
896 	/*
897 	 * We've had a period where the mutex was unlocked, so need to
898 	 * recheck the hash table.
899 	 */
900 	b = __find(c, block);
901 	if (b) {
902 		__free_buffer_wake(new_b);
903 		b->hold_count++;
904 		__relink_lru(b, test_bit(B_DIRTY, &b->state) ||
905 			     test_bit(B_WRITING, &b->state));
906 		return b;
907 	}
908 
909 	__check_watermark(c);
910 
911 	b = new_b;
912 	b->hold_count = 1;
913 	b->read_error = 0;
914 	b->write_error = 0;
915 	__link_buffer(b, block, LIST_CLEAN);
916 
917 	if (nf == NF_FRESH) {
918 		b->state = 0;
919 		return b;
920 	}
921 
922 	b->state = 1 << B_READING;
923 	*need_submit = 1;
924 
925 	return b;
926 }
927 
928 /*
929  * The endio routine for reading: set the error, clear the bit and wake up
930  * anyone waiting on the buffer.
931  */
read_endio(struct bio * bio,int error)932 static void read_endio(struct bio *bio, int error)
933 {
934 	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
935 
936 	b->read_error = error;
937 
938 	BUG_ON(!test_bit(B_READING, &b->state));
939 
940 	smp_mb__before_clear_bit();
941 	clear_bit(B_READING, &b->state);
942 	smp_mb__after_clear_bit();
943 
944 	wake_up_bit(&b->state, B_READING);
945 }
946 
947 /*
948  * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
949  * functions is similar except that dm_bufio_new doesn't read the
950  * buffer from the disk (assuming that the caller overwrites all the data
951  * and uses dm_bufio_mark_buffer_dirty to write new data back).
952  */
new_read(struct dm_bufio_client * c,sector_t block,enum new_flag nf,struct dm_buffer ** bp)953 static void *new_read(struct dm_bufio_client *c, sector_t block,
954 		      enum new_flag nf, struct dm_buffer **bp)
955 {
956 	int need_submit;
957 	struct dm_buffer *b;
958 
959 	dm_bufio_lock(c);
960 	b = __bufio_new(c, block, nf, bp, &need_submit);
961 	dm_bufio_unlock(c);
962 
963 	if (!b || IS_ERR(b))
964 		return b;
965 
966 	if (need_submit)
967 		submit_io(b, READ, b->block, read_endio);
968 
969 	wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
970 
971 	if (b->read_error) {
972 		int error = b->read_error;
973 
974 		dm_bufio_release(b);
975 
976 		return ERR_PTR(error);
977 	}
978 
979 	*bp = b;
980 
981 	return b->data;
982 }
983 
dm_bufio_get(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)984 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
985 		   struct dm_buffer **bp)
986 {
987 	return new_read(c, block, NF_GET, bp);
988 }
989 EXPORT_SYMBOL_GPL(dm_bufio_get);
990 
dm_bufio_read(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)991 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
992 		    struct dm_buffer **bp)
993 {
994 	BUG_ON(dm_bufio_in_request());
995 
996 	return new_read(c, block, NF_READ, bp);
997 }
998 EXPORT_SYMBOL_GPL(dm_bufio_read);
999 
dm_bufio_new(struct dm_bufio_client * c,sector_t block,struct dm_buffer ** bp)1000 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1001 		   struct dm_buffer **bp)
1002 {
1003 	BUG_ON(dm_bufio_in_request());
1004 
1005 	return new_read(c, block, NF_FRESH, bp);
1006 }
1007 EXPORT_SYMBOL_GPL(dm_bufio_new);
1008 
dm_bufio_release(struct dm_buffer * b)1009 void dm_bufio_release(struct dm_buffer *b)
1010 {
1011 	struct dm_bufio_client *c = b->c;
1012 
1013 	dm_bufio_lock(c);
1014 
1015 	BUG_ON(test_bit(B_READING, &b->state));
1016 	BUG_ON(!b->hold_count);
1017 
1018 	b->hold_count--;
1019 	if (!b->hold_count) {
1020 		wake_up(&c->free_buffer_wait);
1021 
1022 		/*
1023 		 * If there were errors on the buffer, and the buffer is not
1024 		 * to be written, free the buffer. There is no point in caching
1025 		 * invalid buffer.
1026 		 */
1027 		if ((b->read_error || b->write_error) &&
1028 		    !test_bit(B_WRITING, &b->state) &&
1029 		    !test_bit(B_DIRTY, &b->state)) {
1030 			__unlink_buffer(b);
1031 			__free_buffer_wake(b);
1032 		}
1033 	}
1034 
1035 	dm_bufio_unlock(c);
1036 }
1037 EXPORT_SYMBOL_GPL(dm_bufio_release);
1038 
dm_bufio_mark_buffer_dirty(struct dm_buffer * b)1039 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1040 {
1041 	struct dm_bufio_client *c = b->c;
1042 
1043 	dm_bufio_lock(c);
1044 
1045 	if (!test_and_set_bit(B_DIRTY, &b->state))
1046 		__relink_lru(b, LIST_DIRTY);
1047 
1048 	dm_bufio_unlock(c);
1049 }
1050 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1051 
dm_bufio_write_dirty_buffers_async(struct dm_bufio_client * c)1052 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1053 {
1054 	BUG_ON(dm_bufio_in_request());
1055 
1056 	dm_bufio_lock(c);
1057 	__write_dirty_buffers_async(c, 0);
1058 	dm_bufio_unlock(c);
1059 }
1060 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1061 
1062 /*
1063  * For performance, it is essential that the buffers are written asynchronously
1064  * and simultaneously (so that the block layer can merge the writes) and then
1065  * waited upon.
1066  *
1067  * Finally, we flush hardware disk cache.
1068  */
dm_bufio_write_dirty_buffers(struct dm_bufio_client * c)1069 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1070 {
1071 	int a, f;
1072 	unsigned long buffers_processed = 0;
1073 	struct dm_buffer *b, *tmp;
1074 
1075 	dm_bufio_lock(c);
1076 	__write_dirty_buffers_async(c, 0);
1077 
1078 again:
1079 	list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1080 		int dropped_lock = 0;
1081 
1082 		if (buffers_processed < c->n_buffers[LIST_DIRTY])
1083 			buffers_processed++;
1084 
1085 		BUG_ON(test_bit(B_READING, &b->state));
1086 
1087 		if (test_bit(B_WRITING, &b->state)) {
1088 			if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1089 				dropped_lock = 1;
1090 				b->hold_count++;
1091 				dm_bufio_unlock(c);
1092 				wait_on_bit(&b->state, B_WRITING,
1093 					    do_io_schedule,
1094 					    TASK_UNINTERRUPTIBLE);
1095 				dm_bufio_lock(c);
1096 				b->hold_count--;
1097 			} else
1098 				wait_on_bit(&b->state, B_WRITING,
1099 					    do_io_schedule,
1100 					    TASK_UNINTERRUPTIBLE);
1101 		}
1102 
1103 		if (!test_bit(B_DIRTY, &b->state) &&
1104 		    !test_bit(B_WRITING, &b->state))
1105 			__relink_lru(b, LIST_CLEAN);
1106 
1107 		dm_bufio_cond_resched();
1108 
1109 		/*
1110 		 * If we dropped the lock, the list is no longer consistent,
1111 		 * so we must restart the search.
1112 		 *
1113 		 * In the most common case, the buffer just processed is
1114 		 * relinked to the clean list, so we won't loop scanning the
1115 		 * same buffer again and again.
1116 		 *
1117 		 * This may livelock if there is another thread simultaneously
1118 		 * dirtying buffers, so we count the number of buffers walked
1119 		 * and if it exceeds the total number of buffers, it means that
1120 		 * someone is doing some writes simultaneously with us.  In
1121 		 * this case, stop, dropping the lock.
1122 		 */
1123 		if (dropped_lock)
1124 			goto again;
1125 	}
1126 	wake_up(&c->free_buffer_wait);
1127 	dm_bufio_unlock(c);
1128 
1129 	a = xchg(&c->async_write_error, 0);
1130 	f = dm_bufio_issue_flush(c);
1131 	if (a)
1132 		return a;
1133 
1134 	return f;
1135 }
1136 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1137 
1138 /*
1139  * Use dm-io to send and empty barrier flush the device.
1140  */
dm_bufio_issue_flush(struct dm_bufio_client * c)1141 int dm_bufio_issue_flush(struct dm_bufio_client *c)
1142 {
1143 	struct dm_io_request io_req = {
1144 		.bi_rw = REQ_FLUSH,
1145 		.mem.type = DM_IO_KMEM,
1146 		.mem.ptr.addr = NULL,
1147 		.client = c->dm_io,
1148 	};
1149 	struct dm_io_region io_reg = {
1150 		.bdev = c->bdev,
1151 		.sector = 0,
1152 		.count = 0,
1153 	};
1154 
1155 	BUG_ON(dm_bufio_in_request());
1156 
1157 	return dm_io(&io_req, 1, &io_reg, NULL);
1158 }
1159 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1160 
1161 /*
1162  * We first delete any other buffer that may be at that new location.
1163  *
1164  * Then, we write the buffer to the original location if it was dirty.
1165  *
1166  * Then, if we are the only one who is holding the buffer, relink the buffer
1167  * in the hash queue for the new location.
1168  *
1169  * If there was someone else holding the buffer, we write it to the new
1170  * location but not relink it, because that other user needs to have the buffer
1171  * at the same place.
1172  */
dm_bufio_release_move(struct dm_buffer * b,sector_t new_block)1173 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1174 {
1175 	struct dm_bufio_client *c = b->c;
1176 	struct dm_buffer *new;
1177 
1178 	BUG_ON(dm_bufio_in_request());
1179 
1180 	dm_bufio_lock(c);
1181 
1182 retry:
1183 	new = __find(c, new_block);
1184 	if (new) {
1185 		if (new->hold_count) {
1186 			__wait_for_free_buffer(c);
1187 			goto retry;
1188 		}
1189 
1190 		/*
1191 		 * FIXME: Is there any point waiting for a write that's going
1192 		 * to be overwritten in a bit?
1193 		 */
1194 		__make_buffer_clean(new);
1195 		__unlink_buffer(new);
1196 		__free_buffer_wake(new);
1197 	}
1198 
1199 	BUG_ON(!b->hold_count);
1200 	BUG_ON(test_bit(B_READING, &b->state));
1201 
1202 	__write_dirty_buffer(b);
1203 	if (b->hold_count == 1) {
1204 		wait_on_bit(&b->state, B_WRITING,
1205 			    do_io_schedule, TASK_UNINTERRUPTIBLE);
1206 		set_bit(B_DIRTY, &b->state);
1207 		__unlink_buffer(b);
1208 		__link_buffer(b, new_block, LIST_DIRTY);
1209 	} else {
1210 		sector_t old_block;
1211 		wait_on_bit_lock(&b->state, B_WRITING,
1212 				 do_io_schedule, TASK_UNINTERRUPTIBLE);
1213 		/*
1214 		 * Relink buffer to "new_block" so that write_callback
1215 		 * sees "new_block" as a block number.
1216 		 * After the write, link the buffer back to old_block.
1217 		 * All this must be done in bufio lock, so that block number
1218 		 * change isn't visible to other threads.
1219 		 */
1220 		old_block = b->block;
1221 		__unlink_buffer(b);
1222 		__link_buffer(b, new_block, b->list_mode);
1223 		submit_io(b, WRITE, new_block, write_endio);
1224 		wait_on_bit(&b->state, B_WRITING,
1225 			    do_io_schedule, TASK_UNINTERRUPTIBLE);
1226 		__unlink_buffer(b);
1227 		__link_buffer(b, old_block, b->list_mode);
1228 	}
1229 
1230 	dm_bufio_unlock(c);
1231 	dm_bufio_release(b);
1232 }
1233 EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1234 
dm_bufio_get_block_size(struct dm_bufio_client * c)1235 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1236 {
1237 	return c->block_size;
1238 }
1239 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1240 
dm_bufio_get_device_size(struct dm_bufio_client * c)1241 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1242 {
1243 	return i_size_read(c->bdev->bd_inode) >>
1244 			   (SECTOR_SHIFT + c->sectors_per_block_bits);
1245 }
1246 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1247 
dm_bufio_get_block_number(struct dm_buffer * b)1248 sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1249 {
1250 	return b->block;
1251 }
1252 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1253 
dm_bufio_get_block_data(struct dm_buffer * b)1254 void *dm_bufio_get_block_data(struct dm_buffer *b)
1255 {
1256 	return b->data;
1257 }
1258 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1259 
dm_bufio_get_aux_data(struct dm_buffer * b)1260 void *dm_bufio_get_aux_data(struct dm_buffer *b)
1261 {
1262 	return b + 1;
1263 }
1264 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1265 
dm_bufio_get_client(struct dm_buffer * b)1266 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1267 {
1268 	return b->c;
1269 }
1270 EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1271 
drop_buffers(struct dm_bufio_client * c)1272 static void drop_buffers(struct dm_bufio_client *c)
1273 {
1274 	struct dm_buffer *b;
1275 	int i;
1276 
1277 	BUG_ON(dm_bufio_in_request());
1278 
1279 	/*
1280 	 * An optimization so that the buffers are not written one-by-one.
1281 	 */
1282 	dm_bufio_write_dirty_buffers_async(c);
1283 
1284 	dm_bufio_lock(c);
1285 
1286 	while ((b = __get_unclaimed_buffer(c)))
1287 		__free_buffer_wake(b);
1288 
1289 	for (i = 0; i < LIST_SIZE; i++)
1290 		list_for_each_entry(b, &c->lru[i], lru_list)
1291 			DMERR("leaked buffer %llx, hold count %u, list %d",
1292 			      (unsigned long long)b->block, b->hold_count, i);
1293 
1294 	for (i = 0; i < LIST_SIZE; i++)
1295 		BUG_ON(!list_empty(&c->lru[i]));
1296 
1297 	dm_bufio_unlock(c);
1298 }
1299 
1300 /*
1301  * Test if the buffer is unused and too old, and commit it.
1302  * At if noio is set, we must not do any I/O because we hold
1303  * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to
1304  * different bufio client.
1305  */
__cleanup_old_buffer(struct dm_buffer * b,gfp_t gfp,unsigned long max_jiffies)1306 static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp,
1307 				unsigned long max_jiffies)
1308 {
1309 	if (jiffies - b->last_accessed < max_jiffies)
1310 		return 1;
1311 
1312 	if (!(gfp & __GFP_IO)) {
1313 		if (test_bit(B_READING, &b->state) ||
1314 		    test_bit(B_WRITING, &b->state) ||
1315 		    test_bit(B_DIRTY, &b->state))
1316 			return 1;
1317 	}
1318 
1319 	if (b->hold_count)
1320 		return 1;
1321 
1322 	__make_buffer_clean(b);
1323 	__unlink_buffer(b);
1324 	__free_buffer_wake(b);
1325 
1326 	return 0;
1327 }
1328 
__scan(struct dm_bufio_client * c,unsigned long nr_to_scan,struct shrink_control * sc)1329 static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1330 		   struct shrink_control *sc)
1331 {
1332 	int l;
1333 	struct dm_buffer *b, *tmp;
1334 
1335 	for (l = 0; l < LIST_SIZE; l++) {
1336 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list)
1337 			if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) &&
1338 			    !--nr_to_scan)
1339 				return;
1340 		dm_bufio_cond_resched();
1341 	}
1342 }
1343 
shrink(struct shrinker * shrinker,struct shrink_control * sc)1344 static int shrink(struct shrinker *shrinker, struct shrink_control *sc)
1345 {
1346 	struct dm_bufio_client *c =
1347 	    container_of(shrinker, struct dm_bufio_client, shrinker);
1348 	unsigned long r;
1349 	unsigned long nr_to_scan = sc->nr_to_scan;
1350 
1351 	if (sc->gfp_mask & __GFP_IO)
1352 		dm_bufio_lock(c);
1353 	else if (!dm_bufio_trylock(c))
1354 		return !nr_to_scan ? 0 : -1;
1355 
1356 	if (nr_to_scan)
1357 		__scan(c, nr_to_scan, sc);
1358 
1359 	r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1360 	if (r > INT_MAX)
1361 		r = INT_MAX;
1362 
1363 	dm_bufio_unlock(c);
1364 
1365 	return r;
1366 }
1367 
1368 /*
1369  * Create the buffering interface
1370  */
dm_bufio_client_create(struct block_device * bdev,unsigned block_size,unsigned reserved_buffers,unsigned aux_size,void (* alloc_callback)(struct dm_buffer *),void (* write_callback)(struct dm_buffer *))1371 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1372 					       unsigned reserved_buffers, unsigned aux_size,
1373 					       void (*alloc_callback)(struct dm_buffer *),
1374 					       void (*write_callback)(struct dm_buffer *))
1375 {
1376 	int r;
1377 	struct dm_bufio_client *c;
1378 	unsigned i;
1379 
1380 	BUG_ON(block_size < 1 << SECTOR_SHIFT ||
1381 	       (block_size & (block_size - 1)));
1382 
1383 	c = kmalloc(sizeof(*c), GFP_KERNEL);
1384 	if (!c) {
1385 		r = -ENOMEM;
1386 		goto bad_client;
1387 	}
1388 	c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS);
1389 	if (!c->cache_hash) {
1390 		r = -ENOMEM;
1391 		goto bad_hash;
1392 	}
1393 
1394 	c->bdev = bdev;
1395 	c->block_size = block_size;
1396 	c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
1397 	c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ?
1398 				  ffs(block_size) - 1 - PAGE_SHIFT : 0;
1399 	c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ?
1400 				  PAGE_SHIFT - (ffs(block_size) - 1) : 0);
1401 
1402 	c->aux_size = aux_size;
1403 	c->alloc_callback = alloc_callback;
1404 	c->write_callback = write_callback;
1405 
1406 	for (i = 0; i < LIST_SIZE; i++) {
1407 		INIT_LIST_HEAD(&c->lru[i]);
1408 		c->n_buffers[i] = 0;
1409 	}
1410 
1411 	for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1412 		INIT_HLIST_HEAD(&c->cache_hash[i]);
1413 
1414 	mutex_init(&c->lock);
1415 	INIT_LIST_HEAD(&c->reserved_buffers);
1416 	c->need_reserved_buffers = reserved_buffers;
1417 
1418 	init_waitqueue_head(&c->free_buffer_wait);
1419 	c->async_write_error = 0;
1420 
1421 	c->dm_io = dm_io_client_create();
1422 	if (IS_ERR(c->dm_io)) {
1423 		r = PTR_ERR(c->dm_io);
1424 		goto bad_dm_io;
1425 	}
1426 
1427 	mutex_lock(&dm_bufio_clients_lock);
1428 	if (c->blocks_per_page_bits) {
1429 		if (!DM_BUFIO_CACHE_NAME(c)) {
1430 			DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size);
1431 			if (!DM_BUFIO_CACHE_NAME(c)) {
1432 				r = -ENOMEM;
1433 				mutex_unlock(&dm_bufio_clients_lock);
1434 				goto bad_cache;
1435 			}
1436 		}
1437 
1438 		if (!DM_BUFIO_CACHE(c)) {
1439 			DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c),
1440 							      c->block_size,
1441 							      c->block_size, 0, NULL);
1442 			if (!DM_BUFIO_CACHE(c)) {
1443 				r = -ENOMEM;
1444 				mutex_unlock(&dm_bufio_clients_lock);
1445 				goto bad_cache;
1446 			}
1447 		}
1448 	}
1449 	mutex_unlock(&dm_bufio_clients_lock);
1450 
1451 	while (c->need_reserved_buffers) {
1452 		struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1453 
1454 		if (!b) {
1455 			r = -ENOMEM;
1456 			goto bad_buffer;
1457 		}
1458 		__free_buffer_wake(b);
1459 	}
1460 
1461 	mutex_lock(&dm_bufio_clients_lock);
1462 	dm_bufio_client_count++;
1463 	list_add(&c->client_list, &dm_bufio_all_clients);
1464 	__cache_size_refresh();
1465 	mutex_unlock(&dm_bufio_clients_lock);
1466 
1467 	c->shrinker.shrink = shrink;
1468 	c->shrinker.seeks = 1;
1469 	c->shrinker.batch = 0;
1470 	register_shrinker(&c->shrinker);
1471 
1472 	return c;
1473 
1474 bad_buffer:
1475 bad_cache:
1476 	while (!list_empty(&c->reserved_buffers)) {
1477 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1478 						 struct dm_buffer, lru_list);
1479 		list_del(&b->lru_list);
1480 		free_buffer(b);
1481 	}
1482 	dm_io_client_destroy(c->dm_io);
1483 bad_dm_io:
1484 	vfree(c->cache_hash);
1485 bad_hash:
1486 	kfree(c);
1487 bad_client:
1488 	return ERR_PTR(r);
1489 }
1490 EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1491 
1492 /*
1493  * Free the buffering interface.
1494  * It is required that there are no references on any buffers.
1495  */
dm_bufio_client_destroy(struct dm_bufio_client * c)1496 void dm_bufio_client_destroy(struct dm_bufio_client *c)
1497 {
1498 	unsigned i;
1499 
1500 	drop_buffers(c);
1501 
1502 	unregister_shrinker(&c->shrinker);
1503 
1504 	mutex_lock(&dm_bufio_clients_lock);
1505 
1506 	list_del(&c->client_list);
1507 	dm_bufio_client_count--;
1508 	__cache_size_refresh();
1509 
1510 	mutex_unlock(&dm_bufio_clients_lock);
1511 
1512 	for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++)
1513 		BUG_ON(!hlist_empty(&c->cache_hash[i]));
1514 
1515 	BUG_ON(c->need_reserved_buffers);
1516 
1517 	while (!list_empty(&c->reserved_buffers)) {
1518 		struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1519 						 struct dm_buffer, lru_list);
1520 		list_del(&b->lru_list);
1521 		free_buffer(b);
1522 	}
1523 
1524 	for (i = 0; i < LIST_SIZE; i++)
1525 		if (c->n_buffers[i])
1526 			DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1527 
1528 	for (i = 0; i < LIST_SIZE; i++)
1529 		BUG_ON(c->n_buffers[i]);
1530 
1531 	dm_io_client_destroy(c->dm_io);
1532 	vfree(c->cache_hash);
1533 	kfree(c);
1534 }
1535 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1536 
cleanup_old_buffers(void)1537 static void cleanup_old_buffers(void)
1538 {
1539 	unsigned long max_age = dm_bufio_max_age;
1540 	struct dm_bufio_client *c;
1541 
1542 	barrier();
1543 
1544 	if (max_age > ULONG_MAX / HZ)
1545 		max_age = ULONG_MAX / HZ;
1546 
1547 	mutex_lock(&dm_bufio_clients_lock);
1548 	list_for_each_entry(c, &dm_bufio_all_clients, client_list) {
1549 		if (!dm_bufio_trylock(c))
1550 			continue;
1551 
1552 		while (!list_empty(&c->lru[LIST_CLEAN])) {
1553 			struct dm_buffer *b;
1554 			b = list_entry(c->lru[LIST_CLEAN].prev,
1555 				       struct dm_buffer, lru_list);
1556 			if (__cleanup_old_buffer(b, 0, max_age * HZ))
1557 				break;
1558 			dm_bufio_cond_resched();
1559 		}
1560 
1561 		dm_bufio_unlock(c);
1562 		dm_bufio_cond_resched();
1563 	}
1564 	mutex_unlock(&dm_bufio_clients_lock);
1565 }
1566 
1567 static struct workqueue_struct *dm_bufio_wq;
1568 static struct delayed_work dm_bufio_work;
1569 
work_fn(struct work_struct * w)1570 static void work_fn(struct work_struct *w)
1571 {
1572 	cleanup_old_buffers();
1573 
1574 	queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1575 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
1576 }
1577 
1578 /*----------------------------------------------------------------
1579  * Module setup
1580  *--------------------------------------------------------------*/
1581 
1582 /*
1583  * This is called only once for the whole dm_bufio module.
1584  * It initializes memory limit.
1585  */
dm_bufio_init(void)1586 static int __init dm_bufio_init(void)
1587 {
1588 	__u64 mem;
1589 
1590 	memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
1591 	memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
1592 
1593 	mem = (__u64)((totalram_pages - totalhigh_pages) *
1594 		      DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT;
1595 
1596 	if (mem > ULONG_MAX)
1597 		mem = ULONG_MAX;
1598 
1599 #ifdef CONFIG_MMU
1600 	/*
1601 	 * Get the size of vmalloc space the same way as VMALLOC_TOTAL
1602 	 * in fs/proc/internal.h
1603 	 */
1604 	if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100)
1605 		mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100;
1606 #endif
1607 
1608 	dm_bufio_default_cache_size = mem;
1609 
1610 	mutex_lock(&dm_bufio_clients_lock);
1611 	__cache_size_refresh();
1612 	mutex_unlock(&dm_bufio_clients_lock);
1613 
1614 	dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache");
1615 	if (!dm_bufio_wq)
1616 		return -ENOMEM;
1617 
1618 	INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
1619 	queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1620 			   DM_BUFIO_WORK_TIMER_SECS * HZ);
1621 
1622 	return 0;
1623 }
1624 
1625 /*
1626  * This is called once when unloading the dm_bufio module.
1627  */
dm_bufio_exit(void)1628 static void __exit dm_bufio_exit(void)
1629 {
1630 	int bug = 0;
1631 	int i;
1632 
1633 	cancel_delayed_work_sync(&dm_bufio_work);
1634 	destroy_workqueue(dm_bufio_wq);
1635 
1636 	for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) {
1637 		struct kmem_cache *kc = dm_bufio_caches[i];
1638 
1639 		if (kc)
1640 			kmem_cache_destroy(kc);
1641 	}
1642 
1643 	for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
1644 		kfree(dm_bufio_cache_names[i]);
1645 
1646 	if (dm_bufio_client_count) {
1647 		DMCRIT("%s: dm_bufio_client_count leaked: %d",
1648 			__func__, dm_bufio_client_count);
1649 		bug = 1;
1650 	}
1651 
1652 	if (dm_bufio_current_allocated) {
1653 		DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
1654 			__func__, dm_bufio_current_allocated);
1655 		bug = 1;
1656 	}
1657 
1658 	if (dm_bufio_allocated_get_free_pages) {
1659 		DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
1660 		       __func__, dm_bufio_allocated_get_free_pages);
1661 		bug = 1;
1662 	}
1663 
1664 	if (dm_bufio_allocated_vmalloc) {
1665 		DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
1666 		       __func__, dm_bufio_allocated_vmalloc);
1667 		bug = 1;
1668 	}
1669 
1670 	if (bug)
1671 		BUG();
1672 }
1673 
1674 module_init(dm_bufio_init)
1675 module_exit(dm_bufio_exit)
1676 
1677 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
1678 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
1679 
1680 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
1681 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
1682 
1683 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
1684 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
1685 
1686 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
1687 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
1688 
1689 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
1690 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
1691 
1692 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
1693 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
1694 
1695 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
1696 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
1697 
1698 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
1699 MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
1700 MODULE_LICENSE("GPL");
1701