fs/bcachefs/journal.h

1 /* SPDX-License-Identifier: GPL-2.0 */
9  * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
11  * Without the journal, the b-tree is always internally consistent on
12  * disk - and in fact, in the earliest incarnations bcache didn't have a journal
17  * (for simplicity) - this may change eventually but updates to interior nodes
20  * This means the journal is relatively separate from the b-tree; it consists of
27  * disk), the journal entry will be written out immediately (or as soon as
45  * numbers - jset->seq; other places in the code refer to this sequence number.
48  * into the b-tree). We need a container to indicate which b-tree the key is
49  * for; also, the roots of the various b-trees are stored in jset_entry entries
50  * (one for each b-tree) - this lets us add new b-tree types without changing
51  * the on disk format.
54  * superblock - all the things that are frequently updated. This is for future
60  * JOURNAL LAYOUT ON DISK:
63  * superblock); the individual buckets are not necessarily contiguous on disk
68  * managed by the allocator and garbage collection - garbage collection marks
73  * Open/dirty journal entries are journal entries that contain b-tree updates
74  * that have not yet been written out to the b-tree on disk. We have to track
79  * On disk, this is represented with the "last_seq" field of struct jset;
82  * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
83  * journal_device->seq) of for each journal bucket, the highest sequence number
89  * (where each entry corresponds to a specific sequence number) - when a ref
92  * Journalling of index updates is done at the same time as the b-tree itself is
93  * being modified (see btree_insert_key()); when we add the key to the journal
94  * the pending b-tree write takes a ref on the journal entry the key was added
95  * to. If a pending b-tree write would need to take refs on multiple dirty
108  * allocate space for a journal write again - preferentially flushing btree
120 	wake_up(&j->wait);  in journal_wake()
121 	closure_wake_up(&j->async_wait);  in journal_wake()
128 	return j->pin.front;  in journal_last_seq()
133 	return atomic64_read(&j->seq);  in journal_cur_seq()
138 	return j->seq_ondisk + 1;  in journal_last_unwritten_seq()
145 			~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;  in journal_cur_buf()
147 	return j->buf + idx;  in journal_cur_buf()
164 	if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR)  in journal_state_seq_count()
172 	s->buf0_count += s->idx == 0;  in journal_state_inc()
173 	s->buf1_count += s->idx == 1;  in journal_state_inc()
174 	s->buf2_count += s->idx == 2;  in journal_state_inc()
175 	s->buf3_count += s->idx == 3;  in journal_state_inc()
189 	return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;  in journal_entry_overhead()
195 	struct jset *jset = buf->data;  in bch2_journal_add_entry_noreservation()
196 	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));  in bch2_journal_add_entry_noreservation()
199 	entry->u64s = cpu_to_le16(u64s);  in bch2_journal_add_entry_noreservation()
201 	le32_add_cpu(&jset->u64s, jset_u64s(u64s));  in bch2_journal_add_entry_noreservation()
209 	return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);  in journal_res_entry()
216 	entry->u64s	= cpu_to_le16(u64s);  in journal_entry_init()
217 	entry->btree_id = id;  in journal_entry_init()
218 	entry->level	= level;  in journal_entry_init()
219 	entry->type	= type;  in journal_entry_init()
220 	entry->pad[0]	= 0;  in journal_entry_init()
221 	entry->pad[1]	= 0;  in journal_entry_init()
222 	entry->pad[2]	= 0;  in journal_entry_init()
232 	memcpy_u64s_small(entry->_data, data, u64s);  in journal_entry_set()
244 	EBUG_ON(!res->ref);  in bch2_journal_add_entry()
245 	EBUG_ON(actual > res->u64s);  in bch2_journal_add_entry()
247 	res->offset	+= actual;  in bch2_journal_add_entry()
248 	res->u64s	-= actual;  in bch2_journal_add_entry()
254 	if (j->seq != j->last_seq)  in journal_entry_empty()
258 		if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)  in journal_entry_empty()
275 				    }).v, &j->reservations.counter);  in journal_state_buf_put()
300 		spin_lock(&j->lock);  in bch2_journal_buf_put()
302 		spin_unlock(&j->lock);  in bch2_journal_buf_put()
304 		wake_up(&j->wait);  in bch2_journal_buf_put()
309  * then proceed to add their keys as well.
314 	if (!res->ref)  in bch2_journal_res_put()
317 	lock_release(&j->res_map, _THIS_IP_);  in bch2_journal_res_put()
319 	while (res->u64s)  in bch2_journal_res_put()
324 	bch2_journal_buf_put(j, res->seq);  in bch2_journal_res_put()
326 	res->ref = 0;  in bch2_journal_res_put()
347 	old.v = atomic64_read(&j->reservations.counter);  in journal_res_get_fast()
357 		if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)  in journal_res_get_fast()
362 		if ((flags & BCH_WATERMARK_MASK) < j->watermark)  in journal_res_get_fast()
365 		new.cur_entry_offset += res->u64s;  in journal_res_get_fast()
370 		 * XXX - tracepoint this:  in journal_res_get_fast()
377 	} while (!atomic64_try_cmpxchg(&j->reservations.counter,  in journal_res_get_fast()
380 	res->ref	= true;  in journal_res_get_fast()
381 	res->offset	= old.cur_entry_offset;  in journal_res_get_fast()
382 	res->seq	= journal_cur_seq(j);  in journal_res_get_fast()
383 	res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;  in journal_res_get_fast()
393 	EBUG_ON(res->ref);  in bch2_journal_res_get()
394 	EBUG_ON(!test_bit(JOURNAL_running, &j->flags));  in bch2_journal_res_get()
396 	res->u64s = u64s;  in bch2_journal_res_get()
406 		lock_acquire_shared(&j->res_map, 0,  in bch2_journal_res_get()
409 		EBUG_ON(!res->ref);  in bch2_journal_res_get()
410 		BUG_ON(!res->seq);  in bch2_journal_res_get()
434 	return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL  in bch2_journal_error()
435 		? -BCH_ERR_journal_shutdown : 0;  in bch2_journal_error()