1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _BCACHEFS_JOURNAL_TYPES_H
3 #define _BCACHEFS_JOURNAL_TYPES_H
4 
5 #include <linux/cache.h>
6 #include <linux/workqueue.h>
7 
8 #include "alloc_types.h"
9 #include "super_types.h"
10 #include "fifo.h"
11 
12 #define JOURNAL_BUF_BITS	2
13 #define JOURNAL_BUF_NR		(1U << JOURNAL_BUF_BITS)
14 #define JOURNAL_BUF_MASK	(JOURNAL_BUF_NR - 1)
15 
16 /*
17  * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
18  * the journal that are being staged or in flight.
19  */
20 struct journal_buf {
21 	struct jset		*data;
22 
23 	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
24 	struct bch_devs_list	devs_written;
25 
26 	struct closure_waitlist	wait;
27 	u64			last_seq;	/* copy of data->last_seq */
28 	long			expires;
29 	u64			flush_time;
30 
31 	unsigned		buf_size;	/* size in bytes of @data */
32 	unsigned		sectors;	/* maximum size for current entry */
33 	unsigned		disk_sectors;	/* maximum size entry could have been, if
34 						   buf_size was bigger */
35 	unsigned		u64s_reserved;
36 	bool			noflush;	/* write has already been kicked off, and was noflush */
37 	bool			must_flush;	/* something wants a flush */
38 	bool			separate_flush;
39 	bool			need_flush_to_write_buffer;
40 };
41 
42 /*
43  * Something that makes a journal entry dirty - i.e. a btree node that has to be
44  * flushed:
45  */
46 
47 enum journal_pin_type {
48 	JOURNAL_PIN_btree,
49 	JOURNAL_PIN_key_cache,
50 	JOURNAL_PIN_other,
51 	JOURNAL_PIN_NR,
52 };
53 
54 struct journal_entry_pin_list {
55 	struct list_head		list[JOURNAL_PIN_NR];
56 	struct list_head		flushed;
57 	atomic_t			count;
58 	struct bch_devs_list		devs;
59 };
60 
61 struct journal;
62 struct journal_entry_pin;
63 typedef int (*journal_pin_flush_fn)(struct journal *j,
64 				struct journal_entry_pin *, u64);
65 
66 struct journal_entry_pin {
67 	struct list_head		list;
68 	journal_pin_flush_fn		flush;
69 	u64				seq;
70 };
71 
72 struct journal_res {
73 	bool			ref;
74 	u8			idx;
75 	u16			u64s;
76 	u32			offset;
77 	u64			seq;
78 };
79 
80 union journal_res_state {
81 	struct {
82 		atomic64_t	counter;
83 	};
84 
85 	struct {
86 		u64		v;
87 	};
88 
89 	struct {
90 		u64		cur_entry_offset:20,
91 				idx:2,
92 				unwritten_idx:2,
93 				buf0_count:10,
94 				buf1_count:10,
95 				buf2_count:10,
96 				buf3_count:10;
97 	};
98 };
99 
100 /* bytes: */
101 #define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
102 #define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */
103 
104 /*
105  * We stash some journal state as sentinal values in cur_entry_offset:
106  * note - cur_entry_offset is in units of u64s
107  */
108 #define JOURNAL_ENTRY_OFFSET_MAX	((1U << 20) - 1)
109 
110 #define JOURNAL_ENTRY_CLOSED_VAL	(JOURNAL_ENTRY_OFFSET_MAX - 1)
111 #define JOURNAL_ENTRY_ERROR_VAL		(JOURNAL_ENTRY_OFFSET_MAX)
112 
113 struct journal_space {
114 	/* Units of 512 bytes sectors: */
115 	unsigned	next_entry; /* How big the next journal entry can be */
116 	unsigned	total;
117 };
118 
119 enum journal_space_from {
120 	journal_space_discarded,
121 	journal_space_clean_ondisk,
122 	journal_space_clean,
123 	journal_space_total,
124 	journal_space_nr,
125 };
126 
127 enum journal_flags {
128 	JOURNAL_REPLAY_DONE,
129 	JOURNAL_STARTED,
130 	JOURNAL_MAY_SKIP_FLUSH,
131 	JOURNAL_NEED_FLUSH_WRITE,
132 };
133 
134 /* Reasons we may fail to get a journal reservation: */
135 #define JOURNAL_ERRORS()		\
136 	x(ok)				\
137 	x(blocked)			\
138 	x(max_in_flight)		\
139 	x(journal_full)			\
140 	x(journal_pin_full)		\
141 	x(journal_stuck)		\
142 	x(insufficient_devices)
143 
144 enum journal_errors {
145 #define x(n)	JOURNAL_ERR_##n,
146 	JOURNAL_ERRORS()
147 #undef x
148 };
149 
150 typedef DARRAY(u64)		darray_u64;
151 
152 /* Embedded in struct bch_fs */
153 struct journal {
154 	/* Fastpath stuff up front: */
155 	struct {
156 
157 	union journal_res_state reservations;
158 	enum bch_watermark	watermark;
159 
160 	} __aligned(SMP_CACHE_BYTES);
161 
162 	unsigned long		flags;
163 
164 	/* Max size of current journal entry */
165 	unsigned		cur_entry_u64s;
166 	unsigned		cur_entry_sectors;
167 
168 	/* Reserved space in journal entry to be used just prior to write */
169 	unsigned		entry_u64s_reserved;
170 
171 
172 	/*
173 	 * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
174 	 * insufficient devices:
175 	 */
176 	enum journal_errors	cur_entry_error;
177 
178 	unsigned		buf_size_want;
179 	/*
180 	 * We may queue up some things to be journalled (log messages) before
181 	 * the journal has actually started - stash them here:
182 	 */
183 	darray_u64		early_journal_entries;
184 
185 	/*
186 	 * Protects journal_buf->data, when accessing without a jorunal
187 	 * reservation: for synchronization between the btree write buffer code
188 	 * and the journal write path:
189 	 */
190 	struct mutex		buf_lock;
191 	/*
192 	 * Two journal entries -- one is currently open for new entries, the
193 	 * other is possibly being written out.
194 	 */
195 	struct journal_buf	buf[JOURNAL_BUF_NR];
196 
197 	spinlock_t		lock;
198 
199 	/* if nonzero, we may not open a new journal entry: */
200 	unsigned		blocked;
201 
202 	/* Used when waiting because the journal was full */
203 	wait_queue_head_t	wait;
204 	struct closure_waitlist	async_wait;
205 
206 	struct closure		io;
207 	struct delayed_work	write_work;
208 
209 	/* Sequence number of most recent journal entry (last entry in @pin) */
210 	atomic64_t		seq;
211 
212 	/* seq, last_seq from the most recent journal entry successfully written */
213 	u64			seq_ondisk;
214 	u64			flushed_seq_ondisk;
215 	u64			last_seq_ondisk;
216 	u64			err_seq;
217 	u64			last_empty_seq;
218 
219 	/*
220 	 * FIFO of journal entries whose btree updates have not yet been
221 	 * written out.
222 	 *
223 	 * Each entry is a reference count. The position in the FIFO is the
224 	 * entry's sequence number relative to @seq.
225 	 *
226 	 * The journal entry itself holds a reference count, put when the
227 	 * journal entry is written out. Each btree node modified by the journal
228 	 * entry also holds a reference count, put when the btree node is
229 	 * written.
230 	 *
231 	 * When a reference count reaches zero, the journal entry is no longer
232 	 * needed. When all journal entries in the oldest journal bucket are no
233 	 * longer needed, the bucket can be discarded and reused.
234 	 */
235 	struct {
236 		u64 front, back, size, mask;
237 		struct journal_entry_pin_list *data;
238 	}			pin;
239 
240 	struct journal_space	space[journal_space_nr];
241 
242 	u64			replay_journal_seq;
243 	u64			replay_journal_seq_end;
244 
245 	struct write_point	wp;
246 	spinlock_t		err_lock;
247 
248 	struct mutex		reclaim_lock;
249 	/*
250 	 * Used for waiting until journal reclaim has freed up space in the
251 	 * journal:
252 	 */
253 	wait_queue_head_t	reclaim_wait;
254 	struct task_struct	*reclaim_thread;
255 	bool			reclaim_kicked;
256 	unsigned long		next_reclaim;
257 	u64			nr_direct_reclaim;
258 	u64			nr_background_reclaim;
259 
260 	unsigned long		last_flushed;
261 	struct journal_entry_pin *flush_in_progress;
262 	bool			flush_in_progress_dropped;
263 	wait_queue_head_t	pin_flush_wait;
264 
265 	/* protects advancing ja->discard_idx: */
266 	struct mutex		discard_lock;
267 	bool			can_discard;
268 
269 	unsigned long		last_flush_write;
270 
271 	u64			write_start_time;
272 
273 	u64			nr_flush_writes;
274 	u64			nr_noflush_writes;
275 	u64			entry_bytes_written;
276 
277 	u64			low_on_space_start;
278 	u64			low_on_pin_start;
279 	u64			max_in_flight_start;
280 	u64			write_buffer_full_start;
281 
282 	struct bch2_time_stats	*flush_write_time;
283 	struct bch2_time_stats	*noflush_write_time;
284 	struct bch2_time_stats	*flush_seq_time;
285 
286 #ifdef CONFIG_DEBUG_LOCK_ALLOC
287 	struct lockdep_map	res_map;
288 #endif
289 } __aligned(SMP_CACHE_BYTES);
290 
291 /*
292  * Embedded in struct bch_dev. First three fields refer to the array of journal
293  * buckets, in bch_sb.
294  */
295 struct journal_device {
296 	/*
297 	 * For each journal bucket, contains the max sequence number of the
298 	 * journal writes it contains - so we know when a bucket can be reused.
299 	 */
300 	u64			*bucket_seq;
301 
302 	unsigned		sectors_free;
303 
304 	/*
305 	 * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
306 	 */
307 	unsigned		discard_idx;		/* Next bucket to discard */
308 	unsigned		dirty_idx_ondisk;
309 	unsigned		dirty_idx;
310 	unsigned		cur_idx;		/* Journal bucket we're currently writing to */
311 	unsigned		nr;
312 
313 	u64			*buckets;
314 
315 	/* Bio for journal reads/writes to this device */
316 	struct bio		*bio;
317 
318 	/* for bch_journal_read_device */
319 	struct closure		read;
320 };
321 
322 /*
323  * journal_entry_res - reserve space in every journal entry:
324  */
325 struct journal_entry_res {
326 	unsigned		u64s;
327 };
328 
329 #endif /* _BCACHEFS_JOURNAL_TYPES_H */
330