1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
3 #define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
4 
5 #include "replicas_format.h"
6 
7 /*
8  * Disk accounting - KEY_TYPE_accounting - on disk format:
9  *
10  * Here, the key has considerably more structure than a typical key (bpos); an
11  * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
12  *
13  * More specifically: a key is just a muliword integer (where word endianness
14  * matches native byte order), so we're treating bpos as an opaque 20 byte
15  * integer and mapping bch_accounting_key to that.
16  *
17  * This is a type-tagged union of all our various subtypes; a disk accounting
18  * key can be device counters, replicas counters, et cetera - it's extensible.
19  *
20  * The value is a list of u64s or s64s; the number of counters is specific to a
21  * given accounting type.
22  *
23  * Unlike with other key types, updates are _deltas_, and the deltas are not
24  * resolved until the update to the underlying btree, done by btree write buffer
25  * flush or journal replay.
26  *
27  * Journal replay in particular requires special handling. The journal tracks a
28  * range of entries which may possibly have not yet been applied to the btree
29  * yet - it does not know definitively whether individual entries are dirty and
30  * still need to be applied.
31  *
32  * To handle this, we use the version field of struct bkey, and give every
33  * accounting update a unique version number - a total ordering in time; the
34  * version number is derived from the key's position in the journal. Then
35  * journal replay can compare the version number of the key from the journal
36  * with the version number of the key in the btree to determine if a key needs
37  * to be replayed.
38  *
39  * For this to work, we must maintain this strict time ordering of updates as
40  * they are flushed to the btree, both via write buffer flush and via journal
41  * replay. This has complications for the write buffer code while journal replay
42  * is still in progress; the write buffer cannot flush any accounting keys to
43  * the btree until journal replay has finished replaying its accounting keys, or
44  * the (newer) version number of the keys from the write buffer will cause
45  * updates from journal replay to be lost.
46  */
47 
48 struct bch_accounting {
49 	struct bch_val		v;
50 	__u64			d[];
51 };
52 
53 #define BCH_ACCOUNTING_MAX_COUNTERS		3
54 
55 #define BCH_DATA_TYPES()		\
56 	x(free,		0)		\
57 	x(sb,		1)		\
58 	x(journal,	2)		\
59 	x(btree,	3)		\
60 	x(user,		4)		\
61 	x(cached,	5)		\
62 	x(parity,	6)		\
63 	x(stripe,	7)		\
64 	x(need_gc_gens,	8)		\
65 	x(need_discard,	9)		\
66 	x(unstriped,	10)
67 
68 enum bch_data_type {
69 #define x(t, n) BCH_DATA_##t,
70 	BCH_DATA_TYPES()
71 #undef x
72 	BCH_DATA_NR
73 };
74 
data_type_is_empty(enum bch_data_type type)75 static inline bool data_type_is_empty(enum bch_data_type type)
76 {
77 	switch (type) {
78 	case BCH_DATA_free:
79 	case BCH_DATA_need_gc_gens:
80 	case BCH_DATA_need_discard:
81 		return true;
82 	default:
83 		return false;
84 	}
85 }
86 
data_type_is_hidden(enum bch_data_type type)87 static inline bool data_type_is_hidden(enum bch_data_type type)
88 {
89 	switch (type) {
90 	case BCH_DATA_sb:
91 	case BCH_DATA_journal:
92 		return true;
93 	default:
94 		return false;
95 	}
96 }
97 
98 /*
99  * field 1: name
100  * field 2: id
101  * field 3: number of counters (max 3)
102  */
103 
104 #define BCH_DISK_ACCOUNTING_TYPES()		\
105 	x(nr_inodes,		0,	1)	\
106 	x(persistent_reserved,	1,	1)	\
107 	x(replicas,		2,	1)	\
108 	x(dev_data_type,	3,	3)	\
109 	x(compression,		4,	3)	\
110 	x(snapshot,		5,	1)	\
111 	x(btree,		6,	1)	\
112 	x(rebalance_work,	7,	1)	\
113 	x(inum,			8,	3)
114 
115 enum disk_accounting_type {
116 #define x(f, nr, ...)	BCH_DISK_ACCOUNTING_##f	= nr,
117 	BCH_DISK_ACCOUNTING_TYPES()
118 #undef x
119 	BCH_DISK_ACCOUNTING_TYPE_NR,
120 };
121 
122 /*
123  * No subtypes - number of inodes in the entire filesystem
124  *
125  * XXX: perhaps we could add a per-subvolume counter?
126  */
127 struct bch_acct_nr_inodes {
128 };
129 
130 /*
131  * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
132  * reservation:
133  */
134 struct bch_acct_persistent_reserved {
135 	__u8			nr_replicas;
136 };
137 
138 /*
139  * device, data type counter fields:
140  * [
141  *   nr_buckets
142  *   live sectors (in buckets of that data type)
143  *   sectors of internal fragmentation
144  * ]
145  *
146  * XXX: live sectors should've been done differently, you can have multiple data
147  * types in the same bucket (user, stripe, cached) and this collapses them to
148  * the bucket data type, and makes the internal fragmentation counter redundant
149  */
150 struct bch_acct_dev_data_type {
151 	__u8			dev;
152 	__u8			data_type;
153 };
154 
155 /*
156  * Compression type fields:
157  * [
158  *   number of extents
159  *   uncompressed size
160  *   compressed size
161  * ]
162  *
163  * Compression ratio, average extent size (fragmentation).
164  */
165 struct bch_acct_compression {
166 	__u8			type;
167 };
168 
169 /*
170  * On disk usage by snapshot id; counts same values as replicas counter, but
171  * aggregated differently
172  */
173 struct bch_acct_snapshot {
174 	__u32			id;
175 } __packed;
176 
177 struct bch_acct_btree {
178 	__u32			id;
179 } __packed;
180 
181 /*
182  * inum counter fields:
183  * [
184  *   number of extents
185  *   sum of extent sizes - bkey size
186  *     this field is similar to inode.bi_sectors, except here extents in
187  *     different snapshots but the same inode number are all collapsed to the
188  *     same counter
189  *   sum of on disk size - same values tracked by replicas counters
190  * ]
191  *
192  * This tracks on disk fragmentation.
193  */
194 struct bch_acct_inum {
195 	__u64			inum;
196 } __packed;
197 
198 /*
199  * Simple counter of the amount of data (on disk sectors) rebalance needs to
200  * move, extents counted here are also in the rebalance_work btree.
201  */
202 struct bch_acct_rebalance_work {
203 };
204 
205 struct disk_accounting_pos {
206 	union {
207 	struct {
208 		__u8				type;
209 		union {
210 		struct bch_acct_nr_inodes	nr_inodes;
211 		struct bch_acct_persistent_reserved	persistent_reserved;
212 		struct bch_replicas_entry_v1	replicas;
213 		struct bch_acct_dev_data_type	dev_data_type;
214 		struct bch_acct_compression	compression;
215 		struct bch_acct_snapshot	snapshot;
216 		struct bch_acct_btree		btree;
217 		struct bch_acct_rebalance_work	rebalance_work;
218 		struct bch_acct_inum		inum;
219 		} __packed;
220 	} __packed;
221 		struct bpos			_pad;
222 	};
223 };
224 
225 #endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
226