1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
3 #define _BCACHEFS_DISK_ACCOUNTING_FORMAT_H
4
5 #include "replicas_format.h"
6
7 /*
8 * Disk accounting - KEY_TYPE_accounting - on disk format:
9 *
10 * Here, the key has considerably more structure than a typical key (bpos); an
11 * accounting key is 'struct disk_accounting_pos', which is a union of bpos.
12 *
13 * More specifically: a key is just a muliword integer (where word endianness
14 * matches native byte order), so we're treating bpos as an opaque 20 byte
15 * integer and mapping bch_accounting_key to that.
16 *
17 * This is a type-tagged union of all our various subtypes; a disk accounting
18 * key can be device counters, replicas counters, et cetera - it's extensible.
19 *
20 * The value is a list of u64s or s64s; the number of counters is specific to a
21 * given accounting type.
22 *
23 * Unlike with other key types, updates are _deltas_, and the deltas are not
24 * resolved until the update to the underlying btree, done by btree write buffer
25 * flush or journal replay.
26 *
27 * Journal replay in particular requires special handling. The journal tracks a
28 * range of entries which may possibly have not yet been applied to the btree
29 * yet - it does not know definitively whether individual entries are dirty and
30 * still need to be applied.
31 *
32 * To handle this, we use the version field of struct bkey, and give every
33 * accounting update a unique version number - a total ordering in time; the
34 * version number is derived from the key's position in the journal. Then
35 * journal replay can compare the version number of the key from the journal
36 * with the version number of the key in the btree to determine if a key needs
37 * to be replayed.
38 *
39 * For this to work, we must maintain this strict time ordering of updates as
40 * they are flushed to the btree, both via write buffer flush and via journal
41 * replay. This has complications for the write buffer code while journal replay
42 * is still in progress; the write buffer cannot flush any accounting keys to
43 * the btree until journal replay has finished replaying its accounting keys, or
44 * the (newer) version number of the keys from the write buffer will cause
45 * updates from journal replay to be lost.
46 */
47
48 struct bch_accounting {
49 struct bch_val v;
50 __u64 d[];
51 };
52
53 #define BCH_ACCOUNTING_MAX_COUNTERS 3
54
55 #define BCH_DATA_TYPES() \
56 x(free, 0) \
57 x(sb, 1) \
58 x(journal, 2) \
59 x(btree, 3) \
60 x(user, 4) \
61 x(cached, 5) \
62 x(parity, 6) \
63 x(stripe, 7) \
64 x(need_gc_gens, 8) \
65 x(need_discard, 9) \
66 x(unstriped, 10)
67
68 enum bch_data_type {
69 #define x(t, n) BCH_DATA_##t,
70 BCH_DATA_TYPES()
71 #undef x
72 BCH_DATA_NR
73 };
74
data_type_is_empty(enum bch_data_type type)75 static inline bool data_type_is_empty(enum bch_data_type type)
76 {
77 switch (type) {
78 case BCH_DATA_free:
79 case BCH_DATA_need_gc_gens:
80 case BCH_DATA_need_discard:
81 return true;
82 default:
83 return false;
84 }
85 }
86
data_type_is_hidden(enum bch_data_type type)87 static inline bool data_type_is_hidden(enum bch_data_type type)
88 {
89 switch (type) {
90 case BCH_DATA_sb:
91 case BCH_DATA_journal:
92 return true;
93 default:
94 return false;
95 }
96 }
97
98 /*
99 * field 1: name
100 * field 2: id
101 * field 3: number of counters (max 3)
102 */
103
104 #define BCH_DISK_ACCOUNTING_TYPES() \
105 x(nr_inodes, 0, 1) \
106 x(persistent_reserved, 1, 1) \
107 x(replicas, 2, 1) \
108 x(dev_data_type, 3, 3) \
109 x(compression, 4, 3) \
110 x(snapshot, 5, 1) \
111 x(btree, 6, 1) \
112 x(rebalance_work, 7, 1) \
113 x(inum, 8, 3)
114
115 enum disk_accounting_type {
116 #define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
117 BCH_DISK_ACCOUNTING_TYPES()
118 #undef x
119 BCH_DISK_ACCOUNTING_TYPE_NR,
120 };
121
122 /*
123 * No subtypes - number of inodes in the entire filesystem
124 *
125 * XXX: perhaps we could add a per-subvolume counter?
126 */
127 struct bch_acct_nr_inodes {
128 };
129
130 /*
131 * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
132 * reservation:
133 */
134 struct bch_acct_persistent_reserved {
135 __u8 nr_replicas;
136 };
137
138 /*
139 * device, data type counter fields:
140 * [
141 * nr_buckets
142 * live sectors (in buckets of that data type)
143 * sectors of internal fragmentation
144 * ]
145 *
146 * XXX: live sectors should've been done differently, you can have multiple data
147 * types in the same bucket (user, stripe, cached) and this collapses them to
148 * the bucket data type, and makes the internal fragmentation counter redundant
149 */
150 struct bch_acct_dev_data_type {
151 __u8 dev;
152 __u8 data_type;
153 };
154
155 /*
156 * Compression type fields:
157 * [
158 * number of extents
159 * uncompressed size
160 * compressed size
161 * ]
162 *
163 * Compression ratio, average extent size (fragmentation).
164 */
165 struct bch_acct_compression {
166 __u8 type;
167 };
168
169 /*
170 * On disk usage by snapshot id; counts same values as replicas counter, but
171 * aggregated differently
172 */
173 struct bch_acct_snapshot {
174 __u32 id;
175 } __packed;
176
177 struct bch_acct_btree {
178 __u32 id;
179 } __packed;
180
181 /*
182 * inum counter fields:
183 * [
184 * number of extents
185 * sum of extent sizes - bkey size
186 * this field is similar to inode.bi_sectors, except here extents in
187 * different snapshots but the same inode number are all collapsed to the
188 * same counter
189 * sum of on disk size - same values tracked by replicas counters
190 * ]
191 *
192 * This tracks on disk fragmentation.
193 */
194 struct bch_acct_inum {
195 __u64 inum;
196 } __packed;
197
198 /*
199 * Simple counter of the amount of data (on disk sectors) rebalance needs to
200 * move, extents counted here are also in the rebalance_work btree.
201 */
202 struct bch_acct_rebalance_work {
203 };
204
205 struct disk_accounting_pos {
206 union {
207 struct {
208 __u8 type;
209 union {
210 struct bch_acct_nr_inodes nr_inodes;
211 struct bch_acct_persistent_reserved persistent_reserved;
212 struct bch_replicas_entry_v1 replicas;
213 struct bch_acct_dev_data_type dev_data_type;
214 struct bch_acct_compression compression;
215 struct bch_acct_snapshot snapshot;
216 struct bch_acct_btree btree;
217 struct bch_acct_rebalance_work rebalance_work;
218 struct bch_acct_inum inum;
219 } __packed;
220 } __packed;
221 struct bpos _pad;
222 };
223 };
224
225 #endif /* _BCACHEFS_DISK_ACCOUNTING_FORMAT_H */
226