1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
5 */
6
7 #ifndef BTRFS_RAID56_H
8 #define BTRFS_RAID56_H
9
10 #include <linux/types.h>
11 #include <linux/list.h>
12 #include <linux/spinlock.h>
13 #include <linux/bio.h>
14 #include <linux/refcount.h>
15 #include <linux/workqueue.h>
16 #include "volumes.h"
17
18 struct page;
19 struct btrfs_fs_info;
20
21 enum btrfs_rbio_ops {
22 BTRFS_RBIO_WRITE,
23 BTRFS_RBIO_READ_REBUILD,
24 BTRFS_RBIO_PARITY_SCRUB,
25 };
26
27 /*
28 * Overview of btrfs_raid_bio.
29 *
30 * One btrfs_raid_bio represents a full stripe of RAID56, including both data
31 * and P/Q stripes. For now, each data and P/Q stripe is of a fixed length (64K).
32 *
33 * One btrfs_raid_bio can have one or more bios from higher layer, covering
34 * part or all of the data stripes.
35 *
36 * [PAGES FROM HIGHER LAYER BIOS]
37 * Higher layer bios are in the btrfs_raid_bio::bio_list.
38 *
39 * Pages from the bio_list are represented like the following:
40 *
41 * bio_list: |<- Bio 1 ->| |<- Bio 2 ->| ...
42 * bio_paddrs: [0] [1] [2] [3] [4] [5] ...
43 *
44 * If there is a bio covering a sector (one btrfs fs block), the corresponding
45 * pointer in btrfs_raid_bio::bio_paddrs[] will point to the physical address
46 * (with the offset inside the page) of the corresponding bio.
47 *
48 * If there is no bio covering a sector, then btrfs_raid_bio::bio_paddrs[i] will
49 * be INVALID_PADDR.
50 *
51 * The length of each entry in bio_paddrs[] is a step (aka, min(sectorsize, PAGE_SIZE)).
52 *
53 * [PAGES FOR INTERNAL USAGES]
54 * Pages not covered by any bio or belonging to P/Q stripes are stored in
55 * btrfs_raid_bio::stripe_pages[] and stripe_paddrs[], like the following:
56 *
57 * stripe_pages: |<- Page 0 ->|<- Page 1 ->| ...
58 * stripe_paddrs: [0] [1] [2] [3] [4] ...
59 *
60 * stripe_pages[] array stores all the pages covering the full stripe, including
61 * data and P/Q pages.
62 * stripe_pages[0] is the first page of the first data stripe.
63 * stripe_pages[BTRFS_STRIPE_LEN / PAGE_SIZE] is the first page of the second
64 * data stripe.
65 *
66 * Some pointers inside stripe_pages[] can be NULL, e.g. for a full stripe write
67 * (the bio covers all data stripes) there is no need to allocate pages for
68 * data stripes (can grab from bio_paddrs[]).
69 *
70 * If the corresponding page of stripe_paddrs[i] is not allocated, the value of
71 * stripe_paddrs[i] will be INVALID_PADDR.
72 *
73 * The length of each entry in stripe_paddrs[] is a step.
74 *
75 * [LOCATING A SECTOR]
76 * To locate a sector for IO, we need the following info:
77 *
78 * - stripe_nr
79 * Starts from 0 (representing the first data stripe), ends at
80 * @nr_data (RAID5, P stripe) or @nr_data + 1 (RAID6, Q stripe).
81 *
82 * - sector_nr
83 * Starts from 0 (representing the first sector of the stripe), ends
84 * at BTRFS_STRIPE_LEN / sectorsize - 1.
85 *
86 * - step_nr
87 * A step is min(sector_size, PAGE_SIZE).
88 *
89 * Starts from 0 (representing the first step of the sector), ends
90 * at @sector_nsteps - 1.
91 *
92 * For most call sites they do not need to bother this parameter.
93 * It is for bs > ps support and only for vertical stripe related works.
94 * (e.g. RMW/recover)
95 *
96 * - from which array
97 * Whether grabbing from stripe_paddrs[] (aka, internal pages) or from the
98 * bio_paddrs[] (aka, from the higher layer bios).
99 *
100 * For IO, a physical address is returned, so that we can extract the page and
101 * the offset inside the page for IO.
102 * A special value INVALID_PADDR represents when the physical address is invalid,
103 * normally meaning there is no page allocated for the specified sector.
104 */
105 struct btrfs_raid_bio {
106 struct btrfs_io_context *bioc;
107
108 /*
109 * While we're doing RMW on a stripe we put it into a hash table so we
110 * can lock the stripe and merge more rbios into it.
111 */
112 struct list_head hash_list;
113
114 /* LRU list for the stripe cache */
115 struct list_head stripe_cache;
116
117 /* For scheduling work in the helper threads */
118 struct work_struct work;
119
120 /*
121 * bio_list and bio_list_lock are used to add more bios into the stripe
122 * in hopes of avoiding the full RMW
123 */
124 struct bio_list bio_list;
125 spinlock_t bio_list_lock;
126
127 /*
128 * Also protected by the bio_list_lock, the plug list is used by the
129 * plugging code to collect partial bios while plugged. The stripe
130 * locking code also uses it to hand off the stripe lock to the next
131 * pending IO.
132 */
133 struct list_head plug_list;
134
135 /* Flags that tell us if it is safe to merge with this bio. */
136 unsigned long flags;
137
138 /*
139 * Set if we're doing a parity rebuild for a read from higher up, which
140 * is handled differently from a parity rebuild as part of RMW.
141 */
142 enum btrfs_rbio_ops operation;
143
144 /* How many pages there are for the full stripe including P/Q */
145 u16 nr_pages;
146
147 /* How many sectors there are for the full stripe including P/Q */
148 u16 nr_sectors;
149
150 /* Number of data stripes (no p/q) */
151 u8 nr_data;
152
153 /* Number of all stripes (including P/Q) */
154 u8 real_stripes;
155
156 /* How many pages there are for each stripe */
157 u8 stripe_npages;
158
159 /* How many sectors there are for each stripe */
160 u8 stripe_nsectors;
161
162 /*
163 * How many steps there are for one sector.
164 *
165 * For bs > ps cases, it's sectorsize / PAGE_SIZE.
166 * For bs <= ps cases, it's always 1.
167 */
168 u8 sector_nsteps;
169
170 /* Stripe number that we're scrubbing */
171 u8 scrubp;
172
173 /*
174 * Size of all the bios in the bio_list. This helps us decide if the
175 * rbio maps to a full stripe or not.
176 */
177 int bio_list_bytes;
178
179 refcount_t refs;
180
181 atomic_t stripes_pending;
182
183 wait_queue_head_t io_wait;
184
185 /* Bitmap to record which horizontal stripe has data */
186 unsigned long dbitmap;
187
188 /* Allocated with stripe_nsectors-many bits for finish_*() calls */
189 unsigned long finish_pbitmap;
190
191 /*
192 * These are two arrays of pointers. We allocate the rbio big enough
193 * to hold them both and setup their locations when the rbio is
194 * allocated.
195 */
196
197 /*
198 * Pointers to pages that we allocated for reading/writing stripes
199 * directly from the disk (including P/Q).
200 */
201 struct page **stripe_pages;
202
203 /* Pointers to the sectors in the bio_list, for faster lookup */
204 phys_addr_t *bio_paddrs;
205
206 /* Pointers to the sectors in the stripe_pages[]. */
207 phys_addr_t *stripe_paddrs;
208
209 /* Each set bit means the corresponding sector in stripe_sectors[] is uptodate. */
210 unsigned long *stripe_uptodate_bitmap;
211
212 /* Allocated with real_stripes-many pointers for finish_*() calls */
213 void **finish_pointers;
214
215 /*
216 * The bitmap recording where IO errors happened.
217 * Each bit is corresponding to one sector in either bio_sectors[] or
218 * stripe_sectors[] array.
219 */
220 unsigned long *error_bitmap;
221
222 /*
223 * Checksum buffer if the rbio is for data. The buffer should cover
224 * all data sectors (excluding P/Q sectors).
225 */
226 u8 *csum_buf;
227
228 /*
229 * Each bit represents if the corresponding sector has data csum found.
230 * Should only cover data sectors (excluding P/Q sectors).
231 */
232 unsigned long *csum_bitmap;
233 };
234
235 /*
236 * For trace event usage only. Records useful debug info for each bio submitted
237 * by RAID56 to each physical device.
238 *
239 * No matter signed or not, (-1) is always the one indicating we can not grab
240 * the proper stripe number.
241 */
242 struct raid56_bio_trace_info {
243 u64 devid;
244
245 /* The offset inside the stripe. (<= STRIPE_LEN) */
246 u32 offset;
247
248 /*
249 * Stripe number.
250 * 0 is the first data stripe, and nr_data for P stripe,
251 * nr_data + 1 for Q stripe.
252 * >= real_stripes for
253 */
254 u8 stripe_nr;
255 };
256
nr_data_stripes(const struct btrfs_chunk_map * map)257 static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
258 {
259 return map->num_stripes - btrfs_nr_parity_stripes(map->type);
260 }
261
nr_bioc_data_stripes(const struct btrfs_io_context * bioc)262 static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
263 {
264 return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
265 }
266
267 #define RAID5_P_STRIPE ((u64)-2)
268 #define RAID6_Q_STRIPE ((u64)-1)
269
270 #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
271 ((x) == RAID6_Q_STRIPE))
272
273 struct btrfs_device;
274
275 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
276 int mirror_num);
277 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
278
279 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
280 struct btrfs_io_context *bioc,
281 struct btrfs_device *scrub_dev,
282 unsigned long *dbitmap, int stripe_nsectors);
283 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
284
285 void raid56_parity_cache_data_folios(struct btrfs_raid_bio *rbio,
286 struct folio **data_folios, u64 data_logical);
287
288 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
289 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
290
291 #endif
292