1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2022 Fujitsu. All Rights Reserved.
4 */
5
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_alloc.h"
13 #include "xfs_bit.h"
14 #include "xfs_btree.h"
15 #include "xfs_inode.h"
16 #include "xfs_icache.h"
17 #include "xfs_rmap.h"
18 #include "xfs_rmap_btree.h"
19 #include "xfs_rtalloc.h"
20 #include "xfs_trans.h"
21 #include "xfs_ag.h"
22 #include "xfs_notify_failure.h"
23 #include "xfs_rtgroup.h"
24 #include "xfs_rtrmap_btree.h"
25
26 #include <linux/mm.h>
27 #include <linux/dax.h>
28 #include <linux/fs.h>
29
30 struct xfs_failure_info {
31 xfs_agblock_t startblock;
32 xfs_extlen_t blockcount;
33 int mf_flags;
34 bool want_shutdown;
35 };
36
37 static pgoff_t
xfs_failure_pgoff(struct xfs_mount * mp,const struct xfs_rmap_irec * rec,const struct xfs_failure_info * notify)38 xfs_failure_pgoff(
39 struct xfs_mount *mp,
40 const struct xfs_rmap_irec *rec,
41 const struct xfs_failure_info *notify)
42 {
43 loff_t pos = XFS_FSB_TO_B(mp, rec->rm_offset);
44
45 if (notify->startblock > rec->rm_startblock)
46 pos += XFS_FSB_TO_B(mp,
47 notify->startblock - rec->rm_startblock);
48 return pos >> PAGE_SHIFT;
49 }
50
51 static unsigned long
xfs_failure_pgcnt(struct xfs_mount * mp,const struct xfs_rmap_irec * rec,const struct xfs_failure_info * notify)52 xfs_failure_pgcnt(
53 struct xfs_mount *mp,
54 const struct xfs_rmap_irec *rec,
55 const struct xfs_failure_info *notify)
56 {
57 xfs_agblock_t end_rec;
58 xfs_agblock_t end_notify;
59 xfs_agblock_t start_cross;
60 xfs_agblock_t end_cross;
61
62 start_cross = max(rec->rm_startblock, notify->startblock);
63
64 end_rec = rec->rm_startblock + rec->rm_blockcount;
65 end_notify = notify->startblock + notify->blockcount;
66 end_cross = min(end_rec, end_notify);
67
68 return XFS_FSB_TO_B(mp, end_cross - start_cross) >> PAGE_SHIFT;
69 }
70
71 static int
xfs_dax_failure_fn(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * data)72 xfs_dax_failure_fn(
73 struct xfs_btree_cur *cur,
74 const struct xfs_rmap_irec *rec,
75 void *data)
76 {
77 struct xfs_mount *mp = cur->bc_mp;
78 struct xfs_inode *ip;
79 struct xfs_failure_info *notify = data;
80 struct address_space *mapping;
81 pgoff_t pgoff;
82 unsigned long pgcnt;
83 int error = 0;
84
85 if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
86 (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
87 /* Continue the query because this isn't a failure. */
88 if (notify->mf_flags & MF_MEM_PRE_REMOVE)
89 return 0;
90 notify->want_shutdown = true;
91 return 0;
92 }
93
94 /* Get files that incore, filter out others that are not in use. */
95 error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
96 0, &ip);
97 /* Continue the rmap query if the inode isn't incore */
98 if (error == -ENODATA)
99 return 0;
100 if (error) {
101 notify->want_shutdown = true;
102 return 0;
103 }
104
105 mapping = VFS_I(ip)->i_mapping;
106 pgoff = xfs_failure_pgoff(mp, rec, notify);
107 pgcnt = xfs_failure_pgcnt(mp, rec, notify);
108
109 /* Continue the rmap query if the inode isn't a dax file. */
110 if (dax_mapping(mapping))
111 error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
112 notify->mf_flags);
113
114 /* Invalidate the cache in dax pages. */
115 if (notify->mf_flags & MF_MEM_PRE_REMOVE)
116 invalidate_inode_pages2_range(mapping, pgoff,
117 pgoff + pgcnt - 1);
118
119 xfs_irele(ip);
120 return error;
121 }
122
123 static int
xfs_dax_notify_failure_freeze(struct xfs_mount * mp)124 xfs_dax_notify_failure_freeze(
125 struct xfs_mount *mp)
126 {
127 struct super_block *sb = mp->m_super;
128 int error;
129
130 error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL);
131 if (error)
132 xfs_emerg(mp, "already frozen by kernel, err=%d", error);
133
134 return error;
135 }
136
137 static void
xfs_dax_notify_failure_thaw(struct xfs_mount * mp,bool kernel_frozen)138 xfs_dax_notify_failure_thaw(
139 struct xfs_mount *mp,
140 bool kernel_frozen)
141 {
142 struct super_block *sb = mp->m_super;
143 int error;
144
145 if (kernel_frozen) {
146 error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL);
147 if (error)
148 xfs_emerg(mp, "still frozen after notify failure, err=%d",
149 error);
150 }
151
152 /*
153 * Also thaw userspace call anyway because the device is about to be
154 * removed immediately.
155 */
156 thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
157 }
158
159 static int
xfs_dax_translate_range(struct xfs_buftarg * btp,u64 offset,u64 len,xfs_daddr_t * daddr,uint64_t * bblen)160 xfs_dax_translate_range(
161 struct xfs_buftarg *btp,
162 u64 offset,
163 u64 len,
164 xfs_daddr_t *daddr,
165 uint64_t *bblen)
166 {
167 u64 dev_start = btp->bt_dax_part_off;
168 u64 dev_len = bdev_nr_bytes(btp->bt_bdev);
169 u64 dev_end = dev_start + dev_len - 1;
170
171 /* Notify failure on the whole device. */
172 if (offset == 0 && len == U64_MAX) {
173 offset = dev_start;
174 len = dev_len;
175 }
176
177 /* Ignore the range out of filesystem area */
178 if (offset + len - 1 < dev_start)
179 return -ENXIO;
180 if (offset > dev_end)
181 return -ENXIO;
182
183 /* Calculate the real range when it touches the boundary */
184 if (offset > dev_start)
185 offset -= dev_start;
186 else {
187 len -= dev_start - offset;
188 offset = 0;
189 }
190 if (offset + len - 1 > dev_end)
191 len = dev_end - offset + 1;
192
193 *daddr = BTOBB(offset);
194 *bblen = BTOBB(len);
195 return 0;
196 }
197
198 static int
xfs_dax_notify_logdev_failure(struct xfs_mount * mp,u64 offset,u64 len,int mf_flags)199 xfs_dax_notify_logdev_failure(
200 struct xfs_mount *mp,
201 u64 offset,
202 u64 len,
203 int mf_flags)
204 {
205 xfs_daddr_t daddr;
206 uint64_t bblen;
207 int error;
208
209 /*
210 * Return ENXIO instead of shutting down the filesystem if the failed
211 * region is beyond the end of the log.
212 */
213 error = xfs_dax_translate_range(mp->m_logdev_targp,
214 offset, len, &daddr, &bblen);
215 if (error)
216 return error;
217
218 /*
219 * In the pre-remove case the failure notification is attempting to
220 * trigger a force unmount. The expectation is that the device is
221 * still present, but its removal is in progress and can not be
222 * cancelled, proceed with accessing the log device.
223 */
224 if (mf_flags & MF_MEM_PRE_REMOVE)
225 return 0;
226
227 xfs_err(mp, "ondisk log corrupt, shutting down fs!");
228 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
229 return -EFSCORRUPTED;
230 }
231
232 static int
xfs_dax_notify_dev_failure(struct xfs_mount * mp,u64 offset,u64 len,int mf_flags,enum xfs_group_type type)233 xfs_dax_notify_dev_failure(
234 struct xfs_mount *mp,
235 u64 offset,
236 u64 len,
237 int mf_flags,
238 enum xfs_group_type type)
239 {
240 struct xfs_failure_info notify = { .mf_flags = mf_flags };
241 struct xfs_trans *tp = NULL;
242 struct xfs_btree_cur *cur = NULL;
243 int error = 0;
244 bool kernel_frozen = false;
245 uint32_t start_gno, end_gno;
246 xfs_fsblock_t start_bno, end_bno;
247 xfs_daddr_t daddr;
248 uint64_t bblen;
249 struct xfs_group *xg = NULL;
250
251 if (!xfs_has_rmapbt(mp)) {
252 xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
253 return -EOPNOTSUPP;
254 }
255
256 error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
257 offset, len, &daddr, &bblen);
258 if (error)
259 return error;
260
261 if (type == XG_TYPE_RTG) {
262 start_bno = xfs_daddr_to_rtb(mp, daddr);
263 end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
264 } else {
265 start_bno = XFS_DADDR_TO_FSB(mp, daddr);
266 end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
267 }
268
269 if (mf_flags & MF_MEM_PRE_REMOVE) {
270 xfs_info(mp, "Device is about to be removed!");
271 /*
272 * Freeze fs to prevent new mappings from being created.
273 * - Keep going on if others already hold the kernel forzen.
274 * - Keep going on if other errors too because this device is
275 * starting to fail.
276 * - If kernel frozen state is hold successfully here, thaw it
277 * here as well at the end.
278 */
279 kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
280 }
281
282 tp = xfs_trans_alloc_empty(mp);
283 start_gno = xfs_fsb_to_gno(mp, start_bno, type);
284 end_gno = xfs_fsb_to_gno(mp, end_bno, type);
285 while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
286 struct xfs_buf *agf_bp = NULL;
287 struct xfs_rtgroup *rtg = NULL;
288 struct xfs_rmap_irec ri_low = { };
289 struct xfs_rmap_irec ri_high;
290
291 if (type == XG_TYPE_AG) {
292 struct xfs_perag *pag = to_perag(xg);
293
294 error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
295 if (error) {
296 xfs_perag_put(pag);
297 break;
298 }
299
300 cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
301 } else {
302 rtg = to_rtg(xg);
303 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
304 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
305 }
306
307 /*
308 * Set the rmap range from ri_low to ri_high, which represents
309 * a [start, end] where we looking for the files or metadata.
310 */
311 memset(&ri_high, 0xFF, sizeof(ri_high));
312 if (xg->xg_gno == start_gno)
313 ri_low.rm_startblock =
314 xfs_fsb_to_gbno(mp, start_bno, type);
315 if (xg->xg_gno == end_gno)
316 ri_high.rm_startblock =
317 xfs_fsb_to_gbno(mp, end_bno, type);
318
319 notify.startblock = ri_low.rm_startblock;
320 notify.blockcount = min(xg->xg_block_count,
321 ri_high.rm_startblock + 1) -
322 ri_low.rm_startblock;
323
324 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
325 xfs_dax_failure_fn, ¬ify);
326 xfs_btree_del_cursor(cur, error);
327 if (agf_bp)
328 xfs_trans_brelse(tp, agf_bp);
329 if (rtg)
330 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
331 if (error) {
332 xfs_group_put(xg);
333 break;
334 }
335 }
336
337 xfs_trans_cancel(tp);
338
339 /*
340 * Shutdown fs from a force umount in pre-remove case which won't fail,
341 * so errors can be ignored. Otherwise, shutdown the filesystem with
342 * CORRUPT flag if error occured or notify.want_shutdown was set during
343 * RMAP querying.
344 */
345 if (mf_flags & MF_MEM_PRE_REMOVE)
346 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
347 else if (error || notify.want_shutdown) {
348 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
349 if (!error)
350 error = -EFSCORRUPTED;
351 }
352
353 /* Thaw the fs if it has been frozen before. */
354 if (mf_flags & MF_MEM_PRE_REMOVE)
355 xfs_dax_notify_failure_thaw(mp, kernel_frozen);
356
357 return error;
358 }
359
360 static int
xfs_dax_notify_failure(struct dax_device * dax_dev,u64 offset,u64 len,int mf_flags)361 xfs_dax_notify_failure(
362 struct dax_device *dax_dev,
363 u64 offset,
364 u64 len,
365 int mf_flags)
366 {
367 struct xfs_mount *mp = dax_holder(dax_dev);
368
369 if (!(mp->m_super->s_flags & SB_BORN)) {
370 xfs_warn(mp, "filesystem is not ready for notify_failure()!");
371 return -EIO;
372 }
373
374 if (mp->m_logdev_targp != mp->m_ddev_targp &&
375 mp->m_logdev_targp->bt_daxdev == dax_dev) {
376 return xfs_dax_notify_logdev_failure(mp, offset, len, mf_flags);
377 }
378
379 return xfs_dax_notify_dev_failure(mp, offset, len, mf_flags,
380 (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) ?
381 XG_TYPE_RTG : XG_TYPE_AG);
382 }
383
384 const struct dax_holder_operations xfs_dax_holder_operations = {
385 .notify_failure = xfs_dax_notify_failure,
386 };
387