1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * aops.c - NTFS kernel address space operations and page cache handling.
4  *
5  * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
6  * Copyright (c) 2002 Richard Russon
7  */
8 
9 #include <linux/errno.h>
10 #include <linux/fs.h>
11 #include <linux/gfp.h>
12 #include <linux/mm.h>
13 #include <linux/pagemap.h>
14 #include <linux/swap.h>
15 #include <linux/buffer_head.h>
16 #include <linux/writeback.h>
17 #include <linux/bit_spinlock.h>
18 #include <linux/bio.h>
19 
20 #include "aops.h"
21 #include "attrib.h"
22 #include "debug.h"
23 #include "inode.h"
24 #include "mft.h"
25 #include "runlist.h"
26 #include "types.h"
27 #include "ntfs.h"
28 
29 /**
30  * ntfs_end_buffer_async_read - async io completion for reading attributes
31  * @bh:		buffer head on which io is completed
32  * @uptodate:	whether @bh is now uptodate or not
33  *
34  * Asynchronous I/O completion handler for reading pages belonging to the
35  * attribute address space of an inode.  The inodes can either be files or
36  * directories or they can be fake inodes describing some attribute.
37  *
38  * If NInoMstProtected(), perform the post read mst fixups when all IO on the
39  * page has been completed and mark the page uptodate or set the error bit on
40  * the page.  To determine the size of the records that need fixing up, we
41  * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
42  * record size, and index_block_size_bits, to the log(base 2) of the ntfs
43  * record size.
44  */
ntfs_end_buffer_async_read(struct buffer_head * bh,int uptodate)45 static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
46 {
47 	unsigned long flags;
48 	struct buffer_head *first, *tmp;
49 	struct page *page;
50 	struct inode *vi;
51 	ntfs_inode *ni;
52 	int page_uptodate = 1;
53 
54 	page = bh->b_page;
55 	vi = page->mapping->host;
56 	ni = NTFS_I(vi);
57 
58 	if (likely(uptodate)) {
59 		loff_t i_size;
60 		s64 file_ofs, init_size;
61 
62 		set_buffer_uptodate(bh);
63 
64 		file_ofs = ((s64)page->index << PAGE_SHIFT) +
65 				bh_offset(bh);
66 		read_lock_irqsave(&ni->size_lock, flags);
67 		init_size = ni->initialized_size;
68 		i_size = i_size_read(vi);
69 		read_unlock_irqrestore(&ni->size_lock, flags);
70 		if (unlikely(init_size > i_size)) {
71 			/* Race with shrinking truncate. */
72 			init_size = i_size;
73 		}
74 		/* Check for the current buffer head overflowing. */
75 		if (unlikely(file_ofs + bh->b_size > init_size)) {
76 			int ofs;
77 			void *kaddr;
78 
79 			ofs = 0;
80 			if (file_ofs < init_size)
81 				ofs = init_size - file_ofs;
82 			kaddr = kmap_atomic(page);
83 			memset(kaddr + bh_offset(bh) + ofs, 0,
84 					bh->b_size - ofs);
85 			flush_dcache_page(page);
86 			kunmap_atomic(kaddr);
87 		}
88 	} else {
89 		clear_buffer_uptodate(bh);
90 		SetPageError(page);
91 		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
92 				"0x%llx.", (unsigned long long)bh->b_blocknr);
93 	}
94 	first = page_buffers(page);
95 	spin_lock_irqsave(&first->b_uptodate_lock, flags);
96 	clear_buffer_async_read(bh);
97 	unlock_buffer(bh);
98 	tmp = bh;
99 	do {
100 		if (!buffer_uptodate(tmp))
101 			page_uptodate = 0;
102 		if (buffer_async_read(tmp)) {
103 			if (likely(buffer_locked(tmp)))
104 				goto still_busy;
105 			/* Async buffers must be locked. */
106 			BUG();
107 		}
108 		tmp = tmp->b_this_page;
109 	} while (tmp != bh);
110 	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
111 	/*
112 	 * If none of the buffers had errors then we can set the page uptodate,
113 	 * but we first have to perform the post read mst fixups, if the
114 	 * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
115 	 * Note we ignore fixup errors as those are detected when
116 	 * map_mft_record() is called which gives us per record granularity
117 	 * rather than per page granularity.
118 	 */
119 	if (!NInoMstProtected(ni)) {
120 		if (likely(page_uptodate && !PageError(page)))
121 			SetPageUptodate(page);
122 	} else {
123 		u8 *kaddr;
124 		unsigned int i, recs;
125 		u32 rec_size;
126 
127 		rec_size = ni->itype.index.block_size;
128 		recs = PAGE_SIZE / rec_size;
129 		/* Should have been verified before we got here... */
130 		BUG_ON(!recs);
131 		kaddr = kmap_atomic(page);
132 		for (i = 0; i < recs; i++)
133 			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
134 					i * rec_size), rec_size);
135 		kunmap_atomic(kaddr);
136 		flush_dcache_page(page);
137 		if (likely(page_uptodate && !PageError(page)))
138 			SetPageUptodate(page);
139 	}
140 	unlock_page(page);
141 	return;
142 still_busy:
143 	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
144 	return;
145 }
146 
147 /**
148  * ntfs_read_block - fill a @folio of an address space with data
149  * @folio:	page cache folio to fill with data
150  *
151  * We read each buffer asynchronously and when all buffers are read in, our io
152  * completion handler ntfs_end_buffer_read_async(), if required, automatically
153  * applies the mst fixups to the folio before finally marking it uptodate and
154  * unlocking it.
155  *
156  * We only enforce allocated_size limit because i_size is checked for in
157  * generic_file_read().
158  *
159  * Return 0 on success and -errno on error.
160  *
161  * Contains an adapted version of fs/buffer.c::block_read_full_folio().
162  */
ntfs_read_block(struct folio * folio)163 static int ntfs_read_block(struct folio *folio)
164 {
165 	loff_t i_size;
166 	VCN vcn;
167 	LCN lcn;
168 	s64 init_size;
169 	struct inode *vi;
170 	ntfs_inode *ni;
171 	ntfs_volume *vol;
172 	runlist_element *rl;
173 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
174 	sector_t iblock, lblock, zblock;
175 	unsigned long flags;
176 	unsigned int blocksize, vcn_ofs;
177 	int i, nr;
178 	unsigned char blocksize_bits;
179 
180 	vi = folio->mapping->host;
181 	ni = NTFS_I(vi);
182 	vol = ni->vol;
183 
184 	/* $MFT/$DATA must have its complete runlist in memory at all times. */
185 	BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
186 
187 	blocksize = vol->sb->s_blocksize;
188 	blocksize_bits = vol->sb->s_blocksize_bits;
189 
190 	head = folio_buffers(folio);
191 	if (!head)
192 		head = create_empty_buffers(folio, blocksize, 0);
193 	bh = head;
194 
195 	/*
196 	 * We may be racing with truncate.  To avoid some of the problems we
197 	 * now take a snapshot of the various sizes and use those for the whole
198 	 * of the function.  In case of an extending truncate it just means we
199 	 * may leave some buffers unmapped which are now allocated.  This is
200 	 * not a problem since these buffers will just get mapped when a write
201 	 * occurs.  In case of a shrinking truncate, we will detect this later
202 	 * on due to the runlist being incomplete and if the folio is being
203 	 * fully truncated, truncate will throw it away as soon as we unlock
204 	 * it so no need to worry what we do with it.
205 	 */
206 	iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
207 	read_lock_irqsave(&ni->size_lock, flags);
208 	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
209 	init_size = ni->initialized_size;
210 	i_size = i_size_read(vi);
211 	read_unlock_irqrestore(&ni->size_lock, flags);
212 	if (unlikely(init_size > i_size)) {
213 		/* Race with shrinking truncate. */
214 		init_size = i_size;
215 	}
216 	zblock = (init_size + blocksize - 1) >> blocksize_bits;
217 
218 	/* Loop through all the buffers in the folio. */
219 	rl = NULL;
220 	nr = i = 0;
221 	do {
222 		int err = 0;
223 
224 		if (unlikely(buffer_uptodate(bh)))
225 			continue;
226 		if (unlikely(buffer_mapped(bh))) {
227 			arr[nr++] = bh;
228 			continue;
229 		}
230 		bh->b_bdev = vol->sb->s_bdev;
231 		/* Is the block within the allowed limits? */
232 		if (iblock < lblock) {
233 			bool is_retry = false;
234 
235 			/* Convert iblock into corresponding vcn and offset. */
236 			vcn = (VCN)iblock << blocksize_bits >>
237 					vol->cluster_size_bits;
238 			vcn_ofs = ((VCN)iblock << blocksize_bits) &
239 					vol->cluster_size_mask;
240 			if (!rl) {
241 lock_retry_remap:
242 				down_read(&ni->runlist.lock);
243 				rl = ni->runlist.rl;
244 			}
245 			if (likely(rl != NULL)) {
246 				/* Seek to element containing target vcn. */
247 				while (rl->length && rl[1].vcn <= vcn)
248 					rl++;
249 				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
250 			} else
251 				lcn = LCN_RL_NOT_MAPPED;
252 			/* Successful remap. */
253 			if (lcn >= 0) {
254 				/* Setup buffer head to correct block. */
255 				bh->b_blocknr = ((lcn << vol->cluster_size_bits)
256 						+ vcn_ofs) >> blocksize_bits;
257 				set_buffer_mapped(bh);
258 				/* Only read initialized data blocks. */
259 				if (iblock < zblock) {
260 					arr[nr++] = bh;
261 					continue;
262 				}
263 				/* Fully non-initialized data block, zero it. */
264 				goto handle_zblock;
265 			}
266 			/* It is a hole, need to zero it. */
267 			if (lcn == LCN_HOLE)
268 				goto handle_hole;
269 			/* If first try and runlist unmapped, map and retry. */
270 			if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
271 				is_retry = true;
272 				/*
273 				 * Attempt to map runlist, dropping lock for
274 				 * the duration.
275 				 */
276 				up_read(&ni->runlist.lock);
277 				err = ntfs_map_runlist(ni, vcn);
278 				if (likely(!err))
279 					goto lock_retry_remap;
280 				rl = NULL;
281 			} else if (!rl)
282 				up_read(&ni->runlist.lock);
283 			/*
284 			 * If buffer is outside the runlist, treat it as a
285 			 * hole.  This can happen due to concurrent truncate
286 			 * for example.
287 			 */
288 			if (err == -ENOENT || lcn == LCN_ENOENT) {
289 				err = 0;
290 				goto handle_hole;
291 			}
292 			/* Hard error, zero out region. */
293 			if (!err)
294 				err = -EIO;
295 			bh->b_blocknr = -1;
296 			folio_set_error(folio);
297 			ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
298 					"attribute type 0x%x, vcn 0x%llx, "
299 					"offset 0x%x because its location on "
300 					"disk could not be determined%s "
301 					"(error code %i).", ni->mft_no,
302 					ni->type, (unsigned long long)vcn,
303 					vcn_ofs, is_retry ? " even after "
304 					"retrying" : "", err);
305 		}
306 		/*
307 		 * Either iblock was outside lblock limits or
308 		 * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
309 		 * of the folio and set the buffer uptodate.
310 		 */
311 handle_hole:
312 		bh->b_blocknr = -1UL;
313 		clear_buffer_mapped(bh);
314 handle_zblock:
315 		folio_zero_range(folio, i * blocksize, blocksize);
316 		if (likely(!err))
317 			set_buffer_uptodate(bh);
318 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
319 
320 	/* Release the lock if we took it. */
321 	if (rl)
322 		up_read(&ni->runlist.lock);
323 
324 	/* Check we have at least one buffer ready for i/o. */
325 	if (nr) {
326 		struct buffer_head *tbh;
327 
328 		/* Lock the buffers. */
329 		for (i = 0; i < nr; i++) {
330 			tbh = arr[i];
331 			lock_buffer(tbh);
332 			tbh->b_end_io = ntfs_end_buffer_async_read;
333 			set_buffer_async_read(tbh);
334 		}
335 		/* Finally, start i/o on the buffers. */
336 		for (i = 0; i < nr; i++) {
337 			tbh = arr[i];
338 			if (likely(!buffer_uptodate(tbh)))
339 				submit_bh(REQ_OP_READ, tbh);
340 			else
341 				ntfs_end_buffer_async_read(tbh, 1);
342 		}
343 		return 0;
344 	}
345 	/* No i/o was scheduled on any of the buffers. */
346 	if (likely(!folio_test_error(folio)))
347 		folio_mark_uptodate(folio);
348 	else /* Signal synchronous i/o error. */
349 		nr = -EIO;
350 	folio_unlock(folio);
351 	return nr;
352 }
353 
354 /**
355  * ntfs_read_folio - fill a @folio of a @file with data from the device
356  * @file:	open file to which the folio @folio belongs or NULL
357  * @folio:	page cache folio to fill with data
358  *
359  * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
360  * file @file by calling the ntfs version of the generic block_read_full_folio()
361  * function, ntfs_read_block(), which in turn creates and reads in the buffers
362  * associated with the folio asynchronously.
363  *
364  * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
365  * data from the mft record (which at this stage is most likely in memory) and
366  * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
367  * even if the mft record is not cached at this point in time, we need to wait
368  * for it to be read in before we can do the copy.
369  *
370  * Return 0 on success and -errno on error.
371  */
ntfs_read_folio(struct file * file,struct folio * folio)372 static int ntfs_read_folio(struct file *file, struct folio *folio)
373 {
374 	struct page *page = &folio->page;
375 	loff_t i_size;
376 	struct inode *vi;
377 	ntfs_inode *ni, *base_ni;
378 	u8 *addr;
379 	ntfs_attr_search_ctx *ctx;
380 	MFT_RECORD *mrec;
381 	unsigned long flags;
382 	u32 attr_len;
383 	int err = 0;
384 
385 retry_readpage:
386 	BUG_ON(!PageLocked(page));
387 	vi = page->mapping->host;
388 	i_size = i_size_read(vi);
389 	/* Is the page fully outside i_size? (truncate in progress) */
390 	if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
391 			PAGE_SHIFT)) {
392 		zero_user(page, 0, PAGE_SIZE);
393 		ntfs_debug("Read outside i_size - truncated?");
394 		goto done;
395 	}
396 	/*
397 	 * This can potentially happen because we clear PageUptodate() during
398 	 * ntfs_writepage() of MstProtected() attributes.
399 	 */
400 	if (PageUptodate(page)) {
401 		unlock_page(page);
402 		return 0;
403 	}
404 	ni = NTFS_I(vi);
405 	/*
406 	 * Only $DATA attributes can be encrypted and only unnamed $DATA
407 	 * attributes can be compressed.  Index root can have the flags set but
408 	 * this means to create compressed/encrypted files, not that the
409 	 * attribute is compressed/encrypted.  Note we need to check for
410 	 * AT_INDEX_ALLOCATION since this is the type of both directory and
411 	 * index inodes.
412 	 */
413 	if (ni->type != AT_INDEX_ALLOCATION) {
414 		/* If attribute is encrypted, deny access, just like NT4. */
415 		if (NInoEncrypted(ni)) {
416 			BUG_ON(ni->type != AT_DATA);
417 			err = -EACCES;
418 			goto err_out;
419 		}
420 		/* Compressed data streams are handled in compress.c. */
421 		if (NInoNonResident(ni) && NInoCompressed(ni)) {
422 			BUG_ON(ni->type != AT_DATA);
423 			BUG_ON(ni->name_len);
424 			return ntfs_read_compressed_block(page);
425 		}
426 	}
427 	/* NInoNonResident() == NInoIndexAllocPresent() */
428 	if (NInoNonResident(ni)) {
429 		/* Normal, non-resident data stream. */
430 		return ntfs_read_block(folio);
431 	}
432 	/*
433 	 * Attribute is resident, implying it is not compressed or encrypted.
434 	 * This also means the attribute is smaller than an mft record and
435 	 * hence smaller than a page, so can simply zero out any pages with
436 	 * index above 0.  Note the attribute can actually be marked compressed
437 	 * but if it is resident the actual data is not compressed so we are
438 	 * ok to ignore the compressed flag here.
439 	 */
440 	if (unlikely(page->index > 0)) {
441 		zero_user(page, 0, PAGE_SIZE);
442 		goto done;
443 	}
444 	if (!NInoAttr(ni))
445 		base_ni = ni;
446 	else
447 		base_ni = ni->ext.base_ntfs_ino;
448 	/* Map, pin, and lock the mft record. */
449 	mrec = map_mft_record(base_ni);
450 	if (IS_ERR(mrec)) {
451 		err = PTR_ERR(mrec);
452 		goto err_out;
453 	}
454 	/*
455 	 * If a parallel write made the attribute non-resident, drop the mft
456 	 * record and retry the read_folio.
457 	 */
458 	if (unlikely(NInoNonResident(ni))) {
459 		unmap_mft_record(base_ni);
460 		goto retry_readpage;
461 	}
462 	ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
463 	if (unlikely(!ctx)) {
464 		err = -ENOMEM;
465 		goto unm_err_out;
466 	}
467 	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
468 			CASE_SENSITIVE, 0, NULL, 0, ctx);
469 	if (unlikely(err))
470 		goto put_unm_err_out;
471 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
472 	read_lock_irqsave(&ni->size_lock, flags);
473 	if (unlikely(attr_len > ni->initialized_size))
474 		attr_len = ni->initialized_size;
475 	i_size = i_size_read(vi);
476 	read_unlock_irqrestore(&ni->size_lock, flags);
477 	if (unlikely(attr_len > i_size)) {
478 		/* Race with shrinking truncate. */
479 		attr_len = i_size;
480 	}
481 	addr = kmap_atomic(page);
482 	/* Copy the data to the page. */
483 	memcpy(addr, (u8*)ctx->attr +
484 			le16_to_cpu(ctx->attr->data.resident.value_offset),
485 			attr_len);
486 	/* Zero the remainder of the page. */
487 	memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
488 	flush_dcache_page(page);
489 	kunmap_atomic(addr);
490 put_unm_err_out:
491 	ntfs_attr_put_search_ctx(ctx);
492 unm_err_out:
493 	unmap_mft_record(base_ni);
494 done:
495 	SetPageUptodate(page);
496 err_out:
497 	unlock_page(page);
498 	return err;
499 }
500 
501 #ifdef NTFS_RW
502 
503 /**
504  * ntfs_write_block - write a @folio to the backing store
505  * @folio:	page cache folio to write out
506  * @wbc:	writeback control structure
507  *
508  * This function is for writing folios belonging to non-resident, non-mst
509  * protected attributes to their backing store.
510  *
511  * For a folio with buffers, map and write the dirty buffers asynchronously
512  * under folio writeback. For a folio without buffers, create buffers for the
513  * folio, then proceed as above.
514  *
515  * If a folio doesn't have buffers the folio dirty state is definitive. If
516  * a folio does have buffers, the folio dirty state is just a hint,
517  * and the buffer dirty state is definitive. (A hint which has rules:
518  * dirty buffers against a clean folio is illegal. Other combinations are
519  * legal and need to be handled. In particular a dirty folio containing
520  * clean buffers for example.)
521  *
522  * Return 0 on success and -errno on error.
523  *
524  * Based on ntfs_read_block() and __block_write_full_folio().
525  */
ntfs_write_block(struct folio * folio,struct writeback_control * wbc)526 static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc)
527 {
528 	VCN vcn;
529 	LCN lcn;
530 	s64 initialized_size;
531 	loff_t i_size;
532 	sector_t block, dblock, iblock;
533 	struct inode *vi;
534 	ntfs_inode *ni;
535 	ntfs_volume *vol;
536 	runlist_element *rl;
537 	struct buffer_head *bh, *head;
538 	unsigned long flags;
539 	unsigned int blocksize, vcn_ofs;
540 	int err;
541 	bool need_end_writeback;
542 	unsigned char blocksize_bits;
543 
544 	vi = folio->mapping->host;
545 	ni = NTFS_I(vi);
546 	vol = ni->vol;
547 
548 	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
549 			"0x%lx.", ni->mft_no, ni->type, folio->index);
550 
551 	BUG_ON(!NInoNonResident(ni));
552 	BUG_ON(NInoMstProtected(ni));
553 	blocksize = vol->sb->s_blocksize;
554 	blocksize_bits = vol->sb->s_blocksize_bits;
555 	head = folio_buffers(folio);
556 	if (!head) {
557 		BUG_ON(!folio_test_uptodate(folio));
558 		head = create_empty_buffers(folio, blocksize,
559 				(1 << BH_Uptodate) | (1 << BH_Dirty));
560 	}
561 	bh = head;
562 
563 	/* NOTE: Different naming scheme to ntfs_read_block()! */
564 
565 	/* The first block in the folio. */
566 	block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
567 
568 	read_lock_irqsave(&ni->size_lock, flags);
569 	i_size = i_size_read(vi);
570 	initialized_size = ni->initialized_size;
571 	read_unlock_irqrestore(&ni->size_lock, flags);
572 
573 	/* The first out of bounds block for the data size. */
574 	dblock = (i_size + blocksize - 1) >> blocksize_bits;
575 
576 	/* The last (fully or partially) initialized block. */
577 	iblock = initialized_size >> blocksize_bits;
578 
579 	/*
580 	 * Be very careful.  We have no exclusion from block_dirty_folio
581 	 * here, and the (potentially unmapped) buffers may become dirty at
582 	 * any time.  If a buffer becomes dirty here after we've inspected it
583 	 * then we just miss that fact, and the folio stays dirty.
584 	 *
585 	 * Buffers outside i_size may be dirtied by block_dirty_folio;
586 	 * handle that here by just cleaning them.
587 	 */
588 
589 	/*
590 	 * Loop through all the buffers in the folio, mapping all the dirty
591 	 * buffers to disk addresses and handling any aliases from the
592 	 * underlying block device's mapping.
593 	 */
594 	rl = NULL;
595 	err = 0;
596 	do {
597 		bool is_retry = false;
598 
599 		if (unlikely(block >= dblock)) {
600 			/*
601 			 * Mapped buffers outside i_size will occur, because
602 			 * this folio can be outside i_size when there is a
603 			 * truncate in progress. The contents of such buffers
604 			 * were zeroed by ntfs_writepage().
605 			 *
606 			 * FIXME: What about the small race window where
607 			 * ntfs_writepage() has not done any clearing because
608 			 * the folio was within i_size but before we get here,
609 			 * vmtruncate() modifies i_size?
610 			 */
611 			clear_buffer_dirty(bh);
612 			set_buffer_uptodate(bh);
613 			continue;
614 		}
615 
616 		/* Clean buffers are not written out, so no need to map them. */
617 		if (!buffer_dirty(bh))
618 			continue;
619 
620 		/* Make sure we have enough initialized size. */
621 		if (unlikely((block >= iblock) &&
622 				(initialized_size < i_size))) {
623 			/*
624 			 * If this folio is fully outside initialized
625 			 * size, zero out all folios between the current
626 			 * initialized size and the current folio. Just
627 			 * use ntfs_read_folio() to do the zeroing
628 			 * transparently.
629 			 */
630 			if (block > iblock) {
631 				// TODO:
632 				// For each folio do:
633 				// - read_cache_folio()
634 				// Again for each folio do:
635 				// - wait_on_folio_locked()
636 				// - Check (folio_test_uptodate(folio) &&
637 				//		!folio_test_error(folio))
638 				// Update initialized size in the attribute and
639 				// in the inode.
640 				// Again, for each folio do:
641 				//	block_dirty_folio();
642 				// folio_put()
643 				// We don't need to wait on the writes.
644 				// Update iblock.
645 			}
646 			/*
647 			 * The current folio straddles initialized size. Zero
648 			 * all non-uptodate buffers and set them uptodate (and
649 			 * dirty?). Note, there aren't any non-uptodate buffers
650 			 * if the folio is uptodate.
651 			 * FIXME: For an uptodate folio, the buffers may need to
652 			 * be written out because they were not initialized on
653 			 * disk before.
654 			 */
655 			if (!folio_test_uptodate(folio)) {
656 				// TODO:
657 				// Zero any non-uptodate buffers up to i_size.
658 				// Set them uptodate and dirty.
659 			}
660 			// TODO:
661 			// Update initialized size in the attribute and in the
662 			// inode (up to i_size).
663 			// Update iblock.
664 			// FIXME: This is inefficient. Try to batch the two
665 			// size changes to happen in one go.
666 			ntfs_error(vol->sb, "Writing beyond initialized size "
667 					"is not supported yet. Sorry.");
668 			err = -EOPNOTSUPP;
669 			break;
670 			// Do NOT set_buffer_new() BUT DO clear buffer range
671 			// outside write request range.
672 			// set_buffer_uptodate() on complete buffers as well as
673 			// set_buffer_dirty().
674 		}
675 
676 		/* No need to map buffers that are already mapped. */
677 		if (buffer_mapped(bh))
678 			continue;
679 
680 		/* Unmapped, dirty buffer. Need to map it. */
681 		bh->b_bdev = vol->sb->s_bdev;
682 
683 		/* Convert block into corresponding vcn and offset. */
684 		vcn = (VCN)block << blocksize_bits;
685 		vcn_ofs = vcn & vol->cluster_size_mask;
686 		vcn >>= vol->cluster_size_bits;
687 		if (!rl) {
688 lock_retry_remap:
689 			down_read(&ni->runlist.lock);
690 			rl = ni->runlist.rl;
691 		}
692 		if (likely(rl != NULL)) {
693 			/* Seek to element containing target vcn. */
694 			while (rl->length && rl[1].vcn <= vcn)
695 				rl++;
696 			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
697 		} else
698 			lcn = LCN_RL_NOT_MAPPED;
699 		/* Successful remap. */
700 		if (lcn >= 0) {
701 			/* Setup buffer head to point to correct block. */
702 			bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
703 					vcn_ofs) >> blocksize_bits;
704 			set_buffer_mapped(bh);
705 			continue;
706 		}
707 		/* It is a hole, need to instantiate it. */
708 		if (lcn == LCN_HOLE) {
709 			u8 *kaddr;
710 			unsigned long *bpos, *bend;
711 
712 			/* Check if the buffer is zero. */
713 			kaddr = kmap_local_folio(folio, bh_offset(bh));
714 			bpos = (unsigned long *)kaddr;
715 			bend = (unsigned long *)(kaddr + blocksize);
716 			do {
717 				if (unlikely(*bpos))
718 					break;
719 			} while (likely(++bpos < bend));
720 			kunmap_local(kaddr);
721 			if (bpos == bend) {
722 				/*
723 				 * Buffer is zero and sparse, no need to write
724 				 * it.
725 				 */
726 				bh->b_blocknr = -1;
727 				clear_buffer_dirty(bh);
728 				continue;
729 			}
730 			// TODO: Instantiate the hole.
731 			// clear_buffer_new(bh);
732 			// clean_bdev_bh_alias(bh);
733 			ntfs_error(vol->sb, "Writing into sparse regions is "
734 					"not supported yet. Sorry.");
735 			err = -EOPNOTSUPP;
736 			break;
737 		}
738 		/* If first try and runlist unmapped, map and retry. */
739 		if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
740 			is_retry = true;
741 			/*
742 			 * Attempt to map runlist, dropping lock for
743 			 * the duration.
744 			 */
745 			up_read(&ni->runlist.lock);
746 			err = ntfs_map_runlist(ni, vcn);
747 			if (likely(!err))
748 				goto lock_retry_remap;
749 			rl = NULL;
750 		} else if (!rl)
751 			up_read(&ni->runlist.lock);
752 		/*
753 		 * If buffer is outside the runlist, truncate has cut it out
754 		 * of the runlist.  Just clean and clear the buffer and set it
755 		 * uptodate so it can get discarded by the VM.
756 		 */
757 		if (err == -ENOENT || lcn == LCN_ENOENT) {
758 			bh->b_blocknr = -1;
759 			clear_buffer_dirty(bh);
760 			folio_zero_range(folio, bh_offset(bh), blocksize);
761 			set_buffer_uptodate(bh);
762 			err = 0;
763 			continue;
764 		}
765 		/* Failed to map the buffer, even after retrying. */
766 		if (!err)
767 			err = -EIO;
768 		bh->b_blocknr = -1;
769 		ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
770 				"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
771 				"because its location on disk could not be "
772 				"determined%s (error code %i).", ni->mft_no,
773 				ni->type, (unsigned long long)vcn,
774 				vcn_ofs, is_retry ? " even after "
775 				"retrying" : "", err);
776 		break;
777 	} while (block++, (bh = bh->b_this_page) != head);
778 
779 	/* Release the lock if we took it. */
780 	if (rl)
781 		up_read(&ni->runlist.lock);
782 
783 	/* For the error case, need to reset bh to the beginning. */
784 	bh = head;
785 
786 	/* Just an optimization, so ->read_folio() is not called later. */
787 	if (unlikely(!folio_test_uptodate(folio))) {
788 		int uptodate = 1;
789 		do {
790 			if (!buffer_uptodate(bh)) {
791 				uptodate = 0;
792 				bh = head;
793 				break;
794 			}
795 		} while ((bh = bh->b_this_page) != head);
796 		if (uptodate)
797 			folio_mark_uptodate(folio);
798 	}
799 
800 	/* Setup all mapped, dirty buffers for async write i/o. */
801 	do {
802 		if (buffer_mapped(bh) && buffer_dirty(bh)) {
803 			lock_buffer(bh);
804 			if (test_clear_buffer_dirty(bh)) {
805 				BUG_ON(!buffer_uptodate(bh));
806 				mark_buffer_async_write(bh);
807 			} else
808 				unlock_buffer(bh);
809 		} else if (unlikely(err)) {
810 			/*
811 			 * For the error case. The buffer may have been set
812 			 * dirty during attachment to a dirty folio.
813 			 */
814 			if (err != -ENOMEM)
815 				clear_buffer_dirty(bh);
816 		}
817 	} while ((bh = bh->b_this_page) != head);
818 
819 	if (unlikely(err)) {
820 		// TODO: Remove the -EOPNOTSUPP check later on...
821 		if (unlikely(err == -EOPNOTSUPP))
822 			err = 0;
823 		else if (err == -ENOMEM) {
824 			ntfs_warning(vol->sb, "Error allocating memory. "
825 					"Redirtying folio so we try again "
826 					"later.");
827 			/*
828 			 * Put the folio back on mapping->dirty_pages, but
829 			 * leave its buffer's dirty state as-is.
830 			 */
831 			folio_redirty_for_writepage(wbc, folio);
832 			err = 0;
833 		} else
834 			folio_set_error(folio);
835 	}
836 
837 	BUG_ON(folio_test_writeback(folio));
838 	folio_start_writeback(folio);	/* Keeps try_to_free_buffers() away. */
839 
840 	/* Submit the prepared buffers for i/o. */
841 	need_end_writeback = true;
842 	do {
843 		struct buffer_head *next = bh->b_this_page;
844 		if (buffer_async_write(bh)) {
845 			submit_bh(REQ_OP_WRITE, bh);
846 			need_end_writeback = false;
847 		}
848 		bh = next;
849 	} while (bh != head);
850 	folio_unlock(folio);
851 
852 	/* If no i/o was started, need to end writeback here. */
853 	if (unlikely(need_end_writeback))
854 		folio_end_writeback(folio);
855 
856 	ntfs_debug("Done.");
857 	return err;
858 }
859 
860 /**
861  * ntfs_write_mst_block - write a @page to the backing store
862  * @page:	page cache page to write out
863  * @wbc:	writeback control structure
864  *
865  * This function is for writing pages belonging to non-resident, mst protected
866  * attributes to their backing store.  The only supported attributes are index
867  * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
868  * supported for the index allocation case.
869  *
870  * The page must remain locked for the duration of the write because we apply
871  * the mst fixups, write, and then undo the fixups, so if we were to unlock the
872  * page before undoing the fixups, any other user of the page will see the
873  * page contents as corrupt.
874  *
875  * We clear the page uptodate flag for the duration of the function to ensure
876  * exclusion for the $MFT/$DATA case against someone mapping an mft record we
877  * are about to apply the mst fixups to.
878  *
879  * Return 0 on success and -errno on error.
880  *
881  * Based on ntfs_write_block(), ntfs_mft_writepage(), and
882  * write_mft_record_nolock().
883  */
ntfs_write_mst_block(struct page * page,struct writeback_control * wbc)884 static int ntfs_write_mst_block(struct page *page,
885 		struct writeback_control *wbc)
886 {
887 	sector_t block, dblock, rec_block;
888 	struct inode *vi = page->mapping->host;
889 	ntfs_inode *ni = NTFS_I(vi);
890 	ntfs_volume *vol = ni->vol;
891 	u8 *kaddr;
892 	unsigned int rec_size = ni->itype.index.block_size;
893 	ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
894 	struct buffer_head *bh, *head, *tbh, *rec_start_bh;
895 	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
896 	runlist_element *rl;
897 	int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
898 	unsigned bh_size, rec_size_bits;
899 	bool sync, is_mft, page_is_dirty, rec_is_dirty;
900 	unsigned char bh_size_bits;
901 
902 	if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
903 		return -EINVAL;
904 
905 	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
906 			"0x%lx.", vi->i_ino, ni->type, page->index);
907 	BUG_ON(!NInoNonResident(ni));
908 	BUG_ON(!NInoMstProtected(ni));
909 	is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
910 	/*
911 	 * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
912 	 * in its page cache were to be marked dirty.  However this should
913 	 * never happen with the current driver and considering we do not
914 	 * handle this case here we do want to BUG(), at least for now.
915 	 */
916 	BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
917 			(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
918 	bh_size = vol->sb->s_blocksize;
919 	bh_size_bits = vol->sb->s_blocksize_bits;
920 	max_bhs = PAGE_SIZE / bh_size;
921 	BUG_ON(!max_bhs);
922 	BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
923 
924 	/* Were we called for sync purposes? */
925 	sync = (wbc->sync_mode == WB_SYNC_ALL);
926 
927 	/* Make sure we have mapped buffers. */
928 	bh = head = page_buffers(page);
929 	BUG_ON(!bh);
930 
931 	rec_size_bits = ni->itype.index.block_size_bits;
932 	BUG_ON(!(PAGE_SIZE >> rec_size_bits));
933 	bhs_per_rec = rec_size >> bh_size_bits;
934 	BUG_ON(!bhs_per_rec);
935 
936 	/* The first block in the page. */
937 	rec_block = block = (sector_t)page->index <<
938 			(PAGE_SHIFT - bh_size_bits);
939 
940 	/* The first out of bounds block for the data size. */
941 	dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
942 
943 	rl = NULL;
944 	err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
945 	page_is_dirty = rec_is_dirty = false;
946 	rec_start_bh = NULL;
947 	do {
948 		bool is_retry = false;
949 
950 		if (likely(block < rec_block)) {
951 			if (unlikely(block >= dblock)) {
952 				clear_buffer_dirty(bh);
953 				set_buffer_uptodate(bh);
954 				continue;
955 			}
956 			/*
957 			 * This block is not the first one in the record.  We
958 			 * ignore the buffer's dirty state because we could
959 			 * have raced with a parallel mark_ntfs_record_dirty().
960 			 */
961 			if (!rec_is_dirty)
962 				continue;
963 			if (unlikely(err2)) {
964 				if (err2 != -ENOMEM)
965 					clear_buffer_dirty(bh);
966 				continue;
967 			}
968 		} else /* if (block == rec_block) */ {
969 			BUG_ON(block > rec_block);
970 			/* This block is the first one in the record. */
971 			rec_block += bhs_per_rec;
972 			err2 = 0;
973 			if (unlikely(block >= dblock)) {
974 				clear_buffer_dirty(bh);
975 				continue;
976 			}
977 			if (!buffer_dirty(bh)) {
978 				/* Clean records are not written out. */
979 				rec_is_dirty = false;
980 				continue;
981 			}
982 			rec_is_dirty = true;
983 			rec_start_bh = bh;
984 		}
985 		/* Need to map the buffer if it is not mapped already. */
986 		if (unlikely(!buffer_mapped(bh))) {
987 			VCN vcn;
988 			LCN lcn;
989 			unsigned int vcn_ofs;
990 
991 			bh->b_bdev = vol->sb->s_bdev;
992 			/* Obtain the vcn and offset of the current block. */
993 			vcn = (VCN)block << bh_size_bits;
994 			vcn_ofs = vcn & vol->cluster_size_mask;
995 			vcn >>= vol->cluster_size_bits;
996 			if (!rl) {
997 lock_retry_remap:
998 				down_read(&ni->runlist.lock);
999 				rl = ni->runlist.rl;
1000 			}
1001 			if (likely(rl != NULL)) {
1002 				/* Seek to element containing target vcn. */
1003 				while (rl->length && rl[1].vcn <= vcn)
1004 					rl++;
1005 				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
1006 			} else
1007 				lcn = LCN_RL_NOT_MAPPED;
1008 			/* Successful remap. */
1009 			if (likely(lcn >= 0)) {
1010 				/* Setup buffer head to correct block. */
1011 				bh->b_blocknr = ((lcn <<
1012 						vol->cluster_size_bits) +
1013 						vcn_ofs) >> bh_size_bits;
1014 				set_buffer_mapped(bh);
1015 			} else {
1016 				/*
1017 				 * Remap failed.  Retry to map the runlist once
1018 				 * unless we are working on $MFT which always
1019 				 * has the whole of its runlist in memory.
1020 				 */
1021 				if (!is_mft && !is_retry &&
1022 						lcn == LCN_RL_NOT_MAPPED) {
1023 					is_retry = true;
1024 					/*
1025 					 * Attempt to map runlist, dropping
1026 					 * lock for the duration.
1027 					 */
1028 					up_read(&ni->runlist.lock);
1029 					err2 = ntfs_map_runlist(ni, vcn);
1030 					if (likely(!err2))
1031 						goto lock_retry_remap;
1032 					if (err2 == -ENOMEM)
1033 						page_is_dirty = true;
1034 					lcn = err2;
1035 				} else {
1036 					err2 = -EIO;
1037 					if (!rl)
1038 						up_read(&ni->runlist.lock);
1039 				}
1040 				/* Hard error.  Abort writing this record. */
1041 				if (!err || err == -ENOMEM)
1042 					err = err2;
1043 				bh->b_blocknr = -1;
1044 				ntfs_error(vol->sb, "Cannot write ntfs record "
1045 						"0x%llx (inode 0x%lx, "
1046 						"attribute type 0x%x) because "
1047 						"its location on disk could "
1048 						"not be determined (error "
1049 						"code %lli).",
1050 						(long long)block <<
1051 						bh_size_bits >>
1052 						vol->mft_record_size_bits,
1053 						ni->mft_no, ni->type,
1054 						(long long)lcn);
1055 				/*
1056 				 * If this is not the first buffer, remove the
1057 				 * buffers in this record from the list of
1058 				 * buffers to write and clear their dirty bit
1059 				 * if not error -ENOMEM.
1060 				 */
1061 				if (rec_start_bh != bh) {
1062 					while (bhs[--nr_bhs] != rec_start_bh)
1063 						;
1064 					if (err2 != -ENOMEM) {
1065 						do {
1066 							clear_buffer_dirty(
1067 								rec_start_bh);
1068 						} while ((rec_start_bh =
1069 								rec_start_bh->
1070 								b_this_page) !=
1071 								bh);
1072 					}
1073 				}
1074 				continue;
1075 			}
1076 		}
1077 		BUG_ON(!buffer_uptodate(bh));
1078 		BUG_ON(nr_bhs >= max_bhs);
1079 		bhs[nr_bhs++] = bh;
1080 	} while (block++, (bh = bh->b_this_page) != head);
1081 	if (unlikely(rl))
1082 		up_read(&ni->runlist.lock);
1083 	/* If there were no dirty buffers, we are done. */
1084 	if (!nr_bhs)
1085 		goto done;
1086 	/* Map the page so we can access its contents. */
1087 	kaddr = kmap(page);
1088 	/* Clear the page uptodate flag whilst the mst fixups are applied. */
1089 	BUG_ON(!PageUptodate(page));
1090 	ClearPageUptodate(page);
1091 	for (i = 0; i < nr_bhs; i++) {
1092 		unsigned int ofs;
1093 
1094 		/* Skip buffers which are not at the beginning of records. */
1095 		if (i % bhs_per_rec)
1096 			continue;
1097 		tbh = bhs[i];
1098 		ofs = bh_offset(tbh);
1099 		if (is_mft) {
1100 			ntfs_inode *tni;
1101 			unsigned long mft_no;
1102 
1103 			/* Get the mft record number. */
1104 			mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
1105 					>> rec_size_bits;
1106 			/* Check whether to write this mft record. */
1107 			tni = NULL;
1108 			if (!ntfs_may_write_mft_record(vol, mft_no,
1109 					(MFT_RECORD*)(kaddr + ofs), &tni)) {
1110 				/*
1111 				 * The record should not be written.  This
1112 				 * means we need to redirty the page before
1113 				 * returning.
1114 				 */
1115 				page_is_dirty = true;
1116 				/*
1117 				 * Remove the buffers in this mft record from
1118 				 * the list of buffers to write.
1119 				 */
1120 				do {
1121 					bhs[i] = NULL;
1122 				} while (++i % bhs_per_rec);
1123 				continue;
1124 			}
1125 			/*
1126 			 * The record should be written.  If a locked ntfs
1127 			 * inode was returned, add it to the array of locked
1128 			 * ntfs inodes.
1129 			 */
1130 			if (tni)
1131 				locked_nis[nr_locked_nis++] = tni;
1132 		}
1133 		/* Apply the mst protection fixups. */
1134 		err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
1135 				rec_size);
1136 		if (unlikely(err2)) {
1137 			if (!err || err == -ENOMEM)
1138 				err = -EIO;
1139 			ntfs_error(vol->sb, "Failed to apply mst fixups "
1140 					"(inode 0x%lx, attribute type 0x%x, "
1141 					"page index 0x%lx, page offset 0x%x)!"
1142 					"  Unmount and run chkdsk.", vi->i_ino,
1143 					ni->type, page->index, ofs);
1144 			/*
1145 			 * Mark all the buffers in this record clean as we do
1146 			 * not want to write corrupt data to disk.
1147 			 */
1148 			do {
1149 				clear_buffer_dirty(bhs[i]);
1150 				bhs[i] = NULL;
1151 			} while (++i % bhs_per_rec);
1152 			continue;
1153 		}
1154 		nr_recs++;
1155 	}
1156 	/* If no records are to be written out, we are done. */
1157 	if (!nr_recs)
1158 		goto unm_done;
1159 	flush_dcache_page(page);
1160 	/* Lock buffers and start synchronous write i/o on them. */
1161 	for (i = 0; i < nr_bhs; i++) {
1162 		tbh = bhs[i];
1163 		if (!tbh)
1164 			continue;
1165 		if (!trylock_buffer(tbh))
1166 			BUG();
1167 		/* The buffer dirty state is now irrelevant, just clean it. */
1168 		clear_buffer_dirty(tbh);
1169 		BUG_ON(!buffer_uptodate(tbh));
1170 		BUG_ON(!buffer_mapped(tbh));
1171 		get_bh(tbh);
1172 		tbh->b_end_io = end_buffer_write_sync;
1173 		submit_bh(REQ_OP_WRITE, tbh);
1174 	}
1175 	/* Synchronize the mft mirror now if not @sync. */
1176 	if (is_mft && !sync)
1177 		goto do_mirror;
1178 do_wait:
1179 	/* Wait on i/o completion of buffers. */
1180 	for (i = 0; i < nr_bhs; i++) {
1181 		tbh = bhs[i];
1182 		if (!tbh)
1183 			continue;
1184 		wait_on_buffer(tbh);
1185 		if (unlikely(!buffer_uptodate(tbh))) {
1186 			ntfs_error(vol->sb, "I/O error while writing ntfs "
1187 					"record buffer (inode 0x%lx, "
1188 					"attribute type 0x%x, page index "
1189 					"0x%lx, page offset 0x%lx)!  Unmount "
1190 					"and run chkdsk.", vi->i_ino, ni->type,
1191 					page->index, bh_offset(tbh));
1192 			if (!err || err == -ENOMEM)
1193 				err = -EIO;
1194 			/*
1195 			 * Set the buffer uptodate so the page and buffer
1196 			 * states do not become out of sync.
1197 			 */
1198 			set_buffer_uptodate(tbh);
1199 		}
1200 	}
1201 	/* If @sync, now synchronize the mft mirror. */
1202 	if (is_mft && sync) {
1203 do_mirror:
1204 		for (i = 0; i < nr_bhs; i++) {
1205 			unsigned long mft_no;
1206 			unsigned int ofs;
1207 
1208 			/*
1209 			 * Skip buffers which are not at the beginning of
1210 			 * records.
1211 			 */
1212 			if (i % bhs_per_rec)
1213 				continue;
1214 			tbh = bhs[i];
1215 			/* Skip removed buffers (and hence records). */
1216 			if (!tbh)
1217 				continue;
1218 			ofs = bh_offset(tbh);
1219 			/* Get the mft record number. */
1220 			mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
1221 					>> rec_size_bits;
1222 			if (mft_no < vol->mftmirr_size)
1223 				ntfs_sync_mft_mirror(vol, mft_no,
1224 						(MFT_RECORD*)(kaddr + ofs),
1225 						sync);
1226 		}
1227 		if (!sync)
1228 			goto do_wait;
1229 	}
1230 	/* Remove the mst protection fixups again. */
1231 	for (i = 0; i < nr_bhs; i++) {
1232 		if (!(i % bhs_per_rec)) {
1233 			tbh = bhs[i];
1234 			if (!tbh)
1235 				continue;
1236 			post_write_mst_fixup((NTFS_RECORD*)(kaddr +
1237 					bh_offset(tbh)));
1238 		}
1239 	}
1240 	flush_dcache_page(page);
1241 unm_done:
1242 	/* Unlock any locked inodes. */
1243 	while (nr_locked_nis-- > 0) {
1244 		ntfs_inode *tni, *base_tni;
1245 
1246 		tni = locked_nis[nr_locked_nis];
1247 		/* Get the base inode. */
1248 		mutex_lock(&tni->extent_lock);
1249 		if (tni->nr_extents >= 0)
1250 			base_tni = tni;
1251 		else {
1252 			base_tni = tni->ext.base_ntfs_ino;
1253 			BUG_ON(!base_tni);
1254 		}
1255 		mutex_unlock(&tni->extent_lock);
1256 		ntfs_debug("Unlocking %s inode 0x%lx.",
1257 				tni == base_tni ? "base" : "extent",
1258 				tni->mft_no);
1259 		mutex_unlock(&tni->mrec_lock);
1260 		atomic_dec(&tni->count);
1261 		iput(VFS_I(base_tni));
1262 	}
1263 	SetPageUptodate(page);
1264 	kunmap(page);
1265 done:
1266 	if (unlikely(err && err != -ENOMEM)) {
1267 		/*
1268 		 * Set page error if there is only one ntfs record in the page.
1269 		 * Otherwise we would loose per-record granularity.
1270 		 */
1271 		if (ni->itype.index.block_size == PAGE_SIZE)
1272 			SetPageError(page);
1273 		NVolSetErrors(vol);
1274 	}
1275 	if (page_is_dirty) {
1276 		ntfs_debug("Page still contains one or more dirty ntfs "
1277 				"records.  Redirtying the page starting at "
1278 				"record 0x%lx.", page->index <<
1279 				(PAGE_SHIFT - rec_size_bits));
1280 		redirty_page_for_writepage(wbc, page);
1281 		unlock_page(page);
1282 	} else {
1283 		/*
1284 		 * Keep the VM happy.  This must be done otherwise the
1285 		 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
1286 		 * the page is clean.
1287 		 */
1288 		BUG_ON(PageWriteback(page));
1289 		set_page_writeback(page);
1290 		unlock_page(page);
1291 		end_page_writeback(page);
1292 	}
1293 	if (likely(!err))
1294 		ntfs_debug("Done.");
1295 	return err;
1296 }
1297 
1298 /**
1299  * ntfs_writepage - write a @page to the backing store
1300  * @page:	page cache page to write out
1301  * @wbc:	writeback control structure
1302  *
1303  * This is called from the VM when it wants to have a dirty ntfs page cache
1304  * page cleaned.  The VM has already locked the page and marked it clean.
1305  *
1306  * For non-resident attributes, ntfs_writepage() writes the @page by calling
1307  * the ntfs version of the generic block_write_full_folio() function,
1308  * ntfs_write_block(), which in turn if necessary creates and writes the
1309  * buffers associated with the page asynchronously.
1310  *
1311  * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
1312  * the data to the mft record (which at this stage is most likely in memory).
1313  * The mft record is then marked dirty and written out asynchronously via the
1314  * vfs inode dirty code path for the inode the mft record belongs to or via the
1315  * vm page dirty code path for the page the mft record is in.
1316  *
1317  * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio().
1318  *
1319  * Return 0 on success and -errno on error.
1320  */
ntfs_writepage(struct page * page,struct writeback_control * wbc)1321 static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
1322 {
1323 	struct folio *folio = page_folio(page);
1324 	loff_t i_size;
1325 	struct inode *vi = folio->mapping->host;
1326 	ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
1327 	char *addr;
1328 	ntfs_attr_search_ctx *ctx = NULL;
1329 	MFT_RECORD *m = NULL;
1330 	u32 attr_len;
1331 	int err;
1332 
1333 retry_writepage:
1334 	BUG_ON(!folio_test_locked(folio));
1335 	i_size = i_size_read(vi);
1336 	/* Is the folio fully outside i_size? (truncate in progress) */
1337 	if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
1338 			PAGE_SHIFT)) {
1339 		/*
1340 		 * The folio may have dirty, unmapped buffers.  Make them
1341 		 * freeable here, so the page does not leak.
1342 		 */
1343 		block_invalidate_folio(folio, 0, folio_size(folio));
1344 		folio_unlock(folio);
1345 		ntfs_debug("Write outside i_size - truncated?");
1346 		return 0;
1347 	}
1348 	/*
1349 	 * Only $DATA attributes can be encrypted and only unnamed $DATA
1350 	 * attributes can be compressed.  Index root can have the flags set but
1351 	 * this means to create compressed/encrypted files, not that the
1352 	 * attribute is compressed/encrypted.  Note we need to check for
1353 	 * AT_INDEX_ALLOCATION since this is the type of both directory and
1354 	 * index inodes.
1355 	 */
1356 	if (ni->type != AT_INDEX_ALLOCATION) {
1357 		/* If file is encrypted, deny access, just like NT4. */
1358 		if (NInoEncrypted(ni)) {
1359 			folio_unlock(folio);
1360 			BUG_ON(ni->type != AT_DATA);
1361 			ntfs_debug("Denying write access to encrypted file.");
1362 			return -EACCES;
1363 		}
1364 		/* Compressed data streams are handled in compress.c. */
1365 		if (NInoNonResident(ni) && NInoCompressed(ni)) {
1366 			BUG_ON(ni->type != AT_DATA);
1367 			BUG_ON(ni->name_len);
1368 			// TODO: Implement and replace this with
1369 			// return ntfs_write_compressed_block(page);
1370 			folio_unlock(folio);
1371 			ntfs_error(vi->i_sb, "Writing to compressed files is "
1372 					"not supported yet.  Sorry.");
1373 			return -EOPNOTSUPP;
1374 		}
1375 		// TODO: Implement and remove this check.
1376 		if (NInoNonResident(ni) && NInoSparse(ni)) {
1377 			folio_unlock(folio);
1378 			ntfs_error(vi->i_sb, "Writing to sparse files is not "
1379 					"supported yet.  Sorry.");
1380 			return -EOPNOTSUPP;
1381 		}
1382 	}
1383 	/* NInoNonResident() == NInoIndexAllocPresent() */
1384 	if (NInoNonResident(ni)) {
1385 		/* We have to zero every time due to mmap-at-end-of-file. */
1386 		if (folio->index >= (i_size >> PAGE_SHIFT)) {
1387 			/* The folio straddles i_size. */
1388 			unsigned int ofs = i_size & (folio_size(folio) - 1);
1389 			folio_zero_segment(folio, ofs, folio_size(folio));
1390 		}
1391 		/* Handle mst protected attributes. */
1392 		if (NInoMstProtected(ni))
1393 			return ntfs_write_mst_block(page, wbc);
1394 		/* Normal, non-resident data stream. */
1395 		return ntfs_write_block(folio, wbc);
1396 	}
1397 	/*
1398 	 * Attribute is resident, implying it is not compressed, encrypted, or
1399 	 * mst protected.  This also means the attribute is smaller than an mft
1400 	 * record and hence smaller than a folio, so can simply return error on
1401 	 * any folios with index above 0.  Note the attribute can actually be
1402 	 * marked compressed but if it is resident the actual data is not
1403 	 * compressed so we are ok to ignore the compressed flag here.
1404 	 */
1405 	BUG_ON(folio_buffers(folio));
1406 	BUG_ON(!folio_test_uptodate(folio));
1407 	if (unlikely(folio->index > 0)) {
1408 		ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0.  "
1409 				"Aborting write.", folio->index);
1410 		BUG_ON(folio_test_writeback(folio));
1411 		folio_start_writeback(folio);
1412 		folio_unlock(folio);
1413 		folio_end_writeback(folio);
1414 		return -EIO;
1415 	}
1416 	if (!NInoAttr(ni))
1417 		base_ni = ni;
1418 	else
1419 		base_ni = ni->ext.base_ntfs_ino;
1420 	/* Map, pin, and lock the mft record. */
1421 	m = map_mft_record(base_ni);
1422 	if (IS_ERR(m)) {
1423 		err = PTR_ERR(m);
1424 		m = NULL;
1425 		ctx = NULL;
1426 		goto err_out;
1427 	}
1428 	/*
1429 	 * If a parallel write made the attribute non-resident, drop the mft
1430 	 * record and retry the writepage.
1431 	 */
1432 	if (unlikely(NInoNonResident(ni))) {
1433 		unmap_mft_record(base_ni);
1434 		goto retry_writepage;
1435 	}
1436 	ctx = ntfs_attr_get_search_ctx(base_ni, m);
1437 	if (unlikely(!ctx)) {
1438 		err = -ENOMEM;
1439 		goto err_out;
1440 	}
1441 	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
1442 			CASE_SENSITIVE, 0, NULL, 0, ctx);
1443 	if (unlikely(err))
1444 		goto err_out;
1445 	/*
1446 	 * Keep the VM happy.  This must be done otherwise
1447 	 * PAGECACHE_TAG_DIRTY remains set even though the folio is clean.
1448 	 */
1449 	BUG_ON(folio_test_writeback(folio));
1450 	folio_start_writeback(folio);
1451 	folio_unlock(folio);
1452 	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
1453 	i_size = i_size_read(vi);
1454 	if (unlikely(attr_len > i_size)) {
1455 		/* Race with shrinking truncate or a failed truncate. */
1456 		attr_len = i_size;
1457 		/*
1458 		 * If the truncate failed, fix it up now.  If a concurrent
1459 		 * truncate, we do its job, so it does not have to do anything.
1460 		 */
1461 		err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
1462 				attr_len);
1463 		/* Shrinking cannot fail. */
1464 		BUG_ON(err);
1465 	}
1466 	addr = kmap_local_folio(folio, 0);
1467 	/* Copy the data from the folio to the mft record. */
1468 	memcpy((u8*)ctx->attr +
1469 			le16_to_cpu(ctx->attr->data.resident.value_offset),
1470 			addr, attr_len);
1471 	/* Zero out of bounds area in the page cache folio. */
1472 	memset(addr + attr_len, 0, folio_size(folio) - attr_len);
1473 	kunmap_local(addr);
1474 	flush_dcache_folio(folio);
1475 	flush_dcache_mft_record_page(ctx->ntfs_ino);
1476 	/* We are done with the folio. */
1477 	folio_end_writeback(folio);
1478 	/* Finally, mark the mft record dirty, so it gets written back. */
1479 	mark_mft_record_dirty(ctx->ntfs_ino);
1480 	ntfs_attr_put_search_ctx(ctx);
1481 	unmap_mft_record(base_ni);
1482 	return 0;
1483 err_out:
1484 	if (err == -ENOMEM) {
1485 		ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
1486 				"page so we try again later.");
1487 		/*
1488 		 * Put the folio back on mapping->dirty_pages, but leave its
1489 		 * buffers' dirty state as-is.
1490 		 */
1491 		folio_redirty_for_writepage(wbc, folio);
1492 		err = 0;
1493 	} else {
1494 		ntfs_error(vi->i_sb, "Resident attribute write failed with "
1495 				"error %i.", err);
1496 		folio_set_error(folio);
1497 		NVolSetErrors(ni->vol);
1498 	}
1499 	folio_unlock(folio);
1500 	if (ctx)
1501 		ntfs_attr_put_search_ctx(ctx);
1502 	if (m)
1503 		unmap_mft_record(base_ni);
1504 	return err;
1505 }
1506 
1507 #endif	/* NTFS_RW */
1508 
1509 /**
1510  * ntfs_bmap - map logical file block to physical device block
1511  * @mapping:	address space mapping to which the block to be mapped belongs
1512  * @block:	logical block to map to its physical device block
1513  *
1514  * For regular, non-resident files (i.e. not compressed and not encrypted), map
1515  * the logical @block belonging to the file described by the address space
1516  * mapping @mapping to its physical device block.
1517  *
1518  * The size of the block is equal to the @s_blocksize field of the super block
1519  * of the mounted file system which is guaranteed to be smaller than or equal
1520  * to the cluster size thus the block is guaranteed to fit entirely inside the
1521  * cluster which means we do not need to care how many contiguous bytes are
1522  * available after the beginning of the block.
1523  *
1524  * Return the physical device block if the mapping succeeded or 0 if the block
1525  * is sparse or there was an error.
1526  *
1527  * Note: This is a problem if someone tries to run bmap() on $Boot system file
1528  * as that really is in block zero but there is nothing we can do.  bmap() is
1529  * just broken in that respect (just like it cannot distinguish sparse from
1530  * not available or error).
1531  */
ntfs_bmap(struct address_space * mapping,sector_t block)1532 static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
1533 {
1534 	s64 ofs, size;
1535 	loff_t i_size;
1536 	LCN lcn;
1537 	unsigned long blocksize, flags;
1538 	ntfs_inode *ni = NTFS_I(mapping->host);
1539 	ntfs_volume *vol = ni->vol;
1540 	unsigned delta;
1541 	unsigned char blocksize_bits, cluster_size_shift;
1542 
1543 	ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
1544 			ni->mft_no, (unsigned long long)block);
1545 	if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
1546 		ntfs_error(vol->sb, "BMAP does not make sense for %s "
1547 				"attributes, returning 0.",
1548 				(ni->type != AT_DATA) ? "non-data" :
1549 				(!NInoNonResident(ni) ? "resident" :
1550 				"encrypted"));
1551 		return 0;
1552 	}
1553 	/* None of these can happen. */
1554 	BUG_ON(NInoCompressed(ni));
1555 	BUG_ON(NInoMstProtected(ni));
1556 	blocksize = vol->sb->s_blocksize;
1557 	blocksize_bits = vol->sb->s_blocksize_bits;
1558 	ofs = (s64)block << blocksize_bits;
1559 	read_lock_irqsave(&ni->size_lock, flags);
1560 	size = ni->initialized_size;
1561 	i_size = i_size_read(VFS_I(ni));
1562 	read_unlock_irqrestore(&ni->size_lock, flags);
1563 	/*
1564 	 * If the offset is outside the initialized size or the block straddles
1565 	 * the initialized size then pretend it is a hole unless the
1566 	 * initialized size equals the file size.
1567 	 */
1568 	if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
1569 		goto hole;
1570 	cluster_size_shift = vol->cluster_size_bits;
1571 	down_read(&ni->runlist.lock);
1572 	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
1573 	up_read(&ni->runlist.lock);
1574 	if (unlikely(lcn < LCN_HOLE)) {
1575 		/*
1576 		 * Step down to an integer to avoid gcc doing a long long
1577 		 * comparision in the switch when we know @lcn is between
1578 		 * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
1579 		 *
1580 		 * Otherwise older gcc (at least on some architectures) will
1581 		 * try to use __cmpdi2() which is of course not available in
1582 		 * the kernel.
1583 		 */
1584 		switch ((int)lcn) {
1585 		case LCN_ENOENT:
1586 			/*
1587 			 * If the offset is out of bounds then pretend it is a
1588 			 * hole.
1589 			 */
1590 			goto hole;
1591 		case LCN_ENOMEM:
1592 			ntfs_error(vol->sb, "Not enough memory to complete "
1593 					"mapping for inode 0x%lx.  "
1594 					"Returning 0.", ni->mft_no);
1595 			break;
1596 		default:
1597 			ntfs_error(vol->sb, "Failed to complete mapping for "
1598 					"inode 0x%lx.  Run chkdsk.  "
1599 					"Returning 0.", ni->mft_no);
1600 			break;
1601 		}
1602 		return 0;
1603 	}
1604 	if (lcn < 0) {
1605 		/* It is a hole. */
1606 hole:
1607 		ntfs_debug("Done (returning hole).");
1608 		return 0;
1609 	}
1610 	/*
1611 	 * The block is really allocated and fullfils all our criteria.
1612 	 * Convert the cluster to units of block size and return the result.
1613 	 */
1614 	delta = ofs & vol->cluster_size_mask;
1615 	if (unlikely(sizeof(block) < sizeof(lcn))) {
1616 		block = lcn = ((lcn << cluster_size_shift) + delta) >>
1617 				blocksize_bits;
1618 		/* If the block number was truncated return 0. */
1619 		if (unlikely(block != lcn)) {
1620 			ntfs_error(vol->sb, "Physical block 0x%llx is too "
1621 					"large to be returned, returning 0.",
1622 					(long long)lcn);
1623 			return 0;
1624 		}
1625 	} else
1626 		block = ((lcn << cluster_size_shift) + delta) >>
1627 				blocksize_bits;
1628 	ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
1629 	return block;
1630 }
1631 
1632 /*
1633  * ntfs_normal_aops - address space operations for normal inodes and attributes
1634  *
1635  * Note these are not used for compressed or mst protected inodes and
1636  * attributes.
1637  */
1638 const struct address_space_operations ntfs_normal_aops = {
1639 	.read_folio	= ntfs_read_folio,
1640 #ifdef NTFS_RW
1641 	.writepage	= ntfs_writepage,
1642 	.dirty_folio	= block_dirty_folio,
1643 #endif /* NTFS_RW */
1644 	.bmap		= ntfs_bmap,
1645 	.migrate_folio	= buffer_migrate_folio,
1646 	.is_partially_uptodate = block_is_partially_uptodate,
1647 	.error_remove_folio = generic_error_remove_folio,
1648 };
1649 
1650 /*
1651  * ntfs_compressed_aops - address space operations for compressed inodes
1652  */
1653 const struct address_space_operations ntfs_compressed_aops = {
1654 	.read_folio	= ntfs_read_folio,
1655 #ifdef NTFS_RW
1656 	.writepage	= ntfs_writepage,
1657 	.dirty_folio	= block_dirty_folio,
1658 #endif /* NTFS_RW */
1659 	.migrate_folio	= buffer_migrate_folio,
1660 	.is_partially_uptodate = block_is_partially_uptodate,
1661 	.error_remove_folio = generic_error_remove_folio,
1662 };
1663 
1664 /*
1665  * ntfs_mst_aops - general address space operations for mst protecteed inodes
1666  *			  and attributes
1667  */
1668 const struct address_space_operations ntfs_mst_aops = {
1669 	.read_folio	= ntfs_read_folio,	/* Fill page with data. */
1670 #ifdef NTFS_RW
1671 	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
1672 	.dirty_folio	= filemap_dirty_folio,
1673 #endif /* NTFS_RW */
1674 	.migrate_folio	= buffer_migrate_folio,
1675 	.is_partially_uptodate	= block_is_partially_uptodate,
1676 	.error_remove_folio = generic_error_remove_folio,
1677 };
1678 
1679 #ifdef NTFS_RW
1680 
1681 /**
1682  * mark_ntfs_record_dirty - mark an ntfs record dirty
1683  * @page:	page containing the ntfs record to mark dirty
1684  * @ofs:	byte offset within @page at which the ntfs record begins
1685  *
1686  * Set the buffers and the page in which the ntfs record is located dirty.
1687  *
1688  * The latter also marks the vfs inode the ntfs record belongs to dirty
1689  * (I_DIRTY_PAGES only).
1690  *
1691  * If the page does not have buffers, we create them and set them uptodate.
1692  * The page may not be locked which is why we need to handle the buffers under
1693  * the mapping->i_private_lock.  Once the buffers are marked dirty we no longer
1694  * need the lock since try_to_free_buffers() does not free dirty buffers.
1695  */
mark_ntfs_record_dirty(struct page * page,const unsigned int ofs)1696 void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
1697 	struct address_space *mapping = page->mapping;
1698 	ntfs_inode *ni = NTFS_I(mapping->host);
1699 	struct buffer_head *bh, *head, *buffers_to_free = NULL;
1700 	unsigned int end, bh_size, bh_ofs;
1701 
1702 	BUG_ON(!PageUptodate(page));
1703 	end = ofs + ni->itype.index.block_size;
1704 	bh_size = VFS_I(ni)->i_sb->s_blocksize;
1705 	spin_lock(&mapping->i_private_lock);
1706 	if (unlikely(!page_has_buffers(page))) {
1707 		spin_unlock(&mapping->i_private_lock);
1708 		bh = head = alloc_page_buffers(page, bh_size, true);
1709 		spin_lock(&mapping->i_private_lock);
1710 		if (likely(!page_has_buffers(page))) {
1711 			struct buffer_head *tail;
1712 
1713 			do {
1714 				set_buffer_uptodate(bh);
1715 				tail = bh;
1716 				bh = bh->b_this_page;
1717 			} while (bh);
1718 			tail->b_this_page = head;
1719 			attach_page_private(page, head);
1720 		} else
1721 			buffers_to_free = bh;
1722 	}
1723 	bh = head = page_buffers(page);
1724 	BUG_ON(!bh);
1725 	do {
1726 		bh_ofs = bh_offset(bh);
1727 		if (bh_ofs + bh_size <= ofs)
1728 			continue;
1729 		if (unlikely(bh_ofs >= end))
1730 			break;
1731 		set_buffer_dirty(bh);
1732 	} while ((bh = bh->b_this_page) != head);
1733 	spin_unlock(&mapping->i_private_lock);
1734 	filemap_dirty_folio(mapping, page_folio(page));
1735 	if (unlikely(buffers_to_free)) {
1736 		do {
1737 			bh = buffers_to_free->b_this_page;
1738 			free_buffer_head(buffers_to_free);
1739 			buffers_to_free = bh;
1740 		} while (buffers_to_free);
1741 	}
1742 }
1743 
1744 #endif /* NTFS_RW */
1745