1f79e2abbSAndrew Morton /* 2f79e2abbSAndrew Morton * High-level sync()-related operations 3f79e2abbSAndrew Morton */ 4f79e2abbSAndrew Morton 5f79e2abbSAndrew Morton #include <linux/kernel.h> 6f79e2abbSAndrew Morton #include <linux/file.h> 7f79e2abbSAndrew Morton #include <linux/fs.h> 85a0e3ad6STejun Heo #include <linux/slab.h> 9630d9c47SPaul Gortmaker #include <linux/export.h> 10b7ed78f5SSage Weil #include <linux/namei.h> 11914e2637SAl Viro #include <linux/sched.h> 12f79e2abbSAndrew Morton #include <linux/writeback.h> 13f79e2abbSAndrew Morton #include <linux/syscalls.h> 14f79e2abbSAndrew Morton #include <linux/linkage.h> 15f79e2abbSAndrew Morton #include <linux/pagemap.h> 16cf9a2ae8SDavid Howells #include <linux/quotaops.h> 175129a469SJörn Engel #include <linux/backing-dev.h> 185a3e5cb8SJan Kara #include "internal.h" 19f79e2abbSAndrew Morton 20f79e2abbSAndrew Morton #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ 21f79e2abbSAndrew Morton SYNC_FILE_RANGE_WAIT_AFTER) 22f79e2abbSAndrew Morton 23c15c54f5SJan Kara /* 24d8a8559cSJens Axboe * Do the filesystem syncing work. For simple filesystems 25d8a8559cSJens Axboe * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to 26d8a8559cSJens Axboe * submit IO for these buffers via __sync_blockdev(). This also speeds up the 27d8a8559cSJens Axboe * wait == 1 case since in that case write_inode() functions do 28d8a8559cSJens Axboe * sync_dirty_buffer() and thus effectively write one block at a time. 29c15c54f5SJan Kara */ 300dc83bd3SJan Kara static int __sync_filesystem(struct super_block *sb, int wait) 31c15c54f5SJan Kara { 325fb324adSChristoph Hellwig if (wait) 330dc83bd3SJan Kara sync_inodes_sb(sb); 345fb324adSChristoph Hellwig else 350e175a18SCurt Wohlgemuth writeback_inodes_sb(sb, WB_REASON_SYNC); 365fb324adSChristoph Hellwig 37c15c54f5SJan Kara if (sb->s_op->sync_fs) 38c15c54f5SJan Kara sb->s_op->sync_fs(sb, wait); 39c15c54f5SJan Kara return __sync_blockdev(sb->s_bdev, wait); 40c15c54f5SJan Kara } 41c15c54f5SJan Kara 42c15c54f5SJan Kara /* 43c15c54f5SJan Kara * Write out and wait upon all dirty data associated with this 44c15c54f5SJan Kara * superblock. Filesystem data as well as the underlying block 45c15c54f5SJan Kara * device. Takes the superblock lock. 46c15c54f5SJan Kara */ 4760b0680fSJan Kara int sync_filesystem(struct super_block *sb) 48c15c54f5SJan Kara { 49c15c54f5SJan Kara int ret; 50c15c54f5SJan Kara 515af7926fSChristoph Hellwig /* 525af7926fSChristoph Hellwig * We need to be protected against the filesystem going from 535af7926fSChristoph Hellwig * r/o to r/w or vice versa. 545af7926fSChristoph Hellwig */ 555af7926fSChristoph Hellwig WARN_ON(!rwsem_is_locked(&sb->s_umount)); 565af7926fSChristoph Hellwig 575af7926fSChristoph Hellwig /* 585af7926fSChristoph Hellwig * No point in syncing out anything if the filesystem is read-only. 595af7926fSChristoph Hellwig */ 605af7926fSChristoph Hellwig if (sb->s_flags & MS_RDONLY) 615af7926fSChristoph Hellwig return 0; 625af7926fSChristoph Hellwig 630dc83bd3SJan Kara ret = __sync_filesystem(sb, 0); 64c15c54f5SJan Kara if (ret < 0) 65c15c54f5SJan Kara return ret; 660dc83bd3SJan Kara return __sync_filesystem(sb, 1); 67c15c54f5SJan Kara } 6810096fb1SAnton Altaparmakov EXPORT_SYMBOL(sync_filesystem); 69c15c54f5SJan Kara 70b3de6531SJan Kara static void sync_inodes_one_sb(struct super_block *sb, void *arg) 7101a05b33SAl Viro { 7295f28604SJens Axboe if (!(sb->s_flags & MS_RDONLY)) 730dc83bd3SJan Kara sync_inodes_sb(sb); 7401a05b33SAl Viro } 75b3de6531SJan Kara 76b3de6531SJan Kara static void sync_fs_one_sb(struct super_block *sb, void *arg) 77b3de6531SJan Kara { 78b3de6531SJan Kara if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs) 79b3de6531SJan Kara sb->s_op->sync_fs(sb, *(int *)arg); 80b3de6531SJan Kara } 81b3de6531SJan Kara 82d0e91b13SJan Kara static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) 83b3de6531SJan Kara { 84d0e91b13SJan Kara filemap_fdatawrite(bdev->bd_inode->i_mapping); 85a8c7176bSJan Kara } 86a8c7176bSJan Kara 87d0e91b13SJan Kara static void fdatawait_one_bdev(struct block_device *bdev, void *arg) 88a8c7176bSJan Kara { 89aa750fd7SJunichi Nomura /* 90aa750fd7SJunichi Nomura * We keep the error status of individual mapping so that 91aa750fd7SJunichi Nomura * applications can catch the writeback error using fsync(2). 92aa750fd7SJunichi Nomura * See filemap_fdatawait_keep_errors() for details. 93aa750fd7SJunichi Nomura */ 94aa750fd7SJunichi Nomura filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); 95c15c54f5SJan Kara } 96c15c54f5SJan Kara 973beab0b4SZhang, Yanmin /* 984ea425b6SJan Kara * Sync everything. We start by waking flusher threads so that most of 994ea425b6SJan Kara * writeback runs on all devices in parallel. Then we sync all inodes reliably 1004ea425b6SJan Kara * which effectively also waits for all flusher threads to finish doing 1014ea425b6SJan Kara * writeback. At this point all data is on disk so metadata should be stable 1024ea425b6SJan Kara * and we tell filesystems to sync their metadata via ->sync_fs() calls. 1034ea425b6SJan Kara * Finally, we writeout all block devices because some filesystems (e.g. ext2) 1044ea425b6SJan Kara * just write metadata (such as inodes or bitmaps) to block device page cache 1054ea425b6SJan Kara * and do not sync it on their own in ->sync_fs(). 1063beab0b4SZhang, Yanmin */ 107a5f8fa9eSHeiko Carstens SYSCALL_DEFINE0(sync) 108cf9a2ae8SDavid Howells { 109b3de6531SJan Kara int nowait = 0, wait = 1; 110b3de6531SJan Kara 1110e175a18SCurt Wohlgemuth wakeup_flusher_threads(0, WB_REASON_SYNC); 1120dc83bd3SJan Kara iterate_supers(sync_inodes_one_sb, NULL); 1134ea425b6SJan Kara iterate_supers(sync_fs_one_sb, &nowait); 114b3de6531SJan Kara iterate_supers(sync_fs_one_sb, &wait); 115d0e91b13SJan Kara iterate_bdevs(fdatawrite_one_bdev, NULL); 116d0e91b13SJan Kara iterate_bdevs(fdatawait_one_bdev, NULL); 1175cee5815SJan Kara if (unlikely(laptop_mode)) 1185cee5815SJan Kara laptop_sync_completion(); 119cf9a2ae8SDavid Howells return 0; 120cf9a2ae8SDavid Howells } 121cf9a2ae8SDavid Howells 122a2a9537aSJens Axboe static void do_sync_work(struct work_struct *work) 123a2a9537aSJens Axboe { 124b3de6531SJan Kara int nowait = 0; 125b3de6531SJan Kara 1265cee5815SJan Kara /* 1275cee5815SJan Kara * Sync twice to reduce the possibility we skipped some inodes / pages 1285cee5815SJan Kara * because they were temporarily locked 1295cee5815SJan Kara */ 130b3de6531SJan Kara iterate_supers(sync_inodes_one_sb, &nowait); 131b3de6531SJan Kara iterate_supers(sync_fs_one_sb, &nowait); 132d0e91b13SJan Kara iterate_bdevs(fdatawrite_one_bdev, NULL); 133b3de6531SJan Kara iterate_supers(sync_inodes_one_sb, &nowait); 134b3de6531SJan Kara iterate_supers(sync_fs_one_sb, &nowait); 135d0e91b13SJan Kara iterate_bdevs(fdatawrite_one_bdev, NULL); 1365cee5815SJan Kara printk("Emergency Sync complete\n"); 137a2a9537aSJens Axboe kfree(work); 138a2a9537aSJens Axboe } 139a2a9537aSJens Axboe 140cf9a2ae8SDavid Howells void emergency_sync(void) 141cf9a2ae8SDavid Howells { 142a2a9537aSJens Axboe struct work_struct *work; 143a2a9537aSJens Axboe 144a2a9537aSJens Axboe work = kmalloc(sizeof(*work), GFP_ATOMIC); 145a2a9537aSJens Axboe if (work) { 146a2a9537aSJens Axboe INIT_WORK(work, do_sync_work); 147a2a9537aSJens Axboe schedule_work(work); 148a2a9537aSJens Axboe } 149cf9a2ae8SDavid Howells } 150cf9a2ae8SDavid Howells 151b7ed78f5SSage Weil /* 152b7ed78f5SSage Weil * sync a single super 153b7ed78f5SSage Weil */ 154b7ed78f5SSage Weil SYSCALL_DEFINE1(syncfs, int, fd) 155b7ed78f5SSage Weil { 1562903ff01SAl Viro struct fd f = fdget(fd); 157b7ed78f5SSage Weil struct super_block *sb; 158b7ed78f5SSage Weil int ret; 159b7ed78f5SSage Weil 1602903ff01SAl Viro if (!f.file) 161b7ed78f5SSage Weil return -EBADF; 162b583043eSAl Viro sb = f.file->f_path.dentry->d_sb; 163b7ed78f5SSage Weil 164b7ed78f5SSage Weil down_read(&sb->s_umount); 165b7ed78f5SSage Weil ret = sync_filesystem(sb); 166b7ed78f5SSage Weil up_read(&sb->s_umount); 167b7ed78f5SSage Weil 1682903ff01SAl Viro fdput(f); 169b7ed78f5SSage Weil return ret; 170b7ed78f5SSage Weil } 171b7ed78f5SSage Weil 1724c728ef5SChristoph Hellwig /** 173148f948bSJan Kara * vfs_fsync_range - helper to sync a range of data & metadata to disk 1744c728ef5SChristoph Hellwig * @file: file to sync 175148f948bSJan Kara * @start: offset in bytes of the beginning of data range to sync 176148f948bSJan Kara * @end: offset in bytes of the end of data range (inclusive) 177148f948bSJan Kara * @datasync: perform only datasync 1784c728ef5SChristoph Hellwig * 179148f948bSJan Kara * Write back data in range @start..@end and metadata for @file to disk. If 180148f948bSJan Kara * @datasync is set only metadata needed to access modified file data is 181148f948bSJan Kara * written. 1824c728ef5SChristoph Hellwig */ 1838018ab05SChristoph Hellwig int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync) 184cf9a2ae8SDavid Howells { 1850ae45f63STheodore Ts'o struct inode *inode = file->f_mapping->host; 1860ae45f63STheodore Ts'o 18772c2d531SAl Viro if (!file->f_op->fsync) 18802c24a82SJosef Bacik return -EINVAL; 1890ae45f63STheodore Ts'o if (!datasync && (inode->i_state & I_DIRTY_TIME)) { 1900ae45f63STheodore Ts'o spin_lock(&inode->i_lock); 1910ae45f63STheodore Ts'o inode->i_state &= ~I_DIRTY_TIME; 1920ae45f63STheodore Ts'o spin_unlock(&inode->i_lock); 1930ae45f63STheodore Ts'o mark_inode_dirty_sync(inode); 1940ae45f63STheodore Ts'o } 19502c24a82SJosef Bacik return file->f_op->fsync(file, start, end, datasync); 196cf9a2ae8SDavid Howells } 197148f948bSJan Kara EXPORT_SYMBOL(vfs_fsync_range); 198148f948bSJan Kara 199148f948bSJan Kara /** 200148f948bSJan Kara * vfs_fsync - perform a fsync or fdatasync on a file 201148f948bSJan Kara * @file: file to sync 202148f948bSJan Kara * @datasync: only perform a fdatasync operation 203148f948bSJan Kara * 204148f948bSJan Kara * Write back data and metadata for @file to disk. If @datasync is 205148f948bSJan Kara * set only metadata needed to access modified file data is written. 206148f948bSJan Kara */ 2078018ab05SChristoph Hellwig int vfs_fsync(struct file *file, int datasync) 208148f948bSJan Kara { 2098018ab05SChristoph Hellwig return vfs_fsync_range(file, 0, LLONG_MAX, datasync); 210148f948bSJan Kara } 2114c728ef5SChristoph Hellwig EXPORT_SYMBOL(vfs_fsync); 212cf9a2ae8SDavid Howells 2134c728ef5SChristoph Hellwig static int do_fsync(unsigned int fd, int datasync) 214cf9a2ae8SDavid Howells { 2152903ff01SAl Viro struct fd f = fdget(fd); 216cf9a2ae8SDavid Howells int ret = -EBADF; 217cf9a2ae8SDavid Howells 2182903ff01SAl Viro if (f.file) { 2192903ff01SAl Viro ret = vfs_fsync(f.file, datasync); 2202903ff01SAl Viro fdput(f); 221cf9a2ae8SDavid Howells } 222cf9a2ae8SDavid Howells return ret; 223cf9a2ae8SDavid Howells } 224cf9a2ae8SDavid Howells 225a5f8fa9eSHeiko Carstens SYSCALL_DEFINE1(fsync, unsigned int, fd) 226cf9a2ae8SDavid Howells { 2274c728ef5SChristoph Hellwig return do_fsync(fd, 0); 228cf9a2ae8SDavid Howells } 229cf9a2ae8SDavid Howells 230a5f8fa9eSHeiko Carstens SYSCALL_DEFINE1(fdatasync, unsigned int, fd) 231cf9a2ae8SDavid Howells { 2324c728ef5SChristoph Hellwig return do_fsync(fd, 1); 233cf9a2ae8SDavid Howells } 234cf9a2ae8SDavid Howells 235cf9a2ae8SDavid Howells /* 236f79e2abbSAndrew Morton * sys_sync_file_range() permits finely controlled syncing over a segment of 237f79e2abbSAndrew Morton * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is 238f79e2abbSAndrew Morton * zero then sys_sync_file_range() will operate from offset out to EOF. 239f79e2abbSAndrew Morton * 240f79e2abbSAndrew Morton * The flag bits are: 241f79e2abbSAndrew Morton * 242f79e2abbSAndrew Morton * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range 243f79e2abbSAndrew Morton * before performing the write. 244f79e2abbSAndrew Morton * 245f79e2abbSAndrew Morton * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the 246cce77081SPavel Machek * range which are not presently under writeback. Note that this may block for 247cce77081SPavel Machek * significant periods due to exhaustion of disk request structures. 248f79e2abbSAndrew Morton * 249f79e2abbSAndrew Morton * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range 250f79e2abbSAndrew Morton * after performing the write. 251f79e2abbSAndrew Morton * 252f79e2abbSAndrew Morton * Useful combinations of the flag bits are: 253f79e2abbSAndrew Morton * 254f79e2abbSAndrew Morton * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages 255f79e2abbSAndrew Morton * in the range which were dirty on entry to sys_sync_file_range() are placed 256f79e2abbSAndrew Morton * under writeout. This is a start-write-for-data-integrity operation. 257f79e2abbSAndrew Morton * 258f79e2abbSAndrew Morton * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which 259f79e2abbSAndrew Morton * are not presently under writeout. This is an asynchronous flush-to-disk 260f79e2abbSAndrew Morton * operation. Not suitable for data integrity operations. 261f79e2abbSAndrew Morton * 262f79e2abbSAndrew Morton * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for 263f79e2abbSAndrew Morton * completion of writeout of all pages in the range. This will be used after an 264f79e2abbSAndrew Morton * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait 265f79e2abbSAndrew Morton * for that operation to complete and to return the result. 266f79e2abbSAndrew Morton * 267f79e2abbSAndrew Morton * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: 268f79e2abbSAndrew Morton * a traditional sync() operation. This is a write-for-data-integrity operation 269f79e2abbSAndrew Morton * which will ensure that all pages in the range which were dirty on entry to 270f79e2abbSAndrew Morton * sys_sync_file_range() are committed to disk. 271f79e2abbSAndrew Morton * 272f79e2abbSAndrew Morton * 273f79e2abbSAndrew Morton * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any 274f79e2abbSAndrew Morton * I/O errors or ENOSPC conditions and will return those to the caller, after 275f79e2abbSAndrew Morton * clearing the EIO and ENOSPC flags in the address_space. 276f79e2abbSAndrew Morton * 277f79e2abbSAndrew Morton * It should be noted that none of these operations write out the file's 278f79e2abbSAndrew Morton * metadata. So unless the application is strictly performing overwrites of 279f79e2abbSAndrew Morton * already-instantiated disk blocks, there are no guarantees here that the data 280f79e2abbSAndrew Morton * will be available after a crash. 281f79e2abbSAndrew Morton */ 2824a0fd5bfSAl Viro SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, 2834a0fd5bfSAl Viro unsigned int, flags) 284f79e2abbSAndrew Morton { 285f79e2abbSAndrew Morton int ret; 2862903ff01SAl Viro struct fd f; 2877a0ad10cSChristoph Hellwig struct address_space *mapping; 288f79e2abbSAndrew Morton loff_t endbyte; /* inclusive */ 289f79e2abbSAndrew Morton umode_t i_mode; 290f79e2abbSAndrew Morton 291f79e2abbSAndrew Morton ret = -EINVAL; 292f79e2abbSAndrew Morton if (flags & ~VALID_FLAGS) 293f79e2abbSAndrew Morton goto out; 294f79e2abbSAndrew Morton 295f79e2abbSAndrew Morton endbyte = offset + nbytes; 296f79e2abbSAndrew Morton 297f79e2abbSAndrew Morton if ((s64)offset < 0) 298f79e2abbSAndrew Morton goto out; 299f79e2abbSAndrew Morton if ((s64)endbyte < 0) 300f79e2abbSAndrew Morton goto out; 301f79e2abbSAndrew Morton if (endbyte < offset) 302f79e2abbSAndrew Morton goto out; 303f79e2abbSAndrew Morton 304f79e2abbSAndrew Morton if (sizeof(pgoff_t) == 4) { 305f79e2abbSAndrew Morton if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { 306f79e2abbSAndrew Morton /* 307f79e2abbSAndrew Morton * The range starts outside a 32 bit machine's 308f79e2abbSAndrew Morton * pagecache addressing capabilities. Let it "succeed" 309f79e2abbSAndrew Morton */ 310f79e2abbSAndrew Morton ret = 0; 311f79e2abbSAndrew Morton goto out; 312f79e2abbSAndrew Morton } 313f79e2abbSAndrew Morton if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { 314f79e2abbSAndrew Morton /* 315f79e2abbSAndrew Morton * Out to EOF 316f79e2abbSAndrew Morton */ 317f79e2abbSAndrew Morton nbytes = 0; 318f79e2abbSAndrew Morton } 319f79e2abbSAndrew Morton } 320f79e2abbSAndrew Morton 321f79e2abbSAndrew Morton if (nbytes == 0) 322111ebb6eSOGAWA Hirofumi endbyte = LLONG_MAX; 323f79e2abbSAndrew Morton else 324f79e2abbSAndrew Morton endbyte--; /* inclusive */ 325f79e2abbSAndrew Morton 326f79e2abbSAndrew Morton ret = -EBADF; 3272903ff01SAl Viro f = fdget(fd); 3282903ff01SAl Viro if (!f.file) 329f79e2abbSAndrew Morton goto out; 330f79e2abbSAndrew Morton 331496ad9aaSAl Viro i_mode = file_inode(f.file)->i_mode; 332f79e2abbSAndrew Morton ret = -ESPIPE; 333f79e2abbSAndrew Morton if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && 334f79e2abbSAndrew Morton !S_ISLNK(i_mode)) 335f79e2abbSAndrew Morton goto out_put; 336f79e2abbSAndrew Morton 3372903ff01SAl Viro mapping = f.file->f_mapping; 3387a0ad10cSChristoph Hellwig if (!mapping) { 3397a0ad10cSChristoph Hellwig ret = -EINVAL; 3407a0ad10cSChristoph Hellwig goto out_put; 3417a0ad10cSChristoph Hellwig } 3427a0ad10cSChristoph Hellwig 3437a0ad10cSChristoph Hellwig ret = 0; 3447a0ad10cSChristoph Hellwig if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { 3457a0ad10cSChristoph Hellwig ret = filemap_fdatawait_range(mapping, offset, endbyte); 3467a0ad10cSChristoph Hellwig if (ret < 0) 3477a0ad10cSChristoph Hellwig goto out_put; 3487a0ad10cSChristoph Hellwig } 3497a0ad10cSChristoph Hellwig 3507a0ad10cSChristoph Hellwig if (flags & SYNC_FILE_RANGE_WRITE) { 35123d01270SJan Kara ret = __filemap_fdatawrite_range(mapping, offset, endbyte, 35223d01270SJan Kara WB_SYNC_NONE); 3537a0ad10cSChristoph Hellwig if (ret < 0) 3547a0ad10cSChristoph Hellwig goto out_put; 3557a0ad10cSChristoph Hellwig } 3567a0ad10cSChristoph Hellwig 3577a0ad10cSChristoph Hellwig if (flags & SYNC_FILE_RANGE_WAIT_AFTER) 3587a0ad10cSChristoph Hellwig ret = filemap_fdatawait_range(mapping, offset, endbyte); 3597a0ad10cSChristoph Hellwig 360f79e2abbSAndrew Morton out_put: 3612903ff01SAl Viro fdput(f); 362f79e2abbSAndrew Morton out: 363f79e2abbSAndrew Morton return ret; 364f79e2abbSAndrew Morton } 365f79e2abbSAndrew Morton 366edd5cd4aSDavid Woodhouse /* It would be nice if people remember that not all the world's an i386 367edd5cd4aSDavid Woodhouse when they introduce new system calls */ 3684a0fd5bfSAl Viro SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, 3694a0fd5bfSAl Viro loff_t, offset, loff_t, nbytes) 370edd5cd4aSDavid Woodhouse { 371edd5cd4aSDavid Woodhouse return sys_sync_file_range(fd, offset, nbytes, flags); 372edd5cd4aSDavid Woodhouse } 373