1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* vnode and volume validity verification. 3 * 4 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/kernel.h> 9 #include <linux/module.h> 10 #include <linux/sched.h> 11 #include "internal.h" 12 13 /* 14 * Data validation is managed through a number of mechanisms from the server: 15 * 16 * (1) On first contact with a server (such as if it has just been rebooted), 17 * the server sends us a CB.InitCallBackState* request. 18 * 19 * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC 20 * calls, the server maintains a time-limited per-vnode promise that it 21 * will send us a CB.CallBack request if a third party alters the vnodes 22 * accessed. 23 * 24 * Note that a vnode-level callbacks may also be sent for other reasons, 25 * such as filelock release. 26 * 27 * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC 28 * calls, each server maintains a time-limited per-volume promise that it 29 * will send us a CB.CallBack request if the RO volume is updated to a 30 * snapshot of the RW volume ("vos release"). This is an atomic event 31 * that cuts over all instances of the RO volume across multiple servers 32 * simultaneously. 33 * 34 * Note that a volume-level callbacks may also be sent for other reasons, 35 * such as the volumeserver taking over control of the volume from the 36 * fileserver. 37 * 38 * Note also that each server maintains an independent time limit on an 39 * independent callback. 40 * 41 * (4) Certain RPC calls include a volume information record "VolSync" in 42 * their reply. This contains a creation date for the volume that should 43 * remain unchanged for a RW volume (but will be changed if the volume is 44 * restored from backup) or will be bumped to the time of snapshotting 45 * when a RO volume is released. 46 * 47 * In order to track this events, the following are provided: 48 * 49 * ->cb_v_break. A counter of events that might mean that the contents of 50 * a volume have been altered since we last checked a vnode. 51 * 52 * ->cb_v_check. A counter of the number of events that we've sent a 53 * query to the server for. Everything's up to date if this equals 54 * cb_v_break. 55 * 56 * ->cb_scrub. A counter of the number of regression events for which we 57 * have to completely wipe the cache. 58 * 59 * ->cb_ro_snapshot. A counter of the number of times that we've 60 * recognised that a RO volume has been updated. 61 * 62 * ->cb_break. A counter of events that might mean that the contents of a 63 * vnode have been altered. 64 * 65 * ->cb_expires_at. The time at which the callback promise expires or 66 * AFS_NO_CB_PROMISE if we have no promise. 67 * 68 * The way we manage things is: 69 * 70 * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on 71 * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the 72 * volume and volume's server record. 73 * 74 * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level 75 * callback break on all the volumes that have been using that volume 76 * (ie. increment ->cb_v_break and reset ->cb_expires_at). 77 * 78 * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the 79 * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also 80 * dispatch a work item to unmap all PTEs to the vnode's pagecache to 81 * force reentry to the filesystem for revalidation. 82 * 83 * (4) When entering the filesystem, we call afs_validate() to check the 84 * validity of a vnode. This first checks to see if ->cb_v_check and 85 * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock 86 * exclusively and perform an FS.FetchStatus on the vnode. 87 * 88 * After checking the volume, we check the vnode. If there's a mismatch 89 * between the volume counters and the vnode's mirrors of those counters, 90 * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode. 91 * 92 * (5) When the reply from FS.FetchStatus arrives, the VolSync record is 93 * parsed: 94 * 95 * (A) If the Creation timestamp has changed on a RW volume or regressed 96 * on a RO volume, we try to increment ->cb_scrub; if it advances on a 97 * RO volume, we assume "vos release" happened and try to increment 98 * ->cb_ro_snapshot. 99 * 100 * (B) If the Update timestamp has regressed, we try to increment 101 * ->cb_scrub. 102 * 103 * Note that in both of these cases, we only do the increment if we can 104 * cmpxchg the value of the timestamp from the value we noted before the 105 * op. This tries to prevent parallel ops from fighting one another. 106 * 107 * volume->cb_v_check is then set to ->cb_v_break. 108 * 109 * (6) The AFSCallBack record included in the FS.FetchStatus reply is also 110 * parsed and used to set the promise in ->cb_expires_at for the vnode, 111 * the volume and the volume's server record. 112 * 113 * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for 114 * the vnode. 115 */ 116 117 /* 118 * Check the validity of a vnode/inode and its parent volume. 119 */ 120 bool afs_check_validity(const struct afs_vnode *vnode) 121 { 122 const struct afs_volume *volume = vnode->volume; 123 enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace; 124 time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at); 125 time64_t deadline = ktime_get_real_seconds() + 10; 126 127 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) 128 return true; 129 130 if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) 131 trace = afs_vnode_invalid_trace_cb_v_break; 132 else if (cb_expires_at == AFS_NO_CB_PROMISE) 133 trace = afs_vnode_invalid_trace_no_cb_promise; 134 else if (cb_expires_at <= deadline) 135 trace = afs_vnode_invalid_trace_expired; 136 else if (volume->cb_expires_at <= deadline) 137 trace = afs_vnode_invalid_trace_vol_expired; 138 else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot)) 139 trace = afs_vnode_invalid_trace_cb_ro_snapshot; 140 else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub)) 141 trace = afs_vnode_invalid_trace_cb_scrub; 142 else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) 143 trace = afs_vnode_invalid_trace_zap_data; 144 else 145 return true; 146 trace_afs_vnode_invalid(vnode, trace); 147 return false; 148 } 149 150 /* 151 * See if the server we've just talked to is currently excluded. 152 */ 153 static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) 154 { 155 const struct afs_server_entry *se; 156 const struct afs_server_list *slist; 157 bool is_excluded = true; 158 int i; 159 160 rcu_read_lock(); 161 162 slist = rcu_dereference(volume->servers); 163 for (i = 0; i < slist->nr_servers; i++) { 164 se = &slist->servers[i]; 165 if (op->server == se->server) { 166 is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags); 167 break; 168 } 169 } 170 171 rcu_read_unlock(); 172 return is_excluded; 173 } 174 175 /* 176 * Update the volume's server list when the creation time changes and see if 177 * the server we've just talked to is currently excluded. 178 */ 179 static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume) 180 { 181 int ret; 182 183 if (__afs_is_server_excluded(op, volume)) 184 return 1; 185 186 set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags); 187 ret = afs_check_volume_status(op->volume, op); 188 if (ret < 0) 189 return ret; 190 191 return __afs_is_server_excluded(op, volume); 192 } 193 194 /* 195 * Handle a change to the volume creation time in the VolSync record. 196 */ 197 static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume) 198 { 199 unsigned int snap; 200 time64_t cur = volume->creation_time; 201 time64_t old = op->pre_volsync.creation; 202 time64_t new = op->volsync.creation; 203 int ret; 204 205 _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); 206 207 if (cur == TIME64_MIN) { 208 volume->creation_time = new; 209 return 0; 210 } 211 212 if (new == cur) 213 return 0; 214 215 /* Try to advance the creation timestamp from what we had before the 216 * operation to what we got back from the server. This should 217 * hopefully ensure that in a race between multiple operations only one 218 * of them will do this. 219 */ 220 if (cur != old) 221 return 0; 222 223 /* If the creation time changes in an unexpected way, we need to scrub 224 * our caches. For a RW vol, this will only change if the volume is 225 * restored from a backup; for a RO/Backup vol, this will advance when 226 * the volume is updated to a new snapshot (eg. "vos release"). 227 */ 228 if (volume->type == AFSVL_RWVOL) 229 goto regressed; 230 if (volume->type == AFSVL_BACKVOL) { 231 if (new < old) 232 goto regressed; 233 goto advance; 234 } 235 236 /* We have an RO volume, we need to query the VL server and look at the 237 * server flags to see if RW->RO replication is in progress. 238 */ 239 ret = afs_is_server_excluded(op, volume); 240 if (ret < 0) 241 return ret; 242 if (ret > 0) { 243 snap = atomic_read(&volume->cb_ro_snapshot); 244 trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded); 245 return ret; 246 } 247 248 advance: 249 snap = atomic_inc_return(&volume->cb_ro_snapshot); 250 trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release); 251 volume->creation_time = new; 252 return 0; 253 254 regressed: 255 atomic_inc(&volume->cb_scrub); 256 trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress); 257 volume->creation_time = new; 258 return 0; 259 } 260 261 /* 262 * Handle a change to the volume update time in the VolSync record. 263 */ 264 static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume) 265 { 266 enum afs_cb_break_reason reason = afs_cb_break_no_break; 267 time64_t cur = volume->update_time; 268 time64_t old = op->pre_volsync.update; 269 time64_t new = op->volsync.update; 270 271 _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new); 272 273 if (cur == TIME64_MIN) { 274 volume->update_time = new; 275 return; 276 } 277 278 if (new == cur) 279 return; 280 281 /* If the volume update time changes in an unexpected way, we need to 282 * scrub our caches. For a RW vol, this will advance on every 283 * modification op; for a RO/Backup vol, this will advance when the 284 * volume is updated to a new snapshot (eg. "vos release"). 285 */ 286 if (new < old) 287 reason = afs_cb_break_for_update_regress; 288 289 /* Try to advance the update timestamp from what we had before the 290 * operation to what we got back from the server. This should 291 * hopefully ensure that in a race between multiple operations only one 292 * of them will do this. 293 */ 294 if (cur == old) { 295 if (reason == afs_cb_break_for_update_regress) { 296 atomic_inc(&volume->cb_scrub); 297 trace_afs_cb_v_break(volume->vid, 0, reason); 298 } 299 volume->update_time = new; 300 } 301 } 302 303 static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume) 304 { 305 int ret = 0; 306 307 if (likely(op->volsync.creation == volume->creation_time && 308 op->volsync.update == volume->update_time)) 309 return 0; 310 311 mutex_lock(&volume->volsync_lock); 312 if (op->volsync.creation != volume->creation_time) { 313 ret = afs_update_volume_creation_time(op, volume); 314 if (ret < 0) 315 goto out; 316 } 317 if (op->volsync.update != volume->update_time) 318 afs_update_volume_update_time(op, volume); 319 out: 320 mutex_unlock(&volume->volsync_lock); 321 return ret; 322 } 323 324 /* 325 * Update the state of a volume, including recording the expiration time of the 326 * callback promise. Returns 1 to redo the operation from the start. 327 */ 328 int afs_update_volume_state(struct afs_operation *op) 329 { 330 struct afs_server_list *slist = op->server_list; 331 struct afs_server_entry *se = &slist->servers[op->server_index]; 332 struct afs_callback *cb = &op->file[0].scb.callback; 333 struct afs_volume *volume = op->volume; 334 unsigned int cb_v_break = atomic_read(&volume->cb_v_break); 335 unsigned int cb_v_check = atomic_read(&volume->cb_v_check); 336 int ret; 337 338 _enter("%llx", op->volume->vid); 339 340 if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) { 341 ret = afs_update_volume_times(op, volume); 342 if (ret != 0) { 343 _leave(" = %d", ret); 344 return ret; 345 } 346 } 347 348 if (op->cb_v_break == cb_v_break && 349 (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) { 350 time64_t expires_at = cb->expires_at; 351 352 if (!op->file[0].scb.have_cb) 353 expires_at = op->file[1].scb.callback.expires_at; 354 355 se->cb_expires_at = expires_at; 356 volume->cb_expires_at = expires_at; 357 } 358 if (cb_v_check < op->cb_v_break) 359 atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break); 360 return 0; 361 } 362 363 /* 364 * mark the data attached to an inode as obsolete due to a write on the server 365 * - might also want to ditch all the outstanding writes and dirty pages 366 */ 367 static void afs_zap_data(struct afs_vnode *vnode) 368 { 369 _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); 370 371 afs_invalidate_cache(vnode, 0); 372 373 /* nuke all the non-dirty pages that aren't locked, mapped or being 374 * written back in a regular file and completely discard the pages in a 375 * directory or symlink */ 376 if (S_ISREG(vnode->netfs.inode.i_mode)) 377 filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX); 378 else 379 filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX); 380 } 381 382 /* 383 * validate a vnode/inode 384 * - there are several things we need to check 385 * - parent dir data changes (rm, rmdir, rename, mkdir, create, link, 386 * symlink) 387 * - parent dir metadata changed (security changes) 388 * - dentry data changed (write, truncate) 389 * - dentry metadata changed (security changes) 390 */ 391 int afs_validate(struct afs_vnode *vnode, struct key *key) 392 { 393 struct afs_volume *volume = vnode->volume; 394 unsigned int cb_ro_snapshot, cb_scrub; 395 time64_t deadline = ktime_get_real_seconds() + 10; 396 bool zap = false, locked_vol = false; 397 int ret; 398 399 _enter("{v={%llx:%llu} fl=%lx},%x", 400 vnode->fid.vid, vnode->fid.vnode, vnode->flags, 401 key_serial(key)); 402 403 if (afs_check_validity(vnode)) 404 return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; 405 406 ret = down_write_killable(&vnode->validate_lock); 407 if (ret < 0) 408 goto error; 409 410 if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { 411 ret = -ESTALE; 412 goto error_unlock; 413 } 414 415 /* Validate a volume after the v_break has changed or the volume 416 * callback expired. We only want to do this once per volume per 417 * v_break change. The actual work will be done when parsing the 418 * status fetch reply. 419 */ 420 if (volume->cb_expires_at <= deadline || 421 atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) { 422 ret = mutex_lock_interruptible(&volume->cb_check_lock); 423 if (ret < 0) 424 goto error_unlock; 425 locked_vol = true; 426 } 427 428 cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); 429 cb_scrub = atomic_read(&volume->cb_scrub); 430 if (vnode->cb_ro_snapshot != cb_ro_snapshot || 431 vnode->cb_scrub != cb_scrub) 432 unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); 433 434 if (vnode->cb_ro_snapshot != cb_ro_snapshot || 435 vnode->cb_scrub != cb_scrub || 436 volume->cb_expires_at <= deadline || 437 atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) || 438 atomic64_read(&vnode->cb_expires_at) <= deadline 439 ) { 440 ret = afs_fetch_status(vnode, key, false, NULL); 441 if (ret < 0) { 442 if (ret == -ENOENT) { 443 set_bit(AFS_VNODE_DELETED, &vnode->flags); 444 ret = -ESTALE; 445 } 446 goto error_unlock; 447 } 448 449 _debug("new promise [fl=%lx]", vnode->flags); 450 } 451 452 /* We can drop the volume lock now as. */ 453 if (locked_vol) { 454 mutex_unlock(&volume->cb_check_lock); 455 locked_vol = false; 456 } 457 458 cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot); 459 cb_scrub = atomic_read(&volume->cb_scrub); 460 _debug("vnode inval %x==%x %x==%x", 461 vnode->cb_ro_snapshot, cb_ro_snapshot, 462 vnode->cb_scrub, cb_scrub); 463 if (vnode->cb_scrub != cb_scrub) 464 zap = true; 465 vnode->cb_ro_snapshot = cb_ro_snapshot; 466 vnode->cb_scrub = cb_scrub; 467 468 /* if the vnode's data version number changed then its contents are 469 * different */ 470 zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags); 471 if (zap) 472 afs_zap_data(vnode); 473 up_write(&vnode->validate_lock); 474 _leave(" = 0"); 475 return 0; 476 477 error_unlock: 478 if (locked_vol) 479 mutex_unlock(&volume->cb_check_lock); 480 up_write(&vnode->validate_lock); 481 error: 482 _leave(" = %d", ret); 483 return ret; 484 } 485