1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/fanotify.h> 3 #include <linux/fcntl.h> 4 #include <linux/file.h> 5 #include <linux/fs.h> 6 #include <linux/anon_inodes.h> 7 #include <linux/fsnotify_backend.h> 8 #include <linux/init.h> 9 #include <linux/mount.h> 10 #include <linux/namei.h> 11 #include <linux/poll.h> 12 #include <linux/security.h> 13 #include <linux/syscalls.h> 14 #include <linux/slab.h> 15 #include <linux/types.h> 16 #include <linux/uaccess.h> 17 #include <linux/compat.h> 18 #include <linux/sched/signal.h> 19 #include <linux/memcontrol.h> 20 #include <linux/statfs.h> 21 #include <linux/exportfs.h> 22 23 #include <asm/ioctls.h> 24 25 #include "../fsnotify.h" 26 #include "../fdinfo.h" 27 #include "fanotify.h" 28 29 #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 30 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 31 #define FANOTIFY_DEFAULT_MAX_GROUPS 128 32 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32 33 34 /* 35 * Legacy fanotify marks limits (8192) is per group and we introduced a tunable 36 * limit of marks per user, similar to inotify. Effectively, the legacy limit 37 * of fanotify marks per user is <max marks per group> * <max groups per user>. 38 * This default limit (1M) also happens to match the increased limit of inotify 39 * max_user_watches since v5.10. 40 */ 41 #define FANOTIFY_DEFAULT_MAX_USER_MARKS \ 42 (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) 43 44 /* 45 * Most of the memory cost of adding an inode mark is pinning the marked inode. 46 * The size of the filesystem inode struct is not uniform across filesystems, 47 * so double the size of a VFS inode is used as a conservative approximation. 48 */ 49 #define INODE_MARK_COST (2 * sizeof(struct inode)) 50 51 /* configurable via /proc/sys/fs/fanotify/ */ 52 static int fanotify_max_queued_events __read_mostly; 53 54 #ifdef CONFIG_SYSCTL 55 56 #include <linux/sysctl.h> 57 58 static long ft_zero = 0; 59 static long ft_int_max = INT_MAX; 60 61 static const struct ctl_table fanotify_table[] = { 62 { 63 .procname = "max_user_groups", 64 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], 65 .maxlen = sizeof(long), 66 .mode = 0644, 67 .proc_handler = proc_doulongvec_minmax, 68 .extra1 = &ft_zero, 69 .extra2 = &ft_int_max, 70 }, 71 { 72 .procname = "max_user_marks", 73 .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], 74 .maxlen = sizeof(long), 75 .mode = 0644, 76 .proc_handler = proc_doulongvec_minmax, 77 .extra1 = &ft_zero, 78 .extra2 = &ft_int_max, 79 }, 80 { 81 .procname = "max_queued_events", 82 .data = &fanotify_max_queued_events, 83 .maxlen = sizeof(int), 84 .mode = 0644, 85 .proc_handler = proc_dointvec_minmax, 86 .extra1 = SYSCTL_ZERO 87 }, 88 }; 89 90 static void __init fanotify_sysctls_init(void) 91 { 92 register_sysctl("fs/fanotify", fanotify_table); 93 } 94 #else 95 #define fanotify_sysctls_init() do { } while (0) 96 #endif /* CONFIG_SYSCTL */ 97 98 /* 99 * All flags that may be specified in parameter event_f_flags of fanotify_init. 100 * 101 * Internal and external open flags are stored together in field f_flags of 102 * struct file. Only external open flags shall be allowed in event_f_flags. 103 * Internal flags like FMODE_EXEC shall be excluded. 104 */ 105 #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ 106 O_ACCMODE | O_APPEND | O_NONBLOCK | \ 107 __O_SYNC | O_DSYNC | O_CLOEXEC | \ 108 O_LARGEFILE | O_NOATIME ) 109 110 extern const struct fsnotify_ops fanotify_fsnotify_ops; 111 112 struct kmem_cache *fanotify_mark_cache __ro_after_init; 113 struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; 114 struct kmem_cache *fanotify_path_event_cachep __ro_after_init; 115 struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; 116 struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init; 117 118 #define FANOTIFY_EVENT_ALIGN 4 119 #define FANOTIFY_FID_INFO_HDR_LEN \ 120 (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) 121 #define FANOTIFY_PIDFD_INFO_LEN \ 122 sizeof(struct fanotify_event_info_pidfd) 123 #define FANOTIFY_ERROR_INFO_LEN \ 124 (sizeof(struct fanotify_event_info_error)) 125 #define FANOTIFY_RANGE_INFO_LEN \ 126 (sizeof(struct fanotify_event_info_range)) 127 #define FANOTIFY_MNT_INFO_LEN \ 128 (sizeof(struct fanotify_event_info_mnt)) 129 130 static int fanotify_fid_info_len(int fh_len, int name_len) 131 { 132 int info_len = fh_len; 133 134 if (name_len) 135 info_len += name_len + 1; 136 137 return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, 138 FANOTIFY_EVENT_ALIGN); 139 } 140 141 /* FAN_RENAME may have one or two dir+name info records */ 142 static int fanotify_dir_name_info_len(struct fanotify_event *event) 143 { 144 struct fanotify_info *info = fanotify_event_info(event); 145 int dir_fh_len = fanotify_event_dir_fh_len(event); 146 int dir2_fh_len = fanotify_event_dir2_fh_len(event); 147 int info_len = 0; 148 149 if (dir_fh_len) 150 info_len += fanotify_fid_info_len(dir_fh_len, 151 info->name_len); 152 if (dir2_fh_len) 153 info_len += fanotify_fid_info_len(dir2_fh_len, 154 info->name2_len); 155 156 return info_len; 157 } 158 159 static size_t fanotify_event_len(unsigned int info_mode, 160 struct fanotify_event *event) 161 { 162 size_t event_len = FAN_EVENT_METADATA_LEN; 163 int fh_len; 164 int dot_len = 0; 165 166 if (fanotify_is_error_event(event->mask)) 167 event_len += FANOTIFY_ERROR_INFO_LEN; 168 169 if (fanotify_event_has_any_dir_fh(event)) { 170 event_len += fanotify_dir_name_info_len(event); 171 } else if ((info_mode & FAN_REPORT_NAME) && 172 (event->mask & FAN_ONDIR)) { 173 /* 174 * With group flag FAN_REPORT_NAME, if name was not recorded in 175 * event on a directory, we will report the name ".". 176 */ 177 dot_len = 1; 178 } 179 180 if (fanotify_event_has_object_fh(event)) { 181 fh_len = fanotify_event_object_fh_len(event); 182 event_len += fanotify_fid_info_len(fh_len, dot_len); 183 } 184 if (fanotify_is_mnt_event(event->mask)) 185 event_len += FANOTIFY_MNT_INFO_LEN; 186 187 if (info_mode & FAN_REPORT_PIDFD) 188 event_len += FANOTIFY_PIDFD_INFO_LEN; 189 190 if (fanotify_event_has_access_range(event)) 191 event_len += FANOTIFY_RANGE_INFO_LEN; 192 193 return event_len; 194 } 195 196 /* 197 * Remove an hashed event from merge hash table. 198 */ 199 static void fanotify_unhash_event(struct fsnotify_group *group, 200 struct fanotify_event *event) 201 { 202 assert_spin_locked(&group->notification_lock); 203 204 pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, 205 group, event, fanotify_event_hash_bucket(group, event)); 206 207 if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) 208 return; 209 210 hlist_del_init(&event->merge_list); 211 } 212 213 /* 214 * Get an fanotify notification event if one exists and is small 215 * enough to fit in "count". Return an error pointer if the count 216 * is not large enough. When permission event is dequeued, its state is 217 * updated accordingly. 218 */ 219 static struct fanotify_event *get_one_event(struct fsnotify_group *group, 220 size_t count) 221 { 222 size_t event_size; 223 struct fanotify_event *event = NULL; 224 struct fsnotify_event *fsn_event; 225 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); 226 227 pr_debug("%s: group=%p count=%zd\n", __func__, group, count); 228 229 spin_lock(&group->notification_lock); 230 fsn_event = fsnotify_peek_first_event(group); 231 if (!fsn_event) 232 goto out; 233 234 event = FANOTIFY_E(fsn_event); 235 event_size = fanotify_event_len(info_mode, event); 236 237 if (event_size > count) { 238 event = ERR_PTR(-EINVAL); 239 goto out; 240 } 241 242 /* 243 * Held the notification_lock the whole time, so this is the 244 * same event we peeked above. 245 */ 246 fsnotify_remove_first_event(group); 247 if (fanotify_is_perm_event(event->mask)) 248 FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; 249 if (fanotify_is_hashed_event(event->mask)) 250 fanotify_unhash_event(group, event); 251 out: 252 spin_unlock(&group->notification_lock); 253 return event; 254 } 255 256 static int create_fd(struct fsnotify_group *group, const struct path *path, 257 struct file **file) 258 { 259 int client_fd; 260 struct file *new_file; 261 262 client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); 263 if (client_fd < 0) 264 return client_fd; 265 266 /* 267 * We provide an fd for the userspace program, so it could access the 268 * file without generating fanotify events itself. 269 */ 270 new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags, 271 current_cred()); 272 if (IS_ERR(new_file)) { 273 put_unused_fd(client_fd); 274 client_fd = PTR_ERR(new_file); 275 } else { 276 *file = new_file; 277 } 278 279 return client_fd; 280 } 281 282 static int process_access_response_info(const char __user *info, 283 size_t info_len, 284 struct fanotify_response_info_audit_rule *friar) 285 { 286 if (info_len != sizeof(*friar)) 287 return -EINVAL; 288 289 if (copy_from_user(friar, info, sizeof(*friar))) 290 return -EFAULT; 291 292 if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE) 293 return -EINVAL; 294 if (friar->hdr.pad != 0) 295 return -EINVAL; 296 if (friar->hdr.len != sizeof(*friar)) 297 return -EINVAL; 298 299 return info_len; 300 } 301 302 /* 303 * Finish processing of permission event by setting it to ANSWERED state and 304 * drop group->notification_lock. 305 */ 306 static void finish_permission_event(struct fsnotify_group *group, 307 struct fanotify_perm_event *event, u32 response, 308 struct fanotify_response_info_audit_rule *friar) 309 __releases(&group->notification_lock) 310 { 311 bool destroy = false; 312 313 assert_spin_locked(&group->notification_lock); 314 event->response = response & ~FAN_INFO; 315 if (response & FAN_INFO) 316 memcpy(&event->audit_rule, friar, sizeof(*friar)); 317 318 if (event->state == FAN_EVENT_CANCELED) 319 destroy = true; 320 else 321 event->state = FAN_EVENT_ANSWERED; 322 spin_unlock(&group->notification_lock); 323 if (destroy) 324 fsnotify_destroy_event(group, &event->fae.fse); 325 } 326 327 static int process_access_response(struct fsnotify_group *group, 328 struct fanotify_response *response_struct, 329 const char __user *info, 330 size_t info_len) 331 { 332 struct fanotify_perm_event *event; 333 int fd = response_struct->fd; 334 u32 response = response_struct->response; 335 int errno = fanotify_get_response_errno(response); 336 int ret = info_len; 337 struct fanotify_response_info_audit_rule friar; 338 339 pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n", 340 __func__, group, fd, response, errno, info, info_len); 341 /* 342 * make sure the response is valid, if invalid we do nothing and either 343 * userspace can send a valid response or we will clean it up after the 344 * timeout 345 */ 346 if (response & ~FANOTIFY_RESPONSE_VALID_MASK) 347 return -EINVAL; 348 349 switch (response & FANOTIFY_RESPONSE_ACCESS) { 350 case FAN_ALLOW: 351 if (errno) 352 return -EINVAL; 353 break; 354 case FAN_DENY: 355 /* Custom errno is supported only for pre-content groups */ 356 if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT) 357 return -EINVAL; 358 359 /* 360 * Limit errno to values expected on open(2)/read(2)/write(2) 361 * of regular files. 362 */ 363 switch (errno) { 364 case 0: 365 case EIO: 366 case EPERM: 367 case EBUSY: 368 case ETXTBSY: 369 case EAGAIN: 370 case ENOSPC: 371 case EDQUOT: 372 break; 373 default: 374 return -EINVAL; 375 } 376 break; 377 default: 378 return -EINVAL; 379 } 380 381 if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) 382 return -EINVAL; 383 384 if (response & FAN_INFO) { 385 ret = process_access_response_info(info, info_len, &friar); 386 if (ret < 0) 387 return ret; 388 if (fd == FAN_NOFD) 389 return ret; 390 } else { 391 ret = 0; 392 } 393 394 if (fd < 0) 395 return -EINVAL; 396 397 spin_lock(&group->notification_lock); 398 list_for_each_entry(event, &group->fanotify_data.access_list, 399 fae.fse.list) { 400 if (event->fd != fd) 401 continue; 402 403 list_del_init(&event->fae.fse.list); 404 finish_permission_event(group, event, response, &friar); 405 wake_up(&group->fanotify_data.access_waitq); 406 return ret; 407 } 408 spin_unlock(&group->notification_lock); 409 410 return -ENOENT; 411 } 412 413 static size_t copy_mnt_info_to_user(struct fanotify_event *event, 414 char __user *buf, int count) 415 { 416 struct fanotify_event_info_mnt info = { }; 417 418 info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT; 419 info.hdr.len = FANOTIFY_MNT_INFO_LEN; 420 421 if (WARN_ON(count < info.hdr.len)) 422 return -EFAULT; 423 424 info.mnt_id = FANOTIFY_ME(event)->mnt_id; 425 426 if (copy_to_user(buf, &info, sizeof(info))) 427 return -EFAULT; 428 429 return info.hdr.len; 430 } 431 432 static size_t copy_error_info_to_user(struct fanotify_event *event, 433 char __user *buf, int count) 434 { 435 struct fanotify_event_info_error info = { }; 436 struct fanotify_error_event *fee = FANOTIFY_EE(event); 437 438 info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; 439 info.hdr.len = FANOTIFY_ERROR_INFO_LEN; 440 441 if (WARN_ON(count < info.hdr.len)) 442 return -EFAULT; 443 444 info.error = fee->error; 445 info.error_count = fee->err_count; 446 447 if (copy_to_user(buf, &info, sizeof(info))) 448 return -EFAULT; 449 450 return info.hdr.len; 451 } 452 453 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, 454 int info_type, const char *name, 455 size_t name_len, 456 char __user *buf, size_t count) 457 { 458 struct fanotify_event_info_fid info = { }; 459 struct file_handle handle = { }; 460 unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf; 461 size_t fh_len = fh ? fh->len : 0; 462 size_t info_len = fanotify_fid_info_len(fh_len, name_len); 463 size_t len = info_len; 464 465 pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", 466 __func__, fh_len, name_len, info_len, count); 467 468 if (WARN_ON_ONCE(len < sizeof(info) || len > count)) 469 return -EFAULT; 470 471 /* 472 * Copy event info fid header followed by variable sized file handle 473 * and optionally followed by variable sized filename. 474 */ 475 switch (info_type) { 476 case FAN_EVENT_INFO_TYPE_FID: 477 case FAN_EVENT_INFO_TYPE_DFID: 478 if (WARN_ON_ONCE(name_len)) 479 return -EFAULT; 480 break; 481 case FAN_EVENT_INFO_TYPE_DFID_NAME: 482 case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME: 483 case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME: 484 if (WARN_ON_ONCE(!name || !name_len)) 485 return -EFAULT; 486 break; 487 default: 488 return -EFAULT; 489 } 490 491 info.hdr.info_type = info_type; 492 info.hdr.len = len; 493 info.fsid = *fsid; 494 if (copy_to_user(buf, &info, sizeof(info))) 495 return -EFAULT; 496 497 buf += sizeof(info); 498 len -= sizeof(info); 499 if (WARN_ON_ONCE(len < sizeof(handle))) 500 return -EFAULT; 501 502 handle.handle_type = fh->type; 503 handle.handle_bytes = fh_len; 504 505 /* Mangle handle_type for bad file_handle */ 506 if (!fh_len) 507 handle.handle_type = FILEID_INVALID; 508 509 if (copy_to_user(buf, &handle, sizeof(handle))) 510 return -EFAULT; 511 512 buf += sizeof(handle); 513 len -= sizeof(handle); 514 if (WARN_ON_ONCE(len < fh_len)) 515 return -EFAULT; 516 517 /* 518 * For an inline fh and inline file name, copy through stack to exclude 519 * the copy from usercopy hardening protections. 520 */ 521 fh_buf = fanotify_fh_buf(fh); 522 if (fh_len <= FANOTIFY_INLINE_FH_LEN) { 523 memcpy(bounce, fh_buf, fh_len); 524 fh_buf = bounce; 525 } 526 if (copy_to_user(buf, fh_buf, fh_len)) 527 return -EFAULT; 528 529 buf += fh_len; 530 len -= fh_len; 531 532 if (name_len) { 533 /* Copy the filename with terminating null */ 534 name_len++; 535 if (WARN_ON_ONCE(len < name_len)) 536 return -EFAULT; 537 538 if (copy_to_user(buf, name, name_len)) 539 return -EFAULT; 540 541 buf += name_len; 542 len -= name_len; 543 } 544 545 /* Pad with 0's */ 546 WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); 547 if (len > 0 && clear_user(buf, len)) 548 return -EFAULT; 549 550 return info_len; 551 } 552 553 static int copy_pidfd_info_to_user(int pidfd, 554 char __user *buf, 555 size_t count) 556 { 557 struct fanotify_event_info_pidfd info = { }; 558 size_t info_len = FANOTIFY_PIDFD_INFO_LEN; 559 560 if (WARN_ON_ONCE(info_len > count)) 561 return -EFAULT; 562 563 info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; 564 info.hdr.len = info_len; 565 info.pidfd = pidfd; 566 567 if (copy_to_user(buf, &info, info_len)) 568 return -EFAULT; 569 570 return info_len; 571 } 572 573 static size_t copy_range_info_to_user(struct fanotify_event *event, 574 char __user *buf, int count) 575 { 576 struct fanotify_perm_event *pevent = FANOTIFY_PERM(event); 577 struct fanotify_event_info_range info = { }; 578 size_t info_len = FANOTIFY_RANGE_INFO_LEN; 579 580 if (WARN_ON_ONCE(info_len > count)) 581 return -EFAULT; 582 583 if (WARN_ON_ONCE(!pevent->ppos)) 584 return -EINVAL; 585 586 info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE; 587 info.hdr.len = info_len; 588 info.offset = *(pevent->ppos); 589 info.count = pevent->count; 590 591 if (copy_to_user(buf, &info, info_len)) 592 return -EFAULT; 593 594 return info_len; 595 } 596 597 static int copy_info_records_to_user(struct fanotify_event *event, 598 struct fanotify_info *info, 599 unsigned int info_mode, int pidfd, 600 char __user *buf, size_t count) 601 { 602 int ret, total_bytes = 0, info_type = 0; 603 unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; 604 unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; 605 606 /* 607 * Event info records order is as follows: 608 * 1. dir fid + name 609 * 2. (optional) new dir fid + new name 610 * 3. (optional) child fid 611 */ 612 if (fanotify_event_has_dir_fh(event)) { 613 info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : 614 FAN_EVENT_INFO_TYPE_DFID; 615 616 /* FAN_RENAME uses special info types */ 617 if (event->mask & FAN_RENAME) 618 info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME; 619 620 ret = copy_fid_info_to_user(fanotify_event_fsid(event), 621 fanotify_info_dir_fh(info), 622 info_type, 623 fanotify_info_name(info), 624 info->name_len, buf, count); 625 if (ret < 0) 626 return ret; 627 628 buf += ret; 629 count -= ret; 630 total_bytes += ret; 631 } 632 633 /* New dir fid+name may be reported in addition to old dir fid+name */ 634 if (fanotify_event_has_dir2_fh(event)) { 635 info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME; 636 ret = copy_fid_info_to_user(fanotify_event_fsid(event), 637 fanotify_info_dir2_fh(info), 638 info_type, 639 fanotify_info_name2(info), 640 info->name2_len, buf, count); 641 if (ret < 0) 642 return ret; 643 644 buf += ret; 645 count -= ret; 646 total_bytes += ret; 647 } 648 649 if (fanotify_event_has_object_fh(event)) { 650 const char *dot = NULL; 651 int dot_len = 0; 652 653 if (fid_mode == FAN_REPORT_FID || info_type) { 654 /* 655 * With only group flag FAN_REPORT_FID only type FID is 656 * reported. Second info record type is always FID. 657 */ 658 info_type = FAN_EVENT_INFO_TYPE_FID; 659 } else if ((fid_mode & FAN_REPORT_NAME) && 660 (event->mask & FAN_ONDIR)) { 661 /* 662 * With group flag FAN_REPORT_NAME, if name was not 663 * recorded in an event on a directory, report the name 664 * "." with info type DFID_NAME. 665 */ 666 info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; 667 dot = "."; 668 dot_len = 1; 669 } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || 670 (event->mask & FAN_ONDIR)) { 671 /* 672 * With group flag FAN_REPORT_DIR_FID, a single info 673 * record has type DFID for directory entry modification 674 * event and for event on a directory. 675 */ 676 info_type = FAN_EVENT_INFO_TYPE_DFID; 677 } else { 678 /* 679 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, 680 * a single info record has type FID for event on a 681 * non-directory, when there is no directory to report. 682 * For example, on FAN_DELETE_SELF event. 683 */ 684 info_type = FAN_EVENT_INFO_TYPE_FID; 685 } 686 687 ret = copy_fid_info_to_user(fanotify_event_fsid(event), 688 fanotify_event_object_fh(event), 689 info_type, dot, dot_len, 690 buf, count); 691 if (ret < 0) 692 return ret; 693 694 buf += ret; 695 count -= ret; 696 total_bytes += ret; 697 } 698 699 if (pidfd_mode) { 700 ret = copy_pidfd_info_to_user(pidfd, buf, count); 701 if (ret < 0) 702 return ret; 703 704 buf += ret; 705 count -= ret; 706 total_bytes += ret; 707 } 708 709 if (fanotify_is_error_event(event->mask)) { 710 ret = copy_error_info_to_user(event, buf, count); 711 if (ret < 0) 712 return ret; 713 buf += ret; 714 count -= ret; 715 total_bytes += ret; 716 } 717 718 if (fanotify_event_has_access_range(event)) { 719 ret = copy_range_info_to_user(event, buf, count); 720 if (ret < 0) 721 return ret; 722 buf += ret; 723 count -= ret; 724 total_bytes += ret; 725 } 726 727 if (fanotify_is_mnt_event(event->mask)) { 728 ret = copy_mnt_info_to_user(event, buf, count); 729 if (ret < 0) 730 return ret; 731 buf += ret; 732 count -= ret; 733 total_bytes += ret; 734 } 735 736 return total_bytes; 737 } 738 739 static ssize_t copy_event_to_user(struct fsnotify_group *group, 740 struct fanotify_event *event, 741 char __user *buf, size_t count) 742 { 743 struct fanotify_event_metadata metadata; 744 const struct path *path = fanotify_event_path(event); 745 struct fanotify_info *info = fanotify_event_info(event); 746 unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); 747 unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; 748 struct file *f = NULL, *pidfd_file = NULL; 749 int ret, pidfd = -ESRCH, fd = -EBADF; 750 751 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 752 753 metadata.event_len = fanotify_event_len(info_mode, event); 754 metadata.metadata_len = FAN_EVENT_METADATA_LEN; 755 metadata.vers = FANOTIFY_METADATA_VERSION; 756 metadata.reserved = 0; 757 metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; 758 metadata.pid = pid_vnr(event->pid); 759 /* 760 * For an unprivileged listener, event->pid can be used to identify the 761 * events generated by the listener process itself, without disclosing 762 * the pids of other processes. 763 */ 764 if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && 765 task_tgid(current) != event->pid) 766 metadata.pid = 0; 767 768 /* 769 * For now, fid mode is required for an unprivileged listener and 770 * fid mode does not report fd in events. Keep this check anyway 771 * for safety in case fid mode requirement is relaxed in the future 772 * to allow unprivileged listener to get events with no fd and no fid. 773 */ 774 if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) && 775 path && path->mnt && path->dentry) { 776 fd = create_fd(group, path, &f); 777 /* 778 * Opening an fd from dentry can fail for several reasons. 779 * For example, when tasks are gone and we try to open their 780 * /proc files or we try to open a WRONLY file like in sysfs 781 * or when trying to open a file that was deleted on the 782 * remote network server. 783 * 784 * For a group with FAN_REPORT_FD_ERROR, we will send the 785 * event with the error instead of the open fd, otherwise 786 * Userspace may not get the error at all. 787 * In any case, userspace will not know which file failed to 788 * open, so add a debug print for further investigation. 789 */ 790 if (fd < 0) { 791 pr_debug("fanotify: create_fd(%pd2) failed err=%d\n", 792 path->dentry, fd); 793 if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) { 794 /* 795 * Historically, we've handled EOPENSTALE in a 796 * special way and silently dropped such 797 * events. Now we have to keep it to maintain 798 * backward compatibility... 799 */ 800 if (fd == -EOPENSTALE) 801 fd = 0; 802 return fd; 803 } 804 } 805 } 806 if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) 807 metadata.fd = fd; 808 else 809 metadata.fd = fd >= 0 ? fd : FAN_NOFD; 810 811 if (pidfd_mode) { 812 /* 813 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual 814 * exclusion is ever lifted. At the time of incoporating pidfd 815 * support within fanotify, the pidfd API only supported the 816 * creation of pidfds for thread-group leaders. 817 */ 818 WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); 819 820 /* 821 * The PIDTYPE_TGID check for an event->pid is performed 822 * preemptively in an attempt to catch out cases where the event 823 * listener reads events after the event generating process has 824 * already terminated. Depending on flag FAN_REPORT_FD_ERROR, 825 * report either -ESRCH or FAN_NOPIDFD to the event listener in 826 * those cases with all other pidfd creation errors reported as 827 * the error code itself or as FAN_EPIDFD. 828 */ 829 if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID)) 830 pidfd = pidfd_prepare(event->pid, 0, &pidfd_file); 831 832 if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0) 833 pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD; 834 } 835 836 ret = -EFAULT; 837 /* 838 * Sanity check copy size in case get_one_event() and 839 * event_len sizes ever get out of sync. 840 */ 841 if (WARN_ON_ONCE(metadata.event_len > count)) 842 goto out_close_fd; 843 844 if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) 845 goto out_close_fd; 846 847 buf += FAN_EVENT_METADATA_LEN; 848 count -= FAN_EVENT_METADATA_LEN; 849 850 ret = copy_info_records_to_user(event, info, info_mode, pidfd, 851 buf, count); 852 if (ret < 0) 853 goto out_close_fd; 854 855 if (f) 856 fd_install(fd, f); 857 858 if (pidfd_file) 859 fd_install(pidfd, pidfd_file); 860 861 if (fanotify_is_perm_event(event->mask)) 862 FANOTIFY_PERM(event)->fd = fd; 863 864 return metadata.event_len; 865 866 out_close_fd: 867 if (f) { 868 put_unused_fd(fd); 869 fput(f); 870 } 871 872 if (pidfd_file) { 873 put_unused_fd(pidfd); 874 fput(pidfd_file); 875 } 876 877 return ret; 878 } 879 880 /* intofiy userspace file descriptor functions */ 881 static __poll_t fanotify_poll(struct file *file, poll_table *wait) 882 { 883 struct fsnotify_group *group = file->private_data; 884 __poll_t ret = 0; 885 886 poll_wait(file, &group->notification_waitq, wait); 887 spin_lock(&group->notification_lock); 888 if (!fsnotify_notify_queue_is_empty(group)) 889 ret = EPOLLIN | EPOLLRDNORM; 890 spin_unlock(&group->notification_lock); 891 892 return ret; 893 } 894 895 static ssize_t fanotify_read(struct file *file, char __user *buf, 896 size_t count, loff_t *pos) 897 { 898 struct fsnotify_group *group; 899 struct fanotify_event *event; 900 char __user *start; 901 int ret; 902 DEFINE_WAIT_FUNC(wait, woken_wake_function); 903 904 start = buf; 905 group = file->private_data; 906 907 pr_debug("%s: group=%p\n", __func__, group); 908 909 add_wait_queue(&group->notification_waitq, &wait); 910 while (1) { 911 /* 912 * User can supply arbitrarily large buffer. Avoid softlockups 913 * in case there are lots of available events. 914 */ 915 cond_resched(); 916 event = get_one_event(group, count); 917 if (IS_ERR(event)) { 918 ret = PTR_ERR(event); 919 break; 920 } 921 922 if (!event) { 923 ret = -EAGAIN; 924 if (file->f_flags & O_NONBLOCK) 925 break; 926 927 ret = -ERESTARTSYS; 928 if (signal_pending(current)) 929 break; 930 931 if (start != buf) 932 break; 933 934 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 935 continue; 936 } 937 938 ret = copy_event_to_user(group, event, buf, count); 939 940 /* 941 * Permission events get queued to wait for response. Other 942 * events can be destroyed now. 943 */ 944 if (!fanotify_is_perm_event(event->mask)) { 945 fsnotify_destroy_event(group, &event->fse); 946 } else { 947 if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) { 948 spin_lock(&group->notification_lock); 949 finish_permission_event(group, 950 FANOTIFY_PERM(event), FAN_DENY, NULL); 951 wake_up(&group->fanotify_data.access_waitq); 952 } else { 953 spin_lock(&group->notification_lock); 954 list_add_tail(&event->fse.list, 955 &group->fanotify_data.access_list); 956 spin_unlock(&group->notification_lock); 957 } 958 } 959 if (ret < 0) 960 break; 961 buf += ret; 962 count -= ret; 963 } 964 remove_wait_queue(&group->notification_waitq, &wait); 965 966 if (start != buf && ret != -EFAULT) 967 ret = buf - start; 968 return ret; 969 } 970 971 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 972 { 973 struct fanotify_response response; 974 struct fsnotify_group *group; 975 int ret; 976 const char __user *info_buf = buf + sizeof(struct fanotify_response); 977 size_t info_len; 978 979 if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) 980 return -EINVAL; 981 982 group = file->private_data; 983 984 pr_debug("%s: group=%p count=%zu\n", __func__, group, count); 985 986 if (count < sizeof(response)) 987 return -EINVAL; 988 989 if (copy_from_user(&response, buf, sizeof(response))) 990 return -EFAULT; 991 992 info_len = count - sizeof(response); 993 994 ret = process_access_response(group, &response, info_buf, info_len); 995 if (ret < 0) 996 count = ret; 997 else 998 count = sizeof(response) + ret; 999 1000 return count; 1001 } 1002 1003 static int fanotify_release(struct inode *ignored, struct file *file) 1004 { 1005 struct fsnotify_group *group = file->private_data; 1006 struct fsnotify_event *fsn_event; 1007 1008 /* 1009 * Stop new events from arriving in the notification queue. since 1010 * userspace cannot use fanotify fd anymore, no event can enter or 1011 * leave access_list by now either. 1012 */ 1013 fsnotify_group_stop_queueing(group); 1014 1015 /* 1016 * Process all permission events on access_list and notification queue 1017 * and simulate reply from userspace. 1018 */ 1019 spin_lock(&group->notification_lock); 1020 while (!list_empty(&group->fanotify_data.access_list)) { 1021 struct fanotify_perm_event *event; 1022 1023 event = list_first_entry(&group->fanotify_data.access_list, 1024 struct fanotify_perm_event, fae.fse.list); 1025 list_del_init(&event->fae.fse.list); 1026 finish_permission_event(group, event, FAN_ALLOW, NULL); 1027 spin_lock(&group->notification_lock); 1028 } 1029 1030 /* 1031 * Destroy all non-permission events. For permission events just 1032 * dequeue them and set the response. They will be freed once the 1033 * response is consumed and fanotify_get_response() returns. 1034 */ 1035 while ((fsn_event = fsnotify_remove_first_event(group))) { 1036 struct fanotify_event *event = FANOTIFY_E(fsn_event); 1037 1038 if (!(event->mask & FANOTIFY_PERM_EVENTS)) { 1039 spin_unlock(&group->notification_lock); 1040 fsnotify_destroy_event(group, fsn_event); 1041 } else { 1042 finish_permission_event(group, FANOTIFY_PERM(event), 1043 FAN_ALLOW, NULL); 1044 } 1045 spin_lock(&group->notification_lock); 1046 } 1047 spin_unlock(&group->notification_lock); 1048 1049 /* Response for all permission events it set, wakeup waiters */ 1050 wake_up(&group->fanotify_data.access_waitq); 1051 1052 /* matches the fanotify_init->fsnotify_alloc_group */ 1053 fsnotify_destroy_group(group); 1054 1055 return 0; 1056 } 1057 1058 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1059 { 1060 struct fsnotify_group *group; 1061 struct fsnotify_event *fsn_event; 1062 void __user *p; 1063 int ret = -ENOTTY; 1064 size_t send_len = 0; 1065 1066 group = file->private_data; 1067 1068 p = (void __user *) arg; 1069 1070 switch (cmd) { 1071 case FIONREAD: 1072 spin_lock(&group->notification_lock); 1073 list_for_each_entry(fsn_event, &group->notification_list, list) 1074 send_len += FAN_EVENT_METADATA_LEN; 1075 spin_unlock(&group->notification_lock); 1076 ret = put_user(send_len, (int __user *) p); 1077 break; 1078 } 1079 1080 return ret; 1081 } 1082 1083 static const struct file_operations fanotify_fops = { 1084 .show_fdinfo = fanotify_show_fdinfo, 1085 .poll = fanotify_poll, 1086 .read = fanotify_read, 1087 .write = fanotify_write, 1088 .fasync = NULL, 1089 .release = fanotify_release, 1090 .unlocked_ioctl = fanotify_ioctl, 1091 .compat_ioctl = compat_ptr_ioctl, 1092 .llseek = noop_llseek, 1093 }; 1094 1095 static int fanotify_find_path(int dfd, const char __user *filename, 1096 struct path *path, unsigned int flags, __u64 mask, 1097 unsigned int obj_type) 1098 { 1099 int ret; 1100 1101 pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__, 1102 dfd, filename, flags); 1103 1104 if (filename == NULL) { 1105 CLASS(fd, f)(dfd); 1106 1107 if (fd_empty(f)) 1108 return -EBADF; 1109 1110 if ((flags & FAN_MARK_ONLYDIR) && 1111 !(S_ISDIR(file_inode(fd_file(f))->i_mode))) 1112 return -ENOTDIR; 1113 1114 *path = fd_file(f)->f_path; 1115 path_get(path); 1116 } else { 1117 unsigned int lookup_flags = 0; 1118 1119 if (!(flags & FAN_MARK_DONT_FOLLOW)) 1120 lookup_flags |= LOOKUP_FOLLOW; 1121 if (flags & FAN_MARK_ONLYDIR) 1122 lookup_flags |= LOOKUP_DIRECTORY; 1123 1124 ret = user_path_at(dfd, filename, lookup_flags, path); 1125 if (ret) 1126 goto out; 1127 } 1128 1129 /* you can only watch an inode if you have read permissions on it */ 1130 ret = path_permission(path, MAY_READ); 1131 if (ret) { 1132 path_put(path); 1133 goto out; 1134 } 1135 1136 ret = security_path_notify(path, mask, obj_type); 1137 if (ret) 1138 path_put(path); 1139 1140 out: 1141 return ret; 1142 } 1143 1144 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, 1145 __u32 mask, unsigned int flags, 1146 __u32 umask, int *destroy) 1147 { 1148 __u32 oldmask, newmask; 1149 1150 /* umask bits cannot be removed by user */ 1151 mask &= ~umask; 1152 spin_lock(&fsn_mark->lock); 1153 oldmask = fsnotify_calc_mask(fsn_mark); 1154 if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) { 1155 fsn_mark->mask &= ~mask; 1156 } else { 1157 fsn_mark->ignore_mask &= ~mask; 1158 } 1159 newmask = fsnotify_calc_mask(fsn_mark); 1160 /* 1161 * We need to keep the mark around even if remaining mask cannot 1162 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal 1163 * changes to the mask. 1164 * Destroy mark when only umask bits remain. 1165 */ 1166 *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask); 1167 spin_unlock(&fsn_mark->lock); 1168 1169 return oldmask & ~newmask; 1170 } 1171 1172 static int fanotify_remove_mark(struct fsnotify_group *group, 1173 void *obj, unsigned int obj_type, __u32 mask, 1174 unsigned int flags, __u32 umask) 1175 { 1176 struct fsnotify_mark *fsn_mark = NULL; 1177 __u32 removed; 1178 int destroy_mark; 1179 1180 fsnotify_group_lock(group); 1181 fsn_mark = fsnotify_find_mark(obj, obj_type, group); 1182 if (!fsn_mark) { 1183 fsnotify_group_unlock(group); 1184 return -ENOENT; 1185 } 1186 1187 removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, 1188 umask, &destroy_mark); 1189 if (removed & fsnotify_conn_mask(fsn_mark->connector)) 1190 fsnotify_recalc_mask(fsn_mark->connector); 1191 if (destroy_mark) 1192 fsnotify_detach_mark(fsn_mark); 1193 fsnotify_group_unlock(group); 1194 if (destroy_mark) 1195 fsnotify_free_mark(fsn_mark); 1196 1197 /* matches the fsnotify_find_mark() */ 1198 fsnotify_put_mark(fsn_mark); 1199 return 0; 1200 } 1201 1202 static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark, 1203 unsigned int fan_flags) 1204 { 1205 bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE); 1206 unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS; 1207 bool recalc = false; 1208 1209 /* 1210 * When using FAN_MARK_IGNORE for the first time, mark starts using 1211 * independent event flags in ignore mask. After that, trying to 1212 * update the ignore mask with the old FAN_MARK_IGNORED_MASK API 1213 * will result in EEXIST error. 1214 */ 1215 if (ignore == FAN_MARK_IGNORE) 1216 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS; 1217 1218 /* 1219 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to 1220 * the removal of the FS_MODIFY bit in calculated mask if it was set 1221 * because of an ignore mask that is now going to survive FS_MODIFY. 1222 */ 1223 if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && 1224 !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) { 1225 fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; 1226 if (!(fsn_mark->mask & FS_MODIFY)) 1227 recalc = true; 1228 } 1229 1230 if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE || 1231 want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) 1232 return recalc; 1233 1234 /* 1235 * NO_IREF may be removed from a mark, but not added. 1236 * When removed, fsnotify_recalc_mask() will take the inode ref. 1237 */ 1238 WARN_ON_ONCE(!want_iref); 1239 fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF; 1240 1241 return true; 1242 } 1243 1244 static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, 1245 __u32 mask, unsigned int fan_flags) 1246 { 1247 bool recalc; 1248 1249 spin_lock(&fsn_mark->lock); 1250 if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS)) 1251 fsn_mark->mask |= mask; 1252 else 1253 fsn_mark->ignore_mask |= mask; 1254 1255 recalc = fsnotify_calc_mask(fsn_mark) & 1256 ~fsnotify_conn_mask(fsn_mark->connector); 1257 1258 recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags); 1259 spin_unlock(&fsn_mark->lock); 1260 1261 return recalc; 1262 } 1263 1264 struct fan_fsid { 1265 struct super_block *sb; 1266 __kernel_fsid_t id; 1267 bool weak; 1268 }; 1269 1270 static int fanotify_set_mark_fsid(struct fsnotify_group *group, 1271 struct fsnotify_mark *mark, 1272 struct fan_fsid *fsid) 1273 { 1274 struct fsnotify_mark_connector *conn; 1275 struct fsnotify_mark *old; 1276 struct super_block *old_sb = NULL; 1277 1278 FANOTIFY_MARK(mark)->fsid = fsid->id; 1279 mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID; 1280 if (fsid->weak) 1281 mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID; 1282 1283 /* First mark added will determine if group is single or multi fsid */ 1284 if (list_empty(&group->marks_list)) 1285 return 0; 1286 1287 /* Find sb of an existing mark */ 1288 list_for_each_entry(old, &group->marks_list, g_list) { 1289 conn = READ_ONCE(old->connector); 1290 if (!conn) 1291 continue; 1292 old_sb = fsnotify_connector_sb(conn); 1293 if (old_sb) 1294 break; 1295 } 1296 1297 /* Only detached marks left? */ 1298 if (!old_sb) 1299 return 0; 1300 1301 /* Do not allow mixing of marks with weak and strong fsid */ 1302 if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID) 1303 return -EXDEV; 1304 1305 /* Allow mixing of marks with strong fsid from different fs */ 1306 if (!fsid->weak) 1307 return 0; 1308 1309 /* Do not allow mixing marks with weak fsid from different fs */ 1310 if (old_sb != fsid->sb) 1311 return -EXDEV; 1312 1313 /* Do not allow mixing marks from different btrfs sub-volumes */ 1314 if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid, 1315 &FANOTIFY_MARK(mark)->fsid)) 1316 return -EXDEV; 1317 1318 return 0; 1319 } 1320 1321 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, 1322 void *obj, 1323 unsigned int obj_type, 1324 unsigned int fan_flags, 1325 struct fan_fsid *fsid) 1326 { 1327 struct ucounts *ucounts = group->fanotify_data.ucounts; 1328 struct fanotify_mark *fan_mark; 1329 struct fsnotify_mark *mark; 1330 int ret; 1331 1332 /* 1333 * Enforce per user marks limits per user in all containing user ns. 1334 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count 1335 * in the limited groups account. 1336 */ 1337 BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS)); 1338 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && 1339 !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) 1340 return ERR_PTR(-ENOSPC); 1341 1342 fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 1343 if (!fan_mark) { 1344 ret = -ENOMEM; 1345 goto out_dec_ucounts; 1346 } 1347 1348 mark = &fan_mark->fsn_mark; 1349 fsnotify_init_mark(mark, group); 1350 if (fan_flags & FAN_MARK_EVICTABLE) 1351 mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF; 1352 1353 /* Cache fsid of filesystem containing the marked object */ 1354 if (fsid) { 1355 ret = fanotify_set_mark_fsid(group, mark, fsid); 1356 if (ret) 1357 goto out_put_mark; 1358 } else { 1359 fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0; 1360 } 1361 1362 ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0); 1363 if (ret) 1364 goto out_put_mark; 1365 1366 return mark; 1367 1368 out_put_mark: 1369 fsnotify_put_mark(mark); 1370 out_dec_ucounts: 1371 if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) 1372 dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); 1373 return ERR_PTR(ret); 1374 } 1375 1376 static int fanotify_group_init_error_pool(struct fsnotify_group *group) 1377 { 1378 if (mempool_initialized(&group->fanotify_data.error_events_pool)) 1379 return 0; 1380 1381 return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool, 1382 FANOTIFY_DEFAULT_FEE_POOL_SIZE, 1383 sizeof(struct fanotify_error_event)); 1384 } 1385 1386 static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark, 1387 __u32 mask, unsigned int fan_flags) 1388 { 1389 /* 1390 * Non evictable mark cannot be downgraded to evictable mark. 1391 */ 1392 if (fan_flags & FAN_MARK_EVICTABLE && 1393 !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) 1394 return -EEXIST; 1395 1396 /* 1397 * New ignore mask semantics cannot be downgraded to old semantics. 1398 */ 1399 if (fan_flags & FAN_MARK_IGNORED_MASK && 1400 fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) 1401 return -EEXIST; 1402 1403 /* 1404 * An ignore mask that survives modify could never be downgraded to not 1405 * survive modify. With new FAN_MARK_IGNORE semantics we make that rule 1406 * explicit and return an error when trying to update the ignore mask 1407 * without the original FAN_MARK_IGNORED_SURV_MODIFY value. 1408 */ 1409 if (fan_flags & FAN_MARK_IGNORE && 1410 !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) && 1411 fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) 1412 return -EEXIST; 1413 1414 /* For now pre-content events are not generated for directories */ 1415 mask |= fsn_mark->mask; 1416 if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) 1417 return -EEXIST; 1418 1419 return 0; 1420 } 1421 1422 static int fanotify_add_mark(struct fsnotify_group *group, 1423 void *obj, unsigned int obj_type, 1424 __u32 mask, unsigned int fan_flags, 1425 struct fan_fsid *fsid) 1426 { 1427 struct fsnotify_mark *fsn_mark; 1428 bool recalc; 1429 int ret = 0; 1430 1431 fsnotify_group_lock(group); 1432 fsn_mark = fsnotify_find_mark(obj, obj_type, group); 1433 if (!fsn_mark) { 1434 fsn_mark = fanotify_add_new_mark(group, obj, obj_type, 1435 fan_flags, fsid); 1436 if (IS_ERR(fsn_mark)) { 1437 fsnotify_group_unlock(group); 1438 return PTR_ERR(fsn_mark); 1439 } 1440 } 1441 1442 /* 1443 * Check if requested mark flags conflict with an existing mark flags. 1444 */ 1445 ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags); 1446 if (ret) 1447 goto out; 1448 1449 /* 1450 * Error events are pre-allocated per group, only if strictly 1451 * needed (i.e. FAN_FS_ERROR was requested). 1452 */ 1453 if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) && 1454 (mask & FAN_FS_ERROR)) { 1455 ret = fanotify_group_init_error_pool(group); 1456 if (ret) 1457 goto out; 1458 } 1459 1460 recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags); 1461 if (recalc) 1462 fsnotify_recalc_mask(fsn_mark->connector); 1463 1464 out: 1465 fsnotify_group_unlock(group); 1466 1467 fsnotify_put_mark(fsn_mark); 1468 return ret; 1469 } 1470 1471 static struct fsnotify_event *fanotify_alloc_overflow_event(void) 1472 { 1473 struct fanotify_event *oevent; 1474 1475 oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT); 1476 if (!oevent) 1477 return NULL; 1478 1479 fanotify_init_event(oevent, 0, FS_Q_OVERFLOW); 1480 oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW; 1481 1482 return &oevent->fse; 1483 } 1484 1485 static struct hlist_head *fanotify_alloc_merge_hash(void) 1486 { 1487 struct hlist_head *hash; 1488 1489 hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, 1490 GFP_KERNEL_ACCOUNT); 1491 if (!hash) 1492 return NULL; 1493 1494 __hash_init(hash, FANOTIFY_HTABLE_SIZE); 1495 1496 return hash; 1497 } 1498 1499 /* fanotify syscalls */ 1500 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) 1501 { 1502 struct user_namespace *user_ns = current_user_ns(); 1503 struct fsnotify_group *group; 1504 int f_flags, fd; 1505 unsigned int fid_mode = flags & FANOTIFY_FID_BITS; 1506 unsigned int class = flags & FANOTIFY_CLASS_BITS; 1507 unsigned int internal_flags = 0; 1508 struct file *file; 1509 1510 pr_debug("%s: flags=%x event_f_flags=%x\n", 1511 __func__, flags, event_f_flags); 1512 1513 if (!capable(CAP_SYS_ADMIN)) { 1514 /* 1515 * An unprivileged user can setup an fanotify group with 1516 * limited functionality - an unprivileged group is limited to 1517 * notification events with file handles or mount ids and it 1518 * cannot use unlimited queue/marks. 1519 */ 1520 if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || 1521 !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT))) 1522 return -EPERM; 1523 1524 /* 1525 * Setting the internal flag FANOTIFY_UNPRIV on the group 1526 * prevents setting mount/filesystem marks on this group and 1527 * prevents reporting pid and open fd in events. 1528 */ 1529 internal_flags |= FANOTIFY_UNPRIV; 1530 } 1531 1532 #ifdef CONFIG_AUDITSYSCALL 1533 if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) 1534 #else 1535 if (flags & ~FANOTIFY_INIT_FLAGS) 1536 #endif 1537 return -EINVAL; 1538 1539 /* 1540 * A pidfd can only be returned for a thread-group leader; thus 1541 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually 1542 * exclusive. 1543 */ 1544 if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) 1545 return -EINVAL; 1546 1547 /* Don't allow mixing mnt events with inode events for now */ 1548 if (flags & FAN_REPORT_MNT) { 1549 if (class != FAN_CLASS_NOTIF) 1550 return -EINVAL; 1551 if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR)) 1552 return -EINVAL; 1553 } 1554 1555 if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) 1556 return -EINVAL; 1557 1558 switch (event_f_flags & O_ACCMODE) { 1559 case O_RDONLY: 1560 case O_RDWR: 1561 case O_WRONLY: 1562 break; 1563 default: 1564 return -EINVAL; 1565 } 1566 1567 if (fid_mode && class != FAN_CLASS_NOTIF) 1568 return -EINVAL; 1569 1570 /* 1571 * Child name is reported with parent fid so requires dir fid. 1572 * We can report both child fid and dir fid with or without name. 1573 */ 1574 if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) 1575 return -EINVAL; 1576 1577 /* 1578 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID 1579 * and is used as an indication to report both dir and child fid on all 1580 * dirent events. 1581 */ 1582 if ((fid_mode & FAN_REPORT_TARGET_FID) && 1583 (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) 1584 return -EINVAL; 1585 1586 f_flags = O_RDWR; 1587 if (flags & FAN_CLOEXEC) 1588 f_flags |= O_CLOEXEC; 1589 if (flags & FAN_NONBLOCK) 1590 f_flags |= O_NONBLOCK; 1591 1592 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ 1593 group = fsnotify_alloc_group(&fanotify_fsnotify_ops, 1594 FSNOTIFY_GROUP_USER); 1595 if (IS_ERR(group)) { 1596 return PTR_ERR(group); 1597 } 1598 1599 /* Enforce groups limits per user in all containing user ns */ 1600 group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(), 1601 UCOUNT_FANOTIFY_GROUPS); 1602 if (!group->fanotify_data.ucounts) { 1603 fd = -EMFILE; 1604 goto out_destroy_group; 1605 } 1606 1607 group->fanotify_data.flags = flags | internal_flags; 1608 group->memcg = get_mem_cgroup_from_mm(current->mm); 1609 group->user_ns = get_user_ns(user_ns); 1610 1611 group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); 1612 if (!group->fanotify_data.merge_hash) { 1613 fd = -ENOMEM; 1614 goto out_destroy_group; 1615 } 1616 1617 group->overflow_event = fanotify_alloc_overflow_event(); 1618 if (unlikely(!group->overflow_event)) { 1619 fd = -ENOMEM; 1620 goto out_destroy_group; 1621 } 1622 1623 if (force_o_largefile()) 1624 event_f_flags |= O_LARGEFILE; 1625 group->fanotify_data.f_flags = event_f_flags; 1626 init_waitqueue_head(&group->fanotify_data.access_waitq); 1627 INIT_LIST_HEAD(&group->fanotify_data.access_list); 1628 switch (class) { 1629 case FAN_CLASS_NOTIF: 1630 group->priority = FSNOTIFY_PRIO_NORMAL; 1631 break; 1632 case FAN_CLASS_CONTENT: 1633 group->priority = FSNOTIFY_PRIO_CONTENT; 1634 break; 1635 case FAN_CLASS_PRE_CONTENT: 1636 group->priority = FSNOTIFY_PRIO_PRE_CONTENT; 1637 break; 1638 default: 1639 fd = -EINVAL; 1640 goto out_destroy_group; 1641 } 1642 1643 BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE)); 1644 if (flags & FAN_UNLIMITED_QUEUE) { 1645 group->max_events = UINT_MAX; 1646 } else { 1647 group->max_events = fanotify_max_queued_events; 1648 } 1649 1650 if (flags & FAN_ENABLE_AUDIT) { 1651 fd = -EPERM; 1652 if (!capable(CAP_AUDIT_WRITE)) 1653 goto out_destroy_group; 1654 } 1655 1656 fd = get_unused_fd_flags(f_flags); 1657 if (fd < 0) 1658 goto out_destroy_group; 1659 1660 file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group, 1661 f_flags, FMODE_NONOTIFY); 1662 if (IS_ERR(file)) { 1663 put_unused_fd(fd); 1664 fd = PTR_ERR(file); 1665 goto out_destroy_group; 1666 } 1667 fd_install(fd, file); 1668 return fd; 1669 1670 out_destroy_group: 1671 fsnotify_destroy_group(group); 1672 return fd; 1673 } 1674 1675 static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags, 1676 struct fan_fsid *fsid) 1677 { 1678 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; 1679 __kernel_fsid_t root_fsid; 1680 int err; 1681 1682 /* 1683 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). 1684 */ 1685 err = vfs_get_fsid(dentry, &fsid->id); 1686 if (err) 1687 return err; 1688 1689 fsid->sb = dentry->d_sb; 1690 if (!fsid->id.val[0] && !fsid->id.val[1]) { 1691 err = -ENODEV; 1692 goto weak; 1693 } 1694 1695 /* 1696 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) 1697 * which uses a different fsid than sb root. 1698 */ 1699 err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid); 1700 if (err) 1701 return err; 1702 1703 if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) { 1704 err = -EXDEV; 1705 goto weak; 1706 } 1707 1708 fsid->weak = false; 1709 return 0; 1710 1711 weak: 1712 /* Allow weak fsid when marking inodes */ 1713 fsid->weak = true; 1714 return (mark_type == FAN_MARK_INODE) ? 0 : err; 1715 } 1716 1717 /* Check if filesystem can encode a unique fid */ 1718 static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) 1719 { 1720 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; 1721 const struct export_operations *nop = dentry->d_sb->s_export_op; 1722 1723 /* 1724 * We need to make sure that the filesystem supports encoding of 1725 * file handles so user can use name_to_handle_at() to compare fids 1726 * reported with events to the file handle of watched objects. 1727 */ 1728 if (!exportfs_can_encode_fid(nop)) 1729 return -EOPNOTSUPP; 1730 1731 /* 1732 * For sb/mount mark, we also need to make sure that the filesystem 1733 * supports decoding file handles, so user has a way to map back the 1734 * reported fids to filesystem objects. 1735 */ 1736 if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop)) 1737 return -EOPNOTSUPP; 1738 1739 return 0; 1740 } 1741 1742 static int fanotify_events_supported(struct fsnotify_group *group, 1743 const struct path *path, __u64 mask, 1744 unsigned int flags) 1745 { 1746 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; 1747 bool is_dir = d_is_dir(path->dentry); 1748 /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ 1749 bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || 1750 (mask & FAN_RENAME) || 1751 (flags & FAN_MARK_IGNORE); 1752 1753 /* 1754 * Filesystems need to opt-into pre-content evnets (a.k.a HSM) 1755 * and they are only supported on regular files and directories. 1756 */ 1757 if (mask & FANOTIFY_PRE_CONTENT_EVENTS) { 1758 if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM)) 1759 return -EOPNOTSUPP; 1760 if (!is_dir && !d_is_reg(path->dentry)) 1761 return -EINVAL; 1762 } 1763 1764 /* 1765 * Some filesystems such as 'proc' acquire unusual locks when opening 1766 * files. For them fanotify permission events have high chances of 1767 * deadlocking the system - open done when reporting fanotify event 1768 * blocks on this "unusual" lock while another process holding the lock 1769 * waits for fanotify permission event to be answered. Just disallow 1770 * permission events for such filesystems. 1771 */ 1772 if (mask & FANOTIFY_PERM_EVENTS && 1773 path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) 1774 return -EINVAL; 1775 1776 /* 1777 * mount and sb marks are not allowed on kernel internal pseudo fs, 1778 * like pipe_mnt, because that would subscribe to events on all the 1779 * anonynous pipes in the system. 1780 * 1781 * SB_NOUSER covers all of the internal pseudo fs whose objects are not 1782 * exposed to user's mount namespace, but there are other SB_KERNMOUNT 1783 * fs, like nsfs, debugfs, for which the value of allowing sb and mount 1784 * mark is questionable. For now we leave them alone. 1785 */ 1786 if (mark_type != FAN_MARK_INODE && 1787 path->mnt->mnt_sb->s_flags & SB_NOUSER) 1788 return -EINVAL; 1789 1790 /* 1791 * We shouldn't have allowed setting dirent events and the directory 1792 * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, 1793 * but because we always allowed it, error only when using new APIs. 1794 */ 1795 if (strict_dir_events && mark_type == FAN_MARK_INODE && 1796 !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) 1797 return -ENOTDIR; 1798 1799 return 0; 1800 } 1801 1802 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, 1803 int dfd, const char __user *pathname) 1804 { 1805 struct inode *inode = NULL; 1806 struct fsnotify_group *group; 1807 struct path path; 1808 struct fan_fsid __fsid, *fsid = NULL; 1809 struct user_namespace *user_ns = NULL; 1810 struct mnt_namespace *mntns; 1811 u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; 1812 unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; 1813 unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; 1814 unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; 1815 unsigned int obj_type, fid_mode; 1816 void *obj = NULL; 1817 u32 umask = 0; 1818 int ret; 1819 1820 pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n", 1821 __func__, fanotify_fd, flags, dfd, pathname, mask); 1822 1823 /* we only use the lower 32 bits as of right now. */ 1824 if (upper_32_bits(mask)) 1825 return -EINVAL; 1826 1827 if (flags & ~FANOTIFY_MARK_FLAGS) 1828 return -EINVAL; 1829 1830 switch (mark_type) { 1831 case FAN_MARK_INODE: 1832 obj_type = FSNOTIFY_OBJ_TYPE_INODE; 1833 break; 1834 case FAN_MARK_MOUNT: 1835 obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; 1836 break; 1837 case FAN_MARK_FILESYSTEM: 1838 obj_type = FSNOTIFY_OBJ_TYPE_SB; 1839 break; 1840 case FAN_MARK_MNTNS: 1841 obj_type = FSNOTIFY_OBJ_TYPE_MNTNS; 1842 break; 1843 default: 1844 return -EINVAL; 1845 } 1846 1847 switch (mark_cmd) { 1848 case FAN_MARK_ADD: 1849 case FAN_MARK_REMOVE: 1850 if (!mask) 1851 return -EINVAL; 1852 break; 1853 case FAN_MARK_FLUSH: 1854 if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) 1855 return -EINVAL; 1856 break; 1857 default: 1858 return -EINVAL; 1859 } 1860 1861 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) 1862 valid_mask |= FANOTIFY_PERM_EVENTS; 1863 1864 if (mask & ~valid_mask) 1865 return -EINVAL; 1866 1867 1868 /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ 1869 if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) 1870 return -EINVAL; 1871 1872 /* 1873 * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with 1874 * FAN_MARK_IGNORED_MASK. 1875 */ 1876 if (ignore == FAN_MARK_IGNORED_MASK) { 1877 mask &= ~FANOTIFY_EVENT_FLAGS; 1878 umask = FANOTIFY_EVENT_FLAGS; 1879 } 1880 1881 CLASS(fd, f)(fanotify_fd); 1882 if (fd_empty(f)) 1883 return -EBADF; 1884 1885 /* verify that this is indeed an fanotify instance */ 1886 if (unlikely(fd_file(f)->f_op != &fanotify_fops)) 1887 return -EINVAL; 1888 group = fd_file(f)->private_data; 1889 1890 /* Only report mount events on mnt namespace */ 1891 if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { 1892 if (mask & ~FANOTIFY_MOUNT_EVENTS) 1893 return -EINVAL; 1894 if (mark_type != FAN_MARK_MNTNS) 1895 return -EINVAL; 1896 } else { 1897 if (mask & FANOTIFY_MOUNT_EVENTS) 1898 return -EINVAL; 1899 if (mark_type == FAN_MARK_MNTNS) 1900 return -EINVAL; 1901 } 1902 1903 /* 1904 * A user is allowed to setup sb/mount/mntns marks only if it is 1905 * capable in the user ns where the group was created. 1906 */ 1907 if (!ns_capable(group->user_ns, CAP_SYS_ADMIN) && 1908 mark_type != FAN_MARK_INODE) 1909 return -EPERM; 1910 1911 /* 1912 * Permission events are not allowed for FAN_CLASS_NOTIF. 1913 * Pre-content permission events are not allowed for FAN_CLASS_CONTENT. 1914 */ 1915 if (mask & FANOTIFY_PERM_EVENTS && 1916 group->priority == FSNOTIFY_PRIO_NORMAL) 1917 return -EINVAL; 1918 else if (mask & FANOTIFY_PRE_CONTENT_EVENTS && 1919 group->priority == FSNOTIFY_PRIO_CONTENT) 1920 return -EINVAL; 1921 1922 if (mask & FAN_FS_ERROR && 1923 mark_type != FAN_MARK_FILESYSTEM) 1924 return -EINVAL; 1925 1926 /* 1927 * Evictable is only relevant for inode marks, because only inode object 1928 * can be evicted on memory pressure. 1929 */ 1930 if (flags & FAN_MARK_EVICTABLE && 1931 mark_type != FAN_MARK_INODE) 1932 return -EINVAL; 1933 1934 /* 1935 * Events that do not carry enough information to report 1936 * event->fd require a group that supports reporting fid. Those 1937 * events are not supported on a mount mark, because they do not 1938 * carry enough information (i.e. path) to be filtered by mount 1939 * point. 1940 */ 1941 fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); 1942 if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) && 1943 (!fid_mode || mark_type == FAN_MARK_MOUNT)) 1944 return -EINVAL; 1945 1946 /* 1947 * FAN_RENAME uses special info type records to report the old and 1948 * new parent+name. Reporting only old and new parent id is less 1949 * useful and was not implemented. 1950 */ 1951 if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) 1952 return -EINVAL; 1953 1954 /* Pre-content events are not currently generated for directories. */ 1955 if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) 1956 return -EINVAL; 1957 1958 if (mark_cmd == FAN_MARK_FLUSH) { 1959 fsnotify_clear_marks_by_group(group, obj_type); 1960 return 0; 1961 } 1962 1963 ret = fanotify_find_path(dfd, pathname, &path, flags, 1964 (mask & ALL_FSNOTIFY_EVENTS), obj_type); 1965 if (ret) 1966 return ret; 1967 1968 if (mark_cmd == FAN_MARK_ADD) { 1969 ret = fanotify_events_supported(group, &path, mask, flags); 1970 if (ret) 1971 goto path_put_and_out; 1972 } 1973 1974 if (fid_mode) { 1975 ret = fanotify_test_fsid(path.dentry, flags, &__fsid); 1976 if (ret) 1977 goto path_put_and_out; 1978 1979 ret = fanotify_test_fid(path.dentry, flags); 1980 if (ret) 1981 goto path_put_and_out; 1982 1983 fsid = &__fsid; 1984 } 1985 1986 /* 1987 * In addition to being capable in the user ns where group was created, 1988 * the user also needs to be capable in the user ns associated with 1989 * the filesystem or in the user ns associated with the mntns 1990 * (when marking mntns). 1991 */ 1992 if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { 1993 inode = path.dentry->d_inode; 1994 obj = inode; 1995 } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { 1996 user_ns = path.mnt->mnt_sb->s_user_ns; 1997 obj = path.mnt; 1998 } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) { 1999 user_ns = path.mnt->mnt_sb->s_user_ns; 2000 obj = path.mnt->mnt_sb; 2001 } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { 2002 mntns = mnt_ns_from_dentry(path.dentry); 2003 user_ns = mntns->user_ns; 2004 obj = mntns; 2005 } 2006 2007 ret = -EPERM; 2008 if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN)) 2009 goto path_put_and_out; 2010 2011 ret = -EINVAL; 2012 if (!obj) 2013 goto path_put_and_out; 2014 2015 /* 2016 * If some other task has this inode open for write we should not add 2017 * an ignore mask, unless that ignore mask is supposed to survive 2018 * modification changes anyway. 2019 */ 2020 if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && 2021 !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { 2022 ret = !inode ? -EINVAL : -EISDIR; 2023 /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ 2024 if (ignore == FAN_MARK_IGNORE && 2025 (!inode || S_ISDIR(inode->i_mode))) 2026 goto path_put_and_out; 2027 2028 ret = 0; 2029 if (inode && inode_is_open_for_write(inode)) 2030 goto path_put_and_out; 2031 } 2032 2033 /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ 2034 if (!inode || !S_ISDIR(inode->i_mode)) { 2035 mask &= ~FAN_EVENT_ON_CHILD; 2036 umask = FAN_EVENT_ON_CHILD; 2037 /* 2038 * If group needs to report parent fid, register for getting 2039 * events with parent/name info for non-directory. 2040 */ 2041 if ((fid_mode & FAN_REPORT_DIR_FID) && 2042 (flags & FAN_MARK_ADD) && !ignore) 2043 mask |= FAN_EVENT_ON_CHILD; 2044 } 2045 2046 /* create/update an inode mark */ 2047 switch (mark_cmd) { 2048 case FAN_MARK_ADD: 2049 ret = fanotify_add_mark(group, obj, obj_type, mask, flags, 2050 fsid); 2051 break; 2052 case FAN_MARK_REMOVE: 2053 ret = fanotify_remove_mark(group, obj, obj_type, mask, flags, 2054 umask); 2055 break; 2056 default: 2057 ret = -EINVAL; 2058 } 2059 2060 path_put_and_out: 2061 path_put(&path); 2062 return ret; 2063 } 2064 2065 #ifndef CONFIG_ARCH_SPLIT_ARG64 2066 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, 2067 __u64, mask, int, dfd, 2068 const char __user *, pathname) 2069 { 2070 return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); 2071 } 2072 #endif 2073 2074 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT) 2075 SYSCALL32_DEFINE6(fanotify_mark, 2076 int, fanotify_fd, unsigned int, flags, 2077 SC_ARG64(mask), int, dfd, 2078 const char __user *, pathname) 2079 { 2080 return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask), 2081 dfd, pathname); 2082 } 2083 #endif 2084 2085 /* 2086 * fanotify_user_setup - Our initialization function. Note that we cannot return 2087 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 2088 * must result in panic(). 2089 */ 2090 static int __init fanotify_user_setup(void) 2091 { 2092 struct sysinfo si; 2093 int max_marks; 2094 2095 si_meminfo(&si); 2096 /* 2097 * Allow up to 1% of addressable memory to be accounted for per user 2098 * marks limited to the range [8192, 1048576]. mount and sb marks are 2099 * a lot cheaper than inode marks, but there is no reason for a user 2100 * to have many of those, so calculate by the cost of inode marks. 2101 */ 2102 max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / 2103 INODE_MARK_COST; 2104 max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, 2105 FANOTIFY_DEFAULT_MAX_USER_MARKS); 2106 2107 BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); 2108 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14); 2109 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); 2110 2111 fanotify_mark_cache = KMEM_CACHE(fanotify_mark, 2112 SLAB_PANIC|SLAB_ACCOUNT); 2113 fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event, 2114 SLAB_PANIC); 2115 fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event, 2116 SLAB_PANIC); 2117 if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) { 2118 fanotify_perm_event_cachep = 2119 KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); 2120 } 2121 fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC); 2122 2123 fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; 2124 init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = 2125 FANOTIFY_DEFAULT_MAX_GROUPS; 2126 init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; 2127 fanotify_sysctls_init(); 2128 2129 return 0; 2130 } 2131 device_initcall(fanotify_user_setup); 2132