1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fanotify.h>
3 #include <linux/fcntl.h>
4 #include <linux/file.h>
5 #include <linux/fs.h>
6 #include <linux/anon_inodes.h>
7 #include <linux/fsnotify_backend.h>
8 #include <linux/init.h>
9 #include <linux/mount.h>
10 #include <linux/namei.h>
11 #include <linux/poll.h>
12 #include <linux/security.h>
13 #include <linux/syscalls.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/uaccess.h>
17 #include <linux/compat.h>
18 #include <linux/sched/signal.h>
19 #include <linux/memcontrol.h>
20 #include <linux/statfs.h>
21 #include <linux/exportfs.h>
22 
23 #include <asm/ioctls.h>
24 
25 #include "../fsnotify.h"
26 #include "../fdinfo.h"
27 #include "fanotify.h"
28 
29 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
30 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
31 #define FANOTIFY_DEFAULT_MAX_GROUPS	128
32 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE	32
33 
34 /*
35  * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
36  * limit of marks per user, similar to inotify.  Effectively, the legacy limit
37  * of fanotify marks per user is <max marks per group> * <max groups per user>.
38  * This default limit (1M) also happens to match the increased limit of inotify
39  * max_user_watches since v5.10.
40  */
41 #define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
42 	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
43 
44 /*
45  * Most of the memory cost of adding an inode mark is pinning the marked inode.
46  * The size of the filesystem inode struct is not uniform across filesystems,
47  * so double the size of a VFS inode is used as a conservative approximation.
48  */
49 #define INODE_MARK_COST	(2 * sizeof(struct inode))
50 
51 /* configurable via /proc/sys/fs/fanotify/ */
52 static int fanotify_max_queued_events __read_mostly;
53 
54 #ifdef CONFIG_SYSCTL
55 
56 #include <linux/sysctl.h>
57 
58 static long ft_zero = 0;
59 static long ft_int_max = INT_MAX;
60 
61 static const struct ctl_table fanotify_table[] = {
62 	{
63 		.procname	= "max_user_groups",
64 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
65 		.maxlen		= sizeof(long),
66 		.mode		= 0644,
67 		.proc_handler	= proc_doulongvec_minmax,
68 		.extra1		= &ft_zero,
69 		.extra2		= &ft_int_max,
70 	},
71 	{
72 		.procname	= "max_user_marks",
73 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
74 		.maxlen		= sizeof(long),
75 		.mode		= 0644,
76 		.proc_handler	= proc_doulongvec_minmax,
77 		.extra1		= &ft_zero,
78 		.extra2		= &ft_int_max,
79 	},
80 	{
81 		.procname	= "max_queued_events",
82 		.data		= &fanotify_max_queued_events,
83 		.maxlen		= sizeof(int),
84 		.mode		= 0644,
85 		.proc_handler	= proc_dointvec_minmax,
86 		.extra1		= SYSCTL_ZERO
87 	},
88 };
89 
90 static void __init fanotify_sysctls_init(void)
91 {
92 	register_sysctl("fs/fanotify", fanotify_table);
93 }
94 #else
95 #define fanotify_sysctls_init() do { } while (0)
96 #endif /* CONFIG_SYSCTL */
97 
98 /*
99  * All flags that may be specified in parameter event_f_flags of fanotify_init.
100  *
101  * Internal and external open flags are stored together in field f_flags of
102  * struct file. Only external open flags shall be allowed in event_f_flags.
103  * Internal flags like FMODE_EXEC shall be excluded.
104  */
105 #define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
106 		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
107 		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
108 		O_LARGEFILE	| O_NOATIME	)
109 
110 extern const struct fsnotify_ops fanotify_fsnotify_ops;
111 
112 struct kmem_cache *fanotify_mark_cache __ro_after_init;
113 struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
114 struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
115 struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
116 struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init;
117 
118 #define FANOTIFY_EVENT_ALIGN 4
119 #define FANOTIFY_FID_INFO_HDR_LEN \
120 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
121 #define FANOTIFY_PIDFD_INFO_LEN \
122 	sizeof(struct fanotify_event_info_pidfd)
123 #define FANOTIFY_ERROR_INFO_LEN \
124 	(sizeof(struct fanotify_event_info_error))
125 #define FANOTIFY_RANGE_INFO_LEN \
126 	(sizeof(struct fanotify_event_info_range))
127 #define FANOTIFY_MNT_INFO_LEN \
128 	(sizeof(struct fanotify_event_info_mnt))
129 
130 static int fanotify_fid_info_len(int fh_len, int name_len)
131 {
132 	int info_len = fh_len;
133 
134 	if (name_len)
135 		info_len += name_len + 1;
136 
137 	return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
138 		       FANOTIFY_EVENT_ALIGN);
139 }
140 
141 /* FAN_RENAME may have one or two dir+name info records */
142 static int fanotify_dir_name_info_len(struct fanotify_event *event)
143 {
144 	struct fanotify_info *info = fanotify_event_info(event);
145 	int dir_fh_len = fanotify_event_dir_fh_len(event);
146 	int dir2_fh_len = fanotify_event_dir2_fh_len(event);
147 	int info_len = 0;
148 
149 	if (dir_fh_len)
150 		info_len += fanotify_fid_info_len(dir_fh_len,
151 						  info->name_len);
152 	if (dir2_fh_len)
153 		info_len += fanotify_fid_info_len(dir2_fh_len,
154 						  info->name2_len);
155 
156 	return info_len;
157 }
158 
159 static size_t fanotify_event_len(unsigned int info_mode,
160 				 struct fanotify_event *event)
161 {
162 	size_t event_len = FAN_EVENT_METADATA_LEN;
163 	int fh_len;
164 	int dot_len = 0;
165 
166 	if (fanotify_is_error_event(event->mask))
167 		event_len += FANOTIFY_ERROR_INFO_LEN;
168 
169 	if (fanotify_event_has_any_dir_fh(event)) {
170 		event_len += fanotify_dir_name_info_len(event);
171 	} else if ((info_mode & FAN_REPORT_NAME) &&
172 		   (event->mask & FAN_ONDIR)) {
173 		/*
174 		 * With group flag FAN_REPORT_NAME, if name was not recorded in
175 		 * event on a directory, we will report the name ".".
176 		 */
177 		dot_len = 1;
178 	}
179 
180 	if (fanotify_event_has_object_fh(event)) {
181 		fh_len = fanotify_event_object_fh_len(event);
182 		event_len += fanotify_fid_info_len(fh_len, dot_len);
183 	}
184 	if (fanotify_is_mnt_event(event->mask))
185 		event_len += FANOTIFY_MNT_INFO_LEN;
186 
187 	if (info_mode & FAN_REPORT_PIDFD)
188 		event_len += FANOTIFY_PIDFD_INFO_LEN;
189 
190 	if (fanotify_event_has_access_range(event))
191 		event_len += FANOTIFY_RANGE_INFO_LEN;
192 
193 	return event_len;
194 }
195 
196 /*
197  * Remove an hashed event from merge hash table.
198  */
199 static void fanotify_unhash_event(struct fsnotify_group *group,
200 				  struct fanotify_event *event)
201 {
202 	assert_spin_locked(&group->notification_lock);
203 
204 	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
205 		 group, event, fanotify_event_hash_bucket(group, event));
206 
207 	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
208 		return;
209 
210 	hlist_del_init(&event->merge_list);
211 }
212 
213 /*
214  * Get an fanotify notification event if one exists and is small
215  * enough to fit in "count". Return an error pointer if the count
216  * is not large enough. When permission event is dequeued, its state is
217  * updated accordingly.
218  */
219 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
220 					    size_t count)
221 {
222 	size_t event_size;
223 	struct fanotify_event *event = NULL;
224 	struct fsnotify_event *fsn_event;
225 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
226 
227 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
228 
229 	spin_lock(&group->notification_lock);
230 	fsn_event = fsnotify_peek_first_event(group);
231 	if (!fsn_event)
232 		goto out;
233 
234 	event = FANOTIFY_E(fsn_event);
235 	event_size = fanotify_event_len(info_mode, event);
236 
237 	if (event_size > count) {
238 		event = ERR_PTR(-EINVAL);
239 		goto out;
240 	}
241 
242 	/*
243 	 * Held the notification_lock the whole time, so this is the
244 	 * same event we peeked above.
245 	 */
246 	fsnotify_remove_first_event(group);
247 	if (fanotify_is_perm_event(event->mask))
248 		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
249 	if (fanotify_is_hashed_event(event->mask))
250 		fanotify_unhash_event(group, event);
251 out:
252 	spin_unlock(&group->notification_lock);
253 	return event;
254 }
255 
256 static int create_fd(struct fsnotify_group *group, const struct path *path,
257 		     struct file **file)
258 {
259 	int client_fd;
260 	struct file *new_file;
261 
262 	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
263 	if (client_fd < 0)
264 		return client_fd;
265 
266 	/*
267 	 * We provide an fd for the userspace program, so it could access the
268 	 * file without generating fanotify events itself.
269 	 */
270 	new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags,
271 					current_cred());
272 	if (IS_ERR(new_file)) {
273 		put_unused_fd(client_fd);
274 		client_fd = PTR_ERR(new_file);
275 	} else {
276 		*file = new_file;
277 	}
278 
279 	return client_fd;
280 }
281 
282 static int process_access_response_info(const char __user *info,
283 					size_t info_len,
284 				struct fanotify_response_info_audit_rule *friar)
285 {
286 	if (info_len != sizeof(*friar))
287 		return -EINVAL;
288 
289 	if (copy_from_user(friar, info, sizeof(*friar)))
290 		return -EFAULT;
291 
292 	if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE)
293 		return -EINVAL;
294 	if (friar->hdr.pad != 0)
295 		return -EINVAL;
296 	if (friar->hdr.len != sizeof(*friar))
297 		return -EINVAL;
298 
299 	return info_len;
300 }
301 
302 /*
303  * Finish processing of permission event by setting it to ANSWERED state and
304  * drop group->notification_lock.
305  */
306 static void finish_permission_event(struct fsnotify_group *group,
307 				    struct fanotify_perm_event *event, u32 response,
308 				    struct fanotify_response_info_audit_rule *friar)
309 				    __releases(&group->notification_lock)
310 {
311 	bool destroy = false;
312 
313 	assert_spin_locked(&group->notification_lock);
314 	event->response = response & ~FAN_INFO;
315 	if (response & FAN_INFO)
316 		memcpy(&event->audit_rule, friar, sizeof(*friar));
317 
318 	if (event->state == FAN_EVENT_CANCELED)
319 		destroy = true;
320 	else
321 		event->state = FAN_EVENT_ANSWERED;
322 	spin_unlock(&group->notification_lock);
323 	if (destroy)
324 		fsnotify_destroy_event(group, &event->fae.fse);
325 }
326 
327 static int process_access_response(struct fsnotify_group *group,
328 				   struct fanotify_response *response_struct,
329 				   const char __user *info,
330 				   size_t info_len)
331 {
332 	struct fanotify_perm_event *event;
333 	int fd = response_struct->fd;
334 	u32 response = response_struct->response;
335 	int errno = fanotify_get_response_errno(response);
336 	int ret = info_len;
337 	struct fanotify_response_info_audit_rule friar;
338 
339 	pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n",
340 		 __func__, group, fd, response, errno, info, info_len);
341 	/*
342 	 * make sure the response is valid, if invalid we do nothing and either
343 	 * userspace can send a valid response or we will clean it up after the
344 	 * timeout
345 	 */
346 	if (response & ~FANOTIFY_RESPONSE_VALID_MASK)
347 		return -EINVAL;
348 
349 	switch (response & FANOTIFY_RESPONSE_ACCESS) {
350 	case FAN_ALLOW:
351 		if (errno)
352 			return -EINVAL;
353 		break;
354 	case FAN_DENY:
355 		/* Custom errno is supported only for pre-content groups */
356 		if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT)
357 			return -EINVAL;
358 
359 		/*
360 		 * Limit errno to values expected on open(2)/read(2)/write(2)
361 		 * of regular files.
362 		 */
363 		switch (errno) {
364 		case 0:
365 		case EIO:
366 		case EPERM:
367 		case EBUSY:
368 		case ETXTBSY:
369 		case EAGAIN:
370 		case ENOSPC:
371 		case EDQUOT:
372 			break;
373 		default:
374 			return -EINVAL;
375 		}
376 		break;
377 	default:
378 		return -EINVAL;
379 	}
380 
381 	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
382 		return -EINVAL;
383 
384 	if (response & FAN_INFO) {
385 		ret = process_access_response_info(info, info_len, &friar);
386 		if (ret < 0)
387 			return ret;
388 		if (fd == FAN_NOFD)
389 			return ret;
390 	} else {
391 		ret = 0;
392 	}
393 
394 	if (fd < 0)
395 		return -EINVAL;
396 
397 	spin_lock(&group->notification_lock);
398 	list_for_each_entry(event, &group->fanotify_data.access_list,
399 			    fae.fse.list) {
400 		if (event->fd != fd)
401 			continue;
402 
403 		list_del_init(&event->fae.fse.list);
404 		finish_permission_event(group, event, response, &friar);
405 		wake_up(&group->fanotify_data.access_waitq);
406 		return ret;
407 	}
408 	spin_unlock(&group->notification_lock);
409 
410 	return -ENOENT;
411 }
412 
413 static size_t copy_mnt_info_to_user(struct fanotify_event *event,
414 				    char __user *buf, int count)
415 {
416 	struct fanotify_event_info_mnt info = { };
417 
418 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT;
419 	info.hdr.len = FANOTIFY_MNT_INFO_LEN;
420 
421 	if (WARN_ON(count < info.hdr.len))
422 		return -EFAULT;
423 
424 	info.mnt_id = FANOTIFY_ME(event)->mnt_id;
425 
426 	if (copy_to_user(buf, &info, sizeof(info)))
427 		return -EFAULT;
428 
429 	return info.hdr.len;
430 }
431 
432 static size_t copy_error_info_to_user(struct fanotify_event *event,
433 				      char __user *buf, int count)
434 {
435 	struct fanotify_event_info_error info = { };
436 	struct fanotify_error_event *fee = FANOTIFY_EE(event);
437 
438 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
439 	info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
440 
441 	if (WARN_ON(count < info.hdr.len))
442 		return -EFAULT;
443 
444 	info.error = fee->error;
445 	info.error_count = fee->err_count;
446 
447 	if (copy_to_user(buf, &info, sizeof(info)))
448 		return -EFAULT;
449 
450 	return info.hdr.len;
451 }
452 
453 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
454 				 int info_type, const char *name,
455 				 size_t name_len,
456 				 char __user *buf, size_t count)
457 {
458 	struct fanotify_event_info_fid info = { };
459 	struct file_handle handle = { };
460 	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
461 	size_t fh_len = fh ? fh->len : 0;
462 	size_t info_len = fanotify_fid_info_len(fh_len, name_len);
463 	size_t len = info_len;
464 
465 	pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
466 		 __func__, fh_len, name_len, info_len, count);
467 
468 	if (WARN_ON_ONCE(len < sizeof(info) || len > count))
469 		return -EFAULT;
470 
471 	/*
472 	 * Copy event info fid header followed by variable sized file handle
473 	 * and optionally followed by variable sized filename.
474 	 */
475 	switch (info_type) {
476 	case FAN_EVENT_INFO_TYPE_FID:
477 	case FAN_EVENT_INFO_TYPE_DFID:
478 		if (WARN_ON_ONCE(name_len))
479 			return -EFAULT;
480 		break;
481 	case FAN_EVENT_INFO_TYPE_DFID_NAME:
482 	case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
483 	case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
484 		if (WARN_ON_ONCE(!name || !name_len))
485 			return -EFAULT;
486 		break;
487 	default:
488 		return -EFAULT;
489 	}
490 
491 	info.hdr.info_type = info_type;
492 	info.hdr.len = len;
493 	info.fsid = *fsid;
494 	if (copy_to_user(buf, &info, sizeof(info)))
495 		return -EFAULT;
496 
497 	buf += sizeof(info);
498 	len -= sizeof(info);
499 	if (WARN_ON_ONCE(len < sizeof(handle)))
500 		return -EFAULT;
501 
502 	handle.handle_type = fh->type;
503 	handle.handle_bytes = fh_len;
504 
505 	/* Mangle handle_type for bad file_handle */
506 	if (!fh_len)
507 		handle.handle_type = FILEID_INVALID;
508 
509 	if (copy_to_user(buf, &handle, sizeof(handle)))
510 		return -EFAULT;
511 
512 	buf += sizeof(handle);
513 	len -= sizeof(handle);
514 	if (WARN_ON_ONCE(len < fh_len))
515 		return -EFAULT;
516 
517 	/*
518 	 * For an inline fh and inline file name, copy through stack to exclude
519 	 * the copy from usercopy hardening protections.
520 	 */
521 	fh_buf = fanotify_fh_buf(fh);
522 	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
523 		memcpy(bounce, fh_buf, fh_len);
524 		fh_buf = bounce;
525 	}
526 	if (copy_to_user(buf, fh_buf, fh_len))
527 		return -EFAULT;
528 
529 	buf += fh_len;
530 	len -= fh_len;
531 
532 	if (name_len) {
533 		/* Copy the filename with terminating null */
534 		name_len++;
535 		if (WARN_ON_ONCE(len < name_len))
536 			return -EFAULT;
537 
538 		if (copy_to_user(buf, name, name_len))
539 			return -EFAULT;
540 
541 		buf += name_len;
542 		len -= name_len;
543 	}
544 
545 	/* Pad with 0's */
546 	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
547 	if (len > 0 && clear_user(buf, len))
548 		return -EFAULT;
549 
550 	return info_len;
551 }
552 
553 static int copy_pidfd_info_to_user(int pidfd,
554 				   char __user *buf,
555 				   size_t count)
556 {
557 	struct fanotify_event_info_pidfd info = { };
558 	size_t info_len = FANOTIFY_PIDFD_INFO_LEN;
559 
560 	if (WARN_ON_ONCE(info_len > count))
561 		return -EFAULT;
562 
563 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
564 	info.hdr.len = info_len;
565 	info.pidfd = pidfd;
566 
567 	if (copy_to_user(buf, &info, info_len))
568 		return -EFAULT;
569 
570 	return info_len;
571 }
572 
573 static size_t copy_range_info_to_user(struct fanotify_event *event,
574 				      char __user *buf, int count)
575 {
576 	struct fanotify_perm_event *pevent = FANOTIFY_PERM(event);
577 	struct fanotify_event_info_range info = { };
578 	size_t info_len = FANOTIFY_RANGE_INFO_LEN;
579 
580 	if (WARN_ON_ONCE(info_len > count))
581 		return -EFAULT;
582 
583 	if (WARN_ON_ONCE(!pevent->ppos))
584 		return -EINVAL;
585 
586 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_RANGE;
587 	info.hdr.len = info_len;
588 	info.offset = *(pevent->ppos);
589 	info.count = pevent->count;
590 
591 	if (copy_to_user(buf, &info, info_len))
592 		return -EFAULT;
593 
594 	return info_len;
595 }
596 
597 static int copy_info_records_to_user(struct fanotify_event *event,
598 				     struct fanotify_info *info,
599 				     unsigned int info_mode, int pidfd,
600 				     char __user *buf, size_t count)
601 {
602 	int ret, total_bytes = 0, info_type = 0;
603 	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
604 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
605 
606 	/*
607 	 * Event info records order is as follows:
608 	 * 1. dir fid + name
609 	 * 2. (optional) new dir fid + new name
610 	 * 3. (optional) child fid
611 	 */
612 	if (fanotify_event_has_dir_fh(event)) {
613 		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
614 					     FAN_EVENT_INFO_TYPE_DFID;
615 
616 		/* FAN_RENAME uses special info types */
617 		if (event->mask & FAN_RENAME)
618 			info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
619 
620 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
621 					    fanotify_info_dir_fh(info),
622 					    info_type,
623 					    fanotify_info_name(info),
624 					    info->name_len, buf, count);
625 		if (ret < 0)
626 			return ret;
627 
628 		buf += ret;
629 		count -= ret;
630 		total_bytes += ret;
631 	}
632 
633 	/* New dir fid+name may be reported in addition to old dir fid+name */
634 	if (fanotify_event_has_dir2_fh(event)) {
635 		info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
636 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
637 					    fanotify_info_dir2_fh(info),
638 					    info_type,
639 					    fanotify_info_name2(info),
640 					    info->name2_len, buf, count);
641 		if (ret < 0)
642 			return ret;
643 
644 		buf += ret;
645 		count -= ret;
646 		total_bytes += ret;
647 	}
648 
649 	if (fanotify_event_has_object_fh(event)) {
650 		const char *dot = NULL;
651 		int dot_len = 0;
652 
653 		if (fid_mode == FAN_REPORT_FID || info_type) {
654 			/*
655 			 * With only group flag FAN_REPORT_FID only type FID is
656 			 * reported. Second info record type is always FID.
657 			 */
658 			info_type = FAN_EVENT_INFO_TYPE_FID;
659 		} else if ((fid_mode & FAN_REPORT_NAME) &&
660 			   (event->mask & FAN_ONDIR)) {
661 			/*
662 			 * With group flag FAN_REPORT_NAME, if name was not
663 			 * recorded in an event on a directory, report the name
664 			 * "." with info type DFID_NAME.
665 			 */
666 			info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
667 			dot = ".";
668 			dot_len = 1;
669 		} else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
670 			   (event->mask & FAN_ONDIR)) {
671 			/*
672 			 * With group flag FAN_REPORT_DIR_FID, a single info
673 			 * record has type DFID for directory entry modification
674 			 * event and for event on a directory.
675 			 */
676 			info_type = FAN_EVENT_INFO_TYPE_DFID;
677 		} else {
678 			/*
679 			 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
680 			 * a single info record has type FID for event on a
681 			 * non-directory, when there is no directory to report.
682 			 * For example, on FAN_DELETE_SELF event.
683 			 */
684 			info_type = FAN_EVENT_INFO_TYPE_FID;
685 		}
686 
687 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
688 					    fanotify_event_object_fh(event),
689 					    info_type, dot, dot_len,
690 					    buf, count);
691 		if (ret < 0)
692 			return ret;
693 
694 		buf += ret;
695 		count -= ret;
696 		total_bytes += ret;
697 	}
698 
699 	if (pidfd_mode) {
700 		ret = copy_pidfd_info_to_user(pidfd, buf, count);
701 		if (ret < 0)
702 			return ret;
703 
704 		buf += ret;
705 		count -= ret;
706 		total_bytes += ret;
707 	}
708 
709 	if (fanotify_is_error_event(event->mask)) {
710 		ret = copy_error_info_to_user(event, buf, count);
711 		if (ret < 0)
712 			return ret;
713 		buf += ret;
714 		count -= ret;
715 		total_bytes += ret;
716 	}
717 
718 	if (fanotify_event_has_access_range(event)) {
719 		ret = copy_range_info_to_user(event, buf, count);
720 		if (ret < 0)
721 			return ret;
722 		buf += ret;
723 		count -= ret;
724 		total_bytes += ret;
725 	}
726 
727 	if (fanotify_is_mnt_event(event->mask)) {
728 		ret = copy_mnt_info_to_user(event, buf, count);
729 		if (ret < 0)
730 			return ret;
731 		buf += ret;
732 		count -= ret;
733 		total_bytes += ret;
734 	}
735 
736 	return total_bytes;
737 }
738 
739 static ssize_t copy_event_to_user(struct fsnotify_group *group,
740 				  struct fanotify_event *event,
741 				  char __user *buf, size_t count)
742 {
743 	struct fanotify_event_metadata metadata;
744 	const struct path *path = fanotify_event_path(event);
745 	struct fanotify_info *info = fanotify_event_info(event);
746 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
747 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
748 	struct file *f = NULL, *pidfd_file = NULL;
749 	int ret, pidfd = -ESRCH, fd = -EBADF;
750 
751 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
752 
753 	metadata.event_len = fanotify_event_len(info_mode, event);
754 	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
755 	metadata.vers = FANOTIFY_METADATA_VERSION;
756 	metadata.reserved = 0;
757 	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
758 	metadata.pid = pid_vnr(event->pid);
759 	/*
760 	 * For an unprivileged listener, event->pid can be used to identify the
761 	 * events generated by the listener process itself, without disclosing
762 	 * the pids of other processes.
763 	 */
764 	if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
765 	    task_tgid(current) != event->pid)
766 		metadata.pid = 0;
767 
768 	/*
769 	 * For now, fid mode is required for an unprivileged listener and
770 	 * fid mode does not report fd in events.  Keep this check anyway
771 	 * for safety in case fid mode requirement is relaxed in the future
772 	 * to allow unprivileged listener to get events with no fd and no fid.
773 	 */
774 	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
775 	    path && path->mnt && path->dentry) {
776 		fd = create_fd(group, path, &f);
777 		/*
778 		 * Opening an fd from dentry can fail for several reasons.
779 		 * For example, when tasks are gone and we try to open their
780 		 * /proc files or we try to open a WRONLY file like in sysfs
781 		 * or when trying to open a file that was deleted on the
782 		 * remote network server.
783 		 *
784 		 * For a group with FAN_REPORT_FD_ERROR, we will send the
785 		 * event with the error instead of the open fd, otherwise
786 		 * Userspace may not get the error at all.
787 		 * In any case, userspace will not know which file failed to
788 		 * open, so add a debug print for further investigation.
789 		 */
790 		if (fd < 0) {
791 			pr_debug("fanotify: create_fd(%pd2) failed err=%d\n",
792 				 path->dentry, fd);
793 			if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) {
794 				/*
795 				 * Historically, we've handled EOPENSTALE in a
796 				 * special way and silently dropped such
797 				 * events. Now we have to keep it to maintain
798 				 * backward compatibility...
799 				 */
800 				if (fd == -EOPENSTALE)
801 					fd = 0;
802 				return fd;
803 			}
804 		}
805 	}
806 	if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR))
807 		metadata.fd = fd;
808 	else
809 		metadata.fd = fd >= 0 ? fd : FAN_NOFD;
810 
811 	if (pidfd_mode) {
812 		/*
813 		 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
814 		 * exclusion is ever lifted. At the time of incoporating pidfd
815 		 * support within fanotify, the pidfd API only supported the
816 		 * creation of pidfds for thread-group leaders.
817 		 */
818 		WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
819 
820 		/*
821 		 * The PIDTYPE_TGID check for an event->pid is performed
822 		 * preemptively in an attempt to catch out cases where the event
823 		 * listener reads events after the event generating process has
824 		 * already terminated.  Depending on flag FAN_REPORT_FD_ERROR,
825 		 * report either -ESRCH or FAN_NOPIDFD to the event listener in
826 		 * those cases with all other pidfd creation errors reported as
827 		 * the error code itself or as FAN_EPIDFD.
828 		 */
829 		if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID))
830 			pidfd = pidfd_prepare(event->pid, 0, &pidfd_file);
831 
832 		if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0)
833 			pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD;
834 	}
835 
836 	ret = -EFAULT;
837 	/*
838 	 * Sanity check copy size in case get_one_event() and
839 	 * event_len sizes ever get out of sync.
840 	 */
841 	if (WARN_ON_ONCE(metadata.event_len > count))
842 		goto out_close_fd;
843 
844 	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
845 		goto out_close_fd;
846 
847 	buf += FAN_EVENT_METADATA_LEN;
848 	count -= FAN_EVENT_METADATA_LEN;
849 
850 	ret = copy_info_records_to_user(event, info, info_mode, pidfd,
851 					buf, count);
852 	if (ret < 0)
853 		goto out_close_fd;
854 
855 	if (f)
856 		fd_install(fd, f);
857 
858 	if (pidfd_file)
859 		fd_install(pidfd, pidfd_file);
860 
861 	if (fanotify_is_perm_event(event->mask))
862 		FANOTIFY_PERM(event)->fd = fd;
863 
864 	return metadata.event_len;
865 
866 out_close_fd:
867 	if (f) {
868 		put_unused_fd(fd);
869 		fput(f);
870 	}
871 
872 	if (pidfd_file) {
873 		put_unused_fd(pidfd);
874 		fput(pidfd_file);
875 	}
876 
877 	return ret;
878 }
879 
880 /* intofiy userspace file descriptor functions */
881 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
882 {
883 	struct fsnotify_group *group = file->private_data;
884 	__poll_t ret = 0;
885 
886 	poll_wait(file, &group->notification_waitq, wait);
887 	spin_lock(&group->notification_lock);
888 	if (!fsnotify_notify_queue_is_empty(group))
889 		ret = EPOLLIN | EPOLLRDNORM;
890 	spin_unlock(&group->notification_lock);
891 
892 	return ret;
893 }
894 
895 static ssize_t fanotify_read(struct file *file, char __user *buf,
896 			     size_t count, loff_t *pos)
897 {
898 	struct fsnotify_group *group;
899 	struct fanotify_event *event;
900 	char __user *start;
901 	int ret;
902 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
903 
904 	start = buf;
905 	group = file->private_data;
906 
907 	pr_debug("%s: group=%p\n", __func__, group);
908 
909 	add_wait_queue(&group->notification_waitq, &wait);
910 	while (1) {
911 		/*
912 		 * User can supply arbitrarily large buffer. Avoid softlockups
913 		 * in case there are lots of available events.
914 		 */
915 		cond_resched();
916 		event = get_one_event(group, count);
917 		if (IS_ERR(event)) {
918 			ret = PTR_ERR(event);
919 			break;
920 		}
921 
922 		if (!event) {
923 			ret = -EAGAIN;
924 			if (file->f_flags & O_NONBLOCK)
925 				break;
926 
927 			ret = -ERESTARTSYS;
928 			if (signal_pending(current))
929 				break;
930 
931 			if (start != buf)
932 				break;
933 
934 			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
935 			continue;
936 		}
937 
938 		ret = copy_event_to_user(group, event, buf, count);
939 
940 		/*
941 		 * Permission events get queued to wait for response.  Other
942 		 * events can be destroyed now.
943 		 */
944 		if (!fanotify_is_perm_event(event->mask)) {
945 			fsnotify_destroy_event(group, &event->fse);
946 		} else {
947 			if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) {
948 				spin_lock(&group->notification_lock);
949 				finish_permission_event(group,
950 					FANOTIFY_PERM(event), FAN_DENY, NULL);
951 				wake_up(&group->fanotify_data.access_waitq);
952 			} else {
953 				spin_lock(&group->notification_lock);
954 				list_add_tail(&event->fse.list,
955 					&group->fanotify_data.access_list);
956 				spin_unlock(&group->notification_lock);
957 			}
958 		}
959 		if (ret < 0)
960 			break;
961 		buf += ret;
962 		count -= ret;
963 	}
964 	remove_wait_queue(&group->notification_waitq, &wait);
965 
966 	if (start != buf && ret != -EFAULT)
967 		ret = buf - start;
968 	return ret;
969 }
970 
971 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
972 {
973 	struct fanotify_response response;
974 	struct fsnotify_group *group;
975 	int ret;
976 	const char __user *info_buf = buf + sizeof(struct fanotify_response);
977 	size_t info_len;
978 
979 	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
980 		return -EINVAL;
981 
982 	group = file->private_data;
983 
984 	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
985 
986 	if (count < sizeof(response))
987 		return -EINVAL;
988 
989 	if (copy_from_user(&response, buf, sizeof(response)))
990 		return -EFAULT;
991 
992 	info_len = count - sizeof(response);
993 
994 	ret = process_access_response(group, &response, info_buf, info_len);
995 	if (ret < 0)
996 		count = ret;
997 	else
998 		count = sizeof(response) + ret;
999 
1000 	return count;
1001 }
1002 
1003 static int fanotify_release(struct inode *ignored, struct file *file)
1004 {
1005 	struct fsnotify_group *group = file->private_data;
1006 	struct fsnotify_event *fsn_event;
1007 
1008 	/*
1009 	 * Stop new events from arriving in the notification queue. since
1010 	 * userspace cannot use fanotify fd anymore, no event can enter or
1011 	 * leave access_list by now either.
1012 	 */
1013 	fsnotify_group_stop_queueing(group);
1014 
1015 	/*
1016 	 * Process all permission events on access_list and notification queue
1017 	 * and simulate reply from userspace.
1018 	 */
1019 	spin_lock(&group->notification_lock);
1020 	while (!list_empty(&group->fanotify_data.access_list)) {
1021 		struct fanotify_perm_event *event;
1022 
1023 		event = list_first_entry(&group->fanotify_data.access_list,
1024 				struct fanotify_perm_event, fae.fse.list);
1025 		list_del_init(&event->fae.fse.list);
1026 		finish_permission_event(group, event, FAN_ALLOW, NULL);
1027 		spin_lock(&group->notification_lock);
1028 	}
1029 
1030 	/*
1031 	 * Destroy all non-permission events. For permission events just
1032 	 * dequeue them and set the response. They will be freed once the
1033 	 * response is consumed and fanotify_get_response() returns.
1034 	 */
1035 	while ((fsn_event = fsnotify_remove_first_event(group))) {
1036 		struct fanotify_event *event = FANOTIFY_E(fsn_event);
1037 
1038 		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
1039 			spin_unlock(&group->notification_lock);
1040 			fsnotify_destroy_event(group, fsn_event);
1041 		} else {
1042 			finish_permission_event(group, FANOTIFY_PERM(event),
1043 						FAN_ALLOW, NULL);
1044 		}
1045 		spin_lock(&group->notification_lock);
1046 	}
1047 	spin_unlock(&group->notification_lock);
1048 
1049 	/* Response for all permission events it set, wakeup waiters */
1050 	wake_up(&group->fanotify_data.access_waitq);
1051 
1052 	/* matches the fanotify_init->fsnotify_alloc_group */
1053 	fsnotify_destroy_group(group);
1054 
1055 	return 0;
1056 }
1057 
1058 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1059 {
1060 	struct fsnotify_group *group;
1061 	struct fsnotify_event *fsn_event;
1062 	void __user *p;
1063 	int ret = -ENOTTY;
1064 	size_t send_len = 0;
1065 
1066 	group = file->private_data;
1067 
1068 	p = (void __user *) arg;
1069 
1070 	switch (cmd) {
1071 	case FIONREAD:
1072 		spin_lock(&group->notification_lock);
1073 		list_for_each_entry(fsn_event, &group->notification_list, list)
1074 			send_len += FAN_EVENT_METADATA_LEN;
1075 		spin_unlock(&group->notification_lock);
1076 		ret = put_user(send_len, (int __user *) p);
1077 		break;
1078 	}
1079 
1080 	return ret;
1081 }
1082 
1083 static const struct file_operations fanotify_fops = {
1084 	.show_fdinfo	= fanotify_show_fdinfo,
1085 	.poll		= fanotify_poll,
1086 	.read		= fanotify_read,
1087 	.write		= fanotify_write,
1088 	.fasync		= NULL,
1089 	.release	= fanotify_release,
1090 	.unlocked_ioctl	= fanotify_ioctl,
1091 	.compat_ioctl	= compat_ptr_ioctl,
1092 	.llseek		= noop_llseek,
1093 };
1094 
1095 static int fanotify_find_path(int dfd, const char __user *filename,
1096 			      struct path *path, unsigned int flags, __u64 mask,
1097 			      unsigned int obj_type)
1098 {
1099 	int ret;
1100 
1101 	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
1102 		 dfd, filename, flags);
1103 
1104 	if (filename == NULL) {
1105 		CLASS(fd, f)(dfd);
1106 
1107 		if (fd_empty(f))
1108 			return -EBADF;
1109 
1110 		if ((flags & FAN_MARK_ONLYDIR) &&
1111 		    !(S_ISDIR(file_inode(fd_file(f))->i_mode)))
1112 			return -ENOTDIR;
1113 
1114 		*path = fd_file(f)->f_path;
1115 		path_get(path);
1116 	} else {
1117 		unsigned int lookup_flags = 0;
1118 
1119 		if (!(flags & FAN_MARK_DONT_FOLLOW))
1120 			lookup_flags |= LOOKUP_FOLLOW;
1121 		if (flags & FAN_MARK_ONLYDIR)
1122 			lookup_flags |= LOOKUP_DIRECTORY;
1123 
1124 		ret = user_path_at(dfd, filename, lookup_flags, path);
1125 		if (ret)
1126 			goto out;
1127 	}
1128 
1129 	/* you can only watch an inode if you have read permissions on it */
1130 	ret = path_permission(path, MAY_READ);
1131 	if (ret) {
1132 		path_put(path);
1133 		goto out;
1134 	}
1135 
1136 	ret = security_path_notify(path, mask, obj_type);
1137 	if (ret)
1138 		path_put(path);
1139 
1140 out:
1141 	return ret;
1142 }
1143 
1144 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
1145 					    __u32 mask, unsigned int flags,
1146 					    __u32 umask, int *destroy)
1147 {
1148 	__u32 oldmask, newmask;
1149 
1150 	/* umask bits cannot be removed by user */
1151 	mask &= ~umask;
1152 	spin_lock(&fsn_mark->lock);
1153 	oldmask = fsnotify_calc_mask(fsn_mark);
1154 	if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
1155 		fsn_mark->mask &= ~mask;
1156 	} else {
1157 		fsn_mark->ignore_mask &= ~mask;
1158 	}
1159 	newmask = fsnotify_calc_mask(fsn_mark);
1160 	/*
1161 	 * We need to keep the mark around even if remaining mask cannot
1162 	 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
1163 	 * changes to the mask.
1164 	 * Destroy mark when only umask bits remain.
1165 	 */
1166 	*destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
1167 	spin_unlock(&fsn_mark->lock);
1168 
1169 	return oldmask & ~newmask;
1170 }
1171 
1172 static int fanotify_remove_mark(struct fsnotify_group *group,
1173 				void *obj, unsigned int obj_type, __u32 mask,
1174 				unsigned int flags, __u32 umask)
1175 {
1176 	struct fsnotify_mark *fsn_mark = NULL;
1177 	__u32 removed;
1178 	int destroy_mark;
1179 
1180 	fsnotify_group_lock(group);
1181 	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1182 	if (!fsn_mark) {
1183 		fsnotify_group_unlock(group);
1184 		return -ENOENT;
1185 	}
1186 
1187 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
1188 						 umask, &destroy_mark);
1189 	if (removed & fsnotify_conn_mask(fsn_mark->connector))
1190 		fsnotify_recalc_mask(fsn_mark->connector);
1191 	if (destroy_mark)
1192 		fsnotify_detach_mark(fsn_mark);
1193 	fsnotify_group_unlock(group);
1194 	if (destroy_mark)
1195 		fsnotify_free_mark(fsn_mark);
1196 
1197 	/* matches the fsnotify_find_mark() */
1198 	fsnotify_put_mark(fsn_mark);
1199 	return 0;
1200 }
1201 
1202 static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
1203 				       unsigned int fan_flags)
1204 {
1205 	bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
1206 	unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
1207 	bool recalc = false;
1208 
1209 	/*
1210 	 * When using FAN_MARK_IGNORE for the first time, mark starts using
1211 	 * independent event flags in ignore mask.  After that, trying to
1212 	 * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
1213 	 * will result in EEXIST error.
1214 	 */
1215 	if (ignore == FAN_MARK_IGNORE)
1216 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
1217 
1218 	/*
1219 	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
1220 	 * the removal of the FS_MODIFY bit in calculated mask if it was set
1221 	 * because of an ignore mask that is now going to survive FS_MODIFY.
1222 	 */
1223 	if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1224 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
1225 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
1226 		if (!(fsn_mark->mask & FS_MODIFY))
1227 			recalc = true;
1228 	}
1229 
1230 	if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
1231 	    want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1232 		return recalc;
1233 
1234 	/*
1235 	 * NO_IREF may be removed from a mark, but not added.
1236 	 * When removed, fsnotify_recalc_mask() will take the inode ref.
1237 	 */
1238 	WARN_ON_ONCE(!want_iref);
1239 	fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
1240 
1241 	return true;
1242 }
1243 
1244 static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
1245 				      __u32 mask, unsigned int fan_flags)
1246 {
1247 	bool recalc;
1248 
1249 	spin_lock(&fsn_mark->lock);
1250 	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
1251 		fsn_mark->mask |= mask;
1252 	else
1253 		fsn_mark->ignore_mask |= mask;
1254 
1255 	recalc = fsnotify_calc_mask(fsn_mark) &
1256 		~fsnotify_conn_mask(fsn_mark->connector);
1257 
1258 	recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
1259 	spin_unlock(&fsn_mark->lock);
1260 
1261 	return recalc;
1262 }
1263 
1264 struct fan_fsid {
1265 	struct super_block *sb;
1266 	__kernel_fsid_t id;
1267 	bool weak;
1268 };
1269 
1270 static int fanotify_set_mark_fsid(struct fsnotify_group *group,
1271 				  struct fsnotify_mark *mark,
1272 				  struct fan_fsid *fsid)
1273 {
1274 	struct fsnotify_mark_connector *conn;
1275 	struct fsnotify_mark *old;
1276 	struct super_block *old_sb = NULL;
1277 
1278 	FANOTIFY_MARK(mark)->fsid = fsid->id;
1279 	mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
1280 	if (fsid->weak)
1281 		mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID;
1282 
1283 	/* First mark added will determine if group is single or multi fsid */
1284 	if (list_empty(&group->marks_list))
1285 		return 0;
1286 
1287 	/* Find sb of an existing mark */
1288 	list_for_each_entry(old, &group->marks_list, g_list) {
1289 		conn = READ_ONCE(old->connector);
1290 		if (!conn)
1291 			continue;
1292 		old_sb = fsnotify_connector_sb(conn);
1293 		if (old_sb)
1294 			break;
1295 	}
1296 
1297 	/* Only detached marks left? */
1298 	if (!old_sb)
1299 		return 0;
1300 
1301 	/* Do not allow mixing of marks with weak and strong fsid */
1302 	if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID)
1303 		return -EXDEV;
1304 
1305 	/* Allow mixing of marks with strong fsid from different fs */
1306 	if (!fsid->weak)
1307 		return 0;
1308 
1309 	/* Do not allow mixing marks with weak fsid from different fs */
1310 	if (old_sb != fsid->sb)
1311 		return -EXDEV;
1312 
1313 	/* Do not allow mixing marks from different btrfs sub-volumes */
1314 	if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid,
1315 				 &FANOTIFY_MARK(mark)->fsid))
1316 		return -EXDEV;
1317 
1318 	return 0;
1319 }
1320 
1321 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
1322 						   void *obj,
1323 						   unsigned int obj_type,
1324 						   unsigned int fan_flags,
1325 						   struct fan_fsid *fsid)
1326 {
1327 	struct ucounts *ucounts = group->fanotify_data.ucounts;
1328 	struct fanotify_mark *fan_mark;
1329 	struct fsnotify_mark *mark;
1330 	int ret;
1331 
1332 	/*
1333 	 * Enforce per user marks limits per user in all containing user ns.
1334 	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
1335 	 * in the limited groups account.
1336 	 */
1337 	BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS));
1338 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
1339 	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
1340 		return ERR_PTR(-ENOSPC);
1341 
1342 	fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
1343 	if (!fan_mark) {
1344 		ret = -ENOMEM;
1345 		goto out_dec_ucounts;
1346 	}
1347 
1348 	mark = &fan_mark->fsn_mark;
1349 	fsnotify_init_mark(mark, group);
1350 	if (fan_flags & FAN_MARK_EVICTABLE)
1351 		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
1352 
1353 	/* Cache fsid of filesystem containing the marked object */
1354 	if (fsid) {
1355 		ret = fanotify_set_mark_fsid(group, mark, fsid);
1356 		if (ret)
1357 			goto out_put_mark;
1358 	} else {
1359 		fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
1360 	}
1361 
1362 	ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0);
1363 	if (ret)
1364 		goto out_put_mark;
1365 
1366 	return mark;
1367 
1368 out_put_mark:
1369 	fsnotify_put_mark(mark);
1370 out_dec_ucounts:
1371 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
1372 		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
1373 	return ERR_PTR(ret);
1374 }
1375 
1376 static int fanotify_group_init_error_pool(struct fsnotify_group *group)
1377 {
1378 	if (mempool_initialized(&group->fanotify_data.error_events_pool))
1379 		return 0;
1380 
1381 	return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
1382 					 FANOTIFY_DEFAULT_FEE_POOL_SIZE,
1383 					 sizeof(struct fanotify_error_event));
1384 }
1385 
1386 static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
1387 					     __u32 mask, unsigned int fan_flags)
1388 {
1389 	/*
1390 	 * Non evictable mark cannot be downgraded to evictable mark.
1391 	 */
1392 	if (fan_flags & FAN_MARK_EVICTABLE &&
1393 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1394 		return -EEXIST;
1395 
1396 	/*
1397 	 * New ignore mask semantics cannot be downgraded to old semantics.
1398 	 */
1399 	if (fan_flags & FAN_MARK_IGNORED_MASK &&
1400 	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
1401 		return -EEXIST;
1402 
1403 	/*
1404 	 * An ignore mask that survives modify could never be downgraded to not
1405 	 * survive modify.  With new FAN_MARK_IGNORE semantics we make that rule
1406 	 * explicit and return an error when trying to update the ignore mask
1407 	 * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
1408 	 */
1409 	if (fan_flags & FAN_MARK_IGNORE &&
1410 	    !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1411 	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
1412 		return -EEXIST;
1413 
1414 	/* For now pre-content events are not generated for directories */
1415 	mask |= fsn_mark->mask;
1416 	if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
1417 		return -EEXIST;
1418 
1419 	return 0;
1420 }
1421 
1422 static int fanotify_add_mark(struct fsnotify_group *group,
1423 			     void *obj, unsigned int obj_type,
1424 			     __u32 mask, unsigned int fan_flags,
1425 			     struct fan_fsid *fsid)
1426 {
1427 	struct fsnotify_mark *fsn_mark;
1428 	bool recalc;
1429 	int ret = 0;
1430 
1431 	fsnotify_group_lock(group);
1432 	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1433 	if (!fsn_mark) {
1434 		fsn_mark = fanotify_add_new_mark(group, obj, obj_type,
1435 						 fan_flags, fsid);
1436 		if (IS_ERR(fsn_mark)) {
1437 			fsnotify_group_unlock(group);
1438 			return PTR_ERR(fsn_mark);
1439 		}
1440 	}
1441 
1442 	/*
1443 	 * Check if requested mark flags conflict with an existing mark flags.
1444 	 */
1445 	ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags);
1446 	if (ret)
1447 		goto out;
1448 
1449 	/*
1450 	 * Error events are pre-allocated per group, only if strictly
1451 	 * needed (i.e. FAN_FS_ERROR was requested).
1452 	 */
1453 	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
1454 	    (mask & FAN_FS_ERROR)) {
1455 		ret = fanotify_group_init_error_pool(group);
1456 		if (ret)
1457 			goto out;
1458 	}
1459 
1460 	recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
1461 	if (recalc)
1462 		fsnotify_recalc_mask(fsn_mark->connector);
1463 
1464 out:
1465 	fsnotify_group_unlock(group);
1466 
1467 	fsnotify_put_mark(fsn_mark);
1468 	return ret;
1469 }
1470 
1471 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1472 {
1473 	struct fanotify_event *oevent;
1474 
1475 	oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1476 	if (!oevent)
1477 		return NULL;
1478 
1479 	fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1480 	oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1481 
1482 	return &oevent->fse;
1483 }
1484 
1485 static struct hlist_head *fanotify_alloc_merge_hash(void)
1486 {
1487 	struct hlist_head *hash;
1488 
1489 	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1490 		       GFP_KERNEL_ACCOUNT);
1491 	if (!hash)
1492 		return NULL;
1493 
1494 	__hash_init(hash, FANOTIFY_HTABLE_SIZE);
1495 
1496 	return hash;
1497 }
1498 
1499 /* fanotify syscalls */
1500 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1501 {
1502 	struct user_namespace *user_ns = current_user_ns();
1503 	struct fsnotify_group *group;
1504 	int f_flags, fd;
1505 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1506 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
1507 	unsigned int internal_flags = 0;
1508 	struct file *file;
1509 
1510 	pr_debug("%s: flags=%x event_f_flags=%x\n",
1511 		 __func__, flags, event_f_flags);
1512 
1513 	if (!capable(CAP_SYS_ADMIN)) {
1514 		/*
1515 		 * An unprivileged user can setup an fanotify group with
1516 		 * limited functionality - an unprivileged group is limited to
1517 		 * notification events with file handles or mount ids and it
1518 		 * cannot use unlimited queue/marks.
1519 		 */
1520 		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) ||
1521 		    !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT)))
1522 			return -EPERM;
1523 
1524 		/*
1525 		 * Setting the internal flag FANOTIFY_UNPRIV on the group
1526 		 * prevents setting mount/filesystem marks on this group and
1527 		 * prevents reporting pid and open fd in events.
1528 		 */
1529 		internal_flags |= FANOTIFY_UNPRIV;
1530 	}
1531 
1532 #ifdef CONFIG_AUDITSYSCALL
1533 	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1534 #else
1535 	if (flags & ~FANOTIFY_INIT_FLAGS)
1536 #endif
1537 		return -EINVAL;
1538 
1539 	/*
1540 	 * A pidfd can only be returned for a thread-group leader; thus
1541 	 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
1542 	 * exclusive.
1543 	 */
1544 	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
1545 		return -EINVAL;
1546 
1547 	/* Don't allow mixing mnt events with inode events for now */
1548 	if (flags & FAN_REPORT_MNT) {
1549 		if (class != FAN_CLASS_NOTIF)
1550 			return -EINVAL;
1551 		if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR))
1552 			return -EINVAL;
1553 	}
1554 
1555 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1556 		return -EINVAL;
1557 
1558 	switch (event_f_flags & O_ACCMODE) {
1559 	case O_RDONLY:
1560 	case O_RDWR:
1561 	case O_WRONLY:
1562 		break;
1563 	default:
1564 		return -EINVAL;
1565 	}
1566 
1567 	if (fid_mode && class != FAN_CLASS_NOTIF)
1568 		return -EINVAL;
1569 
1570 	/*
1571 	 * Child name is reported with parent fid so requires dir fid.
1572 	 * We can report both child fid and dir fid with or without name.
1573 	 */
1574 	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1575 		return -EINVAL;
1576 
1577 	/*
1578 	 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
1579 	 * and is used as an indication to report both dir and child fid on all
1580 	 * dirent events.
1581 	 */
1582 	if ((fid_mode & FAN_REPORT_TARGET_FID) &&
1583 	    (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
1584 		return -EINVAL;
1585 
1586 	f_flags = O_RDWR;
1587 	if (flags & FAN_CLOEXEC)
1588 		f_flags |= O_CLOEXEC;
1589 	if (flags & FAN_NONBLOCK)
1590 		f_flags |= O_NONBLOCK;
1591 
1592 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
1593 	group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
1594 				     FSNOTIFY_GROUP_USER);
1595 	if (IS_ERR(group)) {
1596 		return PTR_ERR(group);
1597 	}
1598 
1599 	/* Enforce groups limits per user in all containing user ns */
1600 	group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(),
1601 						  UCOUNT_FANOTIFY_GROUPS);
1602 	if (!group->fanotify_data.ucounts) {
1603 		fd = -EMFILE;
1604 		goto out_destroy_group;
1605 	}
1606 
1607 	group->fanotify_data.flags = flags | internal_flags;
1608 	group->memcg = get_mem_cgroup_from_mm(current->mm);
1609 	group->user_ns = get_user_ns(user_ns);
1610 
1611 	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1612 	if (!group->fanotify_data.merge_hash) {
1613 		fd = -ENOMEM;
1614 		goto out_destroy_group;
1615 	}
1616 
1617 	group->overflow_event = fanotify_alloc_overflow_event();
1618 	if (unlikely(!group->overflow_event)) {
1619 		fd = -ENOMEM;
1620 		goto out_destroy_group;
1621 	}
1622 
1623 	if (force_o_largefile())
1624 		event_f_flags |= O_LARGEFILE;
1625 	group->fanotify_data.f_flags = event_f_flags;
1626 	init_waitqueue_head(&group->fanotify_data.access_waitq);
1627 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
1628 	switch (class) {
1629 	case FAN_CLASS_NOTIF:
1630 		group->priority = FSNOTIFY_PRIO_NORMAL;
1631 		break;
1632 	case FAN_CLASS_CONTENT:
1633 		group->priority = FSNOTIFY_PRIO_CONTENT;
1634 		break;
1635 	case FAN_CLASS_PRE_CONTENT:
1636 		group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
1637 		break;
1638 	default:
1639 		fd = -EINVAL;
1640 		goto out_destroy_group;
1641 	}
1642 
1643 	BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE));
1644 	if (flags & FAN_UNLIMITED_QUEUE) {
1645 		group->max_events = UINT_MAX;
1646 	} else {
1647 		group->max_events = fanotify_max_queued_events;
1648 	}
1649 
1650 	if (flags & FAN_ENABLE_AUDIT) {
1651 		fd = -EPERM;
1652 		if (!capable(CAP_AUDIT_WRITE))
1653 			goto out_destroy_group;
1654 	}
1655 
1656 	fd = get_unused_fd_flags(f_flags);
1657 	if (fd < 0)
1658 		goto out_destroy_group;
1659 
1660 	file = anon_inode_getfile_fmode("[fanotify]", &fanotify_fops, group,
1661 					f_flags, FMODE_NONOTIFY);
1662 	if (IS_ERR(file)) {
1663 		put_unused_fd(fd);
1664 		fd = PTR_ERR(file);
1665 		goto out_destroy_group;
1666 	}
1667 	fd_install(fd, file);
1668 	return fd;
1669 
1670 out_destroy_group:
1671 	fsnotify_destroy_group(group);
1672 	return fd;
1673 }
1674 
1675 static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags,
1676 			      struct fan_fsid *fsid)
1677 {
1678 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1679 	__kernel_fsid_t root_fsid;
1680 	int err;
1681 
1682 	/*
1683 	 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
1684 	 */
1685 	err = vfs_get_fsid(dentry, &fsid->id);
1686 	if (err)
1687 		return err;
1688 
1689 	fsid->sb = dentry->d_sb;
1690 	if (!fsid->id.val[0] && !fsid->id.val[1]) {
1691 		err = -ENODEV;
1692 		goto weak;
1693 	}
1694 
1695 	/*
1696 	 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
1697 	 * which uses a different fsid than sb root.
1698 	 */
1699 	err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
1700 	if (err)
1701 		return err;
1702 
1703 	if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) {
1704 		err = -EXDEV;
1705 		goto weak;
1706 	}
1707 
1708 	fsid->weak = false;
1709 	return 0;
1710 
1711 weak:
1712 	/* Allow weak fsid when marking inodes */
1713 	fsid->weak = true;
1714 	return (mark_type == FAN_MARK_INODE) ? 0 : err;
1715 }
1716 
1717 /* Check if filesystem can encode a unique fid */
1718 static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
1719 {
1720 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1721 	const struct export_operations *nop = dentry->d_sb->s_export_op;
1722 
1723 	/*
1724 	 * We need to make sure that the filesystem supports encoding of
1725 	 * file handles so user can use name_to_handle_at() to compare fids
1726 	 * reported with events to the file handle of watched objects.
1727 	 */
1728 	if (!exportfs_can_encode_fid(nop))
1729 		return -EOPNOTSUPP;
1730 
1731 	/*
1732 	 * For sb/mount mark, we also need to make sure that the filesystem
1733 	 * supports decoding file handles, so user has a way to map back the
1734 	 * reported fids to filesystem objects.
1735 	 */
1736 	if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop))
1737 		return -EOPNOTSUPP;
1738 
1739 	return 0;
1740 }
1741 
1742 static int fanotify_events_supported(struct fsnotify_group *group,
1743 				     const struct path *path, __u64 mask,
1744 				     unsigned int flags)
1745 {
1746 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1747 	bool is_dir = d_is_dir(path->dentry);
1748 	/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
1749 	bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
1750 				 (mask & FAN_RENAME) ||
1751 				 (flags & FAN_MARK_IGNORE);
1752 
1753 	/*
1754 	 * Filesystems need to opt-into pre-content evnets (a.k.a HSM)
1755 	 * and they are only supported on regular files and directories.
1756 	 */
1757 	if (mask & FANOTIFY_PRE_CONTENT_EVENTS) {
1758 		if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM))
1759 			return -EOPNOTSUPP;
1760 		if (!is_dir && !d_is_reg(path->dentry))
1761 			return -EINVAL;
1762 	}
1763 
1764 	/*
1765 	 * Some filesystems such as 'proc' acquire unusual locks when opening
1766 	 * files. For them fanotify permission events have high chances of
1767 	 * deadlocking the system - open done when reporting fanotify event
1768 	 * blocks on this "unusual" lock while another process holding the lock
1769 	 * waits for fanotify permission event to be answered. Just disallow
1770 	 * permission events for such filesystems.
1771 	 */
1772 	if (mask & FANOTIFY_PERM_EVENTS &&
1773 	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1774 		return -EINVAL;
1775 
1776 	/*
1777 	 * mount and sb marks are not allowed on kernel internal pseudo fs,
1778 	 * like pipe_mnt, because that would subscribe to events on all the
1779 	 * anonynous pipes in the system.
1780 	 *
1781 	 * SB_NOUSER covers all of the internal pseudo fs whose objects are not
1782 	 * exposed to user's mount namespace, but there are other SB_KERNMOUNT
1783 	 * fs, like nsfs, debugfs, for which the value of allowing sb and mount
1784 	 * mark is questionable. For now we leave them alone.
1785 	 */
1786 	if (mark_type != FAN_MARK_INODE &&
1787 	    path->mnt->mnt_sb->s_flags & SB_NOUSER)
1788 		return -EINVAL;
1789 
1790 	/*
1791 	 * We shouldn't have allowed setting dirent events and the directory
1792 	 * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
1793 	 * but because we always allowed it, error only when using new APIs.
1794 	 */
1795 	if (strict_dir_events && mark_type == FAN_MARK_INODE &&
1796 	    !is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
1797 		return -ENOTDIR;
1798 
1799 	return 0;
1800 }
1801 
1802 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1803 			    int dfd, const char  __user *pathname)
1804 {
1805 	struct inode *inode = NULL;
1806 	struct fsnotify_group *group;
1807 	struct path path;
1808 	struct fan_fsid __fsid, *fsid = NULL;
1809 	struct user_namespace *user_ns = NULL;
1810 	struct mnt_namespace *mntns;
1811 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1812 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1813 	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
1814 	unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
1815 	unsigned int obj_type, fid_mode;
1816 	void *obj = NULL;
1817 	u32 umask = 0;
1818 	int ret;
1819 
1820 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1821 		 __func__, fanotify_fd, flags, dfd, pathname, mask);
1822 
1823 	/* we only use the lower 32 bits as of right now. */
1824 	if (upper_32_bits(mask))
1825 		return -EINVAL;
1826 
1827 	if (flags & ~FANOTIFY_MARK_FLAGS)
1828 		return -EINVAL;
1829 
1830 	switch (mark_type) {
1831 	case FAN_MARK_INODE:
1832 		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1833 		break;
1834 	case FAN_MARK_MOUNT:
1835 		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1836 		break;
1837 	case FAN_MARK_FILESYSTEM:
1838 		obj_type = FSNOTIFY_OBJ_TYPE_SB;
1839 		break;
1840 	case FAN_MARK_MNTNS:
1841 		obj_type = FSNOTIFY_OBJ_TYPE_MNTNS;
1842 		break;
1843 	default:
1844 		return -EINVAL;
1845 	}
1846 
1847 	switch (mark_cmd) {
1848 	case FAN_MARK_ADD:
1849 	case FAN_MARK_REMOVE:
1850 		if (!mask)
1851 			return -EINVAL;
1852 		break;
1853 	case FAN_MARK_FLUSH:
1854 		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1855 			return -EINVAL;
1856 		break;
1857 	default:
1858 		return -EINVAL;
1859 	}
1860 
1861 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1862 		valid_mask |= FANOTIFY_PERM_EVENTS;
1863 
1864 	if (mask & ~valid_mask)
1865 		return -EINVAL;
1866 
1867 
1868 	/* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
1869 	if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
1870 		return -EINVAL;
1871 
1872 	/*
1873 	 * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
1874 	 * FAN_MARK_IGNORED_MASK.
1875 	 */
1876 	if (ignore == FAN_MARK_IGNORED_MASK) {
1877 		mask &= ~FANOTIFY_EVENT_FLAGS;
1878 		umask = FANOTIFY_EVENT_FLAGS;
1879 	}
1880 
1881 	CLASS(fd, f)(fanotify_fd);
1882 	if (fd_empty(f))
1883 		return -EBADF;
1884 
1885 	/* verify that this is indeed an fanotify instance */
1886 	if (unlikely(fd_file(f)->f_op != &fanotify_fops))
1887 		return -EINVAL;
1888 	group = fd_file(f)->private_data;
1889 
1890 	/* Only report mount events on mnt namespace */
1891 	if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
1892 		if (mask & ~FANOTIFY_MOUNT_EVENTS)
1893 			return -EINVAL;
1894 		if (mark_type != FAN_MARK_MNTNS)
1895 			return -EINVAL;
1896 	} else {
1897 		if (mask & FANOTIFY_MOUNT_EVENTS)
1898 			return -EINVAL;
1899 		if (mark_type == FAN_MARK_MNTNS)
1900 			return -EINVAL;
1901 	}
1902 
1903 	/*
1904 	 * A user is allowed to setup sb/mount/mntns marks only if it is
1905 	 * capable in the user ns where the group was created.
1906 	 */
1907 	if (!ns_capable(group->user_ns, CAP_SYS_ADMIN) &&
1908 	    mark_type != FAN_MARK_INODE)
1909 		return -EPERM;
1910 
1911 	/*
1912 	 * Permission events are not allowed for FAN_CLASS_NOTIF.
1913 	 * Pre-content permission events are not allowed for FAN_CLASS_CONTENT.
1914 	 */
1915 	if (mask & FANOTIFY_PERM_EVENTS &&
1916 	    group->priority == FSNOTIFY_PRIO_NORMAL)
1917 		return -EINVAL;
1918 	else if (mask & FANOTIFY_PRE_CONTENT_EVENTS &&
1919 		 group->priority == FSNOTIFY_PRIO_CONTENT)
1920 		return -EINVAL;
1921 
1922 	if (mask & FAN_FS_ERROR &&
1923 	    mark_type != FAN_MARK_FILESYSTEM)
1924 		return -EINVAL;
1925 
1926 	/*
1927 	 * Evictable is only relevant for inode marks, because only inode object
1928 	 * can be evicted on memory pressure.
1929 	 */
1930 	if (flags & FAN_MARK_EVICTABLE &&
1931 	     mark_type != FAN_MARK_INODE)
1932 		return -EINVAL;
1933 
1934 	/*
1935 	 * Events that do not carry enough information to report
1936 	 * event->fd require a group that supports reporting fid.  Those
1937 	 * events are not supported on a mount mark, because they do not
1938 	 * carry enough information (i.e. path) to be filtered by mount
1939 	 * point.
1940 	 */
1941 	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1942 	if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) &&
1943 	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
1944 		return -EINVAL;
1945 
1946 	/*
1947 	 * FAN_RENAME uses special info type records to report the old and
1948 	 * new parent+name.  Reporting only old and new parent id is less
1949 	 * useful and was not implemented.
1950 	 */
1951 	if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
1952 		return -EINVAL;
1953 
1954 	/* Pre-content events are not currently generated for directories. */
1955 	if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR)
1956 		return -EINVAL;
1957 
1958 	if (mark_cmd == FAN_MARK_FLUSH) {
1959 		fsnotify_clear_marks_by_group(group, obj_type);
1960 		return 0;
1961 	}
1962 
1963 	ret = fanotify_find_path(dfd, pathname, &path, flags,
1964 			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
1965 	if (ret)
1966 		return ret;
1967 
1968 	if (mark_cmd == FAN_MARK_ADD) {
1969 		ret = fanotify_events_supported(group, &path, mask, flags);
1970 		if (ret)
1971 			goto path_put_and_out;
1972 	}
1973 
1974 	if (fid_mode) {
1975 		ret = fanotify_test_fsid(path.dentry, flags, &__fsid);
1976 		if (ret)
1977 			goto path_put_and_out;
1978 
1979 		ret = fanotify_test_fid(path.dentry, flags);
1980 		if (ret)
1981 			goto path_put_and_out;
1982 
1983 		fsid = &__fsid;
1984 	}
1985 
1986 	/*
1987 	 * In addition to being capable in the user ns where group was created,
1988 	 * the user also needs to be capable in the user ns associated with
1989 	 * the filesystem or in the user ns associated with the mntns
1990 	 * (when marking mntns).
1991 	 */
1992 	if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
1993 		inode = path.dentry->d_inode;
1994 		obj = inode;
1995 	} else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
1996 		user_ns = path.mnt->mnt_sb->s_user_ns;
1997 		obj = path.mnt;
1998 	} else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
1999 		user_ns = path.mnt->mnt_sb->s_user_ns;
2000 		obj = path.mnt->mnt_sb;
2001 	} else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
2002 		mntns = mnt_ns_from_dentry(path.dentry);
2003 		user_ns = mntns->user_ns;
2004 		obj = mntns;
2005 	}
2006 
2007 	ret = -EPERM;
2008 	if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN))
2009 		goto path_put_and_out;
2010 
2011 	ret = -EINVAL;
2012 	if (!obj)
2013 		goto path_put_and_out;
2014 
2015 	/*
2016 	 * If some other task has this inode open for write we should not add
2017 	 * an ignore mask, unless that ignore mask is supposed to survive
2018 	 * modification changes anyway.
2019 	 */
2020 	if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
2021 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
2022 		ret = !inode ? -EINVAL : -EISDIR;
2023 		/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
2024 		if (ignore == FAN_MARK_IGNORE &&
2025 		    (!inode || S_ISDIR(inode->i_mode)))
2026 			goto path_put_and_out;
2027 
2028 		ret = 0;
2029 		if (inode && inode_is_open_for_write(inode))
2030 			goto path_put_and_out;
2031 	}
2032 
2033 	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
2034 	if (!inode || !S_ISDIR(inode->i_mode)) {
2035 		mask &= ~FAN_EVENT_ON_CHILD;
2036 		umask = FAN_EVENT_ON_CHILD;
2037 		/*
2038 		 * If group needs to report parent fid, register for getting
2039 		 * events with parent/name info for non-directory.
2040 		 */
2041 		if ((fid_mode & FAN_REPORT_DIR_FID) &&
2042 		    (flags & FAN_MARK_ADD) && !ignore)
2043 			mask |= FAN_EVENT_ON_CHILD;
2044 	}
2045 
2046 	/* create/update an inode mark */
2047 	switch (mark_cmd) {
2048 	case FAN_MARK_ADD:
2049 		ret = fanotify_add_mark(group, obj, obj_type, mask, flags,
2050 					fsid);
2051 		break;
2052 	case FAN_MARK_REMOVE:
2053 		ret = fanotify_remove_mark(group, obj, obj_type, mask, flags,
2054 					   umask);
2055 		break;
2056 	default:
2057 		ret = -EINVAL;
2058 	}
2059 
2060 path_put_and_out:
2061 	path_put(&path);
2062 	return ret;
2063 }
2064 
2065 #ifndef CONFIG_ARCH_SPLIT_ARG64
2066 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
2067 			      __u64, mask, int, dfd,
2068 			      const char  __user *, pathname)
2069 {
2070 	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
2071 }
2072 #endif
2073 
2074 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
2075 SYSCALL32_DEFINE6(fanotify_mark,
2076 				int, fanotify_fd, unsigned int, flags,
2077 				SC_ARG64(mask), int, dfd,
2078 				const char  __user *, pathname)
2079 {
2080 	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
2081 				dfd, pathname);
2082 }
2083 #endif
2084 
2085 /*
2086  * fanotify_user_setup - Our initialization function.  Note that we cannot return
2087  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
2088  * must result in panic().
2089  */
2090 static int __init fanotify_user_setup(void)
2091 {
2092 	struct sysinfo si;
2093 	int max_marks;
2094 
2095 	si_meminfo(&si);
2096 	/*
2097 	 * Allow up to 1% of addressable memory to be accounted for per user
2098 	 * marks limited to the range [8192, 1048576]. mount and sb marks are
2099 	 * a lot cheaper than inode marks, but there is no reason for a user
2100 	 * to have many of those, so calculate by the cost of inode marks.
2101 	 */
2102 	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
2103 		    INODE_MARK_COST;
2104 	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
2105 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
2106 
2107 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
2108 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14);
2109 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
2110 
2111 	fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
2112 					 SLAB_PANIC|SLAB_ACCOUNT);
2113 	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
2114 					       SLAB_PANIC);
2115 	fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
2116 						SLAB_PANIC);
2117 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
2118 		fanotify_perm_event_cachep =
2119 			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
2120 	}
2121 	fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC);
2122 
2123 	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
2124 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
2125 					FANOTIFY_DEFAULT_MAX_GROUPS;
2126 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
2127 	fanotify_sysctls_init();
2128 
2129 	return 0;
2130 }
2131 device_initcall(fanotify_user_setup);
2132