1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * User interface for Resource Allocation in Resource Director Technology(RDT)
4  *
5  * Copyright (C) 2016 Intel Corporation
6  *
7  * Author: Fenghua Yu <fenghua.yu@intel.com>
8  *
9  * More information about RDT be found in the Intel (R) x86 Architecture
10  * Software Developer Manual.
11  */
12 
13 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
14 
15 #include <linux/cpu.h>
16 #include <linux/debugfs.h>
17 #include <linux/fs.h>
18 #include <linux/fs_parser.h>
19 #include <linux/sysfs.h>
20 #include <linux/kernfs.h>
21 #include <linux/resctrl.h>
22 #include <linux/seq_buf.h>
23 #include <linux/seq_file.h>
24 #include <linux/sched/task.h>
25 #include <linux/slab.h>
26 #include <linux/user_namespace.h>
27 
28 #include <uapi/linux/magic.h>
29 
30 #include "internal.h"
31 
32 /* Mutex to protect rdtgroup access. */
33 DEFINE_MUTEX(rdtgroup_mutex);
34 
35 static struct kernfs_root *rdt_root;
36 
37 struct rdtgroup rdtgroup_default;
38 
39 LIST_HEAD(rdt_all_groups);
40 
41 /* list of entries for the schemata file */
42 LIST_HEAD(resctrl_schema_all);
43 
44 /*
45  * List of struct mon_data containing private data of event files for use by
46  * rdtgroup_mondata_show(). Protected by rdtgroup_mutex.
47  */
48 static LIST_HEAD(mon_data_kn_priv_list);
49 
50 /* The filesystem can only be mounted once. */
51 bool resctrl_mounted;
52 
53 /* Kernel fs node for "info" directory under root */
54 static struct kernfs_node *kn_info;
55 
56 /* Kernel fs node for "mon_groups" directory under root */
57 static struct kernfs_node *kn_mongrp;
58 
59 /* Kernel fs node for "mon_data" directory under root */
60 static struct kernfs_node *kn_mondata;
61 
62 /*
63  * Used to store the max resource name width to display the schemata names in
64  * a tabular format.
65  */
66 int max_name_width;
67 
68 static struct seq_buf last_cmd_status;
69 
70 static char last_cmd_status_buf[512];
71 
72 static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
73 
74 static void rdtgroup_destroy_root(void);
75 
76 struct dentry *debugfs_resctrl;
77 
78 /*
79  * Memory bandwidth monitoring event to use for the default CTRL_MON group
80  * and each new CTRL_MON group created by the user.  Only relevant when
81  * the filesystem is mounted with the "mba_MBps" option so it does not
82  * matter that it remains uninitialized on systems that do not support
83  * the "mba_MBps" option.
84  */
85 enum resctrl_event_id mba_mbps_default_event;
86 
87 static bool resctrl_debug;
88 
89 void rdt_last_cmd_clear(void)
90 {
91 	lockdep_assert_held(&rdtgroup_mutex);
92 	seq_buf_clear(&last_cmd_status);
93 }
94 
95 void rdt_last_cmd_puts(const char *s)
96 {
97 	lockdep_assert_held(&rdtgroup_mutex);
98 	seq_buf_puts(&last_cmd_status, s);
99 }
100 
101 void rdt_last_cmd_printf(const char *fmt, ...)
102 {
103 	va_list ap;
104 
105 	va_start(ap, fmt);
106 	lockdep_assert_held(&rdtgroup_mutex);
107 	seq_buf_vprintf(&last_cmd_status, fmt, ap);
108 	va_end(ap);
109 }
110 
111 void rdt_staged_configs_clear(void)
112 {
113 	struct rdt_ctrl_domain *dom;
114 	struct rdt_resource *r;
115 
116 	lockdep_assert_held(&rdtgroup_mutex);
117 
118 	for_each_alloc_capable_rdt_resource(r) {
119 		list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
120 			memset(dom->staged_config, 0, sizeof(dom->staged_config));
121 	}
122 }
123 
124 static bool resctrl_is_mbm_enabled(void)
125 {
126 	return (resctrl_arch_is_mbm_total_enabled() ||
127 		resctrl_arch_is_mbm_local_enabled());
128 }
129 
130 static bool resctrl_is_mbm_event(int e)
131 {
132 	return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
133 		e <= QOS_L3_MBM_LOCAL_EVENT_ID);
134 }
135 
136 /*
137  * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap
138  * of free CLOSIDs.
139  *
140  * Using a global CLOSID across all resources has some advantages and
141  * some drawbacks:
142  * + We can simply set current's closid to assign a task to a resource
143  *   group.
144  * + Context switch code can avoid extra memory references deciding which
145  *   CLOSID to load into the PQR_ASSOC MSR
146  * - We give up some options in configuring resource groups across multi-socket
147  *   systems.
148  * - Our choices on how to configure each resource become progressively more
149  *   limited as the number of resources grows.
150  */
151 static unsigned long *closid_free_map;
152 
153 static int closid_free_map_len;
154 
155 int closids_supported(void)
156 {
157 	return closid_free_map_len;
158 }
159 
160 static int closid_init(void)
161 {
162 	struct resctrl_schema *s;
163 	u32 rdt_min_closid = ~0;
164 
165 	/* Monitor only platforms still call closid_init() */
166 	if (list_empty(&resctrl_schema_all))
167 		return 0;
168 
169 	/* Compute rdt_min_closid across all resources */
170 	list_for_each_entry(s, &resctrl_schema_all, list)
171 		rdt_min_closid = min(rdt_min_closid, s->num_closid);
172 
173 	closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL);
174 	if (!closid_free_map)
175 		return -ENOMEM;
176 	bitmap_fill(closid_free_map, rdt_min_closid);
177 
178 	/* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
179 	__clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map);
180 	closid_free_map_len = rdt_min_closid;
181 
182 	return 0;
183 }
184 
185 static void closid_exit(void)
186 {
187 	bitmap_free(closid_free_map);
188 	closid_free_map = NULL;
189 }
190 
191 static int closid_alloc(void)
192 {
193 	int cleanest_closid;
194 	u32 closid;
195 
196 	lockdep_assert_held(&rdtgroup_mutex);
197 
198 	if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
199 	    resctrl_arch_is_llc_occupancy_enabled()) {
200 		cleanest_closid = resctrl_find_cleanest_closid();
201 		if (cleanest_closid < 0)
202 			return cleanest_closid;
203 		closid = cleanest_closid;
204 	} else {
205 		closid = find_first_bit(closid_free_map, closid_free_map_len);
206 		if (closid == closid_free_map_len)
207 			return -ENOSPC;
208 	}
209 	__clear_bit(closid, closid_free_map);
210 
211 	return closid;
212 }
213 
214 void closid_free(int closid)
215 {
216 	lockdep_assert_held(&rdtgroup_mutex);
217 
218 	__set_bit(closid, closid_free_map);
219 }
220 
221 /**
222  * closid_allocated - test if provided closid is in use
223  * @closid: closid to be tested
224  *
225  * Return: true if @closid is currently associated with a resource group,
226  * false if @closid is free
227  */
228 bool closid_allocated(unsigned int closid)
229 {
230 	lockdep_assert_held(&rdtgroup_mutex);
231 
232 	return !test_bit(closid, closid_free_map);
233 }
234 
235 /**
236  * rdtgroup_mode_by_closid - Return mode of resource group with closid
237  * @closid: closid if the resource group
238  *
239  * Each resource group is associated with a @closid. Here the mode
240  * of a resource group can be queried by searching for it using its closid.
241  *
242  * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
243  */
244 enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
245 {
246 	struct rdtgroup *rdtgrp;
247 
248 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
249 		if (rdtgrp->closid == closid)
250 			return rdtgrp->mode;
251 	}
252 
253 	return RDT_NUM_MODES;
254 }
255 
256 static const char * const rdt_mode_str[] = {
257 	[RDT_MODE_SHAREABLE]		= "shareable",
258 	[RDT_MODE_EXCLUSIVE]		= "exclusive",
259 	[RDT_MODE_PSEUDO_LOCKSETUP]	= "pseudo-locksetup",
260 	[RDT_MODE_PSEUDO_LOCKED]	= "pseudo-locked",
261 };
262 
263 /**
264  * rdtgroup_mode_str - Return the string representation of mode
265  * @mode: the resource group mode as &enum rdtgroup_mode
266  *
267  * Return: string representation of valid mode, "unknown" otherwise
268  */
269 static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
270 {
271 	if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
272 		return "unknown";
273 
274 	return rdt_mode_str[mode];
275 }
276 
277 /* set uid and gid of rdtgroup dirs and files to that of the creator */
278 static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
279 {
280 	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
281 				.ia_uid = current_fsuid(),
282 				.ia_gid = current_fsgid(), };
283 
284 	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
285 	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
286 		return 0;
287 
288 	return kernfs_setattr(kn, &iattr);
289 }
290 
291 static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
292 {
293 	struct kernfs_node *kn;
294 	int ret;
295 
296 	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
297 				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
298 				  0, rft->kf_ops, rft, NULL, NULL);
299 	if (IS_ERR(kn))
300 		return PTR_ERR(kn);
301 
302 	ret = rdtgroup_kn_set_ugid(kn);
303 	if (ret) {
304 		kernfs_remove(kn);
305 		return ret;
306 	}
307 
308 	return 0;
309 }
310 
311 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
312 {
313 	struct kernfs_open_file *of = m->private;
314 	struct rftype *rft = of->kn->priv;
315 
316 	if (rft->seq_show)
317 		return rft->seq_show(of, m, arg);
318 	return 0;
319 }
320 
321 static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
322 				   size_t nbytes, loff_t off)
323 {
324 	struct rftype *rft = of->kn->priv;
325 
326 	if (rft->write)
327 		return rft->write(of, buf, nbytes, off);
328 
329 	return -EINVAL;
330 }
331 
332 static const struct kernfs_ops rdtgroup_kf_single_ops = {
333 	.atomic_write_len	= PAGE_SIZE,
334 	.write			= rdtgroup_file_write,
335 	.seq_show		= rdtgroup_seqfile_show,
336 };
337 
338 static const struct kernfs_ops kf_mondata_ops = {
339 	.atomic_write_len	= PAGE_SIZE,
340 	.seq_show		= rdtgroup_mondata_show,
341 };
342 
343 static bool is_cpu_list(struct kernfs_open_file *of)
344 {
345 	struct rftype *rft = of->kn->priv;
346 
347 	return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
348 }
349 
350 static int rdtgroup_cpus_show(struct kernfs_open_file *of,
351 			      struct seq_file *s, void *v)
352 {
353 	struct rdtgroup *rdtgrp;
354 	struct cpumask *mask;
355 	int ret = 0;
356 
357 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
358 
359 	if (rdtgrp) {
360 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
361 			if (!rdtgrp->plr->d) {
362 				rdt_last_cmd_clear();
363 				rdt_last_cmd_puts("Cache domain offline\n");
364 				ret = -ENODEV;
365 			} else {
366 				mask = &rdtgrp->plr->d->hdr.cpu_mask;
367 				seq_printf(s, is_cpu_list(of) ?
368 					   "%*pbl\n" : "%*pb\n",
369 					   cpumask_pr_args(mask));
370 			}
371 		} else {
372 			seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
373 				   cpumask_pr_args(&rdtgrp->cpu_mask));
374 		}
375 	} else {
376 		ret = -ENOENT;
377 	}
378 	rdtgroup_kn_unlock(of->kn);
379 
380 	return ret;
381 }
382 
383 /*
384  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
385  *
386  * Per task closids/rmids must have been set up before calling this function.
387  * @r may be NULL.
388  */
389 static void
390 update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
391 {
392 	struct resctrl_cpu_defaults defaults, *p = NULL;
393 
394 	if (r) {
395 		defaults.closid = r->closid;
396 		defaults.rmid = r->mon.rmid;
397 		p = &defaults;
398 	}
399 
400 	on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
401 }
402 
403 static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
404 			  cpumask_var_t tmpmask)
405 {
406 	struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
407 	struct list_head *head;
408 
409 	/* Check whether cpus belong to parent ctrl group */
410 	cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
411 	if (!cpumask_empty(tmpmask)) {
412 		rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
413 		return -EINVAL;
414 	}
415 
416 	/* Check whether cpus are dropped from this group */
417 	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
418 	if (!cpumask_empty(tmpmask)) {
419 		/* Give any dropped cpus to parent rdtgroup */
420 		cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
421 		update_closid_rmid(tmpmask, prgrp);
422 	}
423 
424 	/*
425 	 * If we added cpus, remove them from previous group that owned them
426 	 * and update per-cpu rmid
427 	 */
428 	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
429 	if (!cpumask_empty(tmpmask)) {
430 		head = &prgrp->mon.crdtgrp_list;
431 		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
432 			if (crgrp == rdtgrp)
433 				continue;
434 			cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
435 				       tmpmask);
436 		}
437 		update_closid_rmid(tmpmask, rdtgrp);
438 	}
439 
440 	/* Done pushing/pulling - update this group with new mask */
441 	cpumask_copy(&rdtgrp->cpu_mask, newmask);
442 
443 	return 0;
444 }
445 
446 static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
447 {
448 	struct rdtgroup *crgrp;
449 
450 	cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
451 	/* update the child mon group masks as well*/
452 	list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
453 		cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
454 }
455 
456 static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
457 			   cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
458 {
459 	struct rdtgroup *r, *crgrp;
460 	struct list_head *head;
461 
462 	/* Check whether cpus are dropped from this group */
463 	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
464 	if (!cpumask_empty(tmpmask)) {
465 		/* Can't drop from default group */
466 		if (rdtgrp == &rdtgroup_default) {
467 			rdt_last_cmd_puts("Can't drop CPUs from default group\n");
468 			return -EINVAL;
469 		}
470 
471 		/* Give any dropped cpus to rdtgroup_default */
472 		cpumask_or(&rdtgroup_default.cpu_mask,
473 			   &rdtgroup_default.cpu_mask, tmpmask);
474 		update_closid_rmid(tmpmask, &rdtgroup_default);
475 	}
476 
477 	/*
478 	 * If we added cpus, remove them from previous group and
479 	 * the prev group's child groups that owned them
480 	 * and update per-cpu closid/rmid.
481 	 */
482 	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
483 	if (!cpumask_empty(tmpmask)) {
484 		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
485 			if (r == rdtgrp)
486 				continue;
487 			cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
488 			if (!cpumask_empty(tmpmask1))
489 				cpumask_rdtgrp_clear(r, tmpmask1);
490 		}
491 		update_closid_rmid(tmpmask, rdtgrp);
492 	}
493 
494 	/* Done pushing/pulling - update this group with new mask */
495 	cpumask_copy(&rdtgrp->cpu_mask, newmask);
496 
497 	/*
498 	 * Clear child mon group masks since there is a new parent mask
499 	 * now and update the rmid for the cpus the child lost.
500 	 */
501 	head = &rdtgrp->mon.crdtgrp_list;
502 	list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
503 		cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
504 		update_closid_rmid(tmpmask, rdtgrp);
505 		cpumask_clear(&crgrp->cpu_mask);
506 	}
507 
508 	return 0;
509 }
510 
511 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
512 				   char *buf, size_t nbytes, loff_t off)
513 {
514 	cpumask_var_t tmpmask, newmask, tmpmask1;
515 	struct rdtgroup *rdtgrp;
516 	int ret;
517 
518 	if (!buf)
519 		return -EINVAL;
520 
521 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
522 		return -ENOMEM;
523 	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
524 		free_cpumask_var(tmpmask);
525 		return -ENOMEM;
526 	}
527 	if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
528 		free_cpumask_var(tmpmask);
529 		free_cpumask_var(newmask);
530 		return -ENOMEM;
531 	}
532 
533 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
534 	if (!rdtgrp) {
535 		ret = -ENOENT;
536 		goto unlock;
537 	}
538 
539 	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
540 	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
541 		ret = -EINVAL;
542 		rdt_last_cmd_puts("Pseudo-locking in progress\n");
543 		goto unlock;
544 	}
545 
546 	if (is_cpu_list(of))
547 		ret = cpulist_parse(buf, newmask);
548 	else
549 		ret = cpumask_parse(buf, newmask);
550 
551 	if (ret) {
552 		rdt_last_cmd_puts("Bad CPU list/mask\n");
553 		goto unlock;
554 	}
555 
556 	/* check that user didn't specify any offline cpus */
557 	cpumask_andnot(tmpmask, newmask, cpu_online_mask);
558 	if (!cpumask_empty(tmpmask)) {
559 		ret = -EINVAL;
560 		rdt_last_cmd_puts("Can only assign online CPUs\n");
561 		goto unlock;
562 	}
563 
564 	if (rdtgrp->type == RDTCTRL_GROUP)
565 		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
566 	else if (rdtgrp->type == RDTMON_GROUP)
567 		ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
568 	else
569 		ret = -EINVAL;
570 
571 unlock:
572 	rdtgroup_kn_unlock(of->kn);
573 	free_cpumask_var(tmpmask);
574 	free_cpumask_var(newmask);
575 	free_cpumask_var(tmpmask1);
576 
577 	return ret ?: nbytes;
578 }
579 
580 /**
581  * rdtgroup_remove - the helper to remove resource group safely
582  * @rdtgrp: resource group to remove
583  *
584  * On resource group creation via a mkdir, an extra kernfs_node reference is
585  * taken to ensure that the rdtgroup structure remains accessible for the
586  * rdtgroup_kn_unlock() calls where it is removed.
587  *
588  * Drop the extra reference here, then free the rdtgroup structure.
589  *
590  * Return: void
591  */
592 static void rdtgroup_remove(struct rdtgroup *rdtgrp)
593 {
594 	kernfs_put(rdtgrp->kn);
595 	kfree(rdtgrp);
596 }
597 
598 static void _update_task_closid_rmid(void *task)
599 {
600 	/*
601 	 * If the task is still current on this CPU, update PQR_ASSOC MSR.
602 	 * Otherwise, the MSR is updated when the task is scheduled in.
603 	 */
604 	if (task == current)
605 		resctrl_arch_sched_in(task);
606 }
607 
608 static void update_task_closid_rmid(struct task_struct *t)
609 {
610 	if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
611 		smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
612 	else
613 		_update_task_closid_rmid(t);
614 }
615 
616 static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
617 {
618 	u32 closid, rmid = rdtgrp->mon.rmid;
619 
620 	if (rdtgrp->type == RDTCTRL_GROUP)
621 		closid = rdtgrp->closid;
622 	else if (rdtgrp->type == RDTMON_GROUP)
623 		closid = rdtgrp->mon.parent->closid;
624 	else
625 		return false;
626 
627 	return resctrl_arch_match_closid(tsk, closid) &&
628 	       resctrl_arch_match_rmid(tsk, closid, rmid);
629 }
630 
631 static int __rdtgroup_move_task(struct task_struct *tsk,
632 				struct rdtgroup *rdtgrp)
633 {
634 	/* If the task is already in rdtgrp, no need to move the task. */
635 	if (task_in_rdtgroup(tsk, rdtgrp))
636 		return 0;
637 
638 	/*
639 	 * Set the task's closid/rmid before the PQR_ASSOC MSR can be
640 	 * updated by them.
641 	 *
642 	 * For ctrl_mon groups, move both closid and rmid.
643 	 * For monitor groups, can move the tasks only from
644 	 * their parent CTRL group.
645 	 */
646 	if (rdtgrp->type == RDTMON_GROUP &&
647 	    !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
648 		rdt_last_cmd_puts("Can't move task to different control group\n");
649 		return -EINVAL;
650 	}
651 
652 	if (rdtgrp->type == RDTMON_GROUP)
653 		resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
654 					     rdtgrp->mon.rmid);
655 	else
656 		resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
657 					     rdtgrp->mon.rmid);
658 
659 	/*
660 	 * Ensure the task's closid and rmid are written before determining if
661 	 * the task is current that will decide if it will be interrupted.
662 	 * This pairs with the full barrier between the rq->curr update and
663 	 * resctrl_arch_sched_in() during context switch.
664 	 */
665 	smp_mb();
666 
667 	/*
668 	 * By now, the task's closid and rmid are set. If the task is current
669 	 * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
670 	 * group go into effect. If the task is not current, the MSR will be
671 	 * updated when the task is scheduled in.
672 	 */
673 	update_task_closid_rmid(tsk);
674 
675 	return 0;
676 }
677 
678 static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
679 {
680 	return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
681 		resctrl_arch_match_closid(t, r->closid));
682 }
683 
684 static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
685 {
686 	return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
687 		resctrl_arch_match_rmid(t, r->mon.parent->closid,
688 					r->mon.rmid));
689 }
690 
691 /**
692  * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
693  * @r: Resource group
694  *
695  * Return: 1 if tasks have been assigned to @r, 0 otherwise
696  */
697 int rdtgroup_tasks_assigned(struct rdtgroup *r)
698 {
699 	struct task_struct *p, *t;
700 	int ret = 0;
701 
702 	lockdep_assert_held(&rdtgroup_mutex);
703 
704 	rcu_read_lock();
705 	for_each_process_thread(p, t) {
706 		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
707 			ret = 1;
708 			break;
709 		}
710 	}
711 	rcu_read_unlock();
712 
713 	return ret;
714 }
715 
716 static int rdtgroup_task_write_permission(struct task_struct *task,
717 					  struct kernfs_open_file *of)
718 {
719 	const struct cred *tcred = get_task_cred(task);
720 	const struct cred *cred = current_cred();
721 	int ret = 0;
722 
723 	/*
724 	 * Even if we're attaching all tasks in the thread group, we only
725 	 * need to check permissions on one of them.
726 	 */
727 	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
728 	    !uid_eq(cred->euid, tcred->uid) &&
729 	    !uid_eq(cred->euid, tcred->suid)) {
730 		rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
731 		ret = -EPERM;
732 	}
733 
734 	put_cred(tcred);
735 	return ret;
736 }
737 
738 static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
739 			      struct kernfs_open_file *of)
740 {
741 	struct task_struct *tsk;
742 	int ret;
743 
744 	rcu_read_lock();
745 	if (pid) {
746 		tsk = find_task_by_vpid(pid);
747 		if (!tsk) {
748 			rcu_read_unlock();
749 			rdt_last_cmd_printf("No task %d\n", pid);
750 			return -ESRCH;
751 		}
752 	} else {
753 		tsk = current;
754 	}
755 
756 	get_task_struct(tsk);
757 	rcu_read_unlock();
758 
759 	ret = rdtgroup_task_write_permission(tsk, of);
760 	if (!ret)
761 		ret = __rdtgroup_move_task(tsk, rdtgrp);
762 
763 	put_task_struct(tsk);
764 	return ret;
765 }
766 
767 static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
768 				    char *buf, size_t nbytes, loff_t off)
769 {
770 	struct rdtgroup *rdtgrp;
771 	char *pid_str;
772 	int ret = 0;
773 	pid_t pid;
774 
775 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
776 	if (!rdtgrp) {
777 		rdtgroup_kn_unlock(of->kn);
778 		return -ENOENT;
779 	}
780 	rdt_last_cmd_clear();
781 
782 	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
783 	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
784 		ret = -EINVAL;
785 		rdt_last_cmd_puts("Pseudo-locking in progress\n");
786 		goto unlock;
787 	}
788 
789 	while (buf && buf[0] != '\0' && buf[0] != '\n') {
790 		pid_str = strim(strsep(&buf, ","));
791 
792 		if (kstrtoint(pid_str, 0, &pid)) {
793 			rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
794 			ret = -EINVAL;
795 			break;
796 		}
797 
798 		if (pid < 0) {
799 			rdt_last_cmd_printf("Invalid pid %d\n", pid);
800 			ret = -EINVAL;
801 			break;
802 		}
803 
804 		ret = rdtgroup_move_task(pid, rdtgrp, of);
805 		if (ret) {
806 			rdt_last_cmd_printf("Error while processing task %d\n", pid);
807 			break;
808 		}
809 	}
810 
811 unlock:
812 	rdtgroup_kn_unlock(of->kn);
813 
814 	return ret ?: nbytes;
815 }
816 
817 static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
818 {
819 	struct task_struct *p, *t;
820 	pid_t pid;
821 
822 	rcu_read_lock();
823 	for_each_process_thread(p, t) {
824 		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
825 			pid = task_pid_vnr(t);
826 			if (pid)
827 				seq_printf(s, "%d\n", pid);
828 		}
829 	}
830 	rcu_read_unlock();
831 }
832 
833 static int rdtgroup_tasks_show(struct kernfs_open_file *of,
834 			       struct seq_file *s, void *v)
835 {
836 	struct rdtgroup *rdtgrp;
837 	int ret = 0;
838 
839 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
840 	if (rdtgrp)
841 		show_rdt_tasks(rdtgrp, s);
842 	else
843 		ret = -ENOENT;
844 	rdtgroup_kn_unlock(of->kn);
845 
846 	return ret;
847 }
848 
849 static int rdtgroup_closid_show(struct kernfs_open_file *of,
850 				struct seq_file *s, void *v)
851 {
852 	struct rdtgroup *rdtgrp;
853 	int ret = 0;
854 
855 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
856 	if (rdtgrp)
857 		seq_printf(s, "%u\n", rdtgrp->closid);
858 	else
859 		ret = -ENOENT;
860 	rdtgroup_kn_unlock(of->kn);
861 
862 	return ret;
863 }
864 
865 static int rdtgroup_rmid_show(struct kernfs_open_file *of,
866 			      struct seq_file *s, void *v)
867 {
868 	struct rdtgroup *rdtgrp;
869 	int ret = 0;
870 
871 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
872 	if (rdtgrp)
873 		seq_printf(s, "%u\n", rdtgrp->mon.rmid);
874 	else
875 		ret = -ENOENT;
876 	rdtgroup_kn_unlock(of->kn);
877 
878 	return ret;
879 }
880 
881 #ifdef CONFIG_PROC_CPU_RESCTRL
882 /*
883  * A task can only be part of one resctrl control group and of one monitor
884  * group which is associated to that control group.
885  *
886  * 1)   res:
887  *      mon:
888  *
889  *    resctrl is not available.
890  *
891  * 2)   res:/
892  *      mon:
893  *
894  *    Task is part of the root resctrl control group, and it is not associated
895  *    to any monitor group.
896  *
897  * 3)  res:/
898  *     mon:mon0
899  *
900  *    Task is part of the root resctrl control group and monitor group mon0.
901  *
902  * 4)  res:group0
903  *     mon:
904  *
905  *    Task is part of resctrl control group group0, and it is not associated
906  *    to any monitor group.
907  *
908  * 5) res:group0
909  *    mon:mon1
910  *
911  *    Task is part of resctrl control group group0 and monitor group mon1.
912  */
913 int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
914 		      struct pid *pid, struct task_struct *tsk)
915 {
916 	struct rdtgroup *rdtg;
917 	int ret = 0;
918 
919 	mutex_lock(&rdtgroup_mutex);
920 
921 	/* Return empty if resctrl has not been mounted. */
922 	if (!resctrl_mounted) {
923 		seq_puts(s, "res:\nmon:\n");
924 		goto unlock;
925 	}
926 
927 	list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
928 		struct rdtgroup *crg;
929 
930 		/*
931 		 * Task information is only relevant for shareable
932 		 * and exclusive groups.
933 		 */
934 		if (rdtg->mode != RDT_MODE_SHAREABLE &&
935 		    rdtg->mode != RDT_MODE_EXCLUSIVE)
936 			continue;
937 
938 		if (!resctrl_arch_match_closid(tsk, rdtg->closid))
939 			continue;
940 
941 		seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
942 			   rdt_kn_name(rdtg->kn));
943 		seq_puts(s, "mon:");
944 		list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
945 				    mon.crdtgrp_list) {
946 			if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
947 						     crg->mon.rmid))
948 				continue;
949 			seq_printf(s, "%s", rdt_kn_name(crg->kn));
950 			break;
951 		}
952 		seq_putc(s, '\n');
953 		goto unlock;
954 	}
955 	/*
956 	 * The above search should succeed. Otherwise return
957 	 * with an error.
958 	 */
959 	ret = -ENOENT;
960 unlock:
961 	mutex_unlock(&rdtgroup_mutex);
962 
963 	return ret;
964 }
965 #endif
966 
967 static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
968 				    struct seq_file *seq, void *v)
969 {
970 	int len;
971 
972 	mutex_lock(&rdtgroup_mutex);
973 	len = seq_buf_used(&last_cmd_status);
974 	if (len)
975 		seq_printf(seq, "%.*s", len, last_cmd_status_buf);
976 	else
977 		seq_puts(seq, "ok\n");
978 	mutex_unlock(&rdtgroup_mutex);
979 	return 0;
980 }
981 
982 static void *rdt_kn_parent_priv(struct kernfs_node *kn)
983 {
984 	/*
985 	 * The parent pointer is only valid within RCU section since it can be
986 	 * replaced.
987 	 */
988 	guard(rcu)();
989 	return rcu_dereference(kn->__parent)->priv;
990 }
991 
992 static int rdt_num_closids_show(struct kernfs_open_file *of,
993 				struct seq_file *seq, void *v)
994 {
995 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
996 
997 	seq_printf(seq, "%u\n", s->num_closid);
998 	return 0;
999 }
1000 
1001 static int rdt_default_ctrl_show(struct kernfs_open_file *of,
1002 				 struct seq_file *seq, void *v)
1003 {
1004 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1005 	struct rdt_resource *r = s->res;
1006 
1007 	seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
1008 	return 0;
1009 }
1010 
1011 static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
1012 				 struct seq_file *seq, void *v)
1013 {
1014 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1015 	struct rdt_resource *r = s->res;
1016 
1017 	seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
1018 	return 0;
1019 }
1020 
1021 static int rdt_shareable_bits_show(struct kernfs_open_file *of,
1022 				   struct seq_file *seq, void *v)
1023 {
1024 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1025 	struct rdt_resource *r = s->res;
1026 
1027 	seq_printf(seq, "%x\n", r->cache.shareable_bits);
1028 	return 0;
1029 }
1030 
1031 /*
1032  * rdt_bit_usage_show - Display current usage of resources
1033  *
1034  * A domain is a shared resource that can now be allocated differently. Here
1035  * we display the current regions of the domain as an annotated bitmask.
1036  * For each domain of this resource its allocation bitmask
1037  * is annotated as below to indicate the current usage of the corresponding bit:
1038  *   0 - currently unused
1039  *   X - currently available for sharing and used by software and hardware
1040  *   H - currently used by hardware only but available for software use
1041  *   S - currently used and shareable by software only
1042  *   E - currently used exclusively by one resource group
1043  *   P - currently pseudo-locked by one resource group
1044  */
1045 static int rdt_bit_usage_show(struct kernfs_open_file *of,
1046 			      struct seq_file *seq, void *v)
1047 {
1048 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1049 	/*
1050 	 * Use unsigned long even though only 32 bits are used to ensure
1051 	 * test_bit() is used safely.
1052 	 */
1053 	unsigned long sw_shareable = 0, hw_shareable = 0;
1054 	unsigned long exclusive = 0, pseudo_locked = 0;
1055 	struct rdt_resource *r = s->res;
1056 	struct rdt_ctrl_domain *dom;
1057 	int i, hwb, swb, excl, psl;
1058 	enum rdtgrp_mode mode;
1059 	bool sep = false;
1060 	u32 ctrl_val;
1061 
1062 	cpus_read_lock();
1063 	mutex_lock(&rdtgroup_mutex);
1064 	hw_shareable = r->cache.shareable_bits;
1065 	list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
1066 		if (sep)
1067 			seq_putc(seq, ';');
1068 		sw_shareable = 0;
1069 		exclusive = 0;
1070 		seq_printf(seq, "%d=", dom->hdr.id);
1071 		for (i = 0; i < closids_supported(); i++) {
1072 			if (!closid_allocated(i))
1073 				continue;
1074 			ctrl_val = resctrl_arch_get_config(r, dom, i,
1075 							   s->conf_type);
1076 			mode = rdtgroup_mode_by_closid(i);
1077 			switch (mode) {
1078 			case RDT_MODE_SHAREABLE:
1079 				sw_shareable |= ctrl_val;
1080 				break;
1081 			case RDT_MODE_EXCLUSIVE:
1082 				exclusive |= ctrl_val;
1083 				break;
1084 			case RDT_MODE_PSEUDO_LOCKSETUP:
1085 			/*
1086 			 * RDT_MODE_PSEUDO_LOCKSETUP is possible
1087 			 * here but not included since the CBM
1088 			 * associated with this CLOSID in this mode
1089 			 * is not initialized and no task or cpu can be
1090 			 * assigned this CLOSID.
1091 			 */
1092 				break;
1093 			case RDT_MODE_PSEUDO_LOCKED:
1094 			case RDT_NUM_MODES:
1095 				WARN(1,
1096 				     "invalid mode for closid %d\n", i);
1097 				break;
1098 			}
1099 		}
1100 		for (i = r->cache.cbm_len - 1; i >= 0; i--) {
1101 			pseudo_locked = dom->plr ? dom->plr->cbm : 0;
1102 			hwb = test_bit(i, &hw_shareable);
1103 			swb = test_bit(i, &sw_shareable);
1104 			excl = test_bit(i, &exclusive);
1105 			psl = test_bit(i, &pseudo_locked);
1106 			if (hwb && swb)
1107 				seq_putc(seq, 'X');
1108 			else if (hwb && !swb)
1109 				seq_putc(seq, 'H');
1110 			else if (!hwb && swb)
1111 				seq_putc(seq, 'S');
1112 			else if (excl)
1113 				seq_putc(seq, 'E');
1114 			else if (psl)
1115 				seq_putc(seq, 'P');
1116 			else /* Unused bits remain */
1117 				seq_putc(seq, '0');
1118 		}
1119 		sep = true;
1120 	}
1121 	seq_putc(seq, '\n');
1122 	mutex_unlock(&rdtgroup_mutex);
1123 	cpus_read_unlock();
1124 	return 0;
1125 }
1126 
1127 static int rdt_min_bw_show(struct kernfs_open_file *of,
1128 			   struct seq_file *seq, void *v)
1129 {
1130 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1131 	struct rdt_resource *r = s->res;
1132 
1133 	seq_printf(seq, "%u\n", r->membw.min_bw);
1134 	return 0;
1135 }
1136 
1137 static int rdt_num_rmids_show(struct kernfs_open_file *of,
1138 			      struct seq_file *seq, void *v)
1139 {
1140 	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1141 
1142 	seq_printf(seq, "%d\n", r->num_rmid);
1143 
1144 	return 0;
1145 }
1146 
1147 static int rdt_mon_features_show(struct kernfs_open_file *of,
1148 				 struct seq_file *seq, void *v)
1149 {
1150 	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1151 	struct mon_evt *mevt;
1152 
1153 	list_for_each_entry(mevt, &r->evt_list, list) {
1154 		seq_printf(seq, "%s\n", mevt->name);
1155 		if (mevt->configurable)
1156 			seq_printf(seq, "%s_config\n", mevt->name);
1157 	}
1158 
1159 	return 0;
1160 }
1161 
1162 static int rdt_bw_gran_show(struct kernfs_open_file *of,
1163 			    struct seq_file *seq, void *v)
1164 {
1165 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1166 	struct rdt_resource *r = s->res;
1167 
1168 	seq_printf(seq, "%u\n", r->membw.bw_gran);
1169 	return 0;
1170 }
1171 
1172 static int rdt_delay_linear_show(struct kernfs_open_file *of,
1173 				 struct seq_file *seq, void *v)
1174 {
1175 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1176 	struct rdt_resource *r = s->res;
1177 
1178 	seq_printf(seq, "%u\n", r->membw.delay_linear);
1179 	return 0;
1180 }
1181 
1182 static int max_threshold_occ_show(struct kernfs_open_file *of,
1183 				  struct seq_file *seq, void *v)
1184 {
1185 	seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
1186 
1187 	return 0;
1188 }
1189 
1190 static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
1191 					 struct seq_file *seq, void *v)
1192 {
1193 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1194 	struct rdt_resource *r = s->res;
1195 
1196 	switch (r->membw.throttle_mode) {
1197 	case THREAD_THROTTLE_PER_THREAD:
1198 		seq_puts(seq, "per-thread\n");
1199 		return 0;
1200 	case THREAD_THROTTLE_MAX:
1201 		seq_puts(seq, "max\n");
1202 		return 0;
1203 	case THREAD_THROTTLE_UNDEFINED:
1204 		seq_puts(seq, "undefined\n");
1205 		return 0;
1206 	}
1207 
1208 	WARN_ON_ONCE(1);
1209 
1210 	return 0;
1211 }
1212 
1213 static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
1214 				       char *buf, size_t nbytes, loff_t off)
1215 {
1216 	unsigned int bytes;
1217 	int ret;
1218 
1219 	ret = kstrtouint(buf, 0, &bytes);
1220 	if (ret)
1221 		return ret;
1222 
1223 	if (bytes > resctrl_rmid_realloc_limit)
1224 		return -EINVAL;
1225 
1226 	resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
1227 
1228 	return nbytes;
1229 }
1230 
1231 /*
1232  * rdtgroup_mode_show - Display mode of this resource group
1233  */
1234 static int rdtgroup_mode_show(struct kernfs_open_file *of,
1235 			      struct seq_file *s, void *v)
1236 {
1237 	struct rdtgroup *rdtgrp;
1238 
1239 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1240 	if (!rdtgrp) {
1241 		rdtgroup_kn_unlock(of->kn);
1242 		return -ENOENT;
1243 	}
1244 
1245 	seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
1246 
1247 	rdtgroup_kn_unlock(of->kn);
1248 	return 0;
1249 }
1250 
1251 static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
1252 {
1253 	switch (my_type) {
1254 	case CDP_CODE:
1255 		return CDP_DATA;
1256 	case CDP_DATA:
1257 		return CDP_CODE;
1258 	default:
1259 	case CDP_NONE:
1260 		return CDP_NONE;
1261 	}
1262 }
1263 
1264 static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
1265 					struct seq_file *seq, void *v)
1266 {
1267 	struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
1268 	struct rdt_resource *r = s->res;
1269 
1270 	seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
1271 
1272 	return 0;
1273 }
1274 
1275 /**
1276  * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
1277  * @r: Resource to which domain instance @d belongs.
1278  * @d: The domain instance for which @closid is being tested.
1279  * @cbm: Capacity bitmask being tested.
1280  * @closid: Intended closid for @cbm.
1281  * @type: CDP type of @r.
1282  * @exclusive: Only check if overlaps with exclusive resource groups
1283  *
1284  * Checks if provided @cbm intended to be used for @closid on domain
1285  * @d overlaps with any other closids or other hardware usage associated
1286  * with this domain. If @exclusive is true then only overlaps with
1287  * resource groups in exclusive mode will be considered. If @exclusive
1288  * is false then overlaps with any resource group or hardware entities
1289  * will be considered.
1290  *
1291  * @cbm is unsigned long, even if only 32 bits are used, to make the
1292  * bitmap functions work correctly.
1293  *
1294  * Return: false if CBM does not overlap, true if it does.
1295  */
1296 static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
1297 				    unsigned long cbm, int closid,
1298 				    enum resctrl_conf_type type, bool exclusive)
1299 {
1300 	enum rdtgrp_mode mode;
1301 	unsigned long ctrl_b;
1302 	int i;
1303 
1304 	/* Check for any overlap with regions used by hardware directly */
1305 	if (!exclusive) {
1306 		ctrl_b = r->cache.shareable_bits;
1307 		if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
1308 			return true;
1309 	}
1310 
1311 	/* Check for overlap with other resource groups */
1312 	for (i = 0; i < closids_supported(); i++) {
1313 		ctrl_b = resctrl_arch_get_config(r, d, i, type);
1314 		mode = rdtgroup_mode_by_closid(i);
1315 		if (closid_allocated(i) && i != closid &&
1316 		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
1317 			if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
1318 				if (exclusive) {
1319 					if (mode == RDT_MODE_EXCLUSIVE)
1320 						return true;
1321 					continue;
1322 				}
1323 				return true;
1324 			}
1325 		}
1326 	}
1327 
1328 	return false;
1329 }
1330 
1331 /**
1332  * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
1333  * @s: Schema for the resource to which domain instance @d belongs.
1334  * @d: The domain instance for which @closid is being tested.
1335  * @cbm: Capacity bitmask being tested.
1336  * @closid: Intended closid for @cbm.
1337  * @exclusive: Only check if overlaps with exclusive resource groups
1338  *
1339  * Resources that can be allocated using a CBM can use the CBM to control
1340  * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
1341  * for overlap. Overlap test is not limited to the specific resource for
1342  * which the CBM is intended though - when dealing with CDP resources that
1343  * share the underlying hardware the overlap check should be performed on
1344  * the CDP resource sharing the hardware also.
1345  *
1346  * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
1347  * overlap test.
1348  *
1349  * Return: true if CBM overlap detected, false if there is no overlap
1350  */
1351 bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
1352 			   unsigned long cbm, int closid, bool exclusive)
1353 {
1354 	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
1355 	struct rdt_resource *r = s->res;
1356 
1357 	if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
1358 				    exclusive))
1359 		return true;
1360 
1361 	if (!resctrl_arch_get_cdp_enabled(r->rid))
1362 		return false;
1363 	return  __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
1364 }
1365 
1366 /**
1367  * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
1368  * @rdtgrp: Resource group identified through its closid.
1369  *
1370  * An exclusive resource group implies that there should be no sharing of
1371  * its allocated resources. At the time this group is considered to be
1372  * exclusive this test can determine if its current schemata supports this
1373  * setting by testing for overlap with all other resource groups.
1374  *
1375  * Return: true if resource group can be exclusive, false if there is overlap
1376  * with allocations of other resource groups and thus this resource group
1377  * cannot be exclusive.
1378  */
1379 static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
1380 {
1381 	int closid = rdtgrp->closid;
1382 	struct rdt_ctrl_domain *d;
1383 	struct resctrl_schema *s;
1384 	struct rdt_resource *r;
1385 	bool has_cache = false;
1386 	u32 ctrl;
1387 
1388 	/* Walking r->domains, ensure it can't race with cpuhp */
1389 	lockdep_assert_cpus_held();
1390 
1391 	list_for_each_entry(s, &resctrl_schema_all, list) {
1392 		r = s->res;
1393 		if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
1394 			continue;
1395 		has_cache = true;
1396 		list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1397 			ctrl = resctrl_arch_get_config(r, d, closid,
1398 						       s->conf_type);
1399 			if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
1400 				rdt_last_cmd_puts("Schemata overlaps\n");
1401 				return false;
1402 			}
1403 		}
1404 	}
1405 
1406 	if (!has_cache) {
1407 		rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
1408 		return false;
1409 	}
1410 
1411 	return true;
1412 }
1413 
1414 /*
1415  * rdtgroup_mode_write - Modify the resource group's mode
1416  */
1417 static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
1418 				   char *buf, size_t nbytes, loff_t off)
1419 {
1420 	struct rdtgroup *rdtgrp;
1421 	enum rdtgrp_mode mode;
1422 	int ret = 0;
1423 
1424 	/* Valid input requires a trailing newline */
1425 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1426 		return -EINVAL;
1427 	buf[nbytes - 1] = '\0';
1428 
1429 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1430 	if (!rdtgrp) {
1431 		rdtgroup_kn_unlock(of->kn);
1432 		return -ENOENT;
1433 	}
1434 
1435 	rdt_last_cmd_clear();
1436 
1437 	mode = rdtgrp->mode;
1438 
1439 	if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
1440 	    (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
1441 	    (!strcmp(buf, "pseudo-locksetup") &&
1442 	     mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
1443 	    (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
1444 		goto out;
1445 
1446 	if (mode == RDT_MODE_PSEUDO_LOCKED) {
1447 		rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
1448 		ret = -EINVAL;
1449 		goto out;
1450 	}
1451 
1452 	if (!strcmp(buf, "shareable")) {
1453 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1454 			ret = rdtgroup_locksetup_exit(rdtgrp);
1455 			if (ret)
1456 				goto out;
1457 		}
1458 		rdtgrp->mode = RDT_MODE_SHAREABLE;
1459 	} else if (!strcmp(buf, "exclusive")) {
1460 		if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
1461 			ret = -EINVAL;
1462 			goto out;
1463 		}
1464 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1465 			ret = rdtgroup_locksetup_exit(rdtgrp);
1466 			if (ret)
1467 				goto out;
1468 		}
1469 		rdtgrp->mode = RDT_MODE_EXCLUSIVE;
1470 	} else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
1471 		   !strcmp(buf, "pseudo-locksetup")) {
1472 		ret = rdtgroup_locksetup_enter(rdtgrp);
1473 		if (ret)
1474 			goto out;
1475 		rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
1476 	} else {
1477 		rdt_last_cmd_puts("Unknown or unsupported mode\n");
1478 		ret = -EINVAL;
1479 	}
1480 
1481 out:
1482 	rdtgroup_kn_unlock(of->kn);
1483 	return ret ?: nbytes;
1484 }
1485 
1486 /**
1487  * rdtgroup_cbm_to_size - Translate CBM to size in bytes
1488  * @r: RDT resource to which @d belongs.
1489  * @d: RDT domain instance.
1490  * @cbm: bitmask for which the size should be computed.
1491  *
1492  * The bitmask provided associated with the RDT domain instance @d will be
1493  * translated into how many bytes it represents. The size in bytes is
1494  * computed by first dividing the total cache size by the CBM length to
1495  * determine how many bytes each bit in the bitmask represents. The result
1496  * is multiplied with the number of bits set in the bitmask.
1497  *
1498  * @cbm is unsigned long, even if only 32 bits are used to make the
1499  * bitmap functions work correctly.
1500  */
1501 unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
1502 				  struct rdt_ctrl_domain *d, unsigned long cbm)
1503 {
1504 	unsigned int size = 0;
1505 	struct cacheinfo *ci;
1506 	int num_b;
1507 
1508 	if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
1509 		return size;
1510 
1511 	num_b = bitmap_weight(&cbm, r->cache.cbm_len);
1512 	ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
1513 	if (ci)
1514 		size = ci->size / r->cache.cbm_len * num_b;
1515 
1516 	return size;
1517 }
1518 
1519 bool is_mba_sc(struct rdt_resource *r)
1520 {
1521 	if (!r)
1522 		r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
1523 
1524 	/*
1525 	 * The software controller support is only applicable to MBA resource.
1526 	 * Make sure to check for resource type.
1527 	 */
1528 	if (r->rid != RDT_RESOURCE_MBA)
1529 		return false;
1530 
1531 	return r->membw.mba_sc;
1532 }
1533 
1534 /*
1535  * rdtgroup_size_show - Display size in bytes of allocated regions
1536  *
1537  * The "size" file mirrors the layout of the "schemata" file, printing the
1538  * size in bytes of each region instead of the capacity bitmask.
1539  */
1540 static int rdtgroup_size_show(struct kernfs_open_file *of,
1541 			      struct seq_file *s, void *v)
1542 {
1543 	struct resctrl_schema *schema;
1544 	enum resctrl_conf_type type;
1545 	struct rdt_ctrl_domain *d;
1546 	struct rdtgroup *rdtgrp;
1547 	struct rdt_resource *r;
1548 	unsigned int size;
1549 	int ret = 0;
1550 	u32 closid;
1551 	bool sep;
1552 	u32 ctrl;
1553 
1554 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
1555 	if (!rdtgrp) {
1556 		rdtgroup_kn_unlock(of->kn);
1557 		return -ENOENT;
1558 	}
1559 
1560 	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
1561 		if (!rdtgrp->plr->d) {
1562 			rdt_last_cmd_clear();
1563 			rdt_last_cmd_puts("Cache domain offline\n");
1564 			ret = -ENODEV;
1565 		} else {
1566 			seq_printf(s, "%*s:", max_name_width,
1567 				   rdtgrp->plr->s->name);
1568 			size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
1569 						    rdtgrp->plr->d,
1570 						    rdtgrp->plr->cbm);
1571 			seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
1572 		}
1573 		goto out;
1574 	}
1575 
1576 	closid = rdtgrp->closid;
1577 
1578 	list_for_each_entry(schema, &resctrl_schema_all, list) {
1579 		r = schema->res;
1580 		type = schema->conf_type;
1581 		sep = false;
1582 		seq_printf(s, "%*s:", max_name_width, schema->name);
1583 		list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
1584 			if (sep)
1585 				seq_putc(s, ';');
1586 			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
1587 				size = 0;
1588 			} else {
1589 				if (is_mba_sc(r))
1590 					ctrl = d->mbps_val[closid];
1591 				else
1592 					ctrl = resctrl_arch_get_config(r, d,
1593 								       closid,
1594 								       type);
1595 				if (r->rid == RDT_RESOURCE_MBA ||
1596 				    r->rid == RDT_RESOURCE_SMBA)
1597 					size = ctrl;
1598 				else
1599 					size = rdtgroup_cbm_to_size(r, d, ctrl);
1600 			}
1601 			seq_printf(s, "%d=%u", d->hdr.id, size);
1602 			sep = true;
1603 		}
1604 		seq_putc(s, '\n');
1605 	}
1606 
1607 out:
1608 	rdtgroup_kn_unlock(of->kn);
1609 
1610 	return ret;
1611 }
1612 
1613 static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
1614 {
1615 	smp_call_function_any(&mon_info->d->hdr.cpu_mask,
1616 			      resctrl_arch_mon_event_config_read, mon_info, 1);
1617 }
1618 
1619 static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
1620 {
1621 	struct resctrl_mon_config_info mon_info;
1622 	struct rdt_mon_domain *dom;
1623 	bool sep = false;
1624 
1625 	cpus_read_lock();
1626 	mutex_lock(&rdtgroup_mutex);
1627 
1628 	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
1629 		if (sep)
1630 			seq_puts(s, ";");
1631 
1632 		memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
1633 		mon_info.r = r;
1634 		mon_info.d = dom;
1635 		mon_info.evtid = evtid;
1636 		mondata_config_read(&mon_info);
1637 
1638 		seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
1639 		sep = true;
1640 	}
1641 	seq_puts(s, "\n");
1642 
1643 	mutex_unlock(&rdtgroup_mutex);
1644 	cpus_read_unlock();
1645 
1646 	return 0;
1647 }
1648 
1649 static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
1650 				       struct seq_file *seq, void *v)
1651 {
1652 	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1653 
1654 	mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
1655 
1656 	return 0;
1657 }
1658 
1659 static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
1660 				       struct seq_file *seq, void *v)
1661 {
1662 	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1663 
1664 	mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
1665 
1666 	return 0;
1667 }
1668 
1669 static void mbm_config_write_domain(struct rdt_resource *r,
1670 				    struct rdt_mon_domain *d, u32 evtid, u32 val)
1671 {
1672 	struct resctrl_mon_config_info mon_info = {0};
1673 
1674 	/*
1675 	 * Read the current config value first. If both are the same then
1676 	 * no need to write it again.
1677 	 */
1678 	mon_info.r = r;
1679 	mon_info.d = d;
1680 	mon_info.evtid = evtid;
1681 	mondata_config_read(&mon_info);
1682 	if (mon_info.mon_config == val)
1683 		return;
1684 
1685 	mon_info.mon_config = val;
1686 
1687 	/*
1688 	 * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
1689 	 * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
1690 	 * are scoped at the domain level. Writing any of these MSRs
1691 	 * on one CPU is observed by all the CPUs in the domain.
1692 	 */
1693 	smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
1694 			      &mon_info, 1);
1695 
1696 	/*
1697 	 * When an Event Configuration is changed, the bandwidth counters
1698 	 * for all RMIDs and Events will be cleared by the hardware. The
1699 	 * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
1700 	 * every RMID on the next read to any event for every RMID.
1701 	 * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
1702 	 * cleared while it is tracked by the hardware. Clear the
1703 	 * mbm_local and mbm_total counts for all the RMIDs.
1704 	 */
1705 	resctrl_arch_reset_rmid_all(r, d);
1706 }
1707 
1708 static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
1709 {
1710 	char *dom_str = NULL, *id_str;
1711 	unsigned long dom_id, val;
1712 	struct rdt_mon_domain *d;
1713 
1714 	/* Walking r->domains, ensure it can't race with cpuhp */
1715 	lockdep_assert_cpus_held();
1716 
1717 next:
1718 	if (!tok || tok[0] == '\0')
1719 		return 0;
1720 
1721 	/* Start processing the strings for each domain */
1722 	dom_str = strim(strsep(&tok, ";"));
1723 	id_str = strsep(&dom_str, "=");
1724 
1725 	if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
1726 		rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
1727 		return -EINVAL;
1728 	}
1729 
1730 	if (!dom_str || kstrtoul(dom_str, 16, &val)) {
1731 		rdt_last_cmd_puts("Non-numeric event configuration value\n");
1732 		return -EINVAL;
1733 	}
1734 
1735 	/* Value from user cannot be more than the supported set of events */
1736 	if ((val & r->mbm_cfg_mask) != val) {
1737 		rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
1738 				    r->mbm_cfg_mask);
1739 		return -EINVAL;
1740 	}
1741 
1742 	list_for_each_entry(d, &r->mon_domains, hdr.list) {
1743 		if (d->hdr.id == dom_id) {
1744 			mbm_config_write_domain(r, d, evtid, val);
1745 			goto next;
1746 		}
1747 	}
1748 
1749 	return -EINVAL;
1750 }
1751 
1752 static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
1753 					    char *buf, size_t nbytes,
1754 					    loff_t off)
1755 {
1756 	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1757 	int ret;
1758 
1759 	/* Valid input requires a trailing newline */
1760 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1761 		return -EINVAL;
1762 
1763 	cpus_read_lock();
1764 	mutex_lock(&rdtgroup_mutex);
1765 
1766 	rdt_last_cmd_clear();
1767 
1768 	buf[nbytes - 1] = '\0';
1769 
1770 	ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
1771 
1772 	mutex_unlock(&rdtgroup_mutex);
1773 	cpus_read_unlock();
1774 
1775 	return ret ?: nbytes;
1776 }
1777 
1778 static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
1779 					    char *buf, size_t nbytes,
1780 					    loff_t off)
1781 {
1782 	struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
1783 	int ret;
1784 
1785 	/* Valid input requires a trailing newline */
1786 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
1787 		return -EINVAL;
1788 
1789 	cpus_read_lock();
1790 	mutex_lock(&rdtgroup_mutex);
1791 
1792 	rdt_last_cmd_clear();
1793 
1794 	buf[nbytes - 1] = '\0';
1795 
1796 	ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
1797 
1798 	mutex_unlock(&rdtgroup_mutex);
1799 	cpus_read_unlock();
1800 
1801 	return ret ?: nbytes;
1802 }
1803 
1804 /* rdtgroup information files for one cache resource. */
1805 static struct rftype res_common_files[] = {
1806 	{
1807 		.name		= "last_cmd_status",
1808 		.mode		= 0444,
1809 		.kf_ops		= &rdtgroup_kf_single_ops,
1810 		.seq_show	= rdt_last_cmd_status_show,
1811 		.fflags		= RFTYPE_TOP_INFO,
1812 	},
1813 	{
1814 		.name		= "num_closids",
1815 		.mode		= 0444,
1816 		.kf_ops		= &rdtgroup_kf_single_ops,
1817 		.seq_show	= rdt_num_closids_show,
1818 		.fflags		= RFTYPE_CTRL_INFO,
1819 	},
1820 	{
1821 		.name		= "mon_features",
1822 		.mode		= 0444,
1823 		.kf_ops		= &rdtgroup_kf_single_ops,
1824 		.seq_show	= rdt_mon_features_show,
1825 		.fflags		= RFTYPE_MON_INFO,
1826 	},
1827 	{
1828 		.name		= "num_rmids",
1829 		.mode		= 0444,
1830 		.kf_ops		= &rdtgroup_kf_single_ops,
1831 		.seq_show	= rdt_num_rmids_show,
1832 		.fflags		= RFTYPE_MON_INFO,
1833 	},
1834 	{
1835 		.name		= "cbm_mask",
1836 		.mode		= 0444,
1837 		.kf_ops		= &rdtgroup_kf_single_ops,
1838 		.seq_show	= rdt_default_ctrl_show,
1839 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1840 	},
1841 	{
1842 		.name		= "min_cbm_bits",
1843 		.mode		= 0444,
1844 		.kf_ops		= &rdtgroup_kf_single_ops,
1845 		.seq_show	= rdt_min_cbm_bits_show,
1846 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1847 	},
1848 	{
1849 		.name		= "shareable_bits",
1850 		.mode		= 0444,
1851 		.kf_ops		= &rdtgroup_kf_single_ops,
1852 		.seq_show	= rdt_shareable_bits_show,
1853 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1854 	},
1855 	{
1856 		.name		= "bit_usage",
1857 		.mode		= 0444,
1858 		.kf_ops		= &rdtgroup_kf_single_ops,
1859 		.seq_show	= rdt_bit_usage_show,
1860 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1861 	},
1862 	{
1863 		.name		= "min_bandwidth",
1864 		.mode		= 0444,
1865 		.kf_ops		= &rdtgroup_kf_single_ops,
1866 		.seq_show	= rdt_min_bw_show,
1867 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1868 	},
1869 	{
1870 		.name		= "bandwidth_gran",
1871 		.mode		= 0444,
1872 		.kf_ops		= &rdtgroup_kf_single_ops,
1873 		.seq_show	= rdt_bw_gran_show,
1874 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1875 	},
1876 	{
1877 		.name		= "delay_linear",
1878 		.mode		= 0444,
1879 		.kf_ops		= &rdtgroup_kf_single_ops,
1880 		.seq_show	= rdt_delay_linear_show,
1881 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
1882 	},
1883 	/*
1884 	 * Platform specific which (if any) capabilities are provided by
1885 	 * thread_throttle_mode. Defer "fflags" initialization to platform
1886 	 * discovery.
1887 	 */
1888 	{
1889 		.name		= "thread_throttle_mode",
1890 		.mode		= 0444,
1891 		.kf_ops		= &rdtgroup_kf_single_ops,
1892 		.seq_show	= rdt_thread_throttle_mode_show,
1893 	},
1894 	{
1895 		.name		= "max_threshold_occupancy",
1896 		.mode		= 0644,
1897 		.kf_ops		= &rdtgroup_kf_single_ops,
1898 		.write		= max_threshold_occ_write,
1899 		.seq_show	= max_threshold_occ_show,
1900 		.fflags		= RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
1901 	},
1902 	{
1903 		.name		= "mbm_total_bytes_config",
1904 		.mode		= 0644,
1905 		.kf_ops		= &rdtgroup_kf_single_ops,
1906 		.seq_show	= mbm_total_bytes_config_show,
1907 		.write		= mbm_total_bytes_config_write,
1908 	},
1909 	{
1910 		.name		= "mbm_local_bytes_config",
1911 		.mode		= 0644,
1912 		.kf_ops		= &rdtgroup_kf_single_ops,
1913 		.seq_show	= mbm_local_bytes_config_show,
1914 		.write		= mbm_local_bytes_config_write,
1915 	},
1916 	{
1917 		.name		= "cpus",
1918 		.mode		= 0644,
1919 		.kf_ops		= &rdtgroup_kf_single_ops,
1920 		.write		= rdtgroup_cpus_write,
1921 		.seq_show	= rdtgroup_cpus_show,
1922 		.fflags		= RFTYPE_BASE,
1923 	},
1924 	{
1925 		.name		= "cpus_list",
1926 		.mode		= 0644,
1927 		.kf_ops		= &rdtgroup_kf_single_ops,
1928 		.write		= rdtgroup_cpus_write,
1929 		.seq_show	= rdtgroup_cpus_show,
1930 		.flags		= RFTYPE_FLAGS_CPUS_LIST,
1931 		.fflags		= RFTYPE_BASE,
1932 	},
1933 	{
1934 		.name		= "tasks",
1935 		.mode		= 0644,
1936 		.kf_ops		= &rdtgroup_kf_single_ops,
1937 		.write		= rdtgroup_tasks_write,
1938 		.seq_show	= rdtgroup_tasks_show,
1939 		.fflags		= RFTYPE_BASE,
1940 	},
1941 	{
1942 		.name		= "mon_hw_id",
1943 		.mode		= 0444,
1944 		.kf_ops		= &rdtgroup_kf_single_ops,
1945 		.seq_show	= rdtgroup_rmid_show,
1946 		.fflags		= RFTYPE_MON_BASE | RFTYPE_DEBUG,
1947 	},
1948 	{
1949 		.name		= "schemata",
1950 		.mode		= 0644,
1951 		.kf_ops		= &rdtgroup_kf_single_ops,
1952 		.write		= rdtgroup_schemata_write,
1953 		.seq_show	= rdtgroup_schemata_show,
1954 		.fflags		= RFTYPE_CTRL_BASE,
1955 	},
1956 	{
1957 		.name		= "mba_MBps_event",
1958 		.mode		= 0644,
1959 		.kf_ops		= &rdtgroup_kf_single_ops,
1960 		.write		= rdtgroup_mba_mbps_event_write,
1961 		.seq_show	= rdtgroup_mba_mbps_event_show,
1962 	},
1963 	{
1964 		.name		= "mode",
1965 		.mode		= 0644,
1966 		.kf_ops		= &rdtgroup_kf_single_ops,
1967 		.write		= rdtgroup_mode_write,
1968 		.seq_show	= rdtgroup_mode_show,
1969 		.fflags		= RFTYPE_CTRL_BASE,
1970 	},
1971 	{
1972 		.name		= "size",
1973 		.mode		= 0444,
1974 		.kf_ops		= &rdtgroup_kf_single_ops,
1975 		.seq_show	= rdtgroup_size_show,
1976 		.fflags		= RFTYPE_CTRL_BASE,
1977 	},
1978 	{
1979 		.name		= "sparse_masks",
1980 		.mode		= 0444,
1981 		.kf_ops		= &rdtgroup_kf_single_ops,
1982 		.seq_show	= rdt_has_sparse_bitmasks_show,
1983 		.fflags		= RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
1984 	},
1985 	{
1986 		.name		= "ctrl_hw_id",
1987 		.mode		= 0444,
1988 		.kf_ops		= &rdtgroup_kf_single_ops,
1989 		.seq_show	= rdtgroup_closid_show,
1990 		.fflags		= RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
1991 	},
1992 };
1993 
1994 static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
1995 {
1996 	struct rftype *rfts, *rft;
1997 	int ret, len;
1998 
1999 	rfts = res_common_files;
2000 	len = ARRAY_SIZE(res_common_files);
2001 
2002 	lockdep_assert_held(&rdtgroup_mutex);
2003 
2004 	if (resctrl_debug)
2005 		fflags |= RFTYPE_DEBUG;
2006 
2007 	for (rft = rfts; rft < rfts + len; rft++) {
2008 		if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
2009 			ret = rdtgroup_add_file(kn, rft);
2010 			if (ret)
2011 				goto error;
2012 		}
2013 	}
2014 
2015 	return 0;
2016 error:
2017 	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
2018 	while (--rft >= rfts) {
2019 		if ((fflags & rft->fflags) == rft->fflags)
2020 			kernfs_remove_by_name(kn, rft->name);
2021 	}
2022 	return ret;
2023 }
2024 
2025 static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
2026 {
2027 	struct rftype *rfts, *rft;
2028 	int len;
2029 
2030 	rfts = res_common_files;
2031 	len = ARRAY_SIZE(res_common_files);
2032 
2033 	for (rft = rfts; rft < rfts + len; rft++) {
2034 		if (!strcmp(rft->name, name))
2035 			return rft;
2036 	}
2037 
2038 	return NULL;
2039 }
2040 
2041 static void thread_throttle_mode_init(void)
2042 {
2043 	enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
2044 	struct rdt_resource *r_mba, *r_smba;
2045 
2046 	r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2047 	if (r_mba->alloc_capable &&
2048 	    r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
2049 		throttle_mode = r_mba->membw.throttle_mode;
2050 
2051 	r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
2052 	if (r_smba->alloc_capable &&
2053 	    r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
2054 		throttle_mode = r_smba->membw.throttle_mode;
2055 
2056 	if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
2057 		return;
2058 
2059 	resctrl_file_fflags_init("thread_throttle_mode",
2060 				 RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
2061 }
2062 
2063 void resctrl_file_fflags_init(const char *config, unsigned long fflags)
2064 {
2065 	struct rftype *rft;
2066 
2067 	rft = rdtgroup_get_rftype_by_name(config);
2068 	if (rft)
2069 		rft->fflags = fflags;
2070 }
2071 
2072 /**
2073  * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
2074  * @r: The resource group with which the file is associated.
2075  * @name: Name of the file
2076  *
2077  * The permissions of named resctrl file, directory, or link are modified
2078  * to not allow read, write, or execute by any user.
2079  *
2080  * WARNING: This function is intended to communicate to the user that the
2081  * resctrl file has been locked down - that it is not relevant to the
2082  * particular state the system finds itself in. It should not be relied
2083  * on to protect from user access because after the file's permissions
2084  * are restricted the user can still change the permissions using chmod
2085  * from the command line.
2086  *
2087  * Return: 0 on success, <0 on failure.
2088  */
2089 int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
2090 {
2091 	struct iattr iattr = {.ia_valid = ATTR_MODE,};
2092 	struct kernfs_node *kn;
2093 	int ret = 0;
2094 
2095 	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2096 	if (!kn)
2097 		return -ENOENT;
2098 
2099 	switch (kernfs_type(kn)) {
2100 	case KERNFS_DIR:
2101 		iattr.ia_mode = S_IFDIR;
2102 		break;
2103 	case KERNFS_FILE:
2104 		iattr.ia_mode = S_IFREG;
2105 		break;
2106 	case KERNFS_LINK:
2107 		iattr.ia_mode = S_IFLNK;
2108 		break;
2109 	}
2110 
2111 	ret = kernfs_setattr(kn, &iattr);
2112 	kernfs_put(kn);
2113 	return ret;
2114 }
2115 
2116 /**
2117  * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
2118  * @r: The resource group with which the file is associated.
2119  * @name: Name of the file
2120  * @mask: Mask of permissions that should be restored
2121  *
2122  * Restore the permissions of the named file. If @name is a directory the
2123  * permissions of its parent will be used.
2124  *
2125  * Return: 0 on success, <0 on failure.
2126  */
2127 int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
2128 			     umode_t mask)
2129 {
2130 	struct iattr iattr = {.ia_valid = ATTR_MODE,};
2131 	struct kernfs_node *kn, *parent;
2132 	struct rftype *rfts, *rft;
2133 	int ret, len;
2134 
2135 	rfts = res_common_files;
2136 	len = ARRAY_SIZE(res_common_files);
2137 
2138 	for (rft = rfts; rft < rfts + len; rft++) {
2139 		if (!strcmp(rft->name, name))
2140 			iattr.ia_mode = rft->mode & mask;
2141 	}
2142 
2143 	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
2144 	if (!kn)
2145 		return -ENOENT;
2146 
2147 	switch (kernfs_type(kn)) {
2148 	case KERNFS_DIR:
2149 		parent = kernfs_get_parent(kn);
2150 		if (parent) {
2151 			iattr.ia_mode |= parent->mode;
2152 			kernfs_put(parent);
2153 		}
2154 		iattr.ia_mode |= S_IFDIR;
2155 		break;
2156 	case KERNFS_FILE:
2157 		iattr.ia_mode |= S_IFREG;
2158 		break;
2159 	case KERNFS_LINK:
2160 		iattr.ia_mode |= S_IFLNK;
2161 		break;
2162 	}
2163 
2164 	ret = kernfs_setattr(kn, &iattr);
2165 	kernfs_put(kn);
2166 	return ret;
2167 }
2168 
2169 static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
2170 				      unsigned long fflags)
2171 {
2172 	struct kernfs_node *kn_subdir;
2173 	int ret;
2174 
2175 	kn_subdir = kernfs_create_dir(kn_info, name,
2176 				      kn_info->mode, priv);
2177 	if (IS_ERR(kn_subdir))
2178 		return PTR_ERR(kn_subdir);
2179 
2180 	ret = rdtgroup_kn_set_ugid(kn_subdir);
2181 	if (ret)
2182 		return ret;
2183 
2184 	ret = rdtgroup_add_files(kn_subdir, fflags);
2185 	if (!ret)
2186 		kernfs_activate(kn_subdir);
2187 
2188 	return ret;
2189 }
2190 
2191 static unsigned long fflags_from_resource(struct rdt_resource *r)
2192 {
2193 	switch (r->rid) {
2194 	case RDT_RESOURCE_L3:
2195 	case RDT_RESOURCE_L2:
2196 		return RFTYPE_RES_CACHE;
2197 	case RDT_RESOURCE_MBA:
2198 	case RDT_RESOURCE_SMBA:
2199 		return RFTYPE_RES_MB;
2200 	}
2201 
2202 	return WARN_ON_ONCE(1);
2203 }
2204 
2205 static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
2206 {
2207 	struct resctrl_schema *s;
2208 	struct rdt_resource *r;
2209 	unsigned long fflags;
2210 	char name[32];
2211 	int ret;
2212 
2213 	/* create the directory */
2214 	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
2215 	if (IS_ERR(kn_info))
2216 		return PTR_ERR(kn_info);
2217 
2218 	ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
2219 	if (ret)
2220 		goto out_destroy;
2221 
2222 	/* loop over enabled controls, these are all alloc_capable */
2223 	list_for_each_entry(s, &resctrl_schema_all, list) {
2224 		r = s->res;
2225 		fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
2226 		ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
2227 		if (ret)
2228 			goto out_destroy;
2229 	}
2230 
2231 	for_each_mon_capable_rdt_resource(r) {
2232 		fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
2233 		sprintf(name, "%s_MON", r->name);
2234 		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
2235 		if (ret)
2236 			goto out_destroy;
2237 	}
2238 
2239 	ret = rdtgroup_kn_set_ugid(kn_info);
2240 	if (ret)
2241 		goto out_destroy;
2242 
2243 	kernfs_activate(kn_info);
2244 
2245 	return 0;
2246 
2247 out_destroy:
2248 	kernfs_remove(kn_info);
2249 	return ret;
2250 }
2251 
2252 static int
2253 mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
2254 		    char *name, struct kernfs_node **dest_kn)
2255 {
2256 	struct kernfs_node *kn;
2257 	int ret;
2258 
2259 	/* create the directory */
2260 	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
2261 	if (IS_ERR(kn))
2262 		return PTR_ERR(kn);
2263 
2264 	if (dest_kn)
2265 		*dest_kn = kn;
2266 
2267 	ret = rdtgroup_kn_set_ugid(kn);
2268 	if (ret)
2269 		goto out_destroy;
2270 
2271 	kernfs_activate(kn);
2272 
2273 	return 0;
2274 
2275 out_destroy:
2276 	kernfs_remove(kn);
2277 	return ret;
2278 }
2279 
2280 static inline bool is_mba_linear(void)
2281 {
2282 	return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
2283 }
2284 
2285 static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
2286 {
2287 	u32 num_closid = resctrl_arch_get_num_closid(r);
2288 	int cpu = cpumask_any(&d->hdr.cpu_mask);
2289 	int i;
2290 
2291 	d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
2292 				   GFP_KERNEL, cpu_to_node(cpu));
2293 	if (!d->mbps_val)
2294 		return -ENOMEM;
2295 
2296 	for (i = 0; i < num_closid; i++)
2297 		d->mbps_val[i] = MBA_MAX_MBPS;
2298 
2299 	return 0;
2300 }
2301 
2302 static void mba_sc_domain_destroy(struct rdt_resource *r,
2303 				  struct rdt_ctrl_domain *d)
2304 {
2305 	kfree(d->mbps_val);
2306 	d->mbps_val = NULL;
2307 }
2308 
2309 /*
2310  * MBA software controller is supported only if
2311  * MBM is supported and MBA is in linear scale,
2312  * and the MBM monitor scope is the same as MBA
2313  * control scope.
2314  */
2315 static bool supports_mba_mbps(void)
2316 {
2317 	struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
2318 	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2319 
2320 	return (resctrl_is_mbm_enabled() &&
2321 		r->alloc_capable && is_mba_linear() &&
2322 		r->ctrl_scope == rmbm->mon_scope);
2323 }
2324 
2325 /*
2326  * Enable or disable the MBA software controller
2327  * which helps user specify bandwidth in MBps.
2328  */
2329 static int set_mba_sc(bool mba_sc)
2330 {
2331 	struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
2332 	u32 num_closid = resctrl_arch_get_num_closid(r);
2333 	struct rdt_ctrl_domain *d;
2334 	unsigned long fflags;
2335 	int i;
2336 
2337 	if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
2338 		return -EINVAL;
2339 
2340 	r->membw.mba_sc = mba_sc;
2341 
2342 	rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
2343 
2344 	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
2345 		for (i = 0; i < num_closid; i++)
2346 			d->mbps_val[i] = MBA_MAX_MBPS;
2347 	}
2348 
2349 	fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
2350 	resctrl_file_fflags_init("mba_MBps_event", fflags);
2351 
2352 	return 0;
2353 }
2354 
2355 /*
2356  * We don't allow rdtgroup directories to be created anywhere
2357  * except the root directory. Thus when looking for the rdtgroup
2358  * structure for a kernfs node we are either looking at a directory,
2359  * in which case the rdtgroup structure is pointed at by the "priv"
2360  * field, otherwise we have a file, and need only look to the parent
2361  * to find the rdtgroup.
2362  */
2363 static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
2364 {
2365 	if (kernfs_type(kn) == KERNFS_DIR) {
2366 		/*
2367 		 * All the resource directories use "kn->priv"
2368 		 * to point to the "struct rdtgroup" for the
2369 		 * resource. "info" and its subdirectories don't
2370 		 * have rdtgroup structures, so return NULL here.
2371 		 */
2372 		if (kn == kn_info ||
2373 		    rcu_access_pointer(kn->__parent) == kn_info)
2374 			return NULL;
2375 		else
2376 			return kn->priv;
2377 	} else {
2378 		return rdt_kn_parent_priv(kn);
2379 	}
2380 }
2381 
2382 static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2383 {
2384 	atomic_inc(&rdtgrp->waitcount);
2385 	kernfs_break_active_protection(kn);
2386 }
2387 
2388 static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
2389 {
2390 	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
2391 	    (rdtgrp->flags & RDT_DELETED)) {
2392 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2393 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2394 			rdtgroup_pseudo_lock_remove(rdtgrp);
2395 		kernfs_unbreak_active_protection(kn);
2396 		rdtgroup_remove(rdtgrp);
2397 	} else {
2398 		kernfs_unbreak_active_protection(kn);
2399 	}
2400 }
2401 
2402 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
2403 {
2404 	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2405 
2406 	if (!rdtgrp)
2407 		return NULL;
2408 
2409 	rdtgroup_kn_get(rdtgrp, kn);
2410 
2411 	cpus_read_lock();
2412 	mutex_lock(&rdtgroup_mutex);
2413 
2414 	/* Was this group deleted while we waited? */
2415 	if (rdtgrp->flags & RDT_DELETED)
2416 		return NULL;
2417 
2418 	return rdtgrp;
2419 }
2420 
2421 void rdtgroup_kn_unlock(struct kernfs_node *kn)
2422 {
2423 	struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
2424 
2425 	if (!rdtgrp)
2426 		return;
2427 
2428 	mutex_unlock(&rdtgroup_mutex);
2429 	cpus_read_unlock();
2430 
2431 	rdtgroup_kn_put(rdtgrp, kn);
2432 }
2433 
2434 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
2435 			     struct rdtgroup *prgrp,
2436 			     struct kernfs_node **mon_data_kn);
2437 
2438 static void rdt_disable_ctx(void)
2439 {
2440 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2441 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2442 	set_mba_sc(false);
2443 
2444 	resctrl_debug = false;
2445 }
2446 
2447 static int rdt_enable_ctx(struct rdt_fs_context *ctx)
2448 {
2449 	int ret = 0;
2450 
2451 	if (ctx->enable_cdpl2) {
2452 		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
2453 		if (ret)
2454 			goto out_done;
2455 	}
2456 
2457 	if (ctx->enable_cdpl3) {
2458 		ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
2459 		if (ret)
2460 			goto out_cdpl2;
2461 	}
2462 
2463 	if (ctx->enable_mba_mbps) {
2464 		ret = set_mba_sc(true);
2465 		if (ret)
2466 			goto out_cdpl3;
2467 	}
2468 
2469 	if (ctx->enable_debug)
2470 		resctrl_debug = true;
2471 
2472 	return 0;
2473 
2474 out_cdpl3:
2475 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
2476 out_cdpl2:
2477 	resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
2478 out_done:
2479 	return ret;
2480 }
2481 
2482 static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
2483 {
2484 	struct resctrl_schema *s;
2485 	const char *suffix = "";
2486 	int ret, cl;
2487 
2488 	s = kzalloc(sizeof(*s), GFP_KERNEL);
2489 	if (!s)
2490 		return -ENOMEM;
2491 
2492 	s->res = r;
2493 	s->num_closid = resctrl_arch_get_num_closid(r);
2494 	if (resctrl_arch_get_cdp_enabled(r->rid))
2495 		s->num_closid /= 2;
2496 
2497 	s->conf_type = type;
2498 	switch (type) {
2499 	case CDP_CODE:
2500 		suffix = "CODE";
2501 		break;
2502 	case CDP_DATA:
2503 		suffix = "DATA";
2504 		break;
2505 	case CDP_NONE:
2506 		suffix = "";
2507 		break;
2508 	}
2509 
2510 	ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
2511 	if (ret >= sizeof(s->name)) {
2512 		kfree(s);
2513 		return -EINVAL;
2514 	}
2515 
2516 	cl = strlen(s->name);
2517 
2518 	/*
2519 	 * If CDP is supported by this resource, but not enabled,
2520 	 * include the suffix. This ensures the tabular format of the
2521 	 * schemata file does not change between mounts of the filesystem.
2522 	 */
2523 	if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
2524 		cl += 4;
2525 
2526 	if (cl > max_name_width)
2527 		max_name_width = cl;
2528 
2529 	switch (r->schema_fmt) {
2530 	case RESCTRL_SCHEMA_BITMAP:
2531 		s->fmt_str = "%d=%x";
2532 		break;
2533 	case RESCTRL_SCHEMA_RANGE:
2534 		s->fmt_str = "%d=%u";
2535 		break;
2536 	}
2537 
2538 	if (WARN_ON_ONCE(!s->fmt_str)) {
2539 		kfree(s);
2540 		return -EINVAL;
2541 	}
2542 
2543 	INIT_LIST_HEAD(&s->list);
2544 	list_add(&s->list, &resctrl_schema_all);
2545 
2546 	return 0;
2547 }
2548 
2549 static int schemata_list_create(void)
2550 {
2551 	struct rdt_resource *r;
2552 	int ret = 0;
2553 
2554 	for_each_alloc_capable_rdt_resource(r) {
2555 		if (resctrl_arch_get_cdp_enabled(r->rid)) {
2556 			ret = schemata_list_add(r, CDP_CODE);
2557 			if (ret)
2558 				break;
2559 
2560 			ret = schemata_list_add(r, CDP_DATA);
2561 		} else {
2562 			ret = schemata_list_add(r, CDP_NONE);
2563 		}
2564 
2565 		if (ret)
2566 			break;
2567 	}
2568 
2569 	return ret;
2570 }
2571 
2572 static void schemata_list_destroy(void)
2573 {
2574 	struct resctrl_schema *s, *tmp;
2575 
2576 	list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
2577 		list_del(&s->list);
2578 		kfree(s);
2579 	}
2580 }
2581 
2582 static int rdt_get_tree(struct fs_context *fc)
2583 {
2584 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2585 	unsigned long flags = RFTYPE_CTRL_BASE;
2586 	struct rdt_mon_domain *dom;
2587 	struct rdt_resource *r;
2588 	int ret;
2589 
2590 	cpus_read_lock();
2591 	mutex_lock(&rdtgroup_mutex);
2592 	/*
2593 	 * resctrl file system can only be mounted once.
2594 	 */
2595 	if (resctrl_mounted) {
2596 		ret = -EBUSY;
2597 		goto out;
2598 	}
2599 
2600 	ret = rdtgroup_setup_root(ctx);
2601 	if (ret)
2602 		goto out;
2603 
2604 	ret = rdt_enable_ctx(ctx);
2605 	if (ret)
2606 		goto out_root;
2607 
2608 	ret = schemata_list_create();
2609 	if (ret) {
2610 		schemata_list_destroy();
2611 		goto out_ctx;
2612 	}
2613 
2614 	ret = closid_init();
2615 	if (ret)
2616 		goto out_schemata_free;
2617 
2618 	if (resctrl_arch_mon_capable())
2619 		flags |= RFTYPE_MON;
2620 
2621 	ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
2622 	if (ret)
2623 		goto out_closid_exit;
2624 
2625 	kernfs_activate(rdtgroup_default.kn);
2626 
2627 	ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
2628 	if (ret < 0)
2629 		goto out_closid_exit;
2630 
2631 	if (resctrl_arch_mon_capable()) {
2632 		ret = mongroup_create_dir(rdtgroup_default.kn,
2633 					  &rdtgroup_default, "mon_groups",
2634 					  &kn_mongrp);
2635 		if (ret < 0)
2636 			goto out_info;
2637 
2638 		ret = mkdir_mondata_all(rdtgroup_default.kn,
2639 					&rdtgroup_default, &kn_mondata);
2640 		if (ret < 0)
2641 			goto out_mongrp;
2642 		rdtgroup_default.mon.mon_data_kn = kn_mondata;
2643 	}
2644 
2645 	ret = rdt_pseudo_lock_init();
2646 	if (ret)
2647 		goto out_mondata;
2648 
2649 	ret = kernfs_get_tree(fc);
2650 	if (ret < 0)
2651 		goto out_psl;
2652 
2653 	if (resctrl_arch_alloc_capable())
2654 		resctrl_arch_enable_alloc();
2655 	if (resctrl_arch_mon_capable())
2656 		resctrl_arch_enable_mon();
2657 
2658 	if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
2659 		resctrl_mounted = true;
2660 
2661 	if (resctrl_is_mbm_enabled()) {
2662 		r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
2663 		list_for_each_entry(dom, &r->mon_domains, hdr.list)
2664 			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
2665 						   RESCTRL_PICK_ANY_CPU);
2666 	}
2667 
2668 	goto out;
2669 
2670 out_psl:
2671 	rdt_pseudo_lock_release();
2672 out_mondata:
2673 	if (resctrl_arch_mon_capable())
2674 		kernfs_remove(kn_mondata);
2675 out_mongrp:
2676 	if (resctrl_arch_mon_capable())
2677 		kernfs_remove(kn_mongrp);
2678 out_info:
2679 	kernfs_remove(kn_info);
2680 out_closid_exit:
2681 	closid_exit();
2682 out_schemata_free:
2683 	schemata_list_destroy();
2684 out_ctx:
2685 	rdt_disable_ctx();
2686 out_root:
2687 	rdtgroup_destroy_root();
2688 out:
2689 	rdt_last_cmd_clear();
2690 	mutex_unlock(&rdtgroup_mutex);
2691 	cpus_read_unlock();
2692 	return ret;
2693 }
2694 
2695 enum rdt_param {
2696 	Opt_cdp,
2697 	Opt_cdpl2,
2698 	Opt_mba_mbps,
2699 	Opt_debug,
2700 	nr__rdt_params
2701 };
2702 
2703 static const struct fs_parameter_spec rdt_fs_parameters[] = {
2704 	fsparam_flag("cdp",		Opt_cdp),
2705 	fsparam_flag("cdpl2",		Opt_cdpl2),
2706 	fsparam_flag("mba_MBps",	Opt_mba_mbps),
2707 	fsparam_flag("debug",		Opt_debug),
2708 	{}
2709 };
2710 
2711 static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
2712 {
2713 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2714 	struct fs_parse_result result;
2715 	const char *msg;
2716 	int opt;
2717 
2718 	opt = fs_parse(fc, rdt_fs_parameters, param, &result);
2719 	if (opt < 0)
2720 		return opt;
2721 
2722 	switch (opt) {
2723 	case Opt_cdp:
2724 		ctx->enable_cdpl3 = true;
2725 		return 0;
2726 	case Opt_cdpl2:
2727 		ctx->enable_cdpl2 = true;
2728 		return 0;
2729 	case Opt_mba_mbps:
2730 		msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
2731 		if (!supports_mba_mbps())
2732 			return invalfc(fc, msg);
2733 		ctx->enable_mba_mbps = true;
2734 		return 0;
2735 	case Opt_debug:
2736 		ctx->enable_debug = true;
2737 		return 0;
2738 	}
2739 
2740 	return -EINVAL;
2741 }
2742 
2743 static void rdt_fs_context_free(struct fs_context *fc)
2744 {
2745 	struct rdt_fs_context *ctx = rdt_fc2context(fc);
2746 
2747 	kernfs_free_fs_context(fc);
2748 	kfree(ctx);
2749 }
2750 
2751 static const struct fs_context_operations rdt_fs_context_ops = {
2752 	.free		= rdt_fs_context_free,
2753 	.parse_param	= rdt_parse_param,
2754 	.get_tree	= rdt_get_tree,
2755 };
2756 
2757 static int rdt_init_fs_context(struct fs_context *fc)
2758 {
2759 	struct rdt_fs_context *ctx;
2760 
2761 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
2762 	if (!ctx)
2763 		return -ENOMEM;
2764 
2765 	ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
2766 	fc->fs_private = &ctx->kfc;
2767 	fc->ops = &rdt_fs_context_ops;
2768 	put_user_ns(fc->user_ns);
2769 	fc->user_ns = get_user_ns(&init_user_ns);
2770 	fc->global = true;
2771 	return 0;
2772 }
2773 
2774 /*
2775  * Move tasks from one to the other group. If @from is NULL, then all tasks
2776  * in the systems are moved unconditionally (used for teardown).
2777  *
2778  * If @mask is not NULL the cpus on which moved tasks are running are set
2779  * in that mask so the update smp function call is restricted to affected
2780  * cpus.
2781  */
2782 static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
2783 				 struct cpumask *mask)
2784 {
2785 	struct task_struct *p, *t;
2786 
2787 	read_lock(&tasklist_lock);
2788 	for_each_process_thread(p, t) {
2789 		if (!from || is_closid_match(t, from) ||
2790 		    is_rmid_match(t, from)) {
2791 			resctrl_arch_set_closid_rmid(t, to->closid,
2792 						     to->mon.rmid);
2793 
2794 			/*
2795 			 * Order the closid/rmid stores above before the loads
2796 			 * in task_curr(). This pairs with the full barrier
2797 			 * between the rq->curr update and
2798 			 * resctrl_arch_sched_in() during context switch.
2799 			 */
2800 			smp_mb();
2801 
2802 			/*
2803 			 * If the task is on a CPU, set the CPU in the mask.
2804 			 * The detection is inaccurate as tasks might move or
2805 			 * schedule before the smp function call takes place.
2806 			 * In such a case the function call is pointless, but
2807 			 * there is no other side effect.
2808 			 */
2809 			if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
2810 				cpumask_set_cpu(task_cpu(t), mask);
2811 		}
2812 	}
2813 	read_unlock(&tasklist_lock);
2814 }
2815 
2816 static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
2817 {
2818 	struct rdtgroup *sentry, *stmp;
2819 	struct list_head *head;
2820 
2821 	head = &rdtgrp->mon.crdtgrp_list;
2822 	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
2823 		free_rmid(sentry->closid, sentry->mon.rmid);
2824 		list_del(&sentry->mon.crdtgrp_list);
2825 
2826 		if (atomic_read(&sentry->waitcount) != 0)
2827 			sentry->flags = RDT_DELETED;
2828 		else
2829 			rdtgroup_remove(sentry);
2830 	}
2831 }
2832 
2833 /*
2834  * Forcibly remove all of subdirectories under root.
2835  */
2836 static void rmdir_all_sub(void)
2837 {
2838 	struct rdtgroup *rdtgrp, *tmp;
2839 
2840 	/* Move all tasks to the default resource group */
2841 	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
2842 
2843 	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
2844 		/* Free any child rmids */
2845 		free_all_child_rdtgrp(rdtgrp);
2846 
2847 		/* Remove each rdtgroup other than root */
2848 		if (rdtgrp == &rdtgroup_default)
2849 			continue;
2850 
2851 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
2852 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
2853 			rdtgroup_pseudo_lock_remove(rdtgrp);
2854 
2855 		/*
2856 		 * Give any CPUs back to the default group. We cannot copy
2857 		 * cpu_online_mask because a CPU might have executed the
2858 		 * offline callback already, but is still marked online.
2859 		 */
2860 		cpumask_or(&rdtgroup_default.cpu_mask,
2861 			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
2862 
2863 		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
2864 
2865 		kernfs_remove(rdtgrp->kn);
2866 		list_del(&rdtgrp->rdtgroup_list);
2867 
2868 		if (atomic_read(&rdtgrp->waitcount) != 0)
2869 			rdtgrp->flags = RDT_DELETED;
2870 		else
2871 			rdtgroup_remove(rdtgrp);
2872 	}
2873 	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
2874 	update_closid_rmid(cpu_online_mask, &rdtgroup_default);
2875 
2876 	kernfs_remove(kn_info);
2877 	kernfs_remove(kn_mongrp);
2878 	kernfs_remove(kn_mondata);
2879 }
2880 
2881 /**
2882  * mon_get_kn_priv() - Get the mon_data priv data for this event.
2883  *
2884  * The same values are used across the mon_data directories of all control and
2885  * monitor groups for the same event in the same domain. Keep a list of
2886  * allocated structures and re-use an existing one with the same values for
2887  * @rid, @domid, etc.
2888  *
2889  * @rid:    The resource id for the event file being created.
2890  * @domid:  The domain id for the event file being created.
2891  * @mevt:   The type of event file being created.
2892  * @do_sum: Whether SNC summing monitors are being created.
2893  */
2894 static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid,
2895 					struct mon_evt *mevt,
2896 					bool do_sum)
2897 {
2898 	struct mon_data *priv;
2899 
2900 	lockdep_assert_held(&rdtgroup_mutex);
2901 
2902 	list_for_each_entry(priv, &mon_data_kn_priv_list, list) {
2903 		if (priv->rid == rid && priv->domid == domid &&
2904 		    priv->sum == do_sum && priv->evtid == mevt->evtid)
2905 			return priv;
2906 	}
2907 
2908 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
2909 	if (!priv)
2910 		return NULL;
2911 
2912 	priv->rid = rid;
2913 	priv->domid = domid;
2914 	priv->sum = do_sum;
2915 	priv->evtid = mevt->evtid;
2916 	list_add_tail(&priv->list, &mon_data_kn_priv_list);
2917 
2918 	return priv;
2919 }
2920 
2921 /**
2922  * mon_put_kn_priv() - Free all allocated mon_data structures.
2923  *
2924  * Called when resctrl file system is unmounted.
2925  */
2926 static void mon_put_kn_priv(void)
2927 {
2928 	struct mon_data *priv, *tmp;
2929 
2930 	lockdep_assert_held(&rdtgroup_mutex);
2931 
2932 	list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) {
2933 		list_del(&priv->list);
2934 		kfree(priv);
2935 	}
2936 }
2937 
2938 static void resctrl_fs_teardown(void)
2939 {
2940 	lockdep_assert_held(&rdtgroup_mutex);
2941 
2942 	/* Cleared by rdtgroup_destroy_root() */
2943 	if (!rdtgroup_default.kn)
2944 		return;
2945 
2946 	rmdir_all_sub();
2947 	mon_put_kn_priv();
2948 	rdt_pseudo_lock_release();
2949 	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
2950 	closid_exit();
2951 	schemata_list_destroy();
2952 	rdtgroup_destroy_root();
2953 }
2954 
2955 static void rdt_kill_sb(struct super_block *sb)
2956 {
2957 	struct rdt_resource *r;
2958 
2959 	cpus_read_lock();
2960 	mutex_lock(&rdtgroup_mutex);
2961 
2962 	rdt_disable_ctx();
2963 
2964 	/* Put everything back to default values. */
2965 	for_each_alloc_capable_rdt_resource(r)
2966 		resctrl_arch_reset_all_ctrls(r);
2967 
2968 	resctrl_fs_teardown();
2969 	if (resctrl_arch_alloc_capable())
2970 		resctrl_arch_disable_alloc();
2971 	if (resctrl_arch_mon_capable())
2972 		resctrl_arch_disable_mon();
2973 	resctrl_mounted = false;
2974 	kernfs_kill_sb(sb);
2975 	mutex_unlock(&rdtgroup_mutex);
2976 	cpus_read_unlock();
2977 }
2978 
2979 static struct file_system_type rdt_fs_type = {
2980 	.name			= "resctrl",
2981 	.init_fs_context	= rdt_init_fs_context,
2982 	.parameters		= rdt_fs_parameters,
2983 	.kill_sb		= rdt_kill_sb,
2984 };
2985 
2986 static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
2987 		       void *priv)
2988 {
2989 	struct kernfs_node *kn;
2990 	int ret = 0;
2991 
2992 	kn = __kernfs_create_file(parent_kn, name, 0444,
2993 				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
2994 				  &kf_mondata_ops, priv, NULL, NULL);
2995 	if (IS_ERR(kn))
2996 		return PTR_ERR(kn);
2997 
2998 	ret = rdtgroup_kn_set_ugid(kn);
2999 	if (ret) {
3000 		kernfs_remove(kn);
3001 		return ret;
3002 	}
3003 
3004 	return ret;
3005 }
3006 
3007 static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
3008 {
3009 	struct kernfs_node *kn;
3010 
3011 	kn = kernfs_find_and_get(pkn, name);
3012 	if (!kn)
3013 		return;
3014 	kernfs_put(kn);
3015 
3016 	if (kn->dir.subdirs <= 1)
3017 		kernfs_remove(kn);
3018 	else
3019 		kernfs_remove_by_name(kn, subname);
3020 }
3021 
3022 /*
3023  * Remove all subdirectories of mon_data of ctrl_mon groups
3024  * and monitor groups for the given domain.
3025  * Remove files and directories containing "sum" of domain data
3026  * when last domain being summed is removed.
3027  */
3028 static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3029 					   struct rdt_mon_domain *d)
3030 {
3031 	struct rdtgroup *prgrp, *crgrp;
3032 	char subname[32];
3033 	bool snc_mode;
3034 	char name[32];
3035 
3036 	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3037 	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
3038 	if (snc_mode)
3039 		sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
3040 
3041 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3042 		mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
3043 
3044 		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
3045 			mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
3046 	}
3047 }
3048 
3049 static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
3050 			     struct rdt_resource *r, struct rdtgroup *prgrp,
3051 			     bool do_sum)
3052 {
3053 	struct rmid_read rr = {0};
3054 	struct mon_data *priv;
3055 	struct mon_evt *mevt;
3056 	int ret, domid;
3057 
3058 	if (WARN_ON(list_empty(&r->evt_list)))
3059 		return -EPERM;
3060 
3061 	list_for_each_entry(mevt, &r->evt_list, list) {
3062 		domid = do_sum ? d->ci->id : d->hdr.id;
3063 		priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
3064 		if (WARN_ON_ONCE(!priv))
3065 			return -EINVAL;
3066 
3067 		ret = mon_addfile(kn, mevt->name, priv);
3068 		if (ret)
3069 			return ret;
3070 
3071 		if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
3072 			mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
3073 	}
3074 
3075 	return 0;
3076 }
3077 
3078 static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
3079 				struct rdt_mon_domain *d,
3080 				struct rdt_resource *r, struct rdtgroup *prgrp)
3081 {
3082 	struct kernfs_node *kn, *ckn;
3083 	char name[32];
3084 	bool snc_mode;
3085 	int ret = 0;
3086 
3087 	lockdep_assert_held(&rdtgroup_mutex);
3088 
3089 	snc_mode = r->mon_scope == RESCTRL_L3_NODE;
3090 	sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id);
3091 	kn = kernfs_find_and_get(parent_kn, name);
3092 	if (kn) {
3093 		/*
3094 		 * rdtgroup_mutex will prevent this directory from being
3095 		 * removed. No need to keep this hold.
3096 		 */
3097 		kernfs_put(kn);
3098 	} else {
3099 		kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
3100 		if (IS_ERR(kn))
3101 			return PTR_ERR(kn);
3102 
3103 		ret = rdtgroup_kn_set_ugid(kn);
3104 		if (ret)
3105 			goto out_destroy;
3106 		ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
3107 		if (ret)
3108 			goto out_destroy;
3109 	}
3110 
3111 	if (snc_mode) {
3112 		sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
3113 		ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
3114 		if (IS_ERR(ckn)) {
3115 			ret = -EINVAL;
3116 			goto out_destroy;
3117 		}
3118 
3119 		ret = rdtgroup_kn_set_ugid(ckn);
3120 		if (ret)
3121 			goto out_destroy;
3122 
3123 		ret = mon_add_all_files(ckn, d, r, prgrp, false);
3124 		if (ret)
3125 			goto out_destroy;
3126 	}
3127 
3128 	kernfs_activate(kn);
3129 	return 0;
3130 
3131 out_destroy:
3132 	kernfs_remove(kn);
3133 	return ret;
3134 }
3135 
3136 /*
3137  * Add all subdirectories of mon_data for "ctrl_mon" groups
3138  * and "monitor" groups with given domain id.
3139  */
3140 static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
3141 					   struct rdt_mon_domain *d)
3142 {
3143 	struct kernfs_node *parent_kn;
3144 	struct rdtgroup *prgrp, *crgrp;
3145 	struct list_head *head;
3146 
3147 	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
3148 		parent_kn = prgrp->mon.mon_data_kn;
3149 		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
3150 
3151 		head = &prgrp->mon.crdtgrp_list;
3152 		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
3153 			parent_kn = crgrp->mon.mon_data_kn;
3154 			mkdir_mondata_subdir(parent_kn, d, r, crgrp);
3155 		}
3156 	}
3157 }
3158 
3159 static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
3160 				       struct rdt_resource *r,
3161 				       struct rdtgroup *prgrp)
3162 {
3163 	struct rdt_mon_domain *dom;
3164 	int ret;
3165 
3166 	/* Walking r->domains, ensure it can't race with cpuhp */
3167 	lockdep_assert_cpus_held();
3168 
3169 	list_for_each_entry(dom, &r->mon_domains, hdr.list) {
3170 		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
3171 		if (ret)
3172 			return ret;
3173 	}
3174 
3175 	return 0;
3176 }
3177 
3178 /*
3179  * This creates a directory mon_data which contains the monitored data.
3180  *
3181  * mon_data has one directory for each domain which are named
3182  * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
3183  * with L3 domain looks as below:
3184  * ./mon_data:
3185  * mon_L3_00
3186  * mon_L3_01
3187  * mon_L3_02
3188  * ...
3189  *
3190  * Each domain directory has one file per event:
3191  * ./mon_L3_00/:
3192  * llc_occupancy
3193  *
3194  */
3195 static int mkdir_mondata_all(struct kernfs_node *parent_kn,
3196 			     struct rdtgroup *prgrp,
3197 			     struct kernfs_node **dest_kn)
3198 {
3199 	struct rdt_resource *r;
3200 	struct kernfs_node *kn;
3201 	int ret;
3202 
3203 	/*
3204 	 * Create the mon_data directory first.
3205 	 */
3206 	ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
3207 	if (ret)
3208 		return ret;
3209 
3210 	if (dest_kn)
3211 		*dest_kn = kn;
3212 
3213 	/*
3214 	 * Create the subdirectories for each domain. Note that all events
3215 	 * in a domain like L3 are grouped into a resource whose domain is L3
3216 	 */
3217 	for_each_mon_capable_rdt_resource(r) {
3218 		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
3219 		if (ret)
3220 			goto out_destroy;
3221 	}
3222 
3223 	return 0;
3224 
3225 out_destroy:
3226 	kernfs_remove(kn);
3227 	return ret;
3228 }
3229 
3230 /**
3231  * cbm_ensure_valid - Enforce validity on provided CBM
3232  * @_val:	Candidate CBM
3233  * @r:		RDT resource to which the CBM belongs
3234  *
3235  * The provided CBM represents all cache portions available for use. This
3236  * may be represented by a bitmap that does not consist of contiguous ones
3237  * and thus be an invalid CBM.
3238  * Here the provided CBM is forced to be a valid CBM by only considering
3239  * the first set of contiguous bits as valid and clearing all bits.
3240  * The intention here is to provide a valid default CBM with which a new
3241  * resource group is initialized. The user can follow this with a
3242  * modification to the CBM if the default does not satisfy the
3243  * requirements.
3244  */
3245 static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
3246 {
3247 	unsigned int cbm_len = r->cache.cbm_len;
3248 	unsigned long first_bit, zero_bit;
3249 	unsigned long val = _val;
3250 
3251 	if (!val)
3252 		return 0;
3253 
3254 	first_bit = find_first_bit(&val, cbm_len);
3255 	zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
3256 
3257 	/* Clear any remaining bits to ensure contiguous region */
3258 	bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
3259 	return (u32)val;
3260 }
3261 
3262 /*
3263  * Initialize cache resources per RDT domain
3264  *
3265  * Set the RDT domain up to start off with all usable allocations. That is,
3266  * all shareable and unused bits. All-zero CBM is invalid.
3267  */
3268 static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
3269 				 u32 closid)
3270 {
3271 	enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
3272 	enum resctrl_conf_type t = s->conf_type;
3273 	struct resctrl_staged_config *cfg;
3274 	struct rdt_resource *r = s->res;
3275 	u32 used_b = 0, unused_b = 0;
3276 	unsigned long tmp_cbm;
3277 	enum rdtgrp_mode mode;
3278 	u32 peer_ctl, ctrl_val;
3279 	int i;
3280 
3281 	cfg = &d->staged_config[t];
3282 	cfg->have_new_ctrl = false;
3283 	cfg->new_ctrl = r->cache.shareable_bits;
3284 	used_b = r->cache.shareable_bits;
3285 	for (i = 0; i < closids_supported(); i++) {
3286 		if (closid_allocated(i) && i != closid) {
3287 			mode = rdtgroup_mode_by_closid(i);
3288 			if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
3289 				/*
3290 				 * ctrl values for locksetup aren't relevant
3291 				 * until the schemata is written, and the mode
3292 				 * becomes RDT_MODE_PSEUDO_LOCKED.
3293 				 */
3294 				continue;
3295 			/*
3296 			 * If CDP is active include peer domain's
3297 			 * usage to ensure there is no overlap
3298 			 * with an exclusive group.
3299 			 */
3300 			if (resctrl_arch_get_cdp_enabled(r->rid))
3301 				peer_ctl = resctrl_arch_get_config(r, d, i,
3302 								   peer_type);
3303 			else
3304 				peer_ctl = 0;
3305 			ctrl_val = resctrl_arch_get_config(r, d, i,
3306 							   s->conf_type);
3307 			used_b |= ctrl_val | peer_ctl;
3308 			if (mode == RDT_MODE_SHAREABLE)
3309 				cfg->new_ctrl |= ctrl_val | peer_ctl;
3310 		}
3311 	}
3312 	if (d->plr && d->plr->cbm > 0)
3313 		used_b |= d->plr->cbm;
3314 	unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
3315 	unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
3316 	cfg->new_ctrl |= unused_b;
3317 	/*
3318 	 * Force the initial CBM to be valid, user can
3319 	 * modify the CBM based on system availability.
3320 	 */
3321 	cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
3322 	/*
3323 	 * Assign the u32 CBM to an unsigned long to ensure that
3324 	 * bitmap_weight() does not access out-of-bound memory.
3325 	 */
3326 	tmp_cbm = cfg->new_ctrl;
3327 	if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
3328 		rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
3329 		return -ENOSPC;
3330 	}
3331 	cfg->have_new_ctrl = true;
3332 
3333 	return 0;
3334 }
3335 
3336 /*
3337  * Initialize cache resources with default values.
3338  *
3339  * A new RDT group is being created on an allocation capable (CAT)
3340  * supporting system. Set this group up to start off with all usable
3341  * allocations.
3342  *
3343  * If there are no more shareable bits available on any domain then
3344  * the entire allocation will fail.
3345  */
3346 static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
3347 {
3348 	struct rdt_ctrl_domain *d;
3349 	int ret;
3350 
3351 	list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
3352 		ret = __init_one_rdt_domain(d, s, closid);
3353 		if (ret < 0)
3354 			return ret;
3355 	}
3356 
3357 	return 0;
3358 }
3359 
3360 /* Initialize MBA resource with default values. */
3361 static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
3362 {
3363 	struct resctrl_staged_config *cfg;
3364 	struct rdt_ctrl_domain *d;
3365 
3366 	list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
3367 		if (is_mba_sc(r)) {
3368 			d->mbps_val[closid] = MBA_MAX_MBPS;
3369 			continue;
3370 		}
3371 
3372 		cfg = &d->staged_config[CDP_NONE];
3373 		cfg->new_ctrl = resctrl_get_default_ctrl(r);
3374 		cfg->have_new_ctrl = true;
3375 	}
3376 }
3377 
3378 /* Initialize the RDT group's allocations. */
3379 static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
3380 {
3381 	struct resctrl_schema *s;
3382 	struct rdt_resource *r;
3383 	int ret = 0;
3384 
3385 	rdt_staged_configs_clear();
3386 
3387 	list_for_each_entry(s, &resctrl_schema_all, list) {
3388 		r = s->res;
3389 		if (r->rid == RDT_RESOURCE_MBA ||
3390 		    r->rid == RDT_RESOURCE_SMBA) {
3391 			rdtgroup_init_mba(r, rdtgrp->closid);
3392 			if (is_mba_sc(r))
3393 				continue;
3394 		} else {
3395 			ret = rdtgroup_init_cat(s, rdtgrp->closid);
3396 			if (ret < 0)
3397 				goto out;
3398 		}
3399 
3400 		ret = resctrl_arch_update_domains(r, rdtgrp->closid);
3401 		if (ret < 0) {
3402 			rdt_last_cmd_puts("Failed to initialize allocations\n");
3403 			goto out;
3404 		}
3405 	}
3406 
3407 	rdtgrp->mode = RDT_MODE_SHAREABLE;
3408 
3409 out:
3410 	rdt_staged_configs_clear();
3411 	return ret;
3412 }
3413 
3414 static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
3415 {
3416 	int ret;
3417 
3418 	if (!resctrl_arch_mon_capable())
3419 		return 0;
3420 
3421 	ret = alloc_rmid(rdtgrp->closid);
3422 	if (ret < 0) {
3423 		rdt_last_cmd_puts("Out of RMIDs\n");
3424 		return ret;
3425 	}
3426 	rdtgrp->mon.rmid = ret;
3427 
3428 	ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
3429 	if (ret) {
3430 		rdt_last_cmd_puts("kernfs subdir error\n");
3431 		free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3432 		return ret;
3433 	}
3434 
3435 	return 0;
3436 }
3437 
3438 static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
3439 {
3440 	if (resctrl_arch_mon_capable())
3441 		free_rmid(rgrp->closid, rgrp->mon.rmid);
3442 }
3443 
3444 /*
3445  * We allow creating mon groups only with in a directory called "mon_groups"
3446  * which is present in every ctrl_mon group. Check if this is a valid
3447  * "mon_groups" directory.
3448  *
3449  * 1. The directory should be named "mon_groups".
3450  * 2. The mon group itself should "not" be named "mon_groups".
3451  *   This makes sure "mon_groups" directory always has a ctrl_mon group
3452  *   as parent.
3453  */
3454 static bool is_mon_groups(struct kernfs_node *kn, const char *name)
3455 {
3456 	return (!strcmp(rdt_kn_name(kn), "mon_groups") &&
3457 		strcmp(name, "mon_groups"));
3458 }
3459 
3460 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
3461 			     const char *name, umode_t mode,
3462 			     enum rdt_group_type rtype, struct rdtgroup **r)
3463 {
3464 	struct rdtgroup *prdtgrp, *rdtgrp;
3465 	unsigned long files = 0;
3466 	struct kernfs_node *kn;
3467 	int ret;
3468 
3469 	prdtgrp = rdtgroup_kn_lock_live(parent_kn);
3470 	if (!prdtgrp) {
3471 		ret = -ENODEV;
3472 		goto out_unlock;
3473 	}
3474 
3475 	/*
3476 	 * Check that the parent directory for a monitor group is a "mon_groups"
3477 	 * directory.
3478 	 */
3479 	if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) {
3480 		ret = -EPERM;
3481 		goto out_unlock;
3482 	}
3483 
3484 	if (rtype == RDTMON_GROUP &&
3485 	    (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3486 	     prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
3487 		ret = -EINVAL;
3488 		rdt_last_cmd_puts("Pseudo-locking in progress\n");
3489 		goto out_unlock;
3490 	}
3491 
3492 	/* allocate the rdtgroup. */
3493 	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
3494 	if (!rdtgrp) {
3495 		ret = -ENOSPC;
3496 		rdt_last_cmd_puts("Kernel out of memory\n");
3497 		goto out_unlock;
3498 	}
3499 	*r = rdtgrp;
3500 	rdtgrp->mon.parent = prdtgrp;
3501 	rdtgrp->type = rtype;
3502 	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
3503 
3504 	/* kernfs creates the directory for rdtgrp */
3505 	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
3506 	if (IS_ERR(kn)) {
3507 		ret = PTR_ERR(kn);
3508 		rdt_last_cmd_puts("kernfs create error\n");
3509 		goto out_free_rgrp;
3510 	}
3511 	rdtgrp->kn = kn;
3512 
3513 	/*
3514 	 * kernfs_remove() will drop the reference count on "kn" which
3515 	 * will free it. But we still need it to stick around for the
3516 	 * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
3517 	 * which will be dropped by kernfs_put() in rdtgroup_remove().
3518 	 */
3519 	kernfs_get(kn);
3520 
3521 	ret = rdtgroup_kn_set_ugid(kn);
3522 	if (ret) {
3523 		rdt_last_cmd_puts("kernfs perm error\n");
3524 		goto out_destroy;
3525 	}
3526 
3527 	if (rtype == RDTCTRL_GROUP) {
3528 		files = RFTYPE_BASE | RFTYPE_CTRL;
3529 		if (resctrl_arch_mon_capable())
3530 			files |= RFTYPE_MON;
3531 	} else {
3532 		files = RFTYPE_BASE | RFTYPE_MON;
3533 	}
3534 
3535 	ret = rdtgroup_add_files(kn, files);
3536 	if (ret) {
3537 		rdt_last_cmd_puts("kernfs fill error\n");
3538 		goto out_destroy;
3539 	}
3540 
3541 	/*
3542 	 * The caller unlocks the parent_kn upon success.
3543 	 */
3544 	return 0;
3545 
3546 out_destroy:
3547 	kernfs_put(rdtgrp->kn);
3548 	kernfs_remove(rdtgrp->kn);
3549 out_free_rgrp:
3550 	kfree(rdtgrp);
3551 out_unlock:
3552 	rdtgroup_kn_unlock(parent_kn);
3553 	return ret;
3554 }
3555 
3556 static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
3557 {
3558 	kernfs_remove(rgrp->kn);
3559 	rdtgroup_remove(rgrp);
3560 }
3561 
3562 /*
3563  * Create a monitor group under "mon_groups" directory of a control
3564  * and monitor group(ctrl_mon). This is a resource group
3565  * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
3566  */
3567 static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
3568 			      const char *name, umode_t mode)
3569 {
3570 	struct rdtgroup *rdtgrp, *prgrp;
3571 	int ret;
3572 
3573 	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
3574 	if (ret)
3575 		return ret;
3576 
3577 	prgrp = rdtgrp->mon.parent;
3578 	rdtgrp->closid = prgrp->closid;
3579 
3580 	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3581 	if (ret) {
3582 		mkdir_rdt_prepare_clean(rdtgrp);
3583 		goto out_unlock;
3584 	}
3585 
3586 	kernfs_activate(rdtgrp->kn);
3587 
3588 	/*
3589 	 * Add the rdtgrp to the list of rdtgrps the parent
3590 	 * ctrl_mon group has to track.
3591 	 */
3592 	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
3593 
3594 out_unlock:
3595 	rdtgroup_kn_unlock(parent_kn);
3596 	return ret;
3597 }
3598 
3599 /*
3600  * These are rdtgroups created under the root directory. Can be used
3601  * to allocate and monitor resources.
3602  */
3603 static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
3604 				   const char *name, umode_t mode)
3605 {
3606 	struct rdtgroup *rdtgrp;
3607 	struct kernfs_node *kn;
3608 	u32 closid;
3609 	int ret;
3610 
3611 	ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
3612 	if (ret)
3613 		return ret;
3614 
3615 	kn = rdtgrp->kn;
3616 	ret = closid_alloc();
3617 	if (ret < 0) {
3618 		rdt_last_cmd_puts("Out of CLOSIDs\n");
3619 		goto out_common_fail;
3620 	}
3621 	closid = ret;
3622 	ret = 0;
3623 
3624 	rdtgrp->closid = closid;
3625 
3626 	ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
3627 	if (ret)
3628 		goto out_closid_free;
3629 
3630 	kernfs_activate(rdtgrp->kn);
3631 
3632 	ret = rdtgroup_init_alloc(rdtgrp);
3633 	if (ret < 0)
3634 		goto out_rmid_free;
3635 
3636 	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
3637 
3638 	if (resctrl_arch_mon_capable()) {
3639 		/*
3640 		 * Create an empty mon_groups directory to hold the subset
3641 		 * of tasks and cpus to monitor.
3642 		 */
3643 		ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
3644 		if (ret) {
3645 			rdt_last_cmd_puts("kernfs subdir error\n");
3646 			goto out_del_list;
3647 		}
3648 		if (is_mba_sc(NULL))
3649 			rdtgrp->mba_mbps_event = mba_mbps_default_event;
3650 	}
3651 
3652 	goto out_unlock;
3653 
3654 out_del_list:
3655 	list_del(&rdtgrp->rdtgroup_list);
3656 out_rmid_free:
3657 	mkdir_rdt_prepare_rmid_free(rdtgrp);
3658 out_closid_free:
3659 	closid_free(closid);
3660 out_common_fail:
3661 	mkdir_rdt_prepare_clean(rdtgrp);
3662 out_unlock:
3663 	rdtgroup_kn_unlock(parent_kn);
3664 	return ret;
3665 }
3666 
3667 static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
3668 			  umode_t mode)
3669 {
3670 	/* Do not accept '\n' to avoid unparsable situation. */
3671 	if (strchr(name, '\n'))
3672 		return -EINVAL;
3673 
3674 	/*
3675 	 * If the parent directory is the root directory and RDT
3676 	 * allocation is supported, add a control and monitoring
3677 	 * subdirectory
3678 	 */
3679 	if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
3680 		return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
3681 
3682 	/* Else, attempt to add a monitoring subdirectory. */
3683 	if (resctrl_arch_mon_capable())
3684 		return rdtgroup_mkdir_mon(parent_kn, name, mode);
3685 
3686 	return -EPERM;
3687 }
3688 
3689 static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3690 {
3691 	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3692 	u32 closid, rmid;
3693 	int cpu;
3694 
3695 	/* Give any tasks back to the parent group */
3696 	rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
3697 
3698 	/*
3699 	 * Update per cpu closid/rmid of the moved CPUs first.
3700 	 * Note: the closid will not change, but the arch code still needs it.
3701 	 */
3702 	closid = prdtgrp->closid;
3703 	rmid = prdtgrp->mon.rmid;
3704 	for_each_cpu(cpu, &rdtgrp->cpu_mask)
3705 		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
3706 
3707 	/*
3708 	 * Update the MSR on moved CPUs and CPUs which have moved
3709 	 * task running on them.
3710 	 */
3711 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3712 	update_closid_rmid(tmpmask, NULL);
3713 
3714 	rdtgrp->flags = RDT_DELETED;
3715 	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3716 
3717 	/*
3718 	 * Remove the rdtgrp from the parent ctrl_mon group's list
3719 	 */
3720 	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3721 	list_del(&rdtgrp->mon.crdtgrp_list);
3722 
3723 	kernfs_remove(rdtgrp->kn);
3724 
3725 	return 0;
3726 }
3727 
3728 static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
3729 {
3730 	rdtgrp->flags = RDT_DELETED;
3731 	list_del(&rdtgrp->rdtgroup_list);
3732 
3733 	kernfs_remove(rdtgrp->kn);
3734 	return 0;
3735 }
3736 
3737 static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
3738 {
3739 	u32 closid, rmid;
3740 	int cpu;
3741 
3742 	/* Give any tasks back to the default group */
3743 	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
3744 
3745 	/* Give any CPUs back to the default group */
3746 	cpumask_or(&rdtgroup_default.cpu_mask,
3747 		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
3748 
3749 	/* Update per cpu closid and rmid of the moved CPUs first */
3750 	closid = rdtgroup_default.closid;
3751 	rmid = rdtgroup_default.mon.rmid;
3752 	for_each_cpu(cpu, &rdtgrp->cpu_mask)
3753 		resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
3754 
3755 	/*
3756 	 * Update the MSR on moved CPUs and CPUs which have moved
3757 	 * task running on them.
3758 	 */
3759 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
3760 	update_closid_rmid(tmpmask, NULL);
3761 
3762 	free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
3763 	closid_free(rdtgrp->closid);
3764 
3765 	rdtgroup_ctrl_remove(rdtgrp);
3766 
3767 	/*
3768 	 * Free all the child monitor group rmids.
3769 	 */
3770 	free_all_child_rdtgrp(rdtgrp);
3771 
3772 	return 0;
3773 }
3774 
3775 static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn)
3776 {
3777 	/*
3778 	 * Valid within the RCU section it was obtained or while rdtgroup_mutex
3779 	 * is held.
3780 	 */
3781 	return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
3782 }
3783 
3784 static int rdtgroup_rmdir(struct kernfs_node *kn)
3785 {
3786 	struct kernfs_node *parent_kn;
3787 	struct rdtgroup *rdtgrp;
3788 	cpumask_var_t tmpmask;
3789 	int ret = 0;
3790 
3791 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
3792 		return -ENOMEM;
3793 
3794 	rdtgrp = rdtgroup_kn_lock_live(kn);
3795 	if (!rdtgrp) {
3796 		ret = -EPERM;
3797 		goto out;
3798 	}
3799 	parent_kn = rdt_kn_parent(kn);
3800 
3801 	/*
3802 	 * If the rdtgroup is a ctrl_mon group and parent directory
3803 	 * is the root directory, remove the ctrl_mon group.
3804 	 *
3805 	 * If the rdtgroup is a mon group and parent directory
3806 	 * is a valid "mon_groups" directory, remove the mon group.
3807 	 */
3808 	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
3809 	    rdtgrp != &rdtgroup_default) {
3810 		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
3811 		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
3812 			ret = rdtgroup_ctrl_remove(rdtgrp);
3813 		} else {
3814 			ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
3815 		}
3816 	} else if (rdtgrp->type == RDTMON_GROUP &&
3817 		 is_mon_groups(parent_kn, rdt_kn_name(kn))) {
3818 		ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
3819 	} else {
3820 		ret = -EPERM;
3821 	}
3822 
3823 out:
3824 	rdtgroup_kn_unlock(kn);
3825 	free_cpumask_var(tmpmask);
3826 	return ret;
3827 }
3828 
3829 /**
3830  * mongrp_reparent() - replace parent CTRL_MON group of a MON group
3831  * @rdtgrp:		the MON group whose parent should be replaced
3832  * @new_prdtgrp:	replacement parent CTRL_MON group for @rdtgrp
3833  * @cpus:		cpumask provided by the caller for use during this call
3834  *
3835  * Replaces the parent CTRL_MON group for a MON group, resulting in all member
3836  * tasks' CLOSID immediately changing to that of the new parent group.
3837  * Monitoring data for the group is unaffected by this operation.
3838  */
3839 static void mongrp_reparent(struct rdtgroup *rdtgrp,
3840 			    struct rdtgroup *new_prdtgrp,
3841 			    cpumask_var_t cpus)
3842 {
3843 	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
3844 
3845 	WARN_ON(rdtgrp->type != RDTMON_GROUP);
3846 	WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
3847 
3848 	/* Nothing to do when simply renaming a MON group. */
3849 	if (prdtgrp == new_prdtgrp)
3850 		return;
3851 
3852 	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
3853 	list_move_tail(&rdtgrp->mon.crdtgrp_list,
3854 		       &new_prdtgrp->mon.crdtgrp_list);
3855 
3856 	rdtgrp->mon.parent = new_prdtgrp;
3857 	rdtgrp->closid = new_prdtgrp->closid;
3858 
3859 	/* Propagate updated closid to all tasks in this group. */
3860 	rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
3861 
3862 	update_closid_rmid(cpus, NULL);
3863 }
3864 
3865 static int rdtgroup_rename(struct kernfs_node *kn,
3866 			   struct kernfs_node *new_parent, const char *new_name)
3867 {
3868 	struct kernfs_node *kn_parent;
3869 	struct rdtgroup *new_prdtgrp;
3870 	struct rdtgroup *rdtgrp;
3871 	cpumask_var_t tmpmask;
3872 	int ret;
3873 
3874 	rdtgrp = kernfs_to_rdtgroup(kn);
3875 	new_prdtgrp = kernfs_to_rdtgroup(new_parent);
3876 	if (!rdtgrp || !new_prdtgrp)
3877 		return -ENOENT;
3878 
3879 	/* Release both kernfs active_refs before obtaining rdtgroup mutex. */
3880 	rdtgroup_kn_get(rdtgrp, kn);
3881 	rdtgroup_kn_get(new_prdtgrp, new_parent);
3882 
3883 	mutex_lock(&rdtgroup_mutex);
3884 
3885 	rdt_last_cmd_clear();
3886 
3887 	/*
3888 	 * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
3889 	 * either kernfs_node is a file.
3890 	 */
3891 	if (kernfs_type(kn) != KERNFS_DIR ||
3892 	    kernfs_type(new_parent) != KERNFS_DIR) {
3893 		rdt_last_cmd_puts("Source and destination must be directories");
3894 		ret = -EPERM;
3895 		goto out;
3896 	}
3897 
3898 	if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
3899 		ret = -ENOENT;
3900 		goto out;
3901 	}
3902 
3903 	kn_parent = rdt_kn_parent(kn);
3904 	if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
3905 	    !is_mon_groups(kn_parent, rdt_kn_name(kn))) {
3906 		rdt_last_cmd_puts("Source must be a MON group\n");
3907 		ret = -EPERM;
3908 		goto out;
3909 	}
3910 
3911 	if (!is_mon_groups(new_parent, new_name)) {
3912 		rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
3913 		ret = -EPERM;
3914 		goto out;
3915 	}
3916 
3917 	/*
3918 	 * If the MON group is monitoring CPUs, the CPUs must be assigned to the
3919 	 * current parent CTRL_MON group and therefore cannot be assigned to
3920 	 * the new parent, making the move illegal.
3921 	 */
3922 	if (!cpumask_empty(&rdtgrp->cpu_mask) &&
3923 	    rdtgrp->mon.parent != new_prdtgrp) {
3924 		rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
3925 		ret = -EPERM;
3926 		goto out;
3927 	}
3928 
3929 	/*
3930 	 * Allocate the cpumask for use in mongrp_reparent() to avoid the
3931 	 * possibility of failing to allocate it after kernfs_rename() has
3932 	 * succeeded.
3933 	 */
3934 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
3935 		ret = -ENOMEM;
3936 		goto out;
3937 	}
3938 
3939 	/*
3940 	 * Perform all input validation and allocations needed to ensure
3941 	 * mongrp_reparent() will succeed before calling kernfs_rename(),
3942 	 * otherwise it would be necessary to revert this call if
3943 	 * mongrp_reparent() failed.
3944 	 */
3945 	ret = kernfs_rename(kn, new_parent, new_name);
3946 	if (!ret)
3947 		mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
3948 
3949 	free_cpumask_var(tmpmask);
3950 
3951 out:
3952 	mutex_unlock(&rdtgroup_mutex);
3953 	rdtgroup_kn_put(rdtgrp, kn);
3954 	rdtgroup_kn_put(new_prdtgrp, new_parent);
3955 	return ret;
3956 }
3957 
3958 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
3959 {
3960 	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
3961 		seq_puts(seq, ",cdp");
3962 
3963 	if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
3964 		seq_puts(seq, ",cdpl2");
3965 
3966 	if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
3967 		seq_puts(seq, ",mba_MBps");
3968 
3969 	if (resctrl_debug)
3970 		seq_puts(seq, ",debug");
3971 
3972 	return 0;
3973 }
3974 
3975 static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
3976 	.mkdir		= rdtgroup_mkdir,
3977 	.rmdir		= rdtgroup_rmdir,
3978 	.rename		= rdtgroup_rename,
3979 	.show_options	= rdtgroup_show_options,
3980 };
3981 
3982 static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
3983 {
3984 	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
3985 				      KERNFS_ROOT_CREATE_DEACTIVATED |
3986 				      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
3987 				      &rdtgroup_default);
3988 	if (IS_ERR(rdt_root))
3989 		return PTR_ERR(rdt_root);
3990 
3991 	ctx->kfc.root = rdt_root;
3992 	rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
3993 
3994 	return 0;
3995 }
3996 
3997 static void rdtgroup_destroy_root(void)
3998 {
3999 	lockdep_assert_held(&rdtgroup_mutex);
4000 
4001 	kernfs_destroy_root(rdt_root);
4002 	rdtgroup_default.kn = NULL;
4003 }
4004 
4005 static void rdtgroup_setup_default(void)
4006 {
4007 	mutex_lock(&rdtgroup_mutex);
4008 
4009 	rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
4010 	rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
4011 	rdtgroup_default.type = RDTCTRL_GROUP;
4012 	INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
4013 
4014 	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
4015 
4016 	mutex_unlock(&rdtgroup_mutex);
4017 }
4018 
4019 static void domain_destroy_mon_state(struct rdt_mon_domain *d)
4020 {
4021 	bitmap_free(d->rmid_busy_llc);
4022 	kfree(d->mbm_total);
4023 	kfree(d->mbm_local);
4024 }
4025 
4026 void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4027 {
4028 	mutex_lock(&rdtgroup_mutex);
4029 
4030 	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
4031 		mba_sc_domain_destroy(r, d);
4032 
4033 	mutex_unlock(&rdtgroup_mutex);
4034 }
4035 
4036 void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4037 {
4038 	mutex_lock(&rdtgroup_mutex);
4039 
4040 	/*
4041 	 * If resctrl is mounted, remove all the
4042 	 * per domain monitor data directories.
4043 	 */
4044 	if (resctrl_mounted && resctrl_arch_mon_capable())
4045 		rmdir_mondata_subdir_allrdtgrp(r, d);
4046 
4047 	if (resctrl_is_mbm_enabled())
4048 		cancel_delayed_work(&d->mbm_over);
4049 	if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
4050 		/*
4051 		 * When a package is going down, forcefully
4052 		 * decrement rmid->ebusy. There is no way to know
4053 		 * that the L3 was flushed and hence may lead to
4054 		 * incorrect counts in rare scenarios, but leaving
4055 		 * the RMID as busy creates RMID leaks if the
4056 		 * package never comes back.
4057 		 */
4058 		__check_limbo(d, true);
4059 		cancel_delayed_work(&d->cqm_limbo);
4060 	}
4061 
4062 	domain_destroy_mon_state(d);
4063 
4064 	mutex_unlock(&rdtgroup_mutex);
4065 }
4066 
4067 /**
4068  * domain_setup_mon_state() -  Initialise domain monitoring structures.
4069  * @r:	The resource for the newly online domain.
4070  * @d:	The newly online domain.
4071  *
4072  * Allocate monitor resources that belong to this domain.
4073  * Called when the first CPU of a domain comes online, regardless of whether
4074  * the filesystem is mounted.
4075  * During boot this may be called before global allocations have been made by
4076  * resctrl_mon_resource_init().
4077  *
4078  * Returns 0 for success, or -ENOMEM.
4079  */
4080 static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
4081 {
4082 	u32 idx_limit = resctrl_arch_system_num_rmid_idx();
4083 	size_t tsize;
4084 
4085 	if (resctrl_arch_is_llc_occupancy_enabled()) {
4086 		d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
4087 		if (!d->rmid_busy_llc)
4088 			return -ENOMEM;
4089 	}
4090 	if (resctrl_arch_is_mbm_total_enabled()) {
4091 		tsize = sizeof(*d->mbm_total);
4092 		d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
4093 		if (!d->mbm_total) {
4094 			bitmap_free(d->rmid_busy_llc);
4095 			return -ENOMEM;
4096 		}
4097 	}
4098 	if (resctrl_arch_is_mbm_local_enabled()) {
4099 		tsize = sizeof(*d->mbm_local);
4100 		d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
4101 		if (!d->mbm_local) {
4102 			bitmap_free(d->rmid_busy_llc);
4103 			kfree(d->mbm_total);
4104 			return -ENOMEM;
4105 		}
4106 	}
4107 
4108 	return 0;
4109 }
4110 
4111 int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
4112 {
4113 	int err = 0;
4114 
4115 	mutex_lock(&rdtgroup_mutex);
4116 
4117 	if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
4118 		/* RDT_RESOURCE_MBA is never mon_capable */
4119 		err = mba_sc_domain_allocate(r, d);
4120 	}
4121 
4122 	mutex_unlock(&rdtgroup_mutex);
4123 
4124 	return err;
4125 }
4126 
4127 int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
4128 {
4129 	int err;
4130 
4131 	mutex_lock(&rdtgroup_mutex);
4132 
4133 	err = domain_setup_mon_state(r, d);
4134 	if (err)
4135 		goto out_unlock;
4136 
4137 	if (resctrl_is_mbm_enabled()) {
4138 		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
4139 		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
4140 					   RESCTRL_PICK_ANY_CPU);
4141 	}
4142 
4143 	if (resctrl_arch_is_llc_occupancy_enabled())
4144 		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
4145 
4146 	/*
4147 	 * If the filesystem is not mounted then only the default resource group
4148 	 * exists. Creation of its directories is deferred until mount time
4149 	 * by rdt_get_tree() calling mkdir_mondata_all().
4150 	 * If resctrl is mounted, add per domain monitor data directories.
4151 	 */
4152 	if (resctrl_mounted && resctrl_arch_mon_capable())
4153 		mkdir_mondata_subdir_allrdtgrp(r, d);
4154 
4155 out_unlock:
4156 	mutex_unlock(&rdtgroup_mutex);
4157 
4158 	return err;
4159 }
4160 
4161 void resctrl_online_cpu(unsigned int cpu)
4162 {
4163 	mutex_lock(&rdtgroup_mutex);
4164 	/* The CPU is set in default rdtgroup after online. */
4165 	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
4166 	mutex_unlock(&rdtgroup_mutex);
4167 }
4168 
4169 static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
4170 {
4171 	struct rdtgroup *cr;
4172 
4173 	list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
4174 		if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
4175 			break;
4176 	}
4177 }
4178 
4179 static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
4180 						      struct rdt_resource *r)
4181 {
4182 	struct rdt_mon_domain *d;
4183 
4184 	lockdep_assert_cpus_held();
4185 
4186 	list_for_each_entry(d, &r->mon_domains, hdr.list) {
4187 		/* Find the domain that contains this CPU */
4188 		if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
4189 			return d;
4190 	}
4191 
4192 	return NULL;
4193 }
4194 
4195 void resctrl_offline_cpu(unsigned int cpu)
4196 {
4197 	struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
4198 	struct rdt_mon_domain *d;
4199 	struct rdtgroup *rdtgrp;
4200 
4201 	mutex_lock(&rdtgroup_mutex);
4202 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
4203 		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
4204 			clear_childcpus(rdtgrp, cpu);
4205 			break;
4206 		}
4207 	}
4208 
4209 	if (!l3->mon_capable)
4210 		goto out_unlock;
4211 
4212 	d = get_mon_domain_from_cpu(cpu, l3);
4213 	if (d) {
4214 		if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
4215 			cancel_delayed_work(&d->mbm_over);
4216 			mbm_setup_overflow_handler(d, 0, cpu);
4217 		}
4218 		if (resctrl_arch_is_llc_occupancy_enabled() &&
4219 		    cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
4220 			cancel_delayed_work(&d->cqm_limbo);
4221 			cqm_setup_limbo_handler(d, 0, cpu);
4222 		}
4223 	}
4224 
4225 out_unlock:
4226 	mutex_unlock(&rdtgroup_mutex);
4227 }
4228 
4229 /*
4230  * resctrl_init - resctrl filesystem initialization
4231  *
4232  * Setup resctrl file system including set up root, create mount point,
4233  * register resctrl filesystem, and initialize files under root directory.
4234  *
4235  * Return: 0 on success or -errno
4236  */
4237 int resctrl_init(void)
4238 {
4239 	int ret = 0;
4240 
4241 	seq_buf_init(&last_cmd_status, last_cmd_status_buf,
4242 		     sizeof(last_cmd_status_buf));
4243 
4244 	rdtgroup_setup_default();
4245 
4246 	thread_throttle_mode_init();
4247 
4248 	ret = resctrl_mon_resource_init();
4249 	if (ret)
4250 		return ret;
4251 
4252 	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
4253 	if (ret) {
4254 		resctrl_mon_resource_exit();
4255 		return ret;
4256 	}
4257 
4258 	ret = register_filesystem(&rdt_fs_type);
4259 	if (ret)
4260 		goto cleanup_mountpoint;
4261 
4262 	/*
4263 	 * Adding the resctrl debugfs directory here may not be ideal since
4264 	 * it would let the resctrl debugfs directory appear on the debugfs
4265 	 * filesystem before the resctrl filesystem is mounted.
4266 	 * It may also be ok since that would enable debugging of RDT before
4267 	 * resctrl is mounted.
4268 	 * The reason why the debugfs directory is created here and not in
4269 	 * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
4270 	 * during the debugfs directory creation also &sb->s_type->i_mutex_key
4271 	 * (the lockdep class of inode->i_rwsem). Other filesystem
4272 	 * interactions (eg. SyS_getdents) have the lock ordering:
4273 	 * &sb->s_type->i_mutex_key --> &mm->mmap_lock
4274 	 * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
4275 	 * is taken, thus creating dependency:
4276 	 * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
4277 	 * issues considering the other two lock dependencies.
4278 	 * By creating the debugfs directory here we avoid a dependency
4279 	 * that may cause deadlock (even though file operations cannot
4280 	 * occur until the filesystem is mounted, but I do not know how to
4281 	 * tell lockdep that).
4282 	 */
4283 	debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
4284 
4285 	return 0;
4286 
4287 cleanup_mountpoint:
4288 	sysfs_remove_mount_point(fs_kobj, "resctrl");
4289 	resctrl_mon_resource_exit();
4290 
4291 	return ret;
4292 }
4293 
4294 static bool resctrl_online_domains_exist(void)
4295 {
4296 	struct rdt_resource *r;
4297 
4298 	/*
4299 	 * Only walk capable resources to allow resctrl_arch_get_resource()
4300 	 * to return dummy 'not capable' resources.
4301 	 */
4302 	for_each_alloc_capable_rdt_resource(r) {
4303 		if (!list_empty(&r->ctrl_domains))
4304 			return true;
4305 	}
4306 
4307 	for_each_mon_capable_rdt_resource(r) {
4308 		if (!list_empty(&r->mon_domains))
4309 			return true;
4310 	}
4311 
4312 	return false;
4313 }
4314 
4315 /**
4316  * resctrl_exit() - Remove the resctrl filesystem and free resources.
4317  *
4318  * Called by the architecture code in response to a fatal error.
4319  * Removes resctrl files and structures from kernfs to prevent further
4320  * configuration.
4321  *
4322  * When called by the architecture code, all CPUs and resctrl domains must be
4323  * offline. This ensures the limbo and overflow handlers are not scheduled to
4324  * run, meaning the data structures they access can be freed by
4325  * resctrl_mon_resource_exit().
4326  *
4327  * After resctrl_exit() returns, the architecture code should return an
4328  * error from all resctrl_arch_ functions that can do this.
4329  * resctrl_arch_get_resource() must continue to return struct rdt_resources
4330  * with the correct rid field to ensure the filesystem can be unmounted.
4331  */
4332 void resctrl_exit(void)
4333 {
4334 	cpus_read_lock();
4335 	WARN_ON_ONCE(resctrl_online_domains_exist());
4336 
4337 	mutex_lock(&rdtgroup_mutex);
4338 	resctrl_fs_teardown();
4339 	mutex_unlock(&rdtgroup_mutex);
4340 
4341 	cpus_read_unlock();
4342 
4343 	debugfs_remove_recursive(debugfs_resctrl);
4344 	debugfs_resctrl = NULL;
4345 	unregister_filesystem(&rdt_fs_type);
4346 
4347 	/*
4348 	 * Do not remove the sysfs mount point added by resctrl_init() so that
4349 	 * it can be used to umount resctrl.
4350 	 */
4351 
4352 	resctrl_mon_resource_exit();
4353 }
4354