1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Resource Director Technology(RDT)
4 * - Intel Application Energy Telemetry
5 *
6 * Copyright (C) 2025 Intel Corporation
7 *
8 * Author:
9 * Tony Luck <tony.luck@intel.com>
10 */
11
12 #define pr_fmt(fmt) "resctrl: " fmt
13
14 #include <linux/bits.h>
15 #include <linux/compiler_types.h>
16 #include <linux/container_of.h>
17 #include <linux/cpumask.h>
18 #include <linux/err.h>
19 #include <linux/errno.h>
20 #include <linux/gfp_types.h>
21 #include <linux/init.h>
22 #include <linux/intel_pmt_features.h>
23 #include <linux/intel_vsec.h>
24 #include <linux/io.h>
25 #include <linux/minmax.h>
26 #include <linux/printk.h>
27 #include <linux/rculist.h>
28 #include <linux/rcupdate.h>
29 #include <linux/resctrl.h>
30 #include <linux/resctrl_types.h>
31 #include <linux/slab.h>
32 #include <linux/stddef.h>
33 #include <linux/topology.h>
34 #include <linux/types.h>
35
36 #include "internal.h"
37
38 /**
39 * struct pmt_event - Telemetry event.
40 * @id: Resctrl event id.
41 * @idx: Counter index within each per-RMID block of counters.
42 * @bin_bits: Zero for integer valued events, else number bits in fraction
43 * part of fixed-point.
44 */
45 struct pmt_event {
46 enum resctrl_event_id id;
47 unsigned int idx;
48 unsigned int bin_bits;
49 };
50
51 #define EVT(_id, _idx, _bits) { .id = _id, .idx = _idx, .bin_bits = _bits }
52
53 /**
54 * struct event_group - Events with the same feature type ("energy" or "perf") and GUID.
55 * @pfname: PMT feature name ("energy" or "perf") of this event group.
56 * Used by boot rdt= option.
57 * @pfg: Points to the aggregated telemetry space information
58 * returned by the intel_pmt_get_regions_by_feature()
59 * call to the INTEL_PMT_TELEMETRY driver that contains
60 * data for all telemetry regions of type @pfname.
61 * Valid if the system supports the event group,
62 * NULL otherwise.
63 * @force_off: True when "rdt" command line or architecture code disables
64 * this event group due to insufficient RMIDs.
65 * @force_on: True when "rdt" command line overrides disable of this
66 * event group.
67 * @guid: Unique number per XML description file.
68 * @num_rmid: Number of RMIDs supported by this group. May be
69 * adjusted downwards if enumeration from
70 * intel_pmt_get_regions_by_feature() indicates fewer
71 * RMIDs can be tracked simultaneously.
72 * @mmio_size: Number of bytes of MMIO registers for this group.
73 * @num_events: Number of events in this group.
74 * @evts: Array of event descriptors.
75 */
76 struct event_group {
77 /* Data fields for additional structures to manage this group. */
78 const char *pfname;
79 struct pmt_feature_group *pfg;
80 bool force_off, force_on;
81
82 /* Remaining fields initialized from XML file. */
83 u32 guid;
84 u32 num_rmid;
85 size_t mmio_size;
86 unsigned int num_events;
87 struct pmt_event evts[] __counted_by(num_events);
88 };
89
90 #define XML_MMIO_SIZE(num_rmids, num_events, num_extra_status) \
91 (((num_rmids) * (num_events) + (num_extra_status)) * sizeof(u64))
92
93 /*
94 * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml
95 */
96 static struct event_group energy_0x26696143 = {
97 .pfname = "energy",
98 .guid = 0x26696143,
99 .num_rmid = 576,
100 .mmio_size = XML_MMIO_SIZE(576, 2, 3),
101 .num_events = 2,
102 .evts = {
103 EVT(PMT_EVENT_ENERGY, 0, 18),
104 EVT(PMT_EVENT_ACTIVITY, 1, 18),
105 }
106 };
107
108 /*
109 * Link: https://github.com/intel/Intel-PMT/blob/main/xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml
110 */
111 static struct event_group perf_0x26557651 = {
112 .pfname = "perf",
113 .guid = 0x26557651,
114 .num_rmid = 576,
115 .mmio_size = XML_MMIO_SIZE(576, 7, 3),
116 .num_events = 7,
117 .evts = {
118 EVT(PMT_EVENT_STALLS_LLC_HIT, 0, 0),
119 EVT(PMT_EVENT_C1_RES, 1, 0),
120 EVT(PMT_EVENT_UNHALTED_CORE_CYCLES, 2, 0),
121 EVT(PMT_EVENT_STALLS_LLC_MISS, 3, 0),
122 EVT(PMT_EVENT_AUTO_C6_RES, 4, 0),
123 EVT(PMT_EVENT_UNHALTED_REF_CYCLES, 5, 0),
124 EVT(PMT_EVENT_UOPS_RETIRED, 6, 0),
125 }
126 };
127
128 static struct event_group *known_event_groups[] = {
129 &energy_0x26696143,
130 &perf_0x26557651,
131 };
132
133 #define for_each_event_group(_peg) \
134 for (_peg = known_event_groups; \
135 _peg < &known_event_groups[ARRAY_SIZE(known_event_groups)]; \
136 _peg++)
137
intel_handle_aet_option(bool force_off,char * tok)138 bool intel_handle_aet_option(bool force_off, char *tok)
139 {
140 struct event_group **peg;
141 bool ret = false;
142 u32 guid = 0;
143 char *name;
144
145 if (!tok)
146 return false;
147
148 name = strsep(&tok, ":");
149 if (tok && kstrtou32(tok, 16, &guid))
150 return false;
151
152 for_each_event_group(peg) {
153 if (strcmp(name, (*peg)->pfname))
154 continue;
155 if (guid && (*peg)->guid != guid)
156 continue;
157 if (force_off)
158 (*peg)->force_off = true;
159 else
160 (*peg)->force_on = true;
161 ret = true;
162 }
163
164 return ret;
165 }
166
skip_telem_region(struct telemetry_region * tr,struct event_group * e)167 static bool skip_telem_region(struct telemetry_region *tr, struct event_group *e)
168 {
169 if (tr->guid != e->guid)
170 return true;
171 if (tr->plat_info.package_id >= topology_max_packages()) {
172 pr_warn("Bad package %u in guid 0x%x\n", tr->plat_info.package_id,
173 tr->guid);
174 return true;
175 }
176 if (tr->size != e->mmio_size) {
177 pr_warn("MMIO space wrong size (%zu bytes) for guid 0x%x. Expected %zu bytes.\n",
178 tr->size, e->guid, e->mmio_size);
179 return true;
180 }
181
182 return false;
183 }
184
group_has_usable_regions(struct event_group * e,struct pmt_feature_group * p)185 static bool group_has_usable_regions(struct event_group *e, struct pmt_feature_group *p)
186 {
187 bool usable_regions = false;
188
189 for (int i = 0; i < p->count; i++) {
190 if (skip_telem_region(&p->regions[i], e)) {
191 /*
192 * Clear the address field of regions that did not pass the checks in
193 * skip_telem_region() so they will not be used by intel_aet_read_event().
194 * This is safe to do because intel_pmt_get_regions_by_feature() allocates
195 * a new pmt_feature_group structure to return to each caller and only makes
196 * use of the pmt_feature_group::kref field when intel_pmt_put_feature_group()
197 * returns the structure.
198 */
199 p->regions[i].addr = NULL;
200
201 continue;
202 }
203 usable_regions = true;
204 }
205
206 return usable_regions;
207 }
208
all_regions_have_sufficient_rmid(struct event_group * e,struct pmt_feature_group * p)209 static bool all_regions_have_sufficient_rmid(struct event_group *e, struct pmt_feature_group *p)
210 {
211 struct telemetry_region *tr;
212
213 for (int i = 0; i < p->count; i++) {
214 if (!p->regions[i].addr)
215 continue;
216 tr = &p->regions[i];
217 if (tr->num_rmids < e->num_rmid) {
218 e->force_off = true;
219 return false;
220 }
221 }
222
223 return true;
224 }
225
enable_events(struct event_group * e,struct pmt_feature_group * p)226 static bool enable_events(struct event_group *e, struct pmt_feature_group *p)
227 {
228 struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_PERF_PKG].r_resctrl;
229 int skipped_events = 0;
230
231 if (e->force_off)
232 return false;
233
234 if (!group_has_usable_regions(e, p))
235 return false;
236
237 /*
238 * Only enable event group with insufficient RMIDs if the user requested
239 * it from the kernel command line.
240 */
241 if (!all_regions_have_sufficient_rmid(e, p) && !e->force_on) {
242 pr_info("%s %s:0x%x monitoring not enabled due to insufficient RMIDs\n",
243 r->name, e->pfname, e->guid);
244 return false;
245 }
246
247 for (int i = 0; i < p->count; i++) {
248 if (!p->regions[i].addr)
249 continue;
250 /*
251 * e->num_rmid only adjusted lower if user (via rdt= kernel
252 * parameter) forces an event group with insufficient RMID
253 * to be enabled.
254 */
255 e->num_rmid = min(e->num_rmid, p->regions[i].num_rmids);
256 }
257
258 for (int j = 0; j < e->num_events; j++) {
259 if (!resctrl_enable_mon_event(e->evts[j].id, true,
260 e->evts[j].bin_bits, &e->evts[j]))
261 skipped_events++;
262 }
263 if (e->num_events == skipped_events) {
264 pr_info("No events enabled in %s %s:0x%x\n", r->name, e->pfname, e->guid);
265 return false;
266 }
267
268 if (r->mon.num_rmid)
269 r->mon.num_rmid = min(r->mon.num_rmid, e->num_rmid);
270 else
271 r->mon.num_rmid = e->num_rmid;
272
273 if (skipped_events)
274 pr_info("%s %s:0x%x monitoring detected (skipped %d events)\n", r->name,
275 e->pfname, e->guid, skipped_events);
276 else
277 pr_info("%s %s:0x%x monitoring detected\n", r->name, e->pfname, e->guid);
278
279 return true;
280 }
281
lookup_pfid(const char * pfname)282 static enum pmt_feature_id lookup_pfid(const char *pfname)
283 {
284 if (!strcmp(pfname, "energy"))
285 return FEATURE_PER_RMID_ENERGY_TELEM;
286 else if (!strcmp(pfname, "perf"))
287 return FEATURE_PER_RMID_PERF_TELEM;
288
289 pr_warn("Unknown PMT feature name '%s'\n", pfname);
290
291 return FEATURE_INVALID;
292 }
293
294 /*
295 * Request a copy of struct pmt_feature_group for each event group. If there is
296 * one, the returned structure has an array of telemetry_region structures,
297 * each element of the array describes one telemetry aggregator. The
298 * telemetry aggregators may have different GUIDs so obtain duplicate struct
299 * pmt_feature_group for event groups with same feature type but different
300 * GUID. Post-processing ensures an event group can only use the telemetry
301 * aggregators that match its GUID. An event group keeps a pointer to its
302 * struct pmt_feature_group to indicate that its events are successfully
303 * enabled.
304 */
intel_aet_get_events(void)305 bool intel_aet_get_events(void)
306 {
307 struct pmt_feature_group *p;
308 enum pmt_feature_id pfid;
309 struct event_group **peg;
310 bool ret = false;
311
312 for_each_event_group(peg) {
313 pfid = lookup_pfid((*peg)->pfname);
314 p = intel_pmt_get_regions_by_feature(pfid);
315 if (IS_ERR_OR_NULL(p))
316 continue;
317 if (enable_events(*peg, p)) {
318 (*peg)->pfg = p;
319 ret = true;
320 } else {
321 intel_pmt_put_feature_group(p);
322 }
323 }
324
325 return ret;
326 }
327
intel_aet_exit(void)328 void __exit intel_aet_exit(void)
329 {
330 struct event_group **peg;
331
332 for_each_event_group(peg) {
333 if ((*peg)->pfg) {
334 intel_pmt_put_feature_group((*peg)->pfg);
335 (*peg)->pfg = NULL;
336 }
337 }
338 }
339
340 #define DATA_VALID BIT_ULL(63)
341 #define DATA_BITS GENMASK_ULL(62, 0)
342
343 /*
344 * Read counter for an event on a domain (summing all aggregators on the
345 * domain). If an aggregator hasn't received any data for a specific RMID,
346 * the MMIO read indicates that data is not valid. Return success if at
347 * least one aggregator has valid data.
348 */
intel_aet_read_event(int domid,u32 rmid,void * arch_priv,u64 * val)349 int intel_aet_read_event(int domid, u32 rmid, void *arch_priv, u64 *val)
350 {
351 struct pmt_event *pevt = arch_priv;
352 struct event_group *e;
353 bool valid = false;
354 u64 total = 0;
355 u64 evtcount;
356 void *pevt0;
357 u32 idx;
358
359 pevt0 = pevt - pevt->idx;
360 e = container_of(pevt0, struct event_group, evts);
361 idx = rmid * e->num_events;
362 idx += pevt->idx;
363
364 if (idx * sizeof(u64) + sizeof(u64) > e->mmio_size) {
365 pr_warn_once("MMIO index %u out of range\n", idx);
366 return -EIO;
367 }
368
369 for (int i = 0; i < e->pfg->count; i++) {
370 if (!e->pfg->regions[i].addr)
371 continue;
372 if (e->pfg->regions[i].plat_info.package_id != domid)
373 continue;
374 evtcount = readq(e->pfg->regions[i].addr + idx * sizeof(u64));
375 if (!(evtcount & DATA_VALID))
376 continue;
377 total += evtcount & DATA_BITS;
378 valid = true;
379 }
380
381 if (valid)
382 *val = total;
383
384 return valid ? 0 : -EINVAL;
385 }
386
intel_aet_mon_domain_setup(int cpu,int id,struct rdt_resource * r,struct list_head * add_pos)387 void intel_aet_mon_domain_setup(int cpu, int id, struct rdt_resource *r,
388 struct list_head *add_pos)
389 {
390 struct rdt_perf_pkg_mon_domain *d;
391 int err;
392
393 d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
394 if (!d)
395 return;
396
397 d->hdr.id = id;
398 d->hdr.type = RESCTRL_MON_DOMAIN;
399 d->hdr.rid = RDT_RESOURCE_PERF_PKG;
400 cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
401 list_add_tail_rcu(&d->hdr.list, add_pos);
402
403 err = resctrl_online_mon_domain(r, &d->hdr);
404 if (err) {
405 list_del_rcu(&d->hdr.list);
406 synchronize_rcu();
407 kfree(d);
408 }
409 }
410