1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Common corrected MCE threshold handler code:
4 */
5 #include <linux/interrupt.h>
6 #include <linux/kernel.h>
7
8 #include <asm/irq_vectors.h>
9 #include <asm/traps.h>
10 #include <asm/apic.h>
11 #include <asm/mce.h>
12 #include <asm/trace/irq_vectors.h>
13
14 #include "internal.h"
15
16 static u32 mce_apei_thr_limit;
17
mce_save_apei_thr_limit(u32 thr_limit)18 void mce_save_apei_thr_limit(u32 thr_limit)
19 {
20 mce_apei_thr_limit = thr_limit;
21 pr_info("HEST corrected error threshold limit: %u\n", thr_limit);
22 }
23
mce_get_apei_thr_limit(void)24 u32 mce_get_apei_thr_limit(void)
25 {
26 return mce_apei_thr_limit;
27 }
28
default_threshold_interrupt(void)29 static void default_threshold_interrupt(void)
30 {
31 pr_err("Unexpected threshold interrupt at vector %x\n",
32 THRESHOLD_APIC_VECTOR);
33 }
34
35 void (*mce_threshold_vector)(void) = default_threshold_interrupt;
36
DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)37 DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
38 {
39 trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
40 inc_irq_stat(irq_threshold_count);
41 mce_threshold_vector();
42 trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
43 apic_eoi();
44 }
45
46 DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
47
mce_inherit_storm(unsigned int bank)48 void mce_inherit_storm(unsigned int bank)
49 {
50 struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
51
52 /*
53 * Previous CPU owning this bank had put it into storm mode,
54 * but the precise history of that storm is unknown. Assume
55 * the worst (all recent polls of the bank found a valid error
56 * logged). This will avoid the new owner prematurely declaring
57 * the storm has ended.
58 */
59 storm->banks[bank].history = ~0ull;
60 storm->banks[bank].timestamp = jiffies;
61 }
62
mce_get_storm_mode(void)63 bool mce_get_storm_mode(void)
64 {
65 return __this_cpu_read(storm_desc.poll_mode);
66 }
67
mce_set_storm_mode(bool storm)68 void mce_set_storm_mode(bool storm)
69 {
70 __this_cpu_write(storm_desc.poll_mode, storm);
71 }
72
mce_handle_storm(unsigned int bank,bool on)73 static void mce_handle_storm(unsigned int bank, bool on)
74 {
75 switch (boot_cpu_data.x86_vendor) {
76 case X86_VENDOR_INTEL:
77 mce_intel_handle_storm(bank, on);
78 break;
79 case X86_VENDOR_AMD:
80 mce_amd_handle_storm(bank, on);
81 break;
82 }
83 }
84
cmci_storm_begin(unsigned int bank)85 void cmci_storm_begin(unsigned int bank)
86 {
87 struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
88
89 __set_bit(bank, this_cpu_ptr(mce_poll_banks));
90 storm->banks[bank].in_storm_mode = true;
91
92 /*
93 * If this is the first bank on this CPU to enter storm mode
94 * start polling.
95 */
96 if (++storm->stormy_bank_count == 1)
97 mce_timer_kick(true);
98 }
99
cmci_storm_end(unsigned int bank)100 void cmci_storm_end(unsigned int bank)
101 {
102 struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
103
104 if (!mce_flags.amd_threshold)
105 __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
106 storm->banks[bank].history = 0;
107 storm->banks[bank].in_storm_mode = false;
108
109 /* If no banks left in storm mode, stop polling. */
110 if (!--storm->stormy_bank_count)
111 mce_timer_kick(false);
112 }
113
mce_track_storm(struct mce * mce)114 void mce_track_storm(struct mce *mce)
115 {
116 struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
117 unsigned long now = jiffies, delta;
118 unsigned int shift = 1;
119 u64 history = 0;
120
121 /* No tracking needed for banks that do not support CMCI */
122 if (storm->banks[mce->bank].poll_only)
123 return;
124
125 /*
126 * When a bank is in storm mode it is polled once per second and
127 * the history mask will record about the last minute of poll results.
128 * If it is not in storm mode, then the bank is only checked when
129 * there is a CMCI interrupt. Check how long it has been since
130 * this bank was last checked, and adjust the amount of "shift"
131 * to apply to history.
132 */
133 if (!storm->banks[mce->bank].in_storm_mode) {
134 delta = now - storm->banks[mce->bank].timestamp;
135 shift = (delta + HZ) / HZ;
136 }
137
138 /* If it has been a long time since the last poll, clear history. */
139 if (shift < NUM_HISTORY_BITS)
140 history = storm->banks[mce->bank].history << shift;
141
142 storm->banks[mce->bank].timestamp = now;
143
144 /* History keeps track of corrected errors. VAL=1 && UC=0 */
145 if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
146 history |= 1;
147
148 storm->banks[mce->bank].history = history;
149
150 if (storm->banks[mce->bank].in_storm_mode) {
151 if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
152 return;
153 printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
154 mce_handle_storm(mce->bank, false);
155 cmci_storm_end(mce->bank);
156 } else {
157 if (hweight64(history) < STORM_BEGIN_THRESHOLD)
158 return;
159 printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
160 mce_handle_storm(mce->bank, true);
161 cmci_storm_begin(mce->bank);
162 }
163 }
164