1 /*
2  * Machine check handler.
3  *
4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5  * Rest from unknown author(s).
6  * 2004 Andi Kleen. Rewrote most of it.
7  * Copyright 2008 Intel Corporation
8  * Author: Andi Kleen
9  */
10 #include <linux/thread_info.h>
11 #include <linux/capability.h>
12 #include <linux/miscdevice.h>
13 #include <linux/ratelimit.h>
14 #include <linux/kallsyms.h>
15 #include <linux/rcupdate.h>
16 #include <linux/kobject.h>
17 #include <linux/uaccess.h>
18 #include <linux/kdebug.h>
19 #include <linux/kernel.h>
20 #include <linux/percpu.h>
21 #include <linux/string.h>
22 #include <linux/device.h>
23 #include <linux/syscore_ops.h>
24 #include <linux/delay.h>
25 #include <linux/ctype.h>
26 #include <linux/sched.h>
27 #include <linux/sysfs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/init.h>
31 #include <linux/kmod.h>
32 #include <linux/poll.h>
33 #include <linux/nmi.h>
34 #include <linux/cpu.h>
35 #include <linux/smp.h>
36 #include <linux/fs.h>
37 #include <linux/mm.h>
38 #include <linux/debugfs.h>
39 #include <linux/irq_work.h>
40 #include <linux/export.h>
41 
42 #include <asm/processor.h>
43 #include <asm/mce.h>
44 #include <asm/msr.h>
45 
46 #include "mce-internal.h"
47 
48 static DEFINE_MUTEX(mce_chrdev_read_mutex);
49 
50 #define rcu_dereference_check_mce(p) \
51 	rcu_dereference_index_check((p), \
52 			      rcu_read_lock_sched_held() || \
53 			      lockdep_is_held(&mce_chrdev_read_mutex))
54 
55 #define CREATE_TRACE_POINTS
56 #include <trace/events/mce.h>
57 
58 int mce_disabled __read_mostly;
59 
60 #define MISC_MCELOG_MINOR	227
61 
62 #define SPINUNIT 100	/* 100ns */
63 
64 atomic_t mce_entry;
65 
66 DEFINE_PER_CPU(unsigned, mce_exception_count);
67 
68 /*
69  * Tolerant levels:
70  *   0: always panic on uncorrected errors, log corrected errors
71  *   1: panic or SIGBUS on uncorrected errors, log corrected errors
72  *   2: SIGBUS or log uncorrected errors (if possible), log corrected errors
73  *   3: never panic or SIGBUS, log all errors (for testing only)
74  */
75 static int			tolerant		__read_mostly = 1;
76 static int			banks			__read_mostly;
77 static int			rip_msr			__read_mostly;
78 static int			mce_bootlog		__read_mostly = -1;
79 static int			monarch_timeout		__read_mostly = -1;
80 static int			mce_panic_timeout	__read_mostly;
81 static int			mce_dont_log_ce		__read_mostly;
82 int				mce_cmci_disabled	__read_mostly;
83 int				mce_ignore_ce		__read_mostly;
84 int				mce_ser			__read_mostly;
85 
86 struct mce_bank                *mce_banks		__read_mostly;
87 
88 /* User mode helper program triggered by machine check event */
89 static unsigned long		mce_need_notify;
90 static char			mce_helper[128];
91 static char			*mce_helper_argv[2] = { mce_helper, NULL };
92 
93 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
94 
95 static DEFINE_PER_CPU(struct mce, mces_seen);
96 static int			cpu_missing;
97 
98 /* MCA banks polled by the period polling timer for corrected events */
99 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
100 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
101 };
102 
103 static DEFINE_PER_CPU(struct work_struct, mce_work);
104 
105 /*
106  * CPU/chipset specific EDAC code can register a notifier call here to print
107  * MCE errors in a human-readable form.
108  */
109 ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
110 
111 /* Do initial initialization of a struct mce */
mce_setup(struct mce * m)112 void mce_setup(struct mce *m)
113 {
114 	memset(m, 0, sizeof(struct mce));
115 	m->cpu = m->extcpu = smp_processor_id();
116 	rdtscll(m->tsc);
117 	/* We hope get_seconds stays lockless */
118 	m->time = get_seconds();
119 	m->cpuvendor = boot_cpu_data.x86_vendor;
120 	m->cpuid = cpuid_eax(1);
121 	m->socketid = cpu_data(m->extcpu).phys_proc_id;
122 	m->apicid = cpu_data(m->extcpu).initial_apicid;
123 	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
124 }
125 
126 DEFINE_PER_CPU(struct mce, injectm);
127 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
128 
129 /*
130  * Lockless MCE logging infrastructure.
131  * This avoids deadlocks on printk locks without having to break locks. Also
132  * separate MCEs from kernel messages to avoid bogus bug reports.
133  */
134 
135 static struct mce_log mcelog = {
136 	.signature	= MCE_LOG_SIGNATURE,
137 	.len		= MCE_LOG_LEN,
138 	.recordlen	= sizeof(struct mce),
139 };
140 
mce_log(struct mce * mce)141 void mce_log(struct mce *mce)
142 {
143 	unsigned next, entry;
144 	int ret = 0;
145 
146 	/* Emit the trace record: */
147 	trace_mce_record(mce);
148 
149 	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
150 	if (ret == NOTIFY_STOP)
151 		return;
152 
153 	mce->finished = 0;
154 	wmb();
155 	for (;;) {
156 		entry = rcu_dereference_check_mce(mcelog.next);
157 		for (;;) {
158 
159 			/*
160 			 * When the buffer fills up discard new entries.
161 			 * Assume that the earlier errors are the more
162 			 * interesting ones:
163 			 */
164 			if (entry >= MCE_LOG_LEN) {
165 				set_bit(MCE_OVERFLOW,
166 					(unsigned long *)&mcelog.flags);
167 				return;
168 			}
169 			/* Old left over entry. Skip: */
170 			if (mcelog.entry[entry].finished) {
171 				entry++;
172 				continue;
173 			}
174 			break;
175 		}
176 		smp_rmb();
177 		next = entry + 1;
178 		if (cmpxchg(&mcelog.next, entry, next) == entry)
179 			break;
180 	}
181 	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
182 	wmb();
183 	mcelog.entry[entry].finished = 1;
184 	wmb();
185 
186 	mce->finished = 1;
187 	set_bit(0, &mce_need_notify);
188 }
189 
drain_mcelog_buffer(void)190 static void drain_mcelog_buffer(void)
191 {
192 	unsigned int next, i, prev = 0;
193 
194 	next = rcu_dereference_check_mce(mcelog.next);
195 
196 	do {
197 		struct mce *m;
198 
199 		/* drain what was logged during boot */
200 		for (i = prev; i < next; i++) {
201 			unsigned long start = jiffies;
202 			unsigned retries = 1;
203 
204 			m = &mcelog.entry[i];
205 
206 			while (!m->finished) {
207 				if (time_after_eq(jiffies, start + 2*retries))
208 					retries++;
209 
210 				cpu_relax();
211 
212 				if (!m->finished && retries >= 4) {
213 					pr_err("MCE: skipping error being logged currently!\n");
214 					break;
215 				}
216 			}
217 			smp_rmb();
218 			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
219 		}
220 
221 		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
222 		prev = next;
223 		next = cmpxchg(&mcelog.next, prev, 0);
224 	} while (next != prev);
225 }
226 
227 
mce_register_decode_chain(struct notifier_block * nb)228 void mce_register_decode_chain(struct notifier_block *nb)
229 {
230 	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
231 	drain_mcelog_buffer();
232 }
233 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
234 
mce_unregister_decode_chain(struct notifier_block * nb)235 void mce_unregister_decode_chain(struct notifier_block *nb)
236 {
237 	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
238 }
239 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
240 
print_mce(struct mce * m)241 static void print_mce(struct mce *m)
242 {
243 	int ret = 0;
244 
245 	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
246 	       m->extcpu, m->mcgstatus, m->bank, m->status);
247 
248 	if (m->ip) {
249 		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
250 			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
251 				m->cs, m->ip);
252 
253 		if (m->cs == __KERNEL_CS)
254 			print_symbol("{%s}", m->ip);
255 		pr_cont("\n");
256 	}
257 
258 	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
259 	if (m->addr)
260 		pr_cont("ADDR %llx ", m->addr);
261 	if (m->misc)
262 		pr_cont("MISC %llx ", m->misc);
263 
264 	pr_cont("\n");
265 	/*
266 	 * Note this output is parsed by external tools and old fields
267 	 * should not be changed.
268 	 */
269 	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
270 		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
271 		cpu_data(m->extcpu).microcode);
272 
273 	/*
274 	 * Print out human-readable details about the MCE error,
275 	 * (if the CPU has an implementation for that)
276 	 */
277 	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
278 	if (ret == NOTIFY_STOP)
279 		return;
280 
281 	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
282 }
283 
284 #define PANIC_TIMEOUT 5 /* 5 seconds */
285 
286 static atomic_t mce_paniced;
287 
288 static int fake_panic;
289 static atomic_t mce_fake_paniced;
290 
291 /* Panic in progress. Enable interrupts and wait for final IPI */
wait_for_panic(void)292 static void wait_for_panic(void)
293 {
294 	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
295 
296 	preempt_disable();
297 	local_irq_enable();
298 	while (timeout-- > 0)
299 		udelay(1);
300 	if (panic_timeout == 0)
301 		panic_timeout = mce_panic_timeout;
302 	panic("Panicing machine check CPU died");
303 }
304 
mce_panic(char * msg,struct mce * final,char * exp)305 static void mce_panic(char *msg, struct mce *final, char *exp)
306 {
307 	int i, apei_err = 0;
308 
309 	if (!fake_panic) {
310 		/*
311 		 * Make sure only one CPU runs in machine check panic
312 		 */
313 		if (atomic_inc_return(&mce_paniced) > 1)
314 			wait_for_panic();
315 		barrier();
316 
317 		bust_spinlocks(1);
318 		console_verbose();
319 	} else {
320 		/* Don't log too much for fake panic */
321 		if (atomic_inc_return(&mce_fake_paniced) > 1)
322 			return;
323 	}
324 	/* First print corrected ones that are still unlogged */
325 	for (i = 0; i < MCE_LOG_LEN; i++) {
326 		struct mce *m = &mcelog.entry[i];
327 		if (!(m->status & MCI_STATUS_VAL))
328 			continue;
329 		if (!(m->status & MCI_STATUS_UC)) {
330 			print_mce(m);
331 			if (!apei_err)
332 				apei_err = apei_write_mce(m);
333 		}
334 	}
335 	/* Now print uncorrected but with the final one last */
336 	for (i = 0; i < MCE_LOG_LEN; i++) {
337 		struct mce *m = &mcelog.entry[i];
338 		if (!(m->status & MCI_STATUS_VAL))
339 			continue;
340 		if (!(m->status & MCI_STATUS_UC))
341 			continue;
342 		if (!final || memcmp(m, final, sizeof(struct mce))) {
343 			print_mce(m);
344 			if (!apei_err)
345 				apei_err = apei_write_mce(m);
346 		}
347 	}
348 	if (final) {
349 		print_mce(final);
350 		if (!apei_err)
351 			apei_err = apei_write_mce(final);
352 	}
353 	if (cpu_missing)
354 		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
355 	if (exp)
356 		pr_emerg(HW_ERR "Machine check: %s\n", exp);
357 	if (!fake_panic) {
358 		if (panic_timeout == 0)
359 			panic_timeout = mce_panic_timeout;
360 		panic(msg);
361 	} else
362 		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
363 }
364 
365 /* Support code for software error injection */
366 
msr_to_offset(u32 msr)367 static int msr_to_offset(u32 msr)
368 {
369 	unsigned bank = __this_cpu_read(injectm.bank);
370 
371 	if (msr == rip_msr)
372 		return offsetof(struct mce, ip);
373 	if (msr == MSR_IA32_MCx_STATUS(bank))
374 		return offsetof(struct mce, status);
375 	if (msr == MSR_IA32_MCx_ADDR(bank))
376 		return offsetof(struct mce, addr);
377 	if (msr == MSR_IA32_MCx_MISC(bank))
378 		return offsetof(struct mce, misc);
379 	if (msr == MSR_IA32_MCG_STATUS)
380 		return offsetof(struct mce, mcgstatus);
381 	return -1;
382 }
383 
384 /* MSR access wrappers used for error injection */
mce_rdmsrl(u32 msr)385 static u64 mce_rdmsrl(u32 msr)
386 {
387 	u64 v;
388 
389 	if (__this_cpu_read(injectm.finished)) {
390 		int offset = msr_to_offset(msr);
391 
392 		if (offset < 0)
393 			return 0;
394 		return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
395 	}
396 
397 	if (rdmsrl_safe(msr, &v)) {
398 		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
399 		/*
400 		 * Return zero in case the access faulted. This should
401 		 * not happen normally but can happen if the CPU does
402 		 * something weird, or if the code is buggy.
403 		 */
404 		v = 0;
405 	}
406 
407 	return v;
408 }
409 
mce_wrmsrl(u32 msr,u64 v)410 static void mce_wrmsrl(u32 msr, u64 v)
411 {
412 	if (__this_cpu_read(injectm.finished)) {
413 		int offset = msr_to_offset(msr);
414 
415 		if (offset >= 0)
416 			*(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
417 		return;
418 	}
419 	wrmsrl(msr, v);
420 }
421 
422 /*
423  * Collect all global (w.r.t. this processor) status about this machine
424  * check into our "mce" struct so that we can use it later to assess
425  * the severity of the problem as we read per-bank specific details.
426  */
mce_gather_info(struct mce * m,struct pt_regs * regs)427 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
428 {
429 	mce_setup(m);
430 
431 	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
432 	if (regs) {
433 		/*
434 		 * Get the address of the instruction at the time of
435 		 * the machine check error.
436 		 */
437 		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
438 			m->ip = regs->ip;
439 			m->cs = regs->cs;
440 		}
441 		/* Use accurate RIP reporting if available. */
442 		if (rip_msr)
443 			m->ip = mce_rdmsrl(rip_msr);
444 	}
445 }
446 
447 /*
448  * Simple lockless ring to communicate PFNs from the exception handler with the
449  * process context work function. This is vastly simplified because there's
450  * only a single reader and a single writer.
451  */
452 #define MCE_RING_SIZE 16	/* we use one entry less */
453 
454 struct mce_ring {
455 	unsigned short start;
456 	unsigned short end;
457 	unsigned long ring[MCE_RING_SIZE];
458 };
459 static DEFINE_PER_CPU(struct mce_ring, mce_ring);
460 
461 /* Runs with CPU affinity in workqueue */
mce_ring_empty(void)462 static int mce_ring_empty(void)
463 {
464 	struct mce_ring *r = &__get_cpu_var(mce_ring);
465 
466 	return r->start == r->end;
467 }
468 
mce_ring_get(unsigned long * pfn)469 static int mce_ring_get(unsigned long *pfn)
470 {
471 	struct mce_ring *r;
472 	int ret = 0;
473 
474 	*pfn = 0;
475 	get_cpu();
476 	r = &__get_cpu_var(mce_ring);
477 	if (r->start == r->end)
478 		goto out;
479 	*pfn = r->ring[r->start];
480 	r->start = (r->start + 1) % MCE_RING_SIZE;
481 	ret = 1;
482 out:
483 	put_cpu();
484 	return ret;
485 }
486 
487 /* Always runs in MCE context with preempt off */
mce_ring_add(unsigned long pfn)488 static int mce_ring_add(unsigned long pfn)
489 {
490 	struct mce_ring *r = &__get_cpu_var(mce_ring);
491 	unsigned next;
492 
493 	next = (r->end + 1) % MCE_RING_SIZE;
494 	if (next == r->start)
495 		return -1;
496 	r->ring[r->end] = pfn;
497 	wmb();
498 	r->end = next;
499 	return 0;
500 }
501 
mce_available(struct cpuinfo_x86 * c)502 int mce_available(struct cpuinfo_x86 *c)
503 {
504 	if (mce_disabled)
505 		return 0;
506 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
507 }
508 
mce_schedule_work(void)509 static void mce_schedule_work(void)
510 {
511 	if (!mce_ring_empty()) {
512 		struct work_struct *work = &__get_cpu_var(mce_work);
513 		if (!work_pending(work))
514 			schedule_work(work);
515 	}
516 }
517 
518 DEFINE_PER_CPU(struct irq_work, mce_irq_work);
519 
mce_irq_work_cb(struct irq_work * entry)520 static void mce_irq_work_cb(struct irq_work *entry)
521 {
522 	mce_notify_irq();
523 	mce_schedule_work();
524 }
525 
mce_report_event(struct pt_regs * regs)526 static void mce_report_event(struct pt_regs *regs)
527 {
528 	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
529 		mce_notify_irq();
530 		/*
531 		 * Triggering the work queue here is just an insurance
532 		 * policy in case the syscall exit notify handler
533 		 * doesn't run soon enough or ends up running on the
534 		 * wrong CPU (can happen when audit sleeps)
535 		 */
536 		mce_schedule_work();
537 		return;
538 	}
539 
540 	irq_work_queue(&__get_cpu_var(mce_irq_work));
541 }
542 
543 DEFINE_PER_CPU(unsigned, mce_poll_count);
544 
545 /*
546  * Poll for corrected events or events that happened before reset.
547  * Those are just logged through /dev/mcelog.
548  *
549  * This is executed in standard interrupt context.
550  *
551  * Note: spec recommends to panic for fatal unsignalled
552  * errors here. However this would be quite problematic --
553  * we would need to reimplement the Monarch handling and
554  * it would mess up the exclusion between exception handler
555  * and poll hander -- * so we skip this for now.
556  * These cases should not happen anyways, or only when the CPU
557  * is already totally * confused. In this case it's likely it will
558  * not fully execute the machine check handler either.
559  */
machine_check_poll(enum mcp_flags flags,mce_banks_t * b)560 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
561 {
562 	struct mce m;
563 	int i;
564 
565 	percpu_inc(mce_poll_count);
566 
567 	mce_gather_info(&m, NULL);
568 
569 	for (i = 0; i < banks; i++) {
570 		if (!mce_banks[i].ctl || !test_bit(i, *b))
571 			continue;
572 
573 		m.misc = 0;
574 		m.addr = 0;
575 		m.bank = i;
576 		m.tsc = 0;
577 
578 		barrier();
579 		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
580 		if (!(m.status & MCI_STATUS_VAL))
581 			continue;
582 
583 		/*
584 		 * Uncorrected or signalled events are handled by the exception
585 		 * handler when it is enabled, so don't process those here.
586 		 *
587 		 * TBD do the same check for MCI_STATUS_EN here?
588 		 */
589 		if (!(flags & MCP_UC) &&
590 		    (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
591 			continue;
592 
593 		if (m.status & MCI_STATUS_MISCV)
594 			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
595 		if (m.status & MCI_STATUS_ADDRV)
596 			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
597 
598 		if (!(flags & MCP_TIMESTAMP))
599 			m.tsc = 0;
600 		/*
601 		 * Don't get the IP here because it's unlikely to
602 		 * have anything to do with the actual error location.
603 		 */
604 		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce)
605 			mce_log(&m);
606 
607 		/*
608 		 * Clear state for this bank.
609 		 */
610 		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
611 	}
612 
613 	/*
614 	 * Don't clear MCG_STATUS here because it's only defined for
615 	 * exceptions.
616 	 */
617 
618 	sync_core();
619 }
620 EXPORT_SYMBOL_GPL(machine_check_poll);
621 
622 /*
623  * Do a quick check if any of the events requires a panic.
624  * This decides if we keep the events around or clear them.
625  */
mce_no_way_out(struct mce * m,char ** msg)626 static int mce_no_way_out(struct mce *m, char **msg)
627 {
628 	int i;
629 
630 	for (i = 0; i < banks; i++) {
631 		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
632 		if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
633 			return 1;
634 	}
635 	return 0;
636 }
637 
638 /*
639  * Variable to establish order between CPUs while scanning.
640  * Each CPU spins initially until executing is equal its number.
641  */
642 static atomic_t mce_executing;
643 
644 /*
645  * Defines order of CPUs on entry. First CPU becomes Monarch.
646  */
647 static atomic_t mce_callin;
648 
649 /*
650  * Check if a timeout waiting for other CPUs happened.
651  */
mce_timed_out(u64 * t)652 static int mce_timed_out(u64 *t)
653 {
654 	/*
655 	 * The others already did panic for some reason.
656 	 * Bail out like in a timeout.
657 	 * rmb() to tell the compiler that system_state
658 	 * might have been modified by someone else.
659 	 */
660 	rmb();
661 	if (atomic_read(&mce_paniced))
662 		wait_for_panic();
663 	if (!monarch_timeout)
664 		goto out;
665 	if ((s64)*t < SPINUNIT) {
666 		/* CHECKME: Make panic default for 1 too? */
667 		if (tolerant < 1)
668 			mce_panic("Timeout synchronizing machine check over CPUs",
669 				  NULL, NULL);
670 		cpu_missing = 1;
671 		return 1;
672 	}
673 	*t -= SPINUNIT;
674 out:
675 	touch_nmi_watchdog();
676 	return 0;
677 }
678 
679 /*
680  * The Monarch's reign.  The Monarch is the CPU who entered
681  * the machine check handler first. It waits for the others to
682  * raise the exception too and then grades them. When any
683  * error is fatal panic. Only then let the others continue.
684  *
685  * The other CPUs entering the MCE handler will be controlled by the
686  * Monarch. They are called Subjects.
687  *
688  * This way we prevent any potential data corruption in a unrecoverable case
689  * and also makes sure always all CPU's errors are examined.
690  *
691  * Also this detects the case of a machine check event coming from outer
692  * space (not detected by any CPUs) In this case some external agent wants
693  * us to shut down, so panic too.
694  *
695  * The other CPUs might still decide to panic if the handler happens
696  * in a unrecoverable place, but in this case the system is in a semi-stable
697  * state and won't corrupt anything by itself. It's ok to let the others
698  * continue for a bit first.
699  *
700  * All the spin loops have timeouts; when a timeout happens a CPU
701  * typically elects itself to be Monarch.
702  */
mce_reign(void)703 static void mce_reign(void)
704 {
705 	int cpu;
706 	struct mce *m = NULL;
707 	int global_worst = 0;
708 	char *msg = NULL;
709 	char *nmsg = NULL;
710 
711 	/*
712 	 * This CPU is the Monarch and the other CPUs have run
713 	 * through their handlers.
714 	 * Grade the severity of the errors of all the CPUs.
715 	 */
716 	for_each_possible_cpu(cpu) {
717 		int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
718 					    &nmsg);
719 		if (severity > global_worst) {
720 			msg = nmsg;
721 			global_worst = severity;
722 			m = &per_cpu(mces_seen, cpu);
723 		}
724 	}
725 
726 	/*
727 	 * Cannot recover? Panic here then.
728 	 * This dumps all the mces in the log buffer and stops the
729 	 * other CPUs.
730 	 */
731 	if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
732 		mce_panic("Fatal Machine check", m, msg);
733 
734 	/*
735 	 * For UC somewhere we let the CPU who detects it handle it.
736 	 * Also must let continue the others, otherwise the handling
737 	 * CPU could deadlock on a lock.
738 	 */
739 
740 	/*
741 	 * No machine check event found. Must be some external
742 	 * source or one CPU is hung. Panic.
743 	 */
744 	if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
745 		mce_panic("Machine check from unknown source", NULL, NULL);
746 
747 	/*
748 	 * Now clear all the mces_seen so that they don't reappear on
749 	 * the next mce.
750 	 */
751 	for_each_possible_cpu(cpu)
752 		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
753 }
754 
755 static atomic_t global_nwo;
756 
757 /*
758  * Start of Monarch synchronization. This waits until all CPUs have
759  * entered the exception handler and then determines if any of them
760  * saw a fatal event that requires panic. Then it executes them
761  * in the entry order.
762  * TBD double check parallel CPU hotunplug
763  */
mce_start(int * no_way_out)764 static int mce_start(int *no_way_out)
765 {
766 	int order;
767 	int cpus = num_online_cpus();
768 	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
769 
770 	if (!timeout)
771 		return -1;
772 
773 	atomic_add(*no_way_out, &global_nwo);
774 	/*
775 	 * global_nwo should be updated before mce_callin
776 	 */
777 	smp_wmb();
778 	order = atomic_inc_return(&mce_callin);
779 
780 	/*
781 	 * Wait for everyone.
782 	 */
783 	while (atomic_read(&mce_callin) != cpus) {
784 		if (mce_timed_out(&timeout)) {
785 			atomic_set(&global_nwo, 0);
786 			return -1;
787 		}
788 		ndelay(SPINUNIT);
789 	}
790 
791 	/*
792 	 * mce_callin should be read before global_nwo
793 	 */
794 	smp_rmb();
795 
796 	if (order == 1) {
797 		/*
798 		 * Monarch: Starts executing now, the others wait.
799 		 */
800 		atomic_set(&mce_executing, 1);
801 	} else {
802 		/*
803 		 * Subject: Now start the scanning loop one by one in
804 		 * the original callin order.
805 		 * This way when there are any shared banks it will be
806 		 * only seen by one CPU before cleared, avoiding duplicates.
807 		 */
808 		while (atomic_read(&mce_executing) < order) {
809 			if (mce_timed_out(&timeout)) {
810 				atomic_set(&global_nwo, 0);
811 				return -1;
812 			}
813 			ndelay(SPINUNIT);
814 		}
815 	}
816 
817 	/*
818 	 * Cache the global no_way_out state.
819 	 */
820 	*no_way_out = atomic_read(&global_nwo);
821 
822 	return order;
823 }
824 
825 /*
826  * Synchronize between CPUs after main scanning loop.
827  * This invokes the bulk of the Monarch processing.
828  */
mce_end(int order)829 static int mce_end(int order)
830 {
831 	int ret = -1;
832 	u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
833 
834 	if (!timeout)
835 		goto reset;
836 	if (order < 0)
837 		goto reset;
838 
839 	/*
840 	 * Allow others to run.
841 	 */
842 	atomic_inc(&mce_executing);
843 
844 	if (order == 1) {
845 		/* CHECKME: Can this race with a parallel hotplug? */
846 		int cpus = num_online_cpus();
847 
848 		/*
849 		 * Monarch: Wait for everyone to go through their scanning
850 		 * loops.
851 		 */
852 		while (atomic_read(&mce_executing) <= cpus) {
853 			if (mce_timed_out(&timeout))
854 				goto reset;
855 			ndelay(SPINUNIT);
856 		}
857 
858 		mce_reign();
859 		barrier();
860 		ret = 0;
861 	} else {
862 		/*
863 		 * Subject: Wait for Monarch to finish.
864 		 */
865 		while (atomic_read(&mce_executing) != 0) {
866 			if (mce_timed_out(&timeout))
867 				goto reset;
868 			ndelay(SPINUNIT);
869 		}
870 
871 		/*
872 		 * Don't reset anything. That's done by the Monarch.
873 		 */
874 		return 0;
875 	}
876 
877 	/*
878 	 * Reset all global state.
879 	 */
880 reset:
881 	atomic_set(&global_nwo, 0);
882 	atomic_set(&mce_callin, 0);
883 	barrier();
884 
885 	/*
886 	 * Let others run again.
887 	 */
888 	atomic_set(&mce_executing, 0);
889 	return ret;
890 }
891 
892 /*
893  * Check if the address reported by the CPU is in a format we can parse.
894  * It would be possible to add code for most other cases, but all would
895  * be somewhat complicated (e.g. segment offset would require an instruction
896  * parser). So only support physical addresses up to page granuality for now.
897  */
mce_usable_address(struct mce * m)898 static int mce_usable_address(struct mce *m)
899 {
900 	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
901 		return 0;
902 	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
903 		return 0;
904 	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
905 		return 0;
906 	return 1;
907 }
908 
mce_clear_state(unsigned long * toclear)909 static void mce_clear_state(unsigned long *toclear)
910 {
911 	int i;
912 
913 	for (i = 0; i < banks; i++) {
914 		if (test_bit(i, toclear))
915 			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
916 	}
917 }
918 
919 /*
920  * The actual machine check handler. This only handles real
921  * exceptions when something got corrupted coming in through int 18.
922  *
923  * This is executed in NMI context not subject to normal locking rules. This
924  * implies that most kernel services cannot be safely used. Don't even
925  * think about putting a printk in there!
926  *
927  * On Intel systems this is entered on all CPUs in parallel through
928  * MCE broadcast. However some CPUs might be broken beyond repair,
929  * so be always careful when synchronizing with others.
930  */
do_machine_check(struct pt_regs * regs,long error_code)931 void do_machine_check(struct pt_regs *regs, long error_code)
932 {
933 	struct mce m, *final;
934 	int i;
935 	int worst = 0;
936 	int severity;
937 	/*
938 	 * Establish sequential order between the CPUs entering the machine
939 	 * check handler.
940 	 */
941 	int order;
942 	/*
943 	 * If no_way_out gets set, there is no safe way to recover from this
944 	 * MCE.  If tolerant is cranked up, we'll try anyway.
945 	 */
946 	int no_way_out = 0;
947 	/*
948 	 * If kill_it gets set, there might be a way to recover from this
949 	 * error.
950 	 */
951 	int kill_it = 0;
952 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
953 	char *msg = "Unknown";
954 
955 	atomic_inc(&mce_entry);
956 
957 	percpu_inc(mce_exception_count);
958 
959 	if (!banks)
960 		goto out;
961 
962 	mce_gather_info(&m, regs);
963 
964 	final = &__get_cpu_var(mces_seen);
965 	*final = m;
966 
967 	no_way_out = mce_no_way_out(&m, &msg);
968 
969 	barrier();
970 
971 	/*
972 	 * When no restart IP must always kill or panic.
973 	 */
974 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
975 		kill_it = 1;
976 
977 	/*
978 	 * Go through all the banks in exclusion of the other CPUs.
979 	 * This way we don't report duplicated events on shared banks
980 	 * because the first one to see it will clear it.
981 	 */
982 	order = mce_start(&no_way_out);
983 	for (i = 0; i < banks; i++) {
984 		__clear_bit(i, toclear);
985 		if (!mce_banks[i].ctl)
986 			continue;
987 
988 		m.misc = 0;
989 		m.addr = 0;
990 		m.bank = i;
991 
992 		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
993 		if ((m.status & MCI_STATUS_VAL) == 0)
994 			continue;
995 
996 		/*
997 		 * Non uncorrected or non signaled errors are handled by
998 		 * machine_check_poll. Leave them alone, unless this panics.
999 		 */
1000 		if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1001 			!no_way_out)
1002 			continue;
1003 
1004 		/*
1005 		 * Set taint even when machine check was not enabled.
1006 		 */
1007 		add_taint(TAINT_MACHINE_CHECK);
1008 
1009 		severity = mce_severity(&m, tolerant, NULL);
1010 
1011 		/*
1012 		 * When machine check was for corrected handler don't touch,
1013 		 * unless we're panicing.
1014 		 */
1015 		if (severity == MCE_KEEP_SEVERITY && !no_way_out)
1016 			continue;
1017 		__set_bit(i, toclear);
1018 		if (severity == MCE_NO_SEVERITY) {
1019 			/*
1020 			 * Machine check event was not enabled. Clear, but
1021 			 * ignore.
1022 			 */
1023 			continue;
1024 		}
1025 
1026 		/*
1027 		 * Kill on action required.
1028 		 */
1029 		if (severity == MCE_AR_SEVERITY)
1030 			kill_it = 1;
1031 
1032 		if (m.status & MCI_STATUS_MISCV)
1033 			m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
1034 		if (m.status & MCI_STATUS_ADDRV)
1035 			m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1036 
1037 		/*
1038 		 * Action optional error. Queue address for later processing.
1039 		 * When the ring overflows we just ignore the AO error.
1040 		 * RED-PEN add some logging mechanism when
1041 		 * usable_address or mce_add_ring fails.
1042 		 * RED-PEN don't ignore overflow for tolerant == 0
1043 		 */
1044 		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1045 			mce_ring_add(m.addr >> PAGE_SHIFT);
1046 
1047 		mce_log(&m);
1048 
1049 		if (severity > worst) {
1050 			*final = m;
1051 			worst = severity;
1052 		}
1053 	}
1054 
1055 	if (!no_way_out)
1056 		mce_clear_state(toclear);
1057 
1058 	/*
1059 	 * Do most of the synchronization with other CPUs.
1060 	 * When there's any problem use only local no_way_out state.
1061 	 */
1062 	if (mce_end(order) < 0)
1063 		no_way_out = worst >= MCE_PANIC_SEVERITY;
1064 
1065 	/*
1066 	 * If we have decided that we just CAN'T continue, and the user
1067 	 * has not set tolerant to an insane level, give up and die.
1068 	 *
1069 	 * This is mainly used in the case when the system doesn't
1070 	 * support MCE broadcasting or it has been disabled.
1071 	 */
1072 	if (no_way_out && tolerant < 3)
1073 		mce_panic("Fatal machine check on current CPU", final, msg);
1074 
1075 	/*
1076 	 * If the error seems to be unrecoverable, something should be
1077 	 * done.  Try to kill as little as possible.  If we can kill just
1078 	 * one task, do that.  If the user has set the tolerance very
1079 	 * high, don't try to do anything at all.
1080 	 */
1081 
1082 	if (kill_it && tolerant < 3)
1083 		force_sig(SIGBUS, current);
1084 
1085 	/* notify userspace ASAP */
1086 	set_thread_flag(TIF_MCE_NOTIFY);
1087 
1088 	if (worst > 0)
1089 		mce_report_event(regs);
1090 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1091 out:
1092 	atomic_dec(&mce_entry);
1093 	sync_core();
1094 }
1095 EXPORT_SYMBOL_GPL(do_machine_check);
1096 
1097 /* dummy to break dependency. actual code is in mm/memory-failure.c */
memory_failure(unsigned long pfn,int vector)1098 void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1099 {
1100 	printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1101 }
1102 
1103 /*
1104  * Called after mce notification in process context. This code
1105  * is allowed to sleep. Call the high level VM handler to process
1106  * any corrupted pages.
1107  * Assume that the work queue code only calls this one at a time
1108  * per CPU.
1109  * Note we don't disable preemption, so this code might run on the wrong
1110  * CPU. In this case the event is picked up by the scheduled work queue.
1111  * This is merely a fast path to expedite processing in some common
1112  * cases.
1113  */
mce_notify_process(void)1114 void mce_notify_process(void)
1115 {
1116 	unsigned long pfn;
1117 	mce_notify_irq();
1118 	while (mce_ring_get(&pfn))
1119 		memory_failure(pfn, MCE_VECTOR);
1120 }
1121 
mce_process_work(struct work_struct * dummy)1122 static void mce_process_work(struct work_struct *dummy)
1123 {
1124 	mce_notify_process();
1125 }
1126 
1127 #ifdef CONFIG_X86_MCE_INTEL
1128 /***
1129  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1130  * @cpu: The CPU on which the event occurred.
1131  * @status: Event status information
1132  *
1133  * This function should be called by the thermal interrupt after the
1134  * event has been processed and the decision was made to log the event
1135  * further.
1136  *
1137  * The status parameter will be saved to the 'status' field of 'struct mce'
1138  * and historically has been the register value of the
1139  * MSR_IA32_THERMAL_STATUS (Intel) msr.
1140  */
mce_log_therm_throt_event(__u64 status)1141 void mce_log_therm_throt_event(__u64 status)
1142 {
1143 	struct mce m;
1144 
1145 	mce_setup(&m);
1146 	m.bank = MCE_THERMAL_BANK;
1147 	m.status = status;
1148 	mce_log(&m);
1149 }
1150 #endif /* CONFIG_X86_MCE_INTEL */
1151 
1152 /*
1153  * Periodic polling timer for "silent" machine check errors.  If the
1154  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1155  * errors, poll 2x slower (up to check_interval seconds).
1156  */
1157 static int check_interval = 5 * 60; /* 5 minutes */
1158 
1159 static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1160 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1161 
mce_start_timer(unsigned long data)1162 static void mce_start_timer(unsigned long data)
1163 {
1164 	struct timer_list *t = &per_cpu(mce_timer, data);
1165 	int *n;
1166 
1167 	WARN_ON(smp_processor_id() != data);
1168 
1169 	if (mce_available(__this_cpu_ptr(&cpu_info))) {
1170 		machine_check_poll(MCP_TIMESTAMP,
1171 				&__get_cpu_var(mce_poll_banks));
1172 	}
1173 
1174 	/*
1175 	 * Alert userspace if needed.  If we logged an MCE, reduce the
1176 	 * polling interval, otherwise increase the polling interval.
1177 	 */
1178 	n = &__get_cpu_var(mce_next_interval);
1179 	if (mce_notify_irq())
1180 		*n = max(*n/2, HZ/100);
1181 	else
1182 		*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1183 
1184 	t->expires = jiffies + *n;
1185 	add_timer_on(t, smp_processor_id());
1186 }
1187 
1188 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
mce_timer_delete_all(void)1189 static void mce_timer_delete_all(void)
1190 {
1191 	int cpu;
1192 
1193 	for_each_online_cpu(cpu)
1194 		del_timer_sync(&per_cpu(mce_timer, cpu));
1195 }
1196 
mce_do_trigger(struct work_struct * work)1197 static void mce_do_trigger(struct work_struct *work)
1198 {
1199 	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1200 }
1201 
1202 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1203 
1204 /*
1205  * Notify the user(s) about new machine check events.
1206  * Can be called from interrupt context, but not from machine check/NMI
1207  * context.
1208  */
mce_notify_irq(void)1209 int mce_notify_irq(void)
1210 {
1211 	/* Not more than two messages every minute */
1212 	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1213 
1214 	clear_thread_flag(TIF_MCE_NOTIFY);
1215 
1216 	if (test_and_clear_bit(0, &mce_need_notify)) {
1217 		/* wake processes polling /dev/mcelog */
1218 		wake_up_interruptible(&mce_chrdev_wait);
1219 
1220 		/*
1221 		 * There is no risk of missing notifications because
1222 		 * work_pending is always cleared before the function is
1223 		 * executed.
1224 		 */
1225 		if (mce_helper[0] && !work_pending(&mce_trigger_work))
1226 			schedule_work(&mce_trigger_work);
1227 
1228 		if (__ratelimit(&ratelimit))
1229 			pr_info(HW_ERR "Machine check events logged\n");
1230 
1231 		return 1;
1232 	}
1233 	return 0;
1234 }
1235 EXPORT_SYMBOL_GPL(mce_notify_irq);
1236 
__mcheck_cpu_mce_banks_init(void)1237 static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1238 {
1239 	int i;
1240 
1241 	mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1242 	if (!mce_banks)
1243 		return -ENOMEM;
1244 	for (i = 0; i < banks; i++) {
1245 		struct mce_bank *b = &mce_banks[i];
1246 
1247 		b->ctl = -1ULL;
1248 		b->init = 1;
1249 	}
1250 	return 0;
1251 }
1252 
1253 /*
1254  * Initialize Machine Checks for a CPU.
1255  */
__mcheck_cpu_cap_init(void)1256 static int __cpuinit __mcheck_cpu_cap_init(void)
1257 {
1258 	unsigned b;
1259 	u64 cap;
1260 
1261 	rdmsrl(MSR_IA32_MCG_CAP, cap);
1262 
1263 	b = cap & MCG_BANKCNT_MASK;
1264 	if (!banks)
1265 		printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1266 
1267 	if (b > MAX_NR_BANKS) {
1268 		printk(KERN_WARNING
1269 		       "MCE: Using only %u machine check banks out of %u\n",
1270 			MAX_NR_BANKS, b);
1271 		b = MAX_NR_BANKS;
1272 	}
1273 
1274 	/* Don't support asymmetric configurations today */
1275 	WARN_ON(banks != 0 && b != banks);
1276 	banks = b;
1277 	if (!mce_banks) {
1278 		int err = __mcheck_cpu_mce_banks_init();
1279 
1280 		if (err)
1281 			return err;
1282 	}
1283 
1284 	/* Use accurate RIP reporting if available. */
1285 	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1286 		rip_msr = MSR_IA32_MCG_EIP;
1287 
1288 	if (cap & MCG_SER_P)
1289 		mce_ser = 1;
1290 
1291 	return 0;
1292 }
1293 
__mcheck_cpu_init_generic(void)1294 static void __mcheck_cpu_init_generic(void)
1295 {
1296 	mce_banks_t all_banks;
1297 	u64 cap;
1298 	int i;
1299 
1300 	/*
1301 	 * Log the machine checks left over from the previous reset.
1302 	 */
1303 	bitmap_fill(all_banks, MAX_NR_BANKS);
1304 	machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1305 
1306 	set_in_cr4(X86_CR4_MCE);
1307 
1308 	rdmsrl(MSR_IA32_MCG_CAP, cap);
1309 	if (cap & MCG_CTL_P)
1310 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1311 
1312 	for (i = 0; i < banks; i++) {
1313 		struct mce_bank *b = &mce_banks[i];
1314 
1315 		if (!b->init)
1316 			continue;
1317 		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1318 		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1319 	}
1320 }
1321 
1322 /* Add per CPU specific workarounds here */
__mcheck_cpu_apply_quirks(struct cpuinfo_x86 * c)1323 static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1324 {
1325 	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1326 		pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
1327 		return -EOPNOTSUPP;
1328 	}
1329 
1330 	/* This should be disabled by the BIOS, but isn't always */
1331 	if (c->x86_vendor == X86_VENDOR_AMD) {
1332 		if (c->x86 == 15 && banks > 4) {
1333 			/*
1334 			 * disable GART TBL walk error reporting, which
1335 			 * trips off incorrectly with the IOMMU & 3ware
1336 			 * & Cerberus:
1337 			 */
1338 			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1339 		}
1340 		if (c->x86 <= 17 && mce_bootlog < 0) {
1341 			/*
1342 			 * Lots of broken BIOS around that don't clear them
1343 			 * by default and leave crap in there. Don't log:
1344 			 */
1345 			mce_bootlog = 0;
1346 		}
1347 		/*
1348 		 * Various K7s with broken bank 0 around. Always disable
1349 		 * by default.
1350 		 */
1351 		 if (c->x86 == 6 && banks > 0)
1352 			mce_banks[0].ctl = 0;
1353 	}
1354 
1355 	if (c->x86_vendor == X86_VENDOR_INTEL) {
1356 		/*
1357 		 * SDM documents that on family 6 bank 0 should not be written
1358 		 * because it aliases to another special BIOS controlled
1359 		 * register.
1360 		 * But it's not aliased anymore on model 0x1a+
1361 		 * Don't ignore bank 0 completely because there could be a
1362 		 * valid event later, merely don't write CTL0.
1363 		 */
1364 
1365 		if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1366 			mce_banks[0].init = 0;
1367 
1368 		/*
1369 		 * All newer Intel systems support MCE broadcasting. Enable
1370 		 * synchronization with a one second timeout.
1371 		 */
1372 		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1373 			monarch_timeout < 0)
1374 			monarch_timeout = USEC_PER_SEC;
1375 
1376 		/*
1377 		 * There are also broken BIOSes on some Pentium M and
1378 		 * earlier systems:
1379 		 */
1380 		if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1381 			mce_bootlog = 0;
1382 	}
1383 	if (monarch_timeout < 0)
1384 		monarch_timeout = 0;
1385 	if (mce_bootlog != 0)
1386 		mce_panic_timeout = 30;
1387 
1388 	return 0;
1389 }
1390 
__mcheck_cpu_ancient_init(struct cpuinfo_x86 * c)1391 static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1392 {
1393 	if (c->x86 != 5)
1394 		return 0;
1395 
1396 	switch (c->x86_vendor) {
1397 	case X86_VENDOR_INTEL:
1398 		intel_p5_mcheck_init(c);
1399 		return 1;
1400 		break;
1401 	case X86_VENDOR_CENTAUR:
1402 		winchip_mcheck_init(c);
1403 		return 1;
1404 		break;
1405 	}
1406 
1407 	return 0;
1408 }
1409 
__mcheck_cpu_init_vendor(struct cpuinfo_x86 * c)1410 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1411 {
1412 	switch (c->x86_vendor) {
1413 	case X86_VENDOR_INTEL:
1414 		mce_intel_feature_init(c);
1415 		break;
1416 	case X86_VENDOR_AMD:
1417 		mce_amd_feature_init(c);
1418 		break;
1419 	default:
1420 		break;
1421 	}
1422 }
1423 
__mcheck_cpu_init_timer(void)1424 static void __mcheck_cpu_init_timer(void)
1425 {
1426 	struct timer_list *t = &__get_cpu_var(mce_timer);
1427 	int *n = &__get_cpu_var(mce_next_interval);
1428 
1429 	setup_timer(t, mce_start_timer, smp_processor_id());
1430 
1431 	if (mce_ignore_ce)
1432 		return;
1433 
1434 	*n = check_interval * HZ;
1435 	if (!*n)
1436 		return;
1437 	t->expires = round_jiffies(jiffies + *n);
1438 	add_timer_on(t, smp_processor_id());
1439 }
1440 
1441 /* Handle unconfigured int18 (should never happen) */
unexpected_machine_check(struct pt_regs * regs,long error_code)1442 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1443 {
1444 	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1445 	       smp_processor_id());
1446 }
1447 
1448 /* Call the installed machine check handler for this CPU setup. */
1449 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1450 						unexpected_machine_check;
1451 
1452 /*
1453  * Called for each booted CPU to set up machine checks.
1454  * Must be called with preempt off:
1455  */
mcheck_cpu_init(struct cpuinfo_x86 * c)1456 void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1457 {
1458 	if (mce_disabled)
1459 		return;
1460 
1461 	if (__mcheck_cpu_ancient_init(c))
1462 		return;
1463 
1464 	if (!mce_available(c))
1465 		return;
1466 
1467 	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1468 		mce_disabled = 1;
1469 		return;
1470 	}
1471 
1472 	machine_check_vector = do_machine_check;
1473 
1474 	__mcheck_cpu_init_generic();
1475 	__mcheck_cpu_init_vendor(c);
1476 	__mcheck_cpu_init_timer();
1477 	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1478 	init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
1479 }
1480 
1481 /*
1482  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1483  */
1484 
1485 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1486 static int mce_chrdev_open_count;	/* #times opened */
1487 static int mce_chrdev_open_exclu;	/* already open exclusive? */
1488 
mce_chrdev_open(struct inode * inode,struct file * file)1489 static int mce_chrdev_open(struct inode *inode, struct file *file)
1490 {
1491 	spin_lock(&mce_chrdev_state_lock);
1492 
1493 	if (mce_chrdev_open_exclu ||
1494 	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1495 		spin_unlock(&mce_chrdev_state_lock);
1496 
1497 		return -EBUSY;
1498 	}
1499 
1500 	if (file->f_flags & O_EXCL)
1501 		mce_chrdev_open_exclu = 1;
1502 	mce_chrdev_open_count++;
1503 
1504 	spin_unlock(&mce_chrdev_state_lock);
1505 
1506 	return nonseekable_open(inode, file);
1507 }
1508 
mce_chrdev_release(struct inode * inode,struct file * file)1509 static int mce_chrdev_release(struct inode *inode, struct file *file)
1510 {
1511 	spin_lock(&mce_chrdev_state_lock);
1512 
1513 	mce_chrdev_open_count--;
1514 	mce_chrdev_open_exclu = 0;
1515 
1516 	spin_unlock(&mce_chrdev_state_lock);
1517 
1518 	return 0;
1519 }
1520 
collect_tscs(void * data)1521 static void collect_tscs(void *data)
1522 {
1523 	unsigned long *cpu_tsc = (unsigned long *)data;
1524 
1525 	rdtscll(cpu_tsc[smp_processor_id()]);
1526 }
1527 
1528 static int mce_apei_read_done;
1529 
1530 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
__mce_read_apei(char __user ** ubuf,size_t usize)1531 static int __mce_read_apei(char __user **ubuf, size_t usize)
1532 {
1533 	int rc;
1534 	u64 record_id;
1535 	struct mce m;
1536 
1537 	if (usize < sizeof(struct mce))
1538 		return -EINVAL;
1539 
1540 	rc = apei_read_mce(&m, &record_id);
1541 	/* Error or no more MCE record */
1542 	if (rc <= 0) {
1543 		mce_apei_read_done = 1;
1544 		return rc;
1545 	}
1546 	rc = -EFAULT;
1547 	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1548 		return rc;
1549 	/*
1550 	 * In fact, we should have cleared the record after that has
1551 	 * been flushed to the disk or sent to network in
1552 	 * /sbin/mcelog, but we have no interface to support that now,
1553 	 * so just clear it to avoid duplication.
1554 	 */
1555 	rc = apei_clear_mce(record_id);
1556 	if (rc) {
1557 		mce_apei_read_done = 1;
1558 		return rc;
1559 	}
1560 	*ubuf += sizeof(struct mce);
1561 
1562 	return 0;
1563 }
1564 
mce_chrdev_read(struct file * filp,char __user * ubuf,size_t usize,loff_t * off)1565 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1566 				size_t usize, loff_t *off)
1567 {
1568 	char __user *buf = ubuf;
1569 	unsigned long *cpu_tsc;
1570 	unsigned prev, next;
1571 	int i, err;
1572 
1573 	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1574 	if (!cpu_tsc)
1575 		return -ENOMEM;
1576 
1577 	mutex_lock(&mce_chrdev_read_mutex);
1578 
1579 	if (!mce_apei_read_done) {
1580 		err = __mce_read_apei(&buf, usize);
1581 		if (err || buf != ubuf)
1582 			goto out;
1583 	}
1584 
1585 	next = rcu_dereference_check_mce(mcelog.next);
1586 
1587 	/* Only supports full reads right now */
1588 	err = -EINVAL;
1589 	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1590 		goto out;
1591 
1592 	err = 0;
1593 	prev = 0;
1594 	do {
1595 		for (i = prev; i < next; i++) {
1596 			unsigned long start = jiffies;
1597 			struct mce *m = &mcelog.entry[i];
1598 
1599 			while (!m->finished) {
1600 				if (time_after_eq(jiffies, start + 2)) {
1601 					memset(m, 0, sizeof(*m));
1602 					goto timeout;
1603 				}
1604 				cpu_relax();
1605 			}
1606 			smp_rmb();
1607 			err |= copy_to_user(buf, m, sizeof(*m));
1608 			buf += sizeof(*m);
1609 timeout:
1610 			;
1611 		}
1612 
1613 		memset(mcelog.entry + prev, 0,
1614 		       (next - prev) * sizeof(struct mce));
1615 		prev = next;
1616 		next = cmpxchg(&mcelog.next, prev, 0);
1617 	} while (next != prev);
1618 
1619 	synchronize_sched();
1620 
1621 	/*
1622 	 * Collect entries that were still getting written before the
1623 	 * synchronize.
1624 	 */
1625 	on_each_cpu(collect_tscs, cpu_tsc, 1);
1626 
1627 	for (i = next; i < MCE_LOG_LEN; i++) {
1628 		struct mce *m = &mcelog.entry[i];
1629 
1630 		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1631 			err |= copy_to_user(buf, m, sizeof(*m));
1632 			smp_rmb();
1633 			buf += sizeof(*m);
1634 			memset(m, 0, sizeof(*m));
1635 		}
1636 	}
1637 
1638 	if (err)
1639 		err = -EFAULT;
1640 
1641 out:
1642 	mutex_unlock(&mce_chrdev_read_mutex);
1643 	kfree(cpu_tsc);
1644 
1645 	return err ? err : buf - ubuf;
1646 }
1647 
mce_chrdev_poll(struct file * file,poll_table * wait)1648 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1649 {
1650 	poll_wait(file, &mce_chrdev_wait, wait);
1651 	if (rcu_access_index(mcelog.next))
1652 		return POLLIN | POLLRDNORM;
1653 	if (!mce_apei_read_done && apei_check_mce())
1654 		return POLLIN | POLLRDNORM;
1655 	return 0;
1656 }
1657 
mce_chrdev_ioctl(struct file * f,unsigned int cmd,unsigned long arg)1658 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1659 				unsigned long arg)
1660 {
1661 	int __user *p = (int __user *)arg;
1662 
1663 	if (!capable(CAP_SYS_ADMIN))
1664 		return -EPERM;
1665 
1666 	switch (cmd) {
1667 	case MCE_GET_RECORD_LEN:
1668 		return put_user(sizeof(struct mce), p);
1669 	case MCE_GET_LOG_LEN:
1670 		return put_user(MCE_LOG_LEN, p);
1671 	case MCE_GETCLEAR_FLAGS: {
1672 		unsigned flags;
1673 
1674 		do {
1675 			flags = mcelog.flags;
1676 		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1677 
1678 		return put_user(flags, p);
1679 	}
1680 	default:
1681 		return -ENOTTY;
1682 	}
1683 }
1684 
1685 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1686 			    size_t usize, loff_t *off);
1687 
register_mce_write_callback(ssize_t (* fn)(struct file * filp,const char __user * ubuf,size_t usize,loff_t * off))1688 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1689 			     const char __user *ubuf,
1690 			     size_t usize, loff_t *off))
1691 {
1692 	mce_write = fn;
1693 }
1694 EXPORT_SYMBOL_GPL(register_mce_write_callback);
1695 
mce_chrdev_write(struct file * filp,const char __user * ubuf,size_t usize,loff_t * off)1696 ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1697 			 size_t usize, loff_t *off)
1698 {
1699 	if (mce_write)
1700 		return mce_write(filp, ubuf, usize, off);
1701 	else
1702 		return -EINVAL;
1703 }
1704 
1705 static const struct file_operations mce_chrdev_ops = {
1706 	.open			= mce_chrdev_open,
1707 	.release		= mce_chrdev_release,
1708 	.read			= mce_chrdev_read,
1709 	.write			= mce_chrdev_write,
1710 	.poll			= mce_chrdev_poll,
1711 	.unlocked_ioctl		= mce_chrdev_ioctl,
1712 	.llseek			= no_llseek,
1713 };
1714 
1715 static struct miscdevice mce_chrdev_device = {
1716 	MISC_MCELOG_MINOR,
1717 	"mcelog",
1718 	&mce_chrdev_ops,
1719 };
1720 
1721 /*
1722  * mce=off Disables machine check
1723  * mce=no_cmci Disables CMCI
1724  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1725  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1726  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1727  *	monarchtimeout is how long to wait for other CPUs on machine
1728  *	check, or 0 to not wait
1729  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1730  * mce=nobootlog Don't log MCEs from before booting.
1731  */
mcheck_enable(char * str)1732 static int __init mcheck_enable(char *str)
1733 {
1734 	if (*str == 0) {
1735 		enable_p5_mce();
1736 		return 1;
1737 	}
1738 	if (*str == '=')
1739 		str++;
1740 	if (!strcmp(str, "off"))
1741 		mce_disabled = 1;
1742 	else if (!strcmp(str, "no_cmci"))
1743 		mce_cmci_disabled = 1;
1744 	else if (!strcmp(str, "dont_log_ce"))
1745 		mce_dont_log_ce = 1;
1746 	else if (!strcmp(str, "ignore_ce"))
1747 		mce_ignore_ce = 1;
1748 	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1749 		mce_bootlog = (str[0] == 'b');
1750 	else if (isdigit(str[0])) {
1751 		get_option(&str, &tolerant);
1752 		if (*str == ',') {
1753 			++str;
1754 			get_option(&str, &monarch_timeout);
1755 		}
1756 	} else {
1757 		printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1758 		       str);
1759 		return 0;
1760 	}
1761 	return 1;
1762 }
1763 __setup("mce", mcheck_enable);
1764 
mcheck_init(void)1765 int __init mcheck_init(void)
1766 {
1767 	mcheck_intel_therm_init();
1768 
1769 	return 0;
1770 }
1771 
1772 /*
1773  * mce_syscore: PM support
1774  */
1775 
1776 /*
1777  * Disable machine checks on suspend and shutdown. We can't really handle
1778  * them later.
1779  */
mce_disable_error_reporting(void)1780 static int mce_disable_error_reporting(void)
1781 {
1782 	int i;
1783 
1784 	for (i = 0; i < banks; i++) {
1785 		struct mce_bank *b = &mce_banks[i];
1786 
1787 		if (b->init)
1788 			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1789 	}
1790 	return 0;
1791 }
1792 
mce_syscore_suspend(void)1793 static int mce_syscore_suspend(void)
1794 {
1795 	return mce_disable_error_reporting();
1796 }
1797 
mce_syscore_shutdown(void)1798 static void mce_syscore_shutdown(void)
1799 {
1800 	mce_disable_error_reporting();
1801 }
1802 
1803 /*
1804  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
1805  * Only one CPU is active at this time, the others get re-added later using
1806  * CPU hotplug:
1807  */
mce_syscore_resume(void)1808 static void mce_syscore_resume(void)
1809 {
1810 	__mcheck_cpu_init_generic();
1811 	__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1812 }
1813 
1814 static struct syscore_ops mce_syscore_ops = {
1815 	.suspend	= mce_syscore_suspend,
1816 	.shutdown	= mce_syscore_shutdown,
1817 	.resume		= mce_syscore_resume,
1818 };
1819 
1820 /*
1821  * mce_device: Sysfs support
1822  */
1823 
mce_cpu_restart(void * data)1824 static void mce_cpu_restart(void *data)
1825 {
1826 	if (!mce_available(__this_cpu_ptr(&cpu_info)))
1827 		return;
1828 	__mcheck_cpu_init_generic();
1829 	__mcheck_cpu_init_timer();
1830 }
1831 
1832 /* Reinit MCEs after user configuration changes */
mce_restart(void)1833 static void mce_restart(void)
1834 {
1835 	mce_timer_delete_all();
1836 	on_each_cpu(mce_cpu_restart, NULL, 1);
1837 }
1838 
1839 /* Toggle features for corrected errors */
mce_disable_cmci(void * data)1840 static void mce_disable_cmci(void *data)
1841 {
1842 	if (!mce_available(__this_cpu_ptr(&cpu_info)))
1843 		return;
1844 	cmci_clear();
1845 }
1846 
mce_enable_ce(void * all)1847 static void mce_enable_ce(void *all)
1848 {
1849 	if (!mce_available(__this_cpu_ptr(&cpu_info)))
1850 		return;
1851 	cmci_reenable();
1852 	cmci_recheck();
1853 	if (all)
1854 		__mcheck_cpu_init_timer();
1855 }
1856 
1857 static struct bus_type mce_subsys = {
1858 	.name		= "machinecheck",
1859 	.dev_name	= "machinecheck",
1860 };
1861 
1862 struct device *mce_device[CONFIG_NR_CPUS];
1863 
1864 __cpuinitdata
1865 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1866 
attr_to_bank(struct device_attribute * attr)1867 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
1868 {
1869 	return container_of(attr, struct mce_bank, attr);
1870 }
1871 
show_bank(struct device * s,struct device_attribute * attr,char * buf)1872 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
1873 			 char *buf)
1874 {
1875 	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1876 }
1877 
set_bank(struct device * s,struct device_attribute * attr,const char * buf,size_t size)1878 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
1879 			const char *buf, size_t size)
1880 {
1881 	u64 new;
1882 
1883 	if (strict_strtoull(buf, 0, &new) < 0)
1884 		return -EINVAL;
1885 
1886 	attr_to_bank(attr)->ctl = new;
1887 	mce_restart();
1888 
1889 	return size;
1890 }
1891 
1892 static ssize_t
show_trigger(struct device * s,struct device_attribute * attr,char * buf)1893 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
1894 {
1895 	strcpy(buf, mce_helper);
1896 	strcat(buf, "\n");
1897 	return strlen(mce_helper) + 1;
1898 }
1899 
set_trigger(struct device * s,struct device_attribute * attr,const char * buf,size_t siz)1900 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
1901 				const char *buf, size_t siz)
1902 {
1903 	char *p;
1904 
1905 	strncpy(mce_helper, buf, sizeof(mce_helper));
1906 	mce_helper[sizeof(mce_helper)-1] = 0;
1907 	p = strchr(mce_helper, '\n');
1908 
1909 	if (p)
1910 		*p = 0;
1911 
1912 	return strlen(mce_helper) + !!p;
1913 }
1914 
set_ignore_ce(struct device * s,struct device_attribute * attr,const char * buf,size_t size)1915 static ssize_t set_ignore_ce(struct device *s,
1916 			     struct device_attribute *attr,
1917 			     const char *buf, size_t size)
1918 {
1919 	u64 new;
1920 
1921 	if (strict_strtoull(buf, 0, &new) < 0)
1922 		return -EINVAL;
1923 
1924 	if (mce_ignore_ce ^ !!new) {
1925 		if (new) {
1926 			/* disable ce features */
1927 			mce_timer_delete_all();
1928 			on_each_cpu(mce_disable_cmci, NULL, 1);
1929 			mce_ignore_ce = 1;
1930 		} else {
1931 			/* enable ce features */
1932 			mce_ignore_ce = 0;
1933 			on_each_cpu(mce_enable_ce, (void *)1, 1);
1934 		}
1935 	}
1936 	return size;
1937 }
1938 
set_cmci_disabled(struct device * s,struct device_attribute * attr,const char * buf,size_t size)1939 static ssize_t set_cmci_disabled(struct device *s,
1940 				 struct device_attribute *attr,
1941 				 const char *buf, size_t size)
1942 {
1943 	u64 new;
1944 
1945 	if (strict_strtoull(buf, 0, &new) < 0)
1946 		return -EINVAL;
1947 
1948 	if (mce_cmci_disabled ^ !!new) {
1949 		if (new) {
1950 			/* disable cmci */
1951 			on_each_cpu(mce_disable_cmci, NULL, 1);
1952 			mce_cmci_disabled = 1;
1953 		} else {
1954 			/* enable cmci */
1955 			mce_cmci_disabled = 0;
1956 			on_each_cpu(mce_enable_ce, NULL, 1);
1957 		}
1958 	}
1959 	return size;
1960 }
1961 
store_int_with_restart(struct device * s,struct device_attribute * attr,const char * buf,size_t size)1962 static ssize_t store_int_with_restart(struct device *s,
1963 				      struct device_attribute *attr,
1964 				      const char *buf, size_t size)
1965 {
1966 	ssize_t ret = device_store_int(s, attr, buf, size);
1967 	mce_restart();
1968 	return ret;
1969 }
1970 
1971 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
1972 static DEVICE_INT_ATTR(tolerant, 0644, tolerant);
1973 static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
1974 static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
1975 
1976 static struct dev_ext_attribute dev_attr_check_interval = {
1977 	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
1978 	&check_interval
1979 };
1980 
1981 static struct dev_ext_attribute dev_attr_ignore_ce = {
1982 	__ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce),
1983 	&mce_ignore_ce
1984 };
1985 
1986 static struct dev_ext_attribute dev_attr_cmci_disabled = {
1987 	__ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled),
1988 	&mce_cmci_disabled
1989 };
1990 
1991 static struct device_attribute *mce_device_attrs[] = {
1992 	&dev_attr_tolerant.attr,
1993 	&dev_attr_check_interval.attr,
1994 	&dev_attr_trigger,
1995 	&dev_attr_monarch_timeout.attr,
1996 	&dev_attr_dont_log_ce.attr,
1997 	&dev_attr_ignore_ce.attr,
1998 	&dev_attr_cmci_disabled.attr,
1999 	NULL
2000 };
2001 
2002 static cpumask_var_t mce_device_initialized;
2003 
mce_device_release(struct device * dev)2004 static void mce_device_release(struct device *dev)
2005 {
2006 	kfree(dev);
2007 }
2008 
2009 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
mce_device_create(unsigned int cpu)2010 static __cpuinit int mce_device_create(unsigned int cpu)
2011 {
2012 	struct device *dev;
2013 	int err;
2014 	int i, j;
2015 
2016 	if (!mce_available(&boot_cpu_data))
2017 		return -EIO;
2018 
2019 	dev = kzalloc(sizeof *dev, GFP_KERNEL);
2020 	if (!dev)
2021 		return -ENOMEM;
2022 	dev->id  = cpu;
2023 	dev->bus = &mce_subsys;
2024 	dev->release = &mce_device_release;
2025 
2026 	err = device_register(dev);
2027 	if (err)
2028 		return err;
2029 
2030 	for (i = 0; mce_device_attrs[i]; i++) {
2031 		err = device_create_file(dev, mce_device_attrs[i]);
2032 		if (err)
2033 			goto error;
2034 	}
2035 	for (j = 0; j < banks; j++) {
2036 		err = device_create_file(dev, &mce_banks[j].attr);
2037 		if (err)
2038 			goto error2;
2039 	}
2040 	cpumask_set_cpu(cpu, mce_device_initialized);
2041 	mce_device[cpu] = dev;
2042 
2043 	return 0;
2044 error2:
2045 	while (--j >= 0)
2046 		device_remove_file(dev, &mce_banks[j].attr);
2047 error:
2048 	while (--i >= 0)
2049 		device_remove_file(dev, mce_device_attrs[i]);
2050 
2051 	device_unregister(dev);
2052 
2053 	return err;
2054 }
2055 
mce_device_remove(unsigned int cpu)2056 static __cpuinit void mce_device_remove(unsigned int cpu)
2057 {
2058 	struct device *dev = mce_device[cpu];
2059 	int i;
2060 
2061 	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2062 		return;
2063 
2064 	for (i = 0; mce_device_attrs[i]; i++)
2065 		device_remove_file(dev, mce_device_attrs[i]);
2066 
2067 	for (i = 0; i < banks; i++)
2068 		device_remove_file(dev, &mce_banks[i].attr);
2069 
2070 	device_unregister(dev);
2071 	cpumask_clear_cpu(cpu, mce_device_initialized);
2072 	mce_device[cpu] = NULL;
2073 }
2074 
2075 /* Make sure there are no machine checks on offlined CPUs. */
mce_disable_cpu(void * h)2076 static void __cpuinit mce_disable_cpu(void *h)
2077 {
2078 	unsigned long action = *(unsigned long *)h;
2079 	int i;
2080 
2081 	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2082 		return;
2083 
2084 	if (!(action & CPU_TASKS_FROZEN))
2085 		cmci_clear();
2086 	for (i = 0; i < banks; i++) {
2087 		struct mce_bank *b = &mce_banks[i];
2088 
2089 		if (b->init)
2090 			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2091 	}
2092 }
2093 
mce_reenable_cpu(void * h)2094 static void __cpuinit mce_reenable_cpu(void *h)
2095 {
2096 	unsigned long action = *(unsigned long *)h;
2097 	int i;
2098 
2099 	if (!mce_available(__this_cpu_ptr(&cpu_info)))
2100 		return;
2101 
2102 	if (!(action & CPU_TASKS_FROZEN))
2103 		cmci_reenable();
2104 	for (i = 0; i < banks; i++) {
2105 		struct mce_bank *b = &mce_banks[i];
2106 
2107 		if (b->init)
2108 			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2109 	}
2110 }
2111 
2112 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2113 static int __cpuinit
mce_cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)2114 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2115 {
2116 	unsigned int cpu = (unsigned long)hcpu;
2117 	struct timer_list *t = &per_cpu(mce_timer, cpu);
2118 
2119 	switch (action) {
2120 	case CPU_ONLINE:
2121 	case CPU_ONLINE_FROZEN:
2122 		mce_device_create(cpu);
2123 		if (threshold_cpu_callback)
2124 			threshold_cpu_callback(action, cpu);
2125 		break;
2126 	case CPU_DEAD:
2127 	case CPU_DEAD_FROZEN:
2128 		if (threshold_cpu_callback)
2129 			threshold_cpu_callback(action, cpu);
2130 		mce_device_remove(cpu);
2131 		break;
2132 	case CPU_DOWN_PREPARE:
2133 	case CPU_DOWN_PREPARE_FROZEN:
2134 		del_timer_sync(t);
2135 		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2136 		break;
2137 	case CPU_DOWN_FAILED:
2138 	case CPU_DOWN_FAILED_FROZEN:
2139 		if (!mce_ignore_ce && check_interval) {
2140 			t->expires = round_jiffies(jiffies +
2141 					   __get_cpu_var(mce_next_interval));
2142 			add_timer_on(t, cpu);
2143 		}
2144 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2145 		break;
2146 	case CPU_POST_DEAD:
2147 		/* intentionally ignoring frozen here */
2148 		cmci_rediscover(cpu);
2149 		break;
2150 	}
2151 	return NOTIFY_OK;
2152 }
2153 
2154 static struct notifier_block mce_cpu_notifier __cpuinitdata = {
2155 	.notifier_call = mce_cpu_callback,
2156 };
2157 
mce_init_banks(void)2158 static __init void mce_init_banks(void)
2159 {
2160 	int i;
2161 
2162 	for (i = 0; i < banks; i++) {
2163 		struct mce_bank *b = &mce_banks[i];
2164 		struct device_attribute *a = &b->attr;
2165 
2166 		sysfs_attr_init(&a->attr);
2167 		a->attr.name	= b->attrname;
2168 		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2169 
2170 		a->attr.mode	= 0644;
2171 		a->show		= show_bank;
2172 		a->store	= set_bank;
2173 	}
2174 }
2175 
mcheck_init_device(void)2176 static __init int mcheck_init_device(void)
2177 {
2178 	int err;
2179 	int i = 0;
2180 
2181 	if (!mce_available(&boot_cpu_data))
2182 		return -EIO;
2183 
2184 	zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL);
2185 
2186 	mce_init_banks();
2187 
2188 	err = subsys_system_register(&mce_subsys, NULL);
2189 	if (err)
2190 		return err;
2191 
2192 	for_each_online_cpu(i) {
2193 		err = mce_device_create(i);
2194 		if (err)
2195 			return err;
2196 	}
2197 
2198 	register_syscore_ops(&mce_syscore_ops);
2199 	register_hotcpu_notifier(&mce_cpu_notifier);
2200 
2201 	/* register character device /dev/mcelog */
2202 	misc_register(&mce_chrdev_device);
2203 
2204 	return err;
2205 }
2206 device_initcall(mcheck_init_device);
2207 
2208 /*
2209  * Old style boot options parsing. Only for compatibility.
2210  */
mcheck_disable(char * str)2211 static int __init mcheck_disable(char *str)
2212 {
2213 	mce_disabled = 1;
2214 	return 1;
2215 }
2216 __setup("nomce", mcheck_disable);
2217 
2218 #ifdef CONFIG_DEBUG_FS
mce_get_debugfs_dir(void)2219 struct dentry *mce_get_debugfs_dir(void)
2220 {
2221 	static struct dentry *dmce;
2222 
2223 	if (!dmce)
2224 		dmce = debugfs_create_dir("mce", NULL);
2225 
2226 	return dmce;
2227 }
2228 
mce_reset(void)2229 static void mce_reset(void)
2230 {
2231 	cpu_missing = 0;
2232 	atomic_set(&mce_fake_paniced, 0);
2233 	atomic_set(&mce_executing, 0);
2234 	atomic_set(&mce_callin, 0);
2235 	atomic_set(&global_nwo, 0);
2236 }
2237 
fake_panic_get(void * data,u64 * val)2238 static int fake_panic_get(void *data, u64 *val)
2239 {
2240 	*val = fake_panic;
2241 	return 0;
2242 }
2243 
fake_panic_set(void * data,u64 val)2244 static int fake_panic_set(void *data, u64 val)
2245 {
2246 	mce_reset();
2247 	fake_panic = val;
2248 	return 0;
2249 }
2250 
2251 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2252 			fake_panic_set, "%llu\n");
2253 
mcheck_debugfs_init(void)2254 static int __init mcheck_debugfs_init(void)
2255 {
2256 	struct dentry *dmce, *ffake_panic;
2257 
2258 	dmce = mce_get_debugfs_dir();
2259 	if (!dmce)
2260 		return -ENOMEM;
2261 	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2262 					  &fake_panic_fops);
2263 	if (!ffake_panic)
2264 		return -ENOMEM;
2265 
2266 	return 0;
2267 }
2268 late_initcall(mcheck_debugfs_init);
2269 #endif
2270