1/*
2 *  linux/arch/x86_64/entry.S
3 *
4 *  Copyright (C) 1991, 1992  Linus Torvalds
5 *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
6 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
7 */
8
9/*
10 * entry.S contains the system-call and fault low-level handling routines.
11 *
12 * Some of this is documented in Documentation/x86/entry_64.txt
13 *
14 * NOTE: This code handles signal-recognition, which happens every time
15 * after an interrupt and after each system call.
16 *
17 * Normal syscalls and interrupts don't save a full stack frame, this is
18 * only done for syscall tracing, signals or fork/exec et.al.
19 *
20 * A note on terminology:
21 * - top of stack: Architecture defined interrupt frame from SS to RIP
22 * at the top of the kernel process stack.
23 * - partial stack frame: partially saved registers up to R11.
24 * - full stack frame: Like partial stack frame, but all register saved.
25 *
26 * Some macro usage:
27 * - CFI macros are used to generate dwarf2 unwind information for better
28 * backtraces. They don't change any code.
29 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
30 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
31 * There are unfortunately lots of special cases where some registers
32 * not touched. The macro is a big mess that should be cleaned up.
33 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
34 * Gives a full stack frame.
35 * - ENTRY/END Define functions in the symbol table.
36 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
37 * frame that is otherwise undefined after a SYSCALL
38 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
39 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
40 */
41
42#include <linux/linkage.h>
43#include <asm/segment.h>
44#include <asm/cache.h>
45#include <asm/errno.h>
46#include <asm/dwarf2.h>
47#include <asm/calling.h>
48#include <asm/asm-offsets.h>
49#include <asm/msr.h>
50#include <asm/unistd.h>
51#include <asm/thread_info.h>
52#include <asm/hw_irq.h>
53#include <asm/page_types.h>
54#include <asm/irqflags.h>
55#include <asm/paravirt.h>
56#include <asm/ftrace.h>
57#include <asm/percpu.h>
58#include <linux/err.h>
59
60/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
61#include <linux/elf-em.h>
62#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
63#define __AUDIT_ARCH_64BIT 0x80000000
64#define __AUDIT_ARCH_LE	   0x40000000
65
66	.code64
67	.section .entry.text, "ax"
68
69#ifdef CONFIG_FUNCTION_TRACER
70#ifdef CONFIG_DYNAMIC_FTRACE
71ENTRY(mcount)
72	retq
73END(mcount)
74
75ENTRY(ftrace_caller)
76	cmpl $0, function_trace_stop
77	jne  ftrace_stub
78
79	MCOUNT_SAVE_FRAME
80
81	movq 0x38(%rsp), %rdi
82	movq 8(%rbp), %rsi
83	subq $MCOUNT_INSN_SIZE, %rdi
84
85GLOBAL(ftrace_call)
86	call ftrace_stub
87
88	MCOUNT_RESTORE_FRAME
89
90#ifdef CONFIG_FUNCTION_GRAPH_TRACER
91GLOBAL(ftrace_graph_call)
92	jmp ftrace_stub
93#endif
94
95GLOBAL(ftrace_stub)
96	retq
97END(ftrace_caller)
98
99#else /* ! CONFIG_DYNAMIC_FTRACE */
100ENTRY(mcount)
101	cmpl $0, function_trace_stop
102	jne  ftrace_stub
103
104	cmpq $ftrace_stub, ftrace_trace_function
105	jnz trace
106
107#ifdef CONFIG_FUNCTION_GRAPH_TRACER
108	cmpq $ftrace_stub, ftrace_graph_return
109	jnz ftrace_graph_caller
110
111	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
112	jnz ftrace_graph_caller
113#endif
114
115GLOBAL(ftrace_stub)
116	retq
117
118trace:
119	MCOUNT_SAVE_FRAME
120
121	movq 0x38(%rsp), %rdi
122	movq 8(%rbp), %rsi
123	subq $MCOUNT_INSN_SIZE, %rdi
124
125	call   *ftrace_trace_function
126
127	MCOUNT_RESTORE_FRAME
128
129	jmp ftrace_stub
130END(mcount)
131#endif /* CONFIG_DYNAMIC_FTRACE */
132#endif /* CONFIG_FUNCTION_TRACER */
133
134#ifdef CONFIG_FUNCTION_GRAPH_TRACER
135ENTRY(ftrace_graph_caller)
136	cmpl $0, function_trace_stop
137	jne ftrace_stub
138
139	MCOUNT_SAVE_FRAME
140
141	leaq 8(%rbp), %rdi
142	movq 0x38(%rsp), %rsi
143	movq (%rbp), %rdx
144	subq $MCOUNT_INSN_SIZE, %rsi
145
146	call	prepare_ftrace_return
147
148	MCOUNT_RESTORE_FRAME
149
150	retq
151END(ftrace_graph_caller)
152
153GLOBAL(return_to_handler)
154	subq  $24, %rsp
155
156	/* Save the return values */
157	movq %rax, (%rsp)
158	movq %rdx, 8(%rsp)
159	movq %rbp, %rdi
160
161	call ftrace_return_to_handler
162
163	movq %rax, %rdi
164	movq 8(%rsp), %rdx
165	movq (%rsp), %rax
166	addq $24, %rsp
167	jmp *%rdi
168#endif
169
170
171#ifndef CONFIG_PREEMPT
172#define retint_kernel retint_restore_args
173#endif
174
175#ifdef CONFIG_PARAVIRT
176ENTRY(native_usergs_sysret64)
177	swapgs
178	sysretq
179ENDPROC(native_usergs_sysret64)
180#endif /* CONFIG_PARAVIRT */
181
182
183.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
184#ifdef CONFIG_TRACE_IRQFLAGS
185	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
186	jnc  1f
187	TRACE_IRQS_ON
1881:
189#endif
190.endm
191
192/*
193 * C code is not supposed to know about undefined top of stack. Every time
194 * a C function with an pt_regs argument is called from the SYSCALL based
195 * fast path FIXUP_TOP_OF_STACK is needed.
196 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
197 * manipulation.
198 */
199
200	/* %rsp:at FRAMEEND */
201	.macro FIXUP_TOP_OF_STACK tmp offset=0
202	movq PER_CPU_VAR(old_rsp),\tmp
203	movq \tmp,RSP+\offset(%rsp)
204	movq $__USER_DS,SS+\offset(%rsp)
205	movq $__USER_CS,CS+\offset(%rsp)
206	movq $-1,RCX+\offset(%rsp)
207	movq R11+\offset(%rsp),\tmp  /* get eflags */
208	movq \tmp,EFLAGS+\offset(%rsp)
209	.endm
210
211	.macro RESTORE_TOP_OF_STACK tmp offset=0
212	movq RSP+\offset(%rsp),\tmp
213	movq \tmp,PER_CPU_VAR(old_rsp)
214	movq EFLAGS+\offset(%rsp),\tmp
215	movq \tmp,R11+\offset(%rsp)
216	.endm
217
218	.macro FAKE_STACK_FRAME child_rip
219	/* push in order ss, rsp, eflags, cs, rip */
220	xorl %eax, %eax
221	pushq_cfi $__KERNEL_DS /* ss */
222	/*CFI_REL_OFFSET	ss,0*/
223	pushq_cfi %rax /* rsp */
224	CFI_REL_OFFSET	rsp,0
225	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */
226	/*CFI_REL_OFFSET	rflags,0*/
227	pushq_cfi $__KERNEL_CS /* cs */
228	/*CFI_REL_OFFSET	cs,0*/
229	pushq_cfi \child_rip /* rip */
230	CFI_REL_OFFSET	rip,0
231	pushq_cfi %rax /* orig rax */
232	.endm
233
234	.macro UNFAKE_STACK_FRAME
235	addq $8*6, %rsp
236	CFI_ADJUST_CFA_OFFSET	-(6*8)
237	.endm
238
239/*
240 * initial frame state for interrupts (and exceptions without error code)
241 */
242	.macro EMPTY_FRAME start=1 offset=0
243	.if \start
244	CFI_STARTPROC simple
245	CFI_SIGNAL_FRAME
246	CFI_DEF_CFA rsp,8+\offset
247	.else
248	CFI_DEF_CFA_OFFSET 8+\offset
249	.endif
250	.endm
251
252/*
253 * initial frame state for interrupts (and exceptions without error code)
254 */
255	.macro INTR_FRAME start=1 offset=0
256	EMPTY_FRAME \start, SS+8+\offset-RIP
257	/*CFI_REL_OFFSET ss, SS+\offset-RIP*/
258	CFI_REL_OFFSET rsp, RSP+\offset-RIP
259	/*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
260	/*CFI_REL_OFFSET cs, CS+\offset-RIP*/
261	CFI_REL_OFFSET rip, RIP+\offset-RIP
262	.endm
263
264/*
265 * initial frame state for exceptions with error code (and interrupts
266 * with vector already pushed)
267 */
268	.macro XCPT_FRAME start=1 offset=0
269	INTR_FRAME \start, RIP+\offset-ORIG_RAX
270	/*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
271	.endm
272
273/*
274 * frame that enables calling into C.
275 */
276	.macro PARTIAL_FRAME start=1 offset=0
277	XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
278	CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
279	CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
280	CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
281	CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
282	CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
283	CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
284	CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
285	CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
286	CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
287	.endm
288
289/*
290 * frame that enables passing a complete pt_regs to a C function.
291 */
292	.macro DEFAULT_FRAME start=1 offset=0
293	PARTIAL_FRAME \start, R11+\offset-R15
294	CFI_REL_OFFSET rbx, RBX+\offset
295	CFI_REL_OFFSET rbp, RBP+\offset
296	CFI_REL_OFFSET r12, R12+\offset
297	CFI_REL_OFFSET r13, R13+\offset
298	CFI_REL_OFFSET r14, R14+\offset
299	CFI_REL_OFFSET r15, R15+\offset
300	.endm
301
302/* save partial stack frame */
303	.macro SAVE_ARGS_IRQ
304	cld
305	/* start from rbp in pt_regs and jump over */
306	movq_cfi rdi, RDI-RBP
307	movq_cfi rsi, RSI-RBP
308	movq_cfi rdx, RDX-RBP
309	movq_cfi rcx, RCX-RBP
310	movq_cfi rax, RAX-RBP
311	movq_cfi  r8,  R8-RBP
312	movq_cfi  r9,  R9-RBP
313	movq_cfi r10, R10-RBP
314	movq_cfi r11, R11-RBP
315
316	/* Save rbp so that we can unwind from get_irq_regs() */
317	movq_cfi rbp, 0
318
319	/* Save previous stack value */
320	movq %rsp, %rsi
321
322	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
323	testl $3, CS(%rdi)
324	je 1f
325	SWAPGS
326	/*
327	 * irq_count is used to check if a CPU is already on an interrupt stack
328	 * or not. While this is essentially redundant with preempt_count it is
329	 * a little cheaper to use a separate counter in the PDA (short of
330	 * moving irq_enter into assembly, which would be too much work)
331	 */
3321:	incl PER_CPU_VAR(irq_count)
333	jne 2f
334	mov PER_CPU_VAR(irq_stack_ptr),%rsp
335	CFI_DEF_CFA_REGISTER	rsi
336
3372:	/* Store previous stack value */
338	pushq %rsi
339	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
340			0x77 /* DW_OP_breg7 */, 0, \
341			0x06 /* DW_OP_deref */, \
342			0x08 /* DW_OP_const1u */, SS+8-RBP, \
343			0x22 /* DW_OP_plus */
344	/* We entered an interrupt context - irqs are off: */
345	TRACE_IRQS_OFF
346	.endm
347
348ENTRY(save_rest)
349	PARTIAL_FRAME 1 REST_SKIP+8
350	movq 5*8+16(%rsp), %r11	/* save return address */
351	movq_cfi rbx, RBX+16
352	movq_cfi rbp, RBP+16
353	movq_cfi r12, R12+16
354	movq_cfi r13, R13+16
355	movq_cfi r14, R14+16
356	movq_cfi r15, R15+16
357	movq %r11, 8(%rsp)	/* return address */
358	FIXUP_TOP_OF_STACK %r11, 16
359	ret
360	CFI_ENDPROC
361END(save_rest)
362
363/* save complete stack frame */
364	.pushsection .kprobes.text, "ax"
365ENTRY(save_paranoid)
366	XCPT_FRAME 1 RDI+8
367	cld
368	movq_cfi rdi, RDI+8
369	movq_cfi rsi, RSI+8
370	movq_cfi rdx, RDX+8
371	movq_cfi rcx, RCX+8
372	movq_cfi rax, RAX+8
373	movq_cfi r8, R8+8
374	movq_cfi r9, R9+8
375	movq_cfi r10, R10+8
376	movq_cfi r11, R11+8
377	movq_cfi rbx, RBX+8
378	movq_cfi rbp, RBP+8
379	movq_cfi r12, R12+8
380	movq_cfi r13, R13+8
381	movq_cfi r14, R14+8
382	movq_cfi r15, R15+8
383	movl $1,%ebx
384	movl $MSR_GS_BASE,%ecx
385	rdmsr
386	testl %edx,%edx
387	js 1f	/* negative -> in kernel */
388	SWAPGS
389	xorl %ebx,%ebx
3901:	ret
391	CFI_ENDPROC
392END(save_paranoid)
393	.popsection
394
395/*
396 * A newly forked process directly context switches into this address.
397 *
398 * rdi: prev task we switched from
399 */
400ENTRY(ret_from_fork)
401	DEFAULT_FRAME
402
403	LOCK ; btr $TIF_FORK,TI_flags(%r8)
404
405	pushq_cfi kernel_eflags(%rip)
406	popfq_cfi				# reset kernel eflags
407
408	call schedule_tail			# rdi: 'prev' task parameter
409
410	GET_THREAD_INFO(%rcx)
411
412	RESTORE_REST
413
414	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
415	jz   retint_restore_args
416
417	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
418	jnz  int_ret_from_sys_call
419
420	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
421	jmp ret_from_sys_call			# go to the SYSRET fastpath
422
423	CFI_ENDPROC
424END(ret_from_fork)
425
426/*
427 * System call entry. Up to 6 arguments in registers are supported.
428 *
429 * SYSCALL does not save anything on the stack and does not change the
430 * stack pointer.
431 */
432
433/*
434 * Register setup:
435 * rax  system call number
436 * rdi  arg0
437 * rcx  return address for syscall/sysret, C arg3
438 * rsi  arg1
439 * rdx  arg2
440 * r10  arg3 	(--> moved to rcx for C)
441 * r8   arg4
442 * r9   arg5
443 * r11  eflags for syscall/sysret, temporary for C
444 * r12-r15,rbp,rbx saved by C code, not touched.
445 *
446 * Interrupts are off on entry.
447 * Only called from user space.
448 *
449 * XXX	if we had a free scratch register we could save the RSP into the stack frame
450 *      and report it properly in ps. Unfortunately we haven't.
451 *
452 * When user can change the frames always force IRET. That is because
453 * it deals with uncanonical addresses better. SYSRET has trouble
454 * with them due to bugs in both AMD and Intel CPUs.
455 */
456
457ENTRY(system_call)
458	CFI_STARTPROC	simple
459	CFI_SIGNAL_FRAME
460	CFI_DEF_CFA	rsp,KERNEL_STACK_OFFSET
461	CFI_REGISTER	rip,rcx
462	/*CFI_REGISTER	rflags,r11*/
463	SWAPGS_UNSAFE_STACK
464	/*
465	 * A hypervisor implementation might want to use a label
466	 * after the swapgs, so that it can do the swapgs
467	 * for the guest and jump here on syscall.
468	 */
469GLOBAL(system_call_after_swapgs)
470
471	movq	%rsp,PER_CPU_VAR(old_rsp)
472	movq	PER_CPU_VAR(kernel_stack),%rsp
473	/*
474	 * No need to follow this irqs off/on section - it's straight
475	 * and short:
476	 */
477	ENABLE_INTERRUPTS(CLBR_NONE)
478	SAVE_ARGS 8,0
479	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
480	movq  %rcx,RIP-ARGOFFSET(%rsp)
481	CFI_REL_OFFSET rip,RIP-ARGOFFSET
482	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
483	jnz tracesys
484system_call_fastpath:
485	cmpq $__NR_syscall_max,%rax
486	ja badsys
487	movq %r10,%rcx
488	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
489	movq %rax,RAX-ARGOFFSET(%rsp)
490/*
491 * Syscall return path ending with SYSRET (fast path)
492 * Has incomplete stack frame and undefined top of stack.
493 */
494ret_from_sys_call:
495	movl $_TIF_ALLWORK_MASK,%edi
496	/* edi:	flagmask */
497sysret_check:
498	LOCKDEP_SYS_EXIT
499	DISABLE_INTERRUPTS(CLBR_NONE)
500	TRACE_IRQS_OFF
501	movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx
502	andl %edi,%edx
503	jnz  sysret_careful
504	CFI_REMEMBER_STATE
505	/*
506	 * sysretq will re-enable interrupts:
507	 */
508	TRACE_IRQS_ON
509	movq RIP-ARGOFFSET(%rsp),%rcx
510	CFI_REGISTER	rip,rcx
511	RESTORE_ARGS 1,-ARG_SKIP,0
512	/*CFI_REGISTER	rflags,r11*/
513	movq	PER_CPU_VAR(old_rsp), %rsp
514	USERGS_SYSRET64
515
516	CFI_RESTORE_STATE
517	/* Handle reschedules */
518	/* edx:	work, edi: workmask */
519sysret_careful:
520	bt $TIF_NEED_RESCHED,%edx
521	jnc sysret_signal
522	TRACE_IRQS_ON
523	ENABLE_INTERRUPTS(CLBR_NONE)
524	pushq_cfi %rdi
525	call schedule
526	popq_cfi %rdi
527	jmp sysret_check
528
529	/* Handle a signal */
530sysret_signal:
531	TRACE_IRQS_ON
532	ENABLE_INTERRUPTS(CLBR_NONE)
533#ifdef CONFIG_AUDITSYSCALL
534	bt $TIF_SYSCALL_AUDIT,%edx
535	jc sysret_audit
536#endif
537	/*
538	 * We have a signal, or exit tracing or single-step.
539	 * These all wind up with the iret return path anyway,
540	 * so just join that path right now.
541	 */
542	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
543	jmp int_check_syscall_exit_work
544
545badsys:
546	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
547	jmp ret_from_sys_call
548
549#ifdef CONFIG_AUDITSYSCALL
550	/*
551	 * Fast path for syscall audit without full syscall trace.
552	 * We just call __audit_syscall_entry() directly, and then
553	 * jump back to the normal fast path.
554	 */
555auditsys:
556	movq %r10,%r9			/* 6th arg: 4th syscall arg */
557	movq %rdx,%r8			/* 5th arg: 3rd syscall arg */
558	movq %rsi,%rcx			/* 4th arg: 2nd syscall arg */
559	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
560	movq %rax,%rsi			/* 2nd arg: syscall number */
561	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
562	call __audit_syscall_entry
563	LOAD_ARGS 0		/* reload call-clobbered registers */
564	jmp system_call_fastpath
565
566	/*
567	 * Return fast path for syscall audit.  Call __audit_syscall_exit()
568	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
569	 * masked off.
570	 */
571sysret_audit:
572	movq RAX-ARGOFFSET(%rsp),%rsi	/* second arg, syscall return value */
573	cmpq $-MAX_ERRNO,%rsi	/* is it < -MAX_ERRNO? */
574	setbe %al		/* 1 if so, 0 if not */
575	movzbl %al,%edi		/* zero-extend that into %edi */
576	call __audit_syscall_exit
577	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
578	jmp sysret_check
579#endif	/* CONFIG_AUDITSYSCALL */
580
581	/* Do syscall tracing */
582tracesys:
583#ifdef CONFIG_AUDITSYSCALL
584	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
585	jz auditsys
586#endif
587	SAVE_REST
588	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
589	FIXUP_TOP_OF_STACK %rdi
590	movq %rsp,%rdi
591	call syscall_trace_enter
592	/*
593	 * Reload arg registers from stack in case ptrace changed them.
594	 * We don't reload %rax because syscall_trace_enter() returned
595	 * the value it wants us to use in the table lookup.
596	 */
597	LOAD_ARGS ARGOFFSET, 1
598	RESTORE_REST
599	cmpq $__NR_syscall_max,%rax
600	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
601	movq %r10,%rcx	/* fixup for C */
602	call *sys_call_table(,%rax,8)
603	movq %rax,RAX-ARGOFFSET(%rsp)
604	/* Use IRET because user could have changed frame */
605
606/*
607 * Syscall return path ending with IRET.
608 * Has correct top of stack, but partial stack frame.
609 */
610GLOBAL(int_ret_from_sys_call)
611	DISABLE_INTERRUPTS(CLBR_NONE)
612	TRACE_IRQS_OFF
613	movl $_TIF_ALLWORK_MASK,%edi
614	/* edi:	mask to check */
615GLOBAL(int_with_check)
616	LOCKDEP_SYS_EXIT_IRQ
617	GET_THREAD_INFO(%rcx)
618	movl TI_flags(%rcx),%edx
619	andl %edi,%edx
620	jnz   int_careful
621	andl    $~TS_COMPAT,TI_status(%rcx)
622	jmp   retint_swapgs
623
624	/* Either reschedule or signal or syscall exit tracking needed. */
625	/* First do a reschedule test. */
626	/* edx:	work, edi: workmask */
627int_careful:
628	bt $TIF_NEED_RESCHED,%edx
629	jnc  int_very_careful
630	TRACE_IRQS_ON
631	ENABLE_INTERRUPTS(CLBR_NONE)
632	pushq_cfi %rdi
633	call schedule
634	popq_cfi %rdi
635	DISABLE_INTERRUPTS(CLBR_NONE)
636	TRACE_IRQS_OFF
637	jmp int_with_check
638
639	/* handle signals and tracing -- both require a full stack frame */
640int_very_careful:
641	TRACE_IRQS_ON
642	ENABLE_INTERRUPTS(CLBR_NONE)
643int_check_syscall_exit_work:
644	SAVE_REST
645	/* Check for syscall exit trace */
646	testl $_TIF_WORK_SYSCALL_EXIT,%edx
647	jz int_signal
648	pushq_cfi %rdi
649	leaq 8(%rsp),%rdi	# &ptregs -> arg1
650	call syscall_trace_leave
651	popq_cfi %rdi
652	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
653	jmp int_restore_rest
654
655int_signal:
656	testl $_TIF_DO_NOTIFY_MASK,%edx
657	jz 1f
658	movq %rsp,%rdi		# &ptregs -> arg1
659	xorl %esi,%esi		# oldset -> arg2
660	call do_notify_resume
6611:	movl $_TIF_WORK_MASK,%edi
662int_restore_rest:
663	RESTORE_REST
664	DISABLE_INTERRUPTS(CLBR_NONE)
665	TRACE_IRQS_OFF
666	jmp int_with_check
667	CFI_ENDPROC
668END(system_call)
669
670/*
671 * Certain special system calls that need to save a complete full stack frame.
672 */
673	.macro PTREGSCALL label,func,arg
674ENTRY(\label)
675	PARTIAL_FRAME 1 8		/* offset 8: return address */
676	subq $REST_SKIP, %rsp
677	CFI_ADJUST_CFA_OFFSET REST_SKIP
678	call save_rest
679	DEFAULT_FRAME 0 8		/* offset 8: return address */
680	leaq 8(%rsp), \arg	/* pt_regs pointer */
681	call \func
682	jmp ptregscall_common
683	CFI_ENDPROC
684END(\label)
685	.endm
686
687	PTREGSCALL stub_clone, sys_clone, %r8
688	PTREGSCALL stub_fork, sys_fork, %rdi
689	PTREGSCALL stub_vfork, sys_vfork, %rdi
690	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
691	PTREGSCALL stub_iopl, sys_iopl, %rsi
692
693ENTRY(ptregscall_common)
694	DEFAULT_FRAME 1 8	/* offset 8: return address */
695	RESTORE_TOP_OF_STACK %r11, 8
696	movq_cfi_restore R15+8, r15
697	movq_cfi_restore R14+8, r14
698	movq_cfi_restore R13+8, r13
699	movq_cfi_restore R12+8, r12
700	movq_cfi_restore RBP+8, rbp
701	movq_cfi_restore RBX+8, rbx
702	ret $REST_SKIP		/* pop extended registers */
703	CFI_ENDPROC
704END(ptregscall_common)
705
706ENTRY(stub_execve)
707	CFI_STARTPROC
708	addq $8, %rsp
709	PARTIAL_FRAME 0
710	SAVE_REST
711	FIXUP_TOP_OF_STACK %r11
712	movq %rsp, %rcx
713	call sys_execve
714	RESTORE_TOP_OF_STACK %r11
715	movq %rax,RAX(%rsp)
716	RESTORE_REST
717	jmp int_ret_from_sys_call
718	CFI_ENDPROC
719END(stub_execve)
720
721/*
722 * sigreturn is special because it needs to restore all registers on return.
723 * This cannot be done with SYSRET, so use the IRET return path instead.
724 */
725ENTRY(stub_rt_sigreturn)
726	CFI_STARTPROC
727	addq $8, %rsp
728	PARTIAL_FRAME 0
729	SAVE_REST
730	movq %rsp,%rdi
731	FIXUP_TOP_OF_STACK %r11
732	call sys_rt_sigreturn
733	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
734	RESTORE_REST
735	jmp int_ret_from_sys_call
736	CFI_ENDPROC
737END(stub_rt_sigreturn)
738
739/*
740 * Build the entry stubs and pointer table with some assembler magic.
741 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
742 * single cache line on all modern x86 implementations.
743 */
744	.section .init.rodata,"a"
745ENTRY(interrupt)
746	.section .entry.text
747	.p2align 5
748	.p2align CONFIG_X86_L1_CACHE_SHIFT
749ENTRY(irq_entries_start)
750	INTR_FRAME
751vector=FIRST_EXTERNAL_VECTOR
752.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
753	.balign 32
754  .rept	7
755    .if vector < NR_VECTORS
756      .if vector <> FIRST_EXTERNAL_VECTOR
757	CFI_ADJUST_CFA_OFFSET -8
758      .endif
7591:	pushq_cfi $(~vector+0x80)	/* Note: always in signed byte range */
760      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
761	jmp 2f
762      .endif
763      .previous
764	.quad 1b
765      .section .entry.text
766vector=vector+1
767    .endif
768  .endr
7692:	jmp common_interrupt
770.endr
771	CFI_ENDPROC
772END(irq_entries_start)
773
774.previous
775END(interrupt)
776.previous
777
778/*
779 * Interrupt entry/exit.
780 *
781 * Interrupt entry points save only callee clobbered registers in fast path.
782 *
783 * Entry runs with interrupts off.
784 */
785
786/* 0(%rsp): ~(interrupt number) */
787	.macro interrupt func
788	/* reserve pt_regs for scratch regs and rbp */
789	subq $ORIG_RAX-RBP, %rsp
790	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
791	SAVE_ARGS_IRQ
792	call \func
793	.endm
794
795/*
796 * Interrupt entry/exit should be protected against kprobes
797 */
798	.pushsection .kprobes.text, "ax"
799	/*
800	 * The interrupt stubs push (~vector+0x80) onto the stack and
801	 * then jump to common_interrupt.
802	 */
803	.p2align CONFIG_X86_L1_CACHE_SHIFT
804common_interrupt:
805	XCPT_FRAME
806	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
807	interrupt do_IRQ
808	/* 0(%rsp): old_rsp-ARGOFFSET */
809ret_from_intr:
810	DISABLE_INTERRUPTS(CLBR_NONE)
811	TRACE_IRQS_OFF
812	decl PER_CPU_VAR(irq_count)
813
814	/* Restore saved previous stack */
815	popq %rsi
816	CFI_DEF_CFA_REGISTER	rsi
817	leaq ARGOFFSET-RBP(%rsi), %rsp
818	CFI_DEF_CFA_REGISTER	rsp
819	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET
820
821exit_intr:
822	GET_THREAD_INFO(%rcx)
823	testl $3,CS-ARGOFFSET(%rsp)
824	je retint_kernel
825
826	/* Interrupt came from user space */
827	/*
828	 * Has a correct top of stack, but a partial stack frame
829	 * %rcx: thread info. Interrupts off.
830	 */
831retint_with_reschedule:
832	movl $_TIF_WORK_MASK,%edi
833retint_check:
834	LOCKDEP_SYS_EXIT_IRQ
835	movl TI_flags(%rcx),%edx
836	andl %edi,%edx
837	CFI_REMEMBER_STATE
838	jnz  retint_careful
839
840retint_swapgs:		/* return to user-space */
841	/*
842	 * The iretq could re-enable interrupts:
843	 */
844	DISABLE_INTERRUPTS(CLBR_ANY)
845	TRACE_IRQS_IRETQ
846	SWAPGS
847	jmp restore_args
848
849retint_restore_args:	/* return to kernel space */
850	DISABLE_INTERRUPTS(CLBR_ANY)
851	/*
852	 * The iretq could re-enable interrupts:
853	 */
854	TRACE_IRQS_IRETQ
855restore_args:
856	RESTORE_ARGS 1,8,1
857
858irq_return:
859	INTERRUPT_RETURN
860
861	.section __ex_table, "a"
862	.quad irq_return, bad_iret
863	.previous
864
865#ifdef CONFIG_PARAVIRT
866ENTRY(native_iret)
867	iretq
868
869	.section __ex_table,"a"
870	.quad native_iret, bad_iret
871	.previous
872#endif
873
874	.section .fixup,"ax"
875bad_iret:
876	/*
877	 * The iret traps when the %cs or %ss being restored is bogus.
878	 * We've lost the original trap vector and error code.
879	 * #GPF is the most likely one to get for an invalid selector.
880	 * So pretend we completed the iret and took the #GPF in user mode.
881	 *
882	 * We are now running with the kernel GS after exception recovery.
883	 * But error_entry expects us to have user GS to match the user %cs,
884	 * so swap back.
885	 */
886	pushq $0
887
888	SWAPGS
889	jmp general_protection
890
891	.previous
892
893	/* edi: workmask, edx: work */
894retint_careful:
895	CFI_RESTORE_STATE
896	bt    $TIF_NEED_RESCHED,%edx
897	jnc   retint_signal
898	TRACE_IRQS_ON
899	ENABLE_INTERRUPTS(CLBR_NONE)
900	pushq_cfi %rdi
901	call  schedule
902	popq_cfi %rdi
903	GET_THREAD_INFO(%rcx)
904	DISABLE_INTERRUPTS(CLBR_NONE)
905	TRACE_IRQS_OFF
906	jmp retint_check
907
908retint_signal:
909	testl $_TIF_DO_NOTIFY_MASK,%edx
910	jz    retint_swapgs
911	TRACE_IRQS_ON
912	ENABLE_INTERRUPTS(CLBR_NONE)
913	SAVE_REST
914	movq $-1,ORIG_RAX(%rsp)
915	xorl %esi,%esi		# oldset
916	movq %rsp,%rdi		# &pt_regs
917	call do_notify_resume
918	RESTORE_REST
919	DISABLE_INTERRUPTS(CLBR_NONE)
920	TRACE_IRQS_OFF
921	GET_THREAD_INFO(%rcx)
922	jmp retint_with_reschedule
923
924#ifdef CONFIG_PREEMPT
925	/* Returning to kernel space. Check if we need preemption */
926	/* rcx:	 threadinfo. interrupts off. */
927ENTRY(retint_kernel)
928	cmpl $0,TI_preempt_count(%rcx)
929	jnz  retint_restore_args
930	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
931	jnc  retint_restore_args
932	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
933	jnc  retint_restore_args
934	call preempt_schedule_irq
935	jmp exit_intr
936#endif
937
938	CFI_ENDPROC
939END(common_interrupt)
940/*
941 * End of kprobes section
942 */
943       .popsection
944
945/*
946 * APIC interrupts.
947 */
948.macro apicinterrupt num sym do_sym
949ENTRY(\sym)
950	INTR_FRAME
951	pushq_cfi $~(\num)
952.Lcommon_\sym:
953	interrupt \do_sym
954	jmp ret_from_intr
955	CFI_ENDPROC
956END(\sym)
957.endm
958
959#ifdef CONFIG_SMP
960apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
961	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
962apicinterrupt REBOOT_VECTOR \
963	reboot_interrupt smp_reboot_interrupt
964#endif
965
966#ifdef CONFIG_X86_UV
967apicinterrupt UV_BAU_MESSAGE \
968	uv_bau_message_intr1 uv_bau_message_interrupt
969#endif
970apicinterrupt LOCAL_TIMER_VECTOR \
971	apic_timer_interrupt smp_apic_timer_interrupt
972apicinterrupt X86_PLATFORM_IPI_VECTOR \
973	x86_platform_ipi smp_x86_platform_ipi
974
975#ifdef CONFIG_SMP
976	ALIGN
977	INTR_FRAME
978.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
979	16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
980.if NUM_INVALIDATE_TLB_VECTORS > \idx
981ENTRY(invalidate_interrupt\idx)
982	pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
983	jmp .Lcommon_invalidate_interrupt0
984	CFI_ADJUST_CFA_OFFSET -8
985END(invalidate_interrupt\idx)
986.endif
987.endr
988	CFI_ENDPROC
989apicinterrupt INVALIDATE_TLB_VECTOR_START, \
990	invalidate_interrupt0, smp_invalidate_interrupt
991#endif
992
993apicinterrupt THRESHOLD_APIC_VECTOR \
994	threshold_interrupt smp_threshold_interrupt
995apicinterrupt THERMAL_APIC_VECTOR \
996	thermal_interrupt smp_thermal_interrupt
997
998#ifdef CONFIG_SMP
999apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1000	call_function_single_interrupt smp_call_function_single_interrupt
1001apicinterrupt CALL_FUNCTION_VECTOR \
1002	call_function_interrupt smp_call_function_interrupt
1003apicinterrupt RESCHEDULE_VECTOR \
1004	reschedule_interrupt smp_reschedule_interrupt
1005#endif
1006
1007apicinterrupt ERROR_APIC_VECTOR \
1008	error_interrupt smp_error_interrupt
1009apicinterrupt SPURIOUS_APIC_VECTOR \
1010	spurious_interrupt smp_spurious_interrupt
1011
1012#ifdef CONFIG_IRQ_WORK
1013apicinterrupt IRQ_WORK_VECTOR \
1014	irq_work_interrupt smp_irq_work_interrupt
1015#endif
1016
1017/*
1018 * Exception entry points.
1019 */
1020.macro zeroentry sym do_sym
1021ENTRY(\sym)
1022	INTR_FRAME
1023	PARAVIRT_ADJUST_EXCEPTION_FRAME
1024	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
1025	subq $ORIG_RAX-R15, %rsp
1026	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1027	call error_entry
1028	DEFAULT_FRAME 0
1029	movq %rsp,%rdi		/* pt_regs pointer */
1030	xorl %esi,%esi		/* no error code */
1031	call \do_sym
1032	jmp error_exit		/* %ebx: no swapgs flag */
1033	CFI_ENDPROC
1034END(\sym)
1035.endm
1036
1037.macro paranoidzeroentry sym do_sym
1038ENTRY(\sym)
1039	INTR_FRAME
1040	PARAVIRT_ADJUST_EXCEPTION_FRAME
1041	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
1042	subq $ORIG_RAX-R15, %rsp
1043	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1044	call save_paranoid
1045	TRACE_IRQS_OFF
1046	movq %rsp,%rdi		/* pt_regs pointer */
1047	xorl %esi,%esi		/* no error code */
1048	call \do_sym
1049	jmp paranoid_exit	/* %ebx: no swapgs flag */
1050	CFI_ENDPROC
1051END(\sym)
1052.endm
1053
1054#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
1055.macro paranoidzeroentry_ist sym do_sym ist
1056ENTRY(\sym)
1057	INTR_FRAME
1058	PARAVIRT_ADJUST_EXCEPTION_FRAME
1059	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
1060	subq $ORIG_RAX-R15, %rsp
1061	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1062	call save_paranoid
1063	TRACE_IRQS_OFF
1064	movq %rsp,%rdi		/* pt_regs pointer */
1065	xorl %esi,%esi		/* no error code */
1066	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1067	call \do_sym
1068	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
1069	jmp paranoid_exit	/* %ebx: no swapgs flag */
1070	CFI_ENDPROC
1071END(\sym)
1072.endm
1073
1074.macro errorentry sym do_sym
1075ENTRY(\sym)
1076	XCPT_FRAME
1077	PARAVIRT_ADJUST_EXCEPTION_FRAME
1078	subq $ORIG_RAX-R15, %rsp
1079	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1080	call error_entry
1081	DEFAULT_FRAME 0
1082	movq %rsp,%rdi			/* pt_regs pointer */
1083	movq ORIG_RAX(%rsp),%rsi	/* get error code */
1084	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
1085	call \do_sym
1086	jmp error_exit			/* %ebx: no swapgs flag */
1087	CFI_ENDPROC
1088END(\sym)
1089.endm
1090
1091	/* error code is on the stack already */
1092.macro paranoiderrorentry sym do_sym
1093ENTRY(\sym)
1094	XCPT_FRAME
1095	PARAVIRT_ADJUST_EXCEPTION_FRAME
1096	subq $ORIG_RAX-R15, %rsp
1097	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1098	call save_paranoid
1099	DEFAULT_FRAME 0
1100	TRACE_IRQS_OFF
1101	movq %rsp,%rdi			/* pt_regs pointer */
1102	movq ORIG_RAX(%rsp),%rsi	/* get error code */
1103	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
1104	call \do_sym
1105	jmp paranoid_exit		/* %ebx: no swapgs flag */
1106	CFI_ENDPROC
1107END(\sym)
1108.endm
1109
1110zeroentry divide_error do_divide_error
1111zeroentry overflow do_overflow
1112zeroentry bounds do_bounds
1113zeroentry invalid_op do_invalid_op
1114zeroentry device_not_available do_device_not_available
1115paranoiderrorentry double_fault do_double_fault
1116zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1117errorentry invalid_TSS do_invalid_TSS
1118errorentry segment_not_present do_segment_not_present
1119zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1120zeroentry coprocessor_error do_coprocessor_error
1121errorentry alignment_check do_alignment_check
1122zeroentry simd_coprocessor_error do_simd_coprocessor_error
1123
1124
1125	/* Reload gs selector with exception handling */
1126	/* edi:  new selector */
1127ENTRY(native_load_gs_index)
1128	CFI_STARTPROC
1129	pushfq_cfi
1130	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1131	SWAPGS
1132gs_change:
1133	movl %edi,%gs
11342:	mfence		/* workaround */
1135	SWAPGS
1136	popfq_cfi
1137	ret
1138	CFI_ENDPROC
1139END(native_load_gs_index)
1140
1141	.section __ex_table,"a"
1142	.align 8
1143	.quad gs_change,bad_gs
1144	.previous
1145	.section .fixup,"ax"
1146	/* running with kernelgs */
1147bad_gs:
1148	SWAPGS			/* switch back to user gs */
1149	xorl %eax,%eax
1150	movl %eax,%gs
1151	jmp  2b
1152	.previous
1153
1154ENTRY(kernel_thread_helper)
1155	pushq $0		# fake return address
1156	CFI_STARTPROC
1157	/*
1158	 * Here we are in the child and the registers are set as they were
1159	 * at kernel_thread() invocation in the parent.
1160	 */
1161	call *%rsi
1162	# exit
1163	mov %eax, %edi
1164	call do_exit
1165	ud2			# padding for call trace
1166	CFI_ENDPROC
1167END(kernel_thread_helper)
1168
1169/*
1170 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1171 *
1172 * C extern interface:
1173 *	 extern long execve(const char *name, char **argv, char **envp)
1174 *
1175 * asm input arguments:
1176 *	rdi: name, rsi: argv, rdx: envp
1177 *
1178 * We want to fallback into:
1179 *	extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
1180 *
1181 * do_sys_execve asm fallback arguments:
1182 *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1183 */
1184ENTRY(kernel_execve)
1185	CFI_STARTPROC
1186	FAKE_STACK_FRAME $0
1187	SAVE_ALL
1188	movq %rsp,%rcx
1189	call sys_execve
1190	movq %rax, RAX(%rsp)
1191	RESTORE_REST
1192	testq %rax,%rax
1193	je int_ret_from_sys_call
1194	RESTORE_ARGS
1195	UNFAKE_STACK_FRAME
1196	ret
1197	CFI_ENDPROC
1198END(kernel_execve)
1199
1200/* Call softirq on interrupt stack. Interrupts are off. */
1201ENTRY(call_softirq)
1202	CFI_STARTPROC
1203	pushq_cfi %rbp
1204	CFI_REL_OFFSET rbp,0
1205	mov  %rsp,%rbp
1206	CFI_DEF_CFA_REGISTER rbp
1207	incl PER_CPU_VAR(irq_count)
1208	cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1209	push  %rbp			# backlink for old unwinder
1210	call __do_softirq
1211	leaveq
1212	CFI_RESTORE		rbp
1213	CFI_DEF_CFA_REGISTER	rsp
1214	CFI_ADJUST_CFA_OFFSET   -8
1215	decl PER_CPU_VAR(irq_count)
1216	ret
1217	CFI_ENDPROC
1218END(call_softirq)
1219
1220#ifdef CONFIG_XEN
1221zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
1222
1223/*
1224 * A note on the "critical region" in our callback handler.
1225 * We want to avoid stacking callback handlers due to events occurring
1226 * during handling of the last event. To do this, we keep events disabled
1227 * until we've done all processing. HOWEVER, we must enable events before
1228 * popping the stack frame (can't be done atomically) and so it would still
1229 * be possible to get enough handler activations to overflow the stack.
1230 * Although unlikely, bugs of that kind are hard to track down, so we'd
1231 * like to avoid the possibility.
1232 * So, on entry to the handler we detect whether we interrupted an
1233 * existing activation in its critical region -- if so, we pop the current
1234 * activation and restart the handler using the previous one.
1235 */
1236ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
1237	CFI_STARTPROC
1238/*
1239 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1240 * see the correct pointer to the pt_regs
1241 */
1242	movq %rdi, %rsp            # we don't return, adjust the stack frame
1243	CFI_ENDPROC
1244	DEFAULT_FRAME
124511:	incl PER_CPU_VAR(irq_count)
1246	movq %rsp,%rbp
1247	CFI_DEF_CFA_REGISTER rbp
1248	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1249	pushq %rbp			# backlink for old unwinder
1250	call xen_evtchn_do_upcall
1251	popq %rsp
1252	CFI_DEF_CFA_REGISTER rsp
1253	decl PER_CPU_VAR(irq_count)
1254	jmp  error_exit
1255	CFI_ENDPROC
1256END(xen_do_hypervisor_callback)
1257
1258/*
1259 * Hypervisor uses this for application faults while it executes.
1260 * We get here for two reasons:
1261 *  1. Fault while reloading DS, ES, FS or GS
1262 *  2. Fault while executing IRET
1263 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1264 * registers that could be reloaded and zeroed the others.
1265 * Category 2 we fix up by killing the current process. We cannot use the
1266 * normal Linux return path in this case because if we use the IRET hypercall
1267 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1268 * We distinguish between categories by comparing each saved segment register
1269 * with its current contents: any discrepancy means we in category 1.
1270 */
1271ENTRY(xen_failsafe_callback)
1272	INTR_FRAME 1 (6*8)
1273	/*CFI_REL_OFFSET gs,GS*/
1274	/*CFI_REL_OFFSET fs,FS*/
1275	/*CFI_REL_OFFSET es,ES*/
1276	/*CFI_REL_OFFSET ds,DS*/
1277	CFI_REL_OFFSET r11,8
1278	CFI_REL_OFFSET rcx,0
1279	movw %ds,%cx
1280	cmpw %cx,0x10(%rsp)
1281	CFI_REMEMBER_STATE
1282	jne 1f
1283	movw %es,%cx
1284	cmpw %cx,0x18(%rsp)
1285	jne 1f
1286	movw %fs,%cx
1287	cmpw %cx,0x20(%rsp)
1288	jne 1f
1289	movw %gs,%cx
1290	cmpw %cx,0x28(%rsp)
1291	jne 1f
1292	/* All segments match their saved values => Category 2 (Bad IRET). */
1293	movq (%rsp),%rcx
1294	CFI_RESTORE rcx
1295	movq 8(%rsp),%r11
1296	CFI_RESTORE r11
1297	addq $0x30,%rsp
1298	CFI_ADJUST_CFA_OFFSET -0x30
1299	pushq_cfi $0	/* RIP */
1300	pushq_cfi %r11
1301	pushq_cfi %rcx
1302	jmp general_protection
1303	CFI_RESTORE_STATE
13041:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1305	movq (%rsp),%rcx
1306	CFI_RESTORE rcx
1307	movq 8(%rsp),%r11
1308	CFI_RESTORE r11
1309	addq $0x30,%rsp
1310	CFI_ADJUST_CFA_OFFSET -0x30
1311	pushq_cfi $0
1312	SAVE_ALL
1313	jmp error_exit
1314	CFI_ENDPROC
1315END(xen_failsafe_callback)
1316
1317apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
1318	xen_hvm_callback_vector xen_evtchn_do_upcall
1319
1320#endif /* CONFIG_XEN */
1321
1322/*
1323 * Some functions should be protected against kprobes
1324 */
1325	.pushsection .kprobes.text, "ax"
1326
1327paranoidzeroentry_ist debug do_debug DEBUG_STACK
1328paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1329paranoiderrorentry stack_segment do_stack_segment
1330#ifdef CONFIG_XEN
1331zeroentry xen_debug do_debug
1332zeroentry xen_int3 do_int3
1333errorentry xen_stack_segment do_stack_segment
1334#endif
1335errorentry general_protection do_general_protection
1336errorentry page_fault do_page_fault
1337#ifdef CONFIG_KVM_GUEST
1338errorentry async_page_fault do_async_page_fault
1339#endif
1340#ifdef CONFIG_X86_MCE
1341paranoidzeroentry machine_check *machine_check_vector(%rip)
1342#endif
1343
1344	/*
1345	 * "Paranoid" exit path from exception stack.
1346	 * Paranoid because this is used by NMIs and cannot take
1347	 * any kernel state for granted.
1348	 * We don't do kernel preemption checks here, because only
1349	 * NMI should be common and it does not enable IRQs and
1350	 * cannot get reschedule ticks.
1351	 *
1352	 * "trace" is 0 for the NMI handler only, because irq-tracing
1353	 * is fundamentally NMI-unsafe. (we cannot change the soft and
1354	 * hard flags at once, atomically)
1355	 */
1356
1357	/* ebx:	no swapgs flag */
1358ENTRY(paranoid_exit)
1359	DEFAULT_FRAME
1360	DISABLE_INTERRUPTS(CLBR_NONE)
1361	TRACE_IRQS_OFF
1362	testl %ebx,%ebx				/* swapgs needed? */
1363	jnz paranoid_restore
1364	testl $3,CS(%rsp)
1365	jnz   paranoid_userspace
1366paranoid_swapgs:
1367	TRACE_IRQS_IRETQ 0
1368	SWAPGS_UNSAFE_STACK
1369	RESTORE_ALL 8
1370	jmp irq_return
1371paranoid_restore:
1372	TRACE_IRQS_IRETQ 0
1373	RESTORE_ALL 8
1374	jmp irq_return
1375paranoid_userspace:
1376	GET_THREAD_INFO(%rcx)
1377	movl TI_flags(%rcx),%ebx
1378	andl $_TIF_WORK_MASK,%ebx
1379	jz paranoid_swapgs
1380	movq %rsp,%rdi			/* &pt_regs */
1381	call sync_regs
1382	movq %rax,%rsp			/* switch stack for scheduling */
1383	testl $_TIF_NEED_RESCHED,%ebx
1384	jnz paranoid_schedule
1385	movl %ebx,%edx			/* arg3: thread flags */
1386	TRACE_IRQS_ON
1387	ENABLE_INTERRUPTS(CLBR_NONE)
1388	xorl %esi,%esi 			/* arg2: oldset */
1389	movq %rsp,%rdi 			/* arg1: &pt_regs */
1390	call do_notify_resume
1391	DISABLE_INTERRUPTS(CLBR_NONE)
1392	TRACE_IRQS_OFF
1393	jmp paranoid_userspace
1394paranoid_schedule:
1395	TRACE_IRQS_ON
1396	ENABLE_INTERRUPTS(CLBR_ANY)
1397	call schedule
1398	DISABLE_INTERRUPTS(CLBR_ANY)
1399	TRACE_IRQS_OFF
1400	jmp paranoid_userspace
1401	CFI_ENDPROC
1402END(paranoid_exit)
1403
1404/*
1405 * Exception entry point. This expects an error code/orig_rax on the stack.
1406 * returns in "no swapgs flag" in %ebx.
1407 */
1408ENTRY(error_entry)
1409	XCPT_FRAME
1410	CFI_ADJUST_CFA_OFFSET 15*8
1411	/* oldrax contains error code */
1412	cld
1413	movq_cfi rdi, RDI+8
1414	movq_cfi rsi, RSI+8
1415	movq_cfi rdx, RDX+8
1416	movq_cfi rcx, RCX+8
1417	movq_cfi rax, RAX+8
1418	movq_cfi  r8,  R8+8
1419	movq_cfi  r9,  R9+8
1420	movq_cfi r10, R10+8
1421	movq_cfi r11, R11+8
1422	movq_cfi rbx, RBX+8
1423	movq_cfi rbp, RBP+8
1424	movq_cfi r12, R12+8
1425	movq_cfi r13, R13+8
1426	movq_cfi r14, R14+8
1427	movq_cfi r15, R15+8
1428	xorl %ebx,%ebx
1429	testl $3,CS+8(%rsp)
1430	je error_kernelspace
1431error_swapgs:
1432	SWAPGS
1433error_sti:
1434	TRACE_IRQS_OFF
1435	ret
1436
1437/*
1438 * There are two places in the kernel that can potentially fault with
1439 * usergs. Handle them here. The exception handlers after iret run with
1440 * kernel gs again, so don't set the user space flag. B stepping K8s
1441 * sometimes report an truncated RIP for IRET exceptions returning to
1442 * compat mode. Check for these here too.
1443 */
1444error_kernelspace:
1445	incl %ebx
1446	leaq irq_return(%rip),%rcx
1447	cmpq %rcx,RIP+8(%rsp)
1448	je error_swapgs
1449	movl %ecx,%eax	/* zero extend */
1450	cmpq %rax,RIP+8(%rsp)
1451	je bstep_iret
1452	cmpq $gs_change,RIP+8(%rsp)
1453	je error_swapgs
1454	jmp error_sti
1455
1456bstep_iret:
1457	/* Fix truncated RIP */
1458	movq %rcx,RIP+8(%rsp)
1459	jmp error_swapgs
1460	CFI_ENDPROC
1461END(error_entry)
1462
1463
1464/* ebx:	no swapgs flag (1: don't need swapgs, 0: need it) */
1465ENTRY(error_exit)
1466	DEFAULT_FRAME
1467	movl %ebx,%eax
1468	RESTORE_REST
1469	DISABLE_INTERRUPTS(CLBR_NONE)
1470	TRACE_IRQS_OFF
1471	GET_THREAD_INFO(%rcx)
1472	testl %eax,%eax
1473	jne retint_kernel
1474	LOCKDEP_SYS_EXIT_IRQ
1475	movl TI_flags(%rcx),%edx
1476	movl $_TIF_WORK_MASK,%edi
1477	andl %edi,%edx
1478	jnz retint_careful
1479	jmp retint_swapgs
1480	CFI_ENDPROC
1481END(error_exit)
1482
1483/*
1484 * Test if a given stack is an NMI stack or not.
1485 */
1486	.macro test_in_nmi reg stack nmi_ret normal_ret
1487	cmpq %\reg, \stack
1488	ja \normal_ret
1489	subq $EXCEPTION_STKSZ, %\reg
1490	cmpq %\reg, \stack
1491	jb \normal_ret
1492	jmp \nmi_ret
1493	.endm
1494
1495	/* runs on exception stack */
1496ENTRY(nmi)
1497	INTR_FRAME
1498	PARAVIRT_ADJUST_EXCEPTION_FRAME
1499	/*
1500	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1501	 * the iretq it performs will take us out of NMI context.
1502	 * This means that we can have nested NMIs where the next
1503	 * NMI is using the top of the stack of the previous NMI. We
1504	 * can't let it execute because the nested NMI will corrupt the
1505	 * stack of the previous NMI. NMI handlers are not re-entrant
1506	 * anyway.
1507	 *
1508	 * To handle this case we do the following:
1509	 *  Check the a special location on the stack that contains
1510	 *  a variable that is set when NMIs are executing.
1511	 *  The interrupted task's stack is also checked to see if it
1512	 *  is an NMI stack.
1513	 *  If the variable is not set and the stack is not the NMI
1514	 *  stack then:
1515	 *    o Set the special variable on the stack
1516	 *    o Copy the interrupt frame into a "saved" location on the stack
1517	 *    o Copy the interrupt frame into a "copy" location on the stack
1518	 *    o Continue processing the NMI
1519	 *  If the variable is set or the previous stack is the NMI stack:
1520	 *    o Modify the "copy" location to jump to the repeate_nmi
1521	 *    o return back to the first NMI
1522	 *
1523	 * Now on exit of the first NMI, we first clear the stack variable
1524	 * The NMI stack will tell any nested NMIs at that point that it is
1525	 * nested. Then we pop the stack normally with iret, and if there was
1526	 * a nested NMI that updated the copy interrupt stack frame, a
1527	 * jump will be made to the repeat_nmi code that will handle the second
1528	 * NMI.
1529	 */
1530
1531	/* Use %rdx as out temp variable throughout */
1532	pushq_cfi %rdx
1533
1534	/*
1535	 * If %cs was not the kernel segment, then the NMI triggered in user
1536	 * space, which means it is definitely not nested.
1537	 */
1538	cmpl $__KERNEL_CS, 16(%rsp)
1539	jne first_nmi
1540
1541	/*
1542	 * Check the special variable on the stack to see if NMIs are
1543	 * executing.
1544	 */
1545	cmpl $1, -8(%rsp)
1546	je nested_nmi
1547
1548	/*
1549	 * Now test if the previous stack was an NMI stack.
1550	 * We need the double check. We check the NMI stack to satisfy the
1551	 * race when the first NMI clears the variable before returning.
1552	 * We check the variable because the first NMI could be in a
1553	 * breakpoint routine using a breakpoint stack.
1554	 */
1555	lea 6*8(%rsp), %rdx
1556	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1557
1558nested_nmi:
1559	/*
1560	 * Do nothing if we interrupted the fixup in repeat_nmi.
1561	 * It's about to repeat the NMI handler, so we are fine
1562	 * with ignoring this one.
1563	 */
1564	movq $repeat_nmi, %rdx
1565	cmpq 8(%rsp), %rdx
1566	ja 1f
1567	movq $end_repeat_nmi, %rdx
1568	cmpq 8(%rsp), %rdx
1569	ja nested_nmi_out
1570
15711:
1572	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
1573	leaq -6*8(%rsp), %rdx
1574	movq %rdx, %rsp
1575	CFI_ADJUST_CFA_OFFSET 6*8
1576	pushq_cfi $__KERNEL_DS
1577	pushq_cfi %rdx
1578	pushfq_cfi
1579	pushq_cfi $__KERNEL_CS
1580	pushq_cfi $repeat_nmi
1581
1582	/* Put stack back */
1583	addq $(11*8), %rsp
1584	CFI_ADJUST_CFA_OFFSET -11*8
1585
1586nested_nmi_out:
1587	popq_cfi %rdx
1588
1589	/* No need to check faults here */
1590	INTERRUPT_RETURN
1591
1592first_nmi:
1593	/*
1594	 * Because nested NMIs will use the pushed location that we
1595	 * stored in rdx, we must keep that space available.
1596	 * Here's what our stack frame will look like:
1597	 * +-------------------------+
1598	 * | original SS             |
1599	 * | original Return RSP     |
1600	 * | original RFLAGS         |
1601	 * | original CS             |
1602	 * | original RIP            |
1603	 * +-------------------------+
1604	 * | temp storage for rdx    |
1605	 * +-------------------------+
1606	 * | NMI executing variable  |
1607	 * +-------------------------+
1608	 * | Saved SS                |
1609	 * | Saved Return RSP        |
1610	 * | Saved RFLAGS            |
1611	 * | Saved CS                |
1612	 * | Saved RIP               |
1613	 * +-------------------------+
1614	 * | copied SS               |
1615	 * | copied Return RSP       |
1616	 * | copied RFLAGS           |
1617	 * | copied CS               |
1618	 * | copied RIP              |
1619	 * +-------------------------+
1620	 * | pt_regs                 |
1621	 * +-------------------------+
1622	 *
1623	 * The saved RIP is used to fix up the copied RIP that a nested
1624	 * NMI may zero out. The original stack frame and the temp storage
1625	 * is also used by nested NMIs and can not be trusted on exit.
1626	 */
1627	/* Set the NMI executing variable on the stack. */
1628	pushq_cfi $1
1629
1630	/* Copy the stack frame to the Saved frame */
1631	.rept 5
1632	pushq_cfi 6*8(%rsp)
1633	.endr
1634
1635	/* Make another copy, this one may be modified by nested NMIs */
1636	.rept 5
1637	pushq_cfi 4*8(%rsp)
1638	.endr
1639
1640	/* Do not pop rdx, nested NMIs will corrupt it */
1641	movq 11*8(%rsp), %rdx
1642
1643	/*
1644	 * Everything below this point can be preempted by a nested
1645	 * NMI if the first NMI took an exception. Repeated NMIs
1646	 * caused by an exception and nested NMI will start here, and
1647	 * can still be preempted by another NMI.
1648	 */
1649restart_nmi:
1650	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
1651	subq $ORIG_RAX-R15, %rsp
1652	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1653	/*
1654	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
1655	 * as we should not be calling schedule in NMI context.
1656	 * Even with normal interrupts enabled. An NMI should not be
1657	 * setting NEED_RESCHED or anything that normal interrupts and
1658	 * exceptions might do.
1659	 */
1660	call save_paranoid
1661	DEFAULT_FRAME 0
1662	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1663	movq %rsp,%rdi
1664	movq $-1,%rsi
1665	call do_nmi
1666	testl %ebx,%ebx				/* swapgs needed? */
1667	jnz nmi_restore
1668nmi_swapgs:
1669	SWAPGS_UNSAFE_STACK
1670nmi_restore:
1671	RESTORE_ALL 8
1672	/* Clear the NMI executing stack variable */
1673	movq $0, 10*8(%rsp)
1674	jmp irq_return
1675	CFI_ENDPROC
1676END(nmi)
1677
1678	/*
1679	 * If an NMI hit an iret because of an exception or breakpoint,
1680	 * it can lose its NMI context, and a nested NMI may come in.
1681	 * In that case, the nested NMI will change the preempted NMI's
1682	 * stack to jump to here when it does the final iret.
1683	 */
1684repeat_nmi:
1685	INTR_FRAME
1686	/* Update the stack variable to say we are still in NMI */
1687	movq $1, 5*8(%rsp)
1688
1689	/* copy the saved stack back to copy stack */
1690	.rept 5
1691	pushq_cfi 4*8(%rsp)
1692	.endr
1693
1694	jmp restart_nmi
1695	CFI_ENDPROC
1696end_repeat_nmi:
1697
1698ENTRY(ignore_sysret)
1699	CFI_STARTPROC
1700	mov $-ENOSYS,%eax
1701	sysret
1702	CFI_ENDPROC
1703END(ignore_sysret)
1704
1705/*
1706 * End of kprobes section
1707 */
1708	.popsection
1709