Merge branch 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 18 Dec 2017 16:59:15 +0000 (08:59 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 18 Dec 2017 16:59:15 +0000 (08:59 -0800)
Pull x86 syscall entry code changes for PTI from Ingo Molnar:
 "The main changes here are Andy Lutomirski's changes to switch the
  x86-64 entry code to use the 'per CPU entry trampoline stack'. This,
  besides helping fix KASLR leaks (the pending Page Table Isolation
  (PTI) work), also robustifies the x86 entry code"

* 'WIP.x86-pti.entry-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits)
  x86/cpufeatures: Make CPU bugs sticky
  x86/paravirt: Provide a way to check for hypervisors
  x86/paravirt: Dont patch flush_tlb_single
  x86/entry/64: Make cpu_entry_area.tss read-only
  x86/entry: Clean up the SYSENTER_stack code
  x86/entry/64: Remove the SYSENTER stack canary
  x86/entry/64: Move the IST stacks into struct cpu_entry_area
  x86/entry/64: Create a per-CPU SYSCALL entry trampoline
  x86/entry/64: Return to userspace from the trampoline stack
  x86/entry/64: Use a per-CPU trampoline stack for IDT entries
  x86/espfix/64: Stop assuming that pt_regs is on the entry stack
  x86/entry/64: Separate cpu_current_top_of_stack from TSS.sp0
  x86/entry: Remap the TSS into the CPU entry area
  x86/entry: Move SYSENTER_stack to the beginning of struct tss_struct
  x86/dumpstack: Handle stack overflow on all stacks
  x86/entry: Fix assumptions that the HW TSS is at the beginning of cpu_tss
  x86/kasan/64: Teach KASAN about the cpu_entry_area
  x86/mm/fixmap: Generalize the GDT fixmap mechanism, introduce struct cpu_entry_area
  x86/entry/gdt: Put per-CPU GDT remaps in ascending order
  x86/dumpstack: Add get_stack_info() support for the SYSENTER stack
  ...

40 files changed:
arch/x86/entry/entry_32.S
arch/x86/entry/entry_64.S
arch/x86/entry/entry_64_compat.S
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/desc.h
arch/x86/include/asm/fixmap.h
arch/x86/include/asm/hypervisor.h
arch/x86/include/asm/irqflags.h
arch/x86/include/asm/kdebug.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/stacktrace.h
arch/x86/include/asm/switch_to.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/traps.h
arch/x86/include/asm/unwind.h
arch/x86/kernel/asm-offsets.c
arch/x86/kernel/asm-offsets_32.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/doublefault.c
arch/x86/kernel/dumpstack.c
arch/x86/kernel/dumpstack_32.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/ioport.c
arch/x86/kernel/irq.c
arch/x86/kernel/irq_64.c
arch/x86/kernel/paravirt_patch_64.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/traps.c
arch/x86/kernel/unwind_orc.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/kvm/vmx.c
arch/x86/lib/delay.c
arch/x86/mm/kasan_init_64.c
arch/x86/power/cpu.c
arch/x86/xen/enlighten_pv.c
arch/x86/xen/mmu_pv.c

index 4838037f97f6edffda62b5b045c837fcc29402f0..bd8b57a5c874bc37ab23fb0489f840ba0b17168e 100644 (file)
@@ -941,7 +941,8 @@ ENTRY(debug)
        movl    %esp, %eax                      # pt_regs pointer
 
        /* Are we currently on the SYSENTER stack? */
-       PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+       movl    PER_CPU_VAR(cpu_entry_area), %ecx
+       addl    $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
        subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
        cmpl    $SIZEOF_SYSENTER_stack, %ecx
        jb      .Ldebug_from_sysenter_stack
@@ -984,7 +985,8 @@ ENTRY(nmi)
        movl    %esp, %eax                      # pt_regs pointer
 
        /* Are we currently on the SYSENTER stack? */
-       PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
+       movl    PER_CPU_VAR(cpu_entry_area), %ecx
+       addl    $CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx
        subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
        cmpl    $SIZEOF_SYSENTER_stack, %ecx
        jb      .Lnmi_from_sysenter_stack
index f81d50d7ceacdefa06d61482687937096c68421c..423885bee398c6c9cb80f3bd4a2ec8317e41062a 100644 (file)
@@ -140,6 +140,64 @@ END(native_usergs_sysret64)
  * with them due to bugs in both AMD and Intel CPUs.
  */
 
+       .pushsection .entry_trampoline, "ax"
+
+/*
+ * The code in here gets remapped into cpu_entry_area's trampoline.  This means
+ * that the assembler and linker have the wrong idea as to where this code
+ * lives (and, in fact, it's mapped more than once, so it's not even at a
+ * fixed address).  So we can't reference any symbols outside the entry
+ * trampoline and expect it to work.
+ *
+ * Instead, we carefully abuse %rip-relative addressing.
+ * _entry_trampoline(%rip) refers to the start of the remapped) entry
+ * trampoline.  We can thus find cpu_entry_area with this macro:
+ */
+
+#define CPU_ENTRY_AREA \
+       _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+#define RSP_SCRATCH    CPU_ENTRY_AREA_SYSENTER_stack + \
+                       SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA
+
+ENTRY(entry_SYSCALL_64_trampoline)
+       UNWIND_HINT_EMPTY
+       swapgs
+
+       /* Stash the user RSP. */
+       movq    %rsp, RSP_SCRATCH
+
+       /* Load the top of the task stack into RSP */
+       movq    CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
+
+       /* Start building the simulated IRET frame. */
+       pushq   $__USER_DS                      /* pt_regs->ss */
+       pushq   RSP_SCRATCH                     /* pt_regs->sp */
+       pushq   %r11                            /* pt_regs->flags */
+       pushq   $__USER_CS                      /* pt_regs->cs */
+       pushq   %rcx                            /* pt_regs->ip */
+
+       /*
+        * x86 lacks a near absolute jump, and we can't jump to the real
+        * entry text with a relative jump.  We could push the target
+        * address and then use retq, but this destroys the pipeline on
+        * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
+        * spill RDI and restore it in a second-stage trampoline.
+        */
+       pushq   %rdi
+       movq    $entry_SYSCALL_64_stage2, %rdi
+       jmp     *%rdi
+END(entry_SYSCALL_64_trampoline)
+
+       .popsection
+
+ENTRY(entry_SYSCALL_64_stage2)
+       UNWIND_HINT_EMPTY
+       popq    %rdi
+       jmp     entry_SYSCALL_64_after_hwframe
+END(entry_SYSCALL_64_stage2)
+
 ENTRY(entry_SYSCALL_64)
        UNWIND_HINT_EMPTY
        /*
@@ -330,8 +388,24 @@ syscall_return_via_sysret:
        popq    %rsi    /* skip rcx */
        popq    %rdx
        popq    %rsi
+
+       /*
+        * Now all regs are restored except RSP and RDI.
+        * Save old stack pointer and switch to trampoline stack.
+        */
+       movq    %rsp, %rdi
+       movq    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+       pushq   RSP-RDI(%rdi)   /* RSP */
+       pushq   (%rdi)          /* RDI */
+
+       /*
+        * We are on the trampoline stack.  All regs except RDI are live.
+        * We can do future final exit work right here.
+        */
+
        popq    %rdi
-       movq    RSP-ORIG_RAX(%rsp), %rsp
+       popq    %rsp
        USERGS_SYSRET64
 END(entry_SYSCALL_64)
 
@@ -466,12 +540,13 @@ END(irq_entries_start)
 
 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
 #ifdef CONFIG_DEBUG_ENTRY
-       pushfq
-       testl $X86_EFLAGS_IF, (%rsp)
+       pushq %rax
+       SAVE_FLAGS(CLBR_RAX)
+       testl $X86_EFLAGS_IF, %eax
        jz .Lokay_\@
        ud2
 .Lokay_\@:
-       addq $8, %rsp
+       popq %rax
 #endif
 .endm
 
@@ -563,6 +638,13 @@ END(irq_entries_start)
 /* 0(%rsp): ~(interrupt number) */
        .macro interrupt func
        cld
+
+       testb   $3, CS-ORIG_RAX(%rsp)
+       jz      1f
+       SWAPGS
+       call    switch_to_thread_stack
+1:
+
        ALLOC_PT_GPREGS_ON_STACK
        SAVE_C_REGS
        SAVE_EXTRA_REGS
@@ -572,12 +654,8 @@ END(irq_entries_start)
        jz      1f
 
        /*
-        * IRQ from user mode.  Switch to kernel gsbase and inform context
-        * tracking that we're in kernel mode.
-        */
-       SWAPGS
-
-       /*
+        * IRQ from user mode.
+        *
         * We need to tell lockdep that IRQs are off.  We can't do this until
         * we fix gsbase, and we should do it before enter_from_user_mode
         * (which can take locks).  Since TRACE_IRQS_OFF idempotent,
@@ -630,10 +708,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
        ud2
 1:
 #endif
-       SWAPGS
        POP_EXTRA_REGS
-       POP_C_REGS
-       addq    $8, %rsp        /* skip regs->orig_ax */
+       popq    %r11
+       popq    %r10
+       popq    %r9
+       popq    %r8
+       popq    %rax
+       popq    %rcx
+       popq    %rdx
+       popq    %rsi
+
+       /*
+        * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+        * Save old stack pointer and switch to trampoline stack.
+        */
+       movq    %rsp, %rdi
+       movq    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+       /* Copy the IRET frame to the trampoline stack. */
+       pushq   6*8(%rdi)       /* SS */
+       pushq   5*8(%rdi)       /* RSP */
+       pushq   4*8(%rdi)       /* EFLAGS */
+       pushq   3*8(%rdi)       /* CS */
+       pushq   2*8(%rdi)       /* RIP */
+
+       /* Push user RDI on the trampoline stack. */
+       pushq   (%rdi)
+
+       /*
+        * We are on the trampoline stack.  All regs except RDI are live.
+        * We can do future final exit work right here.
+        */
+
+       /* Restore RDI. */
+       popq    %rdi
+       SWAPGS
        INTERRUPT_RETURN
 
 
@@ -829,7 +938,33 @@ apicinterrupt IRQ_WORK_VECTOR                      irq_work_interrupt              smp_irq_work_interrupt
 /*
  * Exception entry points.
  */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+
+/*
+ * Switch to the thread stack.  This is called with the IRET frame and
+ * orig_ax on the stack.  (That is, RDI..R12 are not on the stack and
+ * space has not been allocated for them.)
+ */
+ENTRY(switch_to_thread_stack)
+       UNWIND_HINT_FUNC
+
+       pushq   %rdi
+       movq    %rsp, %rdi
+       movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+       UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
+
+       pushq   7*8(%rdi)               /* regs->ss */
+       pushq   6*8(%rdi)               /* regs->rsp */
+       pushq   5*8(%rdi)               /* regs->eflags */
+       pushq   4*8(%rdi)               /* regs->cs */
+       pushq   3*8(%rdi)               /* regs->ip */
+       pushq   2*8(%rdi)               /* regs->orig_ax */
+       pushq   8(%rdi)                 /* return address */
+       UNWIND_HINT_FUNC
+
+       movq    (%rdi), %rdi
+       ret
+END(switch_to_thread_stack)
 
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
@@ -848,11 +983,12 @@ ENTRY(\sym)
 
        ALLOC_PT_GPREGS_ON_STACK
 
-       .if \paranoid
-       .if \paranoid == 1
+       .if \paranoid < 2
        testb   $3, CS(%rsp)                    /* If coming from userspace, switch stacks */
-       jnz     1f
+       jnz     .Lfrom_usermode_switch_stack_\@
        .endif
+
+       .if \paranoid
        call    paranoid_entry
        .else
        call    error_entry
@@ -894,20 +1030,15 @@ ENTRY(\sym)
        jmp     error_exit
        .endif
 
-       .if \paranoid == 1
+       .if \paranoid < 2
        /*
-        * Paranoid entry from userspace.  Switch stacks and treat it
+        * Entry from userspace.  Switch stacks and treat it
         * as a normal entry.  This means that paranoid handlers
         * run in real process context if user_mode(regs).
         */
-1:
+.Lfrom_usermode_switch_stack_\@:
        call    error_entry
 
-
-       movq    %rsp, %rdi                      /* pt_regs pointer */
-       call    sync_regs
-       movq    %rax, %rsp                      /* switch stack */
-
        movq    %rsp, %rdi                      /* pt_regs pointer */
 
        .if \has_error_code
@@ -1170,6 +1301,14 @@ ENTRY(error_entry)
        SWAPGS
 
 .Lerror_entry_from_usermode_after_swapgs:
+       /* Put us onto the real thread stack. */
+       popq    %r12                            /* save return addr in %12 */
+       movq    %rsp, %rdi                      /* arg0 = pt_regs pointer */
+       call    sync_regs
+       movq    %rax, %rsp                      /* switch stack */
+       ENCODE_FRAME_POINTER
+       pushq   %r12
+
        /*
         * We need to tell lockdep that IRQs are off.  We can't do this until
         * we fix gsbase, and we should do it before enter_from_user_mode
index 568e130d932cd2a7d44393e5fc52408cffe64f34..95ad40eb7effbdb6f605285df62d1e0bd33a6cac 100644 (file)
@@ -48,7 +48,7 @@
  */
 ENTRY(entry_SYSENTER_compat)
        /* Interrupts are off on entry. */
-       SWAPGS_UNSAFE_STACK
+       SWAPGS
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
        /*
@@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat)
         */
        movl    %eax, %eax
 
-       /* Construct struct pt_regs on stack (iret frame is already on stack) */
        pushq   %rax                    /* pt_regs->orig_ax */
+
+       /* switch to thread stack expects orig_ax to be pushed */
+       call    switch_to_thread_stack
+
        pushq   %rdi                    /* pt_regs->di */
        pushq   %rsi                    /* pt_regs->si */
        pushq   %rdx                    /* pt_regs->dx */
index bf6a76202a779ee131b4df8c89449ab52abd0a79..ea9a7dde62e5c4d551ba89e429f911fb5c6603fd 100644 (file)
@@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
        set_bit(bit, (unsigned long *)cpu_caps_set);    \
 } while (0)
 
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
index 4011cb03ef08e52db15f52779ce366c26359a34b..aab4fe9f49f868a03a5c2da5eeb788a6bb80c24d 100644 (file)
@@ -60,17 +60,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
        return this_cpu_ptr(&gdt_page)->gdt;
 }
 
-/* Get the fixmap index for a specific processor */
-static inline unsigned int get_cpu_gdt_ro_index(int cpu)
-{
-       return FIX_GDT_REMAP_BEGIN + cpu;
-}
-
 /* Provide the fixmap address of the remapped GDT */
 static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
 {
-       unsigned int idx = get_cpu_gdt_ro_index(cpu);
-       return (struct desc_struct *)__fix_to_virt(idx);
+       return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
 }
 
 /* Provide the current read-only GDT */
@@ -185,7 +178,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
 #endif
 }
 
-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
 {
        struct desc_struct *d = get_cpu_gdt_rw(cpu);
        tss_desc tss;
index b0c505fe9a958c701fef6d96f281bb8ab1a773de..94fc4fa141275bcdd6eb43505fc8e5f20352aca4 100644 (file)
@@ -44,6 +44,45 @@ extern unsigned long __FIXADDR_TOP;
                         PAGE_SIZE)
 #endif
 
+/*
+ * cpu_entry_area is a percpu region in the fixmap that contains things
+ * needed by the CPU and early entry/exit code.  Real types aren't used
+ * for all fields here to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+       char gdt[PAGE_SIZE];
+
+       /*
+        * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as
+        * a a read-only guard page.
+        */
+       struct SYSENTER_stack_page SYSENTER_stack_page;
+
+       /*
+        * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
+        * we need task switches to work, and task switches write to the TSS.
+        */
+       struct tss_struct tss;
+
+       char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+       /*
+        * Exception stacks used for IST entries.
+        *
+        * In the future, this should have a separate slot for each stack
+        * with guard pages between them.
+        */
+       char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+};
+
+#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE)
+
+extern void setup_cpu_entry_areas(void);
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -101,8 +140,8 @@ enum fixed_addresses {
        FIX_LNW_VRTC,
 #endif
        /* Fixmap entries to remap the GDTs, one per processor. */
-       FIX_GDT_REMAP_BEGIN,
-       FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+       FIX_CPU_ENTRY_AREA_TOP,
+       FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,
 
 #ifdef CONFIG_ACPI_APEI_GHES
        /* Used for GHES mapping from assorted contexts */
@@ -191,5 +230,30 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,
 void __early_set_fixmap(enum fixed_addresses idx,
                        phys_addr_t phys, pgprot_t flags);
 
+static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page)
+{
+       BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+       return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page;
+}
+
+#define __get_cpu_entry_area_offset_index(cpu, offset) ({              \
+       BUILD_BUG_ON(offset % PAGE_SIZE != 0);                          \
+       __get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE);       \
+       })
+
+#define get_cpu_entry_area_index(cpu, field)                           \
+       __get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field))
+
+static inline struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+       return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0));
+}
+
+static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu)
+{
+       return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack;
+}
+
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_FIXMAP_H */
index 1b0a5abcd8aeb6e700013c5434aaeb0bba7a152f..96aa6b9884dc5b3bc8d54c9ef1c6258eea13a0d0 100644 (file)
 #ifndef _ASM_X86_HYPERVISOR_H
 #define _ASM_X86_HYPERVISOR_H
 
-#ifdef CONFIG_HYPERVISOR_GUEST
-
-#include <asm/kvm_para.h>
-#include <asm/x86_init.h>
-#include <asm/xen/hypervisor.h>
-
-/*
- * x86 hypervisor information
- */
-
+/* x86 hypervisor types  */
 enum x86_hypervisor_type {
        X86_HYPER_NATIVE = 0,
        X86_HYPER_VMWARE,
@@ -39,6 +30,12 @@ enum x86_hypervisor_type {
        X86_HYPER_KVM,
 };
 
+#ifdef CONFIG_HYPERVISOR_GUEST
+
+#include <asm/kvm_para.h>
+#include <asm/x86_init.h>
+#include <asm/xen/hypervisor.h>
+
 struct hypervisor_x86 {
        /* Hypervisor name */
        const char      *name;
@@ -58,7 +55,15 @@ struct hypervisor_x86 {
 
 extern enum x86_hypervisor_type x86_hyper_type;
 extern void init_hypervisor_platform(void);
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+       return x86_hyper_type == type;
+}
 #else
 static inline void init_hypervisor_platform(void) { }
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+       return type == X86_HYPER_NATIVE;
+}
 #endif /* CONFIG_HYPERVISOR_GUEST */
 #endif /* _ASM_X86_HYPERVISOR_H */
index c8ef23f2c28f17c59308b9c41179c47f85e075ad..89f08955fff733c688a5ce4f4a0b8d74050ee617 100644 (file)
@@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
        swapgs;                                 \
        sysretl
 
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(x)          pushfq; popq %rax
+#endif
 #else
 #define INTERRUPT_RETURN               iret
 #define ENABLE_INTERRUPTS_SYSEXIT      sti; sysexit
index f86a8caa561e8873c3f34e6e8b8cd509ebadd819..395c9631e000a3a17aa574c1b25fcc2cafd5b5fb 100644 (file)
@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
 extern void show_stack_regs(struct pt_regs *regs);
 extern void __show_regs(struct pt_regs *regs, int all);
+extern void show_iret_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
index 283efcaac8aff86f2c004bc23e4b8642cbf3d527..892df375b6155a51f584760efb9f9e77c3f732e8 100644 (file)
@@ -927,6 +927,15 @@ extern void default_banner(void);
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),       \
                  CLBR_NONE,                                            \
                  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(clobbers)                                        \
+       PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
+                 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);        \
+                 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl);    \
+                 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+#endif
+
 #endif /* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
index cc16fa882e3e760a40351cf3e7476ac9f25ffe00..1f2434ee9f806c4355a38599ab4485140a8cd1df 100644 (file)
@@ -163,9 +163,9 @@ enum cpuid_regs_idx {
 extern struct cpuinfo_x86      boot_cpu_data;
 extern struct cpuinfo_x86      new_cpu_data;
 
-extern struct tss_struct       doublefault_tss;
-extern __u32                   cpu_caps_cleared[NCAPINTS];
-extern __u32                   cpu_caps_set[NCAPINTS];
+extern struct x86_hw_tss       doublefault_tss;
+extern __u32                   cpu_caps_cleared[NCAPINTS + NBUGINTS];
+extern __u32                   cpu_caps_set[NCAPINTS + NBUGINTS];
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir)
        write_cr3(__sme_pa(pgdir));
 }
 
+/*
+ * Note that while the legacy 'TSS' name comes from 'Task State Segment',
+ * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
+ * unrelated to the task-switch mechanism:
+ */
 #ifdef CONFIG_X86_32
 /* This is the TSS defined by the hardware. */
 struct x86_hw_tss {
@@ -305,7 +310,13 @@ struct x86_hw_tss {
 struct x86_hw_tss {
        u32                     reserved1;
        u64                     sp0;
+
+       /*
+        * We store cpu_current_top_of_stack in sp1 so it's always accessible.
+        * Linux does not use ring 1, so sp1 is not otherwise needed.
+        */
        u64                     sp1;
+
        u64                     sp2;
        u64                     reserved2;
        u64                     ist[7];
@@ -323,12 +334,22 @@ struct x86_hw_tss {
 #define IO_BITMAP_BITS                 65536
 #define IO_BITMAP_BYTES                        (IO_BITMAP_BITS/8)
 #define IO_BITMAP_LONGS                        (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET               offsetof(struct tss_struct, io_bitmap)
+#define IO_BITMAP_OFFSET               (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
 #define INVALID_IO_BITMAP_OFFSET       0x8000
 
+struct SYSENTER_stack {
+       unsigned long           words[64];
+};
+
+struct SYSENTER_stack_page {
+       struct SYSENTER_stack stack;
+} __aligned(PAGE_SIZE);
+
 struct tss_struct {
        /*
-        * The hardware state:
+        * The fixed hardware portion.  This must not cross a page boundary
+        * at risk of violating the SDM's advice and potentially triggering
+        * errata.
         */
        struct x86_hw_tss       x86_tss;
 
@@ -339,18 +360,9 @@ struct tss_struct {
         * be within the limit.
         */
        unsigned long           io_bitmap[IO_BITMAP_LONGS + 1];
+} __aligned(PAGE_SIZE);
 
-#ifdef CONFIG_X86_32
-       /*
-        * Space for the temporary SYSENTER stack.
-        */
-       unsigned long           SYSENTER_stack_canary;
-       unsigned long           SYSENTER_stack[64];
-#endif
-
-} ____cacheline_aligned;
-
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
 
 /*
  * sizeof(unsigned long) coming from an extra "long" at the end
@@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
 
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#else
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
 #endif
 
 /*
@@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)
 static inline void
 native_load_sp0(unsigned long sp0)
 {
-       this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+       this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 static inline void native_swapgs(void)
@@ -535,12 +550,12 @@ static inline void native_swapgs(void)
 
 static inline unsigned long current_top_of_stack(void)
 {
-#ifdef CONFIG_X86_64
-       return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
-#else
-       /* sp0 on x86_32 is special in and around vm86 mode. */
+       /*
+        *  We can't read directly from tss.sp0: sp0 on x86_32 is special in
+        *  and around vm86 mode and sp0 on x86_64 is special because of the
+        *  entry trampoline.
+        */
        return this_cpu_read_stable(cpu_current_top_of_stack);
-#endif
 }
 
 static inline bool on_thread_stack(void)
index 8da111b3c342bbb61a9e630e101c8a83422a15ea..f8062bfd43a072dc950b23f242893b6ca3310652 100644 (file)
@@ -16,6 +16,7 @@ enum stack_type {
        STACK_TYPE_TASK,
        STACK_TYPE_IRQ,
        STACK_TYPE_SOFTIRQ,
+       STACK_TYPE_SYSENTER,
        STACK_TYPE_EXCEPTION,
        STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
 };
@@ -28,6 +29,8 @@ struct stack_info {
 bool in_task_stack(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info);
 
+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info);
+
 int get_stack_info(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info, unsigned long *visit_mask);
 
index 8c6bd6863db9d6b737cd0649324c154f9b9798a3..9b6df68d8fd1eba26f3651faa5c8b8f4dcf223f1 100644 (file)
@@ -79,10 +79,10 @@ do {                                                                        \
 static inline void refresh_sysenter_cs(struct thread_struct *thread)
 {
        /* Only happens when SEP is enabled, no need to test "SEP"arately: */
-       if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+       if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
                return;
 
-       this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+       this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
        wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
 }
 #endif
@@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
 /* This is used when switching tasks or entering/exiting vm86 mode. */
 static inline void update_sp0(struct task_struct *task)
 {
+       /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
 #ifdef CONFIG_X86_32
        load_sp0(task->thread.sp0);
 #else
-       load_sp0(task_top_of_stack(task));
+       if (static_cpu_has(X86_FEATURE_XENPV))
+               load_sp0(task_top_of_stack(task));
 #endif
 }
 
index 70f425947dc50f3e99ca639c0ead0d7e1cce636d..00223333821a96616647a9cbb6fe729c4a18b7b6 100644 (file)
@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
 #endif
 
 #endif
index 1fadd310ff680ece697fa65a8db410c380a8547e..31051f35cbb768e452c4f76a60c5415a45f572e7 100644 (file)
@@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
 dotraplinkage void do_stack_segment(struct pt_regs *, long);
 #ifdef CONFIG_X86_64
 dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *, long);
 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
index e9cc6fe1fc6f953c38ddcc61fcf06fd90d72ab04..c1688c2d0a128f063053697dc60bcbfbca509765 100644 (file)
@@ -7,6 +7,9 @@
 #include <asm/ptrace.h>
 #include <asm/stacktrace.h>
 
+#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
+#define IRET_FRAME_SIZE   (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
+
 struct unwind_state {
        struct stack_info stack_info;
        unsigned long stack_mask;
@@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
 }
 
 #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+/*
+ * WARNING: The entire pt_regs may not be safe to dereference.  In some cases,
+ * only the iret frame registers are accessible.  Use with caution!
+ */
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
        if (unwind_done(state))
index 8ea78275480dafeb702e11ba73364cd9e7c52f21..cd360a5e0dca30f2f1ad052b197606e55f701db0 100644 (file)
@@ -93,4 +93,10 @@ void common(void) {
 
        BLANK();
        DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+       /* Layout info for cpu_entry_area */
+       OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+       OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+       OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page);
+       DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack));
 }
index dedf428b20b68b0a4748fc1ac3032193c9121362..7d20d9c0b3d69cfaa717233a868218fe9d2cb694 100644 (file)
@@ -47,13 +47,8 @@ void foo(void)
        BLANK();
 
        /* Offset from the sysenter stack to tss.sp0 */
-       DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
-              offsetofend(struct tss_struct, SYSENTER_stack));
-
-       /* Offset from cpu_tss to SYSENTER_stack */
-       OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
-       /* Size of SYSENTER_stack */
-       DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+       DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+              offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack));
 
 #ifdef CONFIG_CC_STACKPROTECTOR
        BLANK();
index 630212fa9b9da3f0498fc30d4c193c5926c43abb..bf51e51d808dd8914abd3b4bca69b37ce3ec023b 100644 (file)
@@ -23,6 +23,9 @@ int main(void)
 #ifdef CONFIG_PARAVIRT
        OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
        OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+#ifdef CONFIG_DEBUG_ENTRY
+       OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
+#endif
        BLANK();
 #endif
 
@@ -63,6 +66,7 @@ int main(void)
 
        OFFSET(TSS_ist, tss_struct, x86_tss.ist);
        OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+       OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
        BLANK();
 
 #ifdef CONFIG_CC_STACKPROTECTOR
index fa998ca8aa5aa5b4899dbe8a57c5b543f927009e..7416da3ec4dfa0b0f275dd10a5f9bfa12b884022 100644 (file)
@@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
        return NULL;            /* Not found */
 }
 
-__u32 cpu_caps_cleared[NCAPINTS];
-__u32 cpu_caps_set[NCAPINTS];
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
 
 void load_percpu_segment(int cpu)
 {
@@ -490,27 +490,116 @@ void load_percpu_segment(int cpu)
        load_stack_canary_segment();
 }
 
-/* Setup the fixmap mapping only once per-processor */
-static inline void setup_fixmap_gdt(int cpu)
+#ifdef CONFIG_X86_32
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+#endif
+
+#ifdef CONFIG_X86_64
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+         [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
+         [DEBUG_STACK - 1]                     = DEBUG_STKSZ
+};
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page,
+                                  SYSENTER_stack_storage);
+
+static void __init
+set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot)
+{
+       for ( ; pages; pages--, idx--, ptr += PAGE_SIZE)
+               __set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
 {
 #ifdef CONFIG_X86_64
-       /* On 64-bit systems, we use a read-only fixmap GDT. */
-       pgprot_t prot = PAGE_KERNEL_RO;
+       extern char _entry_trampoline[];
+
+       /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+       pgprot_t gdt_prot = PAGE_KERNEL_RO;
+       pgprot_t tss_prot = PAGE_KERNEL_RO;
 #else
        /*
         * On native 32-bit systems, the GDT cannot be read-only because
         * our double fault handler uses a task gate, and entering through
-        * a task gate needs to change an available TSS to busy.  If the GDT
-        * is read-only, that will triple fault.
+        * a task gate needs to change an available TSS to busy.  If the
+        * GDT is read-only, that will triple fault.  The TSS cannot be
+        * read-only because the CPU writes to it on task switches.
         *
-        * On Xen PV, the GDT must be read-only because the hypervisor requires
-        * it.
+        * On Xen PV, the GDT must be read-only because the hypervisor
+        * requires it.
         */
-       pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+       pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
                PAGE_KERNEL_RO : PAGE_KERNEL;
+       pgprot_t tss_prot = PAGE_KERNEL;
 #endif
 
-       __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
+       __set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot);
+       set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page),
+                               per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1,
+                               PAGE_KERNEL);
+
+       /*
+        * The Intel SDM says (Volume 3, 7.2.1):
+        *
+        *  Avoid placing a page boundary in the part of the TSS that the
+        *  processor reads during a task switch (the first 104 bytes). The
+        *  processor may not correctly perform address translations if a
+        *  boundary occurs in this area. During a task switch, the processor
+        *  reads and writes into the first 104 bytes of each TSS (using
+        *  contiguous physical addresses beginning with the physical address
+        *  of the first byte of the TSS). So, after TSS access begins, if
+        *  part of the 104 bytes is not physically contiguous, the processor
+        *  will access incorrect information without generating a page-fault
+        *  exception.
+        *
+        * There are also a lot of errata involving the TSS spanning a page
+        * boundary.  Assert that we're not doing that.
+        */
+       BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+                     offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+       BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+       set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss),
+                               &per_cpu(cpu_tss_rw, cpu),
+                               sizeof(struct tss_struct) / PAGE_SIZE,
+                               tss_prot);
+
+#ifdef CONFIG_X86_32
+       per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+       BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+       BUILD_BUG_ON(sizeof(exception_stacks) !=
+                    sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+       set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks),
+                               &per_cpu(exception_stacks, cpu),
+                               sizeof(exception_stacks) / PAGE_SIZE,
+                               PAGE_KERNEL);
+
+       __set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline),
+                    __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+       unsigned int cpu;
+
+       for_each_possible_cpu(cpu)
+               setup_cpu_entry_area(cpu);
 }
 
 /* Load the original GDT from the per-cpu structure */
@@ -747,7 +836,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
 {
        int i;
 
-       for (i = 0; i < NCAPINTS; i++) {
+       for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
                c->x86_capability[i] &= ~cpu_caps_cleared[i];
                c->x86_capability[i] |= cpu_caps_set[i];
        }
@@ -1250,7 +1339,7 @@ void enable_sep_cpu(void)
                return;
 
        cpu = get_cpu();
-       tss = &per_cpu(cpu_tss, cpu);
+       tss = &per_cpu(cpu_tss_rw, cpu);
 
        /*
         * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1259,11 +1348,7 @@ void enable_sep_cpu(void)
 
        tss->x86_tss.ss1 = __KERNEL_CS;
        wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
-
-       wrmsr(MSR_IA32_SYSENTER_ESP,
-             (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
-             0);
-
+       wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0);
        wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
 
        put_cpu();
@@ -1357,25 +1442,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
-/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
- */
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
-         [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
-         [DEBUG_STACK - 1]                     = DEBUG_STKSZ
-};
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
-       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
-
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
+       extern char _entry_trampoline[];
+       extern char entry_SYSCALL_64_trampoline[];
+
+       int cpu = smp_processor_id();
+       unsigned long SYSCALL64_entry_trampoline =
+               (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
+               (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
        wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-       wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+       wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
 
 #ifdef CONFIG_IA32_EMULATION
        wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@ -1386,7 +1465,7 @@ void syscall_init(void)
         * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
         */
        wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-       wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+       wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
        wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
        wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1530,7 +1609,7 @@ void cpu_init(void)
        if (cpu)
                load_ucode_ap();
 
-       t = &per_cpu(cpu_tss, cpu);
+       t = &per_cpu(cpu_tss_rw, cpu);
        oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
@@ -1569,7 +1648,7 @@ void cpu_init(void)
         * set up and load the per-CPU TSS
         */
        if (!oist->ist[0]) {
-               char *estacks = per_cpu(exception_stacks, cpu);
+               char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
 
                for (v = 0; v < N_EXCEPTION_STACKS; v++) {
                        estacks += exception_stack_sizes[v];
@@ -1580,7 +1659,7 @@ void cpu_init(void)
                }
        }
 
-       t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+       t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
 
        /*
         * <= is required because the CPU will access up to
@@ -1596,11 +1675,12 @@ void cpu_init(void)
        enter_lazy_tlb(&init_mm, me);
 
        /*
-        * Initialize the TSS.  Don't bother initializing sp0, as the initial
-        * task never enters user mode.
+        * Initialize the TSS.  sp0 points to the entry trampoline stack
+        * regardless of what task is running.
         */
-       set_tss_desc(cpu, t);
+       set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
        load_TR_desc();
+       load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1));
 
        load_mm_ldt(&init_mm);
 
@@ -1612,7 +1692,6 @@ void cpu_init(void)
        if (is_uv_system())
                uv_cpu_init();
 
-       setup_fixmap_gdt(cpu);
        load_fixmap_gdt(cpu);
 }
 
@@ -1622,7 +1701,7 @@ void cpu_init(void)
 {
        int cpu = smp_processor_id();
        struct task_struct *curr = current;
-       struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+       struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
 
        wait_for_master_cpu(cpu);
 
@@ -1657,12 +1736,12 @@ void cpu_init(void)
         * Initialize the TSS.  Don't bother initializing sp0, as the initial
         * task never enters user mode.
         */
-       set_tss_desc(cpu, t);
+       set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
        load_TR_desc();
 
        load_mm_ldt(&init_mm);
 
-       t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+       t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
 
 #ifdef CONFIG_DOUBLEFAULT
        /* Set up doublefault TSS pointer in the GDT */
@@ -1674,7 +1753,6 @@ void cpu_init(void)
 
        fpu__init_cpu();
 
-       setup_fixmap_gdt(cpu);
        load_fixmap_gdt(cpu);
 }
 #endif
index 0e662c55ae902fedd5c78c1ed87a972b35a79856..0b8cedb20d6d92f2875a49292680c8cfecd5b044 100644 (file)
@@ -50,25 +50,23 @@ static void doublefault_fn(void)
                cpu_relax();
 }
 
-struct tss_struct doublefault_tss __cacheline_aligned = {
-       .x86_tss = {
-               .sp0            = STACK_START,
-               .ss0            = __KERNEL_DS,
-               .ldt            = 0,
-               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
-               .ip             = (unsigned long) doublefault_fn,
-               /* 0x2 bit is always set */
-               .flags          = X86_EFLAGS_SF | 0x2,
-               .sp             = STACK_START,
-               .es             = __USER_DS,
-               .cs             = __KERNEL_CS,
-               .ss             = __KERNEL_DS,
-               .ds             = __USER_DS,
-               .fs             = __KERNEL_PERCPU,
-
-               .__cr3          = __pa_nodebug(swapper_pg_dir),
-       }
+struct x86_hw_tss doublefault_tss __cacheline_aligned = {
+       .sp0            = STACK_START,
+       .ss0            = __KERNEL_DS,
+       .ldt            = 0,
+       .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+       .ip             = (unsigned long) doublefault_fn,
+       /* 0x2 bit is always set */
+       .flags          = X86_EFLAGS_SF | 0x2,
+       .sp             = STACK_START,
+       .es             = __USER_DS,
+       .cs             = __KERNEL_CS,
+       .ss             = __KERNEL_DS,
+       .ds             = __USER_DS,
+       .fs             = __KERNEL_PERCPU,
+
+       .__cr3          = __pa_nodebug(swapper_pg_dir),
 };
 
 /* dummy for do_double_fault() call */
index f13b4c00a5de4b7a7b36c40d27311672bcc9d05c..bbd6d986e2d0fc22b5b3c23c794ade410b9f9973 100644 (file)
@@ -43,6 +43,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
        return true;
 }
 
+bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)
+{
+       struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id());
+
+       void *begin = ss;
+       void *end = ss + 1;
+
+       if ((void *)stack < begin || (void *)stack >= end)
+               return false;
+
+       info->type      = STACK_TYPE_SYSENTER;
+       info->begin     = begin;
+       info->end       = end;
+       info->next_sp   = NULL;
+
+       return true;
+}
+
 static void printk_stack_address(unsigned long address, int reliable,
                                 char *log_lvl)
 {
@@ -50,6 +68,28 @@ static void printk_stack_address(unsigned long address, int reliable,
        printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
 }
 
+void show_iret_regs(struct pt_regs *regs)
+{
+       printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
+       printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+               regs->sp, regs->flags);
+}
+
+static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
+{
+       if (on_stack(info, regs, sizeof(*regs)))
+               __show_regs(regs, 0);
+       else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+                         IRET_FRAME_SIZE)) {
+               /*
+                * When an interrupt or exception occurs in entry code, the
+                * full pt_regs might not have been saved yet.  In that case
+                * just print the iret frame.
+                */
+               show_iret_regs(regs);
+       }
+}
+
 void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
                        unsigned long *stack, char *log_lvl)
 {
@@ -71,31 +111,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
         * - task stack
         * - interrupt stack
         * - HW exception stacks (double fault, nmi, debug, mce)
+        * - SYSENTER stack
         *
-        * x86-32 can have up to three stacks:
+        * x86-32 can have up to four stacks:
         * - task stack
         * - softirq stack
         * - hardirq stack
+        * - SYSENTER stack
         */
        for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
                const char *stack_name;
 
-               /*
-                * If we overflowed the task stack into a guard page, jump back
-                * to the bottom of the usable stack.
-                */
-               if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
-                       stack = task_stack_page(task);
-
-               if (get_stack_info(stack, task, &stack_info, &visit_mask))
-                       break;
+               if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+                       /*
+                        * We weren't on a valid stack.  It's possible that
+                        * we overflowed a valid stack into a guard page.
+                        * See if the next page up is valid so that we can
+                        * generate some kind of backtrace if this happens.
+                        */
+                       stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+                       if (get_stack_info(stack, task, &stack_info, &visit_mask))
+                               break;
+               }
 
                stack_name = stack_type_name(stack_info.type);
                if (stack_name)
                        printk("%s <%s>\n", log_lvl, stack_name);
 
-               if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
-                       __show_regs(regs, 0);
+               if (regs)
+                       show_regs_safe(&stack_info, regs);
 
                /*
                 * Scan the stack, printing any text addresses we find.  At the
@@ -119,7 +163,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
                        /*
                         * Don't print regs->ip again if it was already printed
-                        * by __show_regs() below.
+                        * by show_regs_safe() below.
                         */
                        if (regs && stack == &regs->ip)
                                goto next;
@@ -155,8 +199,8 @@ next:
 
                        /* if the frame has entry regs, print them */
                        regs = unwind_get_entry_regs(&state);
-                       if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
-                               __show_regs(regs, 0);
+                       if (regs)
+                               show_regs_safe(&stack_info, regs);
                }
 
                if (stack_name)
index daefae83a3aa86c59602b75bd3e6734c6e3b1030..5ff13a6b368069f68505099ce94267b8bf0f45b9 100644 (file)
@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
        if (type == STACK_TYPE_SOFTIRQ)
                return "SOFTIRQ";
 
+       if (type == STACK_TYPE_SYSENTER)
+               return "SYSENTER";
+
        return NULL;
 }
 
@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
        if (task != current)
                goto unknown;
 
+       if (in_sysenter_stack(stack, info))
+               goto recursion_check;
+
        if (in_hardirq_stack(stack, info))
                goto recursion_check;
 
index 88ce2ffdb110303502ad33e64d357d8af5afd8c6..abc828f8c29785b4fae8398ec19775015447ee22 100644 (file)
@@ -37,6 +37,9 @@ const char *stack_type_name(enum stack_type type)
        if (type == STACK_TYPE_IRQ)
                return "IRQ";
 
+       if (type == STACK_TYPE_SYSENTER)
+               return "SYSENTER";
+
        if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
                return exception_stack_names[type - STACK_TYPE_EXCEPTION];
 
@@ -115,6 +118,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
        if (in_irq_stack(stack, info))
                goto recursion_check;
 
+       if (in_sysenter_stack(stack, info))
+               goto recursion_check;
+
        goto unknown;
 
 recursion_check:
index 3feb648781c470a7a49ee26749712ba7da891fe9..2f723301eb58fc5ad0d6796b342446ae2ee0c9e6 100644 (file)
@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
         * because the ->io_bitmap_max value must match the bitmap
         * contents:
         */
-       tss = &per_cpu(cpu_tss, get_cpu());
+       tss = &per_cpu(cpu_tss_rw, get_cpu());
 
        if (turn_on)
                bitmap_clear(t->io_bitmap_ptr, from, num);
index 49cfd9fe7589fa5ef2bef5d4a5d6431b7007836f..68e1867cca8045d0ed728ffc6b75a866c25484ed 100644 (file)
@@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
        /* high bit used in ret_from_ code  */
        unsigned vector = ~regs->orig_ax;
 
-       /*
-        * NB: Unlike exception entries, IRQ entries do not reliably
-        * handle context tracking in the low-level entry code.  This is
-        * because syscall entries execute briefly with IRQs on before
-        * updating context tracking state, so we can take an IRQ from
-        * kernel mode with CONTEXT_USER.  The low-level entry code only
-        * updates the context if we came from user mode, so we won't
-        * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
-        * code is cleaned up enough that we can cleanly defer enabling
-        * IRQs.
-        */
-
        entering_irq();
 
        /* entering_irq() tells RCU that we're not quiescent.  Check it. */
index 020efbf5786b35d343a8632cd14ac4f800465d9b..d86e344f5b3debfed504b72a7c0f83f36fe16387 100644 (file)
@@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
        if (regs->sp >= estack_top && regs->sp <= estack_bottom)
                return;
 
-       WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+       WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
                current->comm, curbase, regs->sp,
                irq_stack_top, irq_stack_bottom,
-               estack_top, estack_bottom);
+               estack_top, estack_bottom, (void *)regs->ip);
 
        if (sysctl_panic_on_stackoverflow)
                panic("low stack detected by irq handler - check messages\n");
index ac0be8283325edfdc2752f862b4c0cef208a931c..9edadabf04f66c657f8a29bb56fe994b2559d5cf 100644 (file)
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
 DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                PATCH_SITE(pv_mmu_ops, read_cr2);
                PATCH_SITE(pv_mmu_ops, read_cr3);
                PATCH_SITE(pv_mmu_ops, write_cr3);
-               PATCH_SITE(pv_mmu_ops, flush_tlb_single);
                PATCH_SITE(pv_cpu_ops, wbinvd);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
                case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
index bb988a24db927d758f9120d45f90d1c160628790..aed9d94bd46f41bb049b8e0153a44a43d97e80b4 100644 (file)
@@ -47,7 +47,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
        .x86_tss = {
                /*
                 * .sp0 is only used when entering ring 0 from a lower
@@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
                 * Poison it.
                 */
                .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
+
+#ifdef CONFIG_X86_64
+               /*
+                * .sp1 is cpu_current_top_of_stack.  The init task never
+                * runs user code, but cpu_current_top_of_stack should still
+                * be well defined before the first context switch.
+                */
+               .sp1 = TOP_OF_INIT_STACK,
+#endif
+
 #ifdef CONFIG_X86_32
                .ss0 = __KERNEL_DS,
                .ss1 = __KERNEL_CS,
@@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
          */
        .io_bitmap              = { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
-#ifdef CONFIG_X86_32
-       .SYSENTER_stack_canary  = STACK_END_MAGIC,
-#endif
 };
-EXPORT_PER_CPU_SYMBOL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
 
 DEFINE_PER_CPU(bool, __tss_limit_invalid);
 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
        struct fpu *fpu = &t->fpu;
 
        if (bp) {
-               struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
+               struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
 
                t->io_bitmap_ptr = NULL;
                clear_thread_flag(TIF_IO_BITMAP);
index 45bf0c5f93e15103060d67d5245756ab72ce8fe5..5224c609918416337b97440eb2d515d8052463ae 100644 (file)
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        struct fpu *prev_fpu = &prev->fpu;
        struct fpu *next_fpu = &next->fpu;
        int cpu = smp_processor_id();
-       struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+       struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
        /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
index eeeb34f85c250e8c01188b6d32cf5a62bd1af8a0..c754662320163107ca3a254362ce0e404a8d3c11 100644 (file)
@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
        unsigned int fsindex, gsindex;
        unsigned int ds, cs, es;
 
-       printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
-       printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
-               regs->sp, regs->flags);
+       show_iret_regs(regs);
+
        if (regs->orig_ax != -1)
                pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
        else
@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
               regs->r13, regs->r14, regs->r15);
 
+       if (!all)
+               return;
+
        asm("movl %%ds,%0" : "=r" (ds));
        asm("movl %%cs,%0" : "=r" (cs));
        asm("movl %%es,%0" : "=r" (es));
@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
        rdmsrl(MSR_GS_BASE, gs);
        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 
-       if (!all)
-               return;
-
        cr0 = read_cr0();
        cr2 = read_cr2();
        cr3 = __read_cr3();
@@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        struct fpu *prev_fpu = &prev->fpu;
        struct fpu *next_fpu = &next->fpu;
        int cpu = smp_processor_id();
-       struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+       struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
        WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
                     this_cpu_read(irq_count) != -1);
@@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         * Switch the PDA and FPU contexts.
         */
        this_cpu_write(current_task, next_p);
+       this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
        /* Reload sp0. */
        update_sp0(next_p);
index 989514c94a55d8fa93a07192edd199be1a607bf8..e98f8b66a460b98b31d262cff23fa063be33ac5a 100644 (file)
@@ -348,9 +348,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
        /*
         * If IRET takes a non-IST fault on the espfix64 stack, then we
-        * end up promoting it to a doublefault.  In that case, modify
-        * the stack to make it look like we just entered the #GP
-        * handler from user space, similar to bad_iret.
+        * end up promoting it to a doublefault.  In that case, take
+        * advantage of the fact that we're not using the normal (TSS.sp0)
+        * stack right now.  We can write a fake #GP(0) frame at TSS.sp0
+        * and then modify our own IRET frame so that, when we return,
+        * we land directly at the #GP(0) vector with the stack already
+        * set up according to its expectations.
+        *
+        * The net result is that our #GP handler will think that we
+        * entered from usermode with the bad user context.
         *
         * No need for ist_enter here because we don't use RCU.
         */
@@ -358,13 +364,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
                regs->cs == __KERNEL_CS &&
                regs->ip == (unsigned long)native_irq_return_iret)
        {
-               struct pt_regs *normal_regs = task_pt_regs(current);
+               struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
-               /* Fake a #GP(0) from userspace. */
-               memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
-               normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
+               /*
+                * regs->sp points to the failing IRET frame on the
+                * ESPFIX64 stack.  Copy it to the entry stack.  This fills
+                * in gpregs->ss through gpregs->ip.
+                *
+                */
+               memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+               gpregs->orig_ax = 0;  /* Missing (lost) #GP error code */
+
+               /*
+                * Adjust our frame so that we return straight to the #GP
+                * vector with the expected RSP value.  This is safe because
+                * we won't enable interupts or schedule before we invoke
+                * general_protection, so nothing will clobber the stack
+                * frame we just set up.
+                */
                regs->ip = (unsigned long)general_protection;
-               regs->sp = (unsigned long)&normal_regs->orig_ax;
+               regs->sp = (unsigned long)&gpregs->orig_ax;
 
                return;
        }
@@ -389,7 +408,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
         *
         *   Processors update CR2 whenever a page fault is detected. If a
         *   second page fault occurs while an earlier page fault is being
-        *   delivered, the faulting linear address of the second fault will
+        *   delivered, the faulting linear address of the second fault will
         *   overwrite the contents of CR2 (replacing the previous
         *   address). These updates to CR2 occur even if the page fault
         *   results in a double fault or occurs during the delivery of a
@@ -605,14 +624,15 @@ NOKPROBE_SYMBOL(do_int3);
 
 #ifdef CONFIG_X86_64
 /*
- * Help handler running on IST stack to switch off the IST stack if the
- * interrupted code was in user mode. The actual stack switch is done in
- * entry_64.S
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
  */
 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
-       struct pt_regs *regs = task_pt_regs(current);
-       *regs = *eregs;
+       struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+       if (regs != eregs)
+               *regs = *eregs;
        return regs;
 }
 NOKPROBE_SYMBOL(sync_regs);
@@ -628,13 +648,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
        /*
         * This is called from entry_64.S early in handling a fault
         * caused by a bad iret to user mode.  To handle the fault
-        * correctly, we want move our stack frame to task_pt_regs
-        * and we want to pretend that the exception came from the
-        * iret target.
+        * correctly, we want to move our stack frame to where it would
+        * be had we entered directly on the entry stack (rather than
+        * just below the IRET frame) and we want to pretend that the
+        * exception came from the IRET target.
         */
        struct bad_iret_stack *new_stack =
-               container_of(task_pt_regs(current),
-                            struct bad_iret_stack, regs);
+               (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
        /* Copy the IRET target to the new stack. */
        memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@@ -795,14 +815,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
        debug_stack_usage_dec();
 
 exit:
-#if defined(CONFIG_X86_32)
-       /*
-        * This is the most likely code path that involves non-trivial use
-        * of the SYSENTER stack.  Check that we haven't overrun it.
-        */
-       WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
-            "Overran or corrupted SYSENTER stack\n");
-#endif
        ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);
@@ -929,6 +941,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 
 void __init trap_init(void)
 {
+       /* Init cpu_entry_area before IST entries are set up */
+       setup_cpu_entry_areas();
+
        idt_setup_traps();
 
        /*
index a3f973b2c97a03b121fe0173dbdc9298216721e6..be86a865087a6b9dc8e04031dbf2e2fbeeda1ed5 100644 (file)
@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
        return NULL;
 }
 
-static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
                            size_t len)
 {
        struct stack_info *info = &state->stack_info;
+       void *addr = (void *)_addr;
 
-       /*
-        * If the address isn't on the current stack, switch to the next one.
-        *
-        * We may have to traverse multiple stacks to deal with the possibility
-        * that info->next_sp could point to an empty stack and the address
-        * could be on a subsequent stack.
-        */
-       while (!on_stack(info, (void *)addr, len))
-               if (get_stack_info(info->next_sp, state->task, info,
-                                  &state->stack_mask))
-                       return false;
+       if (!on_stack(info, addr, len) &&
+           (get_stack_info(addr, state->task, info, &state->stack_mask)))
+               return false;
 
        return true;
 }
@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
        return true;
 }
 
-#define REGS_SIZE (sizeof(struct pt_regs))
-#define SP_OFFSET (offsetof(struct pt_regs, sp))
-#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
-#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
-
 static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
-                            unsigned long *ip, unsigned long *sp, bool full)
+                            unsigned long *ip, unsigned long *sp)
 {
-       size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
-       size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
-       struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
-
-       if (IS_ENABLED(CONFIG_X86_64)) {
-               if (!stack_access_ok(state, addr, regs_size))
-                       return false;
+       struct pt_regs *regs = (struct pt_regs *)addr;
 
-               *ip = regs->ip;
-               *sp = regs->sp;
+       /* x86-32 support will be more complicated due to the &regs->sp hack */
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
 
-               return true;
-       }
-
-       if (!stack_access_ok(state, addr, sp_offset))
+       if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
                return false;
 
        *ip = regs->ip;
+       *sp = regs->sp;
+       return true;
+}
 
-       if (user_mode(regs)) {
-               if (!stack_access_ok(state, addr + sp_offset,
-                                    REGS_SIZE - SP_OFFSET))
-                       return false;
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+                                 unsigned long *ip, unsigned long *sp)
+{
+       struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
 
-               *sp = regs->sp;
-       } else
-               *sp = (unsigned long)&regs->sp;
+       if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+               return false;
 
+       *ip = regs->ip;
+       *sp = regs->sp;
        return true;
 }
 
@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
        unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
        enum stack_type prev_type = state->stack_info.type;
        struct orc_entry *orc;
-       struct pt_regs *ptregs;
        bool indirect = false;
 
        if (unwind_done(state))
@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
                break;
 
        case ORC_TYPE_REGS:
-               if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+               if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
                        orc_warn("can't dereference registers at %p for ip %pB\n",
                                 (void *)sp, (void *)orig_ip);
                        goto done;
@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
                break;
 
        case ORC_TYPE_REGS_IRET:
-               if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+               if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
                        orc_warn("can't dereference iret registers at %p for ip %pB\n",
                                 (void *)sp, (void *)orig_ip);
                        goto done;
                }
 
-               ptregs = container_of((void *)sp, struct pt_regs, ip);
-               if ((unsigned long)ptregs >= prev_sp &&
-                   on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
-                       state->regs = ptregs;
-                       state->full_regs = false;
-               } else
-                       state->regs = NULL;
-
+               state->regs = (void *)sp - IRET_FRAME_OFFSET;
+               state->full_regs = false;
                state->signal = true;
                break;
 
@@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
        }
 
        if (get_stack_info((unsigned long *)state->sp, state->task,
-                          &state->stack_info, &state->stack_mask))
-               return;
+                          &state->stack_info, &state->stack_mask)) {
+               /*
+                * We weren't on a valid stack.  It's possible that
+                * we overflowed a valid stack into a guard page.
+                * See if the next page up is valid so that we can
+                * generate some kind of backtrace if this happens.
+                */
+               void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+               if (get_stack_info(next_page, state->task, &state->stack_info,
+                                  &state->stack_mask))
+                       return;
+       }
 
        /*
         * The caller can provide the address of the first frame directly
index a4009fb9be8725ce7bda96cd5e8160e524903266..d2a8b5a24a44a554e2f81f3b30309ef39aba0d8a 100644 (file)
@@ -107,6 +107,15 @@ SECTIONS
                SOFTIRQENTRY_TEXT
                *(.fixup)
                *(.gnu.warning)
+
+#ifdef CONFIG_X86_64
+               . = ALIGN(PAGE_SIZE);
+               _entry_trampoline = .;
+               *(.entry_trampoline)
+               . = ALIGN(PAGE_SIZE);
+               ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
+#endif
+
                /* End of text section */
                _etext = .;
        } :text = 0x9090
index 8eba631c4dbd509d8687c6135e8dba267042f5e0..023afa0c8887002d6a79a8b121b46996feec1a61 100644 (file)
@@ -2302,7 +2302,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 * processors.  See 22.2.4.
                 */
                vmcs_writel(HOST_TR_BASE,
-                           (unsigned long)this_cpu_ptr(&cpu_tss));
+                           (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
                vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
 
                /*
index 553f8fd23cc4733d0edafa862b95446f7a04bab1..4846eff7e4c8b1505501d7f1dcb64127d0a4c67c 100644 (file)
@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
                delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
 
                /*
-                * Use cpu_tss as a cacheline-aligned, seldomly
+                * Use cpu_tss_rw as a cacheline-aligned, seldomly
                 * accessed per-cpu variable as the monitor target.
                 */
-               __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
+               __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
 
                /*
                 * AMD, like Intel, supports the EAX hint and EAX=0xf
index 99dfed6dfef8b2f9028f82b89ab8dc2bde8173c4..9ec70d780f1f4172e3c69068f55722d13f003b06 100644 (file)
@@ -277,6 +277,7 @@ void __init kasan_early_init(void)
 void __init kasan_init(void)
 {
        int i;
+       void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
 
 #ifdef CONFIG_KASAN_INLINE
        register_die_notifier(&kasan_die_notifier);
@@ -329,8 +330,23 @@ void __init kasan_init(void)
                              (unsigned long)kasan_mem_to_shadow(_end),
                              early_pfn_to_nid(__pa(_stext)));
 
+       shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM);
+       shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+       shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
+                                               PAGE_SIZE);
+
+       shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE);
+       shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+       shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
+                                       PAGE_SIZE);
+
        kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
-                       (void *)KASAN_SHADOW_END);
+                                  shadow_cpu_entry_begin);
+
+       kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+                             (unsigned long)shadow_cpu_entry_end, 0);
+
+       kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END);
 
        load_cr3(init_top_pgt);
        __flush_tlb_all();
index 36a28eddb435e72d2abc5ffbdd1e78a46b56876e..a7d966964c6f20577c927cf5e618bc86b3331977 100644 (file)
@@ -152,17 +152,19 @@ static void do_fpu_end(void)
 static void fix_processor_context(void)
 {
        int cpu = smp_processor_id();
-       struct tss_struct *t = &per_cpu(cpu_tss, cpu);
 #ifdef CONFIG_X86_64
        struct desc_struct *desc = get_cpu_gdt_rw(cpu);
        tss_desc tss;
 #endif
-       set_tss_desc(cpu, t);   /*
-                                * This just modifies memory; should not be
-                                * necessary. But... This is necessary, because
-                                * 386 hardware has concept of busy TSS or some
-                                * similar stupidity.
-                                */
+
+       /*
+        * We need to reload TR, which requires that we change the
+        * GDT entry to indicate "available" first.
+        *
+        * XXX: This could probably all be replaced by a call to
+        * force_reload_TR().
+        */
+       set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 
 #ifdef CONFIG_X86_64
        memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
index f2414c6c5e7c455b43fc45773fbd1264cf86c24e..7beeee1443b32a3fbcf3ba6ad57594d46c7a359f 100644 (file)
@@ -826,7 +826,7 @@ static void xen_load_sp0(unsigned long sp0)
        mcs = xen_mc_entry(0);
        MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
        xen_mc_issue(PARAVIRT_LAZY_CPU);
-       this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+       this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 void xen_set_iopl_mask(unsigned mask)
index fc048ec686e7699b263254c79b482ccf935c21ef..6cf801ca11428fa5fd9c2d3c9931354f28575580 100644 (file)
@@ -2272,7 +2272,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 #endif
        case FIX_TEXT_POKE0:
        case FIX_TEXT_POKE1:
-       case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
+       case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:
                /* All local page mappings */
                pte = pfn_pte(phys, prot);
                break;