Merge tag 'gpio-v4.15-3' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Dec 2017 02:17:18 +0000 (18:17 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 27 Dec 2017 02:17:18 +0000 (18:17 -0800)
Pull GPIO fixes from Linus Walleij:
 "Two fixes. They are both kind of important, so why not send a pull
  request on christmas eve.

   - Fix a build problem in the gpio single register created by
     refactorings.

   - Fix assignment of GPIO line names, something that was mangled by
     another patch"

* tag 'gpio-v4.15-3' of git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio:
  gpio: fix "gpio-line-names" property retrieval
  gpio: gpio-reg: fix build

285 files changed:
Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
Documentation/x86/x86_64/mm.txt
Makefile
arch/arm/lib/csumpartialcopyuser.S
arch/arm64/kvm/hyp/debug-sr.c
arch/parisc/boot/compressed/misc.c
arch/parisc/include/asm/thread_info.h
arch/parisc/kernel/entry.S
arch/parisc/kernel/hpmc.S
arch/parisc/kernel/unwind.c
arch/parisc/lib/delay.c
arch/powerpc/include/asm/mmu_context.h
arch/powerpc/kernel/process.c
arch/powerpc/kvm/book3s_xive.c
arch/powerpc/net/bpf_jit_comp64.c
arch/powerpc/perf/core-book3s.c
arch/powerpc/perf/imc-pmu.c
arch/s390/net/bpf_jit_comp.c
arch/sparc/mm/fault_32.c
arch/sparc/mm/fault_64.c
arch/sparc/net/bpf_jit_comp_64.c
arch/um/include/asm/mmu_context.h
arch/um/kernel/trap.c
arch/unicore32/include/asm/mmu_context.h
arch/x86/Kconfig
arch/x86/entry/entry_32.S
arch/x86/entry/entry_64.S
arch/x86/entry/entry_64_compat.S
arch/x86/entry/vsyscall/vsyscall_64.c
arch/x86/include/asm/cpu_entry_area.h [new file with mode: 0644]
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/desc.h
arch/x86/include/asm/espfix.h
arch/x86/include/asm/fixmap.h
arch/x86/include/asm/hypervisor.h
arch/x86/include/asm/invpcid.h [new file with mode: 0644]
arch/x86/include/asm/irqflags.h
arch/x86/include/asm/kdebug.h
arch/x86/include/asm/mmu.h
arch/x86/include/asm/mmu_context.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/pgtable_32_types.h
arch/x86/include/asm/pgtable_64_types.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/stacktrace.h
arch/x86/include/asm/switch_to.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/tlbflush.h
arch/x86/include/asm/traps.h
arch/x86/include/asm/unwind.h
arch/x86/kernel/asm-offsets.c
arch/x86/kernel/asm-offsets_32.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/microcode/intel.c
arch/x86/kernel/doublefault.c
arch/x86/kernel/dumpstack.c
arch/x86/kernel/dumpstack_32.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/ioport.c
arch/x86/kernel/irq.c
arch/x86/kernel/irq_64.c
arch/x86/kernel/ldt.c
arch/x86/kernel/paravirt_patch_64.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/traps.c
arch/x86/kernel/unwind_orc.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/kvm/emulate.c
arch/x86/kvm/mmu.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
arch/x86/lib/delay.c
arch/x86/mm/Makefile
arch/x86/mm/cpu_entry_area.c [new file with mode: 0644]
arch/x86/mm/dump_pagetables.c
arch/x86/mm/fault.c
arch/x86/mm/init_32.c
arch/x86/mm/kasan_init_64.c
arch/x86/mm/pgtable_32.c
arch/x86/mm/tlb.c
arch/x86/platform/uv/tlb_uv.c
arch/x86/power/cpu.c
arch/x86/xen/enlighten.c
arch/x86/xen/enlighten_pv.c
arch/x86/xen/mmu_pv.c
arch/x86/xen/setup.c
block/bio.c
block/blk-map.c
block/blk-throttle.c
block/bounce.c
block/kyber-iosched.c
crypto/af_alg.c
crypto/algif_aead.c
crypto/algif_skcipher.c
crypto/mcryptd.c
crypto/skcipher.c
drivers/acpi/apei/erst.c
drivers/acpi/cppc_acpi.c
drivers/acpi/nfit/core.c
drivers/block/null_blk.c
drivers/clk/clk.c
drivers/clk/sunxi/clk-sun9i-mmc.c
drivers/cpufreq/cpufreq_governor.c
drivers/cpufreq/imx6q-cpufreq.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.h
drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c
drivers/gpu/drm/amd/display/dc/core/dc_link.c
drivers/gpu/drm/amd/display/dc/dce110/dce110_hw_sequencer.c
drivers/gpu/drm/amd/display/dc/dcn10/dcn10_dpp.c
drivers/gpu/drm/drm_lease.c
drivers/gpu/drm/drm_plane.c
drivers/gpu/drm/drm_syncobj.c
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_sw_fence.c
drivers/gpu/drm/i915/intel_breadcrumbs.c
drivers/gpu/drm/i915/intel_ddi.c
drivers/gpu/drm/i915/intel_display.c
drivers/gpu/drm/i915/intel_lpe_audio.c
drivers/gpu/drm/nouveau/nouveau_bo.c
drivers/gpu/drm/nouveau/nouveau_drv.h
drivers/gpu/drm/nouveau/nouveau_fbcon.c
drivers/gpu/drm/nouveau/nouveau_mem.c
drivers/gpu/drm/nouveau/nouveau_ttm.c
drivers/gpu/drm/nouveau/nouveau_vmm.c
drivers/gpu/drm/nouveau/nvkm/engine/device/base.c
drivers/gpu/drm/nouveau/nvkm/subdev/bios/dp.c
drivers/gpu/drm/nouveau/nvkm/subdev/instmem/nv50.c
drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c
drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c
drivers/gpu/drm/sun4i/sun4i_tcon.c
drivers/gpu/drm/ttm/ttm_page_alloc.c
drivers/mfd/cros_ec_spi.c
drivers/mfd/twl4030-audio.c
drivers/mfd/twl6040.c
drivers/mtd/mtdcore.c
drivers/mtd/nand/brcmnand/brcmnand.c
drivers/mtd/nand/gpio.c
drivers/mtd/nand/gpmi-nand/gpmi-nand.c
drivers/net/ethernet/arc/emac.h
drivers/net/ethernet/arc/emac_main.c
drivers/net/ethernet/broadcom/tg3.c
drivers/net/ethernet/marvell/mvneta.c
drivers/net/ethernet/mediatek/mtk_eth_soc.c
drivers/net/ethernet/mellanox/mlx5/core/cmd.c
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/eq.c
drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
drivers/net/ethernet/mellanox/mlx5/core/health.c
drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlx5/core/qp.c
drivers/net/ethernet/mellanox/mlx5/core/rl.c
drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
drivers/net/ethernet/netronome/nfp/bpf/main.c
drivers/net/ethernet/netronome/nfp/bpf/main.h
drivers/net/ethernet/qualcomm/emac/emac.c
drivers/net/ethernet/stmicro/stmmac/common.h
drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c
drivers/net/ethernet/stmicro/stmmac/enh_desc.c
drivers/net/ethernet/stmicro/stmmac/norm_desc.c
drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c
drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
drivers/net/phy/marvell.c
drivers/net/phy/mdio-xgene.c
drivers/net/vxlan.c
drivers/net/wireless/mac80211_hwsim.c
drivers/nvdimm/btt.c
drivers/nvdimm/btt.h
drivers/nvdimm/pfn_devs.c
drivers/nvme/host/core.c
drivers/nvme/host/fc.c
drivers/parisc/lba_pci.c
drivers/pci/pci-driver.c
drivers/pinctrl/intel/pinctrl-cherryview.c
drivers/s390/net/qeth_core_main.c
drivers/scsi/aacraid/aacraid.h
drivers/scsi/aacraid/linit.c
drivers/scsi/osd/osd_initiator.c
drivers/scsi/scsi_devinfo.c
drivers/scsi/scsi_scan.c
drivers/scsi/scsi_sysfs.c
drivers/scsi/scsi_transport_spi.c
drivers/spi/spi-armada-3700.c
drivers/spi/spi-atmel.c
drivers/spi/spi-rspi.c
drivers/spi/spi-sun4i.c
drivers/spi/spi-xilinx.c
drivers/target/target_core_pscsi.c
drivers/xen/balloon.c
fs/xfs/libxfs/xfs_alloc.c
fs/xfs/libxfs/xfs_attr.c
fs/xfs/libxfs/xfs_attr_leaf.c
fs/xfs/libxfs/xfs_attr_leaf.h
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_defer.c
fs/xfs/libxfs/xfs_defer.h
fs/xfs/libxfs/xfs_iext_tree.c
fs/xfs/libxfs/xfs_refcount.c
fs/xfs/libxfs/xfs_rmap.c
fs/xfs/libxfs/xfs_rmap.h
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_icache.c
fs/xfs/xfs_icache.h
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_reflink.c
fs/xfs/xfs_super.c
include/asm-generic/mm_hooks.h
include/asm-generic/pgtable.h
include/crypto/mcryptd.h
include/kvm/arm_arch_timer.h
include/linux/bio.h
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/bpf_verifier.h
include/linux/ipv6.h
include/linux/mfd/rtsx_pci.h
include/linux/mlx5/driver.h
include/linux/mlx5/mlx5_ifc.h
include/linux/spi/spi.h
include/net/cfg80211.h
include/net/pkt_cls.h
include/trace/events/clk.h
include/trace/events/kvm.h
include/xen/balloon.h
init/main.c
kernel/bpf/verifier.c
kernel/fork.c
lib/test_bpf.c
mm/backing-dev.c
net/bridge/br_netlink.c
net/core/dev.c
net/core/net_namespace.c
net/core/skbuff.c
net/ipv4/fib_frontend.c
net/ipv4/fib_semantics.c
net/ipv4/ip_gre.c
net/ipv6/af_inet6.c
net/ipv6/ip6_gre.c
net/ipv6/ip6_output.c
net/ipv6/ip6_tunnel.c
net/ipv6/ipv6_sockglue.c
net/ipv6/route.c
net/openvswitch/flow.c
net/sched/cls_bpf.c
net/sctp/debug.c
net/sctp/ulpqueue.c
net/tipc/group.c
net/wireless/Makefile
net/wireless/certs/sforshee.hex [new file with mode: 0644]
net/wireless/certs/sforshee.x509 [deleted file]
net/wireless/nl80211.c
sound/core/rawmidi.c
sound/pci/hda/patch_hdmi.c
sound/pci/hda/patch_realtek.c
sound/usb/mixer.c
sound/usb/quirks.c
tools/arch/s390/include/uapi/asm/bpf_perf_event.h
tools/kvm/kvm_stat/kvm_stat
tools/kvm/kvm_stat/kvm_stat.txt
tools/testing/selftests/bpf/Makefile
tools/testing/selftests/bpf/test_progs.c
tools/testing/selftests/bpf/test_verifier.c
tools/testing/selftests/net/config
tools/testing/selftests/x86/ldt_gdt.c
virt/kvm/arm/arch_timer.c
virt/kvm/arm/arm.c
virt/kvm/arm/mmio.c
virt/kvm/arm/mmu.c

index 376fa2f50e6bc9b41052928037acd4b3a382d380..956bb046e599d576e3f881b2901e0d369a3c9802 100644 (file)
@@ -13,7 +13,6 @@ Required properties:
                  at25df321a
                  at25df641
                  at26df081a
-                 en25s64
                  mr25h128
                  mr25h256
                  mr25h10
@@ -33,7 +32,6 @@ Required properties:
                  s25fl008k
                  s25fl064k
                  sst25vf040b
-                 sst25wf040b
                  m25p40
                  m25p80
                  m25p16
index 5bf13960f7f4a3c826c10b1e15a618df82d82403..e3c48b20b1a691b37d0b425251a257c682a38eca 100644 (file)
@@ -12,24 +12,30 @@ Required properties:
   - "fsl,imx53-ecspi" for SPI compatible with the one integrated on i.MX53 and later Soc
 - reg : Offset and length of the register set for the device
 - interrupts : Should contain CSPI/eCSPI interrupt
-- cs-gpios : Specifies the gpio pins to be used for chipselects.
 - clocks : Clock specifiers for both ipg and per clocks.
 - clock-names : Clock names should include both "ipg" and "per"
 See the clock consumer binding,
        Documentation/devicetree/bindings/clock/clock-bindings.txt
-- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
-               Documentation/devicetree/bindings/dma/dma.txt
-- dma-names: DMA request names should include "tx" and "rx" if present.
 
-Obsolete properties:
-- fsl,spi-num-chipselects : Contains the number of the chipselect
+Recommended properties:
+- cs-gpios : GPIOs to use as chip selects, see spi-bus.txt.  While the native chip
+select lines can be used, they appear to always generate a pulse between each
+word of a transfer.  Most use cases will require GPIO based chip selects to
+generate a valid transaction.
 
 Optional properties:
+- num-cs :  Number of total chip selects, see spi-bus.txt.
+- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
+Documentation/devicetree/bindings/dma/dma.txt.
+- dma-names: DMA request names, if present, should include "tx" and "rx".
 - fsl,spi-rdy-drctl: Integer, representing the value of DRCTL, the register
 controlling the SPI_READY handling. Note that to enable the DRCTL consideration,
 the SPI_READY mode-flag needs to be set too.
 Valid values are: 0 (disabled), 1 (edge-triggered burst) and 2 (level-triggered burst).
 
+Obsolete properties:
+- fsl,spi-num-chipselects : Contains the number of the chipselect
+
 Example:
 
 ecspi@70010000 {
index 3448e675b4623ce81b5e0bc1116c52a12c411801..51101708a03ae1c22ad4a16c4b750a2af165a521 100644 (file)
@@ -1,6 +1,4 @@
 
-<previous description obsolete, deleted>
-
 Virtual memory map with 4 level page tables:
 
 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
@@ -14,13 +12,15 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space (variable)
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 Virtual memory map with 5 level page tables:
@@ -36,19 +36,22 @@ ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 Architecture defines a 64-bit virtual address. Implementations can support
 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
-through to the most-significant implemented bit are set to either all ones
-or all zero. This causes hole between user space and kernel addresses.
+through to the most-significant implemented bit are sign extended.
+This causes hole between user space and kernel addresses if you interpret them
+as unsigned.
 
 The direct mapping covers all memory in the system up to the highest
 memory address (this means in some cases it can also include PCI memory
@@ -58,9 +61,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of
 the processes using the page fault handler, with init_top_pgt as
 reference.
 
-Current X86-64 implementations support up to 46 bits of address space (64 TB),
-which is our current limit. This expands into MBZ space in the page tables.
-
 We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
 memory window (this size is arbitrary, it can be raised later if needed).
 The mappings are not part of any other kernel PGD and are only available
@@ -72,5 +72,3 @@ following fixmap section.
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 Their order is preserved but their base will be offset early at boot time.
-
--Andi Kleen, Jul 2004
index 7e02f951b284187d5354c2b7bd39b0ef1bf5d903..ac8c441866b70d0b447b37ef3746374a4040849e 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 4
 PATCHLEVEL = 15
 SUBLEVEL = 0
-EXTRAVERSION = -rc4
+EXTRAVERSION = -rc5
 NAME = Fearless Coyote
 
 # *DOCUMENTATION*
index 1712f132b80d2402d94d72ea974a0c3326fa2f52..b83fdc06286a64ece150fb7e419bc587e47c3e34 100644 (file)
                .pushsection .text.fixup,"ax"
                .align  4
 9001:          mov     r4, #-EFAULT
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+               ldr     r5, [sp, #9*4]          @ *err_ptr
+#else
                ldr     r5, [sp, #8*4]          @ *err_ptr
+#endif
                str     r4, [r5]
                ldmia   sp, {r1, r2}            @ retrieve dst, len
                add     r2, r2, r1
index 321c9c05dd9e09fc0c745a4543a286b7628f00a4..f4363d40e2cd7fd62d40d826d5296c95f15cde9f 100644 (file)
@@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
 {
        u64 reg;
 
+       /* Clear pmscr in case of early return */
+       *pmscr_el1 = 0;
+
        /* SPE present on this CPU? */
        if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
                                                  ID_AA64DFR0_PMSVER_SHIFT))
index 9345b44b86f036572e33721eb80e9bbbe4493aa4..f57118e1f6b4265257799ae2cf8ea356077e20b9 100644 (file)
@@ -123,8 +123,8 @@ int puts(const char *s)
        while ((nuline = strchr(s, '\n')) != NULL) {
                if (nuline != s)
                        pdc_iodc_print(s, nuline - s);
-                       pdc_iodc_print("\r\n", 2);
-                       s = nuline + 1;
+               pdc_iodc_print("\r\n", 2);
+               s = nuline + 1;
        }
        if (*s != '\0')
                pdc_iodc_print(s, strlen(s));
index c980a02a52bc0dda0a23b205f59d1d86438553f2..598c8d60fa5e602cc9303e1986ada9680d64feb3 100644 (file)
@@ -35,7 +35,12 @@ struct thread_info {
 
 /* thread information allocation */
 
+#ifdef CONFIG_IRQSTACKS
+#define THREAD_SIZE_ORDER      2 /* PA-RISC requires at least 16k stack */
+#else
 #define THREAD_SIZE_ORDER      3 /* PA-RISC requires at least 32k stack */
+#endif
+
 /* Be sure to hunt all references to this down when you change the size of
  * the kernel stack */
 #define THREAD_SIZE             (PAGE_SIZE << THREAD_SIZE_ORDER)
index a4fd296c958e8e14f13a913aca50510b11eb49b7..f3cecf5117cf8ab14724f0ea3535220c3224d569 100644 (file)
@@ -878,9 +878,6 @@ ENTRY_CFI(syscall_exit_rfi)
        STREG   %r19,PT_SR7(%r16)
 
 intr_return:
-       /* NOTE: Need to enable interrupts incase we schedule. */
-       ssm     PSW_SM_I, %r0
-
        /* check for reschedule */
        mfctl   %cr30,%r1
        LDREG   TI_FLAGS(%r1),%r19      /* sched.h: TIF_NEED_RESCHED */
@@ -907,6 +904,11 @@ intr_check_sig:
        LDREG   PT_IASQ1(%r16), %r20
        cmpib,COND(=),n 0,%r20,intr_restore /* backward */
 
+       /* NOTE: We need to enable interrupts if we have to deliver
+        * signals. We used to do this earlier but it caused kernel
+        * stack overflows. */
+       ssm     PSW_SM_I, %r0
+
        copy    %r0, %r25                       /* long in_syscall = 0 */
 #ifdef CONFIG_64BIT
        ldo     -16(%r30),%r29                  /* Reference param save area */
@@ -958,6 +960,10 @@ intr_do_resched:
        cmpib,COND(=)   0, %r20, intr_do_preempt
        nop
 
+       /* NOTE: We need to enable interrupts if we schedule.  We used
+        * to do this earlier but it caused kernel stack overflows. */
+       ssm     PSW_SM_I, %r0
+
 #ifdef CONFIG_64BIT
        ldo     -16(%r30),%r29          /* Reference param save area */
 #endif
index e3a8e5e4d5de75897adcea4134f87c7246f60646..8d072c44f300c16d45ba8f4ee0c2eee6435e4ddd 100644 (file)
@@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc)
 
 
        __INITRODATA
+       .align 4
        .export os_hpmc_size
 os_hpmc_size:
        .word .os_hpmc_end-.os_hpmc
index 5a657986ebbf4bef7beff4e8c8d20f1343872347..143f90e2f9f3c631616d4af52f0fe3fa08f44af9 100644 (file)
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/kallsyms.h>
 #include <linux/sort.h>
-#include <linux/sched.h>
 
 #include <linux/uaccess.h>
 #include <asm/assembly.h>
index 7eab4bb8abe630b14c54c3b457285b4228607dc6..66e506520505d8a3245d49d492831df5e3bbb42a 100644 (file)
@@ -16,9 +16,7 @@
 #include <linux/preempt.h>
 #include <linux/init.h>
 
-#include <asm/processor.h>
 #include <asm/delay.h>
-
 #include <asm/special_insns.h>    /* for mfctl() */
 #include <asm/processor.h> /* for boot_cpu_data */
 
index 6177d43f0ce8afa9c1f6a1101e92ba161e47d97a..e2a2b8400490049143edee40316313a906ca6db7 100644 (file)
@@ -160,9 +160,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 #endif
 }
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-                                struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+                               struct mm_struct *mm)
 {
+       return 0;
 }
 
 #ifndef CONFIG_PPC_BOOK3S_64
index 5acb5a176dbe5c8bffe6ddb7458b7d3ac2b7019f..72be0c32e902a35fa45e5ed02036df91999dda58 100644 (file)
@@ -1403,7 +1403,7 @@ void show_regs(struct pt_regs * regs)
 
        printk("NIP:  "REG" LR: "REG" CTR: "REG"\n",
               regs->nip, regs->link, regs->ctr);
-       printk("REGS: %p TRAP: %04lx   %s  (%s)\n",
+       printk("REGS: %px TRAP: %04lx   %s  (%s)\n",
               regs, regs->trap, print_tainted(), init_utsname()->release);
        printk("MSR:  "REG" ", regs->msr);
        print_msr_bits(regs->msr);
index bf457843e03217b9aa02815d7791f0fce72aea2b..0d750d274c4e21a3324eb3505bbd73c86a58cdc9 100644 (file)
@@ -725,7 +725,8 @@ u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
 
        /* Return the per-cpu state for state saving/migration */
        return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
-              (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
+              (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT |
+              (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;
 }
 
 int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
@@ -1558,7 +1559,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
 
        /*
         * Restore P and Q. If the interrupt was pending, we
-        * force both P and Q, which will trigger a resend.
+        * force Q and !P, which will trigger a resend.
         *
         * That means that a guest that had both an interrupt
         * pending (queued) and Q set will restore with only
@@ -1566,7 +1567,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
         * is perfectly fine as coalescing interrupts that haven't
         * been presented yet is always allowed.
         */
-       if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+       if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
                state->old_p = true;
        if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
                state->old_q = true;
index 46d74e81aff1b4caad4769e7686fa0a800695cd4..d183b4801bdbded832b90d2aa1a18e713f70695b 100644 (file)
@@ -763,7 +763,8 @@ emit_clear:
                        func = (u8 *) __bpf_call_base + imm;
 
                        /* Save skb pointer if we need to re-cache skb data */
-                       if (bpf_helper_changes_pkt_data(func))
+                       if ((ctx->seen & SEEN_SKB) &&
+                           bpf_helper_changes_pkt_data(func))
                                PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
 
                        bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -772,7 +773,8 @@ emit_clear:
                        PPC_MR(b2p[BPF_REG_0], 3);
 
                        /* refresh skb cache */
-                       if (bpf_helper_changes_pkt_data(func)) {
+                       if ((ctx->seen & SEEN_SKB) &&
+                           bpf_helper_changes_pkt_data(func)) {
                                /* reload skb pointer to r3 */
                                PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
                                bpf_jit_emit_skb_loads(image, ctx);
index 1538129663658381b6b1a425dcbf582b1ed09531..fce545774d50afc6093c28ad2f4127c24ed5331c 100644 (file)
@@ -410,8 +410,12 @@ static __u64 power_pmu_bhrb_to(u64 addr)
        int ret;
        __u64 target;
 
-       if (is_kernel_addr(addr))
-               return branch_target((unsigned int *)addr);
+       if (is_kernel_addr(addr)) {
+               if (probe_kernel_read(&instr, (void *)addr, sizeof(instr)))
+                       return 0;
+
+               return branch_target(&instr);
+       }
 
        /* Userspace: need copy instruction here then translate it */
        pagefault_disable();
index 0ead3cd73caa2f8816e8c04f47cca691efba0560..be4e7f84f70a59db60e92a9bfe845678f71cc608 100644 (file)
@@ -309,6 +309,19 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
        if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
                return 0;
 
+       /*
+        * Check whether nest_imc is registered. We could end up here if the
+        * cpuhotplug callback registration fails. i.e, callback invokes the
+        * offline path for all successfully registered nodes. At this stage,
+        * nest_imc pmu will not be registered and we should return here.
+        *
+        * We return with a zero since this is not an offline failure. And
+        * cpuhp_setup_state() returns the actual failure reason to the caller,
+        * which in turn will call the cleanup routine.
+        */
+       if (!nest_pmus)
+               return 0;
+
        /*
         * Now that this cpu is one of the designated,
         * find a next cpu a) which is online and b) in same chip.
@@ -1171,6 +1184,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
                if (nest_pmus == 1) {
                        cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
                        kfree(nest_imc_refc);
+                       kfree(per_nest_pmu_arr);
                }
 
                if (nest_pmus > 0)
@@ -1195,7 +1209,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
                kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
        kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
        kfree(pmu_ptr);
-       kfree(per_nest_pmu_arr);
        return;
 }
 
@@ -1309,6 +1322,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id
                        ret = nest_pmu_cpumask_init();
                        if (ret) {
                                mutex_unlock(&nest_init_lock);
+                               kfree(nest_imc_refc);
+                               kfree(per_nest_pmu_arr);
                                goto err_free;
                        }
                }
index e81c16838b90f1bc9a5418bc1b4e5365e9cb0aef..9557d8b516df5a689dda995cd7fd501ddd6cf54c 100644 (file)
@@ -55,8 +55,7 @@ struct bpf_jit {
 #define SEEN_LITERAL   8       /* code uses literals */
 #define SEEN_FUNC      16      /* calls C functions */
 #define SEEN_TAIL_CALL 32      /* code uses tail calls */
-#define SEEN_SKB_CHANGE        64      /* code changes skb data */
-#define SEEN_REG_AX    128     /* code uses constant blinding */
+#define SEEN_REG_AX    64      /* code uses constant blinding */
 #define SEEN_STACK     (SEEN_FUNC | SEEN_MEM | SEEN_SKB)
 
 /*
@@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
                        EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
                                      REG_15, 152);
        }
-       if (jit->seen & SEEN_SKB)
+       if (jit->seen & SEEN_SKB) {
                emit_load_skb_data_hlen(jit);
-       if (jit->seen & SEEN_SKB_CHANGE)
                /* stg %b1,ST_OFF_SKBP(%r0,%r15) */
                EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
                              STK_OFF_SKBP);
+       }
 }
 
 /*
@@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
                EMIT2(0x0d00, REG_14, REG_W1);
                /* lgr %b0,%r2: load return value into %b0 */
                EMIT4(0xb9040000, BPF_REG_0, REG_2);
-               if (bpf_helper_changes_pkt_data((void *)func)) {
-                       jit->seen |= SEEN_SKB_CHANGE;
+               if ((jit->seen & SEEN_SKB) &&
+                   bpf_helper_changes_pkt_data((void *)func)) {
                        /* lg %b1,ST_OFF_SKBP(%r15) */
                        EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
                                      REG_15, STK_OFF_SKBP);
index be3136f142a9993e0c6c8cfa1d651b1685654a73..a8103a84b4ac4a2ec84c44c302862b3aed8b7e7f 100644 (file)
@@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
        if (!printk_ratelimit())
                return;
 
-       printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
+       printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
               task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
               tsk->comm, task_pid_nr(tsk), address,
               (void *)regs->pc, (void *)regs->u_regs[UREG_I7],
index 815c03d7a765524424b92866b1567ea2a43695d4..41363f46797bf9f74dd922fadbd2a3f190e8c9bb 100644 (file)
@@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
        if (!printk_ratelimit())
                return;
 
-       printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
+       printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
               task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
               tsk->comm, task_pid_nr(tsk), address,
               (void *)regs->tpc, (void *)regs->u_regs[UREG_I7],
index 5765e7e711f78248d2bff70f9c57ca48a4514355..ff5f9cb3039af1f91c8701915f08c051c21d0d81 100644 (file)
@@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
                u8 *func = ((u8 *)__bpf_call_base) + imm;
 
                ctx->saw_call = true;
+               if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
+                       emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
 
                emit_call((u32 *)func, ctx);
                emit_nop(ctx);
 
                emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
 
-               if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind)
-                       load_skb_regs(ctx, bpf2sparc[BPF_REG_6]);
+               if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
+                       load_skb_regs(ctx, L7);
                break;
        }
 
index b668e351fd6c2e4f7a4b75c8a67eada77449abc9..fca34b2177e28a055663055d01c4fb7d78420285 100644 (file)
@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm);
 /*
  * Needed since we do not use the asm-generic/mm_hooks.h:
  */
-static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
        uml_setup_stubs(mm);
+       return 0;
 }
 extern void arch_exit_mmap(struct mm_struct *mm);
 static inline void arch_unmap(struct mm_struct *mm,
index 4e6fcb32620ffb2125f648622499e5bf7c950e72..428644175956231aad112a0ce221452913736635 100644 (file)
@@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs)
        if (!printk_ratelimit())
                return;
 
-       printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x",
+       printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
                task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
                tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
                (void *)UPT_IP(regs), (void *)UPT_SP(regs),
index 59b06b48f27d7a4e0d8b82fc147d3fdad7f75295..5c205a9cb5a6a4bb2c865255bc946d7ca4882db1 100644 (file)
@@ -81,9 +81,10 @@ do { \
        } \
 } while (0)
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-                                struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+                               struct mm_struct *mm)
 {
+       return 0;
 }
 
 static inline void arch_unmap(struct mm_struct *mm,
index 8eed3f94bfc774de5e3f344590f8889a999dea9c..d4fc98c50378c40bc901f6446d2bfff68151eb6a 100644 (file)
@@ -926,7 +926,8 @@ config MAXSMP
 config NR_CPUS
        int "Maximum number of CPUs" if SMP && !MAXSMP
        range 2 8 if SMP && X86_32 && !X86_BIGSMP
-       range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK
+       range 2 64 if SMP && X86_32 && X86_BIGSMP
+       range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
        range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
        default "1" if !SMP
        default "8192" if MAXSMP
index 4838037f97f6edffda62b5b045c837fcc29402f0..ace8f321a5a1f2d1331cc4331a1922c9ed3d8bc1 100644 (file)
@@ -941,9 +941,10 @@ ENTRY(debug)
        movl    %esp, %eax                      # pt_regs pointer
 
        /* Are we currently on the SYSENTER stack? */
-       PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
-       subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
-       cmpl    $SIZEOF_SYSENTER_stack, %ecx
+       movl    PER_CPU_VAR(cpu_entry_area), %ecx
+       addl    $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+       subl    %eax, %ecx      /* ecx = (end of entry_stack) - esp */
+       cmpl    $SIZEOF_entry_stack, %ecx
        jb      .Ldebug_from_sysenter_stack
 
        TRACE_IRQS_OFF
@@ -984,9 +985,10 @@ ENTRY(nmi)
        movl    %esp, %eax                      # pt_regs pointer
 
        /* Are we currently on the SYSENTER stack? */
-       PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
-       subl    %eax, %ecx      /* ecx = (end of SYSENTER_stack) - esp */
-       cmpl    $SIZEOF_SYSENTER_stack, %ecx
+       movl    PER_CPU_VAR(cpu_entry_area), %ecx
+       addl    $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+       subl    %eax, %ecx      /* ecx = (end of entry_stack) - esp */
+       cmpl    $SIZEOF_entry_stack, %ecx
        jb      .Lnmi_from_sysenter_stack
 
        /* Not on SYSENTER stack. */
index f81d50d7ceacdefa06d61482687937096c68421c..3d19c830e1b1ab3c7e3115014039a35eb9607214 100644 (file)
@@ -140,6 +140,64 @@ END(native_usergs_sysret64)
  * with them due to bugs in both AMD and Intel CPUs.
  */
 
+       .pushsection .entry_trampoline, "ax"
+
+/*
+ * The code in here gets remapped into cpu_entry_area's trampoline.  This means
+ * that the assembler and linker have the wrong idea as to where this code
+ * lives (and, in fact, it's mapped more than once, so it's not even at a
+ * fixed address).  So we can't reference any symbols outside the entry
+ * trampoline and expect it to work.
+ *
+ * Instead, we carefully abuse %rip-relative addressing.
+ * _entry_trampoline(%rip) refers to the start of the remapped) entry
+ * trampoline.  We can thus find cpu_entry_area with this macro:
+ */
+
+#define CPU_ENTRY_AREA \
+       _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+#define RSP_SCRATCH    CPU_ENTRY_AREA_entry_stack + \
+                       SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
+
+ENTRY(entry_SYSCALL_64_trampoline)
+       UNWIND_HINT_EMPTY
+       swapgs
+
+       /* Stash the user RSP. */
+       movq    %rsp, RSP_SCRATCH
+
+       /* Load the top of the task stack into RSP */
+       movq    CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
+
+       /* Start building the simulated IRET frame. */
+       pushq   $__USER_DS                      /* pt_regs->ss */
+       pushq   RSP_SCRATCH                     /* pt_regs->sp */
+       pushq   %r11                            /* pt_regs->flags */
+       pushq   $__USER_CS                      /* pt_regs->cs */
+       pushq   %rcx                            /* pt_regs->ip */
+
+       /*
+        * x86 lacks a near absolute jump, and we can't jump to the real
+        * entry text with a relative jump.  We could push the target
+        * address and then use retq, but this destroys the pipeline on
+        * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
+        * spill RDI and restore it in a second-stage trampoline.
+        */
+       pushq   %rdi
+       movq    $entry_SYSCALL_64_stage2, %rdi
+       jmp     *%rdi
+END(entry_SYSCALL_64_trampoline)
+
+       .popsection
+
+ENTRY(entry_SYSCALL_64_stage2)
+       UNWIND_HINT_EMPTY
+       popq    %rdi
+       jmp     entry_SYSCALL_64_after_hwframe
+END(entry_SYSCALL_64_stage2)
+
 ENTRY(entry_SYSCALL_64)
        UNWIND_HINT_EMPTY
        /*
@@ -330,8 +388,24 @@ syscall_return_via_sysret:
        popq    %rsi    /* skip rcx */
        popq    %rdx
        popq    %rsi
+
+       /*
+        * Now all regs are restored except RSP and RDI.
+        * Save old stack pointer and switch to trampoline stack.
+        */
+       movq    %rsp, %rdi
+       movq    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+       pushq   RSP-RDI(%rdi)   /* RSP */
+       pushq   (%rdi)          /* RDI */
+
+       /*
+        * We are on the trampoline stack.  All regs except RDI are live.
+        * We can do future final exit work right here.
+        */
+
        popq    %rdi
-       movq    RSP-ORIG_RAX(%rsp), %rsp
+       popq    %rsp
        USERGS_SYSRET64
 END(entry_SYSCALL_64)
 
@@ -466,12 +540,13 @@ END(irq_entries_start)
 
 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
 #ifdef CONFIG_DEBUG_ENTRY
-       pushfq
-       testl $X86_EFLAGS_IF, (%rsp)
+       pushq %rax
+       SAVE_FLAGS(CLBR_RAX)
+       testl $X86_EFLAGS_IF, %eax
        jz .Lokay_\@
        ud2
 .Lokay_\@:
-       addq $8, %rsp
+       popq %rax
 #endif
 .endm
 
@@ -563,6 +638,13 @@ END(irq_entries_start)
 /* 0(%rsp): ~(interrupt number) */
        .macro interrupt func
        cld
+
+       testb   $3, CS-ORIG_RAX(%rsp)
+       jz      1f
+       SWAPGS
+       call    switch_to_thread_stack
+1:
+
        ALLOC_PT_GPREGS_ON_STACK
        SAVE_C_REGS
        SAVE_EXTRA_REGS
@@ -572,12 +654,8 @@ END(irq_entries_start)
        jz      1f
 
        /*
-        * IRQ from user mode.  Switch to kernel gsbase and inform context
-        * tracking that we're in kernel mode.
-        */
-       SWAPGS
-
-       /*
+        * IRQ from user mode.
+        *
         * We need to tell lockdep that IRQs are off.  We can't do this until
         * we fix gsbase, and we should do it before enter_from_user_mode
         * (which can take locks).  Since TRACE_IRQS_OFF idempotent,
@@ -630,10 +708,41 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
        ud2
 1:
 #endif
-       SWAPGS
        POP_EXTRA_REGS
-       POP_C_REGS
-       addq    $8, %rsp        /* skip regs->orig_ax */
+       popq    %r11
+       popq    %r10
+       popq    %r9
+       popq    %r8
+       popq    %rax
+       popq    %rcx
+       popq    %rdx
+       popq    %rsi
+
+       /*
+        * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+        * Save old stack pointer and switch to trampoline stack.
+        */
+       movq    %rsp, %rdi
+       movq    PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+       /* Copy the IRET frame to the trampoline stack. */
+       pushq   6*8(%rdi)       /* SS */
+       pushq   5*8(%rdi)       /* RSP */
+       pushq   4*8(%rdi)       /* EFLAGS */
+       pushq   3*8(%rdi)       /* CS */
+       pushq   2*8(%rdi)       /* RIP */
+
+       /* Push user RDI on the trampoline stack. */
+       pushq   (%rdi)
+
+       /*
+        * We are on the trampoline stack.  All regs except RDI are live.
+        * We can do future final exit work right here.
+        */
+
+       /* Restore RDI. */
+       popq    %rdi
+       SWAPGS
        INTERRUPT_RETURN
 
 
@@ -829,7 +938,33 @@ apicinterrupt IRQ_WORK_VECTOR                      irq_work_interrupt              smp_irq_work_interrupt
 /*
  * Exception entry points.
  */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+
+/*
+ * Switch to the thread stack.  This is called with the IRET frame and
+ * orig_ax on the stack.  (That is, RDI..R12 are not on the stack and
+ * space has not been allocated for them.)
+ */
+ENTRY(switch_to_thread_stack)
+       UNWIND_HINT_FUNC
+
+       pushq   %rdi
+       movq    %rsp, %rdi
+       movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+       UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
+
+       pushq   7*8(%rdi)               /* regs->ss */
+       pushq   6*8(%rdi)               /* regs->rsp */
+       pushq   5*8(%rdi)               /* regs->eflags */
+       pushq   4*8(%rdi)               /* regs->cs */
+       pushq   3*8(%rdi)               /* regs->ip */
+       pushq   2*8(%rdi)               /* regs->orig_ax */
+       pushq   8(%rdi)                 /* return address */
+       UNWIND_HINT_FUNC
+
+       movq    (%rdi), %rdi
+       ret
+END(switch_to_thread_stack)
 
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
@@ -848,11 +983,12 @@ ENTRY(\sym)
 
        ALLOC_PT_GPREGS_ON_STACK
 
-       .if \paranoid
-       .if \paranoid == 1
+       .if \paranoid < 2
        testb   $3, CS(%rsp)                    /* If coming from userspace, switch stacks */
-       jnz     1f
+       jnz     .Lfrom_usermode_switch_stack_\@
        .endif
+
+       .if \paranoid
        call    paranoid_entry
        .else
        call    error_entry
@@ -894,20 +1030,15 @@ ENTRY(\sym)
        jmp     error_exit
        .endif
 
-       .if \paranoid == 1
+       .if \paranoid < 2
        /*
-        * Paranoid entry from userspace.  Switch stacks and treat it
+        * Entry from userspace.  Switch stacks and treat it
         * as a normal entry.  This means that paranoid handlers
         * run in real process context if user_mode(regs).
         */
-1:
+.Lfrom_usermode_switch_stack_\@:
        call    error_entry
 
-
-       movq    %rsp, %rdi                      /* pt_regs pointer */
-       call    sync_regs
-       movq    %rax, %rsp                      /* switch stack */
-
        movq    %rsp, %rdi                      /* pt_regs pointer */
 
        .if \has_error_code
@@ -1170,6 +1301,14 @@ ENTRY(error_entry)
        SWAPGS
 
 .Lerror_entry_from_usermode_after_swapgs:
+       /* Put us onto the real thread stack. */
+       popq    %r12                            /* save return addr in %12 */
+       movq    %rsp, %rdi                      /* arg0 = pt_regs pointer */
+       call    sync_regs
+       movq    %rax, %rsp                      /* switch stack */
+       ENCODE_FRAME_POINTER
+       pushq   %r12
+
        /*
         * We need to tell lockdep that IRQs are off.  We can't do this until
         * we fix gsbase, and we should do it before enter_from_user_mode
index 568e130d932cd2a7d44393e5fc52408cffe64f34..95ad40eb7effbdb6f605285df62d1e0bd33a6cac 100644 (file)
@@ -48,7 +48,7 @@
  */
 ENTRY(entry_SYSENTER_compat)
        /* Interrupts are off on entry. */
-       SWAPGS_UNSAFE_STACK
+       SWAPGS
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
        /*
@@ -306,8 +306,11 @@ ENTRY(entry_INT80_compat)
         */
        movl    %eax, %eax
 
-       /* Construct struct pt_regs on stack (iret frame is already on stack) */
        pushq   %rax                    /* pt_regs->orig_ax */
+
+       /* switch to thread stack expects orig_ax to be pushed */
+       call    switch_to_thread_stack
+
        pushq   %rdi                    /* pt_regs->di */
        pushq   %rsi                    /* pt_regs->si */
        pushq   %rdx                    /* pt_regs->dx */
index f279ba2643dc8933b9659242082e7ef2ea2d9dd6..1faf40f2dda9a862f974d4f06616402875e32ffc 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/unistd.h>
 #include <asm/fixmap.h>
 #include <asm/traps.h>
+#include <asm/paravirt.h>
 
 #define CREATE_TRACE_POINTS
 #include "vsyscall_trace.h"
@@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
        WARN_ON_ONCE(address != regs->ip);
 
+       /* This should be unreachable in NATIVE mode. */
+       if (WARN_ON(vsyscall_mode == NATIVE))
+               return false;
+
        if (vsyscall_mode == NONE) {
                warn_bad_vsyscall(KERN_INFO, regs,
                                  "vsyscall attempted with vsyscall=none");
@@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr)
        return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
 }
 
+/*
+ * The VSYSCALL page is the only user-accessible page in the kernel address
+ * range.  Normally, the kernel page tables can have _PAGE_USER clear, but
+ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
+ * are enabled.
+ *
+ * Some day we may create a "minimal" vsyscall mode in which we emulate
+ * vsyscalls but leave the page not present.  If so, we skip calling
+ * this.
+ */
+static void __init set_vsyscall_pgtable_user_bits(void)
+{
+       pgd_t *pgd;
+       p4d_t *p4d;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       pgd = pgd_offset_k(VSYSCALL_ADDR);
+       set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+       p4d = p4d_offset(pgd, VSYSCALL_ADDR);
+#if CONFIG_PGTABLE_LEVELS >= 5
+       p4d->p4d |= _PAGE_USER;
+#endif
+       pud = pud_offset(p4d, VSYSCALL_ADDR);
+       set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
+       pmd = pmd_offset(pud, VSYSCALL_ADDR);
+       set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
+}
+
 void __init map_vsyscall(void)
 {
        extern char __vsyscall_page;
        unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
 
-       if (vsyscall_mode != NONE)
+       if (vsyscall_mode != NONE) {
                __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
                             vsyscall_mode == NATIVE
                             ? PAGE_KERNEL_VSYSCALL
                             : PAGE_KERNEL_VVAR);
+               set_vsyscall_pgtable_user_bits();
+       }
 
        BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
                     (unsigned long)VSYSCALL_ADDR);
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
new file mode 100644 (file)
index 0000000..2fbc69a
--- /dev/null
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <linux/percpu-defs.h>
+#include <asm/processor.h>
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code.  Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+       char gdt[PAGE_SIZE];
+
+       /*
+        * The GDT is just below entry_stack and thus serves (on x86_64) as
+        * a a read-only guard page.
+        */
+       struct entry_stack_page entry_stack_page;
+
+       /*
+        * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
+        * we need task switches to work, and task switches write to the TSS.
+        */
+       struct tss_struct tss;
+
+       char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+       /*
+        * Exception stacks used for IST entries.
+        *
+        * In the future, this should have a separate slot for each stack
+        * with guard pages between them.
+        */
+       char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+};
+
+#define CPU_ENTRY_AREA_SIZE    (sizeof(struct cpu_entry_area))
+#define CPU_ENTRY_AREA_TOT_SIZE        (CPU_ENTRY_AREA_SIZE * NR_CPUS)
+
+DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+
+extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+#define        CPU_ENTRY_AREA_RO_IDT           CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU         (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR    ((void *)CPU_ENTRY_AREA_RO_IDT)
+
+#define CPU_ENTRY_AREA_MAP_SIZE                        \
+       (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
+
+extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
+
+static inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+       return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
+
+#endif
index bf6a76202a779ee131b4df8c89449ab52abd0a79..ea9a7dde62e5c4d551ba89e429f911fb5c6603fd 100644 (file)
@@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
        set_bit(bit, (unsigned long *)cpu_caps_set);    \
 } while (0)
 
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
index 4011cb03ef08e52db15f52779ce366c26359a34b..ec8be07c0cda5c9b240d351ca583409713c58406 100644 (file)
@@ -7,6 +7,7 @@
 #include <asm/mmu.h>
 #include <asm/fixmap.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpu_entry_area.h>
 
 #include <linux/smp.h>
 #include <linux/percpu.h>
@@ -60,17 +61,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
        return this_cpu_ptr(&gdt_page)->gdt;
 }
 
-/* Get the fixmap index for a specific processor */
-static inline unsigned int get_cpu_gdt_ro_index(int cpu)
-{
-       return FIX_GDT_REMAP_BEGIN + cpu;
-}
-
 /* Provide the fixmap address of the remapped GDT */
 static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
 {
-       unsigned int idx = get_cpu_gdt_ro_index(cpu);
-       return (struct desc_struct *)__fix_to_virt(idx);
+       return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
 }
 
 /* Provide the current read-only GDT */
@@ -185,7 +179,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
 #endif
 }
 
-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
 {
        struct desc_struct *d = get_cpu_gdt_rw(cpu);
        tss_desc tss;
index 0211029076ea8b9ed6648b9bf298c99c8b2124ad..6777480d8a427eaaa07559f77985c125aa66bb6c 100644 (file)
@@ -2,7 +2,7 @@
 #ifndef _ASM_X86_ESPFIX_H
 #define _ASM_X86_ESPFIX_H
 
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_ESPFIX64
 
 #include <asm/percpu.h>
 
@@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
 
 extern void init_espfix_bsp(void);
 extern void init_espfix_ap(int cpu);
-
-#endif /* CONFIG_X86_64 */
+#else
+static inline void init_espfix_ap(int cpu) { }
+#endif
 
 #endif /* _ASM_X86_ESPFIX_H */
index b0c505fe9a958c701fef6d96f281bb8ab1a773de..64c4a30e0d39621ff8587fc8da538cd3d1d9f144 100644 (file)
@@ -44,7 +44,6 @@ extern unsigned long __FIXADDR_TOP;
                         PAGE_SIZE)
 #endif
 
-
 /*
  * Here we define all the compile-time 'special' virtual
  * addresses. The point is to have a constant address at
@@ -84,7 +83,6 @@ enum fixed_addresses {
        FIX_IO_APIC_BASE_0,
        FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
 #endif
-       FIX_RO_IDT,     /* Virtual mapping for read-only IDT */
 #ifdef CONFIG_X86_32
        FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
        FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
@@ -100,9 +98,6 @@ enum fixed_addresses {
 #ifdef CONFIG_X86_INTEL_MID
        FIX_LNW_VRTC,
 #endif
-       /* Fixmap entries to remap the GDTs, one per processor. */
-       FIX_GDT_REMAP_BEGIN,
-       FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
 
 #ifdef CONFIG_ACPI_APEI_GHES
        /* Used for GHES mapping from assorted contexts */
@@ -143,7 +138,7 @@ enum fixed_addresses {
 extern void reserve_top_address(unsigned long reserve);
 
 #define FIXADDR_SIZE   (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START          (FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_START  (FIXADDR_TOP - FIXADDR_SIZE)
 
 extern int fixmaps_set;
 
index 1b0a5abcd8aeb6e700013c5434aaeb0bba7a152f..96aa6b9884dc5b3bc8d54c9ef1c6258eea13a0d0 100644 (file)
 #ifndef _ASM_X86_HYPERVISOR_H
 #define _ASM_X86_HYPERVISOR_H
 
-#ifdef CONFIG_HYPERVISOR_GUEST
-
-#include <asm/kvm_para.h>
-#include <asm/x86_init.h>
-#include <asm/xen/hypervisor.h>
-
-/*
- * x86 hypervisor information
- */
-
+/* x86 hypervisor types  */
 enum x86_hypervisor_type {
        X86_HYPER_NATIVE = 0,
        X86_HYPER_VMWARE,
@@ -39,6 +30,12 @@ enum x86_hypervisor_type {
        X86_HYPER_KVM,
 };
 
+#ifdef CONFIG_HYPERVISOR_GUEST
+
+#include <asm/kvm_para.h>
+#include <asm/x86_init.h>
+#include <asm/xen/hypervisor.h>
+
 struct hypervisor_x86 {
        /* Hypervisor name */
        const char      *name;
@@ -58,7 +55,15 @@ struct hypervisor_x86 {
 
 extern enum x86_hypervisor_type x86_hyper_type;
 extern void init_hypervisor_platform(void);
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+       return x86_hyper_type == type;
+}
 #else
 static inline void init_hypervisor_platform(void) { }
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+       return type == X86_HYPER_NATIVE;
+}
 #endif /* CONFIG_HYPERVISOR_GUEST */
 #endif /* _ASM_X86_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h
new file mode 100644 (file)
index 0000000..989cfa8
--- /dev/null
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVPCID
+#define _ASM_X86_INVPCID
+
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+                            unsigned long type)
+{
+       struct { u64 d[2]; } desc = { { pcid, addr } };
+
+       /*
+        * The memory clobber is because the whole point is to invalidate
+        * stale TLB entries and, especially if we're flushing global
+        * mappings, we don't want the compiler to reorder any subsequent
+        * memory accesses before the TLB flush.
+        *
+        * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+        * invpcid (%rcx), %rax in long mode.
+        */
+       asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+                     : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR                0
+#define INVPCID_TYPE_SINGLE_CTXT       1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL   2
+#define INVPCID_TYPE_ALL_NON_GLOBAL    3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+                                    unsigned long addr)
+{
+       __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+       __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+       __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+       __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
+#endif /* _ASM_X86_INVPCID */
index c8ef23f2c28f17c59308b9c41179c47f85e075ad..89f08955fff733c688a5ce4f4a0b8d74050ee617 100644 (file)
@@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
        swapgs;                                 \
        sysretl
 
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(x)          pushfq; popq %rax
+#endif
 #else
 #define INTERRUPT_RETURN               iret
 #define ENABLE_INTERRUPTS_SYSEXIT      sti; sysexit
index f86a8caa561e8873c3f34e6e8b8cd509ebadd819..395c9631e000a3a17aa574c1b25fcc2cafd5b5fb 100644 (file)
@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
 extern void show_stack_regs(struct pt_regs *regs);
 extern void __show_regs(struct pt_regs *regs, int all);
+extern void show_iret_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
index 9ea26f16749706fddd5b15e8bf557a9e6156e165..5ff3e8af2c2056b7fe19560ee2ba1ad7146aaf2a 100644 (file)
@@ -3,6 +3,7 @@
 #define _ASM_X86_MMU_H
 
 #include <linux/spinlock.h>
+#include <linux/rwsem.h>
 #include <linux/mutex.h>
 #include <linux/atomic.h>
 
@@ -27,7 +28,8 @@ typedef struct {
        atomic64_t tlb_gen;
 
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
-       struct ldt_struct *ldt;
+       struct rw_semaphore     ldt_usr_sem;
+       struct ldt_struct       *ldt;
 #endif
 
 #ifdef CONFIG_X86_64
index 6d16d15d09a0daed96a1e3d670b6203d1779b98e..5ede7cae1d673e38effa7ce9d1cc5aaf4481ac46 100644 (file)
@@ -57,11 +57,17 @@ struct ldt_struct {
 /*
  * Used for LDT copy/destruction.
  */
-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
+static inline void init_new_context_ldt(struct mm_struct *mm)
+{
+       mm->context.ldt = NULL;
+       init_rwsem(&mm->context.ldt_usr_sem);
+}
+int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
 #else  /* CONFIG_MODIFY_LDT_SYSCALL */
-static inline int init_new_context_ldt(struct task_struct *tsk,
-                                      struct mm_struct *mm)
+static inline void init_new_context_ldt(struct mm_struct *mm) { }
+static inline int ldt_dup_context(struct mm_struct *oldmm,
+                                 struct mm_struct *mm)
 {
        return 0;
 }
@@ -132,18 +138,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 static inline int init_new_context(struct task_struct *tsk,
                                   struct mm_struct *mm)
 {
+       mutex_init(&mm->context.lock);
+
        mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
        atomic64_set(&mm->context.tlb_gen, 0);
 
-       #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
        if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
                /* pkey 0 is the default and always allocated */
                mm->context.pkey_allocation_map = 0x1;
                /* -1 means unallocated or invalid */
                mm->context.execute_only_pkey = -1;
        }
-       #endif
-       return init_new_context_ldt(tsk, mm);
+#endif
+       init_new_context_ldt(mm);
+       return 0;
 }
 static inline void destroy_context(struct mm_struct *mm)
 {
@@ -176,10 +185,10 @@ do {                                              \
 } while (0)
 #endif
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-                                struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
        paravirt_arch_dup_mmap(oldmm, mm);
+       return ldt_dup_context(oldmm, mm);
 }
 
 static inline void arch_exit_mmap(struct mm_struct *mm)
@@ -281,33 +290,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
        return __pkru_allows_pkey(vma_pkey(vma), write);
 }
 
-/*
- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
- * bits.  This serves two purposes.  It prevents a nasty situation in
- * which PCID-unaware code saves CR3, loads some other value (with PCID
- * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
- * the saved ASID was nonzero.  It also means that any bugs involving
- * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
- * deterministically.
- */
-
-static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
-{
-       if (static_cpu_has(X86_FEATURE_PCID)) {
-               VM_WARN_ON_ONCE(asid > 4094);
-               return __sme_pa(mm->pgd) | (asid + 1);
-       } else {
-               VM_WARN_ON_ONCE(asid != 0);
-               return __sme_pa(mm->pgd);
-       }
-}
-
-static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
-{
-       VM_WARN_ON_ONCE(asid > 4094);
-       return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
-}
-
 /*
  * This can be used from process context to figure out what the value of
  * CR3 is without needing to do a (slow) __read_cr3().
@@ -317,7 +299,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
  */
 static inline unsigned long __get_current_cr3_fast(void)
 {
-       unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+       unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
                this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 
        /* For now, be very restrictive about when this can be called. */
index 283efcaac8aff86f2c004bc23e4b8642cbf3d527..892df375b6155a51f584760efb9f9e77c3f732e8 100644 (file)
@@ -927,6 +927,15 @@ extern void default_banner(void);
        PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),       \
                  CLBR_NONE,                                            \
                  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(clobbers)                                        \
+       PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
+                 PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);        \
+                 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl);    \
+                 PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+#endif
+
 #endif /* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
index f2ca9b28fd68303f4494775564aa9da77ddcd53a..ce245b0cdfcaa42bd932a387bbb189ee7349bfef 100644 (file)
@@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
 #define LAST_PKMAP 1024
 #endif
 
-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1))     \
-                   & PMD_MASK)
+/*
+ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
+ * to avoid include recursion hell
+ */
+#define CPU_ENTRY_AREA_PAGES   (NR_CPUS * 40)
+
+#define CPU_ENTRY_AREA_BASE                            \
+       ((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
+
+#define PKMAP_BASE             \
+       ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
 
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END   (PKMAP_BASE - 2 * PAGE_SIZE)
 #else
-# define VMALLOC_END   (FIXADDR_START - 2 * PAGE_SIZE)
+# define VMALLOC_END   (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)
 #endif
 
 #define MODULES_VADDR  VMALLOC_START
index 6d5f45dcd4a13caafbf184f323d0725c2d5f53e4..3d27831bc58dfac15e91a33d27083c0988851de2 100644 (file)
@@ -76,32 +76,41 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_MASK     (~(PGDIR_SIZE - 1))
 
 /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
-#define MAXMEM         _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+#define MAXMEM                 _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
+
 #ifdef CONFIG_X86_5LEVEL
-#define VMALLOC_SIZE_TB _AC(16384, UL)
-#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
+# define VMALLOC_SIZE_TB       _AC(16384, UL)
+# define __VMALLOC_BASE                _AC(0xff92000000000000, UL)
+# define __VMEMMAP_BASE                _AC(0xffd4000000000000, UL)
 #else
-#define VMALLOC_SIZE_TB        _AC(32, UL)
-#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
-#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
+# define VMALLOC_SIZE_TB       _AC(32, UL)
+# define __VMALLOC_BASE                _AC(0xffffc90000000000, UL)
+# define __VMEMMAP_BASE                _AC(0xffffea0000000000, UL)
 #endif
+
 #ifdef CONFIG_RANDOMIZE_MEMORY
-#define VMALLOC_START  vmalloc_base
-#define VMEMMAP_START  vmemmap_base
+# define VMALLOC_START         vmalloc_base
+# define VMEMMAP_START         vmemmap_base
 #else
-#define VMALLOC_START  __VMALLOC_BASE
-#define VMEMMAP_START  __VMEMMAP_BASE
+# define VMALLOC_START         __VMALLOC_BASE
+# define VMEMMAP_START         __VMEMMAP_BASE
 #endif /* CONFIG_RANDOMIZE_MEMORY */
-#define VMALLOC_END    (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
-#define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
+
+#define VMALLOC_END            (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
+
+#define MODULES_VADDR          (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
 /* The module sections ends with the start of the fixmap */
-#define MODULES_END   __fix_to_virt(__end_of_fixed_addresses + 1)
-#define MODULES_LEN   (MODULES_END - MODULES_VADDR)
-#define ESPFIX_PGD_ENTRY _AC(-2, UL)
-#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
-#define EFI_VA_START    ( -4 * (_AC(1, UL) << 30))
-#define EFI_VA_END      (-68 * (_AC(1, UL) << 30))
+#define MODULES_END            __fix_to_virt(__end_of_fixed_addresses + 1)
+#define MODULES_LEN            (MODULES_END - MODULES_VADDR)
+
+#define ESPFIX_PGD_ENTRY       _AC(-2, UL)
+#define ESPFIX_BASE_ADDR       (ESPFIX_PGD_ENTRY << P4D_SHIFT)
+
+#define CPU_ENTRY_AREA_PGD     _AC(-3, UL)
+#define CPU_ENTRY_AREA_BASE    (CPU_ENTRY_AREA_PGD << P4D_SHIFT)
+
+#define EFI_VA_START           ( -4 * (_AC(1, UL) << 30))
+#define EFI_VA_END             (-68 * (_AC(1, UL) << 30))
 
 #define EARLY_DYNAMIC_PAGE_TABLES      64
 
index cc16fa882e3e760a40351cf3e7476ac9f25ffe00..cad8dab266bceefcd91a830371716d48679c7cc7 100644 (file)
@@ -163,9 +163,9 @@ enum cpuid_regs_idx {
 extern struct cpuinfo_x86      boot_cpu_data;
 extern struct cpuinfo_x86      new_cpu_data;
 
-extern struct tss_struct       doublefault_tss;
-extern __u32                   cpu_caps_cleared[NCAPINTS];
-extern __u32                   cpu_caps_set[NCAPINTS];
+extern struct x86_hw_tss       doublefault_tss;
+extern __u32                   cpu_caps_cleared[NCAPINTS + NBUGINTS];
+extern __u32                   cpu_caps_set[NCAPINTS + NBUGINTS];
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
@@ -253,6 +253,11 @@ static inline void load_cr3(pgd_t *pgdir)
        write_cr3(__sme_pa(pgdir));
 }
 
+/*
+ * Note that while the legacy 'TSS' name comes from 'Task State Segment',
+ * on modern x86 CPUs the TSS also holds information important to 64-bit mode,
+ * unrelated to the task-switch mechanism:
+ */
 #ifdef CONFIG_X86_32
 /* This is the TSS defined by the hardware. */
 struct x86_hw_tss {
@@ -305,7 +310,13 @@ struct x86_hw_tss {
 struct x86_hw_tss {
        u32                     reserved1;
        u64                     sp0;
+
+       /*
+        * We store cpu_current_top_of_stack in sp1 so it's always accessible.
+        * Linux does not use ring 1, so sp1 is not otherwise needed.
+        */
        u64                     sp1;
+
        u64                     sp2;
        u64                     reserved2;
        u64                     ist[7];
@@ -323,12 +334,22 @@ struct x86_hw_tss {
 #define IO_BITMAP_BITS                 65536
 #define IO_BITMAP_BYTES                        (IO_BITMAP_BITS/8)
 #define IO_BITMAP_LONGS                        (IO_BITMAP_BYTES/sizeof(long))
-#define IO_BITMAP_OFFSET               offsetof(struct tss_struct, io_bitmap)
+#define IO_BITMAP_OFFSET               (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))
 #define INVALID_IO_BITMAP_OFFSET       0x8000
 
+struct entry_stack {
+       unsigned long           words[64];
+};
+
+struct entry_stack_page {
+       struct entry_stack stack;
+} __aligned(PAGE_SIZE);
+
 struct tss_struct {
        /*
-        * The hardware state:
+        * The fixed hardware portion.  This must not cross a page boundary
+        * at risk of violating the SDM's advice and potentially triggering
+        * errata.
         */
        struct x86_hw_tss       x86_tss;
 
@@ -339,18 +360,9 @@ struct tss_struct {
         * be within the limit.
         */
        unsigned long           io_bitmap[IO_BITMAP_LONGS + 1];
+} __aligned(PAGE_SIZE);
 
-#ifdef CONFIG_X86_32
-       /*
-        * Space for the temporary SYSENTER stack.
-        */
-       unsigned long           SYSENTER_stack_canary;
-       unsigned long           SYSENTER_stack[64];
-#endif
-
-} ____cacheline_aligned;
-
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
 
 /*
  * sizeof(unsigned long) coming from an extra "long" at the end
@@ -364,6 +376,9 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
 
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#else
+/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
+#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
 #endif
 
 /*
@@ -523,7 +538,7 @@ static inline void native_set_iopl_mask(unsigned mask)
 static inline void
 native_load_sp0(unsigned long sp0)
 {
-       this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+       this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 static inline void native_swapgs(void)
@@ -535,12 +550,12 @@ static inline void native_swapgs(void)
 
 static inline unsigned long current_top_of_stack(void)
 {
-#ifdef CONFIG_X86_64
-       return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
-#else
-       /* sp0 on x86_32 is special in and around vm86 mode. */
+       /*
+        *  We can't read directly from tss.sp0: sp0 on x86_32 is special in
+        *  and around vm86 mode and sp0 on x86_64 is special because of the
+        *  entry trampoline.
+        */
        return this_cpu_read_stable(cpu_current_top_of_stack);
-#endif
 }
 
 static inline bool on_thread_stack(void)
index 8da111b3c342bbb61a9e630e101c8a83422a15ea..f737068787729f045a578776845231b0a0ee3e0d 100644 (file)
@@ -16,6 +16,7 @@ enum stack_type {
        STACK_TYPE_TASK,
        STACK_TYPE_IRQ,
        STACK_TYPE_SOFTIRQ,
+       STACK_TYPE_ENTRY,
        STACK_TYPE_EXCEPTION,
        STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,
 };
@@ -28,6 +29,8 @@ struct stack_info {
 bool in_task_stack(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info);
 
+bool in_entry_stack(unsigned long *stack, struct stack_info *info);
+
 int get_stack_info(unsigned long *stack, struct task_struct *task,
                   struct stack_info *info, unsigned long *visit_mask);
 
index 8c6bd6863db9d6b737cd0649324c154f9b9798a3..9b6df68d8fd1eba26f3651faa5c8b8f4dcf223f1 100644 (file)
@@ -79,10 +79,10 @@ do {                                                                        \
 static inline void refresh_sysenter_cs(struct thread_struct *thread)
 {
        /* Only happens when SEP is enabled, no need to test "SEP"arately: */
-       if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+       if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs))
                return;
 
-       this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+       this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs);
        wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
 }
 #endif
@@ -90,10 +90,12 @@ static inline void refresh_sysenter_cs(struct thread_struct *thread)
 /* This is used when switching tasks or entering/exiting vm86 mode. */
 static inline void update_sp0(struct task_struct *task)
 {
+       /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */
 #ifdef CONFIG_X86_32
        load_sp0(task->thread.sp0);
 #else
-       load_sp0(task_top_of_stack(task));
+       if (static_cpu_has(X86_FEATURE_XENPV))
+               load_sp0(task_top_of_stack(task));
 #endif
 }
 
index 70f425947dc50f3e99ca639c0ead0d7e1cce636d..00223333821a96616647a9cbb6fe729c4a18b7b6 100644 (file)
@@ -207,7 +207,7 @@ static inline int arch_within_stack_frames(const void * const stack,
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
 #endif
 
 #endif
index 877b5c1a1b1247116e20e7272dbade77e1874fc4..e1884cf35257b8133ca97f50d146ae3ebfcaa30f 100644 (file)
@@ -9,70 +9,66 @@
 #include <asm/cpufeature.h>
 #include <asm/special_insns.h>
 #include <asm/smp.h>
+#include <asm/invpcid.h>
 
-static inline void __invpcid(unsigned long pcid, unsigned long addr,
-                            unsigned long type)
+static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 {
-       struct { u64 d[2]; } desc = { { pcid, addr } };
-
        /*
-        * The memory clobber is because the whole point is to invalidate
-        * stale TLB entries and, especially if we're flushing global
-        * mappings, we don't want the compiler to reorder any subsequent
-        * memory accesses before the TLB flush.
-        *
-        * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
-        * invpcid (%rcx), %rax in long mode.
+        * Bump the generation count.  This also serves as a full barrier
+        * that synchronizes with switch_mm(): callers are required to order
+        * their read of mm_cpumask after their writes to the paging
+        * structures.
         */
-       asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
-                     : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+       return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
-#define INVPCID_TYPE_INDIV_ADDR                0
-#define INVPCID_TYPE_SINGLE_CTXT       1
-#define INVPCID_TYPE_ALL_INCL_GLOBAL   2
-#define INVPCID_TYPE_ALL_NON_GLOBAL    3
+/* There are 12 bits of space for ASIDS in CR3 */
+#define CR3_HW_ASID_BITS               12
+/*
+ * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for
+ * user/kernel switches
+ */
+#define PTI_CONSUMED_ASID_BITS         0
 
-/* Flush all mappings for a given pcid and addr, not including globals. */
-static inline void invpcid_flush_one(unsigned long pcid,
-                                    unsigned long addr)
-{
-       __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
-}
+#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS)
+/*
+ * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
+ * for them being zero-based.  Another -1 is because ASID 0 is reserved for
+ * use by non-PCID-aware users.
+ */
+#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2)
 
-/* Flush all mappings for a given PCID, not including globals. */
-static inline void invpcid_flush_single_context(unsigned long pcid)
+static inline u16 kern_pcid(u16 asid)
 {
-       __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+       VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+       /*
+        * If PCID is on, ASID-aware code paths put the ASID+1 into the
+        * PCID bits.  This serves two purposes.  It prevents a nasty
+        * situation in which PCID-unaware code saves CR3, loads some other
+        * value (with PCID == 0), and then restores CR3, thus corrupting
+        * the TLB for ASID 0 if the saved ASID was nonzero.  It also means
+        * that any bugs involving loading a PCID-enabled CR3 with
+        * CR4.PCIDE off will trigger deterministically.
+        */
+       return asid + 1;
 }
 
-/* Flush all mappings, including globals, for all PCIDs. */
-static inline void invpcid_flush_all(void)
+struct pgd_t;
+static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)
 {
-       __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+       if (static_cpu_has(X86_FEATURE_PCID)) {
+               return __sme_pa(pgd) | kern_pcid(asid);
+       } else {
+               VM_WARN_ON_ONCE(asid != 0);
+               return __sme_pa(pgd);
+       }
 }
 
-/* Flush all mappings for all PCIDs except globals. */
-static inline void invpcid_flush_all_nonglobals(void)
+static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
 {
-       __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
-}
-
-static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
-{
-       u64 new_tlb_gen;
-
-       /*
-        * Bump the generation count.  This also serves as a full barrier
-        * that synchronizes with switch_mm(): callers are required to order
-        * their read of mm_cpumask after their writes to the paging
-        * structures.
-        */
-       smp_mb__before_atomic();
-       new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen);
-       smp_mb__after_atomic();
-
-       return new_tlb_gen;
+       VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE);
+       VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID));
+       return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;
 }
 
 #ifdef CONFIG_PARAVIRT
@@ -237,6 +233,9 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
 extern void initialize_tlbstate_and_flush(void);
 
+/*
+ * flush the entire current user mapping
+ */
 static inline void __native_flush_tlb(void)
 {
        /*
@@ -249,20 +248,12 @@ static inline void __native_flush_tlb(void)
        preempt_enable();
 }
 
-static inline void __native_flush_tlb_global_irq_disabled(void)
-{
-       unsigned long cr4;
-
-       cr4 = this_cpu_read(cpu_tlbstate.cr4);
-       /* clear PGE */
-       native_write_cr4(cr4 & ~X86_CR4_PGE);
-       /* write old PGE again and flush TLBs */
-       native_write_cr4(cr4);
-}
-
+/*
+ * flush everything
+ */
 static inline void __native_flush_tlb_global(void)
 {
-       unsigned long flags;
+       unsigned long cr4, flags;
 
        if (static_cpu_has(X86_FEATURE_INVPCID)) {
                /*
@@ -280,22 +271,36 @@ static inline void __native_flush_tlb_global(void)
         */
        raw_local_irq_save(flags);
 
-       __native_flush_tlb_global_irq_disabled();
+       cr4 = this_cpu_read(cpu_tlbstate.cr4);
+       /* toggle PGE */
+       native_write_cr4(cr4 ^ X86_CR4_PGE);
+       /* write old PGE again and flush TLBs */
+       native_write_cr4(cr4);
 
        raw_local_irq_restore(flags);
 }
 
+/*
+ * flush one page in the user mapping
+ */
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
        asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 }
 
+/*
+ * flush everything
+ */
 static inline void __flush_tlb_all(void)
 {
-       if (boot_cpu_has(X86_FEATURE_PGE))
+       if (boot_cpu_has(X86_FEATURE_PGE)) {
                __flush_tlb_global();
-       else
+       } else {
+               /*
+                * !PGE -> !PCID (setup_pcid()), thus every flush is total.
+                */
                __flush_tlb();
+       }
 
        /*
         * Note: if we somehow had PCID but not PGE, then this wouldn't work --
@@ -306,6 +311,9 @@ static inline void __flush_tlb_all(void)
         */
 }
 
+/*
+ * flush one page in the kernel mapping
+ */
 static inline void __flush_tlb_one(unsigned long addr)
 {
        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
index 1fadd310ff680ece697fa65a8db410c380a8547e..31051f35cbb768e452c4f76a60c5415a45f572e7 100644 (file)
@@ -75,7 +75,6 @@ dotraplinkage void do_segment_not_present(struct pt_regs *, long);
 dotraplinkage void do_stack_segment(struct pt_regs *, long);
 #ifdef CONFIG_X86_64
 dotraplinkage void do_double_fault(struct pt_regs *, long);
-asmlinkage struct pt_regs *sync_regs(struct pt_regs *);
 #endif
 dotraplinkage void do_general_protection(struct pt_regs *, long);
 dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
index e9cc6fe1fc6f953c38ddcc61fcf06fd90d72ab04..c1688c2d0a128f063053697dc60bcbfbca509765 100644 (file)
@@ -7,6 +7,9 @@
 #include <asm/ptrace.h>
 #include <asm/stacktrace.h>
 
+#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip))
+#define IRET_FRAME_SIZE   (sizeof(struct pt_regs) - IRET_FRAME_OFFSET)
+
 struct unwind_state {
        struct stack_info stack_info;
        unsigned long stack_mask;
@@ -52,6 +55,10 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
 }
 
 #if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
+/*
+ * WARNING: The entire pt_regs may not be safe to dereference.  In some cases,
+ * only the iret frame registers are accessible.  Use with caution!
+ */
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
        if (unwind_done(state))
index 8ea78275480dafeb702e11ba73364cd9e7c52f21..676b7cf4b62bf84a72da2cf690efac745f124e77 100644 (file)
@@ -93,4 +93,10 @@ void common(void) {
 
        BLANK();
        DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
+
+       /* Layout info for cpu_entry_area */
+       OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
+       OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
+       OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
+       DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
 }
index dedf428b20b68b0a4748fc1ac3032193c9121362..fa1261eefa16e73cedf27aadb878753be693f919 100644 (file)
@@ -47,13 +47,8 @@ void foo(void)
        BLANK();
 
        /* Offset from the sysenter stack to tss.sp0 */
-       DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
-              offsetofend(struct tss_struct, SYSENTER_stack));
-
-       /* Offset from cpu_tss to SYSENTER_stack */
-       OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack);
-       /* Size of SYSENTER_stack */
-       DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack));
+       DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) -
+              offsetofend(struct cpu_entry_area, entry_stack_page.stack));
 
 #ifdef CONFIG_CC_STACKPROTECTOR
        BLANK();
index 630212fa9b9da3f0498fc30d4c193c5926c43abb..bf51e51d808dd8914abd3b4bca69b37ce3ec023b 100644 (file)
@@ -23,6 +23,9 @@ int main(void)
 #ifdef CONFIG_PARAVIRT
        OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
        OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
+#ifdef CONFIG_DEBUG_ENTRY
+       OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl);
+#endif
        BLANK();
 #endif
 
@@ -63,6 +66,7 @@ int main(void)
 
        OFFSET(TSS_ist, tss_struct, x86_tss.ist);
        OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
+       OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
        BLANK();
 
 #ifdef CONFIG_CC_STACKPROTECTOR
index fa998ca8aa5aa5b4899dbe8a57c5b543f927009e..c9757f07d738af73ce3bd14c51780c71a512395f 100644 (file)
@@ -476,8 +476,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
        return NULL;            /* Not found */
 }
 
-__u32 cpu_caps_cleared[NCAPINTS];
-__u32 cpu_caps_set[NCAPINTS];
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
 
 void load_percpu_segment(int cpu)
 {
@@ -490,28 +490,23 @@ void load_percpu_segment(int cpu)
        load_stack_canary_segment();
 }
 
-/* Setup the fixmap mapping only once per-processor */
-static inline void setup_fixmap_gdt(int cpu)
-{
-#ifdef CONFIG_X86_64
-       /* On 64-bit systems, we use a read-only fixmap GDT. */
-       pgprot_t prot = PAGE_KERNEL_RO;
-#else
-       /*
-        * On native 32-bit systems, the GDT cannot be read-only because
-        * our double fault handler uses a task gate, and entering through
-        * a task gate needs to change an available TSS to busy.  If the GDT
-        * is read-only, that will triple fault.
-        *
-        * On Xen PV, the GDT must be read-only because the hypervisor requires
-        * it.
-        */
-       pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
-               PAGE_KERNEL_RO : PAGE_KERNEL;
+#ifdef CONFIG_X86_32
+/* The 32-bit entry code needs to find cpu_entry_area. */
+DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
 #endif
 
-       __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
-}
+#ifdef CONFIG_X86_64
+/*
+ * Special IST stacks which the CPU switches to when it calls
+ * an IST-marked descriptor entry. Up to 7 stacks (hardware
+ * limit), all of them are 4K, except the debug stack which
+ * is 8K.
+ */
+static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
+         [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
+         [DEBUG_STACK - 1]                     = DEBUG_STKSZ
+};
+#endif
 
 /* Load the original GDT from the per-cpu structure */
 void load_direct_gdt(int cpu)
@@ -747,7 +742,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
 {
        int i;
 
-       for (i = 0; i < NCAPINTS; i++) {
+       for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
                c->x86_capability[i] &= ~cpu_caps_cleared[i];
                c->x86_capability[i] |= cpu_caps_set[i];
        }
@@ -1250,7 +1245,7 @@ void enable_sep_cpu(void)
                return;
 
        cpu = get_cpu();
-       tss = &per_cpu(cpu_tss, cpu);
+       tss = &per_cpu(cpu_tss_rw, cpu);
 
        /*
         * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
@@ -1259,11 +1254,7 @@ void enable_sep_cpu(void)
 
        tss->x86_tss.ss1 = __KERNEL_CS;
        wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
-
-       wrmsr(MSR_IA32_SYSENTER_ESP,
-             (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
-             0);
-
+       wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);
        wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
 
        put_cpu();
@@ -1357,25 +1348,19 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
-/*
- * Special IST stacks which the CPU switches to when it calls
- * an IST-marked descriptor entry. Up to 7 stacks (hardware
- * limit), all of them are 4K, except the debug stack which
- * is 8K.
- */
-static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
-         [0 ... N_EXCEPTION_STACKS - 1]        = EXCEPTION_STKSZ,
-         [DEBUG_STACK - 1]                     = DEBUG_STKSZ
-};
-
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
-       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
-
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
+       extern char _entry_trampoline[];
+       extern char entry_SYSCALL_64_trampoline[];
+
+       int cpu = smp_processor_id();
+       unsigned long SYSCALL64_entry_trampoline =
+               (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
+               (entry_SYSCALL_64_trampoline - _entry_trampoline);
+
        wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
-       wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+       wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
 
 #ifdef CONFIG_IA32_EMULATION
        wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
@@ -1386,7 +1371,7 @@ void syscall_init(void)
         * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
         */
        wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-       wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+       wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));
        wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
        wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
@@ -1530,7 +1515,7 @@ void cpu_init(void)
        if (cpu)
                load_ucode_ap();
 
-       t = &per_cpu(cpu_tss, cpu);
+       t = &per_cpu(cpu_tss_rw, cpu);
        oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
@@ -1569,7 +1554,7 @@ void cpu_init(void)
         * set up and load the per-CPU TSS
         */
        if (!oist->ist[0]) {
-               char *estacks = per_cpu(exception_stacks, cpu);
+               char *estacks = get_cpu_entry_area(cpu)->exception_stacks;
 
                for (v = 0; v < N_EXCEPTION_STACKS; v++) {
                        estacks += exception_stack_sizes[v];
@@ -1580,7 +1565,7 @@ void cpu_init(void)
                }
        }
 
-       t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+       t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
 
        /*
         * <= is required because the CPU will access up to
@@ -1596,11 +1581,12 @@ void cpu_init(void)
        enter_lazy_tlb(&init_mm, me);
 
        /*
-        * Initialize the TSS.  Don't bother initializing sp0, as the initial
-        * task never enters user mode.
+        * Initialize the TSS.  sp0 points to the entry trampoline stack
+        * regardless of what task is running.
         */
-       set_tss_desc(cpu, t);
+       set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
        load_TR_desc();
+       load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));
 
        load_mm_ldt(&init_mm);
 
@@ -1612,7 +1598,6 @@ void cpu_init(void)
        if (is_uv_system())
                uv_cpu_init();
 
-       setup_fixmap_gdt(cpu);
        load_fixmap_gdt(cpu);
 }
 
@@ -1622,7 +1607,7 @@ void cpu_init(void)
 {
        int cpu = smp_processor_id();
        struct task_struct *curr = current;
-       struct tss_struct *t = &per_cpu(cpu_tss, cpu);
+       struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu);
 
        wait_for_master_cpu(cpu);
 
@@ -1657,12 +1642,12 @@ void cpu_init(void)
         * Initialize the TSS.  Don't bother initializing sp0, as the initial
         * task never enters user mode.
         */
-       set_tss_desc(cpu, t);
+       set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
        load_TR_desc();
 
        load_mm_ldt(&init_mm);
 
-       t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+       t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
 
 #ifdef CONFIG_DOUBLEFAULT
        /* Set up doublefault TSS pointer in the GDT */
@@ -1674,7 +1659,6 @@ void cpu_init(void)
 
        fpu__init_cpu();
 
-       setup_fixmap_gdt(cpu);
        load_fixmap_gdt(cpu);
 }
 #endif
index 7dbcb7adf7975f7f29c38651c23c478ad315a34c..8ccdca6d3f9e9b876ee27f021ed8c021b1168220 100644 (file)
@@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci)
 }
 #else
 
-/*
- * Flush global tlb. We only do this in x86_64 where paging has been enabled
- * already and PGE should be enabled as well.
- */
-static inline void flush_tlb_early(void)
-{
-       __native_flush_tlb_global_irq_disabled();
-}
-
 static inline void print_ucode(struct ucode_cpu_info *uci)
 {
        struct microcode_intel *mc;
@@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
        if (rev != mc->hdr.rev)
                return -1;
 
-#ifdef CONFIG_X86_64
-       /* Flush global tlb. This is precaution. */
-       flush_tlb_early();
-#endif
        uci->cpu_sig.rev = rev;
 
        if (early)
index 0e662c55ae902fedd5c78c1ed87a972b35a79856..0b8cedb20d6d92f2875a49292680c8cfecd5b044 100644 (file)
@@ -50,25 +50,23 @@ static void doublefault_fn(void)
                cpu_relax();
 }
 
-struct tss_struct doublefault_tss __cacheline_aligned = {
-       .x86_tss = {
-               .sp0            = STACK_START,
-               .ss0            = __KERNEL_DS,
-               .ldt            = 0,
-               .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
-               .ip             = (unsigned long) doublefault_fn,
-               /* 0x2 bit is always set */
-               .flags          = X86_EFLAGS_SF | 0x2,
-               .sp             = STACK_START,
-               .es             = __USER_DS,
-               .cs             = __KERNEL_CS,
-               .ss             = __KERNEL_DS,
-               .ds             = __USER_DS,
-               .fs             = __KERNEL_PERCPU,
-
-               .__cr3          = __pa_nodebug(swapper_pg_dir),
-       }
+struct x86_hw_tss doublefault_tss __cacheline_aligned = {
+       .sp0            = STACK_START,
+       .ss0            = __KERNEL_DS,
+       .ldt            = 0,
+       .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+
+       .ip             = (unsigned long) doublefault_fn,
+       /* 0x2 bit is always set */
+       .flags          = X86_EFLAGS_SF | 0x2,
+       .sp             = STACK_START,
+       .es             = __USER_DS,
+       .cs             = __KERNEL_CS,
+       .ss             = __KERNEL_DS,
+       .ds             = __USER_DS,
+       .fs             = __KERNEL_PERCPU,
+
+       .__cr3          = __pa_nodebug(swapper_pg_dir),
 };
 
 /* dummy for do_double_fault() call */
index f13b4c00a5de4b7a7b36c40d27311672bcc9d05c..36b17e0febe8629a9dde305625fccfc723943969 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/stacktrace.h>
 #include <asm/unwind.h>
 
@@ -43,6 +44,24 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
        return true;
 }
 
+bool in_entry_stack(unsigned long *stack, struct stack_info *info)
+{
+       struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
+
+       void *begin = ss;
+       void *end = ss + 1;
+
+       if ((void *)stack < begin || (void *)stack >= end)
+               return false;
+
+       info->type      = STACK_TYPE_ENTRY;
+       info->begin     = begin;
+       info->end       = end;
+       info->next_sp   = NULL;
+
+       return true;
+}
+
 static void printk_stack_address(unsigned long address, int reliable,
                                 char *log_lvl)
 {
@@ -50,6 +69,28 @@ static void printk_stack_address(unsigned long address, int reliable,
        printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
 }
 
+void show_iret_regs(struct pt_regs *regs)
+{
+       printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip);
+       printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss,
+               regs->sp, regs->flags);
+}
+
+static void show_regs_safe(struct stack_info *info, struct pt_regs *regs)
+{
+       if (on_stack(info, regs, sizeof(*regs)))
+               __show_regs(regs, 0);
+       else if (on_stack(info, (void *)regs + IRET_FRAME_OFFSET,
+                         IRET_FRAME_SIZE)) {
+               /*
+                * When an interrupt or exception occurs in entry code, the
+                * full pt_regs might not have been saved yet.  In that case
+                * just print the iret frame.
+                */
+               show_iret_regs(regs);
+       }
+}
+
 void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
                        unsigned long *stack, char *log_lvl)
 {
@@ -71,31 +112,35 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
         * - task stack
         * - interrupt stack
         * - HW exception stacks (double fault, nmi, debug, mce)
+        * - entry stack
         *
-        * x86-32 can have up to three stacks:
+        * x86-32 can have up to four stacks:
         * - task stack
         * - softirq stack
         * - hardirq stack
+        * - entry stack
         */
        for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {
                const char *stack_name;
 
-               /*
-                * If we overflowed the task stack into a guard page, jump back
-                * to the bottom of the usable stack.
-                */
-               if (task_stack_page(task) - (void *)stack < PAGE_SIZE)
-                       stack = task_stack_page(task);
-
-               if (get_stack_info(stack, task, &stack_info, &visit_mask))
-                       break;
+               if (get_stack_info(stack, task, &stack_info, &visit_mask)) {
+                       /*
+                        * We weren't on a valid stack.  It's possible that
+                        * we overflowed a valid stack into a guard page.
+                        * See if the next page up is valid so that we can
+                        * generate some kind of backtrace if this happens.
+                        */
+                       stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack);
+                       if (get_stack_info(stack, task, &stack_info, &visit_mask))
+                               break;
+               }
 
                stack_name = stack_type_name(stack_info.type);
                if (stack_name)
                        printk("%s <%s>\n", log_lvl, stack_name);
 
-               if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
-                       __show_regs(regs, 0);
+               if (regs)
+                       show_regs_safe(&stack_info, regs);
 
                /*
                 * Scan the stack, printing any text addresses we find.  At the
@@ -119,7 +164,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
 
                        /*
                         * Don't print regs->ip again if it was already printed
-                        * by __show_regs() below.
+                        * by show_regs_safe() below.
                         */
                        if (regs && stack == &regs->ip)
                                goto next;
@@ -155,8 +200,8 @@ next:
 
                        /* if the frame has entry regs, print them */
                        regs = unwind_get_entry_regs(&state);
-                       if (regs && on_stack(&stack_info, regs, sizeof(*regs)))
-                               __show_regs(regs, 0);
+                       if (regs)
+                               show_regs_safe(&stack_info, regs);
                }
 
                if (stack_name)
index daefae83a3aa86c59602b75bd3e6734c6e3b1030..04170f63e3a1d567caac3deea641e014b7e10823 100644 (file)
@@ -26,6 +26,9 @@ const char *stack_type_name(enum stack_type type)
        if (type == STACK_TYPE_SOFTIRQ)
                return "SOFTIRQ";
 
+       if (type == STACK_TYPE_ENTRY)
+               return "ENTRY_TRAMPOLINE";
+
        return NULL;
 }
 
@@ -93,6 +96,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
        if (task != current)
                goto unknown;
 
+       if (in_entry_stack(stack, info))
+               goto recursion_check;
+
        if (in_hardirq_stack(stack, info))
                goto recursion_check;
 
index 88ce2ffdb110303502ad33e64d357d8af5afd8c6..563e28d14f2ca157178d9de3a139d8370aaf89fe 100644 (file)
@@ -37,6 +37,15 @@ const char *stack_type_name(enum stack_type type)
        if (type == STACK_TYPE_IRQ)
                return "IRQ";
 
+       if (type == STACK_TYPE_ENTRY) {
+               /*
+                * On 64-bit, we have a generic entry stack that we
+                * use for all the kernel entry points, including
+                * SYSENTER.
+                */
+               return "ENTRY_TRAMPOLINE";
+       }
+
        if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
                return exception_stack_names[type - STACK_TYPE_EXCEPTION];
 
@@ -115,6 +124,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,
        if (in_irq_stack(stack, info))
                goto recursion_check;
 
+       if (in_entry_stack(stack, info))
+               goto recursion_check;
+
        goto unknown;
 
 recursion_check:
index 3feb648781c470a7a49ee26749712ba7da891fe9..2f723301eb58fc5ad0d6796b342446ae2ee0c9e6 100644 (file)
@@ -67,7 +67,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
         * because the ->io_bitmap_max value must match the bitmap
         * contents:
         */
-       tss = &per_cpu(cpu_tss, get_cpu());
+       tss = &per_cpu(cpu_tss_rw, get_cpu());
 
        if (turn_on)
                bitmap_clear(t->io_bitmap_ptr, from, num);
index 49cfd9fe7589fa5ef2bef5d4a5d6431b7007836f..68e1867cca8045d0ed728ffc6b75a866c25484ed 100644 (file)
@@ -219,18 +219,6 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
        /* high bit used in ret_from_ code  */
        unsigned vector = ~regs->orig_ax;
 
-       /*
-        * NB: Unlike exception entries, IRQ entries do not reliably
-        * handle context tracking in the low-level entry code.  This is
-        * because syscall entries execute briefly with IRQs on before
-        * updating context tracking state, so we can take an IRQ from
-        * kernel mode with CONTEXT_USER.  The low-level entry code only
-        * updates the context if we came from user mode, so we won't
-        * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
-        * code is cleaned up enough that we can cleanly defer enabling
-        * IRQs.
-        */
-
        entering_irq();
 
        /* entering_irq() tells RCU that we're not quiescent.  Check it. */
index 020efbf5786b35d343a8632cd14ac4f800465d9b..d86e344f5b3debfed504b72a7c0f83f36fe16387 100644 (file)
@@ -57,10 +57,10 @@ static inline void stack_overflow_check(struct pt_regs *regs)
        if (regs->sp >= estack_top && regs->sp <= estack_bottom)
                return;
 
-       WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
+       WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
                current->comm, curbase, regs->sp,
                irq_stack_top, irq_stack_bottom,
-               estack_top, estack_bottom);
+               estack_top, estack_bottom, (void *)regs->ip);
 
        if (sysctl_panic_on_stackoverflow)
                panic("low stack detected by irq handler - check messages\n");
index 1c1eae9613406b14c3154065e1fd036f985a384c..a6b5d62f45a737b84124411f56a64fdd869ca4f8 100644 (file)
@@ -5,6 +5,11 @@
  * Copyright (C) 2002 Andi Kleen
  *
  * This handles calls from both 32bit and 64bit mode.
+ *
+ * Lock order:
+ *     contex.ldt_usr_sem
+ *       mmap_sem
+ *         context.lock
  */
 
 #include <linux/errno.h>
@@ -42,7 +47,7 @@ static void refresh_ldt_segments(void)
 #endif
 }
 
-/* context.lock is held for us, so we don't need any locking. */
+/* context.lock is held by the task which issued the smp function call */
 static void flush_ldt(void *__mm)
 {
        struct mm_struct *mm = __mm;
@@ -99,15 +104,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
        paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
 }
 
-/* context.lock is held */
-static void install_ldt(struct mm_struct *current_mm,
-                       struct ldt_struct *ldt)
+static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)
 {
+       mutex_lock(&mm->context.lock);
+
        /* Synchronizes with READ_ONCE in load_mm_ldt. */
-       smp_store_release(&current_mm->context.ldt, ldt);
+       smp_store_release(&mm->context.ldt, ldt);
 
-       /* Activate the LDT for all CPUs using current_mm. */
-       on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true);
+       /* Activate the LDT for all CPUs using currents mm. */
+       on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true);
+
+       mutex_unlock(&mm->context.lock);
 }
 
 static void free_ldt_struct(struct ldt_struct *ldt)
@@ -124,27 +131,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)
 }
 
 /*
- * we do not have to muck with descriptors here, that is
- * done in switch_mm() as needed.
+ * Called on fork from arch_dup_mmap(). Just copy the current LDT state,
+ * the new task is not running, so nothing can be installed.
  */
-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
+int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
 {
        struct ldt_struct *new_ldt;
-       struct mm_struct *old_mm;
        int retval = 0;
 
-       mutex_init(&mm->context.lock);
-       old_mm = current->mm;
-       if (!old_mm) {
-               mm->context.ldt = NULL;
+       if (!old_mm)
                return 0;
-       }
 
        mutex_lock(&old_mm->context.lock);
-       if (!old_mm->context.ldt) {
-               mm->context.ldt = NULL;
+       if (!old_mm->context.ldt)
                goto out_unlock;
-       }
 
        new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
        if (!new_ldt) {
@@ -180,7 +180,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
        unsigned long entries_size;
        int retval;
 
-       mutex_lock(&mm->context.lock);
+       down_read(&mm->context.ldt_usr_sem);
 
        if (!mm->context.ldt) {
                retval = 0;
@@ -209,7 +209,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
        retval = bytecount;
 
 out_unlock:
-       mutex_unlock(&mm->context.lock);
+       up_read(&mm->context.ldt_usr_sem);
        return retval;
 }
 
@@ -269,7 +269,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
                        ldt.avl = 0;
        }
 
-       mutex_lock(&mm->context.lock);
+       if (down_write_killable(&mm->context.ldt_usr_sem))
+               return -EINTR;
 
        old_ldt       = mm->context.ldt;
        old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
@@ -291,7 +292,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
        error = 0;
 
 out_unlock:
-       mutex_unlock(&mm->context.lock);
+       up_write(&mm->context.ldt_usr_sem);
 out:
        return error;
 }
index ac0be8283325edfdc2752f862b4c0cef208a931c..9edadabf04f66c657f8a29bb56fe994b2559d5cf 100644 (file)
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
 DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -60,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                PATCH_SITE(pv_mmu_ops, read_cr2);
                PATCH_SITE(pv_mmu_ops, read_cr3);
                PATCH_SITE(pv_mmu_ops, write_cr3);
-               PATCH_SITE(pv_mmu_ops, flush_tlb_single);
                PATCH_SITE(pv_cpu_ops, wbinvd);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
                case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
index bb988a24db927d758f9120d45f90d1c160628790..aed9d94bd46f41bb049b8e0153a44a43d97e80b4 100644 (file)
@@ -47,7 +47,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss_rw) = {
        .x86_tss = {
                /*
                 * .sp0 is only used when entering ring 0 from a lower
@@ -56,6 +56,16 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
                 * Poison it.
                 */
                .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
+
+#ifdef CONFIG_X86_64
+               /*
+                * .sp1 is cpu_current_top_of_stack.  The init task never
+                * runs user code, but cpu_current_top_of_stack should still
+                * be well defined before the first context switch.
+                */
+               .sp1 = TOP_OF_INIT_STACK,
+#endif
+
 #ifdef CONFIG_X86_32
                .ss0 = __KERNEL_DS,
                .ss1 = __KERNEL_CS,
@@ -71,11 +81,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
          */
        .io_bitmap              = { [0 ... IO_BITMAP_LONGS] = ~0 },
 #endif
-#ifdef CONFIG_X86_32
-       .SYSENTER_stack_canary  = STACK_END_MAGIC,
-#endif
 };
-EXPORT_PER_CPU_SYMBOL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
 
 DEFINE_PER_CPU(bool, __tss_limit_invalid);
 EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
@@ -104,7 +111,7 @@ void exit_thread(struct task_struct *tsk)
        struct fpu *fpu = &t->fpu;
 
        if (bp) {
-               struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
+               struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu());
 
                t->io_bitmap_ptr = NULL;
                clear_thread_flag(TIF_IO_BITMAP);
index 45bf0c5f93e15103060d67d5245756ab72ce8fe5..5224c609918416337b97440eb2d515d8052463ae 100644 (file)
@@ -234,7 +234,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        struct fpu *prev_fpu = &prev->fpu;
        struct fpu *next_fpu = &next->fpu;
        int cpu = smp_processor_id();
-       struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+       struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
        /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
index eeeb34f85c250e8c01188b6d32cf5a62bd1af8a0..c754662320163107ca3a254362ce0e404a8d3c11 100644 (file)
@@ -69,9 +69,8 @@ void __show_regs(struct pt_regs *regs, int all)
        unsigned int fsindex, gsindex;
        unsigned int ds, cs, es;
 
-       printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs, (void *)regs->ip);
-       printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
-               regs->sp, regs->flags);
+       show_iret_regs(regs);
+
        if (regs->orig_ax != -1)
                pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
        else
@@ -88,6 +87,9 @@ void __show_regs(struct pt_regs *regs, int all)
        printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
               regs->r13, regs->r14, regs->r15);
 
+       if (!all)
+               return;
+
        asm("movl %%ds,%0" : "=r" (ds));
        asm("movl %%cs,%0" : "=r" (cs));
        asm("movl %%es,%0" : "=r" (es));
@@ -98,9 +100,6 @@ void __show_regs(struct pt_regs *regs, int all)
        rdmsrl(MSR_GS_BASE, gs);
        rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 
-       if (!all)
-               return;
-
        cr0 = read_cr0();
        cr2 = read_cr2();
        cr3 = __read_cr3();
@@ -400,7 +399,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        struct fpu *prev_fpu = &prev->fpu;
        struct fpu *next_fpu = &next->fpu;
        int cpu = smp_processor_id();
-       struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+       struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu);
 
        WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
                     this_cpu_read(irq_count) != -1);
@@ -462,6 +461,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         * Switch the PDA and FPU contexts.
         */
        this_cpu_write(current_task, next_p);
+       this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 
        /* Reload sp0. */
        update_sp0(next_p);
index 35cb20994e32d2bf05f0b1510ccc26cc7e7590a5..c5970efa85570ab324bd1cad2e57d464dba86f46 100644 (file)
@@ -932,12 +932,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
        initial_code = (unsigned long)start_secondary;
        initial_stack  = idle->thread.sp;
 
-       /*
-        * Enable the espfix hack for this CPU
-       */
-#ifdef CONFIG_X86_ESPFIX64
+       /* Enable the espfix hack for this CPU */
        init_espfix_ap(cpu);
-#endif
 
        /* So we see what's up */
        announce_cpu(cpu, apicid);
index 989514c94a55d8fa93a07192edd199be1a607bf8..f69dbd47d7332f4af7e5f274bb6aa9736f3014bd 100644 (file)
@@ -51,6 +51,7 @@
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/fpu/internal.h>
+#include <asm/cpu_entry_area.h>
 #include <asm/mce.h>
 #include <asm/fixmap.h>
 #include <asm/mach_traps.h>
@@ -348,9 +349,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
        /*
         * If IRET takes a non-IST fault on the espfix64 stack, then we
-        * end up promoting it to a doublefault.  In that case, modify
-        * the stack to make it look like we just entered the #GP
-        * handler from user space, similar to bad_iret.
+        * end up promoting it to a doublefault.  In that case, take
+        * advantage of the fact that we're not using the normal (TSS.sp0)
+        * stack right now.  We can write a fake #GP(0) frame at TSS.sp0
+        * and then modify our own IRET frame so that, when we return,
+        * we land directly at the #GP(0) vector with the stack already
+        * set up according to its expectations.
+        *
+        * The net result is that our #GP handler will think that we
+        * entered from usermode with the bad user context.
         *
         * No need for ist_enter here because we don't use RCU.
         */
@@ -358,13 +365,26 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
                regs->cs == __KERNEL_CS &&
                regs->ip == (unsigned long)native_irq_return_iret)
        {
-               struct pt_regs *normal_regs = task_pt_regs(current);
+               struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
-               /* Fake a #GP(0) from userspace. */
-               memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
-               normal_regs->orig_ax = 0;  /* Missing (lost) #GP error code */
+               /*
+                * regs->sp points to the failing IRET frame on the
+                * ESPFIX64 stack.  Copy it to the entry stack.  This fills
+                * in gpregs->ss through gpregs->ip.
+                *
+                */
+               memmove(&gpregs->ip, (void *)regs->sp, 5*8);
+               gpregs->orig_ax = 0;  /* Missing (lost) #GP error code */
+
+               /*
+                * Adjust our frame so that we return straight to the #GP
+                * vector with the expected RSP value.  This is safe because
+                * we won't enable interupts or schedule before we invoke
+                * general_protection, so nothing will clobber the stack
+                * frame we just set up.
+                */
                regs->ip = (unsigned long)general_protection;
-               regs->sp = (unsigned long)&normal_regs->orig_ax;
+               regs->sp = (unsigned long)&gpregs->orig_ax;
 
                return;
        }
@@ -389,7 +409,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
         *
         *   Processors update CR2 whenever a page fault is detected. If a
         *   second page fault occurs while an earlier page fault is being
-        *   delivered, the faulting linear address of the second fault will
+        *   delivered, the faulting linear address of the second fault will
         *   overwrite the contents of CR2 (replacing the previous
         *   address). These updates to CR2 occur even if the page fault
         *   results in a double fault or occurs during the delivery of a
@@ -605,14 +625,15 @@ NOKPROBE_SYMBOL(do_int3);
 
 #ifdef CONFIG_X86_64
 /*
- * Help handler running on IST stack to switch off the IST stack if the
- * interrupted code was in user mode. The actual stack switch is done in
- * entry_64.S
+ * Help handler running on a per-cpu (IST or entry trampoline) stack
+ * to switch to the normal thread stack if the interrupted code was in
+ * user mode. The actual stack switch is done in entry_64.S
  */
 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
 {
-       struct pt_regs *regs = task_pt_regs(current);
-       *regs = *eregs;
+       struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
+       if (regs != eregs)
+               *regs = *eregs;
        return regs;
 }
 NOKPROBE_SYMBOL(sync_regs);
@@ -628,13 +649,13 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
        /*
         * This is called from entry_64.S early in handling a fault
         * caused by a bad iret to user mode.  To handle the fault
-        * correctly, we want move our stack frame to task_pt_regs
-        * and we want to pretend that the exception came from the
-        * iret target.
+        * correctly, we want to move our stack frame to where it would
+        * be had we entered directly on the entry stack (rather than
+        * just below the IRET frame) and we want to pretend that the
+        * exception came from the IRET target.
         */
        struct bad_iret_stack *new_stack =
-               container_of(task_pt_regs(current),
-                            struct bad_iret_stack, regs);
+               (struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
 
        /* Copy the IRET target to the new stack. */
        memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
@@ -795,14 +816,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
        debug_stack_usage_dec();
 
 exit:
-#if defined(CONFIG_X86_32)
-       /*
-        * This is the most likely code path that involves non-trivial use
-        * of the SYSENTER stack.  Check that we haven't overrun it.
-        */
-       WARN(this_cpu_read(cpu_tss.SYSENTER_stack_canary) != STACK_END_MAGIC,
-            "Overran or corrupted SYSENTER stack\n");
-#endif
        ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);
@@ -929,6 +942,9 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 
 void __init trap_init(void)
 {
+       /* Init cpu_entry_area before IST entries are set up */
+       setup_cpu_entry_areas();
+
        idt_setup_traps();
 
        /*
@@ -936,8 +952,9 @@ void __init trap_init(void)
         * "sidt" instruction will not leak the location of the kernel, and
         * to defend the IDT against arbitrary memory write vulnerabilities.
         * It will be reloaded in cpu_init() */
-       __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
-       idt_descr.address = fix_to_virt(FIX_RO_IDT);
+       cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
+                   PAGE_KERNEL_RO);
+       idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
 
        /*
         * Should be a barrier for any external CPU state:
index a3f973b2c97a03b121fe0173dbdc9298216721e6..be86a865087a6b9dc8e04031dbf2e2fbeeda1ed5 100644 (file)
@@ -253,22 +253,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
        return NULL;
 }
 
-static bool stack_access_ok(struct unwind_state *state, unsigned long addr,
+static bool stack_access_ok(struct unwind_state *state, unsigned long _addr,
                            size_t len)
 {
        struct stack_info *info = &state->stack_info;
+       void *addr = (void *)_addr;
 
-       /*
-        * If the address isn't on the current stack, switch to the next one.
-        *
-        * We may have to traverse multiple stacks to deal with the possibility
-        * that info->next_sp could point to an empty stack and the address
-        * could be on a subsequent stack.
-        */
-       while (!on_stack(info, (void *)addr, len))
-               if (get_stack_info(info->next_sp, state->task, info,
-                                  &state->stack_mask))
-                       return false;
+       if (!on_stack(info, addr, len) &&
+           (get_stack_info(addr, state->task, info, &state->stack_mask)))
+               return false;
 
        return true;
 }
@@ -283,42 +276,32 @@ static bool deref_stack_reg(struct unwind_state *state, unsigned long addr,
        return true;
 }
 
-#define REGS_SIZE (sizeof(struct pt_regs))
-#define SP_OFFSET (offsetof(struct pt_regs, sp))
-#define IRET_REGS_SIZE (REGS_SIZE - offsetof(struct pt_regs, ip))
-#define IRET_SP_OFFSET (SP_OFFSET - offsetof(struct pt_regs, ip))
-
 static bool deref_stack_regs(struct unwind_state *state, unsigned long addr,
-                            unsigned long *ip, unsigned long *sp, bool full)
+                            unsigned long *ip, unsigned long *sp)
 {
-       size_t regs_size = full ? REGS_SIZE : IRET_REGS_SIZE;
-       size_t sp_offset = full ? SP_OFFSET : IRET_SP_OFFSET;
-       struct pt_regs *regs = (struct pt_regs *)(addr + regs_size - REGS_SIZE);
-
-       if (IS_ENABLED(CONFIG_X86_64)) {
-               if (!stack_access_ok(state, addr, regs_size))
-                       return false;
+       struct pt_regs *regs = (struct pt_regs *)addr;
 
-               *ip = regs->ip;
-               *sp = regs->sp;
+       /* x86-32 support will be more complicated due to the &regs->sp hack */
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_X86_32));
 
-               return true;
-       }
-
-       if (!stack_access_ok(state, addr, sp_offset))
+       if (!stack_access_ok(state, addr, sizeof(struct pt_regs)))
                return false;
 
        *ip = regs->ip;
+       *sp = regs->sp;
+       return true;
+}
 
-       if (user_mode(regs)) {
-               if (!stack_access_ok(state, addr + sp_offset,
-                                    REGS_SIZE - SP_OFFSET))
-                       return false;
+static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr,
+                                 unsigned long *ip, unsigned long *sp)
+{
+       struct pt_regs *regs = (void *)addr - IRET_FRAME_OFFSET;
 
-               *sp = regs->sp;
-       } else
-               *sp = (unsigned long)&regs->sp;
+       if (!stack_access_ok(state, addr, IRET_FRAME_SIZE))
+               return false;
 
+       *ip = regs->ip;
+       *sp = regs->sp;
        return true;
 }
 
@@ -327,7 +310,6 @@ bool unwind_next_frame(struct unwind_state *state)
        unsigned long ip_p, sp, orig_ip, prev_sp = state->sp;
        enum stack_type prev_type = state->stack_info.type;
        struct orc_entry *orc;
-       struct pt_regs *ptregs;
        bool indirect = false;
 
        if (unwind_done(state))
@@ -435,7 +417,7 @@ bool unwind_next_frame(struct unwind_state *state)
                break;
 
        case ORC_TYPE_REGS:
-               if (!deref_stack_regs(state, sp, &state->ip, &state->sp, true)) {
+               if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
                        orc_warn("can't dereference registers at %p for ip %pB\n",
                                 (void *)sp, (void *)orig_ip);
                        goto done;
@@ -447,20 +429,14 @@ bool unwind_next_frame(struct unwind_state *state)
                break;
 
        case ORC_TYPE_REGS_IRET:
-               if (!deref_stack_regs(state, sp, &state->ip, &state->sp, false)) {
+               if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
                        orc_warn("can't dereference iret registers at %p for ip %pB\n",
                                 (void *)sp, (void *)orig_ip);
                        goto done;
                }
 
-               ptregs = container_of((void *)sp, struct pt_regs, ip);
-               if ((unsigned long)ptregs >= prev_sp &&
-                   on_stack(&state->stack_info, ptregs, REGS_SIZE)) {
-                       state->regs = ptregs;
-                       state->full_regs = false;
-               } else
-                       state->regs = NULL;
-
+               state->regs = (void *)sp - IRET_FRAME_OFFSET;
+               state->full_regs = false;
                state->signal = true;
                break;
 
@@ -553,8 +529,18 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
        }
 
        if (get_stack_info((unsigned long *)state->sp, state->task,
-                          &state->stack_info, &state->stack_mask))
-               return;
+                          &state->stack_info, &state->stack_mask)) {
+               /*
+                * We weren't on a valid stack.  It's possible that
+                * we overflowed a valid stack into a guard page.
+                * See if the next page up is valid so that we can
+                * generate some kind of backtrace if this happens.
+                */
+               void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
+               if (get_stack_info(next_page, state->task, &state->stack_info,
+                                  &state->stack_mask))
+                       return;
+       }
 
        /*
         * The caller can provide the address of the first frame directly
index a4009fb9be8725ce7bda96cd5e8160e524903266..d2a8b5a24a44a554e2f81f3b30309ef39aba0d8a 100644 (file)
@@ -107,6 +107,15 @@ SECTIONS
                SOFTIRQENTRY_TEXT
                *(.fixup)
                *(.gnu.warning)
+
+#ifdef CONFIG_X86_64
+               . = ALIGN(PAGE_SIZE);
+               _entry_trampoline = .;
+               *(.entry_trampoline)
+               . = ALIGN(PAGE_SIZE);
+               ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
+#endif
+
                /* End of text section */
                _etext = .;
        } :text = 0x9090
index abe74f779f9d793e9a6c2f19417f23b5aa7ce484..b514b2b2845a334d4b53f28ed0b73c96f12d0e6a 100644 (file)
@@ -2390,9 +2390,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
 }
 
 static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
-                                    u64 cr0, u64 cr4)
+                                   u64 cr0, u64 cr3, u64 cr4)
 {
        int bad;
+       u64 pcid;
+
+       /* In order to later set CR4.PCIDE, CR3[11:0] must be zero.  */
+       pcid = 0;
+       if (cr4 & X86_CR4_PCIDE) {
+               pcid = cr3 & 0xfff;
+               cr3 &= ~0xfff;
+       }
+
+       bad = ctxt->ops->set_cr(ctxt, 3, cr3);
+       if (bad)
+               return X86EMUL_UNHANDLEABLE;
 
        /*
         * First enable PAE, long mode needs it before CR0.PG = 1 is set.
@@ -2411,6 +2423,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
                bad = ctxt->ops->set_cr(ctxt, 4, cr4);
                if (bad)
                        return X86EMUL_UNHANDLEABLE;
+               if (pcid) {
+                       bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
+                       if (bad)
+                               return X86EMUL_UNHANDLEABLE;
+               }
+
        }
 
        return X86EMUL_CONTINUE;
@@ -2421,11 +2439,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
        struct desc_struct desc;
        struct desc_ptr dt;
        u16 selector;
-       u32 val, cr0, cr4;
+       u32 val, cr0, cr3, cr4;
        int i;
 
        cr0 =                      GET_SMSTATE(u32, smbase, 0x7ffc);
-       ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
+       cr3 =                      GET_SMSTATE(u32, smbase, 0x7ff8);
        ctxt->eflags =             GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
        ctxt->_eip =               GET_SMSTATE(u32, smbase, 0x7ff0);
 
@@ -2467,14 +2485,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
 
        ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
 
-       return rsm_enter_protected_mode(ctxt, cr0, cr4);
+       return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
 }
 
 static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
 {
        struct desc_struct desc;
        struct desc_ptr dt;
-       u64 val, cr0, cr4;
+       u64 val, cr0, cr3, cr4;
        u32 base3;
        u16 selector;
        int i, r;
@@ -2491,7 +2509,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
        ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
 
        cr0 =                       GET_SMSTATE(u64, smbase, 0x7f58);
-       ctxt->ops->set_cr(ctxt, 3,  GET_SMSTATE(u64, smbase, 0x7f50));
+       cr3 =                       GET_SMSTATE(u64, smbase, 0x7f50);
        cr4 =                       GET_SMSTATE(u64, smbase, 0x7f48);
        ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
        val =                       GET_SMSTATE(u64, smbase, 0x7ed0);
@@ -2519,7 +2537,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
        dt.address =                GET_SMSTATE(u64, smbase, 0x7e68);
        ctxt->ops->set_gdt(ctxt, &dt);
 
-       r = rsm_enter_protected_mode(ctxt, cr0, cr4);
+       r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
        if (r != X86EMUL_CONTINUE)
                return r;
 
index e5e66e5c664057bb5cc5ad2660008ccbf19b69e5..c4deb1f34faa6ce7ffe6bcaaebddc3e87b2a9a69 100644 (file)
@@ -3395,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                spin_lock(&vcpu->kvm->mmu_lock);
                if(make_mmu_pages_available(vcpu) < 0) {
                        spin_unlock(&vcpu->kvm->mmu_lock);
-                       return 1;
+                       return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, 0, 0,
                                vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
@@ -3410,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
                        spin_lock(&vcpu->kvm->mmu_lock);
                        if (make_mmu_pages_available(vcpu) < 0) {
                                spin_unlock(&vcpu->kvm->mmu_lock);
-                               return 1;
+                               return -ENOSPC;
                        }
                        sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
                                        i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
@@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                spin_lock(&vcpu->kvm->mmu_lock);
                if (make_mmu_pages_available(vcpu) < 0) {
                        spin_unlock(&vcpu->kvm->mmu_lock);
-                       return 1;
+                       return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
                                vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
@@ -3487,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
                spin_lock(&vcpu->kvm->mmu_lock);
                if (make_mmu_pages_available(vcpu) < 0) {
                        spin_unlock(&vcpu->kvm->mmu_lock);
-                       return 1;
+                       return -ENOSPC;
                }
                sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
                                      0, ACC_ALL);
index 8eba631c4dbd509d8687c6135e8dba267042f5e0..023afa0c8887002d6a79a8b121b46996feec1a61 100644 (file)
@@ -2302,7 +2302,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
                 * processors.  See 22.2.4.
                 */
                vmcs_writel(HOST_TR_BASE,
-                           (unsigned long)this_cpu_ptr(&cpu_tss));
+                           (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
                vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */
 
                /*
index faf843c9b916ead0992d0b155a138c6afdf7ae57..1cec2c62a0b08405d2bd7c8908d6b7f33de3b63c 100644 (file)
@@ -4384,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
                                         addr, n, v))
                    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
                        break;
-               trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+               trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
                handled += n;
                addr += n;
                len -= n;
@@ -4643,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 {
        if (vcpu->mmio_read_completed) {
                trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-                              vcpu->mmio_fragments[0].gpa, *(u64 *)val);
+                              vcpu->mmio_fragments[0].gpa, val);
                vcpu->mmio_read_completed = 0;
                return 1;
        }
@@ -4665,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
 {
-       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+       trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
        return vcpu_mmio_write(vcpu, gpa, bytes, val);
 }
 
 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
                          void *val, int bytes)
 {
-       trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+       trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
        return X86EMUL_IO_NEEDED;
 }
 
@@ -7264,13 +7264,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-       struct fpu *fpu = &current->thread.fpu;
        int r;
 
-       fpu__initialize(fpu);
-
        kvm_sigset_activate(vcpu);
 
+       kvm_load_guest_fpu(vcpu);
+
        if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
                if (kvm_run->immediate_exit) {
                        r = -EINTR;
@@ -7296,14 +7295,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
                }
        }
 
-       kvm_load_guest_fpu(vcpu);
-
        if (unlikely(vcpu->arch.complete_userspace_io)) {
                int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
                vcpu->arch.complete_userspace_io = NULL;
                r = cui(vcpu);
                if (r <= 0)
-                       goto out_fpu;
+                       goto out;
        } else
                WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
@@ -7312,9 +7309,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
        else
                r = vcpu_run(vcpu);
 
-out_fpu:
-       kvm_put_guest_fpu(vcpu);
 out:
+       kvm_put_guest_fpu(vcpu);
        post_kvm_run_save(vcpu);
        kvm_sigset_deactivate(vcpu);
 
@@ -7384,7 +7380,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 #endif
 
        kvm_rip_write(vcpu, regs->rip);
-       kvm_set_rflags(vcpu, regs->rflags);
+       kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
 
        vcpu->arch.exception.pending = false;
 
@@ -7498,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
+int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+       if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG_BIT)) {
+               /*
+                * When EFER.LME and CR0.PG are set, the processor is in
+                * 64-bit mode (though maybe in a 32-bit code segment).
+                * CR4.PAE and EFER.LMA must be set.
+                */
+               if (!(sregs->cr4 & X86_CR4_PAE_BIT)
+                   || !(sregs->efer & EFER_LMA))
+                       return -EINVAL;
+       } else {
+               /*
+                * Not in 64-bit mode: EFER.LMA is clear and the code
+                * segment cannot be 64-bit.
+                */
+               if (sregs->efer & EFER_LMA || sregs->cs.l)
+                       return -EINVAL;
+       }
+
+       return 0;
+}
+
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
 {
@@ -7510,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                        (sregs->cr4 & X86_CR4_OSXSAVE))
                return -EINVAL;
 
+       if (kvm_valid_sregs(vcpu, sregs))
+               return -EINVAL;
+
        apic_base_msr.data = sregs->apic_base;
        apic_base_msr.host_initiated = true;
        if (kvm_set_apic_base(vcpu, &apic_base_msr))
index 553f8fd23cc4733d0edafa862b95446f7a04bab1..4846eff7e4c8b1505501d7f1dcb64127d0a4c67c 100644 (file)
@@ -107,10 +107,10 @@ static void delay_mwaitx(unsigned long __loops)
                delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
 
                /*
-                * Use cpu_tss as a cacheline-aligned, seldomly
+                * Use cpu_tss_rw as a cacheline-aligned, seldomly
                 * accessed per-cpu variable as the monitor target.
                 */
-               __monitorx(raw_cpu_ptr(&cpu_tss), 0, 0);
+               __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
 
                /*
                 * AMD, like Intel, supports the EAX hint and EAX=0xf
index 8e13b8cc6bedb0dc84eea64cd80ca6ae39037eaa..52195ee3f6d50ebd2005aa040b1cf0023edd6b33 100644 (file)
@@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o   = -pg
 endif
 
 obj-y  :=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-           pat.o pgtable.o physaddr.o setup_nx.o tlb.o
+           pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o
 
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
new file mode 100644 (file)
index 0000000..fe814fd
--- /dev/null
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/desc.h>
+
+static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+#ifdef CONFIG_X86_64
+static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
+#endif
+
+struct cpu_entry_area *get_cpu_entry_area(int cpu)
+{
+       unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
+       BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+       return (struct cpu_entry_area *) va;
+}
+EXPORT_SYMBOL(get_cpu_entry_area);
+
+void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+{
+       unsigned long va = (unsigned long) cea_vaddr;
+
+       set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags));
+}
+
+static void __init
+cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+{
+       for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+               cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+}
+
+/* Setup the fixmap mappings only once per-processor */
+static void __init setup_cpu_entry_area(int cpu)
+{
+#ifdef CONFIG_X86_64
+       extern char _entry_trampoline[];
+
+       /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
+       pgprot_t gdt_prot = PAGE_KERNEL_RO;
+       pgprot_t tss_prot = PAGE_KERNEL_RO;
+#else
+       /*
+        * On native 32-bit systems, the GDT cannot be read-only because
+        * our double fault handler uses a task gate, and entering through
+        * a task gate needs to change an available TSS to busy.  If the
+        * GDT is read-only, that will triple fault.  The TSS cannot be
+        * read-only because the CPU writes to it on task switches.
+        *
+        * On Xen PV, the GDT must be read-only because the hypervisor
+        * requires it.
+        */
+       pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ?
+               PAGE_KERNEL_RO : PAGE_KERNEL;
+       pgprot_t tss_prot = PAGE_KERNEL;
+#endif
+
+       cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu),
+                   gdt_prot);
+
+       cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page,
+                            per_cpu_ptr(&entry_stack_storage, cpu), 1,
+                            PAGE_KERNEL);
+
+       /*
+        * The Intel SDM says (Volume 3, 7.2.1):
+        *
+        *  Avoid placing a page boundary in the part of the TSS that the
+        *  processor reads during a task switch (the first 104 bytes). The
+        *  processor may not correctly perform address translations if a
+        *  boundary occurs in this area. During a task switch, the processor
+        *  reads and writes into the first 104 bytes of each TSS (using
+        *  contiguous physical addresses beginning with the physical address
+        *  of the first byte of the TSS). So, after TSS access begins, if
+        *  part of the 104 bytes is not physically contiguous, the processor
+        *  will access incorrect information without generating a page-fault
+        *  exception.
+        *
+        * There are also a lot of errata involving the TSS spanning a page
+        * boundary.  Assert that we're not doing that.
+        */
+       BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
+                     offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
+       BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
+       cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss,
+                            &per_cpu(cpu_tss_rw, cpu),
+                            sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
+
+#ifdef CONFIG_X86_32
+       per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);
+#endif
+
+#ifdef CONFIG_X86_64
+       BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
+       BUILD_BUG_ON(sizeof(exception_stacks) !=
+                    sizeof(((struct cpu_entry_area *)0)->exception_stacks));
+       cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
+                            &per_cpu(exception_stacks, cpu),
+                            sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
+
+       cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
+                    __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
+#endif
+}
+
+static __init void setup_cpu_entry_area_ptes(void)
+{
+#ifdef CONFIG_X86_32
+       unsigned long start, end;
+
+       BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE);
+       BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
+
+       start = CPU_ENTRY_AREA_BASE;
+       end = start + CPU_ENTRY_AREA_MAP_SIZE;
+
+       /* Careful here: start + PMD_SIZE might wrap around */
+       for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE)
+               populate_extra_pte(start);
+#endif
+}
+
+void __init setup_cpu_entry_areas(void)
+{
+       unsigned int cpu;
+
+       setup_cpu_entry_area_ptes();
+
+       for_each_possible_cpu(cpu)
+               setup_cpu_entry_area(cpu);
+}
index 5e3ac6fe6c9e32ed1906f4f9bf736310a7193c7d..43dedbfb7257a32b538dc06773e13ddd55fd92dd 100644 (file)
@@ -44,10 +44,12 @@ struct addr_marker {
        unsigned long max_lines;
 };
 
-/* indices for address_markers; keep sync'd w/ address_markers below */
+/* Address space markers hints */
+
+#ifdef CONFIG_X86_64
+
 enum address_markers_idx {
        USER_SPACE_NR = 0,
-#ifdef CONFIG_X86_64
        KERNEL_SPACE_NR,
        LOW_KERNEL_NR,
        VMALLOC_START_NR,
@@ -56,56 +58,74 @@ enum address_markers_idx {
        KASAN_SHADOW_START_NR,
        KASAN_SHADOW_END_NR,
 #endif
-# ifdef CONFIG_X86_ESPFIX64
+       CPU_ENTRY_AREA_NR,
+#ifdef CONFIG_X86_ESPFIX64
        ESPFIX_START_NR,
-# endif
+#endif
+#ifdef CONFIG_EFI
+       EFI_END_NR,
+#endif
        HIGH_KERNEL_NR,
        MODULES_VADDR_NR,
        MODULES_END_NR,
-#else
+       FIXADDR_START_NR,
+       END_OF_SPACE_NR,
+};
+
+static struct addr_marker address_markers[] = {
+       [USER_SPACE_NR]         = { 0,                  "User Space" },
+       [KERNEL_SPACE_NR]       = { (1UL << 63),        "Kernel Space" },
+       [LOW_KERNEL_NR]         = { 0UL,                "Low Kernel Mapping" },
+       [VMALLOC_START_NR]      = { 0UL,                "vmalloc() Area" },
+       [VMEMMAP_START_NR]      = { 0UL,                "Vmemmap" },
+#ifdef CONFIG_KASAN
+       [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
+       [KASAN_SHADOW_END_NR]   = { KASAN_SHADOW_END,   "KASAN shadow end" },
+#endif
+       [CPU_ENTRY_AREA_NR]     = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
+#ifdef CONFIG_X86_ESPFIX64
+       [ESPFIX_START_NR]       = { ESPFIX_BASE_ADDR,   "ESPfix Area", 16 },
+#endif
+#ifdef CONFIG_EFI
+       [EFI_END_NR]            = { EFI_VA_END,         "EFI Runtime Services" },
+#endif
+       [HIGH_KERNEL_NR]        = { __START_KERNEL_map, "High Kernel Mapping" },
+       [MODULES_VADDR_NR]      = { MODULES_VADDR,      "Modules" },
+       [MODULES_END_NR]        = { MODULES_END,        "End Modules" },
+       [FIXADDR_START_NR]      = { FIXADDR_START,      "Fixmap Area" },
+       [END_OF_SPACE_NR]       = { -1,                 NULL }
+};
+
+#else /* CONFIG_X86_64 */
+
+enum address_markers_idx {
+       USER_SPACE_NR = 0,
        KERNEL_SPACE_NR,
        VMALLOC_START_NR,
        VMALLOC_END_NR,
-# ifdef CONFIG_HIGHMEM
+#ifdef CONFIG_HIGHMEM
        PKMAP_BASE_NR,
-# endif
-       FIXADDR_START_NR,
 #endif
+       CPU_ENTRY_AREA_NR,
+       FIXADDR_START_NR,
+       END_OF_SPACE_NR,
 };
 
-/* Address space markers hints */
 static struct addr_marker address_markers[] = {
-       { 0, "User Space" },
-#ifdef CONFIG_X86_64
-       { 0x8000000000000000UL, "Kernel Space" },
-       { 0/* PAGE_OFFSET */,   "Low Kernel Mapping" },
-       { 0/* VMALLOC_START */, "vmalloc() Area" },
-       { 0/* VMEMMAP_START */, "Vmemmap" },
-#ifdef CONFIG_KASAN
-       { KASAN_SHADOW_START,   "KASAN shadow" },
-       { KASAN_SHADOW_END,     "KASAN shadow end" },
+       [USER_SPACE_NR]         = { 0,                  "User Space" },
+       [KERNEL_SPACE_NR]       = { PAGE_OFFSET,        "Kernel Mapping" },
+       [VMALLOC_START_NR]      = { 0UL,                "vmalloc() Area" },
+       [VMALLOC_END_NR]        = { 0UL,                "vmalloc() End" },
+#ifdef CONFIG_HIGHMEM
+       [PKMAP_BASE_NR]         = { 0UL,                "Persistent kmap() Area" },
 #endif
-# ifdef CONFIG_X86_ESPFIX64
-       { ESPFIX_BASE_ADDR,     "ESPfix Area", 16 },
-# endif
-# ifdef CONFIG_EFI
-       { EFI_VA_END,           "EFI Runtime Services" },
-# endif
-       { __START_KERNEL_map,   "High Kernel Mapping" },
-       { MODULES_VADDR,        "Modules" },
-       { MODULES_END,          "End Modules" },
-#else
-       { PAGE_OFFSET,          "Kernel Mapping" },
-       { 0/* VMALLOC_START */, "vmalloc() Area" },
-       { 0/*VMALLOC_END*/,     "vmalloc() End" },
-# ifdef CONFIG_HIGHMEM
-       { 0/*PKMAP_BASE*/,      "Persistent kmap() Area" },
-# endif
-       { 0/*FIXADDR_START*/,   "Fixmap Area" },
-#endif
-       { -1, NULL }            /* End of list */
+       [CPU_ENTRY_AREA_NR]     = { 0UL,                "CPU entry area" },
+       [FIXADDR_START_NR]      = { 0UL,                "Fixmap area" },
+       [END_OF_SPACE_NR]       = { -1,                 NULL }
 };
 
+#endif /* !CONFIG_X86_64 */
+
 /* Multipliers for offsets within the PTEs */
 #define PTE_LEVEL_MULT (PAGE_SIZE)
 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
@@ -140,7 +160,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
        static const char * const level_name[] =
                { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
 
-       if (!pgprot_val(prot)) {
+       if (!(pr & _PAGE_PRESENT)) {
                /* Not present */
                pt_dump_cont_printf(m, dmsg, "                              ");
        } else {
@@ -525,8 +545,8 @@ static int __init pt_dump_init(void)
        address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
 # endif
        address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
+       address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
 #endif
-
        return 0;
 }
 __initcall(pt_dump_init);
index febf6980e6535572f998cf2fa0ee63d296bdc6f1..06fe3d51d385b88111961c0b5addc673fcd597a2 100644 (file)
@@ -860,7 +860,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
        if (!printk_ratelimit())
                return;
 
-       printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+       printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
                task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
                tsk->comm, task_pid_nr(tsk), address,
                (void *)regs->ip, (void *)regs->sp, error_code);
index 8a64a6f2848d9be2e73a341f4d87ab2dc35de09f..135c9a7898c7da908f1340f9750774b4327e63b3 100644 (file)
@@ -50,6 +50,7 @@
 #include <asm/setup.h>
 #include <asm/set_memory.h>
 #include <asm/page_types.h>
+#include <asm/cpu_entry_area.h>
 #include <asm/init.h>
 
 #include "mm_internal.h"
@@ -766,6 +767,7 @@ void __init mem_init(void)
        mem_init_print_info(NULL);
        printk(KERN_INFO "virtual kernel memory layout:\n"
                "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
+               "  cpu_entry : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #ifdef CONFIG_HIGHMEM
                "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
 #endif
@@ -777,6 +779,10 @@ void __init mem_init(void)
                FIXADDR_START, FIXADDR_TOP,
                (FIXADDR_TOP - FIXADDR_START) >> 10,
 
+               CPU_ENTRY_AREA_BASE,
+               CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE,
+               CPU_ENTRY_AREA_MAP_SIZE >> 10,
+
 #ifdef CONFIG_HIGHMEM
                PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
                (LAST_PKMAP*PAGE_SIZE) >> 10,
index 99dfed6dfef8b2f9028f82b89ab8dc2bde8173c4..47388f0c0e59649ca3574d4e7c31b356dad7d247 100644 (file)
@@ -15,6 +15,7 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/pgtable.h>
+#include <asm/cpu_entry_area.h>
 
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
@@ -277,6 +278,7 @@ void __init kasan_early_init(void)
 void __init kasan_init(void)
 {
        int i;
+       void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
 
 #ifdef CONFIG_KASAN_INLINE
        register_die_notifier(&kasan_die_notifier);
@@ -321,16 +323,33 @@ void __init kasan_init(void)
                map_range(&pfn_mapped[i]);
        }
 
+       shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
+       shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+       shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,
+                                               PAGE_SIZE);
+
+       shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
+                                       CPU_ENTRY_AREA_MAP_SIZE);
+       shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+       shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,
+                                       PAGE_SIZE);
+
        kasan_populate_zero_shadow(
                kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
-               kasan_mem_to_shadow((void *)__START_KERNEL_map));
+               shadow_cpu_entry_begin);
+
+       kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+                             (unsigned long)shadow_cpu_entry_end, 0);
+
+       kasan_populate_zero_shadow(shadow_cpu_entry_end,
+                               kasan_mem_to_shadow((void *)__START_KERNEL_map));
 
        kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
                              (unsigned long)kasan_mem_to_shadow(_end),
                              early_pfn_to_nid(__pa(_stext)));
 
        kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
-                       (void *)KASAN_SHADOW_END);
+                               (void *)KASAN_SHADOW_END);
 
        load_cr3(init_top_pgt);
        __flush_tlb_all();
index 6b9bf023a700559b87ae7ac89570d9bbd26d1f05..c3c5274410a908e762aed936406006d63c3116ac 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 
+#include <asm/cpu_entry_area.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/fixmap.h>
index 3118392cdf756bfc913d7a4137d5f7e0d46b046d..0a1be3adc97eeefa1a45f12aa8b6e893162912b3 100644 (file)
@@ -128,7 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
         * isn't free.
         */
 #ifdef CONFIG_DEBUG_VM
-       if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
+       if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {
                /*
                 * If we were to BUG here, we'd be very likely to kill
                 * the system so hard that we don't see the call trace.
@@ -195,7 +195,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                if (need_flush) {
                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-                       write_cr3(build_cr3(next, new_asid));
+                       write_cr3(build_cr3(next->pgd, new_asid));
 
                        /*
                         * NB: This gets called via leave_mm() in the idle path
@@ -208,7 +208,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
                } else {
                        /* The new ASID is already up to date. */
-                       write_cr3(build_cr3_noflush(next, new_asid));
+                       write_cr3(build_cr3_noflush(next->pgd, new_asid));
 
                        /* See above wrt _rcuidle. */
                        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
@@ -288,7 +288,7 @@ void initialize_tlbstate_and_flush(void)
                !(cr4_read_shadow() & X86_CR4_PCIDE));
 
        /* Force ASID 0 and force a TLB flush. */
-       write_cr3(build_cr3(mm, 0));
+       write_cr3(build_cr3(mm->pgd, 0));
 
        /* Reinitialize tlbstate. */
        this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
@@ -551,7 +551,7 @@ static void do_kernel_range_flush(void *info)
 
        /* flush range by one by one 'invlpg' */
        for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
-               __flush_tlb_single(addr);
+               __flush_tlb_one(addr);
 }
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
index f44c0bc95aa2f45ad42462a5f23f4db4672d1257..8538a6723171a5606058a8823ed1cbb2d343fdb6 100644 (file)
@@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
                local_flush_tlb();
                stat->d_alltlb++;
        } else {
-               __flush_tlb_one(msg->address);
+               __flush_tlb_single(msg->address);
                stat->d_onetlb++;
        }
        stat->d_requestee++;
index 36a28eddb435e72d2abc5ffbdd1e78a46b56876e..a7d966964c6f20577c927cf5e618bc86b3331977 100644 (file)
@@ -152,17 +152,19 @@ static void do_fpu_end(void)
 static void fix_processor_context(void)
 {
        int cpu = smp_processor_id();
-       struct tss_struct *t = &per_cpu(cpu_tss, cpu);
 #ifdef CONFIG_X86_64
        struct desc_struct *desc = get_cpu_gdt_rw(cpu);
        tss_desc tss;
 #endif
-       set_tss_desc(cpu, t);   /*
-                                * This just modifies memory; should not be
-                                * necessary. But... This is necessary, because
-                                * 386 hardware has concept of busy TSS or some
-                                * similar stupidity.
-                                */
+
+       /*
+        * We need to reload TR, which requires that we change the
+        * GDT entry to indicate "available" first.
+        *
+        * XXX: This could probably all be replaced by a call to
+        * force_reload_TR().
+        */
+       set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
 
 #ifdef CONFIG_X86_64
        memcpy(&tss, &desc[GDT_ENTRY_TSS], sizeof(tss_desc));
index d669e9d890017770456abe458f1161eb2509c09e..c9081c6671f0b7a05ecfaaf206e7e1ed2b1f456a 100644 (file)
@@ -1,8 +1,12 @@
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+#include <linux/bootmem.h>
+#endif
 #include <linux/cpu.h>
 #include <linux/kexec.h>
 
 #include <xen/features.h>
 #include <xen/page.h>
+#include <xen/interface/memory.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
@@ -331,3 +335,80 @@ void xen_arch_unregister_cpu(int num)
 }
 EXPORT_SYMBOL(xen_arch_unregister_cpu);
 #endif
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+void __init arch_xen_balloon_init(struct resource *hostmem_resource)
+{
+       struct xen_memory_map memmap;
+       int rc;
+       unsigned int i, last_guest_ram;
+       phys_addr_t max_addr = PFN_PHYS(max_pfn);
+       struct e820_table *xen_e820_table;
+       const struct e820_entry *entry;
+       struct resource *res;
+
+       if (!xen_initial_domain())
+               return;
+
+       xen_e820_table = kmalloc(sizeof(*xen_e820_table), GFP_KERNEL);
+       if (!xen_e820_table)
+               return;
+
+       memmap.nr_entries = ARRAY_SIZE(xen_e820_table->entries);
+       set_xen_guest_handle(memmap.buffer, xen_e820_table->entries);
+       rc = HYPERVISOR_memory_op(XENMEM_machine_memory_map, &memmap);
+       if (rc) {
+               pr_warn("%s: Can't read host e820 (%d)\n", __func__, rc);
+               goto out;
+       }
+
+       last_guest_ram = 0;
+       for (i = 0; i < memmap.nr_entries; i++) {
+               if (xen_e820_table->entries[i].addr >= max_addr)
+                       break;
+               if (xen_e820_table->entries[i].type == E820_TYPE_RAM)
+                       last_guest_ram = i;
+       }
+
+       entry = &xen_e820_table->entries[last_guest_ram];
+       if (max_addr >= entry->addr + entry->size)
+               goto out; /* No unallocated host RAM. */
+
+       hostmem_resource->start = max_addr;
+       hostmem_resource->end = entry->addr + entry->size;
+
+       /*
+        * Mark non-RAM regions between the end of dom0 RAM and end of host RAM
+        * as unavailable. The rest of that region can be used for hotplug-based
+        * ballooning.
+        */
+       for (; i < memmap.nr_entries; i++) {
+               entry = &xen_e820_table->entries[i];
+
+               if (entry->type == E820_TYPE_RAM)
+                       continue;
+
+               if (entry->addr >= hostmem_resource->end)
+                       break;
+
+               res = kzalloc(sizeof(*res), GFP_KERNEL);
+               if (!res)
+                       goto out;
+
+               res->name = "Unavailable host RAM";
+               res->start = entry->addr;
+               res->end = (entry->addr + entry->size < hostmem_resource->end) ?
+                           entry->addr + entry->size : hostmem_resource->end;
+               rc = insert_resource(hostmem_resource, res);
+               if (rc) {
+                       pr_warn("%s: Can't insert [%llx - %llx) (%d)\n",
+                               __func__, res->start, res->end, rc);
+                       kfree(res);
+                       goto  out;
+               }
+       }
+
+ out:
+       kfree(xen_e820_table);
+}
+#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
index f2414c6c5e7c455b43fc45773fbd1264cf86c24e..c047f42552e1a61ed0a5787d904681974cc05af1 100644 (file)
@@ -88,6 +88,8 @@
 #include "multicalls.h"
 #include "pmu.h"
 
+#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */
+
 void *xen_initial_gdt;
 
 static int xen_cpu_up_prepare_pv(unsigned int cpu);
@@ -826,7 +828,7 @@ static void xen_load_sp0(unsigned long sp0)
        mcs = xen_mc_entry(0);
        MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
        xen_mc_issue(PARAVIRT_LAZY_CPU);
-       this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
+       this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
 }
 
 void xen_set_iopl_mask(unsigned mask)
@@ -1258,6 +1260,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
        __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
 
        /* Work out if we support NX */
+       get_cpu_cap(&boot_cpu_data);
        x86_configure_nx();
 
        /* Get mfn list */
index fc048ec686e7699b263254c79b482ccf935c21ef..4d62c071b166f65c848a12ca07bfe44ca20e198a 100644 (file)
@@ -1902,6 +1902,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
        /* Graft it onto L4[511][510] */
        copy_page(level2_kernel_pgt, l2);
 
+       /*
+        * Zap execute permission from the ident map. Due to the sharing of
+        * L1 entries we need to do this in the L2.
+        */
+       if (__supported_pte_mask & _PAGE_NX) {
+               for (i = 0; i < PTRS_PER_PMD; ++i) {
+                       if (pmd_none(level2_ident_pgt[i]))
+                               continue;
+                       level2_ident_pgt[i] = pmd_set_flags(level2_ident_pgt[i], _PAGE_NX);
+               }
+       }
+
        /* Copy the initial P->M table mappings if necessary. */
        i = pgd_index(xen_start_info->mfn_list);
        if (i && i < pgd_index(__START_KERNEL_map))
@@ -2261,7 +2273,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 
        switch (idx) {
        case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-       case FIX_RO_IDT:
 #ifdef CONFIG_X86_32
        case FIX_WP_TEST:
 # ifdef CONFIG_HIGHMEM
@@ -2272,7 +2283,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 #endif
        case FIX_TEXT_POKE0:
        case FIX_TEXT_POKE1:
-       case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
                /* All local page mappings */
                pte = pfn_pte(phys, prot);
                break;
index c114ca767b3b8a382918e2b0160983fa257318db..6e0d2086eacbf37326467b5142e59750151a5328 100644 (file)
@@ -808,7 +808,6 @@ char * __init xen_memory_setup(void)
        addr = xen_e820_table.entries[0].addr;
        size = xen_e820_table.entries[0].size;
        while (i < xen_e820_table.nr_entries) {
-               bool discard = false;
 
                chunk_size = size;
                type = xen_e820_table.entries[i].type;
@@ -824,11 +823,10 @@ char * __init xen_memory_setup(void)
                                xen_add_extra_mem(pfn_s, n_pfns);
                                xen_max_p2m_pfn = pfn_s + n_pfns;
                        } else
-                               discard = true;
+                               type = E820_TYPE_UNUSABLE;
                }
 
-               if (!discard)
-                       xen_align_and_add_e820_region(addr, chunk_size, type);
+               xen_align_and_add_e820_region(addr, chunk_size, type);
 
                addr += chunk_size;
                size -= chunk_size;
index 8bfdea58159ba9ffd972dd95717e0eee99101e0a..9ef6cf3addb38cae822d0e5c5ef18ba9e98cd2d7 100644 (file)
@@ -599,6 +599,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
        bio->bi_disk = bio_src->bi_disk;
        bio->bi_partno = bio_src->bi_partno;
        bio_set_flag(bio, BIO_CLONED);
+       if (bio_flagged(bio_src, BIO_THROTTLED))
+               bio_set_flag(bio, BIO_THROTTLED);
        bio->bi_opf = bio_src->bi_opf;
        bio->bi_write_hint = bio_src->bi_write_hint;
        bio->bi_iter = bio_src->bi_iter;
index b21f8e86f1207f9b76bf3e2083fcf72b5062f0b7..d3a94719f03fb2af81d6270d6fc9ed58f0dde373 100644 (file)
 #include "blk.h"
 
 /*
- * Append a bio to a passthrough request.  Only works can be merged into
- * the request based on the driver constraints.
+ * Append a bio to a passthrough request.  Only works if the bio can be merged
+ * into the request based on the driver constraints.
  */
-int blk_rq_append_bio(struct request *rq, struct bio *bio)
+int blk_rq_append_bio(struct request *rq, struct bio **bio)
 {
-       blk_queue_bounce(rq->q, &bio);
+       struct bio *orig_bio = *bio;
+
+       blk_queue_bounce(rq->q, bio);
 
        if (!rq->bio) {
-               blk_rq_bio_prep(rq->q, rq, bio);
+               blk_rq_bio_prep(rq->q, rq, *bio);
        } else {
-               if (!ll_back_merge_fn(rq->q, rq, bio))
+               if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+                       if (orig_bio != *bio) {
+                               bio_put(*bio);
+                               *bio = orig_bio;
+                       }
                        return -EINVAL;
+               }
 
-               rq->biotail->bi_next = bio;
-               rq->biotail = bio;
-               rq->__data_len += bio->bi_iter.bi_size;
+               rq->biotail->bi_next = *bio;
+               rq->biotail = *bio;
+               rq->__data_len += (*bio)->bi_iter.bi_size;
        }
 
        return 0;
@@ -73,14 +80,12 @@ static int __blk_rq_map_user_iov(struct request *rq,
         * We link the bounce buffer in and could have to traverse it
         * later so we have to get a ref to prevent it from being freed
         */
-       ret = blk_rq_append_bio(rq, bio);
-       bio_get(bio);
+       ret = blk_rq_append_bio(rq, &bio);
        if (ret) {
-               bio_endio(bio);
                __blk_rq_unmap_user(orig_bio);
-               bio_put(bio);
                return ret;
        }
+       bio_get(bio);
 
        return 0;
 }
@@ -213,7 +218,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
        int reading = rq_data_dir(rq) == READ;
        unsigned long addr = (unsigned long) kbuf;
        int do_copy = 0;
-       struct bio *bio;
+       struct bio *bio, *orig_bio;
        int ret;
 
        if (len > (queue_max_hw_sectors(q) << 9))
@@ -236,10 +241,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
        if (do_copy)
                rq->rq_flags |= RQF_COPY_USER;
 
-       ret = blk_rq_append_bio(rq, bio);
+       orig_bio = bio;
+       ret = blk_rq_append_bio(rq, &bio);
        if (unlikely(ret)) {
                /* request is too big */
-               bio_put(bio);
+               bio_put(orig_bio);
                return ret;
        }
 
index 825bc29767e6699ac85675d319a9866b70cc9b84..d19f416d61012ac032c49608f0afe463c948e8bc 100644 (file)
@@ -2226,13 +2226,7 @@ again:
 out_unlock:
        spin_unlock_irq(q->queue_lock);
 out:
-       /*
-        * As multiple blk-throtls may stack in the same issue path, we
-        * don't want bios to leave with the flag set.  Clear the flag if
-        * being issued.
-        */
-       if (!throttled)
-               bio_clear_flag(bio, BIO_THROTTLED);
+       bio_set_flag(bio, BIO_THROTTLED);
 
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
        if (throttled || !td->track_bio_latency)
index fceb1a96480bfb9600e4664fa2b4992c8bb64210..1d05c422c932ad56d705f94deed6cce0891ff9d3 100644 (file)
@@ -200,6 +200,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
        unsigned i = 0;
        bool bounce = false;
        int sectors = 0;
+       bool passthrough = bio_is_passthrough(*bio_orig);
 
        bio_for_each_segment(from, *bio_orig, iter) {
                if (i++ < BIO_MAX_PAGES)
@@ -210,13 +211,14 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
        if (!bounce)
                return;
 
-       if (sectors < bio_sectors(*bio_orig)) {
+       if (!passthrough && sectors < bio_sectors(*bio_orig)) {
                bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
                bio_chain(bio, *bio_orig);
                generic_make_request(*bio_orig);
                *bio_orig = bio;
        }
-       bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
+       bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+                       bounce_bio_set);
 
        bio_for_each_segment_all(to, bio, i) {
                struct page *page = to->bv_page;
index b4df317c291692f01138b91608dc6c80f71bb9aa..f95c60774ce8ca613417d3ccf54bee52010752ee 100644 (file)
@@ -100,9 +100,13 @@ struct kyber_hctx_data {
        unsigned int cur_domain;
        unsigned int batching;
        wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS];
+       struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
        atomic_t wait_index[KYBER_NUM_DOMAINS];
 };
 
+static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
+                            void *key);
+
 static int rq_sched_domain(const struct request *rq)
 {
        unsigned int op = rq->cmd_flags;
@@ -385,6 +389,9 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 
        for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
                INIT_LIST_HEAD(&khd->rqs[i]);
+               init_waitqueue_func_entry(&khd->domain_wait[i],
+                                         kyber_domain_wake);
+               khd->domain_wait[i].private = hctx;
                INIT_LIST_HEAD(&khd->domain_wait[i].entry);
                atomic_set(&khd->wait_index[i], 0);
        }
@@ -524,35 +531,39 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd,
        int nr;
 
        nr = __sbitmap_queue_get(domain_tokens);
-       if (nr >= 0)
-               return nr;
 
        /*
         * If we failed to get a domain token, make sure the hardware queue is
         * run when one becomes available. Note that this is serialized on
         * khd->lock, but we still need to be careful about the waker.
         */
-       if (list_empty_careful(&wait->entry)) {
-               init_waitqueue_func_entry(wait, kyber_domain_wake);
-               wait->private = hctx;
+       if (nr < 0 && list_empty_careful(&wait->entry)) {
                ws = sbq_wait_ptr(domain_tokens,
                                  &khd->wait_index[sched_domain]);
+               khd->domain_ws[sched_domain] = ws;
                add_wait_queue(&ws->wait, wait);
 
                /*
                 * Try again in case a token was freed before we got on the wait
-                * queue. The waker may have already removed the entry from the
-                * wait queue, but list_del_init() is okay with that.
+                * queue.
                 */
                nr = __sbitmap_queue_get(domain_tokens);
-               if (nr >= 0) {
-                       unsigned long flags;
+       }
 
-                       spin_lock_irqsave(&ws->wait.lock, flags);
-                       list_del_init(&wait->entry);
-                       spin_unlock_irqrestore(&ws->wait.lock, flags);
-               }
+       /*
+        * If we got a token while we were on the wait queue, remove ourselves
+        * from the wait queue to ensure that all wake ups make forward
+        * progress. It's possible that the waker already deleted the entry
+        * between the !list_empty_careful() check and us grabbing the lock, but
+        * list_del_init() is okay with that.
+        */
+       if (nr >= 0 && !list_empty_careful(&wait->entry)) {
+               ws = khd->domain_ws[sched_domain];
+               spin_lock_irq(&ws->wait.lock);
+               list_del_init(&wait->entry);
+               spin_unlock_irq(&ws->wait.lock);
        }
+
        return nr;
 }
 
index 415a54ced4d6a490ae1e09170c8b80ef3eef135e..444a387df219e96a35fb7972f2a1a810e013a60b 100644 (file)
@@ -1138,12 +1138,6 @@ int af_alg_get_rsgl(struct sock *sk, struct msghdr *msg, int flags,
                if (!af_alg_readable(sk))
                        break;
 
-               if (!ctx->used) {
-                       err = af_alg_wait_for_data(sk, flags);
-                       if (err)
-                               return err;
-               }
-
                seglen = min_t(size_t, (maxsize - len),
                               msg_data_left(msg));
 
index 48b34e9c68342c55610ad83900557dc1c785af41..ddcc45f77edd367bf118e46aa757891c5c3d8869 100644 (file)
@@ -111,6 +111,12 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
        size_t usedpages = 0;           /* [in]  RX bufs to be used from user */
        size_t processed = 0;           /* [in]  TX bufs to be consumed */
 
+       if (!ctx->used) {
+               err = af_alg_wait_for_data(sk, flags);
+               if (err)
+                       return err;
+       }
+
        /*
         * Data length provided by caller via sendmsg/sendpage that has not
         * yet been processed.
@@ -285,6 +291,10 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
                /* AIO operation */
                sock_hold(sk);
                areq->iocb = msg->msg_iocb;
+
+               /* Remember output size that will be generated. */
+               areq->outlen = outlen;
+
                aead_request_set_callback(&areq->cra_u.aead_req,
                                          CRYPTO_TFM_REQ_MAY_BACKLOG,
                                          af_alg_async_cb, areq);
@@ -292,12 +302,8 @@ static int _aead_recvmsg(struct socket *sock, struct msghdr *msg,
                                 crypto_aead_decrypt(&areq->cra_u.aead_req);
 
                /* AIO operation in progress */
-               if (err == -EINPROGRESS || err == -EBUSY) {
-                       /* Remember output size that will be generated. */
-                       areq->outlen = outlen;
-
+               if (err == -EINPROGRESS || err == -EBUSY)
                        return -EIOCBQUEUED;
-               }
 
                sock_put(sk);
        } else {
index 30cff827dd8fff048fa3e2ca7de770ab73022749..baef9bfccddaa94728bea5933bea16c32b2a32b5 100644 (file)
@@ -72,6 +72,12 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
        int err = 0;
        size_t len = 0;
 
+       if (!ctx->used) {
+               err = af_alg_wait_for_data(sk, flags);
+               if (err)
+                       return err;
+       }
+
        /* Allocate cipher request for current operation. */
        areq = af_alg_alloc_areq(sk, sizeof(struct af_alg_async_req) +
                                     crypto_skcipher_reqsize(tfm));
@@ -119,6 +125,10 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
                /* AIO operation */
                sock_hold(sk);
                areq->iocb = msg->msg_iocb;
+
+               /* Remember output size that will be generated. */
+               areq->outlen = len;
+
                skcipher_request_set_callback(&areq->cra_u.skcipher_req,
                                              CRYPTO_TFM_REQ_MAY_SLEEP,
                                              af_alg_async_cb, areq);
@@ -127,12 +137,8 @@ static int _skcipher_recvmsg(struct socket *sock, struct msghdr *msg,
                        crypto_skcipher_decrypt(&areq->cra_u.skcipher_req);
 
                /* AIO operation in progress */
-               if (err == -EINPROGRESS || err == -EBUSY) {
-                       /* Remember output size that will be generated. */
-                       areq->outlen = len;
-
+               if (err == -EINPROGRESS || err == -EBUSY)
                        return -EIOCBQUEUED;
-               }
 
                sock_put(sk);
        } else {
index 4e64726588524f137acd590809bef11673695ed2..eca04d3729b37c696c2dac4b0ac472422f30615d 100644 (file)
@@ -81,6 +81,7 @@ static int mcryptd_init_queue(struct mcryptd_queue *queue,
                pr_debug("cpu_queue #%d %p\n", cpu, queue->cpu_queue);
                crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
                INIT_WORK(&cpu_queue->work, mcryptd_queue_worker);
+               spin_lock_init(&cpu_queue->q_lock);
        }
        return 0;
 }
@@ -104,15 +105,16 @@ static int mcryptd_enqueue_request(struct mcryptd_queue *queue,
        int cpu, err;
        struct mcryptd_cpu_queue *cpu_queue;
 
-       cpu = get_cpu();
-       cpu_queue = this_cpu_ptr(queue->cpu_queue);
-       rctx->tag.cpu = cpu;
+       cpu_queue = raw_cpu_ptr(queue->cpu_queue);
+       spin_lock(&cpu_queue->q_lock);
+       cpu = smp_processor_id();
+       rctx->tag.cpu = smp_processor_id();
 
        err = crypto_enqueue_request(&cpu_queue->queue, request);
        pr_debug("enqueue request: cpu %d cpu_queue %p request %p\n",
                 cpu, cpu_queue, request);
+       spin_unlock(&cpu_queue->q_lock);
        queue_work_on(cpu, kcrypto_wq, &cpu_queue->work);
-       put_cpu();
 
        return err;
 }
@@ -161,16 +163,11 @@ static void mcryptd_queue_worker(struct work_struct *work)
        cpu_queue = container_of(work, struct mcryptd_cpu_queue, work);
        i = 0;
        while (i < MCRYPTD_BATCH || single_task_running()) {
-               /*
-                * preempt_disable/enable is used to prevent
-                * being preempted by mcryptd_enqueue_request()
-                */
-               local_bh_disable();
-               preempt_disable();
+
+               spin_lock_bh(&cpu_queue->q_lock);
                backlog = crypto_get_backlog(&cpu_queue->queue);
                req = crypto_dequeue_request(&cpu_queue->queue);
-               preempt_enable();
-               local_bh_enable();
+               spin_unlock_bh(&cpu_queue->q_lock);
 
                if (!req) {
                        mcryptd_opportunistic_flush();
@@ -185,7 +182,7 @@ static void mcryptd_queue_worker(struct work_struct *work)
                ++i;
        }
        if (cpu_queue->queue.qlen)
-               queue_work(kcrypto_wq, &cpu_queue->work);
+               queue_work_on(smp_processor_id(), kcrypto_wq, &cpu_queue->work);
 }
 
 void mcryptd_flusher(struct work_struct *__work)
index 778e0ff42bfa801eda5be848da9e6747ebbc2626..11af5fd6a443570550e1dac5b0a429b2cae801b1 100644 (file)
@@ -449,6 +449,8 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
 
        walk->total = req->cryptlen;
        walk->nbytes = 0;
+       walk->iv = req->iv;
+       walk->oiv = req->iv;
 
        if (unlikely(!walk->total))
                return 0;
@@ -456,9 +458,6 @@ static int skcipher_walk_skcipher(struct skcipher_walk *walk,
        scatterwalk_start(&walk->in, req->src);
        scatterwalk_start(&walk->out, req->dst);
 
-       walk->iv = req->iv;
-       walk->oiv = req->iv;
-
        walk->flags &= ~SKCIPHER_WALK_SLEEP;
        walk->flags |= req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP ?
                       SKCIPHER_WALK_SLEEP : 0;
@@ -510,6 +509,8 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk,
        int err;
 
        walk->nbytes = 0;
+       walk->iv = req->iv;
+       walk->oiv = req->iv;
 
        if (unlikely(!walk->total))
                return 0;
@@ -525,9 +526,6 @@ static int skcipher_walk_aead_common(struct skcipher_walk *walk,
        scatterwalk_done(&walk->in, 0, walk->total);
        scatterwalk_done(&walk->out, 0, walk->total);
 
-       walk->iv = req->iv;
-       walk->oiv = req->iv;
-
        if (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP)
                walk->flags |= SKCIPHER_WALK_SLEEP;
        else
index 6742f6c68034c5e833505d294902dd97c274c1b0..9bff853e85f37831d8d053a2aa363f139537c9b5 100644 (file)
@@ -1007,7 +1007,7 @@ skip:
        /* The record may be cleared by others, try read next record */
        if (len == -ENOENT)
                goto skip;
-       else if (len < sizeof(*rcd)) {
+       else if (len < 0 || len < sizeof(*rcd)) {
                rc = -EIO;
                goto out;
        }
index 30e84cc600ae6438c25aec2f2975ae4e3f144553..06ea4749ebd9826a3d7b8b0a9798a1cc797f4d61 100644 (file)
@@ -1171,7 +1171,7 @@ int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
        struct cpc_desc *cpc_desc = per_cpu(cpc_desc_ptr, cpu);
        struct cpc_register_resource *desired_reg;
        int pcc_ss_id = per_cpu(cpu_pcc_subspace_idx, cpu);
-       struct cppc_pcc_data *pcc_ss_data = pcc_data[pcc_ss_id];
+       struct cppc_pcc_data *pcc_ss_data;
        int ret = 0;
 
        if (!cpc_desc || pcc_ss_id < 0) {
index ff2580e7611d18c6d56c58d50c2cbc3a2d54aa36..abeb4df4f22e43d7f0d1398af9962135a37af4b6 100644 (file)
@@ -1670,6 +1670,11 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
                                dev_name(&adev_dimm->dev));
                return -ENXIO;
        }
+       /*
+        * Record nfit_mem for the notification path to track back to
+        * the nfit sysfs attributes for this dimm device object.
+        */
+       dev_set_drvdata(&adev_dimm->dev, nfit_mem);
 
        /*
         * Until standardization materializes we need to consider 4
@@ -1752,9 +1757,11 @@ static void shutdown_dimm_notify(void *data)
                        sysfs_put(nfit_mem->flags_attr);
                        nfit_mem->flags_attr = NULL;
                }
-               if (adev_dimm)
+               if (adev_dimm) {
                        acpi_remove_notify_handler(adev_dimm->handle,
                                        ACPI_DEVICE_NOTIFY, acpi_nvdimm_notify);
+                       dev_set_drvdata(&adev_dimm->dev, NULL);
+               }
        }
        mutex_unlock(&acpi_desc->init_mutex);
 }
index ccb9975a97fa3f214d658776450ab618bae26643..ad0477ae820f040affe54f4368d3a02d9da63350 100644 (file)
@@ -35,13 +35,13 @@ static inline u64 mb_per_tick(int mbps)
 struct nullb_cmd {
        struct list_head list;
        struct llist_node ll_list;
-       call_single_data_t csd;
+       struct __call_single_data csd;
        struct request *rq;
        struct bio *bio;
        unsigned int tag;
+       blk_status_t error;
        struct nullb_queue *nq;
        struct hrtimer timer;
-       blk_status_t error;
 };
 
 struct nullb_queue {
index 647d056df88c8dd2a7d8288e35fa2eeba9b7705b..8a1860a36c778aba3b66de996a6cc7ee878cf97e 100644 (file)
@@ -1564,6 +1564,9 @@ static void clk_change_rate(struct clk_core *core)
                best_parent_rate = core->parent->rate;
        }
 
+       if (clk_pm_runtime_get(core))
+               return;
+
        if (core->flags & CLK_SET_RATE_UNGATE) {
                unsigned long flags;
 
@@ -1634,6 +1637,8 @@ static void clk_change_rate(struct clk_core *core)
        /* handle the new child who might not be in core->children yet */
        if (core->new_child)
                clk_change_rate(core->new_child);
+
+       clk_pm_runtime_put(core);
 }
 
 static int clk_core_set_rate_nolock(struct clk_core *core,
index a1a634253d6f2299bfad888b2fa193c98b4ac019..f00d8758ba24f6e5ed537a88be76ff85e5a7c5e4 100644 (file)
@@ -16,6 +16,7 @@
 
 #include <linux/clk.h>
 #include <linux/clk-provider.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -83,9 +84,20 @@ static int sun9i_mmc_reset_deassert(struct reset_controller_dev *rcdev,
        return 0;
 }
 
+static int sun9i_mmc_reset_reset(struct reset_controller_dev *rcdev,
+                                unsigned long id)
+{
+       sun9i_mmc_reset_assert(rcdev, id);
+       udelay(10);
+       sun9i_mmc_reset_deassert(rcdev, id);
+
+       return 0;
+}
+
 static const struct reset_control_ops sun9i_mmc_reset_ops = {
        .assert         = sun9i_mmc_reset_assert,
        .deassert       = sun9i_mmc_reset_deassert,
+       .reset          = sun9i_mmc_reset_reset,
 };
 
 static int sun9i_a80_mmc_config_clk_probe(struct platform_device *pdev)
index 58d4f4e1ad6a907991873a03027e6c7aa2f31fc4..ca38229b045ab288a2f250dddaf1b174e8c0572f 100644 (file)
@@ -22,6 +22,8 @@
 
 #include "cpufreq_governor.h"
 
+#define CPUFREQ_DBS_MIN_SAMPLING_INTERVAL      (2 * TICK_NSEC / NSEC_PER_USEC)
+
 static DEFINE_PER_CPU(struct cpu_dbs_info, cpu_dbs);
 
 static DEFINE_MUTEX(gov_dbs_data_mutex);
@@ -47,11 +49,15 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
 {
        struct dbs_data *dbs_data = to_dbs_data(attr_set);
        struct policy_dbs_info *policy_dbs;
+       unsigned int sampling_interval;
        int ret;
-       ret = sscanf(buf, "%u", &dbs_data->sampling_rate);
-       if (ret != 1)
+
+       ret = sscanf(buf, "%u", &sampling_interval);
+       if (ret != 1 || sampling_interval < CPUFREQ_DBS_MIN_SAMPLING_INTERVAL)
                return -EINVAL;
 
+       dbs_data->sampling_rate = sampling_interval;
+
        /*
         * We are operating under dbs_data->mutex and so the list and its
         * entries can't be freed concurrently.
@@ -430,7 +436,14 @@ int cpufreq_dbs_governor_init(struct cpufreq_policy *policy)
        if (ret)
                goto free_policy_dbs_info;
 
-       dbs_data->sampling_rate = cpufreq_policy_transition_delay_us(policy);
+       /*
+        * The sampling interval should not be less than the transition latency
+        * of the CPU and it also cannot be too small for dbs_update() to work
+        * correctly.
+        */
+       dbs_data->sampling_rate = max_t(unsigned int,
+                                       CPUFREQ_DBS_MIN_SAMPLING_INTERVAL,
+                                       cpufreq_policy_transition_delay_us(policy));
 
        if (!have_governor_per_policy())
                gov->gdbs_data = dbs_data;
index 628fe899cb483da9dbf0f7661b537734bc82f784..d9b2c2de49c43f125c91b382f818ff81d0ffc6ac 100644 (file)
@@ -226,17 +226,18 @@ static void imx6q_opp_check_speed_grading(struct device *dev)
        val >>= OCOTP_CFG3_SPEED_SHIFT;
        val &= 0x3;
 
-       if ((val != OCOTP_CFG3_SPEED_1P2GHZ) &&
-            of_machine_is_compatible("fsl,imx6q"))
-               if (dev_pm_opp_disable(dev, 1200000000))
-                       dev_warn(dev, "failed to disable 1.2GHz OPP\n");
        if (val < OCOTP_CFG3_SPEED_996MHZ)
                if (dev_pm_opp_disable(dev, 996000000))
                        dev_warn(dev, "failed to disable 996MHz OPP\n");
-       if (of_machine_is_compatible("fsl,imx6q")) {
+
+       if (of_machine_is_compatible("fsl,imx6q") ||
+           of_machine_is_compatible("fsl,imx6qp")) {
                if (val != OCOTP_CFG3_SPEED_852MHZ)
                        if (dev_pm_opp_disable(dev, 852000000))
                                dev_warn(dev, "failed to disable 852MHz OPP\n");
+               if (val != OCOTP_CFG3_SPEED_1P2GHZ)
+                       if (dev_pm_opp_disable(dev, 1200000000))
+                               dev_warn(dev, "failed to disable 1.2GHz OPP\n");
        }
        iounmap(base);
 put_node:
index da43813d67a4ad56ddecb79ac0a749afe29abc43..5aeb5f8816f3b9a68666cf57372cddeb12c2b36a 100644 (file)
@@ -2467,7 +2467,7 @@ static int gfx_v9_0_kiq_kcq_enable(struct amdgpu_device *adev)
                                  PACKET3_MAP_QUEUES_PIPE(ring->pipe) |
                                  PACKET3_MAP_QUEUES_ME((ring->me == 1 ? 0 : 1)) |
                                  PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
-                                 PACKET3_MAP_QUEUES_ALLOC_FORMAT(1) | /* alloc format: all_on_one_pipe */
+                                 PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */
                                  PACKET3_MAP_QUEUES_ENGINE_SEL(0) | /* engine_sel: compute */
                                  PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */
                amdgpu_ring_write(kiq_ring, PACKET3_MAP_QUEUES_DOORBELL_OFFSET(ring->doorbell_index));
index f71fe6d2ddda795fd2fb914740b75845893c1298..bb5fa895fb6446097580ce229ef23dc473f979af 100644 (file)
@@ -2336,7 +2336,7 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector,
                       const struct dm_connector_state *dm_state)
 {
        struct drm_display_mode *preferred_mode = NULL;
-       const struct drm_connector *drm_connector;
+       struct drm_connector *drm_connector;
        struct dc_stream_state *stream = NULL;
        struct drm_display_mode mode = *drm_mode;
        bool native_mode_found = false;
@@ -2355,11 +2355,13 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector,
 
        if (!aconnector->dc_sink) {
                /*
-                * Exclude MST from creating fake_sink
-                * TODO: need to enable MST into fake_sink feature
+                * Create dc_sink when necessary to MST
+                * Don't apply fake_sink to MST
                 */
-               if (aconnector->mst_port)
-                       goto stream_create_fail;
+               if (aconnector->mst_port) {
+                       dm_dp_mst_dc_sink_create(drm_connector);
+                       goto mst_dc_sink_create_done;
+               }
 
                if (create_fake_sink(aconnector))
                        goto stream_create_fail;
@@ -2410,6 +2412,7 @@ create_stream_for_sink(struct amdgpu_dm_connector *aconnector,
 stream_create_fail:
 dm_state_null:
 drm_connector_null:
+mst_dc_sink_create_done:
        return stream;
 }
 
index 117521c6a6ed26213c60ec5316df64b66eeaba12..0230250a1164bb01b41f3a2b22011960909e14bb 100644 (file)
@@ -189,6 +189,8 @@ struct amdgpu_dm_connector {
        struct mutex hpd_lock;
 
        bool fake_enable;
+
+       bool mst_connected;
 };
 
 #define to_amdgpu_dm_connector(x) container_of(x, struct amdgpu_dm_connector, base)
index f8efb98b1fa72f86ecbec4c568a653af164c7daa..638c2c2b5cd79069e7312b7d7f23a28b6f5eb3b6 100644 (file)
@@ -185,6 +185,42 @@ static int dm_connector_update_modes(struct drm_connector *connector,
        return ret;
 }
 
+void dm_dp_mst_dc_sink_create(struct drm_connector *connector)
+{
+       struct amdgpu_dm_connector *aconnector = to_amdgpu_dm_connector(connector);
+       struct edid *edid;
+       struct dc_sink *dc_sink;
+       struct dc_sink_init_data init_params = {
+                       .link = aconnector->dc_link,
+                       .sink_signal = SIGNAL_TYPE_DISPLAY_PORT_MST };
+
+       edid = drm_dp_mst_get_edid(connector, &aconnector->mst_port->mst_mgr, aconnector->port);
+
+       if (!edid) {
+               drm_mode_connector_update_edid_property(
+                       &aconnector->base,
+                       NULL);
+               return;
+       }
+
+       aconnector->edid = edid;
+
+       dc_sink = dc_link_add_remote_sink(
+               aconnector->dc_link,
+               (uint8_t *)aconnector->edid,
+               (aconnector->edid->extensions + 1) * EDID_LENGTH,
+               &init_params);
+
+       dc_sink->priv = aconnector;
+       aconnector->dc_sink = dc_sink;
+
+       amdgpu_dm_add_sink_to_freesync_module(
+                       connector, aconnector->edid);
+
+       drm_mode_connector_update_edid_property(
+                                       &aconnector->base, aconnector->edid);
+}
+
 static int dm_dp_mst_get_modes(struct drm_connector *connector)
 {
        struct amdgpu_dm_connector *aconnector = to_amdgpu_dm_connector(connector);
@@ -311,6 +347,7 @@ dm_dp_add_mst_connector(struct drm_dp_mst_topology_mgr *mgr,
                        drm_mode_connector_set_path_property(connector, pathprop);
 
                        drm_connector_list_iter_end(&conn_iter);
+                       aconnector->mst_connected = true;
                        return &aconnector->base;
                }
        }
@@ -363,6 +400,8 @@ dm_dp_add_mst_connector(struct drm_dp_mst_topology_mgr *mgr,
         */
        amdgpu_dm_connector_funcs_reset(connector);
 
+       aconnector->mst_connected = true;
+
        DRM_INFO("DM_MST: added connector: %p [id: %d] [master: %p]\n",
                        aconnector, connector->base.id, aconnector->mst_port);
 
@@ -394,6 +433,8 @@ static void dm_dp_destroy_mst_connector(struct drm_dp_mst_topology_mgr *mgr,
        drm_mode_connector_update_edid_property(
                        &aconnector->base,
                        NULL);
+
+       aconnector->mst_connected = false;
 }
 
 static void dm_dp_mst_hotplug(struct drm_dp_mst_topology_mgr *mgr)
@@ -404,10 +445,18 @@ static void dm_dp_mst_hotplug(struct drm_dp_mst_topology_mgr *mgr)
        drm_kms_helper_hotplug_event(dev);
 }
 
+static void dm_dp_mst_link_status_reset(struct drm_connector *connector)
+{
+       mutex_lock(&connector->dev->mode_config.mutex);
+       drm_mode_connector_set_link_status_property(connector, DRM_MODE_LINK_STATUS_BAD);
+       mutex_unlock(&connector->dev->mode_config.mutex);
+}
+
 static void dm_dp_mst_register_connector(struct drm_connector *connector)
 {
        struct drm_device *dev = connector->dev;
        struct amdgpu_device *adev = dev->dev_private;
+       struct amdgpu_dm_connector *aconnector = to_amdgpu_dm_connector(connector);
 
        if (adev->mode_info.rfbdev)
                drm_fb_helper_add_one_connector(&adev->mode_info.rfbdev->helper, connector);
@@ -416,6 +465,8 @@ static void dm_dp_mst_register_connector(struct drm_connector *connector)
 
        drm_connector_register(connector);
 
+       if (aconnector->mst_connected)
+               dm_dp_mst_link_status_reset(connector);
 }
 
 static const struct drm_dp_mst_topology_cbs dm_mst_cbs = {
index 2da851b40042aee9b79eb2c666d45c0f5061fee0..8cf51da26657e29e72062b34aeed7e5d827f9e21 100644 (file)
@@ -31,5 +31,6 @@ struct amdgpu_dm_connector;
 
 void amdgpu_dm_initialize_dp_connector(struct amdgpu_display_manager *dm,
                                       struct amdgpu_dm_connector *aconnector);
+void dm_dp_mst_dc_sink_create(struct drm_connector *connector);
 
 #endif
index 3dce35e66b0917d2ec93420063b3477443788302..b142629a105841b603501291800e45b9ade30591 100644 (file)
@@ -900,6 +900,15 @@ bool dcn_validate_bandwidth(
                        v->override_vta_ps[input_idx] = pipe->plane_res.scl_data.taps.v_taps;
                        v->override_hta_pschroma[input_idx] = pipe->plane_res.scl_data.taps.h_taps_c;
                        v->override_vta_pschroma[input_idx] = pipe->plane_res.scl_data.taps.v_taps_c;
+                       /*
+                        * Spreadsheet doesn't handle taps_c is one properly,
+                        * need to force Chroma to always be scaled to pass
+                        * bandwidth validation.
+                        */
+                       if (v->override_hta_pschroma[input_idx] == 1)
+                               v->override_hta_pschroma[input_idx] = 2;
+                       if (v->override_vta_pschroma[input_idx] == 1)
+                               v->override_vta_pschroma[input_idx] = 2;
                        v->source_scan[input_idx] = (pipe->plane_state->rotation % 2) ? dcn_bw_vert : dcn_bw_hor;
                }
                if (v->is_line_buffer_bpp_fixed == dcn_bw_yes)
index e27ed4a45265290690604b10e6d4df4fbee77514..42a111b9505dcb5190437a381c7dba8fda444719 100644 (file)
@@ -1801,7 +1801,7 @@ static void disable_link(struct dc_link *link, enum signal_type signal)
                link->link_enc->funcs->disable_output(link->link_enc, signal, link);
 }
 
-bool dp_active_dongle_validate_timing(
+static bool dp_active_dongle_validate_timing(
                const struct dc_crtc_timing *timing,
                const struct dc_dongle_caps *dongle_caps)
 {
@@ -1833,6 +1833,8 @@ bool dp_active_dongle_validate_timing(
        /* Check Color Depth and Pixel Clock */
        if (timing->pixel_encoding == PIXEL_ENCODING_YCBCR420)
                required_pix_clk /= 2;
+       else if (timing->pixel_encoding == PIXEL_ENCODING_YCBCR422)
+               required_pix_clk = required_pix_clk * 2 / 3;
 
        switch (timing->display_color_depth) {
        case COLOR_DEPTH_666:
index 07ff8d2faf3f4630276d9241092f605274375cda..d844fadcd56f048739e374cb8d534cba10d235b3 100644 (file)
@@ -2866,16 +2866,19 @@ static void dce110_apply_ctx_for_surface(
                int num_planes,
                struct dc_state *context)
 {
-       int i, be_idx;
+       int i;
 
        if (num_planes == 0)
                return;
 
-       be_idx = -1;
        for (i = 0; i < dc->res_pool->pipe_count; i++) {
-               if (stream == context->res_ctx.pipe_ctx[i].stream) {
-                       be_idx = context->res_ctx.pipe_ctx[i].stream_res.tg->inst;
-                       break;
+               struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i];
+               struct pipe_ctx *old_pipe_ctx = &dc->current_state->res_ctx.pipe_ctx[i];
+
+               if (stream == pipe_ctx->stream) {
+                       if (!pipe_ctx->top_pipe &&
+                               (pipe_ctx->plane_state || old_pipe_ctx->plane_state))
+                               dc->hwss.pipe_control_lock(dc, pipe_ctx, true);
                }
        }
 
@@ -2895,9 +2898,22 @@ static void dce110_apply_ctx_for_surface(
                                        context->stream_count);
 
                dce110_program_front_end_for_pipe(dc, pipe_ctx);
+
+               dc->hwss.update_plane_addr(dc, pipe_ctx);
+
                program_surface_visibility(dc, pipe_ctx);
 
        }
+
+       for (i = 0; i < dc->res_pool->pipe_count; i++) {
+               struct pipe_ctx *pipe_ctx = &context->res_ctx.pipe_ctx[i];
+               struct pipe_ctx *old_pipe_ctx = &dc->current_state->res_ctx.pipe_ctx[i];
+
+               if ((stream == pipe_ctx->stream) &&
+                       (!pipe_ctx->top_pipe) &&
+                       (pipe_ctx->plane_state || old_pipe_ctx->plane_state))
+                       dc->hwss.pipe_control_lock(dc, pipe_ctx, false);
+       }
 }
 
 static void dce110_power_down_fe(struct dc *dc, int fe_idx)
index 74e7c82bdc76a71080d8e22dd6db90179ffee54e..a9d55d0dd69e009f4a31038c9529c86c05904ec7 100644 (file)
@@ -159,11 +159,10 @@ bool dpp_get_optimal_number_of_taps(
                        scl_data->taps.h_taps = 1;
                if (IDENTITY_RATIO(scl_data->ratios.vert))
                        scl_data->taps.v_taps = 1;
-               /*
-                * Spreadsheet doesn't handle taps_c is one properly,
-                * need to force Chroma to always be scaled to pass
-                * bandwidth validation.
-                */
+               if (IDENTITY_RATIO(scl_data->ratios.horz_c))
+                       scl_data->taps.h_taps_c = 1;
+               if (IDENTITY_RATIO(scl_data->ratios.vert_c))
+                       scl_data->taps.v_taps_c = 1;
        }
 
        return true;
index 59849f02e2ad5bb74559ea85fbeb6fc1dd97bde6..1402c0e71b03d18866139056b12f0d5fd84b6afb 100644 (file)
@@ -220,17 +220,6 @@ static struct drm_master *drm_lease_create(struct drm_master *lessor, struct idr
 
        mutex_lock(&dev->mode_config.idr_mutex);
 
-       /* Insert the new lessee into the tree */
-       id = idr_alloc(&(drm_lease_owner(lessor)->lessee_idr), lessee, 1, 0, GFP_KERNEL);
-       if (id < 0) {
-               error = id;
-               goto out_lessee;
-       }
-
-       lessee->lessee_id = id;
-       lessee->lessor = drm_master_get(lessor);
-       list_add_tail(&lessee->lessee_list, &lessor->lessees);
-
        idr_for_each_entry(leases, entry, object) {
                error = 0;
                if (!idr_find(&dev->mode_config.crtc_idr, object))
@@ -246,6 +235,17 @@ static struct drm_master *drm_lease_create(struct drm_master *lessor, struct idr
                }
        }
 
+       /* Insert the new lessee into the tree */
+       id = idr_alloc(&(drm_lease_owner(lessor)->lessee_idr), lessee, 1, 0, GFP_KERNEL);
+       if (id < 0) {
+               error = id;
+               goto out_lessee;
+       }
+
+       lessee->lessee_id = id;
+       lessee->lessor = drm_master_get(lessor);
+       list_add_tail(&lessee->lessee_list, &lessor->lessees);
+
        /* Move the leases over */
        lessee->leases = *leases;
        DRM_DEBUG_LEASE("new lessee %d %p, lessor %d %p\n", lessee->lessee_id, lessee, lessor->lessee_id, lessor);
index 37a93cdffb4ad0e7986a634df4d70ccc3fef286e..2c90519576a3e8b63a4c8361f18672db853ebcec 100644 (file)
@@ -558,11 +558,10 @@ int drm_plane_check_pixel_format(const struct drm_plane *plane, u32 format)
 }
 
 /*
- * setplane_internal - setplane handler for internal callers
+ * __setplane_internal - setplane handler for internal callers
  *
- * Note that we assume an extra reference has already been taken on fb.  If the
- * update fails, this reference will be dropped before return; if it succeeds,
- * the previous framebuffer (if any) will be unreferenced instead.
+ * This function will take a reference on the new fb for the plane
+ * on success.
  *
  * src_{x,y,w,h} are provided in 16.16 fixed point format
  */
@@ -630,14 +629,12 @@ static int __setplane_internal(struct drm_plane *plane,
        if (!ret) {
                plane->crtc = crtc;
                plane->fb = fb;
-               fb = NULL;
+               drm_framebuffer_get(plane->fb);
        } else {
                plane->old_fb = NULL;
        }
 
 out:
-       if (fb)
-               drm_framebuffer_put(fb);
        if (plane->old_fb)
                drm_framebuffer_put(plane->old_fb);
        plane->old_fb = NULL;
@@ -685,6 +682,7 @@ int drm_mode_setplane(struct drm_device *dev, void *data,
        struct drm_plane *plane;
        struct drm_crtc *crtc = NULL;
        struct drm_framebuffer *fb = NULL;
+       int ret;
 
        if (!drm_core_check_feature(dev, DRIVER_MODESET))
                return -EINVAL;
@@ -717,15 +715,16 @@ int drm_mode_setplane(struct drm_device *dev, void *data,
                }
        }
 
-       /*
-        * setplane_internal will take care of deref'ing either the old or new
-        * framebuffer depending on success.
-        */
-       return setplane_internal(plane, crtc, fb,
-                                plane_req->crtc_x, plane_req->crtc_y,
-                                plane_req->crtc_w, plane_req->crtc_h,
-                                plane_req->src_x, plane_req->src_y,
-                                plane_req->src_w, plane_req->src_h);
+       ret = setplane_internal(plane, crtc, fb,
+                               plane_req->crtc_x, plane_req->crtc_y,
+                               plane_req->crtc_w, plane_req->crtc_h,
+                               plane_req->src_x, plane_req->src_y,
+                               plane_req->src_w, plane_req->src_h);
+
+       if (fb)
+               drm_framebuffer_put(fb);
+
+       return ret;
 }
 
 static int drm_mode_cursor_universal(struct drm_crtc *crtc,
@@ -788,13 +787,12 @@ static int drm_mode_cursor_universal(struct drm_crtc *crtc,
                src_h = fb->height << 16;
        }
 
-       /*
-        * setplane_internal will take care of deref'ing either the old or new
-        * framebuffer depending on success.
-        */
        ret = __setplane_internal(crtc->cursor, crtc, fb,
-                               crtc_x, crtc_y, crtc_w, crtc_h,
-                               0, 0, src_w, src_h, ctx);
+                                 crtc_x, crtc_y, crtc_w, crtc_h,
+                                 0, 0, src_w, src_h, ctx);
+
+       if (fb)
+               drm_framebuffer_put(fb);
 
        /* Update successful; save new cursor position, if necessary */
        if (ret == 0 && req->flags & DRM_MODE_CURSOR_MOVE) {
index f776fc1cc543abf8e752a5133aaf1ca63fb2d8ff..cb4d09c70fd44647f30b6d10244f25e90db0835f 100644 (file)
@@ -369,40 +369,26 @@ static const struct file_operations drm_syncobj_file_fops = {
        .release = drm_syncobj_file_release,
 };
 
-static int drm_syncobj_alloc_file(struct drm_syncobj *syncobj)
-{
-       struct file *file = anon_inode_getfile("syncobj_file",
-                                              &drm_syncobj_file_fops,
-                                              syncobj, 0);
-       if (IS_ERR(file))
-               return PTR_ERR(file);
-
-       drm_syncobj_get(syncobj);
-       if (cmpxchg(&syncobj->file, NULL, file)) {
-               /* lost the race */
-               fput(file);
-       }
-
-       return 0;
-}
-
 int drm_syncobj_get_fd(struct drm_syncobj *syncobj, int *p_fd)
 {
-       int ret;
+       struct file *file;
        int fd;
 
        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
                return fd;
 
-       if (!syncobj->file) {
-               ret = drm_syncobj_alloc_file(syncobj);
-               if (ret) {
-                       put_unused_fd(fd);
-                       return ret;
-               }
+       file = anon_inode_getfile("syncobj_file",
+                                 &drm_syncobj_file_fops,
+                                 syncobj, 0);
+       if (IS_ERR(file)) {
+               put_unused_fd(fd);
+               return PTR_ERR(file);
        }
-       fd_install(fd, syncobj->file);
+
+       drm_syncobj_get(syncobj);
+       fd_install(fd, file);
+
        *p_fd = fd;
        return 0;
 }
@@ -422,31 +408,24 @@ static int drm_syncobj_handle_to_fd(struct drm_file *file_private,
        return ret;
 }
 
-static struct drm_syncobj *drm_syncobj_fdget(int fd)
-{
-       struct file *file = fget(fd);
-
-       if (!file)
-               return NULL;
-       if (file->f_op != &drm_syncobj_file_fops)
-               goto err;
-
-       return file->private_data;
-err:
-       fput(file);
-       return NULL;
-};
-
 static int drm_syncobj_fd_to_handle(struct drm_file *file_private,
                                    int fd, u32 *handle)
 {
-       struct drm_syncobj *syncobj = drm_syncobj_fdget(fd);
+       struct drm_syncobj *syncobj;
+       struct file *file;
        int ret;
 
-       if (!syncobj)
+       file = fget(fd);
+       if (!file)
                return -EINVAL;
 
+       if (file->f_op != &drm_syncobj_file_fops) {
+               fput(file);
+               return -EINVAL;
+       }
+
        /* take a reference to put in the idr */
+       syncobj = file->private_data;
        drm_syncobj_get(syncobj);
 
        idr_preload(GFP_KERNEL);
@@ -455,12 +434,14 @@ static int drm_syncobj_fd_to_handle(struct drm_file *file_private,
        spin_unlock(&file_private->syncobj_table_lock);
        idr_preload_end();