Commit 7a69f9c6 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The main changes in this cycle were:

   - Continued work to add support for 5-level paging provided by future
     Intel CPUs. In particular we switch the x86 GUP code to the generic
     implementation. (Kirill A. Shutemov)

   - Continued work to add PCID CPU support to native kernels as well.
     In this round most of the focus is on reworking/refreshing the TLB
     flush infrastructure for the upcoming PCID changes. (Andy
     Lutomirski)"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits)
  x86/mm: Delete a big outdated comment about TLB flushing
  x86/mm: Don't reenter flush_tlb_func_common()
  x86/KASLR: Fix detection 32/64 bit bootloaders for 5-level paging
  x86/ftrace: Exclude functions in head64.c from function-tracing
  x86/mmap, ASLR: Do not treat unlimited-stack tasks as legacy mmap
  x86/mm: Remove reset_lazy_tlbstate()
  x86/ldt: Simplify the LDT switching logic
  x86/boot/64: Put __startup_64() into .head.text
  x86/mm: Add support for 5-level paging for KASLR
  x86/mm: Make kernel_physical_mapping_init() support 5-level paging
  x86/mm: Add sync_global_pgds() for configuration with 5-level paging
  x86/boot/64: Add support of additional page table level during early boot
  x86/boot/64: Rename init_level4_pgt and early_level4_pgt
  x86/boot/64: Rewrite startup_64() in C
  x86/boot/compressed: Enable 5-level paging during decompression stage
  x86/boot/efi: Define __KERNEL32_CS GDT on 64-bit configurations
  x86/boot/efi: Fix __KERNEL_CS definition of GDT entry on 64-bit configurations
  x86/boot/efi: Cleanup initialization of GDT entries
  x86/asm: Fix comment in return_from_SYSCALL_64()
  x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation
  ...
parents 9bc088ab 8781fb7e
......@@ -1638,7 +1638,7 @@ config ARCH_SELECT_MEMORY_MODEL
config HAVE_ARCH_PFN_VALID
def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM
config HAVE_GENERIC_RCU_GUP
config HAVE_GENERIC_GUP
def_bool y
depends on ARM_LPAE
......
......@@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY
config ZONE_DMA
def_bool y
config HAVE_GENERIC_RCU_GUP
config HAVE_GENERIC_GUP
def_bool y
config ARCH_DMA_ADDR_T_64BIT
......
......@@ -184,7 +184,7 @@ config PPC
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
select HAVE_GENERIC_RCU_GUP
select HAVE_GENERIC_GUP
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx)
select HAVE_IDE
select HAVE_IOREMAP_PROT
......
......@@ -69,7 +69,7 @@ config X86
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select BUILDTIME_EXTABLE_SORT
......@@ -2793,6 +2793,9 @@ config X86_DMA_REMAP
bool
depends on STA2X11
config HAVE_GENERIC_GUP
def_bool y
source "net/Kconfig"
source "drivers/Kconfig"
......
......@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
memset((char *)gdt->address, 0x0, gdt->size);
desc = (struct desc_struct *)gdt->address;
/* The first GDT is a dummy and the second is unused. */
desc += 2;
/* The first GDT is a dummy. */
desc++;
if (IS_ENABLED(CONFIG_X86_64)) {
/* __KERNEL32_CS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
desc->s = DESC_TYPE_CODE_DATA;
desc->dpl = 0;
desc->p = 1;
desc->limit = 0xf;
desc->avl = 0;
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
desc++;
} else {
/* Second entry is unused on 32-bit */
desc++;
}
/* __KERNEL_CS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
......@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
desc->p = 1;
desc->limit = 0xf;
desc->avl = 0;
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
if (IS_ENABLED(CONFIG_X86_64)) {
desc->l = 1;
desc->d = 0;
} else {
desc->l = 0;
desc->d = SEG_OP_SIZE_32BIT;
}
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
desc++;
/* __KERNEL_DS */
desc->limit0 = 0xffff;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
......@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
desc->d = SEG_OP_SIZE_32BIT;
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
#ifdef CONFIG_X86_64
/* Task segment value */
desc++;
desc->limit0 = 0x0000;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
desc->type = SEG_TYPE_TSS;
desc->s = 0;
desc->dpl = 0;
desc->p = 1;
desc->limit = 0x0;
desc->avl = 0;
desc->l = 0;
desc->d = 0;
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
#endif /* CONFIG_X86_64 */
if (IS_ENABLED(CONFIG_X86_64)) {
/* Task segment value */
desc->limit0 = 0x0000;
desc->base0 = 0x0000;
desc->base1 = 0x0000;
desc->type = SEG_TYPE_TSS;
desc->s = 0;
desc->dpl = 0;
desc->p = 1;
desc->limit = 0x0;
desc->avl = 0;
desc->l = 0;
desc->d = 0;
desc->g = SEG_GRANULARITY_4KB;
desc->base2 = 0x00;
desc++;
}
asm volatile("cli");
asm volatile ("lgdt %0" : : "m" (*gdt));
......
......@@ -346,6 +346,48 @@ preferred_addr:
/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp
#ifdef CONFIG_X86_5LEVEL
/* Check if 5-level paging has already enabled */
movq %cr4, %rax
testl $X86_CR4_LA57, %eax
jnz lvl5
/*
* At this point we are in long mode with 4-level paging enabled,
* but we want to enable 5-level paging.
*
* The problem is that we cannot do it directly. Setting LA57 in
* long mode would trigger #GP. So we need to switch off long mode
* first.
*
* NOTE: This is not going to work if bootloader put us above 4G
* limit.
*
* The first step is go into compatibility mode.
*/
/* Clear additional page table */
leaq lvl5_pgtable(%rbx), %rdi
xorq %rax, %rax
movq $(PAGE_SIZE/8), %rcx
rep stosq
/*
* Setup current CR3 as the first and only entry in a new top level
* page table.
*/
movq %cr3, %rdi
leaq 0x7 (%rdi), %rax
movq %rax, lvl5_pgtable(%rbx)
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq $__KERNEL32_CS
leaq compatible_mode(%rip), %rax
pushq %rax
lretq
lvl5:
#endif
/* Zero EFLAGS */
pushq $0
popfq
......@@ -429,6 +471,44 @@ relocated:
jmp *%rax
.code32
#ifdef CONFIG_X86_5LEVEL
compatible_mode:
/* Setup data and stack segments */
movl $__KERNEL_DS, %eax
movl %eax, %ds
movl %eax, %ss
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
/* Point CR3 to 5-level paging */
leal lvl5_pgtable(%ebx), %eax
movl %eax, %cr3
/* Enable PAE and LA57 mode */
movl %cr4, %eax
orl $(X86_CR4_PAE | X86_CR4_LA57), %eax
movl %eax, %cr4
/* Calculate address we are running at */
call 1f
1: popl %edi
subl $1b, %edi
/* Prepare stack for far return to Long Mode */
pushl $__KERNEL_CS
leal lvl5(%edi), %eax
push %eax
/* Enable paging back */
movl $(X86_CR0_PG | X86_CR0_PE), %eax
movl %eax, %cr0
lret
#endif
no_longmode:
/* This isn't an x86-64 CPU so hang */
1:
......@@ -442,7 +522,7 @@ gdt:
.word gdt_end - gdt
.long gdt
.word 0
.quad 0x0000000000000000 /* NULL descriptor */
.quad 0x00cf9a000000ffff /* __KERNEL32_CS */
.quad 0x00af9a000000ffff /* __KERNEL_CS */
.quad 0x00cf92000000ffff /* __KERNEL_DS */
.quad 0x0080890000000000 /* TS descriptor */
......@@ -486,3 +566,7 @@ boot_stack_end:
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
#ifdef CONFIG_X86_5LEVEL
lvl5_pgtable:
.fill PAGE_SIZE, 1, 0
#endif
......@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context)
static struct alloc_pgt_data pgt_data;
/* The top level page table entry pointer. */
static unsigned long level4p;
static unsigned long top_level_pgt;
/*
* Mapping information structure passed to kernel_ident_mapping_init().
......@@ -91,9 +91,15 @@ void initialize_identity_maps(void)
* If we came here via startup_32(), cr3 will be _pgtable already
* and we must append to the existing area instead of entirely
* overwriting it.
*
* With 5-level paging, we use '_pgtable' to allocate the p4d page table,
* the top-level page table is allocated separately.
*
* p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
* cases. On 4-level paging it's equal to 'top_level_pgt'.
*/
level4p = read_cr3();
if (level4p == (unsigned long)_pgtable) {
top_level_pgt = read_cr3_pa();
if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
debug_putstr("booted via startup_32()\n");
pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
......@@ -103,7 +109,7 @@ void initialize_identity_maps(void)
pgt_data.pgt_buf = _pgtable;
pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
level4p = (unsigned long)alloc_pgt_page(&pgt_data);
top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
}
}
......@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
return;
/* Build the mapping. */
kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
start, end);
}
......@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
*/
void finalize_identity_maps(void)
{
write_cr3(level4p);
write_cr3(top_level_pgt);
}
......@@ -265,7 +265,8 @@ return_from_SYSCALL_64:
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
*
* Change top 16 bits to be the sign-extension of 47th bit
* Change top bits to match most significant bit (47th or 56th bit
* depending on paging mode) in the address.
*/
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
......
......@@ -2111,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
static void refresh_pce(void *ignored)
{
if (current->active_mm)
load_mm_cr4(current->active_mm);
load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
}
static void x86_pmu_event_mapped(struct perf_event *event)
......@@ -2344,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment)
/* IRQs are off, so this synchronizes with smp_store_release */
ldt = lockless_dereference(current->active_mm->context.ldt);
if (!ldt || idx > ldt->size)
if (!ldt || idx > ldt->nr_entries)
return 0;
desc = &ldt->entries[idx];
......
......@@ -74,7 +74,7 @@ struct efi_scratch {
__kernel_fpu_begin(); \
\
if (efi_scratch.use_pgd) { \
efi_scratch.prev_cr3 = read_cr3(); \
efi_scratch.prev_cr3 = __read_cr3(); \
write_cr3((unsigned long)efi_scratch.efi_pgt); \
__flush_tlb_all(); \
} \
......
......@@ -22,8 +22,8 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
unsigned int irq_tlb_count;
#endif
unsigned int irq_tlb_count;
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
#endif
......
......@@ -37,12 +37,6 @@ typedef struct {
#endif
} mm_context_t;
#ifdef CONFIG_SMP
void leave_mm(int cpu);
#else
static inline void leave_mm(int cpu)
{
}
#endif
#endif /* _ASM_X86_MMU_H */
......@@ -47,7 +47,7 @@ struct ldt_struct {
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
unsigned int size;
unsigned int nr_entries;
};
/*
......@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
*/
if (unlikely(ldt))
set_ldt(ldt->entries, ldt->size);
set_ldt(ldt->entries, ldt->nr_entries);
else
clear_LDT();
#else
clear_LDT();
#endif
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* Load the LDT if either the old or new mm had an LDT.
*
* An mm will never go from having an LDT to not having an LDT. Two
* mms never share an LDT, so we don't gain anything by checking to
* see whether the LDT changed. There's also no guarantee that
* prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
* then prev->context.ldt will also be non-NULL.
*
* If we really cared, we could optimize the case where prev == next
* and we're exiting lazy mode. Most of the time, if this happens,
* we don't actually need to reload LDTR, but modify_ldt() is mostly
* used by legacy code and emulators where we don't need this level of
* performance.
*
* This uses | instead of || because it generates better code.
*/
if (unlikely((unsigned long)prev->context.ldt |
(unsigned long)next->context.ldt))
load_mm_ldt(next);
#endif
DEBUG_LOCKS_WARN_ON(preemptible());
}
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
#ifdef CONFIG_SMP
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
#endif
}
static inline int init_new_context(struct task_struct *tsk,
......@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
}
#endif
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
u32 pkru = read_pkru();
if (!__pkru_allows_read(pkru, pkey))
return false;
if (write && !__pkru_allows_write(pkru, pkey))
return false;
return true;
}
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
......@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
/*
* This can be used from process context to figure out what the value of
* CR3 is without needing to do a (slow) __read_cr3().
*
* It's intended to be used for code like KVM that sneakily changes CR3
* and needs to restore it. It needs to be used very carefully.
*/
static inline unsigned long __get_current_cr3_fast(void)
{
unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || !in_atomic());
VM_BUG_ON(cr3 != __read_cr3());
return cr3;
}
#endif /* _ASM_X86_MMU_CONTEXT_H */
......@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
}
static inline unsigned long read_cr3(void)
static inline unsigned long __read_cr3(void)
{
return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
}
......@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
}
static inline void flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start,
unsigned long end)
const struct flush_tlb_info *info)
{
PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
}
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
......
......@@ -51,6 +51,7 @@ struct mm_struct;
struct desc_struct;
struct task_struct;
struct cpumask;
struct flush_tlb_info;
/*
* Wrapper type for pointers to code which uses the non-standard
......@@ -223,9 +224,7 @@ struct pv_mmu_ops {
void (*flush_tlb_kernel)(void);
void (*flush_tlb_single)(unsigned long addr);
void (*flush_tlb_others)(const struct cpumask *cpus,
struct mm_struct *mm,
unsigned long start,
unsigned long end);
const struct flush_tlb_info *info);
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
......
......@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
#define gup_get_pte gup_get_pte
/*
* WARNING: only to be used in the get_user_pages_fast() implementation.
*
* With get_user_pages_fast(), we walk down the pagetables without taking
* any locks. For this we would like to load the pointers atomically,
* but that is not possible (without expensive cmpxchg8b) on PAE. What
* we do have is the guarantee that a PTE will only either go from not
* present to present, or present to not present or both -- it will not
* switch to a completely different present page without a TLB flush in
* between; something that we are blocking by holding interrupts off.
*
* Setting ptes from not present to present goes:
*
* ptep->pte_high = h;
* smp_wmb();
* ptep->pte_low = l;
*
* And present to not present goes:
*
* ptep->pte_low = 0;
* smp_wmb();
* ptep->pte_high = 0;
*
* We must ensure here that the load of pte_low sees 'l' iff pte_high
* sees 'h'. We load pte_high *after* loading pte_low, which ensures we
* don't see an older value of pte_high. *Then* we recheck pte_low,
* which ensures that we haven't picked up a changed pte high. We might
* have gotten rubbish values from pte_low and pte_high, but we are
* guaranteed that pte_low will not have the present bit set *unless*
* it is 'l'. Because get_user_pages_fast() only operates on present ptes
* we're safe.
*/
static inline pte_t gup_get_pte(pte_t *ptep)
{
pte_t pte;
do {
pte.pte_low = ptep->pte_low;
smp_rmb();
pte.pte_high = ptep->pte_high;
smp_rmb();
} while (unlikely(pte.pte_low != ptep->pte_low));
return pte;
}
#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
......@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
return 0;
}
#endif
static inline int pgd_devmap(pgd_t pgd)
{
return 0;
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
......@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
static inline void __meminit init_trampoline_default(void)
{
/* Default trampoline pgd value */
trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
}
# ifdef CONFIG_RANDOMIZE_MEMORY
void __meminit init_trampoline(void);
......@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
#endif
}
static inline bool __pkru_allows_pkey(u16 pkey, bool write)
{
u32 pkru = read_pkru();
if (!__pkru_allows_read(pkru, pkey))
return false;
if (write && !__pkru_allows_write(pkru, pkey))
return false;
return true;
}
/*
* 'pteval' can come from a PTE, PMD or PUD. We only check
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
* same value on all 3 types.
*/
static inline bool __pte_access_permitted(unsigned long pteval, bool write)
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
if (write)
need_pte_bits |= _PAGE_RW;
if ((pteval & need_pte_bits) != need_pte_bits)
return 0;
return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
}
#define pte_access_permitted pte_access_permitted
static inline bool pte_access_permitted(pte_t pte, bool write)
{
return __pte_access_permitted(pte_val(pte), write);