Commit f5509cc1 authored by Kees Cook's avatar Kees Cook

mm: Hardened usercopy

This is the start of porting PAX_USERCOPY into the mainline kernel. This
is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The
work is based on code by PaX Team and Brad Spengler, and an earlier port
from Casey Schaufler. Additional non-slab page tests are from Rik van Riel.

This patch contains the logic for validating several conditions when
performing copy_to_user() and copy_from_user() on the kernel object
being copied to/from:
- address range doesn't wrap around
- address range isn't NULL or zero-allocated (with a non-zero copy size)
- if on the slab allocator:
  - object size must be less than or equal to copy size (when check is
    implemented in the allocator, which appear in subsequent patches)
- otherwise, object must not span page allocations (excepting Reserved
  and CMA ranges)
- if on the stack
  - object must not extend before/after the current process stack
  - object must be contained by a valid stack frame (when there is
    arch/build support for identifying stack frames)
- object must not overlap with kernel text
Signed-off-by: default avatarKees Cook <>
Tested-by: default avatarValdis Kletnieks <>
Tested-by: default avatarMichael Ellerman <>
parent 0f60a8ef
......@@ -155,6 +155,18 @@ void kfree(const void *);
void kzfree(const void *);
size_t ksize(const void *);
const char *__check_heap_object(const void *ptr, unsigned long n,
struct page *page);
static inline const char *__check_heap_object(const void *ptr,
unsigned long n,
struct page *page)
return NULL;
* Some archs want to perform DMA into kmalloc caches and need a guaranteed
* alignment larger than the alignment of a 64-bit integer.
......@@ -155,6 +155,21 @@ static inline int arch_within_stack_frames(const void * const stack,
extern void __check_object_size(const void *ptr, unsigned long n,
bool to_user);
static inline void check_object_size(const void *ptr, unsigned long n,
bool to_user)
__check_object_size(ptr, n, to_user);
static inline void check_object_size(const void *ptr, unsigned long n,
bool to_user)
{ }
#endif /* __KERNEL__ */
#endif /* _LINUX_THREAD_INFO_H */
......@@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
# Since __builtin_frame_address does work as used, disable the warning.
CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
......@@ -99,3 +102,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
* This implements the various checks for CONFIG_HARDENED_USERCOPY*,
* which are designed to protect kernel memory from needless exposure
* and overwrite under many unintended conditions. This code is based
* on PAX_USERCOPY, which is:
* Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
* Security Inc.
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
#include <linux/slab.h>
#include <asm/sections.h>
enum {
* Checks if a given pointer and length is contained by the current
* stack frame (if possible).
* Returns:
* NOT_STACK: not at all on the stack
* GOOD_FRAME: fully within a valid stack frame
* GOOD_STACK: fully on the stack (when can't do frame-checking)
* BAD_STACK: error condition (invalid stack position or bad stack frame)
static noinline int check_stack_object(const void *obj, unsigned long len)
const void * const stack = task_stack_page(current);
const void * const stackend = stack + THREAD_SIZE;
int ret;
/* Object is not on the stack at all. */
if (obj + len <= stack || stackend <= obj)
return NOT_STACK;
* Reject: object partially overlaps the stack (passing the
* the check above means at least one end is within the stack,
* so if this check fails, the other end is outside the stack).
if (obj < stack || stackend < obj + len)
return BAD_STACK;
/* Check if object is safely within a valid frame. */
ret = arch_within_stack_frames(stack, stackend, obj, len);
if (ret)
return ret;
return GOOD_STACK;
static void report_usercopy(const void *ptr, unsigned long len,
bool to_user, const char *type)
pr_emerg("kernel memory %s attempt detected %s %p (%s) (%lu bytes)\n",
to_user ? "exposure" : "overwrite",
to_user ? "from" : "to", ptr, type ? : "unknown", len);
* For greater effect, it would be nice to do do_group_exit(),
* but BUG() actually hooks all the lock-breaking and per-arch
* Oops code, so that is used here instead.
/* Returns true if any portion of [ptr,ptr+n) over laps with [low,high). */
static bool overlaps(const void *ptr, unsigned long n, unsigned long low,
unsigned long high)
unsigned long check_low = (uintptr_t)ptr;
unsigned long check_high = check_low + n;
/* Does not overlap if entirely above or entirely below. */
if (check_low >= high || check_high < low)
return false;
return true;
/* Is this address range in the kernel text area? */
static inline const char *check_kernel_text_object(const void *ptr,
unsigned long n)
unsigned long textlow = (unsigned long)_stext;
unsigned long texthigh = (unsigned long)_etext;
unsigned long textlow_linear, texthigh_linear;
if (overlaps(ptr, n, textlow, texthigh))
return "<kernel text>";
* Some architectures have virtual memory mappings with a secondary
* mapping of the kernel text, i.e. there is more than one virtual
* kernel address that points to the kernel image. It is usually
* when there is a separate linear physical memory mapping, in that
* __pa() is not just the reverse of __va(). This can be detected
* and checked:
textlow_linear = (unsigned long)__va(__pa(textlow));
/* No different mapping: we're done. */
if (textlow_linear == textlow)
return NULL;
/* Check the secondary mapping... */
texthigh_linear = (unsigned long)__va(__pa(texthigh));
if (overlaps(ptr, n, textlow_linear, texthigh_linear))
return "<linear kernel text>";
return NULL;
static inline const char *check_bogus_address(const void *ptr, unsigned long n)
/* Reject if object wraps past end of memory. */
if (ptr + n < ptr)
return "<wrapped address>";
/* Reject if NULL or ZERO-allocation. */
if (ZERO_OR_NULL_PTR(ptr))
return "<null>";
return NULL;
static inline const char *check_heap_object(const void *ptr, unsigned long n,
bool to_user)
struct page *page, *endpage;
const void *end = ptr + n - 1;
bool is_reserved, is_cma;
* Some architectures (arm64) return true for virt_addr_valid() on
* vmalloced addresses. Work around this by checking for vmalloc
* first.
if (is_vmalloc_addr(ptr))
return NULL;
if (!virt_addr_valid(ptr))
return NULL;
page = virt_to_head_page(ptr);
/* Check slab allocator for flags and size. */
if (PageSlab(page))
return __check_heap_object(ptr, n, page);
* Sometimes the kernel data regions are not marked Reserved (see
* check below). And sometimes [_sdata,_edata) does not cover
* rodata and/or bss, so check each range explicitly.
/* Allow reads of kernel rodata region (if not marked as Reserved). */
if (ptr >= (const void *)__start_rodata &&
end <= (const void *)__end_rodata) {
if (!to_user)
return "<rodata>";
return NULL;
/* Allow kernel data region (if not marked as Reserved). */
if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
return NULL;
/* Allow kernel bss region (if not marked as Reserved). */
if (ptr >= (const void *)__bss_start &&
end <= (const void *)__bss_stop)
return NULL;
/* Is the object wholly within one base page? */
if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
((unsigned long)end & (unsigned long)PAGE_MASK)))
return NULL;
/* Allow if start and end are inside the same compound page. */
endpage = virt_to_head_page(end);
if (likely(endpage == page))
return NULL;
* Reject if range is entirely either Reserved (i.e. special or
* device memory), or CMA. Otherwise, reject since the object spans
* several independently allocated pages.
is_reserved = PageReserved(page);
is_cma = is_migrate_cma_page(page);
if (!is_reserved && !is_cma)
goto reject;
for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
page = virt_to_head_page(ptr);
if (is_reserved && !PageReserved(page))
goto reject;
if (is_cma && !is_migrate_cma_page(page))
goto reject;
return NULL;
return "<spans multiple pages>";
* Validates that the given object is:
* - not bogus address
* - known-safe heap or stack object
* - not in kernel text
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
const char *err;
/* Skip all tests if size is zero. */
if (!n)
/* Check for invalid addresses. */
err = check_bogus_address(ptr, n);
if (err)
goto report;
/* Check for bad heap object. */
err = check_heap_object(ptr, n, to_user);
if (err)
goto report;
/* Check for bad stack object. */
switch (check_stack_object(ptr, n)) {
/* Object is not touching the current process stack. */
* Object is either in the correct frame (when it
* is possible to check) or just generally on the
* process stack (when frame checking not available).
err = "<process stack>";
goto report;
/* Check for object in kernel to avoid text exposure. */
err = check_kernel_text_object(ptr, n);
if (!err)
report_usercopy(ptr, n, to_user, err);
......@@ -118,6 +118,34 @@ config LSM_MMAP_MIN_ADDR
this low address space will need the permission specific to the
systems running LSM.
The heap allocator implements __check_heap_object() for
validating memory ranges against heap object sizes in
The architecture supports CONFIG_HARDENED_USERCOPY by
calling check_object_size() just before performing the
userspace copies in the low level implementation of
copy_to_user() and copy_from_user().
bool "Harden memory copies between kernel and userspace"
select BUG
This option checks for obviously wrong memory regions when
copying memory to/from the kernel (via copy_to_user() and
copy_from_user() functions) by rejecting memory ranges that
are larger than the specified heap object, span multiple
separately allocates pages, are not on the process stack,
or are part of the kernel text. This kills entire classes
of heap overflow exploits and similar kernel memory exposures.
source security/selinux/Kconfig
source security/smack/Kconfig
source security/tomoyo/Kconfig
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment