Merge branch 'akpm' (patches from Andrew)
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 22 Mar 2020 17:46:50 +0000 (10:46 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 22 Mar 2020 17:46:50 +0000 (10:46 -0700)
Merge misc fixes from Andrew Morton:
 "10 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  x86/mm: split vmalloc_sync_all()
  mm, slub: prevent kmalloc_node crashes and memory leaks
  mm/mmu_notifier: silence PROVE_RCU_LIST warnings
  epoll: fix possible lost wakeup on epoll_ctl() path
  mm: do not allow MADV_PAGEOUT for CoW pages
  mm, memcg: throttle allocators based on ancestral memory.high
  mm, memcg: fix corruption on 64-bit divisor in memory.high throttling
  page-flags: fix a crash at SetPageError(THP_SWAP)
  mm/hotplug: fix hot remove failure in SPARSEMEM|!VMEMMAP case
  memcg: fix NULL pointer dereference in __mem_cgroup_usage_unregister_event

13 files changed:
arch/x86/mm/fault.c
drivers/acpi/apei/ghes.c
fs/eventpoll.c
include/linux/page-flags.h
include/linux/vmalloc.h
kernel/notifier.c
mm/madvise.c
mm/memcontrol.c
mm/mmu_notifier.c
mm/nommu.c
mm/slub.c
mm/sparse.c
mm/vmalloc.c

index fa4ea09..629fdf1 100644 (file)
@@ -190,7 +190,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
        return pmd_k;
 }
 
-void vmalloc_sync_all(void)
+static void vmalloc_sync(void)
 {
        unsigned long address;
 
@@ -217,6 +217,16 @@ void vmalloc_sync_all(void)
        }
 }
 
+void vmalloc_sync_mappings(void)
+{
+       vmalloc_sync();
+}
+
+void vmalloc_sync_unmappings(void)
+{
+       vmalloc_sync();
+}
+
 /*
  * 32-bit:
  *
@@ -319,11 +329,23 @@ out:
 
 #else /* CONFIG_X86_64: */
 
-void vmalloc_sync_all(void)
+void vmalloc_sync_mappings(void)
 {
+       /*
+        * 64-bit mappings might allocate new p4d/pud pages
+        * that need to be propagated to all tasks' PGDs.
+        */
        sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
 }
 
+void vmalloc_sync_unmappings(void)
+{
+       /*
+        * Unmappings never allocate or free p4d/pud pages.
+        * No work is required here.
+        */
+}
+
 /*
  * 64-bit:
  *
index 103acbb..24c9642 100644 (file)
@@ -171,7 +171,7 @@ int ghes_estatus_pool_init(int num_ghes)
         * New allocation must be visible in all pgd before it can be found by
         * an NMI allocating from the pool.
         */
-       vmalloc_sync_all();
+       vmalloc_sync_mappings();
 
        rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1);
        if (rc)
index b041b66..eee3c92 100644 (file)
@@ -1854,9 +1854,9 @@ fetch_events:
                waiter = true;
                init_waitqueue_entry(&wait, current);
 
-               spin_lock_irq(&ep->wq.lock);
+               write_lock_irq(&ep->lock);
                __add_wait_queue_exclusive(&ep->wq, &wait);
-               spin_unlock_irq(&ep->wq.lock);
+               write_unlock_irq(&ep->lock);
        }
 
        for (;;) {
@@ -1904,9 +1904,9 @@ send_events:
                goto fetch_events;
 
        if (waiter) {
-               spin_lock_irq(&ep->wq.lock);
+               write_lock_irq(&ep->lock);
                __remove_wait_queue(&ep->wq, &wait);
-               spin_unlock_irq(&ep->wq.lock);
+               write_unlock_irq(&ep->lock);
        }
 
        return res;
index 1bf83c8..77de28b 100644 (file)
@@ -311,7 +311,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
 
 __PAGEFLAG(Locked, locked, PF_NO_TAIL)
 PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
-PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
 PAGEFLAG(Referenced, referenced, PF_HEAD)
        TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
        __SETPAGEFLAG(Referenced, referenced, PF_HEAD)
index ec38132..0507a16 100644 (file)
@@ -141,8 +141,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
 
 extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                        unsigned long pgoff);
-void vmalloc_sync_all(void);
+void vmalloc_sync_mappings(void);
+void vmalloc_sync_unmappings(void);
+
 /*
  *     Lowlevel-APIs (not for driver use!)
  */
index 63d7501..5989bbb 100644 (file)
@@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-       vmalloc_sync_all();
+       vmalloc_sync_mappings();
        return atomic_notifier_chain_register(&die_chain, nb);
 }
 EXPORT_SYMBOL_GPL(register_die_notifier);
index 43b47d3..4bb30ed 100644 (file)
@@ -335,12 +335,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                }
 
                page = pmd_page(orig_pmd);
+
+               /* Do not interfere with other mappings of this page */
+               if (page_mapcount(page) != 1)
+                       goto huge_unlock;
+
                if (next - addr != HPAGE_PMD_SIZE) {
                        int err;
 
-                       if (page_mapcount(page) != 1)
-                               goto huge_unlock;
-
                        get_page(page);
                        spin_unlock(ptl);
                        lock_page(page);
@@ -426,6 +428,10 @@ regular_page:
                        continue;
                }
 
+               /* Do not interfere with other mappings of this page */
+               if (page_mapcount(page) != 1)
+                       continue;
+
                VM_BUG_ON_PAGE(PageTransCompound(page), page);
 
                if (pte_young(ptent)) {
index 2058b8d..7a4bd8b 100644 (file)
@@ -2297,28 +2297,41 @@ static void high_work_func(struct work_struct *work)
  #define MEMCG_DELAY_SCALING_SHIFT 14
 
 /*
- * Scheduled by try_charge() to be executed from the userland return path
- * and reclaims memory over the high limit.
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
  */
-void mem_cgroup_handle_over_high(void)
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+                                         unsigned int nr_pages)
 {
-       unsigned long usage, high, clamped_high;
-       unsigned long pflags;
-       unsigned long penalty_jiffies, overage;
-       unsigned int nr_pages = current->memcg_nr_pages_over_high;
-       struct mem_cgroup *memcg;
+       unsigned long penalty_jiffies;
+       u64 max_overage = 0;
 
-       if (likely(!nr_pages))
-               return;
+       do {
+               unsigned long usage, high;
+               u64 overage;
 
-       memcg = get_mem_cgroup_from_mm(current->mm);
-       reclaim_high(memcg, nr_pages, GFP_KERNEL);
-       current->memcg_nr_pages_over_high = 0;
+               usage = page_counter_read(&memcg->memory);
+               high = READ_ONCE(memcg->high);
+
+               /*
+                * Prevent division by 0 in overage calculation by acting as if
+                * it was a threshold of 1 page
+                */
+               high = max(high, 1UL);
+
+               overage = usage - high;
+               overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+               overage = div64_u64(overage, high);
+
+               if (overage > max_overage)
+                       max_overage = overage;
+       } while ((memcg = parent_mem_cgroup(memcg)) &&
+                !mem_cgroup_is_root(memcg));
+
+       if (!max_overage)
+               return 0;
 
        /*
-        * memory.high is breached and reclaim is unable to keep up. Throttle
-        * allocators proactively to slow down excessive growth.
-        *
         * We use overage compared to memory.high to calculate the number of
         * jiffies to sleep (penalty_jiffies). Ideally this value should be
         * fairly lenient on small overages, and increasingly harsh when the
@@ -2326,24 +2339,9 @@ void mem_cgroup_handle_over_high(void)
         * its crazy behaviour, so we exponentially increase the delay based on
         * overage amount.
         */
-
-       usage = page_counter_read(&memcg->memory);
-       high = READ_ONCE(memcg->high);
-
-       if (usage <= high)
-               goto out;
-
-       /*
-        * Prevent division by 0 in overage calculation by acting as if it was a
-        * threshold of 1 page
-        */
-       clamped_high = max(high, 1UL);
-
-       overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
-                         clamped_high);
-
-       penalty_jiffies = ((u64)overage * overage * HZ)
-               >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+       penalty_jiffies = max_overage * max_overage * HZ;
+       penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
+       penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
 
        /*
         * Factor in the task's own contribution to the overage, such that four
@@ -2360,7 +2358,32 @@ void mem_cgroup_handle_over_high(void)
         * application moving forwards and also permit diagnostics, albeit
         * extremely slowly.
         */
-       penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+       return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+}
+
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+       unsigned long penalty_jiffies;
+       unsigned long pflags;
+       unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       struct mem_cgroup *memcg;
+
+       if (likely(!nr_pages))
+               return;
+
+       memcg = get_mem_cgroup_from_mm(current->mm);
+       reclaim_high(memcg, nr_pages, GFP_KERNEL);
+       current->memcg_nr_pages_over_high = 0;
+
+       /*
+        * memory.high is breached and reclaim is unable to keep up. Throttle
+        * allocators proactively to slow down excessive growth.
+        */
+       penalty_jiffies = calculate_high_delay(memcg, nr_pages);
 
        /*
         * Don't sleep if the amount of jiffies this memcg owes us is so low
@@ -4027,7 +4050,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
        unsigned long usage;
-       int i, j, size;
+       int i, j, size, entries;
 
        mutex_lock(&memcg->thresholds_lock);
 
@@ -4047,14 +4070,20 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
 
        /* Calculate new number of threshold */
-       size = 0;
+       size = entries = 0;
        for (i = 0; i < thresholds->primary->size; i++) {
                if (thresholds->primary->entries[i].eventfd != eventfd)
                        size++;
+               else
+                       entries++;
        }
 
        new = thresholds->spare;
 
+       /* If no items related to eventfd have been cleared, nothing to do */
+       if (!entries)
+               goto unlock;
+
        /* Set thresholds array to NULL if we don't have thresholds */
        if (!size) {
                kfree(new);
index ef3973a..06852b8 100644 (file)
@@ -307,7 +307,8 @@ static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions,
         * ->release returns.
         */
        id = srcu_read_lock(&srcu);
-       hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist)
+       hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+                                srcu_read_lock_held(&srcu))
                /*
                 * If ->release runs before mmu_notifier_unregister it must be
                 * handled, as it's the only way for the driver to flush all
@@ -370,7 +371,8 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 
        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription,
-                                &mm->notifier_subscriptions->list, hlist) {
+                                &mm->notifier_subscriptions->list, hlist,
+                                srcu_read_lock_held(&srcu)) {
                if (subscription->ops->clear_flush_young)
                        young |= subscription->ops->clear_flush_young(
                                subscription, mm, start, end);
@@ -389,7 +391,8 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
 
        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription,
-                                &mm->notifier_subscriptions->list, hlist) {
+                                &mm->notifier_subscriptions->list, hlist,
+                                srcu_read_lock_held(&srcu)) {
                if (subscription->ops->clear_young)
                        young |= subscription->ops->clear_young(subscription,
                                                                mm, start, end);
@@ -407,7 +410,8 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 
        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription,
-                                &mm->notifier_subscriptions->list, hlist) {
+                                &mm->notifier_subscriptions->list, hlist,
+                                srcu_read_lock_held(&srcu)) {
                if (subscription->ops->test_young) {
                        young = subscription->ops->test_young(subscription, mm,
                                                              address);
@@ -428,7 +432,8 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
 
        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription,
-                                &mm->notifier_subscriptions->list, hlist) {
+                                &mm->notifier_subscriptions->list, hlist,
+                                srcu_read_lock_held(&srcu)) {
                if (subscription->ops->change_pte)
                        subscription->ops->change_pte(subscription, mm, address,
                                                      pte);
@@ -476,7 +481,8 @@ static int mn_hlist_invalidate_range_start(
        int id;
 
        id = srcu_read_lock(&srcu);
-       hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
+       hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+                                srcu_read_lock_held(&srcu)) {
                const struct mmu_notifier_ops *ops = subscription->ops;
 
                if (ops->invalidate_range_start) {
@@ -528,7 +534,8 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
        int id;
 
        id = srcu_read_lock(&srcu);
-       hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
+       hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+                                srcu_read_lock_held(&srcu)) {
                /*
                 * Call invalidate_range here too to avoid the need for the
                 * subsystem of having to register an invalidate_range_end
@@ -582,7 +589,8 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
 
        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription,
-                                &mm->notifier_subscriptions->list, hlist) {
+                                &mm->notifier_subscriptions->list, hlist,
+                                srcu_read_lock_held(&srcu)) {
                if (subscription->ops->invalidate_range)
                        subscription->ops->invalidate_range(subscription, mm,
                                                            start, end);
@@ -714,7 +722,8 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
 
        spin_lock(&mm->notifier_subscriptions->lock);
        hlist_for_each_entry_rcu(subscription,
-                                &mm->notifier_subscriptions->list, hlist) {
+                                &mm->notifier_subscriptions->list, hlist,
+                                lockdep_is_held(&mm->notifier_subscriptions->lock)) {
                if (subscription->ops != ops)
                        continue;
 
index bd2b4e5..318df4e 100644 (file)
@@ -370,10 +370,14 @@ void vm_unmap_aliases(void)
 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
 
 /*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement a stub for vmalloc_sync_[un]mapping() if the architecture
+ * chose not to have one.
  */
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
+{
+}
+
+void __weak vmalloc_sync_unmappings(void)
 {
 }
 
index 97580b4..6589b41 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1973,8 +1973,6 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
 
        if (node == NUMA_NO_NODE)
                searchnode = numa_mem_id();
-       else if (!node_present_pages(node))
-               searchnode = node_to_mem_node(node);
 
        object = get_partial_node(s, get_node(s, searchnode), c, flags);
        if (object || node != NUMA_NO_NODE)
@@ -2563,17 +2561,27 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        struct page *page;
 
        page = c->page;
-       if (!page)
+       if (!page) {
+               /*
+                * if the node is not online or has no normal memory, just
+                * ignore the node constraint
+                */
+               if (unlikely(node != NUMA_NO_NODE &&
+                            !node_state(node, N_NORMAL_MEMORY)))
+                       node = NUMA_NO_NODE;
                goto new_slab;
+       }
 redo:
 
        if (unlikely(!node_match(page, node))) {
-               int searchnode = node;
-
-               if (node != NUMA_NO_NODE && !node_present_pages(node))
-                       searchnode = node_to_mem_node(node);
-
-               if (unlikely(!node_match(page, searchnode))) {
+               /*
+                * same as above but node_match() being false already
+                * implies node != NUMA_NO_NODE
+                */
+               if (!node_state(node, N_NORMAL_MEMORY)) {
+                       node = NUMA_NO_NODE;
+                       goto redo;
+               } else {
                        stat(s, ALLOC_NODE_MISMATCH);
                        deactivate_slab(s, page, c->freelist, c);
                        goto new_slab;
index 596b2a4..aadb729 100644 (file)
@@ -734,6 +734,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
        struct mem_section *ms = __pfn_to_section(pfn);
        bool section_is_early = early_section(ms);
        struct page *memmap = NULL;
+       bool empty;
        unsigned long *subsection_map = ms->usage
                ? &ms->usage->subsection_map[0] : NULL;
 
@@ -764,7 +765,8 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
         * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
         */
        bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
-       if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+       empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION);
+       if (empty) {
                unsigned long section_nr = pfn_to_section_nr(pfn);
 
                /*
@@ -779,13 +781,15 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
                        ms->usage = NULL;
                }
                memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-               ms->section_mem_map = (unsigned long)NULL;
        }
 
        if (section_is_early && memmap)
                free_map_bootmem(memmap);
        else
                depopulate_section_memmap(pfn, nr_pages, altmap);
+
+       if (empty)
+               ms->section_mem_map = (unsigned long)NULL;
 }
 
 static struct page * __meminit section_activate(int nid, unsigned long pfn,
index 1f46c3b..6b8eeb0 100644 (file)
@@ -1295,7 +1295,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
         * First make sure the mappings are removed from all page-tables
         * before they are freed.
         */
-       vmalloc_sync_all();
+       vmalloc_sync_unmappings();
 
        /*
         * TODO: to calculate a flush range without looping.
@@ -3128,16 +3128,19 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 EXPORT_SYMBOL(remap_vmalloc_range);
 
 /*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
+ * not to have one.
  *
  * The purpose of this function is to make sure the vmalloc area
  * mappings are identical in all page-tables in the system.
  */
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
 {
 }
 
+void __weak vmalloc_sync_unmappings(void)
+{
+}
 
 static int f(pte_t *pte, unsigned long addr, void *data)
 {