return pmd_k;
}
-void vmalloc_sync_all(void)
+static void vmalloc_sync(void)
{
unsigned long address;
}
}
+void vmalloc_sync_mappings(void)
+{
+ vmalloc_sync();
+}
+
+void vmalloc_sync_unmappings(void)
+{
+ vmalloc_sync();
+}
+
/*
* 32-bit:
*
#else /* CONFIG_X86_64: */
-void vmalloc_sync_all(void)
+void vmalloc_sync_mappings(void)
{
+ /*
+ * 64-bit mappings might allocate new p4d/pud pages
+ * that need to be propagated to all tasks' PGDs.
+ */
sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
}
+void vmalloc_sync_unmappings(void)
+{
+ /*
+ * Unmappings never allocate or free p4d/pud pages.
+ * No work is required here.
+ */
+}
+
/*
* 64-bit:
*
* New allocation must be visible in all pgd before it can be found by
* an NMI allocating from the pool.
*/
- vmalloc_sync_all();
+ vmalloc_sync_mappings();
rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1);
if (rc)
waiter = true;
init_waitqueue_entry(&wait, current);
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
for (;;) {
goto fetch_events;
if (waiter) {
- spin_lock_irq(&ep->wq.lock);
+ write_lock_irq(&ep->lock);
__remove_wait_queue(&ep->wq, &wait);
- spin_unlock_irq(&ep->wq.lock);
+ write_unlock_irq(&ep->lock);
}
return res;
__PAGEFLAG(Locked, locked, PF_NO_TAIL)
PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
-PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
+PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
PAGEFLAG(Referenced, referenced, PF_HEAD)
TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
__SETPAGEFLAG(Referenced, referenced, PF_HEAD)
extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff);
-void vmalloc_sync_all(void);
-
+void vmalloc_sync_mappings(void);
+void vmalloc_sync_unmappings(void);
+
/*
* Lowlevel-APIs (not for driver use!)
*/
int register_die_notifier(struct notifier_block *nb)
{
- vmalloc_sync_all();
+ vmalloc_sync_mappings();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
}
page = pmd_page(orig_pmd);
+
+ /* Do not interfere with other mappings of this page */
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+
if (next - addr != HPAGE_PMD_SIZE) {
int err;
- if (page_mapcount(page) != 1)
- goto huge_unlock;
-
get_page(page);
spin_unlock(ptl);
lock_page(page);
continue;
}
+ /* Do not interfere with other mappings of this page */
+ if (page_mapcount(page) != 1)
+ continue;
+
VM_BUG_ON_PAGE(PageTransCompound(page), page);
if (pte_young(ptent)) {
#define MEMCG_DELAY_SCALING_SHIFT 14
/*
- * Scheduled by try_charge() to be executed from the userland return path
- * and reclaims memory over the high limit.
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
*/
-void mem_cgroup_handle_over_high(void)
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
{
- unsigned long usage, high, clamped_high;
- unsigned long pflags;
- unsigned long penalty_jiffies, overage;
- unsigned int nr_pages = current->memcg_nr_pages_over_high;
- struct mem_cgroup *memcg;
+ unsigned long penalty_jiffies;
+ u64 max_overage = 0;
- if (likely(!nr_pages))
- return;
+ do {
+ unsigned long usage, high;
+ u64 overage;
- memcg = get_mem_cgroup_from_mm(current->mm);
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
- current->memcg_nr_pages_over_high = 0;
+ usage = page_counter_read(&memcg->memory);
+ high = READ_ONCE(memcg->high);
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if
+ * it was a threshold of 1 page
+ */
+ high = max(high, 1UL);
+
+ overage = usage - high;
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+ overage = div64_u64(overage, high);
+
+ if (overage > max_overage)
+ max_overage = overage;
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ if (!max_overage)
+ return 0;
/*
- * memory.high is breached and reclaim is unable to keep up. Throttle
- * allocators proactively to slow down excessive growth.
- *
* We use overage compared to memory.high to calculate the number of
* jiffies to sleep (penalty_jiffies). Ideally this value should be
* fairly lenient on small overages, and increasingly harsh when the
* its crazy behaviour, so we exponentially increase the delay based on
* overage amount.
*/
-
- usage = page_counter_read(&memcg->memory);
- high = READ_ONCE(memcg->high);
-
- if (usage <= high)
- goto out;
-
- /*
- * Prevent division by 0 in overage calculation by acting as if it was a
- * threshold of 1 page
- */
- clamped_high = max(high, 1UL);
-
- overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
- clamped_high);
-
- penalty_jiffies = ((u64)overage * overage * HZ)
- >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+ penalty_jiffies = max_overage * max_overage * HZ;
+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
/*
* Factor in the task's own contribution to the overage, such that four
* application moving forwards and also permit diagnostics, albeit
* extremely slowly.
*/
- penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+ return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+}
+
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+ unsigned long penalty_jiffies;
+ unsigned long pflags;
+ unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ struct mem_cgroup *memcg;
+
+ if (likely(!nr_pages))
+ return;
+
+ memcg = get_mem_cgroup_from_mm(current->mm);
+ reclaim_high(memcg, nr_pages, GFP_KERNEL);
+ current->memcg_nr_pages_over_high = 0;
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ */
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages);
/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
unsigned long usage;
- int i, j, size;
+ int i, j, size, entries;
mutex_lock(&memcg->thresholds_lock);
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
- size = 0;
+ size = entries = 0;
for (i = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd != eventfd)
size++;
+ else
+ entries++;
}
new = thresholds->spare;
+ /* If no items related to eventfd have been cleared, nothing to do */
+ if (!entries)
+ goto unlock;
+
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
kfree(new);
* ->release returns.
*/
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist)
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu))
/*
* If ->release runs before mmu_notifier_unregister it must be
* handled, as it's the only way for the driver to flush all
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->clear_flush_young)
young |= subscription->ops->clear_flush_young(
subscription, mm, start, end);
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->clear_young)
young |= subscription->ops->clear_young(subscription,
mm, start, end);
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->test_young) {
young = subscription->ops->test_young(subscription, mm,
address);
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->change_pte)
subscription->ops->change_pte(subscription, mm, address,
pte);
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
const struct mmu_notifier_ops *ops = subscription->ops;
if (ops->invalidate_range_start) {
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
/*
* Call invalidate_range here too to avoid the need for the
* subsystem of having to register an invalidate_range_end
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ srcu_read_lock_held(&srcu)) {
if (subscription->ops->invalidate_range)
subscription->ops->invalidate_range(subscription, mm,
start, end);
spin_lock(&mm->notifier_subscriptions->lock);
hlist_for_each_entry_rcu(subscription,
- &mm->notifier_subscriptions->list, hlist) {
+ &mm->notifier_subscriptions->list, hlist,
+ lockdep_is_held(&mm->notifier_subscriptions->lock)) {
if (subscription->ops != ops)
continue;
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement a stub for vmalloc_sync_[un]mapping() if the architecture
+ * chose not to have one.
*/
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
+{
+}
+
+void __weak vmalloc_sync_unmappings(void)
{
}
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- else if (!node_present_pages(node))
- searchnode = node_to_mem_node(node);
object = get_partial_node(s, get_node(s, searchnode), c, flags);
if (object || node != NUMA_NO_NODE)
struct page *page;
page = c->page;
- if (!page)
+ if (!page) {
+ /*
+ * if the node is not online or has no normal memory, just
+ * ignore the node constraint
+ */
+ if (unlikely(node != NUMA_NO_NODE &&
+ !node_state(node, N_NORMAL_MEMORY)))
+ node = NUMA_NO_NODE;
goto new_slab;
+ }
redo:
if (unlikely(!node_match(page, node))) {
- int searchnode = node;
-
- if (node != NUMA_NO_NODE && !node_present_pages(node))
- searchnode = node_to_mem_node(node);
-
- if (unlikely(!node_match(page, searchnode))) {
+ /*
+ * same as above but node_match() being false already
+ * implies node != NUMA_NO_NODE
+ */
+ if (!node_state(node, N_NORMAL_MEMORY)) {
+ node = NUMA_NO_NODE;
+ goto redo;
+ } else {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, page, c->freelist, c);
goto new_slab;
struct mem_section *ms = __pfn_to_section(pfn);
bool section_is_early = early_section(ms);
struct page *memmap = NULL;
+ bool empty;
unsigned long *subsection_map = ms->usage
? &ms->usage->subsection_map[0] : NULL;
* For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
*/
bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
- if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+ empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION);
+ if (empty) {
unsigned long section_nr = pfn_to_section_nr(pfn);
/*
ms->usage = NULL;
}
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
- ms->section_mem_map = (unsigned long)NULL;
}
if (section_is_early && memmap)
free_map_bootmem(memmap);
else
depopulate_section_memmap(pfn, nr_pages, altmap);
+
+ if (empty)
+ ms->section_mem_map = (unsigned long)NULL;
}
static struct page * __meminit section_activate(int nid, unsigned long pfn,
* First make sure the mappings are removed from all page-tables
* before they are freed.
*/
- vmalloc_sync_all();
+ vmalloc_sync_unmappings();
/*
* TODO: to calculate a flush range without looping.
EXPORT_SYMBOL(remap_vmalloc_range);
/*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
+ * not to have one.
*
* The purpose of this function is to make sure the vmalloc area
* mappings are identical in all page-tables in the system.
*/
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
{
}
+void __weak vmalloc_sync_unmappings(void)
+{
+}
static int f(pte_t *pte, unsigned long addr, void *data)
{