Commit 38d8b4e6 authored by Huang Ying's avatar Huang Ying Committed by Linus Torvalds
Browse files

mm, THP, swap: delay splitting THP during swap out

Patch series "THP swap: Delay splitting THP during swapping out", v11.

This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.

Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine.  Because the
performance of the storage device improved faster than that of single
logical CPU.  And it seems that the trend will not change in the near
future.  On the other hand, the THP becomes more and more popular
because of increased memory size.  So it becomes necessary to optimize
THP swap performance.

The advantages of the THP swap support include:

 - Batch the swap operations for the THP to reduce lock
   acquiring/releasing, including allocating/freeing the swap space,
   adding/deleting to/from the swap cache, and writing/reading the swap
   space, etc. This will help improve the performance of the THP ...
parent 9d85e15f
......@@ -72,6 +72,7 @@ config X86
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANTS_THP_SWAP if X86_64
select BUILDTIME_EXTABLE_SORT
select CLKEVT_I8253
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
......
......@@ -326,11 +326,14 @@ PAGEFLAG_FALSE(HighMem)
#ifdef CONFIG_SWAP
static __always_inline int PageSwapCache(struct page *page)
{
#ifdef CONFIG_THP_SWAP
page = compound_head(page);
#endif
return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);
}
SETPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
#else
PAGEFLAG_FALSE(SwapCache)
#endif
......
......@@ -386,9 +386,9 @@ static inline long get_nr_swap_pages(void)
}
extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(void);
extern swp_entry_t get_swap_page(struct page *page);
extern swp_entry_t get_swap_page_of_type(int);
extern int get_swap_pages(int n, swp_entry_t swp_entries[]);
extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t);
extern int swap_duplicate(swp_entry_t);
......@@ -515,7 +515,7 @@ static inline int try_to_free_swap(struct page *page)
return 0;
}
static inline swp_entry_t get_swap_page(void)
static inline swp_entry_t get_swap_page(struct page *page)
{
swp_entry_t entry;
entry.val = 0;
......@@ -548,7 +548,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
#ifdef CONFIG_MEMCG_SWAP
extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct page *page);
#else
......@@ -562,7 +562,8 @@ static inline int mem_cgroup_try_charge_swap(struct page *page,
return 0;
}
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
unsigned int nr_pages)
{
}
......@@ -577,5 +578,13 @@ static inline bool mem_cgroup_swap_full(struct page *page)
}
#endif
#ifdef CONFIG_THP_SWAP
extern void swapcache_free_cluster(swp_entry_t entry);
#else
static inline void swapcache_free_cluster(swp_entry_t entry)
{
}
#endif
#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */
......@@ -7,7 +7,8 @@
extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
unsigned short old, unsigned short new);
extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
unsigned int nr_ents);
extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
extern int swap_cgroup_swapon(int type, unsigned long max_pages);
extern void swap_cgroup_swapoff(int type);
......@@ -15,7 +16,8 @@ extern void swap_cgroup_swapoff(int type);
#else
static inline
unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
unsigned int nr_ents)
{
return 0;
}
......
......@@ -446,6 +446,18 @@ choice
benefit.
endchoice
config ARCH_WANTS_THP_SWAP
def_bool n
config THP_SWAP
def_bool y
depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
help
Swap transparent huge pages in one piece, without splitting.
XXX: For now this only does clustered swap space allocation.
For selection by architectures with reasonable THP sizes.
config TRANSPARENT_HUGE_PAGECACHE
def_bool y
depends on TRANSPARENT_HUGEPAGE
......
......@@ -2203,7 +2203,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* atomic_set() here would be safe on all archs (and not only on x86),
* it's safer to use atomic_inc()/atomic_add().
*/
if (PageAnon(head)) {
if (PageAnon(head) && !PageSwapCache(head)) {
page_ref_inc(page_tail);
} else {
/* Additional pin to radix tree */
......@@ -2214,6 +2214,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
page_tail->flags |= (head->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_swapcache) |
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
......@@ -2276,7 +2277,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageCompound(head);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
page_ref_inc(head);
/* Additional pin to radix tree of swap cache */
if (PageSwapCache(head))
page_ref_add(head, 2);
else
page_ref_inc(head);
} else {
/* Additional pin to radix tree */
page_ref_add(head, 2);
......@@ -2432,7 +2437,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
ret = -EBUSY;
goto out;
}
extra_pins = 0;
extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
......
......@@ -2376,10 +2376,9 @@ void mem_cgroup_split_huge_fixup(struct page *head)
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
int nr_entries)
{
int val = (charge) ? 1 : -1;
this_cpu_add(memcg->stat->count[MEMCG_SWAP], val);
this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries);
}
/**
......@@ -2405,8 +2404,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
new_id = mem_cgroup_id(to);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
mem_cgroup_swap_statistics(to, true);
mem_cgroup_swap_statistics(from, -1);
mem_cgroup_swap_statistics(to, 1);
return 0;
}
return -EINVAL;
......@@ -5445,7 +5444,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
mem_cgroup_uncharge_swap(entry);
mem_cgroup_uncharge_swap(entry, nr_pages);
}
}
......@@ -5873,9 +5872,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg));
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
VM_BUG_ON_PAGE(oldid, page);
mem_cgroup_swap_statistics(swap_memcg, true);
mem_cgroup_swap_statistics(swap_memcg, 1);
page->mem_cgroup = NULL;
......@@ -5902,19 +5901,20 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
css_put(&memcg->css);
}
/*
* mem_cgroup_try_charge_swap - try charging a swap entry
/**
* mem_cgroup_try_charge_swap - try charging swap space for a page
* @page: page being added to swap
* @entry: swap entry to charge
*
* Try to charge @entry to the memcg that @page belongs to.
* Try to charge @page's memcg for the swap space at @entry.
*
* Returns 0 on success, -ENOMEM on failure.
*/
int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
struct mem_cgroup *memcg;
unsigned int nr_pages = hpage_nr_pages(page);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
......@@ -5929,25 +5929,27 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);
if (!mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, 1, &counter)) {
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
mem_cgroup_id_put(memcg);
return -ENOMEM;
}
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
/* Get references for the tail pages, too */
if (nr_pages > 1)
mem_cgroup_id_get_many(memcg, nr_pages - 1);
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
VM_BUG_ON_PAGE(oldid, page);
mem_cgroup_swap_statistics(memcg, true);
mem_cgroup_swap_statistics(memcg, nr_pages);
return 0;
}
/**
* mem_cgroup_uncharge_swap - uncharge a swap entry
* mem_cgroup_uncharge_swap - uncharge swap space
* @entry: swap entry to uncharge
*
* Drop the swap charge associated with @entry.
* @nr_pages: the amount of swap space to uncharge
*/
void mem_cgroup_uncharge_swap(swp_entry_t entry)
void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
struct mem_cgroup *memcg;
unsigned short id;
......@@ -5955,18 +5957,18 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
if (!do_swap_account)
return;
id = swap_cgroup_record(entry, 0);
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
if (!mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->swap, 1);
page_counter_uncharge(&memcg->swap, nr_pages);
else
page_counter_uncharge(&memcg->memsw, 1);
page_counter_uncharge(&memcg->memsw, nr_pages);
}
mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_id_put(memcg);
mem_cgroup_swap_statistics(memcg, -nr_pages);
mem_cgroup_id_put_many(memcg, nr_pages);
}
rcu_read_unlock();
}
......
......@@ -1291,7 +1291,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
SetPageUptodate(page);
}
swap = get_swap_page();
swap = get_swap_page(page);
if (!swap.val)
goto redirty;
......
......@@ -61,21 +61,27 @@ not_enough_page:
return -ENOMEM;
}
static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
pgoff_t offset)
{
struct page *mappage;
struct swap_cgroup *sc;
mappage = ctrl->map[offset / SC_PER_PAGE];
sc = page_address(mappage);
return sc + offset % SC_PER_PAGE;
}
static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
struct swap_cgroup_ctrl **ctrlp)
{
pgoff_t offset = swp_offset(ent);
struct swap_cgroup_ctrl *ctrl;
struct page *mappage;
struct swap_cgroup *sc;
ctrl = &swap_cgroup_ctrl[swp_type(ent)];
if (ctrlp)
*ctrlp = ctrl;
mappage = ctrl->map[offset / SC_PER_PAGE];
sc = page_address(mappage);
return sc + offset % SC_PER_PAGE;
return __lookup_swap_cgroup(ctrl, offset);
}
/**
......@@ -108,25 +114,39 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
}
/**
* swap_cgroup_record - record mem_cgroup for this swp_entry.
* @ent: swap entry to be recorded into
* swap_cgroup_record - record mem_cgroup for a set of swap entries
* @ent: the first swap entry to be recorded into
* @id: mem_cgroup to be recorded
* @nr_ents: number of swap entries to be recorded
*
* Returns old value at success, 0 at failure.
* (Of course, old value can be 0.)
*/
unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
unsigned int nr_ents)
{
struct swap_cgroup_ctrl *ctrl;
struct swap_cgroup *sc;
unsigned short old;
unsigned long flags;
pgoff_t offset = swp_offset(ent);
pgoff_t end = offset + nr_ents;
sc = lookup_swap_cgroup(ent, &ctrl);
spin_lock_irqsave(&ctrl->lock, flags);
old = sc->id;
sc->id = id;
for (;;) {
VM_BUG_ON(sc->id != old);
sc->id = id;
offset++;
if (offset == end)
break;
if (offset % SC_PER_PAGE)
sc++;
else
sc = __lookup_swap_cgroup(ctrl, offset);
}
spin_unlock_irqrestore(&ctrl->lock, flags);
return old;
......
......@@ -263,7 +263,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
cache->cur = 0;
if (swap_slot_cache_active)
cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots);
cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
cache->slots);
return cache->nr;
}
......@@ -301,11 +302,19 @@ direct_free:
return 0;
}
swp_entry_t get_swap_page(void)
swp_entry_t get_swap_page(struct page *page)
{
swp_entry_t entry, *pentry;
struct swap_slots_cache *cache;
entry.val = 0;
if (PageTransHuge(page)) {
if (IS_ENABLED(CONFIG_THP_SWAP))
get_swap_pages(1, true, &entry);
return entry;
}
/*
* Preemption is allowed here, because we may sleep
* in refill_swap_slots_cache(). But it is safe, because
......@@ -317,7 +326,6 @@ swp_entry_t get_swap_page(void)
*/
cache = raw_cpu_ptr(&swp_slots);
entry.val = 0;
if (check_cache_active()) {
mutex_lock(&cache->alloc_lock);
if (cache->slots) {
......@@ -337,7 +345,7 @@ repeat:
return entry;
}
get_swap_pages(1, &entry);
get_swap_pages(1, false, &entry);
return entry;
}
......
......@@ -19,6 +19,7 @@
#include <linux/migrate.h>
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
#include <asm/pgtable.h>
......@@ -38,6 +39,7 @@ struct address_space *swapper_spaces[MAX_SWAPFILES];
static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
static struct {
unsigned long add_total;
......@@ -90,39 +92,46 @@ void show_swap_cache_info(void)
*/
int __add_to_swap_cache(struct page *page, swp_entry_t entry)
{
int error;
int error, i, nr = hpage_nr_pages(page);
struct address_space *address_space;
pgoff_t idx = swp_offset(entry);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
get_page(page);
page_ref_add(page, nr);
SetPageSwapCache(page);
set_page_private(page, entry.val);
address_space = swap_address_space(entry);
spin_lock_irq(&address_space->tree_lock);
error = radix_tree_insert(&address_space->page_tree,
swp_offset(entry), page);
if (likely(!error)) {
address_space->nrpages++;
__inc_node_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(add_total);
for (i = 0; i < nr; i++) {
set_page_private(page + i, entry.val + i);
error = radix_tree_insert(&address_space->page_tree,
idx + i, page + i);
if (unlikely(error))
break;
}
spin_unlock_irq(&address_space->tree_lock);
if (unlikely(error)) {
if (likely(!error)) {
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
ADD_CACHE_INFO(add_total, nr);
} else {
/*
* Only the context which have set SWAP_HAS_CACHE flag
* would call add_to_swap_cache().
* So add_to_swap_cache() doesn't returns -EEXIST.
*/
VM_BUG_ON(error == -EEXIST);
set_page_private(page, 0UL);
set_page_private(page + i, 0UL);
while (i--) {
radix_tree_delete(&address_space->page_tree, idx + i);
set_page_private(page + i, 0UL);
}
ClearPageSwapCache(page);
put_page(page);
page_ref_sub(page, nr);
}
spin_unlock_irq(&address_space->tree_lock);
return error;
}
......@@ -132,7 +141,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
int error;
error = radix_tree_maybe_preload(gfp_mask);
error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
if (!error) {
error = __add_to_swap_cache(page, entry);
radix_tree_preload_end();
......@@ -146,8 +155,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
*/
void __delete_from_swap_cache(struct page *page)
{
swp_entry_t entry;
struct address_space *address_space;
int i, nr = hpage_nr_pages(page);
swp_entry_t entry;
pgoff_t idx;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
......@@ -155,12 +166,15 @@ void __delete_from_swap_cache(struct page *page)
entry.val = page_private(page);
address_space = swap_address_space(entry);
radix_tree_delete(&address_space->page_tree, swp_offset(entry));
set_page_private(page, 0);
idx = swp_offset(entry);
for (i = 0; i < nr; i++) {
radix_tree_delete(&address_space->page_tree, idx + i);
set_page_private(page + i, 0);
}
ClearPageSwapCache(page);
address_space->nrpages--;
__dec_node_page_state(page, NR_FILE_PAGES);
INC_CACHE_INFO(del_total);
address_space->nrpages -= nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
ADD_CACHE_INFO(del_total, nr);
}
/**
......@@ -178,20 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list)
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageUptodate(page), page);
entry = get_swap_page();
retry:
entry = get_swap_page(page);
if (!entry.val)
return 0;
if (mem_cgroup_try_charge_swap(page, entry)) {
swapcache_free(entry);
return 0;
}
if (unlikely(PageTransHuge(page)))
if (unlikely(split_huge_page_to_list(page, list))) {
swapcache_free(entry);
return 0;
}
goto fail;
if (mem_cgroup_try_charge_swap(page, entry))
goto fail_free;
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
......@@ -206,17 +212,33 @@ int add_to_swap(struct page *page, struct list_head *list)
*/
err = add_to_swap_cache(page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
if (!err) {
return 1;
} else { /* -ENOMEM radix-tree allocation failure */
/* -ENOMEM radix-tree allocation failure */
if (err)
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
*/
swapcache_free(entry);
return 0;
goto fail_free;
if (PageTransHuge(page)) {
err = split_huge_page_to_list(page, list);
if (err) {
delete_from_swap_cache(page);
return 0;
}
}
return 1;
fail_free:
if (PageTransHuge(page))
swapcache_free_cluster(entry);
else
swapcache_free(entry);
fail:
if (PageTransHuge(page) && !split_huge_page_to_list(page, list))
goto retry;
return 0;
}
/*
......@@ -237,8 +259,12 @@ void delete_from_swap_cache(struct page *page)
__delete_from_swap_cache(page);
spin_unlock_irq(&address_space->tree_lock);
swapcache_free(entry);
put_page(page);
if (PageTransHuge(page))
swapcache_free_cluster(entry);
else
swapcache_free(entry);
page_ref_sub(page, hpage_nr_pages(page));
}
/*
......@@ -295,7 +321,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
page = find_get_page(swap_address_space(entry), swp_offset(entry));
if (page) {
if (page && likely(!PageTransCompound(page))) {
INC_CACHE_INFO(find_success);