Commit 89c83fb5 authored by Michal Hocko's avatar Michal Hocko Committed by Linus Torvalds
Browse files

mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask

THP allocation mode is quite complex and it depends on the defrag mode.
This complexity is hidden in alloc_hugepage_direct_gfpmask from a large
part currently. The NUMA special casing (namely __GFP_THISNODE) is
however independent and placed in alloc_pages_vma currently. This both
adds an unnecessary branch to all vma based page allocation requests and
it makes the code more complex unnecessarily as well. Not to mention
that e.g. shmem THP used to do the node reclaiming unconditionally
regardless of the defrag mode until recently. This was not only
unexpected behavior but it was also hardly a good default behavior and I
strongly suspect it was just a side effect of the code sharing more than
a deliberate decision which suggests that such a layering is wrong.

Get rid of the thp special casing from alloc_pages_vma and move the
logic to alloc_hugepage_direct_gfpmask. __GFP_THISNODE is applied to the
resulting gfp mask only when the direct reclaim is not requested and
when there is no explicit numa binding to preserve the current logic.

Please note that there's also a slight difference wrt MPOL_BIND now. The
previous code would avoid using __GFP_THISNODE if the local node was
outside of policy_nodemask(). After this patch __GFP_THISNODE is avoided
for all MPOL_BIND policies. So there's a difference that if local node
is actually allowed by the bind policy's nodemask, previously
__GFP_THISNODE would be added, but now it won't be. From the behavior
POV this is still correct because the policy nodemask is used.


Signed-off-by: default avatarMichal Hocko <>
Acked-by: default avatarVlastimil Babka <>
Cc: Alex Williamson <>
Cc: Andrea Arcangeli <>
Cc: David Rientjes <>
Cc: "Kirill A. Shutemov" <>
Cc: Mel Gorman <>
Cc: Stefan Priebe - Profihost AG <>
Cc: Zi Yan <>
Signed-off-by: default avatarAndrew Morton <>
Signed-off-by: default avatarLinus Torvalds <>
parent 6194ae42
......@@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
struct vm_area_struct *vma, unsigned long addr,
int node, bool hugepage);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
int node);
#define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order)
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
#define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
alloc_pages(gfp_mask, order)
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr) \
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
alloc_pages_vma(gfp_mask, 0, vma, addr, node)
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
......@@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr);
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr);
bool vma_policy_mof(struct vm_area_struct *vma);
extern void numa_default_policy(void);
......@@ -629,21 +629,40 @@ release:
* available
* never: never stall for any thp allocation
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
gfp_t this_node = 0;
struct mempolicy *pol;
* __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
* specified, to express a general desire to stay on the current
* node for optimistic allocation attempts. If the defrag mode
* and/or madvise hint requires the direct reclaim then we prefer
* to fallback to other node rather than node reclaim because that
* can lead to excessive reclaim even though there is free memory
* on other nodes. We expect that NUMA preferences are specified
* by memory policies.
pol = get_vma_policy(vma, addr);
if (pol->mode != MPOL_BIND)
this_node = __GFP_THISNODE;
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
__GFP_KSWAPD_RECLAIM | this_node);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
return GFP_TRANSHUGE_LIGHT | this_node;
/* Caller must hold page table lock. */
......@@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pte_free(vma->vm_mm, pgtable);
return ret;
gfp = alloc_hugepage_direct_gfpmask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
if (unlikely(!page)) {
......@@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
huge_gfp = alloc_hugepage_direct_gfpmask(vma);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
haddr, numa_node_id());
} else
new_page = NULL;
......@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
} else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
address, numa_node_id());
if (!thp)
return NULL;
......@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
struct mempolicy *pol = __get_vma_policy(vma, addr);
......@@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
* @node: Which node to prefer for allocation (modulo policy).
* @hugepage: for hugepages try only the preferred node if possible
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
......@@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage)
unsigned long addr, int node)
struct mempolicy *pol;
struct page *page;
......@@ -2040,60 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
goto out;
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
int hpage_node = node;
* For hugepage allocation and non-interleave policy which
* allows the current node (or other explicitly preferred
* node) we only try to allocate from the current/preferred
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
* If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way.
if (pol->mode == MPOL_PREFERRED &&
!(pol->flags & MPOL_F_LOCAL))
hpage_node = pol->v.preferred_node;
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
* We cannot invoke reclaim if __GFP_THISNODE
* is set. Invoking reclaim with
* __GFP_THISNODE set, would cause THP
* allocations to trigger heavy swapping
* despite there may be tons of free memory
* (including potentially plenty of THP
* already available in the buddy) on all the
* other NUMA nodes.
* At most we could invoke compaction when
* __GFP_THISNODE is set (but we would need to
* refrain from invoking reclaim even if
* compaction returned COMPACT_SKIPPED because
* there wasn't not enough memory to succeed
* compaction). For now just avoid
* __GFP_THISNODE instead of limiting the
* allocation path to a strict and single
* compaction invocation.
* Supposedly if direct reclaim was enabled by
* the caller, the app prefers THP regardless
* of the node it comes from so this would be
* more desiderable behavior than only
* providing THP originated from the local
* node in such case.
if (!(gfp & __GFP_DIRECT_RECLAIM))
gfp |= __GFP_THISNODE;
page = __alloc_pages_node(hpage_node, gfp, order);
goto out;
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
......@@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
shmem_pseudo_vma_init(&pvma, info, hindex);
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
if (page)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment