Commit ba836a6f authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'akpm' (patches from Andrew)

Merge fixes from Andrew Morton:
 "27 fixes.

  There are three patches that aren't actually fixes. They're simple
  function renamings which are nice-to-have in mainline as ongoing net
  development depends on them."

* akpm: (27 commits)
  timerfd: export defines to userspace
  mm/hugetlb.c: fix reservation race when freeing surplus pages
  mm/slab.c: fix SLAB freelist randomization duplicate entries
  zram: support BDI_CAP_STABLE_WRITES
  zram: revalidate disk under init_lock
  mm: support anonymous stable page
  mm: add documentation for page fragment APIs
  mm: rename __page_frag functions to __page_frag_cache, drop order from drain
  mm: rename __alloc_page_frag to page_frag_alloc and __free_page_frag to page_frag_free
  mm, memcg: fix the active list aging for lowmem requests when memcg is enabled
  mm: don't dereference struct page fields of invalid pages
  mailmap: add names for nameless email commits
  signal: protect SIGNAL_UNKILLABLE from unintentional clearing.
  mm: pmd dirty emulation in page fault handler
  ipc/sem.c: fix incorrect sem_lock pairing
  lib/Kconfig.debug: fix frv build failure
  mm: get rid of __GFP_OTHER_NODE
  mm: fix remote numa hits statistics
  mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}
  ocfs2: fix crash caused by stale lvb with fsdlm plugin
parents cff3b2c4 575b1967
......@@ -137,6 +137,7 @@ Ricardo Ribalda Delgado <>
Rudolf Marek <>
Rui Saraiva <>
Sachin P Sant <>
Sarangdhar Joshi <>
Sam Ravnborg <>
Santosh Shilimkar <>
Santosh Shilimkar <>
......@@ -150,10 +151,13 @@ Shuah Khan <> <>
Simon Kelley <>
Stéphane Witzmann <>
Stephen Hemminger <>
Subash Abhinov Kasiviswanathan <>
Subhash Jadavani <>
Sudeep Holla <> Sudeep KarkadaNagesha <>
Sumit Semwal <>
Tejun Heo <>
Thomas Graf <>
Thomas Pedersen <>
Tony Luck <>
Tsuneo Yoshioka <>
Uwe Kleine-König <>
Page fragments
A page fragment is an arbitrary-length arbitrary-offset area of memory
which resides within a 0 or higher order compound page. Multiple
fragments within that page are individually refcounted, in the page's
reference counter.
The page_frag functions, page_frag_alloc and page_frag_free, provide a
simple allocation framework for page fragments. This is used by the
network stack and network device drivers to provide a backing region of
memory for use as either an sk_buff->head, or to be used in the "frags"
portion of skb_shared_info.
In order to make use of the page fragment APIs a backing page fragment
cache is needed. This provides a central point for the fragment allocation
and tracks allows multiple calls to make use of a cached page. The
advantage to doing this is that multiple calls to get_page can be avoided
which can be expensive at allocation time. However due to the nature of
this caching it is required that any calls to the cache be protected by
either a per-cpu limitation, or a per-cpu limitation and forcing interrupts
to be disabled when executing the fragment allocation.
The network stack uses two separate caches per CPU to handle fragment
allocation. The netdev_alloc_cache is used by callers making use of the
__netdev_alloc_frag and __netdev_alloc_skb calls. The napi_alloc_cache is
used by callers of the __napi_alloc_frag and __napi_alloc_skb calls. The
main difference between these two calls is the context in which they may be
called. The "netdev" prefixed functions are usable in any context as these
functions will disable interrupts, while the "napi" prefixed functions are
only usable within the softirq context.
Many network device drivers use a similar methodology for allocating page
fragments, but the page fragments are cached at the ring or descriptor
level. In order to enable these cases it is necessary to provide a generic
way of tearing down a page cache. For this reason __page_frag_cache_drain
was implemented. It allows for freeing multiple references from a single
page via a single call. The advantage to doing this is that it allows for
cleaning up the multiple references that were added to a page in order to
avoid calling get_page per allocation.
Alexander Duyck, Nov 29, 2016.
......@@ -81,7 +81,6 @@ Descriptions of section entries:
Q: Patchwork web based patch tracking system site
T: SCM tree type and location.
Type is one of: git, hg, quilt, stgit, topgit
B: Bug tracking system location.
S: Status, one of the following:
Supported: Someone is actually paid to look after this.
Maintained: Someone actually looks after it.
......@@ -25,6 +25,7 @@
#include <linux/genhd.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/err.h>
......@@ -112,6 +113,14 @@ static inline bool is_partial_io(struct bio_vec *bvec)
return bvec->bv_len != PAGE_SIZE;
static void zram_revalidate_disk(struct zram *zram)
/* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */
zram->disk->queue->backing_dev_info.capabilities |=
* Check if request is within bounds and aligned on zram logical blocks.
......@@ -1095,15 +1104,9 @@ static ssize_t disksize_store(struct device *dev,
zram->comp = comp;
zram->disksize = disksize;
set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
* Revalidate disk out of the init_lock to avoid lockdep splat.
* It's okay because disk's capacity is protected by init_lock
* so that revalidate_disk always sees up-to-date capacity.
return len;
......@@ -1149,7 +1152,7 @@ static ssize_t reset_store(struct device *dev,
/* Make sure all the pending I/O are finished */
......@@ -3962,8 +3962,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
__page_frag_drain(buffer_info->page, 0,
buffer_info->page = NULL;
......@@ -6991,7 +6991,7 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring,
dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
__page_frag_drain(page, 0, rx_buffer->pagecnt_bias);
__page_frag_cache_drain(page, rx_buffer->pagecnt_bias);
/* clear contents of rx_buffer */
......@@ -691,8 +691,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
pgoff_t index, unsigned long pfn)
struct vm_area_struct *vma;
pte_t *ptep;
pte_t pte;
pte_t pte, *ptep = NULL;
pmd_t *pmdp = NULL;
spinlock_t *ptl;
bool changed;
......@@ -707,21 +707,42 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
address = pgoff_address(index, vma);
changed = false;
if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
if (pfn != pte_pfn(*ptep))
goto unlock;
if (!pte_dirty(*ptep) && !pte_write(*ptep))
goto unlock;
flush_cache_page(vma, address, pfn);
pte = ptep_clear_flush(vma, address, ptep);
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
changed = true;
pte_unmap_unlock(ptep, ptl);
if (pmdp) {
pmd_t pmd;
if (pfn != pmd_pfn(*pmdp))
goto unlock_pmd;
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
flush_cache_page(vma, address, pfn);
pmd = pmdp_huge_clear_flush(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
changed = true;
} else {
if (pfn != pte_pfn(*ptep))
goto unlock_pte;
if (!pte_dirty(*ptep) && !pte_write(*ptep))
goto unlock_pte;
flush_cache_page(vma, address, pfn);
pte = ptep_clear_flush(vma, address, ptep);
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
changed = true;
pte_unmap_unlock(ptep, ptl);
if (changed)
mmu_notifier_invalidate_page(vma->vm_mm, address);
......@@ -3303,6 +3303,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
lockres->l_level, new_level);
* On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
* expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
* we can recover correctly from node failure. Otherwise, we may get
* invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
if (!ocfs2_is_o2cb_active() &&
lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
lvb = 1;
if (lvb)
dlm_flags |= DLM_LKF_VALBLK;
......@@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
static struct ocfs2_stack_plugin *active_stack;
inline int ocfs2_is_o2cb_active(void)
return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
struct ocfs2_stack_plugin *p;
......@@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
int ocfs2_is_o2cb_active(void);
extern struct kset *ocfs2_kset;
#endif /* STACKGLUE_H */
......@@ -38,9 +38,8 @@ struct vm_area_struct;
#define ___GFP_ACCOUNT 0x100000u
#define ___GFP_NOTRACK 0x200000u
#define ___GFP_DIRECT_RECLAIM 0x400000u
#define ___GFP_OTHER_NODE 0x800000u
#define ___GFP_WRITE 0x1000000u
#define ___GFP_KSWAPD_RECLAIM 0x2000000u
#define ___GFP_WRITE 0x800000u
#define ___GFP_KSWAPD_RECLAIM 0x1000000u
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
......@@ -172,11 +171,6 @@ struct vm_area_struct;
* __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of
* distinguishing in the source between false positives and allocations that
* cannot be supported (e.g. page tables).
* __GFP_OTHER_NODE is for allocations that are on a remote node but that
* should not be accounted for as a remote allocation in vmstat. A
* typical user would be khugepaged collapsing a huge page on a remote
* node.
#define __GFP_COLD ((__force gfp_t)___GFP_COLD)
#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN)
......@@ -184,10 +178,9 @@ struct vm_area_struct;
#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO)
#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK)
#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE)
/* Room for N __GFP_FOO bits */
#define __GFP_BITS_SHIFT 26
#define __GFP_BITS_SHIFT 25
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
......@@ -506,11 +499,10 @@ extern void free_hot_cold_page(struct page *page, bool cold);
extern void free_hot_cold_page_list(struct list_head *list, bool cold);
struct page_frag_cache;
extern void __page_frag_drain(struct page *page, unsigned int order,
unsigned int count);
extern void *__alloc_page_frag(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask);
extern void __free_page_frag(void *addr);
extern void __page_frag_cache_drain(struct page *page, unsigned int count);
extern void *page_frag_alloc(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask);
extern void page_frag_free(void *addr);
#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)
......@@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter {
struct mem_cgroup_per_node {
struct lruvec lruvec;
unsigned long lru_size[NR_LRU_LISTS];
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
......@@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
int nr_pages);
int zid, int nr_pages);
unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
int nid, unsigned int lru_mask);
......@@ -441,9 +441,23 @@ static inline
unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
struct mem_cgroup_per_node *mz;
unsigned long nr_pages = 0;
int zid;
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
return mz->lru_size[lru];
for (zid = 0; zid < MAX_NR_ZONES; zid++)
nr_pages += mz->lru_zone_size[zid][lru];
return nr_pages;
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
enum lru_list lru, int zone_idx)
struct mem_cgroup_per_node *mz;
mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
return mz->lru_zone_size[zone_idx][lru];
void mem_cgroup_handle_over_high(void);
......@@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
return 0;
static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
enum lru_list lru, int zone_idx)
return 0;
static inline unsigned long
mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
......@@ -1210,8 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
spinlock_t **ptlp);
int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
int follow_phys(struct vm_area_struct *vma, unsigned long address,
......@@ -39,7 +39,7 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
__update_lru_size(lruvec, lru, zid, nr_pages);
mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
......@@ -854,6 +854,16 @@ struct signal_struct {
#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */
static inline void signal_set_stop_flags(struct signal_struct *sig,
unsigned int flags)
sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
/* If true, all threads except ->group_exit_task have pending SIGKILL */
static inline int signal_group_exit(const struct signal_struct *sig)
......@@ -2480,7 +2480,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
static inline void skb_free_frag(void *addr)
void *napi_alloc_frag(unsigned int fragsz);
......@@ -226,7 +226,7 @@ static inline const char *__check_heap_object(const void *ptr,
* (PAGE_SIZE*2). Larger requests are passed to the page allocator.
......@@ -239,7 +239,7 @@ static inline const char *__check_heap_object(const void *ptr,
* be allocated from the same page.
......@@ -150,8 +150,9 @@ enum {
SWP_FILE = (1 << 7), /* set after swap_activate success */
SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */
SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */
/* add others here before... */
SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */
SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */
......@@ -8,23 +8,7 @@
#include <linux/fcntl.h>
/* For _IO helpers */
#include <linux/ioctl.h>
* CAREFUL: Check include/asm-generic/fcntl.h when defining
* new flags, since they might collide with O_* ones. We want
* to re-use O_* flags that couldn't possibly have a meaning
* from eventfd, in order to leave a free define-space for
* shared O_* flags.
#define TFD_TIMER_ABSTIME (1 << 0)
#define TFD_TIMER_CANCEL_ON_SET (1 << 1)
#include <uapi/linux/timerfd.h>
/* Flags for timerfd_create. */
......@@ -32,6 +16,4 @@
/* Flags for timerfd_settime. */
#define TFD_IOC_SET_TICKS _IOW('T', 0, u64)
#endif /* _LINUX_TIMERFD_H */
......@@ -47,8 +47,7 @@
{(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \
{(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \
{(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \
#define show_gfp_flags(flags) \
(flags) ? __print_flags(flags, "|", \
......@@ -414,6 +414,7 @@ header-y += telephony.h
header-y += termios.h
header-y += thermal.h
header-y += time.h
header-y += timerfd.h
header-y += times.h
header-y += timex.h
header-y += tiocl.h
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment