Commit 5d190420 authored by Aaron Lu's avatar Aaron Lu Committed by Linus Torvalds
Browse files

mremap: fix race between mremap() and page cleanning

Prior to 3.15, there was a race between zap_pte_range() and
page_mkclean() where writes to a page could be lost.  Dave Hansen
discovered by inspection that there is a similar race between
move_ptes() and page_mkclean().

We've been able to reproduce the issue by enlarging the race window with
a msleep(), but have not been able to hit it without modifying the code.
So, we think it's a real issue, but is difficult or impossible to hit in
practice.

The zap_pte_range() issue is fixed by commit 1cf35d47("mm: split
'tlb_flush_mmu()' into tlb flushing and memory freeing parts").  And
this patch is to fix the race between page_mkclean() and mremap().

Here is one possible way to hit the race: suppose a process mmapped a
file with READ | WRITE and SHARED, it has two threads and they are bound
to 2 different CPUs, e.g.  CPU1 and CPU2.  mmap returned X, then thread
1 did a write to addr X so that CPU1 now has a writable TLB for addr X
on it.  Thread 2 starts ...
parent 961b708e
......@@ -22,7 +22,7 @@ extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned char *vec);
extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd);
pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush);
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot,
int prot_numa);
......
......@@ -1426,11 +1426,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
bool force_flush = false;
if ((old_addr & ~HPAGE_PMD_MASK) ||
(new_addr & ~HPAGE_PMD_MASK) ||
......@@ -1455,6 +1456,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
if (pmd_present(*old_pmd) && pmd_dirty(*old_pmd))
force_flush = true;
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
......@@ -1467,6 +1470,10 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
if (force_flush)
flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
else
*need_flush = true;
spin_unlock(old_ptl);
return true;
}
......
......@@ -104,11 +104,13 @@ static pte_t move_soft_dirty_pte(pte_t pte)
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
unsigned long new_addr, bool need_rmap_locks)
unsigned long new_addr, bool need_rmap_locks, bool *need_flush)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
bool force_flush = false;
unsigned long len = old_end - old_addr;
/*
* When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
......@@ -146,6 +148,14 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
new_pte++, new_addr += PAGE_SIZE) {
if (pte_none(*old_pte))
continue;
/*
* We are remapping a dirty PTE, make sure to
* flush TLB before we drop the PTL for the
* old PTE or we may race with page_mkclean().
*/
if (pte_present(*old_pte) && pte_dirty(*old_pte))
force_flush = true;
pte = ptep_get_and_clear(mm, old_addr, old_pte);
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
pte = move_soft_dirty_pte(pte);
......@@ -156,6 +166,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
if (force_flush)
flush_tlb_range(vma, old_end - len, old_end);
else
*need_flush = true;
pte_unmap_unlock(old_pte - 1, old_ptl);
if (need_rmap_locks)
drop_rmap_locks(vma);
......@@ -201,13 +215,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (need_rmap_locks)
take_rmap_locks(vma);
moved = move_huge_pmd(vma, old_addr, new_addr,
old_end, old_pmd, new_pmd);
old_end, old_pmd, new_pmd,
&need_flush);
if (need_rmap_locks)
drop_rmap_locks(vma);
if (moved) {
need_flush = true;
if (moved)
continue;
}
}
split_huge_pmd(vma, old_pmd, old_addr);
if (pmd_trans_unstable(old_pmd))
......@@ -220,11 +233,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
extent = next - new_addr;
if (extent > LATENCY_LIMIT)
extent = LATENCY_LIMIT;
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
new_vma, new_pmd, new_addr, need_rmap_locks);
need_flush = true;
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
new_pmd, new_addr, need_rmap_locks, &need_flush);
}
if (likely(need_flush))
if (need_flush)
flush_tlb_range(vma, old_end-len, old_addr);
mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment