Commit 58956317 authored by David Ahern's avatar David Ahern Committed by David S. Miller
Browse files

neighbor: Improve garbage collection



The existing garbage collection algorithm has a number of problems:

1. The gc algorithm will not evict PERMANENT entries as those entries
   are managed by userspace, yet the existing algorithm walks the entire
   hash table which means it always considers PERMANENT entries when
   looking for entries to evict. In some use cases (e.g., EVPN) there
   can be tens of thousands of PERMANENT entries leading to wasted
   CPU cycles when gc kicks in. As an example, with 32k permanent
   entries, neigh_alloc has been observed taking more than 4 msec per
   invocation.

2. Currently, when the number of neighbor entries hits gc_thresh2 and
   the last flush for the table was more than 5 seconds ago gc kicks in
   walks the entire hash table evicting *all* entries not in PERMANENT
   or REACHABLE state and not marked as externally learned. There is no
   discriminator on when the neigh entry was created or if it just moved
   from REACHABLE to another NUD_VALID state (e.g., NUD_STALE).

   It is possible for entries to be created or for established neighbor
   entries to be moved to STALE (e.g., an external node sends an ARP
   request) right before the 5 second window lapses:

        -----|---------x|----------|-----
            t-5         t         t+5

   If that happens those entries are evicted during gc causing unnecessary
   thrashing on neighbor entries and userspace caches trying to track them.

   Further, this contradicts the description of gc_thresh2 which says
   "Entries older than 5 seconds will be cleared".

   One workaround is to make gc_thresh2 == gc_thresh3 but that negates the
   whole point of having separate thresholds.

3. Clearing *all* neigh non-PERMANENT/REACHABLE/externally learned entries
   when gc_thresh2 is exceeded is over kill and contributes to trashing
   especially during startup.

This patch addresses these problems as follows:

1. Use of a separate list_head to track entries that can be garbage
   collected along with a separate counter. PERMANENT entries are not
   added to this list.

   The gc_thresh parameters are only compared to the new counter, not the
   total entries in the table. The forced_gc function is updated to only
   walk this new gc_list looking for entries to evict.

2. Entries are added to the list head at the tail and removed from the
   front.

3. Entries are only evicted if they were last updated more than 5 seconds
   ago, adhering to the original intent of gc_thresh2.

4. Forced gc is stopped once the number of gc_entries drops below
   gc_thresh2.

5. Since gc checks do not apply to PERMANENT entries, gc levels are skipped
   when allocating a new neighbor for a PERMANENT entry. By extension this
   means there are no explicit limits on the number of PERMANENT entries
   that can be created, but this is no different than FIB entries or FDB
   entries.
Signed-off-by: default avatarDavid Ahern <dsahern@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 12edfdfc
...@@ -108,8 +108,8 @@ neigh/default/gc_thresh2 - INTEGER ...@@ -108,8 +108,8 @@ neigh/default/gc_thresh2 - INTEGER
Default: 512 Default: 512
neigh/default/gc_thresh3 - INTEGER neigh/default/gc_thresh3 - INTEGER
Maximum number of neighbor entries allowed. Increase this Maximum number of non-PERMANENT neighbor entries allowed. Increase
when using large numbers of interfaces and when communicating this when using large numbers of interfaces and when communicating
with large numbers of directly-connected peers. with large numbers of directly-connected peers.
Default: 1024 Default: 1024
......
...@@ -154,6 +154,7 @@ struct neighbour { ...@@ -154,6 +154,7 @@ struct neighbour {
struct hh_cache hh; struct hh_cache hh;
int (*output)(struct neighbour *, struct sk_buff *); int (*output)(struct neighbour *, struct sk_buff *);
const struct neigh_ops *ops; const struct neigh_ops *ops;
struct list_head gc_list;
struct rcu_head rcu; struct rcu_head rcu;
struct net_device *dev; struct net_device *dev;
u8 primary_key[0]; u8 primary_key[0];
...@@ -214,6 +215,8 @@ struct neigh_table { ...@@ -214,6 +215,8 @@ struct neigh_table {
struct timer_list proxy_timer; struct timer_list proxy_timer;
struct sk_buff_head proxy_queue; struct sk_buff_head proxy_queue;
atomic_t entries; atomic_t entries;
atomic_t gc_entries;
struct list_head gc_list;
rwlock_t lock; rwlock_t lock;
unsigned long last_rand; unsigned long last_rand;
struct neigh_statistics __percpu *stats; struct neigh_statistics __percpu *stats;
......
...@@ -118,6 +118,34 @@ unsigned long neigh_rand_reach_time(unsigned long base) ...@@ -118,6 +118,34 @@ unsigned long neigh_rand_reach_time(unsigned long base)
} }
EXPORT_SYMBOL(neigh_rand_reach_time); EXPORT_SYMBOL(neigh_rand_reach_time);
static void neigh_mark_dead(struct neighbour *n)
{
n->dead = 1;
if (!list_empty(&n->gc_list)) {
list_del_init(&n->gc_list);
atomic_dec(&n->tbl->gc_entries);
}
}
static void neigh_change_state(struct neighbour *n, u8 new)
{
bool on_gc_list = !list_empty(&n->gc_list);
bool new_is_perm = new & NUD_PERMANENT;
n->nud_state = new;
/* remove from the gc list if new state is permanent;
* add to the gc list if new state is not permanent
*/
if (new_is_perm && on_gc_list) {
list_del_init(&n->gc_list);
atomic_dec(&n->tbl->gc_entries);
} else if (!new_is_perm && !on_gc_list) {
/* add entries to the tail; cleaning removes from the front */
list_add_tail(&n->gc_list, &n->tbl->gc_list);
atomic_inc(&n->tbl->gc_entries);
}
}
static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags,
struct neighbour __rcu **np, struct neigh_table *tbl) struct neighbour __rcu **np, struct neigh_table *tbl)
...@@ -132,7 +160,7 @@ static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags, ...@@ -132,7 +160,7 @@ static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags,
neigh = rcu_dereference_protected(n->next, neigh = rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock)); lockdep_is_held(&tbl->lock));
rcu_assign_pointer(*np, neigh); rcu_assign_pointer(*np, neigh);
n->dead = 1; neigh_mark_dead(n);
retval = true; retval = true;
} }
write_unlock(&n->lock); write_unlock(&n->lock);
...@@ -166,32 +194,31 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) ...@@ -166,32 +194,31 @@ bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
static int neigh_forced_gc(struct neigh_table *tbl) static int neigh_forced_gc(struct neigh_table *tbl)
{ {
int max_clean = atomic_read(&tbl->gc_entries) - tbl->gc_thresh2;
unsigned long tref = jiffies - 5 * HZ;
u8 flags = NTF_EXT_LEARNED;
struct neighbour *n, *tmp;
u8 state = NUD_PERMANENT;
int shrunk = 0; int shrunk = 0;
int i;
struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
write_lock_bh(&tbl->lock); write_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (i = 0; i < (1 << nht->hash_shift); i++) {
struct neighbour *n;
struct neighbour __rcu **np;
np = &nht->hash_buckets[i]; list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
while ((n = rcu_dereference_protected(*np, if (refcount_read(&n->refcnt) == 1) {
lockdep_is_held(&tbl->lock))) != NULL) { bool remove = false;
/* Neighbour record may be discarded if:
* - nobody refers to it. write_lock(&n->lock);
* - it is not permanent if (!(n->nud_state & state) && !(n->flags & flags) &&
*/ time_after(tref, n->updated))
if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np, remove = true;
tbl)) { write_unlock(&n->lock);
shrunk = 1;
continue; if (remove && neigh_remove_one(n, tbl))
} shrunk++;
np = &n->next; if (shrunk >= max_clean)
break;
} }
} }
...@@ -260,8 +287,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, ...@@ -260,8 +287,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
lockdep_is_held(&tbl->lock))); lockdep_is_held(&tbl->lock)));
write_lock(&n->lock); write_lock(&n->lock);
neigh_del_timer(n); neigh_del_timer(n);
n->dead = 1; neigh_mark_dead(n);
if (refcount_read(&n->refcnt) != 1) { if (refcount_read(&n->refcnt) != 1) {
/* The most unpleasant situation. /* The most unpleasant situation.
We must destroy neighbour entry, We must destroy neighbour entry,
...@@ -321,13 +347,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) ...@@ -321,13 +347,18 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
} }
EXPORT_SYMBOL(neigh_ifdown); EXPORT_SYMBOL(neigh_ifdown);
static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev) static struct neighbour *neigh_alloc(struct neigh_table *tbl,
struct net_device *dev,
bool permanent)
{ {
struct neighbour *n = NULL; struct neighbour *n = NULL;
unsigned long now = jiffies; unsigned long now = jiffies;
int entries; int entries;
entries = atomic_inc_return(&tbl->entries) - 1; if (permanent)
goto do_alloc;
entries = atomic_inc_return(&tbl->gc_entries) - 1;
if (entries >= tbl->gc_thresh3 || if (entries >= tbl->gc_thresh3 ||
(entries >= tbl->gc_thresh2 && (entries >= tbl->gc_thresh2 &&
time_after(now, tbl->last_flush + 5 * HZ))) { time_after(now, tbl->last_flush + 5 * HZ))) {
...@@ -340,6 +371,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device ...@@ -340,6 +371,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
} }
} }
do_alloc:
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n) if (!n)
goto out_entries; goto out_entries;
...@@ -358,11 +390,19 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device ...@@ -358,11 +390,19 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
n->tbl = tbl; n->tbl = tbl;
refcount_set(&n->refcnt, 1); refcount_set(&n->refcnt, 1);
n->dead = 1; n->dead = 1;
if (!permanent)
list_add_tail(&n->gc_list, &n->tbl->gc_list);
else
INIT_LIST_HEAD(&n->gc_list);
atomic_inc(&tbl->entries);
out: out:
return n; return n;
out_entries: out_entries:
atomic_dec(&tbl->entries); if (!permanent)
atomic_dec(&tbl->gc_entries);
goto out; goto out;
} }
...@@ -505,13 +545,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, ...@@ -505,13 +545,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
} }
EXPORT_SYMBOL(neigh_lookup_nodev); EXPORT_SYMBOL(neigh_lookup_nodev);
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, static struct neighbour *___neigh_create(struct neigh_table *tbl,
struct net_device *dev, bool want_ref) const void *pkey,
struct net_device *dev,
bool permanent, bool want_ref)
{ {
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, permanent);
u32 hash_val; u32 hash_val;
unsigned int key_len = tbl->key_len; unsigned int key_len = tbl->key_len;
int error; int error;
struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
struct neigh_hash_table *nht; struct neigh_hash_table *nht;
if (!n) { if (!n) {
...@@ -591,6 +633,12 @@ out_neigh_release: ...@@ -591,6 +633,12 @@ out_neigh_release:
neigh_release(n); neigh_release(n);
goto out; goto out;
} }
struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
struct net_device *dev, bool want_ref)
{
return ___neigh_create(tbl, pkey, dev, false, want_ref);
}
EXPORT_SYMBOL(__neigh_create); EXPORT_SYMBOL(__neigh_create);
static u32 pneigh_hash(const void *pkey, unsigned int key_len) static u32 pneigh_hash(const void *pkey, unsigned int key_len)
...@@ -854,7 +902,7 @@ static void neigh_periodic_work(struct work_struct *work) ...@@ -854,7 +902,7 @@ static void neigh_periodic_work(struct work_struct *work)
(state == NUD_FAILED || (state == NUD_FAILED ||
time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
*np = n->next; *np = n->next;
n->dead = 1; neigh_mark_dead(n);
write_unlock(&n->lock); write_unlock(&n->lock);
neigh_cleanup_and_release(n); neigh_cleanup_and_release(n);
continue; continue;
...@@ -1167,7 +1215,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, ...@@ -1167,7 +1215,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
neigh_del_timer(neigh); neigh_del_timer(neigh);
if (old & NUD_CONNECTED) if (old & NUD_CONNECTED)
neigh_suspect(neigh); neigh_suspect(neigh);
neigh->nud_state = new; neigh_change_state(neigh, new);
err = 0; err = 0;
notify = old & NUD_VALID; notify = old & NUD_VALID;
if ((old & (NUD_INCOMPLETE | NUD_PROBE)) && if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
...@@ -1246,7 +1294,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, ...@@ -1246,7 +1294,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
((new & NUD_REACHABLE) ? ((new & NUD_REACHABLE) ?
neigh->parms->reachable_time : neigh->parms->reachable_time :
0))); 0)));
neigh->nud_state = new; neigh_change_state(neigh, new);
notify = 1; notify = 1;
} }
...@@ -1582,6 +1630,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) ...@@ -1582,6 +1630,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
unsigned long phsize; unsigned long phsize;
INIT_LIST_HEAD(&tbl->parms_list); INIT_LIST_HEAD(&tbl->parms_list);
INIT_LIST_HEAD(&tbl->gc_list);
list_add(&tbl->parms.list, &tbl->parms_list); list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net); write_pnet(&tbl->parms.net, &init_net);
refcount_set(&tbl->parms.refcnt, 1); refcount_set(&tbl->parms.refcnt, 1);
...@@ -1813,7 +1862,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, ...@@ -1813,7 +1862,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out; goto out;
} }
neigh = __neigh_lookup_errno(tbl, dst, dev); neigh = ___neigh_create(tbl, dst, dev,
ndm->ndm_state & NUD_PERMANENT,
true);
if (IS_ERR(neigh)) { if (IS_ERR(neigh)) {
err = PTR_ERR(neigh); err = PTR_ERR(neigh);
goto out; goto out;
...@@ -2654,7 +2705,7 @@ void __neigh_for_each_release(struct neigh_table *tbl, ...@@ -2654,7 +2705,7 @@ void __neigh_for_each_release(struct neigh_table *tbl,
rcu_assign_pointer(*np, rcu_assign_pointer(*np,
rcu_dereference_protected(n->next, rcu_dereference_protected(n->next,
lockdep_is_held(&tbl->lock))); lockdep_is_held(&tbl->lock)));
n->dead = 1; neigh_mark_dead(n);
} else } else
np = &n->next; np = &n->next;
write_unlock(&n->lock); write_unlock(&n->lock);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment