eventpoll.c 59 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
Davide Libenzi's avatar
Davide Libenzi committed
2
3
 *  fs/eventpoll.c (Efficient event retrieval implementation)
 *  Copyright (C) 2001,...,2009	 Davide Libenzi
Linus Torvalds's avatar
Linus Torvalds committed
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 *
 */

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/rbtree.h>
#include <linux/wait.h>
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
34
#include <linux/mutex.h>
Davide Libenzi's avatar
Davide Libenzi committed
35
#include <linux/anon_inodes.h>
36
#include <linux/device.h>
Linus Torvalds's avatar
Linus Torvalds committed
37
38
39
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
Arun Sharma's avatar
Arun Sharma committed
40
#include <linux/atomic.h>
41
42
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
43
#include <linux/compat.h>
44
#include <linux/rculist.h>
Linus Torvalds's avatar
Linus Torvalds committed
45
46
47
48
49

/*
 * LOCKING:
 * There are three level of locking required by epoll :
 *
50
 * 1) epmutex (mutex)
51
52
 * 2) ep->mtx (mutex)
 * 3) ep->lock (spinlock)
Linus Torvalds's avatar
Linus Torvalds committed
53
54
55
56
57
58
59
60
61
 *
 * The acquire order is the one listed above, from 1 to 3.
 * We need a spinlock (ep->lock) because we manipulate objects
 * from inside the poll callback, that might be triggered from
 * a wake_up() that in turn might be called from IRQ context.
 * So we can't sleep inside the poll callback and hence we need
 * a spinlock. During the event transfer loop (from kernel to
 * user space) we could end up sleeping due a copy_to_user(), so
 * we need a lock that will allow us to sleep. This lock is a
62
63
64
65
66
 * mutex (ep->mtx). It is acquired during the event transfer loop,
 * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
 * Then we also need a global mutex to serialize eventpoll_release_file()
 * and ep_free().
 * This mutex is acquired by ep_free() during the epoll file
Linus Torvalds's avatar
Linus Torvalds committed
67
68
 * cleanup path and it is also acquired by eventpoll_release_file()
 * if a file has been pushed inside an epoll set and it is then
Daniel Baluta's avatar
Daniel Baluta committed
69
 * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
70
71
72
73
74
75
76
 * It is also acquired when inserting an epoll fd onto another epoll
 * fd. We do this so that we walk the epoll tree and ensure that this
 * insertion does not create a cycle of epoll file descriptors, which
 * could lead to deadlock. We need a global mutex to prevent two
 * simultaneous inserts (A into B and B into A) from racing and
 * constructing a cycle without either insert observing that it is
 * going to.
77
78
79
80
81
82
83
84
85
 * It is necessary to acquire multiple "ep->mtx"es at once in the
 * case when one epoll fd is added to another. In this case, we
 * always acquire the locks in the order of nesting (i.e. after
 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
 * before e2->mtx). Since we disallow cycles of epoll file
 * descriptors, this ensures that the mutexes are well-ordered. In
 * order to communicate this nesting to lockdep, when walking a tree
 * of epoll file descriptors, we use the current recursion depth as
 * the lockdep subkey.
86
87
88
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epmutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
89
 * Events that require holding "epmutex" are very rare, while for
90
91
 * normal operations the epoll private "ep->mtx" will guarantee
 * a better scalability.
Linus Torvalds's avatar
Linus Torvalds committed
92
93
94
 */

/* Epoll private bits inside the event mask */
95
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
Linus Torvalds's avatar
Linus Torvalds committed
96

Davide Libenzi's avatar
Davide Libenzi committed
97
98
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
Linus Torvalds's avatar
Linus Torvalds committed
99

Davide Libenzi's avatar
Davide Libenzi committed
100
101
#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))

102
103
#define EP_UNACTIVE_PTR ((void *) -1L)

104
105
#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))

Linus Torvalds's avatar
Linus Torvalds committed
106
107
108
struct epoll_filefd {
	struct file *file;
	int fd;
109
} __packed;
Linus Torvalds's avatar
Linus Torvalds committed
110
111

/*
Davide Libenzi's avatar
Davide Libenzi committed
112
113
 * Structure used to track possible nested calls, for too deep recursions
 * and loop cycles.
Linus Torvalds's avatar
Linus Torvalds committed
114
 */
Davide Libenzi's avatar
Davide Libenzi committed
115
struct nested_call_node {
Linus Torvalds's avatar
Linus Torvalds committed
116
	struct list_head llink;
Davide Libenzi's avatar
Davide Libenzi committed
117
	void *cookie;
118
	void *ctx;
Linus Torvalds's avatar
Linus Torvalds committed
119
120
121
};

/*
Davide Libenzi's avatar
Davide Libenzi committed
122
123
 * This structure is used as collector for nested calls, to check for
 * maximum recursion dept and loop cycles.
Linus Torvalds's avatar
Linus Torvalds committed
124
 */
Davide Libenzi's avatar
Davide Libenzi committed
125
126
struct nested_calls {
	struct list_head tasks_call_list;
Linus Torvalds's avatar
Linus Torvalds committed
127
128
129
	spinlock_t lock;
};

130
131
132
/*
 * Each file descriptor added to the eventpoll interface will
 * have an entry of this type linked to the "rbr" RB tree.
133
134
 * Avoid increasing the size of this struct, there can be many thousands
 * of these on a server and we do not want this to take another cache line.
135
136
 */
struct epitem {
137
138
139
140
141
142
	union {
		/* RB tree node links this structure to the eventpoll RB tree */
		struct rb_node rbn;
		/* Used to free the struct epitem */
		struct rcu_head rcu;
	};
143
144
145
146

	/* List header used to link this structure to the eventpoll ready list */
	struct list_head rdllink;

147
148
149
150
151
152
	/*
	 * Works together "struct eventpoll"->ovflist in keeping the
	 * single linked chain of items.
	 */
	struct epitem *next;

153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
	/* The file descriptor information this item refers to */
	struct epoll_filefd ffd;

	/* Number of active wait queue attached to poll operations */
	int nwait;

	/* List containing poll wait queues */
	struct list_head pwqlist;

	/* The "container" of this item */
	struct eventpoll *ep;

	/* List header used to link this item to the "struct file" items list */
	struct list_head fllink;

168
	/* wakeup_source used when EPOLLWAKEUP is set */
169
	struct wakeup_source __rcu *ws;
170

171
172
	/* The structure that describe the interested events and the source fd */
	struct epoll_event event;
173
174
};

Linus Torvalds's avatar
Linus Torvalds committed
175
176
/*
 * This structure is stored inside the "private_data" member of the file
Daniel Baluta's avatar
Daniel Baluta committed
177
 * structure and represents the main data structure for the eventpoll
Linus Torvalds's avatar
Linus Torvalds committed
178
179
180
 * interface.
 */
struct eventpoll {
Daniel Baluta's avatar
Daniel Baluta committed
181
	/* Protect the access to this structure */
182
	spinlock_t lock;
Linus Torvalds's avatar
Linus Torvalds committed
183
184

	/*
185
186
187
188
	 * This mutex is used to ensure that files are not removed
	 * while epoll is using them. This is held during the event
	 * collection loop, the file cleanup path, the epoll file exit
	 * code and the ctl operations.
Linus Torvalds's avatar
Linus Torvalds committed
189
	 */
190
	struct mutex mtx;
Linus Torvalds's avatar
Linus Torvalds committed
191
192
193
194
195
196
197
198
199
200

	/* Wait queue used by sys_epoll_wait() */
	wait_queue_head_t wq;

	/* Wait queue used by file->poll() */
	wait_queue_head_t poll_wait;

	/* List of ready file descriptors */
	struct list_head rdllist;

Davide Libenzi's avatar
Davide Libenzi committed
201
	/* RB tree root used to store monitored fd structs */
Linus Torvalds's avatar
Linus Torvalds committed
202
	struct rb_root rbr;
203
204
205

	/*
	 * This is a single linked list that chains all the "struct epitem" that
Lucas De Marchi's avatar
Lucas De Marchi committed
206
	 * happened while transferring ready events to userspace w/out
207
208
209
	 * holding ->lock.
	 */
	struct epitem *ovflist;
210

211
212
213
	/* wakeup_source used when ep_scan_ready_list is running */
	struct wakeup_source *ws;

214
215
	/* The user that created the eventpoll descriptor */
	struct user_struct *user;
Jason Baron's avatar
Jason Baron committed
216
217
218
219
220
221

	struct file *file;

	/* used to optimize loop detection check */
	int visited;
	struct list_head visited_list_link;
Linus Torvalds's avatar
Linus Torvalds committed
222
223
224
225
226
227
228
229
};

/* Wait structure used by the poll hooks */
struct eppoll_entry {
	/* List header used to link this structure to the "struct epitem" */
	struct list_head llink;

	/* The "base" pointer is set to the container "struct epitem" */
230
	struct epitem *base;
Linus Torvalds's avatar
Linus Torvalds committed
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247

	/*
	 * Wait queue item that will be linked to the target file wait
	 * queue head.
	 */
	wait_queue_t wait;

	/* The wait queue head that linked the "wait" wait queue item */
	wait_queue_head_t *whead;
};

/* Wrapper struct used by poll queueing */
struct ep_pqueue {
	poll_table pt;
	struct epitem *epi;
};

Davide Libenzi's avatar
Davide Libenzi committed
248
249
250
251
252
253
/* Used by the ep_send_events() function as callback private data */
struct ep_send_events_data {
	int maxevents;
	struct epoll_event __user *events;
};

254
255
256
257
/*
 * Configuration options available inside /proc/sys/fs/epoll/
 */
/* Maximum number of epoll watched descriptors, per user */
258
static long max_user_watches __read_mostly;
259

Linus Torvalds's avatar
Linus Torvalds committed
260
/*
261
 * This mutex is used to serialize ep_free() and eventpoll_release_file().
Linus Torvalds's avatar
Linus Torvalds committed
262
 */
263
static DEFINE_MUTEX(epmutex);
Linus Torvalds's avatar
Linus Torvalds committed
264

265
266
267
/* Used to check for epoll file descriptor inclusion loops */
static struct nested_calls poll_loop_ncalls;

Davide Libenzi's avatar
Davide Libenzi committed
268
269
270
271
272
/* Used for safe wake up implementation */
static struct nested_calls poll_safewake_ncalls;

/* Used to call file's f_op->poll() under the nested calls boundaries */
static struct nested_calls poll_readywalk_ncalls;
Linus Torvalds's avatar
Linus Torvalds committed
273
274

/* Slab cache used to allocate "struct epitem" */
275
static struct kmem_cache *epi_cache __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
276
277

/* Slab cache used to allocate "struct eppoll_entry" */
278
static struct kmem_cache *pwq_cache __read_mostly;
Linus Torvalds's avatar
Linus Torvalds committed
279

Jason Baron's avatar
Jason Baron committed
280
281
282
283
284
285
286
287
288
/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
static LIST_HEAD(visited_list);

/*
 * List of files with newly added links, where we may need to limit the number
 * of emanating paths. Protected by the epmutex.
 */
static LIST_HEAD(tfile_check_list);

289
290
291
292
#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

293
294
static long zero;
static long long_max = LONG_MAX;
295

296
struct ctl_table epoll_table[] = {
297
298
299
	{
		.procname	= "max_user_watches",
		.data		= &max_user_watches,
300
		.maxlen		= sizeof(max_user_watches),
301
		.mode		= 0644,
302
		.proc_handler	= proc_doulongvec_minmax,
303
		.extra1		= &zero,
304
		.extra2		= &long_max,
305
	},
306
	{ }
307
308
309
};
#endif /* CONFIG_SYSCTL */

Jason Baron's avatar
Jason Baron committed
310
311
312
313
314
315
static const struct file_operations eventpoll_fops;

static inline int is_file_epoll(struct file *f)
{
	return f->f_op == &eventpoll_fops;
}
316

Davide Libenzi's avatar
Davide Libenzi committed
317
/* Setup the structure that is used as key for the RB tree */
318
319
320
321
322
323
324
static inline void ep_set_ffd(struct epoll_filefd *ffd,
			      struct file *file, int fd)
{
	ffd->file = file;
	ffd->fd = fd;
}

Davide Libenzi's avatar
Davide Libenzi committed
325
/* Compare RB tree keys */
326
327
328
329
330
331
332
333
334
335
336
337
338
static inline int ep_cmp_ffd(struct epoll_filefd *p1,
			     struct epoll_filefd *p2)
{
	return (p1->file > p2->file ? +1:
	        (p1->file < p2->file ? -1 : p1->fd - p2->fd));
}

/* Tells us if the item is currently linked */
static inline int ep_is_linked(struct list_head *p)
{
	return !list_empty(p);
}

339
340
341
342
343
static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
{
	return container_of(p, struct eppoll_entry, wait);
}

344
/* Get the "struct epitem" from a wait queue pointer */
345
static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
346
347
348
349
350
{
	return container_of(p, struct eppoll_entry, wait)->base;
}

/* Get the "struct epitem" from an epoll queue wrapper */
351
static inline struct epitem *ep_item_from_epqueue(poll_table *p)
352
353
354
355
356
{
	return container_of(p, struct ep_pqueue, pt)->epi;
}

/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
357
static inline int ep_op_has_event(int op)
358
{
359
	return op != EPOLL_CTL_DEL;
360
361
}

Linus Torvalds's avatar
Linus Torvalds committed
362
/* Initialize the poll safe wake up structure */
Davide Libenzi's avatar
Davide Libenzi committed
363
static void ep_nested_calls_init(struct nested_calls *ncalls)
Linus Torvalds's avatar
Linus Torvalds committed
364
{
Davide Libenzi's avatar
Davide Libenzi committed
365
366
	INIT_LIST_HEAD(&ncalls->tasks_call_list);
	spin_lock_init(&ncalls->lock);
Linus Torvalds's avatar
Linus Torvalds committed
367
368
}

369
370
371
372
373
374
375
376
377
378
379
380
381
/**
 * ep_events_available - Checks if ready events might be available.
 *
 * @ep: Pointer to the eventpoll context.
 *
 * Returns: Returns a value different than zero if ready events are available,
 *          or zero otherwise.
 */
static inline int ep_events_available(struct eventpoll *ep)
{
	return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
}

Davide Libenzi's avatar
Davide Libenzi committed
382
383
384
385
386
387
388
389
390
391
392
/**
 * ep_call_nested - Perform a bound (possibly) nested call, by checking
 *                  that the recursion limit is not exceeded, and that
 *                  the same nested call (by the meaning of same cookie) is
 *                  no re-entered.
 *
 * @ncalls: Pointer to the nested_calls structure to be used for this call.
 * @max_nests: Maximum number of allowed nesting calls.
 * @nproc: Nested call core function pointer.
 * @priv: Opaque data to be passed to the @nproc callback.
 * @cookie: Cookie to be used to identify this nested call.
393
 * @ctx: This instance context.
Davide Libenzi's avatar
Davide Libenzi committed
394
395
396
 *
 * Returns: Returns the code returned by the @nproc callback, or -1 if
 *          the maximum recursion limit has been exceeded.
Linus Torvalds's avatar
Linus Torvalds committed
397
 */
Davide Libenzi's avatar
Davide Libenzi committed
398
399
static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
			  int (*nproc)(void *, void *, int), void *priv,
400
			  void *cookie, void *ctx)
Linus Torvalds's avatar
Linus Torvalds committed
401
{
Davide Libenzi's avatar
Davide Libenzi committed
402
	int error, call_nests = 0;
Linus Torvalds's avatar
Linus Torvalds committed
403
	unsigned long flags;
Davide Libenzi's avatar
Davide Libenzi committed
404
405
406
	struct list_head *lsthead = &ncalls->tasks_call_list;
	struct nested_call_node *tncur;
	struct nested_call_node tnode;
Linus Torvalds's avatar
Linus Torvalds committed
407

Davide Libenzi's avatar
Davide Libenzi committed
408
	spin_lock_irqsave(&ncalls->lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
409

Davide Libenzi's avatar
Davide Libenzi committed
410
411
412
413
414
	/*
	 * Try to see if the current task is already inside this wakeup call.
	 * We use a list here, since the population inside this set is always
	 * very much limited.
	 */
415
	list_for_each_entry(tncur, lsthead, llink) {
416
		if (tncur->ctx == ctx &&
Davide Libenzi's avatar
Davide Libenzi committed
417
		    (tncur->cookie == cookie || ++call_nests > max_nests)) {
Linus Torvalds's avatar
Linus Torvalds committed
418
419
420
421
			/*
			 * Ops ... loop detected or maximum nest level reached.
			 * We abort this wake by breaking the cycle itself.
			 */
422
423
			error = -1;
			goto out_unlock;
Linus Torvalds's avatar
Linus Torvalds committed
424
425
426
		}
	}

Davide Libenzi's avatar
Davide Libenzi committed
427
	/* Add the current task and cookie to the list */
428
	tnode.ctx = ctx;
Davide Libenzi's avatar
Davide Libenzi committed
429
	tnode.cookie = cookie;
Linus Torvalds's avatar
Linus Torvalds committed
430
431
	list_add(&tnode.llink, lsthead);

Davide Libenzi's avatar
Davide Libenzi committed
432
	spin_unlock_irqrestore(&ncalls->lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
433

Davide Libenzi's avatar
Davide Libenzi committed
434
435
	/* Call the nested function */
	error = (*nproc)(priv, cookie, call_nests);
Linus Torvalds's avatar
Linus Torvalds committed
436
437

	/* Remove the current task from the list */
Davide Libenzi's avatar
Davide Libenzi committed
438
	spin_lock_irqsave(&ncalls->lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
439
	list_del(&tnode.llink);
440
out_unlock:
Davide Libenzi's avatar
Davide Libenzi committed
441
442
443
444
445
	spin_unlock_irqrestore(&ncalls->lock, flags);

	return error;
}

446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
/*
 * As described in commit 0ccf831cb lockdep: annotate epoll
 * the use of wait queues used by epoll is done in a very controlled
 * manner. Wake ups can nest inside each other, but are never done
 * with the same locking. For example:
 *
 *   dfd = socket(...);
 *   efd1 = epoll_create();
 *   efd2 = epoll_create();
 *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
 *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
 *
 * When a packet arrives to the device underneath "dfd", the net code will
 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
 * callback wakeup entry on that queue, and the wake_up() performed by the
 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
 * (efd1) notices that it may have some event ready, so it needs to wake up
 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
 * that ends up in another wake_up(), after having checked about the
 * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
 * avoid stack blasting.
 *
 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
 * this special case of epoll.
 */
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
				     unsigned long events, int subclass)
{
	unsigned long flags;

	spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
	wake_up_locked_poll(wqueue, events);
	spin_unlock_irqrestore(&wqueue->lock, flags);
}
#else
static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
				     unsigned long events, int subclass)
{
	wake_up_poll(wqueue, events);
}
#endif

Davide Libenzi's avatar
Davide Libenzi committed
489
490
static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
{
491
492
	ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
			  1 + call_nests);
Davide Libenzi's avatar
Davide Libenzi committed
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
	return 0;
}

/*
 * Perform a safe wake up of the poll wait list. The problem is that
 * with the new callback'd wake up system, it is possible that the
 * poll callback is reentered from inside the call to wake_up() done
 * on the poll wait queue head. The rule is that we cannot reenter the
 * wake up code from the same task more than EP_MAX_NESTS times,
 * and we cannot reenter the same wait queue head at all. This will
 * enable to have a hierarchy of epoll file descriptor of no more than
 * EP_MAX_NESTS deep.
 */
static void ep_poll_safewake(wait_queue_head_t *wq)
{
508
509
	int this_cpu = get_cpu();

Davide Libenzi's avatar
Davide Libenzi committed
510
	ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
511
512
513
		       ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu);

	put_cpu();
Linus Torvalds's avatar
Linus Torvalds committed
514
515
}

516
517
518
519
520
521
522
523
524
525
526
527
static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
	wait_queue_head_t *whead;

	rcu_read_lock();
	/* If it is cleared by POLLFREE, it should be rcu-safe */
	whead = rcu_dereference(pwq->whead);
	if (whead)
		remove_wait_queue(whead, &pwq->wait);
	rcu_read_unlock();
}

Linus Torvalds's avatar
Linus Torvalds committed
528
/*
529
530
531
 * This function unregisters poll callbacks from the associated file
 * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
 * ep_free).
Linus Torvalds's avatar
Linus Torvalds committed
532
 */
533
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
Linus Torvalds's avatar
Linus Torvalds committed
534
{
535
536
	struct list_head *lsthead = &epi->pwqlist;
	struct eppoll_entry *pwq;
Linus Torvalds's avatar
Linus Torvalds committed
537

538
539
	while (!list_empty(lsthead)) {
		pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
Linus Torvalds's avatar
Linus Torvalds committed
540

541
		list_del(&pwq->llink);
542
		ep_remove_wait_queue(pwq);
543
		kmem_cache_free(pwq_cache, pwq);
Linus Torvalds's avatar
Linus Torvalds committed
544
545
546
	}
}

547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
/* call only when ep->mtx is held */
static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
{
	return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
}

/* call only when ep->mtx is held */
static inline void ep_pm_stay_awake(struct epitem *epi)
{
	struct wakeup_source *ws = ep_wakeup_source(epi);

	if (ws)
		__pm_stay_awake(ws);
}

static inline bool ep_has_wakeup_source(struct epitem *epi)
{
	return rcu_access_pointer(epi->ws) ? true : false;
}

/* call when ep->mtx cannot be held (ep_poll_callback) */
static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
{
	struct wakeup_source *ws;

	rcu_read_lock();
	ws = rcu_dereference(epi->ws);
	if (ws)
		__pm_stay_awake(ws);
	rcu_read_unlock();
}

Davide Libenzi's avatar
Davide Libenzi committed
579
580
581
582
583
584
585
586
/**
 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
 *                      the scan code, to call f_op->poll(). Also allows for
 *                      O(NumReady) performance.
 *
 * @ep: Pointer to the epoll private data structure.
 * @sproc: Pointer to the scan callback.
 * @priv: Private opaque data passed to the @sproc callback.
587
 * @depth: The current depth of recursive f_op->poll calls.
588
 * @ep_locked: caller already holds ep->mtx
Davide Libenzi's avatar
Davide Libenzi committed
589
590
591
592
593
594
 *
 * Returns: The same integer error code returned by the @sproc callback.
 */
static int ep_scan_ready_list(struct eventpoll *ep,
			      int (*sproc)(struct eventpoll *,
					   struct list_head *, void *),
595
			      void *priv, int depth, bool ep_locked)
Davide Libenzi's avatar
Davide Libenzi committed
596
597
598
599
{
	int error, pwake = 0;
	unsigned long flags;
	struct epitem *epi, *nepi;
600
	LIST_HEAD(txlist);
Davide Libenzi's avatar
Davide Libenzi committed
601
602
603

	/*
	 * We need to lock this because we could be hit by
Tony Battersby's avatar
Tony Battersby committed
604
	 * eventpoll_release_file() and epoll_ctl().
Davide Libenzi's avatar
Davide Libenzi committed
605
	 */
606
607
608

	if (!ep_locked)
		mutex_lock_nested(&ep->mtx, depth);
Davide Libenzi's avatar
Davide Libenzi committed
609
610
611
612
613
614
615
616
617
618

	/*
	 * Steal the ready list, and re-init the original one to the
	 * empty list. Also, set ep->ovflist to NULL so that events
	 * happening while looping w/out locks, are not lost. We cannot
	 * have the poll callback to queue directly on ep->rdllist,
	 * because we want the "sproc" callback to be able to do it
	 * in a lockless way.
	 */
	spin_lock_irqsave(&ep->lock, flags);
619
	list_splice_init(&ep->rdllist, &txlist);
Davide Libenzi's avatar
Davide Libenzi committed
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
	ep->ovflist = NULL;
	spin_unlock_irqrestore(&ep->lock, flags);

	/*
	 * Now call the callback function.
	 */
	error = (*sproc)(ep, &txlist, priv);

	spin_lock_irqsave(&ep->lock, flags);
	/*
	 * During the time we spent inside the "sproc" callback, some
	 * other events might have been queued by the poll callback.
	 * We re-insert them inside the main ready-list here.
	 */
	for (nepi = ep->ovflist; (epi = nepi) != NULL;
	     nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
		/*
		 * We need to check if the item is already in the list.
		 * During the "sproc" callback execution time, items are
		 * queued into ->ovflist but the "txlist" might already
		 * contain them, and the list_splice() below takes care of them.
		 */
642
		if (!ep_is_linked(&epi->rdllink)) {
Davide Libenzi's avatar
Davide Libenzi committed
643
			list_add_tail(&epi->rdllink, &ep->rdllist);
644
			ep_pm_stay_awake(epi);
645
		}
Davide Libenzi's avatar
Davide Libenzi committed
646
647
648
649
650
651
652
653
654
655
656
657
	}
	/*
	 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
	 * releasing the lock, events will be queued in the normal way inside
	 * ep->rdllist.
	 */
	ep->ovflist = EP_UNACTIVE_PTR;

	/*
	 * Quickly re-inject items left on "txlist".
	 */
	list_splice(&txlist, &ep->rdllist);
658
	__pm_relax(ep->ws);
Davide Libenzi's avatar
Davide Libenzi committed
659
660
661

	if (!list_empty(&ep->rdllist)) {
		/*
662
663
		 * Wake up (if active) both the eventpoll wait list and
		 * the ->poll() wait list (delayed after we release the lock).
Davide Libenzi's avatar
Davide Libenzi committed
664
665
666
667
668
669
670
671
		 */
		if (waitqueue_active(&ep->wq))
			wake_up_locked(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}
	spin_unlock_irqrestore(&ep->lock, flags);

672
673
	if (!ep_locked)
		mutex_unlock(&ep->mtx);
Davide Libenzi's avatar
Davide Libenzi committed
674
675
676
677
678
679
680
681

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

	return error;
}

682
683
684
685
686
687
static void epi_rcu_free(struct rcu_head *head)
{
	struct epitem *epi = container_of(head, struct epitem, rcu);
	kmem_cache_free(epi_cache, epi);
}

688
689
/*
 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
690
 * all the associated resources. Must be called with "mtx" held.
691
692
693
694
695
 */
static int ep_remove(struct eventpoll *ep, struct epitem *epi)
{
	unsigned long flags;
	struct file *file = epi->ffd.file;
Linus Torvalds's avatar
Linus Torvalds committed
696
697

	/*
698
699
700
701
702
703
	 * Removes poll wait queue hooks. We _have_ to do this without holding
	 * the "ep->lock" otherwise a deadlock might occur. This because of the
	 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
	 * queue head lock when unregistering the wait queue. The wakeup callback
	 * will run by holding the wait queue head lock and will call our callback
	 * that will try to get "ep->lock".
Linus Torvalds's avatar
Linus Torvalds committed
704
	 */
705
	ep_unregister_pollwait(ep, epi);
Linus Torvalds's avatar
Linus Torvalds committed
706

707
	/* Remove the current item from the list of epoll hooks */
708
	spin_lock(&file->f_lock);
709
	list_del_rcu(&epi->fllink);
710
	spin_unlock(&file->f_lock);
Linus Torvalds's avatar
Linus Torvalds committed
711

712
	rb_erase(&epi->rbn, &ep->rbr);
Linus Torvalds's avatar
Linus Torvalds committed
713

714
715
716
717
	spin_lock_irqsave(&ep->lock, flags);
	if (ep_is_linked(&epi->rdllink))
		list_del_init(&epi->rdllink);
	spin_unlock_irqrestore(&ep->lock, flags);
Linus Torvalds's avatar
Linus Torvalds committed
718

719
	wakeup_source_unregister(ep_wakeup_source(epi));
720
721
722
723
724
725
726
727
	/*
	 * At this point it is safe to free the eventpoll item. Use the union
	 * field epi->rcu, since we are trying to minimize the size of
	 * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
	 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
	 * use of the rbn field.
	 */
	call_rcu(&epi->rcu, epi_rcu_free);
Linus Torvalds's avatar
Linus Torvalds committed
728

729
	atomic_long_dec(&ep->user->epoll_watches);
730

731
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
732
733
}

734
static void ep_free(struct eventpoll *ep)
Linus Torvalds's avatar
Linus Torvalds committed
735
{
736
737
	struct rb_node *rbp;
	struct epitem *epi;
Linus Torvalds's avatar
Linus Torvalds committed
738

739
740
	/* We need to release all tasks waiting for these file */
	if (waitqueue_active(&ep->poll_wait))
Davide Libenzi's avatar
Davide Libenzi committed
741
		ep_poll_safewake(&ep->poll_wait);
Linus Torvalds's avatar
Linus Torvalds committed
742

743
744
745
	/*
	 * We need to lock this because we could be hit by
	 * eventpoll_release_file() while we're freeing the "struct eventpoll".
746
	 * We do not need to hold "ep->mtx" here because the epoll file
747
748
	 * is on the way to be removed and no one has references to it
	 * anymore. The only hit might come from eventpoll_release_file() but
Lucas De Marchi's avatar
Lucas De Marchi committed
749
	 * holding "epmutex" is sufficient here.
750
751
	 */
	mutex_lock(&epmutex);
Linus Torvalds's avatar
Linus Torvalds committed
752
753

	/*
754
	 * Walks through the whole tree by unregistering poll callbacks.
Linus Torvalds's avatar
Linus Torvalds committed
755
	 */
756
757
758
759
	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
		epi = rb_entry(rbp, struct epitem, rbn);

		ep_unregister_pollwait(ep, epi);
760
		cond_resched();
761
	}
Linus Torvalds's avatar
Linus Torvalds committed
762
763

	/*
764
765
	 * Walks through the whole tree by freeing each "struct epitem". At this
	 * point we are sure no poll callbacks will be lingering around, and also by
766
	 * holding "epmutex" we can be sure that no file cleanup code will hit
767
	 * us during this operation. So we can avoid the lock on "ep->lock".
768
769
	 * We do not need to lock ep->mtx, either, we only do it to prevent
	 * a lockdep warning.
Linus Torvalds's avatar
Linus Torvalds committed
770
	 */
771
	mutex_lock(&ep->mtx);
772
	while ((rbp = rb_first(&ep->rbr)) != NULL) {
773
774
		epi = rb_entry(rbp, struct epitem, rbn);
		ep_remove(ep, epi);
775
		cond_resched();
776
	}
777
	mutex_unlock(&ep->mtx);
Linus Torvalds's avatar
Linus Torvalds committed
778

779
	mutex_unlock(&epmutex);
780
	mutex_destroy(&ep->mtx);
781
	free_uid(ep->user);
782
	wakeup_source_unregister(ep->ws);
783
	kfree(ep);
784
}
Linus Torvalds's avatar
Linus Torvalds committed
785

786
787
788
static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
	struct eventpoll *ep = file->private_data;
Linus Torvalds's avatar
Linus Torvalds committed
789

790
	if (ep)
791
792
793
		ep_free(ep);

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
794
795
}

796
797
798
799
800
801
802
static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
{
	pt->_key = epi->event.events;

	return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
}

803
804
static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
			       void *priv)
Davide Libenzi's avatar
Davide Libenzi committed
805
806
{
	struct epitem *epi, *tmp;
807
	poll_table pt;
Davide Libenzi's avatar
Davide Libenzi committed
808

809
	init_poll_funcptr(&pt, NULL);
810

Davide Libenzi's avatar
Davide Libenzi committed
811
	list_for_each_entry_safe(epi, tmp, head, rdllink) {
812
		if (ep_item_poll(epi, &pt))
Davide Libenzi's avatar
Davide Libenzi committed
813
			return POLLIN | POLLRDNORM;
814
		else {
Davide Libenzi's avatar
Davide Libenzi committed
815
816
817
818
819
			/*
			 * Item has been dropped into the ready list by the poll
			 * callback, but it's not actually ready, as far as
			 * caller requested events goes. We can remove it here.
			 */
820
			__pm_relax(ep_wakeup_source(epi));
Davide Libenzi's avatar
Davide Libenzi committed
821
			list_del_init(&epi->rdllink);
822
		}
Davide Libenzi's avatar
Davide Libenzi committed
823
824
825
826
827
	}

	return 0;
}

828
829
830
831
832
833
834
835
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
				 poll_table *pt);

struct readyevents_arg {
	struct eventpoll *ep;
	bool locked;
};

Davide Libenzi's avatar
Davide Libenzi committed
836
837
static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
{
838
839
840
841
	struct readyevents_arg *arg = priv;

	return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
				  call_nests + 1, arg->locked);
Davide Libenzi's avatar
Davide Libenzi committed
842
843
}

844
845
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
Davide Libenzi's avatar
Davide Libenzi committed
846
	int pollflags;
847
	struct eventpoll *ep = file->private_data;
848
849
850
851
852
853
854
855
	struct readyevents_arg arg;

	/*
	 * During ep_insert() we already hold the ep->mtx for the tfile.
	 * Prevent re-aquisition.
	 */
	arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
	arg.ep = ep;
Linus Torvalds's avatar
Linus Torvalds committed
856

857
858
859
	/* Insert inside our poll wait queue */
	poll_wait(file, &ep->poll_wait, wait);

Davide Libenzi's avatar
Davide Libenzi committed
860
861
862
863
864
865
866
	/*
	 * Proceed to find out if wanted events are really available inside
	 * the ready list. This need to be done under ep_call_nested()
	 * supervision, since the call to f_op->poll() done on listed files
	 * could re-enter here.
	 */
	pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
867
				   ep_poll_readyevents_proc, &arg, ep, current);
868

869
	return pollflags != -1 ? pollflags : 0;
870
871
}

872
#ifdef CONFIG_PROC_FS
873
static void ep_show_fdinfo(struct seq_file *m, struct file *f)
874
875
876
877
878
879
880
881
{
	struct eventpoll *ep = f->private_data;
	struct rb_node *rbp;

	mutex_lock(&ep->mtx);
	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);

882
883
884
885
		seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
			   epi->ffd.fd, epi->event.events,
			   (long long)epi->event.data);
		if (seq_has_overflowed(m))
886
887
888
889
890
891
			break;
	}
	mutex_unlock(&ep->mtx);
}
#endif

892
893
/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
894
895
896
#ifdef CONFIG_PROC_FS
	.show_fdinfo	= ep_show_fdinfo,
#endif
897
	.release	= ep_eventpoll_release,
898
899
	.poll		= ep_eventpoll_poll,
	.llseek		= noop_llseek,
900
901
};

Davide Libenzi's avatar
Davide Libenzi committed
902
/*
903
904
905
 * This is called from eventpoll_release() to unlink files from the eventpoll
 * interface. We need to have this facility to cleanup correctly files that are
 * closed without being removed from the eventpoll interface.
Davide Libenzi's avatar
Davide Libenzi committed
906
 */
907
void eventpoll_release_file(struct file *file)
Davide Libenzi's avatar
Davide Libenzi committed
908
{
909
	struct eventpoll *ep;
910
	struct epitem *epi, *next;
Davide Libenzi's avatar
Davide Libenzi committed
911
912

	/*
913
	 * We don't want to get "file->f_lock" because it is not
914
	 * necessary. It is not necessary because we're in the "struct file"
Lucas De Marchi's avatar
Lucas De Marchi committed
915
	 * cleanup path, and this means that no one is using this file anymore.
Davide Libenzi's avatar
Davide Libenzi committed
916
	 * So, for example, epoll_ctl() cannot hit here since if we reach this
Davide Libenzi's avatar
Davide Libenzi committed
917
	 * point, the file counter already went to zero and fget() would fail.
918
	 * The only hit might come from ep_free() but by holding the mutex
919
	 * will correctly serialize the operation. We do need to acquire
920
	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
921
	 * from anywhere but ep_free().
922
923
	 *
	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
Davide Libenzi's avatar
Davide Libenzi committed
924
	 */
925
	mutex_lock(&epmutex);
926
	list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
927
		ep = epi->ep;
928
		mutex_lock_nested(&ep->mtx, 0);
929
		ep_remove(ep, epi);
930
		mutex_unlock(&ep->mtx);
Davide Libenzi's avatar
Davide Libenzi committed
931
	}
932
	mutex_unlock(&epmutex);
Davide Libenzi's avatar
Davide Libenzi committed
933
934
}

935
static int ep_alloc(struct eventpoll **pep)
Linus Torvalds's avatar
Linus Torvalds committed
936
{
937
938
939
	int error;
	struct user_struct *user;
	struct eventpoll *ep;
Linus Torvalds's avatar
Linus Torvalds committed
940

941
942
943
944
945
	user = get_current_user();
	error = -ENOMEM;
	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
	if (unlikely(!ep))
		goto free_uid;
Linus Torvalds's avatar
Linus Torvalds committed
946

947
	spin_lock_init(&ep->lock);
948
	mutex_init(&ep->mtx);
Linus Torvalds's avatar
Linus Torvalds committed
949
950
951
952
	init_waitqueue_head(&ep->wq);
	init_waitqueue_head(&ep->poll_wait);
	INIT_LIST_HEAD(&ep->rdllist);
	ep->rbr = RB_ROOT;
953
	ep->ovflist = EP_UNACTIVE_PTR;
954
	ep->user = user;
Linus Torvalds's avatar
Linus Torvalds committed
955

956
	*pep = ep;
Linus Torvalds's avatar
Linus Torvalds committed
957
958

	return 0;
959
960
961
962

free_uid:
	free_uid(user);
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
963
964
965
}

/*
966
967
968
 * Search the file inside the eventpoll tree. The RB tree operations
 * are protected by the "mtx" mutex, and ep_find() must be called with
 * "mtx" held.
Linus Torvalds's avatar
Linus Torvalds committed
969
970
971
972
973
974
975
976
 */
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
	int kcmp;
	struct rb_node *rbp;
	struct epitem *epi, *epir = NULL;
	struct epoll_filefd ffd;

977
	ep_set_ffd(&ffd, file, fd);
Linus Torvalds's avatar
Linus Torvalds committed
978
979
	for (rbp = ep->rbr.rb_node; rbp; ) {
		epi = rb_entry(rbp, struct epitem, rbn);
980
		kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
Linus Torvalds's avatar
Linus Torvalds committed
981
982
983
984
985
986
987
988
989
990
991
992
993
994
		if (kcmp > 0)
			rbp = rbp->rb_right;
		else if (kcmp < 0)
			rbp = rbp->rb_left;
		else {
			epir = epi;
			break;
		}
	}

	return epir;
}

/*
995
 * This is the callback that is passed to the wait queue wakeup
Daniel Baluta's avatar
Daniel Baluta committed
996
 * mechanism. It is called by the stored file descriptors when they
997
 * have events to report.
Linus Torvalds's avatar
Linus Torvalds committed
998
 */
999
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
Linus Torvalds's avatar
Linus Torvalds committed
1000
{
For faster browsing, not all history is shown. View entire blame