tun.c 85.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
Linus Torvalds's avatar
Linus Torvalds committed
2
3
4
5
6
7
8
9
10
11
/*
 *  TUN - Universal TUN/TAP device driver.
 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
 *
 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
 */

/*
 *  Changes:
 *
12
13
14
 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
 *    Add TUNSETLINK ioctl to set the link encapsulation
 *
Linus Torvalds's avatar
Linus Torvalds committed
15
 *  Mark Smith <markzzzsmith@yahoo.com.au>
Joe Perches's avatar
Joe Perches committed
16
 *    Use eth_random_addr() for tap MAC address.
Linus Torvalds's avatar
Linus Torvalds committed
17
18
19
20
21
22
23
24
25
26
27
 *
 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
 *    Fixes in packet dropping, queue length setting and queue wakeup.
 *    Increased default tx queue length.
 *    Added ethtool API.
 *    Minor cleanups
 *
 *  Daniel Podlejski <underley@underley.eu.org>
 *    Modifications for 2.3.99-pre5 kernel.
 */

28
29
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

Linus Torvalds's avatar
Linus Torvalds committed
30
31
32
33
34
35
36
37
#define DRV_NAME	"tun"
#define DRV_VERSION	"1.6"
#define DRV_DESCRIPTION	"Universal TUN/TAP device driver"
#define DRV_COPYRIGHT	"(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
38
#include <linux/sched/signal.h>
Linus Torvalds's avatar
Linus Torvalds committed
39
40
41
42
43
44
45
46
47
48
49
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/miscdevice.h>
#include <linux/ethtool.h>
#include <linux/rtnetlink.h>
50
#include <linux/compat.h>
Linus Torvalds's avatar
Linus Torvalds committed
51
52
53
54
#include <linux/if.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_tun.h>
Jason Wang's avatar
Jason Wang committed
55
#include <linux/if_vlan.h>
Linus Torvalds's avatar
Linus Torvalds committed
56
#include <linux/crc32.h>
57
#include <linux/nsproxy.h>
58
#include <linux/virtio_net.h>
59
#include <linux/rcupdate.h>
60
#include <net/net_namespace.h>
61
#include <net/netns/generic.h>
62
#include <net/rtnetlink.h>
63
#include <net/sock.h>
64
#include <net/xdp.h>
65
#include <linux/seq_file.h>
Herbert Xu's avatar
Herbert Xu committed
66
#include <linux/uio.h>
67
#include <linux/skb_array.h>
Jason Wang's avatar
Jason Wang committed
68
69
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
70
#include <linux/mutex.h>
Linus Torvalds's avatar
Linus Torvalds committed
71

72
#include <linux/uaccess.h>
73
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
74

75
76
77
static void tun_default_link_ksettings(struct net_device *dev,
				       struct ethtool_link_ksettings *cmd);

78
79
80
/* Uncomment to enable debugging */
/* #define TUN_DEBUG 1 */

Linus Torvalds's avatar
Linus Torvalds committed
81
82
#ifdef TUN_DEBUG
static int debug;
83

84
85
86
87
88
89
90
91
92
93
#define tun_debug(level, tun, fmt, args...)			\
do {								\
	if (tun->debug)						\
		netdev_printk(level, tun->dev, fmt, ##args);	\
} while (0)
#define DBG1(level, fmt, args...)				\
do {								\
	if (debug == 2)						\
		printk(level fmt, ##args);			\
} while (0)
94
#else
95
96
97
98
99
100
101
102
103
104
#define tun_debug(level, tun, fmt, args...)			\
do {								\
	if (0)							\
		netdev_printk(level, tun->dev, fmt, ##args);	\
} while (0)
#define DBG1(level, fmt, args...)				\
do {								\
	if (0)							\
		printk(level fmt, ##args);			\
} while (0)
105
106
#endif

107
#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
108

109
110
111
112
113
114
/* TUN device flags */

/* IFF_ATTACH_QUEUE is never stored in device flags,
 * overload it to mean fasync when stored there.
 */
#define TUN_FASYNC	IFF_ATTACH_QUEUE
115
116
/* High bits in flags field are unused. */
#define TUN_VNET_LE     0x80000000
117
#define TUN_VNET_BE     0x40000000
118
119

#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
120
121
		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)

122
123
#define GOODCOPY_LEN 128

124
125
126
127
128
129
130
#define FLT_EXACT_COUNT 8
struct tap_filter {
	unsigned int    count;    /* Number of addrs. Zero means disabled */
	u32             mask[2];  /* Mask of the hashed addrs */
	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
};

131
132
133
/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
 * to max number of VCPUs in guest. */
#define MAX_TAP_QUEUES 256
134
#define MAX_TAP_FLOWS  4096
Jason Wang's avatar
Jason Wang committed
135

136
137
#define TUN_FLOW_EXPIRE (3 * HZ)

138
139
140
141
142
143
144
145
146
147
148
struct tun_pcpu_stats {
	u64 rx_packets;
	u64 rx_bytes;
	u64 tx_packets;
	u64 tx_bytes;
	struct u64_stats_sync syncp;
	u32 rx_dropped;
	u32 tx_dropped;
	u32 rx_frame_errors;
};

Jason Wang's avatar
Jason Wang committed
149
/* A tun_file connects an open character device to a tuntap netdevice. It
stephen hemminger's avatar
stephen hemminger committed
150
 * also contains all socket related structures (except sock_fprog and tap_filter)
Jason Wang's avatar
Jason Wang committed
151
152
 * to serve as one transmit queue for tuntap device. The sock_fprog and
 * tap_filter were kept in tun_struct since they were used for filtering for the
Rami Rosen's avatar
Rami Rosen committed
153
 * netdevice not for a specific queue (at least I didn't see the requirement for
Jason Wang's avatar
Jason Wang committed
154
 * this).
155
156
 *
 * RCU usage:
Rami Rosen's avatar
Rami Rosen committed
157
 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
158
 * other can only be read while rcu_read_lock or rtnl_lock is held.
Jason Wang's avatar
Jason Wang committed
159
 */
Eric W. Biederman's avatar
Eric W. Biederman committed
160
struct tun_file {
Jason Wang's avatar
Jason Wang committed
161
162
	struct sock sk;
	struct socket socket;
163
	struct tun_struct __rcu *tun;
Jason Wang's avatar
Jason Wang committed
164
165
166
	struct fasync_struct *fasync;
	/* only used for fasnyc */
	unsigned int flags;
167
168
169
170
	union {
		u16 queue_index;
		unsigned int ifindex;
	};
171
	struct napi_struct napi;
172
	bool napi_enabled;
Eric Dumazet's avatar
Eric Dumazet committed
173
	bool napi_frags_enabled;
174
	struct mutex napi_mutex;	/* Protects access to the above napi */
175
176
	struct list_head next;
	struct tun_struct *detached;
177
	struct ptr_ring tx_ring;
178
	struct xdp_rxq_info xdp_rxq;
Eric W. Biederman's avatar
Eric W. Biederman committed
179
180
};

181
182
183
184
185
struct tun_page {
	struct page *page;
	int count;
};

186
187
188
189
190
191
struct tun_flow_entry {
	struct hlist_node hash_link;
	struct rcu_head rcu;
	struct tun_struct *tun;

	u32 rxhash;
192
	u32 rps_rxhash;
193
	int queue_index;
194
	unsigned long updated ____cacheline_aligned_in_smp;
195
196
197
};

#define TUN_NUM_FLOW_ENTRIES 1024
198
#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)
199

200
struct tun_prog {
201
202
203
204
	struct rcu_head rcu;
	struct bpf_prog *prog;
};

Jason Wang's avatar
Jason Wang committed
205
/* Since the socket were moved to tun_file, to preserve the behavior of persist
Rami Rosen's avatar
Rami Rosen committed
206
 * device, socket filter, sndbuf and vnet header size were restore when the
Jason Wang's avatar
Jason Wang committed
207
208
 * file were attached to a persist device.
 */
209
struct tun_struct {
Jason Wang's avatar
Jason Wang committed
210
211
	struct tun_file __rcu	*tfiles[MAX_TAP_QUEUES];
	unsigned int            numqueues;
212
	unsigned int 		flags;
213
214
	kuid_t			owner;
	kgid_t			group;
215
216

	struct net_device	*dev;
217
	netdev_features_t	set_features;
218
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
219
			  NETIF_F_TSO6)
220

221
	int			align;
222
	int			vnet_hdr_sz;
Jason Wang's avatar
Jason Wang committed
223
224
225
226
227
	int			sndbuf;
	struct tap_filter	txflt;
	struct sock_fprog	fprog;
	/* protected by rtnl lock */
	bool			filter_attached;
228
229
#ifdef TUN_DEBUG
	int debug;
Linus Torvalds's avatar
Linus Torvalds committed
230
#endif
231
232
233
234
	spinlock_t lock;
	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
	struct timer_list flow_gc_timer;
	unsigned long ageing_time;
235
236
	unsigned int numdisabled;
	struct list_head disabled;
237
	void *security;
238
	u32 flow_count;
Jason Wang's avatar
Jason Wang committed
239
	u32 rx_batched;
240
	struct tun_pcpu_stats __percpu *pcpu_stats;
Jason Wang's avatar
Jason Wang committed
241
	struct bpf_prog __rcu *xdp_prog;
242
	struct tun_prog __rcu *steering_prog;
243
	struct tun_prog __rcu *filter_prog;
244
	struct ethtool_link_ksettings link_ksettings;
245
};
Linus Torvalds's avatar
Linus Torvalds committed
246

247
248
249
struct veth {
	__be16 h_vlan_proto;
	__be16 h_vlan_TCI;
250
};
Linus Torvalds's avatar
Linus Torvalds committed
251

252
bool tun_is_xdp_frame(void *ptr)
Jason Wang's avatar
Jason Wang committed
253
254
255
{
	return (unsigned long)ptr & TUN_XDP_FLAG;
}
256
EXPORT_SYMBOL(tun_is_xdp_frame);
Jason Wang's avatar
Jason Wang committed
257
258
259
260
261
262
263
264
265
266
267
268
269

void *tun_xdp_to_ptr(void *ptr)
{
	return (void *)((unsigned long)ptr | TUN_XDP_FLAG);
}
EXPORT_SYMBOL(tun_xdp_to_ptr);

void *tun_ptr_to_xdp(void *ptr)
{
	return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
}
EXPORT_SYMBOL(tun_ptr_to_xdp);

270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
static int tun_napi_receive(struct napi_struct *napi, int budget)
{
	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
	struct sk_buff_head process_queue;
	struct sk_buff *skb;
	int received = 0;

	__skb_queue_head_init(&process_queue);

	spin_lock(&queue->lock);
	skb_queue_splice_tail_init(queue, &process_queue);
	spin_unlock(&queue->lock);

	while (received < budget && (skb = __skb_dequeue(&process_queue))) {
		napi_gro_receive(napi, skb);
		++received;
	}

	if (!skb_queue_empty(&process_queue)) {
		spin_lock(&queue->lock);
		skb_queue_splice(&process_queue, queue);
		spin_unlock(&queue->lock);
	}

	return received;
}

static int tun_napi_poll(struct napi_struct *napi, int budget)
{
	unsigned int received;

	received = tun_napi_receive(napi, budget);

	if (received < budget)
		napi_complete_done(napi, received);

	return received;
}

static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
Eric Dumazet's avatar
Eric Dumazet committed
311
			  bool napi_en, bool napi_frags)
312
{
313
	tfile->napi_enabled = napi_en;
Eric Dumazet's avatar
Eric Dumazet committed
314
	tfile->napi_frags_enabled = napi_en && napi_frags;
315
316
317
318
319
320
321
	if (napi_en) {
		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
			       NAPI_POLL_WEIGHT);
		napi_enable(&tfile->napi);
	}
}

Eric Dumazet's avatar
Eric Dumazet committed
322
static void tun_napi_disable(struct tun_file *tfile)
323
{
324
	if (tfile->napi_enabled)
325
326
327
		napi_disable(&tfile->napi);
}

Eric Dumazet's avatar
Eric Dumazet committed
328
static void tun_napi_del(struct tun_file *tfile)
329
{
330
	if (tfile->napi_enabled)
331
332
333
		netif_napi_del(&tfile->napi);
}

Eric Dumazet's avatar
Eric Dumazet committed
334
static bool tun_napi_frags_enabled(const struct tun_file *tfile)
335
{
Eric Dumazet's avatar
Eric Dumazet committed
336
	return tfile->napi_frags_enabled;
337
338
}

339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
#ifdef CONFIG_TUN_VNET_CROSS_LE
static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
{
	return tun->flags & TUN_VNET_BE ? false :
		virtio_legacy_is_little_endian();
}

static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
{
	int be = !!(tun->flags & TUN_VNET_BE);

	if (put_user(be, argp))
		return -EFAULT;

	return 0;
}

static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
{
	int be;

	if (get_user(be, argp))
		return -EFAULT;

	if (be)
		tun->flags |= TUN_VNET_BE;
	else
		tun->flags &= ~TUN_VNET_BE;

	return 0;
}
#else
static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
{
	return virtio_legacy_is_little_endian();
}

static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
{
	return -EINVAL;
}

static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
{
	return -EINVAL;
}
#endif /* CONFIG_TUN_VNET_CROSS_LE */

387
388
static inline bool tun_is_little_endian(struct tun_struct *tun)
{
389
	return tun->flags & TUN_VNET_LE ||
390
		tun_legacy_is_little_endian(tun);
391
392
}

393
394
static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
{
395
	return __virtio16_to_cpu(tun_is_little_endian(tun), val);
396
397
398
399
}

static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
{
400
	return __cpu_to_virtio16(tun_is_little_endian(tun), val);
401
402
}

403
404
static inline u32 tun_hashfn(u32 rxhash)
{
405
	return rxhash & TUN_MASK_FLOW_ENTRIES;
406
407
408
409
410
411
}

static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
{
	struct tun_flow_entry *e;

412
	hlist_for_each_entry_rcu(e, head, hash_link) {
413
414
415
416
417
418
419
420
421
422
		if (e->rxhash == rxhash)
			return e;
	}
	return NULL;
}

static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
					      struct hlist_head *head,
					      u32 rxhash, u16 queue_index)
{
423
424
	struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);

425
426
427
428
429
	if (e) {
		tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n",
			  rxhash, queue_index);
		e->updated = jiffies;
		e->rxhash = rxhash;
430
		e->rps_rxhash = 0;
431
432
433
		e->queue_index = queue_index;
		e->tun = tun;
		hlist_add_head_rcu(&e->hash_link, head);
434
		++tun->flow_count;
435
436
437
438
439
440
441
442
443
	}
	return e;
}

static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
{
	tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
		  e->rxhash, e->queue_index);
	hlist_del_rcu(&e->hash_link);
444
	kfree_rcu(e, rcu);
445
	--tun->flow_count;
446
447
448
449
450
451
452
453
454
}

static void tun_flow_flush(struct tun_struct *tun)
{
	int i;

	spin_lock_bh(&tun->lock);
	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
		struct tun_flow_entry *e;
455
		struct hlist_node *n;
456

457
		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
458
459
460
461
462
463
464
465
466
467
468
469
			tun_flow_delete(tun, e);
	}
	spin_unlock_bh(&tun->lock);
}

static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
{
	int i;

	spin_lock_bh(&tun->lock);
	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
		struct tun_flow_entry *e;
470
		struct hlist_node *n;
471

472
		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
473
474
475
476
477
478
479
			if (e->queue_index == queue_index)
				tun_flow_delete(tun, e);
		}
	}
	spin_unlock_bh(&tun->lock);
}

480
static void tun_flow_cleanup(struct timer_list *t)
481
{
482
	struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
483
484
485
486
487
488
489
	unsigned long delay = tun->ageing_time;
	unsigned long next_timer = jiffies + delay;
	unsigned long count = 0;
	int i;

	tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n");

490
	spin_lock(&tun->lock);
491
492
	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
		struct tun_flow_entry *e;
493
		struct hlist_node *n;
494

495
		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
496
			unsigned long this_timer;
497

498
			this_timer = e->updated + delay;
499
			if (time_before_eq(this_timer, jiffies)) {
500
				tun_flow_delete(tun, e);
501
502
503
504
				continue;
			}
			count++;
			if (time_before(this_timer, next_timer))
505
506
507
508
509
510
				next_timer = this_timer;
		}
	}

	if (count)
		mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
511
	spin_unlock(&tun->lock);
512
513
}

514
static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
515
			    struct tun_file *tfile)
516
517
518
519
{
	struct hlist_head *head;
	struct tun_flow_entry *e;
	unsigned long delay = tun->ageing_time;
520
	u16 queue_index = tfile->queue_index;
521

522
	head = &tun->flows[tun_hashfn(rxhash)];
523
524
525
526
527
528

	rcu_read_lock();

	e = tun_flow_find(head, rxhash);
	if (likely(e)) {
		/* TODO: keep queueing to old queue until it's empty? */
529
530
531
532
		if (e->queue_index != queue_index)
			e->queue_index = queue_index;
		if (e->updated != jiffies)
			e->updated = jiffies;
533
		sock_rps_record_flow_hash(e->rps_rxhash);
534
535
	} else {
		spin_lock_bh(&tun->lock);
536
537
		if (!tun_flow_find(head, rxhash) &&
		    tun->flow_count < MAX_TAP_FLOWS)
538
539
540
541
542
543
544
545
546
547
548
			tun_flow_create(tun, head, rxhash, queue_index);

		if (!timer_pending(&tun->flow_gc_timer))
			mod_timer(&tun->flow_gc_timer,
				  round_jiffies_up(jiffies + delay));
		spin_unlock_bh(&tun->lock);
	}

	rcu_read_unlock();
}

549
550
551
552
553
554
/**
 * Save the hash received in the stack receive path and update the
 * flow_hash table accordingly.
 */
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
{
555
	if (unlikely(e->rps_rxhash != hash))
556
557
558
		e->rps_rxhash = hash;
}

559
/* We try to identify a flow through its rxhash. The reason that
stephen hemminger's avatar
stephen hemminger committed
560
 * we do not check rxq no. is because some cards(e.g 82599), chooses
Jason Wang's avatar
Jason Wang committed
561
562
 * the rxq based on the txq where the last packet of the flow comes. As
 * the userspace application move between processors, we may get a
563
 * different rxq no. here.
Jason Wang's avatar
Jason Wang committed
564
 */
565
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
Jason Wang's avatar
Jason Wang committed
566
{
567
	struct tun_flow_entry *e;
Jason Wang's avatar
Jason Wang committed
568
569
570
	u32 txq = 0;
	u32 numqueues = 0;

571
	numqueues = READ_ONCE(tun->numqueues);
Jason Wang's avatar
Jason Wang committed
572

Jason Wang's avatar
Jason Wang committed
573
	txq = __skb_get_hash_symmetric(skb);
574
575
576
577
578
579
580
	e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
	if (e) {
		tun_flow_save_rps_rxhash(e, txq);
		txq = e->queue_index;
	} else {
		/* use multiply and shift instead of expensive divide */
		txq = ((u64)txq * numqueues) >> 32;
Jason Wang's avatar
Jason Wang committed
581
582
583
584
585
	}

	return txq;
}

586
587
static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
588
	struct tun_prog *prog;
589
	u32 numqueues;
590
591
	u16 ret = 0;

592
593
594
595
	numqueues = READ_ONCE(tun->numqueues);
	if (!numqueues)
		return 0;

596
597
598
599
	prog = rcu_dereference(tun->steering_prog);
	if (prog)
		ret = bpf_prog_run_clear_cb(prog->prog, skb);

600
	return ret % numqueues;
601
602
603
}

static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
604
			    struct net_device *sb_dev)
605
606
607
608
609
610
611
612
613
614
615
616
617
618
{
	struct tun_struct *tun = netdev_priv(dev);
	u16 ret;

	rcu_read_lock();
	if (rcu_dereference(tun->steering_prog))
		ret = tun_ebpf_select_queue(tun, skb);
	else
		ret = tun_automq_select_queue(tun, skb);
	rcu_read_unlock();

	return ret;
}

619
620
621
static inline bool tun_not_capable(struct tun_struct *tun)
{
	const struct cred *cred = current_cred();
622
	struct net *net = dev_net(tun->dev);
623
624
625

	return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
		  (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
626
		!ns_capable(net->user_ns, CAP_NET_ADMIN);
627
628
}

Jason Wang's avatar
Jason Wang committed
629
630
631
632
633
634
static void tun_set_real_num_queues(struct tun_struct *tun)
{
	netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
	netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
}

635
636
637
638
639
640
641
static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
{
	tfile->detached = tun;
	list_add_tail(&tfile->next, &tun->disabled);
	++tun->numdisabled;
}

Jason Wang's avatar
Jason Wang committed
642
static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
643
644
645
646
647
648
649
650
651
{
	struct tun_struct *tun = tfile->detached;

	tfile->detached = NULL;
	list_del_init(&tfile->next);
	--tun->numdisabled;
	return tun;
}

652
void tun_ptr_free(void *ptr)
Jason Wang's avatar
Jason Wang committed
653
654
655
{
	if (!ptr)
		return;
656
657
	if (tun_is_xdp_frame(ptr)) {
		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
Jason Wang's avatar
Jason Wang committed
658

659
		xdp_return_frame(xdpf);
Jason Wang's avatar
Jason Wang committed
660
661
662
663
	} else {
		__skb_array_destroy_skb(ptr);
	}
}
664
EXPORT_SYMBOL_GPL(tun_ptr_free);
Jason Wang's avatar
Jason Wang committed
665

666
667
static void tun_queue_purge(struct tun_file *tfile)
{
Jason Wang's avatar
Jason Wang committed
668
	void *ptr;
669

Jason Wang's avatar
Jason Wang committed
670
671
	while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
		tun_ptr_free(ptr);
672

Jason Wang's avatar
Jason Wang committed
673
	skb_queue_purge(&tfile->sk.sk_write_queue);
674
675
676
	skb_queue_purge(&tfile->sk.sk_error_queue);
}

Jason Wang's avatar
Jason Wang committed
677
678
679
680
681
static void __tun_detach(struct tun_file *tfile, bool clean)
{
	struct tun_file *ntfile;
	struct tun_struct *tun;

682
683
	tun = rtnl_dereference(tfile->tun);

684
	if (tun && clean) {
Eric Dumazet's avatar
Eric Dumazet committed
685
686
		tun_napi_disable(tfile);
		tun_napi_del(tfile);
687
688
	}

689
	if (tun && !tfile->detached) {
Jason Wang's avatar
Jason Wang committed
690
691
692
693
694
		u16 index = tfile->queue_index;
		BUG_ON(index >= tun->numqueues);

		rcu_assign_pointer(tun->tfiles[index],
				   tun->tfiles[tun->numqueues - 1]);
695
		ntfile = rtnl_dereference(tun->tfiles[index]);
Jason Wang's avatar
Jason Wang committed
696
		ntfile->queue_index = index;
697
698
		rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
				   NULL);
Jason Wang's avatar
Jason Wang committed
699
700

		--tun->numqueues;
701
		if (clean) {
702
			RCU_INIT_POINTER(tfile->tun, NULL);
703
			sock_put(&tfile->sk);
704
		} else
705
			tun_disable_queue(tun, tfile);
Jason Wang's avatar
Jason Wang committed
706
707

		synchronize_net();
708
		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
Jason Wang's avatar
Jason Wang committed
709
		/* Drop read queue */
710
		tun_queue_purge(tfile);
Jason Wang's avatar
Jason Wang committed
711
		tun_set_real_num_queues(tun);
712
	} else if (tfile->detached && clean) {
713
		tun = tun_enable_queue(tfile);
714
715
		sock_put(&tfile->sk);
	}
Jason Wang's avatar
Jason Wang committed
716
717

	if (clean) {
718
719
720
		if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
			netif_carrier_off(tun->dev);

721
			if (!(tun->flags & IFF_PERSIST) &&
722
			    tun->dev->reg_state == NETREG_REGISTERED)
723
				unregister_netdevice(tun->dev);
724
		}
725
726
		if (tun)
			xdp_rxq_info_unreg(&tfile->xdp_rxq);
727
		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
728
		sock_put(&tfile->sk);
Jason Wang's avatar
Jason Wang committed
729
730
731
732
733
	}
}

static void tun_detach(struct tun_file *tfile, bool clean)
{
734
735
736
	struct tun_struct *tun;
	struct net_device *dev;

Jason Wang's avatar
Jason Wang committed
737
	rtnl_lock();
738
739
	tun = rtnl_dereference(tfile->tun);
	dev = tun ? tun->dev : NULL;
Jason Wang's avatar
Jason Wang committed
740
	__tun_detach(tfile, clean);
741
742
	if (dev)
		netdev_state_change(dev);
Jason Wang's avatar
Jason Wang committed
743
744
745
746
747
748
	rtnl_unlock();
}

static void tun_detach_all(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);
749
	struct tun_file *tfile, *tmp;
Jason Wang's avatar
Jason Wang committed
750
751
752
	int i, n = tun->numqueues;

	for (i = 0; i < n; i++) {
753
		tfile = rtnl_dereference(tun->tfiles[i]);
Jason Wang's avatar
Jason Wang committed
754
		BUG_ON(!tfile);
Eric Dumazet's avatar
Eric Dumazet committed
755
		tun_napi_disable(tfile);
756
		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
757
		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
758
		RCU_INIT_POINTER(tfile->tun, NULL);
Jason Wang's avatar
Jason Wang committed
759
760
		--tun->numqueues;
	}
761
	list_for_each_entry(tfile, &tun->disabled, next) {
762
		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
763
		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
764
		RCU_INIT_POINTER(tfile->tun, NULL);
765
	}
Jason Wang's avatar
Jason Wang committed
766
767
768
769
	BUG_ON(tun->numqueues != 0);

	synchronize_net();
	for (i = 0; i < n; i++) {
770
		tfile = rtnl_dereference(tun->tfiles[i]);
Eric Dumazet's avatar
Eric Dumazet committed
771
		tun_napi_del(tfile);
Jason Wang's avatar
Jason Wang committed
772
		/* Drop read queue */
773
		tun_queue_purge(tfile);
774
		xdp_rxq_info_unreg(&tfile->xdp_rxq);
Jason Wang's avatar
Jason Wang committed
775
776
		sock_put(&tfile->sk);
	}
777
778
	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
		tun_enable_queue(tfile);
779
		tun_queue_purge(tfile);
780
		xdp_rxq_info_unreg(&tfile->xdp_rxq);
781
782
783
		sock_put(&tfile->sk);
	}
	BUG_ON(tun->numdisabled != 0);
784

785
	if (tun->flags & IFF_PERSIST)
786
		module_put(THIS_MODULE);
Jason Wang's avatar
Jason Wang committed
787
788
}

789
static int tun_attach(struct tun_struct *tun, struct file *file,
790
791
		      bool skip_filter, bool napi, bool napi_frags,
		      bool publish_tun)
792
{
Eric W. Biederman's avatar
Eric W. Biederman committed
793
	struct tun_file *tfile = file->private_data;
794
	struct net_device *dev = tun->dev;
795
	int err;
796

797
798
799
800
	err = security_tun_dev_attach(tfile->socket.sk, tun->security);
	if (err < 0)
		goto out;

801
	err = -EINVAL;
802
	if (rtnl_dereference(tfile->tun) && !tfile->detached)
803
804
805
		goto out;

	err = -EBUSY;
806
	if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
Jason Wang's avatar
Jason Wang committed
807
808
809
		goto out;

	err = -E2BIG;
810
811
	if (!tfile->detached &&
	    tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
812
813
814
		goto out;

	err = 0;
Jason Wang's avatar
Jason Wang committed
815

stephen hemminger's avatar
stephen hemminger committed
816
	/* Re-attach the filter to persist device */
817
	if (!skip_filter && (tun->filter_attached == true)) {
818
819
820
		lock_sock(tfile->socket.sk);
		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
		release_sock(tfile->socket.sk);
Jason Wang's avatar
Jason Wang committed
821
822
823
		if (!err)
			goto out;
	}
824
825

	if (!tfile->detached &&
826
827
	    ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
			    GFP_KERNEL, tun_ptr_free)) {
828
829
830
831
		err = -ENOMEM;
		goto out;
	}

Jason Wang's avatar
Jason Wang committed
832
	tfile->queue_index = tun->numqueues;
833
	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
834
835
836
837
838
839
840
841
842
843
844
845
846

	if (tfile->detached) {
		/* Re-attach detached tfile, updating XDP queue_index */
		WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));

		if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
			tfile->xdp_rxq.queue_index = tfile->queue_index;
	} else {
		/* Setup XDP RX-queue info, for new tfile getting attached */
		err = xdp_rxq_info_reg(&tfile->xdp_rxq,
				       tun->dev, tfile->queue_index);
		if (err < 0)
			goto out;
847
848
849
850
851
852
		err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
						 MEM_TYPE_PAGE_SHARED, NULL);
		if (err < 0) {
			xdp_rxq_info_unreg(&tfile->xdp_rxq);
			goto out;
		}
853
854
855
		err = 0;
	}

856
	if (tfile->detached) {
857
		tun_enable_queue(tfile);
858
	} else {
859
		sock_hold(&tfile->sk);
Eric Dumazet's avatar
Eric Dumazet committed
860
		tun_napi_init(tun, tfile, napi, napi_frags);
861
	}
862

Jason Wang's avatar
Jason Wang committed
863
864
865
	if (rtnl_dereference(tun->xdp_prog))
		sock_set_flag(&tfile->sk, SOCK_XDP);

Jason Wang's avatar
Jason Wang committed
866
867
868
869
	/* device is allowed to go away first, so no need to hold extra
	 * refcnt.
	 */

870
871
872
873
	/* Publish tfile->tun and tun->tfiles only after we've fully
	 * initialized tfile; otherwise we risk using half-initialized
	 * object.
	 */
874
875
	if (publish_tun)
		rcu_assign_pointer(tfile->tun, tun);
876
877
	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
	tun->numqueues++;
878
	tun_set_real_num_queues(tun);
Jason Wang's avatar
Jason Wang committed
879
880
out:
	return err;
Eric W. Biederman's avatar
Eric W. Biederman committed
881
882
}

883
static struct tun_struct *tun_get(struct tun_file *tfile)
Eric W. Biederman's avatar
Eric W. Biederman committed
884
{
885
	struct tun_struct *tun;
886

887
888
889
890
891
	rcu_read_lock();
	tun = rcu_dereference(tfile->tun);
	if (tun)
		dev_hold(tun->dev);
	rcu_read_unlock();
892
893

	return tun;
Eric W. Biederman's avatar
Eric W. Biederman committed
894
895
896
897
}

static void tun_put(struct tun_struct *tun)
{
898
	dev_put(tun->dev);
Eric W. Biederman's avatar
Eric W. Biederman committed
899
900
}

901
/* TAP filtering */
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
static void addr_hash_set(u32 *mask, const u8 *addr)
{
	int n = ether_crc(ETH_ALEN, addr) >> 26;
	mask[n >> 5] |= (1 << (n & 31));
}

static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
{
	int n = ether_crc(ETH_ALEN, addr) >> 26;
	return mask[n >> 5] & (1 << (n & 31));
}

static int update_filter(struct tap_filter *filter, void __user *arg)
{
	struct { u8 u[ETH_ALEN]; } *addr;
	struct tun_filter uf;
	int err, alen, n, nexact;

	if (copy_from_user(&uf, arg, sizeof(uf)))
		return -EFAULT;

	if (!uf.count) {
		/* Disabled */
		filter->count = 0;
		return 0;
	}

	alen = ETH_ALEN * uf.count;
930
931
932
	addr = memdup_user(arg + sizeof(uf), alen);
	if (IS_ERR(addr))
		return PTR_ERR(addr);
933
934
935
936
937
938
939
940
941
942
943
944
945

	/* The filter is updated without holding any locks. Which is
	 * perfectly safe. We disable it first and in the worst
	 * case we'll accept a few undesired packets. */
	filter->count = 0;
	wmb();

	/* Use first set of addresses as an exact filter */
	for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
		memcpy(filter->addr[n], addr[n].u, ETH_ALEN);

	nexact = n;

946
947
	/* Remaining multicast addresses are hashed,
	 * unicast will leave the filter disabled. */
948
	memset(filter->mask, 0, sizeof(filter->mask));
949
950
951
	for (; n < uf.count; n++) {
		if (!is_multicast_ether_addr(addr[n].u)) {
			err = 0; /* no filter */
952
			goto free_addr;
953
		}
954
		addr_hash_set(filter->mask, addr[n].u);
955
	}
956
957
958
959
960
961
962
963
964
965
966
967

	/* For ALLMULTI just set the mask to all ones.
	 * This overrides the mask populated above. */
	if ((uf.flags & TUN_FLT_ALLMULTI))
		memset(filter->mask, ~0, sizeof(filter->mask));

	/* Now enable the filter */
	wmb();
	filter->count = nexact;

	/* Return the number of exact filters */
	err = nexact;
968
free_addr:
969
970
971
972
973
974
975
976
977
978
979
980
981
982
	kfree(addr);
	return err;
}

/* Returns: 0 - drop, !=0 - accept */
static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
	/* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
	 * at this point. */
	struct ethhdr *eh = (struct ethhdr *) skb->data;
	int i;

	/* Exact match */
	for (i = 0; i < filter->count; i++)
983
		if (ether_addr_equal(eh->h_dest, filter->addr[i]))
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
			return 1;

	/* Inexact match (multicast only) */
	if (is_multicast_ether_addr(eh->h_dest))
		return addr_hash_test(filter->mask, eh->h_dest);

	return 0;
}

/*
 * Checks whether the packet is accepted or not.
 * Returns: 0 - drop, !=0 - accept
 */
static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
	if (!filter->count)
		return 1;

	return run_filter(filter, skb);
}

Linus Torvalds's avatar
Linus Torvalds committed
1005
1006
/* Network device part of the driver */

1007
static const struct ethtool_ops tun_ethtool_ops;
Linus Torvalds's avatar
Linus Torvalds committed
1008

1009
1010
1011
/* Net device detach from fd. */
static void tun_net_uninit(struct net_device *dev)
{
Jason Wang's avatar
Jason Wang committed
1012
	tun_detach_all(dev);
1013
1014
}

Linus Torvalds's avatar
Linus Torvalds committed
1015
1016
1017
/* Net device open. */
static int tun_net_open(struct net_device *dev)
{
Jason Wang's avatar
Jason Wang committed
1018
	netif_tx_start_all_queues(dev);
1019

Linus Torvalds's avatar
Linus Torvalds committed
1020
1021
1022
1023
1024
1025
	return 0;
}

/* Net device close. */
static int tun_net_close(struct net_device *dev)
{
Jason Wang's avatar
Jason Wang committed
1026
	netif_tx_stop_all_queues(dev);
Linus Torvalds's avatar
Linus Torvalds committed
1027
1028
1029
1030
	return 0;
}

/* Net device start xmit */
1031
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
1032
{
1033
#ifdef CONFIG_RPS
1034
	if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
1035
1036
1037
		/* Select queue was not called for the skbuff, so we extract the
		 * RPS hash and save it into the flow_table here.
		 */
1038
		struct tun_flow_entry *e;
1039
1040
		__u32 rxhash;

Jason Wang's avatar
Jason Wang committed
1041
		rxhash = __skb_get_hash_symmetric(skb);
1042
1043
1044
		e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
		if (e)
			tun_flow_save_rps_rxhash(e, rxhash);
1045
	}
1046
#endif
1047
1048
}

1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
static unsigned int run_ebpf_filter(struct tun_struct *tun,
				    struct sk_buff *skb,
				    int len)
{
	struct tun_prog *prog = rcu_dereference(tun->filter_prog);

	if (prog)
		len = bpf_prog_run_clear_cb(prog->prog, skb);

	return len;
}

1061
1062
1063
1064
1065
1066
/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);
	int txq = skb->queue_mapping;
	struct tun_file *tfile;
1067
	int len = skb->len;
1068
1069
1070
1071
1072

	rcu_read_lock();
	tfile = rcu_dereference(tun->tfiles[txq]);

	/* Drop packet if interface is not attached */
1073
	if (!tfile)
1074
1075
1076
1077
		goto drop;

	if (!rcu_dereference(tun->steering_prog))
		tun_automq_xmit(tun, skb);
1078

1079
1080
	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);

Jason Wang's avatar
Jason Wang committed
1081
1082
	BUG_ON(!tfile);

1083
1084
1085
1086
1087
1088
	/* Drop if the filter does not like it.
	 * This is a noop if the filter is disabled.
	 * Filter can be enabled only for the TAP devices. */
	if (!check_filter(&tun->txflt, skb))
		goto drop;

Jason Wang's avatar
Jason Wang committed
1089
1090
	if (tfile->socket.sk->sk_filter &&
	    sk_filter(tfile->socket.sk, skb))
1091