tun.c 81.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
/*
 *  TUN - Universal TUN/TAP device driver.
 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
 */

/*
 *  Changes:
 *
21
22
23
 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
 *    Add TUNSETLINK ioctl to set the link encapsulation
 *
Linus Torvalds's avatar
Linus Torvalds committed
24
 *  Mark Smith <markzzzsmith@yahoo.com.au>
Joe Perches's avatar
Joe Perches committed
25
 *    Use eth_random_addr() for tap MAC address.
Linus Torvalds's avatar
Linus Torvalds committed
26
27
28
29
30
31
32
33
34
35
36
 *
 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
 *    Fixes in packet dropping, queue length setting and queue wakeup.
 *    Increased default tx queue length.
 *    Added ethtool API.
 *    Minor cleanups
 *
 *  Daniel Podlejski <underley@underley.eu.org>
 *    Modifications for 2.3.99-pre5 kernel.
 */

37
38
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

Linus Torvalds's avatar
Linus Torvalds committed
39
40
41
42
43
44
45
46
#define DRV_NAME	"tun"
#define DRV_VERSION	"1.6"
#define DRV_DESCRIPTION	"Universal TUN/TAP device driver"
#define DRV_COPYRIGHT	"(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
47
#include <linux/sched/signal.h>
Linus Torvalds's avatar
Linus Torvalds committed
48
49
50
51
52
53
54
55
56
57
58
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/miscdevice.h>
#include <linux/ethtool.h>
#include <linux/rtnetlink.h>
59
#include <linux/compat.h>
Linus Torvalds's avatar
Linus Torvalds committed
60
61
62
63
#include <linux/if.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_tun.h>
Jason Wang's avatar
Jason Wang committed
64
#include <linux/if_vlan.h>
Linus Torvalds's avatar
Linus Torvalds committed
65
#include <linux/crc32.h>
66
#include <linux/nsproxy.h>
67
#include <linux/virtio_net.h>
68
#include <linux/rcupdate.h>
69
#include <net/net_namespace.h>
70
#include <net/netns/generic.h>
71
#include <net/rtnetlink.h>
72
#include <net/sock.h>
73
#include <net/xdp.h>
74
#include <linux/seq_file.h>
Herbert Xu's avatar
Herbert Xu committed
75
#include <linux/uio.h>
76
#include <linux/skb_array.h>
Jason Wang's avatar
Jason Wang committed
77
78
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
79
#include <linux/mutex.h>
Linus Torvalds's avatar
Linus Torvalds committed
80

81
#include <linux/uaccess.h>
82
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
83

84
85
86
static void tun_default_link_ksettings(struct net_device *dev,
				       struct ethtool_link_ksettings *cmd);

87
88
89
/* Uncomment to enable debugging */
/* #define TUN_DEBUG 1 */

Linus Torvalds's avatar
Linus Torvalds committed
90
91
#ifdef TUN_DEBUG
static int debug;
92

93
94
95
96
97
98
99
100
101
102
#define tun_debug(level, tun, fmt, args...)			\
do {								\
	if (tun->debug)						\
		netdev_printk(level, tun->dev, fmt, ##args);	\
} while (0)
#define DBG1(level, fmt, args...)				\
do {								\
	if (debug == 2)						\
		printk(level fmt, ##args);			\
} while (0)
103
#else
104
105
106
107
108
109
110
111
112
113
#define tun_debug(level, tun, fmt, args...)			\
do {								\
	if (0)							\
		netdev_printk(level, tun->dev, fmt, ##args);	\
} while (0)
#define DBG1(level, fmt, args...)				\
do {								\
	if (0)							\
		printk(level fmt, ##args);			\
} while (0)
114
115
#endif

Jason Wang's avatar
Jason Wang committed
116
#define TUN_HEADROOM 256
117
#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
118

119
120
121
122
123
124
/* TUN device flags */

/* IFF_ATTACH_QUEUE is never stored in device flags,
 * overload it to mean fasync when stored there.
 */
#define TUN_FASYNC	IFF_ATTACH_QUEUE
125
126
/* High bits in flags field are unused. */
#define TUN_VNET_LE     0x80000000
127
#define TUN_VNET_BE     0x40000000
128
129

#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
130
131
		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)

132
133
#define GOODCOPY_LEN 128

134
135
136
137
138
139
140
#define FLT_EXACT_COUNT 8
struct tap_filter {
	unsigned int    count;    /* Number of addrs. Zero means disabled */
	u32             mask[2];  /* Mask of the hashed addrs */
	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
};

141
142
143
/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
 * to max number of VCPUs in guest. */
#define MAX_TAP_QUEUES 256
144
#define MAX_TAP_FLOWS  4096
Jason Wang's avatar
Jason Wang committed
145

146
147
#define TUN_FLOW_EXPIRE (3 * HZ)

148
149
150
151
152
153
154
155
156
157
158
struct tun_pcpu_stats {
	u64 rx_packets;
	u64 rx_bytes;
	u64 tx_packets;
	u64 tx_bytes;
	struct u64_stats_sync syncp;
	u32 rx_dropped;
	u32 tx_dropped;
	u32 rx_frame_errors;
};

Jason Wang's avatar
Jason Wang committed
159
/* A tun_file connects an open character device to a tuntap netdevice. It
stephen hemminger's avatar
stephen hemminger committed
160
 * also contains all socket related structures (except sock_fprog and tap_filter)
Jason Wang's avatar
Jason Wang committed
161
162
 * to serve as one transmit queue for tuntap device. The sock_fprog and
 * tap_filter were kept in tun_struct since they were used for filtering for the
Rami Rosen's avatar
Rami Rosen committed
163
 * netdevice not for a specific queue (at least I didn't see the requirement for
Jason Wang's avatar
Jason Wang committed
164
 * this).
165
166
 *
 * RCU usage:
Rami Rosen's avatar
Rami Rosen committed
167
 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
168
 * other can only be read while rcu_read_lock or rtnl_lock is held.
Jason Wang's avatar
Jason Wang committed
169
 */
Eric W. Biederman's avatar
Eric W. Biederman committed
170
struct tun_file {
Jason Wang's avatar
Jason Wang committed
171
172
173
	struct sock sk;
	struct socket socket;
	struct socket_wq wq;
174
	struct tun_struct __rcu *tun;
Jason Wang's avatar
Jason Wang committed
175
176
177
	struct fasync_struct *fasync;
	/* only used for fasnyc */
	unsigned int flags;
178
179
180
181
	union {
		u16 queue_index;
		unsigned int ifindex;
	};
182
	struct napi_struct napi;
183
	bool napi_enabled;
184
	struct mutex napi_mutex;	/* Protects access to the above napi */
185
186
	struct list_head next;
	struct tun_struct *detached;
187
	struct ptr_ring tx_ring;
188
	struct xdp_rxq_info xdp_rxq;
Eric W. Biederman's avatar
Eric W. Biederman committed
189
190
};

191
192
193
194
195
196
struct tun_flow_entry {
	struct hlist_node hash_link;
	struct rcu_head rcu;
	struct tun_struct *tun;

	u32 rxhash;
197
	u32 rps_rxhash;
198
199
200
201
202
	int queue_index;
	unsigned long updated;
};

#define TUN_NUM_FLOW_ENTRIES 1024
203
#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)
204

205
struct tun_prog {
206
207
208
209
	struct rcu_head rcu;
	struct bpf_prog *prog;
};

Jason Wang's avatar
Jason Wang committed
210
/* Since the socket were moved to tun_file, to preserve the behavior of persist
Rami Rosen's avatar
Rami Rosen committed
211
 * device, socket filter, sndbuf and vnet header size were restore when the
Jason Wang's avatar
Jason Wang committed
212
213
 * file were attached to a persist device.
 */
214
struct tun_struct {
Jason Wang's avatar
Jason Wang committed
215
216
	struct tun_file __rcu	*tfiles[MAX_TAP_QUEUES];
	unsigned int            numqueues;
217
	unsigned int 		flags;
218
219
	kuid_t			owner;
	kgid_t			group;
220
221

	struct net_device	*dev;
222
	netdev_features_t	set_features;
223
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
224
			  NETIF_F_TSO6)
225

226
	int			align;
227
	int			vnet_hdr_sz;
Jason Wang's avatar
Jason Wang committed
228
229
230
231
232
	int			sndbuf;
	struct tap_filter	txflt;
	struct sock_fprog	fprog;
	/* protected by rtnl lock */
	bool			filter_attached;
233
234
#ifdef TUN_DEBUG
	int debug;
Linus Torvalds's avatar
Linus Torvalds committed
235
#endif
236
237
238
239
	spinlock_t lock;
	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
	struct timer_list flow_gc_timer;
	unsigned long ageing_time;
240
241
	unsigned int numdisabled;
	struct list_head disabled;
242
	void *security;
243
	u32 flow_count;
Jason Wang's avatar
Jason Wang committed
244
	u32 rx_batched;
245
	struct tun_pcpu_stats __percpu *pcpu_stats;
Jason Wang's avatar
Jason Wang committed
246
	struct bpf_prog __rcu *xdp_prog;
247
	struct tun_prog __rcu *steering_prog;
248
	struct tun_prog __rcu *filter_prog;
249
	struct ethtool_link_ksettings link_ksettings;
250
};
Linus Torvalds's avatar
Linus Torvalds committed
251

252
253
254
struct veth {
	__be16 h_vlan_proto;
	__be16 h_vlan_TCI;
255
};
Linus Torvalds's avatar
Linus Torvalds committed
256

257
bool tun_is_xdp_frame(void *ptr)
Jason Wang's avatar
Jason Wang committed
258
259
260
{
	return (unsigned long)ptr & TUN_XDP_FLAG;
}
261
EXPORT_SYMBOL(tun_is_xdp_frame);
Jason Wang's avatar
Jason Wang committed
262
263
264
265
266
267
268
269
270
271
272
273
274

void *tun_xdp_to_ptr(void *ptr)
{
	return (void *)((unsigned long)ptr | TUN_XDP_FLAG);
}
EXPORT_SYMBOL(tun_xdp_to_ptr);

void *tun_ptr_to_xdp(void *ptr)
{
	return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
}
EXPORT_SYMBOL(tun_ptr_to_xdp);

275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
static int tun_napi_receive(struct napi_struct *napi, int budget)
{
	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
	struct sk_buff_head process_queue;
	struct sk_buff *skb;
	int received = 0;

	__skb_queue_head_init(&process_queue);

	spin_lock(&queue->lock);
	skb_queue_splice_tail_init(queue, &process_queue);
	spin_unlock(&queue->lock);

	while (received < budget && (skb = __skb_dequeue(&process_queue))) {
		napi_gro_receive(napi, skb);
		++received;
	}

	if (!skb_queue_empty(&process_queue)) {
		spin_lock(&queue->lock);
		skb_queue_splice(&process_queue, queue);
		spin_unlock(&queue->lock);
	}

	return received;
}

static int tun_napi_poll(struct napi_struct *napi, int budget)
{
	unsigned int received;

	received = tun_napi_receive(napi, budget);

	if (received < budget)
		napi_complete_done(napi, received);

	return received;
}

static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
			  bool napi_en)
{
318
	tfile->napi_enabled = napi_en;
319
320
321
322
323
324
325
	if (napi_en) {
		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
			       NAPI_POLL_WEIGHT);
		napi_enable(&tfile->napi);
	}
}

Eric Dumazet's avatar
Eric Dumazet committed
326
static void tun_napi_disable(struct tun_file *tfile)
327
{
328
	if (tfile->napi_enabled)
329
330
331
		napi_disable(&tfile->napi);
}

Eric Dumazet's avatar
Eric Dumazet committed
332
static void tun_napi_del(struct tun_file *tfile)
333
{
334
	if (tfile->napi_enabled)
335
336
337
		netif_napi_del(&tfile->napi);
}

338
339
340
341
342
static bool tun_napi_frags_enabled(const struct tun_struct *tun)
{
	return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
}

343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
#ifdef CONFIG_TUN_VNET_CROSS_LE
static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
{
	return tun->flags & TUN_VNET_BE ? false :
		virtio_legacy_is_little_endian();
}

static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
{
	int be = !!(tun->flags & TUN_VNET_BE);

	if (put_user(be, argp))
		return -EFAULT;

	return 0;
}

static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
{
	int be;

	if (get_user(be, argp))
		return -EFAULT;

	if (be)
		tun->flags |= TUN_VNET_BE;
	else
		tun->flags &= ~TUN_VNET_BE;

	return 0;
}
#else
static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
{
	return virtio_legacy_is_little_endian();
}

static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
{
	return -EINVAL;
}

static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
{
	return -EINVAL;
}
#endif /* CONFIG_TUN_VNET_CROSS_LE */

391
392
static inline bool tun_is_little_endian(struct tun_struct *tun)
{
393
	return tun->flags & TUN_VNET_LE ||
394
		tun_legacy_is_little_endian(tun);
395
396
}

397
398
static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
{
399
	return __virtio16_to_cpu(tun_is_little_endian(tun), val);
400
401
402
403
}

static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
{
404
	return __cpu_to_virtio16(tun_is_little_endian(tun), val);
405
406
}

407
408
static inline u32 tun_hashfn(u32 rxhash)
{
409
	return rxhash & TUN_MASK_FLOW_ENTRIES;
410
411
412
413
414
415
}

static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
{
	struct tun_flow_entry *e;

416
	hlist_for_each_entry_rcu(e, head, hash_link) {
417
418
419
420
421
422
423
424
425
426
		if (e->rxhash == rxhash)
			return e;
	}
	return NULL;
}

static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
					      struct hlist_head *head,
					      u32 rxhash, u16 queue_index)
{
427
428
	struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);

429
430
431
432
433
	if (e) {
		tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n",
			  rxhash, queue_index);
		e->updated = jiffies;
		e->rxhash = rxhash;
434
		e->rps_rxhash = 0;
435
436
437
		e->queue_index = queue_index;
		e->tun = tun;
		hlist_add_head_rcu(&e->hash_link, head);
438
		++tun->flow_count;
439
440
441
442
443
444
445
446
447
	}
	return e;
}

static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
{
	tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
		  e->rxhash, e->queue_index);
	hlist_del_rcu(&e->hash_link);
448
	kfree_rcu(e, rcu);
449
	--tun->flow_count;
450
451
452
453
454
455
456
457
458
}

static void tun_flow_flush(struct tun_struct *tun)
{
	int i;

	spin_lock_bh(&tun->lock);
	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
		struct tun_flow_entry *e;
459
		struct hlist_node *n;
460

461
		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
462
463
464
465
466
467
468
469
470
471
472
473
			tun_flow_delete(tun, e);
	}
	spin_unlock_bh(&tun->lock);
}

static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
{
	int i;

	spin_lock_bh(&tun->lock);
	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
		struct tun_flow_entry *e;
474
		struct hlist_node *n;
475

476
		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
477
478
479
480
481
482
483
			if (e->queue_index == queue_index)
				tun_flow_delete(tun, e);
		}
	}
	spin_unlock_bh(&tun->lock);
}

484
static void tun_flow_cleanup(struct timer_list *t)
485
{
486
	struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
487
488
489
490
491
492
493
	unsigned long delay = tun->ageing_time;
	unsigned long next_timer = jiffies + delay;
	unsigned long count = 0;
	int i;

	tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n");

494
	spin_lock(&tun->lock);
495
496
	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
		struct tun_flow_entry *e;
497
		struct hlist_node *n;
498

499
		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
500
			unsigned long this_timer;
501

502
			this_timer = e->updated + delay;
503
			if (time_before_eq(this_timer, jiffies)) {
504
				tun_flow_delete(tun, e);
505
506
507
508
				continue;
			}
			count++;
			if (time_before(this_timer, next_timer))
509
510
511
512
513
514
				next_timer = this_timer;
		}
	}

	if (count)
		mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
515
	spin_unlock(&tun->lock);
516
517
}

518
static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
519
			    struct tun_file *tfile)
520
521
522
523
{
	struct hlist_head *head;
	struct tun_flow_entry *e;
	unsigned long delay = tun->ageing_time;
524
	u16 queue_index = tfile->queue_index;
525
526
527
528
529
530
531
532
533
534
535
536
537

	if (!rxhash)
		return;
	else
		head = &tun->flows[tun_hashfn(rxhash)];

	rcu_read_lock();

	e = tun_flow_find(head, rxhash);
	if (likely(e)) {
		/* TODO: keep queueing to old queue until it's empty? */
		e->queue_index = queue_index;
		e->updated = jiffies;
538
		sock_rps_record_flow_hash(e->rps_rxhash);
539
540
	} else {
		spin_lock_bh(&tun->lock);
541
542
		if (!tun_flow_find(head, rxhash) &&
		    tun->flow_count < MAX_TAP_FLOWS)
543
544
545
546
547
548
549
550
551
552
553
			tun_flow_create(tun, head, rxhash, queue_index);

		if (!timer_pending(&tun->flow_gc_timer))
			mod_timer(&tun->flow_gc_timer,
				  round_jiffies_up(jiffies + delay));
		spin_unlock_bh(&tun->lock);
	}

	rcu_read_unlock();
}

554
555
556
557
558
559
/**
 * Save the hash received in the stack receive path and update the
 * flow_hash table accordingly.
 */
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
{
560
	if (unlikely(e->rps_rxhash != hash))
561
562
563
		e->rps_rxhash = hash;
}

Jason Wang's avatar
Jason Wang committed
564
/* We try to identify a flow through its rxhash first. The reason that
stephen hemminger's avatar
stephen hemminger committed
565
 * we do not check rxq no. is because some cards(e.g 82599), chooses
Jason Wang's avatar
Jason Wang committed
566
567
568
569
570
 * the rxq based on the txq where the last packet of the flow comes. As
 * the userspace application move between processors, we may get a
 * different rxq no. here. If we could not get rxhash, then we would
 * hope the rxq no. may help here.
 */
571
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
Jason Wang's avatar
Jason Wang committed
572
{
573
	struct tun_flow_entry *e;
Jason Wang's avatar
Jason Wang committed
574
575
576
	u32 txq = 0;
	u32 numqueues = 0;

577
	numqueues = READ_ONCE(tun->numqueues);
Jason Wang's avatar
Jason Wang committed
578

Jason Wang's avatar
Jason Wang committed
579
	txq = __skb_get_hash_symmetric(skb);
Jason Wang's avatar
Jason Wang committed
580
	if (txq) {
581
		e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
582
583
		if (e) {
			tun_flow_save_rps_rxhash(e, txq);
584
			txq = e->queue_index;
585
		} else
586
587
			/* use multiply and shift instead of expensive divide */
			txq = ((u64)txq * numqueues) >> 32;
Jason Wang's avatar
Jason Wang committed
588
589
590
591
592
593
594
595
596
	} else if (likely(skb_rx_queue_recorded(skb))) {
		txq = skb_get_rx_queue(skb);
		while (unlikely(txq >= numqueues))
			txq -= numqueues;
	}

	return txq;
}

597
598
static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
599
	struct tun_prog *prog;
600
601
602
603
604
605
606
607
608
609
	u16 ret = 0;

	prog = rcu_dereference(tun->steering_prog);
	if (prog)
		ret = bpf_prog_run_clear_cb(prog->prog, skb);

	return ret % tun->numqueues;
}

static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
610
611
			    struct net_device *sb_dev,
			    select_queue_fallback_t fallback)
612
613
614
615
616
617
618
619
620
621
622
623
624
625
{
	struct tun_struct *tun = netdev_priv(dev);
	u16 ret;

	rcu_read_lock();
	if (rcu_dereference(tun->steering_prog))
		ret = tun_ebpf_select_queue(tun, skb);
	else
		ret = tun_automq_select_queue(tun, skb);
	rcu_read_unlock();

	return ret;
}

626
627
628
static inline bool tun_not_capable(struct tun_struct *tun)
{
	const struct cred *cred = current_cred();
629
	struct net *net = dev_net(tun->dev);
630
631
632

	return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
		  (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
633
		!ns_capable(net->user_ns, CAP_NET_ADMIN);
634
635
}

Jason Wang's avatar
Jason Wang committed
636
637
638
639
640
641
static void tun_set_real_num_queues(struct tun_struct *tun)
{
	netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
	netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
}

642
643
644
645
646
647
648
static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
{
	tfile->detached = tun;
	list_add_tail(&tfile->next, &tun->disabled);
	++tun->numdisabled;
}

Jason Wang's avatar
Jason Wang committed
649
static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
650
651
652
653
654
655
656
657
658
{
	struct tun_struct *tun = tfile->detached;

	tfile->detached = NULL;
	list_del_init(&tfile->next);
	--tun->numdisabled;
	return tun;
}

659
void tun_ptr_free(void *ptr)
Jason Wang's avatar
Jason Wang committed
660
661
662
{
	if (!ptr)
		return;
663
664
	if (tun_is_xdp_frame(ptr)) {
		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
Jason Wang's avatar
Jason Wang committed
665

666
		xdp_return_frame(xdpf);
Jason Wang's avatar
Jason Wang committed
667
668
669
670
	} else {
		__skb_array_destroy_skb(ptr);
	}
}
671
EXPORT_SYMBOL_GPL(tun_ptr_free);
Jason Wang's avatar
Jason Wang committed
672

673
674
static void tun_queue_purge(struct tun_file *tfile)
{
Jason Wang's avatar
Jason Wang committed
675
	void *ptr;
676

Jason Wang's avatar
Jason Wang committed
677
678
	while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
		tun_ptr_free(ptr);
679

Jason Wang's avatar
Jason Wang committed
680
	skb_queue_purge(&tfile->sk.sk_write_queue);
681
682
683
	skb_queue_purge(&tfile->sk.sk_error_queue);
}

Jason Wang's avatar
Jason Wang committed
684
685
686
687
688
static void __tun_detach(struct tun_file *tfile, bool clean)
{
	struct tun_file *ntfile;
	struct tun_struct *tun;

689
690
	tun = rtnl_dereference(tfile->tun);

691
	if (tun && clean) {
Eric Dumazet's avatar
Eric Dumazet committed
692
693
		tun_napi_disable(tfile);
		tun_napi_del(tfile);
694
695
	}

696
	if (tun && !tfile->detached) {
Jason Wang's avatar
Jason Wang committed
697
698
699
700
701
		u16 index = tfile->queue_index;
		BUG_ON(index >= tun->numqueues);

		rcu_assign_pointer(tun->tfiles[index],
				   tun->tfiles[tun->numqueues - 1]);
702
		ntfile = rtnl_dereference(tun->tfiles[index]);
Jason Wang's avatar
Jason Wang committed
703
704
705
		ntfile->queue_index = index;

		--tun->numqueues;
706
		if (clean) {
707
			RCU_INIT_POINTER(tfile->tun, NULL);
708
			sock_put(&tfile->sk);
709
		} else
710
			tun_disable_queue(tun, tfile);
Jason Wang's avatar
Jason Wang committed
711
712

		synchronize_net();
713
		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
Jason Wang's avatar
Jason Wang committed
714
		/* Drop read queue */
715
		tun_queue_purge(tfile);
Jason Wang's avatar
Jason Wang committed
716
		tun_set_real_num_queues(tun);
717
	} else if (tfile->detached && clean) {
718
		tun = tun_enable_queue(tfile);
719
720
		sock_put(&tfile->sk);
	}
Jason Wang's avatar
Jason Wang committed
721
722

	if (clean) {
723
724
725
		if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
			netif_carrier_off(tun->dev);

726
			if (!(tun->flags & IFF_PERSIST) &&
727
			    tun->dev->reg_state == NETREG_REGISTERED)
728
				unregister_netdevice(tun->dev);
729
		}
730
731
		if (tun)
			xdp_rxq_info_unreg(&tfile->xdp_rxq);
732
		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
733
		sock_put(&tfile->sk);
Jason Wang's avatar
Jason Wang committed
734
735
736
737
738
	}
}

static void tun_detach(struct tun_file *tfile, bool clean)
{
739
740
741
	struct tun_struct *tun;
	struct net_device *dev;

Jason Wang's avatar
Jason Wang committed
742
	rtnl_lock();
743
744
	tun = rtnl_dereference(tfile->tun);
	dev = tun ? tun->dev : NULL;
Jason Wang's avatar
Jason Wang committed
745
	__tun_detach(tfile, clean);
746
747
	if (dev)
		netdev_state_change(dev);
Jason Wang's avatar
Jason Wang committed
748
749
750
751
752
753
	rtnl_unlock();
}

static void tun_detach_all(struct net_device *dev)
{
	struct tun_struct *tun = netdev_priv(dev);
754
	struct tun_file *tfile, *tmp;
Jason Wang's avatar
Jason Wang committed
755
756
757
	int i, n = tun->numqueues;

	for (i = 0; i < n; i++) {
758
		tfile = rtnl_dereference(tun->tfiles[i]);
Jason Wang's avatar
Jason Wang committed
759
		BUG_ON(!tfile);
Eric Dumazet's avatar
Eric Dumazet committed
760
		tun_napi_disable(tfile);
761
		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
762
		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
763
		RCU_INIT_POINTER(tfile->tun, NULL);
Jason Wang's avatar
Jason Wang committed
764
765
		--tun->numqueues;
	}
766
	list_for_each_entry(tfile, &tun->disabled, next) {
767
		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
768
		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
769
		RCU_INIT_POINTER(tfile->tun, NULL);
770
	}
Jason Wang's avatar
Jason Wang committed
771
772
773
774
	BUG_ON(tun->numqueues != 0);

	synchronize_net();
	for (i = 0; i < n; i++) {
775
		tfile = rtnl_dereference(tun->tfiles[i]);
Eric Dumazet's avatar
Eric Dumazet committed
776
		tun_napi_del(tfile);
Jason Wang's avatar
Jason Wang committed
777
		/* Drop read queue */
778
		tun_queue_purge(tfile);
779
		xdp_rxq_info_unreg(&tfile->xdp_rxq);
Jason Wang's avatar
Jason Wang committed
780
781
		sock_put(&tfile->sk);
	}
782
783
	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
		tun_enable_queue(tfile);
784
		tun_queue_purge(tfile);
785
		xdp_rxq_info_unreg(&tfile->xdp_rxq);
786
787
788
		sock_put(&tfile->sk);
	}
	BUG_ON(tun->numdisabled != 0);
789

790
	if (tun->flags & IFF_PERSIST)
791
		module_put(THIS_MODULE);
Jason Wang's avatar
Jason Wang committed
792
793
}

794
795
static int tun_attach(struct tun_struct *tun, struct file *file,
		      bool skip_filter, bool napi)
796
{
Eric W. Biederman's avatar
Eric W. Biederman committed
797
	struct tun_file *tfile = file->private_data;
798
	struct net_device *dev = tun->dev;
799
	int err;
800

801
802
803
804
	err = security_tun_dev_attach(tfile->socket.sk, tun->security);
	if (err < 0)
		goto out;

805
	err = -EINVAL;
806
	if (rtnl_dereference(tfile->tun) && !tfile->detached)
807
808
809
		goto out;

	err = -EBUSY;
810
	if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
Jason Wang's avatar
Jason Wang committed
811
812
813
		goto out;

	err = -E2BIG;
814
815
	if (!tfile->detached &&
	    tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
816
817
818
		goto out;

	err = 0;
Jason Wang's avatar
Jason Wang committed
819

stephen hemminger's avatar
stephen hemminger committed
820
	/* Re-attach the filter to persist device */
821
	if (!skip_filter && (tun->filter_attached == true)) {
822
823
824
		lock_sock(tfile->socket.sk);
		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
		release_sock(tfile->socket.sk);
Jason Wang's avatar
Jason Wang committed
825
826
827
		if (!err)
			goto out;
	}
828
829

	if (!tfile->detached &&
830
831
	    ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
			    GFP_KERNEL, tun_ptr_free)) {
832
833
834
835
		err = -ENOMEM;
		goto out;
	}

Jason Wang's avatar
Jason Wang committed
836
	tfile->queue_index = tun->numqueues;
837
	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
838
839
840
841
842
843
844
845
846
847
848
849
850

	if (tfile->detached) {
		/* Re-attach detached tfile, updating XDP queue_index */
		WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));

		if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
			tfile->xdp_rxq.queue_index = tfile->queue_index;
	} else {
		/* Setup XDP RX-queue info, for new tfile getting attached */
		err = xdp_rxq_info_reg(&tfile->xdp_rxq,
				       tun->dev, tfile->queue_index);
		if (err < 0)
			goto out;
851
852
853
854
855
856
		err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
						 MEM_TYPE_PAGE_SHARED, NULL);
		if (err < 0) {
			xdp_rxq_info_unreg(&tfile->xdp_rxq);
			goto out;
		}
857
858
859
		err = 0;
	}

860
	rcu_assign_pointer(tfile->tun, tun);
Jason Wang's avatar
Jason Wang committed
861
862
	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
	tun->numqueues++;
863

864
	if (tfile->detached) {
865
		tun_enable_queue(tfile);
866
	} else {
867
		sock_hold(&tfile->sk);
868
869
		tun_napi_init(tun, tfile, napi);
	}
870

Jason Wang's avatar
Jason Wang committed
871
	tun_set_real_num_queues(tun);
872

Jason Wang's avatar
Jason Wang committed
873
874
875
876
877
878
	/* device is allowed to go away first, so no need to hold extra
	 * refcnt.
	 */

out:
	return err;
Eric W. Biederman's avatar
Eric W. Biederman committed
879
880
}

881
static struct tun_struct *tun_get(struct tun_file *tfile)
Eric W. Biederman's avatar
Eric W. Biederman committed
882
{
883
	struct tun_struct *tun;
884

885
886
887
888
889
	rcu_read_lock();
	tun = rcu_dereference(tfile->tun);
	if (tun)
		dev_hold(tun->dev);
	rcu_read_unlock();
890
891

	return tun;
Eric W. Biederman's avatar
Eric W. Biederman committed
892
893
894
895
}

static void tun_put(struct tun_struct *tun)
{
896
	dev_put(tun->dev);
Eric W. Biederman's avatar
Eric W. Biederman committed
897
898
}

899
/* TAP filtering */
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
static void addr_hash_set(u32 *mask, const u8 *addr)
{
	int n = ether_crc(ETH_ALEN, addr) >> 26;
	mask[n >> 5] |= (1 << (n & 31));
}

static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
{
	int n = ether_crc(ETH_ALEN, addr) >> 26;
	return mask[n >> 5] & (1 << (n & 31));
}

static int update_filter(struct tap_filter *filter, void __user *arg)
{
	struct { u8 u[ETH_ALEN]; } *addr;
	struct tun_filter uf;
	int err, alen, n, nexact;

	if (copy_from_user(&uf, arg, sizeof(uf)))
		return -EFAULT;

	if (!uf.count) {
		/* Disabled */
		filter->count = 0;
		return 0;
	}

	alen = ETH_ALEN * uf.count;
928
929
930
	addr = memdup_user(arg + sizeof(uf), alen);
	if (IS_ERR(addr))
		return PTR_ERR(addr);
931
932
933
934
935
936
937
938
939
940
941
942
943

	/* The filter is updated without holding any locks. Which is
	 * perfectly safe. We disable it first and in the worst
	 * case we'll accept a few undesired packets. */
	filter->count = 0;
	wmb();

	/* Use first set of addresses as an exact filter */
	for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
		memcpy(filter->addr[n], addr[n].u, ETH_ALEN);

	nexact = n;

944
945
	/* Remaining multicast addresses are hashed,
	 * unicast will leave the filter disabled. */
946
	memset(filter->mask, 0, sizeof(filter->mask));
947
948
949
	for (; n < uf.count; n++) {
		if (!is_multicast_ether_addr(addr[n].u)) {
			err = 0; /* no filter */
950
			goto free_addr;
951
		}
952
		addr_hash_set(filter->mask, addr[n].u);
953
	}
954
955
956
957
958
959
960
961
962
963
964
965

	/* For ALLMULTI just set the mask to all ones.
	 * This overrides the mask populated above. */
	if ((uf.flags & TUN_FLT_ALLMULTI))
		memset(filter->mask, ~0, sizeof(filter->mask));

	/* Now enable the filter */
	wmb();
	filter->count = nexact;

	/* Return the number of exact filters */
	err = nexact;
966
free_addr:
967
968
969
970
971
972
973
974
975
976
977
978
979
980
	kfree(addr);
	return err;
}

/* Returns: 0 - drop, !=0 - accept */
static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
	/* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
	 * at this point. */
	struct ethhdr *eh = (struct ethhdr *) skb->data;
	int i;

	/* Exact match */
	for (i = 0; i < filter->count; i++)
981
		if (ether_addr_equal(eh->h_dest, filter->addr[i]))
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
			return 1;

	/* Inexact match (multicast only) */
	if (is_multicast_ether_addr(eh->h_dest))
		return addr_hash_test(filter->mask, eh->h_dest);

	return 0;
}

/*
 * Checks whether the packet is accepted or not.
 * Returns: 0 - drop, !=0 - accept
 */
static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
	if (!filter->count)
		return 1;

	return run_filter(filter, skb);
For faster browsing, not all history is shown. View entire blame