udp.c 77.5 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		The User Datagram Protocol (UDP).
 *
8
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
9
10
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
11
 *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
Linus Torvalds's avatar
Linus Torvalds committed
12
13
14
15
16
17
18
19
20
 *		Hirokazu Takahashi, <taka@valinux.co.jp>
 *
 * Fixes:
 *		Alan Cox	:	verify_area() calls
 *		Alan Cox	: 	stopped close while in use off icmp
 *					messages. Not a fix but a botch that
 *					for udp at least is 'valid'.
 *		Alan Cox	:	Fixed icmp handling properly
 *		Alan Cox	: 	Correct error for oversized datagrams
21
22
 *		Alan Cox	:	Tidied select() semantics.
 *		Alan Cox	:	udp_err() fixed properly, also now
Linus Torvalds's avatar
Linus Torvalds committed
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
 *					select and read wake correctly on errors
 *		Alan Cox	:	udp_send verify_area moved to avoid mem leak
 *		Alan Cox	:	UDP can count its memory
 *		Alan Cox	:	send to an unknown connection causes
 *					an ECONNREFUSED off the icmp, but
 *					does NOT close.
 *		Alan Cox	:	Switched to new sk_buff handlers. No more backlog!
 *		Alan Cox	:	Using generic datagram code. Even smaller and the PEEK
 *					bug no longer crashes it.
 *		Fred Van Kempen	: 	Net2e support for sk->broadcast.
 *		Alan Cox	:	Uses skb_free_datagram
 *		Alan Cox	:	Added get/set sockopt support.
 *		Alan Cox	:	Broadcasting without option set returns EACCES.
 *		Alan Cox	:	No wakeup calls. Instead we now use the callbacks.
 *		Alan Cox	:	Use ip_tos and ip_ttl
 *		Alan Cox	:	SNMP Mibs
 *		Alan Cox	:	MSG_DONTROUTE, and 0.0.0.0 support.
 *		Matt Dillon	:	UDP length checks.
 *		Alan Cox	:	Smarter af_inet used properly.
 *		Alan Cox	:	Use new kernel side addressing.
 *		Alan Cox	:	Incorrect return on truncated datagram receive.
 *	Arnt Gulbrandsen 	:	New udp_send and stuff
 *		Alan Cox	:	Cache last socket
 *		Alan Cox	:	Route cache
 *		Jon Peatfield	:	Minor efficiency fix to sendto().
 *		Mike Shaver	:	RFC1122 checks.
 *		Alan Cox	:	Nonblocking error fix.
 *	Willy Konynenberg	:	Transparent proxying support.
 *		Mike McLagan	:	Routing by source
 *		David S. Miller	:	New socket lookup architecture.
 *					Last socket cache retained as it
 *					does have a high hit rate.
 *		Olaf Kirch	:	Don't linearise iovec on sendmsg.
 *		Andi Kleen	:	Some cleanups, cache destination entry
57
 *					for connect.
Linus Torvalds's avatar
Linus Torvalds committed
58
59
60
61
62
63
64
65
66
67
68
69
70
 *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
 *		Melvin Smith	:	Check msg_name not msg_namelen in sendto(),
 *					return ENOTCONN for unconnected sockets (POSIX)
 *		Janos Farkas	:	don't deliver multi/broadcasts to a different
 *					bound-to-device socket
 *	Hirokazu Takahashi	:	HW checksumming for outgoing UDP
 *					datagrams.
 *	Hirokazu Takahashi	:	sendfile() on UDP works now.
 *		Arnaldo C. Melo :	convert /proc/net/udp to seq_file
 *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
 *	Alexey Kuznetsov:		allow both IPv4 and IPv6 sockets to bind
 *					a single port at the same time.
 *	Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
71
 *	James Chapman		:	Add L2TP encapsulation type.
Linus Torvalds's avatar
Linus Torvalds committed
72
73
74
75
76
77
78
 *
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */
79

80
81
#define pr_fmt(fmt) "UDP: " fmt

82
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
83
#include <asm/ioctls.h>
84
#include <linux/memblock.h>
85
86
#include <linux/highmem.h>
#include <linux/swap.h>
Linus Torvalds's avatar
Linus Torvalds committed
87
88
89
90
91
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/sockios.h>
92
#include <linux/igmp.h>
93
#include <linux/inetdevice.h>
Linus Torvalds's avatar
Linus Torvalds committed
94
95
96
97
98
99
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
100
#include <linux/slab.h>
101
#include <net/tcp_states.h>
Linus Torvalds's avatar
Linus Torvalds committed
102
103
104
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
105
#include <net/net_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
106
#include <net/icmp.h>
Shawn Bohrer's avatar
Shawn Bohrer committed
107
#include <net/inet_hashtables.h>
108
#include <net/ip_tunnels.h>
Linus Torvalds's avatar
Linus Torvalds committed
109
110
111
#include <net/route.h>
#include <net/checksum.h>
#include <net/xfrm.h>
112
#include <trace/events/udp.h>
113
#include <linux/static_key.h>
114
#include <trace/events/skb.h>
115
#include <net/busy_poll.h>
116
#include "udp_impl.h"
117
#include <net/sock_reuseport.h>
Eric Dumazet's avatar
Eric Dumazet committed
118
#include <net/addrconf.h>
119
#include <net/udp_tunnel.h>
Linus Torvalds's avatar
Linus Torvalds committed
120

121
struct udp_table udp_table __read_mostly;
122
EXPORT_SYMBOL(udp_table);
Linus Torvalds's avatar
Linus Torvalds committed
123

Eric Dumazet's avatar
Eric Dumazet committed
124
long sysctl_udp_mem[3] __read_mostly;
Hideo Aoki's avatar
Hideo Aoki committed
125
EXPORT_SYMBOL(sysctl_udp_mem);
Eric Dumazet's avatar
Eric Dumazet committed
126

Eric Dumazet's avatar
Eric Dumazet committed
127
atomic_long_t udp_memory_allocated;
Hideo Aoki's avatar
Hideo Aoki committed
128
129
EXPORT_SYMBOL(udp_memory_allocated);

130
131
#define MAX_UDP_PORTS 65536
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
132

133
134
135
136
137
138
139
140
141
142
143
/* IPCB reference means this can not be used from early demux */
static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
	if (!net->ipv4.sysctl_udp_l3mdev_accept &&
	    skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
		return true;
#endif
	return false;
}

144
static int udp_lib_lport_inuse(struct net *net, __u16 num,
145
			       const struct udp_hslot *hslot,
146
			       unsigned long *bitmap,
147
			       struct sock *sk, unsigned int log)
Linus Torvalds's avatar
Linus Torvalds committed
148
{
149
	struct sock *sk2;
150
	kuid_t uid = sock_i_uid(sk);
151

152
	sk_for_each(sk2, &hslot->head) {
153
154
		if (net_eq(sock_net(sk2), net) &&
		    sk2 != sk &&
155
		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
156
157
158
		    (!sk2->sk_reuse || !sk->sk_reuse) &&
		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
159
		    inet_rcv_saddr_equal(sk, sk2, true)) {
160
161
162
163
164
165
166
167
168
169
170
			if (sk2->sk_reuseport && sk->sk_reuseport &&
			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
			    uid_eq(uid, sock_i_uid(sk2))) {
				if (!bitmap)
					return 0;
			} else {
				if (!bitmap)
					return 1;
				__set_bit(udp_sk(sk2)->udp_port_hash >> log,
					  bitmap);
			}
171
		}
172
	}
173
174
175
	return 0;
}

Eric Dumazet's avatar
Eric Dumazet committed
176
177
178
179
180
/*
 * Note: we still hold spinlock of primary hash chain, so no other writer
 * can insert/delete a socket with local_port == num
 */
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
181
				struct udp_hslot *hslot2,
182
				struct sock *sk)
Eric Dumazet's avatar
Eric Dumazet committed
183
184
{
	struct sock *sk2;
185
	kuid_t uid = sock_i_uid(sk);
Eric Dumazet's avatar
Eric Dumazet committed
186
187
188
	int res = 0;

	spin_lock(&hslot2->lock);
189
	udp_portaddr_for_each_entry(sk2, &hslot2->head) {
190
191
192
193
194
195
		if (net_eq(sock_net(sk2), net) &&
		    sk2 != sk &&
		    (udp_sk(sk2)->udp_port_hash == num) &&
		    (!sk2->sk_reuse || !sk->sk_reuse) &&
		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
196
		    inet_rcv_saddr_equal(sk, sk2, true)) {
197
198
199
200
201
202
203
			if (sk2->sk_reuseport && sk->sk_reuseport &&
			    !rcu_access_pointer(sk->sk_reuseport_cb) &&
			    uid_eq(uid, sock_i_uid(sk2))) {
				res = 0;
			} else {
				res = 1;
			}
Eric Dumazet's avatar
Eric Dumazet committed
204
205
			break;
		}
206
	}
Eric Dumazet's avatar
Eric Dumazet committed
207
208
209
210
	spin_unlock(&hslot2->lock);
	return res;
}

211
static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
212
213
214
215
216
{
	struct net *net = sock_net(sk);
	kuid_t uid = sock_i_uid(sk);
	struct sock *sk2;

217
	sk_for_each(sk2, &hslot->head) {
218
219
220
221
222
223
224
		if (net_eq(sock_net(sk2), net) &&
		    sk2 != sk &&
		    sk2->sk_family == sk->sk_family &&
		    ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
		    (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
		    (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
		    sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
225
		    inet_rcv_saddr_equal(sk, sk2, false)) {
226
227
			return reuseport_add_sock(sk, sk2,
						  inet_rcv_saddr_any(sk));
228
229
230
		}
	}

231
	return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
232
233
}

234
/**
235
 *  udp_lib_get_port  -  UDP/-Lite port lookup for IPv4 and IPv6
236
237
238
 *
 *  @sk:          socket struct in question
 *  @snum:        port number to look up
Lucas De Marchi's avatar
Lucas De Marchi committed
239
 *  @hash2_nulladdr: AF-dependent hash value in secondary hash chains,
Eric Dumazet's avatar
Eric Dumazet committed
240
 *                   with NULL address
241
 */
242
int udp_lib_get_port(struct sock *sk, unsigned short snum,
Eric Dumazet's avatar
Eric Dumazet committed
243
		     unsigned int hash2_nulladdr)
244
{
245
	struct udp_hslot *hslot, *hslot2;
246
	struct udp_table *udptable = sk->sk_prot->h.udp_table;
247
	int    error = 1;
248
	struct net *net = sock_net(sk);
Linus Torvalds's avatar
Linus Torvalds committed
249

250
	if (!snum) {
Eric Dumazet's avatar
Eric Dumazet committed
251
		int low, high, remaining;
252
		unsigned int rand;
253
254
		unsigned short first, last;
		DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
255

256
		inet_get_local_port_range(net, &low, &high);
257
		remaining = (high - low) + 1;
258

259
		rand = prandom_u32();
260
		first = reciprocal_scale(rand, remaining) + low;
261
262
263
		/*
		 * force rand to be an odd multiple of UDP_HTABLE_SIZE
		 */
264
		rand = (rand | 1) * (udptable->mask + 1);
Eric Dumazet's avatar
Eric Dumazet committed
265
266
		last = first + udptable->mask + 1;
		do {
267
			hslot = udp_hashslot(udptable, net, first);
268
			bitmap_zero(bitmap, PORTS_PER_CHAIN);
269
			spin_lock_bh(&hslot->lock);
270
			udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
271
					    udptable->log);
272
273
274
275
276
277
278

			snum = first;
			/*
			 * Iterate on all possible values of snum for this hash.
			 * Using steps of an odd multiple of UDP_HTABLE_SIZE
			 * give us randomization and full range coverage.
			 */
Eric Dumazet's avatar
Eric Dumazet committed
279
			do {
280
				if (low <= snum && snum <= high &&
281
				    !test_bit(snum >> udptable->log, bitmap) &&
282
				    !inet_is_local_reserved_port(net, snum))
283
284
285
286
					goto found;
				snum += rand;
			} while (snum != first);
			spin_unlock_bh(&hslot->lock);
287
			cond_resched();
Eric Dumazet's avatar
Eric Dumazet committed
288
		} while (++first != last);
289
		goto fail;
290
	} else {
291
		hslot = udp_hashslot(udptable, net, snum);
292
		spin_lock_bh(&hslot->lock);
Eric Dumazet's avatar
Eric Dumazet committed
293
294
295
296
297
298
299
300
301
302
303
		if (hslot->count > 10) {
			int exist;
			unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;

			slot2          &= udptable->mask;
			hash2_nulladdr &= udptable->mask;

			hslot2 = udp_hashslot2(udptable, slot2);
			if (hslot->count < hslot2->count)
				goto scan_primary_hash;

304
			exist = udp_lib_lport_inuse2(net, snum, hslot2, sk);
Eric Dumazet's avatar
Eric Dumazet committed
305
306
307
			if (!exist && (hash2_nulladdr != slot2)) {
				hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
				exist = udp_lib_lport_inuse2(net, snum, hslot2,
308
							     sk);
Eric Dumazet's avatar
Eric Dumazet committed
309
310
311
312
313
314
315
			}
			if (exist)
				goto fail_unlock;
			else
				goto found;
		}
scan_primary_hash:
316
		if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0))
317
318
			goto fail_unlock;
	}
319
found:
320
	inet_sk(sk)->inet_num = snum;
321
322
	udp_sk(sk)->udp_port_hash = snum;
	udp_sk(sk)->udp_portaddr_hash ^= snum;
Linus Torvalds's avatar
Linus Torvalds committed
323
	if (sk_unhashed(sk)) {
324
		if (sk->sk_reuseport &&
325
		    udp_reuseport_add_sock(sk, hslot)) {
326
327
328
329
330
331
			inet_sk(sk)->inet_num = 0;
			udp_sk(sk)->udp_port_hash = 0;
			udp_sk(sk)->udp_portaddr_hash ^= snum;
			goto fail_unlock;
		}

332
		sk_add_node_rcu(sk, &hslot->head);
333
		hslot->count++;
334
		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
335
336
337

		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
		spin_lock(&hslot2->lock);
338
		if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
339
340
341
		    sk->sk_family == AF_INET6)
			hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
					   &hslot2->head);
342
		else
343
344
			hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
					   &hslot2->head);
345
346
		hslot2->count++;
		spin_unlock(&hslot2->lock);
Linus Torvalds's avatar
Linus Torvalds committed
347
	}
348
	sock_set_flag(sk, SOCK_RCU_FREE);
349
	error = 0;
350
351
fail_unlock:
	spin_unlock_bh(&hslot->lock);
Linus Torvalds's avatar
Linus Torvalds committed
352
fail:
353
354
	return error;
}
Eric Dumazet's avatar
Eric Dumazet committed
355
EXPORT_SYMBOL(udp_lib_get_port);
356

357
int udp_v4_get_port(struct sock *sk, unsigned short snum)
358
{
Eric Dumazet's avatar
Eric Dumazet committed
359
	unsigned int hash2_nulladdr =
360
		ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
Eric Dumazet's avatar
Eric Dumazet committed
361
	unsigned int hash2_partial =
362
		ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
Eric Dumazet's avatar
Eric Dumazet committed
363

364
	/* precompute partial secondary hash */
Eric Dumazet's avatar
Eric Dumazet committed
365
	udp_sk(sk)->udp_portaddr_hash = hash2_partial;
366
	return udp_lib_get_port(sk, snum, hash2_nulladdr);
367
368
}

369
370
static int compute_score(struct sock *sk, struct net *net,
			 __be32 saddr, __be16 sport,
371
372
			 __be32 daddr, unsigned short hnum,
			 int dif, int sdif, bool exact_dif)
373
{
374
375
	int score;
	struct inet_sock *inet;
376
	bool dev_match;
377

378
379
380
381
	if (!net_eq(sock_net(sk), net) ||
	    udp_sk(sk)->udp_port_hash != hnum ||
	    ipv6_only_sock(sk))
		return -1;
382

383
384
	if (sk->sk_rcv_saddr != daddr)
		return -1;
385

386
	score = (sk->sk_family == PF_INET) ? 2 : 1;
387

388
	inet = inet_sk(sk);
389
390
391
392
393
394
395
396
397
398
399
400
	if (inet->inet_daddr) {
		if (inet->inet_daddr != saddr)
			return -1;
		score += 4;
	}

	if (inet->inet_dport) {
		if (inet->inet_dport != sport)
			return -1;
		score += 4;
	}

401
402
403
404
405
	dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
					dif, sdif);
	if (!dev_match)
		return -1;
	score += 4;
406

407
408
	if (sk->sk_incoming_cpu == raw_smp_processor_id())
		score++;
409
410
411
	return score;
}

412
413
414
static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
		       const __u16 lport, const __be32 faddr,
		       const __be16 fport)
415
{
416
417
418
419
	static u32 udp_ehash_secret __read_mostly;

	net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));

420
	return __inet_ehashfn(laddr, lport, faddr, fport,
421
			      udp_ehash_secret + net_hash_mix(net));
422
423
}

424
/* called with rcu_read_lock() */
425
static struct sock *udp4_lib_lookup2(struct net *net,
426
427
428
429
430
				     __be32 saddr, __be16 sport,
				     __be32 daddr, unsigned int hnum,
				     int dif, int sdif, bool exact_dif,
				     struct udp_hslot *hslot2,
				     struct sk_buff *skb)
431
432
{
	struct sock *sk, *result;
Paolo Abeni's avatar
Paolo Abeni committed
433
	int score, badness;
434
	u32 hash = 0;
435
436

	result = NULL;
437
	badness = 0;
438
	udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
439
		score = compute_score(sk, net, saddr, sport,
440
				      daddr, hnum, dif, sdif, exact_dif);
441
		if (score > badness) {
Paolo Abeni's avatar
Paolo Abeni committed
442
			if (sk->sk_reuseport) {
443
444
				hash = udp_ehashfn(net, daddr, hnum,
						   saddr, sport);
445
				result = reuseport_select_sock(sk, hash, skb,
446
							sizeof(struct udphdr));
447
448
				if (result)
					return result;
449
			}
450
451
			badness = score;
			result = sk;
452
453
454
455
456
		}
	}
	return result;
}

457
458
459
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
 * harder than this. -DaveM
 */
460
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
461
462
		__be16 sport, __be32 daddr, __be16 dport, int dif,
		int sdif, struct udp_table *udptable, struct sk_buff *skb)
463
{
464
	struct sock *result;
465
	unsigned short hnum = ntohs(dport);
466
467
	unsigned int hash2, slot2;
	struct udp_hslot *hslot2;
468
	bool exact_dif = udp_lib_exact_dif_match(net, skb);
469

470
471
472
473
474
475
476
477
478
	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
	slot2 = hash2 & udptable->mask;
	hslot2 = &udptable->hash2[slot2];

	result = udp4_lib_lookup2(net, saddr, sport,
				  daddr, hnum, dif, sdif,
				  exact_dif, hslot2, skb);
	if (!result) {
		hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
479
480
481
482
		slot2 = hash2 & udptable->mask;
		hslot2 = &udptable->hash2[slot2];

		result = udp4_lib_lookup2(net, saddr, sport,
483
					  htonl(INADDR_ANY), hnum, dif, sdif,
484
					  exact_dif, hslot2, skb);
485
	}
486
487
	if (unlikely(IS_ERR(result)))
		return NULL;
488
489
	return result;
}
490
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
491

492
493
static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
						 __be16 sport, __be16 dport,
494
						 struct udp_table *udptable)
495
496
497
{
	const struct iphdr *iph = ip_hdr(skb);

498
	return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
499
				 iph->daddr, dport, inet_iif(skb),
500
				 inet_sdif(skb), udptable, skb);
501
502
}

503
504
505
struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
				 __be16 sport, __be16 dport)
{
506
	return __udp4_lib_lookup_skb(skb, sport, dport, &udp_table);
507
508
509
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);

510
511
512
/* Must be called under rcu_read_lock().
 * Does increment socket refcount.
 */
513
#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
514
515
516
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
			     __be32 daddr, __be16 dport, int dif)
{
517
518
519
	struct sock *sk;

	sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
520
			       dif, 0, &udp_table, NULL);
521
	if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
522
523
		sk = NULL;
	return sk;
524
525
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
526
#endif
527

Shawn Bohrer's avatar
Shawn Bohrer committed
528
529
530
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
				       __be16 loc_port, __be32 loc_addr,
				       __be16 rmt_port, __be32 rmt_addr,
531
				       int dif, int sdif, unsigned short hnum)
Shawn Bohrer's avatar
Shawn Bohrer committed
532
533
534
535
536
537
538
539
540
{
	struct inet_sock *inet = inet_sk(sk);

	if (!net_eq(sock_net(sk), net) ||
	    udp_sk(sk)->udp_port_hash != hnum ||
	    (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
	    (inet->inet_dport != rmt_port && inet->inet_dport) ||
	    (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
	    ipv6_only_sock(sk) ||
541
542
	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
	     sk->sk_bound_dev_if != sdif))
Shawn Bohrer's avatar
Shawn Bohrer committed
543
		return false;
544
	if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
Shawn Bohrer's avatar
Shawn Bohrer committed
545
546
547
548
		return false;
	return true;
}

549
550
551
DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
void udp_encap_enable(void)
{
Paolo Abeni's avatar
Paolo Abeni committed
552
	static_branch_inc(&udp_encap_needed_key);
553
554
555
}
EXPORT_SYMBOL(udp_encap_enable);

556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
 * through error handlers in encapsulations looking for a match.
 */
static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
{
	int i;

	for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
		int (*handler)(struct sk_buff *skb, u32 info);

		if (!iptun_encaps[i])
			continue;
		handler = rcu_dereference(iptun_encaps[i]->err_handler);
		if (handler && !handler(skb, info))
			return 0;
	}

	return -ENOENT;
}

576
577
578
579
580
581
582
/* Try to match ICMP errors to UDP tunnels by looking up a socket without
 * reversing source and destination port: this will match tunnels that force the
 * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
 * lwtunnels might actually break this assumption by being configured with
 * different destination ports on endpoints, in this case we won't be able to
 * trace ICMP messages back to them.
 *
583
584
585
586
 * If this doesn't match any socket, probe tunnels with arbitrary destination
 * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
 * we've sent packets to won't necessarily match the local destination port.
 *
587
588
589
 * Then ask the tunnel implementation to match the error against a valid
 * association.
 *
590
591
 * Return an error if we can't find a match, the socket if we need further
 * processing, zero otherwise.
592
593
594
595
596
 */
static struct sock *__udp4_lib_err_encap(struct net *net,
					 const struct iphdr *iph,
					 struct udphdr *uh,
					 struct udp_table *udptable,
597
					 struct sk_buff *skb, u32 info)
598
599
600
601
602
603
604
605
606
607
608
609
610
{
	int network_offset, transport_offset;
	struct sock *sk;

	network_offset = skb_network_offset(skb);
	transport_offset = skb_transport_offset(skb);

	/* Network header needs to point to the outer IPv4 header inside ICMP */
	skb_reset_network_header(skb);

	/* Transport header needs to point to the UDP header */
	skb_set_transport_header(skb, iph->ihl << 2);

611
612
613
614
615
616
617
618
619
620
621
622
623
624
	sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
			       iph->saddr, uh->dest, skb->dev->ifindex, 0,
			       udptable, NULL);
	if (sk) {
		int (*lookup)(struct sock *sk, struct sk_buff *skb);
		struct udp_sock *up = udp_sk(sk);

		lookup = READ_ONCE(up->encap_err_lookup);
		if (!lookup || lookup(sk, skb))
			sk = NULL;
	}

	if (!sk)
		sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
625
626
627
628
629
630
631

	skb_set_transport_header(skb, transport_offset);
	skb_set_network_header(skb, network_offset);

	return sk;
}

632
633
634
635
636
637
638
639
640
641
642
/*
 * This routine is called by the ICMP module when it gets some
 * sort of error condition.  If err < 0 then the socket should
 * be closed and the error returned to the user.  If err > 0
 * it's just the icmp type << 8 | icmp code.
 * Header points to the ip header of the error packet. We move
 * on past this. Then (as it used to claim before adjustment)
 * header points to the first 8 bytes of the udp header.  We need
 * to find the appropriate port.
 */

643
int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
644
645
{
	struct inet_sock *inet;
646
	const struct iphdr *iph = (const struct iphdr *)skb->data;
Eric Dumazet's avatar
Eric Dumazet committed
647
	struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
648
649
	const int type = icmp_hdr(skb)->type;
	const int code = icmp_hdr(skb)->code;
650
	bool tunnel = false;
651
652
653
	struct sock *sk;
	int harderr;
	int err;
654
	struct net *net = dev_net(skb->dev);
655

656
	sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
657
658
			       iph->saddr, uh->source, skb->dev->ifindex,
			       inet_sdif(skb), udptable, NULL);
659
	if (!sk) {
660
		/* No socket for error: try tunnels before discarding */
661
662
663
664
665
666
667
		sk = ERR_PTR(-ENOENT);
		if (static_branch_unlikely(&udp_encap_needed_key)) {
			sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb,
						  info);
			if (!sk)
				return 0;
		}
668

669
		if (IS_ERR(sk)) {
670
			__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
671
			return PTR_ERR(sk);
672
		}
673

674
		tunnel = true;
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
	}

	err = 0;
	harderr = 0;
	inet = inet_sk(sk);

	switch (type) {
	default:
	case ICMP_TIME_EXCEEDED:
		err = EHOSTUNREACH;
		break;
	case ICMP_SOURCE_QUENCH:
		goto out;
	case ICMP_PARAMETERPROB:
		err = EPROTO;
		harderr = 1;
		break;
	case ICMP_DEST_UNREACH:
		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
694
			ipv4_sk_update_pmtu(skb, sk, info);
695
696
697
698
699
700
701
702
703
704
705
706
707
			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
				err = EMSGSIZE;
				harderr = 1;
				break;
			}
			goto out;
		}
		err = EHOSTUNREACH;
		if (code <= NR_ICMP_UNREACH) {
			harderr = icmp_err_convert[code].fatal;
			err = icmp_err_convert[code].errno;
		}
		break;
708
709
	case ICMP_REDIRECT:
		ipv4_sk_redirect(skb, sk);
710
		goto out;
711
712
713
714
715
716
	}

	/*
	 *      RFC1122: OK.  Passes ICMP errors back to application, as per
	 *	4.1.3.3.
	 */
717
718
719
720
	if (tunnel) {
		/* ...not for tunnels though: we don't have a sending socket */
		goto out;
	}
721
722
723
	if (!inet->recverr) {
		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
			goto out;
724
	} else
Eric Dumazet's avatar
Eric Dumazet committed
725
		ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
726

727
728
729
	sk->sk_err = err;
	sk->sk_error_report(sk);
out:
730
	return 0;
731
732
}

733
int udp_err(struct sk_buff *skb, u32 info)
734
{
735
	return __udp4_lib_err(skb, info, &udp_table);
736
737
738
739
740
}

/*
 * Throw away all pending data and cancel the corking. Socket is locked.
 */
741
void udp_flush_pending_frames(struct sock *sk)
742
743
744
745
746
747
748
749
750
{
	struct udp_sock *up = udp_sk(sk);

	if (up->pending) {
		up->len = 0;
		up->pending = 0;
		ip_flush_pending_frames(sk);
	}
}
751
EXPORT_SYMBOL(udp_flush_pending_frames);
752
753

/**
Herbert Xu's avatar
Herbert Xu committed
754
 * 	udp4_hwcsum  -  handle outgoing HW checksumming
755
756
 * 	@skb: 	sk_buff containing the filled-in UDP header
 * 	        (checksum field must be zeroed out)
Herbert Xu's avatar
Herbert Xu committed
757
758
 *	@src:	source IP address
 *	@dst:	destination IP address
759
 */
760
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst)
761
762
{
	struct udphdr *uh = udp_hdr(skb);
Herbert Xu's avatar
Herbert Xu committed
763
764
765
	int offset = skb_transport_offset(skb);
	int len = skb->len - offset;
	int hlen = len;
766
767
	__wsum csum = 0;

768
	if (!skb_has_frag_list(skb)) {
769
770
771
772
773
		/*
		 * Only one fragment on the socket.
		 */
		skb->csum_start = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct udphdr, check);
Herbert Xu's avatar
Herbert Xu committed
774
775
		uh->check = ~csum_tcpudp_magic(src, dst, len,
					       IPPROTO_UDP, 0);
776
	} else {
777
778
		struct sk_buff *frags;

779
780
781
782
783
		/*
		 * HW-checksum won't work as there are two or more
		 * fragments on the socket so that all csums of sk_buffs
		 * should be together
		 */
784
		skb_walk_frags(skb, frags) {
Herbert Xu's avatar
Herbert Xu committed
785
786
			csum = csum_add(csum, frags->csum);
			hlen -= frags->len;
787
		}
788

Herbert Xu's avatar
Herbert Xu committed
789
		csum = skb_checksum(skb, offset, hlen, csum);
790
791
792
793
794
795
796
		skb->ip_summed = CHECKSUM_NONE;

		uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
		if (uh->check == 0)
			uh->check = CSUM_MANGLED_0;
	}
}
797
EXPORT_SYMBOL_GPL(udp4_hwcsum);
798

799
800
801
802
803
804
805
806
/* Function to set UDP checksum for an IPv4 UDP packet. This is intended
 * for the simple case like when setting the checksum for a UDP tunnel.
 */
void udp_set_csum(bool nocheck, struct sk_buff *skb,
		  __be32 saddr, __be32 daddr, int len)
{
	struct udphdr *uh = udp_hdr(skb);

807
	if (nocheck) {
808
		uh->check = 0;
809
	} else if (skb_is_gso(skb)) {
810
		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
811
812
813
814
815
	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
		uh->check = 0;
		uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
		if (uh->check == 0)
			uh->check = CSUM_MANGLED_0;
816
	} else {
817
818
819
820
821
822
823
824
		skb->ip_summed = CHECKSUM_PARTIAL;
		skb->csum_start = skb_transport_header(skb) - skb->head;
		skb->csum_offset = offsetof(struct udphdr, check);
		uh->check = ~udp_v4_check(len, saddr, daddr, 0);
	}
}
EXPORT_SYMBOL(udp_set_csum);

825
826
static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
			struct inet_cork *cork)
827
{
Herbert Xu's avatar
Herbert Xu committed
828
	struct sock *sk = skb->sk;
829
830
831
832
	struct inet_sock *inet = inet_sk(sk);
	struct udphdr *uh;
	int err = 0;
	int is_udplite = IS_UDPLITE(sk);
Herbert Xu's avatar
Herbert Xu committed
833
834
	int offset = skb_transport_offset(skb);
	int len = skb->len - offset;
835
836
837
838
839
840
	__wsum csum = 0;

	/*
	 * Create a UDP header
	 */
	uh = udp_hdr(skb);
Herbert Xu's avatar
Herbert Xu committed
841
	uh->source = inet->inet_sport;
842
	uh->dest = fl4->fl4_dport;
Herbert Xu's avatar
Herbert Xu committed
843
	uh->len = htons(len);
844
845
	uh->check = 0;

846
847
848
849
	if (cork->gso_size) {
		const int hlen = skb_network_header_len(skb) +
				 sizeof(struct udphdr);

850
851
		if (hlen + cork->gso_size > cork->fragsize) {
			kfree_skb(skb);
852
			return -EINVAL;
853
854
855
		}
		if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) {
			kfree_skb(skb);
856
			return -EINVAL;
857
858
859
		}
		if (sk->sk_no_check_tx) {
			kfree_skb(skb);
860
			return -EINVAL;
861
		}
862
		if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
863
864
		    dst_xfrm(skb_dst(skb))) {
			kfree_skb(skb);
865
			return -EIO;
866
		}
867
868
869

		skb_shinfo(skb)->gso_size = cork->gso_size;
		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
870
871
		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
							 cork->gso_size);
872
		goto csum_partial;
873
874
	}

875
	if (is_udplite)  				 /*     UDP-Lite      */
Herbert Xu's avatar
Herbert Xu committed
876
		csum = udplite_csum(skb);
877

878
	else if (sk->sk_no_check_tx) {			 /* UDP csum off */
879
880
881
882
883

		skb->ip_summed = CHECKSUM_NONE;
		goto send;

	} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
884
csum_partial:
885

886
		udp4_hwcsum(skb, fl4->saddr, fl4->daddr);
887
888
		goto send;

Herbert Xu's avatar
Herbert Xu committed
889
890
	} else
		csum = udp_csum(skb);
891
892

	/* add protocol-dependent pseudo-header */
893
	uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len,
Eric Dumazet's avatar
Eric Dumazet committed
894
				      sk->sk_protocol, csum);
895
896
897
898
	if (uh->check == 0)
		uh->check = CSUM_MANGLED_0;

send:
Eric Dumazet's avatar
Eric Dumazet committed
899
	err = ip_send_skb(sock_net(sk), skb);
Eric Dumazet's avatar
Eric Dumazet committed
900
901
	if (err) {
		if (err == -ENOBUFS && !inet->recverr) {
902
903
			UDP_INC_STATS(sock_net(sk),
				      UDP_MIB_SNDBUFERRORS, is_udplite);
Eric Dumazet's avatar
Eric Dumazet committed
904
905
906
			err = 0;
		}
	} else
907
908
		UDP_INC_STATS(sock_net(sk),
			      UDP_MIB_OUTDATAGRAMS, is_udplite);
Herbert Xu's avatar
Herbert Xu committed
909
910
911
912
913
914
	return err;
}

/*
 * Push out all pending data as one UDP datagram. Socket is locked.
 */
915
int udp_push_pending_frames(struct sock *sk)
Herbert Xu's avatar
Herbert Xu committed
916
917
918
{
	struct udp_sock  *up = udp_sk(sk);
	struct inet_sock *inet = inet_sk(sk);
David S. Miller's avatar
David S. Miller committed
919
	struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
Herbert Xu's avatar
Herbert Xu committed
920
921
922
	struct sk_buff *skb;
	int err = 0;

923
	skb = ip_finish_skb(sk, fl4);
Herbert Xu's avatar
Herbert Xu committed
924
925
926
	if (!skb)
		goto out;

927
	err = udp_send_skb(skb, fl4, &inet->cork.base);
Herbert Xu's avatar
Herbert Xu committed
928

929
930
931
932
933
out:
	up->len = 0;
	up->pending = 0;
	return err;
}
934
EXPORT_SYMBOL(udp_push_pending_frames);
935

Willem de Bruijn's avatar
Willem de Bruijn committed
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
{
	switch (cmsg->cmsg_type) {
	case UDP_SEGMENT:
		if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
			return -EINVAL;
		*gso_size = *(__u16 *)CMSG_DATA(cmsg);
		return 0;
	default:
		return -EINVAL;
	}
}

int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
{
	struct cmsghdr *cmsg;
	bool need_ip = false;
	int err;

	for_each_cmsghdr(cmsg, msg) {
		if (!CMSG_OK(msg, cmsg))
			return -EINVAL;

		if (cmsg->cmsg_level != SOL_UDP) {
			need_ip = true;
			continue;
		}

		err = __udp_cmsg_send(cmsg, gso_size);
		if (err)
			return err;
	}

	return need_ip;
}
EXPORT_SYMBOL_GPL(udp_cmsg_send);

973
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
974
975
976
{
	struct inet_sock *inet = inet_sk(sk);
	struct udp_sock *up = udp_sk(sk);
Andrey Ignatov's avatar
Andrey Ignatov committed
977
	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
978
	struct flowi4 fl4_stack;
David S. Miller's avatar
David S. Miller committed
979
	struct flowi4 *fl4;
980
981
982
983
984
985
986
987
988
989
990
	int ulen = len;
	struct ipcm_cookie ipc;
	struct rtable *rt = NULL;
	int free = 0;
	int connected = 0;
	__be32 daddr, faddr, saddr;
	__be16 dport;
	u8  tos;
	int err, is_udplite = IS_UDPLITE(sk);
	int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
	int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
Herbert Xu's avatar
Herbert Xu committed
991
	struct sk_buff *skb;
992
	struct ip_options_data opt_copy;
993
994
995
996
997
998
999
1000

	if (len > 0xFFFF)
		return -EMSGSIZE;

	/*
	 *	Check the flags.
	 */

Eric Dumazet's avatar
Eric Dumazet committed
1001
	if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
1002
1003
		return -EOPNOTSUPP;

Herbert Xu's avatar
Herbert Xu committed
1004
1005
	getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;

1006
	fl4 = &inet->cork.fl.u.ip4;
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
	if (up->pending) {
		/*
		 * There are pending frames.
		 * The socket lock must be held while it's corked.
		 */
		lock_sock(sk);
		if (likely(up->pending)) {
			if (unlikely(up->pending != AF_INET)) {
				release_sock(sk);
				return -EINVAL;
			}
			goto do_append_data;
		}
		release_sock(sk);
	}
	ulen += sizeof(struct udphdr);

	/*
	 *	Get and verify the address.
	 */
Andrey Ignatov's avatar
Andrey Ignatov committed
1027
	if (usin) {
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
		if (msg->msg_namelen < sizeof(*usin))
			return -EINVAL;
		if (usin->sin_family != AF_INET) {
			if (usin->sin_family != AF_UNSPEC)
				return -EAFNOSUPPORT;
		}

		daddr = usin->sin_addr.