tcp.h 66.1 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Definitions for the TCP module.
 *
 * Version:	@(#)tcp.h	1.0.5	05/23/93
 *
10
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
11
12
13
14
15
16
17
18
19
20
21
22
23
24
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */
#ifndef _TCP_H
#define _TCP_H

#define FASTRETRANS_DEBUG 1

#include <linux/list.h>
#include <linux/tcp.h>
25
#include <linux/bug.h>
Linus Torvalds's avatar
Linus Torvalds committed
26
27
28
#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/percpu.h>
29
#include <linux/skbuff.h>
30
#include <linux/cryptohash.h>
31
#include <linux/kref.h>
32
#include <linux/ktime.h>
33
34

#include <net/inet_connection_sock.h>
35
#include <net/inet_timewait_sock.h>
36
#include <net/inet_hashtables.h>
Linus Torvalds's avatar
Linus Torvalds committed
37
#include <net/checksum.h>
38
#include <net/request_sock.h>
39
#include <net/sock_reuseport.h>
Linus Torvalds's avatar
Linus Torvalds committed
40
41
42
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
43
#include <net/tcp_states.h>
44
#include <net/inet_ecn.h>
45
#include <net/dst.h>
46

Linus Torvalds's avatar
Linus Torvalds committed
47
#include <linux/seq_file.h>
48
#include <linux/memcontrol.h>
49
50
#include <linux/bpf-cgroup.h>

51
extern struct inet_hashinfo tcp_hashinfo;
Linus Torvalds's avatar
Linus Torvalds committed
52

53
extern struct percpu_counter tcp_orphan_count;
54
void tcp_time_wait(struct sock *sk, int state, int timeo);
Linus Torvalds's avatar
Linus Torvalds committed
55
56

#define MAX_TCP_HEADER	(128 + MAX_HEADER)
Adam Langley's avatar
Adam Langley committed
57
#define MAX_TCP_OPTION_SPACE 40
Linus Torvalds's avatar
Linus Torvalds committed
58

59
/*
Linus Torvalds's avatar
Linus Torvalds committed
60
 * Never offer a window over 32767 without using window scaling. Some
61
 * poor stacks do signed 16bit maths!
Linus Torvalds's avatar
Linus Torvalds committed
62
63
64
65
66
67
 */
#define MAX_TCP_WINDOW		32767U

/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
#define TCP_MIN_MSS		88U

John Heffner's avatar
John Heffner committed
68
/* The least MTU to use for probing */
69
#define TCP_BASE_MSS		1024
John Heffner's avatar
John Heffner committed
70

71
72
73
/* probing interval, default to 10 minutes as per RFC4821 */
#define TCP_PROBE_INTERVAL	600

74
75
76
/* Specify interval when tcp mtu probing will stop */
#define TCP_PROBE_THRESHOLD	8

Linus Torvalds's avatar
Linus Torvalds committed
77
78
79
80
81
82
/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3

/* Maximal number of ACKs sent quickly to accelerate slow-start. */
#define TCP_MAX_QUICKACKS	16U

83
84
85
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE		14U

Linus Torvalds's avatar
Linus Torvalds committed
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/* urg_data states */
#define TCP_URG_VALID	0x0100
#define TCP_URG_NOTYET	0x0200
#define TCP_URG_READ	0x0400

#define TCP_RETR1	3	/*
				 * This is how many retries it does before it
				 * tries to figure out if the gateway is
				 * down. Minimal RFC value is 3; it corresponds
				 * to ~3sec-8min depending on RTO.
				 */

#define TCP_RETR2	15	/*
				 * This should take at least
				 * 90 minutes to time out.
				 * RFC1122 says that the limit is 100 sec.
				 * 15 is ~13-30min depending on RTO.
				 */

105
106
107
108
109
110
111
112
#define TCP_SYN_RETRIES	 6	/* This is how many retries are done
				 * when active opening a connection.
				 * RFC1122 says the minimum retry MUST
				 * be at least 180secs.  Nevertheless
				 * this value is corresponding to
				 * 63secs of retransmission with the
				 * current initial RTO.
				 */
Linus Torvalds's avatar
Linus Torvalds committed
113

114
115
116
117
118
119
#define TCP_SYNACK_RETRIES 5	/* This is how may retries are done
				 * when passive opening a connection.
				 * This is corresponding to 31secs of
				 * retransmission with the current
				 * initial RTO.
				 */
Linus Torvalds's avatar
Linus Torvalds committed
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT
				  * state, about 60 seconds	*/
#define TCP_FIN_TIMEOUT	TCP_TIMEWAIT_LEN
                                 /* BSD style FIN_WAIT2 deadlock breaker.
				  * It used to be 3min, new value is 60sec,
				  * to combine FIN-WAIT-2 timeout with
				  * TIME-WAIT timer.
				  */

#define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
#if HZ >= 100
#define TCP_DELACK_MIN	((unsigned)(HZ/25))	/* minimal time to delay before sending an ACK */
#define TCP_ATO_MIN	((unsigned)(HZ/25))
#else
#define TCP_DELACK_MIN	4U
#define TCP_ATO_MIN	4U
#endif
#define TCP_RTO_MAX	((unsigned)(120*HZ))
#define TCP_RTO_MIN	((unsigned)(HZ/5))
140
#define TCP_TIMEOUT_MIN	(2U) /* Min timeout for TCP timers in jiffies */
141
#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC6298 2.1 initial RTO value	*/
142
143
144
145
146
147
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))	/* RFC 1122 initial RTO value, now
						 * used as a fallback RTO for the
						 * initial data transmission if no
						 * valid RTT sample has been acquired,
						 * most likely due to retrans in 3WHS.
						 */
Linus Torvalds's avatar
Linus Torvalds committed
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176

#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
					                 * for local resources.
					                 */
#define TCP_KEEPALIVE_TIME	(120*60*HZ)	/* two hours */
#define TCP_KEEPALIVE_PROBES	9		/* Max of 9 keepalive probes	*/
#define TCP_KEEPALIVE_INTVL	(75*HZ)

#define MAX_TCP_KEEPIDLE	32767
#define MAX_TCP_KEEPINTVL	32767
#define MAX_TCP_KEEPCNT		127
#define MAX_TCP_SYNCNT		127

#define TCP_SYNQ_INTERVAL	(HZ/5)	/* Period of SYNACK timer */

#define TCP_PAWS_24DAYS	(60 * 60 * 24 * 24)
#define TCP_PAWS_MSL	60		/* Per-host timestamps are invalidated
					 * after this time. It should be equal
					 * (or greater than) TCP_TIMEWAIT_LEN
					 * to provide reliability equal to one
					 * provided by timewait state.
					 */
#define TCP_PAWS_WINDOW	1		/* Replay window for per-host
					 * timestamps. It must be less than
					 * minimal timewait lifetime.
					 */
/*
 *	TCP option
 */
177

Linus Torvalds's avatar
Linus Torvalds committed
178
179
180
181
182
183
184
#define TCPOPT_NOP		1	/* Padding */
#define TCPOPT_EOL		0	/* End of options */
#define TCPOPT_MSS		2	/* Segment size negotiating */
#define TCPOPT_WINDOW		3	/* Window scaling */
#define TCPOPT_SACK_PERM        4       /* SACK Permitted */
#define TCPOPT_SACK             5       /* SACK Block */
#define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
185
#define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
186
#define TCPOPT_FASTOPEN		34	/* Fast open (RFC7413) */
Yuchung Cheng's avatar
Yuchung Cheng committed
187
188
189
190
191
#define TCPOPT_EXP		254	/* Experimental */
/* Magic number to be after the option value for sharing TCP
 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
 */
#define TCPOPT_FASTOPEN_MAGIC	0xF989
192
#define TCPOPT_SMC_MAGIC	0xE2D4C3D9
Linus Torvalds's avatar
Linus Torvalds committed
193
194
195
196
197
198
199
200
201

/*
 *     TCP option lengths
 */

#define TCPOLEN_MSS            4
#define TCPOLEN_WINDOW         3
#define TCPOLEN_SACK_PERM      2
#define TCPOLEN_TIMESTAMP      10
202
#define TCPOLEN_MD5SIG         18
203
#define TCPOLEN_FASTOPEN_BASE  2
Yuchung Cheng's avatar
Yuchung Cheng committed
204
#define TCPOLEN_EXP_FASTOPEN_BASE  4
205
#define TCPOLEN_EXP_SMC_BASE   6
Linus Torvalds's avatar
Linus Torvalds committed
206
207
208
209
210
211
212
213

/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED		12
#define TCPOLEN_WSCALE_ALIGNED		4
#define TCPOLEN_SACKPERM_ALIGNED	4
#define TCPOLEN_SACK_BASE		2
#define TCPOLEN_SACK_BASE_ALIGNED	4
#define TCPOLEN_SACK_PERBLOCK		8
214
#define TCPOLEN_MD5SIG_ALIGNED		20
Adam Langley's avatar
Adam Langley committed
215
#define TCPOLEN_MSS_ALIGNED		4
216
#define TCPOLEN_EXP_SMC_BASE_ALIGNED	8
Linus Torvalds's avatar
Linus Torvalds committed
217
218
219
220

/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */
#define TCP_NAGLE_CORK		2	/* Socket is corked	    */
Stephen Hemminger's avatar
Stephen Hemminger committed
221
#define TCP_NAGLE_PUSH		4	/* Cork is overridden for already queued data */
Linus Torvalds's avatar
Linus Torvalds committed
222

223
224
225
/* TCP thin-stream limits */
#define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */

226
/* TCP initial congestion window as per rfc6928 */
227
228
#define TCP_INIT_CWND		10

229
230
/* Bit Flags for sysctl_tcp_fastopen */
#define	TFO_CLIENT_ENABLE	1
231
#define	TFO_SERVER_ENABLE	2
232
#define	TFO_CLIENT_NO_COOKIE	4	/* Data in SYN w/o cookie option */
233

234
235
236
237
/* Accept SYN data w/o any cookie option */
#define	TFO_SERVER_COOKIE_NOT_REQD	0x200

/* Force enable TFO on all listeners, i.e., not requiring the
238
 * TCP_FASTOPEN socket option.
239
240
241
 */
#define	TFO_SERVER_WO_SOCKOPT1	0x400

242

Linus Torvalds's avatar
Linus Torvalds committed
243
244
/* sysctl variables for tcp */
extern int sysctl_tcp_max_orphans;
245
extern long sysctl_tcp_mem[3];
246

247
#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
248
#define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
249
#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
250

Eric Dumazet's avatar
Eric Dumazet committed
251
extern atomic_long_t tcp_memory_allocated;
252
extern struct percpu_counter tcp_sockets_allocated;
253
extern unsigned long tcp_memory_pressure;
Linus Torvalds's avatar
Linus Torvalds committed
254

255
256
257
/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
258
259
	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
	    mem_cgroup_under_socket_pressure(sk->sk_memcg))
260
		return true;
261
262
263

	return tcp_memory_pressure;
}
Linus Torvalds's avatar
Linus Torvalds committed
264
265
266
267
268
/*
 * The next routines deal with comparing 32 bit unsigned ints
 * and worry about wraparound (automatic with unsigned arithmetic).
 */

Eric Dumazet's avatar
Eric Dumazet committed
269
static inline bool before(__u32 seq1, __u32 seq2)
Linus Torvalds's avatar
Linus Torvalds committed
270
{
271
        return (__s32)(seq1-seq2) < 0;
Linus Torvalds's avatar
Linus Torvalds committed
272
}
273
#define after(seq2, seq1) 	before(seq1, seq2)
Linus Torvalds's avatar
Linus Torvalds committed
274
275

/* is s2<=s1<=s3 ? */
Eric Dumazet's avatar
Eric Dumazet committed
276
static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3)
Linus Torvalds's avatar
Linus Torvalds committed
277
278
279
280
{
	return seq3 - seq2 >= seq1 - seq2;
}

Arun Sharma's avatar
Arun Sharma committed
281
282
283
284
285
286
287
288
static inline bool tcp_out_of_memory(struct sock *sk)
{
	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
	    sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
		return true;
	return false;
}

289
290
void sk_forced_mem_schedule(struct sock *sk, int size);

291
static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
292
{
293
294
295
296
297
298
299
300
301
	struct percpu_counter *ocp = sk->sk_prot->orphan_count;
	int orphans = percpu_counter_read_positive(ocp);

	if (orphans << shift > sysctl_tcp_max_orphans) {
		orphans = percpu_counter_sum_positive(ocp);
		if (orphans << shift > sysctl_tcp_max_orphans)
			return true;
	}
	return false;
302
}
Linus Torvalds's avatar
Linus Torvalds committed
303

304
bool tcp_check_oom(struct sock *sk, int shift);
Arun Sharma's avatar
Arun Sharma committed
305

306

Linus Torvalds's avatar
Linus Torvalds committed
307
308
extern struct proto tcp_prot;

309
#define TCP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.tcp_statistics, field)
Eric Dumazet's avatar
Eric Dumazet committed
310
#define __TCP_INC_STATS(net, field)	__SNMP_INC_STATS((net)->mib.tcp_statistics, field)
311
#define TCP_DEC_STATS(net, field)	SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
312
#define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
Linus Torvalds's avatar
Linus Torvalds committed
313

314
315
void tcp_tasklet_init(void);

316
int tcp_v4_err(struct sk_buff *skb, u32);
317
318
319

void tcp_shutdown(struct sock *sk, int how);

320
int tcp_v4_early_demux(struct sk_buff *skb);
321
322
323
int tcp_v4_rcv(struct sk_buff *skb);

int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
324
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
325
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
326
327
int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
		 int flags);
328
329
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
			size_t size, int flags);
330
331
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
		 size_t size, int flags);
332
333
334
335
336
void tcp_release_cb(struct sock *sk);
void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
337
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
338
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
339
340
341
342
343
344
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
			struct pipe_inode_info *pipe, size_t len,
			unsigned int flags);
Jens Axboe's avatar
Jens Axboe committed
345

346
void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
347
348
static inline void tcp_dec_quickack_mode(struct sock *sk,
					 const unsigned int pkts)
Linus Torvalds's avatar
Linus Torvalds committed
349
{
350
	struct inet_connection_sock *icsk = inet_csk(sk);
351

352
353
354
	if (icsk->icsk_ack.quick) {
		if (pkts >= icsk->icsk_ack.quick) {
			icsk->icsk_ack.quick = 0;
355
			/* Leaving quickack mode we deflate ATO. */
356
			icsk->icsk_ack.ato   = TCP_ATO_MIN;
357
		} else
358
			icsk->icsk_ack.quick -= pkts;
Linus Torvalds's avatar
Linus Torvalds committed
359
360
361
	}
}

362
363
364
#define	TCP_ECN_OK		1
#define	TCP_ECN_QUEUE_CWR	2
#define	TCP_ECN_DEMAND_CWR	4
365
#define	TCP_ECN_SEEN		8
366

Eric Dumazet's avatar
Eric Dumazet committed
367
enum tcp_tw_status {
Linus Torvalds's avatar
Linus Torvalds committed
368
369
370
371
372
373
374
	TCP_TW_SUCCESS = 0,
	TCP_TW_RST = 1,
	TCP_TW_ACK = 2,
	TCP_TW_SYN = 3
};


375
376
377
378
enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
					      struct sk_buff *skb,
					      const struct tcphdr *th);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
379
380
			   struct request_sock *req, bool fastopen,
			   bool *lost_race);
381
382
int tcp_child_process(struct sock *parent, struct sock *child,
		      struct sk_buff *skb);
383
void tcp_enter_loss(struct sock *sk);
384
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
385
386
387
388
void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
389
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
390
391
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
392
void tcp_init_transfer(struct sock *sk, int bpf_op);
393
394
__poll_t tcp_poll(struct file *file, struct socket *sock,
		      struct poll_table_struct *wait);
395
396
397
398
399
int tcp_getsockopt(struct sock *sk, int level, int optname,
		   char __user *optval, int __user *optlen);
int tcp_setsockopt(struct sock *sk, int level, int optname,
		   char __user *optval, unsigned int optlen);
int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
Changli Gao's avatar
Changli Gao committed
400
			  char __user *optval, int __user *optlen);
401
int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
Changli Gao's avatar
Changli Gao committed
402
			  char __user *optval, unsigned int optlen);
403
void tcp_set_keepalive(struct sock *sk, int val);
404
void tcp_syn_ack_timeout(const struct request_sock *req);
405
406
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
		int flags, int *addr_len);
407
int tcp_set_rcvlowat(struct sock *sk, int val);
408
void tcp_data_ready(struct sock *sk);
409
410
int tcp_mmap(struct file *file, struct socket *sock,
	     struct vm_area_struct *vma);
411
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
412
413
414
		       struct tcp_options_received *opt_rx,
		       int estab, struct tcp_fastopen_cookie *foc);
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
415

Linus Torvalds's avatar
Linus Torvalds committed
416
417
418
419
/*
 *	TCP v4 functions exported for the inet6 API
 */

420
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
421
void tcp_v4_mtu_reduced(struct sock *sk);
422
void tcp_req_err(struct sock *sk, u32 seq, bool abort);
423
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
424
struct sock *tcp_create_openreq_child(const struct sock *sk,
425
426
				      struct request_sock *req,
				      struct sk_buff *skb);
427
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
428
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
429
				  struct request_sock *req,
430
431
432
				  struct dst_entry *dst,
				  struct request_sock *req_unhash,
				  bool *own_req);
433
434
435
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int tcp_connect(struct sock *sk);
436
437
438
439
440
enum tcp_synack_type {
	TCP_SYNACK_NORMAL,
	TCP_SYNACK_FASTOPEN,
	TCP_SYNACK_COOKIE,
};
441
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
442
				struct request_sock *req,
443
				struct tcp_fastopen_cookie *foc,
444
				enum tcp_synack_type synack_type);
445
int tcp_disconnect(struct sock *sk, int flags);
Linus Torvalds's avatar
Linus Torvalds committed
446

Pavel Emelyanov's avatar
Pavel Emelyanov committed
447
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
448
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
449
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
Linus Torvalds's avatar
Linus Torvalds committed
450
451

/* From syncookies.c */
452
453
struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
				 struct request_sock *req,
454
				 struct dst_entry *dst, u32 tsoff);
455
456
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
		      u32 cookie);
Cong Wang's avatar
Cong Wang committed
457
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
458
#ifdef CONFIG_SYN_COOKIES
459

460
/* Syncookies use a monotonic timer which increments every 60 seconds.
461
462
463
 * This counter is used both as a hash input and partially encoded into
 * the cookie value.  A cookie is only validated further if the delta
 * between the current counter value and the encoded one is less than this,
464
 * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
465
466
 * the counter advances immediately after a cookie is generated).
 */
467
468
469
470
471
472
#define MAX_SYNCOOKIE_AGE	2
#define TCP_SYNCOOKIE_PERIOD	(60 * HZ)
#define TCP_SYNCOOKIE_VALID	(MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)

/* syncookies: remember time of last synqueue overflow
 * But do not dirty this field too often (once per second is enough)
473
 * It is racy as we do not hold a lock, but race is very minor.
474
 */
475
static inline void tcp_synq_overflow(const struct sock *sk)
476
{
477
	unsigned int last_overflow;
478
	unsigned int now = jiffies;
479

480
481
482
483
484
485
486
487
488
489
490
491
492
	if (sk->sk_reuseport) {
		struct sock_reuseport *reuse;

		reuse = rcu_dereference(sk->sk_reuseport_cb);
		if (likely(reuse)) {
			last_overflow = READ_ONCE(reuse->synq_overflow_ts);
			if (time_after32(now, last_overflow + HZ))
				WRITE_ONCE(reuse->synq_overflow_ts, now);
			return;
		}
	}

	last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
493
	if (time_after32(now, last_overflow + HZ))
494
495
496
497
498
499
		tcp_sk(sk)->rx_opt.ts_recent_stamp = now;
}

/* syncookies: no recent synqueue overflow on this listening socket? */
static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
{
500
	unsigned int last_overflow;
501
	unsigned int now = jiffies;
502

503
504
505
506
507
508
509
510
511
512
513
514
	if (sk->sk_reuseport) {
		struct sock_reuseport *reuse;

		reuse = rcu_dereference(sk->sk_reuseport_cb);
		if (likely(reuse)) {
			last_overflow = READ_ONCE(reuse->synq_overflow_ts);
			return time_after32(now, last_overflow +
					    TCP_SYNCOOKIE_VALID);
		}
	}

	last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
515
	return time_after32(now, last_overflow + TCP_SYNCOOKIE_VALID);
516
}
517
518
519

static inline u32 tcp_cookie_time(void)
{
520
521
	u64 val = get_jiffies_64();

522
	do_div(val, TCP_SYNCOOKIE_PERIOD);
523
	return val;
524
525
}

526
527
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
			      u16 *mssp);
528
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
529
u64 cookie_init_timestamp(struct request_sock *req);
530
531
bool cookie_timestamp_decode(const struct net *net,
			     struct tcp_options_received *opt);
532
bool cookie_ecn_ok(const struct tcp_options_received *opt,
533
		   const struct net *net, const struct dst_entry *dst);
534

535
/* From net/ipv6/syncookies.c */
536
537
538
int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
		      u32 cookie);
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
539

540
541
u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
			      const struct tcphdr *th, u16 *mssp);
542
__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
543
#endif
Linus Torvalds's avatar
Linus Torvalds committed
544
545
/* tcp_output.c */

546
547
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
			       int nonagle);
548
549
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
550
551
552
void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
553
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
554
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
555
556
557
558
559
560
561
enum tcp_queue {
	TCP_FRAG_IN_WRITE_QUEUE,
	TCP_FRAG_IN_RTX_QUEUE,
};
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
		 struct sk_buff *skb, u32 len,
		 unsigned int mss_now, gfp_t gfp);
562
563
564

void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *);
565
int tcp_write_wakeup(struct sock *, int mib);
566
567
568
569
void tcp_send_fin(struct sock *sk);
void tcp_send_active_reset(struct sock *sk, gfp_t priority);
int tcp_send_synack(struct sock *);
void tcp_push_one(struct sock *, unsigned int mss_now);
570
void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
571
572
573
void tcp_send_ack(struct sock *sk);
void tcp_send_delayed_ack(struct sock *sk);
void tcp_send_loss_probe(struct sock *sk);
574
bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
575
576
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
			     const struct sk_buff *next_skb);
Linus Torvalds's avatar
Linus Torvalds committed
577

578
/* tcp_input.c */
579
void tcp_rearm_rto(struct sock *sk);
580
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
581
void tcp_reset(struct sock *sk);
Yuchung Cheng's avatar
Yuchung Cheng committed
582
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
583
void tcp_fin(struct sock *sk);
584

Linus Torvalds's avatar
Linus Torvalds committed
585
/* tcp_timer.c */
586
void tcp_init_xmit_timers(struct sock *);
587
588
static inline void tcp_clear_xmit_timers(struct sock *sk)
{
589
	if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
590
		__sock_put(sk);
591

Eric Dumazet's avatar
Eric Dumazet committed
592
593
594
	if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
		__sock_put(sk);

595
596
	inet_csk_clear_xmit_timers(sk);
}
Linus Torvalds's avatar
Linus Torvalds committed
597

598
599
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
unsigned int tcp_current_mss(struct sock *sk);
Ilpo Järvinen's avatar
Ilpo Järvinen committed
600
601
602
603

/* Bound MSS / TSO packet size with the half of the window */
static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
{
604
605
606
607
608
609
610
611
612
	int cutoff;

	/* When peer uses tiny windows, there is no use in packetizing
	 * to sub-MSS pieces for the sake of SWS or making sure there
	 * are enough packets in the pipe for fast recovery.
	 *
	 * On the other hand, for extremely large MSS devices, handling
	 * smaller than MSS windows in this way does make sense.
	 */
613
	if (tp->max_window > TCP_MSS_DEFAULT)
614
615
616
617
618
619
		cutoff = (tp->max_window >> 1);
	else
		cutoff = tp->max_window;

	if (cutoff && pktsize > cutoff)
		return max_t(int, cutoff, 68U - tp->tcp_header_len);
Ilpo Järvinen's avatar
Ilpo Järvinen committed
620
621
622
	else
		return pktsize;
}
Linus Torvalds's avatar
Linus Torvalds committed
623

624
/* tcp.c */
625
void tcp_get_info(struct sock *, struct tcp_info *);
Linus Torvalds's avatar
Linus Torvalds committed
626
627

/* Read 'sendfile()'-style from a TCP socket */
628
629
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
		  sk_read_actor_t recv_actor);
Linus Torvalds's avatar
Linus Torvalds committed
630

631
void tcp_initialize_rcv_mss(struct sock *sk);
Linus Torvalds's avatar
Linus Torvalds committed
632

633
634
635
636
int tcp_mtu_to_mss(struct sock *sk, int pmtu);
int tcp_mss_to_mtu(struct sock *sk, int mss);
void tcp_mtup_init(struct sock *sk);
void tcp_init_buffer_space(struct sock *sk);
John Heffner's avatar
John Heffner committed
637

638
639
640
641
642
643
644
645
static inline void tcp_bound_rto(const struct sock *sk)
{
	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
}

static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
646
	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
647
648
}

649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
			       ntohl(TCP_FLAG_ACK) |
			       snd_wnd);
}

static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}

static inline void tcp_fast_path_check(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
	    tp->rcv_wnd &&
	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
	    !tp->urg_data)
		tcp_fast_path_on(tp);
}

672
673
674
/* Compute the actual rto_min value */
static inline u32 tcp_rto_min(struct sock *sk)
{
675
	const struct dst_entry *dst = __sk_dst_get(sk);
676
677
678
679
680
681
682
	u32 rto_min = TCP_RTO_MIN;

	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
		rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
	return rto_min;
}

683
684
685
686
687
static inline u32 tcp_rto_min_us(struct sock *sk)
{
	return jiffies_to_usecs(tcp_rto_min(sk));
}

688
689
690
691
692
static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
{
	return dst_metric_locked(dst, RTAX_CC_ALGO);
}

693
694
695
/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
696
	return minmax_get(&tp->rtt_min);
697
698
}

Linus Torvalds's avatar
Linus Torvalds committed
699
700
701
702
/* Compute the actual receive window we are currently advertising.
 * Rcv_nxt can be after the window if our peer push more data
 * than the offered window.
 */
Stephen Hemminger's avatar
Stephen Hemminger committed
703
static inline u32 tcp_receive_window(const struct tcp_sock *tp)
Linus Torvalds's avatar
Linus Torvalds committed
704
705
706
707
708
709
710
711
712
713
714
715
{
	s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;

	if (win < 0)
		win = 0;
	return (u32) win;
}

/* Choose a new window, without checks for shrinking, and without
 * scaling applied to the result.  The caller does these things
 * if necessary.  This is a "raw" window selection.
 */
716
u32 __tcp_select_window(struct sock *sk);
Linus Torvalds's avatar
Linus Torvalds committed
717

Pavel Emelyanov's avatar
Pavel Emelyanov committed
718
719
void tcp_send_window_probe(struct sock *sk);

Eric Dumazet's avatar
Eric Dumazet committed
720
721
722
723
724
725
/* TCP uses 32bit jiffies to save some space.
 * Note that this is different from tcp_time_stamp, which
 * historically has been the same until linux-4.13.
 */
#define tcp_jiffies32 ((u32)jiffies)

726
727
728
729
730
731
732
733
734
/*
 * Deliver a 32bit value for TCP timestamp option (RFC 7323)
 * It is no longer tied to jiffies, but to 1 ms clock.
 * Note: double check if you want to use tcp_jiffies32 instead of this.
 */
#define TCP_TS_HZ	1000

static inline u64 tcp_clock_ns(void)
{
735
	return ktime_get_ns();
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
}

static inline u64 tcp_clock_us(void)
{
	return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}

/* This should only be used in contexts where tp->tcp_mstamp is up to date */
static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
{
	return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
}

/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
static inline u32 tcp_time_stamp_raw(void)
{
	return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
}

755
void tcp_mstamp_refresh(struct tcp_sock *tp);
756
757
758
759
760

static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
{
	return max_t(s64, t1 - t0, 0);
}
Linus Torvalds's avatar
Linus Torvalds committed
761

762
763
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
764
	return div_u64(skb->skb_mstamp_ns, NSEC_PER_SEC / TCP_TS_HZ);
765
766
}

767
768
769
/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
770
	return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
771
772
}

773

Changli Gao's avatar
Changli Gao committed
774
775
776
777
778
779
780
781
782
783
784
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])

#define TCPHDR_FIN 0x01
#define TCPHDR_SYN 0x02
#define TCPHDR_RST 0x04
#define TCPHDR_PSH 0x08
#define TCPHDR_ACK 0x10
#define TCPHDR_URG 0x20
#define TCPHDR_ECE 0x40
#define TCPHDR_CWR 0x80

785
786
#define TCPHDR_SYN_ECN	(TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)

Stephen Hemminger's avatar
Stephen Hemminger committed
787
/* This is what the send packet queuing engine uses to pass
788
789
790
791
 * TCP per-packet control information to the transmission code.
 * We also store the host-order sequence numbers in here too.
 * This is 44 bytes if IPV6 is enabled.
 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
Linus Torvalds's avatar
Linus Torvalds committed
792
793
794
795
 */
struct tcp_skb_cb {
	__u32		seq;		/* Starting sequence number	*/
	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
796
797
798
799
	union {
		/* Note : tcp_tw_isn is used in input path only
		 *	  (isn chosen by tcp_timewait_state_process())
		 *
800
801
		 * 	  tcp_gso_segs/size are used in write queue only,
		 *	  cf tcp_skb_pcount()/tcp_skb_mss()
802
803
		 */
		__u32		tcp_tw_isn;
804
805
806
807
		struct {
			u16	tcp_gso_segs;
			u16	tcp_gso_size;
		};
808
	};
Eric Dumazet's avatar
Eric Dumazet committed
809
	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
810

811
	__u8		sacked;		/* State flags for SACK.	*/
Linus Torvalds's avatar
Linus Torvalds committed
812
813
814
815
#define TCPCB_SACKED_ACKED	0x01	/* SKB ACK'd by a SACK block	*/
#define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
#define TCPCB_LOST		0x04	/* SKB is lost			*/
#define TCPCB_TAGBITS		0x07	/* All tag bits			*/
816
#define TCPCB_REPAIRED		0x10	/* SKB repaired (no skb_mstamp_ns)	*/
Linus Torvalds's avatar
Linus Torvalds committed
817
#define TCPCB_EVER_RETRANS	0x80	/* Ever retransmitted frame	*/
818
819
#define TCPCB_RETRANS		(TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
				TCPCB_REPAIRED)
Linus Torvalds's avatar
Linus Torvalds committed
820

821
	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
822
	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
823
			eor:1,		/* Is skb MSG_EOR marked? */
824
825
			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
			unused:5;
Linus Torvalds's avatar
Linus Torvalds committed
826
	__u32		ack_seq;	/* Sequence number ACK'd	*/
827
	union {
828
		struct {
829
			/* There is space for up to 24 bytes */
830
831
832
			__u32 in_flight:30,/* Bytes in flight at transmit */
			      is_app_limited:1, /* cwnd not fully used? */
			      unused:1;
833
834
835
			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
			__u32 delivered;
			/* start of send pipeline phase */
836
			u64 first_tx_mstamp;
837
			/* when we reached the "delivered" count */
838
			u64 delivered_mstamp;
839
840
841
		} tx;   /* only used for outgoing skbs */
		union {
			struct inet_skb_parm	h4;
842
#if IS_ENABLED(CONFIG_IPV6)
843
			struct inet6_skb_parm	h6;
844
#endif
845
		} header;	/* For incoming skbs */
846
847
		struct {
			__u32 flags;
848
			struct sock *sk_redir;
849
			void *data_end;
850
		} bpf;
851
	};
Linus Torvalds's avatar
Linus Torvalds committed
852
853
854
855
};

#define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))

856
857
858
859
static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
{
	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
}
Eric Dumazet's avatar
Eric Dumazet committed
860

861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
{
	return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
}

static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
{
	return TCP_SKB_CB(skb)->bpf.sk_redir;
}

static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
{
	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
}

876
#if IS_ENABLED(CONFIG_IPV6)
Eric Dumazet's avatar
Eric Dumazet committed
877
878
879
880
/* This is the variant of inet6_iif() that must be used by TCP,
 * as TCP moves IP6CB into a different location in skb->cb[]
 */
static inline int tcp_v6_iif(const struct sk_buff *skb)
881
882
883
884
885
{
	return TCP_SKB_CB(skb)->header.h6.iif;
}

static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
Eric Dumazet's avatar
Eric Dumazet committed
886
{
887
	bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
888
889

	return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
Eric Dumazet's avatar
Eric Dumazet committed
890
}
891
892
893
894
895
896
897
898
899
900

/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
	if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
		return TCP_SKB_CB(skb)->header.h6.iif;
#endif
	return 0;
}
901
#endif
Eric Dumazet's avatar
Eric Dumazet committed
902

903
904
905
906
static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
	if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
907
	    skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
908
909
910
911
912
		return true;
#endif
	return false;
}

913
914
915
916
917
918
919
920
921
922
/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v4_sdif(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
	if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
		return TCP_SKB_CB(skb)->header.h4.iif;
#endif
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
923
924
/* Due to TSO, an SKB can be composed of multiple actual
 * packets.  To keep these tracked properly, we use this.
925
 */
Linus Torvalds's avatar
Linus Torvalds committed
926
static inline int tcp_skb_pcount(const struct sk_buff *skb)
927
{
928
929
	return TCP_SKB_CB(skb)->tcp_gso_segs;
}
930

931
932
933
static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
{
	TCP_SKB_CB(skb)->tcp_gso_segs = segs;
934
935
}

936
static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
Linus Torvalds's avatar
Linus Torvalds committed
937
{
938
	TCP_SKB_CB(skb)->tcp_gso_segs += segs;
Linus Torvalds's avatar
Linus Torvalds committed
939
940
}

941
/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */
Linus Torvalds's avatar
Linus Torvalds committed
942
943
static inline int tcp_skb_mss(const struct sk_buff *skb)
{
944
	return TCP_SKB_CB(skb)->tcp_gso_size;
Linus Torvalds's avatar
Linus Torvalds committed
945
946
}

947
948
949
950
951
static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
{
	return likely(!TCP_SKB_CB(skb)->eor);
}

952
953
954
955
956
957
/* Events passed to congestion control interface */
enum tcp_ca_event {
	CA_EVENT_TX_START,	/* first transmit when no packets in flight */
	CA_EVENT_CWND_RESTART,	/* congestion window restart */
	CA_EVENT_COMPLETE_CWR,	/* end of congestion recovery */
	CA_EVENT_LOSS,		/* loss timeout */
958
959
	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
960
961
};

962
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
963
enum tcp_ca_ack_event_flags {
964
965
966
	CA_ACK_SLOWPATH		= (1 << 0),	/* In slow path processing */
	CA_ACK_WIN_UPDATE	= (1 << 1),	/* ACK updated window */
	CA_ACK_ECE		= (1 << 2),	/* ECE bit is set on ack */
967
968
969
970
971
972
};

/*
 * Interface for adding new TCP congestion control handlers
 */
#define TCP_CA_NAME_MAX	16
973
974
975
#define TCP_CA_MAX	128
#define TCP_CA_BUF_MAX	(TCP_CA_NAME_MAX*TCP_CA_MAX)

976
977
#define TCP_CA_UNSPEC	0

978
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
979
#define TCP_CONG_NON_RESTRICTED 0x1
980
981
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN	0x2
982

983
984
union tcp_cc_info;

985
986
987
struct ack_sample {
	u32 pkts_acked;
	s32 rtt_us;
988
	u32 in_flight;
989
990
};

991
992
993
994
995
996
997
998
999
/* A rate sample measures the number of (original/retransmitted) data
 * packets delivered "delivered" over an interval of time "interval_us".
 * The tcp_rate.c code fills in the rate sample, and congestion
 * control modules that define a cong_control function to run at the end
 * of ACK processing can optionally chose to consult this sample when
 * setting cwnd and pacing rate.
 * A sample is invalid if "delivered" or "interval_us" is negative.
 */
struct rate_sample {
1000
	u64  prior_mstamp; /* starting timestamp for interval */
1001
1002
1003
	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
	s32  delivered;		/* number of packets delivered over interval */
	long interval_us;	/* time for tp->delivered to incr "delivered" */
1004
1005
	u32 snd_interval_us;	/* snd interval for delivered packets */
	u32 rcv_interval_us;	/* rcv interval for delivered packets */
1006
1007
1008
1009
	long rtt_us;		/* RTT of last (S)ACKed packet (or -1) */
	int  losses;		/* number of packets marked lost upon ACK */
	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
	u32  prior_in_flight;	/* in flight before this ACK */
1010
	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
1011
	bool is_retrans;	/* is sample from retransmission? */
1012
	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
1013
1014
};

1015
1016
struct tcp_congestion_ops {
	struct list_head	list;
1017
1018
	u32 key;
	u32 flags;
1019
1020

	/* initialize private data (optional) */
1021
	void (*init)(struct sock *sk);
1022
	/* cleanup private data  (optional) */
1023
	void (*release)(struct sock *sk);
1024
1025

	/* return slow start threshold (required) */
1026
	u32 (*ssthresh)(struct sock *sk);
1027
	/* do new cwnd calculation (required) */
1028
	void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
1029
	/* call before changing ca_state (optional) */
1030
	void (*set_state)(struct sock *sk, u8 new_state);
1031
	/* call when cwnd event occurs (optional) */
1032
	void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
1033
1034
	/* call when ack arrives (optional) */
	void (*in_ack_event)(struct sock *sk, u32 flags);
1035
	/* new value of cwnd after loss (required) */
1036
	u32  (*undo_cwnd)(struct sock *sk);
1037
	/* hook for packet ack accounting (optional) */
Lawrence Brakmo's avatar