tcp.h 62.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Definitions for the TCP module.
 *
 * Version:	@(#)tcp.h	1.0.5	05/23/93
 *
10
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
11
12
13
14
15
16
17
18
19
20
21
22
23
24
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */
#ifndef _TCP_H
#define _TCP_H

#define FASTRETRANS_DEBUG 1

#include <linux/list.h>
#include <linux/tcp.h>
25
#include <linux/bug.h>
Linus Torvalds's avatar
Linus Torvalds committed
26
27
28
#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/percpu.h>
29
#include <linux/skbuff.h>
30
#include <linux/cryptohash.h>
31
#include <linux/kref.h>
32
#include <linux/ktime.h>
33
34

#include <net/inet_connection_sock.h>
35
#include <net/inet_timewait_sock.h>
36
#include <net/inet_hashtables.h>
Linus Torvalds's avatar
Linus Torvalds committed
37
#include <net/checksum.h>
38
#include <net/request_sock.h>
Linus Torvalds's avatar
Linus Torvalds committed
39
40
41
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
42
#include <net/tcp_states.h>
43
#include <net/inet_ecn.h>
44
#include <net/dst.h>
45

Linus Torvalds's avatar
Linus Torvalds committed
46
#include <linux/seq_file.h>
47
#include <linux/memcontrol.h>
48
49
#include <linux/bpf-cgroup.h>

50
extern struct inet_hashinfo tcp_hashinfo;
Linus Torvalds's avatar
Linus Torvalds committed
51

52
extern struct percpu_counter tcp_orphan_count;
53
void tcp_time_wait(struct sock *sk, int state, int timeo);
Linus Torvalds's avatar
Linus Torvalds committed
54
55

#define MAX_TCP_HEADER	(128 + MAX_HEADER)
Adam Langley's avatar
Adam Langley committed
56
#define MAX_TCP_OPTION_SPACE 40
Linus Torvalds's avatar
Linus Torvalds committed
57

58
/*
Linus Torvalds's avatar
Linus Torvalds committed
59
 * Never offer a window over 32767 without using window scaling. Some
60
 * poor stacks do signed 16bit maths!
Linus Torvalds's avatar
Linus Torvalds committed
61
62
63
64
65
66
 */
#define MAX_TCP_WINDOW		32767U

/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
#define TCP_MIN_MSS		88U

John Heffner's avatar
John Heffner committed
67
/* The least MTU to use for probing */
68
#define TCP_BASE_MSS		1024
John Heffner's avatar
John Heffner committed
69

70
71
72
/* probing interval, default to 10 minutes as per RFC4821 */
#define TCP_PROBE_INTERVAL	600

73
74
75
/* Specify interval when tcp mtu probing will stop */
#define TCP_PROBE_THRESHOLD	8

Linus Torvalds's avatar
Linus Torvalds committed
76
77
78
79
80
81
/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3

/* Maximal number of ACKs sent quickly to accelerate slow-start. */
#define TCP_MAX_QUICKACKS	16U

82
83
84
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE		14U

Linus Torvalds's avatar
Linus Torvalds committed
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/* urg_data states */
#define TCP_URG_VALID	0x0100
#define TCP_URG_NOTYET	0x0200
#define TCP_URG_READ	0x0400

#define TCP_RETR1	3	/*
				 * This is how many retries it does before it
				 * tries to figure out if the gateway is
				 * down. Minimal RFC value is 3; it corresponds
				 * to ~3sec-8min depending on RTO.
				 */

#define TCP_RETR2	15	/*
				 * This should take at least
				 * 90 minutes to time out.
				 * RFC1122 says that the limit is 100 sec.
				 * 15 is ~13-30min depending on RTO.
				 */

104
105
106
107
108
109
110
111
#define TCP_SYN_RETRIES	 6	/* This is how many retries are done
				 * when active opening a connection.
				 * RFC1122 says the minimum retry MUST
				 * be at least 180secs.  Nevertheless
				 * this value is corresponding to
				 * 63secs of retransmission with the
				 * current initial RTO.
				 */
Linus Torvalds's avatar
Linus Torvalds committed
112

113
114
115
116
117
118
#define TCP_SYNACK_RETRIES 5	/* This is how may retries are done
				 * when passive opening a connection.
				 * This is corresponding to 31secs of
				 * retransmission with the current
				 * initial RTO.
				 */
Linus Torvalds's avatar
Linus Torvalds committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT
				  * state, about 60 seconds	*/
#define TCP_FIN_TIMEOUT	TCP_TIMEWAIT_LEN
                                 /* BSD style FIN_WAIT2 deadlock breaker.
				  * It used to be 3min, new value is 60sec,
				  * to combine FIN-WAIT-2 timeout with
				  * TIME-WAIT timer.
				  */

#define TCP_DELACK_MAX	((unsigned)(HZ/5))	/* maximal time to delay before sending an ACK */
#if HZ >= 100
#define TCP_DELACK_MIN	((unsigned)(HZ/25))	/* minimal time to delay before sending an ACK */
#define TCP_ATO_MIN	((unsigned)(HZ/25))
#else
#define TCP_DELACK_MIN	4U
#define TCP_ATO_MIN	4U
#endif
#define TCP_RTO_MAX	((unsigned)(120*HZ))
#define TCP_RTO_MIN	((unsigned)(HZ/5))
139
#define TCP_TIMEOUT_MIN	(2U) /* Min timeout for TCP timers in jiffies */
140
#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))	/* RFC6298 2.1 initial RTO value	*/
141
142
143
144
145
146
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ))	/* RFC 1122 initial RTO value, now
						 * used as a fallback RTO for the
						 * initial data transmission if no
						 * valid RTT sample has been acquired,
						 * most likely due to retrans in 3WHS.
						 */
Linus Torvalds's avatar
Linus Torvalds committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
					                 * for local resources.
					                 */
#define TCP_KEEPALIVE_TIME	(120*60*HZ)	/* two hours */
#define TCP_KEEPALIVE_PROBES	9		/* Max of 9 keepalive probes	*/
#define TCP_KEEPALIVE_INTVL	(75*HZ)

#define MAX_TCP_KEEPIDLE	32767
#define MAX_TCP_KEEPINTVL	32767
#define MAX_TCP_KEEPCNT		127
#define MAX_TCP_SYNCNT		127

#define TCP_SYNQ_INTERVAL	(HZ/5)	/* Period of SYNACK timer */

#define TCP_PAWS_24DAYS	(60 * 60 * 24 * 24)
#define TCP_PAWS_MSL	60		/* Per-host timestamps are invalidated
					 * after this time. It should be equal
					 * (or greater than) TCP_TIMEWAIT_LEN
					 * to provide reliability equal to one
					 * provided by timewait state.
					 */
#define TCP_PAWS_WINDOW	1		/* Replay window for per-host
					 * timestamps. It must be less than
					 * minimal timewait lifetime.
					 */
/*
 *	TCP option
 */
176

Linus Torvalds's avatar
Linus Torvalds committed
177
178
179
180
181
182
183
#define TCPOPT_NOP		1	/* Padding */
#define TCPOPT_EOL		0	/* End of options */
#define TCPOPT_MSS		2	/* Segment size negotiating */
#define TCPOPT_WINDOW		3	/* Window scaling */
#define TCPOPT_SACK_PERM        4       /* SACK Permitted */
#define TCPOPT_SACK             5       /* SACK Block */
#define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
184
#define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
185
#define TCPOPT_FASTOPEN		34	/* Fast open (RFC7413) */
Yuchung Cheng's avatar
Yuchung Cheng committed
186
187
188
189
190
#define TCPOPT_EXP		254	/* Experimental */
/* Magic number to be after the option value for sharing TCP
 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
 */
#define TCPOPT_FASTOPEN_MAGIC	0xF989
191
#define TCPOPT_SMC_MAGIC	0xE2D4C3D9
Linus Torvalds's avatar
Linus Torvalds committed
192
193
194
195
196
197
198
199
200

/*
 *     TCP option lengths
 */

#define TCPOLEN_MSS            4
#define TCPOLEN_WINDOW         3
#define TCPOLEN_SACK_PERM      2
#define TCPOLEN_TIMESTAMP      10
201
#define TCPOLEN_MD5SIG         18
202
#define TCPOLEN_FASTOPEN_BASE  2
Yuchung Cheng's avatar
Yuchung Cheng committed
203
#define TCPOLEN_EXP_FASTOPEN_BASE  4
204
#define TCPOLEN_EXP_SMC_BASE   6
Linus Torvalds's avatar
Linus Torvalds committed
205
206
207
208
209
210
211
212

/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED		12
#define TCPOLEN_WSCALE_ALIGNED		4
#define TCPOLEN_SACKPERM_ALIGNED	4
#define TCPOLEN_SACK_BASE		2
#define TCPOLEN_SACK_BASE_ALIGNED	4
#define TCPOLEN_SACK_PERBLOCK		8
213
#define TCPOLEN_MD5SIG_ALIGNED		20
Adam Langley's avatar
Adam Langley committed
214
#define TCPOLEN_MSS_ALIGNED		4
215
#define TCPOLEN_EXP_SMC_BASE_ALIGNED	8
Linus Torvalds's avatar
Linus Torvalds committed
216
217
218
219

/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */
#define TCP_NAGLE_CORK		2	/* Socket is corked	    */
Stephen Hemminger's avatar
Stephen Hemminger committed
220
#define TCP_NAGLE_PUSH		4	/* Cork is overridden for already queued data */
Linus Torvalds's avatar
Linus Torvalds committed
221

222
223
224
/* TCP thin-stream limits */
#define TCP_THIN_LINEAR_RETRIES 6       /* After 6 linear retries, do exp. backoff */

225
/* TCP initial congestion window as per rfc6928 */
226
227
#define TCP_INIT_CWND		10

228
229
/* Bit Flags for sysctl_tcp_fastopen */
#define	TFO_CLIENT_ENABLE	1
230
#define	TFO_SERVER_ENABLE	2
231
#define	TFO_CLIENT_NO_COOKIE	4	/* Data in SYN w/o cookie option */
232

233
234
235
236
/* Accept SYN data w/o any cookie option */
#define	TFO_SERVER_COOKIE_NOT_REQD	0x200

/* Force enable TFO on all listeners, i.e., not requiring the
237
 * TCP_FASTOPEN socket option.
238
239
240
 */
#define	TFO_SERVER_WO_SOCKOPT1	0x400

241

Linus Torvalds's avatar
Linus Torvalds committed
242
243
/* sysctl variables for tcp */
extern int sysctl_tcp_max_orphans;
244
extern long sysctl_tcp_mem[3];
245

246
#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
247
#define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
248

Eric Dumazet's avatar
Eric Dumazet committed
249
extern atomic_long_t tcp_memory_allocated;
250
extern struct percpu_counter tcp_sockets_allocated;
251
extern unsigned long tcp_memory_pressure;
Linus Torvalds's avatar
Linus Torvalds committed
252

253
254
255
/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
256
257
	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
	    mem_cgroup_under_socket_pressure(sk->sk_memcg))
258
		return true;
259
260
261

	return tcp_memory_pressure;
}
Linus Torvalds's avatar
Linus Torvalds committed
262
263
264
265
266
/*
 * The next routines deal with comparing 32 bit unsigned ints
 * and worry about wraparound (automatic with unsigned arithmetic).
 */

Eric Dumazet's avatar
Eric Dumazet committed
267
static inline bool before(__u32 seq1, __u32 seq2)
Linus Torvalds's avatar
Linus Torvalds committed
268
{
269
        return (__s32)(seq1-seq2) < 0;
Linus Torvalds's avatar
Linus Torvalds committed
270
}
271
#define after(seq2, seq1) 	before(seq1, seq2)
Linus Torvalds's avatar
Linus Torvalds committed
272
273

/* is s2<=s1<=s3 ? */
Eric Dumazet's avatar
Eric Dumazet committed
274
static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3)
Linus Torvalds's avatar
Linus Torvalds committed
275
276
277
278
{
	return seq3 - seq2 >= seq1 - seq2;
}

Arun Sharma's avatar
Arun Sharma committed
279
280
281
282
283
284
285
286
static inline bool tcp_out_of_memory(struct sock *sk)
{
	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
	    sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
		return true;
	return false;
}

287
288
void sk_forced_mem_schedule(struct sock *sk, int size);

289
static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
290
{
291
292
293
294
295
296
297
298
299
	struct percpu_counter *ocp = sk->sk_prot->orphan_count;
	int orphans = percpu_counter_read_positive(ocp);

	if (orphans << shift > sysctl_tcp_max_orphans) {
		orphans = percpu_counter_sum_positive(ocp);
		if (orphans << shift > sysctl_tcp_max_orphans)
			return true;
	}
	return false;
300
}
Linus Torvalds's avatar
Linus Torvalds committed
301

302
bool tcp_check_oom(struct sock *sk, int shift);
Arun Sharma's avatar
Arun Sharma committed
303

304

Linus Torvalds's avatar
Linus Torvalds committed
305
306
extern struct proto tcp_prot;

307
#define TCP_INC_STATS(net, field)	SNMP_INC_STATS((net)->mib.tcp_statistics, field)
Eric Dumazet's avatar
Eric Dumazet committed
308
#define __TCP_INC_STATS(net, field)	__SNMP_INC_STATS((net)->mib.tcp_statistics, field)
309
#define TCP_DEC_STATS(net, field)	SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
310
#define TCP_ADD_STATS(net, field, val)	SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
Linus Torvalds's avatar
Linus Torvalds committed
311

312
313
314
315
316
317
void tcp_tasklet_init(void);

void tcp_v4_err(struct sk_buff *skb, u32);

void tcp_shutdown(struct sock *sk, int how);

318
int tcp_v4_early_demux(struct sk_buff *skb);
319
320
321
int tcp_v4_rcv(struct sk_buff *skb);

int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
322
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
323
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
324
325
int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
		 int flags);
326
327
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
			size_t size, int flags);
328
329
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
		 size_t size, int flags);
330
331
332
333
334
void tcp_release_cb(struct sock *sk);
void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
335
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
336
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
337
			 const struct tcphdr *th);
338
339
340
341
342
343
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
			struct pipe_inode_info *pipe, size_t len,
			unsigned int flags);
Jens Axboe's avatar
Jens Axboe committed
344

345
346
static inline void tcp_dec_quickack_mode(struct sock *sk,
					 const unsigned int pkts)
Linus Torvalds's avatar
Linus Torvalds committed
347
{
348
	struct inet_connection_sock *icsk = inet_csk(sk);
349

350
351
352
	if (icsk->icsk_ack.quick) {
		if (pkts >= icsk->icsk_ack.quick) {
			icsk->icsk_ack.quick = 0;
353
			/* Leaving quickack mode we deflate ATO. */
354
			icsk->icsk_ack.ato   = TCP_ATO_MIN;
355
		} else
356
			icsk->icsk_ack.quick -= pkts;
Linus Torvalds's avatar
Linus Torvalds committed
357
358
359
	}
}

360
361
362
#define	TCP_ECN_OK		1
#define	TCP_ECN_QUEUE_CWR	2
#define	TCP_ECN_DEMAND_CWR	4
363
#define	TCP_ECN_SEEN		8
364

Eric Dumazet's avatar
Eric Dumazet committed
365
enum tcp_tw_status {
Linus Torvalds's avatar
Linus Torvalds committed
366
367
368
369
370
371
372
	TCP_TW_SUCCESS = 0,
	TCP_TW_RST = 1,
	TCP_TW_ACK = 2,
	TCP_TW_SYN = 3
};


373
374
375
376
enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
					      struct sk_buff *skb,
					      const struct tcphdr *th);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
377
378
			   struct request_sock *req, bool fastopen,
			   bool *lost_race);
379
380
int tcp_child_process(struct sock *parent, struct sock *child,
		      struct sk_buff *skb);
381
void tcp_enter_loss(struct sock *sk);
382
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag);
383
384
385
386
void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
387
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
388
389
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
390
void tcp_init_transfer(struct sock *sk, int bpf_op);
Al Viro's avatar
Al Viro committed
391
__poll_t tcp_poll(struct file *file, struct socket *sock,
392
393
394
395
396
397
		      struct poll_table_struct *wait);
int tcp_getsockopt(struct sock *sk, int level, int optname,
		   char __user *optval, int __user *optlen);
int tcp_setsockopt(struct sock *sk, int level, int optname,
		   char __user *optval, unsigned int optlen);
int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
Changli Gao's avatar
Changli Gao committed
398
			  char __user *optval, int __user *optlen);
399
int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
Changli Gao's avatar
Changli Gao committed
400
			  char __user *optval, unsigned int optlen);
401
void tcp_set_keepalive(struct sock *sk, int val);
402
void tcp_syn_ack_timeout(const struct request_sock *req);
403
404
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
		int flags, int *addr_len);
405
int tcp_set_rcvlowat(struct sock *sk, int val);
406
void tcp_data_ready(struct sock *sk);
407
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
408
409
410
		       struct tcp_options_received *opt_rx,
		       int estab, struct tcp_fastopen_cookie *foc);
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
411

Linus Torvalds's avatar
Linus Torvalds committed
412
413
414
415
/*
 *	TCP v4 functions exported for the inet6 API
 */

416
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
417
void tcp_v4_mtu_reduced(struct sock *sk);
418
void tcp_req_err(struct sock *sk, u32 seq, bool abort);
419
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
420
struct sock *tcp_create_openreq_child(const struct sock *sk,
421
422
				      struct request_sock *req,
				      struct sk_buff *skb);
423
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
424
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
425
				  struct request_sock *req,
426
427
428
				  struct dst_entry *dst,
				  struct request_sock *req_unhash,
				  bool *own_req);
429
430
431
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int tcp_connect(struct sock *sk);
432
433
434
435
436
enum tcp_synack_type {
	TCP_SYNACK_NORMAL,
	TCP_SYNACK_FASTOPEN,
	TCP_SYNACK_COOKIE,
};
437
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
438
				struct request_sock *req,
439
				struct tcp_fastopen_cookie *foc,
440
				enum tcp_synack_type synack_type);
441
int tcp_disconnect(struct sock *sk, int flags);
Linus Torvalds's avatar
Linus Torvalds committed
442

Pavel Emelyanov's avatar
Pavel Emelyanov committed
443
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
444
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
445
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
Linus Torvalds's avatar
Linus Torvalds committed
446
447

/* From syncookies.c */
448
449
struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
				 struct request_sock *req,
450
				 struct dst_entry *dst, u32 tsoff);
451
452
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
		      u32 cookie);
Cong Wang's avatar
Cong Wang committed
453
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
454
#ifdef CONFIG_SYN_COOKIES
455

456
/* Syncookies use a monotonic timer which increments every 60 seconds.
457
458
459
 * This counter is used both as a hash input and partially encoded into
 * the cookie value.  A cookie is only validated further if the delta
 * between the current counter value and the encoded one is less than this,
460
 * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
461
462
 * the counter advances immediately after a cookie is generated).
 */
463
464
465
466
467
468
#define MAX_SYNCOOKIE_AGE	2
#define TCP_SYNCOOKIE_PERIOD	(60 * HZ)
#define TCP_SYNCOOKIE_VALID	(MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)

/* syncookies: remember time of last synqueue overflow
 * But do not dirty this field too often (once per second is enough)
469
 * It is racy as we do not hold a lock, but race is very minor.
470
 */
471
static inline void tcp_synq_overflow(const struct sock *sk)
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
{
	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;
	unsigned long now = jiffies;

	if (time_after(now, last_overflow + HZ))
		tcp_sk(sk)->rx_opt.ts_recent_stamp = now;
}

/* syncookies: no recent synqueue overflow on this listening socket? */
static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
{
	unsigned long last_overflow = tcp_sk(sk)->rx_opt.ts_recent_stamp;

	return time_after(jiffies, last_overflow + TCP_SYNCOOKIE_VALID);
}
487
488
489

static inline u32 tcp_cookie_time(void)
{
490
491
	u64 val = get_jiffies_64();

492
	do_div(val, TCP_SYNCOOKIE_PERIOD);
493
	return val;
494
495
}

496
497
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
			      u16 *mssp);
498
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
499
u64 cookie_init_timestamp(struct request_sock *req);
500
501
bool cookie_timestamp_decode(const struct net *net,
			     struct tcp_options_received *opt);
502
bool cookie_ecn_ok(const struct tcp_options_received *opt,
503
		   const struct net *net, const struct dst_entry *dst);
504

505
/* From net/ipv6/syncookies.c */
506
507
508
int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
		      u32 cookie);
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
509

510
511
u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
			      const struct tcphdr *th, u16 *mssp);
512
__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
513
#endif
Linus Torvalds's avatar
Linus Torvalds committed
514
515
/* tcp_output.c */

516
517
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
			       int nonagle);
518
519
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
520
521
522
void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
523
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
524
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
525
526
527
528
529
530
531
enum tcp_queue {
	TCP_FRAG_IN_WRITE_QUEUE,
	TCP_FRAG_IN_RTX_QUEUE,
};
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
		 struct sk_buff *skb, u32 len,
		 unsigned int mss_now, gfp_t gfp);
532
533
534

void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *);
535
int tcp_write_wakeup(struct sock *, int mib);
536
537
538
539
540
541
542
void tcp_send_fin(struct sock *sk);
void tcp_send_active_reset(struct sock *sk, gfp_t priority);
int tcp_send_synack(struct sock *);
void tcp_push_one(struct sock *, unsigned int mss_now);
void tcp_send_ack(struct sock *sk);
void tcp_send_delayed_ack(struct sock *sk);
void tcp_send_loss_probe(struct sock *sk);
543
bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
544
545
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
			     const struct sk_buff *next_skb);
Linus Torvalds's avatar
Linus Torvalds committed
546

547
/* tcp_input.c */
548
void tcp_rearm_rto(struct sock *sk);
549
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
550
void tcp_reset(struct sock *sk);
Yuchung Cheng's avatar
Yuchung Cheng committed
551
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
552
void tcp_fin(struct sock *sk);
553

Linus Torvalds's avatar
Linus Torvalds committed
554
/* tcp_timer.c */
555
void tcp_init_xmit_timers(struct sock *);
556
557
static inline void tcp_clear_xmit_timers(struct sock *sk)
{
558
	hrtimer_cancel(&tcp_sk(sk)->pacing_timer);
559
560
	inet_csk_clear_xmit_timers(sk);
}
Linus Torvalds's avatar
Linus Torvalds committed
561

562
563
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
unsigned int tcp_current_mss(struct sock *sk);
Ilpo Järvinen's avatar
Ilpo Järvinen committed
564
565
566
567

/* Bound MSS / TSO packet size with the half of the window */
static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
{
568
569
570
571
572
573
574
575
576
	int cutoff;

	/* When peer uses tiny windows, there is no use in packetizing
	 * to sub-MSS pieces for the sake of SWS or making sure there
	 * are enough packets in the pipe for fast recovery.
	 *
	 * On the other hand, for extremely large MSS devices, handling
	 * smaller than MSS windows in this way does make sense.
	 */
577
	if (tp->max_window > TCP_MSS_DEFAULT)
578
579
580
581
582
583
		cutoff = (tp->max_window >> 1);
	else
		cutoff = tp->max_window;

	if (cutoff && pktsize > cutoff)
		return max_t(int, cutoff, 68U - tp->tcp_header_len);
Ilpo Järvinen's avatar
Ilpo Järvinen committed
584
585
586
	else
		return pktsize;
}
Linus Torvalds's avatar
Linus Torvalds committed
587

588
/* tcp.c */
589
void tcp_get_info(struct sock *, struct tcp_info *);
Linus Torvalds's avatar
Linus Torvalds committed
590
591

/* Read 'sendfile()'-style from a TCP socket */
592
593
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
		  sk_read_actor_t recv_actor);
Linus Torvalds's avatar
Linus Torvalds committed
594

595
void tcp_initialize_rcv_mss(struct sock *sk);
Linus Torvalds's avatar
Linus Torvalds committed
596

597
598
599
600
int tcp_mtu_to_mss(struct sock *sk, int pmtu);
int tcp_mss_to_mtu(struct sock *sk, int mss);
void tcp_mtup_init(struct sock *sk);
void tcp_init_buffer_space(struct sock *sk);
John Heffner's avatar
John Heffner committed
601

602
603
604
605
606
607
608
609
static inline void tcp_bound_rto(const struct sock *sk)
{
	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
}

static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
610
	return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
611
612
}

613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
	tp->pred_flags = htonl((tp->tcp_header_len << 26) |
			       ntohl(TCP_FLAG_ACK) |
			       snd_wnd);
}

static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
	__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}

static inline void tcp_fast_path_check(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
	    tp->rcv_wnd &&
	    atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
	    !tp->urg_data)
		tcp_fast_path_on(tp);
}

636
637
638
/* Compute the actual rto_min value */
static inline u32 tcp_rto_min(struct sock *sk)
{
639
	const struct dst_entry *dst = __sk_dst_get(sk);
640
641
642
643
644
645
646
	u32 rto_min = TCP_RTO_MIN;

	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
		rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
	return rto_min;
}

647
648
649
650
651
static inline u32 tcp_rto_min_us(struct sock *sk)
{
	return jiffies_to_usecs(tcp_rto_min(sk));
}

652
653
654
655
656
static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
{
	return dst_metric_locked(dst, RTAX_CC_ALGO);
}

657
658
659
/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
660
	return minmax_get(&tp->rtt_min);
661
662
}

Linus Torvalds's avatar
Linus Torvalds committed
663
664
665
666
/* Compute the actual receive window we are currently advertising.
 * Rcv_nxt can be after the window if our peer push more data
 * than the offered window.
 */
Stephen Hemminger's avatar
Stephen Hemminger committed
667
static inline u32 tcp_receive_window(const struct tcp_sock *tp)
Linus Torvalds's avatar
Linus Torvalds committed
668
669
670
671
672
673
674
675
676
677
678
679
{
	s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;

	if (win < 0)
		win = 0;
	return (u32) win;
}

/* Choose a new window, without checks for shrinking, and without
 * scaling applied to the result.  The caller does these things
 * if necessary.  This is a "raw" window selection.
 */
680
u32 __tcp_select_window(struct sock *sk);
Linus Torvalds's avatar
Linus Torvalds committed
681

Pavel Emelyanov's avatar
Pavel Emelyanov committed
682
683
void tcp_send_window_probe(struct sock *sk);

Eric Dumazet's avatar
Eric Dumazet committed
684
685
686
687
688
689
/* TCP uses 32bit jiffies to save some space.
 * Note that this is different from tcp_time_stamp, which
 * historically has been the same until linux-4.13.
 */
#define tcp_jiffies32 ((u32)jiffies)

690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
/*
 * Deliver a 32bit value for TCP timestamp option (RFC 7323)
 * It is no longer tied to jiffies, but to 1 ms clock.
 * Note: double check if you want to use tcp_jiffies32 instead of this.
 */
#define TCP_TS_HZ	1000

static inline u64 tcp_clock_ns(void)
{
	return local_clock();
}

static inline u64 tcp_clock_us(void)
{
	return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}

/* This should only be used in contexts where tp->tcp_mstamp is up to date */
static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
{
	return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
}

/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
static inline u32 tcp_time_stamp_raw(void)
{
	return div_u64(tcp_clock_ns(), NSEC_PER_SEC / TCP_TS_HZ);
}


/* Refresh 1us clock of a TCP socket,
 * ensuring monotically increasing values.
Linus Torvalds's avatar
Linus Torvalds committed
722
 */
723
724
725
726
727
728
729
730
731
732
733
734
static inline void tcp_mstamp_refresh(struct tcp_sock *tp)
{
	u64 val = tcp_clock_us();

	if (val > tp->tcp_mstamp)
		tp->tcp_mstamp = val;
}

static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
{
	return max_t(s64, t1 - t0, 0);
}
Linus Torvalds's avatar
Linus Torvalds committed
735

736
737
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
738
	return div_u64(skb->skb_mstamp, USEC_PER_SEC / TCP_TS_HZ);
739
740
741
}


Changli Gao's avatar
Changli Gao committed
742
743
744
745
746
747
748
749
750
751
752
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])

#define TCPHDR_FIN 0x01
#define TCPHDR_SYN 0x02
#define TCPHDR_RST 0x04
#define TCPHDR_PSH 0x08
#define TCPHDR_ACK 0x10
#define TCPHDR_URG 0x20
#define TCPHDR_ECE 0x40
#define TCPHDR_CWR 0x80

753
754
#define TCPHDR_SYN_ECN	(TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)

Stephen Hemminger's avatar
Stephen Hemminger committed
755
/* This is what the send packet queuing engine uses to pass
756
757
758
759
 * TCP per-packet control information to the transmission code.
 * We also store the host-order sequence numbers in here too.
 * This is 44 bytes if IPV6 is enabled.
 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
Linus Torvalds's avatar
Linus Torvalds committed
760
761
762
763
 */
struct tcp_skb_cb {
	__u32		seq;		/* Starting sequence number	*/
	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
764
765
766
767
	union {
		/* Note : tcp_tw_isn is used in input path only
		 *	  (isn chosen by tcp_timewait_state_process())
		 *
768
769
		 * 	  tcp_gso_segs/size are used in write queue only,
		 *	  cf tcp_skb_pcount()/tcp_skb_mss()
770
771
		 */
		__u32		tcp_tw_isn;
772
773
774
775
		struct {
			u16	tcp_gso_segs;
			u16	tcp_gso_size;
		};
776
	};
Eric Dumazet's avatar
Eric Dumazet committed
777
	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
778

779
	__u8		sacked;		/* State flags for SACK.	*/
Linus Torvalds's avatar
Linus Torvalds committed
780
781
782
783
#define TCPCB_SACKED_ACKED	0x01	/* SKB ACK'd by a SACK block	*/
#define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
#define TCPCB_LOST		0x04	/* SKB is lost			*/
#define TCPCB_TAGBITS		0x07	/* All tag bits			*/
784
#define TCPCB_REPAIRED		0x10	/* SKB repaired (no skb_mstamp)	*/
Linus Torvalds's avatar
Linus Torvalds committed
785
#define TCPCB_EVER_RETRANS	0x80	/* Ever retransmitted frame	*/
786
787
#define TCPCB_RETRANS		(TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
				TCPCB_REPAIRED)
Linus Torvalds's avatar
Linus Torvalds committed
788

789
	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
790
	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
791
			eor:1,		/* Is skb MSG_EOR marked? */
792
793
			has_rxtstamp:1,	/* SKB has a RX timestamp	*/
			unused:5;
Linus Torvalds's avatar
Linus Torvalds committed
794
	__u32		ack_seq;	/* Sequence number ACK'd	*/
795
	union {
796
		struct {
797
			/* There is space for up to 24 bytes */
798
799
800
			__u32 in_flight:30,/* Bytes in flight at transmit */
			      is_app_limited:1, /* cwnd not fully used? */
			      unused:1;
801
802
803
			/* pkts S/ACKed so far upon tx of skb, incl retrans: */
			__u32 delivered;
			/* start of send pipeline phase */
804
			u64 first_tx_mstamp;
805
			/* when we reached the "delivered" count */
806
			u64 delivered_mstamp;
807
808
809
		} tx;   /* only used for outgoing skbs */
		union {
			struct inet_skb_parm	h4;
810
#if IS_ENABLED(CONFIG_IPV6)
811
			struct inet6_skb_parm	h6;
812
#endif
813
		} header;	/* For incoming skbs */
814
815
816
817
		struct {
			__u32 key;
			__u32 flags;
			struct bpf_map *map;
818
			void *data_end;
819
		} bpf;
820
	};
Linus Torvalds's avatar
Linus Torvalds committed
821
822
823
824
};

#define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))

Eric Dumazet's avatar
Eric Dumazet committed
825

826
#if IS_ENABLED(CONFIG_IPV6)
Eric Dumazet's avatar
Eric Dumazet committed
827
828
829
830
831
/* This is the variant of inet6_iif() that must be used by TCP,
 * as TCP moves IP6CB into a different location in skb->cb[]
 */
static inline int tcp_v6_iif(const struct sk_buff *skb)
{
832
	bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
833
834

	return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
Eric Dumazet's avatar
Eric Dumazet committed
835
}
836
837
838
839
840
841
842
843
844
845

/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
	if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
		return TCP_SKB_CB(skb)->header.h6.iif;
#endif
	return 0;
}
846
#endif
Eric Dumazet's avatar
Eric Dumazet committed
847

848
849
850
851
static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
	if (!net->ipv4.sysctl_tcp_l3mdev_accept &&
852
	    skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
853
854
855
856
857
		return true;
#endif
	return false;
}

858
859
860
861
862
863
864
865
866
867
/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v4_sdif(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
	if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
		return TCP_SKB_CB(skb)->header.h4.iif;
#endif
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
868
869
/* Due to TSO, an SKB can be composed of multiple actual
 * packets.  To keep these tracked properly, we use this.
870
 */
Linus Torvalds's avatar
Linus Torvalds committed
871
static inline int tcp_skb_pcount(const struct sk_buff *skb)
872
{
873
874
	return TCP_SKB_CB(skb)->tcp_gso_segs;
}
875

876
877
878
static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
{
	TCP_SKB_CB(skb)->tcp_gso_segs = segs;
879
880
}

881
static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
Linus Torvalds's avatar
Linus Torvalds committed
882
{
883
	TCP_SKB_CB(skb)->tcp_gso_segs += segs;
Linus Torvalds's avatar
Linus Torvalds committed
884
885
}

886
/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */
Linus Torvalds's avatar
Linus Torvalds committed
887
888
static inline int tcp_skb_mss(const struct sk_buff *skb)
{
889
	return TCP_SKB_CB(skb)->tcp_gso_size;
Linus Torvalds's avatar
Linus Torvalds committed
890
891
}

892
893
894
895
896
static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
{
	return likely(!TCP_SKB_CB(skb)->eor);
}

897
898
899
900
901
902
/* Events passed to congestion control interface */
enum tcp_ca_event {
	CA_EVENT_TX_START,	/* first transmit when no packets in flight */
	CA_EVENT_CWND_RESTART,	/* congestion window restart */
	CA_EVENT_COMPLETE_CWR,	/* end of congestion recovery */
	CA_EVENT_LOSS,		/* loss timeout */
903
904
905
906
	CA_EVENT_ECN_NO_CE,	/* ECT set, but not CE marked */
	CA_EVENT_ECN_IS_CE,	/* received CE marked IP packet */
	CA_EVENT_DELAYED_ACK,	/* Delayed ack is sent */
	CA_EVENT_NON_DELAYED_ACK,
907
908
};

909
/* Information about inbound ACK, passed to cong_ops->in_ack_event() */
910
enum tcp_ca_ack_event_flags {
911
912
913
	CA_ACK_SLOWPATH		= (1 << 0),	/* In slow path processing */
	CA_ACK_WIN_UPDATE	= (1 << 1),	/* ACK updated window */
	CA_ACK_ECE		= (1 << 2),	/* ECE bit is set on ack */
914
915
916
917
918
919
};

/*
 * Interface for adding new TCP congestion control handlers
 */
#define TCP_CA_NAME_MAX	16
920
921
922
#define TCP_CA_MAX	128
#define TCP_CA_BUF_MAX	(TCP_CA_NAME_MAX*TCP_CA_MAX)

923
924
#define TCP_CA_UNSPEC	0

925
/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
926
#define TCP_CONG_NON_RESTRICTED 0x1
927
928
/* Requires ECN/ECT set on all packets */
#define TCP_CONG_NEEDS_ECN	0x2
929

930
931
union tcp_cc_info;

932
933
934
struct ack_sample {
	u32 pkts_acked;
	s32 rtt_us;
935
	u32 in_flight;
936
937
};

938
939
940
941
942
943
944
945
946
/* A rate sample measures the number of (original/retransmitted) data
 * packets delivered "delivered" over an interval of time "interval_us".
 * The tcp_rate.c code fills in the rate sample, and congestion
 * control modules that define a cong_control function to run at the end
 * of ACK processing can optionally chose to consult this sample when
 * setting cwnd and pacing rate.
 * A sample is invalid if "delivered" or "interval_us" is negative.
 */
struct rate_sample {
947
	u64  prior_mstamp; /* starting timestamp for interval */
948
949
950
951
952
953
954
	u32  prior_delivered;	/* tp->delivered at "prior_mstamp" */
	s32  delivered;		/* number of packets delivered over interval */
	long interval_us;	/* time for tp->delivered to incr "delivered" */
	long rtt_us;		/* RTT of last (S)ACKed packet (or -1) */
	int  losses;		/* number of packets marked lost upon ACK */
	u32  acked_sacked;	/* number of packets newly (S)ACKed upon ACK */
	u32  prior_in_flight;	/* in flight before this ACK */
955
	bool is_app_limited;	/* is sample from packet with bubble in pipe? */
956
	bool is_retrans;	/* is sample from retransmission? */
957
	bool is_ack_delayed;	/* is this (likely) a delayed ACK? */
958
959
};

960
961
struct tcp_congestion_ops {
	struct list_head	list;
962
963
	u32 key;
	u32 flags;
964
965

	/* initialize private data (optional) */
966
	void (*init)(struct sock *sk);
967
	/* cleanup private data  (optional) */
968
	void (*release)(struct sock *sk);
969
970

	/* return slow start threshold (required) */
971
	u32 (*ssthresh)(struct sock *sk);
972
	/* do new cwnd calculation (required) */
973
	void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked);
974
	/* call before changing ca_state (optional) */
975
	void (*set_state)(struct sock *sk, u8 new_state);
976
	/* call when cwnd event occurs (optional) */
977
	void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
978
979
	/* call when ack arrives (optional) */
	void (*in_ack_event)(struct sock *sk, u32 flags);
980
	/* new value of cwnd after loss (required) */
981
	u32  (*undo_cwnd)(struct sock *sk);
982
	/* hook for packet ack accounting (optional) */
983
	void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
984
985
	/* override sysctl_tcp_min_tso_segs */
	u32 (*min_tso_segs)(struct sock *sk);
986
987
	/* returns the multiplier used in tcp_sndbuf_expand (optional) */
	u32 (*sndbuf_expand)(struct sock *sk);
988
989
990
991
	/* call when packets are delivered to update cwnd and pacing rate,
	 * after all the ca_state processing. (optional)
	 */
	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
992
	/* get info for inet_diag (optional) */
993
994
	size_t (*get_info)(struct sock *sk, u32 ext, int *attr,
			   union tcp_cc_info *info);
995
996
997
998
999

	char 		name[TCP_CA_NAME_MAX];
	struct module 	*owner;
};

1000
1001
int tcp_register_congestion_control(struct tcp_congestion_ops *type);
void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
1002

1003
void tcp_assign_congestion_control(struct sock *sk);
1004
1005
void tcp_init_congestion_control(struct sock *sk);
void tcp_cleanup_congestion_control(struct sock *sk);
1006
1007
int tcp_set_default_congestion_control(struct net *net, const char *name);
void tcp_get_default_congestion_control(struct net *net, char *name);
1008
1009
1010
void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed);
1011
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit);
Neal Cardwell's avatar
Neal Cardwell committed
1012
1013
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
1014

1015
u32 tcp_reno_ssthresh(struct sock *sk);
1016
u32 tcp_reno_undo_cwnd(struct sock *sk);
1017
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
1018
extern struct tcp_congestion_ops tcp_reno;
1019

1020
struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
1021
u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
1022
#ifdef CONFIG_INET
1023
char *tcp_ca_get_name_by_key(u32 key, char *buffer);
1024
1025
1026
1027
1028
1029
#else
static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
	return NULL;
}
#endif
1030

1031
1032
1033
1034
1035
1036
1037
static inline bool tcp_ca_needs_ecn(const struct sock *sk)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);

	return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
}

1038
static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
1039
{
1040
1041
1042
1043
1044