tcp_input.c 193 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2
3
4
5
6
7
8
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
9
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *		Pedro Roque	:	Fast Retransmit/Recovery.
 *					Two receive queues.
 *					Retransmit queue handled by TCP.
 *					Better retransmit timer handling.
 *					New congestion avoidance.
 *					Header prediction.
 *					Variable renaming.
 *
 *		Eric		:	Fast Retransmit.
 *		Randy Scott	:	MSS option defines.
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		David S. Miller	:	Don't allow zero congestion window.
 *		Eric Schenk	:	Fix retransmitter so that it sends
 *					next packet on ack of previous packet.
 *		Andi Kleen	:	Moved open_request checking here
 *					and process RSTs for open_requests.
 *		Andi Kleen	:	Better prune_queue, and other fixes.
Stephen Hemminger's avatar
Stephen Hemminger committed
44
 *		Andrey Savochkin:	Fix RTT measurements in the presence of
Linus Torvalds's avatar
Linus Torvalds committed
45
46
47
48
49
50
51
 *					timestamps.
 *		Andrey Savochkin:	Check sequence numbers correctly when
 *					removing SACKs due to in sequence incoming
 *					data segments.
 *		Andi Kleen:		Make sure we never ack data there is not
 *					enough room for. Also make this condition
 *					a fatal error if it might still happen.
52
 *		Andi Kleen:		Add tcp_measure_rcv_mss to make
Linus Torvalds's avatar
Linus Torvalds committed
53
 *					connections with MSS<min(MTU,ann. MSS)
54
 *					work without delayed acks.
Linus Torvalds's avatar
Linus Torvalds committed
55
56
57
58
59
60
61
62
63
64
 *		Andi Kleen:		Process packets with PSH set in the
 *					fast path.
 *		J Hadi Salim:		ECN support
 *	 	Andrei Gurtov,
 *		Pasi Sarolahti,
 *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
 *					engine. Lots of bugs are found.
 *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
 */

65
66
#define pr_fmt(fmt) "TCP: " fmt

Linus Torvalds's avatar
Linus Torvalds committed
67
#include <linux/mm.h>
68
#include <linux/slab.h>
Linus Torvalds's avatar
Linus Torvalds committed
69
70
#include <linux/module.h>
#include <linux/sysctl.h>
71
#include <linux/kernel.h>
72
#include <linux/prefetch.h>
73
#include <net/dst.h>
Linus Torvalds's avatar
Linus Torvalds committed
74
75
76
77
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
78
#include <linux/errqueue.h>
79
#include <trace/events/tcp.h>
80
#include <linux/jump_label_ratelimit.h>
81
#include <net/busy_poll.h>
82
#include <net/mptcp.h>
Linus Torvalds's avatar
Linus Torvalds committed
83

84
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
Linus Torvalds's avatar
Linus Torvalds committed
85
86
87
88
89
90
91
92

#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
#define FLAG_ECE		0x40 /* ECE in this ACK				*/
93
#define FLAG_LOST_RETRANS	0x80 /* This ACK marks some retransmission lost */
94
#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
Yuchung Cheng's avatar
Yuchung Cheng committed
95
#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
96
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
97
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
98
#define FLAG_SET_XMIT_TIMER	0x1000 /* Set TLP or RTO timer */
99
#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
100
#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
101
#define FLAG_NO_CHALLENGE_ACK	0x8000 /* do not call tcp_send_challenge_ack()	*/
102
#define FLAG_ACK_MAYBE_DELAYED	0x10000 /* Likely a delayed ACK */
Linus Torvalds's avatar
Linus Torvalds committed
103
104
105

#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
106
#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE|FLAG_DSACKING_ACK)
Linus Torvalds's avatar
Linus Torvalds committed
107
108
109
#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)

#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
110
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
Linus Torvalds's avatar
Linus Torvalds committed
111

112
113
114
115
#define REXMIT_NONE	0 /* no loss recovery to do */
#define REXMIT_LOST	1 /* retransmit packets marked lost */
#define REXMIT_NEW	2 /* FRTO-style transmit of unsent/new packets */

Ilya Lesokhin's avatar
Ilya Lesokhin committed
116
#if IS_ENABLED(CONFIG_TLS_DEVICE)
117
static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
Ilya Lesokhin's avatar
Ilya Lesokhin committed
118
119
120
121
122

void clean_acked_data_enable(struct inet_connection_sock *icsk,
			     void (*cad)(struct sock *sk, u32 ack_seq))
{
	icsk->icsk_clean_acked = cad;
123
	static_branch_deferred_inc(&clean_acked_data_enabled);
Ilya Lesokhin's avatar
Ilya Lesokhin committed
124
125
126
127
128
}
EXPORT_SYMBOL_GPL(clean_acked_data_enable);

void clean_acked_data_disable(struct inet_connection_sock *icsk)
{
129
	static_branch_slow_dec_deferred(&clean_acked_data_enabled);
Ilya Lesokhin's avatar
Ilya Lesokhin committed
130
131
132
	icsk->icsk_clean_acked = NULL;
}
EXPORT_SYMBOL_GPL(clean_acked_data_disable);
133
134
135
136
137
138

void clean_acked_data_flush(void)
{
	static_key_deferred_flush(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_flush);
Ilya Lesokhin's avatar
Ilya Lesokhin committed
139
140
#endif

141
142
static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
			     unsigned int len)
143
144
145
146
147
148
149
150
151
152
{
	static bool __once __read_mostly;

	if (!__once) {
		struct net_device *dev;

		__once = true;

		rcu_read_lock();
		dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
153
154
155
		if (!dev || len >= dev->mtu)
			pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
				dev ? dev->name : "Unknown driver");
156
157
158
159
		rcu_read_unlock();
	}
}

160
/* Adapt the MSS value used to make delayed ack decision to the
Linus Torvalds's avatar
Linus Torvalds committed
161
 * real world.
162
 */
163
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
164
{
165
	struct inet_connection_sock *icsk = inet_csk(sk);
166
	const unsigned int lss = icsk->icsk_ack.last_seg_size;
167
	unsigned int len;
Linus Torvalds's avatar
Linus Torvalds committed
168

169
	icsk->icsk_ack.last_seg_size = 0;
Linus Torvalds's avatar
Linus Torvalds committed
170
171
172
173

	/* skb->len may jitter because of SACKs, even if peer
	 * sends good full-sized frames.
	 */
174
	len = skb_shinfo(skb)->gso_size ? : skb->len;
175
	if (len >= icsk->icsk_ack.rcv_mss) {
176
177
		icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
					       tcp_sk(sk)->advmss);
178
179
180
181
		/* Account for possibly-removed options */
		if (unlikely(len > icsk->icsk_ack.rcv_mss +
				   MAX_TCP_OPTION_SPACE))
			tcp_gro_dev_warn(sk, skb, len);
Linus Torvalds's avatar
Linus Torvalds committed
182
183
184
185
186
187
	} else {
		/* Otherwise, we make more careful check taking into account,
		 * that SACKs block is variable.
		 *
		 * "len" is invariant segment length, including TCP header.
		 */
188
		len += skb->data - skb_transport_header(skb);
189
		if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
Linus Torvalds's avatar
Linus Torvalds committed
190
191
192
193
194
195
		    /* If PSH is not set, packet should be
		     * full sized, provided peer TCP is not badly broken.
		     * This observation (if it is correct 8)) allows
		     * to handle super-low mtu links fairly.
		     */
		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
196
		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
Linus Torvalds's avatar
Linus Torvalds committed
197
198
199
200
			/* Subtract also invariant (if peer is RFC compliant),
			 * tcp header plus fixed timestamp option length.
			 * Resulting "len" is MSS free of SACK jitter.
			 */
201
202
			len -= tcp_sk(sk)->tcp_header_len;
			icsk->icsk_ack.last_seg_size = len;
Linus Torvalds's avatar
Linus Torvalds committed
203
			if (len == lss) {
204
				icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
205
206
207
				return;
			}
		}
208
209
		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
210
		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
Linus Torvalds's avatar
Linus Torvalds committed
211
212
213
	}
}

214
static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
Linus Torvalds's avatar
Linus Torvalds committed
215
{
216
	struct inet_connection_sock *icsk = inet_csk(sk);
217
	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
Linus Torvalds's avatar
Linus Torvalds committed
218

219
220
	if (quickacks == 0)
		quickacks = 2;
221
	quickacks = min(quickacks, max_quickacks);
222
	if (quickacks > icsk->icsk_ack.quick)
223
		icsk->icsk_ack.quick = quickacks;
Linus Torvalds's avatar
Linus Torvalds committed
224
225
}

226
void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
Linus Torvalds's avatar
Linus Torvalds committed
227
{
228
	struct inet_connection_sock *icsk = inet_csk(sk);
229
230

	tcp_incr_quickack(sk, max_quickacks);
Wei Wang's avatar
Wei Wang committed
231
	inet_csk_exit_pingpong_mode(sk);
232
	icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
233
}
234
EXPORT_SYMBOL(tcp_enter_quickack_mode);
Linus Torvalds's avatar
Linus Torvalds committed
235
236
237
238
239

/* Send ACKs quickly, if "quick" count is not exhausted
 * and the session is not interactive.
 */

240
static bool tcp_in_quickack_mode(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
241
{
242
	const struct inet_connection_sock *icsk = inet_csk(sk);
243
	const struct dst_entry *dst = __sk_dst_get(sk);
Eric Dumazet's avatar
Eric Dumazet committed
244

245
	return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
Wei Wang's avatar
Wei Wang committed
246
		(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
Linus Torvalds's avatar
Linus Torvalds committed
247
248
}

249
static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
250
{
251
	if (tp->ecn_flags & TCP_ECN_OK)
252
253
254
		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}

255
static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
256
{
257
	if (tcp_hdr(skb)->cwr) {
258
		tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
259
260
261
262
263

		/* If the sender is telling us it has entered CWR, then its
		 * cwnd may be very low (even just 1 packet), so we should ACK
		 * immediately.
		 */
264
265
		if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
			inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
266
	}
267
268
}

269
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
270
{
271
	tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
272
273
}

274
static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
275
{
276
277
	struct tcp_sock *tp = tcp_sk(sk);

278
	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
279
	case INET_ECN_NOT_ECT:
280
		/* Funny extension: if ECT is not set on a segment,
281
282
283
284
		 * and we already seen ECT on a previous segment,
		 * it is probably a retransmit.
		 */
		if (tp->ecn_flags & TCP_ECN_SEEN)
285
			tcp_enter_quickack_mode(sk, 2);
286
287
		break;
	case INET_ECN_CE:
288
289
		if (tcp_ca_needs_ecn(sk))
			tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
290

291
292
		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
			/* Better not delay acks, sender can have a very low cwnd */
293
			tcp_enter_quickack_mode(sk, 2);
294
295
			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
		}
296
297
		tp->ecn_flags |= TCP_ECN_SEEN;
		break;
298
	default:
299
300
		if (tcp_ca_needs_ecn(sk))
			tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
301
		tp->ecn_flags |= TCP_ECN_SEEN;
302
		break;
303
304
305
	}
}

306
static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
307
{
308
309
	if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
		__tcp_ecn_check_ce(sk, skb);
310
311
312
}

static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
313
{
314
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
315
316
317
		tp->ecn_flags &= ~TCP_ECN_OK;
}

318
static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
319
{
320
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
321
322
323
		tp->ecn_flags &= ~TCP_ECN_OK;
}

324
static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
325
{
326
	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
Eric Dumazet's avatar
Eric Dumazet committed
327
328
		return true;
	return false;
329
330
}

Linus Torvalds's avatar
Linus Torvalds committed
331
332
333
334
335
/* Buffer size and advertised window tuning.
 *
 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
 */

336
static void tcp_sndbuf_expand(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
337
{
338
	const struct tcp_sock *tp = tcp_sk(sk);
339
	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
	int sndmem, per_mss;
	u32 nr_segs;

	/* Worst case is non GSO/TSO : each frame consumes one skb
	 * and skb->head is kmalloced using power of two area of memory
	 */
	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
		  MAX_TCP_HEADER +
		  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

	per_mss = roundup_pow_of_two(per_mss) +
		  SKB_DATA_ALIGN(sizeof(struct sk_buff));

	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

	/* Fast Recovery (RFC 5681 3.2) :
	 * Cubic needs 1.7 factor, rounded to 2 to include
358
	 * extra cushion (application might react slowly to EPOLLOUT)
359
	 */
360
361
	sndmem = ca_ops->sndbuf_expand ? ca_ops->sndbuf_expand(sk) : 2;
	sndmem *= nr_segs * per_mss;
Linus Torvalds's avatar
Linus Torvalds committed
362

363
	if (sk->sk_sndbuf < sndmem)
364
365
		WRITE_ONCE(sk->sk_sndbuf,
			   min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]));
Linus Torvalds's avatar
Linus Torvalds committed
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
}

/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
 *
 * All tcp_full_space() is split to two parts: "network" buffer, allocated
 * forward and advertised in receiver window (tp->rcv_wnd) and
 * "application buffer", required to isolate scheduling/application
 * latencies from network.
 * window_clamp is maximal advertised window. It can be less than
 * tcp_full_space(), in this case tcp_full_space() - window_clamp
 * is reserved for "application" buffer. The less window_clamp is
 * the smoother our behaviour from viewpoint of network, but the lower
 * throughput and the higher sensitivity of the connection to losses. 8)
 *
 * rcv_ssthresh is more strict window_clamp used at "slow start"
 * phase to predict further behaviour of this connection.
 * It is used for two goals:
 * - to enforce header prediction at sender, even when application
 *   requires some significant "application buffer". It is check #1.
 * - to prevent pruning of receive queue because of misprediction
 *   of receiver window. Check #2.
 *
 * The scheme does not work when sender sends good segments opening
Stephen Hemminger's avatar
Stephen Hemminger committed
389
 * window and then starts to feed us spaghetti. But it should work
Linus Torvalds's avatar
Linus Torvalds committed
390
391
392
393
 * in common situations. Otherwise, we have to rely on queue collapsing.
 */

/* Slow part of check#2. */
394
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
395
{
396
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
397
	/* Optimize this! */
398
	int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
399
	int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
Linus Torvalds's avatar
Linus Torvalds committed
400
401
402

	while (tp->rcv_ssthresh <= window) {
		if (truesize <= skb->len)
403
			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
Linus Torvalds's avatar
Linus Torvalds committed
404
405
406
407
408
409
410

		truesize >>= 1;
		window >>= 1;
	}
	return 0;
}

411
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
412
{
413
	struct tcp_sock *tp = tcp_sk(sk);
414
415
416
	int room;

	room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
417

Linus Torvalds's avatar
Linus Torvalds committed
418
	/* Check #1 */
419
	if (room > 0 && !tcp_under_memory_pressure(sk)) {
Linus Torvalds's avatar
Linus Torvalds committed
420
421
422
423
424
		int incr;

		/* Check #2. Increase window, if skb with such overhead
		 * will fit to rcvbuf in future.
		 */
425
		if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
426
			incr = 2 * tp->advmss;
Linus Torvalds's avatar
Linus Torvalds committed
427
		else
428
			incr = __tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
429
430

		if (incr) {
431
			incr = max_t(int, incr, 2 * skb->len);
432
			tp->rcv_ssthresh += min(room, incr);
433
			inet_csk(sk)->icsk_ack.quick |= 1;
Linus Torvalds's avatar
Linus Torvalds committed
434
435
436
437
		}
	}
}

438
/* 3. Try to fixup all. It is made immediately after connection enters
Linus Torvalds's avatar
Linus Torvalds committed
439
440
 *    established state.
 */
441
static void tcp_init_buffer_space(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
442
{
443
	int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
Linus Torvalds's avatar
Linus Torvalds committed
444
445
446
447
	struct tcp_sock *tp = tcp_sk(sk);
	int maxwin;

	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
448
		tcp_sndbuf_expand(sk);
Linus Torvalds's avatar
Linus Torvalds committed
449

450
	tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss);
451
	tcp_mstamp_refresh(tp);
452
	tp->rcvq_space.time = tp->tcp_mstamp;
Eric Dumazet's avatar
Eric Dumazet committed
453
	tp->rcvq_space.seq = tp->copied_seq;
Linus Torvalds's avatar
Linus Torvalds committed
454
455
456
457
458
459

	maxwin = tcp_full_space(sk);

	if (tp->window_clamp >= maxwin) {
		tp->window_clamp = maxwin;

460
		if (tcp_app_win && maxwin > 4 * tp->advmss)
Linus Torvalds's avatar
Linus Torvalds committed
461
			tp->window_clamp = max(maxwin -
462
					       (maxwin >> tcp_app_win),
Linus Torvalds's avatar
Linus Torvalds committed
463
464
465
466
					       4 * tp->advmss);
	}

	/* Force reservation of one segment. */
467
	if (tcp_app_win &&
Linus Torvalds's avatar
Linus Torvalds committed
468
469
470
471
472
	    tp->window_clamp > 2 * tp->advmss &&
	    tp->window_clamp + tp->advmss > maxwin)
		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);

	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
473
	tp->snd_cwnd_stamp = tcp_jiffies32;
Linus Torvalds's avatar
Linus Torvalds committed
474
475
}

476
/* 4. Recalculate window clamp after socket hit its memory bounds. */
477
static void tcp_clamp_window(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
478
{
479
	struct tcp_sock *tp = tcp_sk(sk);
480
	struct inet_connection_sock *icsk = inet_csk(sk);
481
	struct net *net = sock_net(sk);
Linus Torvalds's avatar
Linus Torvalds committed
482

483
	icsk->icsk_ack.quick = 0;
Linus Torvalds's avatar
Linus Torvalds committed
484

485
	if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
486
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
487
	    !tcp_under_memory_pressure(sk) &&
488
	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
489
490
491
		WRITE_ONCE(sk->sk_rcvbuf,
			   min(atomic_read(&sk->sk_rmem_alloc),
			       net->ipv4.sysctl_tcp_rmem[2]));
Linus Torvalds's avatar
Linus Torvalds committed
492
	}
493
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
494
		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
Linus Torvalds's avatar
Linus Torvalds committed
495
496
}

Stephen Hemminger's avatar
Stephen Hemminger committed
497
498
499
500
501
502
503
504
505
/* Initialize RCV_MSS value.
 * RCV_MSS is an our guess about MSS used by the peer.
 * We haven't any direct information about the MSS.
 * It's better to underestimate the RCV_MSS rather than overestimate.
 * Overestimations make us ACKing less frequently than needed.
 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 */
void tcp_initialize_rcv_mss(struct sock *sk)
{
506
	const struct tcp_sock *tp = tcp_sk(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
507
508
	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

509
	hint = min(hint, tp->rcv_wnd / 2);
510
	hint = min(hint, TCP_MSS_DEFAULT);
Stephen Hemminger's avatar
Stephen Hemminger committed
511
512
513
514
	hint = max(hint, TCP_MIN_MSS);

	inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
515
EXPORT_SYMBOL(tcp_initialize_rcv_mss);
Stephen Hemminger's avatar
Stephen Hemminger committed
516

Linus Torvalds's avatar
Linus Torvalds committed
517
518
519
520
/* Receiver "autotuning" code.
 *
 * The algorithm for RTT estimation w/o timestamps is based on
 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
521
 * <http://public.lanl.gov/radiant/pubs.html#DRS>
Linus Torvalds's avatar
Linus Torvalds committed
522
523
 *
 * More detail on this code can be found at
524
 * <http://staff.psc.edu/jheffner/>,
Linus Torvalds's avatar
Linus Torvalds committed
525
526
527
528
529
 * though this reference is out of date.  A new paper
 * is pending.
 */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
530
	u32 new_sample = tp->rcv_rtt_est.rtt_us;
Linus Torvalds's avatar
Linus Torvalds committed
531
532
533
534
535
536
537
538
539
	long m = sample;

	if (new_sample != 0) {
		/* If we sample in larger samples in the non-timestamp
		 * case, we could grossly overestimate the RTT especially
		 * with chatty applications or bulk transfer apps which
		 * are stalled on filesystem I/O.
		 *
		 * Also, since we are only going for a minimum in the
540
		 * non-timestamp case, we do not smooth things out
Stephen Hemminger's avatar
Stephen Hemminger committed
541
		 * else with timestamps disabled convergence takes too
Linus Torvalds's avatar
Linus Torvalds committed
542
543
544
545
546
		 * long.
		 */
		if (!win_dep) {
			m -= (new_sample >> 3);
			new_sample += m;
547
548
549
550
551
		} else {
			m <<= 3;
			if (m < new_sample)
				new_sample = m;
		}
Linus Torvalds's avatar
Linus Torvalds committed
552
	} else {
Stephen Hemminger's avatar
Stephen Hemminger committed
553
		/* No previous measure. */
Linus Torvalds's avatar
Linus Torvalds committed
554
555
556
		new_sample = m << 3;
	}

557
	tp->rcv_rtt_est.rtt_us = new_sample;
Linus Torvalds's avatar
Linus Torvalds committed
558
559
560
561
}

static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
562
563
	u32 delta_us;

564
	if (tp->rcv_rtt_est.time == 0)
Linus Torvalds's avatar
Linus Torvalds committed
565
566
567
		goto new_measure;
	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
		return;
568
	delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
569
570
	if (!delta_us)
		delta_us = 1;
571
	tcp_rcv_rtt_update(tp, delta_us, 1);
Linus Torvalds's avatar
Linus Torvalds committed
572
573
574

new_measure:
	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
575
	tp->rcv_rtt_est.time = tp->tcp_mstamp;
Linus Torvalds's avatar
Linus Torvalds committed
576
577
}

578
579
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
					  const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
580
{
581
	struct tcp_sock *tp = tcp_sk(sk);
582

583
584
585
586
587
588
	if (tp->rx_opt.rcv_tsecr == tp->rcv_rtt_last_tsecr)
		return;
	tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;

	if (TCP_SKB_CB(skb)->end_seq -
	    TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
589
		u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
590
		u32 delta_us;
591

592
593
594
595
596
597
		if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
			if (!delta)
				delta = 1;
			delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
			tcp_rcv_rtt_update(tp, delta_us, 0);
		}
598
	}
Linus Torvalds's avatar
Linus Torvalds committed
599
600
601
602
603
604
605
606
607
}

/*
 * This function should be called every time data is copied to user space.
 * It calculates the appropriate TCP receive buffer space.
 */
void tcp_rcv_space_adjust(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
608
	u32 copied;
Linus Torvalds's avatar
Linus Torvalds committed
609
	int time;
610

611
612
	trace_tcp_rcv_space_adjust(sk);

613
	tcp_mstamp_refresh(tp);
614
	time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
615
	if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
Linus Torvalds's avatar
Linus Torvalds committed
616
		return;
617

Eric Dumazet's avatar
Eric Dumazet committed
618
619
620
621
622
623
624
625
626
627
628
629
630
631
	/* Number of bytes copied to user in last RTT */
	copied = tp->copied_seq - tp->rcvq_space.seq;
	if (copied <= tp->rcvq_space.space)
		goto new_measure;

	/* A bit of theory :
	 * copied = bytes received in previous RTT, our base window
	 * To cope with packet losses, we need a 2x factor
	 * To cope with slow start, and sender growing its cwin by 100 %
	 * every RTT, we need a 4x factor, because the ACK we are sending
	 * now is for the next RTT, not the current one :
	 * <prev RTT . ><current RTT .. ><next RTT .... >
	 */

632
	if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
Eric Dumazet's avatar
Eric Dumazet committed
633
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
634
		int rcvmem, rcvbuf;
635
		u64 rcvwin, grow;
Linus Torvalds's avatar
Linus Torvalds committed
636

Eric Dumazet's avatar
Eric Dumazet committed
637
638
639
		/* minimal window to cope with packet losses, assuming
		 * steady state. Add some cushion because of small variations.
		 */
640
		rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
Linus Torvalds's avatar
Linus Torvalds committed
641

642
643
644
645
		/* Accommodate for sender rate increase (eg. slow start) */
		grow = rcvwin * (copied - tp->rcvq_space.space);
		do_div(grow, tp->rcvq_space.space);
		rcvwin += (grow << 1);
Linus Torvalds's avatar
Linus Torvalds committed
646

Eric Dumazet's avatar
Eric Dumazet committed
647
		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
648
		while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
Eric Dumazet's avatar
Eric Dumazet committed
649
			rcvmem += 128;
Linus Torvalds's avatar
Linus Torvalds committed
650

651
652
653
		do_div(rcvwin, tp->advmss);
		rcvbuf = min_t(u64, rcvwin * rcvmem,
			       sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
Eric Dumazet's avatar
Eric Dumazet committed
654
		if (rcvbuf > sk->sk_rcvbuf) {
655
			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
Linus Torvalds's avatar
Linus Torvalds committed
656

Eric Dumazet's avatar
Eric Dumazet committed
657
			/* Make the window clamp follow along.  */
658
			tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
Linus Torvalds's avatar
Linus Torvalds committed
659
660
		}
	}
Eric Dumazet's avatar
Eric Dumazet committed
661
	tp->rcvq_space.space = copied;
662

Linus Torvalds's avatar
Linus Torvalds committed
663
664
new_measure:
	tp->rcvq_space.seq = tp->copied_seq;
665
	tp->rcvq_space.time = tp->tcp_mstamp;
Linus Torvalds's avatar
Linus Torvalds committed
666
667
668
669
670
671
672
673
674
675
676
677
}

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
678
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
679
{
680
	struct tcp_sock *tp = tcp_sk(sk);
681
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
682
683
	u32 now;

684
	inet_csk_schedule_ack(sk);
Linus Torvalds's avatar
Linus Torvalds committed
685

686
	tcp_measure_rcv_mss(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
687
688

	tcp_rcv_rtt_measure(tp);
689

690
	now = tcp_jiffies32;
Linus Torvalds's avatar
Linus Torvalds committed
691

692
	if (!icsk->icsk_ack.ato) {
Linus Torvalds's avatar
Linus Torvalds committed
693
694
695
		/* The _first_ data packet received, initialize
		 * delayed ACK engine.
		 */
696
		tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
697
		icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
698
	} else {
699
		int m = now - icsk->icsk_ack.lrcvtime;
Linus Torvalds's avatar
Linus Torvalds committed
700

701
		if (m <= TCP_ATO_MIN / 2) {
Linus Torvalds's avatar
Linus Torvalds committed
702
			/* The fastest case is the first. */
703
704
705
706
707
708
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
		} else if (m < icsk->icsk_ack.ato) {
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
			if (icsk->icsk_ack.ato > icsk->icsk_rto)
				icsk->icsk_ack.ato = icsk->icsk_rto;
		} else if (m > icsk->icsk_rto) {
Stephen Hemminger's avatar
Stephen Hemminger committed
709
			/* Too long gap. Apparently sender failed to
Linus Torvalds's avatar
Linus Torvalds committed
710
711
			 * restart window, so that we send ACKs quickly.
			 */
712
			tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
713
			sk_mem_reclaim(sk);
Linus Torvalds's avatar
Linus Torvalds committed
714
715
		}
	}
716
	icsk->icsk_ack.lrcvtime = now;
Linus Torvalds's avatar
Linus Torvalds committed
717

718
	tcp_ecn_check_ce(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
719
720

	if (skb->len >= 128)
721
		tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
722
723
724
725
726
727
728
729
730
731
732
}

/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
733
static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
Linus Torvalds's avatar
Linus Torvalds committed
734
{
735
	struct tcp_sock *tp = tcp_sk(sk);
736
737
	long m = mrtt_us; /* RTT */
	u32 srtt = tp->srtt_us;
Linus Torvalds's avatar
Linus Torvalds committed
738
739
740
741

	/*	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
742
	 *	This is designed to be as fast as possible
Linus Torvalds's avatar
Linus Torvalds committed
743
744
745
746
747
748
749
	 *	m stands for "measurement".
	 *
	 *	On a 1990 paper the rto value is changed to:
	 *	RTO = rtt + 4 * mdev
	 *
	 * Funny. This algorithm seems to be very broken.
	 * These formulae increase RTO, when it should be decreased, increase
750
	 * too slowly, when it should be increased quickly, decrease too quickly
Linus Torvalds's avatar
Linus Torvalds committed
751
752
753
754
	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
	 * does not matter how to _calculate_ it. Seems, it was trap
	 * that VJ failed to avoid. 8)
	 */
755
756
757
	if (srtt != 0) {
		m -= (srtt >> 3);	/* m is now error in rtt est */
		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
Linus Torvalds's avatar
Linus Torvalds committed
758
759
		if (m < 0) {
			m = -m;		/* m is now abs(error) */
760
			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
Linus Torvalds's avatar
Linus Torvalds committed
761
762
763
764
765
766
767
768
769
770
771
			/* This is similar to one of Eifel findings.
			 * Eifel blocks mdev updates when rtt decreases.
			 * This solution is a bit different: we use finer gain
			 * for mdev in this case (alpha*beta).
			 * Like Eifel it also prevents growth of rto,
			 * but also it limits too fast rto decreases,
			 * happening in pure Eifel.
			 */
			if (m > 0)
				m >>= 3;
		} else {
772
			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
Linus Torvalds's avatar
Linus Torvalds committed
773
		}
774
775
776
777
778
		tp->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */
		if (tp->mdev_us > tp->mdev_max_us) {
			tp->mdev_max_us = tp->mdev_us;
			if (tp->mdev_max_us > tp->rttvar_us)
				tp->rttvar_us = tp->mdev_max_us;
Linus Torvalds's avatar
Linus Torvalds committed
779
780
		}
		if (after(tp->snd_una, tp->rtt_seq)) {
781
782
			if (tp->mdev_max_us < tp->rttvar_us)
				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
Linus Torvalds's avatar
Linus Torvalds committed
783
			tp->rtt_seq = tp->snd_nxt;
784
			tp->mdev_max_us = tcp_rto_min_us(sk);
785
786

			tcp_bpf_rtt(sk);
Linus Torvalds's avatar
Linus Torvalds committed
787
788
789
		}
	} else {
		/* no previous measure. */
790
		srtt = m << 3;		/* take the measured time to be rtt */
791
792
793
		tp->mdev_us = m << 1;	/* make sure rto = 3*rtt */
		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
		tp->mdev_max_us = tp->rttvar_us;
Linus Torvalds's avatar
Linus Torvalds committed
794
		tp->rtt_seq = tp->snd_nxt;
795
796

		tcp_bpf_rtt(sk);
Linus Torvalds's avatar
Linus Torvalds committed
797
	}
798
	tp->srtt_us = max(1U, srtt);
Linus Torvalds's avatar
Linus Torvalds committed
799
800
}

801
802
803
804
805
806
static void tcp_update_pacing_rate(struct sock *sk)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	u64 rate;

	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
807
808
809
810
811
812
813
814
815
816
817
	rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3);

	/* current rate is (cwnd * mss) / srtt
	 * In Slow Start [1], set sk_pacing_rate to 200 % the current rate.
	 * In Congestion Avoidance phase, set it to 120 % the current rate.
	 *
	 * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh)
	 *	 If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
	 *	 end of slow start and should slow down.
	 */
	if (tp->snd_cwnd < tp->snd_ssthresh / 2)
818
		rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
819
	else
820
		rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
821
822
823

	rate *= max(tp->snd_cwnd, tp->packets_out);

824
825
	if (likely(tp->srtt_us))
		do_div(rate, tp->srtt_us);
826

827
	/* WRITE_ONCE() is needed because sch_fq fetches sk_pacing_rate
828
829
830
	 * without any lock. We want to make sure compiler wont store
	 * intermediate values in this location.
	 */
831
832
	WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
					     sk->sk_max_pacing_rate));
833
834
}

Linus Torvalds's avatar
Linus Torvalds committed
835
836
837
/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
838
static void tcp_set_rto(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
839
{
840
	const struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
841
842
843
844
845
846
847
848
	/* Old crap is replaced with new one. 8)
	 *
	 * More seriously:
	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
	 *    It cannot be less due to utterly erratic ACK generation made
	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
	 *    to do with delayed acks, because at cwnd>2 true delack timeout
	 *    is invisible. Actually, Linux-2.4 also generates erratic
Stephen Hemminger's avatar
Stephen Hemminger committed
849
	 *    ACKs in some circumstances.
Linus Torvalds's avatar
Linus Torvalds committed
850
	 */
851
	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
Linus Torvalds's avatar
Linus Torvalds committed
852
853
854
855

	/* 2. Fixups made earlier cannot be right.
	 *    If we do not estimate RTO correctly without them,
	 *    all the algo is pure shit and should be replaced
Stephen Hemminger's avatar
Stephen Hemminger committed
856
	 *    with correct one. It is exactly, which we pretend to do.
Linus Torvalds's avatar
Linus Torvalds committed
857
858
	 */

859
860
861
	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
	 * guarantees that rto is higher.
	 */
862
	tcp_bound_rto(sk);
Linus Torvalds's avatar
Linus Torvalds committed
863
864
}

865
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
Linus Torvalds's avatar
Linus Torvalds committed
866
867
868
{
	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

869
	if (!cwnd)
870
		cwnd = TCP_INIT_CWND;
Linus Torvalds's avatar
Linus Torvalds committed
871
872
873
	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}

874
/* Take a notice that peer is sending D-SACKs */
875
876
static void tcp_dsack_seen(struct tcp_sock *tp)
{
877
	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
878
	tp->rack.dsack_seen = 1;
879
	tp->dsack_dups++;
880
881
}

882
883
884
885
886
887
/* It's reordering when higher sequence was delivered (i.e. sacked) before
 * some lower never-retransmitted sequence ("low_seq"). The maximum reordering
 * distance is approximated in full-mss packet distance ("reordering").
 */
static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
				      const int ts)
Linus Torvalds's avatar
Linus Torvalds committed
888
{
889
	struct tcp_sock *tp = tcp_sk(sk);
890
891
	const u32 mss = tp->mss_cache;
	u32 fack, metric;
892

893
894
	fack = tcp_highest_sack_seq(tp);
	if (!before(low_seq, fack))
895
896
		return;

897
898
	metric = fack - low_seq;
	if ((metric > tp->reordering * mss) && mss) {
Linus Torvalds's avatar
Linus Torvalds committed
899
#if FASTRETRANS_DEBUG > 1
900
901
902
		pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
			 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
			 tp->reordering,
903
			 0,
904
905
			 tp->sacked_out,
			 tp->undo_marker ? tp->undo_retrans : 0);
Linus Torvalds's avatar
Linus Torvalds committed
906
#endif
907
908
		tp->reordering = min_t(u32, (metric + mss - 1) / mss,
				       sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
Linus Torvalds's avatar
Linus Torvalds committed
909
	}
Yuchung Cheng's avatar
Yuchung Cheng committed
910

911
	/* This exciting event is worth to be remembered. 8) */
912
	tp->reord_seen++;
913
914
	NET_INC_STATS(sock_net(sk),
		      ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
Linus Torvalds's avatar
Linus Torvalds committed
915
916
}

917
/* This must be called before lost_out is incremented */
918
919
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
920
921
922
923
	if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
	    (tp->retransmit_skb_hint &&
	     before(TCP_SKB_CB(skb)->seq,
		    TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
924
		tp->retransmit_skb_hint = skb;
925
926
}

927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
/* Sum the number of packets on the wire we have marked as lost.
 * There are two cases we care about here:
 * a) Packet hasn't been marked lost (nor retransmitted),
 *    and this is the first loss.
 * b) Packet has been marked both lost and retransmitted,
 *    and this means we think it was lost again.
 */
static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
	__u8 sacked = TCP_SKB_CB(skb)->sacked;

	if (!(sacked & TCPCB_LOST) ||
	    ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
		tp->lost += tcp_skb_pcount(skb);
}

943
944
945
946
947
948
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
		tcp_verify_retransmit_hint(tp, skb);

		tp->lost_out += tcp_skb_pcount(skb);
949
		tcp_sum_lost(tp, skb);
950
951
952
953
		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
	}
}

Yuchung Cheng's avatar
Yuchung Cheng committed
954
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
955
956
957
{
	tcp_verify_retransmit_hint(tp, skb);

958
	tcp_sum_lost(tp, skb);
959
960
961
962
963
964
	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
		tp->lost_out += tcp_skb_pcount(skb);
		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
/* This procedure tags the retransmission queue when SACKs arrive.
 *
 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
 * Packets in queue with these bits set are counted in variables
 * sacked_out, retrans_out and lost_out, correspondingly.
 *
 * Valid combinations are:
 * Tag  InFlight	Description
 * 0	1		- orig segment is in flight.
 * S	0		- nothing flies, orig reached receiver.
 * L	0		- nothing flies, orig lost by net.
 * R	2		- both orig and retransmit are in flight.
 * L|R	1		- orig is lost, retransmit is in flight.
 * S|R  1		- orig reached receiver, retrans is still in flight.
 * (L|S|R is logically valid, it could occur when L|R is sacked,
 *  but it is equivalent to plain S and code short-curcuits it to S.
 *  L|S is logically invalid, it would mean -1 packet in flight 8))
 *
 * These 6 states form finite state machine, controlled by the following events:
 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
986
 * 3. Loss detection event of two flavors:
Linus Torvalds's avatar
Linus Torvalds committed
987
988
 *	A. Scoreboard estimator decided the packet is lost.
 *	   A'. Reno "three dupacks" marks head of queue lost.