tcp_input.c 166 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Implementation of the Transmission Control Protocol(TCP).
 *
8
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Mark Evans, <evansmp@uhura.aston.ac.uk>
 *		Corey Minyard <wf-rch!minyard@relay.EU.net>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
 *		Linus Torvalds, <torvalds@cs.helsinki.fi>
 *		Alan Cox, <gw4pts@gw4pts.ampr.org>
 *		Matthew Dillon, <dillon@apollo.west.oic.com>
 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *		Jorge Cwik, <jorge@laser.satlink.net>
 */

/*
 * Changes:
 *		Pedro Roque	:	Fast Retransmit/Recovery.
 *					Two receive queues.
 *					Retransmit queue handled by TCP.
 *					Better retransmit timer handling.
 *					New congestion avoidance.
 *					Header prediction.
 *					Variable renaming.
 *
 *		Eric		:	Fast Retransmit.
 *		Randy Scott	:	MSS option defines.
 *		Eric Schenk	:	Fixes to slow start algorithm.
 *		Eric Schenk	:	Yet another double ACK bug.
 *		Eric Schenk	:	Delayed ACK bug fixes.
 *		Eric Schenk	:	Floyd style fast retrans war avoidance.
 *		David S. Miller	:	Don't allow zero congestion window.
 *		Eric Schenk	:	Fix retransmitter so that it sends
 *					next packet on ack of previous packet.
 *		Andi Kleen	:	Moved open_request checking here
 *					and process RSTs for open_requests.
 *		Andi Kleen	:	Better prune_queue, and other fixes.
Stephen Hemminger's avatar
Stephen Hemminger committed
43
 *		Andrey Savochkin:	Fix RTT measurements in the presence of
Linus Torvalds's avatar
Linus Torvalds committed
44 45 46 47 48 49 50
 *					timestamps.
 *		Andrey Savochkin:	Check sequence numbers correctly when
 *					removing SACKs due to in sequence incoming
 *					data segments.
 *		Andi Kleen:		Make sure we never ack data there is not
 *					enough room for. Also make this condition
 *					a fatal error if it might still happen.
51
 *		Andi Kleen:		Add tcp_measure_rcv_mss to make
Linus Torvalds's avatar
Linus Torvalds committed
52
 *					connections with MSS<min(MTU,ann. MSS)
53
 *					work without delayed acks.
Linus Torvalds's avatar
Linus Torvalds committed
54 55 56 57 58 59 60 61 62 63
 *		Andi Kleen:		Process packets with PSH set in the
 *					fast path.
 *		J Hadi Salim:		ECN support
 *	 	Andrei Gurtov,
 *		Pasi Sarolahti,
 *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
 *					engine. Lots of bugs are found.
 *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
 */

64 65
#define pr_fmt(fmt) "TCP: " fmt

Linus Torvalds's avatar
Linus Torvalds committed
66
#include <linux/mm.h>
67
#include <linux/slab.h>
Linus Torvalds's avatar
Linus Torvalds committed
68 69
#include <linux/module.h>
#include <linux/sysctl.h>
70
#include <linux/kernel.h>
71
#include <net/dst.h>
Linus Torvalds's avatar
Linus Torvalds committed
72 73 74 75
#include <net/tcp.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
76
#include <net/netdma.h>
Linus Torvalds's avatar
Linus Torvalds committed
77

78 79 80 81 82
int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
83
EXPORT_SYMBOL(sysctl_tcp_reordering);
84 85
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
86
int sysctl_tcp_adv_win_scale __read_mostly = 1;
87
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
Linus Torvalds's avatar
Linus Torvalds committed
88

Eric Dumazet's avatar
Eric Dumazet committed
89 90 91
/* rfc5961 challenge ack rate limiting */
int sysctl_tcp_challenge_ack_limit = 100;

92 93 94
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
95
int sysctl_tcp_frto __read_mostly = 2;
Linus Torvalds's avatar
Linus Torvalds committed
96

Andreas Petlund's avatar
Andreas Petlund committed
97 98
int sysctl_tcp_thin_dupack __read_mostly;

99
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
Nandita Dukkipati's avatar
Nandita Dukkipati committed
100
int sysctl_tcp_early_retrans __read_mostly = 3;
Linus Torvalds's avatar
Linus Torvalds committed
101 102 103 104 105 106 107 108 109

#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
#define FLAG_ECE		0x40 /* ECE in this ACK				*/
#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
Yuchung Cheng's avatar
Yuchung Cheng committed
110
#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
111
#define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
112
#define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
113
#define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
114
#define FLAG_UPDATE_TS_RECENT	0x4000 /* tcp_replace_ts_recent() */
Linus Torvalds's avatar
Linus Torvalds committed
115 116 117 118 119 120 121

#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)

#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
122
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
Linus Torvalds's avatar
Linus Torvalds committed
123

124
/* Adapt the MSS value used to make delayed ack decision to the
Linus Torvalds's avatar
Linus Torvalds committed
125
 * real world.
126
 */
127
static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
128
{
129
	struct inet_connection_sock *icsk = inet_csk(sk);
130
	const unsigned int lss = icsk->icsk_ack.last_seg_size;
131
	unsigned int len;
Linus Torvalds's avatar
Linus Torvalds committed
132

133
	icsk->icsk_ack.last_seg_size = 0;
Linus Torvalds's avatar
Linus Torvalds committed
134 135 136 137

	/* skb->len may jitter because of SACKs, even if peer
	 * sends good full-sized frames.
	 */
138
	len = skb_shinfo(skb)->gso_size ? : skb->len;
139 140
	if (len >= icsk->icsk_ack.rcv_mss) {
		icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
141 142 143 144 145 146
	} else {
		/* Otherwise, we make more careful check taking into account,
		 * that SACKs block is variable.
		 *
		 * "len" is invariant segment length, including TCP header.
		 */
147
		len += skb->data - skb_transport_header(skb);
148
		if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
Linus Torvalds's avatar
Linus Torvalds committed
149 150 151 152 153 154
		    /* If PSH is not set, packet should be
		     * full sized, provided peer TCP is not badly broken.
		     * This observation (if it is correct 8)) allows
		     * to handle super-low mtu links fairly.
		     */
		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
155
		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
Linus Torvalds's avatar
Linus Torvalds committed
156 157 158 159
			/* Subtract also invariant (if peer is RFC compliant),
			 * tcp header plus fixed timestamp option length.
			 * Resulting "len" is MSS free of SACK jitter.
			 */
160 161
			len -= tcp_sk(sk)->tcp_header_len;
			icsk->icsk_ack.last_seg_size = len;
Linus Torvalds's avatar
Linus Torvalds committed
162
			if (len == lss) {
163
				icsk->icsk_ack.rcv_mss = len;
Linus Torvalds's avatar
Linus Torvalds committed
164 165 166
				return;
			}
		}
167 168
		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
169
		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
Linus Torvalds's avatar
Linus Torvalds committed
170 171 172
	}
}

173
static void tcp_incr_quickack(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
174
{
175
	struct inet_connection_sock *icsk = inet_csk(sk);
176
	unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
Linus Torvalds's avatar
Linus Torvalds committed
177

178 179
	if (quickacks == 0)
		quickacks = 2;
180 181
	if (quickacks > icsk->icsk_ack.quick)
		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
Linus Torvalds's avatar
Linus Torvalds committed
182 183
}

184
static void tcp_enter_quickack_mode(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
185
{
186 187 188 189
	struct inet_connection_sock *icsk = inet_csk(sk);
	tcp_incr_quickack(sk);
	icsk->icsk_ack.pingpong = 0;
	icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
190 191 192 193 194 195
}

/* Send ACKs quickly, if "quick" count is not exhausted
 * and the session is not interactive.
 */

Eric Dumazet's avatar
Eric Dumazet committed
196
static inline bool tcp_in_quickack_mode(const struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
197
{
198
	const struct inet_connection_sock *icsk = inet_csk(sk);
Eric Dumazet's avatar
Eric Dumazet committed
199

200
	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
Linus Torvalds's avatar
Linus Torvalds committed
201 202
}

203 204
static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
{
205
	if (tp->ecn_flags & TCP_ECN_OK)
206 207 208
		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
}

209
static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
210 211 212 213 214 215 216 217 218 219
{
	if (tcp_hdr(skb)->cwr)
		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}

static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
{
	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}

220
static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
221
{
222 223 224
	if (!(tp->ecn_flags & TCP_ECN_OK))
		return;

225
	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
226
	case INET_ECN_NOT_ECT:
227
		/* Funny extension: if ECT is not set on a segment,
228 229 230 231
		 * and we already seen ECT on a previous segment,
		 * it is probably a retransmit.
		 */
		if (tp->ecn_flags & TCP_ECN_SEEN)
232
			tcp_enter_quickack_mode((struct sock *)tp);
233 234
		break;
	case INET_ECN_CE:
235 236 237 238 239
		if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
			/* Better not delay acks, sender can have a very low cwnd */
			tcp_enter_quickack_mode((struct sock *)tp);
			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
		}
240 241 242
		/* fallinto */
	default:
		tp->ecn_flags |= TCP_ECN_SEEN;
243 244 245
	}
}

246
static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
247
{
248
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
249 250 251
		tp->ecn_flags &= ~TCP_ECN_OK;
}

252
static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
253
{
254
	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
255 256 257
		tp->ecn_flags &= ~TCP_ECN_OK;
}

Eric Dumazet's avatar
Eric Dumazet committed
258
static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
259
{
260
	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
Eric Dumazet's avatar
Eric Dumazet committed
261 262
		return true;
	return false;
263 264
}

Linus Torvalds's avatar
Linus Torvalds committed
265 266 267 268 269
/* Buffer size and advertised window tuning.
 *
 * 1. Tuning sk->sk_sndbuf, when connection enters established state.
 */

270
static void tcp_sndbuf_expand(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
271
{
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
	const struct tcp_sock *tp = tcp_sk(sk);
	int sndmem, per_mss;
	u32 nr_segs;

	/* Worst case is non GSO/TSO : each frame consumes one skb
	 * and skb->head is kmalloced using power of two area of memory
	 */
	per_mss = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
		  MAX_TCP_HEADER +
		  SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

	per_mss = roundup_pow_of_two(per_mss) +
		  SKB_DATA_ALIGN(sizeof(struct sk_buff));

	nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
	nr_segs = max_t(u32, nr_segs, tp->reordering + 1);

	/* Fast Recovery (RFC 5681 3.2) :
	 * Cubic needs 1.7 factor, rounded to 2 to include
	 * extra cushion (application might react slowly to POLLOUT)
	 */
	sndmem = 2 * nr_segs * per_mss;
Linus Torvalds's avatar
Linus Torvalds committed
294

295 296
	if (sk->sk_sndbuf < sndmem)
		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
Linus Torvalds's avatar
Linus Torvalds committed
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319
}

/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
 *
 * All tcp_full_space() is split to two parts: "network" buffer, allocated
 * forward and advertised in receiver window (tp->rcv_wnd) and
 * "application buffer", required to isolate scheduling/application
 * latencies from network.
 * window_clamp is maximal advertised window. It can be less than
 * tcp_full_space(), in this case tcp_full_space() - window_clamp
 * is reserved for "application" buffer. The less window_clamp is
 * the smoother our behaviour from viewpoint of network, but the lower
 * throughput and the higher sensitivity of the connection to losses. 8)
 *
 * rcv_ssthresh is more strict window_clamp used at "slow start"
 * phase to predict further behaviour of this connection.
 * It is used for two goals:
 * - to enforce header prediction at sender, even when application
 *   requires some significant "application buffer". It is check #1.
 * - to prevent pruning of receive queue because of misprediction
 *   of receiver window. Check #2.
 *
 * The scheme does not work when sender sends good segments opening
Stephen Hemminger's avatar
Stephen Hemminger committed
320
 * window and then starts to feed us spaghetti. But it should work
Linus Torvalds's avatar
Linus Torvalds committed
321 322 323 324
 * in common situations. Otherwise, we have to rely on queue collapsing.
 */

/* Slow part of check#2. */
325
static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
326
{
327
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
328
	/* Optimize this! */
329 330
	int truesize = tcp_win_from_space(skb->truesize) >> 1;
	int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
Linus Torvalds's avatar
Linus Torvalds committed
331 332 333

	while (tp->rcv_ssthresh <= window) {
		if (truesize <= skb->len)
334
			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
Linus Torvalds's avatar
Linus Torvalds committed
335 336 337 338 339 340 341

		truesize >>= 1;
		window >>= 1;
	}
	return 0;
}

342
static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
343
{
344 345
	struct tcp_sock *tp = tcp_sk(sk);

Linus Torvalds's avatar
Linus Torvalds committed
346 347 348
	/* Check #1 */
	if (tp->rcv_ssthresh < tp->window_clamp &&
	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
349
	    !sk_under_memory_pressure(sk)) {
Linus Torvalds's avatar
Linus Torvalds committed
350 351 352 353 354 355
		int incr;

		/* Check #2. Increase window, if skb with such overhead
		 * will fit to rcvbuf in future.
		 */
		if (tcp_win_from_space(skb->truesize) <= skb->len)
356
			incr = 2 * tp->advmss;
Linus Torvalds's avatar
Linus Torvalds committed
357
		else
358
			incr = __tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
359 360

		if (incr) {
361
			incr = max_t(int, incr, 2 * skb->len);
362 363
			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
					       tp->window_clamp);
364
			inet_csk(sk)->icsk_ack.quick |= 1;
Linus Torvalds's avatar
Linus Torvalds committed
365 366 367 368 369 370 371
		}
	}
}

/* 3. Tuning rcvbuf, when connection enters established state. */
static void tcp_fixup_rcvbuf(struct sock *sk)
{
372 373
	u32 mss = tcp_sk(sk)->advmss;
	int rcvmem;
Linus Torvalds's avatar
Linus Torvalds committed
374

375 376
	rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
		 tcp_default_init_rwnd(mss);
377

Eric Dumazet's avatar
Eric Dumazet committed
378 379 380 381 382 383
	/* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
	 * Allow enough cushion so that sender is not limited by our window
	 */
	if (sysctl_tcp_moderate_rcvbuf)
		rcvmem <<= 2;

384 385
	if (sk->sk_rcvbuf < rcvmem)
		sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
Linus Torvalds's avatar
Linus Torvalds committed
386 387
}

Stephen Hemminger's avatar
Stephen Hemminger committed
388
/* 4. Try to fixup all. It is made immediately after connection enters
Linus Torvalds's avatar
Linus Torvalds committed
389 390
 *    established state.
 */
391
void tcp_init_buffer_space(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
392 393 394 395 396 397 398
{
	struct tcp_sock *tp = tcp_sk(sk);
	int maxwin;

	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
		tcp_fixup_rcvbuf(sk);
	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
399
		tcp_sndbuf_expand(sk);
Linus Torvalds's avatar
Linus Torvalds committed
400 401

	tp->rcvq_space.space = tp->rcv_wnd;
Eric Dumazet's avatar
Eric Dumazet committed
402 403
	tp->rcvq_space.time = tcp_time_stamp;
	tp->rcvq_space.seq = tp->copied_seq;
Linus Torvalds's avatar
Linus Torvalds committed
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426

	maxwin = tcp_full_space(sk);

	if (tp->window_clamp >= maxwin) {
		tp->window_clamp = maxwin;

		if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
			tp->window_clamp = max(maxwin -
					       (maxwin >> sysctl_tcp_app_win),
					       4 * tp->advmss);
	}

	/* Force reservation of one segment. */
	if (sysctl_tcp_app_win &&
	    tp->window_clamp > 2 * tp->advmss &&
	    tp->window_clamp + tp->advmss > maxwin)
		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);

	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
	tp->snd_cwnd_stamp = tcp_time_stamp;
}

/* 5. Recalculate window clamp after socket hit its memory bounds. */
427
static void tcp_clamp_window(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
428
{
429
	struct tcp_sock *tp = tcp_sk(sk);
430
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
431

432
	icsk->icsk_ack.quick = 0;
Linus Torvalds's avatar
Linus Torvalds committed
433

434 435
	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
436 437
	    !sk_under_memory_pressure(sk) &&
	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
438 439
		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
				    sysctl_tcp_rmem[2]);
Linus Torvalds's avatar
Linus Torvalds committed
440
	}
441
	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
442
		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
Linus Torvalds's avatar
Linus Torvalds committed
443 444
}

Stephen Hemminger's avatar
Stephen Hemminger committed
445 446 447 448 449 450 451 452 453
/* Initialize RCV_MSS value.
 * RCV_MSS is an our guess about MSS used by the peer.
 * We haven't any direct information about the MSS.
 * It's better to underestimate the RCV_MSS rather than overestimate.
 * Overestimations make us ACKing less frequently than needed.
 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
 */
void tcp_initialize_rcv_mss(struct sock *sk)
{
454
	const struct tcp_sock *tp = tcp_sk(sk);
Stephen Hemminger's avatar
Stephen Hemminger committed
455 456
	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

457
	hint = min(hint, tp->rcv_wnd / 2);
458
	hint = min(hint, TCP_MSS_DEFAULT);
Stephen Hemminger's avatar
Stephen Hemminger committed
459 460 461 462
	hint = max(hint, TCP_MIN_MSS);

	inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
463
EXPORT_SYMBOL(tcp_initialize_rcv_mss);
Stephen Hemminger's avatar
Stephen Hemminger committed
464

Linus Torvalds's avatar
Linus Torvalds committed
465 466 467 468
/* Receiver "autotuning" code.
 *
 * The algorithm for RTT estimation w/o timestamps is based on
 * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
469
 * <http://public.lanl.gov/radiant/pubs.html#DRS>
Linus Torvalds's avatar
Linus Torvalds committed
470 471
 *
 * More detail on this code can be found at
472
 * <http://staff.psc.edu/jheffner/>,
Linus Torvalds's avatar
Linus Torvalds committed
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490
 * though this reference is out of date.  A new paper
 * is pending.
 */
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
	u32 new_sample = tp->rcv_rtt_est.rtt;
	long m = sample;

	if (m == 0)
		m = 1;

	if (new_sample != 0) {
		/* If we sample in larger samples in the non-timestamp
		 * case, we could grossly overestimate the RTT especially
		 * with chatty applications or bulk transfer apps which
		 * are stalled on filesystem I/O.
		 *
		 * Also, since we are only going for a minimum in the
491
		 * non-timestamp case, we do not smooth things out
Stephen Hemminger's avatar
Stephen Hemminger committed
492
		 * else with timestamps disabled convergence takes too
Linus Torvalds's avatar
Linus Torvalds committed
493 494 495 496 497
		 * long.
		 */
		if (!win_dep) {
			m -= (new_sample >> 3);
			new_sample += m;
498 499 500 501 502
		} else {
			m <<= 3;
			if (m < new_sample)
				new_sample = m;
		}
Linus Torvalds's avatar
Linus Torvalds committed
503
	} else {
Stephen Hemminger's avatar
Stephen Hemminger committed
504
		/* No previous measure. */
Linus Torvalds's avatar
Linus Torvalds committed
505 506 507 508 509 510 511 512 513 514 515 516 517
		new_sample = m << 3;
	}

	if (tp->rcv_rtt_est.rtt != new_sample)
		tp->rcv_rtt_est.rtt = new_sample;
}

static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
{
	if (tp->rcv_rtt_est.time == 0)
		goto new_measure;
	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
		return;
518
	tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1);
Linus Torvalds's avatar
Linus Torvalds committed
519 520 521 522 523 524

new_measure:
	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
	tp->rcv_rtt_est.time = tcp_time_stamp;
}

525 526
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
					  const struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
527
{
528
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
529 530
	if (tp->rx_opt.rcv_tsecr &&
	    (TCP_SKB_CB(skb)->end_seq -
531
	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
Linus Torvalds's avatar
Linus Torvalds committed
532 533 534 535 536 537 538 539 540 541 542
		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
}

/*
 * This function should be called every time data is copied to user space.
 * It calculates the appropriate TCP receive buffer space.
 */
void tcp_rcv_space_adjust(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	int time;
Eric Dumazet's avatar
Eric Dumazet committed
543
	int copied;
544

Linus Torvalds's avatar
Linus Torvalds committed
545
	time = tcp_time_stamp - tp->rcvq_space.time;
546
	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
Linus Torvalds's avatar
Linus Torvalds committed
547
		return;
548

Eric Dumazet's avatar
Eric Dumazet committed
549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
	/* Number of bytes copied to user in last RTT */
	copied = tp->copied_seq - tp->rcvq_space.seq;
	if (copied <= tp->rcvq_space.space)
		goto new_measure;

	/* A bit of theory :
	 * copied = bytes received in previous RTT, our base window
	 * To cope with packet losses, we need a 2x factor
	 * To cope with slow start, and sender growing its cwin by 100 %
	 * every RTT, we need a 4x factor, because the ACK we are sending
	 * now is for the next RTT, not the current one :
	 * <prev RTT . ><current RTT .. ><next RTT .... >
	 */

	if (sysctl_tcp_moderate_rcvbuf &&
	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
		int rcvwin, rcvmem, rcvbuf;
Linus Torvalds's avatar
Linus Torvalds committed
566

Eric Dumazet's avatar
Eric Dumazet committed
567 568 569 570
		/* minimal window to cope with packet losses, assuming
		 * steady state. Add some cushion because of small variations.
		 */
		rcvwin = (copied << 1) + 16 * tp->advmss;
Linus Torvalds's avatar
Linus Torvalds committed
571

Eric Dumazet's avatar
Eric Dumazet committed
572 573 574 575 576 577 578 579 580 581 582 583 584
		/* If rate increased by 25%,
		 *	assume slow start, rcvwin = 3 * copied
		 * If rate increased by 50%,
		 *	assume sender can use 2x growth, rcvwin = 4 * copied
		 */
		if (copied >=
		    tp->rcvq_space.space + (tp->rcvq_space.space >> 2)) {
			if (copied >=
			    tp->rcvq_space.space + (tp->rcvq_space.space >> 1))
				rcvwin <<= 1;
			else
				rcvwin += (rcvwin >> 1);
		}
Linus Torvalds's avatar
Linus Torvalds committed
585

Eric Dumazet's avatar
Eric Dumazet committed
586 587 588
		rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
		while (tcp_win_from_space(rcvmem) < tp->advmss)
			rcvmem += 128;
Linus Torvalds's avatar
Linus Torvalds committed
589

Eric Dumazet's avatar
Eric Dumazet committed
590 591 592
		rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
		if (rcvbuf > sk->sk_rcvbuf) {
			sk->sk_rcvbuf = rcvbuf;
Linus Torvalds's avatar
Linus Torvalds committed
593

Eric Dumazet's avatar
Eric Dumazet committed
594 595
			/* Make the window clamp follow along.  */
			tp->window_clamp = rcvwin;
Linus Torvalds's avatar
Linus Torvalds committed
596 597
		}
	}
Eric Dumazet's avatar
Eric Dumazet committed
598
	tp->rcvq_space.space = copied;
599

Linus Torvalds's avatar
Linus Torvalds committed
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
new_measure:
	tp->rcvq_space.seq = tp->copied_seq;
	tp->rcvq_space.time = tcp_time_stamp;
}

/* There is something which you must keep in mind when you analyze the
 * behavior of the tp->ato delayed ack timeout interval.  When a
 * connection starts up, we want to ack as quickly as possible.  The
 * problem is that "good" TCP's do slow start at the beginning of data
 * transmission.  The means that until we send the first few ACK's the
 * sender will sit on his end and only queue most of his data, because
 * he can only send snd_cwnd unacked packets at any given time.  For
 * each ACK we send, he increments snd_cwnd and transmits more of his
 * queue.  -DaveM
 */
615
static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
Linus Torvalds's avatar
Linus Torvalds committed
616
{
617
	struct tcp_sock *tp = tcp_sk(sk);
618
	struct inet_connection_sock *icsk = inet_csk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
619 620
	u32 now;

621
	inet_csk_schedule_ack(sk);
Linus Torvalds's avatar
Linus Torvalds committed
622

623
	tcp_measure_rcv_mss(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
624 625

	tcp_rcv_rtt_measure(tp);
626

Linus Torvalds's avatar
Linus Torvalds committed
627 628
	now = tcp_time_stamp;

629
	if (!icsk->icsk_ack.ato) {
Linus Torvalds's avatar
Linus Torvalds committed
630 631 632
		/* The _first_ data packet received, initialize
		 * delayed ACK engine.
		 */
633 634
		tcp_incr_quickack(sk);
		icsk->icsk_ack.ato = TCP_ATO_MIN;
Linus Torvalds's avatar
Linus Torvalds committed
635
	} else {
636
		int m = now - icsk->icsk_ack.lrcvtime;
Linus Torvalds's avatar
Linus Torvalds committed
637

638
		if (m <= TCP_ATO_MIN / 2) {
Linus Torvalds's avatar
Linus Torvalds committed
639
			/* The fastest case is the first. */
640 641 642 643 644 645
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
		} else if (m < icsk->icsk_ack.ato) {
			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
			if (icsk->icsk_ack.ato > icsk->icsk_rto)
				icsk->icsk_ack.ato = icsk->icsk_rto;
		} else if (m > icsk->icsk_rto) {
Stephen Hemminger's avatar
Stephen Hemminger committed
646
			/* Too long gap. Apparently sender failed to
Linus Torvalds's avatar
Linus Torvalds committed
647 648
			 * restart window, so that we send ACKs quickly.
			 */
649
			tcp_incr_quickack(sk);
650
			sk_mem_reclaim(sk);
Linus Torvalds's avatar
Linus Torvalds committed
651 652
		}
	}
653
	icsk->icsk_ack.lrcvtime = now;
Linus Torvalds's avatar
Linus Torvalds committed
654 655 656 657

	TCP_ECN_check_ce(tp, skb);

	if (skb->len >= 128)
658
		tcp_grow_window(sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
659 660 661 662 663 664 665 666 667 668 669
}

/* Called to compute a smoothed rtt estimate. The data fed to this
 * routine either comes from timestamps, or from segments that were
 * known _not_ to have been retransmitted [see Karn/Partridge
 * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
 * piece by Van Jacobson.
 * NOTE: the next three routines used to be one big routine.
 * To save cycles in the RFC 1323 implementation it was better to break
 * it up into three procedures. -- erics
 */
670
static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
Linus Torvalds's avatar
Linus Torvalds committed
671
{
672
	struct tcp_sock *tp = tcp_sk(sk);
673 674
	long m = mrtt_us; /* RTT */
	u32 srtt = tp->srtt_us;
Linus Torvalds's avatar
Linus Torvalds committed
675 676 677 678

	/*	The following amusing code comes from Jacobson's
	 *	article in SIGCOMM '88.  Note that rtt and mdev
	 *	are scaled versions of rtt and mean deviation.
679
	 *	This is designed to be as fast as possible
Linus Torvalds's avatar
Linus Torvalds committed
680 681 682 683 684 685 686
	 *	m stands for "measurement".
	 *
	 *	On a 1990 paper the rto value is changed to:
	 *	RTO = rtt + 4 * mdev
	 *
	 * Funny. This algorithm seems to be very broken.
	 * These formulae increase RTO, when it should be decreased, increase
687
	 * too slowly, when it should be increased quickly, decrease too quickly
Linus Torvalds's avatar
Linus Torvalds committed
688 689 690 691
	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
	 * does not matter how to _calculate_ it. Seems, it was trap
	 * that VJ failed to avoid. 8)
	 */
692 693 694
	if (srtt != 0) {
		m -= (srtt >> 3);	/* m is now error in rtt est */
		srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
Linus Torvalds's avatar
Linus Torvalds committed
695 696
		if (m < 0) {
			m = -m;		/* m is now abs(error) */
697
			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
Linus Torvalds's avatar
Linus Torvalds committed
698 699 700 701 702 703 704 705 706 707 708
			/* This is similar to one of Eifel findings.
			 * Eifel blocks mdev updates when rtt decreases.
			 * This solution is a bit different: we use finer gain
			 * for mdev in this case (alpha*beta).
			 * Like Eifel it also prevents growth of rto,
			 * but also it limits too fast rto decreases,
			 * happening in pure Eifel.
			 */
			if (m > 0)
				m >>= 3;
		} else {
709
			m -= (tp->mdev_us >> 2);   /* similar update on mdev */
Linus Torvalds's avatar
Linus Torvalds committed
710
		}
711 712 713 714 715
		tp->mdev_us += m;		/* mdev = 3/4 mdev + 1/4 new */
		if (tp->mdev_us > tp->mdev_max_us) {
			tp->mdev_max_us = tp->mdev_us;
			if (tp->mdev_max_us > tp->rttvar_us)
				tp->rttvar_us = tp->mdev_max_us;
Linus Torvalds's avatar
Linus Torvalds committed
716 717
		}
		if (after(tp->snd_una, tp->rtt_seq)) {
718 719
			if (tp->mdev_max_us < tp->rttvar_us)
				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
Linus Torvalds's avatar
Linus Torvalds committed
720
			tp->rtt_seq = tp->snd_nxt;
721
			tp->mdev_max_us = tcp_rto_min_us(sk);
Linus Torvalds's avatar
Linus Torvalds committed
722 723 724
		}
	} else {
		/* no previous measure. */
725
		srtt = m << 3;		/* take the measured time to be rtt */
726 727 728
		tp->mdev_us = m << 1;	/* make sure rto = 3*rtt */
		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
		tp->mdev_max_us = tp->rttvar_us;
Linus Torvalds's avatar
Linus Torvalds committed
729 730
		tp->rtt_seq = tp->snd_nxt;
	}
731
	tp->srtt_us = max(1U, srtt);
Linus Torvalds's avatar
Linus Torvalds committed
732 733
}

734 735 736 737 738 739 740 741 742 743 744 745
/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
 * Note: TCP stack does not yet implement pacing.
 * FQ packet scheduler can be used to implement cheap but effective
 * TCP pacing, to smooth the burst on large writes when packets
 * in flight is significantly lower than cwnd (or rwin)
 */
static void tcp_update_pacing_rate(struct sock *sk)
{
	const struct tcp_sock *tp = tcp_sk(sk);
	u64 rate;

	/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
746
	rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
747 748 749

	rate *= max(tp->snd_cwnd, tp->packets_out);

750 751
	if (likely(tp->srtt_us))
		do_div(rate, tp->srtt_us);
752

753 754 755 756 757 758
	/* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
	 * without any lock. We want to make sure compiler wont store
	 * intermediate values in this location.
	 */
	ACCESS_ONCE(sk->sk_pacing_rate) = min_t(u64, rate,
						sk->sk_max_pacing_rate);
759 760
}

Linus Torvalds's avatar
Linus Torvalds committed
761 762 763
/* Calculate rto without backoff.  This is the second half of Van Jacobson's
 * routine referred to above.
 */
764
static void tcp_set_rto(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
765
{
766
	const struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
767 768 769 770 771 772 773 774
	/* Old crap is replaced with new one. 8)
	 *
	 * More seriously:
	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
	 *    It cannot be less due to utterly erratic ACK generation made
	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
	 *    to do with delayed acks, because at cwnd>2 true delack timeout
	 *    is invisible. Actually, Linux-2.4 also generates erratic
Stephen Hemminger's avatar
Stephen Hemminger committed
775
	 *    ACKs in some circumstances.
Linus Torvalds's avatar
Linus Torvalds committed
776
	 */
777
	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
Linus Torvalds's avatar
Linus Torvalds committed
778 779 780 781

	/* 2. Fixups made earlier cannot be right.
	 *    If we do not estimate RTO correctly without them,
	 *    all the algo is pure shit and should be replaced
Stephen Hemminger's avatar
Stephen Hemminger committed
782
	 *    with correct one. It is exactly, which we pretend to do.
Linus Torvalds's avatar
Linus Torvalds committed
783 784
	 */

785 786 787
	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
	 * guarantees that rto is higher.
	 */
788
	tcp_bound_rto(sk);
Linus Torvalds's avatar
Linus Torvalds committed
789 790
}

791
__u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
Linus Torvalds's avatar
Linus Torvalds committed
792 793 794
{
	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

795
	if (!cwnd)
796
		cwnd = TCP_INIT_CWND;
Linus Torvalds's avatar
Linus Torvalds committed
797 798 799
	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}

800 801 802 803
/*
 * Packet counting of FACK is based on in-order assumptions, therefore TCP
 * disables it when reordering is detected
 */
804
void tcp_disable_fack(struct tcp_sock *tp)
805
{
806 807 808
	/* RFC3517 uses different metric in lost marker => reset on change */
	if (tcp_is_fack(tp))
		tp->lost_skb_hint = NULL;
809
	tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
810 811
}

812
/* Take a notice that peer is sending D-SACKs */
813 814
static void tcp_dsack_seen(struct tcp_sock *tp)
{
815
	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
816 817
}

818 819
static void tcp_update_reordering(struct sock *sk, const int metric,
				  const int ts)
Linus Torvalds's avatar
Linus Torvalds committed
820
{
821
	struct tcp_sock *tp = tcp_sk(sk);
Linus Torvalds's avatar
Linus Torvalds committed
822
	if (metric > tp->reordering) {
823 824
		int mib_idx;

Linus Torvalds's avatar
Linus Torvalds committed
825 826 827 828
		tp->reordering = min(TCP_MAX_REORDERING, metric);

		/* This exciting event is worth to be remembered. 8) */
		if (ts)
829
			mib_idx = LINUX_MIB_TCPTSREORDER;
830
		else if (tcp_is_reno(tp))
831
			mib_idx = LINUX_MIB_TCPRENOREORDER;
832
		else if (tcp_is_fack(tp))
833
			mib_idx = LINUX_MIB_TCPFACKREORDER;
Linus Torvalds's avatar
Linus Torvalds committed
834
		else
835 836
			mib_idx = LINUX_MIB_TCPSACKREORDER;

837
		NET_INC_STATS_BH(sock_net(sk), mib_idx);
Linus Torvalds's avatar
Linus Torvalds committed
838
#if FASTRETRANS_DEBUG > 1
839 840 841 842 843 844
		pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
			 tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
			 tp->reordering,
			 tp->fackets_out,
			 tp->sacked_out,
			 tp->undo_marker ? tp->undo_retrans : 0);
Linus Torvalds's avatar
Linus Torvalds committed
845
#endif
846
		tcp_disable_fack(tp);
Linus Torvalds's avatar
Linus Torvalds committed
847
	}
Yuchung Cheng's avatar
Yuchung Cheng committed
848 849 850

	if (metric > 0)
		tcp_disable_early_retrans(tp);
Linus Torvalds's avatar
Linus Torvalds committed
851 852
}

853
/* This must be called before lost_out is incremented */
854 855
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
856
	if ((tp->retransmit_skb_hint == NULL) ||
857 858
	    before(TCP_SKB_CB(skb)->seq,
		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
859 860 861 862 863
		tp->retransmit_skb_hint = skb;

	if (!tp->lost_out ||
	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
864 865
}

866 867 868 869 870 871 872 873 874 875
static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
{
	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
		tcp_verify_retransmit_hint(tp, skb);

		tp->lost_out += tcp_skb_pcount(skb);
		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
	}
}

876 877
static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
					    struct sk_buff *skb)
878 879 880 881 882 883 884 885 886
{
	tcp_verify_retransmit_hint(tp, skb);

	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
		tp->lost_out += tcp_skb_pcount(skb);
		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
/* This procedure tags the retransmission queue when SACKs arrive.
 *
 * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
 * Packets in queue with these bits set are counted in variables
 * sacked_out, retrans_out and lost_out, correspondingly.
 *
 * Valid combinations are:
 * Tag  InFlight	Description
 * 0	1		- orig segment is in flight.
 * S	0		- nothing flies, orig reached receiver.
 * L	0		- nothing flies, orig lost by net.
 * R	2		- both orig and retransmit are in flight.
 * L|R	1		- orig is lost, retransmit is in flight.
 * S|R  1		- orig reached receiver, retrans is still in flight.
 * (L|S|R is logically valid, it could occur when L|R is sacked,
 *  but it is equivalent to plain S and code short-curcuits it to S.
 *  L|S is logically invalid, it would mean -1 packet in flight 8))
 *
 * These 6 states form finite state machine, controlled by the following events:
 * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
 * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
908
 * 3. Loss detection event of two flavors:
Linus Torvalds's avatar
Linus Torvalds committed
909 910
 *	A. Scoreboard estimator decided the packet is lost.
 *	   A'. Reno "three dupacks" marks head of queue lost.
911 912
 *	   A''. Its FACK modification, head until snd.fack is lost.
 *	B. SACK arrives sacking SND.NXT at the moment, when the
Linus Torvalds's avatar
Linus Torvalds committed
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931
 *	   segment was retransmitted.
 * 4. D-SACK added new rule: D-SACK changes any tag to S.
 *
 * It is pleasant to note, that state diagram turns out to be commutative,
 * so that we are allowed not to be bothered by order of our actions,
 * when multiple events arrive simultaneously. (see the function below).
 *
 * Reordering detection.
 * --------------------
 * Reordering metric is maximal distance, which a packet can be displaced
 * in packet stream. With SACKs we can estimate it:
 *
 * 1. SACK fills old hole and the corresponding segment was not
 *    ever retransmitted -> reordering. Alas, we cannot use it
 *    when segment was retransmitted.
 * 2. The last flaw is solved with D-SACK. D-SACK arrives
 *    for retransmitted and already SACKed segment -> reordering..
 * Both of these heuristics are not used in Loss state, when we cannot
 * account for retransmits accurately.
932 933 934 935 936 937 938
 *
 * SACK block validation.
 * ----------------------
 *
 * SACK block range validation checks that the received SACK block fits to
 * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
 * Note that SND.UNA is not included to the range though being valid because
939 940 941 942 943 944 945 946 947
 * it means that the receiver is rather inconsistent with itself reporting
 * SACK reneging when it should advance SND.UNA. Such SACK block this is
 * perfectly valid, however, in light of RFC2018 which explicitly states
 * that "SACK block MUST reflect the newest segment.  Even if the newest
 * segment is going to be discarded ...", not that it looks very clever
 * in case of head skb. Due to potentional receiver driven attacks, we
 * choose to avoid immediate execution of a walk in write queue due to
 * reneging and defer head skb's loss recovery to standard loss recovery
 * procedure that will eventually trigger (nothing forbids us doing this).
948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969
 *
 * Implements also blockage to start_seq wrap-around. Problem lies in the
 * fact that though start_seq (s) is before end_seq (i.e., not reversed),
 * there's no guarantee that it will be before snd_nxt (n). The problem
 * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
 * wrap (s_w):
 *
 *         <- outs wnd ->                          <- wrapzone ->
 *         u     e      n                         u_w   e_w  s n_w
 *         |     |      |                          |     |   |  |
 * |<------------+------+----- TCP seqno space --------------+---------->|
 * ...-- <2^31 ->|                                           |<--------...
 * ...---- >2^31 ------>|                                    |<--------...
 *
 * Current code wouldn't be vulnerable but it's better still to discard such
 * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
 * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
 * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
 * equal to the ideal case (infinite seqno space without wrap caused issues).
 *
 * With D-SACK the lower bound is extended to cover sequence space below
 * SND.UNA down to undo_marker, which is the last point of interest. Yet
970
 * again, D-SACK block must not to go across snd_una (for the same reason as
971 972 973 974 975 976 977 978 979
 * for the normal SACK blocks, explained above). But there all simplicity
 * ends, TCP might receive valid D-SACKs below that. As long as they reside
 * fully below undo_marker they do not affect behavior in anyway and can
 * therefore be safely ignored. In rare cases (which are more or less
 * theoretical ones), the D-SACK will nicely cross that boundary due to skb
 * fragmentation and packet reordering past skb's retransmission. To consider
 * them correctly, the acceptable range must be extended even more though
 * the exact amount is rather hard to quantify. However, tp->max_window can
 * be used as an exaggerated estimate.
Linus Torvalds's avatar
Linus Torvalds committed
980
 */
Eric Dumazet's avatar
Eric Dumazet committed
981 982
static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
				   u32 start_seq, u32 end_seq)
983 984 985
{
	/* Too far in future, or reversed (interpretation is ambiguous) */
	if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
Eric Dumazet's avatar
Eric Dumazet committed
986
		return false;
987 988 989

	/* Nasty start_seq wrap-around check (see comments above) */
	if (!before(start_seq, tp->snd_nxt))
Eric Dumazet's avatar
Eric Dumazet committed
990
		return false;
991

992
	/* In outstanding window? ...This is valid exit for D-SACKs too.
993 994 995
	 * start_seq == snd_una is non-sensical (see comments above)
	 */
	if (after(start_seq, tp->snd_una))
Eric Dumazet's avatar
Eric Dumazet committed
996
		return true;
997 998

	if (!is_dsack || !tp->undo_marker)
Eric Dumazet's avatar
Eric Dumazet committed
999
		return false;
1000 1001

	/* ...Then it's D-SACK, and must reside below snd_una completely */
Zheng Yan's avatar
Zheng Yan committed
1002
	if (after(end_seq, tp->snd_una))
Eric Dumazet's avatar
Eric Dumazet committed
1003
		return false;
1004 1005

	if (!before(start_seq, tp->undo_marker))
Eric Dumazet's avatar
Eric Dumazet committed
1006
		return true;
1007 1008 1009

	/* Too old */
	if (!after(end_seq, tp->undo_marker))
Eric Dumazet's avatar
Eric Dumazet committed
1010
		return false;
1011 1012 1013 1014 1015 1016 1017

	/* Undo_marker boundary crossing (overestimates a lot). Known already:
	 *   start_seq < undo_marker and end_seq >= undo_marker.
	 */
	return !before(start_seq, end_seq - tp->max_window);
}

1018
/* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
1019
 * Event "B". Later note: FACK people cheated me again 8), we have to account
1020
 * for reordering! Ugly, but should help.
1021 1022 1023
 *
 * Search retransmitted skbs from write_queue that were sent when snd_nxt was
 * less than what is now known to be received by the other end (derived from
1024 1025
 * highest SACK block). Also calculate the lowest snd_nxt among the remaining
 * retransmitted skbs to avoid some costly processing per ACKs.
1026
 */
1027
static void tcp_mark_lost_retrans(struct sock *sk)
1028
{
1029
	const struct inet_connection_sock *icsk = inet_csk(sk);
1030 1031
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
1032
	int cnt = 0;
1033
	u32 new_low_seq = tp->snd_nxt;
1034
	u32 received_upto = tcp_highest_sack_seq(tp);
1035 1036 1037 1038

	if (!tcp_is_fack(tp) || !tp->retrans_out ||
	    !after(received_upto, tp->lost_retrans_low) ||
	    icsk->icsk_ca_state != TCP_CA_Recovery)
1039
		return;
1040 1041 1042 1043 1044 1045

	tcp_for_write_queue(skb, sk) {
		u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;

		if (skb == tcp_send_head(sk))
			break;
1046
		if (cnt == tp->retrans_out)
1047 1048 1049 1050
			break;
		if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
			continue;

1051 1052 1053
		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
			continue;

1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
		/* TODO: We would like to get rid of tcp_is_fack(tp) only
		 * constraint here (see above) but figuring out that at
		 * least tp->reordering SACK blocks reside between ack_seq
		 * and received_upto is not easy task to do cheaply with
		 * the available datastructures.
		 *
		 * Whether FACK should check here for tp->reordering segs
		 * in-between one could argue for either way (it would be
		 * rather simple to implement as we could count fack_count
		 * during the walk and do tp->fackets_out - fack_count).
		 */
		if (after(received_upto, ack_seq)) {
1066 1067 1068
			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
			tp->retrans_out -= tcp_skb_pcount(skb);

1069
			tcp_skb_mark_lost_uncond_verify(tp, skb);
1070
			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
1071
		} else {
1072
			if (before(ack_seq, new_low_seq))
1073
				new_low_seq = ack_seq;
1074
			cnt += tcp_skb_pcount(skb);
1075 1076
		}
	}
1077 1078 1079

	if (tp->retrans_out)
		tp->lost_retrans_low = new_low_seq;
1080
}
1081

Eric Dumazet's avatar
Eric Dumazet committed
1082 1083 1084
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
			    struct tcp_sack_block_wire *sp, int num_sacks,
			    u32 prior_snd_una)
1085
{
1086
	struct tcp_sock *tp = tcp_sk(sk);
1087 1088
	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
Eric Dumazet's avatar
Eric Dumazet committed
1089
	bool dup_sack = false;
1090 1091

	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
Eric Dumazet's avatar
Eric Dumazet committed
1092
		dup_sack = true;
1093
		tcp_dsack_seen(tp);
1094
		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
1095
	} else if (num_sacks > 1) {
1096 1097
		u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
		u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
1098 1099 1100

		if (!after(end_seq_0, end_seq_1) &&
		    !before(start_seq_0, start_seq_1)) {
Eric Dumazet's avatar
Eric Dumazet committed
1101
			dup_sack = true;
1102
			tcp_dsack_seen(tp);
1103