svcsock.c 43.3 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7
/*
 * linux/net/sunrpc/svcsock.c
 *
 * These are the RPC server socket internals.
 *
 * The server scheduling algorithm does not always distribute the load
 * evenly when servicing a single client. May need to modify the
8
 * svc_xprt_enqueue procedure...
Linus Torvalds's avatar
Linus Torvalds committed
9 10 11 12 13 14 15 16 17 18 19 20 21
 *
 * TCP support is largely untested and may be a little slow. The problem
 * is that we currently do two separate recvfrom's, one for the 4-byte
 * record length, and the second for the actual record. This could possibly
 * be improved by always reading a minimum size of around 100 bytes and
 * tucking any superfluous bytes away in a temporary store. Still, that
 * leaves write requests out in the rain. An alternative may be to peek at
 * the first skb in the queue, and if it matches the next TCP sequence
 * number, to extract the record marker. Yuck.
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */

22
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
#include <linux/sched.h>
24
#include <linux/module.h>
Linus Torvalds's avatar
Linus Torvalds committed
25 26 27 28 29 30
#include <linux/errno.h>
#include <linux/fcntl.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/udp.h>
31
#include <linux/tcp.h>
Linus Torvalds's avatar
Linus Torvalds committed
32 33 34 35
#include <linux/unistd.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
36
#include <linux/file.h>
37
#include <linux/freezer.h>
Linus Torvalds's avatar
Linus Torvalds committed
38 39 40
#include <net/sock.h>
#include <net/checksum.h>
#include <net/ip.h>
41
#include <net/ipv6.h>
42
#include <net/tcp.h>
43
#include <net/tcp_states.h>
Linus Torvalds's avatar
Linus Torvalds committed
44 45
#include <asm/uaccess.h>
#include <asm/ioctls.h>
46
#include <trace/events/skb.h>
Linus Torvalds's avatar
Linus Torvalds committed
47 48

#include <linux/sunrpc/types.h>
49
#include <linux/sunrpc/clnt.h>
Linus Torvalds's avatar
Linus Torvalds committed
50
#include <linux/sunrpc/xdr.h>
51
#include <linux/sunrpc/msg_prot.h>
Linus Torvalds's avatar
Linus Torvalds committed
52 53
#include <linux/sunrpc/svcsock.h>
#include <linux/sunrpc/stats.h>
54
#include <linux/sunrpc/xprt.h>
Linus Torvalds's avatar
Linus Torvalds committed
55

56 57
#include "sunrpc.h"

58
#define RPCDBG_FACILITY	RPCDBG_SVCXPRT
Linus Torvalds's avatar
Linus Torvalds committed
59 60 61


static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
62
					 int flags);
63
static void		svc_udp_data_ready(struct sock *);
Linus Torvalds's avatar
Linus Torvalds committed
64 65
static int		svc_udp_recvfrom(struct svc_rqst *);
static int		svc_udp_sendto(struct svc_rqst *);
66
static void		svc_sock_detach(struct svc_xprt *);
67
static void		svc_tcp_sock_detach(struct svc_xprt *);
68
static void		svc_sock_free(struct svc_xprt *);
Linus Torvalds's avatar
Linus Torvalds committed
69

70
static struct svc_xprt *svc_create_socket(struct svc_serv *, int,
71 72
					  struct net *, struct sockaddr *,
					  int, int);
73
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
74 75 76 77
static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int,
					     struct net *, struct sockaddr *,
					     int, int);
static void svc_bc_sock_free(struct svc_xprt *xprt);
78
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
79

80 81 82 83
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key svc_key[2];
static struct lock_class_key svc_slock_key[2];

84
static void svc_reclassify_socket(struct socket *sock)
85 86
{
	struct sock *sk = sock->sk;
87

88
	if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
89 90
		return;

91 92 93
	switch (sk->sk_family) {
	case AF_INET:
		sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
94 95 96
					      &svc_slock_key[0],
					      "sk_xprt.xpt_lock-AF_INET-NFSD",
					      &svc_key[0]);
97 98 99 100
		break;

	case AF_INET6:
		sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
101 102 103
					      &svc_slock_key[1],
					      "sk_xprt.xpt_lock-AF_INET6-NFSD",
					      &svc_key[1]);
104 105 106 107 108 109 110
		break;

	default:
		BUG();
	}
}
#else
111
static void svc_reclassify_socket(struct socket *sock)
112 113 114 115
{
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
116 117 118
/*
 * Release an skbuff after use
 */
119
static void svc_release_skb(struct svc_rqst *rqstp)
Linus Torvalds's avatar
Linus Torvalds committed
120
{
121
	struct sk_buff *skb = rqstp->rq_xprt_ctxt;
Linus Torvalds's avatar
Linus Torvalds committed
122 123

	if (skb) {
124 125
		struct svc_sock *svsk =
			container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
126
		rqstp->rq_xprt_ctxt = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
127 128

		dprintk("svc: service %p, releasing skb %p\n", rqstp, skb);
129
		skb_free_datagram_locked(svsk->sk_sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
130 131 132
	}
}

133 134 135 136
union svc_pktinfo_u {
	struct in_pktinfo pkti;
	struct in6_pktinfo pkti6;
};
137 138
#define SVC_PKTINFO_SPACE \
	CMSG_SPACE(sizeof(union svc_pktinfo_u))
139 140 141

static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
{
142 143 144
	struct svc_sock *svsk =
		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
	switch (svsk->sk_sk->sk_family) {
145 146 147 148 149 150
	case AF_INET: {
			struct in_pktinfo *pki = CMSG_DATA(cmh);

			cmh->cmsg_level = SOL_IP;
			cmh->cmsg_type = IP_PKTINFO;
			pki->ipi_ifindex = 0;
151 152
			pki->ipi_spec_dst.s_addr =
				 svc_daddr_in(rqstp)->sin_addr.s_addr;
153 154 155
			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
		}
		break;
156

157 158
	case AF_INET6: {
			struct in6_pktinfo *pki = CMSG_DATA(cmh);
159
			struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp);
160 161 162

			cmh->cmsg_level = SOL_IPV6;
			cmh->cmsg_type = IPV6_PKTINFO;
163
			pki->ipi6_ifindex = daddr->sin6_scope_id;
Alexey Dobriyan's avatar
Alexey Dobriyan committed
164
			pki->ipi6_addr = daddr->sin6_addr;
165 166 167 168 169 170
			cmh->cmsg_len = CMSG_LEN(sizeof(*pki));
		}
		break;
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
171
/*
172
 * send routine intended to be shared by the fore- and back-channel
Linus Torvalds's avatar
Linus Torvalds committed
173
 */
174 175 176
int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
		    struct page *headpage, unsigned long headoffset,
		    struct page *tailpage, unsigned long tailoffset)
Linus Torvalds's avatar
Linus Torvalds committed
177 178 179 180 181 182
{
	int		result;
	int		size;
	struct page	**ppage = xdr->pages;
	size_t		base = xdr->page_base;
	unsigned int	pglen = xdr->page_len;
183
	unsigned int	flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
184 185
	int		slen;
	int		len = 0;
Linus Torvalds's avatar
Linus Torvalds committed
186 187 188 189 190 191

	slen = xdr->len;

	/* send head */
	if (slen == xdr->head[0].iov_len)
		flags = 0;
192
	len = kernel_sendpage(sock, headpage, headoffset,
193
				  xdr->head[0].iov_len, flags);
Linus Torvalds's avatar
Linus Torvalds committed
194 195 196 197 198 199 200 201 202 203 204
	if (len != xdr->head[0].iov_len)
		goto out;
	slen -= xdr->head[0].iov_len;
	if (slen == 0)
		goto out;

	/* send page data */
	size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
	while (pglen > 0) {
		if (slen == size)
			flags = 0;
205
		result = kernel_sendpage(sock, *ppage, base, size, flags);
Linus Torvalds's avatar
Linus Torvalds committed
206 207 208 209 210 211 212 213 214 215
		if (result > 0)
			len += result;
		if (result != size)
			goto out;
		slen -= size;
		pglen -= size;
		size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
		base = 0;
		ppage++;
	}
216

Linus Torvalds's avatar
Linus Torvalds committed
217 218
	/* send tail */
	if (xdr->tail[0].iov_len) {
219 220
		result = kernel_sendpage(sock, tailpage, tailoffset,
				   xdr->tail[0].iov_len, 0);
Linus Torvalds's avatar
Linus Torvalds committed
221 222 223
		if (result > 0)
			len += result;
	}
224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258

out:
	return len;
}


/*
 * Generic sendto routine
 */
static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
{
	struct svc_sock	*svsk =
		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
	struct socket	*sock = svsk->sk_sock;
	union {
		struct cmsghdr	hdr;
		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
	} buffer;
	struct cmsghdr *cmh = &buffer.hdr;
	int		len = 0;
	unsigned long tailoff;
	unsigned long headoff;
	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);

	if (rqstp->rq_prot == IPPROTO_UDP) {
		struct msghdr msg = {
			.msg_name	= &rqstp->rq_addr,
			.msg_namelen	= rqstp->rq_addrlen,
			.msg_control	= cmh,
			.msg_controllen	= sizeof(buffer),
			.msg_flags	= MSG_MORE,
		};

		svc_set_cmsg_data(rqstp, cmh);

259
		if (sock_sendmsg(sock, &msg) < 0)
260 261 262 263 264 265 266 267
			goto out;
	}

	tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1);
	headoff = 0;
	len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff,
			       rqstp->rq_respages[0], tailoff);

Linus Torvalds's avatar
Linus Torvalds committed
268
out:
269
	dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n",
270
		svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
271
		xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
Linus Torvalds's avatar
Linus Torvalds committed
272 273 274 275

	return len;
}

276 277 278
/*
 * Report socket names for nfsdfs
 */
279
static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining)
280
{
Chuck Lever's avatar
Chuck Lever committed
281 282 283
	const struct sock *sk = svsk->sk_sk;
	const char *proto_name = sk->sk_protocol == IPPROTO_UDP ?
							"udp" : "tcp";
284 285
	int len;

Chuck Lever's avatar
Chuck Lever committed
286
	switch (sk->sk_family) {
287 288
	case PF_INET:
		len = snprintf(buf, remaining, "ipv4 %s %pI4 %d\n",
Chuck Lever's avatar
Chuck Lever committed
289
				proto_name,
290 291
				&inet_sk(sk)->inet_rcv_saddr,
				inet_sk(sk)->inet_num);
292
		break;
293
#if IS_ENABLED(CONFIG_IPV6)
294 295
	case PF_INET6:
		len = snprintf(buf, remaining, "ipv6 %s %pI6 %d\n",
Chuck Lever's avatar
Chuck Lever committed
296
				proto_name,
297
				&sk->sk_v6_rcv_saddr,
298
				inet_sk(sk)->inet_num);
299
		break;
300
#endif
301
	default:
302
		len = snprintf(buf, remaining, "*unknown-%d*\n",
Chuck Lever's avatar
Chuck Lever committed
303
				sk->sk_family);
304
	}
305 306 307 308

	if (len >= remaining) {
		*buf = '\0';
		return -ENAMETOOLONG;
309 310 311 312
	}
	return len;
}

Linus Torvalds's avatar
Linus Torvalds committed
313 314 315
/*
 * Generic recvfrom routine.
 */
316 317
static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr,
			int buflen)
Linus Torvalds's avatar
Linus Torvalds committed
318
{
319 320
	struct svc_sock *svsk =
		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
321 322 323 324
	struct msghdr msg = {
		.msg_flags	= MSG_DONTWAIT,
	};
	int len;
Linus Torvalds's avatar
Linus Torvalds committed
325

326 327
	rqstp->rq_xprt_hlen = 0;

328
	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
329 330
	len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen,
				msg.msg_flags);
331 332 333 334 335
	/* If we read a full record, then assume there may be more
	 * data to read (stream based sockets only!)
	 */
	if (len == buflen)
		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
Linus Torvalds's avatar
Linus Torvalds committed
336 337

	dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n",
338
		svsk, iov[0].iov_base, iov[0].iov_len, len);
Linus Torvalds's avatar
Linus Torvalds committed
339 340 341
	return len;
}

342 343 344 345 346
static int svc_partial_recvfrom(struct svc_rqst *rqstp,
				struct kvec *iov, int nr,
				int buflen, unsigned int base)
{
	size_t save_iovlen;
347
	void *save_iovbase;
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
	unsigned int i;
	int ret;

	if (base == 0)
		return svc_recvfrom(rqstp, iov, nr, buflen);

	for (i = 0; i < nr; i++) {
		if (iov[i].iov_len > base)
			break;
		base -= iov[i].iov_len;
	}
	save_iovlen = iov[i].iov_len;
	save_iovbase = iov[i].iov_base;
	iov[i].iov_len -= base;
	iov[i].iov_base += base;
	ret = svc_recvfrom(rqstp, &iov[i], nr - i, buflen);
	iov[i].iov_len = save_iovlen;
	iov[i].iov_base = save_iovbase;
	return ret;
}

Linus Torvalds's avatar
Linus Torvalds committed
369 370 371
/*
 * Set socket snd and rcv buffer lengths
 */
372 373
static void svc_sock_setbufsize(struct socket *sock, unsigned int snd,
				unsigned int rcv)
Linus Torvalds's avatar
Linus Torvalds committed
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
{
#if 0
	mm_segment_t	oldfs;
	oldfs = get_fs(); set_fs(KERNEL_DS);
	sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
			(char*)&snd, sizeof(snd));
	sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
			(char*)&rcv, sizeof(rcv));
#else
	/* sock_setsockopt limits use to sysctl_?mem_max,
	 * which isn't acceptable.  Until that is made conditional
	 * on not having CAP_SYS_RESOURCE or similar, we go direct...
	 * DaveM said I could!
	 */
	lock_sock(sock->sk);
	sock->sk->sk_sndbuf = snd * 2;
	sock->sk->sk_rcvbuf = rcv * 2;
391
	sock->sk->sk_write_space(sock->sk);
Linus Torvalds's avatar
Linus Torvalds committed
392 393 394
	release_sock(sock->sk);
#endif
}
395 396 397 398 399 400

static int svc_sock_secure_port(struct svc_rqst *rqstp)
{
	return svc_port_is_privileged(svc_addr(rqstp));
}

401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
static bool sunrpc_waitqueue_active(wait_queue_head_t *wq)
{
	if (!wq)
		return false;
	/*
	 * There should normally be a memory * barrier here--see
	 * wq_has_sleeper().
	 *
	 * It appears that isn't currently necessary, though, basically
	 * because callers all appear to have sufficient memory barriers
	 * between the time the relevant change is made and the
	 * time they call these callbacks.
	 *
	 * The nfsd code itself doesn't actually explicitly wait on
	 * these waitqueues, but it may wait on them for example in
	 * sendpage() or sendmsg() calls.  (And those may be the only
	 * places, since it it uses nonblocking reads.)
	 *
	 * Maybe we should add the memory barriers anyway, but these are
	 * hot paths so we'd need to be convinced there's no sigificant
	 * penalty.
	 */
	return waitqueue_active(wq);
}

Linus Torvalds's avatar
Linus Torvalds committed
426 427 428
/*
 * INET callback when data has been received on the socket.
 */
429
static void svc_udp_data_ready(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
430
{
431
	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
432
	wait_queue_head_t *wq = sk_sleep(sk);
Linus Torvalds's avatar
Linus Torvalds committed
433

434
	if (svsk) {
435 436
		dprintk("svc: socket %p(inet %p), busy=%d\n",
			svsk, sk,
437 438
			test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
439
		svc_xprt_enqueue(&svsk->sk_xprt);
440
	}
441
	if (sunrpc_waitqueue_active(wq))
442
		wake_up_interruptible(wq);
Linus Torvalds's avatar
Linus Torvalds committed
443 444 445 446 447
}

/*
 * INET callback when space is newly available on the socket.
 */
448
static void svc_write_space(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
449 450
{
	struct svc_sock	*svsk = (struct svc_sock *)(sk->sk_user_data);
451
	wait_queue_head_t *wq = sk_sleep(sk);
Linus Torvalds's avatar
Linus Torvalds committed
452 453 454

	if (svsk) {
		dprintk("svc: socket %p(inet %p), write_space busy=%d\n",
455
			svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags));
456
		svc_xprt_enqueue(&svsk->sk_xprt);
Linus Torvalds's avatar
Linus Torvalds committed
457 458
	}

459
	if (sunrpc_waitqueue_active(wq)) {
460
		dprintk("RPC svc_write_space: someone sleeping on %p\n",
Linus Torvalds's avatar
Linus Torvalds committed
461
		       svsk);
462
		wake_up_interruptible(wq);
Linus Torvalds's avatar
Linus Torvalds committed
463 464 465
	}
}

466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482
static int svc_tcp_has_wspace(struct svc_xprt *xprt)
{
	struct svc_sock *svsk =	container_of(xprt, struct svc_sock, sk_xprt);
	struct svc_serv *serv = svsk->sk_xprt.xpt_server;
	int required;

	if (test_bit(XPT_LISTENER, &xprt->xpt_flags))
		return 1;
	required = atomic_read(&xprt->xpt_reserved) + serv->sv_max_mesg;
	if (sk_stream_wspace(svsk->sk_sk) >= required ||
	    (sk_stream_min_wspace(svsk->sk_sk) == 0 &&
	     atomic_read(&xprt->xpt_reserved) == 0))
		return 1;
	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
	return 0;
}

483 484
static void svc_tcp_write_space(struct sock *sk)
{
485
	struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data);
486 487
	struct socket *sock = sk->sk_socket;

488 489 490
	if (!sk_stream_is_writeable(sk) || !sock)
		return;
	if (!svsk || svc_tcp_has_wspace(&svsk->sk_xprt))
491 492 493 494
		clear_bit(SOCK_NOSPACE, &sock->flags);
	svc_write_space(sk);
}

495 496 497 498 499 500 501 502
static void svc_tcp_adjust_wspace(struct svc_xprt *xprt)
{
	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);

	if (svc_tcp_has_wspace(xprt))
		clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
}

503 504 505 506 507 508 509
/*
 * See net/ipv6/ip_sockglue.c : ip_cmsg_recv_pktinfo
 */
static int svc_udp_get_dest_address4(struct svc_rqst *rqstp,
				     struct cmsghdr *cmh)
{
	struct in_pktinfo *pki = CMSG_DATA(cmh);
510 511
	struct sockaddr_in *daddr = svc_daddr_in(rqstp);

512 513
	if (cmh->cmsg_type != IP_PKTINFO)
		return 0;
514 515 516

	daddr->sin_family = AF_INET;
	daddr->sin_addr.s_addr = pki->ipi_spec_dst.s_addr;
517 518 519 520
	return 1;
}

/*
521
 * See net/ipv6/datagram.c : ip6_datagram_recv_ctl
522 523 524 525 526
 */
static int svc_udp_get_dest_address6(struct svc_rqst *rqstp,
				     struct cmsghdr *cmh)
{
	struct in6_pktinfo *pki = CMSG_DATA(cmh);
527 528
	struct sockaddr_in6 *daddr = svc_daddr_in6(rqstp);

529 530
	if (cmh->cmsg_type != IPV6_PKTINFO)
		return 0;
531 532

	daddr->sin6_family = AF_INET6;
Alexey Dobriyan's avatar
Alexey Dobriyan committed
533
	daddr->sin6_addr = pki->ipi6_addr;
534
	daddr->sin6_scope_id = pki->ipi6_ifindex;
535 536 537
	return 1;
}

538 539 540 541 542 543 544
/*
 * Copy the UDP datagram's destination address to the rqstp structure.
 * The 'destination' address in this case is the address to which the
 * peer sent the datagram, i.e. our local address. For multihomed
 * hosts, this can change from msg to msg. Note that only the IP
 * address changes, the port number should remain the same.
 */
545 546
static int svc_udp_get_dest_address(struct svc_rqst *rqstp,
				    struct cmsghdr *cmh)
547
{
548 549 550 551 552
	switch (cmh->cmsg_level) {
	case SOL_IP:
		return svc_udp_get_dest_address4(rqstp, cmh);
	case SOL_IPV6:
		return svc_udp_get_dest_address6(rqstp, cmh);
553
	}
554 555

	return 0;
556 557
}

Linus Torvalds's avatar
Linus Torvalds committed
558 559 560
/*
 * Receive a datagram from a UDP socket.
 */
561
static int svc_udp_recvfrom(struct svc_rqst *rqstp)
Linus Torvalds's avatar
Linus Torvalds committed
562
{
563 564
	struct svc_sock	*svsk =
		container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
565
	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
Linus Torvalds's avatar
Linus Torvalds committed
566
	struct sk_buff	*skb;
567 568 569 570 571
	union {
		struct cmsghdr	hdr;
		long		all[SVC_PKTINFO_SPACE / sizeof(long)];
	} buffer;
	struct cmsghdr *cmh = &buffer.hdr;
572 573 574 575 576 577
	struct msghdr msg = {
		.msg_name = svc_addr(rqstp),
		.msg_control = cmh,
		.msg_controllen = sizeof(buffer),
		.msg_flags = MSG_DONTWAIT,
	};
578 579
	size_t len;
	int err;
Linus Torvalds's avatar
Linus Torvalds committed
580

581
	if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags))
Linus Torvalds's avatar
Linus Torvalds committed
582 583 584
	    /* udp sockets need large rcvbuf as all pending
	     * requests are still in that buffer.  sndbuf must
	     * also be large enough that there is enough space
585 586 587 588
	     * for one reply per thread.  We count all threads
	     * rather than threads in a particular pool, which
	     * provides an upper bound on the number of threads
	     * which will access the socket.
Linus Torvalds's avatar
Linus Torvalds committed
589 590
	     */
	    svc_sock_setbufsize(svsk->sk_sock,
591 592
				(serv->sv_nrthreads+3) * serv->sv_max_mesg,
				(serv->sv_nrthreads+3) * serv->sv_max_mesg);
Linus Torvalds's avatar
Linus Torvalds committed
593

594
	clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
595 596 597 598 599 600 601 602 603 604
	skb = NULL;
	err = kernel_recvmsg(svsk->sk_sock, &msg, NULL,
			     0, 0, MSG_PEEK | MSG_DONTWAIT);
	if (err >= 0)
		skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err);

	if (skb == NULL) {
		if (err != -EAGAIN) {
			/* possibly an icmp error */
			dprintk("svc: recvfrom returned error %d\n", -err);
605
			set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
Linus Torvalds's avatar
Linus Torvalds committed
606
		}
607
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
608
	}
609 610
	len = svc_addr_len(svc_addr(rqstp));
	rqstp->rq_addrlen = len;
611 612
	if (skb->tstamp.tv64 == 0) {
		skb->tstamp = ktime_get_real();
613
		/* Don't enable netstamp, sunrpc doesn't
Linus Torvalds's avatar
Linus Torvalds committed
614 615
		   need that much accuracy */
	}
616
	svsk->sk_sk->sk_stamp = skb->tstamp;
617
	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */
Linus Torvalds's avatar
Linus Torvalds committed
618

619
	len  = skb->len;
Linus Torvalds's avatar
Linus Torvalds committed
620 621
	rqstp->rq_arg.len = len;

622
	rqstp->rq_prot = IPPROTO_UDP;
623

624
	if (!svc_udp_get_dest_address(rqstp, cmh)) {
625 626
		net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n",
				     cmh->cmsg_level, cmh->cmsg_type);
627
		goto out_free;
628
	}
629
	rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp));
Linus Torvalds's avatar
Linus Torvalds committed
630 631 632 633 634 635 636

	if (skb_is_nonlinear(skb)) {
		/* we have to copy */
		local_bh_disable();
		if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
			local_bh_enable();
			/* checksum error */
637
			goto out_free;
Linus Torvalds's avatar
Linus Torvalds committed
638 639
		}
		local_bh_enable();
640
		skb_free_datagram_locked(svsk->sk_sk, skb);
Linus Torvalds's avatar
Linus Torvalds committed
641 642
	} else {
		/* we can use it in-place */
643
		rqstp->rq_arg.head[0].iov_base = skb->data;
Linus Torvalds's avatar
Linus Torvalds committed
644
		rqstp->rq_arg.head[0].iov_len = len;
645 646
		if (skb_checksum_complete(skb))
			goto out_free;
647
		rqstp->rq_xprt_ctxt = skb;
Linus Torvalds's avatar
Linus Torvalds committed
648 649 650 651 652 653
	}

	rqstp->rq_arg.page_base = 0;
	if (len <= rqstp->rq_arg.head[0].iov_len) {
		rqstp->rq_arg.head[0].iov_len = len;
		rqstp->rq_arg.page_len = 0;
654
		rqstp->rq_respages = rqstp->rq_pages+1;
Linus Torvalds's avatar
Linus Torvalds committed
655 656
	} else {
		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
657
		rqstp->rq_respages = rqstp->rq_pages + 1 +
658
			DIV_ROUND_UP(rqstp->rq_arg.page_len, PAGE_SIZE);
Linus Torvalds's avatar
Linus Torvalds committed
659
	}
660
	rqstp->rq_next_page = rqstp->rq_respages+1;
Linus Torvalds's avatar
Linus Torvalds committed
661 662 663 664 665

	if (serv->sv_stats)
		serv->sv_stats->netudpcnt++;

	return len;
666 667 668 669
out_free:
	trace_kfree_skb(skb, svc_udp_recvfrom);
	skb_free_datagram_locked(svsk->sk_sk, skb);
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
}

static int
svc_udp_sendto(struct svc_rqst *rqstp)
{
	int		error;

	error = svc_sendto(rqstp, &rqstp->rq_res);
	if (error == -ECONNREFUSED)
		/* ICMP error on earlier request. */
		error = svc_sendto(rqstp, &rqstp->rq_res);

	return error;
}

Tom Tucker's avatar
Tom Tucker committed
685 686 687 688
static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp)
{
}

689 690 691
static int svc_udp_has_wspace(struct svc_xprt *xprt)
{
	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
692
	struct svc_serv	*serv = xprt->xpt_server;
693 694 695 696 697 698 699
	unsigned long required;

	/*
	 * Set the SOCK_NOSPACE flag before checking the available
	 * sock space.
	 */
	set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
Tom Tucker's avatar
Tom Tucker committed
700
	required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg;
701 702 703 704 705 706
	if (required*2 > sock_wspace(svsk->sk_sk))
		return 0;
	clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags);
	return 1;
}

707 708 709 710 711 712
static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt)
{
	BUG();
	return NULL;
}

713
static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
714
				       struct net *net,
715 716 717
				       struct sockaddr *sa, int salen,
				       int flags)
{
718
	return svc_create_socket(serv, IPPROTO_UDP, net, sa, salen, flags);
719 720
}

721
static struct svc_xprt_ops svc_udp_ops = {
722
	.xpo_create = svc_udp_create,
723 724
	.xpo_recvfrom = svc_udp_recvfrom,
	.xpo_sendto = svc_udp_sendto,
725
	.xpo_release_rqst = svc_release_skb,
726 727
	.xpo_detach = svc_sock_detach,
	.xpo_free = svc_sock_free,
Tom Tucker's avatar
Tom Tucker committed
728
	.xpo_prep_reply_hdr = svc_udp_prep_reply_hdr,
729
	.xpo_has_wspace = svc_udp_has_wspace,
730
	.xpo_accept = svc_udp_accept,
731
	.xpo_secure_port = svc_sock_secure_port,
732 733 734 735
};

static struct svc_xprt_class svc_udp_class = {
	.xcl_name = "udp",
736
	.xcl_owner = THIS_MODULE,
737
	.xcl_ops = &svc_udp_ops,
738
	.xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP,
739
	.xcl_ident = XPRT_TRANSPORT_UDP,
740 741
};

742
static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv)
Linus Torvalds's avatar
Linus Torvalds committed
743
{
744
	int err, level, optname, one = 1;
745

746 747
	svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class,
		      &svsk->sk_xprt, serv);
748
	clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags);
Linus Torvalds's avatar
Linus Torvalds committed
749 750 751 752
	svsk->sk_sk->sk_data_ready = svc_udp_data_ready;
	svsk->sk_sk->sk_write_space = svc_write_space;

	/* initialise setting must have enough space to
753
	 * receive and respond to one request.
Linus Torvalds's avatar
Linus Torvalds committed
754 755 756
	 * svc_udp_recvfrom will re-adjust if necessary
	 */
	svc_sock_setbufsize(svsk->sk_sock,
757 758
			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg,
			    3 * svsk->sk_xprt.xpt_server->sv_max_mesg);
Linus Torvalds's avatar
Linus Torvalds committed
759

760 761
	/* data might have come in before data_ready set up */
	set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
762
	set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags);
763 764

	/* make sure we get destination address info */
765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
	switch (svsk->sk_sk->sk_family) {
	case AF_INET:
		level = SOL_IP;
		optname = IP_PKTINFO;
		break;
	case AF_INET6:
		level = SOL_IPV6;
		optname = IPV6_RECVPKTINFO;
		break;
	default:
		BUG();
	}
	err = kernel_setsockopt(svsk->sk_sock, level, optname,
					(char *)&one, sizeof(one));
	dprintk("svc: kernel_setsockopt returned %d\n", err);
Linus Torvalds's avatar
Linus Torvalds committed
780 781 782 783 784 785
}

/*
 * A data_ready event on a listening socket means there's a connection
 * pending. Do not use state_change as a substitute for it.
 */
786
static void svc_tcp_listen_data_ready(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
787
{
788
	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
789
	wait_queue_head_t *wq;
Linus Torvalds's avatar
Linus Torvalds committed
790 791

	dprintk("svc: socket %p TCP (listen) state change %d\n",
792
		sk, sk->sk_state);
Linus Torvalds's avatar
Linus Torvalds committed
793

794 795 796 797 798 799 800 801 802 803 804 805
	/*
	 * This callback may called twice when a new connection
	 * is established as a child socket inherits everything
	 * from a parent LISTEN socket.
	 * 1) data_ready method of the parent socket will be called
	 *    when one of child sockets become ESTABLISHED.
	 * 2) data_ready method of the child socket may be called
	 *    when it receives data before the socket is accepted.
	 * In case of 2, we should ignore it silently.
	 */
	if (sk->sk_state == TCP_LISTEN) {
		if (svsk) {
806
			set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
807
			svc_xprt_enqueue(&svsk->sk_xprt);
808 809
		} else
			printk("svc: socket %p: no user data\n", sk);
Linus Torvalds's avatar
Linus Torvalds committed
810
	}
811

812
	wq = sk_sleep(sk);
813
	if (sunrpc_waitqueue_active(wq))
814
		wake_up_interruptible_all(wq);
Linus Torvalds's avatar
Linus Torvalds committed
815 816 817 818 819
}

/*
 * A state change on a connected socket means it's dying or dead.
 */
820
static void svc_tcp_state_change(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
821
{
822
	struct svc_sock	*svsk = (struct svc_sock *)sk->sk_user_data;
823
	wait_queue_head_t *wq = sk_sleep(sk);
Linus Torvalds's avatar
Linus Torvalds committed
824 825

	dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n",
826
		sk, sk->sk_state, sk->sk_user_data);
Linus Torvalds's avatar
Linus Torvalds committed
827

828
	if (!svsk)
Linus Torvalds's avatar
Linus Torvalds committed
829
		printk("svc: socket %p: no user data\n", sk);
830
	else {
831
		set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
832
		svc_xprt_enqueue(&svsk->sk_xprt);
Linus Torvalds's avatar
Linus Torvalds committed
833
	}
834
	if (sunrpc_waitqueue_active(wq))
835
		wake_up_interruptible_all(wq);
Linus Torvalds's avatar
Linus Torvalds committed
836 837
}

838
static void svc_tcp_data_ready(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
839
{
840
	struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data;
841
	wait_queue_head_t *wq = sk_sleep(sk);
Linus Torvalds's avatar
Linus Torvalds committed
842 843

	dprintk("svc: socket %p TCP data ready (svsk %p)\n",
844 845
		sk, sk->sk_user_data);
	if (svsk) {
846
		set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
847
		svc_xprt_enqueue(&svsk->sk_xprt);
848
	}
849
	if (sunrpc_waitqueue_active(wq))
850
		wake_up_interruptible(wq);
Linus Torvalds's avatar
Linus Torvalds committed
851 852 853 854 855
}

/*
 * Accept a TCP connection
 */
856
static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt)
Linus Torvalds's avatar
Linus Torvalds committed
857
{
858
	struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
859 860
	struct sockaddr_storage addr;
	struct sockaddr	*sin = (struct sockaddr *) &addr;
861
	struct svc_serv	*serv = svsk->sk_xprt.xpt_server;
Linus Torvalds's avatar
Linus Torvalds committed
862 863 864 865
	struct socket	*sock = svsk->sk_sock;
	struct socket	*newsock;
	struct svc_sock	*newsvsk;
	int		err, slen;
866
	RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
Linus Torvalds's avatar
Linus Torvalds committed
867 868 869

	dprintk("svc: tcp_accept %p sock %p\n", svsk, sock);
	if (!sock)
870
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
871

872
	clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
873 874
	err = kernel_accept(sock, &newsock, O_NONBLOCK);
	if (err < 0) {
Linus Torvalds's avatar
Linus Torvalds committed
875 876 877
		if (err == -ENOMEM)
			printk(KERN_WARNING "%s: no more sockets!\n",
			       serv->sv_name);
878 879 880
		else if (err != -EAGAIN)
			net_warn_ratelimited("%s: accept failed (err %d)!\n",
					     serv->sv_name, -err);
881
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
882
	}
883
	set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags);
Linus Torvalds's avatar
Linus Torvalds committed
884

885
	err = kernel_getpeername(newsock, sin, &slen);
Linus Torvalds's avatar
Linus Torvalds committed
886
	if (err < 0) {
887 888
		net_warn_ratelimited("%s: peername failed (err %d)!\n",
				     serv->sv_name, -err);
Linus Torvalds's avatar
Linus Torvalds committed
889 890 891 892
		goto failed;		/* aborted connection or whatever */
	}

	/* Ideally, we would want to reject connections from unauthorized
893 894
	 * hosts here, but when we get encryption, the IP of the host won't
	 * tell us anything.  For now just warn about unpriv connections.
Linus Torvalds's avatar
Linus Torvalds committed
895
	 */
896
	if (!svc_port_is_privileged(sin)) {
897
		dprintk("%s: connect from unprivileged port: %s\n",
898
			serv->sv_name,
899
			__svc_print_addr(sin, buf, sizeof(buf)));
Linus Torvalds's avatar
Linus Torvalds committed
900
	}
901
	dprintk("%s: connect from %s\n", serv->sv_name,
902
		__svc_print_addr(sin, buf, sizeof(buf)));
Linus Torvalds's avatar
Linus Torvalds committed
903 904 905 906 907 908

	/* make sure that a write doesn't block forever when
	 * low on memory
	 */
	newsock->sk->sk_sndtimeo = HZ*30;

909 910 911
	newsvsk = svc_setup_socket(serv, newsock,
				 (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY));
	if (IS_ERR(newsvsk))
Linus Torvalds's avatar
Linus Torvalds committed
912
		goto failed;
913
	svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen);
914 915 916 917 918
	err = kernel_getsockname(newsock, sin, &slen);
	if (unlikely(err < 0)) {
		dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err);
		slen = offsetof(struct sockaddr, sa_data);
	}
919
	svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen);
920

921 922 923 924
	if (sock_is_loopback(newsock->sk))
		set_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
	else
		clear_bit(XPT_LOCAL, &newsvsk->sk_xprt.xpt_flags);
925 926 927 928 929 930 931 932 933 934
	if (serv->sv_stats)
		serv->sv_stats->nettcpconn++;

	return &newsvsk->sk_xprt;

failed:
	sock_release(newsock);
	return NULL;
}

935 936 937 938
static unsigned int svc_tcp_restore_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
{
	unsigned int i, len, npages;

939
	if (svsk->sk_datalen == 0)
940
		return 0;
941
	len = svsk->sk_datalen;
942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957
	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	for (i = 0; i < npages; i++) {
		if (rqstp->rq_pages[i] != NULL)
			put_page(rqstp->rq_pages[i]);
		BUG_ON(svsk->sk_pages[i] == NULL);
		rqstp->rq_pages[i] = svsk->sk_pages[i];
		svsk->sk_pages[i] = NULL;
	}
	rqstp->rq_arg.head[0].iov_base = page_address(rqstp->rq_pages[0]);
	return len;
}

static void svc_tcp_save_pages(struct svc_sock *svsk, struct svc_rqst *rqstp)
{
	unsigned int i, len, npages;

958
	if (svsk->sk_datalen == 0)
959
		return;
960
	len = svsk->sk_datalen;
961 962 963 964 965 966 967 968 969 970 971
	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	for (i = 0; i < npages; i++) {
		svsk->sk_pages[i] = rqstp->rq_pages[i];
		rqstp->rq_pages[i] = NULL;
	}
}

static void svc_tcp_clear_pages(struct svc_sock *svsk)
{
	unsigned int i, len, npages;

972
	if (svsk->sk_datalen == 0)
973
		goto out;
974
	len = svsk->sk_datalen;
975 976
	npages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
	for (i = 0; i < npages; i++) {
977 978 979 980
		if (svsk->sk_pages[i] == NULL) {
			WARN_ON_ONCE(1);
			continue;
		}
981 982 983 984 985
		put_page(svsk->sk_pages[i]);
		svsk->sk_pages[i] = NULL;
	}
out:
	svsk->sk_tcplen = 0;
986
	svsk->sk_datalen = 0;
987 988
}

Linus Torvalds's avatar
Linus Torvalds committed
989
/*
990
 * Receive fragment record header.
991
 * If we haven't gotten the record length yet, get the next four bytes.
Linus Torvalds's avatar
Linus Torvalds committed
992
 */