select.c 34.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

18
#include <linux/kernel.h>
19 20
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
Linus Torvalds's avatar
Linus Torvalds committed
21
#include <linux/syscalls.h>
22
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
23 24 25 26
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
Al Viro's avatar
Al Viro committed
27
#include <linux/fdtable.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
#include <linux/fs.h>
29
#include <linux/rcupdate.h>
30
#include <linux/hrtimer.h>
31
#include <linux/freezer.h>
32
#include <net/busy_poll.h>
33
#include <linux/vmalloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
34

35
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
36

37 38 39 40 41 42 43 44 45 46 47 48 49

/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

50 51
#define MAX_SLACK	(100 * NSEC_PER_MSEC)

52
static long __estimate_accuracy(struct timespec64 *tv)
53
{
54
	long slack;
55 56
	int divfactor = 1000;

57 58 59
	if (tv->tv_sec < 0)
		return 0;

60
	if (task_nice(current) > 0)
61 62
		divfactor = divfactor / 5;

63 64 65
	if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
		return MAX_SLACK;

66 67 68
	slack = tv->tv_nsec / divfactor;
	slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

69 70
	if (slack > MAX_SLACK)
		return MAX_SLACK;
71

72 73 74
	return slack;
}

75
u64 select_estimate_accuracy(struct timespec64 *tv)
76
{
77
	u64 ret;
78
	struct timespec64 now;
79 80 81 82 83

	/*
	 * Realtime tasks get a slack of 0 for obvious reasons.
	 */

84
	if (rt_task(current))
85 86
		return 0;

87 88
	ktime_get_ts64(&now);
	now = timespec64_sub(*tv, now);
89 90 91 92 93 94 95 96
	ret = __estimate_accuracy(&now);
	if (ret < current->timer_slack_ns)
		return current->timer_slack_ns;
	return ret;
}



Linus Torvalds's avatar
Linus Torvalds committed
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
struct poll_table_page {
	struct poll_table_page * next;
	struct poll_table_entry * entry;
	struct poll_table_entry entries[0];
};

#define POLL_TABLE_FULL(table) \
	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
118 119
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
		       poll_table *p);
Linus Torvalds's avatar
Linus Torvalds committed
120 121 122 123

void poll_initwait(struct poll_wqueues *pwq)
{
	init_poll_funcptr(&pwq->pt, __pollwait);
124
	pwq->polling_task = current;
125
	pwq->triggered = 0;
Linus Torvalds's avatar
Linus Torvalds committed
126 127
	pwq->error = 0;
	pwq->table = NULL;
128
	pwq->inline_index = 0;
Linus Torvalds's avatar
Linus Torvalds committed
129 130 131
}
EXPORT_SYMBOL(poll_initwait);

132 133
static void free_poll_entry(struct poll_table_entry *entry)
{
WANG Cong's avatar
WANG Cong committed
134
	remove_wait_queue(entry->wait_address, &entry->wait);
135 136 137
	fput(entry->filp);
}

Linus Torvalds's avatar
Linus Torvalds committed
138 139 140
void poll_freewait(struct poll_wqueues *pwq)
{
	struct poll_table_page * p = pwq->table;
141 142 143
	int i;
	for (i = 0; i < pwq->inline_index; i++)
		free_poll_entry(pwq->inline_entries + i);
Linus Torvalds's avatar
Linus Torvalds committed
144 145 146 147 148 149 150
	while (p) {
		struct poll_table_entry * entry;
		struct poll_table_page *old;

		entry = p->entry;
		do {
			entry--;
151
			free_poll_entry(entry);
Linus Torvalds's avatar
Linus Torvalds committed
152 153 154 155 156 157 158 159
		} while (entry > p->entries);
		old = p;
		p = p->next;
		free_page((unsigned long) old);
	}
}
EXPORT_SYMBOL(poll_freewait);

160
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
Linus Torvalds's avatar
Linus Torvalds committed
161 162 163
{
	struct poll_table_page *table = p->table;

164 165 166
	if (p->inline_index < N_INLINE_POLL_ENTRIES)
		return p->inline_entries + p->inline_index++;

Linus Torvalds's avatar
Linus Torvalds committed
167 168 169 170 171 172
	if (!table || POLL_TABLE_FULL(table)) {
		struct poll_table_page *new_table;

		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
		if (!new_table) {
			p->error = -ENOMEM;
173
			return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
174 175 176 177 178 179 180
		}
		new_table->entry = new_table->entries;
		new_table->next = table;
		p->table = new_table;
		table = new_table;
	}

181 182 183
	return table->entry++;
}

184
static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
185 186 187 188 189 190 191 192 193
{
	struct poll_wqueues *pwq = wait->private;
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
194
	 * and is paired with smp_store_mb() in poll_schedule_timeout.
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
	 */
	smp_wmb();
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	return default_wake_function(&dummy_wait, mode, sync, key);
}

210
static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
211 212 213 214 215 216 217 218 219
{
	struct poll_table_entry *entry;

	entry = container_of(wait, struct poll_table_entry, wait);
	if (key && !((unsigned long)key & entry->key))
		return 0;
	return __pollwake(wait, mode, sync, key);
}

220 221 222 223
/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
224 225
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	struct poll_table_entry *entry = poll_get_entry(pwq);
226 227
	if (!entry)
		return;
228
	entry->filp = get_file(filp);
229
	entry->wait_address = wait_address;
230
	entry->key = p->_key;
231 232
	init_waitqueue_func_entry(&entry->wait, pollwake);
	entry->wait.private = pwq;
WANG Cong's avatar
WANG Cong committed
233
	add_wait_queue(wait_address, &entry->wait);
Linus Torvalds's avatar
Linus Torvalds committed
234 235
}

236 237 238 239 240 241 242
int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
			  ktime_t *expires, unsigned long slack)
{
	int rc = -EINTR;

	set_current_state(state);
	if (!pwq->triggered)
243
		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
244 245 246 247 248
	__set_current_state(TASK_RUNNING);

	/*
	 * Prepare for the next iteration.
	 *
249
	 * The following smp_store_mb() serves two purposes.  First, it's
250 251 252 253 254 255 256
	 * the counterpart rmb of the wmb in pollwake() such that data
	 * written before wake up is always visible after wake up.
	 * Second, the full barrier guarantees that triggered clearing
	 * doesn't pass event check of the next iteration.  Note that
	 * this problem doesn't exist for the first iteration as
	 * add_wait_queue() has full barrier semantics.
	 */
257
	smp_store_mb(pwq->triggered, 0);
258 259 260 261 262

	return rc;
}
EXPORT_SYMBOL(poll_schedule_timeout);

263 264
/**
 * poll_select_set_timeout - helper function to setup the timeout value
265
 * @to:		pointer to timespec64 variable for the final timeout
266 267 268 269 270 271 272 273
 * @sec:	seconds (from user space)
 * @nsec:	nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
274
int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
275
{
276
	struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};
277

278
	if (!timespec64_valid(&ts))
279 280 281 282 283 284
		return -EINVAL;

	/* Optimize for the zero timeout value here */
	if (!sec && !nsec) {
		to->tv_sec = to->tv_nsec = 0;
	} else {
285 286
		ktime_get_ts64(to);
		*to = timespec64_add_safe(*to, ts);
287 288 289 290
	}
	return 0;
}

291 292
static int poll_select_copy_remaining(struct timespec64 *end_time,
				      void __user *p,
293 294
				      int timeval, int ret)
{
295
	struct timespec64 rts64;
296 297 298 299 300 301 302 303 304 305 306 307 308
	struct timespec rts;
	struct timeval rtv;

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

309 310 311 312 313 314
	ktime_get_ts64(&rts64);
	rts64 = timespec64_sub(*end_time, rts64);
	if (rts64.tv_sec < 0)
		rts64.tv_sec = rts64.tv_nsec = 0;

	rts = timespec64_to_timespec(rts64);
315 316

	if (timeval) {
317 318
		if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
			memset(&rtv, 0, sizeof(rtv));
319 320
		rtv.tv_sec = rts64.tv_sec;
		rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC;
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;

	} else if (!copy_to_user(p, &rts, sizeof(rts)))
		return ret;

	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
/*
 * Scalable version of the fd_set.
 */

typedef struct {
	unsigned long *in, *out, *ex;
	unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

/*
 * How many longwords for "nr" bits?
 */
#define FDS_BITPERLONG	(8*sizeof(long))
#define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))

/*
 * We do a VERIFY_WRITE here even though we are only reading this time:
 * we'll write to it eventually..
 *
 * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
 */
static inline
int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	nr = FDS_BYTES(nr);
	if (ufdset)
		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;

	memset(fdset, 0, nr);
	return 0;
}

static inline unsigned long __must_check
set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	if (ufdset)
		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
	return 0;
}

static inline
void zero_fd_set(unsigned long nr, unsigned long *fdset)
{
	memset(fdset, 0, FDS_BYTES(nr));
}

Linus Torvalds's avatar
Linus Torvalds committed
389 390 391 392 393 394 395 396 397 398 399
#define FDS_IN(fds, n)		(fds->in + n)
#define FDS_OUT(fds, n)		(fds->out + n)
#define FDS_EX(fds, n)		(fds->ex + n)

#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
	unsigned long *open_fds;
	unsigned long set;
	int max;
400
	struct fdtable *fdt;
Linus Torvalds's avatar
Linus Torvalds committed
401 402

	/* handle last in-complete long-word first */
403 404
	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
	n /= BITS_PER_LONG;
405
	fdt = files_fdtable(current->files);
406
	open_fds = fdt->open_fds + n;
Linus Torvalds's avatar
Linus Torvalds committed
407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
	max = 0;
	if (set) {
		set &= BITS(fds, n);
		if (set) {
			if (!(set & ~*open_fds))
				goto get_max;
			return -EBADF;
		}
	}
	while (n) {
		open_fds--;
		n--;
		set = BITS(fds, n);
		if (!set)
			continue;
		if (set & ~*open_fds)
			return -EBADF;
		if (max)
			continue;
get_max:
		do {
			max++;
			set >>= 1;
		} while (set);
431
		max += n * BITS_PER_LONG;
Linus Torvalds's avatar
Linus Torvalds committed
432 433 434 435 436 437 438 439 440
	}

	return max;
}

#define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

441
static inline void wait_key_set(poll_table *wait, unsigned long in,
442 443
				unsigned long out, unsigned long bit,
				unsigned int ll_flag)
444
{
445
	wait->_key = POLLEX_SET | ll_flag;
446 447 448 449
	if (in & bit)
		wait->_key |= POLLIN_SET;
	if (out & bit)
		wait->_key |= POLLOUT_SET;
450 451
}

452
static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
Linus Torvalds's avatar
Linus Torvalds committed
453
{
454
	ktime_t expire, *to = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
455 456
	struct poll_wqueues table;
	poll_table *wait;
457
	int retval, i, timed_out = 0;
458
	u64 slack = 0;
459
	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
460
	unsigned long busy_start = 0;
Linus Torvalds's avatar
Linus Torvalds committed
461

462
	rcu_read_lock();
Linus Torvalds's avatar
Linus Torvalds committed
463
	retval = max_select_fd(n, fds);
464
	rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
465 466 467 468 469 470 471

	if (retval < 0)
		return retval;
	n = retval;

	poll_initwait(&table);
	wait = &table.pt;
472
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
473
		wait->_qproc = NULL;
474 475 476
		timed_out = 1;
	}

477
	if (end_time && !timed_out)
478
		slack = select_estimate_accuracy(end_time);
479

Linus Torvalds's avatar
Linus Torvalds committed
480 481 482
	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
483
		bool can_busy_loop = false;
Linus Torvalds's avatar
Linus Torvalds committed
484 485 486 487 488 489 490 491 492 493 494

		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;

			in = *inp++; out = *outp++; ex = *exp++;
			all_bits = in | out | ex;
			if (all_bits == 0) {
495
				i += BITS_PER_LONG;
Linus Torvalds's avatar
Linus Torvalds committed
496 497 498
				continue;
			}

499
			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
500
				struct fd f;
Linus Torvalds's avatar
Linus Torvalds committed
501 502 503 504
				if (i >= n)
					break;
				if (!(bit & all_bits))
					continue;
505 506 507 508
				f = fdget(i);
				if (f.file) {
					const struct file_operations *f_op;
					f_op = f.file->f_op;
Linus Torvalds's avatar
Linus Torvalds committed
509
					mask = DEFAULT_POLLMASK;
Al Viro's avatar
Al Viro committed
510
					if (f_op->poll) {
511
						wait_key_set(wait, in, out,
512
							     bit, busy_flag);
513
						mask = (*f_op->poll)(f.file, wait);
514
					}
515
					fdput(f);
Linus Torvalds's avatar
Linus Torvalds committed
516 517 518
					if ((mask & POLLIN_SET) && (in & bit)) {
						res_in |= bit;
						retval++;
519
						wait->_qproc = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
520 521 522 523
					}
					if ((mask & POLLOUT_SET) && (out & bit)) {
						res_out |= bit;
						retval++;
524
						wait->_qproc = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
525 526 527 528
					}
					if ((mask & POLLEX_SET) && (ex & bit)) {
						res_ex |= bit;
						retval++;
529
						wait->_qproc = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
530
					}
531
					/* got something, stop busy polling */
532 533 534 535 536 537 538 539 540 541 542
					if (retval) {
						can_busy_loop = false;
						busy_flag = 0;

					/*
					 * only remember a returned
					 * POLL_BUSY_LOOP if we asked for it
					 */
					} else if (busy_flag & mask)
						can_busy_loop = true;

Linus Torvalds's avatar
Linus Torvalds committed
543 544 545 546 547 548 549 550
				}
			}
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
551
			cond_resched();
Linus Torvalds's avatar
Linus Torvalds committed
552
		}
553
		wait->_qproc = NULL;
554
		if (retval || timed_out || signal_pending(current))
Linus Torvalds's avatar
Linus Torvalds committed
555
			break;
Pavel Machek's avatar
Pavel Machek committed
556
		if (table.error) {
Linus Torvalds's avatar
Linus Torvalds committed
557 558 559
			retval = table.error;
			break;
		}
560

561
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
562
		if (can_busy_loop && !need_resched()) {
563 564
			if (!busy_start) {
				busy_start = busy_loop_current_time();
565 566
				continue;
			}
567
			if (!busy_loop_timeout(busy_start))
568 569 570
				continue;
		}
		busy_flag = 0;
571

572 573 574 575 576 577
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
578
			expire = timespec64_to_ktime(*end_time);
579
			to = &expire;
580
		}
581

582 583
		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))
584
			timed_out = 1;
Linus Torvalds's avatar
Linus Torvalds committed
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599
	}

	poll_freewait(&table);

	return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
600
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
601
			   fd_set __user *exp, struct timespec64 *end_time)
Linus Torvalds's avatar
Linus Torvalds committed
602 603
{
	fd_set_bits fds;
604
	void *bits;
605
	int ret, max_fds;
606
	size_t size, alloc_size;
607
	struct fdtable *fdt;
608
	/* Allocate small arguments on the stack to save memory and be faster */
609
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
Linus Torvalds's avatar
Linus Torvalds committed
610 611 612 613 614

	ret = -EINVAL;
	if (n < 0)
		goto out_nofds;

615
	/* max_fds can increase, so grab it once to avoid race */
616
	rcu_read_lock();
617
	fdt = files_fdtable(current->files);
618
	max_fds = fdt->max_fds;
619
	rcu_read_unlock();
620 621
	if (n > max_fds)
		n = max_fds;
Linus Torvalds's avatar
Linus Torvalds committed
622 623 624 625 626 627 628

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 */
	size = FDS_BYTES(n);
629 630 631 632
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		/* Not enough space in on-stack array; must use kmalloc */
		ret = -ENOMEM;
633 634 635 636
		if (size > (SIZE_MAX / 6))
			goto out_nofds;

		alloc_size = 6 * size;
637
		bits = kvmalloc(alloc_size, GFP_KERNEL);
638 639 640
		if (!bits)
			goto out_nofds;
	}
641 642 643 644 645 646
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;
Linus Torvalds's avatar
Linus Torvalds committed
647 648 649 650 651 652 653 654 655

	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

656
	ret = do_select(n, &fds, end_time);
Linus Torvalds's avatar
Linus Torvalds committed
657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
673
	if (bits != stack_fds)
674
		kvfree(bits);
Linus Torvalds's avatar
Linus Torvalds committed
675 676 677 678
out_nofds:
	return ret;
}

679 680
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)
681
{
682
	struct timespec64 end_time, *to = NULL;
683 684 685 686 687 688 689
	struct timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

690
		to = &end_time;
691 692 693
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
694 695 696
			return -EINVAL;
	}

697 698
	ret = core_sys_select(n, inp, outp, exp, to);
	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
699 700 701 702

	return ret;
}

703 704 705
static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
		       fd_set __user *exp, struct timespec __user *tsp,
		       const sigset_t __user *sigmask, size_t sigsetsize)
706 707
{
	sigset_t ksigmask, sigsaved;
708 709
	struct timespec ts;
	struct timespec64 ts64, end_time, *to = NULL;
710 711 712 713 714
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;
715
		ts64 = timespec_to_timespec64(ts);
716

717
		to = &end_time;
718
		if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec))
719 720 721 722 723 724 725 726 727 728 729 730 731 732
			return -EINVAL;
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

733
	ret = core_sys_select(n, inp, outp, exp, to);
734
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
735 736 737 738 739 740 741 742 743 744

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
745
			set_restore_sigmask();
746 747 748 749 750 751 752 753 754 755 756 757 758
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
759 760 761
SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timespec __user *, tsp,
		void __user *, sig)
762 763 764 765 766 767
{
	size_t sigsetsize = 0;
	sigset_t __user *up = NULL;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t))
768
		    || __get_user(up, (sigset_t __user * __user *)sig)
769
		    || __get_user(sigsetsize,
770
				(size_t __user *)(sig+sizeof(void *))))
771 772 773
			return -EFAULT;
	}

774
	return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize);
775 776
}

777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
	unsigned long n;
	fd_set __user *inp, *outp, *exp;
	struct timeval __user *tvp;
};

SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
{
	struct sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
	return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
794 795 796 797 798 799 800 801
struct poll_list {
	struct poll_list *next;
	int len;
	struct pollfd entries[0];
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

802 803 804 805 806
/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
807
 * if pwait->_qproc is non-NULL.
808
 */
809
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
810 811
				     bool *can_busy_poll,
				     unsigned int busy_flag)
Linus Torvalds's avatar
Linus Torvalds committed
812
{
813 814 815 816 817 818
	unsigned int mask;
	int fd;

	mask = 0;
	fd = pollfd->fd;
	if (fd >= 0) {
819
		struct fd f = fdget(fd);
820
		mask = POLLNVAL;
821
		if (f.file) {
822
			mask = DEFAULT_POLLMASK;
Al Viro's avatar
Al Viro committed
823
			if (f.file->f_op->poll) {
824
				pwait->_key = pollfd->events|POLLERR|POLLHUP;
825
				pwait->_key |= busy_flag;
826
				mask = f.file->f_op->poll(f.file, pwait);
827 828
				if (mask & busy_flag)
					*can_busy_poll = true;
829
			}
830 831
			/* Mask out unneeded events. */
			mask &= pollfd->events | POLLERR | POLLHUP;
832
			fdput(f);
Linus Torvalds's avatar
Linus Torvalds committed
833 834
		}
	}
835 836 837
	pollfd->revents = mask;

	return mask;
Linus Torvalds's avatar
Linus Torvalds committed
838 839
}

840
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
841
		   struct timespec64 *end_time)
Linus Torvalds's avatar
Linus Torvalds committed
842 843
{
	poll_table* pt = &wait->pt;
844 845
	ktime_t expire, *to = NULL;
	int timed_out = 0, count = 0;
846
	u64 slack = 0;
847
	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
848
	unsigned long busy_start = 0;
Linus Torvalds's avatar
Linus Torvalds committed
849

850
	/* Optimise the no-wait case */
851
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
852
		pt->_qproc = NULL;
853 854
		timed_out = 1;
	}
855

856
	if (end_time && !timed_out)
857
		slack = select_estimate_accuracy(end_time);
858

Linus Torvalds's avatar
Linus Torvalds committed
859 860
	for (;;) {
		struct poll_list *walk;
861
		bool can_busy_loop = false;
862

863 864 865 866 867 868 869 870
		for (walk = list; walk != NULL; walk = walk->next) {
			struct pollfd * pfd, * pfd_end;

			pfd = walk->entries;
			pfd_end = pfd + walk->len;
			for (; pfd != pfd_end; pfd++) {
				/*
				 * Fish for events. If we found one, record it
871
				 * and kill poll_table->_qproc, so we don't
872 873 874 875
				 * needlessly register any other waiters after
				 * this. They'll get immediately deregistered
				 * when we break out and return.
				 */
876 877
				if (do_pollfd(pfd, pt, &can_busy_loop,
					      busy_flag)) {
878
					count++;
879
					pt->_qproc = NULL;
880 881 882
					/* found something, stop busy polling */
					busy_flag = 0;
					can_busy_loop = false;
883 884
				}
			}
Linus Torvalds's avatar
Linus Torvalds committed
885
		}
886 887
		/*
		 * All waiters have already been registered, so don't provide
888
		 * a poll_table->_qproc to them on the next loop iteration.
889
		 */
890
		pt->_qproc = NULL;
891 892 893 894 895
		if (!count) {
			count = wait->error;
			if (signal_pending(current))
				count = -EINTR;
		}
896
		if (count || timed_out)
Linus Torvalds's avatar
Linus Torvalds committed
897
			break;
898

899
		/* only if found POLL_BUSY_LOOP sockets && not out of time */
900
		if (can_busy_loop && !need_resched()) {
901 902
			if (!busy_start) {
				busy_start = busy_loop_current_time();
903 904
				continue;
			}
905
			if (!busy_loop_timeout(busy_start))
906 907 908
				continue;
		}
		busy_flag = 0;
909

910 911 912 913 914 915
		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		if (end_time && !to) {
916
			expire = timespec64_to_ktime(*end_time);
917
			to = &expire;
918 919
		}

920
		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
921
			timed_out = 1;
Linus Torvalds's avatar
Linus Torvalds committed
922 923 924 925
	}
	return count;
}

926 927 928
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
			sizeof(struct pollfd))

929
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
930
		struct timespec64 *end_time)
Linus Torvalds's avatar
Linus Torvalds committed
931 932
{
	struct poll_wqueues table;
933
 	int err = -EFAULT, fdcount, len, size;
934 935 936 937
	/* Allocate small arguments on the stack to save memory and be
	   faster - use long to make sure the buffer is aligned properly
	   on 64 bit archs to avoid unaligned access */
	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
938 939 940
	struct poll_list *const head = (struct poll_list *)stack_pps;
 	struct poll_list *walk = head;
 	unsigned long todo = nfds;
Linus Torvalds's avatar
Linus Torvalds committed
941

Jiri Slaby's avatar
Jiri Slaby committed
942
	if (nfds > rlimit(RLIMIT_NOFILE))
Linus Torvalds's avatar
Linus Torvalds committed
943 944
		return -EINVAL;

945 946 947 948 949 950
	len = min_t(unsigned int, nfds, N_STACK_PPS);
	for (;;) {
		walk->next = NULL;
		walk->len = len;
		if (!len)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
951

952 953 954 955 956 957 958
		if (copy_from_user(walk->entries, ufds + nfds-todo,
					sizeof(struct pollfd) * walk->len))
			goto out_fds;

		todo -= walk->len;
		if (!todo)
			break;
Linus Torvalds's avatar
Linus Torvalds committed
959

960 961 962 963 964
		len = min(todo, POLLFD_PER_PAGE);
		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
		walk = walk->next = kmalloc(size, GFP_KERNEL);
		if (!walk) {
			err = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
965 966 967
			goto out_fds;
		}
	}
968

969
	poll_initwait(&table);
970
	fdcount = do_poll(head, &table, end_time);
971
	poll_freewait(&table);
Linus Torvalds's avatar
Linus Torvalds committed
972

973
	for (walk = head; walk; walk = walk->next) {
Linus Torvalds's avatar
Linus Torvalds committed
974 975 976
		struct pollfd *fds = walk->entries;
		int j;

977 978
		for (j = 0; j < walk->len; j++, ufds++)
			if (__put_user(fds[j].revents, &ufds->revents))
Linus Torvalds's avatar
Linus Torvalds committed
979 980
				goto out_fds;
  	}
981

Linus Torvalds's avatar
Linus Torvalds committed
982 983
	err = fdcount;
out_fds:
984 985 986 987 988
	walk = head->next;
	while (walk) {
		struct poll_list *pos = walk;
		walk = walk->next;
		kfree(pos);
Linus Torvalds's avatar
Linus Torvalds committed
989
	}
990

Linus Torvalds's avatar
Linus Torvalds committed
991 992
	return err;
}
993

994 995
static long do_restart_poll(struct restart_block *restart_block)
{
996 997
	struct pollfd __user *ufds = restart_block->poll.ufds;
	int nfds = restart_block->poll.nfds;
998
	struct timespec64 *to = NULL, end_time;
999 1000
	int ret;

1001 1002 1003 1004 1005 1006 1007 1008
	if (restart_block->poll.has_timeout) {
		end_time.tv_sec = restart_block->poll.tv_sec;
		end_time.tv_nsec = restart_block->poll.tv_nsec;
		to = &end_time;
	}

	ret = do_sys_poll(ufds, nfds, to);

1009 1010 1011 1012 1013 1014 1015
	if (ret == -EINTR) {
		restart_block->fn = do_restart_poll;
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
}

1016
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
1017
		int, timeout_msecs)
1018
{
1019
	struct timespec64 end_time, *to = NULL;
1020
	int ret;
1021

1022 1023 1024 1025
	if (timeout_msecs >= 0) {
		to = &end_time;
		poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
			NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
1026 1027
	}

1028 1029
	ret = do_sys_poll(ufds, nfds, to);

1030 1031
	if (ret == -EINTR) {
		struct restart_block *restart_block;
1032

1033
		restart_block = &current->restart_block;
1034
		restart_block->fn = do_restart_poll;
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
		restart_block->poll.ufds = ufds;
		restart_block->poll.nfds = nfds;

		if (timeout_msecs >= 0) {
			restart_block->poll.tv_sec = end_time.tv_sec;
			restart_block->poll.tv_nsec = end_time.tv_nsec;
			restart_block->poll.has_timeout = 1;
		} else
			restart_block->poll.has_timeout = 0;

1045 1046 1047
		ret = -ERESTART_RESTARTBLOCK;
	}
	return ret;
1048 1049
}

1050 1051 1052
SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
		struct timespec __user *, tsp, const sigset_t __user *, sigmask,
		size_t, sigsetsize)
1053 1054
{
	sigset_t ksigmask, sigsaved;
1055 1056
	struct timespec ts;
	struct timespec64 end_time, *to = NULL;
1057 1058 1059 1060 1061 1062
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

1063 1064 1065
		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078
	}

	if (sigmask) {
		/* XXX: Don't preclude handling different sized sigset_t's.  */
		if (sigsetsize != sizeof(sigset_t))
			return -EINVAL;
		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
			return -EFAULT;

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

1079
	ret = do_sys_poll(ufds, nfds, to);
1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
1091
			set_restore_sigmask();
1092 1093 1094 1095 1096
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

1097
	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
1098 1099 1100

	return ret;
}
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165

#ifdef CONFIG_COMPAT
#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))

static
int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
				      int timeval, int ret)
{
	struct timespec ts;

	if (!p)
		return ret;

	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	/* No update for zero timeout */
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

	ktime_get_ts(&ts);
	ts = timespec_sub(*end_time, ts);
	if (ts.tv_sec < 0)
		ts.tv_sec = ts.tv_nsec = 0;

	if (timeval) {
		struct compat_timeval rtv;

		rtv.tv_sec = ts.tv_sec;
		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;

		if (!copy_to_user(p, &rtv, sizeof(rtv)))
			return ret;
	} else {
		struct compat_timespec rts;

		rts.tv_sec = ts.tv_sec;
		rts.tv_nsec = ts.tv_nsec;

		if (!copy_to_user(p, &rts, sizeof(rts)))
			return ret;
	}
	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	return ret;
}

/*
 * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
 * 64-bit unsigned longs.
 */
static
int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
			unsigned long *fdset)
{
	if (ufdset) {
1166
		return compat_get_bitmap(fdset, ufdset, nr);
1167
	} else {
1168
		zero_fd_set(nr, fdset);
1169
		return 0;
1170 1171 1172 1173 1174 1175 1176 1177 1178
	}
}

static
int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
		      unsigned long *fdset)
{
	if (!ufdset)
		return 0;
1179
	return compat_put_bitmap(ufdset, fdset, nr);
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432
}


/*
 * This is a virtual copy of sys_select from fs/select.c and probably
 * should be compared to it from time to time
 */

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
	struct timespec *end_time)
{
	fd_set_bits fds;
	void *bits;
	int size, max_fds, ret = -EINVAL;
	struct fdtable *fdt;
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

	if (n < 0)
		goto out_nofds;

	/* max_fds can increase, so grab it once to avoid race */
	rcu_read_lock();
	fdt = files_fdtable(current->files);
	max_fds = fdt->max_fds;
	rcu_read_unlock();
	if (n > max_fds)
		n = max_fds;

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words.
	 */
	size = FDS_BYTES(n);
	bits = stack_fds;
	if (size > sizeof(stack_fds) / 6) {
		bits = kmalloc(6 * size, GFP_KERNEL);
		ret = -ENOMEM;
		if (!bits)
			goto out_nofds;
	}
	fds.in      = (unsigned long *)  bits;
	fds.out     = (unsigned long *) (bits +   size);
	fds.ex      = (unsigned long *) (bits + 2*size);
	fds.res_in  = (unsigned long *) (bits + 3*size);
	fds.res_out = (unsigned long *) (bits + 4*size);
	fds.res_ex  = (unsigned long *) (bits + 5*size);

	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
	    (ret = compat_get_fd_set(n, exp, fds.ex)))
		goto out;
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	ret = do_select(n, &fds, end_time);

	if (ret < 0)
		goto out;
	if (!ret) {
		ret = -ERESTARTNOHAND;
		if (signal_pending(current))
			goto out;
		ret = 0;
	}

	if (compat_set_fd_set(n, inp, fds.res_in) ||
	    compat_set_fd_set(n, outp, fds.res_out) ||
	    compat_set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;
out:
	if (bits != stack_fds)
		kfree(bits);
out_nofds:
	return ret;
}

COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
	struct compat_timeval __user *, tvp)
{
	struct timespec end_time, *to = NULL;
	struct compat_timeval tv;
	int ret;

	if (tvp) {
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
			return -EINVAL;
	}

	ret = compat_core_sys_select(n, inp, outp, exp, to);
	ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);

	return ret;
}

struct compat_sel_arg_struct {
	compat_ulong_t n;
	compat_uptr_t inp;
	compat_uptr_t outp;
	compat_uptr_t exp;
	compat_uptr_t tvp;
};

COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
{
	struct compat_sel_arg_struct a;

	if (copy_from_user(&a, arg, sizeof(a)))
		return -EFAULT;
	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
				 compat_ptr(a.exp), compat_ptr(a.tvp));
}

static long do_compat_pselect(int n, compat_ulong_t __user *inp,
	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
	compat_size_t sigsetsize)
{
	compat_sigset_t ss32;
	sigset_t ksigmask, sigsaved;
	struct compat_timespec ts;
	struct timespec end_time, *to = NULL;
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
	}

	if (sigmask) {
		if (sigsetsize != sizeof(compat_sigset_t))
			return -EINVAL;
		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
			return -EFAULT;
		sigset_from_compat(&ksigmask, &ss32);

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

	ret = compat_core_sys_select(n, inp, outp, exp, to);
	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);

	if (ret == -ERESTARTNOHAND) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
					sizeof(sigsaved));
			set_restore_sigmask();
		}
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	return ret;
}

COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
	struct compat_timespec __user *, tsp, void __user *, sig)
{
	compat_size_t sigsetsize = 0;
	compat_uptr_t up = 0;

	if (sig) {
		if (!access_ok(VERIFY_READ, sig,
				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
		    	__get_user(up, (compat_uptr_t __user *)sig) ||
		    	__get_user(sigsetsize,
				(compat_size_t __user *)(sig+sizeof(up))))
			return -EFAULT;
	}
	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
				 sigsetsize);
}

COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
	unsigned int,  nfds, struct compat_timespec __user *, tsp,
	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
	compat_sigset_t ss32;
	sigset_t ksigmask, sigsaved;
	struct compat_timespec ts;
	struct timespec end_time, *to = NULL;
	int ret;

	if (tsp) {
		if (copy_from_user(&ts, tsp, sizeof(ts)))
			return -EFAULT;

		to = &end_time;
		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
			return -EINVAL;
	}

	if (sigmask) {
		if (sigsetsize != sizeof(compat_sigset_t))
			return -EINVAL;
		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
			return -EFAULT;
		sigset_from_compat(&ksigmask, &ss32);

		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
	}

	ret = do_sys_poll(ufds, nfds, to);

	/* We can restart this syscall, usually */
	if (ret == -EINTR) {
		/*
		 * Don't restore the signal mask yet. Let do_signal() deliver
		 * the signal on the way back to userspace, before the signal
		 * mask is restored.
		 */
		if (sigmask) {
			memcpy(&current->saved_sigmask, &sigsaved,
				sizeof(sigsaved));
			set_restore_sigmask();
		}
		ret = -ERESTARTNOHAND;
	} else if (sigmask)
		sigprocmask(SIG_SETMASK, &sigsaved, NULL);

	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);

	return ret;
}
#endif