namei.c 121 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
19
#include <linux/export.h>
20
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
21 22 23 24
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
Robert Love's avatar
Robert Love committed
25
#include <linux/fsnotify.h>
Linus Torvalds's avatar
Linus Torvalds committed
26 27
#include <linux/personality.h>
#include <linux/security.h>
Mimi Zohar's avatar
Mimi Zohar committed
28
#include <linux/ima.h>
Linus Torvalds's avatar
Linus Torvalds committed
29 30 31
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
32
#include <linux/capability.h>
33
#include <linux/file.h>
34
#include <linux/fcntl.h>
35
#include <linux/device_cgroup.h>
36
#include <linux/fs_struct.h>
37
#include <linux/posix_acl.h>
38
#include <linux/hash.h>
39
#include <linux/bitops.h>
40
#include <linux/init_task.h>
41
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
42

43
#include "internal.h"
44
#include "mount.h"
45

Linus Torvalds's avatar
Linus Torvalds committed
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
80
 * the name is a symlink pointing to a non-existent name.
Linus Torvalds's avatar
Linus Torvalds committed
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *	inside the path - always follow.
 *	in the last component in creation/removal/renaming - never follow.
 *	if LOOKUP_FOLLOW passed - follow.
 *	if the pathname has trailing slashes - follow.
 *	otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
113
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
Linus Torvalds's avatar
Linus Torvalds committed
114 115 116 117 118 119 120 121 122 123
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
124

Al Viro's avatar
Al Viro committed
125
#define EMBEDDED_NAME_MAX	(PATH_MAX - offsetof(struct filename, iname))
126

127
struct filename *
128 129
getname_flags(const char __user *filename, int flags, int *empty)
{
130
	struct filename *result;
131
	char *kname;
132
	int len;
133

134 135 136 137
	result = audit_reusename(filename);
	if (result)
		return result;

138
	result = __getname();
139
	if (unlikely(!result))
140 141
		return ERR_PTR(-ENOMEM);

142 143 144 145
	/*
	 * First, try to embed the struct filename inside the names_cache
	 * allocation
	 */
Al Viro's avatar
Al Viro committed
146
	kname = (char *)result->iname;
147
	result->name = kname;
148

149
	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
150
	if (unlikely(len < 0)) {
151 152
		__putname(result);
		return ERR_PTR(len);
153
	}
154

155 156 157 158 159 160
	/*
	 * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
	 * separate struct filename so we can dedicate the entire
	 * names_cache allocation for the pathname, and re-do the copy from
	 * userland.
	 */
161
	if (unlikely(len == EMBEDDED_NAME_MAX)) {
Al Viro's avatar
Al Viro committed
162
		const size_t size = offsetof(struct filename, iname[1]);
163 164
		kname = (char *)result;

Al Viro's avatar
Al Viro committed
165 166 167 168 169 170
		/*
		 * size is chosen that way we to guarantee that
		 * result->iname[0] is within the same object and that
		 * kname can't be equal to result->iname, no matter what.
		 */
		result = kzalloc(size, GFP_KERNEL);
171 172 173
		if (unlikely(!result)) {
			__putname(kname);
			return ERR_PTR(-ENOMEM);
174 175
		}
		result->name = kname;
176 177 178 179 180 181 182 183 184 185 186
		len = strncpy_from_user(kname, filename, PATH_MAX);
		if (unlikely(len < 0)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(len);
		}
		if (unlikely(len == PATH_MAX)) {
			__putname(kname);
			kfree(result);
			return ERR_PTR(-ENAMETOOLONG);
		}
187 188
	}

189
	result->refcnt = 1;
190 191 192
	/* The empty path is special. */
	if (unlikely(!len)) {
		if (empty)
193
			*empty = 1;
194 195 196 197
		if (!(flags & LOOKUP_EMPTY)) {
			putname(result);
			return ERR_PTR(-ENOENT);
		}
Linus Torvalds's avatar
Linus Torvalds committed
198
	}
199

200
	result->uptr = filename;
201
	result->aname = NULL;
202 203
	audit_getname(result);
	return result;
Linus Torvalds's avatar
Linus Torvalds committed
204 205
}

206 207
struct filename *
getname(const char __user * filename)
Al Viro's avatar
Al Viro committed
208
{
209
	return getname_flags(filename, 0, NULL);
Al Viro's avatar
Al Viro committed
210 211
}

212 213 214 215
struct filename *
getname_kernel(const char * filename)
{
	struct filename *result;
216
	int len = strlen(filename) + 1;
217 218 219 220 221

	result = __getname();
	if (unlikely(!result))
		return ERR_PTR(-ENOMEM);

222
	if (len <= EMBEDDED_NAME_MAX) {
Al Viro's avatar
Al Viro committed
223
		result->name = (char *)result->iname;
224
	} else if (len <= PATH_MAX) {
225
		const size_t size = offsetof(struct filename, iname[1]);
226 227
		struct filename *tmp;

228
		tmp = kmalloc(size, GFP_KERNEL);
229 230 231 232 233 234 235 236 237 238 239
		if (unlikely(!tmp)) {
			__putname(result);
			return ERR_PTR(-ENOMEM);
		}
		tmp->name = (char *)result;
		result = tmp;
	} else {
		__putname(result);
		return ERR_PTR(-ENAMETOOLONG);
	}
	memcpy((char *)result->name, filename, len);
240 241
	result->uptr = NULL;
	result->aname = NULL;
242
	result->refcnt = 1;
243
	audit_getname(result);
244 245 246 247

	return result;
}

248
void putname(struct filename *name)
Linus Torvalds's avatar
Linus Torvalds committed
249
{
250 251 252 253 254
	BUG_ON(name->refcnt <= 0);

	if (--name->refcnt > 0)
		return;

Al Viro's avatar
Al Viro committed
255
	if (name->name != name->iname) {
256 257 258 259
		__putname(name->name);
		kfree(name);
	} else
		__putname(name);
Linus Torvalds's avatar
Linus Torvalds committed
260 261
}

262 263
static int check_acl(struct inode *inode, int mask)
{
264
#ifdef CONFIG_FS_POSIX_ACL
265 266 267
	struct posix_acl *acl;

	if (mask & MAY_NOT_BLOCK) {
268 269
		acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
	        if (!acl)
270
	                return -EAGAIN;
271
		/* no ->get_acl() calls in RCU mode... */
272
		if (is_uncached_acl(acl))
273
			return -ECHILD;
274
	        return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
275 276
	}

277 278 279
	acl = get_acl(inode, ACL_TYPE_ACCESS);
	if (IS_ERR(acl))
		return PTR_ERR(acl);
280 281 282 283 284
	if (acl) {
	        int error = posix_acl_permission(inode, acl, mask);
	        posix_acl_release(acl);
	        return error;
	}
285
#endif
286 287 288 289

	return -EAGAIN;
}

290
/*
291
 * This does the basic permission checking
Linus Torvalds's avatar
Linus Torvalds committed
292
 */
293
static int acl_permission_check(struct inode *inode, int mask)
Linus Torvalds's avatar
Linus Torvalds committed
294
{
295
	unsigned int mode = inode->i_mode;
Linus Torvalds's avatar
Linus Torvalds committed
296

297
	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
Linus Torvalds's avatar
Linus Torvalds committed
298 299
		mode >>= 6;
	else {
300
		if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
301
			int error = check_acl(inode, mask);
302 303
			if (error != -EAGAIN)
				return error;
Linus Torvalds's avatar
Linus Torvalds committed
304 305 306 307 308 309 310 311 312
		}

		if (in_group_p(inode->i_gid))
			mode >>= 3;
	}

	/*
	 * If the DACs are ok we don't need any capability check.
	 */
313
	if ((mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
Linus Torvalds's avatar
Linus Torvalds committed
314
		return 0;
315 316 317 318
	return -EACCES;
}

/**
319
 * generic_permission -  check for access rights on a Posix-like filesystem
320
 * @inode:	inode to check access rights for
321
 * @mask:	right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
322 323 324 325
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
326 327 328 329 330
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
331
 */
332
int generic_permission(struct inode *inode, int mask)
333 334 335 336
{
	int ret;

	/*
337
	 * Do the basic permission checks.
338
	 */
339
	ret = acl_permission_check(inode, mask);
340 341
	if (ret != -EACCES)
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
342

343 344 345
	if (S_ISDIR(inode->i_mode)) {
		/* DACs are overridable for directories */
		if (!(mask & MAY_WRITE))
346 347
			if (capable_wrt_inode_uidgid(inode,
						     CAP_DAC_READ_SEARCH))
348
				return 0;
349
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
Linus Torvalds's avatar
Linus Torvalds committed
350
			return 0;
351 352
		return -EACCES;
	}
Linus Torvalds's avatar
Linus Torvalds committed
353 354 355 356

	/*
	 * Searching includes executable on directories, else just read.
	 */
357
	mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
358
	if (mask == MAY_READ)
359
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
Linus Torvalds's avatar
Linus Torvalds committed
360
			return 0;
361 362 363 364 365 366 367 368
	/*
	 * Read/write DACs are always overridable.
	 * Executable DACs are overridable when there is
	 * at least one exec bit set.
	 */
	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
			return 0;
Linus Torvalds's avatar
Linus Torvalds committed
369 370 371

	return -EACCES;
}
372
EXPORT_SYMBOL(generic_permission);
Linus Torvalds's avatar
Linus Torvalds committed
373

374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
/*
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct inode *inode, int mask)
{
	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
		if (likely(inode->i_op->permission))
			return inode->i_op->permission(inode, mask);

		/* This gets set once for the inode lifetime */
		spin_lock(&inode->i_lock);
		inode->i_opflags |= IOP_FASTPERM;
		spin_unlock(&inode->i_lock);
	}
	return generic_permission(inode, mask);
}

Christoph Hellwig's avatar
Christoph Hellwig committed
394
/**
395 396 397
 * __inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
Christoph Hellwig's avatar
Christoph Hellwig committed
398
 *
399
 * Check for read/write/execute permissions on an inode.
400 401
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
402 403 404
 *
 * This does not check for a read-only file system.  You probably want
 * inode_permission().
Christoph Hellwig's avatar
Christoph Hellwig committed
405
 */
406
int __inode_permission(struct inode *inode, int mask)
Linus Torvalds's avatar
Linus Torvalds committed
407
{
408
	int retval;
Linus Torvalds's avatar
Linus Torvalds committed
409

410
	if (unlikely(mask & MAY_WRITE)) {
Linus Torvalds's avatar
Linus Torvalds committed
411 412 413 414
		/*
		 * Nobody gets write access to an immutable file.
		 */
		if (IS_IMMUTABLE(inode))
415
			return -EPERM;
416 417 418 419 420 421 422 423

		/*
		 * Updating mtime will likely cause i_uid and i_gid to be
		 * written back improperly if their true value is unknown
		 * to the vfs.
		 */
		if (HAS_UNMAPPED_ID(inode))
			return -EACCES;
Linus Torvalds's avatar
Linus Torvalds committed
424 425
	}

426
	retval = do_inode_permission(inode, mask);
Linus Torvalds's avatar
Linus Torvalds committed
427 428 429
	if (retval)
		return retval;

430 431 432 433
	retval = devcgroup_inode_permission(inode, mask);
	if (retval)
		return retval;

434
	return security_inode_permission(inode, mask);
Linus Torvalds's avatar
Linus Torvalds committed
435
}
436
EXPORT_SYMBOL(__inode_permission);
Linus Torvalds's avatar
Linus Torvalds committed
437

438 439 440
/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
441
 * @inode: Inode to check permission on
442 443 444 445 446 447 448 449 450 451
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
	if (unlikely(mask & MAY_WRITE)) {
		umode_t mode = inode->i_mode;

		/* Nobody gets write access to a read-only fs. */
452
		if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477
			return -EROFS;
	}
	return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct inode *inode, int mask)
{
	int retval;

	retval = sb_permission(inode->i_sb, inode, mask);
	if (retval)
		return retval;
	return __inode_permission(inode, mask);
}
478
EXPORT_SYMBOL(inode_permission);
479

Jan Blunck's avatar
Jan Blunck committed
480 481 482 483 484 485
/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
486
void path_get(const struct path *path)
Jan Blunck's avatar
Jan Blunck committed
487 488 489 490 491 492
{
	mntget(path->mnt);
	dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

Jan Blunck's avatar
Jan Blunck committed
493 494 495 496 497 498
/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
499
void path_put(const struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
500
{
Jan Blunck's avatar
Jan Blunck committed
501 502
	dput(path->dentry);
	mntput(path->mnt);
Linus Torvalds's avatar
Linus Torvalds committed
503
}
Jan Blunck's avatar
Jan Blunck committed
504
EXPORT_SYMBOL(path_put);
Linus Torvalds's avatar
Linus Torvalds committed
505

506
#define EMBEDDED_LEVELS 2
507 508
struct nameidata {
	struct path	path;
Al Viro's avatar
Al Viro committed
509
	struct qstr	last;
510 511 512
	struct path	root;
	struct inode	*inode; /* path.dentry.d_inode */
	unsigned int	flags;
513
	unsigned	seq, m_seq;
514 515
	int		last_type;
	unsigned	depth;
516
	int		total_link_count;
517 518
	struct saved {
		struct path link;
519
		struct delayed_call done;
520
		const char *name;
521
		unsigned seq;
522
	} *stack, internal[EMBEDDED_LEVELS];
523 524
	struct filename	*name;
	struct nameidata *saved;
525
	struct inode	*link_inode;
526 527
	unsigned	root_seq;
	int		dfd;
528
} __randomize_layout;
529

530
static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
531
{
532 533
	struct nameidata *old = current->nameidata;
	p->stack = p->internal;
534 535
	p->dfd = dfd;
	p->name = name;
536
	p->total_link_count = old ? old->total_link_count : 0;
537
	p->saved = old;
538
	current->nameidata = p;
539 540
}

541
static void restore_nameidata(void)
542
{
543
	struct nameidata *now = current->nameidata, *old = now->saved;
544 545 546 547

	current->nameidata = old;
	if (old)
		old->total_link_count = now->total_link_count;
548
	if (now->stack != now->internal)
549
		kfree(now->stack);
550 551 552 553
}

static int __nd_alloc_stack(struct nameidata *nd)
{
554 555 556 557 558 559 560 561 562
	struct saved *p;

	if (nd->flags & LOOKUP_RCU) {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
				  GFP_ATOMIC);
		if (unlikely(!p))
			return -ECHILD;
	} else {
		p= kmalloc(MAXSYMLINKS * sizeof(struct saved),
563
				  GFP_KERNEL);
564 565 566
		if (unlikely(!p))
			return -ENOMEM;
	}
567 568 569 570 571
	memcpy(p, nd->internal, sizeof(nd->internal));
	nd->stack = p;
	return 0;
}

572 573 574 575 576 577 578 579 580 581
/**
 * path_connected - Verify that a path->dentry is below path->mnt.mnt_root
 * @path: nameidate to verify
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(const struct path *path)
{
	struct vfsmount *mnt = path->mnt;
582
	struct super_block *sb = mnt->mnt_sb;
583

584 585
	/* Bind mounts and multi-root filesystems can have disconnected paths */
	if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
586 587 588 589 590
		return true;

	return is_subdir(path->dentry, mnt->mnt_root);
}

591 592
static inline int nd_alloc_stack(struct nameidata *nd)
{
593
	if (likely(nd->depth != EMBEDDED_LEVELS))
594 595 596 597 598 599
		return 0;
	if (likely(nd->stack != nd->internal))
		return 0;
	return __nd_alloc_stack(nd);
}

600 601 602 603 604
static void drop_links(struct nameidata *nd)
{
	int i = nd->depth;
	while (i--) {
		struct saved *last = nd->stack + i;
605 606
		do_delayed_call(&last->done);
		clear_delayed_call(&last->done);
607 608 609 610 611 612 613 614 615 616 617
	}
}

static void terminate_walk(struct nameidata *nd)
{
	drop_links(nd);
	if (!(nd->flags & LOOKUP_RCU)) {
		int i;
		path_put(&nd->path);
		for (i = 0; i < nd->depth; i++)
			path_put(&nd->stack[i].link);
618 619 620 621
		if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
			path_put(&nd->root);
			nd->root.mnt = NULL;
		}
622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662
	} else {
		nd->flags &= ~LOOKUP_RCU;
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
		rcu_read_unlock();
	}
	nd->depth = 0;
}

/* path_put is needed afterwards regardless of success or failure */
static bool legitimize_path(struct nameidata *nd,
			    struct path *path, unsigned seq)
{
	int res = __legitimize_mnt(path->mnt, nd->m_seq);
	if (unlikely(res)) {
		if (res > 0)
			path->mnt = NULL;
		path->dentry = NULL;
		return false;
	}
	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
		path->dentry = NULL;
		return false;
	}
	return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static bool legitimize_links(struct nameidata *nd)
{
	int i;
	for (i = 0; i < nd->depth; i++) {
		struct saved *last = nd->stack + i;
		if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
			drop_links(nd);
			nd->depth = i + 1;
			return false;
		}
	}
	return true;
}

663
/*
664
 * Path walking has 2 modes, rcu-walk and ref-walk (see
665 666
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
667
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
668 669 670 671
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
672 673 674
 */

/**
675 676
 * unlazy_walk - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
677
 * Returns: 0 on success, -ECHILD on failure
678
 *
Al Viro's avatar
Al Viro committed
679 680 681
 * unlazy_walk attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
682 683
 * Nothing should touch nameidata between unlazy_walk() failure and
 * terminate_walk().
684
 */
Al Viro's avatar
Al Viro committed
685
static int unlazy_walk(struct nameidata *nd)
686 687 688 689
{
	struct dentry *parent = nd->path.dentry;

	BUG_ON(!(nd->flags & LOOKUP_RCU));
690

Al Viro's avatar
Al Viro committed
691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
	nd->flags &= ~LOOKUP_RCU;
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
		goto out1;
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq)))
			goto out;
	}
	rcu_read_unlock();
	BUG_ON(nd->inode != parent->d_inode);
	return 0;

out2:
	nd->path.mnt = NULL;
	nd->path.dentry = NULL;
out1:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
out:
	rcu_read_unlock();
	return -ECHILD;
}

/**
 * unlazy_child - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: child of nd->path.dentry
 * @seq: seq number to check dentry against
 * Returns: 0 on success, -ECHILD on failure
 *
 * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
 * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
 * @nd.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between unlazy_child() failure and
 * terminate_walk().
 */
static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
{
	BUG_ON(!(nd->flags & LOOKUP_RCU));

732
	nd->flags &= ~LOOKUP_RCU;
733 734 735 736
	if (unlikely(!legitimize_links(nd)))
		goto out2;
	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
		goto out2;
Al Viro's avatar
Al Viro committed
737
	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
738
		goto out1;
Al Viro's avatar
Al Viro committed
739

740
	/*
Al Viro's avatar
Al Viro committed
741 742 743 744 745
	 * We need to move both the parent and the dentry from the RCU domain
	 * to be properly refcounted. And the sequence number in the dentry
	 * validates *both* dentry counters, since we checked the sequence
	 * number of the parent after we got the child sequence number. So we
	 * know the parent must still be valid if the child sequence number is
746
	 */
Al Viro's avatar
Al Viro committed
747 748 749 750 751 752
	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
		goto out;
	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) {
		rcu_read_unlock();
		dput(dentry);
		goto drop_root_mnt;
753
	}
754 755 756 757 758
	/*
	 * Sequence counts matched. Now make sure that the root is
	 * still valid and get it if required.
	 */
	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
759 760 761 762
		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
			rcu_read_unlock();
			dput(dentry);
			return -ECHILD;
763
		}
764 765
	}

766
	rcu_read_unlock();
767
	return 0;
768

769 770 771 772
out2:
	nd->path.mnt = NULL;
out1:
	nd->path.dentry = NULL;
773
out:
774
	rcu_read_unlock();
775 776 777
drop_root_mnt:
	if (!(nd->flags & LOOKUP_ROOT))
		nd->root.mnt = NULL;
778 779 780
	return -ECHILD;
}

781
static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
782
{
783 784 785 786
	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
		return dentry->d_op->d_revalidate(dentry, flags);
	else
		return 1;
787 788
}

789 790 791
/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
792
 *
793 794 795 796 797
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
798
 */
799
static int complete_walk(struct nameidata *nd)
800
{
801
	struct dentry *dentry = nd->path.dentry;
802 803
	int status;

804 805 806
	if (nd->flags & LOOKUP_RCU) {
		if (!(nd->flags & LOOKUP_ROOT))
			nd->root.mnt = NULL;
Al Viro's avatar
Al Viro committed
807
		if (unlikely(unlazy_walk(nd)))
808 809 810
			return -ECHILD;
	}

811 812 813
	if (likely(!(nd->flags & LOOKUP_JUMPED)))
		return 0;

814
	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
815 816
		return 0;

817
	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
818 819 820
	if (status > 0)
		return 0;

821
	if (!status)
822
		status = -ESTALE;
823

824 825 826
	return status;
}

827
static void set_root(struct nameidata *nd)
828
{
829
	struct fs_struct *fs = current->fs;
Nick Piggin's avatar
Nick Piggin committed
830

831 832 833 834 835 836 837 838 839 840 841
	if (nd->flags & LOOKUP_RCU) {
		unsigned seq;

		do {
			seq = read_seqcount_begin(&fs->seq);
			nd->root = fs->root;
			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
		} while (read_seqcount_retry(&fs->seq, seq));
	} else {
		get_fs_root(fs, &nd->root);
	}
842 843
}

Jan Blunck's avatar
Jan Blunck committed
844
static void path_put_conditional(struct path *path, struct nameidata *nd)
845 846
{
	dput(path->dentry);
847
	if (path->mnt != nd->path.mnt)
848 849 850
		mntput(path->mnt);
}

851 852
static inline void path_to_nameidata(const struct path *path,
					struct nameidata *nd)
853
{
854 855 856 857
	if (!(nd->flags & LOOKUP_RCU)) {
		dput(nd->path.dentry);
		if (nd->path.mnt != path->mnt)
			mntput(nd->path.mnt);
858
	}
859
	nd->path.mnt = path->mnt;
860
	nd->path.dentry = path->dentry;
861 862
}

863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882
static int nd_jump_root(struct nameidata *nd)
{
	if (nd->flags & LOOKUP_RCU) {
		struct dentry *d;
		nd->path = nd->root;
		d = nd->path.dentry;
		nd->inode = d->d_inode;
		nd->seq = nd->root_seq;
		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
			return -ECHILD;
	} else {
		path_put(&nd->path);
		nd->path = nd->root;
		path_get(&nd->path);
		nd->inode = nd->path.dentry->d_inode;
	}
	nd->flags |= LOOKUP_JUMPED;
	return 0;
}

Christoph Hellwig's avatar
Christoph Hellwig committed
883
/*
884
 * Helper to directly jump to a known parsed path from ->get_link,
Christoph Hellwig's avatar
Christoph Hellwig committed
885 886
 * caller must have taken a reference to path beforehand.
 */
887
void nd_jump_link(struct path *path)
Christoph Hellwig's avatar
Christoph Hellwig committed
888
{
889
	struct nameidata *nd = current->nameidata;
Christoph Hellwig's avatar
Christoph Hellwig committed
890 891 892 893 894 895 896
	path_put(&nd->path);

	nd->path = *path;
	nd->inode = nd->path.dentry->d_inode;
	nd->flags |= LOOKUP_JUMPED;
}

897
static inline void put_link(struct nameidata *nd)
898
{
899
	struct saved *last = nd->stack + --nd->depth;
900
	do_delayed_call(&last->done);
901 902
	if (!(nd->flags & LOOKUP_RCU))
		path_put(&last->link);
903 904
}

905 906
int sysctl_protected_symlinks __read_mostly = 0;
int sysctl_protected_hardlinks __read_mostly = 0;
Kees Cook's avatar
Kees Cook committed
907 908 909

/**
 * may_follow_link - Check symlink following for unsafe situations
910
 * @nd: nameidata pathwalk data
Kees Cook's avatar
Kees Cook committed
911 912 913 914 915 916 917 918 919 920 921 922
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
923
static inline int may_follow_link(struct nameidata *nd)
Kees Cook's avatar
Kees Cook committed
924 925 926
{
	const struct inode *inode;
	const struct inode *parent;
927
	kuid_t puid;
Kees Cook's avatar
Kees Cook committed
928 929 930 931 932

	if (!sysctl_protected_symlinks)
		return 0;

	/* Allowed if owner and follower match. */
933
	inode = nd->link_inode;
934
	if (uid_eq(current_cred()->fsuid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
935 936 937
		return 0;

	/* Allowed if parent directory not sticky and world-writable. */
938
	parent = nd->inode;
Kees Cook's avatar
Kees Cook committed
939 940 941 942
	if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
		return 0;

	/* Allowed if parent directory and link owner match. */
943 944
	puid = parent->i_uid;
	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
Kees Cook's avatar
Kees Cook committed
945 946
		return 0;

947 948 949
	if (nd->flags & LOOKUP_RCU)
		return -ECHILD;

Al Viro's avatar
Al Viro committed
950
	audit_log_link_denied("follow_link", &nd->stack[0].link);
Kees Cook's avatar
Kees Cook committed
951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996
	return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct inode *inode)
{
	umode_t mode = inode->i_mode;

	/* Special files should not get pinned to the filesystem. */
	if (!S_ISREG(mode))
		return false;

	/* Setuid files should not get pinned to the filesystem. */
	if (mode & S_ISUID)
		return false;

	/* Executable setgid files should not get pinned to the filesystem. */
	if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
		return false;

	/* Hardlinking to unreadable or unwritable sources is dangerous. */
	if (inode_permission(inode, MAY_READ | MAY_WRITE))
		return false;

	return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @link: the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
997
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
Kees Cook's avatar
Kees Cook committed
998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
 *
 * Returns 0 if successful, -ve on error.
 */
static int may_linkat(struct path *link)
{
	struct inode *inode;

	if (!sysctl_protected_hardlinks)
		return 0;

	inode = link->dentry->d_inode;

	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
	 * otherwise, it must be a safe source.
	 */
1013
	if (safe_hardlink_source(inode) || inode_owner_or_capable(inode))
Kees Cook's avatar
Kees Cook committed
1014 1015
		return 0;

1016
	audit_log_link_denied("linkat", link);
Kees Cook's avatar
Kees Cook committed
1017 1018 1019
	return -EPERM;
}

1020 1021
static __always_inline
const char *get_link(struct nameidata *nd)
Linus Torvalds's avatar
Linus Torvalds committed
1022
{
1023
	struct saved *last = nd->stack + nd->depth - 1;
Al Viro's avatar
Al Viro committed
1024
	struct dentry *dentry = last->link.dentry;
1025
	struct inode *inode = nd->link_inode;
1026
	int error;
1027
	const char *res;
Linus Torvalds's avatar
Linus Torvalds committed
1028

1029 1030 1031
	if (!(nd->flags & LOOKUP_RCU)) {
		touch_atime(&last->link);
		cond_resched();
1032
	} else if (atime_needs_update_rcu(&last->link, inode)) {
Al Viro's avatar
Al Viro committed
1033
		if (unlikely(unlazy_walk(nd)))
1034
			return ERR_PTR(-ECHILD);
1035
		touch_atime(&last->link);
1036
	}
1037

1038 1039 1040
	error = security_inode_follow_link(dentry, inode,
					   nd->flags & LOOKUP_RCU);
	if (unlikely(error))
1041
		return ERR_PTR(error);
1042

1043
	nd->last_type = LAST_BIND;
1044 1045
	res = inode->i_link;
	if (!res) {
1046 1047 1048
		const char * (*get)(struct dentry *, struct inode *,
				struct delayed_call *);
		get = inode->i_op->get_link;
1049
		if (nd->flags & LOOKUP_RCU) {
1050
			res = get(NULL, inode, &last->done);
1051
			if (res == ERR_PTR(-ECHILD)) {
Al Viro's avatar
Al Viro committed
1052
				if (unlikely(unlazy_walk(nd)))
1053
					return ERR_PTR(-ECHILD);
1054
				res = get(dentry, inode, &last->done);
1055 1056
			}
		} else {
1057
			res = get(dentry, inode, &last->done);
1058
		}
1059
		if (IS_ERR_OR_NULL(res))
1060 1061 1062
			return res;
	}
	if (*res == '/') {
1063 1064
		if (!nd->root.mnt)
			set_root(nd);
1065 1066
		if (unlikely(nd_jump_root(nd)))
			return ERR_PTR(-ECHILD);
1067 1068
		while (unlikely(*++res == '/'))
			;
Linus Torvalds's avatar
Linus Torvalds committed
1069
	}
1070 1071
	if (!*res)
		res = NULL;
1072 1073
	return res;
}
1074

1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
1085
int follow_up(struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
1086
{
1087 1088
	struct mount *mnt = real_mount(path->mnt);
	struct mount *parent;
Linus Torvalds's avatar
Linus Torvalds committed
1089
	struct dentry *mountpoint;
Nick Piggin's avatar
Nick Piggin committed
1090

Al Viro's avatar
Al Viro committed
1091
	read_seqlock_excl(&mount_lock);
1092
	parent = mnt->mnt_parent;
Al Viro's avatar
Al Viro committed
1093
	if (parent == mnt) {
Al Viro's avatar
Al Viro committed
1094
		read_sequnlock_excl(&mount_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1095 1096
		return 0;
	}
1097
	mntget(&parent->mnt);
1098
	mountpoint = dget(mnt->mnt_mountpoint);
Al Viro's avatar
Al Viro committed
1099
	read_sequnlock_excl(&mount_lock);
1100 1101 1102
	dput(path->dentry);
	path->dentry = mountpoint;
	mntput(path->mnt);
1103
	path->mnt = &parent->mnt;
Linus Torvalds's avatar
Linus Torvalds committed
1104 1105
	return 1;
}
1106
EXPORT_SYMBOL(follow_up);
Linus Torvalds's avatar
Linus Torvalds committed
1107

1108
/*
1109 1110 1111
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
Linus Torvalds's avatar
Linus Torvalds committed
1112
 */
1113
static int follow_automount(struct path *path, struct nameidata *nd,
1114
			    bool *need_mntput)
1115
{
1116
	struct vfsmount *mnt;
1117
	int err;
1118 1119 1120 1121

	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
		return -EREMOTE;

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
	/* We don't want to mount if someone's just doing a stat -
	 * unless they're stat'ing a directory and appended a '/' to
	 * the name.
	 *
	 * We do, however, want to mount if someone wants to open or
	 * create a file of any type under the mountpoint, wants to
	 * traverse through the mountpoint or wants to open the
	 * mounted directory.  Also, autofs may mark negative dentries
	 * as being automount points.  These will need the attentions
	 * of the daemon to instantiate them before they can be used.
1132
	 */
1133
	if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
1134 1135 1136
			   LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
	    path->dentry->d_inode)
		return -EISDIR;
1137

1138 1139
	nd->total_link_count++;
	if (nd->total_link_count >= 40)
1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
		return -ELOOP;

	mnt = path->dentry->d_op->d_automount(path);
	if (IS_ERR(mnt)) {
		/*
		 * The filesystem is allowed to return -EISDIR here to indicate
		 * it doesn't want to automount.  For instance, autofs would do
		 * this so that its userspace daemon can mount on this dentry.
		 *
		 * However, we can only permit this if it's a terminal point in
		 * the path being looked up; if it wasn't then the remainder of
		 * the path is inaccessible and we should say so.
		 */
1153
		if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
1154 1155
			return -EREMOTE;
		return PTR_ERR(mnt);
1156
	}
1157

1158 1159
	if (!mnt) /* mount collision */
		return 0;
1160

1161 1162 1163 1164 1165
	if (!*need_mntput) {
		/* lock_mount() may release path->mnt on error */
		mntget(path->mnt);
		*need_mntput = true;
	}
1166
	err = finish_automount(mnt, path);
1167

1168 1169 1170
	switch (err) {
	case -EBUSY:
		/* Someone else made a mount here whilst we were busy */
1171
		return 0;
1172
	case 0:
1173
		path_put(path);
1174 1175 1176
		path->mnt = mnt;
		path->dentry = dget(mnt->mnt_root);
		return 0;
1177 1178
	default:
		return err;
1179
	}
1180

Al Viro's avatar
Al Viro committed
1181 1182
}

1183 1184
/*
 * Handle a dentry that is managed in some way.
1185
 * - Flagged for transit management (autofs)
1186 1187 1188 1189 1190 1191 1192
 * - Flagged as mountpoint
 * - Flagged as automount point
 *
 * This may only be called in refwalk mode.
 *
 * Serialization is taken care of in namespace.c
 */
1193
static int follow_managed(struct path *path, struct nameidata *nd)
Linus Torvalds's avatar
Linus Torvalds committed
1194
{
1195
	struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */
1196 1197
	unsigned managed;
	bool need_mntput = false;
1198
	int ret = 0;
1199 1200 1201 1202 1203 1204 1205

	/* Given that we're not holding a lock here, we retain the value in a
	 * local variable for each dentry as we look at it so that we don't see
	 * the components of that value change under us */
	while (managed = ACCESS_ONCE(path->dentry->d_flags),
	       managed &= DCACHE_MANAGED_DENTRY,
	       unlikely(managed != 0)) {
1206 1207 1208 1209 1210
		/* Allow the filesystem to manage the transit without i_mutex
		 * being held. */
		if (managed & DCACHE_MANAGE_TRANSIT) {
			BUG_ON(!path->dentry->d_op);
			BUG_ON(!path->dentry->d_op->d_manage);
1211
			ret = path->dentry->d_op->d_manage(path, false);
1212
			if (ret < 0)
1213
				break;
1214 1215
		}

1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230
		/* Transit to a mounted filesystem. */
		if (managed & DCACHE_MOUNTED) {
			struct vfsmount *mounted = lookup_mnt(path);
			if (mounted) {
				dput(path->dentry);
				if (need_mntput)
					mntput(path->mnt);
				path->mnt = mounted;
				path->dentry = dget(mounted->mnt_root);
				need_mntput = true;
				continue;
			}

			/* Something is mounted on this dentry in another
			 * namespace and/or whatever was mounted there in this
Al Viro's avatar
Al Viro committed
1231 1232
			 * namespace got unmounted before lookup_mnt() could
			 * get it */
1233 1234 1235 1236
		}

		/* Handle an automount point */
		if (managed & DCACHE_NEED_AUTOMOUNT) {
1237
			ret = follow_automount(path, nd, &need_mntput);
1238
			if (ret < 0)
1239
				break;
1240 1241 1242 1243 1244
			continue;
		}

		/* We didn't change the current path point */
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1245
	}
1246 1247 1248

	if (need_mntput && path->mnt == mnt)
		mntput(path->mnt);
1249 1250
	if (ret == -EISDIR || !ret)
		ret = 1;
1251 1252 1253 1254 1255
	if (need_mntput)
		nd->flags |= LOOKUP_JUMPED;
	if (unlikely(ret < 0))
		path_put_conditional(path, nd);
	return ret;
Linus Torvalds's avatar
Linus Torvalds committed
1256 1257
}

1258
int follow_down_one(struct path *path)
Linus Torvalds's avatar
Linus Torvalds committed
1259 1260 1261
{
	struct vfsmount *mounted;

Al Viro's avatar
Al Viro committed
1262
	mounted = lookup_mnt(path);
Linus Torvalds's avatar
Linus Torvalds committed
1263
	if (mounted) {
Al Viro's avatar
Al Viro committed
1264 1265 1266 1267
		dput(path->dentry);
		mntput(path->mnt);
		path->mnt = mounted;
		path->dentry = dget(mounted->mnt_root);
Linus Torvalds's avatar
Linus Torvalds committed
1268 1269 1270 1271
		return 1;
	}
	return 0;
}
1272
EXPORT_SYMBOL(follow_down_one);
Linus Torvalds's avatar
Linus Torvalds committed
1273

1274
static inline int managed_dentry_rcu(const struct path *path)
1275
{
1276 1277
	return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
		path->dentry->d_op->d_manage(path, true) : 0;
1278 1279
}

1280
/*
1281 1282
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
1283 1284
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1285
			       struct inode **inode, unsigned *seqp)
1286
{
1287
	for (;;) {
1288
		struct mount *mounted;
1289 1290 1291 1292
		/*
		 * Don't forget we might have a non-mountpoint managed dentry
		 * that wants to block transit.
		 */
1293
		switch (managed_dentry_rcu(path)) {
1294 1295
		case -ECHILD:
		default:
1296
			return false;
1297 1298 1299 1300 1301
		case -EISDIR:
			return true;
		case 0:
			break;
		}
1302 1303

		if (!d_mountpoint(path->dentry))
1304
			return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1305

1306
		mounted = __lookup_mnt(path->mnt, path->dentry);
1307 1308
		if (!mounted)
			break;
1309 1310
		path->mnt = &mounted->mnt;
		path->dentry = mounted->mnt.mnt_root;
1311
		nd->flags |= LOOKUP_JUMPED;
1312
		*seqp = read_seqcount_begin(&path->dentry->d_seq);
1313 1314 1315 1316 1317 1318
		/*
		 * Update the inode too. We don't need to re-check the
		 * dentry sequence number here after this d_inode read,
		 * because a mount-point is always pinned.
		 */
		*inode = path->dentry->d_inode;
1319
	}
1320
	return !read_seqretry(&mount_lock, nd->m_seq) &&
1321
		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1322 1323
}

1324 1325
static int follow_dotdot_rcu(struct nameidata *nd)
{
1326
	struct inode *inode = nd->inode;
1327

1328
	while (1) {
1329
		if (path_equal(&nd->path, &nd->root))
1330 1331 1332 1333 1334 1335
			break;
		if (nd->path.dentry != nd->path.mnt->mnt_root) {
			struct dentry *old = nd->path.dentry;
			struct dentry *parent = old->d_parent;
			unsigned seq;

1336
			inode = parent->d_inode;
1337
			seq = read_seqcount_begin(&parent->d_seq);
1338 1339
			if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
				return -ECHILD;