radix-tree.c 62.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
Christoph Lameter's avatar
Christoph Lameter committed
4
 * Copyright (C) 2005 SGI, Christoph Lameter
5
 * Copyright (C) 2006 Nick Piggin
6
 * Copyright (C) 2012 Konstantin Khlebnikov
7 8
 * Copyright (C) 2016 Intel, Matthew Wilcox
 * Copyright (C) 2016 Intel, Ross Zwisler
Linus Torvalds's avatar
Linus Torvalds committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2, or (at
 * your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

25 26
#include <linux/bitmap.h>
#include <linux/bitops.h>
27
#include <linux/cpu.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
#include <linux/errno.h>
29 30
#include <linux/export.h>
#include <linux/idr.h>
Linus Torvalds's avatar
Linus Torvalds committed
31 32
#include <linux/init.h>
#include <linux/kernel.h>
33
#include <linux/kmemleak.h>
Linus Torvalds's avatar
Linus Torvalds committed
34
#include <linux/percpu.h>
35 36 37
#include <linux/preempt.h>		/* in_interrupt() */
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
Linus Torvalds's avatar
Linus Torvalds committed
38 39 40 41
#include <linux/slab.h>
#include <linux/string.h>


42 43 44
/* Number of nodes in fully populated tree of given height */
static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;

Linus Torvalds's avatar
Linus Torvalds committed
45 46 47
/*
 * Radix tree node cache.
 */
48
static struct kmem_cache *radix_tree_node_cachep;
Linus Torvalds's avatar
Linus Torvalds committed
49

50 51 52 53 54 55 56 57 58 59 60 61 62
/*
 * The radix tree is variable-height, so an insert operation not only has
 * to build the branch to its corresponding item, it also has to build the
 * branch to existing items if the size has to be increased (by
 * radix_tree_extend).
 *
 * The worst case is a zero height tree with just a single item at index 0,
 * and then inserting an item at index ULONG_MAX. This requires 2 new branches
 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
 * Hence:
 */
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)

63 64 65 66 67 68 69 70 71
/*
 * The IDR does not have to be as high as the radix tree since it uses
 * signed integers, not unsigned longs.
 */
#define IDR_INDEX_BITS		(8 /* CHAR_BIT */ * sizeof(int) - 1)
#define IDR_MAX_PATH		(DIV_ROUND_UP(IDR_INDEX_BITS, \
						RADIX_TREE_MAP_SHIFT))
#define IDR_PRELOAD_SIZE	(IDR_MAX_PATH * 2 - 1)

72 73 74 75 76 77 78 79
/*
 * The IDA is even shorter since it uses a bitmap at the last level.
 */
#define IDA_INDEX_BITS		(8 * sizeof(int) - 1 - ilog2(IDA_BITMAP_BITS))
#define IDA_MAX_PATH		(DIV_ROUND_UP(IDA_INDEX_BITS, \
						RADIX_TREE_MAP_SHIFT))
#define IDA_PRELOAD_SIZE	(IDA_MAX_PATH * 2 - 1)

Linus Torvalds's avatar
Linus Torvalds committed
80 81 82 83
/*
 * Per-cpu pool of preloaded nodes
 */
struct radix_tree_preload {
84
	unsigned nr;
85
	/* nodes->parent points to next preallocated node */
86
	struct radix_tree_node *nodes;
Linus Torvalds's avatar
Linus Torvalds committed
87
};
88
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
Linus Torvalds's avatar
Linus Torvalds committed
89

90 91 92 93 94
static inline struct radix_tree_node *entry_to_node(void *ptr)
{
	return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}

95
static inline void *node_to_entry(void *ptr)
Nick Piggin's avatar
Nick Piggin committed
96
{
97
	return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
Nick Piggin's avatar
Nick Piggin committed
98 99
}

100
#define RADIX_TREE_RETRY	node_to_entry(NULL)
101

102 103
#ifdef CONFIG_RADIX_TREE_MULTIORDER
/* Sibling slots point directly to another slot in the same node */
104 105
static inline
bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
106
{
107
	void __rcu **ptr = node;
108 109 110 111
	return (parent->slots <= ptr) &&
			(ptr < parent->slots + RADIX_TREE_MAP_SIZE);
}
#else
112 113
static inline
bool is_sibling_entry(const struct radix_tree_node *parent, void *node)
114 115 116 117 118
{
	return false;
}
#endif

119 120
static inline unsigned long
get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
121 122 123 124
{
	return slot - parent->slots;
}

125
static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
126
			struct radix_tree_node **nodep, unsigned long index)
127
{
128
	unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
129
	void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);
130 131

#ifdef CONFIG_RADIX_TREE_MULTIORDER
132
	if (radix_tree_is_internal_node(entry)) {
133
		if (is_sibling_entry(parent, entry)) {
134 135
			void __rcu **sibentry;
			sibentry = (void __rcu **) entry_to_node(entry);
136 137
			offset = get_slot_offset(parent, sibentry);
			entry = rcu_dereference_raw(*sibentry);
138 139 140 141 142 143 144 145
		}
	}
#endif

	*nodep = (void *)entry;
	return offset;
}

146
static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
Nick Piggin's avatar
Nick Piggin committed
147 148 149 150
{
	return root->gfp_mask & __GFP_BITS_MASK;
}

151 152 153 154 155 156 157 158 159 160 161 162
static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
		int offset)
{
	__set_bit(offset, node->tags[tag]);
}

static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
		int offset)
{
	__clear_bit(offset, node->tags[tag]);
}

163
static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
164 165 166 167 168
		int offset)
{
	return test_bit(offset, node->tags[tag]);
}

169
static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
170
{
171
	root->gfp_mask |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
172 173
}

174
static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
175
{
176
	root->gfp_mask &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
177 178 179 180
}

static inline void root_tag_clear_all(struct radix_tree_root *root)
{
181
	root->gfp_mask &= (1 << ROOT_TAG_SHIFT) - 1;
182 183
}

184
static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
185
{
186
	return (__force int)root->gfp_mask & (1 << (tag + ROOT_TAG_SHIFT));
187 188
}

189
static inline unsigned root_tags_get(const struct radix_tree_root *root)
190
{
191
	return (__force unsigned)root->gfp_mask >> ROOT_TAG_SHIFT;
192 193
}

194
static inline bool is_idr(const struct radix_tree_root *root)
195
{
196
	return !!(root->gfp_mask & ROOT_IS_IDR);
197 198
}

199 200 201 202
/*
 * Returns 1 if any slot in the node has this tag set.
 * Otherwise returns 0.
 */
203 204
static inline int any_tag_set(const struct radix_tree_node *node,
							unsigned int tag)
205
{
206
	unsigned idx;
207 208 209 210 211 212
	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
		if (node->tags[tag][idx])
			return 1;
	}
	return 0;
}
213

214 215 216 217 218
static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
{
	bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
}

219 220 221 222 223 224 225 226 227 228 229 230
/**
 * radix_tree_find_next_bit - find the next set bit in a memory region
 *
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Unrollable variant of find_next_bit() for constant size arrays.
 * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
 * Returns next bit offset, or size if nothing found.
 */
static __always_inline unsigned long
231 232
radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
			 unsigned long offset)
233
{
234
	const unsigned long *addr = node->tags[tag];
235

236
	if (offset < RADIX_TREE_MAP_SIZE) {
237 238 239 240 241 242 243
		unsigned long tmp;

		addr += offset / BITS_PER_LONG;
		tmp = *addr >> (offset % BITS_PER_LONG);
		if (tmp)
			return __ffs(tmp) + offset;
		offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
244
		while (offset < RADIX_TREE_MAP_SIZE) {
245 246 247 248 249 250
			tmp = *++addr;
			if (tmp)
				return __ffs(tmp) + offset;
			offset += BITS_PER_LONG;
		}
	}
251
	return RADIX_TREE_MAP_SIZE;
252 253
}

254 255 256 257 258
static unsigned int iter_offset(const struct radix_tree_iter *iter)
{
	return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK;
}

259 260 261 262 263 264 265 266
/*
 * The maximum index which can be stored in a radix tree
 */
static inline unsigned long shift_maxindex(unsigned int shift)
{
	return (RADIX_TREE_MAP_SIZE << shift) - 1;
}

267
static inline unsigned long node_maxindex(const struct radix_tree_node *node)
268 269 270 271
{
	return shift_maxindex(node->shift);
}

272 273 274 275 276 277 278
static unsigned long next_index(unsigned long index,
				const struct radix_tree_node *node,
				unsigned long offset)
{
	return (index & ~node_maxindex(node)) + (offset << node->shift);
}

279
#ifndef __KERNEL__
280
static void dump_node(struct radix_tree_node *node, unsigned long index)
281
{
282
	unsigned long i;
283

284 285 286
	pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n",
		node, node->offset, index, index | node_maxindex(node),
		node->parent,
287
		node->tags[0][0], node->tags[1][0], node->tags[2][0],
288
		node->shift, node->count, node->exceptional);
289 290

	for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
291 292
		unsigned long first = index | (i << node->shift);
		unsigned long last = first | ((1UL << node->shift) - 1);
293 294 295
		void *entry = node->slots[i];
		if (!entry)
			continue;
296 297 298
		if (entry == RADIX_TREE_RETRY) {
			pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n",
					i, first, last, node);
299
		} else if (!radix_tree_is_internal_node(entry)) {
300 301 302 303 304 305
			pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n",
					entry, i, first, last, node);
		} else if (is_sibling_entry(node, entry)) {
			pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n",
					entry, i, first, last, node,
					*(void **)entry_to_node(entry));
306
		} else {
307
			dump_node(entry_to_node(entry), first);
308 309
		}
	}
310 311 312 313 314
}

/* For debug */
static void radix_tree_dump(struct radix_tree_root *root)
{
315 316
	pr_debug("radix root: %p rnode %p tags %x\n",
			root, root->rnode,
317
			root->gfp_mask >> ROOT_TAG_SHIFT);
318
	if (!radix_tree_is_internal_node(root->rnode))
319
		return;
320
	dump_node(entry_to_node(root->rnode), 0);
321
}
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341

static void dump_ida_node(void *entry, unsigned long index)
{
	unsigned long i;

	if (!entry)
		return;

	if (radix_tree_is_internal_node(entry)) {
		struct radix_tree_node *node = entry_to_node(entry);

		pr_debug("ida node: %p offset %d indices %lu-%lu parent %p free %lx shift %d count %d\n",
			node, node->offset, index * IDA_BITMAP_BITS,
			((index | node_maxindex(node)) + 1) *
				IDA_BITMAP_BITS - 1,
			node->parent, node->tags[0][0], node->shift,
			node->count);
		for (i = 0; i < RADIX_TREE_MAP_SIZE; i++)
			dump_ida_node(node->slots[i],
					index | (i << node->shift));
342 343 344 345 346 347 348 349
	} else if (radix_tree_exceptional_entry(entry)) {
		pr_debug("ida excp: %p offset %d indices %lu-%lu data %lx\n",
				entry, (int)(index & RADIX_TREE_MAP_MASK),
				index * IDA_BITMAP_BITS,
				index * IDA_BITMAP_BITS + BITS_PER_LONG -
					RADIX_TREE_EXCEPTIONAL_SHIFT,
				(unsigned long)entry >>
					RADIX_TREE_EXCEPTIONAL_SHIFT);
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
	} else {
		struct ida_bitmap *bitmap = entry;

		pr_debug("ida btmp: %p offset %d indices %lu-%lu data", bitmap,
				(int)(index & RADIX_TREE_MAP_MASK),
				index * IDA_BITMAP_BITS,
				(index + 1) * IDA_BITMAP_BITS - 1);
		for (i = 0; i < IDA_BITMAP_LONGS; i++)
			pr_cont(" %lx", bitmap->bitmap[i]);
		pr_cont("\n");
	}
}

static void ida_dump(struct ida *ida)
{
	struct radix_tree_root *root = &ida->ida_rt;
366 367
	pr_debug("ida: %p node %p free %d\n", ida, root->rnode,
				root->gfp_mask >> ROOT_TAG_SHIFT);
368 369
	dump_ida_node(root->rnode, 0);
}
370 371
#endif

Linus Torvalds's avatar
Linus Torvalds committed
372 373 374 375 376
/*
 * This assumes that the caller has performed appropriate preallocation, and
 * that the caller has pinned this thread of control to the current CPU.
 */
static struct radix_tree_node *
377
radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
378
			struct radix_tree_root *root,
379 380
			unsigned int shift, unsigned int offset,
			unsigned int count, unsigned int exceptional)
Linus Torvalds's avatar
Linus Torvalds committed
381
{
382
	struct radix_tree_node *ret = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
383

384
	/*
385 386 387
	 * Preload code isn't irq safe and it doesn't make sense to use
	 * preloading during an interrupt anyway as all the allocations have
	 * to be atomic. So just do normal allocation when in interrupt.
388
	 */
389
	if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
Linus Torvalds's avatar
Linus Torvalds committed
390 391
		struct radix_tree_preload *rtp;

392 393
		/*
		 * Even if the caller has preloaded, try to allocate from the
394 395
		 * cache first for the new node to get accounted to the memory
		 * cgroup.
396 397
		 */
		ret = kmem_cache_alloc(radix_tree_node_cachep,
398
				       gfp_mask | __GFP_NOWARN);
399 400 401
		if (ret)
			goto out;

402 403 404 405 406
		/*
		 * Provided the caller has preloaded here, we will always
		 * succeed in getting a node here (and never reach
		 * kmem_cache_alloc)
		 */
407
		rtp = this_cpu_ptr(&radix_tree_preloads);
Linus Torvalds's avatar
Linus Torvalds committed
408
		if (rtp->nr) {
409
			ret = rtp->nodes;
410
			rtp->nodes = ret->parent;
Linus Torvalds's avatar
Linus Torvalds committed
411 412
			rtp->nr--;
		}
413 414 415 416 417
		/*
		 * Update the allocation stack trace as this is more useful
		 * for debugging.
		 */
		kmemleak_update_trace(ret);
418
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
419
	}
420
	ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
421
out:
422
	BUG_ON(radix_tree_is_internal_node(ret));
423 424 425 426 427
	if (ret) {
		ret->shift = shift;
		ret->offset = offset;
		ret->count = count;
		ret->exceptional = exceptional;
428 429
		ret->parent = parent;
		ret->root = root;
430
	}
Linus Torvalds's avatar
Linus Torvalds committed
431 432 433
	return ret;
}

434 435 436 437
static void radix_tree_node_rcu_free(struct rcu_head *head)
{
	struct radix_tree_node *node =
			container_of(head, struct radix_tree_node, rcu_head);
438 439

	/*
440 441 442
	 * Must only free zeroed nodes into the slab.  We can be left with
	 * non-NULL entries by radix_tree_free_nodes, so clear the entries
	 * and tags here.
443
	 */
444 445
	memset(node->slots, 0, sizeof(node->slots));
	memset(node->tags, 0, sizeof(node->tags));
446
	INIT_LIST_HEAD(&node->private_list);
447

448 449 450
	kmem_cache_free(radix_tree_node_cachep, node);
}

Linus Torvalds's avatar
Linus Torvalds committed
451 452 453
static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
454
	call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
Linus Torvalds's avatar
Linus Torvalds committed
455 456 457 458 459 460 461
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
462 463
 *
 * To make use of this facility, the radix tree must be initialised without
464
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
Linus Torvalds's avatar
Linus Torvalds committed
465
 */
466
static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
Linus Torvalds's avatar
Linus Torvalds committed
467 468 469 470 471
{
	struct radix_tree_preload *rtp;
	struct radix_tree_node *node;
	int ret = -ENOMEM;

472 473 474 475 476 477
	/*
	 * Nodes preloaded by one cgroup can be be used by another cgroup, so
	 * they should never be accounted to any particular memory cgroup.
	 */
	gfp_mask &= ~__GFP_ACCOUNT;

Linus Torvalds's avatar
Linus Torvalds committed
478
	preempt_disable();
479
	rtp = this_cpu_ptr(&radix_tree_preloads);
480
	while (rtp->nr < nr) {
Linus Torvalds's avatar
Linus Torvalds committed
481
		preempt_enable();
Christoph Lameter's avatar
Christoph Lameter committed
482
		node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
483 484 485
		if (node == NULL)
			goto out;
		preempt_disable();
486
		rtp = this_cpu_ptr(&radix_tree_preloads);
487
		if (rtp->nr < nr) {
488
			node->parent = rtp->nodes;
489 490 491
			rtp->nodes = node;
			rtp->nr++;
		} else {
Linus Torvalds's avatar
Linus Torvalds committed
492
			kmem_cache_free(radix_tree_node_cachep, node);
493
		}
Linus Torvalds's avatar
Linus Torvalds committed
494 495 496 497 498
	}
	ret = 0;
out:
	return ret;
}
499 500 501 502 503 504 505 506

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
507
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
508 509 510 511
 */
int radix_tree_preload(gfp_t gfp_mask)
{
	/* Warn on non-sensical use... */
512
	WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
513
	return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
514
}
515
EXPORT_SYMBOL(radix_tree_preload);
Linus Torvalds's avatar
Linus Torvalds committed
516

517 518 519 520 521 522 523
/*
 * The same as above function, except we don't guarantee preloading happens.
 * We do it, if we decide it helps. On success, return zero with preemption
 * disabled. On error, return -ENOMEM with preemption not disabled.
 */
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
524
	if (gfpflags_allow_blocking(gfp_mask))
525
		return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
526 527 528 529 530 531
	/* Preloading doesn't help anything with this gfp mask, skip it */
	preempt_disable();
	return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);

532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
#ifdef CONFIG_RADIX_TREE_MULTIORDER
/*
 * Preload with enough objects to ensure that we can split a single entry
 * of order @old_order into many entries of size @new_order
 */
int radix_tree_split_preload(unsigned int old_order, unsigned int new_order,
							gfp_t gfp_mask)
{
	unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT);
	unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) -
				(new_order / RADIX_TREE_MAP_SHIFT);
	unsigned nr = 0;

	WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
	BUG_ON(new_order >= old_order);

	while (layers--)
		nr = nr * RADIX_TREE_MAP_SIZE + 1;
	return __radix_tree_preload(gfp_mask, top * nr);
}
#endif

554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
/*
 * The same as function above, but preload number of nodes required to insert
 * (1 << order) continuous naturally-aligned elements.
 */
int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
{
	unsigned long nr_subtrees;
	int nr_nodes, subtree_height;

	/* Preloading doesn't help anything with this gfp mask, skip it */
	if (!gfpflags_allow_blocking(gfp_mask)) {
		preempt_disable();
		return 0;
	}

	/*
	 * Calculate number and height of fully populated subtrees it takes to
	 * store (1 << order) elements.
	 */
	nr_subtrees = 1 << order;
	for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE;
			subtree_height++)
		nr_subtrees >>= RADIX_TREE_MAP_SHIFT;

	/*
	 * The worst case is zero height tree with a single item at index 0 and
	 * then inserting items starting at ULONG_MAX - (1 << order).
	 *
	 * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to
	 * 0-index item.
	 */
	nr_nodes = RADIX_TREE_MAX_PATH;

	/* Plus branch to fully populated subtrees. */
	nr_nodes += RADIX_TREE_MAX_PATH - subtree_height;

	/* Root node is shared. */
	nr_nodes--;

	/* Plus nodes required to build subtrees. */
	nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height];

	return __radix_tree_preload(gfp_mask, nr_nodes);
}

599
static unsigned radix_tree_load_root(const struct radix_tree_root *root,
600 601 602 603 604 605
		struct radix_tree_node **nodep, unsigned long *maxindex)
{
	struct radix_tree_node *node = rcu_dereference_raw(root->rnode);

	*nodep = node;

606
	if (likely(radix_tree_is_internal_node(node))) {
607
		node = entry_to_node(node);
608
		*maxindex = node_maxindex(node);
609
		return node->shift + RADIX_TREE_MAP_SHIFT;
610 611 612 613 614 615
	}

	*maxindex = 0;
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
616 617 618
/*
 *	Extend a radix tree so it can store key @index.
 */
619
static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
620
				unsigned long index, unsigned int shift)
Linus Torvalds's avatar
Linus Torvalds committed
621
{
622
	void *entry;
623
	unsigned int maxshift;
Linus Torvalds's avatar
Linus Torvalds committed
624 625
	int tag;

626 627 628 629
	/* Figure out what the shift should be.  */
	maxshift = shift;
	while (index > shift_maxindex(maxshift))
		maxshift += RADIX_TREE_MAP_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
630

631 632
	entry = rcu_dereference_raw(root->rnode);
	if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
Linus Torvalds's avatar
Linus Torvalds committed
633 634 635
		goto out;

	do {
636
		struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
637
							root, shift, 0, 1, 0);
638
		if (!node)
Linus Torvalds's avatar
Linus Torvalds committed
639 640
			return -ENOMEM;

641 642 643 644 645 646 647 648 649 650 651 652
		if (is_idr(root)) {
			all_tag_set(node, IDR_FREE);
			if (!root_tag_get(root, IDR_FREE)) {
				tag_clear(node, IDR_FREE, 0);
				root_tag_set(root, IDR_FREE);
			}
		} else {
			/* Propagate the aggregated tag info to the new child */
			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
				if (root_tag_get(root, tag))
					tag_set(node, tag, 0);
			}
Linus Torvalds's avatar
Linus Torvalds committed
653 654
		}

655
		BUG_ON(shift > BITS_PER_LONG);
656 657 658
		if (radix_tree_is_internal_node(entry)) {
			entry_to_node(entry)->parent = node;
		} else if (radix_tree_exceptional_entry(entry)) {
659
			/* Moving an exceptional root->rnode to a node */
660
			node->exceptional = 1;
661
		}
662 663 664 665 666 667 668
		/*
		 * entry was already in the radix tree, so we do not need
		 * rcu_assign_pointer here
		 */
		node->slots[0] = (void __rcu *)entry;
		entry = node_to_entry(node);
		rcu_assign_pointer(root->rnode, entry);
669 670
		shift += RADIX_TREE_MAP_SHIFT;
	} while (shift <= maxshift);
Linus Torvalds's avatar
Linus Torvalds committed
671
out:
672
	return maxshift + RADIX_TREE_MAP_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
673 674
}

675 676 677 678
/**
 *	radix_tree_shrink    -    shrink radix tree to minimum height
 *	@root		radix tree root
 */
679
static inline bool radix_tree_shrink(struct radix_tree_root *root,
680 681
				     radix_tree_update_node_t update_node,
				     void *private)
682
{
683 684
	bool shrunk = false;

685
	for (;;) {
686
		struct radix_tree_node *node = rcu_dereference_raw(root->rnode);
687 688 689 690 691 692 693 694 695 696 697 698 699
		struct radix_tree_node *child;

		if (!radix_tree_is_internal_node(node))
			break;
		node = entry_to_node(node);

		/*
		 * The candidate node has more than one child, or its child
		 * is not at the leftmost slot, or the child is a multiorder
		 * entry, we cannot shrink.
		 */
		if (node->count != 1)
			break;
700
		child = rcu_dereference_raw(node->slots[0]);
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
		if (!child)
			break;
		if (!radix_tree_is_internal_node(child) && node->shift)
			break;

		if (radix_tree_is_internal_node(child))
			entry_to_node(child)->parent = NULL;

		/*
		 * We don't need rcu_assign_pointer(), since we are simply
		 * moving the node from one part of the tree to another: if it
		 * was safe to dereference the old pointer to it
		 * (node->slots[0]), it will be safe to dereference the new
		 * one (root->rnode) as far as dependent read barriers go.
		 */
716
		root->rnode = (void __rcu *)child;
717 718
		if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
			root_tag_clear(root, IDR_FREE);
719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737

		/*
		 * We have a dilemma here. The node's slot[0] must not be
		 * NULLed in case there are concurrent lookups expecting to
		 * find the item. However if this was a bottom-level node,
		 * then it may be subject to the slot pointer being visible
		 * to callers dereferencing it. If item corresponding to
		 * slot[0] is subsequently deleted, these callers would expect
		 * their slot to become empty sooner or later.
		 *
		 * For example, lockless pagecache will look up a slot, deref
		 * the page pointer, and if the page has 0 refcount it means it
		 * was concurrently deleted from pagecache so try the deref
		 * again. Fortunately there is already a requirement for logic
		 * to retry the entire slot lookup -- the indirect pointer
		 * problem (replacing direct root node with an indirect pointer
		 * also results in a stale slot). So tag the slot as indirect
		 * to force callers to retry.
		 */
738 739
		node->count = 0;
		if (!radix_tree_is_internal_node(child)) {
740
			node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
741 742 743
			if (update_node)
				update_node(node, private);
		}
744

745
		WARN_ON_ONCE(!list_empty(&node->private_list));
746
		radix_tree_node_free(node);
747
		shrunk = true;
748
	}
749 750

	return shrunk;
751 752
}

753
static bool delete_node(struct radix_tree_root *root,
754 755
			struct radix_tree_node *node,
			radix_tree_update_node_t update_node, void *private)
756
{
757 758
	bool deleted = false;

759 760 761 762
	do {
		struct radix_tree_node *parent;

		if (node->count) {
763 764
			if (node_to_entry(node) ==
					rcu_dereference_raw(root->rnode))
765 766 767
				deleted |= radix_tree_shrink(root, update_node,
								private);
			return deleted;
768 769 770 771 772 773 774
		}

		parent = node->parent;
		if (parent) {
			parent->slots[node->offset] = NULL;
			parent->count--;
		} else {
775 776 777 778 779 780
			/*
			 * Shouldn't the tags already have all been cleared
			 * by the caller?
			 */
			if (!is_idr(root))
				root_tag_clear_all(root);
781 782 783
			root->rnode = NULL;
		}

784
		WARN_ON_ONCE(!list_empty(&node->private_list));
785
		radix_tree_node_free(node);
786
		deleted = true;
787 788 789

		node = parent;
	} while (node);
790 791

	return deleted;
792 793
}

Linus Torvalds's avatar
Linus Torvalds committed
794
/**
795
 *	__radix_tree_create	-	create a slot in a radix tree
Linus Torvalds's avatar
Linus Torvalds committed
796 797
 *	@root:		radix tree root
 *	@index:		index key
798
 *	@order:		index occupies 2^order aligned slots
799 800
 *	@nodep:		returns node
 *	@slotp:		returns slot
Linus Torvalds's avatar
Linus Torvalds committed
801
 *
802 803 804 805 806 807 808 809
 *	Create, if necessary, and return the node and slot for an item
 *	at position @index in the radix tree @root.
 *
 *	Until there is more than one item in the tree, no nodes are
 *	allocated and @root->rnode is used as a direct slot instead of
 *	pointing to a node, in which case *@nodep will be NULL.
 *
 *	Returns -ENOMEM, or 0 for success.
Linus Torvalds's avatar
Linus Torvalds committed
810
 */
811
int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
812
			unsigned order, struct radix_tree_node **nodep,
813
			void __rcu ***slotp)
Linus Torvalds's avatar
Linus Torvalds committed
814
{
815
	struct radix_tree_node *node = NULL, *child;
816
	void __rcu **slot = (void __rcu **)&root->rnode;
817
	unsigned long maxindex;
818
	unsigned int shift, offset = 0;
819
	unsigned long max = index | ((1UL << order) - 1);
820
	gfp_t gfp = root_gfp_mask(root);
821

822
	shift = radix_tree_load_root(root, &child, &maxindex);
Linus Torvalds's avatar
Linus Torvalds committed
823 824

	/* Make sure the tree is high enough.  */
825 826
	if (order > 0 && max == ((1UL << order) - 1))
		max++;
827
	if (max > maxindex) {
828
		int error = radix_tree_extend(root, gfp, max, shift);
829
		if (error < 0)
Linus Torvalds's avatar
Linus Torvalds committed
830
			return error;
831
		shift = error;
832
		child = rcu_dereference_raw(root->rnode);
Linus Torvalds's avatar
Linus Torvalds committed
833 834
	}

835
	while (shift > order) {
836
		shift -= RADIX_TREE_MAP_SHIFT;
837
		if (child == NULL) {
Linus Torvalds's avatar
Linus Torvalds committed
838
			/* Have to add a child node.  */
839
			child = radix_tree_node_alloc(gfp, node, root, shift,
840
							offset, 0, 0);
841
			if (!child)
Linus Torvalds's avatar
Linus Torvalds committed
842
				return -ENOMEM;
843 844
			rcu_assign_pointer(*slot, node_to_entry(child));
			if (node)
Linus Torvalds's avatar
Linus Torvalds committed
845
				node->count++;
846
		} else if (!radix_tree_is_internal_node(child))
847
			break;
Linus Torvalds's avatar
Linus Torvalds committed
848 849

		/* Go a level down */
850
		node = entry_to_node(child);
851
		offset = radix_tree_descend(node, &child, index);
852
		slot = &node->slots[offset];
853 854
	}

855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876
	if (nodep)
		*nodep = node;
	if (slotp)
		*slotp = slot;
	return 0;
}

/*
 * Free any nodes below this node.  The tree is presumed to not need
 * shrinking, and any user data in the tree is presumed to not need a
 * destructor called on it.  If we need to add a destructor, we can
 * add that functionality later.  Note that we may not clear tags or
 * slots from the tree as an RCU walker may still have a pointer into
 * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
 * but we'll still have to clear those in rcu_free.
 */
static void radix_tree_free_nodes(struct radix_tree_node *node)
{
	unsigned offset = 0;
	struct radix_tree_node *child = entry_to_node(node);

	for (;;) {
877
		void *entry = rcu_dereference_raw(child->slots[offset]);
878 879 880 881 882 883 884 885 886 887 888
		if (radix_tree_is_internal_node(entry) &&
					!is_sibling_entry(child, entry)) {
			child = entry_to_node(entry);
			offset = 0;
			continue;
		}
		offset++;
		while (offset == RADIX_TREE_MAP_SIZE) {
			struct radix_tree_node *old = child;
			offset = child->offset + 1;
			child = child->parent;
889
			WARN_ON_ONCE(!list_empty(&old->private_list));
890 891 892 893 894 895 896
			radix_tree_node_free(old);
			if (old == entry_to_node(node))
				return;
		}
	}
}

897
#ifdef CONFIG_RADIX_TREE_MULTIORDER
898 899
static inline int insert_entries(struct radix_tree_node *node,
		void __rcu **slot, void *item, unsigned order, bool replace)
900 901 902 903 904
{
	struct radix_tree_node *child;
	unsigned i, n, tag, offset, tags = 0;

	if (node) {
905 906 907 908
		if (order > node->shift)
			n = 1 << (order - node->shift);
		else
			n = 1;
909 910 911 912 913 914 915
		offset = get_slot_offset(node, slot);
	} else {
		n = 1;
		offset = 0;
	}

	if (n > 1) {
916
		offset = offset & ~(n - 1);
917
		slot = &node->slots[offset];
918 919 920 921 922 923 924 925 926 927 928
	}
	child = node_to_entry(slot);

	for (i = 0; i < n; i++) {
		if (slot[i]) {
			if (replace) {
				node->count--;
				for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
					if (tag_get(node, tag, offset + i))
						tags |= 1 << tag;
			} else
929 930
				return -EEXIST;
		}
931
	}
932

933
	for (i = 0; i < n; i++) {
934
		struct radix_tree_node *old = rcu_dereference_raw(slot[i]);
935
		if (i) {
936
			rcu_assign_pointer(slot[i], child);
937 938 939 940 941 942 943 944
			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
				if (tags & (1 << tag))
					tag_clear(node, tag, offset + i);
		} else {
			rcu_assign_pointer(slot[i], item);
			for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
				if (tags & (1 << tag))
					tag_set(node, tag, offset);
945
		}
946
		if (radix_tree_is_internal_node(old) &&
947 948
					!is_sibling_entry(node, old) &&
					(old != RADIX_TREE_RETRY))
949 950 951
			radix_tree_free_nodes(old);
		if (radix_tree_exceptional_entry(old))
			node->exceptional--;
Nick Piggin's avatar
Nick Piggin committed
952
	}
953 954 955 956 957 958
	if (node) {
		node->count += n;
		if (radix_tree_exceptional_entry(item))
			node->exceptional += n;
	}
	return n;
959
}
960
#else
961 962
static inline int insert_entries(struct radix_tree_node *node,
		void __rcu **slot, void *item, unsigned order, bool replace)
963 964 965 966 967 968 969 970 971 972 973 974
{
	if (*slot)
		return -EEXIST;
	rcu_assign_pointer(*slot, item);
	if (node) {
		node->count++;
		if (radix_tree_exceptional_entry(item))
			node->exceptional++;
	}
	return 1;
}
#endif
975 976

/**
977
 *	__radix_tree_insert    -    insert into a radix tree
978 979
 *	@root:		radix tree root
 *	@index:		index key
980
 *	@order:		key covers the 2^order indices around index
981 982 983 984
 *	@item:		item to insert
 *
 *	Insert an item into the radix tree at position @index.
 */
985 986
int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
			unsigned order, void *item)
987 988
{
	struct radix_tree_node *node;
989
	void __rcu **slot;
990 991
	int error;

992
	BUG_ON(radix_tree_is_internal_node(item));
993

994
	error = __radix_tree_create(root, index, order, &node, &slot);
995 996
	if (error)
		return error;
997 998 999 1000

	error = insert_entries(node, slot, item, order, false);
	if (error < 0)
		return error;
1001

Nick Piggin's avatar
Nick Piggin committed
1002
	if (node) {
1003 1004 1005 1006
		unsigned offset = get_slot_offset(node, slot);
		BUG_ON(tag_get(node, 0, offset));
		BUG_ON(tag_get(node, 1, offset));
		BUG_ON(tag_get(node, 2, offset));
Nick Piggin's avatar
Nick Piggin committed
1007
	} else {
1008
		BUG_ON(root_tags_get(root));
Nick Piggin's avatar
Nick Piggin committed
1009
	}
Linus Torvalds's avatar
Linus Torvalds committed
1010 1011 1012

	return 0;
}
1013
EXPORT_SYMBOL(__radix_tree_insert);
Linus Torvalds's avatar
Linus Torvalds committed
1014

1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027
/**
 *	__radix_tree_lookup	-	lookup an item in a radix tree
 *	@root:		radix tree root
 *	@index:		index key
 *	@nodep:		returns node
 *	@slotp:		returns slot
 *
 *	Lookup and return the item at position @index in the radix
 *	tree @root.
 *
 *	Until there is more than one item in the tree, no nodes are
 *	allocated and @root->rnode is used as a direct slot instead of
 *	pointing to a node, in which case *@nodep will be NULL.
1028
 */
1029 1030
void *__radix_tree_lookup(const struct radix_tree_root *root,
			  unsigned long index, struct radix_tree_node **nodep,
1031
			  void __rcu ***slotp)
Linus Torvalds's avatar
Linus Torvalds committed
1032
{
1033
	struct radix_tree_node *node, *parent;
1034
	unsigned long maxindex;
1035
	void __rcu **slot;
Nick Piggin's avatar
Nick Piggin committed
1036

1037 1038
 restart:
	parent = NULL;
1039
	slot = (void __rcu **)&root->rnode;
1040
	radix_tree_load_root(root, &node, &maxindex);
1041
	if (index > maxindex)
Linus Torvalds's avatar
Linus Torvalds committed
1042 1043
		return NULL;

1044
	while (radix_tree_is_internal_node(node)) {
1045
		unsigned offset;
Linus Torvalds's avatar
Linus Torvalds committed
1046

1047 1048
		if (node == RADIX_TREE_RETRY)
			goto restart;
1049
		parent = entry_to_node(node);
1050
		offset = radix_tree_descend(parent, &node, index);
1051 1052
		slot = parent->slots + offset;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1053

1054 1055 1056 1057 1058
	if (nodep)
		*nodep = parent;
	if (slotp)
		*slotp = slot;
	return node;
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
}

/**
 *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
 *	@root:		radix tree root
 *	@index:		index key
 *
 *	Returns:  the slot corresponding to the position @index in the
 *	radix tree @root. This is useful for update-if-exists operations.
 *
 *	This function can be called under rcu_read_lock iff the slot is not
 *	modified by radix_tree_replace_slot, otherwise it must be called
 *	exclusive from other writers. Any dereference of the slot must be done
 *	using radix_tree_deref_slot.
 */
1074
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
1075
				unsigned long index)
1076
{
1077
	void __rcu **slot;
1078 1079 1080 1081

	if (!__radix_tree_lookup(root, index, NULL, &slot))
		return NULL;
	return slot;
1082 1083 1084 1085 1086 1087 1088 1089 1090
}
EXPORT_SYMBOL(radix_tree_lookup_slot);

/**
 *	radix_tree_lookup    -    perform lookup operation on a radix tree
 *	@root:		radix tree root
 *	@index:		index key
 *
 *	Lookup the item at the position @index in the radix tree @root.
1091 1092 1093 1094 1095
 *
 *	This function can be called under rcu_read_lock, however the caller
 *	must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 *	them safely). No RCU barriers are required to access or modify the
 *	returned item, however.
1096
 */
1097
void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
1098
{
1099
	return __radix_tree_lookup(root, index, NULL, NULL);
Linus Torvalds's avatar
Linus Torvalds committed
1100 1101 1102
}
EXPORT_SYMBOL(radix_tree_lookup);

1103
static inline void replace_sibling_entries(struct radix_tree_node *node,
1104
				void __rcu **slot, int count, int exceptional)
1105 1106 1107
{
#ifdef CONFIG_RADIX_TREE_MULTIORDER
	void *ptr = node_to_entry(slot);
1108
	unsigned offset = get_slot_offset(node, slot) + 1;
1109

1110
	while (offset < RADIX_TREE_MAP_SIZE) {
1111
		if (rcu_dereference_raw(node->slots[offset]) != ptr)
1112
			break;
1113 1114 1115 1116 1117 1118
		if (count < 0) {
			node->slots[offset] = NULL;
			node->count--;
		}
		node->exceptional += exceptional;
		offset++;
1119 1120 1121 1122
	}
#endif
}

1123 1124
static void replace_slot(void __rcu **slot, void *item,
		struct radix_tree_node *node, int count, int exceptional)
1125
{
1126 1127
	if (WARN_ON_ONCE(radix_tree_is_internal_node(item)))
		return;
1128

1129
	if (node && (count || exceptional)) {
1130
		node->count += count;
1131 1132
		node->exceptional += exceptional;
		replace_sibling_entries(node, slot, count, exceptional);
1133
	}
1134 1135 1136 1137

	rcu_assign_pointer(*slot, item);
}

1138 1139 1140
static bool node_tag_get(const struct radix_tree_root *root,
				const struct radix_tree_node *node,
				unsigned int tag, unsigned int offset)
1141
{
1142 1143 1144 1145
	if (node)
		return tag_get(node, tag, offset);
	return root_tag_get(root, tag);
}
1146

1147 1148 1149 1150 1151 1152 1153 1154
/*
 * IDR users want to be able to store NULL in the tree, so if the slot isn't
 * free, don't adjust the count, even if it's transitioning between NULL and
 * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
 * have empty bits, but it only stores NULL in slots when they're being
 * deleted.
 */
static int calculate_count(struct radix_tree_root *root,
1155
				struct radix_tree_node *node, void __rcu **slot,
1156 1157 1158 1159 1160 1161 1162 1163 1164
				void *item, void *old)
{
	if (is_idr(root)) {
		unsigned offset = get_slot_offset(node, slot);
		bool free = node_tag_get(root, node, IDR_FREE, offset);
		if (!free)
			return 0;
		if (!old)
			return 1;
1165
	}
1166
	return !!item - !!old;
1167 1168
}

1169 1170
/**
 * __radix_tree_replace		- replace item in a slot
1171 1172 1173 1174 1175 1176
 * @root:		radix tree root
 * @node:		pointer to tree node
 * @slot:		pointer to slot in @node
 * @item:		new item to store in the slot.
 * @update_node:	callback for changing leaf nodes
 * @private:		private data to pass to @update_node
1177 1178 1179 1180 1181 1182
 *
 * For use with __radix_tree_lookup().  Caller must hold tree write locked
 * across slot lookup and replacement.
 */
void __radix_tree_replace(struct radix_tree_root *root,
			  struct radix_tree_node *node,
1183
			  void __rcu **slot, void *item,
1184
			  radix_tree_update_node_t update_node, void *private)
1185
{
1186 1187 1188 1189 1190
	void *old = rcu_dereference_raw(*slot);
	int exceptional = !!radix_tree_exceptional_entry(item) -
				!!radix_tree_exceptional_entry(old);
	int count =