clocksource.c 31.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/*
 * linux/kernel/time/clocksource.c
 *
 * This file contains the functions which manage clocksource drivers.
 *
 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * TODO WishList:
 *   o Allow clocksource drivers to be unregistered
 */

26 27
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

28
#include <linux/device.h>
29 30 31
#include <linux/clocksource.h>
#include <linux/init.h>
#include <linux/module.h>
32
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
33
#include <linux/tick.h>
34
#include <linux/kthread.h>
35
#include <linux/kallsyms.h>
36

37
#include "tick-internal.h"
38
#include "timekeeping_internal.h"
39

40 41 42 43 44 45
/**
 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
 * @mult:	pointer to mult variable
 * @shift:	pointer to shift variable
 * @from:	frequency to convert from
 * @to:		frequency to convert to
46
 * @maxsec:	guaranteed runtime conversion range in seconds
47 48 49 50 51 52 53 54
 *
 * The function evaluates the shift/mult pair for the scaled math
 * operations of clocksources and clockevents.
 *
 * @to and @from are frequency values in HZ. For clock sources @to is
 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
 * event @to is the counter frequency and @from is NSEC_PER_SEC.
 *
55
 * The @maxsec conversion range argument controls the time frame in
56 57 58 59 60 61 62 63
 * seconds which must be covered by the runtime conversion with the
 * calculated mult and shift factors. This guarantees that no 64bit
 * overflow happens when the input value of the conversion is
 * multiplied with the calculated mult factor. Larger ranges may
 * reduce the conversion accuracy by chosing smaller mult and shift
 * factors.
 */
void
64
clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
65 66 67 68 69 70 71 72
{
	u64 tmp;
	u32 sft, sftacc= 32;

	/*
	 * Calculate the shift factor which is limiting the conversion
	 * range:
	 */
73
	tmp = ((u64)maxsec * from) >> 32;
74 75 76 77 78 79 80 81 82 83 84
	while (tmp) {
		tmp >>=1;
		sftacc--;
	}

	/*
	 * Find the conversion shift/mult pair which has the best
	 * accuracy and fits the maxsec conversion range:
	 */
	for (sft = 32; sft > 0; sft--) {
		tmp = (u64) to << sft;
85
		tmp += from / 2;
86 87 88 89 90 91 92
		do_div(tmp, from);
		if ((tmp >> sftacc) == 0)
			break;
	}
	*mult = tmp;
	*shift = sft;
}
93
EXPORT_SYMBOL_GPL(clocks_calc_mult_shift);
94

95 96
/*[Clocksource internal variables]---------
 * curr_clocksource:
97
 *	currently selected clocksource.
98 99
 * clocksource_list:
 *	linked list with the registered clocksources
100 101
 * clocksource_mutex:
 *	protects manipulations to curr_clocksource and the clocksource_list
102 103 104
 * override_name:
 *	Name of the user-specified clocksource.
 */
105
static struct clocksource *curr_clocksource;
106
static LIST_HEAD(clocksource_list);
107
static DEFINE_MUTEX(clocksource_mutex);
108
static char override_name[CS_NAME_LEN];
109
static int finished_booting;
110

111
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
112
static void clocksource_watchdog_work(struct work_struct *work);
113
static void clocksource_select(void);
114

115 116 117
static LIST_HEAD(watchdog_list);
static struct clocksource *watchdog;
static struct timer_list watchdog_timer;
118
static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
119
static DEFINE_SPINLOCK(watchdog_lock);
120
static int watchdog_running;
121
static atomic_t watchdog_reset_pending;
122

123
static int clocksource_watchdog_kthread(void *data);
124
static void __clocksource_change_rating(struct clocksource *cs, int rating);
125

126
/*
127
 * Interval: 0.5sec Threshold: 0.0625s
128 129
 */
#define WATCHDOG_INTERVAL (HZ >> 1)
130
#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
131

132 133 134 135 136 137 138 139 140
static void clocksource_watchdog_work(struct work_struct *work)
{
	/*
	 * If kthread_run fails the next watchdog scan over the
	 * watchdog_list will find the unstable clock again.
	 */
	kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
}

141
static void __clocksource_unstable(struct clocksource *cs)
142 143
{
	cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
144
	cs->flags |= CLOCK_SOURCE_UNSTABLE;
145 146 147 148

	if (cs->mark_unstable)
		cs->mark_unstable(cs);

149 150
	if (finished_booting)
		schedule_work(&watchdog_work);
151 152
}

153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
/**
 * clocksource_mark_unstable - mark clocksource unstable via watchdog
 * @cs:		clocksource to be marked unstable
 *
 * This function is called instead of clocksource_change_rating from
 * cpu hotplug code to avoid a deadlock between the clocksource mutex
 * and the cpu hotplug mutex. It defers the update of the clocksource
 * to the watchdog thread.
 */
void clocksource_mark_unstable(struct clocksource *cs)
{
	unsigned long flags;

	spin_lock_irqsave(&watchdog_lock, flags);
	if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
		if (list_empty(&cs->wd_list))
			list_add(&cs->wd_list, &watchdog_list);
		__clocksource_unstable(cs);
	}
	spin_unlock_irqrestore(&watchdog_lock, flags);
}

175 176
static void clocksource_watchdog(unsigned long data)
{
177
	struct clocksource *cs;
178
	u64 csnow, wdnow, cslast, wdlast, delta;
179
	int64_t wd_nsec, cs_nsec;
180
	int next_cpu, reset_pending;
181 182 183
#ifdef CONFIG_IPIPE
	u64 wdref;
#endif
184 185

	spin_lock(&watchdog_lock);
186 187
	if (!watchdog_running)
		goto out;
188

189 190
	reset_pending = atomic_read(&watchdog_reset_pending);

191 192 193
	list_for_each_entry(cs, &watchdog_list, wd_list) {

		/* Clocksource already marked unstable? */
194
		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
195 196
			if (finished_booting)
				schedule_work(&watchdog_work);
197
			continue;
198
		}
199

200 201 202
#ifdef CONFIG_IPIPE
retry:
#endif
203
		local_irq_disable();
204 205 206
#ifdef CONFIG_IPIPE
		wdref = watchdog->read(watchdog);
#endif
207
		csnow = cs->read(cs);
208 209
		wdnow = watchdog->read(watchdog);
		local_irq_enable();
210

211 212 213 214 215 216 217
#ifdef CONFIG_IPIPE
		wd_nsec = clocksource_cyc2ns((wdnow - wdref) & watchdog->mask,
					     watchdog->mult, watchdog->shift);
		if (wd_nsec > WATCHDOG_THRESHOLD)
			goto retry;
#endif

218
		/* Clocksource initialized ? */
219 220
		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
		    atomic_read(&watchdog_reset_pending)) {
221
			cs->flags |= CLOCK_SOURCE_WATCHDOG;
222 223
			cs->wd_last = wdnow;
			cs->cs_last = csnow;
224 225 226
			continue;
		}

227 228 229
		delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
		wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
					     watchdog->shift);
230

231 232
		delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
233 234
		wdlast = cs->wd_last; /* save these in case we print them */
		cslast = cs->cs_last;
235 236 237
		cs->cs_last = csnow;
		cs->wd_last = wdnow;

238 239 240
		if (atomic_read(&watchdog_reset_pending))
			continue;

241
		/* Check the deviation from the watchdog clocksource. */
Andrew Morton's avatar
Andrew Morton committed
242
		if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
243 244
			pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
				smp_processor_id(), cs->name);
245
			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
246
				watchdog->name, wdnow, wdlast, watchdog->mask);
247
			pr_warn("                      '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
248 249
				cs->name, csnow, cslast, cs->mask);
			__clocksource_unstable(cs);
250 251 252
			continue;
		}

253 254 255
		if (cs == curr_clocksource && cs->tick_stable)
			cs->tick_stable(cs);

256 257 258
		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
		    (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
		    (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
259
			/* Mark it valid for high-res. */
260
			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
261 262 263 264 265 266 267 268

			/*
			 * clocksource_done_booting() will sort it if
			 * finished_booting is not set yet.
			 */
			if (!finished_booting)
				continue;

269
			/*
270 271 272 273 274 275
			 * If this is not the current clocksource let
			 * the watchdog thread reselect it. Due to the
			 * change to high res this clocksource might
			 * be preferred now. If it is the current
			 * clocksource let the tick code know about
			 * that change.
276
			 */
277 278 279 280 281 282
			if (cs != curr_clocksource) {
				cs->flags |= CLOCK_SOURCE_RESELECT;
				schedule_work(&watchdog_work);
			} else {
				tick_clock_notify();
			}
283 284 285
		}
	}

286 287 288 289 290 291 292
	/*
	 * We only clear the watchdog_reset_pending, when we did a
	 * full cycle through all clocksources.
	 */
	if (reset_pending)
		atomic_dec(&watchdog_reset_pending);

293 294 295 296 297 298 299 300 301
	/*
	 * Cycle through CPUs to check if the CPUs stay synchronized
	 * to each other.
	 */
	next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
	if (next_cpu >= nr_cpu_ids)
		next_cpu = cpumask_first(cpu_online_mask);
	watchdog_timer.expires += WATCHDOG_INTERVAL;
	add_timer_on(&watchdog_timer, next_cpu);
302
out:
303 304
	spin_unlock(&watchdog_lock);
}
305

306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
static inline void clocksource_start_watchdog(void)
{
	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
		return;
	init_timer(&watchdog_timer);
	watchdog_timer.function = clocksource_watchdog;
	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
	watchdog_running = 1;
}

static inline void clocksource_stop_watchdog(void)
{
	if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
		return;
	del_timer(&watchdog_timer);
	watchdog_running = 0;
}

325 326 327 328 329 330 331 332
static inline void clocksource_reset_watchdog(void)
{
	struct clocksource *cs;

	list_for_each_entry(cs, &watchdog_list, wd_list)
		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
}

333 334
static void clocksource_resume_watchdog(void)
{
335
	atomic_inc(&watchdog_reset_pending);
336 337
}

338
static void clocksource_enqueue_watchdog(struct clocksource *cs)
339 340 341 342 343
{
	unsigned long flags;

	spin_lock_irqsave(&watchdog_lock, flags);
	if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
344
		/* cs is a clocksource to be watched. */
345
		list_add(&cs->wd_list, &watchdog_list);
346
		cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
347
	} else {
348
		/* cs is a watchdog. */
349
		if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
350
			cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
	}
	spin_unlock_irqrestore(&watchdog_lock, flags);
}

static void clocksource_select_watchdog(bool fallback)
{
	struct clocksource *cs, *old_wd;
	unsigned long flags;

	spin_lock_irqsave(&watchdog_lock, flags);
	/* save current watchdog */
	old_wd = watchdog;
	if (fallback)
		watchdog = NULL;

	list_for_each_entry(cs, &clocksource_list, list) {
		/* cs is a clocksource to be watched. */
		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
			continue;

		/* Skip current if we were requested for a fallback. */
		if (fallback && cs == old_wd)
			continue;

375
		/* Pick the best watchdog. */
376
		if (!watchdog || cs->rating > watchdog->rating)
377 378
			watchdog = cs;
	}
379 380 381 382 383 384 385 386
	/* If we failed to find a fallback restore the old one. */
	if (!watchdog)
		watchdog = old_wd;

	/* If we changed the watchdog we need to reset cycles. */
	if (watchdog != old_wd)
		clocksource_reset_watchdog();

387 388
	/* Check if the watchdog timer needs to be started. */
	clocksource_start_watchdog();
389 390
	spin_unlock_irqrestore(&watchdog_lock, flags);
}
391 392 393 394 395 396

static void clocksource_dequeue_watchdog(struct clocksource *cs)
{
	unsigned long flags;

	spin_lock_irqsave(&watchdog_lock, flags);
397 398 399 400 401 402
	if (cs != watchdog) {
		if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
			/* cs is a watched clocksource. */
			list_del_init(&cs->wd_list);
			/* Check if the watchdog timer needs to be stopped. */
			clocksource_stop_watchdog();
403 404 405 406 407
		}
	}
	spin_unlock_irqrestore(&watchdog_lock, flags);
}

408
static int __clocksource_watchdog_kthread(void)
409 410 411
{
	struct clocksource *cs, *tmp;
	unsigned long flags;
412
	LIST_HEAD(unstable);
413
	int select = 0;
414 415

	spin_lock_irqsave(&watchdog_lock, flags);
416
	list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
417 418
		if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
			list_del_init(&cs->wd_list);
419
			list_add(&cs->wd_list, &unstable);
420 421 422 423 424
			select = 1;
		}
		if (cs->flags & CLOCK_SOURCE_RESELECT) {
			cs->flags &= ~CLOCK_SOURCE_RESELECT;
			select = 1;
425
		}
426
	}
427 428
	/* Check if the watchdog timer needs to be stopped. */
	clocksource_stop_watchdog();
429 430 431 432 433
	spin_unlock_irqrestore(&watchdog_lock, flags);

	/* Needs to be done outside of watchdog lock */
	list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
		list_del_init(&cs->wd_list);
434
		__clocksource_change_rating(cs, 0);
435
	}
436 437 438 439 440 441 442 443
	return select;
}

static int clocksource_watchdog_kthread(void *data)
{
	mutex_lock(&clocksource_mutex);
	if (__clocksource_watchdog_kthread())
		clocksource_select();
444
	mutex_unlock(&clocksource_mutex);
445
	return 0;
446 447
}

448 449 450 451 452
static bool clocksource_is_watchdog(struct clocksource *cs)
{
	return cs == watchdog;
}

453 454 455
#else /* CONFIG_CLOCKSOURCE_WATCHDOG */

static void clocksource_enqueue_watchdog(struct clocksource *cs)
456 457 458 459
{
	if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
		cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
460

461
static void clocksource_select_watchdog(bool fallback) { }
462
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
463
static inline void clocksource_resume_watchdog(void) { }
464
static inline int __clocksource_watchdog_kthread(void) { return 0; }
465
static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
466
void clocksource_mark_unstable(struct clocksource *cs) { }
467 468

#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
469

470 471 472 473 474 475 476 477 478 479 480 481
/**
 * clocksource_suspend - suspend the clocksource(s)
 */
void clocksource_suspend(void)
{
	struct clocksource *cs;

	list_for_each_entry_reverse(cs, &clocksource_list, list)
		if (cs->suspend)
			cs->suspend(cs);
}

482 483 484 485 486
/**
 * clocksource_resume - resume the clocksource(s)
 */
void clocksource_resume(void)
{
487
	struct clocksource *cs;
488

489
	list_for_each_entry(cs, &clocksource_list, list)
490
		if (cs->resume)
491
			cs->resume(cs);
492 493 494 495

	clocksource_resume_watchdog();
}

Jason Wessel's avatar
Jason Wessel committed
496 497 498 499
/**
 * clocksource_touch_watchdog - Update watchdog
 *
 * Update the watchdog after exception contexts such as kgdb so as not
500 501
 * to incorrectly trip the watchdog. This might fail when the kernel
 * was stopped in code which holds watchdog_lock.
Jason Wessel's avatar
Jason Wessel committed
502 503 504 505 506 507
 */
void clocksource_touch_watchdog(void)
{
	clocksource_resume_watchdog();
}

508 509 510 511 512 513 514 515 516
/**
 * clocksource_max_adjustment- Returns max adjustment amount
 * @cs:         Pointer to clocksource
 *
 */
static u32 clocksource_max_adjustment(struct clocksource *cs)
{
	u64 ret;
	/*
517
	 * We won't try to correct for more than 11% adjustments (110,000 ppm),
518 519 520 521 522 523
	 */
	ret = (u64)cs->mult * 11;
	do_div(ret,100);
	return (u32)ret;
}

524
/**
525 526 527 528 529
 * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
 * @mult:	cycle to nanosecond multiplier
 * @shift:	cycle to nanosecond divisor (power of two)
 * @maxadj:	maximum adjustment value to mult (~11%)
 * @mask:	bitmask for two's complement subtraction of non 64 bit counters
530 531
 * @max_cyc:	maximum cycle value before potential overflow (does not include
 *		any safety margin)
532
 *
533 534 535 536
 * NOTE: This function includes a safety margin of 50%, in other words, we
 * return half the number of nanoseconds the hardware counter can technically
 * cover. This is done so that we can potentially detect problems caused by
 * delayed timers or bad hardware, which might result in time intervals that
Zhen Lei's avatar
Zhen Lei committed
537
 * are larger than what the math used can handle without overflows.
538
 */
539
u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc)
540 541 542 543 544
{
	u64 max_nsecs, max_cycles;

	/*
	 * Calculate the maximum number of cycles that we can pass to the
545
	 * cyc2ns() function without overflowing a 64-bit result.
546
	 */
547 548
	max_cycles = ULLONG_MAX;
	do_div(max_cycles, mult+maxadj);
549 550 551

	/*
	 * The actual maximum number of cycles we can defer the clocksource is
552
	 * determined by the minimum of max_cycles and mask.
553 554
	 * Note: Here we subtract the maxadj to make sure we don't sleep for
	 * too long if there's a large negative adjustment.
555
	 */
556 557 558
	max_cycles = min(max_cycles, mask);
	max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);

559 560 561 562
	/* return the max_cycles value as well if requested */
	if (max_cyc)
		*max_cyc = max_cycles;

563 564 565
	/* Return 50% of the actual maximum, so we can detect bad values */
	max_nsecs >>= 1;

566 567 568 569
	return max_nsecs;
}

/**
570 571
 * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles
 * @cs:         Pointer to clocksource to be updated
572 573
 *
 */
574
static inline void clocksource_update_max_deferment(struct clocksource *cs)
575
{
576 577 578
	cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift,
						cs->maxadj, cs->mask,
						&cs->max_cycles);
579 580
}

581
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
582

583
static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
584 585 586 587 588 589 590 591 592 593 594 595
{
	struct clocksource *cs;

	if (!finished_booting || list_empty(&clocksource_list))
		return NULL;

	/*
	 * We pick the clocksource with the highest rating. If oneshot
	 * mode is active, we pick the highres valid clocksource with
	 * the best rating.
	 */
	list_for_each_entry(cs, &clocksource_list, list) {
596 597
		if (skipcur && cs == curr_clocksource)
			continue;
598 599 600 601 602 603 604
		if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
			continue;
		return cs;
	}
	return NULL;
}

605
static void __clocksource_select(bool skipcur)
606
{
607
	bool oneshot = tick_oneshot_mode_active();
608
	struct clocksource *best, *cs;
609

610
	/* Find the best suitable clocksource */
611
	best = clocksource_find_best(oneshot, skipcur);
612
	if (!best)
613
		return;
614

615 616
	/* Check for the override clocksource. */
	list_for_each_entry(cs, &clocksource_list, list) {
617 618
		if (skipcur && cs == curr_clocksource)
			continue;
619 620 621 622 623 624 625
		if (strcmp(cs->name, override_name) != 0)
			continue;
		/*
		 * Check to make sure we don't switch to a non-highres
		 * capable clocksource if the tick code is in oneshot
		 * mode (highres or nohz)
		 */
626
		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
627
			/* Override clocksource cannot be used. */
628 629 630 631 632 633 634 635 636 637 638 639
			if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
				pr_warn("Override clocksource %s is unstable and not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
					cs->name);
				override_name[0] = 0;
			} else {
				/*
				 * The override cannot be currently verified.
				 * Deferring to let the watchdog check.
				 */
				pr_info("Override clocksource %s is not currently HRT compatible - deferring\n",
					cs->name);
			}
640 641 642 643 644
		} else
			/* Override clocksource can be used. */
			best = cs;
		break;
	}
645 646 647

	if (curr_clocksource != best && !timekeeping_notify(best)) {
		pr_info("Switched to clocksource %s\n", best->name);
648 649
		curr_clocksource = best;
	}
650
}
651

652 653 654 655 656 657 658 659 660 661
/**
 * clocksource_select - Select the best clocksource available
 *
 * Private function. Must hold clocksource_mutex when called.
 *
 * Select the clocksource with the best rating, or the clocksource,
 * which is selected by userspace override.
 */
static void clocksource_select(void)
{
662
	__clocksource_select(false);
663 664
}

665 666
static void clocksource_select_fallback(void)
{
667
	__clocksource_select(true);
668 669
}

670
#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */
671
static inline void clocksource_select(void) { }
672
static inline void clocksource_select_fallback(void) { }
673 674 675

#endif

676 677 678 679 680 681 682 683 684
/*
 * clocksource_done_booting - Called near the end of core bootup
 *
 * Hack to avoid lots of clocksource churn at boot time.
 * We use fs_initcall because we want this to start before
 * device_initcall but after subsys_initcall.
 */
static int __init clocksource_done_booting(void)
{
685 686
	mutex_lock(&clocksource_mutex);
	curr_clocksource = clocksource_default_clock();
687
	finished_booting = 1;
688 689 690
	/*
	 * Run the watchdog first to eliminate unstable clock sources
	 */
691
	__clocksource_watchdog_kthread();
692
	clocksource_select();
693
	mutex_unlock(&clocksource_mutex);
694 695 696 697
	return 0;
}
fs_initcall(clocksource_done_booting);

698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
#ifdef CONFIG_IPIPE_WANT_CLOCKSOURCE
unsigned long long __ipipe_cs_freq;
EXPORT_SYMBOL_GPL(__ipipe_cs_freq);

struct clocksource *__ipipe_cs;
EXPORT_SYMBOL_GPL(__ipipe_cs);

u64 (*__ipipe_cs_read)(struct clocksource *cs);
u64 __ipipe_cs_last_tsc;
u64 __ipipe_cs_mask;
unsigned __ipipe_cs_lat = 0xffffffff;

static void ipipe_check_clocksource(struct clocksource *cs)
{
	u64 (*cread)(struct clocksource *cs);
	u64 lat, mask, saved;
	unsigned long long freq;
	unsigned long flags;
	unsigned i;

	if (cs->ipipe_read) {
		mask = CLOCKSOURCE_MASK(64);
		cread = cs->ipipe_read;
	} else {
		mask = cs->mask;
		cread = cs->read;

		if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) == 0)
			return;

		/*
		 * We only support masks such that cs->mask + 1 is a power of 2,
		 * 64 bits masks or masks lesser than 32 bits
		 */
		if (mask != CLOCKSOURCE_MASK(64)
		    && ((mask & (mask + 1)) != 0 || mask > 0xffffffff))
			return;
	}

	/*
	 * We prefer a clocksource with a better resolution than 1us
	 */
	if (cs->shift <= 34) {
		freq = 1000000000ULL << cs->shift;
		do_div(freq, cs->mult);
	} else {
		freq = 1000000ULL << cs->shift;
		do_div(freq, cs->mult);
		freq *= 1000;
	}
	if (freq < 1000000)
		return;

	/* Measure the clocksource latency */
	flags = hard_local_irq_save();
	saved = __ipipe_cs_last_tsc;
	lat = cread(cs);
	for (i = 0; i < 10; i++)
		cread(cs);
	lat = cread(cs) - lat;
	__ipipe_cs_last_tsc = saved;
	hard_local_irq_restore(flags);
	lat = (lat * cs->mult) >> cs->shift;
	do_div(lat, i + 1);

	if (!strcmp(cs->name, override_name))
		goto skip_tests;

	if (lat > __ipipe_cs_lat)
		return;

	if (__ipipe_cs && !strcmp(__ipipe_cs->name, override_name))
		return;

  skip_tests:
	flags = hard_local_irq_save();
	if (__ipipe_cs_last_tsc == 0) {
		__ipipe_cs_lat = lat;
		__ipipe_cs_freq = freq;
		__ipipe_cs = cs;
		__ipipe_cs_read = cread;
		__ipipe_cs_mask = mask;
	}
	hard_local_irq_restore(flags);
}
#else /* !CONFIG_IPIPE_WANT_CLOCKSOURCE */
#define ipipe_check_clocksource(cs)	do { }while (0)
#endif /* !CONFIG_IPIPE_WANT_CLOCKSOURCE */

787 788
/*
 * Enqueue the clocksource sorted by rating
789
 */
790
static void clocksource_enqueue(struct clocksource *cs)
791
{
792 793
	struct list_head *entry = &clocksource_list;
	struct clocksource *tmp;
794

795
	list_for_each_entry(tmp, &clocksource_list, list) {
796
		/* Keep track of the place, where to insert */
797 798 799 800
		if (tmp->rating < cs->rating)
			break;
		entry = &tmp->list;
	}
801
	list_add(&cs->list, entry);
802 803

	ipipe_check_clocksource(cs);
804 805
}

806
/**
807
 * __clocksource_update_freq_scale - Used update clocksource with new freq
808
 * @cs:		clocksource to be registered
809 810 811
 * @scale:	Scale factor multiplied against freq to get clocksource hz
 * @freq:	clocksource frequency (cycles per second) divided by scale
 *
812
 * This should only be called from the clocksource->enable() method.
813 814
 *
 * This *SHOULD NOT* be called directly! Please use the
815 816
 * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper
 * functions.
817
 */
818
void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq)
819
{
820
	u64 sec;
821

822
	/*
823 824
	 * Default clocksources are *special* and self-define their mult/shift.
	 * But, you're not special, so you should specify a freq value.
825
	 */
826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
	if (freq) {
		/*
		 * Calc the maximum number of seconds which we can run before
		 * wrapping around. For clocksources which have a mask > 32-bit
		 * we need to limit the max sleep time to have a good
		 * conversion precision. 10 minutes is still a reasonable
		 * amount. That results in a shift value of 24 for a
		 * clocksource with mask >= 40-bit and f >= 4GHz. That maps to
		 * ~ 0.06ppm granularity for NTP.
		 */
		sec = cs->mask;
		do_div(sec, freq);
		do_div(sec, scale);
		if (!sec)
			sec = 1;
		else if (sec > 600 && cs->mask > UINT_MAX)
			sec = 600;

		clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
				       NSEC_PER_SEC / scale, sec * scale);
	}
847
	/*
848 849
	 * Ensure clocksources that have large 'mult' values don't overflow
	 * when adjusted.
850 851
	 */
	cs->maxadj = clocksource_max_adjustment(cs);
852 853
	while (freq && ((cs->mult + cs->maxadj < cs->mult)
		|| (cs->mult - cs->maxadj > cs->mult))) {
854 855 856 857 858
		cs->mult >>= 1;
		cs->shift--;
		cs->maxadj = clocksource_max_adjustment(cs);
	}

859 860 861 862 863 864 865 866
	/*
	 * Only warn for *special* clocksources that self-define
	 * their mult/shift values and don't specify a freq.
	 */
	WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
		"timekeeping: Clocksource %s might overflow on 11%% adjustment\n",
		cs->name);

867
	clocksource_update_max_deferment(cs);
868

869 870
	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
871
}
872
EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
873 874 875

/**
 * __clocksource_register_scale - Used to install new clocksources
876
 * @cs:		clocksource to be registered
877 878 879 880 881 882 883 884 885 886 887
 * @scale:	Scale factor multiplied against freq to get clocksource hz
 * @freq:	clocksource frequency (cycles per second) divided by scale
 *
 * Returns -EBUSY if registration fails, zero otherwise.
 *
 * This *SHOULD NOT* be called directly! Please use the
 * clocksource_register_hz() or clocksource_register_khz helper functions.
 */
int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
{

888
	/* Initialize mult/shift and max_idle_ns */
889
	__clocksource_update_freq_scale(cs, scale, freq);
890

891
	/* Add clocksource to the clocksource list */
892 893 894
	mutex_lock(&clocksource_mutex);
	clocksource_enqueue(cs);
	clocksource_enqueue_watchdog(cs);
895
	clocksource_select();
896
	clocksource_select_watchdog(false);
897 898 899 900 901
	mutex_unlock(&clocksource_mutex);
	return 0;
}
EXPORT_SYMBOL_GPL(__clocksource_register_scale);

902 903 904 905 906 907 908
static void __clocksource_change_rating(struct clocksource *cs, int rating)
{
	list_del(&cs->list);
	cs->rating = rating;
	clocksource_enqueue(cs);
}

909
/**
910
 * clocksource_change_rating - Change the rating of a registered clocksource
911 912
 * @cs:		clocksource to be changed
 * @rating:	new rating
913
 */
914
void clocksource_change_rating(struct clocksource *cs, int rating)
915
{
916
	mutex_lock(&clocksource_mutex);
917
	__clocksource_change_rating(cs, rating);
918
	clocksource_select();
919
	clocksource_select_watchdog(false);
920
	mutex_unlock(&clocksource_mutex);
921
}
922
EXPORT_SYMBOL(clocksource_change_rating);
923

924 925 926 927 928
/*
 * Unbind clocksource @cs. Called with clocksource_mutex held
 */
static int clocksource_unbind(struct clocksource *cs)
{
929 930 931 932 933 934
	if (clocksource_is_watchdog(cs)) {
		/* Select and try to install a replacement watchdog. */
		clocksource_select_watchdog(true);
		if (clocksource_is_watchdog(cs))
			return -EBUSY;
	}
935 936 937 938 939 940 941 942 943 944 945 946

	if (cs == curr_clocksource) {
		/* Select and try to install a replacement clock source */
		clocksource_select_fallback();
		if (curr_clocksource == cs)
			return -EBUSY;
	}
	clocksource_dequeue_watchdog(cs);
	list_del_init(&cs->list);
	return 0;
}

947 948
/**
 * clocksource_unregister - remove a registered clocksource
949
 * @cs:	clocksource to be unregistered
950
 */
951
int clocksource_unregister(struct clocksource *cs)
952
{
953 954
	int ret = 0;

955
	mutex_lock(&clocksource_mutex);
956 957
	if (!list_empty(&cs->list))
		ret = clocksource_unbind(cs);
958
	mutex_unlock(&clocksource_mutex);
959
	return ret;
960
}
961
EXPORT_SYMBOL(clocksource_unregister);
962

963
#ifdef CONFIG_SYSFS
964 965 966
/**
 * sysfs_show_current_clocksources - sysfs interface for current clocksource
 * @dev:	unused
967
 * @attr:	unused
968 969 970 971 972
 * @buf:	char buffer to be filled with clocksource list
 *
 * Provides sysfs interface for listing current clocksource.
 */
static ssize_t
973 974
sysfs_show_current_clocksources(struct device *dev,
				struct device_attribute *attr, char *buf)
975
{
976
	ssize_t count = 0;
977

978
	mutex_lock(&clocksource_mutex);
979
	count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
980
	mutex_unlock(&clocksource_mutex);
981

982
	return count;
983 984
}

985
ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
{
	size_t ret = cnt;

	/* strings from sysfs write are not 0 terminated! */
	if (!cnt || cnt >= CS_NAME_LEN)
		return -EINVAL;

	/* strip of \n: */
	if (buf[cnt-1] == '\n')
		cnt--;
	if (cnt > 0)
		memcpy(dst, buf, cnt);
	dst[cnt] = 0;
	return ret;
}

1002 1003 1004
/**
 * sysfs_override_clocksource - interface for manually overriding clocksource
 * @dev:	unused
1005
 * @attr:	unused
1006 1007 1008 1009
 * @buf:	name of override clocksource
 * @count:	length of buffer
 *
 * Takes input from sysfs interface for manually overriding the default
1010
 * clocksource selection.
1011
 */
1012 1013
static ssize_t sysfs_override_clocksource(struct device *dev,
					  struct device_attribute *attr,
1014 1015
					  const char *buf, size_t count)
{
1016
	ssize_t ret;
1017

1018
	mutex_lock(&clocksource_mutex);
1019

1020
	ret = sysfs_get_uname(buf, override_name, count);
1021 1022
	if (ret >= 0)
		clocksource_select();
1023

1024
	mutex_unlock(&clocksource_mutex);
1025 1026 1027 1028

	return ret;
}

1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
/**
 * sysfs_unbind_current_clocksource - interface for manually unbinding clocksource
 * @dev:	unused
 * @attr:	unused
 * @buf:	unused
 * @count:	length of buffer
 *
 * Takes input from sysfs interface for manually unbinding a clocksource.
 */
static ssize_t sysfs_unbind_clocksource(struct device *dev,
					struct device_attribute *attr,
					const char *buf, size_t count)
{
	struct clocksource *cs;
	char name[CS_NAME_LEN];
1044
	ssize_t ret;
1045

1046
	ret = sysfs_get_uname(buf, name, count);
1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
	if (ret < 0)
		return ret;

	ret = -ENODEV;
	mutex_lock(&clocksource_mutex);
	list_for_each_entry(cs, &clocksource_list, list) {
		if (strcmp(cs->name, name))
			continue;
		ret = clocksource_unbind(cs);
		break;
	}
	mutex_unlock(&clocksource_mutex);

	return ret ? ret : count;
}

1063 1064 1065
/**
 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
 * @dev:	unused
1066
 * @attr:	unused
1067 1068 1069 1070 1071
 * @buf:	char buffer to be filled with clocksource list
 *
 * Provides sysfs interface for listing registered clocksources
 */
static ssize_t
1072 1073
sysfs_show_available_clocksources(struct device *dev,
				  struct device_attribute *attr,
1074
				  char *buf)
1075
{
1076
	struct clocksource *src;
1077
	ssize_t count = 0;
1078

1079
	mutex_lock(&clocksource_mutex);