Commit f5a67a24 authored by Philippe Gerum's avatar Philippe Gerum Committed by Jan Kiszka

cobalt/sched: improve watchdog accuracy

The original watchdog mechanism was based on a sampling method: every
second (built-in value), it used to check the runtime mode of the
current task preempted on the ticking CPU. A per-cpu counter was
increased by one every time rt/primary mode was detected, then checked
against the trigger limit (CONFIG_XENO_OPT_WATCHDOG_TIMEOUT).
Otherwise, the counter was reset to zero.

With this fairly naive approach, it only takes a single hit with
CONFIG_XENO_OPT_WATCHDOG_TIMEOUT=1 to trigger the watchdog, i.e. if
the system-fixed 1s watchdog tick preempts any Xenomai task when it is
running in primary mode on the current CPU, the watchdog fires.

The default value of 4s papered over the inherent imprecision of such
a coarse-grained method, lengthening the odds of observing false
positive triggers.

To improve the accuracy of the watchdog, arm the watchdog timer to
fire at the final trigger date directly, right before switching the
CPU to primary mode (leave_root()), disarming it when the CPU is about
to switch back to secondary mode (enter_root()).

Better accuracy comes at the expense of slightly more overhead when
transitioning between primary and secondary modes, which should be
acceptable for a debug feature which is not affecting the hot path
anyway (i.e. there is no added cost for strictly rt context switches).
Signed-off-by: Philippe Gerum's avatarPhilippe Gerum <rpm@xenomai.org>
Signed-off-by: Jan Kiszka's avatarJan Kiszka <jan.kiszka@siemens.com>
parent b4cd0f72
......@@ -104,8 +104,6 @@ struct xnsched {
#ifdef CONFIG_XENO_OPT_WATCHDOG
/*!< Watchdog timer object. */
struct xntimer wdtimer;
/*!< Watchdog tick count. */
int wdcount;
#endif
#ifdef CONFIG_XENO_OPT_STATS
/*!< Last account switch date (ticks). */
......@@ -387,17 +385,6 @@ xnsched_maybe_resched_after_unlocked_switch(struct xnsched *sched)
#endif /* !CONFIG_IPIPE_WANT_PREEMPTIBLE_SWITCH */
#ifdef CONFIG_XENO_OPT_WATCHDOG
static inline void xnsched_reset_watchdog(struct xnsched *sched)
{
sched->wdcount = 0;
}
#else /* !CONFIG_XENO_OPT_WATCHDOG */
static inline void xnsched_reset_watchdog(struct xnsched *sched)
{
}
#endif /* CONFIG_XENO_OPT_WATCHDOG */
bool xnsched_set_effective_priority(struct xnthread *thread,
int prio);
......
......@@ -93,14 +93,20 @@ void xnsched_register_classes(void)
static unsigned long wd_timeout_arg = CONFIG_XENO_OPT_WATCHDOG_TIMEOUT;
module_param_named(watchdog_timeout, wd_timeout_arg, ulong, 0644);
static inline xnticks_t get_watchdog_timeout(void)
{
return wd_timeout_arg * 1000000000ULL;
}
/**
* @internal
* @fn void watchdog_handler(struct xntimer *timer)
* @brief Process watchdog ticks.
*
* This internal routine handles incoming watchdog ticks to detect
* software lockups. It kills any offending thread which is found to
* monopolize the CPU so as to starve the Linux kernel for too long.
* This internal routine handles incoming watchdog triggers to detect
* software lockups. It forces the offending thread to stop
* monopolizing the CPU, either by kicking it out of primary mode if
* running in user space, or cancelling it if kernel-based.
*
* @coretags{coreirq-only, atomic-entry}
*/
......@@ -109,13 +115,14 @@ static void watchdog_handler(struct xntimer *timer)
struct xnsched *sched = xnsched_current();
struct xnthread *curr = sched->curr;
if (likely(xnthread_test_state(curr, XNROOT))) {
xnsched_reset_watchdog(sched);
return;
}
if (likely(++sched->wdcount < wd_timeout_arg))
return;
/*
* CAUTION: The watchdog tick might have been delayed while we
* were busy switching the CPU to secondary mode at the
* trigger date eventually. Make sure that we are not about to
* kick the incoming root thread.
*/
if (xnthread_test_state(curr, XNROOT))
return;
trace_cobalt_watchdog_signal(curr);
......@@ -137,8 +144,6 @@ static void watchdog_handler(struct xntimer *timer)
*/
xnthread_set_info(curr, XNKICKED|XNCANCELD);
}
xnsched_reset_watchdog(sched);
}
#endif /* CONFIG_XENO_OPT_WATCHDOG */
......@@ -908,6 +913,9 @@ static inline void enter_root(struct xnthread *root)
{
struct xnarchtcb *rootcb __maybe_unused = xnthread_archtcb(root);
#ifdef CONFIG_XENO_OPT_WATCHDOG
xntimer_stop(&root->sched->wdtimer);
#endif
#ifdef CONFIG_IPIPE_WANT_PREEMPTIBLE_SWITCH
if (rootcb->core.mm == NULL)
set_ti_thread_flag(rootcb->core.tip, TIF_MMSWITCH_INT);
......@@ -926,6 +934,11 @@ static inline void leave_root(struct xnthread *root)
rootcb->core.mm = rootcb->core.active_mm = ipipe_get_active_mm();
rootcb->core.tip = task_thread_info(p);
xnarch_leave_root(root);
#ifdef CONFIG_XENO_OPT_WATCHDOG
xntimer_start(&root->sched->wdtimer, get_watchdog_timeout(),
XN_INFINITE, XN_RELATIVE);
#endif
}
void __xnsched_run_handler(void) /* hw interrupts off. */
......@@ -983,9 +996,6 @@ reschedule:
trace_cobalt_switch_context(prev, next);
if (xnthread_test_state(next, XNROOT))
xnsched_reset_watchdog(sched);
sched->curr = next;
shadow = 1;
......
......@@ -894,10 +894,6 @@ int xntimer_grab_hardware(void)
else if (ret == 1)
xntimer_start(&sched->htimer, 0, 0, XN_RELATIVE);
#ifdef CONFIG_XENO_OPT_WATCHDOG
xntimer_start(&sched->wdtimer, 1000000000UL, 1000000000UL, XN_RELATIVE);
xnsched_reset_watchdog(sched);
#endif
xnlock_put_irqrestore(&nklock, s);
}
......@@ -909,9 +905,6 @@ fail:
xnlock_get_irqsave(&nklock, s);
sched = xnsched_struct(cpu);
xntimer_stop(&sched->htimer);
#ifdef CONFIG_XENO_OPT_WATCHDOG
xntimer_stop(&sched->wdtimer);
#endif
xnlock_put_irqrestore(&nklock, s);
ipipe_timer_stop(_cpu);
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment