LINUX高精度定時器實現(xiàn)分析 | 碼農(nóng)故事

老匹夫 2016-01-04

展開全文

hrtimer，是High-resolution kernel timers的縮寫，從字面意思就知道，這是一個高精度內(nèi)核timer。

HRTIMER用法示例

先調(diào)用hrtimer_init初始化，然后設(shè)置function回調(diào)，例如，此處設(shè)置回調(diào)為coalesced_timer_fn。

static enum hrtimer_restart coalesced_timer_fn(struct hrtimer *timer)

{

struct kvm_rtc *rtc = container_of(timer, struct kvm_rtc, coalesced_timer);

CHECK_PAUSE_RET(rtc->pause, HRTIMER_NORESTART);

queue_work(rtc->wq, &rtc->ws_coalesced);

return HRTIMER_NORESTART;//這里返回值很重要，如果想此timer為周期timer，返回HRTIMER_RESTART(超時必須先設(shè)置)

}

hrtimer_init(&rtc->coalesced_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);

rtc->coalesced_timer.function = coalesced_timer_fn;

設(shè)置超時時間，設(shè)置一個超時的ABS時間，超時時間設(shè)置了，并不代表timer已經(jīng)運行，還必須將其加入active隊列，start系列函數(shù)就做此事。

1 2	hrtimer_forward(&rtc->coalesced_timer, ns_to_ktime(now_time), ns_to_ktime(next_time-now_time)); hrtimer_add_expires_ns(&rtc->coalesced_timer, 1000000000);//1秒

加入active隊列。

1 2	hrtimer_start_expires(&rtc->coalesced_timer, HRTIMER_MODE_ABS); hrtimer_restart(&rtc->coalesced_timer);

從active隊列刪除(如正在回調(diào)中，會等待回調(diào)運行完成)。

1	hrtimer_cancel(&rtc->coalesced_timer);

HRTIMER數(shù)據(jù)結(jié)構(gòu)

Figure 1數(shù)據(jù)結(jié)構(gòu)示意圖

enum hrtimer_base_type {

HRTIMER_BASE_MONOTONIC,

HRTIMER_BASE_REALTIME,

HRTIMER_BASE_BOOTTIME,

HRTIMER_MAX_CLOCK_BASES, //max 邊界，下面的clock_base就是用來做數(shù)組大小

};

* struct hrtimer_cpu_base - the per cpu clock bases

struct hrtimer_cpu_base {

raw_spinlock_t lock; //lock protecting the base and associated clock bases and timers

unsigned int active_bases; //Bitfield to mark bases with active timers

unsigned int clock_was_set; //Indicates that clock was set from irq context.

#ifdef CONFIG_HIGH_RES_TIMERS

ktime_t expires_next; //absolute time of the next event which was scheduled, via clock_set_next_event()

int hres_active; //State of high resolution mode

int hang_detected; //The last hrtimer interrupt detected a hang

unsigned long nr_events; //Total number of hrtimer interrupt events

unsigned long nr_retries; //Total number of hrtimer interrupt retries

unsigned long nr_hangs; //Total number of hrtimer interrupt hangs

ktime_t max_hang_time; //Maximum time spent in hrtimer_interrupt

#endif

struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; //array of clock bases for this cpu

};

DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =

{

.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),

.clock_base =

{

.index = HRTIMER_BASE_MONOTONIC,

.clockid = CLOCK_MONOTONIC,

.get_time = &ktime_get, //monotonic time

.resolution = KTIME_LOW_RES, //剛初始化的時候，都是低精度的

{

.index = HRTIMER_BASE_REALTIME,

.clockid = CLOCK_REALTIME,

.get_time = &ktime_get_real, //get the real (wall-) time, TOD

.resolution = KTIME_LOW_RES,

{

.index = HRTIMER_BASE_BOOTTIME,

.clockid = CLOCK_BOOTTIME,

.get_time = &ktime_get_boottime,//monotonic time since boot

.resolution = KTIME_LOW_RES,

}

};

/**

* struct hrtimer_clock_base - the timer base for a specific clock

struct hrtimer_clock_base {

struct hrtimer_cpu_base * cpu_base; //per cpu clock base

int index; //clock type index for per_cpu support when moving a timer to a base on another cpu.

clockid_t clockid; //clock id for per_cpu support

struct timerqueue_head active; //red black tree root node for the active timers, active queue里面存放的，就是hrtimer

ktime_t resolution; //the resolution of the clock, in nanoseconds

ktime_t (*get_time)(void); //function to retrieve the current time of the clock

ktime_t softirq_time; //the time when running the hrtimer queue in the softirq

ktime_t offset; //offset of this clock to the monotonic base

};

struct timerqueue_node {

struct rb_node node;

ktime_t expires; //這個超時，是真正用于比較時間的，據(jù)說是為了節(jié)能優(yōu)化

};

struct hrtimer {

struct timerqueue_node node;

ktime_t _softexpires; //這是超時，叫soft expires，對應(yīng)timerqueue_node->expires叫做hard expires

enum hrtimer_restart (*function)(struct hrtimer *); //這就是回調(diào)函數(shù)了

struct hrtimer_clock_base *base;

unsigned long state; //狀態(tài)字段，指示hrtimer處于什么狀態(tài)，見#狀態(tài)轉(zhuǎn)換

#ifdef CONFIG_TIMER_STATS

int start_pid;

void *start_site;

char start_comm[16];

#endif

};

HRTIMER初始化流程

asmlinkage void __init start_kernel(void)

{

...

init_IRQ();

init_timers(); //初始化低精度timer

hrtimers_init(); //初始化高進度timer

softirq_init();

timekeeping_init();

time_init();

...

}

//hrtimer的CPU事件通知

static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,

unsigned long action, void *hcpu)

{

int scpu = (long)hcpu;

switch (action) {

case CPU_UP_PREPARE:

case CPU_UP_PREPARE_FROZEN:

init_hrtimers_cpu(scpu);//對每個CPU，初始化其struct hrtimer_cpu_base * 中的hrtimer_clock_base

break;

#ifdef CONFIG_HOTPLUG_CPU //熱插拔支持

case CPU_DYING:

case CPU_DYING_FROZEN:

clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);

break;

case CPU_DEAD:

case CPU_DEAD_FROZEN:

{

clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu);

migrate_hrtimers(scpu);//CPU DEAD的時候，將DEAD的CPU上的timer遷移到本CPU

break;

}

#endif

default:

break;

}

return NOTIFY_OK;

}

static struct notifier_block __cpuinitdata hrtimers_nb = {

.notifier_call = hrtimer_cpu_notify,

};

void __init hrtimers_init(void)

{

hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,

(void *)(long)smp_processor_id());//這里將BSP的PREPARE初始化顯示調(diào)用一下，因為BSP已經(jīng)啟動了，其他CPU通過下面注冊的回調(diào)

register_cpu_notifier(&hrtimers_nb);//CPU事件通知，會調(diào)用hrtimer_cpu_notify

#ifdef CONFIG_HIGH_RES_TIMERS

open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);//注冊高精度模式下的定時器軟中斷

#endif

}

* Functions related to boot-time initialization:

//對每一個CPU，初始化和CPU關(guān)聯(lián)的hrtimer_cpu_base結(jié)構(gòu)

static void __cpuinit init_hrtimers_cpu(int cpu)

{

struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);

int i;

//其實，也初始化了hrtimer_cpu_base中的hrtimer_clock_base數(shù)組

//每個clock base，需要將其存放hrtimer的queue初始化好，將回溯指針設(shè)置好

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {

cpu_base->clock_base[i].cpu_base = cpu_base;

timerqueue_init_head(&cpu_base->clock_base[i].active);

}

hrtimer_init_hres(cpu_base);//例如初始化高精度為未激活狀態(tài)

}

HRTIMER CPU熱插拔支持

熱插拔支持，其核心功能，就是當(dāng)一個CPU死掉的時候，將其上面的hrtimer遷移到本CPU上來，需要預(yù)編譯宏CONFIG_HOTPLUG_CPU。

#ifdef CONFIG_HOTPLUG_CPU

static void migrate_hrtimers(int scpu)

{

struct hrtimer_cpu_base *old_base, *new_base;

int i;

BUG_ON(cpu_online(scpu));

tick_cancel_sched_timer(scpu);

local_irq_disable();//遷移timer的時候關(guān)中斷

old_base = &per_cpu(hrtimer_bases, scpu);

new_base = &__get_cpu_var(hrtimer_bases);

* The caller is globally serialized and nobody else

* takes two locks at once, deadlock is not possible.

raw_spin_lock(&new_base->lock);

raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);

//新的老的hrtimer_bases都鎖起來，然后遷移timer節(jié)點

//一個hrtimer_bases下又有多個clock base

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {

migrate_hrtimer_list(&old_base->clock_base[i],

&new_base->clock_base[i]);

}

raw_spin_unlock(&old_base->lock);

raw_spin_unlock(&new_base->lock);

/* Check, if we got expired work to do */

__hrtimer_peek_ahead_timers();//這里是為了確保實時性，遷移完成后，就檢查一下處于本CPU上的hrtimer_bases中是否有timer超時

local_irq_enable();//遷移成功后開中斷

}

static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,

struct hrtimer_clock_base *new_base)

{

struct hrtimer *timer;

struct timerqueue_node *node;

//做了個很簡單是事情，就是將老的clock_base里的active鏈表的timer重新連接到新的

while ((node = timerqueue_getnext(&old_base->active))) {

timer = container_of(node, struct hrtimer, node);

BUG_ON(hrtimer_callback_running(timer));

debug_deactivate(timer);

* Mark it as STATE_MIGRATE not INACTIVE otherwise the

* timer could be seen as !active and just vanish away

* under us on another CPU

__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);

timer->base = new_base;

* Enqueue the timers on the new cpu. This does not

* reprogram the event device in case the timer

* expires before the earliest on this CPU, but we run

* hrtimer_interrupt after we migrated everything to

* sort out already expired timers and reprogram the

* event device.

enqueue_hrtimer(timer, new_base);

/* Clear the migration state bit */

timer->state &= ~HRTIMER_STATE_MIGRATE;

}

#endif /* CONFIG_HOTPLUG_CPU */

時鐘設(shè)備

如下斜體部分文字摘自陳功的《Linux 時鐘管理》

tick device
Tick device 用來處理周期性的 tick event。Tick device 其實是時鐘事件設(shè)備的一個 wrapper，因此 tick device 也有 one-shot 和周期性這兩種中斷觸發(fā)模式。
每注冊一個時鐘事件設(shè)備，這個設(shè)備會自動被注冊為一個 tick device。全局的 tick device 用來更新諸如 jiffies 這樣的全局信息，per-CPU 的 tick device 則用來更新每個 CPU 相關(guān)的特定信息。
broadcast
Broadcast 的出現(xiàn)是為了應(yīng)對這樣一種情況：假定 CPU 使用 Local APIC Timer 作為 per-CPU 的 tick device，但是某些特定的 CPU（如 Intel 的 Westmere 之前的 CPU）在進入 C3+ 的狀態(tài)時 Local APIC Timer 也會同時停止工作，進入睡眠狀態(tài)。在這種情形下 broadcast 可以替代 Local APIC Timer 繼續(xù)完成統(tǒng)計進程的執(zhí)行時間等有關(guān)操作。本質(zhì)上 broadcast 是發(fā)送一個 IPI（Inter-processor interrupt）中斷給其他所有的 CPU，當(dāng)目標(biāo) CPU 收到這個 IPI 中斷后就會調(diào)用原先 Local APIC Timer 正常工作時的中斷處理函數(shù)，從而實現(xiàn)了同樣的功能。目前主要在 x86 以及 MIPS 下會用到 broadcast 功能。
Timekeeping & GTOD (Generic Time-of-Day)
Timekeeping（可以理解為時間測量或者計時）是內(nèi)核時間管理的一個核心組成部分。沒有 Timekeeping，就無法更新系統(tǒng)時間，維持系統(tǒng)“心跳”。GTOD 是一個通用的框架，用來實現(xiàn)諸如設(shè)置系統(tǒng)時間 gettimeofday 或者修改系統(tǒng)時間 settimeofday 等工作。

100

101

102

* The hpet clock event device

static struct clock_event_device hpet_clockevent = {

.name = "hpet",

.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,

.set_mode = hpet_legacy_set_mode,

.set_next_event = hpet_legacy_next_event,

.irq = 0,

.rating = 50,

};

asmlinkage void __init start_kernel(void)

{

...

init_IRQ();

init_timers();

hrtimers_init();

softirq_init();

timekeeping_init();

time_init();

...

if (late_time_init)

late_time_init();

...

}

void __init time_init(void)

{

late_time_init = x86_late_time_init;

}

static __init void x86_late_time_init(void)

{

x86_init.timers.timer_init();

tsc_init();

}

struct x86_init_ops x86_init __initdata = {

...

.timers = {

.setup_percpu_clockev = setup_boot_APIC_clock,

.tsc_pre_init = x86_init_noop,

.timer_init = hpet_time_init,

.wallclock_init = x86_init_noop,

...

};

/* Default timer init function */

void __init hpet_time_init(void)

{

if (!hpet_enable())//默認使用HPET，如果HPET不支持，再使用PIT代替時鐘源

setup_pit_timer();

setup_default_timer_irq();//為IRQ0設(shè)置處理HANDLE

}

hpet_enable ->

static void hpet_legacy_clockevent_register(void)

{

/* Start HPET legacy interrupts */

hpet_enable_legacy_int();

* Start hpet with the boot cpu mask and make it

* global after the IO_APIC has been initialized.

hpet_clockevent.cpumask = cpumask_of(smp_processor_id());

clockevents_config_and_register(&hpet_clockevent, hpet_freq,

HPET_MIN_PROG_DELTA, 0x7FFFFFFF);//非常關(guān)鍵，注冊clock event

global_clock_event = &hpet_clockevent;//讓IRQ 0的中斷HANDLE使用hpet的HANDLE

printk(KERN_DEBUG "hpet clockevent registeredn");

}

clockevents_config_and_register ->

void clockevents_register_device(struct clock_event_device *dev)

{

unsigned long flags;

BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);

if (!dev->cpumask) {

WARN_ON(num_possible_cpus() > 1);

dev->cpumask = cpumask_of(smp_processor_id());

}

raw_spin_lock_irqsave(&clockevents_lock, flags);

//將clock event 加入clockevent_devices鏈表，可以用在suspend, resume或其他消息通知的時候回調(diào)

list_add(&dev->list, &clockevent_devices);

clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);//通知clock event添加事件

clockevents_notify_released();

raw_spin_unlock_irqrestore(&clockevents_lock, flags);

}

* Notify about a clock event change. Called with clockevents_lock

* held.

static void clockevents_do_notify(unsigned long reason, void *dev)

{

raw_notifier_call_chain(&clockevents_chain, reason, dev);

}

CLOCK_EVT_NOTIFY_ADD通知會在tick_notify里收到，然后回調(diào)
tick_check_new_device -> tick_setup_device(td, newdev, cpu, cpumask_of(cpu));

* Setup the tick device

static void tick_setup_device(struct tick_device *td,

struct clock_event_device *newdev, int cpu,

const struct cpumask *cpumask)

{

ktime_t next_event;

void (*handler)(struct clock_event_device *) = NULL;

* First device setup ?

if (!td->evtdev) {

* If no cpu took the do_timer update, assign it to

* this cpu:

if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {

tick_do_timer_cpu = cpu;

tick_next_period = ktime_get();

tick_period = ktime_set(0, NSEC_PER_SEC / HZ);

}

* Startup in periodic mode first.

td->mode = TICKDEV_MODE_PERIODIC;//初始的時候，都是PERIODIC模式，當(dāng)高精度時鐘的時候，才是ONE SHORT

} else {

handler = td->evtdev->event_handler;

next_event = td->evtdev->next_event;

td->evtdev->event_handler = clockevents_handle_noop;

}

td->evtdev = newdev;

* When the device is not per cpu, pin the interrupt to the

* current cpu:

if (!cpumask_equal(newdev->cpumask, cpumask))

irq_set_affinity(newdev->irq, cpumask);

* When global broadcasting is active, check if the current

* device is registered as a placeholder for broadcast mode.

* This allows us to handle this x86 misfeature in a generic

* way.

if (tick_device_uses_broadcast(newdev, cpu))

return;

//初始化

if (td->mode == TICKDEV_MODE_PERIODIC)

tick_setup_periodic(newdev, 0);//這里面設(shè)置handle為tick_handle_periodic 或tick_handle_periodic_broadcast

else

tick_setup_oneshot(newdev, handler, next_event); //在高精度模式下，handler其實是hrtimer_interrupt

}

低精度模式

所以，周期時鐘(低精度)時，回調(diào)函數(shù)為tick_handle_periodic或tick_handle_periodic_broadcast

* Event handler for periodic ticks

void tick_handle_periodic(struct clock_event_device *dev)

{

int cpu = smp_processor_id();

ktime_t next;

tick_periodic(cpu);

if (dev->mode != CLOCK_EVT_MODE_ONESHOT)

return;

* Setup the next period for devices, which do not have

* periodic mode:

next = ktime_add(dev->next_event, tick_period);

for (;;) {

if (!clockevents_program_event(dev, next, false))

return;

* Have to be careful here. If we're in oneshot mode,

* before we call tick_periodic() in a loop, we need

* to be sure we're using a real hardware clocksource.

* Otherwise we could get trapped in an infinite

* loop, as the tick_periodic() increments jiffies,

* when then will increment time, posibly causing

* the loop to trigger again and again.

if (timekeeping_valid_for_hres())

tick_periodic(cpu);

next = ktime_add(next, tick_period);

}

* Periodic tick

static void tick_periodic(int cpu)

{

if (tick_do_timer_cpu == cpu) {

write_seqlock(&jiffies_lock);

/* Keep track of the next tick event */

tick_next_period = ktime_add(tick_next_period, tick_period);

do_timer(1);

write_sequnlock(&jiffies_lock);

}

update_process_times(user_mode(get_irq_regs()));//低精度下，運行此函數(shù)，更新進程時間，調(diào)用run_local_timers

profile_tick(CPU_PROFILING);

}

void update_process_times(int user_tick)

{

struct task_struct *p = current;

int cpu = smp_processor_id();

/* Note: this timer irq context must be accounted for as well. */

account_process_tick(p, user_tick);

run_local_timers(); //運行l(wèi)ocal timers

rcu_check_callbacks(cpu, user_tick);

#ifdef CONFIG_IRQ_WORK

if (in_irq())

irq_work_run();

#endif

scheduler_tick();

run_posix_cpu_timers(p);

}

* Called by the local, per-CPU timer interrupt on SMP.

void run_local_timers(void)

{

hrtimer_run_queues();//hardirq context下運行所有到期的timer

raise_softirq(TIMER_SOFTIRQ); //TIMER軟中斷，不是HTIMER, softirq context下運行所有到期的timer，start_kernel->init_timers->open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 這里注冊的SOFTIRQ回調(diào)

}

這里的TIMER SOFTIRQ，是在start_kernel里面調(diào)用init_timers初始化的，會調(diào)用到run_timer_softirq

void __init init_timers(void)

{

int err;

/* ensure there are enough low bits for flags in timer->base pointer */

BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);

//顯示調(diào)用一下當(dāng)前CPU的PREPARE

err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,

(void *)(long)smp_processor_id());

init_timer_stats();

BUG_ON(err != NOTIFY_OK);

register_cpu_notifier(&timers_nb);//這里才是注冊回調(diào)

open_softirq(TIMER_SOFTIRQ, run_timer_softirq);//注冊TIMER_SOFTIRQ軟中斷

}

static void run_timer_softirq(struct softirq_action *h)

{

struct tvec_base *base = __this_cpu_read(tvec_bases);

hrtimer_run_pending();//檢查一下，是否需切換到高精度

if (time_after_eq(jiffies, base->timer_jiffies))

__run_timers(base);//低精度模式下，在軟中斷上下文調(diào)用低精度的timer回調(diào)

}

上面的run_local_timers里，調(diào)用hrtimer_run_queues，目的是在低精度模式下，實現(xiàn)hrtimer功能

* Called from hardirq context every jiffy

void hrtimer_run_queues(void)

{

struct timerqueue_node *node;

struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);

struct hrtimer_clock_base *base;

int index, gettime = 1;

//如果開啟了高精度模式，這里就不會進來

//換言之，這里實現(xiàn)了低精度支持hrtimer的功能

//切記，這個調(diào)用是在時鐘設(shè)備的硬中斷中調(diào)用的

if (hrtimer_hres_active())

return;

for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {

base = &cpu_base->clock_base[index];//將當(dāng)前CPU上的所有類型的CLOCK上的所有timer都檢查一遍

if (!timerqueue_getnext(&base->active))

continue;

if (gettime) {

hrtimer_get_softirq_time(cpu_base);//這里會將所有的base的softirq_time更新為最新

gettime = 0;

}

raw_spin_lock(&cpu_base->lock);

while ((node = timerqueue_getnext(&base->active))) {

struct hrtimer *timer;

//這里比較的世界是node的時間，即hard expires

timer = container_of(node, struct hrtimer, node);

if (base->softirq_time.tv64 <=

hrtimer_get_expires_tv64(timer))//timer->node.expires.tv64;這里比較時間，用的是timer->node.expires，不是timer->_softexpires

break;

__run_hrtimer(timer, &base->softirq_time);//調(diào)用run timer

}

raw_spin_unlock(&cpu_base->lock);

}

低精度切換到高精度

上面低精度模式下，運行TIMER SOFTIRQ的時候(run_timer_softirq)，會檢查是否可以切換到高精度模式

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

void hrtimer_run_pending(void)

{

if (hrtimer_hres_active())//已經(jīng)是高精度了

return;

* This _is_ ugly: We have to check in the softirq context,

* whether we can switch to highres and / or nohz mode. The

* clocksource switch happens in the timer interrupt with

* xtime_lock held. Notification from there only sets the

* check bit in the tick_oneshot code, otherwise we might

* deadlock vs. xtime_lock.

if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))

hrtimer_switch_to_hres();//看下，是否高精度模式開啟，是的話切換

}

int tick_check_oneshot_change(int allow_nohz)

{

struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);

//0位用于保存十分clock發(fā)生了變化

if (!test_and_clear_bit(0, &ts->check_clocks))

return 0;

if (ts->nohz_mode != NOHZ_MODE_INACTIVE)//已經(jīng)開啟了NOHZ模式

return 0;

//timekeeping不支持高精度或clock event不支持oneshot，無法切換到高精度

if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())

return 0;

//當(dāng)運行高精度的時候，return 1，會設(shè)置hrtimer_interrupt

//否則切換到nohz，設(shè)置tick_nohz_handler

if (!allow_nohz)

return 1;

#if 0

low resolution mode High resolution mode

------------------+-----------------------+-----------------------

periodic tick | tick_handle_periodic | hrtimer_interrupt

dynamic tick | tick_nohz_handler | hrtimer_interrupt

------------------+-----------------------+-----------------------

#endif

tick_nohz_switch_to_nohz();

return 0;

}

* Switch to high resolution mode

static int hrtimer_switch_to_hres(void)

{

int i, cpu = smp_processor_id();

struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);

unsigned long flags;

if (base->hres_active)//已經(jīng)是高精度模式了

return 1;

local_irq_save(flags);

if (tick_init_highres()) { // => tick_switch_to_oneshot(hrtimer_interrupt)

//初始化錯了

local_irq_restore(flags);

printk(KERN_WARNING "Could not switch to high resolution "

"mode on CPU %dn", cpu);

return 0;

}

base->hres_active = 1;//這個標(biāo)志高精度模式active

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)

base->clock_base[i].resolution = KTIME_HIGH_RES;//修改clock base的標(biāo)志

tick_setup_sched_timer();//低精度下要干的事，高精度下用一個hrtimer來做，為啥?我想是因為高精度的頻率比低精度高，而這些任務(wù)用不著高頻率處理

/* "Retrigger" the interrupt to get things going */

retrigger_next_event(NULL);

local_irq_restore(flags);

return 1;

}

/**

* tick_switch_to_oneshot - switch to oneshot mode

int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))

{//高精度支持，需要oneshot模式，方便及時切換頻率或停止啟動

struct tick_device *td = &__get_cpu_var(tick_cpu_device);

struct clock_event_device *dev = td->evtdev;

if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||

!tick_device_is_functional(dev)) {

//糟糕，設(shè)備不支持ONESHORT

printk(KERN_INFO "Clockevents: "

"could not switch to one-shot mode:");

if (!dev) {

printk(" no tick devicen");

} else {

if (!tick_device_is_functional(dev))

printk(" %s is not functional.n", dev->name);

else

printk(" %s does not support one-shot mode.n",

dev->name);

}

return -EINVAL;

}

td->mode = TICKDEV_MODE_ONESHOT;//修改為ONESHORT模式

dev->event_handler = handler;//現(xiàn)在，HANDLER也修改了，低精度的回調(diào)是在tick_setup_periodic里設(shè)置的，高精度是hrtimer_interrupt

clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);

tick_broadcast_switch_to_oneshot();//讓broadcast設(shè)備也切換到oneshot模式

return 0;

}

/**

* tick_setup_sched_timer - setup the tick emulation timer

void tick_setup_sched_timer(void)

{

struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);

ktime_t now = ktime_get();

* Emulate tick processing via per-CPU hrtimers:

hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);

ts->sched_timer.function = tick_sched_timer;//主要更新下JIFFIES，進程運行時間等在低精度下也要做的工作

/* Get the next period (per cpu) */

hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());

/* Offset the tick to avert jiffies_lock contention. */

if (sched_skew_tick) {//這個不錯，防止jiffies lock競爭，讓超時每個CPU的超時周期和CPU ID做一個散列

u64 offset = ktime_to_ns(tick_period) >> 1;

do_div(offset, num_possible_cpus());

offset *= smp_processor_id();

hrtimer_add_expires_ns(&ts->sched_timer, offset);

}

for (;;) {//就是確保sched timer運行

hrtimer_forward(&ts->sched_timer, now, tick_period);

hrtimer_start_expires(&ts->sched_timer,

HRTIMER_MODE_ABS_PINNED);

/* Check, if the timer was already in the past */

if (hrtimer_active(&ts->sched_timer))

break;

now = ktime_get();

}

#ifdef CONFIG_NO_HZ

if (tick_nohz_enabled)

ts->nohz_mode = NOHZ_MODE_HIGHRES;

#endif

}

高精度模式

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

* High resolution timer interrupt

* Called with interrupts disabled

void hrtimer_interrupt(struct clock_event_device *dev)

{

struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);

ktime_t expires_next, now, entry_time, delta;

int i, retries = 0;

BUG_ON(!cpu_base->hres_active);

cpu_base->nr_events++;//統(tǒng)計總的interrupt次數(shù)

dev->next_event.tv64 = KTIME_MAX;

raw_spin_lock(&cpu_base->lock);

entry_time = now = hrtimer_update_base(cpu_base);//更新clock_base的時間

retry:

expires_next.tv64 = KTIME_MAX;

* We set expires_next to KTIME_MAX here with cpu_base->lock

* held to prevent that a timer is enqueued in our queue via

* the migration code. This does not affect enqueueing of

* timers which run their callback and need to be requeued on

* this CPU.

cpu_base->expires_next.tv64 = KTIME_MAX;

for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {

struct hrtimer_clock_base *base;

struct timerqueue_node *node;

ktime_t basenow;

if (!(cpu_base->active_bases & (1 << i)))//clock不是激活狀態(tài)，比如，clock base里面沒有timer，何必調(diào)用一次?

continue;

base = cpu_base->clock_base + i;//每一個CLOCK BASE

basenow = ktime_add(now, base->offset);//每一個CLOCK BASE的當(dāng)前時間

//取每一個CLOCK BASE的active紅黑樹中最頂端hrtimer，最可能超時

while ((node = timerqueue_getnext(&base->active))) {

struct hrtimer *timer;

timer = container_of(node, struct hrtimer, node);

* The immediate goal for using the softexpires is

* minimizing wakeups, not running timers at the

* earliest interrupt after their soft expiration.

* This allows us to avoid using a Priority Search

* Tree, which can answer a stabbing querry for

* overlapping intervals and instead use the simple

* BST we already have.

* We don't add extra wakeups by delaying timers that

* are right-of a not yet expired timer, because that

* timer will have to trigger a wakeup anyway.

//這里比較的是soft expires，如果soft expires超過了當(dāng)前CLOCK BASE的時間，表示還沒到期，當(dāng)前的CLOCK BASE可以中斷檢查

if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {

ktime_t expires;

expires = ktime_sub(hrtimer_get_expires(timer),

base->offset);//用未超時的timer的hard expires - base->offset，其實就是base 下次觸發(fā)的時間

if (expires.tv64 < 0)

expires.tv64 = KTIME_MAX;//溢出了?這不科學(xué)，設(shè)置為最大值

if (expires.tv64 < expires_next.tv64)

expires_next = expires;//expires其實就是next expires

break;

}

__run_hrtimer(timer, &basenow);//調(diào)用run timer

}

* Store the new expiry value so the migration code can verify

* against it.

cpu_base->expires_next = expires_next;

raw_spin_unlock(&cpu_base->lock);

/* Reprogramming necessary ? */

if (expires_next.tv64 == KTIME_MAX ||//不需要next expires 或設(shè)置硬件next正確

!tick_program_event(expires_next, 0)) {//設(shè)置對應(yīng)硬件的下一次超時，為表示正確

cpu_base->hang_detected = 0;

return;

}

* The next timer was already expired due to:

* - tracing

* - long lasting callbacks

* - being scheduled away when running in a VM

* We need to prevent that we loop forever in the hrtimer

* interrupt routine. We give it 3 attempts to avoid

* overreacting on some spurious event.

* Acquire base lock for updating the offsets and retrieving

* the current time.

raw_spin_lock(&cpu_base->lock);

//當(dāng)前時間已經(jīng)超過next time,嘗試修復(fù)，執(zhí)行次

now = hrtimer_update_base(cpu_base);

cpu_base->nr_retries++;

if (++retries < 3)

goto retry;

//還是不行?標(biāo)志hang了

* Give the system a chance to do something else than looping

* here. We stored the entry time, so we know exactly how long

* we spent here. We schedule the next event this amount of

* time away.

cpu_base->nr_hangs++;

cpu_base->hang_detected = 1;

raw_spin_unlock(&cpu_base->lock);

delta = ktime_sub(now, entry_time);//從剛進來到現(xiàn)在，耗時多長?delta

if (delta.tv64 > cpu_base->max_hang_time.tv64)

cpu_base->max_hang_time = delta;//保存最大的hang time就可以了

* Limit it to a sensible value as we enforce a longer

* delay. Give the CPU at least 100ms to catch up.

if (delta.tv64 > 100 * NSEC_PER_MSEC)

expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);

else

expires_next = ktime_add(now, delta);

tick_program_event(expires_next, 1);//設(shè)置長一些的超時最大ms

printk_once(KERN_WARNING "hrtimer: interrupt took %llu nsn",

ktime_to_ns(delta));

}

tick_program_event ->

/**

* clockevents_program_event - Reprogram the clock event device.

* @dev: device to program

* @expires: absolute expiry time (monotonic clock)

* @force: program minimum delay if expires can not be set

* Returns 0 on success, -ETIME when the event is in the past.

int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,

bool force)

{

unsigned long long clc;

int64_t delta;

int rc;

if (unlikely(expires.tv64 < 0)) {

WARN_ON_ONCE(1);

return -ETIME;

}

dev->next_event = expires;

if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN)

return 0;

/* Shortcut for clockevent devices that can deal with ktime. */

if (dev->features & CLOCK_EVT_FEAT_KTIME)

return dev->set_next_ktime(expires, dev);

delta = ktime_to_ns(ktime_sub(expires, ktime_get()));

if (delta <= 0)//現(xiàn)在的時間，已經(jīng)超過了想要預(yù)設(shè)的超時，怎么辦?根據(jù)是否需要force決定是否設(shè)置為min delta

return force ? clockevents_program_min_delta(dev) : -ETIME;

delta = min(delta, (int64_t) dev->max_delta_ns);

delta = max(delta, (int64_t) dev->min_delta_ns);

clc = ((unsigned long long) delta * dev->mult) >> dev->shift;

rc = dev->set_next_event((unsigned long) clc, dev); //比如hpet，其回調(diào)為hpet_next_event

//返回非表示錯誤，如果需要force，那么強行設(shè)置為min delta

return (rc && force) ? clockevents_program_min_delta(dev) : rc;

}

static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)

{

struct hrtimer_clock_base *base = timer->base;

struct hrtimer_cpu_base *cpu_base = base->cpu_base;

enum hrtimer_restart (*fn)(struct hrtimer *);

int restart;

WARN_ON(!irqs_disabled());

debug_deactivate(timer);

__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);//先將timer從base中刪除，并設(shè)置timer的狀態(tài)為CALLBACK

timer_stats_account_hrtimer(timer);

這里的function回調(diào)指針，就是我們調(diào)用hrtimer_init后設(shè)置的

hrtimer_init(&rtc->coalesced_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);

rtc->coalesced_timer.function = coalesced_timer_fn;

可以看出，我們設(shè)置的hrtimer回調(diào)是在hardirq context中執(zhí)行

fn = timer->function;

* Because we run timers from hardirq context, there is no chance

* they get migrated to another cpu, therefore its safe to unlock

* the timer base.

raw_spin_unlock(&cpu_base->lock);//這句話點名了，timer的回調(diào)函數(shù)是在hardirq context

trace_hrtimer_expire_entry(timer, now);

restart = fn(timer);//調(diào)用我們的回調(diào)函數(shù)

trace_hrtimer_expire_exit(timer);

raw_spin_lock(&cpu_base->lock);

* Note: We clear the CALLBACK bit after enqueue_hrtimer and

* we do not reprogramm the event hardware. Happens either in

* hrtimer_start_range_ns() or in hrtimer_interrupt()

if (restart != HRTIMER_NORESTART) {

BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);

enqueue_hrtimer(timer, base);

}

WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));

timer->state &= ~HRTIMER_STATE_CALLBACK;

}

HRTIMER函數(shù)詳解

初始化

//hrtimer_init非常簡單，就是將hrtimer*加入對應(yīng)的RB TREE

static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,

enum hrtimer_mode mode)

{

struct hrtimer_cpu_base *cpu_base;

int base;

memset(timer, 0, sizeof(struct hrtimer));

cpu_base = &__raw_get_cpu_var(hrtimer_bases);

if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)

clock_id = CLOCK_MONOTONIC;

base = hrtimer_clockid_to_base(clock_id);

timer->base = &cpu_base->clock_base[base];

timerqueue_init(&timer->node);

#ifdef CONFIG_TIMER_STATS

timer->start_site = NULL;

timer->start_pid = -1;

memset(timer->start_comm, 0, TASK_COMM_LEN);

#endif

}

設(shè)置超時

/**

* hrtimer_forward - forward the timer expiry

* @timer: hrtimer to forward

* @now: forward past this time

* @interval: the interval to forward

* Forward the timer expiry so it will expire in the future.

* Returns the number of overruns.

u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)

{

u64 orun = 1;

ktime_t delta;

delta = ktime_sub(now, hrtimer_get_expires(timer));

if (delta.tv64 < 0)//如果timer原本的超時時間還在想要定位的now之后，就不修改，因為這個函數(shù)的目的是將超時時間設(shè)置在now后

return 0;

if (interval.tv64 < timer->base->resolution.tv64)//如果interval過小，小于clock base所能達到的精度，當(dāng)然使用clock base的最小精度了

interval.tv64 = timer->base->resolution.tv64;

//欲修改的時間基準(zhǔn)與原有超時時間差，大于interval

//這個話使用的是unlikely，表示，這種情況還是不多的

//例如，你在一個周期回調(diào)里面，再次add，其interval肯定要大

if (unlikely(delta.tv64 >= interval.tv64)) {

s64 incr = ktime_to_ns(interval);

orun = ktime_divns(delta, incr);//相差有多少個interval

hrtimer_add_expires_ns(timer, incr * orun);

if (hrtimer_get_expires_tv64(timer) > now.tv64)//這個函數(shù)，不是簡單的add interval，而是觸發(fā)時間能夠>now就可以了，想想周期時鐘的用法，確實應(yīng)該是這樣

return orun;

* This (and the ktime_add() below) is the

* correction for exact:

orun++;

}

hrtimer_add_expires(timer, interval);//在上次超時的基礎(chǔ)上加上interval

return orun;

}

static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)

{//《Linux 時鐘管理》說，將原來必須在hard expire 超時才能執(zhí)行的一個點變成一個范圍后，可以盡量把hrtimer

//中斷放在一起處理，這樣CPU 被重復(fù)喚醒的幾率會變小，從而達到節(jié)能的效果，同時這個hrtimer 也可以保證其執(zhí)行精度。

timer->node.expires = ktime_add_safe(timer->node.expires, time);

timer->_softexpires = ktime_add_safe(timer->_softexpires, time);

}

static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)

{

timer->node.expires = ktime_add_ns(timer->node.expires, ns);

timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);

}

啟動

int

hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)

{//這里的mode 和tim有關(guān)，mode為HRTIMER_REL的時候，表示tim為相對于now的時間

//顧明思意，如果mode為HRTIMER_ABS，這tim為絕對時間

return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);

}

int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,

unsigned long delta_ns, const enum hrtimer_mode mode,

int wakeup)

{

struct hrtimer_clock_base *base, *new_base;

unsigned long flags;

int ret, leftmost;

base = lock_hrtimer_base(timer, &flags);

/* Remove an active timer from the queue: */

ret = remove_hrtimer(timer, base);

/* Switch the timer base, if necessary: */

//啟動的時候，會檢查是否需要switch clock base到當(dāng)前CPU的clock base

new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);

if (mode & HRTIMER_MODE_REL) {

tim = ktime_add_safe(tim, new_base->get_time());

* CONFIG_TIME_LOW_RES is a temporary way for architectures

* to signal that they simply return xtime in

* do_gettimeoffset(). In this case we want to round up by

* resolution when starting a relative timer, to avoid short

* timeouts. This will go away with the GTOD framework.

#ifdef CONFIG_TIME_LOW_RES

tim = ktime_add_safe(tim, base->resolution);

#endif

}

hrtimer_set_expires_range_ns(timer, tim, delta_ns);

timer_stats_hrtimer_set_start_info(timer);

//加入紅黑樹

leftmost = enqueue_hrtimer(timer, new_base);

* Only allow reprogramming if the new base is on this CPU.

* (it might still be on another CPU if the timer was pending)

* XXX send_remote_softirq() ?

//加入之后，如果發(fā)下自己這個hrtimer最早超時, leftmost為

//如果是本CPU上的CLOCK BASE，那么，重新設(shè)置超時，因為之前設(shè)置的超時比較靠后了

if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)

&& hrtimer_enqueue_reprogram(timer, new_base)) {

//如果加入失敗，激活HRTIMER_SOFTIRQ，使得能夠在軟中斷中檢查timer是否超時

if (wakeup) {

* We need to drop cpu_base->lock to avoid a

* lock ordering issue vs. rq->lock.

raw_spin_unlock(&new_base->cpu_base->lock);

raise_softirq_irqoff(HRTIMER_SOFTIRQ);

local_irq_restore(flags);

return ret;

} else {

__raise_softirq_irqoff(HRTIMER_SOFTIRQ);

}

unlock_hrtimer_base(timer, &flags);

return ret;

}

刪除

static inline int hrtimer_callback_running(struct hrtimer *timer)

{

return timer->state & HRTIMER_STATE_CALLBACK;

}

/**

* hrtimer_try_to_cancel - try to deactivate a timer

* @timer: hrtimer to stop

* Returns:

* 0 when the timer was not active

* 1 when the timer was active

* -1 when the timer is currently excuting the callback function and

* cannot be stopped

int hrtimer_try_to_cancel(struct hrtimer *timer)

{

struct hrtimer_clock_base *base;

unsigned long flags;

int ret = -1;

base = lock_hrtimer_base(timer, &flags);

if (!hrtimer_callback_running(timer))//如果處于回調(diào)函數(shù)執(zhí)行狀態(tài)，不CANCEL

ret = remove_hrtimer(timer, base);

unlock_hrtimer_base(timer, &flags);

return ret;

}

/**

* hrtimer_cancel - cancel a timer and wait for the handler to finish.

* @timer: the timer to be cancelled

* Returns:

* 0 when the timer was not active

* 1 when the timer was active

int hrtimer_cancel(struct hrtimer *timer)

{

for (;;) {

int ret = hrtimer_try_to_cancel(timer);//不停嘗試cancel，直到成功

if (ret >= 0)

return ret;

cpu_relax();

}

狀態(tài)轉(zhuǎn)換

state為hrtimer的四個狀態(tài)：

#define HRTIMER_STATE_INACTIVE 0x00

#define HRTIMER_STATE_ENQUEUED 0x01

#define HRTIMER_STATE_CALLBACK 0x02

#define HRTIMER_STATE_MIGRATE 0x04

可以想象，HRTIMER_STATE_INACTIVE這個是初始值，調(diào)用hrtimer_init后，這個變量設(shè)置為HRTIMER_STATE_INACTIVE

調(diào)用enqueue_hrtimer將hrtimer*加入到RB TREE后，狀態(tài)會 OR 上 HRTIMER_STATE_ENQUEUED
什么時候清楚，當(dāng)調(diào)用__remove_hrtimer的從RB TREE里刪除后，會“設(shè)置”為新狀態(tài)，此新狀態(tài)中一定不包含HRTIMER_STATE_ENQUEUED

在remove_hrtimer調(diào)用的時候，只保留了CALLBACK狀態(tài)
1
2
state = timer->state & HRTIMER_STATE_CALLBACK;
__remove_hrtimer(timer, base, state, reprogram);
在__run_hrtimer的時候，會先從RB TREE里將hrtimer*刪除，設(shè)置狀態(tài)為CALLBACK，然后調(diào)用回調(diào)函數(shù)
先將timer從base中刪除，并設(shè)置timer的狀態(tài)為CALLBACK
1
2
3
4
5
6
__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
restart = fn(timer);
if (restart != HRTIMER_NORESTART) {
enqueue_hrtimer(timer, base);
}
timer->state &= ~HRTIMER_STATE_CALLBACK;

調(diào)用完fn后，又將CALLBACK狀態(tài)去除