2.6 彻底公正调度CFS
- 核心思想:度量每个进程的虚拟运转时刻,调度运转时刻最少的进程运转
- 虚拟运转时刻:由物理时钟、进程优先级确认,优先级越高,虚拟时钟越慢,也便是进程越简单被调度到
2.6.1 数据结构
- CFS的安排妥当行列
-
vruntime
:位于sched_entity
,记录进程在虚拟时钟上流逝的数量 -
min_vruntime
:是单调递增的,盯梢行列上一切进程的最小虚拟运转时刻,比最左面的树结点的vruntime
大些
-
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load; //安排妥当行列上进程的累积负荷值
unsigned long nr_running; //行列上可运转进程的数目
u64 min_vruntime; //一切进程的最小虚拟运转时刻
struct rb_root tasks_timeline; //管理红黑树一切进程
struct rb_node *rb_leftmost; //指向红黑树最左面的节点(需求被调度的进程)
struct sched_entity *curr; //当时履行进程的可调度实体
......
};
2.6.2 CFS虚拟时刻
1. 虚拟时钟核算
-
虚拟时钟:彻底公正调度算法依赖虚拟时钟,度量进程能得到的CPU时刻
- 依据:实践时钟、进程相关的负荷权重(进程优先级)
- 作业:更新当时进程履行的实践时钟、核算对应的虚拟时钟、更新行列的min_vruntime
// 核心函数:更新进程的物理时钟和虚拟时钟
static void update_curr(struct cfs_rq *cfs_rq) {
//curr表明安排妥当行列当时履行的进程
struct sched_entity *curr = cfs_rq->curr;
//now表明安排妥当行列的真实时钟
u64 now = rq_of(cfs_rq)->clock;
unsigned long delta_exec;
......
//核算 当时安排妥当行列时钟 和 上一次安排妥当行列时钟 的时刻差delta_exec
delta_exec = (unsigned long)(now - curr->exec_start);
//托付给__update_curr,更新当时进程在CPU花费的物理时刻和虚拟时刻
__update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
......
}
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;
u64 vruntime;
......
//更新当时进程的履行总时刻(物理时刻)
curr->sum_exec_runtime += delta_exec;
......
//依据负荷权重核算当时进程添加的虚拟时钟(虚拟时刻)
delta_exec_weighted = delta_exec;
if (unlikely(curr->load.weight != NICE_0_LOAD)) {
delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
&curr->load);
}
curr->vruntime += delta_exec_weighted;
//对CFS行列更新min_vruntime(内核确保值单调添加)
if (first_fair(cfs_rq)) {
//假如红黑树有最左面结点(有进程在等待调度),vruntime=最左面结点进程的vruntime
vruntime = min_vruntime(curr->vruntime,
__pick_next_entity(cfs_rq)->vruntime);
} else {
//假如红黑树是空的,vruntime=当时进程的虚拟运转时刻
vruntime = curr->vruntime;
}
cfs_rq->min_vruntime =
max_vruntime(cfs_rq->min_vruntime, vruntime);
}
其间,虚拟时钟的增长量delta_exec_weighted
依据公式核算,取四舍五入:
delta_exec_weighted=delta_execNICE_0_LOADload_weightdelta\_exec\_weighted = delta\_exec \times \frac{NICE\_0\_LOAD}{load\_weight}
- 依据表
prio_to_weight
,nice=0
的负荷权重load_weight=1024
- nice=0(prio=120):delta_exec_weighted = delta_exec1024/1024 = delta_exec*
- nice=-10(prio=110):delta_exec_weighted = delta_exec1024/9548 = 0.107delta_exec
【总结】
- 关于
nice=0
的进程,虚拟时刻=物理时刻;关于其他优先级,依据负荷权重load_weight
从头衡定时刻。 - 进程优先级越高,
nice
值越小,进程负荷权重load_weight
越大,虚拟时钟跑得越慢- 进程运转时,
vruntime
稳定添加,越重要的进程虚拟时钟跑得越慢,在红黑树中方位更靠左,越简单被调度到 - 进程进入睡觉,
vruntime
坚持不变。每个行列min_vruntime
添加,睡觉进程醒来则在红黑树中方位更靠左,更简单被调度
- 进程运转时,
2. 进程调度时刻核算
- CFS没有时刻片的概念,某个调度实体分配到的时刻由以下公式决定:
slice=行列调度周期调度实体的权重安排妥当行列的权重slice = 行列调度周期 \times \frac {调度实体的权重} {安排妥当行列的权重}
- 其间安排妥当行列的权重便是一切调度实体权重之和,拜见
update_load_add()
/* 核算某个调度实体分配到的时刻 */
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) {
u64 slice = __sched_period(cfs_rq->nr_running);
slice *= se->load.weight;
do_div(slice, cfs_rq->load.weight);
return slice;
}
/* 核算安排妥当行列调度的总时刻 */
static u64 __sched_period(unsigned long nr_running) {
//默许推迟周期为20ms
u64 period = sysctl_sched_latency;
//安排妥当行列上活动进程数目,默许5
unsigned long nr_latency = sched_nr_latency;
//假如行列上数目超越默许值,总时刻要拉长(sysctl_sched_latency*nr_running/nr_latency)
if (unlikely(nr_running > nr_latency)) {
period *= nr_running;
do_div(period, nr_latency);
}
return period;
}
- 等价的虚拟时刻在
__sched_vslice
函数给出,公式:
vslice=timeNICE_0_LOADweightvslice = time \times \frac {NICE\_0\_LOAD} {weight}
- 进程调度时刻核算涉及到三个内核参数:用于盯梢CFS的调度推迟(物理时刻)
-
sysctl_sched_latency
:CPU 密集型使命的目标抢占推迟,默许值20ms。每个调度实体一定在20ms内会被调度到(留意:和时刻片不一样,时刻片是固定的) -
sysctl_sched_min_granularity
:CPU 密集型使命的最小抢占粒度,默许4ms。设的越小切换越频繁(个人理解为这4ms都是你的,不能被抢走) -
sched_nr_latency
:操控一个推迟周期中处理的最大活动进程数目。假如活动进程数目超越该上限,推迟周期成份额地线性扩展(20ms最多处理5个使命,每个使命4ms)
-
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
*
* NOTE: this latency value is not the same as the concept of
* 'timeslice length' - timeslices in CFS are of variable length
* and have no persistent notion like in traditional, time-slice
* based scheduling concepts.
*
* (to see the precise effective timeslice length of your workload,
* run vmstat and monitor the context-switches (cs) field)
*/
unsigned int sysctl_sched_latency = 20000000ULL;
/*
* Minimal preemption granularity for CPU-bound tasks:
* (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_min_granularity = 4000000ULL;
/*
* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
*/
static unsigned int sched_nr_latency = 5;
2.6.3 行列操作
1. 进程参加安排妥当行列
-
由
enqueue_task_fair
完结- 进程最近在运转,更新vruntime后,直接参加安排妥当行列红黑树中
- 进程此前在睡觉,或许其vruntime与行列的vruntime差的很远,所以要先调整后再加到红黑树里,确保进程不会一向履行并更优先被调度到
- 假如进程是新fork的,分两种情况
- 父进程先履行,参加到安排妥当行列最终面,确保进程最终才被调度到
- 子进程先履行(Linux 2.6.24默许):确保子进程虚拟时钟小于父进程,即子进程先履行(2.6.7节),并参加到安排妥当行列最终面,进程最终被调度到
/* 进程参加行列,wakeup表明入队的进程是否此前在睡觉,当时被唤醒并转换为运转态 */
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) {
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
//已经在安排妥当行列中,直接返回
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, wakeup);
wakeup = 1;
}
}
/* 核心函数,将调度实体参加行列 */
static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
{
//更新当时CFS行列时刻
update_curr(cfs_rq);
if (wakeup) {
//假如进程此前在睡觉,调整进程的虚拟时刻
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
}
......
if (se != cfs_rq->curr)
//内核函数,将调度实体置入红黑树中
__enqueue_entity(cfs_rq, se);
//安排妥当行列活动进程数cfs_rq->nr_running++;
account_entity_enqueue(cfs_rq, se);
}
/*更新新进程的vruntime值,以便把他插入红黑树 */
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime;
//以当时行列的min_vruntime作为基准时刻
vruntime = cfs_rq->min_vruntime;
......
//是新进程被加到系统中,那么新进程的vruntime需求加上整个调度周期的虚拟实践,确保最终才被调度到
if (initial)
vruntime += sched_vslice_add(cfs_rq, se);
//进程此前在睡觉,扣掉sysctl_sched_latency,确保进程唤醒后能够更早被调度到
if (!initial) {
......
vruntime -= sysctl_sched_latency;
//取se->vruntime和vruntime的最大值
vruntime = max_vruntime(se->vruntime, vruntime);
}
//更新进程的vruntime
se->vruntime = vruntime;
}
- 其间
__enqueue_entity
将进程置入红黑树中。参加的方位由entity_key
确认。- 当时进程的vruntime越小,则参加的方位越靠左,越简单被选到(2.6.4节)
static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) {
return se->vruntime - cfs_rq->min_vruntime;
}
2.6.4 挑选下一个进程
- 挑选红黑树最靠左的进程作为下一个进程
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) {
struct sched_entity *se = NULL;
if (first_fair(cfs_rq)) {
//挑选最靠左的进程作为下一个进程
se = __pick_next_entity(cfs_rq);
//将进程取出标记为运转进程
set_next_entity(cfs_rq, se);
}
return se;
}
/* 挑选进程后需求做一些作业,标记为运转进程 */
static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
......
//从安排妥当行列中移除最左面的进程
if (se->on_rq) {
__dequeue_entity(cfs_rq, se);
}
//安排妥当行列标记当时进程,并更新进程运转的时刻
update_stats_curr_start(cfs_rq, se);
cfs_rq->curr = se;
......
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
2.6.5 处理周期性调度器
- 内核按频率
HZ
履行周期性调度。由task_tick_fair
担任,实践作业由entity_tick
完结- 更新安排妥当行列实践时钟、虚拟时钟统计量
- 假如当时进程运转时刻过长,则从头调度
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) {
//更新统计量
update_curr(cfs_rq);
if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
//看看进程运转的时刻是否过长,需求从头调度
check_preempt_tick(cfs_rq, curr);
}
static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) {
unsigned long ideal_runtime, delta_exec;
//核算当时进程的调度时刻
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
//假如当时进程的运转时刻过长,超越了调度时刻额度,需求从头调度
if (delta_exec > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr);
}
- 从头调度
resched_task()
设置标志TIF_NEED_RESCHED
。进程在恰当机遇(比方系统调用返回、中止返回)建议schedule()
asmlinkage void __sched schedule(void)
{
need_resched:
......
prev = rq->curr;
......
prev->sched_class->put_prev_task(rq, prev); //将当时进程从头放回安排妥当行列中
next = pick_next_task(rq, prev); //挑选下一个进程调度履行
......
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
goto need_resched;
}
2.6.6 唤醒抢占
- 在
try_to_wake_up
和wake_up_new_task
中唤醒进程时,内核使用check_preempt_curr
看看是否新进程能够抢占当时进程 - 彻底公正调度器使用
check_preempt_wakeup
完成check_preempt_curr
- 假如是实时进程,会当即抢占CFS进程
- 假如是批处理进程,不抢占其他进程
- CFS进程抢占,要确保进程运转
sysctl_sched_wakeup_granularity
(默许4ms),避免频繁切换
/*
* Preempt the current task with a newly woken task if needed:
*/
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
{
struct task_struct *curr = rq->curr;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
struct sched_entity *se = &curr->se, *pse = &p->se;
unsigned long gran;
//假如是实时进程,会当即抢占CFS进程
if (unlikely(rt_prio(p->prio))) {
update_rq_clock(rq);
update_curr(cfs_rq);
resched_task(curr);
return;
}
//假如是批处理进程,不抢占其他进程
if (unlikely(p->policy == SCHED_BATCH))
return;
......
//CFS进程被新CFS进程抢占,需求至少运转4ms。留意这儿4ms要转成对应的虚拟时刻
gran = sysctl_sched_wakeup_granularity;
if (unlikely(se->load.weight != NICE_0_LOAD))
gran = calc_delta_fair(gran, &se->load);
//超越4ms则从头调度
if (pse->vruntime + gran < se->vruntime)
resched_task(curr);
}
2.6.7 处理新进程
- 由
sysctl_sched_child_runs_first
操控fork
进程后是父进程先履行还是子进程先履行- Linux 2.6.24默许值为1,表明子进程先履行。后续内核有修订
- 过程:更新虚拟时钟、确保子进程先履行、子进程参加安排妥当行列、从头调度(留意:这儿子进程参加安排妥当行列后不一定会当即履行)
long do_fork(unsigned long clone_flags, //标志调集,SIGCHLD表明fork后发送SIGCHLD信号给父进程
unsigned long stack_start, //用户状态下栈的起始地址
struct pt_regs *regs, //指向寄存器调集的指针
unsigned long stack_size, //用户状态下栈的大小,一般为0
int __user *parent_tidptr, //指向父进程PID
int __user *child_tidptr) //指向子进程PID
{
......
// 将子进程参加调度器行列,由调度器挑选它运转
if (!(clone_flags & CLONE_STOPPED))
wake_up_new_task(p, clone_flags);
......
}
//唤醒子进程
void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
//初始化子进程优先级
p->prio = effective_prio(p);
......
//子进程调度时刻初始化
p->sched_class->task_new(rq, p);
inc_nr_running(p, rq);
......
}
//彻底公正调度走task_new_fair
static void task_new_fair(struct rq *rq, struct task_struct *p) {
struct cfs_rq *cfs_rq = task_cfs_rq(p);
struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
int this_cpu = smp_processor_id();
sched_info_queued(p);
//更新虚拟时钟,更新se->vruntime。留意第三个参数initial=1
update_curr(cfs_rq);
place_entity(cfs_rq, se, 1);
//这儿确保子进程先履行
if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
curr && curr->vruntime < se->vruntime) {
/*
* Upon rescheduling, sched_class::put_prev_task() will place
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
}
//参加行列,从头请求调度
enqueue_task_fair(rq, p, 0);
resched_task(rq->curr);
}
- 代码中
swap
()的作用:假如父进程虚拟运转时刻curr->vruntime
小于子进程运转时刻se->vruntime
,表明父进程先于子进程履行。所以经过交换虚拟时钟的方法,确保子进程先履行。