【深入Linux内核架构笔记】第二章进程调度(1)-六虎

2.5 调度器的完成

2.5.1 概观

进程调度：CPU同一时刻只能履行一个任务。各个进程尽可能公正地共享CPU时刻，又要考虑任务的优先级

需要考虑的问题：
- 进程有不同优先级
- 进程不能切换得太频繁，因为有进程上下文切换开支
- 两个相邻任务切换之间，切换时刻不能太长
概念
- 不公正程度：实质与进程等候时刻有关。假如CPU经常挑选最高等候时刻的进程，不公正情况缓解，会均匀分布到体系的一切进程
- 安排妥当行列：办理活动进程的数据结构。彻底公正调度器运用红黑树结构

调度算法

经典调度算法：对进程核算时刻片。若一切进程时刻片竭尽则从头核算
彻底公正调度：只考虑进程在安排妥当行列的等候时刻，对CPU时刻需求最严厉的进程被调度履行（不需要传统的时刻片）
- 数据结构：一切可运转进程按时刻在红黑树中排序，等候CPU时刻最长的进程是最左边的项，下一次调度该进程
- 虚拟时钟：判断等候进程将取得多少CPU时刻。要比实际时钟慢（假设安排妥当行列有4进程，虚拟时钟将以1/4速度运转）。
  - fair_clock：安排妥当行列的虚拟时刻
  - wait_runtime：进程的等候时刻（也能够理解为不公正程度）
  - fair_clock-wait_runtime：进程的虚拟运转时刻，用于红黑树排序

2.5.2 数据结构

调度方法：1）进程自动抛弃CPU；2）内核经过周期性机制切换进程

1. task_struct成员

struct task_struct {
    int static_prio;            //静态优先级
    int prio, normal_prio;      //动态优先级
    unsigned int rt_priority;   //实时进程的优先级，不同于nice值
    const struct sched_class *sched_class;    //进程所属的调度器类
    struct sched_entity se;     //可调度实体，调度器依据实体操作各个task_struct
    unsigned int policy;        //进程的调度策略
    cpumask_t cpus_allowed;     //限制进程能够在哪些CPU上运转
    struct list_head run_list;  //用于循环实时调度器，维护包含个进程的运转表
    unsigned int time_slice;    //用于循环实时调度器，可运用CPU的剩下时刻段
};

静态优先级：进程启动时分配的优先级，一般进程运转期间坚持恒定（能够用nice和sched_setscheduler修正）
动态优先级：依据静态优先级和调度策略核算出的优先级，供调度器调度
进程的调度策略
- SCHED_NORMAL：用于一般进程，经过彻底公正调度器（CFS）处理
- SCHED_BATCH：用于非交互、CPU运用密布的批处理进程，经过CFS处理
- SCHED_IDLE：用于相对权重最小的进程，经过CFS处理（注意：SCHED_IDLE不负责调度闲暇进程，闲暇进程由内核供给单独机制处理）
- SCHED_RR：用于软实时进程，轮询调度，经过实时调度器类处理
- SCHED_FIFO：用于软实时进程，先进先出调度，经过实时调度器类处理

2. 调度器类：理解为调度器基类

struct sched_class {
    //向安排妥当行列添加进程
    void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
    //从安排妥当行列去除进程
    void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
    //进程自愿抛弃CPU
    void (*yield_task) (struct rq *rq);
    //唤醒新进程，抢占当时进程
    void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
    //挑选下一个要运转的进程（向进程供给CPU）
    struct task_struct * (*pick_next_task) (struct rq *rq);
    //用另一个进程替代当时运转的进程（进程吊销CPU）
    void (*put_prev_task) (struct rq *rq, struct task_struct *p);
    //进程调度策略变化时，调用set_curr_task
    void (*set_curr_task) (struct rq *rq);
    //每次激活周期性调度器时，由周期性调度器调用
    void (*task_tick) (struct rq *rq, struct task_struct *p);
    //每次新进程树立后，用task_new告诉调度器，树立进程和调度器的相关
    void (*task_new) (struct rq *rq, struct task_struct *p);
};

调度器类处理次序：调度器之间层次结构是平整的，优先级：实时进程>彻底公正进程>闲暇进程（CPU无事可做处于活动状况）

3. 安排妥当行列

安排妥当行列：办理活动进程的数据结构，各个活动进程只能出现在一个安排妥当行列中
- 体系的一切安排妥当行列坐落runqueues数组中，每个元素对应体系的一个CPU。单处理器体系只有一个安排妥当行列
行列负荷：与行列上当时活动进程的数目成正比，其中的各个进程又有优先级作为权重。影响安排妥当行列的虚拟时钟速度

struct rq {
	unsigned long nr_running;        //可运转进程数目
	#define CPU_LOAD_IDX_MAX 5
	unsigned long cpu_load[CPU_LOAD_IDX_MAX];    //用于盯梢此前的负荷衡量
......
	/* capture load from *all* tasks on this cpu: */
	struct load_weight load;         //行列负荷
	struct cfs_rq cfs;               //子安排妥当行列，用于彻底公正调度器
	struct rt_rq rt;                 //子安排妥当行列，用于彻底公正调度器
	struct task_struct *curr, *idle; //当时运转进程、闲暇进程的task_struct
	u64 clock, prev_clock_raw;       //用于完成安排妥当行列本身的时钟
......
};

4. 调度实体

sched_entity：调度器调度的是调度实体，不是进程（进程是实体的特例）
- 每个task_struct有嵌入sched_entity的实例

struct sched_entity {
    struct load_weight    load;      //指定权重，实体占行列总负荷的比例
    struct rb_node        run_node;  //规范的数结点，使得实体能够在红黑树上排序
    unsigned int          on_rq;     //当时实体是否在被调度
    //以下用于彻底公正调度器
    u64    exec_start;               //实体开端运转的时刻
    u64    sum_exec_runtime;         //实体运转的时刻
    u64    vruntime;                 //虚拟时钟上流逝的时刻数量
    u64    prev_sum_exec_runtime;    //实体被吊销CPU时，prev_sum_exec_runtime保存sum_exec_runtime值
};

2.5.3 处理优先级

调度器优先级task_struct->prio

prio：调度器终究运用的优先级，规模0 ~ 139，值越低优先级越高
- 0 ~ 99：供实时进程运用
- 100 ~ 139：代表一般进程的nice值，对应-20 ~ +19

prio由static_prio和normal_prio互相相关得到
调度进口：set_user_nice和wake_up_new_task，经过一行指令

p->prio = effective_prio(p);
static int effective_prio(struct task_struct *p) {
        //核算normal_prio
	p->normal_prio = normal_prio(p);
        //假如是一般进程，回来normal_prio
	if (!rt_prio(p->prio))
		return p->normal_prio;
        //假如是实时进程或进程已经提高到实时优先级，则坚持优先级不变
	return p->prio;
}

静态优先级task_struct->static_prio

static_prio：静态优先级不会随时刻改动，内核不会自动修正它。用户空间经过nice指令设置，规模是-20 ~ +19
内核为了转换nice和static_prio，界说宏（其实就是规模映射，图2-14）

#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
/* 其中 */
#define MAX_USER_RT_PRIO	100
#define MAX_RT_PRIO		MAX_USER_RT_PRIO
#define MAX_PRIO		(MAX_RT_PRIO + 40)
#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)

注意：在fork出子进程时，子进程的静态优先级承继自父进程

归一化优先级task_struct->normal_prio

从代码能够看出，假如是实时进程，与rt_priority有关；假如是一般进程，与static_prio共同

static inline int normal_prio(struct task_struct *p) {
	int prio;
	if (task_has_rt_policy(p))                      //假如是实时进程
		prio = MAX_RT_PRIO-1 - p->rt_priority;  //prio = 99 - rt_priority
	else                                            //假如是一般进程
		prio = __normal_prio(p);                //prio = static_prio
	return prio;
}
static inline int __normal_prio(struct task_struct *p) {
	return p->static_prio;
}

实时优先级task_struct->rt_priority

rt_priority：表明实时进程的优先级，规模0 ~ 99，值越大优先级越高（不同于静态优先级！）

prio = MAX_RT_PRIO-1 - p->rt_priority;    //prio = 99 - rt_priority

优先级总结

负荷权重task_struct->se.load

负荷权重：进程的重要性除了与优先级外，还与负荷权重有关

struct load_weight {
	unsigned long weight;      //负荷权重
        unsigned long inv_weight;  //负荷权重倒数，2^32/weight
};
//内核优先级转换为权重值的因子
static const int prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
};

因子是怎么核算的？
- 每升高一个nice，则多取得10%的CPU时刻
- 假设有两个进程，进程A的nice=0，进程B的nice=1
  - 进程A得到：1024/(1024+820)=55%的CPU比例
  - 进程B得到：820/(1024+820)=45%的CPU比例 -> 产生了10%的差额

// 核算负荷权重
static void set_load_weight(struct task_struct *p) {
        // 实时进程的负荷权重是一般仅的2倍（本书是2.6.24内核，后续内核规矩有变）
	if (task_has_rt_policy(p)) {
		p->se.load.weight = prio_to_weight[0] * 2;     //nice=-20的权重*2
		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
		return;
	}
	// SCHED_IDLE进程得到的权重最少
	if (p->policy == SCHED_IDLE) {
		p->se.load.weight = WEIGHT_IDLEPRIO;       //2
		p->se.load.inv_weight = WMULT_IDLEPRIO;    //1<<31
		return;
	}
        // 依据因子表，将静态优先级转为负荷权重
	p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
	p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
}

2.5.4 核心调度器

1. 周期性调度器scheduler_tick()：内核按照频率HZ自动调用该函数

void scheduler_tick(void) {
	int cpu = smp_processor_id();
	struct rq *rq = cpu_rq(cpu);
	struct task_struct *curr = rq->curr;
	u64 next_tick = rq->tick_timestamp + TICK_NSEC;
	spin_lock(&rq->lock);
        // 1. 处理安排妥当行列时钟更新
        __update_rq_clock(rq);    //添加当时rq的时刻戳rq->clock
......
	update_cpu_load(rq);     bbbb //更新rq->cpu_load[]
        // 2. 运用调度器调度
	if (curr != rq->idle) /* FIXME: needed? */
		curr->sched_class->task_tick(rq, curr);
	spin_unlock(&rq->lock);
......
}

上面代码片段中，update_cpu_load比较有意思，运用一些取平均值的技巧来更新CPU负载的。cpu_load分为5个等级，保证负荷数组不会出现太多不连续跳变：第0级负荷数组改变速度最快，第5级负荷数组改变速度较慢。举个比如：

当时CPU负荷	10	10	10	10	10	10
cpu_load[0]	10	10	10	10	10	10
cpu_load[1]	5	8	9	10	10	10
cpu_load[2]	3	5	7	8	9	10

/*
 * Update rq->cpu_load[] statistics. This function is usually called every
 * scheduler tick (TICK_NSEC).
 */
static void update_cpu_load(struct rq *this_rq) {
	unsigned long this_load = this_rq->load.weight;
	int i, scale;
	this_rq->nr_load_updates++;
	/* Update our load: */
	for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
		unsigned long old_load, new_load;
		/* scale is effectively 1 << i now, and >> i divides by scale */
		old_load = this_rq->cpu_load[i];
		new_load = this_load;
		/*
		 * Round up the averaging division if load is increasing. This
		 * prevents us from getting stuck on 9 if the load is 10, for
		 * example.
		 */
		if (new_load > old_load)
			new_load += scale-1;
		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
	}
}

2. 主调度器schedule()：CPU自动分配给另一进程

/*
 * schedule() is the main scheduler function.
 */
asmlinkage void __sched schedule(void)
{
......
need_resched:
......
	cpu = smp_processor_id();
	rq = cpu_rq(cpu);
	prev = rq->curr;               //prev指向当时进程
......
	__update_rq_clock(rq);         //更新安排妥当行列时钟
	clear_tsk_need_resched(prev);  //铲除冲调度标志TIF_NEED_RESCHED
        //安排妥当行列移除当时进程，这儿会调用到sched_class->dequque_task
        //假如进程是可中断睡眠状况但现在收到信号，有必要提升为安排妥当状况
	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
				unlikely(signal_pending(prev)))) {
			prev->state = TASK_RUNNING;
		} else {
			deactivate_task(rq, prev, 1);
		}
		switch_count = &prev->nvcsw;
	}
......
        //告诉调度器类，当时运转的进程将要被另一个进程替代
	prev->sched_class->put_prev_task(rq, prev);
        //调度器挑选下一个应该履行的进程
	next = pick_next_task(rq, prev);
	sched_info_switch(prev, next);
        //预备履行硬件级的进程切换
	if (likely(prev != next)) {
		rq->nr_switches++;
		rq->curr = next;
		++*switch_count;
		context_switch(rq, prev, next); //上下文切换
	}
......
        //检测TIF_NEED_RESCHED是否设置，如设置从头开端搜索一个新进程
	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
		goto need_resched;
}

3. 与fork的交互schedule_fork()：运用fork体系调用时，子进程调度实体数据结构、确定进程的动态优先级

void sched_fork(struct task_struct *p, int clone_flags)
{
	int cpu = get_cpu();
        //初始化调度字段、树立数据结构（task_struct->se）、子进程设为TASK_RUNNING
	__sched_fork(p);
......
	//运用父进程的normal_prio作为子进程的动态优先级prio
	p->prio = current->normal_prio;
        //假如优先级不在实时规模，默许调度器是彻底公正调度CFS
	if (!rt_prio(p->prio))
		p->sched_class = &fair_sched_class;
......
}

4. 上下文切换context_switch()：其核心是switch_mm()和switch_to()

/*
 * context_switch - switch to the new MM and the new
 * thread's register state.
 */
static inline void
context_switch(struct rq *rq, struct task_struct *prev,
	       struct task_struct *next)
{
	struct mm_struct *mm, *oldmm;
        //上下文切换预备、mm指向下个进程的页表、oldmm指向当时进程的活动页表
	prepare_task_switch(rq, prev, next);
	mm = next->mm;
	oldmm = prev->active_mm;
 ......
        //加载页表、刷出TLB（部分或全部）、向MMU供给新的信息
	if (unlikely(!mm)) {
		next->active_mm = oldmm;
		atomic_inc(&oldmm->mm_count);
		enter_lazy_tlb(oldmm, next);
	} else {
		switch_mm(oldmm, mm, next);
        }
        //假如当时进程是内核线程(prev->mm == NULL)，断开与借用的地址空间的联络
	if (unlikely(!prev->mm)) {
		prev->active_mm = NULL;
		rq->prev_mm = oldmm;
	}
......
        //切换寄存器和内核栈
	switch_to(prev, next, prev);
        //内存屏障，保证switch_to和finish_task_switch的履行次序不变
	barrier();
	//整理工作，正确地开释锁等（这儿可能switch_to到随机调度的进程，乃至是别的CPU上的进程，所以整理工作整理的是当时的活泼进程，也要从头核算rq）
	finish_task_switch(this_rq(), prev);
}

5. switch_to

切换寄存器和内核栈，一般运用汇编语言完成

<asm-x86/system_32.h>
/*
 * Saving eflags is important. It switches not only IOPL between tasks,
 * it also protects other tasks from NT leaking through sysenter etc.
 */
#define switch_to(prev,next,last) do {					\
	unsigned long esi,edi;						\
	asm volatile("pushfl\n\t"		/* Save flags */	\
		     "pushl %%ebp\n\t"					\
		     "movl %%esp,%0\n\t"	/* save ESP */		\
		     "movl %5,%%esp\n\t"	/* restore ESP */	\
		     "movl $1f,%1\n\t"		/* save EIP */		\
		     "pushl %6\n\t"		/* restore EIP */	\
		     "jmp __switch_to\n"				\
		     "1:\t"						\
		     "popl %%ebp\n\t"					\
		     "popfl"						\
		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\
		      "=a" (last),"=S" (esi),"=D" (edi)			\
		     :"m" (next->thread.esp),"m" (next->thread.eip),	\
		      "2" (prev), "d" (next));				\
} while (0)

【阐明】switch_to：其实质是保存旧进程eflags、ESP和EIP寄存器到当时内核栈prev->thread中，加载新进程eflags、ESP和EIP

输出参数：
%0=prev->thread.esp，内存变量
%1=prev->thread.eip，内存变量
%2=eax=last
%3=esi
%4=edi
输入参数：
%5=next->thread.esp，内存变量
%6=next->thread.eip，内存变量
%7=eax=prev
%8=next
函数内容：
	pushfl                          //保存eflags
	pushl %ebp                      //保存栈底指针EBP
	movl %esp, %prev->thread.esp    //保存栈顶指针ESP
	movl %next->thread.esp, %esp    //加载新进程ESP
	movl $1f, %prev->thread.eip     //保存旧进程EIP为标号1的位置，后续切换后从1履行
	pushl %next->thread.eip         //加载新进程EIP
	jmp __switch_to                 //完成硬件的上下文切换
1:	popl %ebp                       //康复新进程栈底指针EBP
	popfl                           //康复新进程eflags

【补白】switch_to函数有个有意思的点，是传了3个实参：prev, next, last。为什么要传last？

书中比如：三个进程轮转调度会出现问题
- 进程A->进程B：switch_to后，内核栈A记载prev=A,next=B
- 进程B->进程C：switch_to后，内核栈B记载prev=B,next=C
- 进程C->进程A：switch_to后，内核栈C记载prev=C,next=A；内核栈A记载prev=A,next=B。此时康复内核栈A，内核为了直到进程A前运转的是进程C，加入了last
其实质是：prev作为宏传入，switch_to前后prev因为内核栈重载发生改动，所以引进last变量来得到prev的值，用于finish_task_switch的整理工作

【深入Linux内核架构笔记】第二章 进程调度(1)

2.5 调度器的完成

2.5.1 概观

2.5.2 数据结构

2.5.3 处理优先级

2.5.4 核心调度器

相关文章

全方位剖析iOS高级技术问题(十一)之算法

🌸 用canvas绘制一场浪漫樱花雨

应用层HTTP也不简单——HTTP连接管理

为什么大家都不想回家过年了

作者信息

【深入Linux内核架构笔记】第二章进程调度(1)