【深入Linux内核架构笔记】第二章进程调度(3)

2.6 彻底公正调度CFS

核心思想：度量每个进程的虚拟运转时刻，调度运转时刻最少的进程运转
虚拟运转时刻：由物理时钟、进程优先级确认，优先级越高，虚拟时钟越慢，也便是进程越简单被调度到

2.6.1 数据结构

CFS的安排妥当行列
- vruntime：位于sched_entity，记录进程在虚拟时钟上流逝的数量
- min_vruntime：是单调递增的，盯梢行列上一切进程的最小虚拟运转时刻，比最左面的树结点的vruntime大些

/* CFS-related fields in a runqueue */
struct cfs_rq {
  struct load_weight load;     //安排妥当行列上进程的累积负荷值
  unsigned long nr_running;    //行列上可运转进程的数目
  u64 min_vruntime;        //一切进程的最小虚拟运转时刻
  struct rb_root tasks_timeline;  //管理红黑树一切进程
  struct rb_node *rb_leftmost;   //指向红黑树最左面的节点（需求被调度的进程）
  struct sched_entity *curr;    //当时履行进程的可调度实体
......
};

2.6.2 CFS虚拟时刻

1. 虚拟时钟核算

虚拟时钟：彻底公正调度算法依赖虚拟时钟，度量进程能得到的CPU时刻
- 依据：实践时钟、进程相关的负荷权重（进程优先级）
- 作业：更新当时进程履行的实践时钟、核算对应的虚拟时钟、更新行列的min_vruntime

// 核心函数：更新进程的物理时钟和虚拟时钟
static void update_curr(struct cfs_rq *cfs_rq) {
  //curr表明安排妥当行列当时履行的进程
  struct sched_entity *curr = cfs_rq->curr;
  //now表明安排妥当行列的真实时钟
  u64 now = rq_of(cfs_rq)->clock;
  unsigned long delta_exec;
......
  //核算 当时安排妥当行列时钟 和 上一次安排妥当行列时钟 的时刻差delta_exec
  delta_exec = (unsigned long)(now - curr->exec_start);
   //托付给__update_curr，更新当时进程在CPU花费的物理时刻和虚拟时刻
  __update_curr(cfs_rq, curr, delta_exec);
  curr->exec_start = now;
......
}
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
     unsigned long delta_exec)
{
  unsigned long delta_exec_weighted;
  u64 vruntime;
......
  //更新当时进程的履行总时刻（物理时刻）
  curr->sum_exec_runtime += delta_exec;
......
  //依据负荷权重核算当时进程添加的虚拟时钟（虚拟时刻）
  delta_exec_weighted = delta_exec;
  if (unlikely(curr->load.weight != NICE_0_LOAD)) {
    delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
              &curr->load);
   }
  curr->vruntime += delta_exec_weighted;
  //对CFS行列更新min_vruntime（内核确保值单调添加）
  if (first_fair(cfs_rq)) {
    //假如红黑树有最左面结点（有进程在等待调度），vruntime=最左面结点进程的vruntime
    vruntime = min_vruntime(curr->vruntime,
        __pick_next_entity(cfs_rq)->vruntime);
   } else {
    //假如红黑树是空的，vruntime=当时进程的虚拟运转时刻
    vruntime = curr->vruntime;
   }
  cfs_rq->min_vruntime =
    max_vruntime(cfs_rq->min_vruntime, vruntime);
}

其间，虚拟时钟的增长量delta_exec_weighted依据公式核算，取四舍五入：

delta_exec_weighted=delta_execNICE_0_LOADload_weightdelta\_exec\_weighted = delta\_exec \times \frac{NICE\_0\_LOAD}{load\_weight}

依据表prio_to_weight，nice=0的负荷权重load_weight=1024
- nice=0（prio=120）：delta_exec_weighted = delta_exec1024/1024 = delta_exec*
- nice=-10（prio=110）：delta_exec_weighted = delta_exec1024/9548 = 0.107delta_exec

【总结】

关于nice=0的进程，虚拟时刻=物理时刻；关于其他优先级，依据负荷权重load_weight从头衡定时刻。
进程优先级越高，nice值越小，进程负荷权重load_weight越大，虚拟时钟跑得越慢
- 进程运转时，vruntime稳定添加，越重要的进程虚拟时钟跑得越慢，在红黑树中方位更靠左，越简单被调度到
- 进程进入睡觉，vruntime坚持不变。每个行列min_vruntime添加，睡觉进程醒来则在红黑树中方位更靠左，更简单被调度

2. 进程调度时刻核算

CFS没有时刻片的概念，某个调度实体分配到的时刻由以下公式决定：

\times \frac {调度实体的权重} {安排妥当行列的权重}

其间安排妥当行列的权重便是一切调度实体权重之和，拜见update_load_add()

/* 核算某个调度实体分配到的时刻 */
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) {
  u64 slice = __sched_period(cfs_rq->nr_running);
  slice *= se->load.weight;
  do_div(slice, cfs_rq->load.weight);
  return slice;
}
/* 核算安排妥当行列调度的总时刻 */
static u64 __sched_period(unsigned long nr_running) {
  //默许推迟周期为20ms
  u64 period = sysctl_sched_latency;
  //安排妥当行列上活动进程数目，默许5
  unsigned long nr_latency = sched_nr_latency;
  //假如行列上数目超越默许值，总时刻要拉长（sysctl_sched_latency*nr_running/nr_latency）
  if (unlikely(nr_running > nr_latency)) {
    period *= nr_running;
    do_div(period, nr_latency);
   }
  return period;
}

等价的虚拟时刻在__sched_vslice函数给出，公式：

vslice=timeNICE_0_LOADweightvslice = time \times \frac {NICE\_0\_LOAD} {weight}

进程调度时刻核算涉及到三个内核参数：用于盯梢CFS的调度推迟（物理时刻）
- sysctl_sched_latency：CPU 密集型使命的目标抢占推迟，默许值20ms。每个调度实体一定在20ms内会被调度到（留意：和时刻片不一样，时刻片是固定的）
- sysctl_sched_min_granularity：CPU 密集型使命的最小抢占粒度，默许4ms。设的越小切换越频繁（个人理解为这4ms都是你的，不能被抢走）
- sched_nr_latency：操控一个推迟周期中处理的最大活动进程数目。假如活动进程数目超越该上限，推迟周期成份额地线性扩展（20ms最多处理5个使命，每个使命4ms）

/*
 * Targeted preemption latency for CPU-bound tasks:
 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * NOTE: this latency value is not the same as the concept of
 * 'timeslice length' - timeslices in CFS are of variable length
 * and have no persistent notion like in traditional, time-slice
 * based scheduling concepts.
 *
 * (to see the precise effective timeslice length of your workload,
 *  run vmstat and monitor the context-switches (cs) field)
 */
unsigned int sysctl_sched_latency = 20000000ULL;
/*
 * Minimal preemption granularity for CPU-bound tasks:
 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
unsigned int sysctl_sched_min_granularity = 4000000ULL;
/*
 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
 */
static unsigned int sched_nr_latency = 5;

2.6.3 行列操作

1. 进程参加安排妥当行列

由enqueue_task_fair完结
- 进程最近在运转，更新vruntime后，直接参加安排妥当行列红黑树中
- 进程此前在睡觉，或许其vruntime与行列的vruntime差的很远，所以要先调整后再加到红黑树里，确保进程不会一向履行并更优先被调度到
- 假如进程是新fork的，分两种情况
  - 父进程先履行，参加到安排妥当行列最终面，确保进程最终才被调度到
  - 子进程先履行（Linux 2.6.24默许）：确保子进程虚拟时钟小于父进程，即子进程先履行（2.6.7节），并参加到安排妥当行列最终面，进程最终被调度到

/* 进程参加行列，wakeup表明入队的进程是否此前在睡觉，当时被唤醒并转换为运转态 */
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) {
  struct cfs_rq *cfs_rq;
  struct sched_entity *se = &p->se;
  for_each_sched_entity(se) {
    //已经在安排妥当行列中，直接返回
    if (se->on_rq)
      break;
    cfs_rq = cfs_rq_of(se);
    enqueue_entity(cfs_rq, se, wakeup);
    wakeup = 1;
   }
}
/* 核心函数，将调度实体参加行列 */
static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
{
  //更新当时CFS行列时刻
  update_curr(cfs_rq);
 if (wakeup) {
    //假如进程此前在睡觉，调整进程的虚拟时刻
    place_entity(cfs_rq, se, 0);
    enqueue_sleeper(cfs_rq, se);
   }
......
  if (se != cfs_rq->curr)
    //内核函数，将调度实体置入红黑树中
    __enqueue_entity(cfs_rq, se);
  //安排妥当行列活动进程数cfs_rq->nr_running++;
  account_entity_enqueue(cfs_rq, se);
}
/*更新新进程的vruntime值，以便把他插入红黑树 */
static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
  u64 vruntime;
  //以当时行列的min_vruntime作为基准时刻
  vruntime = cfs_rq->min_vruntime;
......
  //是新进程被加到系统中，那么新进程的vruntime需求加上整个调度周期的虚拟实践，确保最终才被调度到
  if (initial)
    vruntime += sched_vslice_add(cfs_rq, se);
  //进程此前在睡觉，扣掉sysctl_sched_latency，确保进程唤醒后能够更早被调度到
  if (!initial) {
......
    vruntime -= sysctl_sched_latency;
    //取se->vruntime和vruntime的最大值
    vruntime = max_vruntime(se->vruntime, vruntime);
   }
  //更新进程的vruntime
  se->vruntime = vruntime;
}

其间__enqueue_entity将进程置入红黑树中。参加的方位由entity_key确认。
- 当时进程的vruntime越小，则参加的方位越靠左，越简单被选到（2.6.4节）

static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) {
  return se->vruntime - cfs_rq->min_vruntime;
}

2.6.4 挑选下一个进程

挑选红黑树最靠左的进程作为下一个进程

static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) {
  struct sched_entity *se = NULL;
  if (first_fair(cfs_rq)) {
    //挑选最靠左的进程作为下一个进程
    se = __pick_next_entity(cfs_rq);
    //将进程取出标记为运转进程
    set_next_entity(cfs_rq, se);
   }
  return se;
}
/* 挑选进程后需求做一些作业，标记为运转进程 */
static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
......
  //从安排妥当行列中移除最左面的进程
  if (se->on_rq) {
    __dequeue_entity(cfs_rq, se);
   }
  //安排妥当行列标记当时进程，并更新进程运转的时刻
  update_stats_curr_start(cfs_rq, se);
  cfs_rq->curr = se;
......
  se->prev_sum_exec_runtime = se->sum_exec_runtime;
}

2.6.5 处理周期性调度器

内核按频率HZ履行周期性调度。由task_tick_fair担任，实践作业由entity_tick完结
- 更新安排妥当行列实践时钟、虚拟时钟统计量
- 假如当时进程运转时刻过长，则从头调度

static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) {
  //更新统计量
  update_curr(cfs_rq);
  if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
    //看看进程运转的时刻是否过长，需求从头调度
    check_preempt_tick(cfs_rq, curr);
}
static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) {
  unsigned long ideal_runtime, delta_exec;
  //核算当时进程的调度时刻
  ideal_runtime = sched_slice(cfs_rq, curr);
  delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
  //假如当时进程的运转时刻过长，超越了调度时刻额度，需求从头调度
  if (delta_exec > ideal_runtime)
    resched_task(rq_of(cfs_rq)->curr);
}

从头调度resched_task()设置标志TIF_NEED_RESCHED。进程在恰当机遇（比方系统调用返回、中止返回）建议schedule()

asmlinkage void __sched schedule(void)
{
need_resched:
......
  prev = rq->curr;
......
  prev->sched_class->put_prev_task(rq, prev); //将当时进程从头放回安排妥当行列中
  next = pick_next_task(rq, prev);       //挑选下一个进程调度履行
......
  if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
    goto need_resched;
}

2.6.6 唤醒抢占

在try_to_wake_up和wake_up_new_task中唤醒进程时，内核使用check_preempt_curr看看是否新进程能够抢占当时进程
彻底公正调度器使用check_preempt_wakeup完成check_preempt_curr
- 假如是实时进程，会当即抢占CFS进程
- 假如是批处理进程，不抢占其他进程
- CFS进程抢占，要确保进程运转sysctl_sched_wakeup_granularity（默许4ms），避免频繁切换

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
{
  struct task_struct *curr = rq->curr;
  struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  struct sched_entity *se = &curr->se, *pse = &p->se;
  unsigned long gran;
  //假如是实时进程，会当即抢占CFS进程
  if (unlikely(rt_prio(p->prio))) {
    update_rq_clock(rq);
    update_curr(cfs_rq);
    resched_task(curr);
    return;
   }
  //假如是批处理进程，不抢占其他进程
  if (unlikely(p->policy == SCHED_BATCH))
    return;
......
  //CFS进程被新CFS进程抢占，需求至少运转4ms。留意这儿4ms要转成对应的虚拟时刻
  gran = sysctl_sched_wakeup_granularity;
  if (unlikely(se->load.weight != NICE_0_LOAD))
    gran = calc_delta_fair(gran, &se->load);
  //超越4ms则从头调度
  if (pse->vruntime + gran < se->vruntime)
    resched_task(curr);
}

2.6.7 处理新进程

由sysctl_sched_child_runs_first操控fork进程后是父进程先履行还是子进程先履行
- Linux 2.6.24默许值为1，表明子进程先履行。后续内核有修订
- 过程：更新虚拟时钟、确保子进程先履行、子进程参加安排妥当行列、从头调度（留意：这儿子进程参加安排妥当行列后不一定会当即履行）

long do_fork(unsigned long clone_flags,  //标志调集，SIGCHLD表明fork后发送SIGCHLD信号给父进程
     unsigned long stack_start,  //用户状态下栈的起始地址
     struct pt_regs *regs,     //指向寄存器调集的指针
     unsigned long stack_size,   //用户状态下栈的大小，一般为0
     int __user *parent_tidptr,  //指向父进程PID
     int __user *child_tidptr)   //指向子进程PID
{
......
    // 将子进程参加调度器行列，由调度器挑选它运转
    if (!(clone_flags & CLONE_STOPPED))
        wake_up_new_task(p, clone_flags);
......
}
//唤醒子进程
void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
{
  //初始化子进程优先级
  p->prio = effective_prio(p);
......
  //子进程调度时刻初始化
  p->sched_class->task_new(rq, p);
  inc_nr_running(p, rq);
......
}
//彻底公正调度走task_new_fair
static void task_new_fair(struct rq *rq, struct task_struct *p) {
  struct cfs_rq *cfs_rq = task_cfs_rq(p);
  struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
  int this_cpu = smp_processor_id();
  sched_info_queued(p);
  //更新虚拟时钟，更新se->vruntime。留意第三个参数initial=1
  update_curr(cfs_rq);
  place_entity(cfs_rq, se, 1);
  //这儿确保子进程先履行
  if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
      curr && curr->vruntime < se->vruntime) {
    /*
     * Upon rescheduling, sched_class::put_prev_task() will place
     * 'current' within the tree based on its new key value.
     */
    swap(curr->vruntime, se->vruntime);
   }
  //参加行列，从头请求调度
  enqueue_task_fair(rq, p, 0);
  resched_task(rq->curr);
}

代码中swap()的作用：假如父进程虚拟运转时刻curr->vruntime小于子进程运转时刻se->vruntime，表明父进程先于子进程履行。所以经过交换虚拟时钟的方法，确保子进程先履行。

【深入Linux内核架构笔记】第二章进程调度(3)–完全公平调度CFS

2.6 彻底公正调度CFS

2.6.1 数据结构

2.6.2 CFS虚拟时刻

2.6.3 行列操作

2.6.4 挑选下一个进程

2.6.5 处理周期性调度器

2.6.6 唤醒抢占

2.6.7 处理新进程

作者信息

【深入Linux内核架构笔记】第二章 进程调度(3)–完全公平调度CFS

2.6 彻底公正调度CFS

2.6.1 数据结构

2.6.2 CFS虚拟时刻

2.6.3 行列操作

2.6.4 挑选下一个进程

2.6.5 处理周期性调度器

2.6.6 唤醒抢占

2.6.7 处理新进程

相关文章

AI大模型已经出现不可预测的能力

CBAM：融合通道和空间注意力的注意力模块

一次JVM GC长暂停的排查过程

如何为Azure SQL数据库部署Azure SQL Elastic Pools

作者信息

【深入Linux内核架构笔记】第二章进程调度(3)–完全公平调度CFS