seminix 0.1.60 - Docs.rs

// Pelt 负载衰减算法, 来自于 linux-4.18

use core::hint::unlikely;

use crate::time::sched_clock::sched_clock;

// 处理器算力能力值, 平衡可运行和瞬时负载
const SCHED_CAPACITY_SHIFT: usize = 10;
const SCHED_CAPACITY_SCALE: usize = 1 << SCHED_CAPACITY_SHIFT;

// 负载周期衰减系数, 计算过程复杂 linux 内核直接定义成常量
#[allow(non_upper_case_globals)]
const runnable_avg_yN_inv: [u32; 32] = [
    0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 0xe0ccdeeb, 0xdbfbb796,
    0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46,
    0xb504f333, 0xb123f581, 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
    0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, 0x85aac367, 0x82cd8698,
];

const LOAD_AVG_PERIOD: u32 = 32;
const LOAD_AVG_MAX: u32 = 47742;

const PELT_MIN_DIVIDER: u32 = LOAD_AVG_MAX - 1024;

// Approximate:
//   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
/// 负载衰减
fn decay_load(mut val: u64, n: u64) -> u64 {
    if unlikely(n > u64::from(LOAD_AVG_PERIOD) * 63) {
        return 0;
    }

    // after bounds checking we can collapse to 32-bit
    let mut local_n = n as u32;

    // As y^PERIOD = 1/2, we can combine
    //    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
    // With a look-up table which covers y^n (n<PERIOD)
    //
    // To achieve constant time decay_load.
    if unlikely(local_n >= LOAD_AVG_PERIOD) {
        val >>= local_n / LOAD_AVG_PERIOD;
        local_n %= LOAD_AVG_PERIOD;
    }

    ((u128::from(val) * u128::from(runnable_avg_yN_inv[local_n as usize])) >> 32) as u64
}

// 计算整个周期负载衰减值
fn __accumulate_pelt_segments(periods: u64, d1: u32, d3: u32) -> u32 {
    // y^0 == 1
    let c3 = d3;

    // c1 = d1 y^p
    let c1 = decay_load(u64::from(d1), periods);

    //            p-1
    // c2 = 1024 \Sum y^n
    //            n=1
    //
    //              inf        inf
    //    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
    //              n=0        n=p
    let c2 = u64::from(LOAD_AVG_MAX) - decay_load(u64::from(LOAD_AVG_MAX), periods) - 1024;

    c1 as u32 + c2 as u32 + c3
}

// 调度平均负载统计
#[repr(align(64))]
pub(crate) struct SchedAvg {
    last_update_time: u64,

    load_sum: u64,
    runnable_sum: u64,
    util_sum: u64,

    // 一个周期内贡献的时间片
    period_contrib: u64,

    // 统计系统平均负载, 反应整个系统的负载情况
    // 统计包括: 正在运行, 等待运行, 睡眠中任务
    load_avg: u64,
    // 统计系统可运行任务平均负载
    // 统计包括: 正在运行, 等待运行任务
    runnable_avg: u64,
    // 统计系统瞬时负载, 反应系统处理器真实时间负载
    // 仅统计正在运行任务
    util_avg: u64,
}

impl SchedAvg {
    pub(crate) fn new() -> Self {
        SchedAvg {
            last_update_time: sched_clock(),
            load_sum: 0,
            runnable_sum: 0,
            util_sum: 0,
            period_contrib: 0,
            load_avg: 0,
            runnable_avg: 0,
            util_avg: 0,
        }
    }

    // Accumulate the three separate parts of the sum; d1 the remainder
    // of the last (incomplete) period, d2 the span of full periods and d3
    // the remainder of the (incomplete) current period.
    //
    //           d1          d2           d3
    //           ^           ^            ^
    //           |           |            |
    //         |<->|<----------------->|<--->|
    // ... |---x---|------| ... |------|-----x (now)
    //
    //                           p-1
    // u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
    //                           n=1
    //
    //    = u y^p +					(Step 1)
    //
    //                     p-1
    //      d1 y^p + 1024 \Sum y^n + d3 y^0		(Step 2)
    //                     n=1
    #[inline(always)]
    fn accumulate_sum(&mut self, mut delta: u64, load: u64, runnable: u64, running: u64) -> bool {
        // p == 0 -> delta < 1024
        let mut contrib = delta as u32;

        delta += self.period_contrib;
        // A period is 1024us (~1ms)
        let periods = delta / 1024;

        // Step 1: decay old *_sum if we crossed period boundaries.
        if periods != 0 {
            self.load_sum = decay_load(self.load_sum, periods);
            self.runnable_sum = decay_load(self.runnable_sum, periods);
            self.util_sum = decay_load(self.util_sum, periods);

            // Step 2
            delta %= 1024;
            if load != 0 {
                // This relies on the:
                //
                // if (!load)
                // 	runnable = running = 0;
                //
                // clause from ___update_load_sum(); this results in
                // the below usage of @contrib to disappear entirely,
                // so no point in calculating it.
                contrib = __accumulate_pelt_segments(
                    periods,
                    1024 - self.period_contrib as u32,
                    delta as u32,
                );
            }
        }
        self.period_contrib = delta;

        if load != 0 {
            self.load_sum += load * u64::from(contrib);
        }
        if runnable != 0 {
            self.runnable_sum += (runnable * u64::from(contrib)) * SCHED_CAPACITY_SCALE as u64;
        }
        if running != 0 {
            self.util_sum += u64::from(contrib) * SCHED_CAPACITY_SCALE as u64;
        }

        periods != 0
    }

    // We can represent the historical contribution to runnable average as the
    // coefficients of a geometric series.  To do this we sub-divide our runnable
    // history into segments of approximately 1ms (1024us); label the segment that
    // occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
    //
    // [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
    //      p0            p1           p2
    //     (now)       (~1ms ago)  (~2ms ago)
    //
    // Let u_i denote the fraction of p_i that the entity was runnable.
    //
    // We then designate the fractions u_i as our co-efficients, yielding the
    // following representation of historical load:
    //   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
    //
    // We choose y based on the with of a reasonably scheduling period, fixing:
    //   y^32 = 0.5
    //
    // This means that the contribution to load ~32ms ago (u_32) will be weighted
    // approximately half as much as the contribution to load within the last ms
    // (u_0).
    //
    // When a period "rolls over" and we have new u_0`, multiplying the previous
    // sum again by y is sufficient to update:
    //   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
    //            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
    #[inline(always)]
    fn ___update_load_sum(
        &mut self,
        now: u64,
        load: u64,
        mut runnable: u64,
        mut running: u64,
    ) -> bool {
        let mut delta = now - self.last_update_time;

        // Use 1024ns as the unit of measurement since it's a reasonable
        // approximation of 1us and fast to compute.
        delta >>= 10;
        if delta == 0 {
            return false;
        }

        self.last_update_time += delta << 10;

        // running is a subset of runnable (weight) so running can't be set if
        // runnable is clear. But there are some corner cases where the current
        // se has been already dequeued but cfs_rq->curr still points to it.
        // This means that weight will be 0 but not running for a sched_entity
        // but also for a cfs_rq if the latter becomes idle. As an example,
        // this happens during idle_balance() which calls
        // update_blocked_averages().
        //
        // Also see the comment in accumulate_sum().
        if load == 0 {
            runnable = 0;
            running = 0;
        }

        // Now we know we crossed measurement unit boundaries. The *_avg
        // accrues by two steps:
        //
        // Step 1: accumulate *_sum since last_update_time. If we haven't
        // crossed period boundaries, finish.
        if self.accumulate_sum(delta, load, runnable, running) {
            return true;
        }

        false
    }

    const fn get_pelt_divider(&self) -> u64 {
        self.period_contrib + PELT_MIN_DIVIDER as u64
    }

    #[inline(always)]
    fn ___update_load_avg(&mut self, load: u64) {
        let divider = self.get_pelt_divider();

        // Step 2: update *_avg.
        self.load_avg = (load * self.load_sum) / divider;
        self.runnable_avg = self.runnable_sum / divider;
        self.util_avg = self.util_sum / divider;
    }

    // load 为当前处理器任务的负载权重
    // runnable 为当前处理器可运行任务数量
    // running 为当前处理器是否正在运行任务
    //
    // load_avg 的基准值为负载权重
    // runnable_avg 的基准值为一个任务 = 1024
    // running 的基准值范围为[0-1024], 1024 即意味着任务一直运行
    #[inline(always)]
    pub(crate) fn update_load_avg(
        &mut self,
        now: u64,
        load: u64,
        runnable: u64,
        running: u64,
    ) -> bool {
        if self.___update_load_sum(now, load, runnable, running) {
            self.___update_load_avg(load);
            return true;
        }
        false
    }

    #[inline(always)]
    pub(crate) fn load_avg(&self) -> u64 {
        self.load_avg
    }

    #[inline(always)]
    pub(crate) fn runnable_avg(&self) -> u64 {
        self.runnable_avg
    }

    #[inline(always)]
    pub(crate) fn util_avg(&self) -> u64 {
        self.util_avg
    }
}