#include <scx/common.bpf.h>
#include <scx/compat.bpf.h>
#include "intf.h"
char _license[] SEC("license") = "GPL";
const volatile u64 nr_cpu_ids = 1;
#define TRACE_SCHED 0
#define TIER_BATCH 0
#define TIER_INTERACTIVE 1
#define TIER_LAT_CRITICAL 2
#define LAT_CRI_THRESH_HIGH 32
#define LAT_CRI_THRESH_LOW 8
#define LAT_CRI_CAP 255
#define KTHREAD_HIPRI_STATIC_PRIO_MAX 110
#define WEIGHT_LAT_CRITICAL 256
#define WEIGHT_INTERACTIVE 192
#define WEIGHT_BATCH 128
#define EWMA_AGE_MATURE 8
#define EWMA_AGE_CAP 16
#define MAX_WAKEUP_FREQ 64
#define MAX_CSW_RATE 512
static u64 lag_cap_ns = 40000000ULL;
#define SLICE_MIN_NS 100000
#define K_Q16_SHIFT 16
#define K_SOJOURN_INTERVAL 19661u
#define K_OVERFLOW_RESCUE 16384u
#define K_CODEL_FLOOR 1147u
#define K_STARVATION_RESCUE 273285u
#define K_LONGRUN 3276800u
#define K_CODEL_MAX 3277u
#define K_VTIME_CEILING 196608u
#define K_LAG_CAP 65536u
#define K_SPILL_BUDGET 80000000ULL
#define K_AFFINITY_SEARCH 40000000ULL
#define K_OSC_PULL_THRESH_NS 10000000u
#define K_OSC_DAMP_THRESH_NS 8000000u
#define OSC_VELOCITY_CAP_PER_PULL 50000u
static u32 nr_nodes;
static u64 vtime_now;
static bool interactive_waiting;
static bool latcrit_waiting;
static u64 batch_enqueue_ns;
static u64 interactive_enqueue_ns;
static u64 pcpu_enqueue_ns[MAX_CPUS];
static u64 starvation_rescue_ns;
static u64 overflow_sojourn_rescue_ns;
static u32 pcpu_depth_base;
static u64 vtime_ceiling_window_ns;
static u32 longrun_preempt_shift;
#define OSCILLATOR_PULL_NS 8000
static u32 oscillator_damping_shift; static u32 oscillator_spring_shift; static u32 oscillator_pull_scale; static s64 oscillator_velocity_cap; u64 codel_target_floor_ns; static u64 sojourn_interval_ns; u64 codel_target_ns; static s64 oscillator_velocity_ns; static u64 prev_rescue_snapshot; static u64 global_rescue_count; static u64 pcpu_min_sojourn_ns[MAX_CPUS];
static u64 pcpu_stall_start_ns[MAX_CPUS];
static u64 longrun_thresh_ns = 2000000000ULL;
u64 codel_target_max_ns = 2000000ULL; static bool longrun_mode;
static u64 last_tau_snapshot;
static u64 codel_target_equilibrium_ns = 2000000ULL;
UEI_DEFINE(uei);
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, u32);
__type(value, struct tuning_knobs);
} tuning_knobs_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, u32);
__type(value, struct pandemonium_stats);
} stats_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, MAX_CPUS);
__type(key, u32);
__type(value, u32);
} cache_domain SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 512);
__type(key, char[16]);
__type(value, struct task_class_entry);
} task_class_observe SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 512);
__type(key, char[16]);
__type(value, struct task_class_entry);
} task_class_init SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 32);
__type(key, char[16]);
__type(value, u8);
} compositor_map SEC(".maps");
#define MAX_L2_SIBLINGS 8
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 512);
__type(key, u32);
__type(value, u32);
} l2_siblings SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, MAX_CPUS * MAX_AFFINITY_CANDIDATES);
__type(key, u32);
__type(value, u32);
} affinity_rank SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 36);
__type(key, u32);
__type(value, u64);
} wake_lat_hist SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 4);
__type(key, u32);
__type(value, u64);
} sleep_hist SEC(".maps");
struct task_ctx {
u64 awake_vtime;
u64 last_run_at;
u64 wakeup_freq;
u64 last_woke_at;
u64 enqueue_at; u64 avg_runtime;
u64 runtime_dev; u64 cached_weight;
u64 prev_nvcsw;
u64 csw_rate;
u64 lat_cri;
u64 sleep_start_ns; u32 tier;
u32 ewma_age;
s32 last_cpu; u8 dispatch_path; u8 _pad[3];
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct task_ctx);
} task_ctx_stor SEC(".maps");
static __always_inline struct pandemonium_stats *get_stats(void)
{
u32 zero = 0;
return bpf_map_lookup_elem(&stats_map, &zero);
}
static __always_inline struct tuning_knobs *get_knobs(void)
{
u32 zero = 0;
return bpf_map_lookup_elem(&tuning_knobs_map, &zero);
}
static __always_inline struct task_ctx *lookup_task_ctx(const struct task_struct *p)
{
return bpf_task_storage_get(&task_ctx_stor,
(struct task_struct *)p, 0, 0);
}
static __always_inline struct task_ctx *ensure_task_ctx(struct task_struct *p)
{
struct task_ctx zero = {};
return bpf_task_storage_get(&task_ctx_stor, p, &zero,
BPF_LOCAL_STORAGE_GET_F_CREATE);
}
static __always_inline void count_l2_affinity(struct pandemonium_stats *s,
const struct task_ctx *tctx,
s32 cpu)
{
u32 lcpu = (u32)tctx->last_cpu;
u32 ncpu = (u32)cpu;
u32 *ld = bpf_map_lookup_elem(&cache_domain, &lcpu);
u32 *nd = bpf_map_lookup_elem(&cache_domain, &ncpu);
bool hit = ld && nd && *ld == *nd;
if (tctx->tier == TIER_BATCH) {
if (hit) s->nr_l2_hit_batch += 1;
else s->nr_l2_miss_batch += 1;
} else if (tctx->tier == TIER_INTERACTIVE) {
if (hit) s->nr_l2_hit_interactive += 1;
else s->nr_l2_miss_interactive += 1;
} else {
if (hit) s->nr_l2_hit_lat_crit += 1;
else s->nr_l2_miss_lat_crit += 1;
}
}
static __always_inline s32 find_idle_l2_sibling(const struct task_ctx *tctx,
const struct cpumask *allowed)
{
if (tctx->last_cpu < 0)
return -1;
u32 lcpu = (u32)tctx->last_cpu;
u32 *group = bpf_map_lookup_elem(&cache_domain, &lcpu);
if (!group)
return -1;
u32 base = *group * MAX_L2_SIBLINGS;
for (int i = 0; i < MAX_L2_SIBLINGS; i++) {
u32 key = base + i;
u32 *val = bpf_map_lookup_elem(&l2_siblings, &key);
if (!val || *val == (u32)-1)
break;
s32 cpu = (s32)*val;
if (allowed && !bpf_cpumask_test_cpu(cpu, allowed))
continue;
if (scx_bpf_test_and_clear_cpu_idle(cpu))
return cpu;
}
return -1;
}
static u32 affinity_search_online = 3;
static __always_inline s32 find_idle_by_affinity(s32 src_cpu,
const struct cpumask *allowed)
{
if (src_cpu < 0 || (u32)src_cpu >= nr_cpu_ids)
return -1;
u32 base = (u32)src_cpu * MAX_AFFINITY_CANDIDATES;
u32 checked = 0;
for (int i = 0; i < MAX_AFFINITY_CANDIDATES; i++) {
u32 key = base + (u32)i;
u32 *val = bpf_map_lookup_elem(&affinity_rank, &key);
if (!val || *val == (u32)-1)
break;
if (*val >= nr_cpu_ids)
continue;
if (allowed && !bpf_cpumask_test_cpu((s32)*val, allowed))
continue;
if (scx_bpf_test_and_clear_cpu_idle((s32)*val))
return (s32)*val;
if (++checked >= affinity_search_online)
break;
}
return -1;
}
static u32 pcpu_spill_search_budget = 6;
static __always_inline s32 find_pcpu_with_room(s32 src_cpu,
const struct cpumask *allowed)
{
if (src_cpu < 0 || (u32)src_cpu >= nr_cpu_ids)
return -1;
u32 base = (u32)src_cpu * MAX_AFFINITY_CANDIDATES;
u32 checked = 0;
for (int i = 0; i < MAX_AFFINITY_CANDIDATES; i++) {
u32 key = base + (u32)i;
u32 *val = bpf_map_lookup_elem(&affinity_rank, &key);
if (!val || *val == (u32)-1)
break;
if (*val >= nr_cpu_ids)
continue;
if (allowed && !bpf_cpumask_test_cpu((s32)*val, allowed))
continue;
if (scx_bpf_dsq_nr_queued((u64)*val) < pcpu_depth_base)
return (s32)*val;
if (++checked >= pcpu_spill_search_budget)
break;
}
return -1;
}
static __always_inline u64 pick_pcpu_dsq_with_spill(s32 src_cpu,
const struct cpumask *allowed,
s32 *out_cpu)
{
u64 now = bpf_ktime_get_ns();
bool src_ok = (u64)src_cpu < nr_cpu_ids &&
(!allowed || bpf_cpumask_test_cpu(src_cpu, allowed));
if (src_ok &&
scx_bpf_dsq_nr_queued((u64)src_cpu) < pcpu_depth_base) {
if ((u32)src_cpu < MAX_CPUS)
__sync_val_compare_and_swap(
&pcpu_enqueue_ns[src_cpu & (MAX_CPUS - 1)],
0, now);
*out_cpu = src_cpu;
return (u64)src_cpu;
}
s32 spill = find_pcpu_with_room(src_cpu, allowed);
if (spill >= 0) {
if ((u32)spill < MAX_CPUS)
__sync_val_compare_and_swap(
&pcpu_enqueue_ns[spill & (MAX_CPUS - 1)],
0, now);
*out_cpu = spill;
return (u64)spill;
}
s32 node = __COMPAT_scx_bpf_cpu_node(src_cpu);
if (node < 0 || (u32)node >= nr_nodes) node = 0;
__sync_val_compare_and_swap(&interactive_enqueue_ns, 0, now);
*out_cpu = src_cpu;
return nr_cpu_ids + (u64)node;
}
static __always_inline void arm_interactive_waiting(const struct task_ctx *tctx)
{
if (!tctx || tctx->tier == TIER_BATCH)
return;
interactive_waiting = true;
if (tctx->tier == TIER_LAT_CRITICAL)
latcrit_waiting = true;
}
static __always_inline void update_pcpu_sojourn(u32 cpu, u64 sojourn)
{
if (cpu >= MAX_CPUS) return;
if (sojourn < pcpu_min_sojourn_ns[cpu])
pcpu_min_sojourn_ns[cpu] = sojourn;
}
static __always_inline bool pcpu_dsq_is_stalled(u32 cpu, u64 now)
{
if (cpu >= MAX_CPUS) return false;
u64 min_s = pcpu_min_sojourn_ns[cpu];
if (min_s < codel_target_ns) {
pcpu_stall_start_ns[cpu] = 0;
pcpu_min_sojourn_ns[cpu] = ~0ULL;
return false;
}
if (pcpu_stall_start_ns[cpu] == 0) {
pcpu_stall_start_ns[cpu] = now + sojourn_interval_ns;
return false;
}
if (now >= pcpu_stall_start_ns[cpu]) {
pcpu_min_sojourn_ns[cpu] = ~0ULL;
pcpu_stall_start_ns[cpu] = 0;
return true;
}
return false;
}
static __always_inline bool sojourn_gate_pass(u64 now)
{
u64 ie = interactive_enqueue_ns;
u64 be = batch_enqueue_ns;
return (ie == 0 || (now - ie) <= overflow_sojourn_rescue_ns) &&
(be == 0 || (now - be) <= overflow_sojourn_rescue_ns);
}
static __always_inline bool try_service_older_overflow(u64 now,
u64 node_dsq,
u64 batch_dsq,
u64 thresh,
bool feed_oscillator)
{
u64 ie = interactive_enqueue_ns;
u64 be = batch_enqueue_ns;
u64 i_age = (ie > 0 && now > ie) ? (now - ie) : 0;
u64 b_age = (be > 0 && now > be) ? (now - be) : 0;
bool i_aged = i_age > thresh;
bool b_aged = b_age > thresh;
if (!i_aged && !b_aged)
return false;
bool serve_interactive = i_aged && (!b_aged || i_age >= b_age);
bool dispatched_any = false;
if (serve_interactive) {
if (scx_bpf_dsq_move_to_local(node_dsq, 0)) {
if (scx_bpf_dsq_nr_queued(node_dsq) == 0) {
u64 old = interactive_enqueue_ns;
if (old > 0)
__sync_val_compare_and_swap(&interactive_enqueue_ns, old, 0);
}
dispatched_any = true;
}
if (b_aged && scx_bpf_dsq_move_to_local(batch_dsq, 0)) {
if (scx_bpf_dsq_nr_queued(batch_dsq) == 0) {
u64 old = batch_enqueue_ns;
if (old > 0)
__sync_val_compare_and_swap(&batch_enqueue_ns, old, 0);
}
dispatched_any = true;
}
} else {
if (scx_bpf_dsq_move_to_local(batch_dsq, 0)) {
if (scx_bpf_dsq_nr_queued(batch_dsq) == 0) {
u64 old = batch_enqueue_ns;
if (old > 0)
__sync_val_compare_and_swap(&batch_enqueue_ns, old, 0);
}
dispatched_any = true;
}
if (i_aged && scx_bpf_dsq_move_to_local(node_dsq, 0)) {
if (scx_bpf_dsq_nr_queued(node_dsq) == 0) {
u64 old = interactive_enqueue_ns;
if (old > 0)
__sync_val_compare_and_swap(&interactive_enqueue_ns, old, 0);
}
dispatched_any = true;
}
}
if (!dispatched_any)
return false;
struct pandemonium_stats *s = get_stats();
if (s) {
s->nr_dispatches += 1;
if (feed_oscillator)
s->nr_overflow_rescue += 1;
}
if (feed_oscillator)
__sync_fetch_and_add(&global_rescue_count, 1);
return true;
}
static __always_inline u64 scale_tau(u64 tau_ns, u64 k_q16)
{
return (tau_ns * k_q16) >> K_Q16_SHIFT;
}
static __always_inline void apply_tau_scaling(u64 tau_ns, u64 codel_eq_ns)
{
if (tau_ns == 0)
return;
u64 snap = __sync_fetch_and_add(&last_tau_snapshot, 0);
if (tau_ns == snap)
return;
if (!__sync_bool_compare_and_swap(&last_tau_snapshot, snap, tau_ns))
return;
u64 v;
v = scale_tau(tau_ns, K_SOJOURN_INTERVAL);
if (v < 2000000ULL) v = 2000000ULL;
if (v > 12000000ULL) v = 12000000ULL;
sojourn_interval_ns = v;
v = scale_tau(tau_ns, K_OVERFLOW_RESCUE);
if (v < 4000000ULL) v = 4000000ULL;
if (v > 10000000ULL) v = 10000000ULL;
overflow_sojourn_rescue_ns = v;
v = scale_tau(tau_ns, K_STARVATION_RESCUE);
if (v < 20000000ULL) v = 20000000ULL;
if (v > 500000000ULL) v = 500000000ULL;
starvation_rescue_ns = v;
v = scale_tau(tau_ns, K_CODEL_FLOOR);
if (v < 200000ULL) v = 200000ULL;
if (v > 800000ULL) v = 800000ULL;
codel_target_floor_ns = v;
v = scale_tau(tau_ns, K_LONGRUN);
if (v < 500000000ULL) v = 500000000ULL; if (v > 8000000000ULL) v = 8000000000ULL; longrun_thresh_ns = v;
v = scale_tau(tau_ns, K_CODEL_MAX);
if (v < 1000000ULL) v = 1000000ULL; if (v > 8000000ULL) v = 8000000ULL; codel_target_max_ns = v;
if (codel_eq_ns >= 200000ULL && codel_eq_ns <= 8000000ULL) {
u64 eq = codel_eq_ns;
if (eq < codel_target_floor_ns) eq = codel_target_floor_ns;
if (eq > codel_target_max_ns) eq = codel_target_max_ns;
codel_target_equilibrium_ns = eq;
}
u32 pull = (u32)(tau_ns / K_OSC_PULL_THRESH_NS);
if (pull < 1) pull = 1;
if (pull > 4) pull = 4;
oscillator_pull_scale = pull;
u32 damp = (u32)(tau_ns / K_OSC_DAMP_THRESH_NS);
if (damp < 1) damp = 1;
if (damp > 5) damp = 5;
oscillator_damping_shift = damp;
oscillator_spring_shift = 2 * damp + 2;
oscillator_velocity_cap = (s64)((u64)OSC_VELOCITY_CAP_PER_PULL * (u64)pull);
v = scale_tau(tau_ns, K_VTIME_CEILING);
if (v < 16000000ULL) v = 16000000ULL; if (v > 160000000ULL) v = 160000000ULL; vtime_ceiling_window_ns = v;
pcpu_depth_base = (tau_ns >= 6000000ULL) ? 2 : 1;
longrun_preempt_shift = (tau_ns < 4000000ULL) ? 2 : 0;
{
u32 b = (u32)(K_SPILL_BUDGET / tau_ns);
u32 ceil = (nr_cpu_ids > 1) ? (u32)(nr_cpu_ids - 1) : 1;
if (ceil > MAX_AFFINITY_CANDIDATES) ceil = MAX_AFFINITY_CANDIDATES;
if (b < 6) b = 6;
if (b > ceil) b = ceil;
pcpu_spill_search_budget = b;
}
{
u32 b = (u32)(K_AFFINITY_SEARCH / tau_ns);
u32 ceil = (nr_cpu_ids > 1) ? (u32)(nr_cpu_ids - 1) : 1;
if (ceil > MAX_AFFINITY_CANDIDATES) ceil = MAX_AFFINITY_CANDIDATES;
if (b < 3) b = 3;
if (b > ceil) b = ceil;
affinity_search_online = b;
}
v = scale_tau(tau_ns, K_LAG_CAP);
if (v < 8000000ULL) v = 8000000ULL;
if (v > 80000000ULL) v = 80000000ULL;
lag_cap_ns = v;
}
static __always_inline void pcpu_drain_clear(u32 cpu)
{
if (cpu >= MAX_CPUS)
return;
if (scx_bpf_dsq_nr_queued((u64)cpu) != 0)
return;
u64 old = pcpu_enqueue_ns[cpu];
if (old > 0)
__sync_val_compare_and_swap(&pcpu_enqueue_ns[cpu], old, 0);
}
static __always_inline u32 lat_bucket(u64 lat_ns)
{
if (lat_ns <= 10000) return 0;
if (lat_ns <= 25000) return 1;
if (lat_ns <= 50000) return 2;
if (lat_ns <= 100000) return 3;
if (lat_ns <= 250000) return 4;
if (lat_ns <= 500000) return 5;
if (lat_ns <= 1000000) return 6;
if (lat_ns <= 2000000) return 7;
if (lat_ns <= 5000000) return 8;
if (lat_ns <= 10000000) return 9;
if (lat_ns <= 20000000) return 10;
return 11;
}
static __always_inline u32 sleep_bucket(u64 sleep_ns)
{
if (sleep_ns <= 1000000) return 0;
if (sleep_ns <= 10000000) return 1;
if (sleep_ns <= 100000000) return 2;
return 3;
}
static __always_inline u64 calc_avg(u64 old_val, u64 new_val, u32 age)
{
if (age < EWMA_AGE_MATURE)
return (old_val >> 1) + (new_val >> 1);
return old_val - (old_val >> 3) + (new_val >> 3);
}
static __always_inline u64 update_freq(u64 freq, u64 interval_ns, u32 age)
{
if (interval_ns == 0)
interval_ns = 1;
u64 new_freq = (100ULL * 1000000ULL) / interval_ns;
return calc_avg(freq, new_freq, age);
}
static __always_inline u64 compute_lat_cri(u64 wakeup_freq, u64 csw_rate,
u64 avg_runtime_ns,
u64 runtime_dev_ns)
{
u64 effective_runtime_ns = avg_runtime_ns + (runtime_dev_ns >> 1);
u64 avg_runtime_ms = effective_runtime_ns >> 20;
if (avg_runtime_ms == 0)
avg_runtime_ms = 1;
u64 score = (wakeup_freq * csw_rate) / avg_runtime_ms;
if (score > LAT_CRI_CAP)
score = LAT_CRI_CAP;
return score;
}
static __always_inline u32 classify_tier(u64 lat_cri,
const struct tuning_knobs *knobs)
{
u64 thresh_high = knobs ? knobs->lat_cri_thresh_high : LAT_CRI_THRESH_HIGH;
u64 thresh_low = knobs ? knobs->lat_cri_thresh_low : LAT_CRI_THRESH_LOW;
if (lat_cri >= thresh_high)
return TIER_LAT_CRITICAL;
if (lat_cri >= thresh_low)
return TIER_INTERACTIVE;
return TIER_BATCH;
}
static __always_inline bool is_compositor(const struct task_struct *p)
{
char key[16] = {};
unsigned int i;
for (i = 0; i < 15 && p->comm[i]; i++)
key[i] = p->comm[i];
return bpf_map_lookup_elem(&compositor_map, key) != NULL;
}
#if TRACE_SCHED
static __always_inline bool is_sched_task(const struct task_struct *p)
{
return p->comm[0] == 'p' && p->comm[1] == 'a' &&
p->comm[2] == 'n' && p->comm[3] == 'd';
}
#endif
static __always_inline u64 effective_weight(const struct task_struct *p,
const struct task_ctx *tctx)
{
u64 weight = p->scx.weight;
u64 behavioral;
if (tctx->tier == TIER_LAT_CRITICAL)
behavioral = WEIGHT_LAT_CRITICAL;
else if (tctx->tier == TIER_INTERACTIVE)
behavioral = WEIGHT_INTERACTIVE;
else
behavioral = WEIGHT_BATCH;
return weight * behavioral >> 7;
}
static __always_inline u64 task_deadline(struct task_struct *p,
struct task_ctx *tctx,
u64 dsq_id,
const struct tuning_knobs *knobs)
{
u64 knob_scale = knobs ? knobs->lag_scale : 4;
u64 lag_scale = (tctx->wakeup_freq * knob_scale) >> 2;
if (lag_scale < 1)
lag_scale = 1;
if (lag_scale > MAX_WAKEUP_FREQ)
lag_scale = MAX_WAKEUP_FREQ;
u64 nr_queued = scx_bpf_dsq_nr_queued(dsq_id);
if (nr_queued > 8)
lag_scale = 1;
else if (nr_queued > 4 && lag_scale > 2)
lag_scale >>= 1;
u64 vtime_floor = vtime_now - lag_cap_ns * lag_scale;
if (time_before(p->scx.dsq_vtime, vtime_floor))
p->scx.dsq_vtime = vtime_floor;
u64 awake_cap;
if (tctx->tier == TIER_LAT_CRITICAL)
awake_cap = lag_cap_ns >> 1;
else if (tctx->tier == TIER_INTERACTIVE)
awake_cap = (lag_cap_ns * 3) >> 2;
else
awake_cap = lag_cap_ns;
if (tctx->awake_vtime > awake_cap)
tctx->awake_vtime = awake_cap;
u64 dl = p->scx.dsq_vtime + tctx->awake_vtime;
u64 vtime_ceiling = vtime_now + vtime_ceiling_window_ns;
if (time_after(dl, vtime_ceiling))
dl = vtime_ceiling;
return dl;
}
static __always_inline u64 task_slice(const struct task_ctx *tctx,
const struct tuning_knobs *knobs)
{
u64 base_slice = knobs ? (longrun_mode
? knobs->burst_slice_ns : knobs->slice_ns) : 1000000;
u64 base;
if (tctx->tier == TIER_LAT_CRITICAL) {
base = tctx->avg_runtime + (tctx->avg_runtime >> 1);
if (base > base_slice)
base = base_slice;
if (base < SLICE_MIN_NS)
base = SLICE_MIN_NS;
return base;
}
if (tctx->tier == TIER_INTERACTIVE) {
base = tctx->avg_runtime << 1;
if (base > base_slice)
base = base_slice;
if (base < SLICE_MIN_NS)
base = SLICE_MIN_NS;
return base;
}
u64 batch_ceil = knobs ? knobs->batch_slice_ns : 20000000;
if (batch_ceil < SLICE_MIN_NS)
batch_ceil = SLICE_MIN_NS;
base = batch_ceil * tctx->cached_weight >> 7;
if (base > batch_ceil)
base = batch_ceil;
if (base < SLICE_MIN_NS)
base = SLICE_MIN_NS;
return base;
}
s32 BPF_STRUCT_OPS(pandemonium_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
bool is_idle = false;
if (wake_flags & SCX_WAKE_SYNC) {
struct task_struct *waker =
(struct task_struct *)bpf_get_current_task_btf();
if (waker) {
u32 wflips = BPF_CORE_READ(waker, wakee_flips);
u32 pflips = p->wakee_flips;
u32 thresh = nr_cpu_ids;
if (wflips <= thresh && pflips <= thresh) {
s32 waker_cpu = bpf_get_smp_processor_id();
if ((u64)waker_cpu >= nr_cpu_ids)
goto normal_path;
s32 target = find_idle_by_affinity(waker_cpu, p->cpus_ptr);
if (target >= 0) {
struct task_ctx *tctx = lookup_task_ctx(p);
struct tuning_knobs *knobs = get_knobs();
u64 sl = tctx ? task_slice(tctx, knobs)
: 1000000;
s32 dst_cpu;
u64 dst_dsq = pick_pcpu_dsq_with_spill(target, p->cpus_ptr, &dst_cpu);
u64 dl = tctx ? task_deadline(p, tctx,
dst_dsq, knobs) : vtime_now;
scx_bpf_dsq_insert_vtime(p,
dst_dsq, sl, dl, 0);
if (tctx) {
tctx->dispatch_path = 0;
tctx->enqueue_at = bpf_ktime_get_ns();
}
struct pandemonium_stats *s = get_stats();
if (s) {
s->nr_idle_hits += 1;
s->nr_dispatches += 1;
}
return dst_cpu;
}
if (!pcpu_dsq_is_stalled(
(u32)waker_cpu, bpf_ktime_get_ns())) {
struct task_ctx *tctx = lookup_task_ctx(p);
struct tuning_knobs *knobs = get_knobs();
u64 sl = tctx ? task_slice(tctx, knobs)
: 1000000;
s32 dst_cpu;
u64 dst_dsq = pick_pcpu_dsq_with_spill(waker_cpu, p->cpus_ptr, &dst_cpu);
u64 dl = tctx ? task_deadline(p, tctx,
dst_dsq, knobs) : vtime_now;
scx_bpf_dsq_insert_vtime(p,
dst_dsq, sl, dl, 0);
if (tctx) {
tctx->dispatch_path = 0;
tctx->enqueue_at = bpf_ktime_get_ns();
}
struct pandemonium_stats *s = get_stats();
if (s) {
s->nr_idle_hits += 1;
s->nr_dispatches += 1;
}
return dst_cpu;
}
}
}
}
normal_path:;
s32 cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
s32 dst_cpu = cpu;
if (is_idle) {
struct task_ctx *tctx = lookup_task_ctx(p);
struct tuning_knobs *knobs = get_knobs();
u64 sl = tctx ? task_slice(tctx, knobs) : 1000000;
u64 dst_dsq = pick_pcpu_dsq_with_spill(cpu, p->cpus_ptr, &dst_cpu);
u64 dl = tctx ? task_deadline(p, tctx, dst_dsq, knobs)
: vtime_now;
scx_bpf_dsq_insert_vtime(p, dst_dsq, sl, dl, 0);
scx_bpf_kick_cpu(dst_cpu, SCX_KICK_IDLE);
if (tctx) {
tctx->dispatch_path = 0;
tctx->enqueue_at = bpf_ktime_get_ns();
}
struct pandemonium_stats *s = get_stats();
if (s) {
s->nr_idle_hits += 1;
s->nr_dispatches += 1;
if (tctx)
count_l2_affinity(s, tctx, dst_cpu);
}
#if TRACE_SCHED
if (is_sched_task(p))
bpf_printk("PAND: select_cpu pid=%d cpu=%d", p->pid, dst_cpu);
#endif
}
return dst_cpu;
}
void BPF_STRUCT_OPS(pandemonium_enqueue, struct task_struct *p,
u64 enq_flags)
{
s32 node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p));
if (node < 0 || (u32)node >= nr_nodes) node = 0;
u64 node_dsq = nr_cpu_ids + (u64)node;
struct task_ctx *tctx = lookup_task_ctx(p);
struct tuning_knobs *knobs = get_knobs();
u64 sl = tctx ? task_slice(tctx, knobs) : 1000000;
u64 dl;
bool is_wakeup = tctx && tctx->awake_vtime == 0;
s32 cpu = -1;
if (knobs && knobs->affinity_mode > 0 && tctx &&
tctx->tier != TIER_LAT_CRITICAL &&
!(p->flags & PF_KTHREAD)) {
cpu = find_idle_l2_sibling(tctx, p->cpus_ptr);
}
if (cpu < 0)
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node, 0);
if (cpu >= 0 && (u64)cpu < nr_cpu_ids) {
dl = tctx ? task_deadline(p, tctx, node_dsq, knobs)
: vtime_now;
scx_bpf_dsq_insert_vtime(p, node_dsq, sl, dl, enq_flags);
__sync_val_compare_and_swap(&interactive_enqueue_ns, 0,
bpf_ktime_get_ns());
u64 kick_flag = (tctx && tctx->tier != TIER_BATCH)
? SCX_KICK_PREEMPT : SCX_KICK_IDLE;
scx_bpf_kick_cpu(cpu, kick_flag);
if (tctx) {
tctx->dispatch_path = 0;
tctx->enqueue_at = bpf_ktime_get_ns();
}
struct pandemonium_stats *s = get_stats();
if (s) {
s->nr_shared += 1;
s->nr_dispatches += 1;
if (is_wakeup)
s->nr_enq_wakeup += 1;
else
s->nr_enq_requeue += 1;
if (tctx)
count_l2_affinity(s, tctx, cpu);
}
#if TRACE_SCHED
if (is_sched_task(p))
bpf_printk("PAND: enq tier1 pid=%d cpu=%d", p->pid, cpu);
#endif
return;
}
if (tctx &&
(tctx->tier == TIER_LAT_CRITICAL || is_wakeup)) {
cpu = __COMPAT_scx_bpf_pick_any_cpu_node(
p->cpus_ptr, node, 0);
if (cpu >= 0 && (u64)cpu < nr_cpu_ids &&
__COMPAT_scx_bpf_cpu_curr(cpu)) {
s32 t2_cpu;
u64 tier2_dsq = pick_pcpu_dsq_with_spill(cpu, p->cpus_ptr, &t2_cpu);
dl = task_deadline(p, tctx, tier2_dsq, knobs);
scx_bpf_dsq_insert_vtime(p, tier2_dsq, sl, dl,
enq_flags);
u64 kick_flag = (is_wakeup ||
tctx->tier == TIER_LAT_CRITICAL)
? SCX_KICK_PREEMPT : SCX_KICK_IDLE;
scx_bpf_kick_cpu(t2_cpu, kick_flag);
if (tier2_dsq >= nr_cpu_ids)
arm_interactive_waiting(tctx);
tctx->dispatch_path = 1;
tctx->enqueue_at = bpf_ktime_get_ns();
struct pandemonium_stats *s = get_stats();
if (s) {
s->nr_shared += 1;
s->nr_dispatches += 1;
s->nr_hard_kicks += 1;
if (is_wakeup)
s->nr_enq_wakeup += 1;
else
s->nr_enq_requeue += 1;
}
#if TRACE_SCHED
if (is_sched_task(p))
bpf_printk("PAND: enq tier2 pid=%d cpu=%d dsq=%llu", p->pid, cpu, tier2_dsq);
#endif
return;
}
}
u64 target_dsq = (tctx && tctx->tier == TIER_BATCH)
? (nr_cpu_ids + nr_nodes + (u64)node)
: node_dsq;
if (target_dsq != node_dsq)
__sync_val_compare_and_swap(&batch_enqueue_ns, 0, bpf_ktime_get_ns());
if (target_dsq == node_dsq)
__sync_val_compare_and_swap(&interactive_enqueue_ns, 0, bpf_ktime_get_ns());
dl = tctx ? task_deadline(p, tctx, target_dsq, knobs) : vtime_now;
scx_bpf_dsq_insert_vtime(p, target_dsq, sl, dl, enq_flags);
if (tctx)
tctx->enqueue_at = bpf_ktime_get_ns();
#if TRACE_SCHED
if (is_sched_task(p))
bpf_printk("PAND: enq tier3 pid=%d dsq=%llu tier=%d", p->pid, target_dsq, tctx ? tctx->tier : -1);
#endif
arm_interactive_waiting(tctx);
u64 kick_flags = is_wakeup ? SCX_KICK_PREEMPT : 0;
scx_bpf_kick_cpu(scx_bpf_task_cpu(p), kick_flags);
if (tctx)
tctx->dispatch_path = is_wakeup ? 1 : 2;
struct pandemonium_stats *s = get_stats();
if (s) {
s->nr_shared += 1;
if (is_wakeup) {
s->nr_enq_wakeup += 1;
s->nr_hard_kicks += 1;
} else {
s->nr_enq_requeue += 1;
s->nr_soft_kicks += 1;
}
}
}
void BPF_STRUCT_OPS(pandemonium_dispatch, s32 cpu, struct task_struct *prev)
{
s32 node = __COMPAT_scx_bpf_cpu_node(cpu);
if (node < 0 || (u32)node >= nr_nodes) node = 0;
u64 node_dsq = nr_cpu_ids + (u64)node;
u64 batch_dsq = nr_cpu_ids + nr_nodes + (u64)node;
struct pandemonium_stats *s;
u64 now = bpf_ktime_get_ns();
if ((u64)cpu < nr_cpu_ids &&
scx_bpf_dsq_move_to_local((u64)cpu, 0)) {
pcpu_drain_clear((u32)cpu);
s = get_stats();
if (s)
s->nr_dispatches += 1;
if (sojourn_gate_pass(now))
return;
}
{
u32 my_cpu = (u32)cpu;
u32 base = my_cpu * MAX_AFFINITY_CANDIDATES;
u32 checked = 0;
for (int i = 0; i < MAX_AFFINITY_CANDIDATES; i++) {
u32 key = base + (u32)i;
u32 *val = bpf_map_lookup_elem(&affinity_rank, &key);
if (!val || *val == (u32)-1)
break;
u32 peer = *val;
if (peer >= nr_cpu_ids)
continue;
if (peer == my_cpu) {
if (++checked >= pcpu_spill_search_budget)
break;
continue;
}
if (scx_bpf_dsq_move_to_local((u64)peer, 0)) {
pcpu_drain_clear(peer);
s = get_stats();
if (s)
s->nr_dispatches += 1;
if (sojourn_gate_pass(now))
return;
break;
}
if (++checked >= pcpu_spill_search_budget)
break;
}
}
if (try_service_older_overflow(now, node_dsq, batch_dsq,
starvation_rescue_ns, false))
return;
if (try_service_older_overflow(now, node_dsq, batch_dsq,
overflow_sojourn_rescue_ns, true))
return;
if (scx_bpf_dsq_move_to_local(node_dsq, 0)) {
if (scx_bpf_dsq_nr_queued(node_dsq) == 0) {
u64 old_iens = interactive_enqueue_ns;
if (old_iens > 0)
__sync_val_compare_and_swap(&interactive_enqueue_ns, old_iens, 0);
}
s = get_stats();
if (s)
s->nr_dispatches += 1;
return;
}
if (scx_bpf_dsq_move_to_local(batch_dsq, 0)) {
if (scx_bpf_dsq_nr_queued(batch_dsq) == 0) {
u64 old_bens = batch_enqueue_ns;
if (old_bens > 0)
__sync_val_compare_and_swap(&batch_enqueue_ns, old_bens, 0);
}
s = get_stats();
if (s)
s->nr_dispatches += 1;
return;
}
for (u32 n = 0; n < nr_nodes && n < MAX_NODES; n++) {
if (n != (u32)node) {
if (scx_bpf_dsq_move_to_local(nr_cpu_ids + (u64)n, 0)) {
s = get_stats();
if (s)
s->nr_dispatches += 1;
return;
}
if (scx_bpf_dsq_move_to_local(nr_cpu_ids + nr_nodes + (u64)n, 0)) {
s = get_stats();
if (s)
s->nr_dispatches += 1;
return;
}
}
}
if (prev && !(prev->flags & PF_EXITING) &&
(prev->scx.flags & SCX_TASK_QUEUED)) {
struct task_ctx *tctx = lookup_task_ctx(prev);
struct tuning_knobs *knobs = get_knobs();
prev->scx.slice = tctx ? task_slice(tctx, knobs) :
(knobs ? knobs->slice_ns : 1000000);
s = get_stats();
if (s) {
s->nr_keep_running += 1;
s->nr_dispatches += 1;
}
}
}
void BPF_STRUCT_OPS(pandemonium_runnable, struct task_struct *p,
u64 enq_flags)
{
struct task_ctx *tctx = lookup_task_ctx(p);
if (!tctx)
return;
u64 now = bpf_ktime_get_ns();
tctx->awake_vtime = 0;
if (tctx->ewma_age < 2) {
tctx->last_woke_at = now;
tctx->prev_nvcsw = p->nvcsw;
tctx->ewma_age += 1;
return;
}
u64 delta_t = now > tctx->last_woke_at ? now - tctx->last_woke_at : 1;
tctx->wakeup_freq = update_freq(tctx->wakeup_freq, delta_t,
tctx->ewma_age);
if (tctx->wakeup_freq > MAX_WAKEUP_FREQ)
tctx->wakeup_freq = MAX_WAKEUP_FREQ;
tctx->last_woke_at = now;
if (tctx->ewma_age < EWMA_AGE_CAP)
tctx->ewma_age += 1;
u64 nvcsw = p->nvcsw;
u64 csw_delta = nvcsw > tctx->prev_nvcsw ? nvcsw - tctx->prev_nvcsw : 0;
tctx->prev_nvcsw = nvcsw;
if (csw_delta > 0 && delta_t > 0) {
u64 csw_freq = csw_delta * (100ULL * 1000000ULL) / delta_t;
tctx->csw_rate = calc_avg(tctx->csw_rate, csw_freq,
tctx->ewma_age);
} else {
tctx->csw_rate = calc_avg(tctx->csw_rate, 0, tctx->ewma_age);
}
if (tctx->csw_rate > MAX_CSW_RATE)
tctx->csw_rate = MAX_CSW_RATE;
tctx->lat_cri = compute_lat_cri(tctx->wakeup_freq, tctx->csw_rate,
tctx->avg_runtime, tctx->runtime_dev);
struct tuning_knobs *knobs = get_knobs();
u32 new_tier = classify_tier(tctx->lat_cri, knobs);
if (new_tier != TIER_LAT_CRITICAL && is_compositor(p))
new_tier = TIER_LAT_CRITICAL;
if (p->flags & PF_KTHREAD &&
p->static_prio <= KTHREAD_HIPRI_STATIC_PRIO_MAX)
new_tier = TIER_BATCH;
if (new_tier == TIER_BATCH && (p->flags & PF_WQ_WORKER))
new_tier = TIER_INTERACTIVE;
tctx->tier = new_tier;
}
void BPF_STRUCT_OPS(pandemonium_running, struct task_struct *p)
{
#if TRACE_SCHED
if (is_sched_task(p))
bpf_printk("PAND: running pid=%d cpu=%d", p->pid, bpf_get_smp_processor_id());
#endif
u64 cur = vtime_now;
if (time_before(cur, p->scx.dsq_vtime))
__sync_bool_compare_and_swap(&vtime_now, cur, p->scx.dsq_vtime);
struct task_ctx *tctx = lookup_task_ctx(p);
if (!tctx) {
struct tuning_knobs *knobs = get_knobs();
p->scx.slice = knobs ? knobs->slice_ns : 1000000;
return;
}
u64 now = bpf_ktime_get_ns();
tctx->last_run_at = now;
if (tctx->enqueue_at > 0 && now > tctx->enqueue_at) {
u64 sojourn = now - tctx->enqueue_at;
u32 cpu = bpf_get_smp_processor_id();
update_pcpu_sojourn(cpu, sojourn);
tctx->enqueue_at = 0;
}
if (tctx->last_woke_at && now > tctx->last_woke_at) {
u64 wake_lat = now - tctx->last_woke_at;
u8 path = tctx->dispatch_path;
u64 sleep_dur = 0;
if (tctx->sleep_start_ns > 0 &&
tctx->last_woke_at > tctx->sleep_start_ns) {
sleep_dur = tctx->last_woke_at - tctx->sleep_start_ns;
tctx->sleep_start_ns = 0;
}
tctx->last_woke_at = 0;
struct pandemonium_stats *s = get_stats();
if (s) {
s->wake_lat_samples += 1;
s->wake_lat_sum += wake_lat;
if (path == 0) {
s->wake_lat_idle_sum += wake_lat;
s->wake_lat_idle_cnt += 1;
} else if (path == 1) {
s->wake_lat_kick_sum += wake_lat;
s->wake_lat_kick_cnt += 1;
}
}
u32 tier_idx = (u32)tctx->tier;
if (tier_idx > 2) tier_idx = 2;
u32 bucket = lat_bucket(wake_lat);
u32 hist_key = tier_idx * 12 + bucket;
u64 *hist_val = bpf_map_lookup_elem(&wake_lat_hist, &hist_key);
if (hist_val)
*hist_val += 1;
if (sleep_dur > 0) {
u32 sbucket = sleep_bucket(sleep_dur);
u64 *sval = bpf_map_lookup_elem(&sleep_hist, &sbucket);
if (sval)
*sval += 1;
}
}
struct tuning_knobs *knobs = get_knobs();
p->scx.slice = task_slice(tctx, knobs);
}
void BPF_STRUCT_OPS(pandemonium_stopping, struct task_struct *p,
bool runnable)
{
struct task_ctx *tctx = lookup_task_ctx(p);
if (!tctx)
return;
tctx->cached_weight = effective_weight(p, tctx);
tctx->last_cpu = bpf_get_smp_processor_id();
u64 weight = tctx->cached_weight;
u64 now = bpf_ktime_get_ns();
u64 slice = now > tctx->last_run_at ? now - tctx->last_run_at : 0;
{
u64 avg = tctx->avg_runtime;
u64 diff = slice > avg ? slice - avg : avg - slice;
tctx->avg_runtime = calc_avg(avg, slice, tctx->ewma_age);
tctx->runtime_dev = calc_avg(tctx->runtime_dev, diff,
tctx->ewma_age);
}
{
struct tuning_knobs *kk = get_knobs();
u64 slice_cap = kk ? kk->slice_ns : 1000000;
if (tctx->tier == TIER_INTERACTIVE &&
tctx->avg_runtime * 4 >= slice_cap * 3 &&
tctx->ewma_age <= 4) {
tctx->tier = TIER_BATCH;
tctx->cached_weight = effective_weight(p, tctx);
}
}
if (tctx->ewma_age == EWMA_AGE_MATURE ||
(tctx->ewma_age > EWMA_AGE_MATURE && tctx->ewma_age % 64 == 0)) {
struct task_class_entry obs = {};
obs.tier = (u8)tctx->tier;
obs.avg_runtime = tctx->avg_runtime;
obs.runtime_dev = tctx->runtime_dev;
obs.wakeup_freq = tctx->wakeup_freq;
obs.csw_rate = tctx->csw_rate;
char key[16];
__builtin_memcpy(key, p->comm, 16);
bpf_map_update_elem(&task_class_observe, key, &obs, BPF_ANY);
}
u64 delta_vtime;
if (weight > 0)
delta_vtime = (slice << 7) / weight;
else
delta_vtime = slice;
p->scx.dsq_vtime += delta_vtime;
tctx->awake_vtime += delta_vtime;
}
void BPF_STRUCT_OPS(pandemonium_tick, struct task_struct *p)
{
struct pandemonium_stats *s = get_stats();
struct tuning_knobs *knobs = get_knobs();
if (bpf_get_smp_processor_id() == 0) {
apply_tau_scaling(knobs ? knobs->topology_tau_ns : 0,
knobs ? knobs->codel_eq_ns : 0);
{
u64 cur = __sync_fetch_and_add(&global_rescue_count, 0);
u64 delta = cur - prev_rescue_snapshot;
prev_rescue_snapshot = cur;
s64 impulse = 0;
if (delta > 0) {
u64 capped = delta > 8 ? 8 : delta;
impulse = -((s64)(capped * OSCILLATOR_PULL_NS *
oscillator_pull_scale));
}
oscillator_velocity_ns += impulse;
s64 disp = (s64)codel_target_ns -
(s64)codel_target_equilibrium_ns;
oscillator_velocity_ns -= disp >> oscillator_spring_shift;
oscillator_velocity_ns -= oscillator_velocity_ns >>
oscillator_damping_shift;
if (oscillator_velocity_ns > oscillator_velocity_cap)
oscillator_velocity_ns = oscillator_velocity_cap;
if (oscillator_velocity_ns < -oscillator_velocity_cap)
oscillator_velocity_ns = -oscillator_velocity_cap;
s64 nc = (s64)codel_target_ns + oscillator_velocity_ns;
if (nc < (s64)codel_target_floor_ns)
nc = (s64)codel_target_floor_ns;
if (nc > (s64)codel_target_max_ns)
nc = (s64)codel_target_max_ns;
codel_target_ns = (u64)nc;
}
}
if (s) {
s->longrun_mode_active = longrun_mode ? 1 : 0;
}
u64 bens = batch_enqueue_ns;
if (bens > 0) {
u64 now = bpf_ktime_get_ns();
u64 sojourn = now - bens;
if (s)
s->batch_sojourn_ns = sojourn;
if (bpf_get_smp_processor_id() == 0)
longrun_mode = sojourn > longrun_thresh_ns;
u64 sojourn_thresh = knobs ? knobs->sojourn_thresh_ns : 5000000;
if (sojourn > sojourn_thresh) {
struct task_ctx *tctx = lookup_task_ctx(p);
if (tctx && tctx->tier == TIER_BATCH) {
scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_PREEMPT);
return;
}
}
} else {
if (bpf_get_smp_processor_id() == 0)
longrun_mode = false;
if (s)
s->batch_sojourn_ns = 0;
}
{
u32 this_cpu = bpf_get_smp_processor_id();
u64 now2 = bpf_ktime_get_ns();
u64 pcpu_sojourn_thresh = knobs
? knobs->sojourn_thresh_ns : 5000000;
if (this_cpu < MAX_CPUS) {
u64 pcpu_oldest = pcpu_enqueue_ns[this_cpu];
if (pcpu_oldest > 0 &&
(now2 - pcpu_oldest) > pcpu_sojourn_thresh) {
scx_bpf_kick_cpu(this_cpu,
SCX_KICK_PREEMPT);
return;
}
}
if (nr_cpu_ids > 0) {
u32 nr = nr_cpu_ids;
if (nr <= 4) {
for (u32 i = 0; i < 4; i++) {
if (i >= nr)
break;
u32 scan_cpu = i;
if (scan_cpu == this_cpu)
continue;
u64 remote_stamp = pcpu_enqueue_ns[
scan_cpu & (MAX_CPUS - 1)];
if (remote_stamp > 0 &&
(now2 - remote_stamp) > pcpu_sojourn_thresh)
scx_bpf_kick_cpu(scan_cpu,
SCX_KICK_PREEMPT);
}
} else {
u32 scan_base = (u32)(now2 >> 20);
for (int i = 0; i < 4; i++) {
u32 scan_cpu =
(scan_base + (u32)i) % nr;
if (scan_cpu == this_cpu)
continue;
u64 remote_stamp = pcpu_enqueue_ns[
scan_cpu & (MAX_CPUS - 1)];
if (remote_stamp > 0 &&
(now2 - remote_stamp) > pcpu_sojourn_thresh)
scx_bpf_kick_cpu(scan_cpu,
SCX_KICK_PREEMPT);
}
}
}
}
if (!interactive_waiting)
return;
struct task_ctx *tctx = lookup_task_ctx(p);
if (!tctx)
return;
u64 base_thresh = knobs ? knobs->preempt_thresh_ns : 1000000;
u64 thresh = longrun_mode ? (base_thresh << longrun_preempt_shift)
: base_thresh;
if (latcrit_waiting)
thresh >>= 2;
u64 on_cpu = tctx->last_run_at > 0
? bpf_ktime_get_ns() - tctx->last_run_at : 0;
bool preemptible = tctx->tier == TIER_BATCH ||
(latcrit_waiting && tctx->tier == TIER_INTERACTIVE);
if (preemptible && on_cpu >= thresh) {
scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_PREEMPT);
interactive_waiting = false;
latcrit_waiting = false;
if (!s)
s = get_stats();
if (s)
s->nr_preempt += 1;
}
}
void BPF_STRUCT_OPS(pandemonium_enable, struct task_struct *p)
{
p->scx.dsq_vtime = vtime_now + vtime_ceiling_window_ns;
struct task_ctx *tctx = ensure_task_ctx(p);
if (tctx) {
tctx->awake_vtime = 0;
tctx->last_run_at = 0;
tctx->wakeup_freq = 20;
tctx->last_woke_at = bpf_ktime_get_ns();
tctx->avg_runtime = 100000;
tctx->cached_weight = WEIGHT_INTERACTIVE;
tctx->prev_nvcsw = p->nvcsw;
tctx->csw_rate = 0;
tctx->lat_cri = 0;
tctx->tier = TIER_INTERACTIVE;
tctx->ewma_age = 0;
tctx->dispatch_path = 0;
char key[16];
__builtin_memcpy(key, p->comm, 16);
struct task_class_entry *init_entry =
bpf_map_lookup_elem(&task_class_init, key);
if (init_entry) {
tctx->tier = (u32)init_entry->tier;
tctx->avg_runtime = init_entry->avg_runtime;
tctx->runtime_dev = init_entry->runtime_dev;
tctx->wakeup_freq = init_entry->wakeup_freq;
tctx->csw_rate = init_entry->csw_rate;
tctx->cached_weight = effective_weight(p, tctx);
}
}
}
s32 BPF_STRUCT_OPS_SLEEPABLE(pandemonium_init)
{
u32 zero = 0;
nr_nodes = __COMPAT_scx_bpf_nr_node_ids();
if (nr_nodes < 1)
nr_nodes = 1;
if (nr_nodes > nr_cpu_ids)
nr_nodes = nr_cpu_ids;
for (u32 i = 0; i < nr_cpu_ids && i < MAX_CPUS; i++)
scx_bpf_create_dsq(i, -1);
for (u32 i = 0; i < nr_nodes && i < MAX_NODES; i++)
scx_bpf_create_dsq(nr_cpu_ids + i, (s32)i);
for (u32 i = 0; i < nr_nodes && i < MAX_NODES; i++)
scx_bpf_create_dsq(nr_cpu_ids + nr_nodes + i, (s32)i);
starvation_rescue_ns = 100000000ULL; overflow_sojourn_rescue_ns = 6000000ULL; sojourn_interval_ns = 4000000ULL; codel_target_floor_ns = 500000ULL; vtime_ceiling_window_ns = 80000000ULL; pcpu_depth_base = 2; pcpu_spill_search_budget = 6; affinity_search_online = 3; lag_cap_ns = 40000000ULL; longrun_preempt_shift = 0; oscillator_damping_shift = 3;
oscillator_spring_shift = 8; oscillator_pull_scale = 3;
oscillator_velocity_cap = (s64)((u64)OSC_VELOCITY_CAP_PER_PULL * 3);
codel_target_ns = codel_target_max_ns;
oscillator_velocity_ns = 0;
prev_rescue_snapshot = 0;
global_rescue_count = 0;
for (u32 i = 0; i < nr_cpu_ids && i < MAX_CPUS; i++) {
pcpu_min_sojourn_ns[i] = ~0ULL;
pcpu_stall_start_ns[i] = 0;
}
longrun_mode = false;
struct tuning_knobs *knobs = bpf_map_lookup_elem(&tuning_knobs_map, &zero);
if (knobs) {
knobs->slice_ns = 1000000;
knobs->preempt_thresh_ns = 1000000;
knobs->lag_scale = 4;
knobs->batch_slice_ns = 20000000; knobs->lat_cri_thresh_high = LAT_CRI_THRESH_HIGH; knobs->lat_cri_thresh_low = LAT_CRI_THRESH_LOW; knobs->affinity_mode = 0; knobs->sojourn_thresh_ns = 5000000; knobs->burst_slice_ns = 1000000; knobs->topology_tau_ns = 0; knobs->codel_eq_ns = 0; }
return 0;
}
void BPF_STRUCT_OPS(pandemonium_exit, struct scx_exit_info *ei)
{
UEI_RECORD(uei, ei);
}
void BPF_STRUCT_OPS(pandemonium_quiescent, struct task_struct *p,
u64 deq_flags)
{
struct task_ctx *tctx = lookup_task_ctx(p);
if (tctx)
tctx->sleep_start_ns = bpf_ktime_get_ns();
}
void BPF_STRUCT_OPS(pandemonium_cpu_release, s32 cpu,
struct scx_cpu_release_args *args)
{
scx_bpf_reenqueue_local();
}
void BPF_STRUCT_OPS(pandemonium_cpu_online, s32 cpu)
{
if ((u32)cpu < MAX_CPUS) {
__sync_lock_test_and_set(&pcpu_enqueue_ns[cpu], 0);
pcpu_min_sojourn_ns[cpu] = ~0ULL;
pcpu_stall_start_ns[cpu] = 0;
}
__sync_lock_test_and_set(&last_tau_snapshot, 0);
}
void BPF_STRUCT_OPS(pandemonium_cpu_offline, s32 cpu)
{
if ((u32)cpu < MAX_CPUS) {
__sync_lock_test_and_set(&pcpu_enqueue_ns[cpu], 0);
pcpu_min_sojourn_ns[cpu] = ~0ULL;
pcpu_stall_start_ns[cpu] = 0;
}
__sync_lock_test_and_set(&last_tau_snapshot, 0);
__sync_lock_test_and_set(&interactive_enqueue_ns, 0);
__sync_lock_test_and_set(&batch_enqueue_ns, 0);
__sync_lock_test_and_set(&global_rescue_count, 0);
prev_rescue_snapshot = 0;
oscillator_velocity_ns = 0;
}
SCX_OPS_DEFINE(pandemonium_ops,
.select_cpu = (void *)pandemonium_select_cpu,
.enqueue = (void *)pandemonium_enqueue,
.dispatch = (void *)pandemonium_dispatch,
.runnable = (void *)pandemonium_runnable,
.running = (void *)pandemonium_running,
.stopping = (void *)pandemonium_stopping,
.tick = (void *)pandemonium_tick,
.enable = (void *)pandemonium_enable,
.quiescent = (void *)pandemonium_quiescent,
.cpu_release = (void *)pandemonium_cpu_release,
.cpu_online = (void *)pandemonium_cpu_online,
.cpu_offline = (void *)pandemonium_cpu_offline,
.init = (void *)pandemonium_init,
.exit = (void *)pandemonium_exit,
.flags = SCX_OPS_BUILTIN_IDLE_PER_NODE,
.name = "pandemonium");