#ifdef LSP
#define __bpf__
#include "../../../../include/scx/common.bpf.h"
#else
#include <scx/common.bpf.h>
#endif
#include "ravg_impl.bpf.h"
#include "sdt_task.h"
#include "intf.h"
#include "types.h"
#include "lb_domain.h"
#include <bpf_arena_common.bpf.h>
#include <errno.h>
#include <stdbool.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
char _license[] SEC("license") = "GPL";
UEI_DEFINE(uei);
const volatile u32 nr_doms = 32;
const volatile u32 nr_nodes = 32;
const volatile u32 nr_cpu_ids = 64;
const volatile u32 cpu_dom_id_map[MAX_CPUS];
const volatile u32 dom_numa_id_map[MAX_DOMS];
const volatile u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64];
const volatile u64 numa_cpumasks[MAX_NUMA_NODES][MAX_CPUS / 64];
const volatile u32 load_half_life = 1000000000 ;
const volatile bool kthreads_local;
const volatile bool fifo_sched = false;
const volatile bool direct_greedy_numa;
const volatile bool mempolicy_affinity;
const volatile u32 greedy_threshold;
const volatile u32 greedy_threshold_x_numa;
const volatile u32 rusty_perf_mode;
const volatile u32 debug;
volatile u64 slice_ns;
struct bpfmask_wrapper {
struct bpf_cpumask __kptr *instance;
};
struct rusty_percpu_storage {
struct bpf_cpumask __kptr *bpfmask;
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, struct rusty_percpu_storage);
__uint(max_entries, 1);
} scx_percpu_bpfmask_map SEC(".maps");
static s32 create_save_cpumask(struct bpf_cpumask **kptr)
{
struct bpf_cpumask *cpumask;
cpumask = bpf_cpumask_create();
if (!cpumask) {
scx_bpf_error("Failed to create cpumask");
return -ENOMEM;
}
cpumask = bpf_kptr_xchg(kptr, cpumask);
if (cpumask) {
scx_bpf_error("kptr already had cpumask");
bpf_cpumask_release(cpumask);
}
return 0;
}
static int scx_rusty_percpu_storage_init(void)
{
void *map = &scx_percpu_bpfmask_map;
struct rusty_percpu_storage *storage;
const u32 zero = 0;
int ret, i;
bpf_for (i, 0, nr_cpu_ids) {
storage = bpf_map_lookup_percpu_elem(map, &zero, i);
if (!storage) {
scx_bpf_error("Did not find map entry");
return -EINVAL;
}
ret = create_save_cpumask(&storage->bpfmask);
if (ret)
return ret;
}
return 0;
}
static struct bpf_cpumask *scx_percpu_bpfmask(void)
{
struct rusty_percpu_storage *storage;
void *map = &scx_percpu_bpfmask_map;
const u32 zero = 0;
storage = bpf_map_lookup_elem(map, &zero);
if (!storage) {
scx_bpf_error("Did not find map entry");
return NULL;
}
if (!storage->bpfmask)
scx_bpf_error("Did not properly initialize singleton");
return storage->bpfmask;
}
struct pcpu_ctx {
u32 dom_rr_cur;
u32 dom_id;
u32 pad[8];
} __attribute__((aligned(CACHELINE_SIZE)));
struct pcpu_ctx pcpu_ctx[MAX_CPUS];
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, struct node_ctx);
__uint(max_entries, MAX_NUMA_NODES);
__uint(map_flags, 0);
} node_data SEC(".maps");
struct lock_wrapper {
struct bpf_spin_lock lock;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, struct lock_wrapper);
__uint(max_entries, MAX_DOMS * LB_LOAD_BUCKETS);
__uint(map_flags, 0);
} dom_dcycle_locks SEC(".maps");
const u64 ravg_1 = 1 << RAVG_FRAC_BITS;
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, struct task_struct *);
__type(value, struct bpfmask_wrapper);
__uint(max_entries, 1000000);
__uint(map_flags, 0);
} task_masks SEC(".maps");
static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
{
struct task_ctx __arena *taskc = sdt_task_data(p);
cast_kern(taskc);
return (struct task_ctx *)taskc;
}
static struct bpf_cpumask *lookup_task_bpfmask(struct task_struct *p)
{
struct bpfmask_wrapper *wrapper;
wrapper = bpf_map_lookup_elem(&task_masks, &p);
if (!wrapper) {
scx_bpf_error("lookup_mask_bpfmask failed for task %p", p);
return NULL;
}
return wrapper->instance;
}
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
{
struct task_ctx *taskc;
taskc = try_lookup_task_ctx(p);
if (!taskc)
scx_bpf_error("task_ctx lookup failed for task %p", p);
return taskc;
}
static struct task_ctx *lookup_task_ctx_mask(struct task_struct *p, struct bpf_cpumask **p_cpumaskp)
{
struct task_ctx *taskc;
if (p_cpumaskp == NULL) {
scx_bpf_error("no mask pointer provided");
return NULL;
}
taskc = lookup_task_ctx(p);
if (!taskc)
scx_bpf_error("task_ctx lookup failed for task %p", p);
*p_cpumaskp = lookup_task_bpfmask(p);
if (*p_cpumaskp == NULL)
scx_bpf_error("task bpf_cpumask lookup failed for task %p", p);
return taskc;
}
static dom_ptr task_domain(struct task_ctx *taskc)
{
dom_ptr domc = taskc->domc;
cast_kern(domc);
return domc;
}
static struct pcpu_ctx *lookup_pcpu_ctx(s32 cpu)
{
struct pcpu_ctx *pcpuc;
pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]);
if (!pcpuc)
scx_bpf_error("Failed to lookup pcpu ctx for %d", cpu);
return pcpuc;
}
static inline u32 weight_to_bucket_idx(u32 weight)
{
return weight * LB_LOAD_BUCKETS / LB_MAX_WEIGHT;
}
static void task_load_adj(struct task_ctx *taskc,
u64 now, bool runnable)
{
taskc->runnable = runnable;
ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
}
static struct bucket_ctx *lookup_dom_bucket(dom_ptr dom_ctx,
u32 weight, u32 *bucket_id)
{
u32 idx = weight_to_bucket_idx(weight);
struct bucket_ctx *bucket;
*bucket_id = idx;
bucket = (struct bucket_ctx *)&dom_ctx->buckets[idx];
if (bucket)
return bucket;
scx_bpf_error("Failed to lookup dom bucket");
return NULL;
}
static struct lock_wrapper *lookup_dom_bkt_lock(u32 dom_id, u32 weight)
{
u32 idx = dom_id * LB_LOAD_BUCKETS + weight_to_bucket_idx(weight);
struct lock_wrapper *lockw;
lockw = bpf_map_lookup_elem(&dom_dcycle_locks, &idx);
if (lockw)
return lockw;
scx_bpf_error("Failed to lookup dom lock");
return NULL;
}
static u64 scale_up_fair(u64 value, u64 weight)
{
return value * weight / 100;
}
static u64 scale_inverse_fair(u64 value, u64 weight)
{
return value * 100 / weight;
}
static void dom_dcycle_adj(dom_ptr domc, u32 weight, u64 now, bool runnable)
{
struct bucket_ctx *bucket;
struct lock_wrapper *lockw;
s64 adj = runnable ? 1 : -1;
u32 bucket_idx = 0;
u32 dom_id;
cast_kern(domc);
dom_id = domc->id;
bucket = lookup_dom_bucket(domc, weight, &bucket_idx);
lockw = lookup_dom_bkt_lock(dom_id, weight);
if (!bucket || !lockw)
return;
bpf_spin_lock(&lockw->lock);
bucket->dcycle += adj;
ravg_accumulate(&bucket->rd, bucket->dcycle, now, load_half_life);
bpf_spin_unlock(&lockw->lock);
if (adj < 0 && (s64)bucket->dcycle < 0)
scx_bpf_error("cpu%d dom%u bucket%u load underflow (dcycle=%lld adj=%lld)",
bpf_get_smp_processor_id(), dom_id, bucket_idx,
bucket->dcycle, adj);
if (debug >=2 &&
(!domc->dbg_dcycle_printed_at || now - domc->dbg_dcycle_printed_at >= 1000000000)) {
bpf_printk("DCYCLE ADJ dom=%u bucket=%u adj=%lld dcycle=%u avg_dcycle=%llu",
dom_id, bucket_idx, adj, bucket->dcycle,
ravg_read(&bucket->rd, now, load_half_life) >> RAVG_FRAC_BITS);
domc->dbg_dcycle_printed_at = now;
}
}
static void dom_dcycle_xfer_task(struct task_struct *p, struct task_ctx *taskc,
dom_ptr from_domc,
dom_ptr to_domc, u64 now)
{
struct bucket_ctx *from_bucket, *to_bucket;
u32 idx = 0, weight = taskc->weight;
struct lock_wrapper *from_lockw, *to_lockw;
struct ravg_data task_dcyc_rd;
u64 from_dcycle[2], to_dcycle[2], task_dcycle;
from_lockw = lookup_dom_bkt_lock(from_domc->id, weight);
to_lockw = lookup_dom_bkt_lock(to_domc->id, weight);
if (!from_lockw || !to_lockw)
return;
from_bucket = lookup_dom_bucket(from_domc, weight, &idx);
to_bucket = lookup_dom_bucket(to_domc, weight, &idx);
if (!from_bucket || !to_bucket)
return;
ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life);
task_dcyc_rd = taskc->dcyc_rd;
if (debug >= 2)
task_dcycle = ravg_read(&task_dcyc_rd, now, load_half_life);
bpf_spin_lock(&from_lockw->lock);
if (taskc->runnable)
from_bucket->dcycle--;
if (debug >= 2)
from_dcycle[0] = ravg_read(&from_bucket->rd, now, load_half_life);
ravg_transfer(&from_bucket->rd, from_bucket->dcycle,
&task_dcyc_rd, taskc->runnable, load_half_life, false);
if (debug >= 2)
from_dcycle[1] = ravg_read(&from_bucket->rd, now, load_half_life);
bpf_spin_unlock(&from_lockw->lock);
bpf_spin_lock(&to_lockw->lock);
if (taskc->runnable)
to_bucket->dcycle++;
if (debug >= 2)
to_dcycle[0] = ravg_read(&to_bucket->rd, now, load_half_life);
ravg_transfer(&to_bucket->rd, to_bucket->dcycle,
&task_dcyc_rd, taskc->runnable, load_half_life, true);
if (debug >= 2)
to_dcycle[1] = ravg_read(&to_bucket->rd, now, load_half_life);
bpf_spin_unlock(&to_lockw->lock);
if (debug >= 2)
bpf_printk("XFER DCYCLE dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu",
from_domc->id, to_domc->id,
task_dcycle >> RAVG_FRAC_BITS,
from_dcycle[0] >> RAVG_FRAC_BITS,
from_dcycle[1] >> RAVG_FRAC_BITS,
to_dcycle[0] >> RAVG_FRAC_BITS,
to_dcycle[1] >> RAVG_FRAC_BITS);
}
static u64 dom_min_vruntime(dom_ptr domc)
{
return READ_ONCE(domc->min_vruntime);
}
int dom_xfer_task(struct task_struct *p __arg_trusted, u32 new_dom_id, u64 now)
{
dom_ptr from_domc, to_domc;
struct task_ctx *taskc;
taskc = lookup_task_ctx(p);
if (!taskc)
return 0;
from_domc = task_domain(taskc);
to_domc = lookup_dom_ctx(new_dom_id);
if (!from_domc || !to_domc || !taskc)
return 0;
dom_dcycle_xfer_task(p, taskc, from_domc, to_domc, now);
return 0;
}
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(u32));
__uint(value_size, sizeof(u64));
__uint(max_entries, RUSTY_NR_STATS);
} stats SEC(".maps");
static inline void stat_add(enum stat_idx idx, u64 addend)
{
u32 idx_v = idx;
u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
if (cnt_p)
(*cnt_p) += addend;
}
struct tune_input{
u64 genn;
u64 slice_ns;
u64 direct_greedy_cpumask[MAX_CPUS / 64];
u64 kick_greedy_cpumask[MAX_CPUS / 64];
} tune_input;
u64 tune_params_gen;
private(A) struct bpf_cpumask __kptr *all_cpumask;
private(A) struct bpf_cpumask __kptr *direct_greedy_cpumask;
private(A) struct bpf_cpumask __kptr *kick_greedy_cpumask;
static u32 cpu_to_dom_id(s32 cpu)
{
const volatile u32 *dom_idp;
dom_idp = MEMBER_VPTR(cpu_dom_id_map, [cpu]);
if (!dom_idp)
return MAX_DOMS;
return *dom_idp;
}
static inline bool is_offline_cpu(s32 cpu)
{
return cpu_to_dom_id(cpu) > MAX_DOMS;
}
static void refresh_tune_params(void)
{
s32 cpu;
if (tune_params_gen == tune_input.genn)
return;
tune_params_gen = tune_input.genn;
slice_ns = tune_input.slice_ns;
bpf_for(cpu, 0, nr_cpu_ids) {
u32 dom_id = cpu_to_dom_id(cpu);
struct lb_domain *lb_domain;
if (is_offline_cpu(cpu))
continue;
if (!(lb_domain = lb_domain_get(dom_id)))
return;
if (tune_input.direct_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) {
if (direct_greedy_cpumask)
bpf_cpumask_set_cpu(cpu, direct_greedy_cpumask);
if (lb_domain->direct_greedy_cpumask)
bpf_cpumask_set_cpu(cpu, lb_domain->direct_greedy_cpumask);
} else {
if (direct_greedy_cpumask)
bpf_cpumask_clear_cpu(cpu, direct_greedy_cpumask);
if (lb_domain->direct_greedy_cpumask)
bpf_cpumask_clear_cpu(cpu, lb_domain->direct_greedy_cpumask);
}
if (tune_input.kick_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) {
if (kick_greedy_cpumask)
bpf_cpumask_set_cpu(cpu, kick_greedy_cpumask);
} else {
if (kick_greedy_cpumask)
bpf_cpumask_clear_cpu(cpu, kick_greedy_cpumask);
}
}
}
static u64 min(u64 a, u64 b)
{
return a <= b ? a : b;
}
const int sched_prio_to_weight[DL_MAX_LAT_PRIO + 1] = {
88761, 71755, 56483, 46273, 36291,
29154, 23254, 18705, 14949, 11916,
9548, 7620, 6100, 4904, 3906,
3121, 2501, 1991, 1586, 1277,
1024, 820, 655, 526, 423,
335, 272, 215, 172, 137,
110, 87, 70, 56, 45,
36, 29, 23, 18, 15,
};
static __noinline u64 sched_prio_to_latency_weight(u64 prio)
{
if (prio >= DL_MAX_LAT_PRIO) {
scx_bpf_error("Invalid prio index");
return 0;
}
return sched_prio_to_weight[DL_MAX_LAT_PRIO - prio - 1];
}
static u64 task_compute_dl(struct task_struct *p, struct task_ctx *taskc,
u64 enq_flags)
{
u64 waker_freq, blocked_freq;
u64 lat_prio, lat_scale, avg_run_raw, avg_run;
u64 freq_factor;
waker_freq = min(taskc->waker_freq, DL_FREQ_FT_MAX);
blocked_freq = min(taskc->blocked_freq, DL_FREQ_FT_MAX);
freq_factor = blocked_freq * waker_freq * waker_freq;
freq_factor = scale_up_fair(freq_factor, p->scx.weight);
lat_prio = log2_u64(freq_factor + 1);
lat_prio = min(lat_prio, DL_MAX_LAT_PRIO);
avg_run_raw = taskc->avg_runtime / DL_RUNTIME_SCALE;
avg_run_raw = min(avg_run_raw, DL_MAX_LATENCY_NS);
avg_run_raw = scale_inverse_fair(avg_run_raw, p->scx.weight);
avg_run = log2_u64(avg_run_raw + 1);
if (avg_run < lat_prio) {
lat_prio -= avg_run;
} else {
lat_prio = 0;
}
lat_scale = sched_prio_to_latency_weight(lat_prio);
lat_scale = min(lat_scale, LB_MAX_WEIGHT);
return scale_inverse_fair(taskc->avg_runtime, lat_scale);
}
static void clamp_task_vtime(struct task_struct *p, struct task_ctx *taskc, u64 enq_flags)
{
u64 dom_vruntime, min_vruntime;
dom_ptr domc;
if (!(domc = task_domain(taskc)))
return;
dom_vruntime = dom_min_vruntime(domc);
min_vruntime = dom_vruntime - slice_ns;
if (time_before(p->scx.dsq_vtime, min_vruntime)) {
p->scx.dsq_vtime = min_vruntime;
taskc->deadline = p->scx.dsq_vtime + task_compute_dl(p, taskc, enq_flags);
stat_add(RUSTY_STAT_DL_CLAMP, 1);
} else {
stat_add(RUSTY_STAT_DL_PRESET, 1);
}
}
static bool task_set_domain(struct task_struct *p __arg_trusted,
u32 new_dom_id, bool init_dsq_vtime)
{
dom_ptr old_domc, new_domc;
struct bpf_cpumask *d_cpumask, *t_cpumask;
struct lb_domain *new_lb_domain;
struct task_ctx *taskc;
u32 old_dom_id;
taskc = lookup_task_ctx_mask(p, &t_cpumask);
if (!taskc || !t_cpumask) {
scx_bpf_error("Failed to look up task cpumask");
return false;
}
old_dom_id = taskc->target_dom;
old_domc = lookup_dom_ctx(old_dom_id);
if (!old_domc)
return false;
if (new_dom_id == NO_DOM_FOUND) {
bpf_cpumask_clear(t_cpumask);
return !(p->scx.flags & SCX_TASK_QUEUED);
}
new_domc = try_lookup_dom_ctx_arena(new_dom_id);
if (!new_domc)
return false;
new_lb_domain = lb_domain_get(new_dom_id);
if (!new_lb_domain) {
scx_bpf_error("no lb_domain for dom%d\n", new_dom_id);
return false;
}
d_cpumask = new_lb_domain->cpumask;
if (!d_cpumask) {
scx_bpf_error("Failed to get dom%u cpumask kptr",
new_dom_id);
return false;
}
if (bpf_cpumask_intersects(cast_mask(d_cpumask), p->cpus_ptr)) {
u64 now = scx_bpf_now();
if (!init_dsq_vtime)
dom_xfer_task(p, new_dom_id, now);
taskc->target_dom = new_dom_id;
taskc->domc = new_domc;
cast_kern(new_domc);
p->scx.dsq_vtime = dom_min_vruntime(new_domc);
taskc->deadline = p->scx.dsq_vtime +
scale_inverse_fair(taskc->avg_runtime, taskc->weight);
bpf_cpumask_and(t_cpumask, cast_mask(d_cpumask), p->cpus_ptr);
}
return taskc->target_dom == new_dom_id;
}
static s32 try_sync_wakeup(struct task_struct *p, struct task_ctx *taskc,
s32 prev_cpu)
{
struct task_struct *current = (void *)bpf_get_current_task_btf();
s32 cpu;
const struct cpumask *idle_cpumask;
bool share_llc, has_idle;
struct lb_domain *lb_domain;
struct bpf_cpumask *d_cpumask;
struct pcpu_ctx *pcpuc;
cpu = bpf_get_smp_processor_id();
pcpuc = lookup_pcpu_ctx(cpu);
if (!pcpuc)
return -ENOENT;
lb_domain = lb_domain_get(pcpuc->dom_id);
if (!lb_domain)
return -ENOENT;
d_cpumask = lb_domain->cpumask;
if (!d_cpumask) {
scx_bpf_error("Failed to acquire dom%u cpumask kptr",
taskc->target_dom);
return -ENOENT;
}
idle_cpumask = scx_bpf_get_idle_cpumask();
share_llc = bpf_cpumask_test_cpu(prev_cpu, cast_mask(d_cpumask));
if (share_llc && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
stat_add(RUSTY_STAT_SYNC_PREV_IDLE, 1);
cpu = prev_cpu;
goto out;
}
has_idle = bpf_cpumask_intersects(cast_mask(d_cpumask), idle_cpumask);
if (has_idle && bpf_cpumask_test_cpu(cpu, p->cpus_ptr) &&
!(current->flags & PF_EXITING) && taskc->target_dom < MAX_DOMS &&
scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu) == 0) {
stat_add(RUSTY_STAT_WAKE_SYNC, 1);
goto out;
}
cpu = -ENOENT;
out:
scx_bpf_put_idle_cpumask(idle_cpumask);
return cpu;
}
s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu,
u64 wake_flags)
{
const struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask();
struct task_ctx *taskc;
bool prev_domestic, has_idle_cores;
struct bpf_cpumask *p_cpumask, *tmp_cpumask;
s32 cpu;
refresh_tune_params();
if (!(taskc = lookup_task_ctx_mask(p, &p_cpumask)) || !p_cpumask)
goto enoent;
if (p->nr_cpus_allowed == 1) {
cpu = prev_cpu;
if (kthreads_local && (p->flags & PF_KTHREAD)) {
stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
} else {
stat_add(RUSTY_STAT_PINNED, 1);
}
goto direct;
}
if (wake_flags & SCX_WAKE_SYNC) {
cpu = try_sync_wakeup(p, taskc, prev_cpu);
if (cpu >= 0)
goto direct;
}
has_idle_cores = !bpf_cpumask_empty(idle_smtmask);
prev_domestic = bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_cpumask));
if (prev_domestic) {
if (bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
stat_add(RUSTY_STAT_PREV_IDLE, 1);
cpu = prev_cpu;
goto direct;
}
} else {
if (direct_greedy_cpumask &&
bpf_cpumask_test_cpu(prev_cpu, cast_mask(direct_greedy_cpumask)) &&
bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
stat_add(RUSTY_STAT_GREEDY_IDLE, 1);
cpu = prev_cpu;
goto direct;
}
}
if (has_idle_cores) {
cpu = scx_bpf_pick_idle_cpu(cast_mask(p_cpumask), SCX_PICK_IDLE_CORE);
if (cpu >= 0) {
stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
goto direct;
}
}
if (prev_domestic && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
cpu = prev_cpu;
goto direct;
}
cpu = scx_bpf_pick_idle_cpu(cast_mask(p_cpumask), 0);
if (cpu >= 0) {
stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1);
goto direct;
}
if (taskc->all_cpus && direct_greedy_cpumask &&
!bpf_cpumask_empty(cast_mask(direct_greedy_cpumask))) {
u32 dom_id = cpu_to_dom_id(prev_cpu);
dom_ptr domc;
struct lb_domain *lb_domain;
struct bpf_cpumask *tmp_direct_greedy, *node_mask;
if (unlikely(is_offline_cpu(prev_cpu)))
domc = NULL;
else if (!(domc = lookup_dom_ctx(dom_id)))
goto enoent;
if (!(lb_domain = lb_domain_get(domc->id))) {
scx_bpf_error("Failed to lookup domain map value");
goto enoent;
}
tmp_direct_greedy = direct_greedy_cpumask;
if (!tmp_direct_greedy) {
scx_bpf_error("Failed to lookup direct_greedy mask");
goto enoent;
}
if (!direct_greedy_numa && domc) {
node_mask = lb_domain->node_cpumask;
if (!node_mask) {
scx_bpf_error("Failed to lookup node mask");
goto enoent;
}
tmp_cpumask = scx_percpu_bpfmask();
if (!tmp_cpumask) {
scx_bpf_error("Failed to lookup tmp cpumask");
goto enoent;
}
bpf_cpumask_and(tmp_cpumask,
cast_mask(node_mask),
cast_mask(tmp_direct_greedy));
tmp_direct_greedy = tmp_cpumask;
}
if (has_idle_cores) {
if (domc && lb_domain->direct_greedy_cpumask) {
cpu = scx_bpf_pick_idle_cpu(cast_mask(lb_domain->direct_greedy_cpumask),
SCX_PICK_IDLE_CORE);
if (cpu >= 0) {
stat_add(RUSTY_STAT_DIRECT_GREEDY, 1);
goto direct;
}
}
if (direct_greedy_cpumask) {
cpu = scx_bpf_pick_idle_cpu(cast_mask(tmp_direct_greedy),
SCX_PICK_IDLE_CORE);
if (cpu >= 0) {
stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
goto direct;
}
}
}
if (domc && lb_domain->direct_greedy_cpumask) {
cpu = scx_bpf_pick_idle_cpu(cast_mask(lb_domain->direct_greedy_cpumask), 0);
if (cpu >= 0) {
stat_add(RUSTY_STAT_DIRECT_GREEDY, 1);
goto direct;
}
}
if (direct_greedy_cpumask) {
cpu = scx_bpf_pick_idle_cpu(cast_mask(tmp_direct_greedy), 0);
if (cpu >= 0) {
stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1);
goto direct;
}
}
}
if (prev_domestic) {
cpu = prev_cpu;
} else {
cpu = bpf_cpumask_any_distribute(cast_mask(p_cpumask));
if (cpu >= nr_cpu_ids)
cpu = prev_cpu;
}
scx_bpf_put_idle_cpumask(idle_smtmask);
return cpu;
direct:
taskc->dispatch_local = true;
scx_bpf_put_idle_cpumask(idle_smtmask);
return cpu;
enoent:
scx_bpf_put_idle_cpumask(idle_smtmask);
return -ENOENT;
}
static void place_task_dl(struct task_struct *p, struct task_ctx *taskc,
u64 enq_flags)
{
clamp_task_vtime(p, taskc, enq_flags);
scx_bpf_dsq_insert_vtime(p, taskc->target_dom, slice_ns, taskc->deadline,
enq_flags);
}
void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p __arg_trusted, u64 enq_flags)
{
struct task_ctx *taskc;
dom_ptr domc;
struct bpf_cpumask *p_cpumask;
s32 cpu = -1;
if (!(taskc = lookup_task_ctx_mask(p, &p_cpumask)) || !p_cpumask)
return;
domc = task_domain(taskc);
if (!domc)
return;
if (domc->id != taskc->target_dom &&
task_set_domain(p, domc->id, false)) {
stat_add(RUSTY_STAT_LOAD_BALANCE, 1);
taskc->dispatch_local = false;
cpu = bpf_cpumask_any_distribute(cast_mask(p_cpumask));
if (cpu < nr_cpu_ids)
scx_bpf_kick_cpu(cpu, 0);
goto dom_queue;
}
if (taskc->dispatch_local) {
taskc->dispatch_local = false;
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
return;
}
if (!bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), cast_mask(p_cpumask))) {
cpu = bpf_cpumask_any_distribute(cast_mask(p_cpumask));
if (cpu < nr_cpu_ids)
scx_bpf_kick_cpu(cpu, 0);
stat_add(RUSTY_STAT_REPATRIATE, 1);
}
dom_queue:
if (fifo_sched)
scx_bpf_dsq_insert(p, taskc->target_dom, slice_ns, enq_flags);
else
place_task_dl(p, taskc, enq_flags);
if (taskc->all_cpus) {
const struct cpumask *idle_cpumask;
idle_cpumask = scx_bpf_get_idle_cpumask();
if (kick_greedy_cpumask) {
cpu = bpf_cpumask_any_and_distribute(cast_mask(kick_greedy_cpumask), idle_cpumask);
if (cpu >= nr_cpu_ids)
cpu = -EBUSY;
}
scx_bpf_put_cpumask(idle_cpumask);
if (cpu >= 0) {
stat_add(RUSTY_STAT_KICK_GREEDY, 1);
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
}
}
}
static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id)
{
struct lb_domain *lb_domain;
struct bpf_cpumask *dmask;
lb_domain = lb_domain_get(dom_id);
if (!lb_domain)
return false;
dmask = lb_domain->cpumask;
if (!dmask)
return false;
return bpf_cpumask_intersects(cpumask, cast_mask(dmask));
}
u32 dom_node_id(u32 dom_id)
{
const volatile u32 *nid_ptr;
nid_ptr = MEMBER_VPTR(dom_numa_id_map, [dom_id]);
if (!nid_ptr) {
scx_bpf_error("Couldn't look up node ID for %d", dom_id);
return 0;
}
return *nid_ptr;
}
#if !defined(__TARGET_ARCH_arm64) && !defined(__aarch64__) && !defined(__TARGET_ARCH_riscv)
static u64 node_dom_mask(u32 node_id)
{
u64 mask = 0;
u32 dom_id = 0;
bpf_for(dom_id, 0, nr_doms) {
if (dom_node_id(dom_id) != node_id)
continue;
mask |= 1LLU << dom_id;
}
return mask;
}
static void task_set_preferred_mempolicy_dom_mask(struct task_struct *p,
struct task_ctx *taskc)
{
struct bpf_cpumask *p_cpumask;
struct mempolicy *mempolicy;
u32 node_id;
u32 val = 0;
void *mask;
taskc->preferred_dom_mask = 0;
mempolicy = BPF_CORE_READ(p, mempolicy);
if (!mempolicy)
return;
p_cpumask = lookup_task_bpfmask(p);
if (!mempolicy_affinity || !p_cpumask)
return;
if (!(mempolicy->mode & (MPOL_BIND|MPOL_PREFERRED|MPOL_PREFERRED_MANY)))
return;
if ((int)mempolicy->home_node >= 0) {
taskc->preferred_dom_mask =
node_dom_mask((u32)mempolicy->home_node);
return;
}
mask = BPF_CORE_READ(mempolicy, nodes.bits);
if (bpf_core_read(&val, sizeof(val), mask))
return;
bpf_for(node_id, 0, nr_nodes) {
if (!(val & 1 << node_id))
continue;
taskc->preferred_dom_mask |= node_dom_mask(node_id);
}
return;
}
#else
static void task_set_preferred_mempolicy_dom_mask(struct task_struct *p,
struct task_ctx *taskc)
{}
#endif
void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev)
{
u32 curr_dom = cpu_to_dom_id(cpu), dom;
struct pcpu_ctx *pcpuc;
u32 my_node;
if (unlikely(is_offline_cpu(cpu)))
return;
if (scx_bpf_dsq_move_to_local(curr_dom, 0)) {
stat_add(RUSTY_STAT_DSQ_DISPATCH, 1);
return;
}
if (!greedy_threshold)
return;
pcpuc = lookup_pcpu_ctx(cpu);
if (!pcpuc)
return;
my_node = dom_node_id(curr_dom);
bpf_repeat(nr_doms - 1) {
dom = pcpuc->dom_rr_cur++ % nr_doms;
if (dom == curr_dom || dom_node_id(dom) != my_node)
continue;
if (scx_bpf_dsq_move_to_local(dom, 0)) {
stat_add(RUSTY_STAT_GREEDY_LOCAL, 1);
return;
}
}
if (!greedy_threshold_x_numa || nr_nodes == 1)
return;
bpf_repeat(nr_doms - 1) {
dom = pcpuc->dom_rr_cur++ % nr_doms;
if (dom_node_id(dom) == my_node || dom == curr_dom ||
scx_bpf_dsq_nr_queued(dom) >= greedy_threshold_x_numa)
continue;
if (scx_bpf_dsq_move_to_local(dom, 0)) {
stat_add(RUSTY_STAT_GREEDY_XNUMA, 1);
return;
}
}
}
static u64 calc_avg(u64 old_val, u64 new_val)
{
return (old_val - (old_val >> 2)) + (new_val >> 2);
}
static u64 update_freq(u64 freq, u64 interval)
{
u64 new_freq;
new_freq = (100 * NSEC_PER_MSEC) / interval;
return calc_avg(freq, new_freq);
}
void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags)
{
u64 now = scx_bpf_now(), interval;
struct task_struct *waker;
struct task_ctx *wakee_ctx, *waker_ctx;
if (!(wakee_ctx = lookup_task_ctx(p)))
return;
wakee_ctx->is_kworker = p->flags & PF_WQ_WORKER;
task_load_adj(wakee_ctx, now, true);
dom_dcycle_adj(wakee_ctx->domc, wakee_ctx->weight, now, true);
if (fifo_sched)
return;
wakee_ctx->sum_runtime = 0;
waker = bpf_get_current_task_btf();
if (!(waker_ctx = try_lookup_task_ctx(waker)))
return;
interval = now - waker_ctx->last_woke_at;
waker_ctx->waker_freq = update_freq(waker_ctx->waker_freq, interval);
waker_ctx->last_woke_at = now;
}
static void running_update_vtime(struct task_struct *p,
struct task_ctx *taskc,
dom_ptr domc)
{
struct bpf_spin_lock * lock = lookup_dom_vtime_lock(domc);
if (!lock)
return;
bpf_spin_lock(lock);
if (time_before(dom_min_vruntime(domc), p->scx.dsq_vtime))
WRITE_ONCE(domc->min_vruntime, p->scx.dsq_vtime);
bpf_spin_unlock(lock);
}
void BPF_STRUCT_OPS(rusty_running, struct task_struct *p)
{
task_ptr usrptr = (task_ptr)sdt_task_data(p);
struct task_ctx *taskc;
dom_ptr domc;
u32 dap_gen;
if (!(taskc = lookup_task_ctx(p)))
return;
domc = task_domain(taskc);
if (!domc) {
scx_bpf_error("Invalid dom ID");
return;
}
dap_gen = domc->active_tasks.genn;
if (taskc->dom_active_tasks_gen != dap_gen) {
u64 idx = __sync_fetch_and_add(&domc->active_tasks.write_idx, 1) %
MAX_DOM_ACTIVE_TPTRS;
if (idx >= MAX_DOM_ACTIVE_TPTRS) {
scx_bpf_error("dom_active_tasks[%u][%llu] out of bounds indexing",
domc->id, idx);
return;
}
usrptr = (task_ptr)sdt_task_data(p);
cast_user(usrptr);
domc->active_tasks.tasks[idx] = usrptr;
taskc->dom_active_tasks_gen = dap_gen;
}
if (fifo_sched)
return;
running_update_vtime(p, taskc, domc);
taskc->last_run_at = scx_bpf_now();
}
static void stopping_update_vtime(struct task_struct *p,
struct task_ctx *taskc,
dom_ptr domc)
{
u64 now, delta;
now = scx_bpf_now();
delta = now - taskc->last_run_at;
taskc->sum_runtime += delta;
taskc->avg_runtime = calc_avg(taskc->avg_runtime, taskc->sum_runtime);
p->scx.dsq_vtime += scale_inverse_fair(delta, p->scx.weight);
taskc->deadline = p->scx.dsq_vtime + task_compute_dl(p, taskc, 0);
}
void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable)
{
struct task_ctx *taskc;
dom_ptr domc;
if (fifo_sched)
return;
if (!(taskc = lookup_task_ctx(p)))
return;
if (!(domc = task_domain(taskc)))
return;
stopping_update_vtime(p, taskc, domc);
}
void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags)
{
u64 now = scx_bpf_now(), interval;
struct task_ctx *taskc;
dom_ptr domc;
if (!(taskc = lookup_task_ctx(p)))
return;
if (!(domc = task_domain(taskc)))
return;
task_load_adj(taskc, now, false);
dom_dcycle_adj(domc, taskc->weight, now, false);
if (fifo_sched)
return;
interval = now - taskc->last_blocked_at;
taskc->blocked_freq = update_freq(taskc->blocked_freq, interval);
taskc->last_blocked_at = now;
}
void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight)
{
struct task_ctx *taskc;
if (!(taskc = lookup_task_ctx(p)))
return;
if (debug >= 2)
bpf_printk("%s[%p]: SET_WEIGHT %u -> %u", p->comm, p,
taskc->weight, weight);
taskc->weight = weight;
}
static u32 task_pick_domain(struct task_ctx *taskc, struct task_struct *p,
const struct cpumask *cpumask)
{
s32 cpu = bpf_get_smp_processor_id();
u32 first_dom = NO_DOM_FOUND, dom, preferred_dom = NO_DOM_FOUND;
if (cpu < 0 || cpu >= MAX_CPUS)
return NO_DOM_FOUND;
taskc->dom_mask = 0;
dom = pcpu_ctx[cpu].dom_rr_cur++;
task_set_preferred_mempolicy_dom_mask(p, taskc);
bpf_repeat(nr_doms) {
dom = (dom + 1) % nr_doms;
if (cpumask_intersects_domain(cpumask, dom)) {
taskc->dom_mask |= 1LLU << dom;
if (first_dom == NO_DOM_FOUND)
first_dom = dom;
if (taskc->preferred_dom_mask == 0)
continue;
if (((1LLU << dom) & taskc->preferred_dom_mask)
&& preferred_dom == NO_DOM_FOUND)
preferred_dom = dom;
}
}
return preferred_dom != NO_DOM_FOUND ? preferred_dom: first_dom;
}
static void task_pick_and_set_domain(struct task_ctx *taskc,
struct task_struct *p,
const struct cpumask *cpumask,
bool init_dsq_vtime)
{
u32 dom_id = 0;
if (nr_doms > 1)
dom_id = task_pick_domain(taskc, p, cpumask);
if (!task_set_domain(p, dom_id, init_dsq_vtime))
scx_bpf_error("Failed to set dom%d for %s[%p]",
dom_id, p->comm, p);
}
void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p,
const struct cpumask *cpumask)
{
struct task_ctx *taskc;
if (!(taskc = lookup_task_ctx(p)))
return;
task_pick_and_set_domain(taskc, p, cpumask, false);
if (all_cpumask)
taskc->all_cpus =
bpf_cpumask_subset(cast_mask(all_cpumask), cpumask);
}
s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init_task, struct task_struct *p,
struct scx_init_task_args *args)
{
u64 now = scx_bpf_now();
struct task_struct *p_map;
struct bpfmask_wrapper wrapper = {};
struct bpfmask_wrapper *mask_map_value;
struct task_ctx *taskc;
long ret;
taskc = (struct task_ctx *)sdt_task_alloc(p);
if (!taskc)
return -ENOMEM;
*taskc = (struct task_ctx) {
.dom_active_tasks_gen = -1,
.last_blocked_at = now,
.last_woke_at = now,
.preferred_dom_mask = 0,
.pid = p->pid,
};
if (debug >= 2)
bpf_printk("%s[%p]: INIT (weight %u))", p->comm, p, p->scx.weight);
p_map = p;
ret = bpf_map_update_elem(&task_masks, &p_map, &wrapper, 0 );
if (ret) {
sdt_task_free(p);
return ret;
}
mask_map_value = bpf_map_lookup_elem(&task_masks, &p_map);
if (!mask_map_value) {
sdt_task_free(p);
return -EINVAL;
}
ret = create_save_cpumask(&mask_map_value->instance);
if (ret) {
bpf_map_delete_elem(&task_masks, &p_map);
sdt_task_free(p);
return ret;
}
bpf_rcu_read_lock();
task_pick_and_set_domain(taskc, p, p->cpus_ptr, true);
bpf_rcu_read_unlock();
return 0;
}
void BPF_STRUCT_OPS(rusty_exit_task, struct task_struct *p,
struct scx_exit_task_args *args)
{
long ret;
sdt_task_free(p);
ret = bpf_map_delete_elem(&task_masks, &p);
if (ret) {
stat_add(RUSTY_STAT_TASK_GET_ERR, 1);
return;
}
}
static s32 create_node(u32 node_id)
{
u32 cpu;
struct bpf_cpumask *cpumask;
struct node_ctx *nodec;
s32 ret;
nodec = bpf_map_lookup_elem(&node_data, &node_id);
if (!nodec) {
scx_bpf_error("No node%u", node_id);
return -ENOENT;
}
ret = create_save_cpumask(&nodec->cpumask);
if (ret)
return ret;
bpf_rcu_read_lock();
cpumask = nodec->cpumask;
if (!cpumask) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup node cpumask");
return -ENOENT;
}
bpf_for(cpu, 0, MAX_CPUS) {
const volatile u64 *nmask;
nmask = MEMBER_VPTR(numa_cpumasks, [node_id][cpu / 64]);
if (!nmask) {
scx_bpf_error("array index error");
ret = -ENOENT;
break;
}
if (*nmask & (1LLU << (cpu % 64)))
bpf_cpumask_set_cpu(cpu, cpumask);
}
bpf_rcu_read_unlock();
return ret;
}
__weak s32 create_dom(u32 dom_id)
{
dom_ptr domc;
struct node_ctx *nodec;
struct bpf_cpumask *dom_mask, *node_mask, *all_mask;
struct lb_domain *lb_domain;
u32 cpu, node_id;
int perf;
s32 ret;
if (dom_id >= MAX_DOMS) {
scx_bpf_error("Max dom ID %u exceeded (%u)", MAX_DOMS, dom_id);
return -EINVAL;
}
node_id = dom_node_id(dom_id);
domc = lb_domain_alloc(dom_id);
if (!domc)
return -ENOMEM;
dom_ctxs[dom_id] = domc;
cast_user(dom_ctxs[dom_id]);
lb_domain = lb_domain_get(dom_id);
if (!lb_domain) {
scx_bpf_error("could not retrieve dom%d data\n", dom_id);
lb_domain_free(domc);
return -EINVAL;
}
ret = scx_bpf_create_dsq(dom_id, node_id);
if (ret < 0) {
scx_bpf_error("Failed to create dsq %u (%d)", dom_id, ret);
return ret;
}
domc->id = dom_id;
ret = create_save_cpumask(&lb_domain->cpumask);
if (ret)
return ret;
bpf_printk("Created domain %d (%p)", dom_id, &lb_domain->cpumask);
if (!lb_domain->cpumask)
scx_bpf_error("NULL");
bpf_rcu_read_lock();
dom_mask = lb_domain->cpumask;
all_mask = all_cpumask;
if (!dom_mask || !all_mask) {
bpf_rcu_read_unlock();
scx_bpf_error("Could not find cpumask");
return -ENOENT;
}
bpf_for(cpu, 0, MAX_CPUS) {
const volatile u64 *dmask;
bool cpu_in_domain;
dmask = MEMBER_VPTR(dom_cpumasks, [dom_id][cpu / 64]);
if (!dmask) {
scx_bpf_error("array index error");
ret = -ENOENT;
break;
}
cpu_in_domain = *dmask & (1LLU << (cpu % 64));
if (!cpu_in_domain)
continue;
bpf_cpumask_set_cpu(cpu, dom_mask);
bpf_cpumask_set_cpu(cpu, all_mask);
perf = min(SCX_CPUPERF_ONE, rusty_perf_mode);
scx_bpf_cpuperf_set(cpu, perf);
}
bpf_rcu_read_unlock();
if (ret)
return ret;
ret = create_save_cpumask(&lb_domain->direct_greedy_cpumask);
if (ret)
return ret;
nodec = bpf_map_lookup_elem(&node_data, &node_id);
if (!nodec) {
scx_bpf_error("No node%u", node_id);
return -ENOENT;
}
ret = create_save_cpumask(&lb_domain->node_cpumask);
if (ret)
return ret;
bpf_rcu_read_lock();
node_mask = nodec->cpumask;
dom_mask = lb_domain->node_cpumask;
if (!node_mask || !dom_mask) {
bpf_rcu_read_unlock();
scx_bpf_error("cpumask lookup failed");
return -ENOENT;
}
bpf_cpumask_copy(dom_mask, cast_mask(node_mask));
bpf_rcu_read_unlock();
return 0;
}
static s32 initialize_cpu(s32 cpu)
{
struct pcpu_ctx *pcpuc = lookup_pcpu_ctx(cpu);
u32 i;
if (!pcpuc)
return -ENOENT;
pcpuc->dom_rr_cur = cpu;
bpf_for(i, 0, nr_doms) {
bool in_dom;
struct lb_domain *lb_domain;
lb_domain = lb_domain_get(i);
if (!lb_domain)
return -ENOENT;
bpf_rcu_read_lock();
if (!lb_domain->cpumask) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup dom node %d cpumask %p",
i, &lb_domain->cpumask);
return -ENOENT;
}
in_dom = bpf_cpumask_test_cpu(cpu, cast_mask(lb_domain->cpumask));
bpf_rcu_read_unlock();
if (in_dom) {
pcpuc->dom_id = i;
return 0;
}
}
return -ENOENT;
}
s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init)
{
s32 i, ret;
ret = sdt_static_init(STATIC_ALLOC_PAGES_GRANULARITY);
if (ret)
return ret;
ret = sdt_task_init(sizeof(struct task_ctx));
if (ret)
return ret;
ret = lb_domain_init();
if (ret)
return ret;
ret = create_save_cpumask(&all_cpumask);
if (ret)
return ret;
ret = create_save_cpumask(&direct_greedy_cpumask);
if (ret)
return ret;
ret = create_save_cpumask(&kick_greedy_cpumask);
if (ret)
return ret;
ret = scx_rusty_percpu_storage_init();
if (ret)
return ret;
bpf_for(i, 0, nr_nodes) {
ret = create_node(i);
if (ret)
return ret;
}
bpf_for(i, 0, nr_doms) {
ret = create_dom(i);
if (ret)
return ret;
}
bpf_for(i, 0, nr_cpu_ids) {
if (is_offline_cpu(i))
continue;
ret = initialize_cpu(i);
if (ret)
return ret;
}
return 0;
}
void BPF_STRUCT_OPS(rusty_exit, struct scx_exit_info *ei)
{
UEI_RECORD(uei, ei);
}
SCX_OPS_DEFINE(rusty,
.select_cpu = (void *)rusty_select_cpu,
.enqueue = (void *)rusty_enqueue,
.dispatch = (void *)rusty_dispatch,
.runnable = (void *)rusty_runnable,
.running = (void *)rusty_running,
.stopping = (void *)rusty_stopping,
.quiescent = (void *)rusty_quiescent,
.set_weight = (void *)rusty_set_weight,
.set_cpumask = (void *)rusty_set_cpumask,
.init_task = (void *)rusty_init_task,
.exit_task = (void *)rusty_exit_task,
.init = (void *)rusty_init,
.exit = (void *)rusty_exit,
.timeout_ms = 10000,
.name = "rusty");