#ifdef LSP
#ifndef __bpf__
#define __bpf__
#endif
#include "../../../../include/scx/common.bpf.h"
#include "../../../../include/scx/namespace_impl.bpf.h"
#else
#include <scx/common.bpf.h>
#include <scx/namespace_impl.bpf.h>
#endif
#include <errno.h>
#include <stdbool.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "intf.h"
#include "timer.bpf.h"
#include "util.bpf.h"
char _license[] SEC("license") = "GPL";
extern unsigned CONFIG_HZ __kconfig;
const volatile u32 debug;
const volatile u64 slice_ns;
const volatile u64 max_exec_ns;
const volatile u32 nr_cpu_ids = 1;
const volatile u32 nr_possible_cpus = 1;
const volatile u64 numa_cpumasks[MAX_NUMA_NODES][MAX_CPUS / 64];
const volatile u32 llc_numa_id_map[MAX_LLCS];
const volatile u32 cpu_llc_id_map[MAX_CPUS];
const volatile u32 nr_layers = 1;
const volatile u32 nr_nodes = 32;
const volatile u32 nr_llcs = 32;
const volatile bool smt_enabled = true;
const volatile bool has_little_cores = true;
const volatile bool xnuma_preemption = false;
const volatile s32 __sibling_cpu[MAX_CPUS];
const volatile bool monitor_disable = false;
const volatile unsigned char all_cpus[MAX_CPUS_U8];
const volatile u32 layer_iteration_order[MAX_LAYERS];
const volatile u32 nr_op_layers;
const volatile u32 nr_on_layers;
const volatile u32 nr_gp_layers;
const volatile u32 nr_gn_layers;
const volatile u32 nr_excl_layers;
const volatile u64 min_open_layer_disallow_open_after_ns;
const volatile u64 min_open_layer_disallow_preempt_after_ns;
const volatile u64 lo_fb_wait_ns = 5000000;
const volatile u32 lo_fb_share_ppk = 128;
const volatile bool percpu_kthread_preempt = true;
const volatile bool percpu_kthread_preempt_all = false;
volatile u64 layer_refresh_seq_avgruntime;
const volatile bool enable_antistall = true;
const volatile bool enable_match_debug = false;
const volatile bool enable_gpu_support = false;
const volatile u64 antistall_sec = 3;
const u32 zero_u32 = 0;
const volatile u64 ext_sched_class_addr = 0;
const volatile u64 idle_sched_class_addr = 0;
private(unprotected_cpumask) struct bpf_cpumask __kptr *unprotected_cpumask;
u64 unprotected_seq = 0;
private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
private(big_cpumask) struct bpf_cpumask __kptr *big_cpumask;
struct layer layers[MAX_LAYERS];
u32 fallback_cpu;
u32 layered_root_tgid = 0;
u32 empty_layer_ids[MAX_LAYERS];
u32 nr_empty_layer_ids;
UEI_DEFINE(uei);
struct task_hint {
u64 hint;
u64 __reserved[3];
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct task_hint);
} scx_layered_task_hint_map SEC(".maps");
static inline s32 prio_to_nice(s32 static_prio)
{
return static_prio - 120;
}
static inline bool is_percpu_kthread(struct task_struct *p)
{
return (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1;
}
static inline bool is_percpu_kthread_preempting(struct task_struct *p)
{
return percpu_kthread_preempt &&
(percpu_kthread_preempt_all || p->scx.weight > 100);
}
static inline s32 sibling_cpu(s32 cpu)
{
const volatile s32 *sib;
if (!smt_enabled)
return -1;
sib = MEMBER_VPTR(__sibling_cpu, [cpu]);
if (sib)
return *sib;
else
return -1;
}
static __always_inline struct layer *lookup_layer(u32 id)
{
if (id >= nr_layers) {
scx_bpf_error("invalid layer %d", id);
return NULL;
}
return &layers[id];
}
static __noinline u64 layer_dsq_id(u32 layer_id, u32 llc_id)
{
return ((u64)layer_id << DSQ_ID_LAYER_SHIFT) | llc_id;
}
static __always_inline u32 cpu_to_llc_id(s32 cpu_id)
{
const volatile u32 *llc_ptr;
llc_ptr = MEMBER_VPTR(cpu_llc_id_map, [cpu_id]);
if (!llc_ptr) {
scx_bpf_error("Couldn't look up llc ID for cpu %d", cpu_id);
return 0;
}
return *llc_ptr;
}
u32 llc_node_id(u32 llc_id)
{
const volatile u32 *llc_ptr;
llc_ptr = MEMBER_VPTR(llc_numa_id_map, [llc_id]);
if (!llc_ptr) {
scx_bpf_error("Couldn't look up llc ID for %d", llc_id);
return 0;
}
return *llc_ptr;
}
static u64 hi_fb_dsq_id(u32 llc_id)
{
return HI_FB_DSQ_BASE | llc_id;
}
static u64 lo_fb_dsq_id(u32 llc_id)
{
return LO_FB_DSQ_BASE | llc_id;
}
static __always_inline bool is_scheduler_task(struct task_struct *p)
{
return (u32)p->tgid == layered_root_tgid;
}
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, struct cpu_ctx);
__uint(max_entries, 1);
} cpu_ctxs SEC(".maps");
static struct cpu_ctx *lookup_cpu_ctx(int cpu)
{
struct cpu_ctx *cpuc;
if (cpu < 0)
cpuc = bpf_map_lookup_elem(&cpu_ctxs, &zero_u32);
else
cpuc = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero_u32, cpu);
if (!cpuc) {
scx_bpf_error("no cpu_ctx for cpu %d", cpu);
return NULL;
}
return cpuc;
}
static bool cpuc_in_layer(struct cpu_ctx *cpuc, struct layer *layer)
{
if (layer->kind == LAYER_KIND_OPEN)
return cpuc->in_open_layers;
else
return cpuc->layer_id == layer->id;
}
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, u32);
__type(value, u32);
__uint(max_entries, MAX_TASKS);
__uint(map_flags, BPF_F_NO_PREALLOC);
} layer_match_dbg SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, u32);
__type(value, u32);
__uint(max_entries, MAX_GPU_PIDS);
__uint(map_flags, BPF_F_NO_PREALLOC);
} gpu_tgid SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, u32);
__type(value, u32);
__uint(max_entries, MAX_GPU_PIDS);
__uint(map_flags, BPF_F_NO_PREALLOC);
} gpu_tid SEC(".maps");
int save_gpu_tgid_pid() {
if (!enable_gpu_support)
return 0;
u64 pid_tgid;
u32 pid, tid, zero;
zero = 0;
pid_tgid = bpf_get_current_pid_tgid();
pid = pid_tgid >> 32;
tid = pid_tgid;
bpf_map_update_elem(&gpu_tid, &tid, &zero, BPF_ANY);
bpf_map_update_elem(&gpu_tgid, &pid, &zero, BPF_ANY);
return 0;
}
SEC("?kprobe/nvidia_poll")
int kprobe_nvidia_poll() {
return save_gpu_tgid_pid();
}
SEC("?kprobe/nvidia_open")
int kprobe_nvidia_open() {
return save_gpu_tgid_pid();
}
SEC("?kprobe/nvidia_mmap")
int kprobe_nvidia_mmap() {
return save_gpu_tgid_pid();
}
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, struct node_ctx);
__uint(max_entries, MAX_NUMA_NODES);
__uint(map_flags, 0);
} node_data SEC(".maps");
static struct node_ctx *lookup_node_ctx(u32 node)
{
struct node_ctx *nodec;
if (!(nodec = bpf_map_lookup_elem(&node_data, &node)))
scx_bpf_error("no node_ctx");
return nodec;
}
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, struct llc_ctx);
__uint(max_entries, MAX_LLCS);
__uint(map_flags, 0);
} llc_data SEC(".maps");
static struct llc_ctx *lookup_llc_ctx(u32 llc_id)
{
struct llc_ctx *llcc;
if (!(llcc = bpf_map_lookup_elem(&llc_data, &llc_id)))
scx_bpf_error("no llc_ctx");
return llcc;
}
static void gstat_add(u32 id, struct cpu_ctx *cpuc, s64 delta)
{
if (id >= NR_GSTATS) {
scx_bpf_error("invalid global stat id %d", id);
return;
}
cpuc->gstats[id] += delta;
}
static void gstat_inc(u32 id, struct cpu_ctx *cpuc)
{
gstat_add(id, cpuc, 1);
}
static void lstat_add(u32 id, struct layer *layer, struct cpu_ctx *cpuc, s64 delta)
{
u64 *vptr;
if ((vptr = MEMBER_VPTR(*cpuc, .lstats[layer->id][id])))
(*vptr) += delta;
else
scx_bpf_error("invalid layer or stat ids: %d, %d", id, layer->id);
}
static void lstat_inc(u32 id, struct layer *layer, struct cpu_ctx *cpuc)
{
lstat_add(id, layer, cpuc, 1);
}
struct layer_cpumask_wrapper {
struct bpf_cpumask __kptr *cpumask;
struct bpf_cpumask __kptr *cpuset;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, u32);
__type(value, struct layer_cpumask_wrapper);
__uint(max_entries, MAX_LAYERS);
__uint(map_flags, 0);
} layer_cpumasks SEC(".maps");
static struct cpumask *lookup_layer_cpumask(u32 layer_id)
{
struct layer_cpumask_wrapper *cpumaskw;
if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &layer_id))) {
return (struct cpumask *)cpumaskw->cpumask;
} else {
scx_bpf_error("no layer_cpumask for layer %d", layer_id);
return NULL;
}
}
static struct bpf_cpumask *lookup_layer_cpuset(u32 layer_id)
{
struct layer_cpumask_wrapper *cpumaskw;
if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &layer_id))) {
return cpumaskw->cpuset;
} else {
scx_bpf_error("no layer_cpuset for layer %d", layer_id);
return NULL;
}
}
static void layer_llc_drain_enable(struct layer *layer, u32 llc_id)
{
__sync_or_and_fetch(&layer->llcs_to_drain, 1LLU << llc_id);
}
static void layer_llc_drain_disable(struct layer *layer, u32 llc_id)
{
__sync_and_and_fetch(&layer->llcs_to_drain, ~(1LLU << llc_id));
}
static inline bool refresh_layer_cpuc(struct cpu_ctx *cpuc, struct layer *layer)
{
cpuc->in_open_layers = (layer->kind == LAYER_KIND_OPEN);
cpuc->layer_id = (layer->kind == LAYER_KIND_OPEN) ? MAX_LAYERS : layer->id;
if (cpuc->is_protected == layer->is_protected)
return false;
cpuc->is_protected = layer->is_protected;
if (unlikely(!unprotected_cpumask)) {
scx_bpf_error("unprotected_cpumask not initialized");
return false;
}
if (cpuc->is_protected)
bpf_cpumask_clear_cpu(cpuc->cpu, unprotected_cpumask);
else
bpf_cpumask_set_cpu(cpuc->cpu, unprotected_cpumask);
return true;
}
static void layer_cpuset_bpfmask(int layer_id)
{
struct bpf_cpumask *layer_cpuset;
u8 *u8_ptr;
int cpu;
bpf_rcu_read_lock();
bpf_for(cpu, 0, nr_possible_cpus) {
u8_ptr = MEMBER_VPTR(layers, [layer_id].cpuset[cpu / 8]);
if (!u8_ptr) {
bpf_rcu_read_unlock();
scx_bpf_error("could not find cpuset byte");
return;
}
layer_cpuset = lookup_layer_cpuset(layer_id);
if (!layer_cpuset) {
bpf_rcu_read_unlock();
scx_bpf_error("uninitialized cpuset");
return;
}
if (*u8_ptr & (1 << (cpu % 8)))
bpf_cpumask_set_cpu(cpu, layer_cpuset);
else
bpf_cpumask_clear_cpu(cpu, layer_cpuset);
}
bpf_rcu_read_unlock();
}
int refresh_cpumasks(u32 layer_id)
{
struct bpf_cpumask *layer_cpumask;
struct layer_cpumask_wrapper *cpumaskw;
bool protected_changed = false;
struct layer *layer;
struct cpu_ctx *cpuc;
int cpu, llc_id;
layer = MEMBER_VPTR(layers, [layer_id]);
if (!layer) {
scx_bpf_error("can't happen");
return 0;
}
if (!__sync_val_compare_and_swap(&layer->refresh_cpus, 1, 0))
return 0;
bpf_rcu_read_lock();
if (!(cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &layer_id)) ||
!(layer_cpumask = cpumaskw->cpumask)) {
bpf_rcu_read_unlock();
scx_bpf_error("can't happen");
return 0;
}
bpf_for(cpu, 0, nr_possible_cpus) {
u8 *u8_ptr;
if (!(cpuc = lookup_cpu_ctx(cpu))) {
bpf_rcu_read_unlock();
return 0;
}
if ((u8_ptr = MEMBER_VPTR(layers, [layer_id].cpus[cpu / 8]))) {
if (*u8_ptr & (1 << (cpu % 8))) {
protected_changed = refresh_layer_cpuc(cpuc, layer) || protected_changed;
bpf_cpumask_set_cpu(cpu, layer_cpumask);
} else {
if (layer->kind == LAYER_KIND_OPEN)
cpuc->in_open_layers = false;
else if (cpuc->layer_id == layer_id)
cpuc->layer_id = MAX_LAYERS;
bpf_cpumask_clear_cpu(cpu, layer_cpumask);
}
} else {
scx_bpf_error("can't happen");
}
}
bpf_rcu_read_unlock();
if (protected_changed)
__sync_fetch_and_add(&unprotected_seq, 1);
__sync_fetch_and_add(&layer->cpus_seq, 1);
trace("LAYER[%d] now has %d cpus, seq=%llu", layer_id, layer->nr_cpus, layer->cpus_seq);
bpf_for(llc_id, 0, nr_llcs) {
if (layer->nr_llc_cpus[llc_id])
layer_llc_drain_disable(layer, llc_id);
else if (scx_bpf_dsq_nr_queued(layer_dsq_id(layer->id, llc_id)))
layer_llc_drain_enable(layer, llc_id);
}
bpf_for(cpu, 0, nr_possible_cpus) {
if (!(cpuc = lookup_cpu_ctx(cpu)))
return 0;
if (cpuc_in_layer(cpuc, layer))
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
}
return 0;
}
SEC("syscall")
int BPF_PROG(refresh_layer_cpumasks)
{
u32 id;
bpf_for(id, 0, nr_layers)
refresh_cpumasks(id);
return 0;
}
struct cached_cpus {
s64 id;
u64 seq;
};
struct task_ctx {
int pid;
int last_cpu;
u32 layer_id;
pid_t last_waker;
bool refresh_layer;
struct cached_cpus layered_cpus;
struct bpf_cpumask __kptr *layered_mask;
struct cached_cpus layered_cpus_llc;
struct bpf_cpumask __kptr *layered_llc_mask;
struct cached_cpus layered_cpus_node;
struct bpf_cpumask __kptr *layered_node_mask;
struct cached_cpus layered_cpus_unprotected;
struct bpf_cpumask __kptr *layered_unprotected_mask;
bool all_cpuset_allowed;
bool cpus_node_aligned;
u64 runnable_at;
u64 running_at;
u64 runtime_avg;
u64 dsq_id;
u32 llc_id;
u32 qrt_layer_id;
u32 qrt_llc_id;
char join_layer[SCXCMD_COMLEN];
u64 layer_refresh_seq;
};
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct task_ctx);
} task_ctxs SEC(".maps");
static void refresh_cpus_flags(struct task_ctx *taskc,
const struct cpumask *cpumask);
static struct task_ctx *lookup_task_ctx_may_fail(struct task_struct *p)
{
return bpf_task_storage_get(&task_ctxs, p, 0, 0);
}
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
{
struct task_ctx *taskc = lookup_task_ctx_may_fail(p);
if (!taskc)
scx_bpf_error("task_ctx lookup failed");
return taskc;
}
SEC("tp_btf/cgroup_attach_task")
int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path,
struct task_struct *leader, bool threadgroup)
{
struct list_head *thread_head;
struct task_struct *next;
struct task_ctx *taskc;
if (!(taskc = lookup_task_ctx_may_fail(leader)))
return 0;
taskc->refresh_layer = true;
if (!threadgroup)
return 0;
thread_head = &leader->signal->thread_head;
if (!(next = bpf_task_acquire(leader))) {
scx_bpf_error("failed to acquire leader");
return 0;
}
bpf_repeat(MAX_TASKS) {
struct task_struct *p;
int pid;
p = container_of(next->thread_node.next, struct task_struct, thread_node);
bpf_task_release(next);
if (&p->thread_node == thread_head) {
next = NULL;
break;
}
pid = BPF_CORE_READ(p, pid);
next = bpf_task_from_pid(pid);
if (!next) {
bpf_printk("scx_layered: tp_cgroup_attach_task: thread iteration failed");
break;
}
if ((taskc = lookup_task_ctx(next)))
taskc->refresh_layer = true;
}
if (next)
bpf_task_release(next);
return 0;
}
static int handle_cmd(struct task_ctx *taskc, struct scx_cmd *cmd)
{
_Static_assert(sizeof(*cmd) == MAX_COMM, "scx_cmd has wrong size");
if (cmd->prefix != SCXCMD_PREFIX)
return 0;
switch (cmd->opcode) {
case SCXCMD_OP_NONE:
break;
case SCXCMD_OP_JOIN:
__builtin_memcpy(taskc->join_layer, cmd->cmd, SCXCMD_COMLEN);
break;
case SCXCMD_OP_LEAVE:
__builtin_memset(taskc->join_layer, 0, SCXCMD_COMLEN);
break;
default:
break;
}
return 0;
}
SEC("tp_btf/task_rename")
int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf)
{
struct task_ctx *taskc;
struct scx_cmd cmd;
int ret;
if (!(taskc = lookup_task_ctx_may_fail(p))) {
bpf_printk("could not find task on rename");
return -EINVAL;
}
taskc->refresh_layer = true;
ret = bpf_probe_read_str(&cmd, sizeof(cmd), buf);
if (ret < 0) {
bpf_printk("could not new task name on rename");
return -EINVAL;
}
handle_cmd(taskc, &cmd);
return 0;
}
SEC("syscall")
int BPF_PROG(initialize_pid_namespace)
{
struct task_struct *p;
if (!(p = (struct task_struct*)bpf_get_current_task_btf()))
return -ENOENT;
layered_root_tgid = BPF_PROBE_READ(p, tgid);
trace("CFG layered running with tgid: %d", layered_root_tgid);
return 0;
}
static bool should_refresh_cached_cpus(struct cached_cpus *ccpus, s64 id, u64 cpus_seq)
{
return ccpus->id != id || ccpus->seq != cpus_seq;
}
static __always_inline
void refresh_cached_cpus(struct bpf_cpumask *mask,
struct cached_cpus *ccpus,
s64 id, u64 cpus_seq,
const struct cpumask *cpus_a,
const struct cpumask *cpus_b)
{
if (unlikely(!mask || !cpus_a || !cpus_b)) {
scx_bpf_error("NULL ccpus->mask or cpus_a/b");
return;
}
bpf_cpumask_and(mask, cpus_a, cpus_b);
ccpus->id = id;
ccpus->seq = cpus_seq;
}
static void maybe_refresh_layered_cpus(struct task_struct *p, struct task_ctx *taskc,
const struct cpumask *layer_cpumask,
u64 cpus_seq)
{
if (should_refresh_cached_cpus(&taskc->layered_cpus, 0, cpus_seq)) {
refresh_cached_cpus(taskc->layered_mask, &taskc->layered_cpus, 0, cpus_seq,
p->cpus_ptr, layer_cpumask);
trace("%s[%d] layered cpumask refreshed to seq=%llu",
p->comm, p->pid, taskc->layered_cpus.seq);
}
}
static void maybe_refresh_layered_cpus_llc(struct task_struct *p, struct task_ctx *taskc,
const struct cpumask *layer_cpumask,
s32 llc_id, u64 cpus_seq)
{
if (should_refresh_cached_cpus(&taskc->layered_cpus_llc, llc_id, cpus_seq)) {
struct llc_ctx *llcc;
if (!(llcc = lookup_llc_ctx(llc_id)))
return;
refresh_cached_cpus(taskc->layered_llc_mask,
&taskc->layered_cpus_llc, llc_id, cpus_seq,
cast_mask(taskc->layered_mask),
cast_mask(llcc->cpumask));
trace("%s[%d] layered llc cpumask refreshed to llc=%d seq=%llu",
p->comm, p->pid, taskc->layered_cpus_llc.id, taskc->layered_cpus_llc.seq);
}
}
static void maybe_refresh_layered_cpus_node(struct task_struct *p, struct task_ctx *taskc,
const struct cpumask *layer_cpumask,
s32 node_id, u64 cpus_seq)
{
if (should_refresh_cached_cpus(&taskc->layered_cpus_node, node_id, cpus_seq)) {
struct node_ctx *nodec;
if (!(nodec = lookup_node_ctx(node_id)))
return;
refresh_cached_cpus(taskc->layered_node_mask,
&taskc->layered_cpus_node, node_id, cpus_seq,
cast_mask(taskc->layered_mask),
cast_mask(nodec->cpumask));
trace("%s[%d] layered node cpumask refreshed to node=%d seq=%llu",
p->comm, p->pid, taskc->layered_cpus_node.id, taskc->layered_cpus_node.seq);
}
}
static void maybe_refresh_layered_cpus_unprotected(struct task_struct *p, struct task_ctx *taskc,
const struct cpumask *layer_cpumask)
{
struct bpf_cpumask *task_cpumask = taskc->layered_unprotected_mask;
u64 cpus_seq = READ_ONCE(unprotected_seq);
if (!task_cpumask || !layer_cpumask || !unprotected_cpumask)
return;
if (should_refresh_cached_cpus(&taskc->layered_cpus_unprotected, 0, cpus_seq)) {
bpf_cpumask_or(task_cpumask, cast_mask(unprotected_cpumask), layer_cpumask);
bpf_cpumask_and(task_cpumask, cast_mask(task_cpumask), p->cpus_ptr);
taskc->layered_cpus_unprotected.id = 0;
taskc->layered_cpus_unprotected.seq = cpus_seq;
trace("%s[%d] layered allowed cpumask refreshed to seq=%llu",
p->comm, p->pid, taskc->layered_cpus_unprotected.seq);
}
}
static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu,
const struct cpumask *idle_smtmask, const struct layer *layer)
{
bool prev_in_cand;
s32 i, cpu = -1;
if (unlikely(!cand_cpumask || !idle_smtmask))
return -1;
prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask);
if (smt_enabled) {
if (prev_in_cand &&
layer->prev_over_idle_core) {
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
return prev_cpu;
prev_in_cand = false;
}
if (prev_in_cand && bpf_cpumask_test_cpu(prev_cpu, idle_smtmask)) {
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
return prev_cpu;
prev_in_cand = false;
}
cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE);
if (cpu >= 0)
return cpu;
if (nr_excl_layers && layer->excl)
return -EBUSY;
}
bpf_for(i, 0, nr_cpu_ids) {
struct cpu_ctx *sib_cpuc;
s32 sib;
if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
cpu = prev_cpu;
prev_in_cand = false;
} else {
cpu = scx_bpf_pick_idle_cpu(cand_cpumask, 0);
if (cpu < 0)
break;
}
if (!nr_excl_layers ||
(sib = sibling_cpu(cpu)) < 0 || !(sib_cpuc = lookup_cpu_ctx(sib)) ||
(!sib_cpuc->current_excl && !sib_cpuc->next_excl))
break;
}
return cpu;
}
static __always_inline
bool should_try_preempt_first(s32 cand, struct layer *layer,
const struct cpumask *layered_cpumask)
{
struct cpu_ctx *cand_cpuc, *sib_cpuc;
s32 sib;
if (!layer->preempt || !layer->preempt_first)
return false;
if (layer->kind == LAYER_KIND_CONFINED &&
!bpf_cpumask_test_cpu(cand, layered_cpumask))
return false;
if (!(cand_cpuc = lookup_cpu_ctx(cand)) || cand_cpuc->current_preempt)
return false;
if (nr_excl_layers && layer->excl && (sib = sibling_cpu(cand)) >= 0 &&
(!(sib_cpuc = lookup_cpu_ctx(sib)) || sib_cpuc->current_preempt))
return false;
return true;
}
static __always_inline
s32 pick_idle_big_little(struct layer *layer, struct task_ctx *taskc,
const struct cpumask *idle_smtmask, s32 prev_cpu)
{
s32 cpu = -1;
if (!has_little_cores || !big_cpumask)
return cpu;
struct bpf_cpumask *tmp_cpumask;
if (!taskc->layered_mask || !big_cpumask)
return cpu;
if (!(tmp_cpumask = bpf_cpumask_create()))
return cpu;
switch (layer->growth_algo) {
case GROWTH_ALGO_BIG_LITTLE: {
if (!taskc->layered_mask || !big_cpumask)
goto out_put;
bpf_cpumask_and(tmp_cpumask, cast_mask(taskc->layered_mask),
cast_mask(big_cpumask));
cpu = pick_idle_cpu_from(cast_mask(tmp_cpumask),
prev_cpu, idle_smtmask, layer);
goto out_put;
}
case GROWTH_ALGO_LITTLE_BIG: {
bpf_cpumask_setall(tmp_cpumask);
if (!tmp_cpumask || !big_cpumask)
goto out_put;
bpf_cpumask_xor(tmp_cpumask, cast_mask(big_cpumask),
cast_mask(tmp_cpumask));
if (!tmp_cpumask || !taskc->layered_mask)
goto out_put;
bpf_cpumask_and(tmp_cpumask, cast_mask(taskc->layered_mask),
cast_mask(tmp_cpumask));
cpu = pick_idle_cpu_from(cast_mask(tmp_cpumask),
prev_cpu, idle_smtmask, layer);
goto out_put;
}
default:
goto out_put;
}
out_put:
bpf_cpumask_release(tmp_cpumask);
return cpu;
}
static __always_inline
s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu,
struct cpu_ctx *cpuc, struct task_ctx *taskc, struct layer *layer,
bool from_selcpu)
{
const struct cpumask *idle_smtmask, *layer_cpumask, *layered_cpumask, *cpumask;
bool is_float = layer->task_place == PLACEMENT_FLOAT;
struct bpf_cpumask *unprot_mask;
struct cpu_ctx *prev_cpuc;
u32 layer_id = layer->id;
u64 cpus_seq;
s32 cpu;
if (layer_id >= MAX_LAYERS || !(layer_cpumask = lookup_layer_cpumask(layer_id)))
return -1;
if (layer->periodically_refresh && taskc->layer_refresh_seq < layer_refresh_seq_avgruntime)
taskc->refresh_layer = true;
if (!is_float && (p->nr_cpus_allowed == 1 || is_migration_disabled(p))) {
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
if (layer->kind == LAYER_KIND_CONFINED &&
!bpf_cpumask_test_cpu(prev_cpu, layer_cpumask))
lstat_inc(LSTAT_AFFN_VIOL, layer, cpuc);
return prev_cpu;
} else {
return -1;
}
}
cpus_seq = READ_ONCE(layers->cpus_seq);
maybe_refresh_layered_cpus(p, taskc, layer_cpumask, cpus_seq);
if (!(layered_cpumask = cast_mask(taskc->layered_mask)))
return -1;
if (from_selcpu && should_try_preempt_first(prev_cpu, layer, layered_cpumask)) {
cpuc->try_preempt_first = true;
return -1;
}
if (READ_ONCE(layer->check_no_idle)) {
bool has_idle;
cpumask = scx_bpf_get_idle_cpumask();
if (layer->kind == LAYER_KIND_CONFINED) {
has_idle = bpf_cpumask_intersects(layered_cpumask, cpumask);
} else {
maybe_refresh_layered_cpus_unprotected(p, taskc, layered_cpumask);
unprot_mask = taskc->layered_unprotected_mask;
if (!unprot_mask)
unprot_mask = unprotected_cpumask;
if (unlikely(!unprot_mask)) {
scx_bpf_error("unprotected_cpumask not initialized");
scx_bpf_put_idle_cpumask(cpumask);
return -1;
}
has_idle = bpf_cpumask_intersects(cast_mask(unprot_mask), cpumask);
}
scx_bpf_put_idle_cpumask(cpumask);
if (!has_idle)
return -1;
}
if ((nr_llcs > 1 || nr_nodes > 1) &&
!(prev_cpuc = lookup_cpu_ctx(prev_cpu)))
return -1;
if (!(idle_smtmask = scx_bpf_get_idle_smtmask()))
return -1;
if (is_float)
goto no_locality;
cpu = pick_idle_big_little(layer, taskc, idle_smtmask, prev_cpu);
if (cpu >=0)
goto out_put;
if (nr_llcs > 1) {
struct llc_ctx *prev_llcc;
maybe_refresh_layered_cpus_llc(p, taskc, layer_cpumask,
prev_cpuc->llc_id, cpus_seq);
if (!(cpumask = cast_mask(taskc->layered_llc_mask))) {
cpu = -1;
goto out_put;
}
if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask, layer)) >= 0)
goto out_put;
if (!(prev_llcc = lookup_llc_ctx(prev_cpuc->llc_id)) ||
prev_llcc->queued_runtime[layer_id] < layer->xllc_mig_min_ns) {
lstat_inc(LSTAT_XLLC_MIGRATION_SKIP, layer, cpuc);
cpu = -1;
goto out_put;
}
}
no_locality:
if (nr_nodes > 1) {
maybe_refresh_layered_cpus_node(p, taskc, layer_cpumask,
prev_cpuc->node_id, cpus_seq);
if (!(cpumask = cast_mask(taskc->layered_node_mask))) {
cpu = -1;
goto out_put;
}
if ((cpu = pick_idle_cpu_from(cpumask, prev_cpu, idle_smtmask, layer)) >= 0)
goto out_put;
}
if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu, idle_smtmask, layer)) >= 0)
goto out_put;
if (layer->kind != LAYER_KIND_CONFINED) {
maybe_refresh_layered_cpus_unprotected(p, taskc, layered_cpumask);
unprot_mask = taskc->layered_unprotected_mask;
if (!unprot_mask)
unprot_mask = unprotected_cpumask;
if ((cpu = pick_idle_cpu_from(cast_mask(unprot_mask), prev_cpu, idle_smtmask, layer)) >= 0) {
lstat_inc(LSTAT_OPEN_IDLE, layer, cpuc);
goto out_put;
}
}
cpu = -1;
out_put:
if (cpu >= 0) {
if (READ_ONCE(layer->check_no_idle))
WRITE_ONCE(layer->check_no_idle, false);
} else if (taskc->all_cpuset_allowed) {
if (!READ_ONCE(layer->check_no_idle))
WRITE_ONCE(layer->check_no_idle, true);
}
scx_bpf_put_idle_cpumask(idle_smtmask);
return cpu;
}
static __always_inline
bool maybe_update_task_llc(struct task_struct *p, struct task_ctx *taskc, s32 new_cpu)
{
u32 new_llc_id = cpu_to_llc_id(new_cpu);
struct llc_ctx *prev_llcc, *new_llcc;
u32 layer_id;
s64 vtime_delta;
if (taskc->llc_id == new_llc_id)
return false;
layer_id = taskc->layer_id;
if (layer_id >= MAX_LAYERS ||
!(prev_llcc = lookup_llc_ctx(taskc->llc_id)) ||
!(new_llcc = lookup_llc_ctx(new_llc_id)))
return false;
vtime_delta = p->scx.dsq_vtime - prev_llcc->vtime_now[layer_id];
p->scx.dsq_vtime = new_llcc->vtime_now[layer_id] + vtime_delta;
taskc->llc_id = new_llc_id;
return true;
}
s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
struct cpu_ctx *cpuc;
struct task_ctx *taskc;
struct layer *layer;
s32 cpu;
if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p)))
return prev_cpu;
if (taskc->layer_id == MAX_LAYERS || !(layer = lookup_layer(taskc->layer_id)))
return prev_cpu;
if (layer->task_place == PLACEMENT_STICK)
cpu = prev_cpu;
else
cpu = pick_idle_cpu(p, prev_cpu, cpuc, taskc, layer, true);
if (cpu >= 0) {
lstat_inc(LSTAT_SEL_LOCAL, layer, cpuc);
taskc->dsq_id = SCX_DSQ_LOCAL;
scx_bpf_dsq_insert(p, taskc->dsq_id, layer->slice_ns, 0);
return cpu;
}
return prev_cpu;
}
enum preempt_flags {
PREEMPT_FIRST = 1LLU << 0,
PREEMPT_IGNORE_EXCL = 1LLU << 1,
};
static bool try_preempt_cpu(s32 cand, struct task_struct *p, struct task_ctx *taskc,
struct layer *layer, u64 flags)
{
struct cpu_ctx *cpuc, *cand_cpuc, *sib_cpuc = NULL;
struct rq *rq;
struct task_struct *curr;
const struct cpumask *idle_cpumask;
s32 sib;
if (cand >= nr_possible_cpus || !bpf_cpumask_test_cpu(cand, p->cpus_ptr))
return false;
if (!(cand_cpuc = lookup_cpu_ctx(cand)))
return false;
if (cand_cpuc->current_preempt)
return false;
rq = scx_bpf_cpu_rq(cand);
if (!rq)
return false;
curr = rq->curr;
if (ext_sched_class_addr && idle_sched_class_addr &&
((u64)curr->sched_class != ext_sched_class_addr) &&
((u64)curr->sched_class != idle_sched_class_addr)) {
if (!(cpuc = lookup_cpu_ctx(-1)))
return false;
gstat_inc(GSTAT_SKIP_PREEMPT, cpuc);
return false;
}
if (cand_cpuc->protect_owned_preempt && cand_cpuc->running_owned &&
!(layer->kind == LAYER_KIND_OPEN && cand_cpuc->running_open))
return false;
if (nr_excl_layers && !(flags & PREEMPT_IGNORE_EXCL) &&
layer->excl && (sib = sibling_cpu(cand)) >= 0 &&
(!(sib_cpuc = lookup_cpu_ctx(sib)) || sib_cpuc->current_preempt)) {
if (!(cpuc = lookup_cpu_ctx(-1)))
return false;
lstat_inc(LSTAT_EXCL_COLLISION, layer, cpuc);
return false;
}
taskc->dsq_id = SCX_DSQ_LOCAL_ON | cand;
scx_bpf_dsq_insert(p, taskc->dsq_id, layer->slice_ns, SCX_ENQ_PREEMPT);
if (!(cpuc = lookup_cpu_ctx(-1)))
return true;
idle_cpumask = scx_bpf_get_idle_cpumask();
if (nr_excl_layers && sib_cpuc &&
!bpf_cpumask_test_cpu(sib_cpuc->cpu, idle_cpumask)) {
lstat_inc(LSTAT_EXCL_PREEMPT, layer, cpuc);
cpuc->next_excl = true;
scx_bpf_kick_cpu(sib, SCX_KICK_PREEMPT);
}
if (!bpf_cpumask_test_cpu(cand_cpuc->cpu, idle_cpumask)) {
lstat_inc(LSTAT_PREEMPT, layer, cpuc);
if (flags & PREEMPT_FIRST)
lstat_inc(LSTAT_PREEMPT_FIRST, layer, cpuc);
} else {
lstat_inc(LSTAT_PREEMPT_IDLE, layer, cpuc);
}
scx_bpf_put_idle_cpumask(idle_cpumask);
return true;
}
static void task_uncharge_qrt(struct task_ctx *taskc)
{
struct llc_ctx *llcc;
u32 layer_id = taskc->qrt_layer_id;
if (layer_id >= MAX_LAYERS || !(llcc = lookup_llc_ctx(taskc->qrt_llc_id)))
return;
__sync_fetch_and_sub(&llcc->queued_runtime[layer_id], taskc->runtime_avg);
taskc->qrt_layer_id = MAX_LAYERS;
taskc->qrt_llc_id = MAX_LLCS;
}
static void layer_kick_idle_cpu(struct layer *layer)
{
const struct cpumask *layer_cpumask, *idle_smtmask;;
s32 cpu;
if (!(layer_cpumask = lookup_layer_cpumask(layer->id)) ||
!(idle_smtmask = scx_bpf_get_idle_smtmask()))
return;
if ((cpu = pick_idle_cpu_from(layer_cpumask, 0, idle_smtmask, layer)) >= 0)
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
scx_bpf_put_idle_cpumask(idle_smtmask);
}
void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
{
struct cpu_ctx *cpuc, *task_cpuc;
struct task_ctx *taskc;
struct llc_ctx *llcc;
struct layer *layer;
bool wakeup = enq_flags & SCX_ENQ_WAKEUP;
s32 cpu, task_cpu = scx_bpf_task_cpu(p);
u32 llc_id, layer_id;
bool yielding, try_preempt_first;
u64 queued_runtime;
u64 *lstats;
if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p)))
return;
layer_id = taskc->layer_id;
if (!(layer = lookup_layer(layer_id)))
return;
if (enq_flags & SCX_ENQ_REENQ) {
lstat_inc(LSTAT_ENQ_REENQ, layer, cpuc);
} else {
if (wakeup)
lstat_inc(LSTAT_ENQ_WAKEUP, layer, cpuc);
else
lstat_inc(LSTAT_ENQ_EXPIRE, layer, cpuc);
}
yielding = cpuc->yielding;
if (yielding) {
lstat_inc(LSTAT_YIELD, layer, cpuc);
cpuc->yielding = false;
}
try_preempt_first = cpuc->try_preempt_first;
cpuc->try_preempt_first = false;
if (try_preempt_first && wakeup && !yielding &&
try_preempt_cpu(task_cpu, p, taskc, layer, PREEMPT_FIRST))
return;
if (!__COMPAT_is_enq_cpu_selected(enq_flags) || try_preempt_first) {
cpu = pick_idle_cpu(p, task_cpu, cpuc, taskc, layer, false);
if (cpu >= 0) {
lstat_inc(LSTAT_ENQ_LOCAL, layer, cpuc);
taskc->dsq_id = SCX_DSQ_LOCAL_ON | cpu;
scx_bpf_dsq_insert(p, taskc->dsq_id, layer->slice_ns, 0);
return;
}
}
if (!(task_cpuc = lookup_cpu_ctx(task_cpu)))
return;
if (layer->preempt && !yielding) {
if (!try_preempt_first && wakeup &&
try_preempt_cpu(task_cpu, p, taskc, layer, 0))
return;
if (p->nr_cpus_allowed > 1) {
struct cpu_prox_map *pmap = &task_cpuc->prox_map;
bpf_for(cpu, 1, MAX_CPUS) {
if (cpu >= pmap->sys_end)
break;
u16 *cpu_p = MEMBER_VPTR(pmap->cpus, [cpu]);
if (cpu_p && try_preempt_cpu(*cpu_p, p, taskc, layer, 0))
return;
}
if (nr_excl_layers && layer->excl) {
bpf_for(cpu, 0, MAX_CPUS) {
if (cpu >= pmap->sys_end)
break;
u16 *cpu_p = MEMBER_VPTR(pmap->cpus, [cpu]);
if (cpu_p && try_preempt_cpu(*cpu_p, p, taskc, layer,
PREEMPT_IGNORE_EXCL))
return;
}
}
}
lstat_inc(LSTAT_PREEMPT_FAIL, layer, cpuc);
}
if (((p->flags & PF_KTHREAD) && p->nr_cpus_allowed < nr_possible_cpus) ||
is_scheduler_task(p)) {
struct cpumask *layer_cpumask;
if (layer->kind == LAYER_KIND_CONFINED &&
(layer_cpumask = lookup_layer_cpumask(taskc->layer_id)) &&
!bpf_cpumask_test_cpu(task_cpu, layer_cpumask))
lstat_inc(LSTAT_AFFN_VIOL, layer, cpuc);
if (p->nr_cpus_allowed == 1) {
taskc->dsq_id = SCX_DSQ_LOCAL;
if (is_percpu_kthread_preempting(p))
enq_flags |= SCX_ENQ_PREEMPT;
} else {
taskc->dsq_id = task_cpuc->hi_fb_dsq_id;
}
scx_bpf_dsq_insert(p, taskc->dsq_id, layer->slice_ns, enq_flags);
return;
}
llc_id = task_cpuc->llc_id;
if (llc_id >= MAX_LLCS || !(llcc = lookup_llc_ctx(llc_id)))
return;
maybe_update_task_llc(p, taskc, task_cpu);
u64 vtime = p->scx.dsq_vtime;
u64 vtime_now = llcc->vtime_now[layer_id];
u64 vtime_min = vtime_now - layer->slice_ns;
u64 vtime_max = vtime_now + 8192 * layer->slice_ns;
if (time_before(vtime, vtime_min))
vtime = vtime_min;
if (unlikely(time_after(vtime, vtime_max))) {
gstat_inc(GSTAT_FIXUP_VTIME, cpuc);
vtime = vtime_max;
}
if ((!taskc->all_cpuset_allowed &&
!(layer->allow_node_aligned && taskc->cpus_node_aligned)) ||
!layer->nr_cpus) {
taskc->dsq_id = task_cpuc->lo_fb_dsq_id;
if (!scx_bpf_dsq_nr_queued(taskc->dsq_id))
llcc->lo_fb_seq++;
scx_bpf_dsq_insert(p, taskc->dsq_id, layer->slice_ns, enq_flags);
return;
}
task_uncharge_qrt(taskc);
taskc->qrt_layer_id = layer_id;
taskc->qrt_llc_id = llc_id;
queued_runtime = __sync_fetch_and_add(&llcc->queued_runtime[layer_id],
taskc->runtime_avg);
queued_runtime += taskc->runtime_avg;
lstats = llcc->lstats[layer_id];
lstats[LLC_LSTAT_LAT] =
((LAYER_LAT_DECAY_FACTOR - 1) * lstats[LLC_LSTAT_LAT] + queued_runtime) /
LAYER_LAT_DECAY_FACTOR;
lstats[LLC_LSTAT_CNT]++;
taskc->dsq_id = layer_dsq_id(layer_id, llc_id);
if (layer->fifo)
scx_bpf_dsq_insert(p, taskc->dsq_id, layer->slice_ns, enq_flags);
else
scx_bpf_dsq_insert_vtime(p, taskc->dsq_id, layer->slice_ns, vtime, enq_flags);
lstat_inc(LSTAT_ENQ_DSQ, layer, cpuc);
if (!layer->nr_llc_cpus[llc_id]) {
layer_llc_drain_enable(layer, llc_id);
layer_kick_idle_cpu(layer);
}
}
static void account_used(struct cpu_ctx *cpuc, struct task_ctx *taskc, u64 now)
{
s32 task_lid;
u64 used;
used = now - cpuc->used_at;
if (!used)
return;
task_lid = taskc->layer_id;
if (unlikely(task_lid >= nr_layers)) {
scx_bpf_error("invalid layer %d", task_lid);
return;
}
cpuc->used_at = now;
cpuc->usage += used;
if (cpuc->running_owned) {
cpuc->layer_usages[task_lid][LAYER_USAGE_OWNED] += used;
if (cpuc->protect_owned)
cpuc->layer_usages[task_lid][LAYER_USAGE_PROTECTED] += used;
if (cpuc->protect_owned_preempt)
cpuc->layer_usages[task_lid][LAYER_USAGE_PROTECTED_PREEMPT] += used;
} else {
cpuc->layer_usages[task_lid][LAYER_USAGE_OPEN] += used;
}
if (taskc->dsq_id & HI_FB_DSQ_BASE)
gstat_add(GSTAT_HI_FB_USAGE, cpuc, used);
else if (taskc->dsq_id & LO_FB_DSQ_BASE)
gstat_add(GSTAT_LO_FB_USAGE, cpuc, used);
if (cpuc->running_fallback)
gstat_add(GSTAT_FB_CPU_USAGE, cpuc, used);
}
static bool keep_running(struct cpu_ctx *cpuc, struct task_struct *p,
struct task_ctx *taskc, struct layer *layer)
{
if (cpuc->yielding || !max_exec_ns)
goto no;
if (!(p->scx.flags & SCX_TASK_QUEUED))
goto no;
if (taskc->dsq_id & LO_FB_DSQ_BASE)
goto no;
if (scx_bpf_dsq_nr_queued(cpuc->hi_fb_dsq_id))
goto no;
cpuc->ran_current_for += layer->slice_ns;
if (cpuc->ran_current_for > max_exec_ns) {
lstat_inc(LSTAT_KEEP_FAIL_MAX_EXEC, layer, cpuc);
goto no;
}
if (layer->preempt) {
u64 dsq_id = layer_dsq_id(layer->id, cpuc->llc_id);
if (!scx_bpf_dsq_nr_queued(dsq_id)) {
p->scx.slice = layer->slice_ns;
lstat_inc(LSTAT_KEEP, layer, cpuc);
return true;
}
} else {
const struct cpumask *idle_cpumask = scx_bpf_get_idle_cpumask();
bool has_idle = false;
if (layer->kind != LAYER_KIND_CONFINED) {
has_idle = !bpf_cpumask_empty(idle_cpumask);
} else {
struct cpumask *layer_cpumask;
if ((layer_cpumask = lookup_layer_cpumask(layer->id)))
has_idle = bpf_cpumask_intersects(idle_cpumask,
layer_cpumask);
}
scx_bpf_put_idle_cpumask(idle_cpumask);
if (has_idle) {
p->scx.slice = layer->slice_ns;
lstat_inc(LSTAT_KEEP, layer, cpuc);
return true;
}
}
lstat_inc(LSTAT_KEEP_FAIL_BUSY, layer, cpuc);
no:
cpuc->ran_current_for = 0;
return false;
}
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, 1);
} antistall_cpu_dsq SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, 1);
} antistall_cpu_max_delay SEC(".maps");
int get_delay_sec(struct task_struct *p, u64 jiffies_now)
{
u64 runnable_at, delta_secs;
runnable_at = READ_ONCE(p->scx.runnable_at);
if (time_before(runnable_at, jiffies_now)) {
delta_secs = (jiffies_now - runnable_at) / CONFIG_HZ;
} else {
delta_secs = 0;
}
return delta_secs;
}
bool antistall_consume(struct cpu_ctx *cpuc)
{
u64 *antistall_dsq, jiffies_now, cur_delay;
bool consumed;
struct task_struct *p;
cur_delay = 0;
consumed = false;
if (!enable_antistall || !cpuc)
return false;
antistall_dsq = bpf_map_lookup_elem(&antistall_cpu_dsq, &zero_u32);
if (!antistall_dsq) {
scx_bpf_error("cant happen");
return false;
}
if (*antistall_dsq == SCX_DSQ_INVALID)
return false;
consumed = scx_bpf_dsq_move_to_local(*antistall_dsq);
if (!consumed)
goto reset;
gstat_inc(GSTAT_ANTISTALL, cpuc);
jiffies_now = bpf_jiffies64();
bpf_for_each(scx_dsq, p, *antistall_dsq, 0) {
cur_delay = get_delay_sec(p, jiffies_now);
if (cur_delay > antistall_sec) {
return consumed;
}
goto reset;
}
reset:
trace("antistall reset DSQ[%llu] SELECTED_CPU[%llu] DELAY[%llu]",
*antistall_dsq, cpuc->cpu, cur_delay);
*antistall_dsq = SCX_DSQ_INVALID;
return consumed;
}
static bool try_drain_layer_llcs(struct layer *layer, struct cpu_ctx *cpuc)
{
u32 cnt = layer->llc_drain_cnt++;
u32 u;
if (cnt & 1)
return false;
lstat_inc(LSTAT_LLC_DRAIN_TRY, layer, cpuc);
bpf_for(u, 0, nr_llcs) {
u32 llc_id = (u + cnt / 2) % nr_llcs;
u64 dsq_id = layer_dsq_id(layer->id, llc_id);
u32 *vptr;
bool disabled = false, consumed;
if (!(layer->llcs_to_drain & (1LLU << llc_id)))
continue;
if ((vptr = MEMBER_VPTR(layer->nr_llc_cpus, [llc_id])) && *vptr)
continue;
if (scx_bpf_dsq_nr_queued(dsq_id) <= 1) {
layer_llc_drain_disable(layer, llc_id);
disabled = true;
}
consumed = scx_bpf_dsq_move_to_local(dsq_id);
if (disabled && scx_bpf_dsq_nr_queued(dsq_id))
layer_llc_drain_enable(layer, llc_id);
if (consumed) {
lstat_inc(LSTAT_LLC_DRAIN, layer, cpuc);
return true;
}
}
return false;
}
static __always_inline bool try_consume_layer(u32 layer_id, struct cpu_ctx *cpuc,
struct llc_ctx *llcc)
{
struct llc_prox_map *llc_pmap = &llcc->prox_map;
struct layer *layer;
u32 nid = llc_node_id(llcc->id);
bool xllc_mig_skipped = false;
bool skip_remote_node;
u32 u;
if (!(layer = lookup_layer(layer_id)))
return false;
skip_remote_node = layer->skip_remote_node;
bpf_for(u, 0, llc_pmap->sys_end) {
u16 *llc_idp;
if (!(llc_idp = MEMBER_VPTR(llc_pmap->llcs, [u]))) {
scx_bpf_error("llc_pmap->sys_end=%u too big", llc_pmap->sys_end);
return false;
}
if (u > 0) {
struct llc_ctx *remote_llcc;
if (!(remote_llcc = lookup_llc_ctx(*llc_idp)))
return false;
if (skip_remote_node && nid != llc_node_id(remote_llcc->id)) {
lstat_inc(LSTAT_SKIP_REMOTE_NODE, layer, cpuc);
continue;
}
if (remote_llcc->queued_runtime[layer_id] < layer->xllc_mig_min_ns) {
xllc_mig_skipped = true;
continue;
}
}
if (scx_bpf_dsq_move_to_local(layer_dsq_id(layer_id, *llc_idp)))
return true;
}
if (xllc_mig_skipped)
lstat_inc(LSTAT_XLLC_MIGRATION_SKIP, layer, cpuc);
return false;
}
static __always_inline
bool try_consume_layers(u32 *layer_order, u32 nr, u32 exclude_layer_id,
struct cpu_ctx *cpuc, struct llc_ctx *llcc)
{
u32 u;
if (nr >= MAX_LAYERS) {
scx_bpf_error("nr=%u too high", nr);
return false;
}
bpf_for(u, 0, nr) {
u32 layer_id = layer_order[u];
if (layer_id == exclude_layer_id)
continue;
if (try_consume_layer(layer_id, cpuc, llcc))
return true;
}
return false;
}
bool __always_inline sib_keep_idle(s32 cpu, struct task_struct *prev __arg_trusted, struct layer *prev_layer,
struct task_ctx *prev_taskc, struct cpu_ctx *cpuc)
{
if (nr_excl_layers && (!prev_taskc || !prev_layer->excl)) {
struct cpu_ctx *sib_cpuc;
s32 sib;
if ((sib = sibling_cpu(cpu)) >= 0 && (sib_cpuc = lookup_cpu_ctx(sib)) &&
(sib_cpuc->current_excl || sib_cpuc->next_excl)) {
gstat_inc(GSTAT_EXCL_IDLE, cpuc);
return true;
}
}
return false;
}
void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev)
{
struct task_ctx *prev_taskc = NULL;
struct layer *prev_layer = NULL;
struct cpu_ctx *cpuc;
struct llc_ctx *llcc;
bool tried_preempting = false, tried_lo_fb = false;
u32 nr_ogp_layers = nr_op_layers + nr_gp_layers;
u32 nr_ogn_layers = nr_on_layers + nr_gn_layers;
if (!(cpuc = lookup_cpu_ctx(-1)))
return;
if (antistall_consume(cpuc))
return;
if (prev && (prev->scx.flags & SCX_TASK_QUEUED)) {
if (!(prev_taskc = lookup_task_ctx(prev)) ||
!(prev_layer = lookup_layer(prev_taskc->layer_id)))
return;
}
if (prev && sib_keep_idle(cpu, prev, prev_layer, prev_taskc, cpuc))
return;
if (prev_taskc && keep_running(cpuc, prev, prev_taskc, prev_layer)) {
prev->scx.slice = prev_layer->slice_ns;
return;
}
if (!(llcc = lookup_llc_ctx(cpuc->llc_id)))
return;
if (scx_bpf_dsq_move_to_local(cpuc->hi_fb_dsq_id))
return;
if (cpuc->cpu == fallback_cpu &&
try_consume_layers(empty_layer_ids, nr_empty_layer_ids,
MAX_LAYERS, cpuc, llcc)) {
cpuc->running_fallback = true;
return;
}
if (scx_bpf_dsq_nr_queued(cpuc->lo_fb_dsq_id)) {
u64 now = scx_bpf_now();
u64 dur, usage;
if (cpuc->lo_fb_seq != llcc->lo_fb_seq) {
cpuc->lo_fb_seq_at = now;
cpuc->lo_fb_usage_base = cpuc->gstats[GSTAT_LO_FB_USAGE];
cpuc->lo_fb_seq = llcc->lo_fb_seq;
}
dur = now - cpuc->lo_fb_seq_at;
usage = cpuc->gstats[GSTAT_LO_FB_USAGE] + lo_fb_wait_ns -
cpuc->lo_fb_usage_base;
if (dur > lo_fb_wait_ns && 1024 * usage < lo_fb_share_ppk * dur) {
if (scx_bpf_dsq_move_to_local(cpuc->lo_fb_dsq_id))
return;
tried_lo_fb = true;
}
}
if (cpuc->in_open_layers) {
if (cpuc->protect_owned) {
if (try_consume_layers(cpuc->op_layer_order, nr_op_layers,
MAX_LAYERS, cpuc, llcc))
return;
if (try_consume_layers(cpuc->on_layer_order, nr_on_layers,
MAX_LAYERS, cpuc, llcc))
return;
if (try_consume_layers(cpuc->gp_layer_order, nr_gp_layers,
MAX_LAYERS, cpuc, llcc))
return;
if (try_consume_layers(cpuc->gn_layer_order, nr_gn_layers,
MAX_LAYERS, cpuc, llcc))
return;
} else {
if (try_consume_layers(cpuc->op_layer_order, nr_op_layers,
MAX_LAYERS, cpuc, llcc))
return;
if (try_consume_layers(cpuc->gp_layer_order, nr_gp_layers,
MAX_LAYERS, cpuc, llcc))
return;
if (try_consume_layers(cpuc->ogn_layer_order, nr_ogn_layers,
MAX_LAYERS, cpuc, llcc))
return;
}
} else {
struct layer *owner_layer = NULL;
if (cpuc->layer_id < MAX_LAYERS)
owner_layer = &layers[cpuc->layer_id];
if (!owner_layer || (!owner_layer->is_protected && !cpuc->protect_owned && !owner_layer->preempt)) {
if (try_consume_layers(cpuc->ogp_layer_order, nr_ogp_layers,
cpuc->layer_id, cpuc, llcc))
return;
tried_preempting = true;
}
if (owner_layer) {
if (owner_layer->llcs_to_drain &&
try_drain_layer_llcs(owner_layer, cpuc))
return;
if (try_consume_layer(owner_layer->id, cpuc, llcc))
return;
if (owner_layer->is_protected)
goto replenish;
}
if (!tried_preempting &&
try_consume_layers(cpuc->ogp_layer_order, nr_ogp_layers,
cpuc->layer_id, cpuc, llcc))
return;
if (try_consume_layers(cpuc->ogn_layer_order, nr_ogn_layers,
cpuc->layer_id, cpuc, llcc))
return;
}
replenish:
if (!tried_lo_fb && scx_bpf_dsq_move_to_local(cpuc->lo_fb_dsq_id))
return;
if (prev_taskc && prev_layer)
prev->scx.slice = prev_layer->slice_ns;
}
void BPF_STRUCT_OPS(layered_tick, struct task_struct *p)
{
struct cpu_ctx *cpuc;
struct task_ctx *taskc;
if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p)))
return;
account_used(cpuc, taskc, scx_bpf_now());
}
static __noinline bool match_one(struct layer_match *match,
struct task_struct *p, const char *cgrp_path)
{
bool result = false;
const struct cred *cred;
switch (match->kind) {
case MATCH_CGROUP_PREFIX: {
return match_str(match->cgroup_prefix, cgrp_path, STR_PREFIX);
}
case MATCH_CGROUP_SUFFIX: {
return match_str(match->cgroup_suffix, cgrp_path, STR_SUFFIX);
}
case MATCH_CGROUP_CONTAINS: {
return match_str(match->cgroup_substr, cgrp_path, STR_SUBSTR);
}
case MATCH_COMM_PREFIX: {
char comm[MAX_COMM];
__builtin_memcpy(comm, p->comm, MAX_COMM);
return match_str(match->comm_prefix, comm, STR_PREFIX);
}
case MATCH_PCOMM_PREFIX: {
char pcomm[MAX_COMM];
__builtin_memcpy(pcomm, p->group_leader->comm, MAX_COMM);
return match_str(match->pcomm_prefix, pcomm, STR_PREFIX);
}
case MATCH_NICE_ABOVE:
return prio_to_nice((s32)p->static_prio) > match->nice;
case MATCH_NICE_BELOW:
return prio_to_nice((s32)p->static_prio) < match->nice;
case MATCH_NICE_EQUALS:
return prio_to_nice((s32)p->static_prio) == match->nice;
case MATCH_USER_ID_EQUALS:
bpf_rcu_read_lock();
cred = p->real_cred;
if (cred)
result = cred->euid.val == match->user_id;
bpf_rcu_read_unlock();
return result;
case MATCH_GROUP_ID_EQUALS:
bpf_rcu_read_lock();
cred = p->real_cred;
if (cred)
result = cred->egid.val == match->group_id;
bpf_rcu_read_unlock();
return result;
case MATCH_PID_EQUALS:
return p->pid == match->pid;
case MATCH_PPID_EQUALS:
return p->real_parent->pid == match->ppid;
case MATCH_TGID_EQUALS:
return p->tgid == match->tgid;
case MATCH_NSPID_EQUALS: {
bpf_rcu_read_lock();
struct pid *p_pid = get_task_pid_ptr(p, PIDTYPE_PID);
struct pid_namespace *pid_ns = get_task_pid_ns(p, PIDTYPE_TGID);
if (!p_pid || !pid_ns) {
bpf_rcu_read_unlock();
return result;
}
pid_t nspid = get_pid_nr_ns(p_pid, pid_ns);
u64 nsid = BPF_CORE_READ(pid_ns, ns.inum);
bpf_rcu_read_unlock();
return (u32)nspid == match->pid && nsid == match->nsid;
}
case MATCH_NS_EQUALS: {
bpf_rcu_read_lock();
struct pid *p_pid = get_task_pid_ptr(p, PIDTYPE_PID);
struct pid_namespace *pid_ns = get_task_pid_ns(p, PIDTYPE_TGID);
if (!p_pid || !pid_ns) {
bpf_rcu_read_unlock();
return result;
}
u64 nsid = BPF_CORE_READ(pid_ns, ns.inum);
bpf_rcu_read_unlock();
return nsid == match->nsid;
}
case MATCH_SCXCMD_JOIN: {
struct task_ctx *taskc = lookup_task_ctx_may_fail(p);
if (!taskc) {
scx_bpf_error("could not find task");
return false;
}
if (!taskc->join_layer[0])
return false;
return match_str(match->comm_prefix, taskc->join_layer,
STR_PREFIX);
}
case MATCH_IS_GROUP_LEADER: {
return (p->tgid == p->pid) == match->is_group_leader;
}
case MATCH_IS_KTHREAD:
return p->flags & PF_KTHREAD;
case MATCH_USED_GPU_TID: {
u32 tid;
bool pid_present = false;
if (!enable_gpu_support)
scx_bpf_error("UsedGpuTid requires --enable_gpu_support");
tid = p->pid;
if (bpf_map_lookup_elem(&gpu_tid, &tid))
pid_present = true;
return pid_present == match->used_gpu_tid;
}
case MATCH_USED_GPU_PID: {
u32 tgid;
bool pid_present = false;
if (!enable_gpu_support)
scx_bpf_error("UsedGpuPid requires --enable_gpu_support");
tgid = p->tgid;
if (bpf_map_lookup_elem(&gpu_tgid, &tgid))
pid_present = true;
return pid_present == match->used_gpu_pid;
}
case MATCH_AVG_RUNTIME: {
struct task_ctx *taskc = lookup_task_ctx_may_fail(p);
if (!taskc) {
scx_bpf_error("could not find task");
return false;
}
u64 avg_runtime_us = taskc->runtime_avg / 1000;
if (!taskc) {
scx_bpf_error("could not find task");
return false;
}
return match->min_avg_runtime_us <= avg_runtime_us &&
avg_runtime_us < match->max_avg_runtime_us;
}
default:
scx_bpf_error("invalid match kind %d", match->kind);
return result;
}
}
int match_layer(u32 layer_id, struct task_struct *p __arg_trusted, const char *cgrp_path)
{
struct layer *layer;
u32 nr_match_ors, pid;
u64 or_id, and_id;
if (layer_id >= nr_layers)
return -EINVAL;
layer = &layers[layer_id];
nr_match_ors = layer->nr_match_ors;
if (nr_match_ors > MAX_LAYER_MATCH_ORS)
return -EINVAL;
bpf_for(or_id, 0, nr_match_ors) {
struct layer_match_ands *ands;
bool matched = true;
barrier_var(or_id);
if (or_id >= MAX_LAYER_MATCH_ORS)
return -EINVAL;
ands = &layer->matches[or_id];
if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS)
return -EINVAL;
bpf_for(and_id, 0, ands->nr_match_ands) {
struct layer_match *match;
barrier_var(and_id);
if (and_id >= NR_LAYER_MATCH_KINDS)
return -EINVAL;
match = &ands->matches[and_id];
if (!(match_one(match, p, cgrp_path) == !match->exclude)) {
matched = false;
break;
}
}
if (matched) {
if (enable_match_debug && (pid = p->pid))
bpf_map_update_elem(&layer_match_dbg, &pid, &layer_id, BPF_ANY);
return 0;
}
}
return -ENOENT;
}
static void maybe_refresh_layer(struct task_struct *p __arg_trusted, struct task_ctx *taskc)
{
const char *cgrp_path;
bool matched = false;
u64 layer_id;
if (!taskc->refresh_layer)
return;
taskc->refresh_layer = false;
taskc->layer_refresh_seq = layer_refresh_seq_avgruntime;
if (!(cgrp_path = format_cgrp_path(p->cgroups->dfl_cgrp)))
return;
if (taskc->layer_id >= 0 && taskc->layer_id < nr_layers)
__sync_fetch_and_add(&layers[taskc->layer_id].nr_tasks, -1);
bpf_for(layer_id, 0, nr_layers) {
if (match_layer(layer_id, p, cgrp_path) == 0) {
matched = true;
break;
}
}
if (matched) {
struct layer *layer = &layers[layer_id];
struct cpu_ctx *cpuc;
struct llc_ctx *llcc;
if (!(cpuc = lookup_cpu_ctx(scx_bpf_task_cpu(p))) ||
!(llcc = lookup_llc_ctx(cpuc->llc_id)))
return;
taskc->layer_id = layer_id;
taskc->llc_id = cpuc->llc_id;
taskc->layered_cpus.seq = -1;
taskc->layered_cpus_llc.seq = -1;
taskc->layered_cpus_node.seq = -1;
__sync_fetch_and_add(&layer->nr_tasks, 1);
refresh_cpus_flags(taskc, p->cpus_ptr);
p->scx.dsq_vtime = llcc->vtime_now[layer_id];
} else {
scx_bpf_error("[%s]%d didn't match any layer", p->comm, p->pid);
}
if (taskc->layer_id < nr_layers - 1)
trace("LAYER=%d %s[%d] cgrp=\"%s\"",
taskc->layer_id, p->comm, p->pid, cgrp_path);
}
static s32 create_save_cpumask(struct bpf_cpumask **kptr)
{
struct bpf_cpumask *cpumask;
cpumask = bpf_cpumask_create();
if (!cpumask) {
scx_bpf_error("Failed to create cpumask");
return -ENOMEM;
}
cpumask = bpf_kptr_xchg(kptr, cpumask);
if (cpumask) {
scx_bpf_error("kptr already had cpumask");
bpf_cpumask_release(cpumask);
}
return 0;
}
static s32 create_node(u32 node_id)
{
u32 cpu;
struct bpf_cpumask *cpumask;
struct node_ctx *nodec;
struct cpu_ctx *cpuc;
s32 ret;
if (!(nodec = lookup_node_ctx(node_id)))
return -ENOENT;
nodec->id = node_id;
ret = create_save_cpumask(&nodec->cpumask);
if (ret)
return ret;
bpf_rcu_read_lock();
cpumask = nodec->cpumask;
if (!cpumask) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup node cpumask");
return -ENOENT;
}
bpf_for(cpu, 0, nr_possible_cpus) {
const volatile u64 *nmask;
nmask = MEMBER_VPTR(numa_cpumasks, [node_id][cpu / 64]);
if (!nmask) {
scx_bpf_error("array index error");
ret = -ENOENT;
break;
}
if (*nmask & (1LLU << (cpu % 64))) {
bpf_cpumask_set_cpu(cpu, cpumask);
if (!(cpuc = lookup_cpu_ctx(cpu))) {
scx_bpf_error("cpu ctx error");
ret = -ENOENT;
break;
}
cpuc->node_id = node_id;
nodec->nr_cpus++;
nodec->llc_mask |= (1LLU << node_id);
}
}
dbg("CFG creating node %d with %d cpus", node_id, nodec->nr_cpus);
bpf_rcu_read_unlock();
return ret;
}
static s32 create_llc(u32 llc_id)
{
struct bpf_cpumask *cpumask;
struct llc_ctx *llcc;
struct cpu_ctx *cpuc;
struct llc_prox_map *pmap;
u32 cpu;
s32 i, ret;
if (!(llcc = lookup_llc_ctx(llc_id)))
return -ENOENT;
llcc->id = llc_id;
ret = create_save_cpumask(&llcc->cpumask);
if (ret)
return ret;
bpf_rcu_read_lock();
cpumask = llcc->cpumask;
if (!cpumask) {
bpf_rcu_read_unlock();
scx_bpf_error("Failed to lookup node cpumask");
return -ENOENT;
}
bpf_for(cpu, 0, nr_possible_cpus) {
if (!(cpuc = lookup_cpu_ctx(cpu))) {
bpf_rcu_read_unlock();
scx_bpf_error("cpu ctx error");
return -ENOENT;
}
if (cpu_to_llc_id(cpu) != llc_id)
continue;
bpf_cpumask_set_cpu(cpu, cpumask);
llcc->nr_cpus++;
cpuc->llc_id = llc_id;
cpuc->hi_fb_dsq_id = hi_fb_dsq_id(llc_id);
cpuc->lo_fb_dsq_id = lo_fb_dsq_id(llc_id);
}
dbg("CFG creating llc %d with %d cpus", llc_id, llcc->nr_cpus);
bpf_rcu_read_unlock();
pmap = &llcc->prox_map;
dbg("CFG: LLC[%d] prox_map node/sys=%d/%d",
llc_id, pmap->node_end, pmap->sys_end);
if (pmap->sys_end > nr_possible_cpus || pmap->sys_end > MAX_CPUS) {
scx_bpf_error("CPU %d proximity map too long", cpu);
return -EINVAL;
}
bpf_for(i, 0, pmap->sys_end) {
u16 *p = MEMBER_VPTR(pmap->llcs, [i]);
if (p)
dbg("CFG: LLC[%d] prox[%d]=%d", cpu, i, *p);
}
return ret;
}
static __always_inline
void on_wakeup(struct task_struct *p, struct task_ctx *taskc)
{
struct cpu_ctx *cpuc;
struct layer *layer;
struct task_ctx *waker_taskc;
struct task_struct *waker;
if (!(cpuc = lookup_cpu_ctx(-1)) ||
!(layer = lookup_layer(taskc->layer_id)))
return;
if (!(waker = bpf_get_current_task_btf()) ||
!(waker_taskc = lookup_task_ctx_may_fail(waker)))
return;
if (taskc->layer_id == waker_taskc->layer_id)
return;
if (taskc->last_waker == waker->pid)
lstat_inc(LSTAT_XLAYER_REWAKE, layer, cpuc);
taskc->last_waker = waker->pid;
lstat_inc(LSTAT_XLAYER_WAKE, layer, cpuc);
}
void BPF_STRUCT_OPS(layered_runnable, struct task_struct *p, u64 enq_flags)
{
struct task_ctx *taskc;
u64 now = scx_bpf_now();
if (!(taskc = lookup_task_ctx(p)))
return;
taskc->runnable_at = now;
maybe_refresh_layer(p, taskc);
if (enq_flags & SCX_ENQ_WAKEUP)
on_wakeup(p, taskc);
}
void BPF_STRUCT_OPS(layered_running, struct task_struct *p)
{
struct cpu_ctx *cpuc;
struct task_ctx *taskc;
struct layer *layer;
struct node_ctx *nodec;
struct llc_ctx *llcc;
s32 task_cpu = scx_bpf_task_cpu(p);
u64 now = scx_bpf_now();
u32 layer_id;
if (!(cpuc = lookup_cpu_ctx(-1)) || !(llcc = lookup_llc_ctx(cpuc->llc_id)) ||
!(taskc = lookup_task_ctx(p)))
return;
layer_id = taskc->layer_id;
if (!(layer = lookup_layer(layer_id)))
return;
task_uncharge_qrt(taskc);
if (taskc->last_cpu >= 0 && taskc->last_cpu != task_cpu) {
lstat_inc(LSTAT_MIGRATION, layer, cpuc);
if (!(nodec = lookup_node_ctx(cpuc->node_id)))
return;
if (nodec->cpumask &&
!bpf_cpumask_test_cpu(taskc->last_cpu, cast_mask(nodec->cpumask)))
lstat_inc(LSTAT_XNUMA_MIGRATION, layer, cpuc);
if (llcc->cpumask &&
!bpf_cpumask_test_cpu(taskc->last_cpu, cast_mask(llcc->cpumask)))
lstat_inc(LSTAT_XLLC_MIGRATION, layer, cpuc);
}
taskc->last_cpu = task_cpu;
maybe_update_task_llc(p, taskc, task_cpu);
if (time_before(llcc->vtime_now[layer_id], p->scx.dsq_vtime))
llcc->vtime_now[layer_id] = p->scx.dsq_vtime;
cpuc->current_preempt = layer->preempt ||
(is_percpu_kthread(p) && is_percpu_kthread_preempting(p));
cpuc->task_layer_id = taskc->layer_id;
cpuc->used_at = now;
taskc->running_at = now;
cpuc->is_protected = layer->is_protected;
if (layer->kind == LAYER_KIND_OPEN) {
cpuc->running_owned = cpuc->in_open_layers;
cpuc->running_open = true;
} else {
cpuc->running_owned = taskc->layer_id == cpuc->layer_id;
cpuc->running_open = false;
}
if (nr_excl_layers) {
cpuc->next_excl = false;
cpuc->current_excl = layer->excl;
if (cpuc->prev_excl && !cpuc->current_excl) {
s32 sib = sibling_cpu(task_cpu);
struct cpu_ctx *sib_cpuc;
if (sib >= 0 && (sib_cpuc = lookup_cpu_ctx(sib))) {
gstat_inc(GSTAT_EXCL_WAKEUP, cpuc);
scx_bpf_kick_cpu(sib, SCX_KICK_IDLE);
}
}
}
if (layer->perf > 0 && cpuc->perf != layer->perf) {
scx_bpf_cpuperf_set(task_cpu, layer->perf);
cpuc->perf = layer->perf;
}
}
void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable)
{
struct cpu_ctx *cpuc;
struct task_ctx *taskc;
struct layer *task_layer;
struct task_hint *task_hint;
u64 now = scx_bpf_now();
u64 usage_since_idle;
s32 task_lid;
u64 runtime;
if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p)))
return;
task_lid = taskc->layer_id;
if (!(task_layer = lookup_layer(task_lid)))
return;
runtime = now - taskc->running_at;
taskc->runtime_avg =
((RUNTIME_DECAY_FACTOR - 1) * taskc->runtime_avg + runtime) /
RUNTIME_DECAY_FACTOR;
account_used(cpuc, taskc, now);
if (taskc->dsq_id & HI_FB_DSQ_BASE)
gstat_inc(GSTAT_HI_FB_EVENTS, cpuc);
else if (taskc->dsq_id & LO_FB_DSQ_BASE)
gstat_inc(GSTAT_LO_FB_EVENTS, cpuc);
usage_since_idle = cpuc->usage - cpuc->usage_at_idle;
cpuc->protect_owned = false;
cpuc->protect_owned_preempt = false;
cpuc->is_protected = false;
if (cpuc->in_open_layers) {
if (task_layer->kind == LAYER_KIND_OPEN && !task_layer->preempt) {
cpuc->protect_owned = usage_since_idle > min_open_layer_disallow_open_after_ns;
cpuc->protect_owned_preempt = usage_since_idle > min_open_layer_disallow_preempt_after_ns;
}
} else {
struct layer *cpu_layer = NULL;
if (cpuc->layer_id != MAX_LAYERS &&
!(cpu_layer = lookup_layer(cpuc->layer_id)))
return;
if (cpu_layer) {
cpuc->protect_owned = usage_since_idle > cpu_layer->disallow_open_after_ns;
cpuc->protect_owned_preempt = usage_since_idle > cpu_layer->disallow_preempt_after_ns;
}
}
cpuc->running_fallback = false;
cpuc->current_preempt = false;
cpuc->task_layer_id = MAX_LAYERS;
if (nr_excl_layers) {
cpuc->prev_excl = cpuc->current_excl;
cpuc->current_excl = false;
}
if (runtime < task_layer->min_exec_ns) {
lstat_inc(LSTAT_MIN_EXEC, task_layer, cpuc);
lstat_add(LSTAT_MIN_EXEC_NS, task_layer, cpuc, task_layer->min_exec_ns - runtime);
runtime = task_layer->min_exec_ns;
}
if (cpuc->yielding && runtime < task_layer->slice_ns)
runtime = task_layer->slice_ns;
runtime = runtime * 100 / p->scx.weight;
task_hint = bpf_task_storage_get(&scx_layered_task_hint_map, p, NULL, 0);
if (task_hint) {
u64 hint = task_hint->hint ?: 1;
hint = hint < 1024 ? hint : 1024;
runtime = (runtime * hint) / 1024;
}
p->scx.dsq_vtime += runtime;
}
bool BPF_STRUCT_OPS(layered_yield, struct task_struct *from, struct task_struct *to)
{
struct cpu_ctx *cpuc;
struct task_ctx *taskc;
struct layer *layer;
if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(from)) ||
!(layer = lookup_layer(taskc->layer_id)))
return false;
if (!layer->yield_step_ns) {
lstat_inc(LSTAT_YIELD_IGNORE, layer, cpuc);
return false;
}
if (from->scx.slice > layer->yield_step_ns) {
from->scx.slice -= layer->yield_step_ns;
lstat_inc(LSTAT_YIELD_IGNORE, layer, cpuc);
} else {
from->scx.slice = 0;
cpuc->yielding = true;
}
return false;
}
void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight)
{
struct task_ctx *taskc;
if ((taskc = lookup_task_ctx(p)))
taskc->refresh_layer = true;
}
static void refresh_cpus_flags(struct task_ctx *taskc,
const struct cpumask *cpumask)
{
const struct cpumask *cpuset;
u32 node_id;
cpuset = (const struct cpumask *)lookup_layer_cpuset(taskc->layer_id);
if (!cpuset) {
scx_bpf_error("no cpuset mask found");
return;
}
taskc->all_cpuset_allowed = bpf_cpumask_subset(cpuset, cpumask);
taskc->cpus_node_aligned = true;
bpf_for(node_id, 0, nr_nodes) {
struct node_ctx *nodec;
const struct cpumask *node_cpumask;
if (!(nodec = lookup_node_ctx(node_id)) ||
!(node_cpumask = cast_mask(nodec->cpumask)))
return;
if (bpf_cpumask_intersects(node_cpumask, cpumask) &&
!bpf_cpumask_subset(node_cpumask, cpumask)) {
taskc->cpus_node_aligned = false;
break;
}
}
}
static int init_cached_cpus(struct cached_cpus *ccpus)
{
ccpus->id = -1;
return 0;
}
static int maybe_init_task_unprotected_mask(struct task_struct *p, struct task_ctx *taskc)
{
struct bpf_cpumask *cpumask;
int ret;
if (bpf_cpumask_full(p->cpus_ptr))
return 0;
if (taskc->layered_unprotected_mask)
return 0;
ret = init_cached_cpus(&taskc->layered_cpus_unprotected);
if (ret)
return ret;
if (!(cpumask = bpf_cpumask_create()))
return -ENOMEM;
if ((cpumask = bpf_kptr_xchg(&taskc->layered_unprotected_mask, cpumask))) {
bpf_cpumask_release(cpumask);
return -EINVAL;
}
return 0;
}
void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p,
const struct cpumask *cpumask)
{
struct task_ctx *taskc;
if (!(taskc = lookup_task_ctx(p)))
return;
if (taskc->layer_id != MAX_LAYERS)
refresh_cpus_flags(taskc, cpumask);
taskc->layered_cpus.seq = -1;
taskc->layered_cpus_llc.seq = -1;
taskc->layered_cpus_node.seq = -1;
taskc->layered_cpus_unprotected.seq = -1;
maybe_init_task_unprotected_mask(p, taskc);
}
void BPF_STRUCT_OPS(layered_update_idle, s32 cpu, bool idle)
{
struct cpu_ctx *cpuc;
if (!idle || !(cpuc = lookup_cpu_ctx(cpu)))
return;
cpuc->protect_owned = false;
cpuc->usage_at_idle = cpuc->usage;
}
void BPF_STRUCT_OPS(layered_cpu_release, s32 cpu,
struct scx_cpu_release_args *args)
{
scx_bpf_reenqueue_local();
}
s32 BPF_STRUCT_OPS(layered_init_task, struct task_struct *p,
struct scx_init_task_args *args)
{
struct task_ctx *taskc;
struct bpf_cpumask *cpumask;
s32 ret;
taskc = bpf_task_storage_get(&task_ctxs, p, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!taskc) {
scx_bpf_error("task_ctx allocation failure");
return -ENOMEM;
}
ret = init_cached_cpus(&taskc->layered_cpus);
if (ret)
return ret;
if (!(cpumask = bpf_cpumask_create()))
return -ENOMEM;
if ((cpumask = bpf_kptr_xchg(&taskc->layered_mask, cpumask))) {
bpf_cpumask_release(cpumask);
return -EINVAL;
}
ret = init_cached_cpus(&taskc->layered_cpus_llc);
if (ret)
return ret;
if (!(cpumask = bpf_cpumask_create()))
return -ENOMEM;
if ((cpumask = bpf_kptr_xchg(&taskc->layered_llc_mask, cpumask))) {
bpf_cpumask_release(cpumask);
return -EINVAL;
}
ret = init_cached_cpus(&taskc->layered_cpus_node);
if (ret)
return ret;
if (!(cpumask = bpf_cpumask_create()))
return -ENOMEM;
if ((cpumask = bpf_kptr_xchg(&taskc->layered_node_mask, cpumask))) {
bpf_cpumask_release(cpumask);
return -EINVAL;
}
ret = maybe_init_task_unprotected_mask(p, taskc);
if (ret)
return ret;
taskc->pid = p->pid;
taskc->last_cpu = -1;
taskc->layer_id = MAX_LAYERS;
taskc->refresh_layer = true;
taskc->llc_id = MAX_LLCS;
taskc->qrt_layer_id = MAX_LLCS;
taskc->qrt_llc_id = MAX_LLCS;
taskc->runtime_avg = slice_ns / 4;
return 0;
}
void BPF_STRUCT_OPS(layered_exit_task, struct task_struct *p,
struct scx_exit_task_args *args)
{
struct cpu_ctx *cpuc;
struct task_ctx *taskc;
u32 pid;
if (args->cancelled) {
return;
}
if (enable_match_debug && (pid = p->pid))
bpf_map_delete_elem(&layer_match_dbg, &pid);
if (!(cpuc = lookup_cpu_ctx(-1)) || !(taskc = lookup_task_ctx(p)))
return;
if (taskc->layer_id < nr_layers)
__sync_fetch_and_add(&layers[taskc->layer_id].nr_tasks, -1);
}
void BPF_STRUCT_OPS(layered_disable, struct task_struct *p)
{
struct task_ctx *taskc;
if (!(taskc = lookup_task_ctx(p)))
return;
task_uncharge_qrt(taskc);
}
static s64 dsq_first_runnable_at_ms(u64 dsq_id, u64 now)
{
struct task_struct *p;
bpf_for_each(scx_dsq, p, dsq_id, 0) {
struct task_ctx *taskc;
if ((taskc = lookup_task_ctx(p))) {
u64 runnable_at = taskc->runnable_at;
if (runnable_at >= now)
return ((taskc->runnable_at - now) / 1000000);
else
return -((now - taskc->runnable_at) / 1000000);
}
}
return 0;
}
__hidden void dump_cpumask_word(s32 word, struct cpumask *cpumask)
{
u32 u, v = 0;
bpf_for(u, 0, 32) {
s32 cpu = 32 * word + u;
if (cpu < nr_cpu_ids &&
bpf_cpumask_test_cpu(cpu, cpumask))
v |= 1 << u;
}
scx_bpf_dump("%08x", v);
}
static void dump_layer_cpumask(int id)
{
struct cpumask *layer_cpumask;
u32 word, nr_words = (nr_cpu_ids + 31) / 32;
if (!(layer_cpumask = lookup_layer_cpumask(id)))
return;
bpf_for(word, 0, nr_words) {
if (word)
scx_bpf_dump(",");
dump_cpumask_word(nr_words - word - 1, layer_cpumask);
}
}
void BPF_STRUCT_OPS(layered_dump, struct scx_dump_ctx *dctx)
{
u64 now = scx_bpf_now();
u64 dsq_id;
int i, j;
struct layer *layer;
scx_bpf_dump_header();
bpf_for(i, 0, nr_layers) {
layer = lookup_layer(i);
if (!layer) {
scx_bpf_error("unabled to lookup layer %d", i);
continue;
}
bpf_for(j, 0, nr_llcs) {
if (!(layer->llc_mask & (1 << j)))
continue;
dsq_id = layer_dsq_id(layer->id, j);
scx_bpf_dump("LAYER[%d](%s)-DSQ[%llx] nr_cpus=%u nr_queued=%d %+lldms\n",
i, layer->name, dsq_id, layer->nr_cpus,
scx_bpf_dsq_nr_queued(dsq_id),
dsq_first_runnable_at_ms(dsq_id, now));
}
scx_bpf_dump("LAYER[%d](%s) CPUS=", i, layer->name);
dump_layer_cpumask(i);
scx_bpf_dump("\n");
}
bpf_for(i, 0, nr_llcs) {
dsq_id = hi_fb_dsq_id(i);
scx_bpf_dump("HI_[%llx] nr_queued=%d %+lldms\n",
dsq_id, scx_bpf_dsq_nr_queued(dsq_id),
dsq_first_runnable_at_ms(dsq_id, now));
dsq_id = lo_fb_dsq_id(i);
scx_bpf_dump("LO_FALLBACK[%llx] nr_queued=%d %+lldms\n",
dsq_id, scx_bpf_dsq_nr_queued(dsq_id),
dsq_first_runnable_at_ms(dsq_id, now));
}
}
static bool layered_monitor(void)
{
if (monitor_disable)
return false;
return true;
}
u64 antistall_set(u64 dsq_id, u64 jiffies_now)
{
struct task_struct *__p, *p = NULL;
struct task_ctx *taskc;
s32 cpu;
u64 *antistall_dsq, *delay, cur_delay;
int pass;
if (!dsq_id || !jiffies_now)
return 0;
bpf_rcu_read_lock();
bpf_for_each(scx_dsq, __p, dsq_id, 0) {
if (p)
bpf_task_release(p);
if (!(p = bpf_task_from_pid(__p->pid)))
continue;
if (!(taskc = lookup_task_ctx(p)))
goto unlock;
cur_delay = get_delay_sec(p, jiffies_now);
if (cur_delay <= antistall_sec)
goto unlock;
#pragma unroll
for (pass = 0; pass < 2; ++pass) bpf_for(cpu, 0, nr_possible_cpus) {
const struct cpumask *cpumask;
if (!(cpumask = cast_mask(taskc->layered_mask)))
goto unlock;
if (bpf_cpumask_empty(cpumask))
cpumask = p->cpus_ptr;
if (!bpf_cpumask_test_cpu(cpu, cpumask))
continue;
antistall_dsq = bpf_map_lookup_percpu_elem(&antistall_cpu_dsq, &zero_u32, cpu);
delay = bpf_map_lookup_percpu_elem(&antistall_cpu_max_delay, &zero_u32, cpu);
if (!antistall_dsq || !delay) {
scx_bpf_error("cant happen");
goto unlock;
}
if ((pass == 0 && *antistall_dsq == SCX_DSQ_INVALID) ||
(pass != 0 && *delay < cur_delay)) {
trace("antistall set DSQ[%llu] SELECTED_CPU[%llu] DELAY[%llu]", dsq_id, cpu, cur_delay);
*delay = cur_delay;
*antistall_dsq = dsq_id;
goto unlock;
}
}
goto unlock;
}
unlock:
if (p)
bpf_task_release(p);
bpf_rcu_read_unlock();
return 0;
}
static bool antistall_scan(void)
{
s32 llc;
u64 layer_id;
u64 jiffies_now;
if (!enable_antistall)
return true;
jiffies_now = bpf_jiffies64();
bpf_for(layer_id, 0, nr_layers)
bpf_for(llc, 0, nr_llcs)
antistall_set(layer_dsq_id(layer_id, llc), jiffies_now);
bpf_for(llc, 0, nr_llcs) {
antistall_set(hi_fb_dsq_id(llc), jiffies_now);
antistall_set(lo_fb_dsq_id(llc), jiffies_now);
}
return true;
}
bool run_timer_cb(int key)
{
switch (key) {
case LAYERED_MONITOR:
return layered_monitor();
case ANTISTALL_TIMER:
return antistall_scan();
case NOOP_TIMER:
case MAX_TIMERS:
default:
return false;
}
}
struct layered_timer layered_timers[MAX_TIMERS] = {
{15LLU * NSEC_PER_SEC, CLOCK_BOOTTIME, 0},
{1LLU * NSEC_PER_SEC, CLOCK_BOOTTIME, 0},
{0LLU, CLOCK_BOOTTIME, 0},
};
__weak int
init_layer_cpumasks(int layer_id)
{
struct bpf_cpumask *cpumask, *cpuset;
struct layer_cpumask_wrapper *cpumaskw;
if (!(cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &layer_id)))
return -ENOENT;
cpuset = bpf_cpumask_create();
if (!cpuset)
return -ENOMEM;
cpuset = bpf_kptr_xchg(&cpumaskw->cpuset, cpuset);
if (cpuset)
bpf_cpumask_release(cpuset);
cpumask = bpf_cpumask_create();
if (!cpumask)
return -ENOMEM;
cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask);
if (cpumask)
bpf_cpumask_release(cpumask);
layer_cpuset_bpfmask(layer_id);
bpf_rcu_read_lock();
if (!(cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &layer_id)) ||
!(cpumask = cpumaskw->cpumask) ||
!(cpuset = cpumaskw->cpuset)) {
bpf_rcu_read_unlock();
return -EINVAL;
}
bpf_cpumask_copy(cpumask, (const struct cpumask *)cpuset);
bpf_rcu_read_unlock();
return 0;
}
static s32 init_layer(int layer_id)
{
struct layer *layer = &layers[layer_id];
int i, j, ret;
dbg("CFG LAYER[%d][%s] min_exec_ns=%lu open=%d preempt=%d excl=%d",
layer_id, layer->name, layer->min_exec_ns,
layer->kind != LAYER_KIND_CONFINED,
layer->preempt, layer->excl);
dbg("CFG disallow_open/preempt_after/protected=%lu/%lu/%d",
layer->disallow_open_after_ns, layer->disallow_preempt_after_ns,
layer->is_protected);
layer->id = layer_id;
if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) {
scx_bpf_error("too many ORs");
return -EINVAL;
}
bpf_for(i, 0, layer->nr_match_ors) {
struct layer_match_ands *ands = MEMBER_VPTR(layers, [layer_id].matches[i]);
if (!ands) {
scx_bpf_error("shouldn't happen");
return -EINVAL;
}
if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) {
scx_bpf_error("too many ANDs");
return -EINVAL;
}
dbg("CFG OR[%02d]", i);
bpf_for(j, 0, ands->nr_match_ands) {
char header[32];
u64 header_data[1] = { j };
struct layer_match *match;
bpf_snprintf(header, sizeof(header), "CFG AND[%02d]:",
header_data, sizeof(header_data));
match = MEMBER_VPTR(layers, [layer_id].matches[i].matches[j]);
if (!match) {
scx_bpf_error("shouldn't happen");
return -EINVAL;
}
switch (match->kind) {
case MATCH_CGROUP_PREFIX:
dbg("%s CGROUP_PREFIX \"%s\"", header, match->cgroup_prefix);
break;
case MATCH_COMM_PREFIX:
dbg("%s COMM_PREFIX \"%s\"", header, match->comm_prefix);
break;
case MATCH_PCOMM_PREFIX:
dbg("%s PCOMM_PREFIX \"%s\"", header, match->pcomm_prefix);
break;
case MATCH_NICE_ABOVE:
dbg("%s NICE_ABOVE %d", header, match->nice);
break;
case MATCH_NICE_BELOW:
dbg("%s NICE_BELOW %d", header, match->nice);
break;
case MATCH_NICE_EQUALS:
dbg("%s NICE_EQUALS %d", header, match->nice);
break;
case MATCH_USER_ID_EQUALS:
dbg("%s USER_ID %u", header, match->user_id);
break;
case MATCH_GROUP_ID_EQUALS:
dbg("%s GROUP_ID %u", header, match->group_id);
break;
case MATCH_PID_EQUALS:
dbg("%s PID %u", header, match->pid);
break;
case MATCH_PPID_EQUALS:
dbg("%s PPID %u", header, match->ppid);
break;
case MATCH_TGID_EQUALS:
dbg("%s TGID %u", header, match->tgid);
break;
case MATCH_NSPID_EQUALS:
dbg("%s NSID %lld PID %d",
header, match->nsid, match->pid);
break;
case MATCH_NS_EQUALS:
dbg("%s NSID %lld", header, match->nsid);
break;
case MATCH_SCXCMD_JOIN:
dbg("%s SCXCMD_JOIN \"%s\"", header, match->comm_prefix);
break;
case MATCH_IS_GROUP_LEADER:
dbg("%s IS_GROUP_LEADER %d", header, match->is_group_leader);
break;
case MATCH_IS_KTHREAD:
dbg("%s IS_KTHREAD %d", header, match->is_kthread);
break;
case MATCH_USED_GPU_TID:
dbg("%s GPU_TID %d", header, match->used_gpu_tid);
break;
case MATCH_USED_GPU_PID:
dbg("%s GPU_PID %d", header, match->used_gpu_pid);
break;
case MATCH_AVG_RUNTIME:
layer->periodically_refresh = true;
dbg("%s AVG_RUNTIME [%lluus, %lluus)", header,
match->min_avg_runtime_us,
match->max_avg_runtime_us);
case MATCH_CGROUP_SUFFIX:
dbg("%s CGROUP_SUFFIX \"%s\"", header, match->cgroup_suffix);
break;
case MATCH_CGROUP_CONTAINS:
dbg("%s CGROUP_CONTAINS \"%s\"", header, match->cgroup_substr);
break;
default:
scx_bpf_error("%s Invalid kind", header);
return -EINVAL;
}
}
if (ands->nr_match_ands == 0)
dbg("CFG DEFAULT");
}
if ((ret = init_layer_cpumasks(layer_id))) {
scx_bpf_error("could not initalize cpumasks");
return ret;
}
bpf_for(i, 0, nr_llcs) {
u64 dsq_id = layer_dsq_id(layer_id, i);
int node_id = llc_node_id(i);
dbg("CFG creating DSQ 0x%llx for layer %d %s on LLC %d (node %d)",
dsq_id, layer_id, layer->name, i, node_id);
ret = scx_bpf_create_dsq(dsq_id, node_id);
if (ret < 0)
return ret;
}
return 0;
}
static s32 init_cpu(s32 cpu, int *nr_online_cpus,
struct bpf_cpumask *cpumask,
struct bpf_cpumask *tmp_big_cpumask,
struct bpf_cpumask *tmp_unprotected_cpumask)
{
const volatile u8 *u8_ptr;
struct cpu_ctx *cpuc;
struct cpu_prox_map *pmap;
u64 *init_antistall_dsq;
int i;
init_antistall_dsq = bpf_map_lookup_percpu_elem(&antistall_cpu_dsq,
&zero_u32, cpu);
if (init_antistall_dsq) {
*init_antistall_dsq = SCX_DSQ_INVALID;
}
if (!(cpuc = lookup_cpu_ctx(cpu))) {
return -ENOMEM;
}
cpuc->task_layer_id = MAX_LAYERS;
bpf_cpumask_set_cpu(cpu, tmp_unprotected_cpumask);
if ((u8_ptr = MEMBER_VPTR(all_cpus, [cpu / 8]))) {
if (*u8_ptr & (1 << (cpu % 8))) {
bpf_cpumask_set_cpu(cpu, cpumask);
(*nr_online_cpus)++;
if (cpuc->is_big)
bpf_cpumask_set_cpu(cpu, tmp_big_cpumask);
}
} else {
return -EINVAL;
}
pmap = &cpuc->prox_map;
dbg("CFG: CPU[%d] prox_map core/llc/node/sys=%d/%d/%d/%d",
cpu, pmap->core_end, pmap->llc_end, pmap->node_end, pmap->sys_end);
if (pmap->sys_end > nr_possible_cpus || pmap->sys_end > MAX_CPUS) {
scx_bpf_error("CPU %d proximity map too long", cpu);
return -EINVAL;
}
if (cpu == 0) {
bpf_for(i, 0, pmap->sys_end) {
u16 *p = MEMBER_VPTR(pmap->cpus, [i]);
if (p)
dbg("CFG: CPU[%d] prox[%d]=%d", cpu, i, *p);
}
}
bpf_for(i, 0, nr_op_layers + nr_gp_layers)
dbg("CFG: CPU[%d] ogp_layer_order[%d]=%d",
cpu, i, cpuc->ogp_layer_order[i]);
bpf_for(i, 0, nr_on_layers + nr_gn_layers)
dbg("CFG: CPU[%d] ogn_layer_order[%d]=%d",
cpu, i, cpuc->ogn_layer_order[i]);
bpf_for(i, 0, nr_op_layers)
dbg("CFG: CPU[%d] op_layer_order[%d]=%d",
cpu, i, cpuc->op_layer_order[i]);
bpf_for(i, 0, nr_on_layers)
dbg("CFG: CPU[%d] on_layer_order[%d]=%d",
cpu, i, cpuc->on_layer_order[i]);
bpf_for(i, 0, nr_gp_layers)
dbg("CFG: CPU[%d] gp_layer_order[%d]=%d",
cpu, i, cpuc->gp_layer_order[i]);
bpf_for(i, 0, nr_gn_layers)
dbg("CFG: CPU[%d] gn_layer_order[%d]=%d",
cpu, i, cpuc->gn_layer_order[i]);
return 0;
}
s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
{
struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask;
int i, nr_online_cpus, ret;
cpumask = bpf_cpumask_create();
if (!cpumask)
return -ENOMEM;
tmp_big_cpumask = bpf_cpumask_create();
if (!tmp_big_cpumask) {
bpf_cpumask_release(cpumask);
return -ENOMEM;
}
tmp_unprotected_cpumask = bpf_cpumask_create();
if (!tmp_unprotected_cpumask) {
bpf_cpumask_release(tmp_big_cpumask);
bpf_cpumask_release(cpumask);
return -ENOMEM;
}
nr_online_cpus = 0;
bpf_for(i, 0, nr_possible_cpus) {
ret = init_cpu(i, &nr_online_cpus, cpumask, tmp_big_cpumask, tmp_unprotected_cpumask);
if (ret != 0) {
bpf_cpumask_release(cpumask);
bpf_cpumask_release(tmp_big_cpumask);
bpf_cpumask_release(tmp_unprotected_cpumask);
return ret;
}
}
cpumask = bpf_kptr_xchg(&all_cpumask, cpumask);
if (cpumask)
bpf_cpumask_release(cpumask);
tmp_big_cpumask = bpf_kptr_xchg(&big_cpumask, tmp_big_cpumask);
if (tmp_big_cpumask)
bpf_cpumask_release(tmp_big_cpumask);
tmp_unprotected_cpumask = bpf_kptr_xchg(&unprotected_cpumask, tmp_unprotected_cpumask);
if (tmp_unprotected_cpumask)
bpf_cpumask_release(tmp_unprotected_cpumask);
bpf_for(i, 0, nr_nodes) {
ret = create_node(i);
if (ret)
return ret;
}
bpf_for(i, 0, nr_llcs) {
ret = create_llc(i);
if (ret)
return ret;
}
dbg("CFG: Dumping configuration, nr_online_cpus=%d smt_enabled=%d little_cores=%d",
nr_online_cpus, smt_enabled, has_little_cores);
dbg("CFG: min_open_layer_disallow_open/preempt_after=%lu/%lu",
min_open_layer_disallow_open_after_ns, min_open_layer_disallow_preempt_after_ns);
bpf_for(i, 0, nr_layers) {
ret = init_layer(i);
if (ret != 0)
return ret;
}
bpf_for(i, 0, nr_llcs) {
u64 dsq_id;
dsq_id = hi_fb_dsq_id(i);
dbg("CFG creating hi fallback DSQ 0x%llx on LLC %d", dsq_id, i);
ret = scx_bpf_create_dsq(dsq_id, llc_node_id(i));
if (ret < 0)
return ret;
dsq_id = lo_fb_dsq_id(i);
dbg("CFG creating lo fallback DSQ 0x%llx on LLC %d", dsq_id, i);
ret = scx_bpf_create_dsq(dsq_id, llc_node_id(i));
if (ret < 0)
return ret;
}
ret = start_layered_timers();
if (ret < 0)
return ret;
return 0;
}
void BPF_STRUCT_OPS(layered_exit, struct scx_exit_info *ei)
{
UEI_RECORD(uei, ei);
}
SCX_OPS_DEFINE(layered,
.select_cpu = (void *)layered_select_cpu,
.enqueue = (void *)layered_enqueue,
.dispatch = (void *)layered_dispatch,
.tick = (void *)layered_tick,
.runnable = (void *)layered_runnable,
.running = (void *)layered_running,
.stopping = (void *)layered_stopping,
.yield = (void *)layered_yield,
.set_weight = (void *)layered_set_weight,
.set_cpumask = (void *)layered_set_cpumask,
.update_idle = (void *)layered_update_idle,
.cpu_release = (void *)layered_cpu_release,
.init_task = (void *)layered_init_task,
.exit_task = (void *)layered_exit_task,
.disable = (void *)layered_disable,
.dump = (void *)layered_dump,
.init = (void *)layered_init,
.exit = (void *)layered_exit,
.flags = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST,
.name = "layered");