use serde::{Deserialize, Serialize};
use super::btf_offsets::{TaskEnrichmentOffsets, pid_type};
use super::guest::GuestKernel;
use super::idr::translate_any_kva;
const TASK_COMM_LEN: usize = 16;
#[derive(Debug, Clone, Default)]
pub struct SchedClassRegistry {
pub fair: Option<u64>,
pub rt: Option<u64>,
pub dl: Option<u64>,
pub idle: Option<u64>,
pub stop: Option<u64>,
pub ext: Option<u64>,
}
#[allow(dead_code)] impl SchedClassRegistry {
pub fn from_guest_kernel(kernel: &GuestKernel<'_>) -> Self {
Self {
fair: kernel.symbol_kva("fair_sched_class"),
rt: kernel.symbol_kva("rt_sched_class"),
dl: kernel.symbol_kva("dl_sched_class"),
idle: kernel.symbol_kva("idle_sched_class"),
stop: kernel.symbol_kva("stop_sched_class"),
ext: kernel.symbol_kva("ext_sched_class"),
}
}
pub fn decode(&self, sched_class_kva: u64) -> Option<&'static str> {
if sched_class_kva == 0 {
return None;
}
if Some(sched_class_kva) == self.fair {
return Some("fair");
}
if Some(sched_class_kva) == self.rt {
return Some("rt");
}
if Some(sched_class_kva) == self.dl {
return Some("dl");
}
if Some(sched_class_kva) == self.idle {
return Some("idle");
}
if Some(sched_class_kva) == self.stop {
return Some("stop");
}
if Some(sched_class_kva) == self.ext {
return Some("ext");
}
None
}
}
#[derive(Debug, Clone, Default)]
pub struct LockSlowpathRegistry {
pub queued_spin_lock_slowpath: Option<u64>,
pub mutex_lock_slowpath: Option<u64>,
pub rwsem_down_read_slowpath: Option<u64>,
pub rwsem_down_write_slowpath: Option<u64>,
}
const LOCK_SLOWPATH_FN_MAX_SIZE: u64 = 4096;
#[allow(dead_code)] impl LockSlowpathRegistry {
pub fn from_guest_kernel(kernel: &GuestKernel<'_>) -> Self {
Self {
queued_spin_lock_slowpath: kernel.symbol_kva("queued_spin_lock_slowpath"),
mutex_lock_slowpath: kernel
.symbol_kva("__mutex_lock_slowpath")
.or_else(|| kernel.symbol_kva("__mutex_lock")),
rwsem_down_read_slowpath: kernel.symbol_kva("rwsem_down_read_slowpath"),
rwsem_down_write_slowpath: kernel.symbol_kva("rwsem_down_write_slowpath"),
}
}
pub fn match_pc(&self, pc: u64) -> Option<&'static str> {
let probe = |start: Option<u64>, name: &'static str| -> Option<&'static str> {
let s = start?;
let end = s.checked_add(LOCK_SLOWPATH_FN_MAX_SIZE)?;
if pc >= s && pc < end {
Some(name)
} else {
None
}
};
probe(self.queued_spin_lock_slowpath, "queued_spin_lock_slowpath")
.or_else(|| probe(self.mutex_lock_slowpath, "mutex_lock_slowpath"))
.or_else(|| probe(self.rwsem_down_read_slowpath, "rwsem_down_read_slowpath"))
.or_else(|| probe(self.rwsem_down_write_slowpath, "rwsem_down_write_slowpath"))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct TaskEnrichment {
pub pid: i32,
pub tgid: i32,
pub comm: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub group_leader_pid: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub real_parent_pid: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub real_parent_comm: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub pgid: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub sid: Option<i32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub nr_threads: Option<i32>,
pub weight: u32,
pub prio: i32,
pub static_prio: i32,
pub normal_prio: i32,
pub rt_priority: u32,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub sched_class: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub core_cookie: Option<u64>,
pub pi_boosted_out_of_scx: bool,
pub nvcsw: u64,
pub nivcsw: u64,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub signal_nvcsw: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub signal_nivcsw: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub lock_slowpath_match: Option<String>,
}
#[allow(dead_code)]
pub fn walk_task_enrichment(
kernel: &GuestKernel<'_>,
task_kva: u64,
offsets: &TaskEnrichmentOffsets,
classes: &SchedClassRegistry,
locks: &LockSlowpathRegistry,
is_runnable_in_scx: bool,
pc: Option<u64>,
) -> Option<TaskEnrichment> {
let mem = kernel.mem();
let walk = kernel.walk_context();
let task_pa = translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
task_kva,
walk.l5,
walk.tcr_el1,
)?;
let pid = mem.read_u32(task_pa, offsets.task_struct_pid) as i32;
let tgid = mem.read_u32(task_pa, offsets.task_struct_tgid) as i32;
let comm = read_comm(mem, task_pa, offsets.task_struct_comm);
let prio = mem.read_u32(task_pa, offsets.task_struct_prio) as i32;
let static_prio = mem.read_u32(task_pa, offsets.task_struct_static_prio) as i32;
let normal_prio = mem.read_u32(task_pa, offsets.task_struct_normal_prio) as i32;
let rt_priority = mem.read_u32(task_pa, offsets.task_struct_rt_priority);
let sched_class_kva = mem.read_u64(task_pa, offsets.task_struct_sched_class);
let sched_class = classes.decode(sched_class_kva).map(str::to_string);
let weight = mem.read_u32(task_pa, offsets.task_struct_scx + offsets.see_weight);
let core_cookie = offsets
.task_struct_core_cookie
.map(|off| mem.read_u64(task_pa, off));
let pi_boosted_out_of_scx =
is_runnable_in_scx && classes.ext.is_some() && Some(sched_class_kva) != classes.ext;
let nvcsw = mem.read_u64(task_pa, offsets.task_struct_nvcsw);
let nivcsw = mem.read_u64(task_pa, offsets.task_struct_nivcsw);
let group_leader_kva = mem.read_u64(task_pa, offsets.task_struct_group_leader);
let group_leader_pid =
follow_task_for_pid(mem, walk, group_leader_kva, offsets.task_struct_pid);
let real_parent_kva = mem.read_u64(task_pa, offsets.task_struct_real_parent);
let (real_parent_pid, real_parent_comm) = follow_task_for_pid_and_comm(
mem,
walk,
real_parent_kva,
offsets.task_struct_pid,
offsets.task_struct_comm,
);
let signal_kva = mem.read_u64(task_pa, offsets.task_struct_signal);
let (nr_threads, signal_nvcsw, signal_nivcsw, pgid, sid) = if signal_kva == 0 {
(None, None, None, None, None)
} else {
match translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
signal_kva,
walk.l5,
walk.tcr_el1,
) {
None => (None, None, None, None, None),
Some(signal_pa) => {
let nr_threads_v = mem.read_u32(signal_pa, offsets.signal_struct_nr_threads) as i32;
let signal_nvcsw_v = mem.read_u64(signal_pa, offsets.signal_struct_nvcsw);
let signal_nivcsw_v = mem.read_u64(signal_pa, offsets.signal_struct_nivcsw);
let pgid_v = read_pid_nr_at_index(
mem,
walk,
signal_pa,
offsets.signal_struct_pids,
pid_type::PGID,
offsets.pid_numbers,
offsets.upid_size,
offsets.upid_nr,
);
let sid_v = read_pid_nr_at_index(
mem,
walk,
signal_pa,
offsets.signal_struct_pids,
pid_type::SID,
offsets.pid_numbers,
offsets.upid_size,
offsets.upid_nr,
);
(
Some(nr_threads_v),
Some(signal_nvcsw_v),
Some(signal_nivcsw_v),
pgid_v,
sid_v,
)
}
}
};
let lock_slowpath_match = pc.and_then(|p| locks.match_pc(p)).map(str::to_string);
Some(TaskEnrichment {
pid,
tgid,
comm,
group_leader_pid,
real_parent_pid,
real_parent_comm,
pgid,
sid,
nr_threads,
weight,
prio,
static_prio,
normal_prio,
rt_priority,
sched_class,
core_cookie,
pi_boosted_out_of_scx,
nvcsw,
nivcsw,
signal_nvcsw,
signal_nivcsw,
lock_slowpath_match,
})
}
fn read_comm(mem: &super::reader::GuestMem, task_pa: u64, comm_off: usize) -> String {
let mut buf = [0u8; TASK_COMM_LEN];
mem.read_bytes(task_pa + comm_off as u64, &mut buf);
let n = buf.iter().position(|&b| b == 0).unwrap_or(TASK_COMM_LEN);
String::from_utf8_lossy(&buf[..n]).to_string()
}
fn follow_task_for_pid_and_comm(
mem: &super::reader::GuestMem,
walk: super::reader::WalkContext,
task_kva: u64,
pid_off: usize,
comm_off: usize,
) -> (Option<i32>, Option<String>) {
if task_kva == 0 {
return (None, None);
}
let Some(task_pa) = translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
task_kva,
walk.l5,
walk.tcr_el1,
) else {
return (None, None);
};
let pid = mem.read_u32(task_pa, pid_off) as i32;
let comm = read_comm(mem, task_pa, comm_off);
(Some(pid), Some(comm))
}
fn follow_task_for_pid(
mem: &super::reader::GuestMem,
walk: super::reader::WalkContext,
task_kva: u64,
pid_off: usize,
) -> Option<i32> {
if task_kva == 0 {
return None;
}
let task_pa = translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
task_kva,
walk.l5,
walk.tcr_el1,
)?;
Some(mem.read_u32(task_pa, pid_off) as i32)
}
#[allow(clippy::too_many_arguments)]
fn read_pid_nr_at_index(
mem: &super::reader::GuestMem,
walk: super::reader::WalkContext,
signal_pa: u64,
pids_off: usize,
idx: usize,
numbers_off: usize,
upid_size: usize,
nr_off: usize,
) -> Option<i32> {
let pid_kva = mem.read_u64(signal_pa, pids_off + idx * 8);
if pid_kva == 0 {
return None;
}
let pid_pa = translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
pid_kva,
walk.l5,
walk.tcr_el1,
)?;
let _ = upid_size; Some(mem.read_u32(pid_pa, numbers_off + nr_off) as i32)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn sched_class_registry_decode_known_class() {
let r = SchedClassRegistry {
fair: Some(0xffff_ffff_8000_1000),
rt: Some(0xffff_ffff_8000_1100),
dl: None,
idle: None,
stop: None,
ext: Some(0xffff_ffff_8000_1300),
};
assert_eq!(r.decode(0xffff_ffff_8000_1000), Some("fair"));
assert_eq!(r.decode(0xffff_ffff_8000_1100), Some("rt"));
assert_eq!(r.decode(0xffff_ffff_8000_1300), Some("ext"));
}
#[test]
fn sched_class_registry_decode_unknown_returns_none() {
let r = SchedClassRegistry {
fair: Some(0xffff_ffff_8000_1000),
rt: None,
dl: None,
idle: None,
stop: None,
ext: None,
};
assert_eq!(r.decode(0xffff_ffff_8000_2000), None);
assert_eq!(r.decode(0), None);
}
#[test]
fn lock_slowpath_match_within_window() {
let r = LockSlowpathRegistry {
queued_spin_lock_slowpath: Some(0xffff_ffff_8001_0000),
mutex_lock_slowpath: Some(0xffff_ffff_8002_0000),
rwsem_down_read_slowpath: None,
rwsem_down_write_slowpath: None,
};
assert_eq!(
r.match_pc(0xffff_ffff_8001_0010),
Some("queued_spin_lock_slowpath")
);
assert_eq!(
r.match_pc(0xffff_ffff_8002_0fff),
Some("mutex_lock_slowpath")
);
assert!(r.match_pc(0xffff_ffff_8001_2000).is_none());
assert!(r.match_pc(0xffff_ffff_8000_ffff).is_none());
}
#[test]
fn lock_slowpath_no_match_when_all_none() {
let r = LockSlowpathRegistry::default();
assert_eq!(r.match_pc(0xdeadbeef), None);
}
#[test]
fn task_enrichment_serde_skip_none_fields() {
let e = TaskEnrichment {
pid: 42,
tgid: 42,
comm: "ktstr_worker".to_string(),
group_leader_pid: None,
real_parent_pid: None,
real_parent_comm: None,
pgid: None,
sid: None,
nr_threads: None,
weight: 100,
prio: 120,
static_prio: 120,
normal_prio: 120,
rt_priority: 0,
sched_class: Some("fair".to_string()),
core_cookie: None,
pi_boosted_out_of_scx: false,
nvcsw: 0,
nivcsw: 0,
signal_nvcsw: None,
signal_nivcsw: None,
lock_slowpath_match: None,
};
let json = serde_json::to_string(&e).unwrap();
assert!(!json.contains("group_leader_pid"));
assert!(!json.contains("real_parent_pid"));
assert!(!json.contains("pgid"));
assert!(!json.contains("nr_threads"));
assert!(!json.contains("core_cookie"));
assert!(!json.contains("signal_nvcsw"));
assert!(!json.contains("lock_slowpath_match"));
assert!(json.contains("\"pid\":42"));
assert!(json.contains("\"comm\":\"ktstr_worker\""));
assert!(json.contains("\"weight\":100"));
assert!(json.contains("\"sched_class\":\"fair\""));
}
#[test]
fn task_enrichment_serde_roundtrip_populated() {
let e = TaskEnrichment {
pid: 1234,
tgid: 1230,
comm: "stress-ng".to_string(),
group_leader_pid: Some(1230),
real_parent_pid: Some(1),
real_parent_comm: Some("systemd".to_string()),
pgid: Some(1230),
sid: Some(1),
nr_threads: Some(8),
weight: 200,
prio: 100,
static_prio: 120,
normal_prio: 100,
rt_priority: 50,
sched_class: Some("rt".to_string()),
core_cookie: Some(0xc0c01e),
pi_boosted_out_of_scx: true,
nvcsw: 12345,
nivcsw: 678,
signal_nvcsw: Some(50_000),
signal_nivcsw: Some(1_234),
lock_slowpath_match: Some("queued_spin_lock_slowpath".to_string()),
};
let json = serde_json::to_string(&e).unwrap();
let parsed: TaskEnrichment = serde_json::from_str(&json).unwrap();
assert_eq!(parsed.pid, 1234);
assert_eq!(parsed.comm, "stress-ng");
assert_eq!(parsed.real_parent_comm.as_deref(), Some("systemd"));
assert_eq!(parsed.nr_threads, Some(8));
assert_eq!(parsed.core_cookie, Some(0xc0c01e));
assert!(parsed.pi_boosted_out_of_scx);
assert_eq!(
parsed.lock_slowpath_match.as_deref(),
Some("queued_spin_lock_slowpath"),
);
}
}