use crate::monitor::btf_offsets::{find_struct, nested_member_byte_offset};
use crate::monitor::guest::GuestKernel;
use crate::monitor::idr::translate_any_kva;
use crate::vmm::wire::{
KERNEL_OP_REASON_MAX, KernelOpDirection, KernelOpEntry, KernelOpReplyPayload,
KernelOpRequestPayload, KernelOpTarget, KernelOpValue,
};
use btf_rs::Btf;
const MAX_TASK_WALKER_NODES: u32 = 65536;
const TASK_DEAD: u32 = 0x80;
const KERNEL_HALF_CONSERVATIVE_5LEVEL: u64 = 0xFF00_0000_0000_0000;
fn validate_direct_target(
kva: u64,
len: u64,
page_offset: u64,
dram_size: u64,
) -> Result<(), String> {
if kva < page_offset {
return Err(format!(
"Direct kva={kva:#x} below page_offset={page_offset:#x} \
(kva_to_pa would wrap; use Kva target for vmalloc/vmemmap)"
));
}
let direct_map_end = page_offset.checked_add(dram_size).ok_or_else(|| {
format!("internal: page_offset+dram_size overflow ({page_offset:#x} + {dram_size:#x})")
})?;
let kva_end = kva
.checked_add(len)
.ok_or_else(|| format!("Direct kva+len overflow ({kva:#x} + {len:#x})"))?;
if kva_end > direct_map_end {
return Err(format!(
"Direct kva={kva:#x} len={len} overruns direct-map end {direct_map_end:#x}"
));
}
Ok(())
}
fn validate_kva_target(kva: u64, len: u64) -> Result<(), String> {
if kva < KERNEL_HALF_CONSERVATIVE_5LEVEL {
return Err(user_half_kva_rejection_reason(kva));
}
let _ = kva
.checked_add(len)
.ok_or_else(|| format!("Kva kva+len overflow ({kva:#x} + {len:#x})"))?;
Ok(())
}
pub(super) fn user_half_kva_rejection_reason(kva: u64) -> String {
format!(
"Kva={kva:#x} below kernel-half 5-level conservative threshold \
{KERNEL_HALF_CONSERVATIVE_5LEVEL:#x}; use Symbol target or a KVA in the \
kernel address space"
)
}
pub(super) fn dispatch_kernel_op_batch(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
req: &KernelOpRequestPayload,
) -> KernelOpReplyPayload {
let request_id = req.request_id;
match req.direction {
KernelOpDirection::Write => {
dispatch_write_batch(kernel, btf, kaslr_offset, request_id, &req.entries)
}
KernelOpDirection::Read => {
dispatch_read_batch(kernel, btf, kaslr_offset, request_id, &req.entries)
}
}
}
fn dispatch_write_batch(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
request_id: u32,
entries: &[KernelOpEntry],
) -> KernelOpReplyPayload {
for (idx, entry) in entries.iter().enumerate() {
if let Err(reason) =
dispatch_one_write(kernel, btf, kaslr_offset, &entry.target, &entry.value)
{
return error_reply(request_id, format!("entry[{idx}]: {reason}"));
}
}
KernelOpReplyPayload {
request_id,
success: true,
reason: String::new(),
read_values: Vec::new(),
}
}
fn dispatch_read_batch(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
request_id: u32,
entries: &[KernelOpEntry],
) -> KernelOpReplyPayload {
let mut read_values: Vec<KernelOpValue> = Vec::with_capacity(entries.len());
for (idx, entry) in entries.iter().enumerate() {
match dispatch_one_read(kernel, btf, kaslr_offset, &entry.target, &entry.value) {
Ok(v) => read_values.push(v),
Err(reason) => return error_reply(request_id, format!("entry[{idx}]: {reason}")),
}
}
KernelOpReplyPayload {
request_id,
success: true,
reason: String::new(),
read_values,
}
}
fn dispatch_one_write(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
target: &KernelOpTarget,
value: &KernelOpValue,
) -> Result<(), String> {
let page_offset = kernel.page_offset();
let dram_size = kernel.mem().size();
match (target, value) {
(KernelOpTarget::Symbol(name), KernelOpValue::U32(v)) => kernel
.write_symbol_u32(name, *v)
.map_err(|e| format!("write_symbol_u32('{name}'): {e:#}")),
(KernelOpTarget::Symbol(name), KernelOpValue::U64(v)) => kernel
.write_symbol_u64(name, *v)
.map_err(|e| format!("write_symbol_u64('{name}'): {e:#}")),
(KernelOpTarget::Symbol(name), KernelOpValue::Bytes(b)) => kernel
.write_symbol_bytes(name, b)
.map_err(|e| format!("write_symbol_bytes('{name}'): {e:#}")),
(KernelOpTarget::Symbol(name), KernelOpValue::OrU32(mask)) => {
let cur = kernel
.read_symbol_u32(name)
.map_err(|e| format!("read_symbol_u32('{name}') for OrU32: {e:#}"))?;
kernel
.write_symbol_u32(name, cur | mask)
.map_err(|e| format!("write_symbol_u32('{name}') for OrU32: {e:#}"))
}
(KernelOpTarget::Direct(kva), KernelOpValue::U32(v)) => {
validate_direct_target(*kva, 4, page_offset, dram_size)?;
kernel.write_direct_u32(*kva, *v);
Ok(())
}
(KernelOpTarget::Direct(kva), KernelOpValue::U64(v)) => {
validate_direct_target(*kva, 8, page_offset, dram_size)?;
kernel.write_direct_u64(*kva, *v);
Ok(())
}
(KernelOpTarget::Direct(kva), KernelOpValue::Bytes(b)) => {
validate_direct_target(*kva, b.len() as u64, page_offset, dram_size)?;
kernel.write_direct_bytes(*kva, b);
Ok(())
}
(KernelOpTarget::Direct(kva), KernelOpValue::OrU32(mask)) => {
validate_direct_target(*kva, 4, page_offset, dram_size)?;
let cur = kernel.read_direct_u32(*kva);
kernel.write_direct_u32(*kva, cur | mask);
Ok(())
}
(KernelOpTarget::Kva(kva), KernelOpValue::U32(v)) => {
validate_kva_target(*kva, 4)?;
kernel
.write_kva_u32(*kva, *v)
.ok_or_else(|| format!("write_kva_u32({kva:#x}): page unmapped"))
}
(KernelOpTarget::Kva(kva), KernelOpValue::U64(v)) => {
validate_kva_target(*kva, 8)?;
kernel
.write_kva_u64(*kva, *v)
.ok_or_else(|| format!("write_kva_u64({kva:#x}): page unmapped"))
}
(KernelOpTarget::Kva(kva), KernelOpValue::Bytes(b)) => {
validate_kva_target(*kva, b.len() as u64)?;
kernel
.write_kva_bytes_chunked(*kva, b)
.ok_or_else(|| format!("write_kva_bytes_chunked({kva:#x}): page unmapped or short"))
}
(KernelOpTarget::Kva(kva), KernelOpValue::OrU32(mask)) => {
validate_kva_target(*kva, 4)?;
let cur = kernel
.read_kva_u32(*kva)
.ok_or_else(|| format!("read_kva_u32({kva:#x}) for OrU32: page unmapped"))?;
kernel
.write_kva_u32(*kva, cur | mask)
.ok_or_else(|| format!("write_kva_u32({kva:#x}) for OrU32: page unmapped"))
}
(KernelOpTarget::PerCpuField { symbol, field, cpu }, value) => {
dispatch_per_cpu_field_write(kernel, btf, kaslr_offset, symbol, field, *cpu, value)
}
(
KernelOpTarget::TaskField {
pid,
expected_start_time_ns,
field,
},
value,
) => dispatch_task_field_write(
kernel,
btf,
kaslr_offset,
*pid,
*expected_start_time_ns,
field,
value,
),
}
}
fn dispatch_one_read(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
target: &KernelOpTarget,
width_hint: &KernelOpValue,
) -> Result<KernelOpValue, String> {
let page_offset = kernel.page_offset();
let dram_size = kernel.mem().size();
match (target, width_hint) {
(KernelOpTarget::Symbol(name), KernelOpValue::U32(_)) => kernel
.read_symbol_u32(name)
.map(KernelOpValue::U32)
.map_err(|e| format!("read_symbol_u32('{name}'): {e:#}")),
(KernelOpTarget::Symbol(name), KernelOpValue::U64(_)) => kernel
.read_symbol_u64(name)
.map(KernelOpValue::U64)
.map_err(|e| format!("read_symbol_u64('{name}'): {e:#}")),
(KernelOpTarget::Symbol(name), KernelOpValue::Bytes(placeholder)) => kernel
.read_symbol_bytes(name, placeholder.len())
.map(KernelOpValue::Bytes)
.map_err(|e| format!("read_symbol_bytes('{name}', {}): {e:#}", placeholder.len())),
(KernelOpTarget::Direct(kva), KernelOpValue::U32(_)) => {
validate_direct_target(*kva, 4, page_offset, dram_size)?;
Ok(KernelOpValue::U32(kernel.read_direct_u32(*kva)))
}
(KernelOpTarget::Direct(kva), KernelOpValue::U64(_)) => {
validate_direct_target(*kva, 8, page_offset, dram_size)?;
Ok(KernelOpValue::U64(kernel.read_direct_u64(*kva)))
}
(KernelOpTarget::Direct(kva), KernelOpValue::Bytes(placeholder)) => {
validate_direct_target(*kva, placeholder.len() as u64, page_offset, dram_size)?;
Ok(KernelOpValue::Bytes(
kernel.read_direct_bytes(*kva, placeholder.len()),
))
}
(KernelOpTarget::Kva(kva), KernelOpValue::U32(_)) => {
validate_kva_target(*kva, 4)?;
kernel
.read_kva_u32(*kva)
.map(KernelOpValue::U32)
.ok_or_else(|| format!("read_kva_u32({kva:#x}): page unmapped"))
}
(KernelOpTarget::Kva(kva), KernelOpValue::U64(_)) => {
validate_kva_target(*kva, 8)?;
kernel
.read_kva_u64(*kva)
.map(KernelOpValue::U64)
.ok_or_else(|| format!("read_kva_u64({kva:#x}): page unmapped"))
}
(KernelOpTarget::Kva(kva), KernelOpValue::Bytes(placeholder)) => {
validate_kva_target(*kva, placeholder.len() as u64)?;
kernel
.read_kva_bytes_chunked(*kva, placeholder.len())
.map(KernelOpValue::Bytes)
.ok_or_else(|| {
format!(
"read_kva_bytes_chunked({kva:#x}, {}): page unmapped or short",
placeholder.len()
)
})
}
(KernelOpTarget::PerCpuField { symbol, field, cpu }, width_hint) => {
dispatch_per_cpu_field_read(kernel, btf, kaslr_offset, symbol, field, *cpu, width_hint)
}
(
KernelOpTarget::TaskField {
pid,
expected_start_time_ns,
field,
},
width_hint,
) => dispatch_task_field_read(
kernel,
btf,
kaslr_offset,
*pid,
*expected_start_time_ns,
field,
width_hint,
),
(_, KernelOpValue::OrU32(mask)) => Err(oru32_read_rejection_reason(*mask)),
}
}
fn struct_name_for_per_cpu_symbol(symbol: &str) -> Result<&'static str, String> {
match symbol {
"runqueues" => Ok("rq"),
"kernel_cpustat" => Ok("kernel_cpustat"),
"kstat" => Ok("kernel_stat"),
"tick_cpu_sched" => Ok("tick_sched"),
_ => Err(format!(
"PerCpuField: unknown per-CPU symbol '{symbol}' (v1 supports: \
runqueues, kernel_cpustat, kstat, tick_cpu_sched); extend \
struct_name_for_per_cpu_symbol + KernelSymbols::from_elf to add"
)),
}
}
fn resolve_per_cpu_field_pa(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
symbol: &str,
field: &str,
cpu: u32,
) -> Result<usize, String> {
let btf = btf.ok_or_else(|| {
format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: BTF not loaded in this \
coordinator — cannot resolve struct layout (vmlinux must carry \
CONFIG_DEBUG_INFO_BTF=y output)"
)
})?;
let struct_name = struct_name_for_per_cpu_symbol(symbol)?;
let template_kva = kernel.symbol_kva(symbol).ok_or_else(|| {
format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: '{symbol}' symbol absent \
from vmlinux symtab"
)
})?;
let per_cpu_offset_array_kva = kernel.symbol_kva("__per_cpu_offset").ok_or_else(|| {
format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: '__per_cpu_offset' symbol \
absent — kernel built without SMP"
)
})?;
let per_cpu_offset_array_pa = kernel.text_kva_to_pa(per_cpu_offset_array_kva);
let per_cpu_offset = kernel
.mem()
.read_u64(per_cpu_offset_array_pa, (cpu as usize) * 8);
if per_cpu_offset == 0 && cpu > 0 {
return Err(format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: __per_cpu_offset[{cpu}]=0 \
(cpu beyond nr_cpu_ids; kernel zero-init slot)"
));
}
let per_cpu_kva =
crate::monitor::symbols::per_cpu_kva(template_kva, kaslr_offset, per_cpu_offset);
let kernel_half_floor = kernel.walk_context().page_offset;
if per_cpu_kva < kernel_half_floor {
return Err(format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: per_cpu_kva={per_cpu_kva:#x} \
below kernel page_offset ({kernel_half_floor:#x}) — arithmetic wrap \
or broken template KVA \
(template={template_kva:#x} + kaslr={kaslr_offset:#x} + \
per_cpu_off={per_cpu_offset:#x})"
));
}
let (struct_t, _) = find_struct(btf, struct_name).map_err(|e| {
format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: 'struct {struct_name}' BTF \
lookup: {e:#}"
)
})?;
let field_off = nested_member_byte_offset(btf, &struct_t, field).map_err(|e| {
format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: BTF nested-offset for \
'{field}' within '{struct_name}': {e:#}"
)
})?;
let walk = kernel.walk_context();
let pa = translate_any_kva(
kernel.mem(),
walk.cr3_pa,
walk.page_offset,
per_cpu_kva,
walk.l5,
walk.tcr_el1,
)
.ok_or_else(|| {
format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: per_cpu_kva={per_cpu_kva:#x} \
unmapped (translate_any_kva returned None)"
)
})?;
Ok((pa + field_off as u64) as usize)
}
fn dispatch_per_cpu_field_write(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
symbol: &str,
field: &str,
cpu: u32,
value: &KernelOpValue,
) -> Result<(), String> {
let pa = resolve_per_cpu_field_pa(kernel, btf, kaslr_offset, symbol, field, cpu)? as u64;
match value {
KernelOpValue::U32(v) => {
kernel.mem().write_u32(pa, 0, *v);
Ok(())
}
KernelOpValue::U64(v) => {
kernel.mem().write_u64(pa, 0, *v);
Ok(())
}
KernelOpValue::OrU32(mask) => {
let cur = kernel.mem().read_u32(pa, 0);
kernel.mem().write_u32(pa, 0, cur | mask);
Ok(())
}
KernelOpValue::Bytes(_) => Err(format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: Bytes write not supported \
(per-CPU scheduler fields are scalars)"
)),
}
}
fn dispatch_per_cpu_field_read(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
symbol: &str,
field: &str,
cpu: u32,
width_hint: &KernelOpValue,
) -> Result<KernelOpValue, String> {
let pa = resolve_per_cpu_field_pa(kernel, btf, kaslr_offset, symbol, field, cpu)? as u64;
match width_hint {
KernelOpValue::U32(_) => Ok(KernelOpValue::U32(kernel.mem().read_u32(pa, 0))),
KernelOpValue::U64(_) => Ok(KernelOpValue::U64(kernel.mem().read_u64(pa, 0))),
KernelOpValue::Bytes(_) => Err(format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: Bytes read not supported"
)),
KernelOpValue::OrU32(_) => Err(format!(
"PerCpuField {symbol}.{field}[cpu={cpu}]: OrU32 has no read semantic"
)),
}
}
const START_TIME_PROC_TICK_NS: u64 = 10_000_000;
struct TaskValidationOffsets {
pid: usize,
start_time: usize,
state: usize,
on_rq: usize,
scx_dsq: usize,
scx_runnable_node: usize,
sched_class: usize,
start_boottime: usize,
tasks: usize,
signal: usize,
signal_thread_head: usize,
thread_node: usize,
}
impl TaskValidationOffsets {
fn resolve_from_btf(btf: &Btf) -> Result<Self, String> {
let (task_struct_t, _) = find_struct(btf, "task_struct")
.map_err(|e| format!("BTF: 'struct task_struct' lookup: {e:#}"))?;
let task_resolve = |path: &str| -> Result<usize, String> {
nested_member_byte_offset(btf, &task_struct_t, path)
.map_err(|e| format!("BTF: task_struct.{path} offset: {e:#}"))
};
let (signal_struct_t, _) = find_struct(btf, "signal_struct")
.map_err(|e| format!("BTF: 'struct signal_struct' lookup: {e:#}"))?;
let signal_thread_head = nested_member_byte_offset(btf, &signal_struct_t, "thread_head")
.map_err(|e| format!("BTF: signal_struct.thread_head offset: {e:#}"))?;
Ok(Self {
pid: task_resolve("pid")?,
start_time: task_resolve("start_time")?,
state: task_resolve("__state")?,
on_rq: task_resolve("on_rq")?,
scx_dsq: task_resolve("scx.dsq")?,
scx_runnable_node: task_resolve("scx.runnable_node")?,
sched_class: task_resolve("sched_class")?,
start_boottime: task_resolve("start_boottime")?,
tasks: task_resolve("tasks")?,
signal: task_resolve("signal")?,
signal_thread_head,
thread_node: task_resolve("thread_node")?,
})
}
}
fn find_task_by_pid(
kernel: &GuestKernel,
init_task_kva: u64,
offs: &TaskValidationOffsets,
target_pid: u32,
) -> Result<u64, String> {
let mem = kernel.mem();
let walk = kernel.walk_context();
let pid_off = offs.pid;
let tasks_off = offs.tasks;
let signal_off = offs.signal;
let signal_thread_head_off = offs.signal_thread_head;
let thread_node_off = offs.thread_node;
let head_kva = init_task_kva.checked_add(tasks_off as u64).ok_or_else(|| {
format!(
"find_task_by_pid: head_kva overflow init_task={init_task_kva:#x} + \
tasks_off={tasks_off}"
)
})?;
let head_pa = kernel.text_kva_to_pa(head_kva);
let mut node_kva = mem.read_u64(head_pa, 0);
if node_kva == 0 {
return Err(format!(
"find_task_by_pid: init_task.tasks.next read as 0 at head_pa={head_pa:#x} \
— head bytes unmapped or torn read"
));
}
if node_kva == head_kva {
return Err(format!(
"find_task_by_pid: init_task.tasks is empty (head.next == head) — \
no user tasks exist; cannot resolve pid={target_pid}"
));
}
let mut visited: u32 = 0;
while node_kva != head_kva {
if visited >= MAX_TASK_WALKER_NODES {
return Err(format!(
"find_task_by_pid: walker cap {MAX_TASK_WALKER_NODES} exceeded \
scanning for pid={target_pid} (visited={visited}); list may be \
corrupted (cycle) or pid_max exceeded the cap"
));
}
visited += 1;
let leader_kva = node_kva.wrapping_sub(tasks_off as u64);
if leader_kva == init_task_kva {
return Err(format!(
"find_task_by_pid: candidate task_kva={leader_kva:#x} equals \
init_task_kva={init_task_kva:#x} (pid=0 swapper); init_task \
is not a writable target"
));
}
let Some(leader_pa) = translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
leader_kva,
walk.l5,
walk.tcr_el1,
) else {
return Err(format!(
"find_task_by_pid: leader task_kva={leader_kva:#x} unmapped \
(visited={visited}); task_struct slab page not present in guest memory"
));
};
let leader_pid = mem.read_u32(leader_pa, pid_off);
if leader_pid == target_pid {
return Ok(leader_kva);
}
let signal_kva = mem.read_u64(leader_pa, signal_off);
if signal_kva != 0 {
let thread_head_kva = signal_kva.wrapping_add(signal_thread_head_off as u64);
if let Some(thread_head_pa) = translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
thread_head_kva,
walk.l5,
walk.tcr_el1,
) {
let mut thread_node_kva = mem.read_u64(thread_head_pa, 0);
while thread_node_kva != 0 && thread_node_kva != thread_head_kva {
if visited >= MAX_TASK_WALKER_NODES {
return Err(format!(
"find_task_by_pid: walker cap {MAX_TASK_WALKER_NODES} \
exceeded inside thread-group of leader_pid={leader_pid} \
scanning for pid={target_pid}"
));
}
visited += 1;
let thread_kva = thread_node_kva.wrapping_sub(thread_node_off as u64);
if thread_kva != leader_kva {
let Some(thread_pa) = translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
thread_kva,
walk.l5,
walk.tcr_el1,
) else {
let Some(thread_node_pa) = translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
thread_node_kva,
walk.l5,
walk.tcr_el1,
) else {
break; };
thread_node_kva = mem.read_u64(thread_node_pa, 0);
continue;
};
let thread_pid = mem.read_u32(thread_pa, pid_off);
if thread_pid == target_pid {
return Ok(thread_kva);
}
}
let next_kva = mem.read_u64(
thread_pa_or_node(
mem,
walk.cr3_pa,
walk.page_offset,
walk.l5,
walk.tcr_el1,
thread_kva,
thread_node_kva,
thread_node_off,
),
0,
);
if next_kva == 0 {
break; }
thread_node_kva = next_kva;
}
}
}
let next_kva = mem.read_u64(leader_pa, tasks_off);
if next_kva == 0 {
return Err(format!(
"find_task_by_pid: list_head.next read as 0 at leader_kva={leader_kva:#x} \
(visited={visited}); chain broken before finding pid={target_pid}"
));
}
node_kva = next_kva;
}
Err(format!(
"find_task_by_pid: pid={target_pid} not found in init_task.tasks \
or any leader's signal->thread_head (visited={visited} entries across \
leaders + threads)"
))
}
#[allow(clippy::too_many_arguments)]
fn thread_pa_or_node(
mem: &crate::monitor::reader::GuestMem,
cr3_pa: u64,
page_offset: u64,
l5: bool,
tcr_el1: u64,
thread_kva: u64,
thread_node_kva: u64,
thread_node_off: usize,
) -> u64 {
if let Some(task_pa) = translate_any_kva(mem, cr3_pa, page_offset, thread_kva, l5, tcr_el1) {
task_pa + thread_node_off as u64
} else {
translate_any_kva(mem, cr3_pa, page_offset, thread_node_kva, l5, tcr_el1).unwrap_or(0)
}
}
fn validate_task_for_field_op(
kernel: &GuestKernel,
task_pa: u64,
target_pid: u32,
expected_start_time_ns: u64,
offs: &TaskValidationOffsets,
ext_sched_class_kva: u64,
) -> Result<(), String> {
let mem = kernel.mem();
let pid = mem.read_u32(task_pa, offs.pid);
if pid != target_pid {
return Err(format!(
"validate_task: pid mismatch at task_pa={task_pa:#x} — read pid={pid}, \
expected {target_pid} (likely slab-recycle since walker found this task)"
));
}
let observed_start_time = mem.read_u64(task_pa, offs.start_time);
let skew = observed_start_time.saturating_sub(expected_start_time_ns);
if observed_start_time < expected_start_time_ns || skew >= START_TIME_PROC_TICK_NS {
return Err(format!(
"validate_task: task pid={target_pid} start_time identity mismatch — \
observed={observed_start_time}ns expected in \
[{expected_start_time_ns}, {}]ns; \
original task exited and PID was recycled for an unrelated task",
expected_start_time_ns + START_TIME_PROC_TICK_NS - 1
));
}
let state = mem.read_u32(task_pa, offs.state);
if state & TASK_DEAD != 0 {
return Err(format!(
"validate_task: task pid={target_pid} is TASK_DEAD (state={state:#x}); \
mid-teardown task fields unsafe to write"
));
}
let on_rq = mem.read_u32(task_pa, offs.on_rq);
if on_rq != 0 {
return Err(format!(
"validate_task: task pid={target_pid} is on_rq={on_rq} (TASK_ON_RQ_QUEUED \
or MIGRATING); writing scheduler fields would corrupt rb-tree / DSQ \
ordering. Test author must use a blocking workload pattern \
(`WorkType::FutexPingPong`, `WorkType::WaitOnFutex`, `WorkType::Sleep`) \
so the worker is sleeping at cold-op time"
));
}
let scx_dsq_ptr = mem.read_u64(task_pa, offs.scx_dsq);
if scx_dsq_ptr != 0 {
return Err(format!(
"validate_task: task pid={target_pid} has scx.dsq={scx_dsq_ptr:#x} (queued \
on an SCX DSQ); modifying ordering keys while queued mangles ordering \
per include/linux/sched/ext.h:248-254 (dsq_vtime warning). Test author \
must use a blocking workload pattern \
(`WorkType::FutexPingPong`, `WorkType::WaitOnFutex`, `WorkType::Sleep`)"
));
}
let task_kva = task_pa.wrapping_add(kernel.page_offset());
let runnable_node_kva = task_kva.wrapping_add(offs.scx_runnable_node as u64);
let runnable_node_next = mem.read_u64(task_pa, offs.scx_runnable_node);
if runnable_node_next != 0 && runnable_node_next != runnable_node_kva {
return Err(format!(
"validate_task: task pid={target_pid} scx.runnable_node is linked \
(next={runnable_node_next:#x} != self={runnable_node_kva:#x}); task is \
on a per-rq runnable_list. Test author must use a blocking workload \
pattern (`WorkType::FutexPingPong`, `WorkType::WaitOnFutex`, \
`WorkType::Sleep`)"
));
}
let sched_class_kva = mem.read_u64(task_pa, offs.sched_class);
if sched_class_kva != ext_sched_class_kva {
return Err(format!(
"validate_task: task pid={target_pid} sched_class={sched_class_kva:#x} \
is not ext_sched_class={ext_sched_class_kva:#x}; TaskField writes target \
SCX-managed tasks only (CFS / RT / DL / stop / idle classes have \
different vtime semantics — EEVDF's place_entity overwrites se.vruntime \
on enqueue, RT/DL have RT_BANDWIDTH instant-throttle hazards). Spawn \
the worker under `SchedPolicy::Ext` to make it SCX-managed"
));
}
let start_boottime = mem.read_u64(task_pa, offs.start_boottime);
if start_boottime == 0 {
return Err(format!(
"validate_task: task pid={target_pid} start_boottime=0 — possibly a \
freshly-zeroed slab page mid-slab-recycle; reject rather than risk \
writing to dead memory"
));
}
Ok(())
}
fn resolve_and_validate_task_field(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
pid: u32,
expected_start_time_ns: u64,
) -> Result<(u64, btf_rs::Struct), String> {
let btf = btf.ok_or_else(|| {
format!(
"TaskField pid={pid}: BTF not loaded in this coordinator — cannot resolve \
task_struct layout (vmlinux must carry CONFIG_DEBUG_INFO_BTF=y output)"
)
})?;
let init_task_kva = kernel.symbol_kva("init_task").ok_or_else(|| {
format!(
"TaskField pid={pid}: init_task symbol absent from vmlinux symtab \
(heavily stripped vmlinux); cannot anchor the task-list walker"
)
})?;
let ext_sched_class_link_kva = kernel.symbol_kva("ext_sched_class").ok_or_else(|| {
format!(
"TaskField pid={pid}: ext_sched_class symbol absent from vmlinux symtab \
(kernel built without CONFIG_SCHED_CLASS_EXT=y); TaskField writes are \
SCX-only and require sched_ext support"
)
})?;
let ext_sched_class_kva = ext_sched_class_link_kva.wrapping_add(kaslr_offset);
let val_offs = TaskValidationOffsets::resolve_from_btf(btf)?;
let task_kva = find_task_by_pid(kernel, init_task_kva, &val_offs, pid)?;
let walk = kernel.walk_context();
let task_pa = translate_any_kva(
kernel.mem(),
walk.cr3_pa,
walk.page_offset,
task_kva,
walk.l5,
walk.tcr_el1,
)
.ok_or_else(|| {
format!(
"TaskField pid={pid}: task_kva={task_kva:#x} unmapped at validation step \
(slab page disappeared between walker and validator — extreme race)"
)
})?;
validate_task_for_field_op(
kernel,
task_pa,
pid,
expected_start_time_ns,
&val_offs,
ext_sched_class_kva,
)?;
let (task_struct_t, _) = find_struct(btf, "task_struct")
.map_err(|e| format!("TaskField pid={pid}: 'struct task_struct' BTF lookup: {e:#}"))?;
Ok((task_pa, task_struct_t))
}
fn dispatch_task_field_write(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
pid: u32,
expected_start_time_ns: u64,
field: &str,
value: &KernelOpValue,
) -> Result<(), String> {
let (task_pa, task_struct_t) =
resolve_and_validate_task_field(kernel, btf, kaslr_offset, pid, expected_start_time_ns)?;
let btf = btf.expect("checked in resolve_and_validate_task_field");
let field_off = nested_member_byte_offset(btf, &task_struct_t, field).map_err(|e| {
format!("TaskField pid={pid} field={field:?}: BTF nested-offset resolution: {e:#}")
})?;
match value {
KernelOpValue::U32(v) => {
kernel.mem().write_u32(task_pa, field_off, *v);
Ok(())
}
KernelOpValue::U64(v) => {
kernel.mem().write_u64(task_pa, field_off, *v);
Ok(())
}
KernelOpValue::Bytes(_) => Err(format!(
"TaskField pid={pid} field={field:?}: Bytes write not supported in v1 — \
use U32 or U64 (per-task scheduler fields are scalars)"
)),
KernelOpValue::OrU32(_) => Err(format!(
"TaskField pid={pid} field={field:?}: OrU32 RMW not supported on TaskField \
in v1 (no current use case; per-task scheduler fields are scalars not flags)"
)),
}
}
fn dispatch_task_field_read(
kernel: &GuestKernel,
btf: Option<&Btf>,
kaslr_offset: u64,
pid: u32,
expected_start_time_ns: u64,
field: &str,
width_hint: &KernelOpValue,
) -> Result<KernelOpValue, String> {
let (task_pa, task_struct_t) =
resolve_and_validate_task_field(kernel, btf, kaslr_offset, pid, expected_start_time_ns)?;
let btf = btf.expect("checked in resolve_and_validate_task_field");
let field_off = nested_member_byte_offset(btf, &task_struct_t, field).map_err(|e| {
format!("TaskField pid={pid} field={field:?}: BTF nested-offset resolution: {e:#}")
})?;
match width_hint {
KernelOpValue::U32(_) => Ok(KernelOpValue::U32(
kernel.mem().read_u32(task_pa, field_off),
)),
KernelOpValue::U64(_) => Ok(KernelOpValue::U64(
kernel.mem().read_u64(task_pa, field_off),
)),
KernelOpValue::Bytes(_) => Err(format!(
"TaskField pid={pid} field={field:?}: Bytes read not supported in v1 — \
use U32 or U64 width hint"
)),
KernelOpValue::OrU32(_) => Err(format!(
"TaskField pid={pid} field={field:?}: OrU32 has no read semantic (covered \
by the dispatcher's read-direction catch-all but explicit here for clarity)"
)),
}
}
pub(super) fn oru32_read_rejection_reason(mask: u32) -> String {
format!(
"OrU32(mask={mask:#x}) cannot be used as a Read width — \
RMW is a write operation. For 32-bit reads use \
`KernelValueWidth::u32()` instead."
)
}
fn error_reply(request_id: u32, reason: String) -> KernelOpReplyPayload {
let mut reason = reason;
if reason.len() > KERNEL_OP_REASON_MAX {
let cut = super::utf8_safe_truncate_len(&reason, KERNEL_OP_REASON_MAX);
reason.truncate(cut);
}
KernelOpReplyPayload {
request_id,
success: false,
reason,
read_values: Vec::new(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::vmm::KERNEL_HALF_CANONICAL as KERNEL_HALF_CANONICAL_4LEVEL;
const _: () = assert!(
KERNEL_HALF_CONSERVATIVE_5LEVEL < KERNEL_HALF_CANONICAL_4LEVEL,
"5-level threshold must be permissively lower than 4-level canonical",
);
#[test]
fn error_reply_passes_short_reason_unchanged() {
let reply = error_reply(7, "short".to_string());
assert!(!reply.success);
assert_eq!(reply.reason, "short");
}
#[test]
fn read_direction_with_oru32_value_rejects() {
const MASK: u32 = 1 << 5;
const ENTRY_IDX: usize = 0;
let helper_reason = oru32_read_rejection_reason(MASK);
let batch_reason = format!("entry[{ENTRY_IDX}]: {helper_reason}");
let reply = error_reply(99, batch_reason.clone());
assert!(!reply.success);
assert_eq!(reply.request_id, 99);
assert_eq!(reply.reason, batch_reason);
assert!(helper_reason.contains("KernelValueWidth::u32()"));
assert!(helper_reason.contains("OrU32"));
assert!(helper_reason.contains(&format!("{MASK:#x}")));
}
#[test]
fn per_cpu_field_unknown_symbol_rejected() {
let err = struct_name_for_per_cpu_symbol("not_a_real_per_cpu_symbol")
.expect_err("unknown symbol must reject");
assert!(err.contains("PerCpuField"));
assert!(err.contains("not_a_real_per_cpu_symbol"));
assert!(err.contains("runqueues"));
assert!(err.contains("kernel_cpustat"));
assert!(err.contains("kstat"));
assert!(err.contains("tick_cpu_sched"));
}
#[test]
fn per_cpu_field_known_symbol_mapping() {
assert_eq!(struct_name_for_per_cpu_symbol("runqueues").unwrap(), "rq");
assert_eq!(
struct_name_for_per_cpu_symbol("kernel_cpustat").unwrap(),
"kernel_cpustat"
);
assert_eq!(
struct_name_for_per_cpu_symbol("kstat").unwrap(),
"kernel_stat"
);
assert_eq!(
struct_name_for_per_cpu_symbol("tick_cpu_sched").unwrap(),
"tick_sched"
);
}
#[test]
fn dispatch_arms_call_validate_target_helpers() {
let full_src = include_str!("kernel_op_dispatch.rs");
let test_mod_start = full_src
.find("#[cfg(test)]")
.expect("test module must exist");
let src = &full_src[..test_mod_start];
let direct_arms: Vec<_> = src
.match_indices("KernelOpTarget::Direct(kva), KernelOpValue::")
.collect();
assert_eq!(
direct_arms.len(),
7,
"expected exactly 7 Direct arms (4 write + 3 read); found {}",
direct_arms.len()
);
for (idx, _) in &direct_arms {
let window_end = (idx + 400).min(src.len());
let window = &src[*idx..window_end];
assert!(
window.contains("validate_direct_target("),
"Direct arm at byte offset {idx} is missing validate_direct_target() call; \
window: {window:?}"
);
}
let kva_arms: Vec<_> = src
.match_indices("KernelOpTarget::Kva(kva), KernelOpValue::")
.collect();
assert_eq!(
kva_arms.len(),
7,
"expected exactly 7 Kva arms (4 write + 3 read); found {}",
kva_arms.len()
);
for (idx, _) in &kva_arms {
let window_end = (idx + 400).min(src.len());
let window = &src[*idx..window_end];
assert!(
window.contains("validate_kva_target("),
"Kva arm at byte offset {idx} is missing validate_kva_target() call; \
window: {window:?}"
);
}
}
#[test]
fn error_reply_truncates_at_utf8_boundary_classes() {
for (cp, label, padding) in [
("é", "2byte_U+00E9", 4), ("☃", "3byte_U+2603", 6), ("🦀", "4byte_U+1F980", 8), ("\u{FEFF}", "BOM_U+FEFF", 6), ] {
let mut s = String::new();
while s.len() < KERNEL_OP_REASON_MAX + padding {
s.push_str(cp);
}
let reply = error_reply(42, s);
assert!(
reply.reason.len() <= KERNEL_OP_REASON_MAX,
"{label}: reason.len()={} > cap={KERNEL_OP_REASON_MAX}",
reply.reason.len()
);
assert!(
reply.reason.is_char_boundary(reply.reason.len()),
"{label}: truncation landed mid-codepoint"
);
let _ = reply.reason.as_str();
}
}
#[test]
fn error_reply_truncates_mixed_width_input_at_boundary() {
let pattern = "Aé☃🦀";
let mut s = String::new();
while s.len() < KERNEL_OP_REASON_MAX + 10 {
s.push_str(pattern);
}
let reply = error_reply(99, s);
assert!(reply.reason.len() <= KERNEL_OP_REASON_MAX);
assert!(reply.reason.is_char_boundary(reply.reason.len()));
let _ = reply.reason.as_str();
}
#[test]
fn error_reply_truncates_pure_ascii_no_walkback() {
let s = "A".repeat(KERNEL_OP_REASON_MAX + 16);
let reply = error_reply(1, s);
assert_eq!(reply.reason.len(), KERNEL_OP_REASON_MAX);
assert!(reply.reason.is_char_boundary(reply.reason.len()));
}
#[test]
fn error_reply_zero_length_reason_passes() {
let reply = error_reply(2, String::new());
assert!(!reply.success);
assert_eq!(reply.reason, "");
}
mod target_validation;
#[test]
fn or_u32_rmw_anchors_inside_dispatch_one_write() {
let full_src = include_str!("kernel_op_dispatch.rs");
let test_mod_start = full_src
.find("#[cfg(test)]")
.expect("test module must exist");
let src = &full_src[..test_mod_start];
let arm_sites: Vec<_> = src
.match_indices("KernelOpValue::OrU32(mask)) => {")
.collect();
assert_eq!(
arm_sites.len(),
3,
"expected exactly 3 OrU32 write arms (Symbol/Direct/Kva); \
found {} — if a 4th was added, add the rmw-invariant-anchor \
comment to it AND update this expected count",
arm_sites.len()
);
for (idx, _) in &arm_sites {
let window_end = (idx + 400).min(src.len());
let window = &src[*idx..window_end];
assert!(
window.contains("rmw-invariant-anchor"),
"OrU32 arm at byte offset {idx} is missing the \
// rmw-invariant-anchor comment; window: {window:?}"
);
}
let dow_start = src
.find("fn dispatch_one_write(")
.expect("dispatch_one_write must exist");
let dow_end = src[dow_start..]
.find("\nfn ")
.map(|rel| dow_start + rel)
.unwrap_or(src.len());
let global_or_mask: Vec<_> = src.match_indices("| mask").collect();
let inside_dow: Vec<_> = global_or_mask
.iter()
.filter(|(idx, _)| *idx >= dow_start && *idx < dow_end)
.collect();
for (idx, _) in &global_or_mask {
let lookahead_end = (idx + 6).min(src.len());
let lookahead = &src[*idx..lookahead_end];
if lookahead.contains("| mask)") {
assert!(
*idx >= dow_start && *idx < dow_end,
"Production `| mask)` OR-with-mask call at byte offset \
{idx} is OUTSIDE dispatch_one_write \
[start={dow_start}, end={dow_end}). \
A refactor extracted the OrU32 RMW into a helper, \
breaking the same-rendezvous-epoch invariant. \
Move it back inside dispatch_one_write OR (if \
intentional) update this test."
);
}
}
assert_eq!(
inside_dow.len(),
3,
"expected exactly 3 `| mask` production sites inside \
dispatch_one_write (one per Symbol/Direct/Kva OrU32 arm); \
found {}",
inside_dow.len()
);
}
mod task_field;
}