mod display;
mod render_map;
#[cfg(test)]
mod tests;
use render_map::*;
use serde::{Deserialize, Serialize};
use btf_rs::Btf;
use super::arena::{ArenaSnapshot, BpfArenaOffsets, snapshot_arena};
use super::bpf_map::{
BPF_MAP_TYPE_ARENA, BPF_MAP_TYPE_ARRAY, BpfMapAccessor, BpfMapInfo, GuestMemMapAccessor,
};
use super::btf_render::RenderedValue;
use super::sdt_alloc::{
SdtAllocOffsets, SdtAllocatorSnapshot, discover_payload_btf_id, walk_sdt_allocator,
};
pub struct ProgRuntimeCapture<'a> {
pub accessor: &'a dyn super::bpf_prog::BpfProgAccessor,
pub per_cpu_offsets: &'a [u64],
}
pub struct CpuTimeCapture<'a> {
pub mem: &'a super::reader::GuestMem,
pub offsets: &'a super::btf_offsets::CpuTimeOffsets,
pub kernel_cpustat_kva: u64,
pub kstat_kva: u64,
pub tick_cpu_sched_kva: Option<u64>,
pub per_cpu_offsets: &'a [u64],
pub page_offset: u64,
}
pub struct TaskEnrichmentCapture<'a> {
pub kernel: &'a super::guest::GuestKernel,
pub offsets: &'a super::btf_offsets::TaskEnrichmentOffsets,
pub sched_classes: &'a super::task_enrichment::SchedClassRegistry,
pub lock_slowpaths: &'a super::task_enrichment::LockSlowpathRegistry,
pub tasks: &'a [TaskWalkerEntry],
}
#[derive(Debug, Clone, Copy)]
pub struct TaskWalkerEntry {
pub task_kva: u64,
pub is_runnable_in_scx: bool,
pub running_pc: Option<u64>,
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
pub struct PerCpuTimeStats {
pub cpu: u32,
pub cpustat_user_ns: u64,
pub cpustat_nice_ns: u64,
pub cpustat_system_ns: u64,
pub cpustat_softirq_ns: u64,
pub cpustat_irq_ns: u64,
pub cpustat_idle_ns: u64,
pub cpustat_iowait_ns: u64,
pub cpustat_steal_ns: u64,
pub softirqs: [u64; super::btf_offsets::NR_SOFTIRQS],
pub irqs_sum: u64,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub iowait_sleeptime_ns: Option<u64>,
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
pub struct PerNodeNumaStats {
pub node: u32,
pub numa_hit: u64,
pub numa_miss: u64,
pub numa_foreign: u64,
pub numa_interleave_hit: u64,
pub numa_local: u64,
pub numa_other: u64,
}
pub const REASON_NO_NUMA_WALKER: &str = "no NUMA walker (host-side walker pending)";
pub struct EventCounterCapture<'a> {
pub samples: &'a [super::MonitorSample],
}
pub struct ScxWalkerCapture<'a> {
pub kernel: &'a super::guest::GuestKernel,
pub offsets: &'a super::btf_offsets::ScxWalkerOffsets,
pub scx_root_kva: u64,
pub rq_kvas: &'a [u64],
pub rq_pas: &'a [u64],
pub per_cpu_offsets: &'a [u64],
pub nr_nodes: u32,
}
#[derive(Debug, Clone, Default, serde::Serialize, serde::Deserialize)]
#[non_exhaustive]
pub struct EventCounterSample {
pub elapsed_ms: u64,
pub select_cpu_fallback: i64,
pub dispatch_local_dsq_offline: i64,
pub dispatch_keep_last: i64,
pub enq_skip_exiting: i64,
pub enq_skip_migration_disabled: i64,
pub reenq_immed: i64,
pub reenq_local_repeat: i64,
pub refill_slice_dfl: i64,
pub bypass_duration: i64,
pub bypass_dispatch: i64,
pub bypass_activate: i64,
pub insert_not_owned: i64,
pub sub_bypass_dispatch: i64,
}
impl EventCounterSample {
pub fn from_monitor_sample(sample: &super::MonitorSample) -> Option<Self> {
let mut any = false;
let mut out = Self {
elapsed_ms: sample.elapsed_ms,
..Self::default()
};
for cpu in &sample.cpus {
if let Some(ev) = &cpu.event_counters {
any = true;
out.select_cpu_fallback = out
.select_cpu_fallback
.saturating_add(ev.select_cpu_fallback);
out.dispatch_local_dsq_offline = out
.dispatch_local_dsq_offline
.saturating_add(ev.dispatch_local_dsq_offline);
out.dispatch_keep_last =
out.dispatch_keep_last.saturating_add(ev.dispatch_keep_last);
out.enq_skip_exiting = out.enq_skip_exiting.saturating_add(ev.enq_skip_exiting);
out.enq_skip_migration_disabled = out
.enq_skip_migration_disabled
.saturating_add(ev.enq_skip_migration_disabled);
out.reenq_immed = out.reenq_immed.saturating_add(ev.reenq_immed);
out.reenq_local_repeat =
out.reenq_local_repeat.saturating_add(ev.reenq_local_repeat);
out.refill_slice_dfl = out.refill_slice_dfl.saturating_add(ev.refill_slice_dfl);
out.bypass_duration = out.bypass_duration.saturating_add(ev.bypass_duration);
out.bypass_dispatch = out.bypass_dispatch.saturating_add(ev.bypass_dispatch);
out.bypass_activate = out.bypass_activate.saturating_add(ev.bypass_activate);
out.insert_not_owned = out.insert_not_owned.saturating_add(ev.insert_not_owned);
out.sub_bypass_dispatch = out
.sub_bypass_dispatch
.saturating_add(ev.sub_bypass_dispatch);
}
}
if any { Some(out) } else { None }
}
}
pub fn render_sparkline(values: &[u64]) -> String {
const GLYPHS: &[char] = &['▁', '▂', '▃', '▄', '▅', '▆', '▇', '█'];
if values.is_empty() {
return String::new();
}
let min = *values.iter().min().expect("non-empty");
let max = *values.iter().max().expect("non-empty");
let mut s = String::with_capacity(values.len() * 4);
if max == min {
let glyph = if max == 0 {
GLYPHS[0]
} else {
GLYPHS[GLYPHS.len() / 2]
};
for _ in values {
s.push(glyph);
}
return s;
}
let span = max - min;
let last_idx = (GLYPHS.len() - 1) as u64;
for &v in values {
let scaled = ((v - min) * last_idx) / span;
let idx = scaled.min(last_idx) as usize;
s.push(GLYPHS[idx]);
}
s
}
pub fn render_sparkline_i64(values: &[i64]) -> String {
let widened: Vec<u64> = values.iter().map(|&v| v.max(0) as u64).collect();
render_sparkline(&widened)
}
pub use crate::vmm::exit_dispatch::VcpuRegSnapshot;
pub const SCHEMA_SINGLE: &str = "single";
pub const SCHEMA_DUAL: &str = "dual";
pub const REASON_NO_STRUCT_OPS_LOADED: &str = "no struct_ops programs loaded";
pub const REASON_PROG_ACCESSOR_UNAVAILABLE: &str = "prog accessor unavailable";
pub const REASON_TASK_WALKER_ZERO_TASKS: &str = "task walker yielded zero tasks";
pub const REASON_NO_TASK_WALKER: &str = "no task walker available";
pub const REASON_SCX_ROOT_NULL: &str = "scx_root is NULL (no scheduler attached)";
pub const REASON_SCX_WALKER_NO_STATE: &str = "scx walker reached no state";
pub const REASON_NO_SCX_WALKER: &str = "no scx walker capture";
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[non_exhaustive]
pub struct ProbeBssCounters {
pub probe_count: u64,
pub kprobe_returns: u64,
pub meta_miss: u64,
pub ringbuf_drops: u64,
pub timeline_count: u64,
pub timeline_drops: u64,
pub pi_count: u64,
pub pi_orphan_fexits: u64,
pub pi_class_change_count: u64,
pub pi_drops: u64,
pub lock_contend_count: u64,
pub lock_contend_drops: u64,
pub preempt_disable_count: u64,
pub preempt_enable_count: u64,
pub trigger_count: u64,
}
fn default_schema_single() -> String {
SCHEMA_SINGLE.to_string()
}
fn default_schema_dual() -> String {
SCHEMA_DUAL.to_string()
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct FailureDumpReport {
#[serde(default = "default_schema_single")]
pub schema: String,
pub maps: Vec<FailureDumpMap>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub vcpu_regs: Vec<Option<VcpuRegSnapshot>>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub sdt_allocations: Vec<SdtAllocatorSnapshot>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub prog_runtime_stats: Vec<super::bpf_prog::ProgRuntimeStats>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub prog_runtime_stats_unavailable: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub per_cpu_time: Vec<PerCpuTimeStats>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub per_node_numa: Vec<PerNodeNumaStats>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub per_node_numa_unavailable: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub task_enrichments: Vec<super::task_enrichment::TaskEnrichment>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub task_enrichments_unavailable: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub event_counter_timeline: Vec<EventCounterSample>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub rq_scx_states: Vec<super::scx_walker::RqScxState>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub dsq_states: Vec<super::scx_walker::DsqState>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub scx_sched_state: Option<super::scx_walker::ScxSchedState>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub scx_walker_unavailable: Option<String>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub vcpu_perf_at_freeze: Vec<Option<super::perf_counters::VcpuPerfSample>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub dump_truncated_at_us: Option<u64>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub probe_counters: Option<ProbeBssCounters>,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub is_placeholder: bool,
}
impl Default for FailureDumpReport {
fn default() -> Self {
Self {
schema: SCHEMA_SINGLE.to_string(),
maps: Vec::new(),
vcpu_regs: Vec::new(),
sdt_allocations: Vec::new(),
prog_runtime_stats: Vec::new(),
prog_runtime_stats_unavailable: None,
per_cpu_time: Vec::new(),
per_node_numa: Vec::new(),
per_node_numa_unavailable: None,
task_enrichments: Vec::new(),
task_enrichments_unavailable: None,
event_counter_timeline: Vec::new(),
rq_scx_states: Vec::new(),
dsq_states: Vec::new(),
scx_sched_state: None,
scx_walker_unavailable: None,
vcpu_perf_at_freeze: Vec::new(),
dump_truncated_at_us: None,
probe_counters: None,
is_placeholder: false,
}
}
}
impl FailureDumpReport {
pub fn placeholder(reason: impl Into<String>) -> Self {
let reason = reason.into();
Self {
prog_runtime_stats_unavailable: Some(reason.clone()),
per_node_numa_unavailable: Some(reason.clone()),
task_enrichments_unavailable: Some(reason.clone()),
scx_walker_unavailable: Some(reason),
is_placeholder: true,
..Self::default()
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct DualFailureDumpReport {
#[serde(default = "default_schema_dual")]
pub schema: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub early: Option<FailureDumpReport>,
pub late: FailureDumpReport,
#[serde(default, skip_serializing_if = "is_zero_u64")]
pub early_max_age_jiffies: u64,
#[serde(default, skip_serializing_if = "is_zero_u64")]
pub early_threshold_jiffies: u64,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub early_skipped_reason: Option<String>,
}
fn is_zero_u64(v: &u64) -> bool {
*v == 0
}
#[non_exhaustive]
pub enum FailureDumpReportAny {
Single(Box<FailureDumpReport>),
Dual(Box<DualFailureDumpReport>),
}
impl FailureDumpReportAny {
pub fn from_json(json: &str) -> Option<Self> {
let value: serde_json::Value = serde_json::from_str(json).ok()?;
let schema = value.get("schema").and_then(|v| v.as_str()).unwrap_or("");
match schema {
SCHEMA_DUAL => serde_json::from_str(json)
.ok()
.map(|d| Self::Dual(Box::new(d))),
SCHEMA_SINGLE | "" => serde_json::from_str(json)
.ok()
.map(|r| Self::Single(Box::new(r))),
_ => None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct FailureDumpMap {
pub name: String,
pub map_type: u32,
pub value_size: u32,
pub max_entries: u32,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub value: Option<RenderedValue>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub entries: Vec<FailureDumpEntry>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub percpu_entries: Vec<FailureDumpPercpuEntry>,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub percpu_hash_entries: Vec<FailureDumpPercpuHashEntry>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub arena: Option<ArenaSnapshot>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub ringbuf: Option<FailureDumpRingbuf>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub stack_trace: Option<FailureDumpStackTrace>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub fd_array: Option<FailureDumpFdArray>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct FailureDumpRingbuf {
pub capacity: u64,
pub consumer_pos: u64,
pub producer_pos: u64,
pub pending_pos: u64,
pub pending_bytes: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct FailureDumpStackTrace {
pub n_buckets: u32,
pub entries: Vec<FailureDumpStackTraceEntry>,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub truncated: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct FailureDumpStackTraceEntry {
pub bucket_id: u32,
pub nr: u32,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub pcs: Vec<u64>,
pub data_hex: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct FailureDumpFdArray {
pub populated: u32,
pub scanned: u32,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub indices: Vec<u32>,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub truncated: bool,
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub indices_truncated: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct FailureDumpEntry {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub key: Option<RenderedValue>,
pub key_hex: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub value: Option<RenderedValue>,
pub value_hex: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub payload: Option<RenderedValue>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct FailureDumpPercpuEntry {
pub key: u32,
pub per_cpu: Vec<Option<RenderedValue>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct FailureDumpPercpuHashEntry {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub key: Option<RenderedValue>,
pub key_hex: String,
pub per_cpu: Vec<Option<RenderedValue>>,
}
pub(crate) const MAX_BTF_BLOB: usize = 32 * 1024 * 1024;
pub const MAX_ENRICHED_TASKS: usize = 4096;
const KTSTR_INTERNAL_MAPS: &[&str] = &[
"func_meta_map",
"probe_data",
"probe_scratch",
"ktstr_events",
];
pub struct DumpContext<'a> {
pub accessor: &'a GuestMemMapAccessor<'a>,
pub btf: &'a Btf,
pub num_cpus: u32,
pub arena_offsets: Option<&'a BpfArenaOffsets>,
pub prog_capture: Option<&'a ProgRuntimeCapture<'a>>,
pub cpu_time_capture: Option<&'a CpuTimeCapture<'a>>,
pub task_enrichment_capture: Option<&'a TaskEnrichmentCapture<'a>>,
pub event_counter_capture: Option<&'a EventCounterCapture<'a>>,
pub scx_walker_capture: Option<&'a ScxWalkerCapture<'a>>,
pub perf_capture: Option<&'a super::perf_counters::PerfCountersCapture>,
pub deadline: Option<std::time::Instant>,
}
fn decode_probe_counters_snapshot(
accessor: &GuestMemMapAccessor<'_>,
base_btf: &Btf,
) -> Option<ProbeBssCounters> {
use super::bpf_map::BpfMapAccessor;
const PCPU_PROBE_COUNT: usize = 0;
const PCPU_KPROBE_RETURNS: usize = 1;
const PCPU_META_MISS: usize = 2;
const PCPU_RINGBUF_DROPS: usize = 3;
const PCPU_TIMELINE_COUNT: usize = 4;
const PCPU_TIMELINE_DROPS: usize = 5;
const PCPU_PI_COUNT: usize = 6;
const PCPU_PI_ORPHAN_FEXITS: usize = 7;
const PCPU_PI_CLASS_CHANGE_COUNT: usize = 8;
const PCPU_PI_DROPS: usize = 9;
const PCPU_LOCK_CONTEND_COUNT: usize = 10;
const PCPU_LOCK_CONTEND_DROPS: usize = 11;
const PCPU_PREEMPT_DISABLE_COUNT: usize = 12;
const PCPU_PREEMPT_ENABLE_COUNT: usize = 13;
const PCPU_TRIGGER_COUNT: usize = 14;
const PCPU_NR: usize = 15;
const PCPU_SLOT_STRIDE: usize = 128;
const MAX_CPUS: usize = 256;
let bss_map = accessor.find_map("probe_bp.bss")?;
if bss_map.btf_kva == 0 {
return None;
}
let prog_btf = load_program_btf_kva(accessor, bss_map.btf_kva, base_btf)?;
let array_off = super::btf_offsets::resolve_var_offset_in_section(
&prog_btf,
".bss",
"ktstr_pcpu_counters",
)? as usize;
let total_bytes = MAX_CPUS * PCPU_NR * PCPU_SLOT_STRIDE;
let array_bytes = accessor.read_value(&bss_map, array_off, total_bytes)?;
if array_bytes.len() < total_bytes {
return None;
}
let sum_slot = |slot: usize| -> u64 {
let mut total: u64 = 0;
for cpu in 0..MAX_CPUS {
let off = (cpu * PCPU_NR + slot) * PCPU_SLOT_STRIDE;
let mut buf = [0u8; 8];
buf.copy_from_slice(&array_bytes[off..off + 8]);
let v = i64::from_le_bytes(buf);
if v > 0 {
total = total.saturating_add(v as u64);
}
}
total
};
Some(ProbeBssCounters {
probe_count: sum_slot(PCPU_PROBE_COUNT),
kprobe_returns: sum_slot(PCPU_KPROBE_RETURNS),
meta_miss: sum_slot(PCPU_META_MISS),
ringbuf_drops: sum_slot(PCPU_RINGBUF_DROPS),
timeline_count: sum_slot(PCPU_TIMELINE_COUNT),
timeline_drops: sum_slot(PCPU_TIMELINE_DROPS),
pi_count: sum_slot(PCPU_PI_COUNT),
pi_orphan_fexits: sum_slot(PCPU_PI_ORPHAN_FEXITS),
pi_class_change_count: sum_slot(PCPU_PI_CLASS_CHANGE_COUNT),
pi_drops: sum_slot(PCPU_PI_DROPS),
lock_contend_count: sum_slot(PCPU_LOCK_CONTEND_COUNT),
lock_contend_drops: sum_slot(PCPU_LOCK_CONTEND_DROPS),
preempt_disable_count: sum_slot(PCPU_PREEMPT_DISABLE_COUNT),
preempt_enable_count: sum_slot(PCPU_PREEMPT_ENABLE_COUNT),
trigger_count: sum_slot(PCPU_TRIGGER_COUNT),
})
}
fn decode_probe_sched_state_snapshot(
accessor: &GuestMemMapAccessor<'_>,
base_btf: &Btf,
) -> Option<super::scx_walker::ScxSchedState> {
use super::bpf_map::BpfMapAccessor;
let bss_map = accessor.find_map("probe_bp.bss")?;
if bss_map.btf_kva == 0 {
return None;
}
let prog_btf = load_program_btf_kva(accessor, bss_map.btf_kva, base_btf)?;
let kind_off = super::btf_offsets::resolve_var_offset_in_section(
&prog_btf,
".bss",
"ktstr_exit_kind_snap",
)?;
let aborting_off = super::btf_offsets::resolve_var_offset_in_section(
&prog_btf,
".bss",
"ktstr_exit_aborting",
)?;
let bypass_depth_off = super::btf_offsets::resolve_var_offset_in_section(
&prog_btf,
".bss",
"ktstr_exit_bypass_depth",
)?;
let sched_kva_off = super::btf_offsets::resolve_var_offset_in_section(
&prog_btf,
".bss",
"ktstr_exit_sched_kva",
)?;
let watchdog_timeout_off = super::btf_offsets::resolve_var_offset_in_section(
&prog_btf,
".bss",
"ktstr_exit_watchdog_timeout",
)?;
let kind_bytes = accessor.read_value(&bss_map, kind_off as usize, 4)?;
let kind = u32::from_le_bytes(kind_bytes.as_slice().try_into().ok()?);
if kind == 0 {
return None;
}
let aborting_bytes = accessor.read_value(&bss_map, aborting_off as usize, 1)?;
let aborting = aborting_bytes.first().copied()? != 0;
let bypass_depth_bytes = accessor.read_value(&bss_map, bypass_depth_off as usize, 4)?;
let bypass_depth = i32::from_le_bytes(bypass_depth_bytes.as_slice().try_into().ok()?);
let sched_kva_bytes = accessor.read_value(&bss_map, sched_kva_off as usize, 8)?;
let sched_kva = u64::from_le_bytes(sched_kva_bytes.as_slice().try_into().ok()?);
let watchdog_timeout_bytes = accessor.read_value(&bss_map, watchdog_timeout_off as usize, 8)?;
let watchdog_timeout = u64::from_le_bytes(watchdog_timeout_bytes.as_slice().try_into().ok()?);
Some(super::scx_walker::ScxSchedState {
aborting,
bypass_depth,
exit_kind: kind,
watchdog_timeout: Some(watchdog_timeout),
source: Some(super::scx_walker::SCX_SCHED_STATE_SOURCE_BSS.to_string()),
sched_kva: if sched_kva == 0 {
None
} else {
Some(sched_kva)
},
})
}
pub fn dump_state(ctx: DumpContext<'_>) -> FailureDumpReport {
let DumpContext {
accessor,
btf,
num_cpus,
arena_offsets,
prog_capture,
cpu_time_capture,
task_enrichment_capture,
event_counter_capture,
scx_walker_capture,
perf_capture,
deadline,
} = ctx;
let dump_start = std::time::Instant::now();
let mut truncated_at_us: Option<u64> = None;
let deadline_exceeded = |truncated_at: &mut Option<u64>| -> bool {
if let Some(deadline) = deadline {
let now = std::time::Instant::now();
if now > deadline {
if truncated_at.is_none() {
let elapsed_us = dump_start.elapsed().as_micros() as u64;
*truncated_at = Some(elapsed_us);
tracing::warn!(
elapsed_us,
"dump_state: deadline exceeded, truncating remaining phases"
);
}
return true;
}
}
false
};
let maps = accessor.maps();
let (prog_runtime_stats, prog_runtime_stats_unavailable) = match prog_capture {
Some(cap) => {
let stats = cap.accessor.struct_ops_runtime_stats(cap.per_cpu_offsets);
let reason = if stats.is_empty() {
Some(REASON_NO_STRUCT_OPS_LOADED.to_string())
} else {
None
};
(stats, reason)
}
None => (
Vec::new(),
Some(REASON_PROG_ACCESSOR_UNAVAILABLE.to_string()),
),
};
let per_cpu_time = match cpu_time_capture {
Some(cap) => collect_per_cpu_time(cap),
None => Vec::new(),
};
let task_enrichment_t0 = std::time::Instant::now();
let (task_enrichments, task_enrichments_unavailable) = match task_enrichment_capture {
Some(cap) => {
let total = cap.tasks.len();
let cap_n = total.min(MAX_ENRICHED_TASKS);
let mut enrichments = Vec::with_capacity(cap_n);
for entry in cap.tasks.iter().take(cap_n) {
if let Some(e) = super::task_enrichment::walk_task_enrichment(
cap.kernel,
entry.task_kva,
cap.offsets,
cap.sched_classes,
cap.lock_slowpaths,
entry.is_runnable_in_scx,
entry.running_pc,
) {
enrichments.push(e);
}
}
if total > cap_n {
tracing::warn!(
cap = MAX_ENRICHED_TASKS,
total,
"dump_state task_enrichment: capped at MAX_ENRICHED_TASKS, dropping tail"
);
}
let reason = if enrichments.is_empty() {
tracing::debug!(
tasks_count = total,
"dump_state task_enrichment: walker yielded zero entries — \
scx_tasks list and rq->scx.runnable_list both empty, or every \
walk_task_enrichment call returned None (translate failures)",
);
Some(REASON_TASK_WALKER_ZERO_TASKS.to_string())
} else {
None
};
(enrichments, reason)
}
None => {
tracing::debug!(
"dump_state task_enrichment: capture is None — \
freeze coordinator passed no TaskEnrichmentCapture \
(scx_owned, scx_walker_offsets, or task_enrichment_offsets unresolved)",
);
(Vec::new(), Some(REASON_NO_TASK_WALKER.to_string()))
}
};
tracing::debug!(
elapsed_us = task_enrichment_t0.elapsed().as_micros() as u64,
enriched = task_enrichments.len(),
"dump_state phase: walk_task_enrichment"
);
deadline_exceeded(&mut truncated_at_us);
let event_counter_timeline = match event_counter_capture {
Some(cap) => cap
.samples
.iter()
.filter_map(EventCounterSample::from_monitor_sample)
.collect(),
None => Vec::new(),
};
let (rq_scx_states, dsq_states, scx_sched_state, scx_walker_unavailable) =
match scx_walker_capture {
Some(cap) => {
let missing = cap.offsets.missing_groups();
let (sched_pa_opt, sched_state) = match super::scx_walker::read_scx_sched_state(
cap.kernel,
cap.scx_root_kva,
cap.offsets,
) {
Some((sched_kva, state)) => {
let mem = cap.kernel.mem();
let walk = cap.kernel.walk_context();
let pa = super::idr::translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
sched_kva,
walk.l5,
walk.tcr_el1,
);
(pa, Some(state))
}
None => {
let snap = decode_probe_sched_state_snapshot(accessor, btf);
if snap.is_some() {
tracing::debug!(
scx_root_kva = format_args!("{:#x}", cap.scx_root_kva),
"dump_state scx walker: live read returned None; \
BPF .bss snapshot fallback populated scx_sched_state \
(scheduler torn down before freeze, snapshot \
captured at err-exit instant)",
);
}
(None, snap)
}
};
let walk_rq_scx_t0 = std::time::Instant::now();
let mut rq_states = Vec::with_capacity(cap.rq_kvas.len());
if !deadline_exceeded(&mut truncated_at_us) {
for (cpu, (&rq_kva, &rq_pa)) in
cap.rq_kvas.iter().zip(cap.rq_pas.iter()).enumerate()
{
if let Some((state, _entries)) = super::scx_walker::walk_rq_scx(
cap.kernel,
cpu as u32,
rq_kva,
rq_pa,
cap.offsets,
) {
rq_states.push(state);
}
}
}
tracing::debug!(
elapsed_us = walk_rq_scx_t0.elapsed().as_micros() as u64,
cpus = cap.rq_kvas.len(),
rq_states = rq_states.len(),
"dump_state phase: walk_rq_scx"
);
let walk_local_dsqs_t0 = std::time::Instant::now();
let mut dsqs: Vec<super::scx_walker::DsqState> = Vec::new();
if !deadline_exceeded(&mut truncated_at_us)
&& let Some((local_states, _entries)) = super::scx_walker::walk_local_dsqs(
cap.kernel,
cap.rq_kvas,
cap.rq_pas,
cap.per_cpu_offsets,
cap.offsets,
)
{
dsqs.extend(local_states);
}
tracing::debug!(
elapsed_us = walk_local_dsqs_t0.elapsed().as_micros() as u64,
local_dsqs = dsqs.len(),
"dump_state phase: walk_local_dsqs"
);
let walk_dsqs_t0 = std::time::Instant::now();
if !deadline_exceeded(&mut truncated_at_us)
&& let Some(sched_pa) = sched_pa_opt
{
let (sched_states, _entries) = super::scx_walker::walk_dsqs(
cap.kernel,
sched_pa,
cap.per_cpu_offsets,
cap.nr_nodes,
cap.offsets,
);
dsqs.extend(sched_states);
}
tracing::debug!(
elapsed_us = walk_dsqs_t0.elapsed().as_micros() as u64,
total_dsqs = dsqs.len(),
"dump_state phase: walk_dsqs"
);
let unavail = if !missing.is_empty() {
tracing::debug!(
missing_groups = ?missing,
rq_states_count = rq_states.len(),
dsq_count = dsqs.len(),
sched_state_some = sched_state.is_some(),
"dump_state scx walker: partial degradation — missing BTF sub-groups",
);
Some(format!(
"scx walker partial: missing offset groups [{}]",
missing.join(", ")
))
} else if sched_state.is_none() {
tracing::debug!(
scx_root_kva = format_args!("{:#x}", cap.scx_root_kva),
rq_states_count = rq_states.len(),
dsq_count = dsqs.len(),
"dump_state scx walker: scx_root is NULL — no scheduler attached; \
rq->scx and local DSQ captures populated, sched/bypass/global/user passes blinded",
);
Some(REASON_SCX_ROOT_NULL.to_string())
} else if rq_states.is_empty() && dsqs.is_empty() {
tracing::debug!(
scx_root_kva = format_args!("{:#x}", cap.scx_root_kva),
"dump_state scx walker: every walker read failed — no rq->scx, no DSQ, but sched_state present",
);
Some(REASON_SCX_WALKER_NO_STATE.to_string())
} else {
None
};
(rq_states, dsqs, sched_state, unavail)
}
None => {
tracing::debug!(
"dump_state scx walker: capture is None — \
freeze coordinator passed no ScxWalkerCapture (offsets/symbols/per_cpu_offsets unresolved)",
);
(
Vec::new(),
Vec::new(),
None,
Some(REASON_NO_SCX_WALKER.to_string()),
)
}
};
let vcpu_perf_at_freeze: Vec<Option<super::perf_counters::VcpuPerfSample>> = match perf_capture
{
Some(cap) => cap.per_vcpu.iter().map(|p| p.read().ok()).collect(),
None => Vec::new(),
};
let probe_counters = decode_probe_counters_snapshot(accessor, btf);
let mut report = FailureDumpReport {
schema: SCHEMA_SINGLE.to_string(),
maps: Vec::with_capacity(maps.len()),
vcpu_regs: Vec::new(),
sdt_allocations: Vec::new(),
prog_runtime_stats,
prog_runtime_stats_unavailable,
per_cpu_time,
per_node_numa: Vec::new(),
per_node_numa_unavailable: Some(REASON_NO_NUMA_WALKER.to_string()),
task_enrichments,
task_enrichments_unavailable,
event_counter_timeline,
rq_scx_states,
dsq_states,
scx_sched_state,
scx_walker_unavailable,
vcpu_perf_at_freeze,
dump_truncated_at_us: None,
probe_counters,
is_placeholder: false,
};
let mut program_btfs: std::collections::HashMap<u64, Btf> = std::collections::HashMap::new();
let shared_arena_snapshot: Option<(BpfMapInfo, ArenaSnapshot)> =
arena_offsets.and_then(|off| {
for info in &maps {
let name = info.name();
if name.starts_with("probe_bp.")
|| name.starts_with("fentry_p.")
|| name == "probe_bp"
|| name == "fentry_p"
|| KTSTR_INTERNAL_MAPS.contains(&name.as_ref())
{
continue;
}
if info.map_type == BPF_MAP_TYPE_ARENA {
let snap = snapshot_arena(accessor.kernel(), info, off);
return Some((info.clone(), snap));
}
}
None
});
let shared_arena_ref: Option<(&ArenaSnapshot, u64)> = shared_arena_snapshot
.as_ref()
.map(|(info, snap)| (snap, info.map_kva));
let arena_kern_vm_start: u64 = shared_arena_snapshot
.as_ref()
.map(|(_, snap)| snap.kern_vm_start)
.unwrap_or(0);
let mut sched_bss_bytes: Option<(Vec<u8>, u64)> = None; for info in &maps {
let name = info.name();
if name.starts_with("probe_bp.")
|| name.starts_with("fentry_p.")
|| name == "probe_bp"
|| name == "fentry_p"
|| KTSTR_INTERNAL_MAPS.contains(&name.as_ref())
{
continue;
}
if info.btf_kva != 0
&& !program_btfs.contains_key(&info.btf_kva)
&& let Some(loaded) = accessor.load_program_btf(info, btf)
{
program_btfs.insert(info.btf_kva, loaded);
}
if sched_bss_bytes.is_none()
&& info.map_type == BPF_MAP_TYPE_ARRAY
&& info.btf_kva != 0
&& name.ends_with(".bss")
&& let Some(bytes) = accessor.read_value(info, 0, info.value_size as usize)
{
sched_bss_bytes = Some((bytes, info.btf_kva));
}
}
let arena_page_index = crate::monitor::dump::render_map::build_arena_page_index(
shared_arena_snapshot.as_ref().map(|(_, snap)| snap),
);
let sdt_alloc_t0 = std::time::Instant::now();
let mut sdt_alloc_metas: Vec<crate::monitor::dump::render_map::SdtAllocMeta> = Vec::new();
if !deadline_exceeded(&mut truncated_at_us)
&& let Some((bss_bytes, btf_kva)) = sched_bss_bytes
&& arena_kern_vm_start != 0
&& let Some(prog_btf) = program_btfs.get(&btf_kva)
&& let Ok(sdt_offsets) = SdtAllocOffsets::from_btf(prog_btf)
{
let sdt_mem = accessor.mem_reader(
shared_arena_snapshot.as_ref().map(|(_, snap)| snap),
&arena_page_index,
num_cpus,
);
for (var_name, var_offset, var_type_id) in iter_bss_vars_with_type(prog_btf, ".bss") {
if !is_scx_allocator_type(prog_btf, var_type_id) {
continue;
}
let Some(slice_end) = var_offset.checked_add(sdt_offsets.allocator_size) else {
continue;
};
let slice = match bss_bytes.get(var_offset..slice_end) {
Some(s) => s,
None => continue,
};
let pool_off = sdt_offsets.allocator_pool + sdt_offsets.pool_elem_size;
let elem_size = if pool_off + 8 <= slice.len() {
let mut buf = [0u8; 8];
buf.copy_from_slice(&slice[pool_off..pool_off + 8]);
u64::from_le_bytes(buf)
} else {
0
};
let payload_size =
elem_size.saturating_sub(sdt_offsets.data_header_size as u64) as usize;
let choice = discover_payload_btf_id(prog_btf, payload_size);
let snap = walk_sdt_allocator(
accessor.kernel(),
arena_kern_vm_start,
slice,
&sdt_offsets,
prog_btf,
choice.btf_type_id,
choice.reason.clone(),
var_name.clone(),
&sdt_mem,
);
if choice.btf_type_id != 0 {
sdt_alloc_metas.push(crate::monitor::dump::render_map::SdtAllocMeta {
allocator_name: var_name,
elem_size,
header_size: sdt_offsets.data_header_size,
payload_btf_type_id: choice.btf_type_id,
kern_vm_start: arena_kern_vm_start,
});
}
if !snap.entries.is_empty() || snap.elem_size != 0 {
report.sdt_allocations.push(snap);
}
}
}
tracing::debug!(
elapsed_us = sdt_alloc_t0.elapsed().as_micros() as u64,
allocations = report.sdt_allocations.len(),
"dump_state phase: sdt_alloc"
);
let render_map_t0 = std::time::Instant::now();
let mut maps_rendered: usize = 0;
let mut maps_truncated: usize = 0;
for info in maps {
{
let info_name = info.name();
if info_name.starts_with("probe_bp.")
|| info_name.starts_with("fentry_p.")
|| info_name == "probe_bp"
|| info_name == "fentry_p"
|| KTSTR_INTERNAL_MAPS.contains(&info_name.as_ref())
{
continue;
}
}
if deadline_exceeded(&mut truncated_at_us) {
maps_truncated += 1;
continue;
}
let map_btf: Option<&Btf> = if info.map_type == super::bpf_map::BPF_MAP_TYPE_STRUCT_OPS {
Some(btf)
} else if info.btf_kva != 0 {
program_btfs.get(&info.btf_kva)
} else {
Some(btf)
};
let rendered = render_map(
&RenderMapCtx {
accessor,
btf: map_btf,
num_cpus,
arena_offsets,
shared_arena: shared_arena_ref,
arena_page_index: &arena_page_index,
sdt_alloc_metas: &sdt_alloc_metas,
},
&info,
);
report.maps.push(rendered);
maps_rendered += 1;
}
tracing::debug!(
elapsed_us = render_map_t0.elapsed().as_micros() as u64,
rendered = maps_rendered,
truncated = maps_truncated,
"dump_state phase: per-map render"
);
report.dump_truncated_at_us = truncated_at_us;
report
}
fn collect_per_cpu_time(cap: &CpuTimeCapture<'_>) -> Vec<PerCpuTimeStats> {
use super::btf_offsets::{
CPUTIME_IDLE, CPUTIME_IOWAIT, CPUTIME_IRQ, CPUTIME_NICE, CPUTIME_SOFTIRQ, CPUTIME_STEAL,
CPUTIME_SYSTEM, CPUTIME_USER, NR_SOFTIRQS,
};
let mut out = Vec::with_capacity(cap.per_cpu_offsets.len());
for (cpu_idx, &per_cpu_off) in cap.per_cpu_offsets.iter().enumerate() {
let cpu = cpu_idx as u32;
let cpustat_kva = cap.kernel_cpustat_kva.wrapping_add(per_cpu_off);
let cpustat_pa = super::symbols::kva_to_pa(cpustat_kva, cap.page_offset);
let cpustat_base = cap.offsets.kernel_cpustat_cpustat;
let read_cpustat = |idx: usize| -> u64 {
cap.mem.read_u64(cpustat_pa, cpustat_base + idx * 8)
};
let cpustat_user_ns = read_cpustat(CPUTIME_USER);
let cpustat_nice_ns = read_cpustat(CPUTIME_NICE);
let cpustat_system_ns = read_cpustat(CPUTIME_SYSTEM);
let cpustat_softirq_ns = read_cpustat(CPUTIME_SOFTIRQ);
let cpustat_irq_ns = read_cpustat(CPUTIME_IRQ);
let cpustat_idle_ns = read_cpustat(CPUTIME_IDLE);
let cpustat_iowait_ns = read_cpustat(CPUTIME_IOWAIT);
let cpustat_steal_ns = read_cpustat(CPUTIME_STEAL);
let kstat_kva = cap.kstat_kva.wrapping_add(per_cpu_off);
let kstat_pa = super::symbols::kva_to_pa(kstat_kva, cap.page_offset);
let mut softirqs = [0u64; NR_SOFTIRQS];
for (i, slot) in softirqs.iter_mut().enumerate() {
*slot = cap
.mem
.read_u32(kstat_pa, cap.offsets.kstat_softirqs + i * 4) as u64;
}
let irqs_sum = cap.mem.read_u64(kstat_pa, cap.offsets.kstat_irqs_sum);
let iowait_sleeptime_ns = cap
.tick_cpu_sched_kva
.zip(cap.offsets.tick_sched_iowait_sleeptime)
.map(|(tick_sym_kva, off)| {
let kva = tick_sym_kva.wrapping_add(per_cpu_off);
let pa = super::symbols::kva_to_pa(kva, cap.page_offset);
cap.mem.read_u64(pa, off)
});
out.push(PerCpuTimeStats {
cpu,
cpustat_user_ns,
cpustat_nice_ns,
cpustat_system_ns,
cpustat_softirq_ns,
cpustat_irq_ns,
cpustat_idle_ns,
cpustat_iowait_ns,
cpustat_steal_ns,
softirqs,
irqs_sum,
iowait_sleeptime_ns,
});
}
out
}
fn iter_bss_vars_with_type(btf: &Btf, section_name: &str) -> Vec<(String, usize, u32)> {
use btf_rs::BtfType;
let mut out = Vec::new();
let Ok(candidates) = btf.resolve_types_by_name(section_name) else {
return out;
};
for ty in candidates {
let btf_rs::Type::Datasec(ds) = ty else {
continue;
};
for var_info in &ds.variables {
let Ok(chained) = btf.resolve_chained_type(var_info) else {
continue;
};
let btf_rs::Type::Var(var) = chained else {
continue;
};
let Ok(name) = btf.resolve_name(&var) else {
continue;
};
let Ok(type_id) = var.get_type_id() else {
continue;
};
out.push((name, var_info.offset() as usize, type_id));
}
}
out
}
fn is_scx_allocator_type(btf: &Btf, type_id: u32) -> bool {
use btf_rs::Type as T;
let Ok(mut t) = btf.resolve_type_by_id(type_id) else {
return false;
};
for _ in 0..20 {
match t {
T::Struct(s) => {
return btf.resolve_name(&s).is_ok_and(|n| n == "scx_allocator");
}
T::Const(_) | T::Volatile(_) | T::Typedef(_) | T::Restrict(_) | T::TypeTag(_) => {
let Some(btf_ty) = t.as_btf_type() else {
return false;
};
let Ok(next) = btf.resolve_chained_type(btf_ty) else {
return false;
};
t = next;
}
_ => return false,
}
}
false
}
pub(super) fn load_program_btf_kva(
accessor: &GuestMemMapAccessor<'_>,
btf_kva: u64,
base_btf: &Btf,
) -> Option<Btf> {
let kernel = accessor.kernel();
let offsets = accessor.offsets();
let mem = kernel.mem();
let walk = kernel.walk_context();
let btf_pa = super::idr::translate_any_kva(
mem,
walk.cr3_pa,
walk.page_offset,
btf_kva,
walk.l5,
walk.tcr_el1,
)?;
let data_kva = mem.read_u64(btf_pa, offsets.btf_data);
let data_size = mem.read_u32(btf_pa, offsets.btf_data_size) as usize;
let base_kva = mem.read_u64(btf_pa, offsets.btf_base_btf);
if data_kva == 0 || data_size == 0 {
return None;
}
if data_size > MAX_BTF_BLOB {
return None;
}
let blob = kernel.read_kva_bytes_chunked(data_kva, data_size)?;
if base_kva != 0 {
Btf::from_split_bytes(&blob, base_btf).ok()
} else {
Btf::from_bytes(&blob).ok()
}
}
pub(crate) fn hex_dump(bytes: &[u8]) -> String {
use std::fmt::Write;
let mut s = String::with_capacity(bytes.len() * 3);
for (i, b) in bytes.iter().enumerate() {
if i > 0 {
s.push(' ');
}
let _ = write!(s, "{b:02x}");
}
s
}