use crate::{SimTrace, TraceEvent, TraceEventKind};
#[derive(Debug, Clone)]
pub struct ViolationInfo {
pub invariant_name: String,
pub tick: u64,
pub description: String,
}
#[derive(Debug, Clone)]
pub struct DiagnosisReport {
pub violation: ViolationInfo,
pub suspected_root_cause: Option<FaultCause>,
pub causal_events: Vec<CausalEvent>,
pub suggestions: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct FaultCause {
pub event_id: u64,
pub tick: u64,
pub node_id: u64,
pub fault_type: String,
pub details: String,
}
#[derive(Debug, Clone)]
pub struct CausalEvent {
pub tick: u64,
pub node_id: u64,
pub description: String,
}
pub fn diagnose(violation: &ViolationInfo, trace: &SimTrace) -> DiagnosisReport {
let violation_tick = violation.tick;
let fault_events: Vec<&TraceEvent> = trace
.events()
.iter()
.filter(|e| e.tick <= violation_tick)
.filter(|e| matches!(&e.kind, TraceEventKind::FaultInjected { .. }))
.collect();
let suspected_root_cause = fault_events.last().map(|event| {
let (fault_type, details) = match &event.kind {
TraceEventKind::FaultInjected {
fault_type,
details,
} => (fault_type.clone(), details.clone()),
_ => ("unknown".to_string(), String::new()),
};
FaultCause {
event_id: event.event_id,
tick: event.tick,
node_id: event.node_id,
fault_type,
details,
}
});
let start_tick = suspected_root_cause.as_ref().map(|f| f.tick).unwrap_or(0);
let causal_events: Vec<CausalEvent> = trace
.events()
.iter()
.filter(|e| e.tick >= start_tick && e.tick <= violation_tick)
.filter(|e| {
matches!(
&e.kind,
TraceEventKind::FaultInjected { .. }
| TraceEventKind::FaultHealed { .. }
| TraceEventKind::StateTransition { .. }
| TraceEventKind::StorageOp { .. }
)
})
.take(50)
.map(|e| CausalEvent {
tick: e.tick,
node_id: e.node_id,
description: describe_event(&e.kind),
})
.collect();
let mut suggestions = Vec::new();
if let Some(ref cause) = suspected_root_cause {
match cause.fault_type.as_str() {
"crash" | "crash_at_step" | "cascade_crash" => {
suggestions.push(format!(
"Node {} crashed at tick {}. Check if committed data was lost on restart.",
cause.node_id, cause.tick
));
suggestions
.push("Replay the simulation to the crash tick to inspect state.".to_string());
}
"partition" | "one_way_partition" => {
suggestions.push(format!(
"Network partition at tick {}: {}. Check if split-brain occurred.",
cause.tick, cause.details
));
suggestions
.push("Check state transition events during the partition window.".to_string());
}
"clock_freeze" | "clock_warp" | "clock_skew" | "leap_second" => {
suggestions.push(format!(
"Clock anomaly on node {} at tick {}: {}. Check timestamp-dependent logic.",
cause.node_id, cause.tick, cause.details
));
}
"slow_node" | "link_degrade" => {
suggestions.push(format!(
"Node {} degraded at tick {}. Check for timeout-related failures.",
cause.node_id, cause.tick
));
}
"disk_full" => {
suggestions.push(format!(
"Disk full on node {} at tick {}. Check storage error handling paths.",
cause.node_id, cause.tick
));
}
_ => {
suggestions.push(format!(
"Fault '{}' on node {} at tick {}.",
cause.fault_type, cause.node_id, cause.tick
));
}
}
} else {
suggestions
.push("No fault events found before violation. This may be a logic bug.".to_string());
}
suggestions.push(format!(
"Reproduce: run with the same seed and use trace.events_between({}, {}) to inspect.",
start_tick, violation_tick
));
DiagnosisReport {
violation: violation.clone(),
suspected_root_cause,
causal_events,
suggestions,
}
}
fn describe_event(kind: &TraceEventKind) -> String {
match kind {
TraceEventKind::FaultInjected {
fault_type,
details,
} => {
format!("FAULT+ {fault_type}: {details}")
}
TraceEventKind::FaultHealed {
fault_type,
details,
} => {
format!("FAULT- {fault_type}: {details}")
}
TraceEventKind::StateTransition {
from_state,
to_state,
metadata,
} => {
format!("STATE {from_state} -> {to_state} ({metadata})")
}
TraceEventKind::StorageOp { op_type, key_count } => {
format!("STORAGE {op_type} keys={key_count}")
}
other => format!("{:?}", other),
}
}
impl DiagnosisReport {
pub fn to_string_pretty(&self) -> String {
let mut out = String::new();
out.push_str(&format!(
"=== Diagnosis: {} ===\n",
self.violation.invariant_name
));
out.push_str(&format!(
"Violation at tick {}: {}\n\n",
self.violation.tick, self.violation.description
));
if let Some(ref cause) = self.suspected_root_cause {
out.push_str(&format!(
"Suspected root cause: {} on node {} at tick {}\n",
cause.fault_type, cause.node_id, cause.tick
));
out.push_str(&format!(" Details: {}\n\n", cause.details));
} else {
out.push_str("No fault-based root cause identified.\n\n");
}
if !self.causal_events.is_empty() {
out.push_str("Causal chain:\n");
for event in &self.causal_events {
out.push_str(&format!(
" [tick {}] node {}: {}\n",
event.tick, event.node_id, event.description
));
}
out.push('\n');
}
out.push_str("Suggestions:\n");
for (i, s) in self.suggestions.iter().enumerate() {
out.push_str(&format!(" {}. {}\n", i + 1, s));
}
out
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_violation() -> ViolationInfo {
ViolationInfo {
invariant_name: "NoDataLoss".into(),
description: "Key 'abc' missing on node 2".into(),
tick: 500,
}
}
#[test]
fn test_diagnosis_crash_induced() {
let mut trace = SimTrace::new();
trace.record(
100,
1,
TraceEventKind::StateTransition {
from_state: "Follower".into(),
to_state: "Leader".into(),
metadata: "term=1".into(),
},
);
trace.record(
200,
2,
TraceEventKind::FaultInjected {
fault_type: "crash".into(),
details: "node 2 crashed".into(),
},
);
trace.record(
300,
3,
TraceEventKind::StateTransition {
from_state: "Follower".into(),
to_state: "Leader".into(),
metadata: "term=2".into(),
},
);
let report = diagnose(&make_violation(), &trace);
assert!(report.suspected_root_cause.is_some());
let cause = report.suspected_root_cause.unwrap();
assert_eq!(cause.fault_type, "crash");
assert_eq!(cause.node_id, 2);
assert_eq!(cause.tick, 200);
}
#[test]
fn test_diagnosis_partition_induced() {
let mut trace = SimTrace::new();
trace.record(
100,
0,
TraceEventKind::FaultInjected {
fault_type: "partition".into(),
details: "[1, 2] <-> [3]".into(),
},
);
trace.record(
200,
0,
TraceEventKind::FaultHealed {
fault_type: "partition".into(),
details: "healed".into(),
},
);
let report = diagnose(&make_violation(), &trace);
let cause = report.suspected_root_cause.unwrap();
assert_eq!(cause.fault_type, "partition");
assert!(report.suggestions.iter().any(|s| s.contains("split-brain")));
}
#[test]
fn test_diagnosis_no_faults() {
let trace = SimTrace::new();
let report = diagnose(&make_violation(), &trace);
assert!(report.suspected_root_cause.is_none());
assert!(report.suggestions.iter().any(|s| s.contains("logic bug")));
}
#[test]
fn test_diagnosis_human_readable() {
let mut trace = SimTrace::new();
trace.record(
100,
1,
TraceEventKind::FaultInjected {
fault_type: "crash".into(),
details: "node 1 crashed".into(),
},
);
let report = diagnose(&make_violation(), &trace);
let pretty = report.to_string_pretty();
assert!(pretty.contains("Diagnosis:"));
assert!(pretty.contains("NoDataLoss"));
assert!(pretty.contains("Suspected root cause"));
assert!(pretty.contains("Suggestions"));
}
#[test]
fn test_diagnosis_causal_chain() {
let mut trace = SimTrace::new();
trace.record(
100,
0,
TraceEventKind::FaultInjected {
fault_type: "partition".into(),
details: "split".into(),
},
);
trace.record(
200,
1,
TraceEventKind::StateTransition {
from_state: "Follower".into(),
to_state: "Candidate".into(),
metadata: "".into(),
},
);
trace.record(
300,
1,
TraceEventKind::StateTransition {
from_state: "Candidate".into(),
to_state: "Leader".into(),
metadata: "".into(),
},
);
trace.record(
400,
0,
TraceEventKind::FaultHealed {
fault_type: "partition".into(),
details: "healed".into(),
},
);
let report = diagnose(&make_violation(), &trace);
assert!(report.causal_events.len() >= 3);
}
}