Skip to main content

vortex_trace/
diagnosis.rs

1//! Failure diagnosis helper for simulation invariant violations.
2//!
3//! When an invariant is violated, the diagnosis helper walks backward through
4//! the trace to find the most likely root cause: which fault, at which tick,
5//! on which node, led to the violation.
6
7use crate::{SimTrace, TraceEvent, TraceEventKind};
8
9/// Lightweight violation info for diagnosis (avoids coupling to vortex-check).
10#[derive(Debug, Clone)]
11pub struct ViolationInfo {
12    /// Name of the invariant that was violated.
13    pub invariant_name: String,
14    /// Simulation tick when the violation was detected.
15    pub tick: u64,
16    /// Human-readable description.
17    pub description: String,
18}
19
20/// Result of diagnosing an invariant violation.
21#[derive(Debug, Clone)]
22pub struct DiagnosisReport {
23    /// The invariant violation being diagnosed.
24    pub violation: ViolationInfo,
25    /// Most likely root cause fault event (if found).
26    pub suspected_root_cause: Option<FaultCause>,
27    /// Chain of events leading to the violation.
28    pub causal_events: Vec<CausalEvent>,
29    /// Suggested investigation steps.
30    pub suggestions: Vec<String>,
31}
32
33/// A fault event identified as a potential root cause.
34#[derive(Debug, Clone)]
35pub struct FaultCause {
36    pub event_id: u64,
37    pub tick: u64,
38    pub node_id: u64,
39    pub fault_type: String,
40    pub details: String,
41}
42
43/// A significant event in the causal chain.
44#[derive(Debug, Clone)]
45pub struct CausalEvent {
46    pub tick: u64,
47    pub node_id: u64,
48    pub description: String,
49}
50
51/// Diagnose an invariant violation by examining the trace.
52///
53/// Walks backward from the violation tick to find the most recent fault event.
54/// Returns a human-readable diagnosis report with suspected root cause,
55/// causal chain, and investigation suggestions.
56pub fn diagnose(violation: &ViolationInfo, trace: &SimTrace) -> DiagnosisReport {
57    let violation_tick = violation.tick;
58
59    // Find all fault events before the violation
60    let fault_events: Vec<&TraceEvent> = trace
61        .events()
62        .iter()
63        .filter(|e| e.tick <= violation_tick)
64        .filter(|e| matches!(&e.kind, TraceEventKind::FaultInjected { .. }))
65        .collect();
66
67    // The most recent fault is our suspected root cause
68    let suspected_root_cause = fault_events.last().map(|event| {
69        let (fault_type, details) = match &event.kind {
70            TraceEventKind::FaultInjected {
71                fault_type,
72                details,
73            } => (fault_type.clone(), details.clone()),
74            _ => ("unknown".to_string(), String::new()),
75        };
76        FaultCause {
77            event_id: event.event_id,
78            tick: event.tick,
79            node_id: event.node_id,
80            fault_type,
81            details,
82        }
83    });
84
85    // Build causal chain: significant events between root cause and violation
86    let start_tick = suspected_root_cause.as_ref().map(|f| f.tick).unwrap_or(0);
87
88    let causal_events: Vec<CausalEvent> = trace
89        .events()
90        .iter()
91        .filter(|e| e.tick >= start_tick && e.tick <= violation_tick)
92        .filter(|e| {
93            matches!(
94                &e.kind,
95                TraceEventKind::FaultInjected { .. }
96                    | TraceEventKind::FaultHealed { .. }
97                    | TraceEventKind::StateTransition { .. }
98                    | TraceEventKind::StorageOp { .. }
99            )
100        })
101        .take(50)
102        .map(|e| CausalEvent {
103            tick: e.tick,
104            node_id: e.node_id,
105            description: describe_event(&e.kind),
106        })
107        .collect();
108
109    // Generate investigation suggestions
110    let mut suggestions = Vec::new();
111    if let Some(ref cause) = suspected_root_cause {
112        match cause.fault_type.as_str() {
113            "crash" | "crash_at_step" | "cascade_crash" => {
114                suggestions.push(format!(
115                    "Node {} crashed at tick {}. Check if committed data was lost on restart.",
116                    cause.node_id, cause.tick
117                ));
118                suggestions
119                    .push("Replay the simulation to the crash tick to inspect state.".to_string());
120            }
121            "partition" | "one_way_partition" => {
122                suggestions.push(format!(
123                    "Network partition at tick {}: {}. Check if split-brain occurred.",
124                    cause.tick, cause.details
125                ));
126                suggestions
127                    .push("Check state transition events during the partition window.".to_string());
128            }
129            "clock_freeze" | "clock_warp" | "clock_skew" | "leap_second" => {
130                suggestions.push(format!(
131                    "Clock anomaly on node {} at tick {}: {}. Check timestamp-dependent logic.",
132                    cause.node_id, cause.tick, cause.details
133                ));
134            }
135            "slow_node" | "link_degrade" => {
136                suggestions.push(format!(
137                    "Node {} degraded at tick {}. Check for timeout-related failures.",
138                    cause.node_id, cause.tick
139                ));
140            }
141            "disk_full" => {
142                suggestions.push(format!(
143                    "Disk full on node {} at tick {}. Check storage error handling paths.",
144                    cause.node_id, cause.tick
145                ));
146            }
147            _ => {
148                suggestions.push(format!(
149                    "Fault '{}' on node {} at tick {}.",
150                    cause.fault_type, cause.node_id, cause.tick
151                ));
152            }
153        }
154    } else {
155        suggestions
156            .push("No fault events found before violation. This may be a logic bug.".to_string());
157    }
158
159    suggestions.push(format!(
160        "Reproduce: run with the same seed and use trace.events_between({}, {}) to inspect.",
161        start_tick, violation_tick
162    ));
163
164    DiagnosisReport {
165        violation: violation.clone(),
166        suspected_root_cause,
167        causal_events,
168        suggestions,
169    }
170}
171
172/// Human-readable description of a trace event.
173fn describe_event(kind: &TraceEventKind) -> String {
174    match kind {
175        TraceEventKind::FaultInjected {
176            fault_type,
177            details,
178        } => {
179            format!("FAULT+ {fault_type}: {details}")
180        }
181        TraceEventKind::FaultHealed {
182            fault_type,
183            details,
184        } => {
185            format!("FAULT- {fault_type}: {details}")
186        }
187        TraceEventKind::StateTransition {
188            from_state,
189            to_state,
190            metadata,
191        } => {
192            format!("STATE {from_state} -> {to_state} ({metadata})")
193        }
194        TraceEventKind::StorageOp { op_type, key_count } => {
195            format!("STORAGE {op_type} keys={key_count}")
196        }
197        other => format!("{:?}", other),
198    }
199}
200
201impl DiagnosisReport {
202    /// Human-readable diagnosis output.
203    pub fn to_string_pretty(&self) -> String {
204        let mut out = String::new();
205        out.push_str(&format!(
206            "=== Diagnosis: {} ===\n",
207            self.violation.invariant_name
208        ));
209        out.push_str(&format!(
210            "Violation at tick {}: {}\n\n",
211            self.violation.tick, self.violation.description
212        ));
213
214        if let Some(ref cause) = self.suspected_root_cause {
215            out.push_str(&format!(
216                "Suspected root cause: {} on node {} at tick {}\n",
217                cause.fault_type, cause.node_id, cause.tick
218            ));
219            out.push_str(&format!("  Details: {}\n\n", cause.details));
220        } else {
221            out.push_str("No fault-based root cause identified.\n\n");
222        }
223
224        if !self.causal_events.is_empty() {
225            out.push_str("Causal chain:\n");
226            for event in &self.causal_events {
227                out.push_str(&format!(
228                    "  [tick {}] node {}: {}\n",
229                    event.tick, event.node_id, event.description
230                ));
231            }
232            out.push('\n');
233        }
234
235        out.push_str("Suggestions:\n");
236        for (i, s) in self.suggestions.iter().enumerate() {
237            out.push_str(&format!("  {}. {}\n", i + 1, s));
238        }
239
240        out
241    }
242}
243
244#[cfg(test)]
245mod tests {
246    use super::*;
247
248    fn make_violation() -> ViolationInfo {
249        ViolationInfo {
250            invariant_name: "NoDataLoss".into(),
251            description: "Key 'abc' missing on node 2".into(),
252            tick: 500,
253        }
254    }
255
256    #[test]
257    fn test_diagnosis_crash_induced() {
258        let mut trace = SimTrace::new();
259        trace.record(
260            100,
261            1,
262            TraceEventKind::StateTransition {
263                from_state: "Follower".into(),
264                to_state: "Leader".into(),
265                metadata: "term=1".into(),
266            },
267        );
268        trace.record(
269            200,
270            2,
271            TraceEventKind::FaultInjected {
272                fault_type: "crash".into(),
273                details: "node 2 crashed".into(),
274            },
275        );
276        trace.record(
277            300,
278            3,
279            TraceEventKind::StateTransition {
280                from_state: "Follower".into(),
281                to_state: "Leader".into(),
282                metadata: "term=2".into(),
283            },
284        );
285
286        let report = diagnose(&make_violation(), &trace);
287        assert!(report.suspected_root_cause.is_some());
288        let cause = report.suspected_root_cause.unwrap();
289        assert_eq!(cause.fault_type, "crash");
290        assert_eq!(cause.node_id, 2);
291        assert_eq!(cause.tick, 200);
292    }
293
294    #[test]
295    fn test_diagnosis_partition_induced() {
296        let mut trace = SimTrace::new();
297        trace.record(
298            100,
299            0,
300            TraceEventKind::FaultInjected {
301                fault_type: "partition".into(),
302                details: "[1, 2] <-> [3]".into(),
303            },
304        );
305        trace.record(
306            200,
307            0,
308            TraceEventKind::FaultHealed {
309                fault_type: "partition".into(),
310                details: "healed".into(),
311            },
312        );
313
314        let report = diagnose(&make_violation(), &trace);
315        let cause = report.suspected_root_cause.unwrap();
316        assert_eq!(cause.fault_type, "partition");
317        assert!(report.suggestions.iter().any(|s| s.contains("split-brain")));
318    }
319
320    #[test]
321    fn test_diagnosis_no_faults() {
322        let trace = SimTrace::new();
323        let report = diagnose(&make_violation(), &trace);
324        assert!(report.suspected_root_cause.is_none());
325        assert!(report.suggestions.iter().any(|s| s.contains("logic bug")));
326    }
327
328    #[test]
329    fn test_diagnosis_human_readable() {
330        let mut trace = SimTrace::new();
331        trace.record(
332            100,
333            1,
334            TraceEventKind::FaultInjected {
335                fault_type: "crash".into(),
336                details: "node 1 crashed".into(),
337            },
338        );
339
340        let report = diagnose(&make_violation(), &trace);
341        let pretty = report.to_string_pretty();
342        assert!(pretty.contains("Diagnosis:"));
343        assert!(pretty.contains("NoDataLoss"));
344        assert!(pretty.contains("Suspected root cause"));
345        assert!(pretty.contains("Suggestions"));
346    }
347
348    #[test]
349    fn test_diagnosis_causal_chain() {
350        let mut trace = SimTrace::new();
351        trace.record(
352            100,
353            0,
354            TraceEventKind::FaultInjected {
355                fault_type: "partition".into(),
356                details: "split".into(),
357            },
358        );
359        trace.record(
360            200,
361            1,
362            TraceEventKind::StateTransition {
363                from_state: "Follower".into(),
364                to_state: "Candidate".into(),
365                metadata: "".into(),
366            },
367        );
368        trace.record(
369            300,
370            1,
371            TraceEventKind::StateTransition {
372                from_state: "Candidate".into(),
373                to_state: "Leader".into(),
374                metadata: "".into(),
375            },
376        );
377        trace.record(
378            400,
379            0,
380            TraceEventKind::FaultHealed {
381                fault_type: "partition".into(),
382                details: "healed".into(),
383            },
384        );
385
386        let report = diagnose(&make_violation(), &trace);
387        assert!(report.causal_events.len() >= 3);
388    }
389}