vortex-trace 0.1.0

Structured event tracing and replay for Vortex simulations
Documentation
//! Failure diagnosis helper for simulation invariant violations.
//!
//! When an invariant is violated, the diagnosis helper walks backward through
//! the trace to find the most likely root cause: which fault, at which tick,
//! on which node, led to the violation.

use crate::{SimTrace, TraceEvent, TraceEventKind};

/// Lightweight violation info for diagnosis (avoids coupling to vortex-check).
#[derive(Debug, Clone)]
pub struct ViolationInfo {
    /// Name of the invariant that was violated.
    pub invariant_name: String,
    /// Simulation tick when the violation was detected.
    pub tick: u64,
    /// Human-readable description.
    pub description: String,
}

/// Result of diagnosing an invariant violation.
#[derive(Debug, Clone)]
pub struct DiagnosisReport {
    /// The invariant violation being diagnosed.
    pub violation: ViolationInfo,
    /// Most likely root cause fault event (if found).
    pub suspected_root_cause: Option<FaultCause>,
    /// Chain of events leading to the violation.
    pub causal_events: Vec<CausalEvent>,
    /// Suggested investigation steps.
    pub suggestions: Vec<String>,
}

/// A fault event identified as a potential root cause.
#[derive(Debug, Clone)]
pub struct FaultCause {
    pub event_id: u64,
    pub tick: u64,
    pub node_id: u64,
    pub fault_type: String,
    pub details: String,
}

/// A significant event in the causal chain.
#[derive(Debug, Clone)]
pub struct CausalEvent {
    pub tick: u64,
    pub node_id: u64,
    pub description: String,
}

/// Diagnose an invariant violation by examining the trace.
///
/// Walks backward from the violation tick to find the most recent fault event.
/// Returns a human-readable diagnosis report with suspected root cause,
/// causal chain, and investigation suggestions.
pub fn diagnose(violation: &ViolationInfo, trace: &SimTrace) -> DiagnosisReport {
    let violation_tick = violation.tick;

    // Find all fault events before the violation
    let fault_events: Vec<&TraceEvent> = trace
        .events()
        .iter()
        .filter(|e| e.tick <= violation_tick)
        .filter(|e| matches!(&e.kind, TraceEventKind::FaultInjected { .. }))
        .collect();

    // The most recent fault is our suspected root cause
    let suspected_root_cause = fault_events.last().map(|event| {
        let (fault_type, details) = match &event.kind {
            TraceEventKind::FaultInjected {
                fault_type,
                details,
            } => (fault_type.clone(), details.clone()),
            _ => ("unknown".to_string(), String::new()),
        };
        FaultCause {
            event_id: event.event_id,
            tick: event.tick,
            node_id: event.node_id,
            fault_type,
            details,
        }
    });

    // Build causal chain: significant events between root cause and violation
    let start_tick = suspected_root_cause.as_ref().map(|f| f.tick).unwrap_or(0);

    let causal_events: Vec<CausalEvent> = trace
        .events()
        .iter()
        .filter(|e| e.tick >= start_tick && e.tick <= violation_tick)
        .filter(|e| {
            matches!(
                &e.kind,
                TraceEventKind::FaultInjected { .. }
                    | TraceEventKind::FaultHealed { .. }
                    | TraceEventKind::StateTransition { .. }
                    | TraceEventKind::StorageOp { .. }
            )
        })
        .take(50)
        .map(|e| CausalEvent {
            tick: e.tick,
            node_id: e.node_id,
            description: describe_event(&e.kind),
        })
        .collect();

    // Generate investigation suggestions
    let mut suggestions = Vec::new();
    if let Some(ref cause) = suspected_root_cause {
        match cause.fault_type.as_str() {
            "crash" | "crash_at_step" | "cascade_crash" => {
                suggestions.push(format!(
                    "Node {} crashed at tick {}. Check if committed data was lost on restart.",
                    cause.node_id, cause.tick
                ));
                suggestions
                    .push("Replay the simulation to the crash tick to inspect state.".to_string());
            }
            "partition" | "one_way_partition" => {
                suggestions.push(format!(
                    "Network partition at tick {}: {}. Check if split-brain occurred.",
                    cause.tick, cause.details
                ));
                suggestions
                    .push("Check state transition events during the partition window.".to_string());
            }
            "clock_freeze" | "clock_warp" | "clock_skew" | "leap_second" => {
                suggestions.push(format!(
                    "Clock anomaly on node {} at tick {}: {}. Check timestamp-dependent logic.",
                    cause.node_id, cause.tick, cause.details
                ));
            }
            "slow_node" | "link_degrade" => {
                suggestions.push(format!(
                    "Node {} degraded at tick {}. Check for timeout-related failures.",
                    cause.node_id, cause.tick
                ));
            }
            "disk_full" => {
                suggestions.push(format!(
                    "Disk full on node {} at tick {}. Check storage error handling paths.",
                    cause.node_id, cause.tick
                ));
            }
            _ => {
                suggestions.push(format!(
                    "Fault '{}' on node {} at tick {}.",
                    cause.fault_type, cause.node_id, cause.tick
                ));
            }
        }
    } else {
        suggestions
            .push("No fault events found before violation. This may be a logic bug.".to_string());
    }

    suggestions.push(format!(
        "Reproduce: run with the same seed and use trace.events_between({}, {}) to inspect.",
        start_tick, violation_tick
    ));

    DiagnosisReport {
        violation: violation.clone(),
        suspected_root_cause,
        causal_events,
        suggestions,
    }
}

/// Human-readable description of a trace event.
fn describe_event(kind: &TraceEventKind) -> String {
    match kind {
        TraceEventKind::FaultInjected {
            fault_type,
            details,
        } => {
            format!("FAULT+ {fault_type}: {details}")
        }
        TraceEventKind::FaultHealed {
            fault_type,
            details,
        } => {
            format!("FAULT- {fault_type}: {details}")
        }
        TraceEventKind::StateTransition {
            from_state,
            to_state,
            metadata,
        } => {
            format!("STATE {from_state} -> {to_state} ({metadata})")
        }
        TraceEventKind::StorageOp { op_type, key_count } => {
            format!("STORAGE {op_type} keys={key_count}")
        }
        other => format!("{:?}", other),
    }
}

impl DiagnosisReport {
    /// Human-readable diagnosis output.
    pub fn to_string_pretty(&self) -> String {
        let mut out = String::new();
        out.push_str(&format!(
            "=== Diagnosis: {} ===\n",
            self.violation.invariant_name
        ));
        out.push_str(&format!(
            "Violation at tick {}: {}\n\n",
            self.violation.tick, self.violation.description
        ));

        if let Some(ref cause) = self.suspected_root_cause {
            out.push_str(&format!(
                "Suspected root cause: {} on node {} at tick {}\n",
                cause.fault_type, cause.node_id, cause.tick
            ));
            out.push_str(&format!("  Details: {}\n\n", cause.details));
        } else {
            out.push_str("No fault-based root cause identified.\n\n");
        }

        if !self.causal_events.is_empty() {
            out.push_str("Causal chain:\n");
            for event in &self.causal_events {
                out.push_str(&format!(
                    "  [tick {}] node {}: {}\n",
                    event.tick, event.node_id, event.description
                ));
            }
            out.push('\n');
        }

        out.push_str("Suggestions:\n");
        for (i, s) in self.suggestions.iter().enumerate() {
            out.push_str(&format!("  {}. {}\n", i + 1, s));
        }

        out
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_violation() -> ViolationInfo {
        ViolationInfo {
            invariant_name: "NoDataLoss".into(),
            description: "Key 'abc' missing on node 2".into(),
            tick: 500,
        }
    }

    #[test]
    fn test_diagnosis_crash_induced() {
        let mut trace = SimTrace::new();
        trace.record(
            100,
            1,
            TraceEventKind::StateTransition {
                from_state: "Follower".into(),
                to_state: "Leader".into(),
                metadata: "term=1".into(),
            },
        );
        trace.record(
            200,
            2,
            TraceEventKind::FaultInjected {
                fault_type: "crash".into(),
                details: "node 2 crashed".into(),
            },
        );
        trace.record(
            300,
            3,
            TraceEventKind::StateTransition {
                from_state: "Follower".into(),
                to_state: "Leader".into(),
                metadata: "term=2".into(),
            },
        );

        let report = diagnose(&make_violation(), &trace);
        assert!(report.suspected_root_cause.is_some());
        let cause = report.suspected_root_cause.unwrap();
        assert_eq!(cause.fault_type, "crash");
        assert_eq!(cause.node_id, 2);
        assert_eq!(cause.tick, 200);
    }

    #[test]
    fn test_diagnosis_partition_induced() {
        let mut trace = SimTrace::new();
        trace.record(
            100,
            0,
            TraceEventKind::FaultInjected {
                fault_type: "partition".into(),
                details: "[1, 2] <-> [3]".into(),
            },
        );
        trace.record(
            200,
            0,
            TraceEventKind::FaultHealed {
                fault_type: "partition".into(),
                details: "healed".into(),
            },
        );

        let report = diagnose(&make_violation(), &trace);
        let cause = report.suspected_root_cause.unwrap();
        assert_eq!(cause.fault_type, "partition");
        assert!(report.suggestions.iter().any(|s| s.contains("split-brain")));
    }

    #[test]
    fn test_diagnosis_no_faults() {
        let trace = SimTrace::new();
        let report = diagnose(&make_violation(), &trace);
        assert!(report.suspected_root_cause.is_none());
        assert!(report.suggestions.iter().any(|s| s.contains("logic bug")));
    }

    #[test]
    fn test_diagnosis_human_readable() {
        let mut trace = SimTrace::new();
        trace.record(
            100,
            1,
            TraceEventKind::FaultInjected {
                fault_type: "crash".into(),
                details: "node 1 crashed".into(),
            },
        );

        let report = diagnose(&make_violation(), &trace);
        let pretty = report.to_string_pretty();
        assert!(pretty.contains("Diagnosis:"));
        assert!(pretty.contains("NoDataLoss"));
        assert!(pretty.contains("Suspected root cause"));
        assert!(pretty.contains("Suggestions"));
    }

    #[test]
    fn test_diagnosis_causal_chain() {
        let mut trace = SimTrace::new();
        trace.record(
            100,
            0,
            TraceEventKind::FaultInjected {
                fault_type: "partition".into(),
                details: "split".into(),
            },
        );
        trace.record(
            200,
            1,
            TraceEventKind::StateTransition {
                from_state: "Follower".into(),
                to_state: "Candidate".into(),
                metadata: "".into(),
            },
        );
        trace.record(
            300,
            1,
            TraceEventKind::StateTransition {
                from_state: "Candidate".into(),
                to_state: "Leader".into(),
                metadata: "".into(),
            },
        );
        trace.record(
            400,
            0,
            TraceEventKind::FaultHealed {
                fault_type: "partition".into(),
                details: "healed".into(),
            },
        );

        let report = diagnose(&make_violation(), &trace);
        assert!(report.causal_events.len() >= 3);
    }
}