Skip to main content

aivcs_core/
self_healing.rs

1//! Verification and self-healing orchestration primitives.
2//!
3//! This module provides:
4//! - failure taxonomy classification
5//! - bounded auto-repair loop decisions
6//! - auditable recovery artifacts with digest verification
7
8use std::path::{Path, PathBuf};
9
10use chrono::{DateTime, Utc};
11use serde::{Deserialize, Serialize};
12
13use crate::domain::{AivcsError, Result};
14use oxidized_state::storage_traits::ContentDigest;
15
16/// Coarse failure taxonomy used by the recovery planner.
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
18#[serde(rename_all = "snake_case")]
19pub enum FailureClass {
20    Build,
21    Test,
22    Runtime,
23    Integration,
24    Unknown,
25}
26
27/// Structured failure signal from verification/runtime stages.
28#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
29pub struct FailureSignal {
30    pub stage: String,
31    pub message: String,
32    pub exit_code: Option<i32>,
33    pub flaky_hint: bool,
34}
35
36impl FailureSignal {
37    pub fn new(stage: impl Into<String>, message: impl Into<String>) -> Self {
38        Self {
39            stage: stage.into(),
40            message: message.into(),
41            exit_code: None,
42            flaky_hint: false,
43        }
44    }
45}
46
47/// Recovery action selected per attempt.
48#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
49#[serde(rename_all = "snake_case")]
50pub enum RecoveryAction {
51    Retry,
52    PatchForward,
53    Rollback,
54    Escalate,
55}
56
57/// Recovery loop final state.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
59#[serde(rename_all = "snake_case")]
60pub enum RecoveryOutcome {
61    Recovered,
62    Failed,
63}
64
65/// Bounded recovery policy.
66#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
67pub struct RecoveryPolicy {
68    pub max_attempts: u32,
69    pub max_flaky_retries: u32,
70    pub allow_patch_forward: bool,
71    pub allow_rollback: bool,
72}
73
74impl Default for RecoveryPolicy {
75    fn default() -> Self {
76        Self {
77            max_attempts: 3,
78            max_flaky_retries: 1,
79            allow_patch_forward: true,
80            allow_rollback: true,
81        }
82    }
83}
84
85/// One auditable decision in the recovery timeline.
86#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
87pub struct RecoveryDecision {
88    pub attempt: u32,
89    pub failure_class: FailureClass,
90    pub action: RecoveryAction,
91    pub rationale: String,
92}
93
94/// Result from one repair attempt.
95#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
96pub struct RecoveryAttemptResult {
97    pub success: bool,
98    pub next_failure: Option<FailureSignal>,
99}
100
101/// Full recovery log for artifacts/audit.
102#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
103pub struct RecoveryLog {
104    pub run_id: String,
105    pub policy: RecoveryPolicy,
106    pub initial_failure: FailureSignal,
107    pub decisions: Vec<RecoveryDecision>,
108    pub outcome: RecoveryOutcome,
109    pub attempts_used: u32,
110    pub final_failure: Option<FailureSignal>,
111    pub evaluated_at: DateTime<Utc>,
112}
113
114/// Classify a failure into a coarse category.
115pub fn classify_failure(signal: &FailureSignal) -> FailureClass {
116    let stage = signal.stage.to_lowercase();
117    let msg = signal.message.to_lowercase();
118
119    if stage.contains("build")
120        || stage.contains("compile")
121        || msg.contains("compil")
122        || msg.contains("linker error")
123    {
124        return FailureClass::Build;
125    }
126    if stage.contains("test")
127        || msg.contains("assertion")
128        || msg.contains("test failed")
129        || msg.contains("snapshot mismatch")
130    {
131        return FailureClass::Test;
132    }
133    if stage.contains("runtime")
134        || msg.contains("panic")
135        || msg.contains("segmentation fault")
136        || msg.contains("null pointer")
137    {
138        return FailureClass::Runtime;
139    }
140    if stage.contains("integration")
141        || msg.contains("contract")
142        || msg.contains("handshake")
143        || msg.contains("dependency unavailable")
144    {
145        return FailureClass::Integration;
146    }
147
148    FailureClass::Unknown
149}
150
151fn decide_action(
152    class: FailureClass,
153    signal: &FailureSignal,
154    policy: &RecoveryPolicy,
155    flaky_retries_used: u32,
156) -> (RecoveryAction, String) {
157    if class == FailureClass::Test
158        && signal.flaky_hint
159        && flaky_retries_used < policy.max_flaky_retries
160    {
161        return (
162            RecoveryAction::Retry,
163            "flaky signal detected; bounded retry permitted".to_string(),
164        );
165    }
166
167    if policy.allow_patch_forward && matches!(class, FailureClass::Build | FailureClass::Test) {
168        return (
169            RecoveryAction::PatchForward,
170            "build/test failure; patch-forward is enabled".to_string(),
171        );
172    }
173
174    if policy.allow_rollback && matches!(class, FailureClass::Runtime | FailureClass::Integration) {
175        return (
176            RecoveryAction::Rollback,
177            "runtime/integration failure; rollback is enabled".to_string(),
178        );
179    }
180
181    (
182        RecoveryAction::Escalate,
183        "no safe automated action available under policy".to_string(),
184    )
185}
186
187/// Run a bounded, policy-controlled recovery loop.
188pub fn execute_recovery_loop<F>(
189    run_id: &str,
190    initial_failure: FailureSignal,
191    policy: RecoveryPolicy,
192    mut apply_action: F,
193) -> RecoveryLog
194where
195    F: FnMut(u32, RecoveryAction, &FailureSignal) -> RecoveryAttemptResult,
196{
197    let mut current = initial_failure.clone();
198    let mut decisions = Vec::new();
199    let mut flaky_retries_used = 0u32;
200    let mut attempts_used = 0u32;
201
202    for attempt in 1..=policy.max_attempts {
203        attempts_used = attempt;
204        let class = classify_failure(&current);
205        let (action, rationale) = decide_action(class, &current, &policy, flaky_retries_used);
206        if action == RecoveryAction::Retry && current.flaky_hint {
207            flaky_retries_used += 1;
208        }
209
210        decisions.push(RecoveryDecision {
211            attempt,
212            failure_class: class,
213            action,
214            rationale,
215        });
216
217        if action == RecoveryAction::Escalate {
218            return RecoveryLog {
219                run_id: run_id.to_string(),
220                policy,
221                initial_failure,
222                decisions,
223                outcome: RecoveryOutcome::Failed,
224                attempts_used,
225                final_failure: Some(current),
226                evaluated_at: Utc::now(),
227            };
228        }
229
230        let attempt_result = apply_action(attempt, action, &current);
231        if attempt_result.success {
232            return RecoveryLog {
233                run_id: run_id.to_string(),
234                policy,
235                initial_failure,
236                decisions,
237                outcome: RecoveryOutcome::Recovered,
238                attempts_used,
239                final_failure: None,
240                evaluated_at: Utc::now(),
241            };
242        }
243
244        if let Some(next) = attempt_result.next_failure {
245            current = next;
246        }
247    }
248
249    RecoveryLog {
250        run_id: run_id.to_string(),
251        policy,
252        initial_failure,
253        decisions,
254        outcome: RecoveryOutcome::Failed,
255        attempts_used,
256        final_failure: Some(current),
257        evaluated_at: Utc::now(),
258    }
259}
260
261/// Persist `<dir>/<run_id>/recovery.json` and `<dir>/<run_id>/recovery.digest`.
262pub fn write_recovery_artifact(log: &RecoveryLog, dir: &Path) -> Result<PathBuf> {
263    let run_dir = dir.join(&log.run_id);
264    std::fs::create_dir_all(&run_dir)?;
265
266    let artifact_path = run_dir.join("recovery.json");
267    let digest_path = run_dir.join("recovery.digest");
268    let json = serde_json::to_vec_pretty(log)?;
269    let digest = ContentDigest::from_bytes(&json).as_str().to_string();
270
271    std::fs::write(&artifact_path, &json)?;
272    std::fs::write(&digest_path, digest.as_bytes())?;
273
274    Ok(artifact_path)
275}
276
277/// Read and verify `<dir>/<run_id>/recovery.json` integrity.
278pub fn read_recovery_artifact(run_id: &str, dir: &Path) -> Result<RecoveryLog> {
279    let run_dir = dir.join(run_id);
280    let artifact_path = run_dir.join("recovery.json");
281    let digest_path = run_dir.join("recovery.digest");
282
283    let json = std::fs::read(&artifact_path)?;
284    let digest = std::fs::read_to_string(&digest_path)?;
285    let actual = ContentDigest::from_bytes(&json).as_str().to_string();
286    if digest.trim() != actual {
287        return Err(AivcsError::DigestMismatch {
288            expected: digest.trim().to_string(),
289            actual,
290        });
291    }
292
293    Ok(serde_json::from_slice(&json)?)
294}