1use std::path::{Path, PathBuf};
9
10use chrono::{DateTime, Utc};
11use serde::{Deserialize, Serialize};
12
13use crate::domain::{AivcsError, Result};
14use oxidized_state::storage_traits::ContentDigest;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
18#[serde(rename_all = "snake_case")]
19pub enum FailureClass {
20 Build,
21 Test,
22 Runtime,
23 Integration,
24 Unknown,
25}
26
27#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
29pub struct FailureSignal {
30 pub stage: String,
31 pub message: String,
32 pub exit_code: Option<i32>,
33 pub flaky_hint: bool,
34}
35
36impl FailureSignal {
37 pub fn new(stage: impl Into<String>, message: impl Into<String>) -> Self {
38 Self {
39 stage: stage.into(),
40 message: message.into(),
41 exit_code: None,
42 flaky_hint: false,
43 }
44 }
45}
46
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
49#[serde(rename_all = "snake_case")]
50pub enum RecoveryAction {
51 Retry,
52 PatchForward,
53 Rollback,
54 Escalate,
55}
56
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
59#[serde(rename_all = "snake_case")]
60pub enum RecoveryOutcome {
61 Recovered,
62 Failed,
63}
64
65#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
67pub struct RecoveryPolicy {
68 pub max_attempts: u32,
69 pub max_flaky_retries: u32,
70 pub allow_patch_forward: bool,
71 pub allow_rollback: bool,
72}
73
74impl Default for RecoveryPolicy {
75 fn default() -> Self {
76 Self {
77 max_attempts: 3,
78 max_flaky_retries: 1,
79 allow_patch_forward: true,
80 allow_rollback: true,
81 }
82 }
83}
84
85#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
87pub struct RecoveryDecision {
88 pub attempt: u32,
89 pub failure_class: FailureClass,
90 pub action: RecoveryAction,
91 pub rationale: String,
92}
93
94#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
96pub struct RecoveryAttemptResult {
97 pub success: bool,
98 pub next_failure: Option<FailureSignal>,
99}
100
101#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
103pub struct RecoveryLog {
104 pub run_id: String,
105 pub policy: RecoveryPolicy,
106 pub initial_failure: FailureSignal,
107 pub decisions: Vec<RecoveryDecision>,
108 pub outcome: RecoveryOutcome,
109 pub attempts_used: u32,
110 pub final_failure: Option<FailureSignal>,
111 pub evaluated_at: DateTime<Utc>,
112}
113
114pub fn classify_failure(signal: &FailureSignal) -> FailureClass {
116 let stage = signal.stage.to_lowercase();
117 let msg = signal.message.to_lowercase();
118
119 if stage.contains("build")
120 || stage.contains("compile")
121 || msg.contains("compil")
122 || msg.contains("linker error")
123 {
124 return FailureClass::Build;
125 }
126 if stage.contains("test")
127 || msg.contains("assertion")
128 || msg.contains("test failed")
129 || msg.contains("snapshot mismatch")
130 {
131 return FailureClass::Test;
132 }
133 if stage.contains("runtime")
134 || msg.contains("panic")
135 || msg.contains("segmentation fault")
136 || msg.contains("null pointer")
137 {
138 return FailureClass::Runtime;
139 }
140 if stage.contains("integration")
141 || msg.contains("contract")
142 || msg.contains("handshake")
143 || msg.contains("dependency unavailable")
144 {
145 return FailureClass::Integration;
146 }
147
148 FailureClass::Unknown
149}
150
151fn decide_action(
152 class: FailureClass,
153 signal: &FailureSignal,
154 policy: &RecoveryPolicy,
155 flaky_retries_used: u32,
156) -> (RecoveryAction, String) {
157 if class == FailureClass::Test
158 && signal.flaky_hint
159 && flaky_retries_used < policy.max_flaky_retries
160 {
161 return (
162 RecoveryAction::Retry,
163 "flaky signal detected; bounded retry permitted".to_string(),
164 );
165 }
166
167 if policy.allow_patch_forward && matches!(class, FailureClass::Build | FailureClass::Test) {
168 return (
169 RecoveryAction::PatchForward,
170 "build/test failure; patch-forward is enabled".to_string(),
171 );
172 }
173
174 if policy.allow_rollback && matches!(class, FailureClass::Runtime | FailureClass::Integration) {
175 return (
176 RecoveryAction::Rollback,
177 "runtime/integration failure; rollback is enabled".to_string(),
178 );
179 }
180
181 (
182 RecoveryAction::Escalate,
183 "no safe automated action available under policy".to_string(),
184 )
185}
186
187pub fn execute_recovery_loop<F>(
189 run_id: &str,
190 initial_failure: FailureSignal,
191 policy: RecoveryPolicy,
192 mut apply_action: F,
193) -> RecoveryLog
194where
195 F: FnMut(u32, RecoveryAction, &FailureSignal) -> RecoveryAttemptResult,
196{
197 let mut current = initial_failure.clone();
198 let mut decisions = Vec::new();
199 let mut flaky_retries_used = 0u32;
200 let mut attempts_used = 0u32;
201
202 for attempt in 1..=policy.max_attempts {
203 attempts_used = attempt;
204 let class = classify_failure(¤t);
205 let (action, rationale) = decide_action(class, ¤t, &policy, flaky_retries_used);
206 if action == RecoveryAction::Retry && current.flaky_hint {
207 flaky_retries_used += 1;
208 }
209
210 decisions.push(RecoveryDecision {
211 attempt,
212 failure_class: class,
213 action,
214 rationale,
215 });
216
217 if action == RecoveryAction::Escalate {
218 return RecoveryLog {
219 run_id: run_id.to_string(),
220 policy,
221 initial_failure,
222 decisions,
223 outcome: RecoveryOutcome::Failed,
224 attempts_used,
225 final_failure: Some(current),
226 evaluated_at: Utc::now(),
227 };
228 }
229
230 let attempt_result = apply_action(attempt, action, ¤t);
231 if attempt_result.success {
232 return RecoveryLog {
233 run_id: run_id.to_string(),
234 policy,
235 initial_failure,
236 decisions,
237 outcome: RecoveryOutcome::Recovered,
238 attempts_used,
239 final_failure: None,
240 evaluated_at: Utc::now(),
241 };
242 }
243
244 if let Some(next) = attempt_result.next_failure {
245 current = next;
246 }
247 }
248
249 RecoveryLog {
250 run_id: run_id.to_string(),
251 policy,
252 initial_failure,
253 decisions,
254 outcome: RecoveryOutcome::Failed,
255 attempts_used,
256 final_failure: Some(current),
257 evaluated_at: Utc::now(),
258 }
259}
260
261pub fn write_recovery_artifact(log: &RecoveryLog, dir: &Path) -> Result<PathBuf> {
263 let run_dir = dir.join(&log.run_id);
264 std::fs::create_dir_all(&run_dir)?;
265
266 let artifact_path = run_dir.join("recovery.json");
267 let digest_path = run_dir.join("recovery.digest");
268 let json = serde_json::to_vec_pretty(log)?;
269 let digest = ContentDigest::from_bytes(&json).as_str().to_string();
270
271 std::fs::write(&artifact_path, &json)?;
272 std::fs::write(&digest_path, digest.as_bytes())?;
273
274 Ok(artifact_path)
275}
276
277pub fn read_recovery_artifact(run_id: &str, dir: &Path) -> Result<RecoveryLog> {
279 let run_dir = dir.join(run_id);
280 let artifact_path = run_dir.join("recovery.json");
281 let digest_path = run_dir.join("recovery.digest");
282
283 let json = std::fs::read(&artifact_path)?;
284 let digest = std::fs::read_to_string(&digest_path)?;
285 let actual = ContentDigest::from_bytes(&json).as_str().to_string();
286 if digest.trim() != actual {
287 return Err(AivcsError::DigestMismatch {
288 expected: digest.trim().to_string(),
289 actual,
290 });
291 }
292
293 Ok(serde_json::from_slice(&json)?)
294}