1use crate::recommendations::{Recommendation, RecommendationCategory, RecommendationSeverity};
7use chrono::{DateTime, Duration, Utc};
8use parking_lot::RwLock;
9use serde::{Deserialize, Serialize};
10use std::collections::{HashMap, VecDeque};
11use std::sync::Arc;
12use uuid::Uuid;
13
14#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct RemediationConfig {
17 pub enabled: bool,
19 pub max_auto_severity: RecommendationSeverity,
21 pub require_approval_categories: Vec<RecommendationCategory>,
23 pub max_concurrent: usize,
25 pub cooldown_minutes: i64,
27 pub auto_rollback: bool,
29 pub dry_run: bool,
31 pub max_retries: u32,
33}
34
35impl Default for RemediationConfig {
36 fn default() -> Self {
37 Self {
38 enabled: false, max_auto_severity: RecommendationSeverity::Low,
40 require_approval_categories: vec![
41 RecommendationCategory::FaultInjection,
42 RecommendationCategory::CircuitBreaker,
43 ],
44 max_concurrent: 1,
45 cooldown_minutes: 30,
46 auto_rollback: true,
47 dry_run: false,
48 max_retries: 3,
49 }
50 }
51}
52
53#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
55#[serde(rename_all = "lowercase")]
56pub enum RemediationStatus {
57 Pending,
58 AwaitingApproval,
59 Approved,
60 Rejected,
61 Applying,
62 Applied,
63 Failed,
64 RolledBack,
65 Cancelled,
66}
67
68#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct RemediationAction {
71 pub id: String,
73 pub recommendation_id: String,
75 pub status: RemediationStatus,
77 pub created_at: DateTime<Utc>,
79 pub applied_at: Option<DateTime<Utc>>,
81 pub completed_at: Option<DateTime<Utc>>,
83 pub config_changes: HashMap<String, String>,
85 pub rollback_data: Option<RollbackData>,
87 pub logs: Vec<String>,
89 pub success: bool,
91 pub error: Option<String>,
93 pub retry_count: u32,
95 pub approved_by: Option<String>,
97 pub approved_at: Option<DateTime<Utc>>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct RollbackData {
104 pub previous_config: HashMap<String, String>,
105 pub restore_commands: Vec<String>,
106 pub created_at: DateTime<Utc>,
107}
108
109#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct RemediationResult {
112 pub action_id: String,
113 pub success: bool,
114 pub message: String,
115 pub applied_changes: Vec<String>,
116 pub duration_ms: u64,
117}
118
119#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct EffectivenessMetrics {
122 pub recommendation_id: String,
123 pub action_id: String,
124 pub before_metrics: SystemMetrics,
126 pub after_metrics: SystemMetrics,
128 pub improvement_score: f64,
130 pub measurement_period_hours: i64,
132 pub measured_at: DateTime<Utc>,
133}
134
135#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct SystemMetrics {
138 pub error_rate: f64,
139 pub avg_latency_ms: f64,
140 pub p95_latency_ms: f64,
141 pub p99_latency_ms: f64,
142 pub success_rate: f64,
143 pub chaos_impact: f64,
144 pub resilience_score: f64,
145}
146
147#[derive(Debug, Clone, Serialize, Deserialize)]
149pub struct ApprovalRequest {
150 pub action_id: String,
151 pub recommendation: Recommendation,
152 pub proposed_changes: HashMap<String, String>,
153 pub risk_assessment: RiskAssessment,
154 pub created_at: DateTime<Utc>,
155 pub expires_at: DateTime<Utc>,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
160pub struct RiskAssessment {
161 pub risk_level: RiskLevel,
162 pub impact_scope: Vec<String>,
163 pub reversible: bool,
164 pub estimated_downtime_ms: u64,
165 pub safety_checks: Vec<SafetyCheck>,
166}
167
168#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
170#[serde(rename_all = "lowercase")]
171pub enum RiskLevel {
172 Minimal,
173 Low,
174 Medium,
175 High,
176 Critical,
177}
178
179#[derive(Debug, Clone, Serialize, Deserialize)]
181pub struct SafetyCheck {
182 pub name: String,
183 pub passed: bool,
184 pub message: String,
185}
186
187pub struct RemediationEngine {
189 config: Arc<RwLock<RemediationConfig>>,
190 actions: Arc<RwLock<HashMap<String, RemediationAction>>>,
191 effectiveness_metrics: Arc<RwLock<HashMap<String, EffectivenessMetrics>>>,
192 approval_queue: Arc<RwLock<VecDeque<ApprovalRequest>>>,
193 action_history: Arc<RwLock<VecDeque<RemediationAction>>>,
194 max_history: usize,
195}
196
197impl RemediationEngine {
198 pub fn new() -> Self {
200 Self::with_config(RemediationConfig::default())
201 }
202
203 pub fn with_config(config: RemediationConfig) -> Self {
205 Self {
206 config: Arc::new(RwLock::new(config)),
207 actions: Arc::new(RwLock::new(HashMap::new())),
208 effectiveness_metrics: Arc::new(RwLock::new(HashMap::new())),
209 approval_queue: Arc::new(RwLock::new(VecDeque::new())),
210 action_history: Arc::new(RwLock::new(VecDeque::new())),
211 max_history: 1000,
212 }
213 }
214
215 pub fn update_config(&self, config: RemediationConfig) {
217 let mut cfg = self.config.write();
218 *cfg = config;
219 }
220
221 pub fn get_config(&self) -> RemediationConfig {
223 self.config.read().clone()
224 }
225
226 pub fn process_recommendation(
228 &self,
229 recommendation: &Recommendation,
230 ) -> Result<String, String> {
231 let config = self.config.read().clone();
232
233 if !config.enabled {
234 return Err("Auto-remediation is disabled".to_string());
235 }
236
237 if !self.check_cooldown(&config) {
239 return Err("Cooldown period not elapsed".to_string());
240 }
241
242 if !self.check_concurrent_limit(&config) {
244 return Err("Maximum concurrent remediations reached".to_string());
245 }
246
247 let risk_assessment = self.assess_risk(recommendation);
249
250 let needs_approval = self.needs_approval(recommendation, &config, &risk_assessment);
252
253 let action = self.create_action(recommendation, risk_assessment.clone());
255 let action_id = action.id.clone();
256
257 {
259 let mut actions = self.actions.write();
260 actions.insert(action_id.clone(), action.clone());
261 }
262
263 if needs_approval {
264 self.queue_for_approval(action_id.clone(), recommendation.clone(), risk_assessment);
266 self.update_action_status(&action_id, RemediationStatus::AwaitingApproval);
267 Ok(format!("Action {} queued for approval", action_id))
268 } else {
269 self.apply_action(&action_id)?;
271 Ok(format!("Action {} applied successfully", action_id))
272 }
273 }
274
275 fn create_action(
277 &self,
278 recommendation: &Recommendation,
279 _risk_assessment: RiskAssessment,
280 ) -> RemediationAction {
281 let config_changes = self.extract_config_changes(recommendation);
282 let rollback_data = self.create_rollback_data(&config_changes);
283
284 RemediationAction {
285 id: format!("action-{}", Uuid::new_v4()),
286 recommendation_id: recommendation.id.clone(),
287 status: RemediationStatus::Pending,
288 created_at: Utc::now(),
289 applied_at: None,
290 completed_at: None,
291 config_changes,
292 rollback_data: Some(rollback_data),
293 logs: vec![format!(
294 "Action created from recommendation: {}",
295 recommendation.title
296 )],
297 success: false,
298 error: None,
299 retry_count: 0,
300 approved_by: None,
301 approved_at: None,
302 }
303 }
304
305 fn extract_config_changes(&self, recommendation: &Recommendation) -> HashMap<String, String> {
307 let mut changes = HashMap::new();
308
309 match recommendation.category {
311 RecommendationCategory::Latency => {
312 if let Some(ref example) = recommendation.example {
313 if let Some(latency) = self.extract_latency_value(example) {
314 changes.insert("chaos_latency_ms".to_string(), latency.to_string());
315 }
316 }
317 }
318 RecommendationCategory::FaultInjection => {
319 changes.insert("chaos_fault_probability".to_string(), "0.3".to_string());
320 }
321 RecommendationCategory::RateLimit => {
322 changes.insert("chaos_rate_limit".to_string(), "100".to_string());
323 }
324 _ => {}
325 }
326
327 changes
328 }
329
330 fn extract_latency_value(&self, example: &str) -> Option<u64> {
332 example
334 .split_whitespace()
335 .position(|s| s == "--chaos-latency-ms")
336 .and_then(|i| example.split_whitespace().nth(i + 1))
337 .and_then(|v| v.parse().ok())
338 }
339
340 fn create_rollback_data(&self, config_changes: &HashMap<String, String>) -> RollbackData {
342 let mut previous_config = HashMap::new();
344 for key in config_changes.keys() {
345 previous_config.insert(key.clone(), "default".to_string());
346 }
347
348 RollbackData {
349 previous_config,
350 restore_commands: vec!["mockforge serve --reset-chaos".to_string()],
351 created_at: Utc::now(),
352 }
353 }
354
355 fn assess_risk(&self, recommendation: &Recommendation) -> RiskAssessment {
357 let risk_level = match recommendation.severity {
358 RecommendationSeverity::Info => RiskLevel::Minimal,
359 RecommendationSeverity::Low => RiskLevel::Low,
360 RecommendationSeverity::Medium => RiskLevel::Medium,
361 RecommendationSeverity::High => RiskLevel::High,
362 RecommendationSeverity::Critical => RiskLevel::Critical,
363 };
364
365 let safety_checks = vec![
366 SafetyCheck {
367 name: "configuration_valid".to_string(),
368 passed: true,
369 message: "Configuration changes are valid".to_string(),
370 },
371 SafetyCheck {
372 name: "rollback_available".to_string(),
373 passed: true,
374 message: "Rollback mechanism available".to_string(),
375 },
376 ];
377
378 RiskAssessment {
379 risk_level,
380 impact_scope: recommendation.affected_endpoints.clone(),
381 reversible: true,
382 estimated_downtime_ms: 0,
383 safety_checks,
384 }
385 }
386
387 fn needs_approval(
389 &self,
390 recommendation: &Recommendation,
391 config: &RemediationConfig,
392 risk: &RiskAssessment,
393 ) -> bool {
394 if recommendation.severity > config.max_auto_severity {
396 return true;
397 }
398
399 if config.require_approval_categories.contains(&recommendation.category) {
401 return true;
402 }
403
404 if risk.risk_level >= RiskLevel::High {
406 return true;
407 }
408
409 if !risk.reversible {
411 return true;
412 }
413
414 false
415 }
416
417 fn queue_for_approval(
419 &self,
420 action_id: String,
421 recommendation: Recommendation,
422 risk: RiskAssessment,
423 ) {
424 let mut changes = HashMap::new();
425 changes.insert("example".to_string(), recommendation.example.clone().unwrap_or_default());
426
427 let request = ApprovalRequest {
428 action_id,
429 recommendation,
430 proposed_changes: changes,
431 risk_assessment: risk,
432 created_at: Utc::now(),
433 expires_at: Utc::now() + Duration::hours(24),
434 };
435
436 let mut queue = self.approval_queue.write();
437 queue.push_back(request);
438 }
439
440 pub fn get_approval_queue(&self) -> Vec<ApprovalRequest> {
442 let queue = self.approval_queue.read();
443 queue.iter().cloned().collect()
444 }
445
446 pub fn approve_action(&self, action_id: &str, approver: &str) -> Result<(), String> {
448 {
450 let mut queue = self.approval_queue.write();
451 queue.retain(|req| req.action_id != action_id);
452 }
453
454 {
456 let mut actions = self.actions.write();
457 if let Some(action) = actions.get_mut(action_id) {
458 action.status = RemediationStatus::Approved;
459 action.approved_by = Some(approver.to_string());
460 action.approved_at = Some(Utc::now());
461 action.logs.push(format!("Approved by {}", approver));
462 } else {
463 return Err("Action not found".to_string());
464 }
465 }
466
467 self.apply_action(action_id)?;
469
470 Ok(())
471 }
472
473 pub fn reject_action(&self, action_id: &str, reason: &str) -> Result<(), String> {
475 {
477 let mut queue = self.approval_queue.write();
478 queue.retain(|req| req.action_id != action_id);
479 }
480
481 self.update_action_status(action_id, RemediationStatus::Rejected);
482 self.add_action_log(action_id, &format!("Rejected: {}", reason));
483
484 Ok(())
485 }
486
487 fn apply_action(&self, action_id: &str) -> Result<RemediationResult, String> {
489 let config = self.config.read().clone();
490 let start_time = Utc::now();
491
492 self.update_action_status(action_id, RemediationStatus::Applying);
493
494 let action = {
496 let actions = self.actions.read();
497 actions.get(action_id).cloned().ok_or_else(|| "Action not found".to_string())?
498 };
499
500 if config.dry_run {
501 self.add_action_log(action_id, "Dry-run mode: changes not actually applied");
502 self.update_action_status(action_id, RemediationStatus::Applied);
503
504 return Ok(RemediationResult {
505 action_id: action_id.to_string(),
506 success: true,
507 message: "Dry-run completed successfully".to_string(),
508 applied_changes: action.config_changes.keys().cloned().collect(),
509 duration_ms: (Utc::now() - start_time).num_milliseconds() as u64,
510 });
511 }
512
513 let applied_changes: Vec<String> =
515 action.config_changes.iter().map(|(k, v)| format!("{} = {}", k, v)).collect();
516
517 self.add_action_log(action_id, &format!("Applied changes: {:?}", applied_changes));
518
519 {
521 let mut actions = self.actions.write();
522 if let Some(action) = actions.get_mut(action_id) {
523 action.status = RemediationStatus::Applied;
524 action.success = true;
525 action.applied_at = Some(Utc::now());
526 action.completed_at = Some(Utc::now());
527 }
528 }
529
530 self.add_to_history(action);
532
533 Ok(RemediationResult {
534 action_id: action_id.to_string(),
535 success: true,
536 message: "Remediation applied successfully".to_string(),
537 applied_changes: applied_changes.to_vec(),
538 duration_ms: (Utc::now() - start_time).num_milliseconds() as u64,
539 })
540 }
541
542 pub fn rollback_action(&self, action_id: &str) -> Result<(), String> {
544 let action = {
545 let actions = self.actions.read();
546 actions.get(action_id).cloned().ok_or_else(|| "Action not found".to_string())?
547 };
548
549 if action.status != RemediationStatus::Applied {
550 return Err("Can only rollback applied actions".to_string());
551 }
552
553 let rollback_data =
554 action.rollback_data.ok_or_else(|| "No rollback data available".to_string())?;
555
556 self.add_action_log(action_id, "Rolling back changes");
557
558 for cmd in &rollback_data.restore_commands {
560 self.add_action_log(action_id, &format!("Executing: {}", cmd));
561 }
562
563 self.update_action_status(action_id, RemediationStatus::RolledBack);
564 self.add_action_log(action_id, "Rollback completed");
565
566 Ok(())
567 }
568
569 pub fn record_effectiveness(
571 &self,
572 recommendation_id: &str,
573 action_id: &str,
574 before: SystemMetrics,
575 after: SystemMetrics,
576 measurement_period_hours: i64,
577 ) {
578 let improvement_score = self.calculate_improvement_score(&before, &after);
579
580 let metrics = EffectivenessMetrics {
581 recommendation_id: recommendation_id.to_string(),
582 action_id: action_id.to_string(),
583 before_metrics: before,
584 after_metrics: after,
585 improvement_score,
586 measurement_period_hours,
587 measured_at: Utc::now(),
588 };
589
590 let mut effectiveness = self.effectiveness_metrics.write();
591 effectiveness.insert(action_id.to_string(), metrics);
592 }
593
594 fn calculate_improvement_score(&self, before: &SystemMetrics, after: &SystemMetrics) -> f64 {
596 let mut score = 0.0;
597 let mut weight_total = 0.0;
598
599 if before.error_rate > 0.0 {
601 let error_improvement = (before.error_rate - after.error_rate) / before.error_rate;
602 score += error_improvement * 0.3;
603 weight_total += 0.3;
604 }
605
606 if before.avg_latency_ms > 0.0 {
608 let latency_improvement =
609 (before.avg_latency_ms - after.avg_latency_ms) / before.avg_latency_ms;
610 score += latency_improvement * 0.2;
611 weight_total += 0.2;
612 }
613
614 let success_improvement = after.success_rate - before.success_rate;
616 score += success_improvement * 0.25;
617 weight_total += 0.25;
618
619 let resilience_improvement = after.resilience_score - before.resilience_score;
621 score += resilience_improvement * 0.25;
622 weight_total += 0.25;
623
624 if weight_total > 0.0 {
625 (score / weight_total).clamp(0.0, 1.0)
626 } else {
627 0.0
628 }
629 }
630
631 pub fn get_effectiveness(&self, action_id: &str) -> Option<EffectivenessMetrics> {
633 let metrics = self.effectiveness_metrics.read();
634 metrics.get(action_id).cloned()
635 }
636
637 pub fn get_all_effectiveness(&self) -> Vec<EffectivenessMetrics> {
639 let metrics = self.effectiveness_metrics.read();
640 metrics.values().cloned().collect()
641 }
642
643 pub fn get_action(&self, action_id: &str) -> Option<RemediationAction> {
645 let actions = self.actions.read();
646 actions.get(action_id).cloned()
647 }
648
649 pub fn get_active_actions(&self) -> Vec<RemediationAction> {
651 let actions = self.actions.read();
652 actions
653 .values()
654 .filter(|a| {
655 matches!(
656 a.status,
657 RemediationStatus::Pending
658 | RemediationStatus::Applying
659 | RemediationStatus::Applied
660 )
661 })
662 .cloned()
663 .collect()
664 }
665
666 pub fn get_history(&self, limit: usize) -> Vec<RemediationAction> {
668 let history = self.action_history.read();
669 history.iter().take(limit).cloned().collect()
670 }
671
672 pub fn get_stats(&self) -> RemediationStats {
674 let actions = self.actions.read();
675 let history = self.action_history.read();
676
677 let total_actions = actions.len() + history.len();
678 let successful = actions.values().filter(|a| a.success).count()
679 + history.iter().filter(|a| a.success).count();
680 let failed = actions.values().filter(|a| a.status == RemediationStatus::Failed).count()
681 + history.iter().filter(|a| a.status == RemediationStatus::Failed).count();
682 let pending_approval = actions
683 .values()
684 .filter(|a| a.status == RemediationStatus::AwaitingApproval)
685 .count();
686 let rolled_back =
687 history.iter().filter(|a| a.status == RemediationStatus::RolledBack).count();
688
689 let effectiveness_metrics = self.effectiveness_metrics.read();
690 let avg_improvement = if effectiveness_metrics.is_empty() {
691 0.0
692 } else {
693 effectiveness_metrics.values().map(|m| m.improvement_score).sum::<f64>()
694 / effectiveness_metrics.len() as f64
695 };
696
697 RemediationStats {
698 total_actions,
699 successful_actions: successful,
700 failed_actions: failed,
701 pending_approval,
702 rolled_back,
703 avg_improvement_score: avg_improvement,
704 total_effectiveness_measurements: effectiveness_metrics.len(),
705 }
706 }
707
708 fn check_cooldown(&self, config: &RemediationConfig) -> bool {
711 let actions = self.actions.read();
712 let cooldown_threshold = Utc::now() - Duration::minutes(config.cooldown_minutes);
713
714 !actions.values().any(|a| {
715 a.status == RemediationStatus::Applied
716 && a.completed_at.is_some_and(|t| t > cooldown_threshold)
717 })
718 }
719
720 fn check_concurrent_limit(&self, config: &RemediationConfig) -> bool {
721 let actions = self.actions.read();
722 let active_count = actions
723 .values()
724 .filter(|a| matches!(a.status, RemediationStatus::Applying))
725 .count();
726
727 active_count < config.max_concurrent
728 }
729
730 fn update_action_status(&self, action_id: &str, status: RemediationStatus) {
731 let mut actions = self.actions.write();
732 if let Some(action) = actions.get_mut(action_id) {
733 action.status = status;
734 }
735 }
736
737 fn add_action_log(&self, action_id: &str, message: &str) {
738 let mut actions = self.actions.write();
739 if let Some(action) = actions.get_mut(action_id) {
740 action
741 .logs
742 .push(format!("[{}] {}", Utc::now().format("%Y-%m-%d %H:%M:%S"), message));
743 }
744 }
745
746 fn add_to_history(&self, action: RemediationAction) {
747 let mut history = self.action_history.write();
748 history.push_front(action);
749 if history.len() > self.max_history {
750 history.pop_back();
751 }
752 }
753}
754
755impl Default for RemediationEngine {
756 fn default() -> Self {
757 Self::new()
758 }
759}
760
761#[derive(Debug, Clone, Serialize, Deserialize)]
763pub struct RemediationStats {
764 pub total_actions: usize,
765 pub successful_actions: usize,
766 pub failed_actions: usize,
767 pub pending_approval: usize,
768 pub rolled_back: usize,
769 pub avg_improvement_score: f64,
770 pub total_effectiveness_measurements: usize,
771}
772
773#[cfg(test)]
774mod tests {
775 use super::*;
776
777 #[test]
778 fn test_engine_creation() {
779 let engine = RemediationEngine::new();
780 assert!(!engine.get_config().enabled);
781 }
782
783 #[test]
784 fn test_config_update() {
785 let engine = RemediationEngine::new();
786 let config = RemediationConfig {
787 enabled: true,
788 ..Default::default()
789 };
790 engine.update_config(config);
791 assert!(engine.get_config().enabled);
792 }
793
794 #[test]
795 fn test_improvement_score_calculation() {
796 let engine = RemediationEngine::new();
797
798 let before = SystemMetrics {
799 error_rate: 0.5,
800 avg_latency_ms: 1000.0,
801 p95_latency_ms: 1500.0,
802 p99_latency_ms: 2000.0,
803 success_rate: 0.5,
804 chaos_impact: 0.8,
805 resilience_score: 0.3,
806 };
807
808 let after = SystemMetrics {
809 error_rate: 0.2,
810 avg_latency_ms: 500.0,
811 p95_latency_ms: 750.0,
812 p99_latency_ms: 1000.0,
813 success_rate: 0.8,
814 chaos_impact: 0.4,
815 resilience_score: 0.7,
816 };
817
818 let score = engine.calculate_improvement_score(&before, &after);
819 assert!(score > 0.0 && score <= 1.0);
820 }
821}