mockforge_chaos/
recommendations.rs

1//! AI-powered chaos recommendations
2//!
3//! Analyzes chaos engineering metrics and system behavior to generate
4//! intelligent recommendations for improving resilience testing.
5
6use crate::analytics::{ChaosImpact, MetricsBucket};
7use chrono::{DateTime, Utc};
8use parking_lot::RwLock;
9use serde::{Deserialize, Serialize};
10use std::collections::HashMap;
11use std::sync::Arc;
12
13/// Recommendation category
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
15#[serde(rename_all = "snake_case")]
16pub enum RecommendationCategory {
17    /// Latency testing recommendations
18    Latency,
19    /// Fault injection recommendations
20    FaultInjection,
21    /// Rate limiting recommendations
22    RateLimit,
23    /// Traffic shaping recommendations
24    TrafficShaping,
25    /// Circuit breaker recommendations
26    CircuitBreaker,
27    /// Bulkhead recommendations
28    Bulkhead,
29    /// Scenario recommendations
30    Scenario,
31    /// Coverage recommendations
32    Coverage,
33}
34
35/// Recommendation severity/priority
36#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
37#[serde(rename_all = "snake_case")]
38pub enum RecommendationSeverity {
39    /// Informational
40    Info,
41    /// Low priority
42    Low,
43    /// Medium priority
44    Medium,
45    /// High priority
46    High,
47    /// Critical - should be addressed immediately
48    Critical,
49}
50
51/// Confidence level in the recommendation
52#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Serialize, Deserialize)]
53pub struct Confidence(f64);
54
55impl Confidence {
56    /// Create a new confidence value (0.0 - 1.0)
57    pub fn new(value: f64) -> Self {
58        Self(value.clamp(0.0, 1.0))
59    }
60
61    /// Get confidence value
62    pub fn value(&self) -> f64 {
63        self.0
64    }
65
66    /// Check if confidence is high (>= 0.7)
67    pub fn is_high(&self) -> bool {
68        self.0 >= 0.7
69    }
70
71    /// Check if confidence is medium (0.4 - 0.7)
72    pub fn is_medium(&self) -> bool {
73        self.0 >= 0.4 && self.0 < 0.7
74    }
75
76    /// Check if confidence is low (< 0.4)
77    pub fn is_low(&self) -> bool {
78        self.0 < 0.4
79    }
80}
81
82/// A chaos engineering recommendation
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct Recommendation {
85    /// Unique identifier
86    pub id: String,
87    /// Category
88    pub category: RecommendationCategory,
89    /// Severity/priority
90    pub severity: RecommendationSeverity,
91    /// Confidence level (0.0 - 1.0)
92    pub confidence: Confidence,
93    /// Title
94    pub title: String,
95    /// Description
96    pub description: String,
97    /// Rationale - why this is recommended
98    pub rationale: String,
99    /// Suggested action
100    pub action: String,
101    /// Example configuration or command
102    pub example: Option<String>,
103    /// Affected endpoints/services
104    pub affected_endpoints: Vec<String>,
105    /// Related metrics
106    pub metrics: HashMap<String, f64>,
107    /// Generated timestamp
108    pub generated_at: DateTime<Utc>,
109    /// Expected impact score (0.0 - 1.0)
110    pub expected_impact: f64,
111}
112
113impl Recommendation {
114    /// Calculate overall recommendation score for prioritization
115    pub fn score(&self) -> f64 {
116        let severity_weight = match self.severity {
117            RecommendationSeverity::Info => 0.2,
118            RecommendationSeverity::Low => 0.4,
119            RecommendationSeverity::Medium => 0.6,
120            RecommendationSeverity::High => 0.8,
121            RecommendationSeverity::Critical => 1.0,
122        };
123
124        // Weighted combination of severity, confidence, and expected impact
125        (severity_weight * 0.4) + (self.confidence.value() * 0.3) + (self.expected_impact * 0.3)
126    }
127}
128
129/// Pattern detected in chaos events
130#[derive(Debug, Clone)]
131struct ChaosPattern {
132    /// Pattern type
133    pattern_type: String,
134    /// Frequency of occurrence
135    frequency: f64,
136    /// Affected components
137    affected: Vec<String>,
138    /// Severity
139    severity: f64,
140}
141
142/// Weakness detected in system behavior
143#[derive(Debug, Clone)]
144struct SystemWeakness {
145    /// Weakness type
146    weakness_type: String,
147    /// Affected endpoints
148    endpoints: Vec<String>,
149    /// Evidence metrics
150    evidence: HashMap<String, f64>,
151}
152
153/// AI-powered chaos recommendation engine
154pub struct RecommendationEngine {
155    /// Generated recommendations
156    recommendations: Arc<RwLock<Vec<Recommendation>>>,
157    /// Historical patterns
158    patterns: Arc<RwLock<Vec<ChaosPattern>>>,
159    /// Configuration
160    config: EngineConfig,
161}
162
163/// Engine configuration
164#[derive(Debug, Clone)]
165pub struct EngineConfig {
166    /// Minimum confidence threshold for recommendations
167    pub min_confidence: f64,
168    /// Maximum recommendations to generate
169    pub max_recommendations: usize,
170    /// Enable pattern learning
171    pub enable_learning: bool,
172    /// Analysis window (hours)
173    pub analysis_window_hours: i64,
174}
175
176impl Default for EngineConfig {
177    fn default() -> Self {
178        Self {
179            min_confidence: 0.5,
180            max_recommendations: 20,
181            enable_learning: true,
182            analysis_window_hours: 24,
183        }
184    }
185}
186
187impl RecommendationEngine {
188    /// Create a new recommendation engine
189    pub fn new() -> Self {
190        Self::with_config(EngineConfig::default())
191    }
192
193    /// Create with custom configuration
194    pub fn with_config(config: EngineConfig) -> Self {
195        Self {
196            recommendations: Arc::new(RwLock::new(Vec::new())),
197            patterns: Arc::new(RwLock::new(Vec::new())),
198            config,
199        }
200    }
201
202    /// Analyze metrics and generate recommendations
203    pub fn analyze_and_recommend(
204        &self,
205        buckets: &[MetricsBucket],
206        impact: &ChaosImpact,
207    ) -> Vec<Recommendation> {
208        let mut recommendations = Vec::new();
209
210        // Detect patterns
211        let patterns = self.detect_patterns(buckets);
212
213        // Detect weaknesses
214        let weaknesses = self.detect_weaknesses(buckets, impact);
215
216        // Generate recommendations from patterns
217        recommendations.extend(self.recommendations_from_patterns(&patterns));
218
219        // Generate recommendations from weaknesses
220        recommendations.extend(self.recommendations_from_weaknesses(&weaknesses));
221
222        // Generate coverage recommendations
223        recommendations.extend(self.coverage_recommendations(buckets, impact));
224
225        // Generate scenario recommendations
226        recommendations.extend(self.scenario_recommendations(impact));
227
228        // Score and filter by confidence
229        let mut filtered: Vec<_> = recommendations
230            .into_iter()
231            .filter(|r| r.confidence.value() >= self.config.min_confidence)
232            .collect();
233
234        // Sort by score (highest first)
235        filtered
236            .sort_by(|a, b| b.score().partial_cmp(&a.score()).unwrap_or(std::cmp::Ordering::Equal));
237
238        // Limit to max recommendations
239        filtered.truncate(self.config.max_recommendations);
240
241        // Store recommendations
242        {
243            let mut recs = self.recommendations.write();
244            *recs = filtered.clone();
245        }
246
247        // Update patterns if learning is enabled
248        if self.config.enable_learning {
249            let mut stored_patterns = self.patterns.write();
250            *stored_patterns = patterns;
251        }
252
253        filtered
254    }
255
256    /// Detect patterns in chaos events
257    fn detect_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
258        let mut patterns = Vec::new();
259
260        if buckets.is_empty() {
261            return patterns;
262        }
263
264        // Pattern 1: Endpoints with consistently high latency
265        let latency_endpoints = self.detect_latency_patterns(buckets);
266        patterns.extend(latency_endpoints);
267
268        // Pattern 2: Endpoints with high fault rates
269        let fault_endpoints = self.detect_fault_patterns(buckets);
270        patterns.extend(fault_endpoints);
271
272        // Pattern 3: Rate limit violations
273        let rate_limit_patterns = self.detect_rate_limit_patterns(buckets);
274        patterns.extend(rate_limit_patterns);
275
276        // Pattern 4: Time-based patterns
277        let time_patterns = self.detect_time_patterns(buckets);
278        patterns.extend(time_patterns);
279
280        patterns
281    }
282
283    /// Detect latency patterns
284    fn detect_latency_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
285        let mut endpoint_latencies: HashMap<String, Vec<f64>> = HashMap::new();
286
287        for bucket in buckets {
288            for endpoint in bucket.affected_endpoints.keys() {
289                endpoint_latencies
290                    .entry(endpoint.clone())
291                    .or_default()
292                    .push(bucket.avg_latency_ms);
293            }
294        }
295
296        endpoint_latencies
297            .into_iter()
298            .filter_map(|(endpoint, latencies)| {
299                if latencies.is_empty() {
300                    return None;
301                }
302
303                let avg = latencies.iter().sum::<f64>() / latencies.len() as f64;
304
305                // High latency pattern if average > 500ms
306                if avg > 500.0 {
307                    Some(ChaosPattern {
308                        pattern_type: "high_latency".to_string(),
309                        frequency: latencies.len() as f64 / buckets.len() as f64,
310                        affected: vec![endpoint],
311                        severity: (avg / 1000.0).min(1.0), // Normalize to 0-1
312                    })
313                } else {
314                    None
315                }
316            })
317            .collect()
318    }
319
320    /// Detect fault patterns
321    fn detect_fault_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
322        let mut endpoint_faults: HashMap<String, usize> = HashMap::new();
323        let mut total_events_per_endpoint: HashMap<String, usize> = HashMap::new();
324
325        for bucket in buckets {
326            for (endpoint, count) in &bucket.affected_endpoints {
327                *total_events_per_endpoint.entry(endpoint.clone()).or_insert(0) += count;
328            }
329            for (fault_type, count) in &bucket.faults_by_type {
330                // Track faults by endpoint (simplified - assumes fault type contains endpoint info)
331                *endpoint_faults.entry(fault_type.clone()).or_insert(0) += count;
332            }
333        }
334
335        endpoint_faults
336            .into_iter()
337            .filter_map(|(endpoint, fault_count)| {
338                let total = total_events_per_endpoint.get(&endpoint).copied().unwrap_or(1);
339                let fault_rate = fault_count as f64 / total as f64;
340
341                // High fault rate if > 20%
342                if fault_rate > 0.2 {
343                    Some(ChaosPattern {
344                        pattern_type: "high_fault_rate".to_string(),
345                        frequency: fault_rate,
346                        affected: vec![endpoint],
347                        severity: fault_rate.min(1.0),
348                    })
349                } else {
350                    None
351                }
352            })
353            .collect()
354    }
355
356    /// Detect rate limit patterns
357    fn detect_rate_limit_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
358        let total_violations: usize = buckets.iter().map(|b| b.rate_limit_violations).sum();
359        let total_events: usize = buckets.iter().map(|b| b.total_events).sum();
360
361        if total_events == 0 {
362            return Vec::new();
363        }
364
365        let violation_rate = total_violations as f64 / total_events as f64;
366
367        if violation_rate > 0.1 {
368            vec![ChaosPattern {
369                pattern_type: "frequent_rate_limits".to_string(),
370                frequency: violation_rate,
371                affected: vec!["global".to_string()],
372                severity: violation_rate.min(1.0),
373            }]
374        } else {
375            Vec::new()
376        }
377    }
378
379    /// Detect time-based patterns
380    fn detect_time_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
381        // Look for patterns like "more errors during certain hours"
382        // This is a simplified implementation
383        if buckets.len() < 10 {
384            return Vec::new();
385        }
386
387        let mut patterns = Vec::new();
388
389        // Check if there's an increasing trend in faults
390        let first_half = &buckets[..buckets.len() / 2];
391        let second_half = &buckets[buckets.len() / 2..];
392
393        let first_avg_faults: f64 = first_half.iter().map(|b| b.total_faults).sum::<usize>() as f64
394            / first_half.len() as f64;
395        let second_avg_faults: f64 = second_half.iter().map(|b| b.total_faults).sum::<usize>()
396            as f64
397            / second_half.len() as f64;
398
399        if second_avg_faults > first_avg_faults * 1.5 {
400            patterns.push(ChaosPattern {
401                pattern_type: "increasing_fault_trend".to_string(),
402                frequency: 1.0,
403                affected: vec!["system".to_string()],
404                severity: ((second_avg_faults - first_avg_faults) / first_avg_faults.max(1.0))
405                    .min(1.0),
406            });
407        }
408
409        patterns
410    }
411
412    /// Detect system weaknesses
413    fn detect_weaknesses(
414        &self,
415        buckets: &[MetricsBucket],
416        impact: &ChaosImpact,
417    ) -> Vec<SystemWeakness> {
418        let mut weaknesses = Vec::new();
419
420        // Weakness 1: No chaos testing on some endpoints
421        if let Some(coverage_weakness) = self.detect_coverage_weakness(buckets) {
422            weaknesses.push(coverage_weakness);
423        }
424
425        // Weakness 2: High impact from chaos (system not resilient)
426        if impact.severity_score > 0.7 {
427            weaknesses.push(SystemWeakness {
428                weakness_type: "low_resilience".to_string(),
429                endpoints: impact.top_affected_endpoints.iter().map(|(ep, _)| ep.clone()).collect(),
430                evidence: {
431                    let mut map = HashMap::new();
432                    map.insert("severity_score".to_string(), impact.severity_score);
433                    map.insert("degradation_percent".to_string(), impact.avg_degradation_percent);
434                    map
435                },
436            });
437        }
438
439        // Weakness 3: Insufficient fault coverage
440        if self.detect_insufficient_fault_coverage(buckets) {
441            weaknesses.push(SystemWeakness {
442                weakness_type: "insufficient_fault_coverage".to_string(),
443                endpoints: vec![],
444                evidence: HashMap::new(),
445            });
446        }
447
448        weaknesses
449    }
450
451    /// Detect coverage weakness
452    fn detect_coverage_weakness(&self, buckets: &[MetricsBucket]) -> Option<SystemWeakness> {
453        if buckets.is_empty() {
454            return Some(SystemWeakness {
455                weakness_type: "no_chaos_testing".to_string(),
456                endpoints: vec![],
457                evidence: HashMap::new(),
458            });
459        }
460
461        None
462    }
463
464    /// Detect insufficient fault coverage
465    fn detect_insufficient_fault_coverage(&self, buckets: &[MetricsBucket]) -> bool {
466        let fault_types: std::collections::HashSet<_> =
467            buckets.iter().flat_map(|b| b.faults_by_type.keys()).collect();
468
469        // Expect at least 3 different fault types for good coverage
470        fault_types.len() < 3
471    }
472
473    /// Generate recommendations from patterns
474    fn recommendations_from_patterns(&self, patterns: &[ChaosPattern]) -> Vec<Recommendation> {
475        patterns
476            .iter()
477            .filter_map(|pattern| self.pattern_to_recommendation(pattern))
478            .collect()
479    }
480
481    /// Convert pattern to recommendation
482    fn pattern_to_recommendation(&self, pattern: &ChaosPattern) -> Option<Recommendation> {
483        match pattern.pattern_type.as_str() {
484            "high_latency" => Some(self.create_latency_recommendation(pattern)),
485            "high_fault_rate" => Some(self.create_fault_recommendation(pattern)),
486            "frequent_rate_limits" => Some(self.create_rate_limit_recommendation(pattern)),
487            "increasing_fault_trend" => Some(self.create_trend_recommendation(pattern)),
488            _ => None,
489        }
490    }
491
492    /// Create latency recommendation
493    fn create_latency_recommendation(&self, pattern: &ChaosPattern) -> Recommendation {
494        let endpoint = pattern.affected.first().map(|s| s.as_str()).unwrap_or("unknown");
495
496        Recommendation {
497            id: format!("rec-latency-{}", uuid::Uuid::new_v4()),
498            category: RecommendationCategory::Latency,
499            severity: if pattern.severity > 0.7 {
500                RecommendationSeverity::High
501            } else {
502                RecommendationSeverity::Medium
503            },
504            confidence: Confidence::new(0.85),
505            title: format!("Increase latency testing for endpoint: {}", endpoint),
506            description: format!(
507                "Endpoint {} shows high average latency ({:.0}ms) under chaos conditions",
508                endpoint,
509                pattern.severity * 1000.0
510            ),
511            rationale: "High latency detected consistently across chaos experiments. \
512                        This indicates the endpoint may be sensitive to delays and needs \
513                        more comprehensive latency testing."
514                .to_string(),
515            action: format!(
516                "Add more aggressive latency scenarios for endpoint {}. \
517                 Test with latencies up to {}ms to validate timeout handling.",
518                endpoint,
519                (pattern.severity * 2000.0) as u64
520            ),
521            example: Some(format!(
522                "mockforge serve --chaos --chaos-latency-ms {} --chaos-latency-probability 0.8",
523                (pattern.severity * 1500.0) as u64
524            )),
525            affected_endpoints: pattern.affected.clone(),
526            metrics: {
527                let mut map = HashMap::new();
528                map.insert("avg_latency_ms".to_string(), pattern.severity * 1000.0);
529                map.insert("frequency".to_string(), pattern.frequency);
530                map
531            },
532            generated_at: Utc::now(),
533            expected_impact: pattern.severity * 0.8,
534        }
535    }
536
537    /// Create fault recommendation
538    fn create_fault_recommendation(&self, pattern: &ChaosPattern) -> Recommendation {
539        let endpoint = pattern.affected.first().map(|s| s.as_str()).unwrap_or("unknown");
540
541        Recommendation {
542            id: format!("rec-fault-{}", uuid::Uuid::new_v4()),
543            category: RecommendationCategory::FaultInjection,
544            severity: if pattern.severity > 0.5 {
545                RecommendationSeverity::High
546            } else {
547                RecommendationSeverity::Medium
548            },
549            confidence: Confidence::new(0.80),
550            title: format!("Endpoint {} shows high fault sensitivity", endpoint),
551            description: format!(
552                "Fault rate of {:.1}% detected for endpoint {}",
553                pattern.frequency * 100.0,
554                endpoint
555            ),
556            rationale: "High fault rate indicates insufficient error handling or retry logic. \
557                        Testing with more diverse fault types is recommended."
558                .to_string(),
559            action: format!(
560                "Implement comprehensive error handling for endpoint {}. \
561                 Test with multiple fault types (500, 502, 503, 504, connection errors).",
562                endpoint
563            ),
564            example: Some(
565                "mockforge serve --chaos --chaos-http-errors '500,502,503,504' \
566                 --chaos-http-error-probability 0.3"
567                    .to_string(),
568            ),
569            affected_endpoints: pattern.affected.clone(),
570            metrics: {
571                let mut map = HashMap::new();
572                map.insert("fault_rate".to_string(), pattern.frequency);
573                map.insert("severity".to_string(), pattern.severity);
574                map
575            },
576            generated_at: Utc::now(),
577            expected_impact: pattern.severity,
578        }
579    }
580
581    /// Create rate limit recommendation
582    fn create_rate_limit_recommendation(&self, pattern: &ChaosPattern) -> Recommendation {
583        Recommendation {
584            id: format!("rec-ratelimit-{}", uuid::Uuid::new_v4()),
585            category: RecommendationCategory::RateLimit,
586            severity: RecommendationSeverity::Medium,
587            confidence: Confidence::new(0.75),
588            title: "Frequent rate limit violations detected".to_string(),
589            description: format!(
590                "Rate limit violations occurring at {:.1}% of requests",
591                pattern.frequency * 100.0
592            ),
593            rationale: "High rate of rate limiting indicates need for better backpressure \
594                        handling and retry logic with exponential backoff."
595                .to_string(),
596            action: "Implement proper retry logic with exponential backoff. \
597                     Test with more aggressive rate limits to validate behavior."
598                .to_string(),
599            example: Some(
600                "mockforge serve --chaos --chaos-rate-limit 10 --chaos-scenario peak_traffic"
601                    .to_string(),
602            ),
603            affected_endpoints: pattern.affected.clone(),
604            metrics: {
605                let mut map = HashMap::new();
606                map.insert("violation_rate".to_string(), pattern.frequency);
607                map
608            },
609            generated_at: Utc::now(),
610            expected_impact: 0.6,
611        }
612    }
613
614    /// Create trend recommendation
615    fn create_trend_recommendation(&self, pattern: &ChaosPattern) -> Recommendation {
616        Recommendation {
617            id: format!("rec-trend-{}", uuid::Uuid::new_v4()),
618            category: RecommendationCategory::Scenario,
619            severity: RecommendationSeverity::High,
620            confidence: Confidence::new(0.70),
621            title: "Increasing fault trend detected - system degradation".to_string(),
622            description: "Fault rate increasing over time, indicating system degradation \
623                          or cascading failures."
624                .to_string(),
625            rationale: "Increasing fault trends suggest lack of circuit breaker or bulkhead \
626                        patterns. System may be experiencing cascading failures."
627                .to_string(),
628            action: "Implement circuit breaker and bulkhead patterns. \
629                     Test with cascading failure scenarios."
630                .to_string(),
631            example: Some("mockforge serve --chaos --chaos-scenario cascading_failure".to_string()),
632            affected_endpoints: pattern.affected.clone(),
633            metrics: {
634                let mut map = HashMap::new();
635                map.insert("severity".to_string(), pattern.severity);
636                map
637            },
638            generated_at: Utc::now(),
639            expected_impact: 0.9,
640        }
641    }
642
643    /// Generate recommendations from weaknesses
644    fn recommendations_from_weaknesses(
645        &self,
646        weaknesses: &[SystemWeakness],
647    ) -> Vec<Recommendation> {
648        weaknesses
649            .iter()
650            .filter_map(|weakness| self.weakness_to_recommendation(weakness))
651            .collect()
652    }
653
654    /// Convert weakness to recommendation
655    fn weakness_to_recommendation(&self, weakness: &SystemWeakness) -> Option<Recommendation> {
656        match weakness.weakness_type.as_str() {
657            "no_chaos_testing" => Some(self.create_no_testing_recommendation()),
658            "low_resilience" => Some(self.create_resilience_recommendation(weakness)),
659            "insufficient_fault_coverage" => Some(self.create_coverage_recommendation()),
660            _ => None,
661        }
662    }
663
664    /// Create no testing recommendation
665    fn create_no_testing_recommendation(&self) -> Recommendation {
666        Recommendation {
667            id: format!("rec-start-{}", uuid::Uuid::new_v4()),
668            category: RecommendationCategory::Coverage,
669            severity: RecommendationSeverity::Critical,
670            confidence: Confidence::new(1.0),
671            title: "Start chaos engineering testing".to_string(),
672            description: "No chaos testing detected. Begin with basic scenarios to build \
673                          confidence in system resilience."
674                .to_string(),
675            rationale: "Without chaos testing, you cannot validate how your system behaves \
676                        under failure conditions."
677                .to_string(),
678            action: "Start with the 'network_degradation' scenario to test basic resilience."
679                .to_string(),
680            example: Some(
681                "mockforge serve --chaos --chaos-scenario network_degradation".to_string(),
682            ),
683            affected_endpoints: vec![],
684            metrics: HashMap::new(),
685            generated_at: Utc::now(),
686            expected_impact: 1.0,
687        }
688    }
689
690    /// Create resilience recommendation
691    fn create_resilience_recommendation(&self, weakness: &SystemWeakness) -> Recommendation {
692        Recommendation {
693            id: format!("rec-resilience-{}", uuid::Uuid::new_v4()),
694            category: RecommendationCategory::CircuitBreaker,
695            severity: RecommendationSeverity::Critical,
696            confidence: Confidence::new(0.85),
697            title: "System shows low resilience - implement resilience patterns".to_string(),
698            description: format!(
699                "System degradation of {:.1}% under chaos - resilience patterns needed",
700                weakness.evidence.get("degradation_percent").unwrap_or(&0.0)
701            ),
702            rationale: "High system degradation indicates missing resilience patterns like \
703                        circuit breakers, bulkheads, and retry logic."
704                .to_string(),
705            action: "Implement circuit breaker and bulkhead patterns for critical endpoints. \
706                     Add retry logic with exponential backoff."
707                .to_string(),
708            example: Some(
709                "# Test with circuit breaker scenario\n\
710                 mockforge serve --chaos --chaos-scenario cascading_failure"
711                    .to_string(),
712            ),
713            affected_endpoints: weakness.endpoints.clone(),
714            metrics: weakness.evidence.clone(),
715            generated_at: Utc::now(),
716            expected_impact: 0.95,
717        }
718    }
719
720    /// Create coverage recommendation
721    fn create_coverage_recommendation(&self) -> Recommendation {
722        Recommendation {
723            id: format!("rec-coverage-{}", uuid::Uuid::new_v4()),
724            category: RecommendationCategory::Coverage,
725            severity: RecommendationSeverity::High,
726            confidence: Confidence::new(0.80),
727            title: "Insufficient fault type coverage".to_string(),
728            description: "Testing with limited fault types. Expand coverage to include \
729                          multiple error conditions."
730                .to_string(),
731            rationale: "Comprehensive chaos testing should include various fault types: \
732                        HTTP errors (500, 502, 503, 504), connection errors, and timeouts."
733                .to_string(),
734            action: "Add diverse fault injection scenarios covering all major failure modes."
735                .to_string(),
736            example: Some(
737                "mockforge serve --chaos --chaos-scenario service_instability".to_string(),
738            ),
739            affected_endpoints: vec![],
740            metrics: HashMap::new(),
741            generated_at: Utc::now(),
742            expected_impact: 0.7,
743        }
744    }
745
746    /// Generate coverage recommendations
747    fn coverage_recommendations(
748        &self,
749        buckets: &[MetricsBucket],
750        _impact: &ChaosImpact,
751    ) -> Vec<Recommendation> {
752        let mut recs = Vec::new();
753
754        // Check protocol coverage
755        let protocols_tested: std::collections::HashSet<_> =
756            buckets.iter().flat_map(|b| b.protocol_events.keys()).collect();
757
758        if protocols_tested.is_empty() || protocols_tested.len() < 2 {
759            recs.push(Recommendation {
760                id: format!("rec-protocol-{}", uuid::Uuid::new_v4()),
761                category: RecommendationCategory::Coverage,
762                severity: RecommendationSeverity::Medium,
763                confidence: Confidence::new(0.75),
764                title: "Expand protocol-specific chaos testing".to_string(),
765                description: "Limited protocol coverage. Test chaos scenarios across \
766                              HTTP, gRPC, WebSocket, and GraphQL."
767                    .to_string(),
768                rationale: "Different protocols have different failure modes. \
769                            Comprehensive testing should cover all protocols in use."
770                    .to_string(),
771                action: "Enable protocol-specific chaos scenarios.".to_string(),
772                example: Some(
773                    "# Test gRPC chaos\n\
774                     mockforge serve --chaos --grpc-port 50051"
775                        .to_string(),
776                ),
777                affected_endpoints: vec![],
778                metrics: HashMap::new(),
779                generated_at: Utc::now(),
780                expected_impact: 0.6,
781            });
782        }
783
784        recs
785    }
786
787    /// Generate scenario recommendations
788    fn scenario_recommendations(&self, impact: &ChaosImpact) -> Vec<Recommendation> {
789        let mut recs = Vec::new();
790
791        // Recommend progressive chaos testing
792        if impact.total_events < 100 {
793            recs.push(Recommendation {
794                id: format!("rec-progressive-{}", uuid::Uuid::new_v4()),
795                category: RecommendationCategory::Scenario,
796                severity: RecommendationSeverity::Medium,
797                confidence: Confidence::new(0.70),
798                title: "Implement progressive chaos testing".to_string(),
799                description: "Start with light chaos and gradually increase intensity \
800                              to identify breaking points."
801                    .to_string(),
802                rationale: "Progressive testing helps identify at what point your system \
803                            starts to degrade, allowing you to set appropriate limits."
804                    .to_string(),
805                action: "Run chaos scenarios in order of increasing intensity: \
806                         network_degradation → service_instability → cascading_failure"
807                    .to_string(),
808                example: Some(
809                    "# Phase 1: Light chaos\n\
810                     mockforge serve --chaos --chaos-scenario network_degradation\n\n\
811                     # Phase 2: Medium chaos\n\
812                     mockforge serve --chaos --chaos-scenario service_instability\n\n\
813                     # Phase 3: Heavy chaos\n\
814                     mockforge serve --chaos --chaos-scenario cascading_failure"
815                        .to_string(),
816                ),
817                affected_endpoints: vec![],
818                metrics: HashMap::new(),
819                generated_at: Utc::now(),
820                expected_impact: 0.75,
821            });
822        }
823
824        recs
825    }
826
827    /// Get all current recommendations
828    pub fn get_recommendations(&self) -> Vec<Recommendation> {
829        self.recommendations.read().clone()
830    }
831
832    /// Get recommendations by category
833    pub fn get_recommendations_by_category(
834        &self,
835        category: RecommendationCategory,
836    ) -> Vec<Recommendation> {
837        self.recommendations
838            .read()
839            .iter()
840            .filter(|r| r.category == category)
841            .cloned()
842            .collect()
843    }
844
845    /// Get recommendations by severity
846    pub fn get_recommendations_by_severity(
847        &self,
848        min_severity: RecommendationSeverity,
849    ) -> Vec<Recommendation> {
850        self.recommendations
851            .read()
852            .iter()
853            .filter(|r| r.severity >= min_severity)
854            .cloned()
855            .collect()
856    }
857
858    /// Clear all recommendations
859    pub fn clear(&self) {
860        self.recommendations.write().clear();
861    }
862}
863
864impl Default for RecommendationEngine {
865    fn default() -> Self {
866        Self::new()
867    }
868}
869
870#[cfg(test)]
871mod tests {
872    use super::*;
873
874    use std::collections::HashMap;
875
876    #[test]
877    fn test_confidence_creation() {
878        let conf = Confidence::new(0.8);
879        assert_eq!(conf.value(), 0.8);
880        assert!(conf.is_high());
881        assert!(!conf.is_medium());
882        assert!(!conf.is_low());
883    }
884
885    #[test]
886    fn test_confidence_clamping() {
887        let conf = Confidence::new(1.5);
888        assert_eq!(conf.value(), 1.0);
889
890        let conf = Confidence::new(-0.5);
891        assert_eq!(conf.value(), 0.0);
892    }
893
894    #[test]
895    fn test_recommendation_score() {
896        let rec = Recommendation {
897            id: "test".to_string(),
898            category: RecommendationCategory::Latency,
899            severity: RecommendationSeverity::High,
900            confidence: Confidence::new(0.9),
901            title: "Test".to_string(),
902            description: "Test".to_string(),
903            rationale: "Test".to_string(),
904            action: "Test".to_string(),
905            example: None,
906            affected_endpoints: vec![],
907            metrics: HashMap::new(),
908            generated_at: Utc::now(),
909            expected_impact: 0.8,
910        };
911
912        let score = rec.score();
913        assert!(score > 0.0 && score <= 1.0);
914    }
915
916    #[test]
917    fn test_engine_creation() {
918        let engine = RecommendationEngine::new();
919        let recs = engine.get_recommendations();
920        assert_eq!(recs.len(), 0);
921    }
922
923    #[test]
924    fn test_detect_latency_patterns() {
925        let engine = RecommendationEngine::new();
926
927        let mut bucket = MetricsBucket::new(Utc::now(), crate::analytics::TimeBucket::Minute);
928        bucket.avg_latency_ms = 800.0;
929        bucket.affected_endpoints.insert("/api/slow".to_string(), 10);
930
931        let patterns = engine.detect_latency_patterns(&[bucket]);
932        assert_eq!(patterns.len(), 1);
933        assert_eq!(patterns[0].pattern_type, "high_latency");
934    }
935
936    #[test]
937    fn test_no_chaos_recommendation() {
938        let engine = RecommendationEngine::new();
939        let impact = ChaosImpact::from_buckets(&[]);
940
941        let recs = engine.analyze_and_recommend(&[], &impact);
942
943        // Should recommend starting chaos testing
944        assert!(!recs.is_empty());
945        assert!(recs.iter().any(|r| r.category == RecommendationCategory::Coverage));
946    }
947}