use crate::analytics::{ChaosImpact, MetricsBucket};
use chrono::{DateTime, Utc};
use parking_lot::RwLock;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RecommendationCategory {
Latency,
FaultInjection,
RateLimit,
TrafficShaping,
CircuitBreaker,
Bulkhead,
Scenario,
Coverage,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RecommendationSeverity {
Info,
Low,
Medium,
High,
Critical,
}
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, Serialize, Deserialize)]
pub struct Confidence(f64);
impl Confidence {
pub fn new(value: f64) -> Self {
Self(value.clamp(0.0, 1.0))
}
pub fn value(&self) -> f64 {
self.0
}
pub fn is_high(&self) -> bool {
self.0 >= 0.7
}
pub fn is_medium(&self) -> bool {
self.0 >= 0.4 && self.0 < 0.7
}
pub fn is_low(&self) -> bool {
self.0 < 0.4
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Recommendation {
pub id: String,
pub category: RecommendationCategory,
pub severity: RecommendationSeverity,
pub confidence: Confidence,
pub title: String,
pub description: String,
pub rationale: String,
pub action: String,
pub example: Option<String>,
pub affected_endpoints: Vec<String>,
pub metrics: HashMap<String, f64>,
pub generated_at: DateTime<Utc>,
pub expected_impact: f64,
}
impl Recommendation {
pub fn score(&self) -> f64 {
let severity_weight = match self.severity {
RecommendationSeverity::Info => 0.2,
RecommendationSeverity::Low => 0.4,
RecommendationSeverity::Medium => 0.6,
RecommendationSeverity::High => 0.8,
RecommendationSeverity::Critical => 1.0,
};
(severity_weight * 0.4) + (self.confidence.value() * 0.3) + (self.expected_impact * 0.3)
}
}
#[derive(Debug, Clone)]
struct ChaosPattern {
pattern_type: String,
frequency: f64,
affected: Vec<String>,
severity: f64,
}
#[derive(Debug, Clone)]
struct SystemWeakness {
weakness_type: String,
endpoints: Vec<String>,
evidence: HashMap<String, f64>,
}
pub struct RecommendationEngine {
recommendations: Arc<RwLock<Vec<Recommendation>>>,
patterns: Arc<RwLock<Vec<ChaosPattern>>>,
config: EngineConfig,
}
#[derive(Debug, Clone)]
pub struct EngineConfig {
pub min_confidence: f64,
pub max_recommendations: usize,
pub enable_learning: bool,
pub analysis_window_hours: i64,
}
impl Default for EngineConfig {
fn default() -> Self {
Self {
min_confidence: 0.5,
max_recommendations: 20,
enable_learning: true,
analysis_window_hours: 24,
}
}
}
impl RecommendationEngine {
pub fn new() -> Self {
Self::with_config(EngineConfig::default())
}
pub fn with_config(config: EngineConfig) -> Self {
Self {
recommendations: Arc::new(RwLock::new(Vec::new())),
patterns: Arc::new(RwLock::new(Vec::new())),
config,
}
}
pub fn analyze_and_recommend(
&self,
buckets: &[MetricsBucket],
impact: &ChaosImpact,
) -> Vec<Recommendation> {
let mut recommendations = Vec::new();
let patterns = self.detect_patterns(buckets);
let weaknesses = self.detect_weaknesses(buckets, impact);
recommendations.extend(self.recommendations_from_patterns(&patterns));
recommendations.extend(self.recommendations_from_weaknesses(&weaknesses));
recommendations.extend(self.coverage_recommendations(buckets, impact));
recommendations.extend(self.scenario_recommendations(impact));
let mut filtered: Vec<_> = recommendations
.into_iter()
.filter(|r| r.confidence.value() >= self.config.min_confidence)
.collect();
filtered
.sort_by(|a, b| b.score().partial_cmp(&a.score()).unwrap_or(std::cmp::Ordering::Equal));
filtered.truncate(self.config.max_recommendations);
{
let mut recs = self.recommendations.write();
*recs = filtered.clone();
}
if self.config.enable_learning {
let mut stored_patterns = self.patterns.write();
*stored_patterns = patterns;
}
filtered
}
fn detect_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
let mut patterns = Vec::new();
if buckets.is_empty() {
return patterns;
}
let latency_endpoints = self.detect_latency_patterns(buckets);
patterns.extend(latency_endpoints);
let fault_endpoints = self.detect_fault_patterns(buckets);
patterns.extend(fault_endpoints);
let rate_limit_patterns = self.detect_rate_limit_patterns(buckets);
patterns.extend(rate_limit_patterns);
let time_patterns = self.detect_time_patterns(buckets);
patterns.extend(time_patterns);
patterns
}
fn detect_latency_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
let mut endpoint_latencies: HashMap<String, Vec<f64>> = HashMap::new();
for bucket in buckets {
for endpoint in bucket.affected_endpoints.keys() {
endpoint_latencies
.entry(endpoint.clone())
.or_default()
.push(bucket.avg_latency_ms);
}
}
endpoint_latencies
.into_iter()
.filter_map(|(endpoint, latencies)| {
if latencies.is_empty() {
return None;
}
let avg = latencies.iter().sum::<f64>() / latencies.len() as f64;
if avg > 500.0 {
Some(ChaosPattern {
pattern_type: "high_latency".to_string(),
frequency: latencies.len() as f64 / buckets.len() as f64,
affected: vec![endpoint],
severity: (avg / 1000.0).min(1.0), })
} else {
None
}
})
.collect()
}
fn detect_fault_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
let mut endpoint_faults: HashMap<String, usize> = HashMap::new();
let mut total_events_per_endpoint: HashMap<String, usize> = HashMap::new();
for bucket in buckets {
for (endpoint, count) in &bucket.affected_endpoints {
*total_events_per_endpoint.entry(endpoint.clone()).or_insert(0) += count;
}
for (fault_type, count) in &bucket.faults_by_type {
*endpoint_faults.entry(fault_type.clone()).or_insert(0) += count;
}
}
endpoint_faults
.into_iter()
.filter_map(|(endpoint, fault_count)| {
let total = total_events_per_endpoint.get(&endpoint).copied().unwrap_or(1);
let fault_rate = fault_count as f64 / total as f64;
if fault_rate > 0.2 {
Some(ChaosPattern {
pattern_type: "high_fault_rate".to_string(),
frequency: fault_rate,
affected: vec![endpoint],
severity: fault_rate.min(1.0),
})
} else {
None
}
})
.collect()
}
fn detect_rate_limit_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
let total_violations: usize = buckets.iter().map(|b| b.rate_limit_violations).sum();
let total_events: usize = buckets.iter().map(|b| b.total_events).sum();
if total_events == 0 {
return Vec::new();
}
let violation_rate = total_violations as f64 / total_events as f64;
if violation_rate > 0.1 {
vec![ChaosPattern {
pattern_type: "frequent_rate_limits".to_string(),
frequency: violation_rate,
affected: vec!["global".to_string()],
severity: violation_rate.min(1.0),
}]
} else {
Vec::new()
}
}
fn detect_time_patterns(&self, buckets: &[MetricsBucket]) -> Vec<ChaosPattern> {
if buckets.len() < 10 {
return Vec::new();
}
let mut patterns = Vec::new();
let first_half = &buckets[..buckets.len() / 2];
let second_half = &buckets[buckets.len() / 2..];
let first_avg_faults: f64 = first_half.iter().map(|b| b.total_faults).sum::<usize>() as f64
/ first_half.len() as f64;
let second_avg_faults: f64 = second_half.iter().map(|b| b.total_faults).sum::<usize>()
as f64
/ second_half.len() as f64;
if second_avg_faults > first_avg_faults * 1.5 {
patterns.push(ChaosPattern {
pattern_type: "increasing_fault_trend".to_string(),
frequency: 1.0,
affected: vec!["system".to_string()],
severity: ((second_avg_faults - first_avg_faults) / first_avg_faults.max(1.0))
.min(1.0),
});
}
patterns
}
fn detect_weaknesses(
&self,
buckets: &[MetricsBucket],
impact: &ChaosImpact,
) -> Vec<SystemWeakness> {
let mut weaknesses = Vec::new();
if let Some(coverage_weakness) = self.detect_coverage_weakness(buckets) {
weaknesses.push(coverage_weakness);
}
if impact.severity_score > 0.7 {
weaknesses.push(SystemWeakness {
weakness_type: "low_resilience".to_string(),
endpoints: impact.top_affected_endpoints.iter().map(|(ep, _)| ep.clone()).collect(),
evidence: {
let mut map = HashMap::new();
map.insert("severity_score".to_string(), impact.severity_score);
map.insert("degradation_percent".to_string(), impact.avg_degradation_percent);
map
},
});
}
if self.detect_insufficient_fault_coverage(buckets) {
weaknesses.push(SystemWeakness {
weakness_type: "insufficient_fault_coverage".to_string(),
endpoints: vec![],
evidence: HashMap::new(),
});
}
weaknesses
}
fn detect_coverage_weakness(&self, buckets: &[MetricsBucket]) -> Option<SystemWeakness> {
if buckets.is_empty() {
return Some(SystemWeakness {
weakness_type: "no_chaos_testing".to_string(),
endpoints: vec![],
evidence: HashMap::new(),
});
}
None
}
fn detect_insufficient_fault_coverage(&self, buckets: &[MetricsBucket]) -> bool {
let fault_types: std::collections::HashSet<_> =
buckets.iter().flat_map(|b| b.faults_by_type.keys()).collect();
fault_types.len() < 3
}
fn recommendations_from_patterns(&self, patterns: &[ChaosPattern]) -> Vec<Recommendation> {
patterns
.iter()
.filter_map(|pattern| self.pattern_to_recommendation(pattern))
.collect()
}
fn pattern_to_recommendation(&self, pattern: &ChaosPattern) -> Option<Recommendation> {
match pattern.pattern_type.as_str() {
"high_latency" => Some(self.create_latency_recommendation(pattern)),
"high_fault_rate" => Some(self.create_fault_recommendation(pattern)),
"frequent_rate_limits" => Some(self.create_rate_limit_recommendation(pattern)),
"increasing_fault_trend" => Some(self.create_trend_recommendation(pattern)),
_ => None,
}
}
fn create_latency_recommendation(&self, pattern: &ChaosPattern) -> Recommendation {
let endpoint = pattern.affected.first().map(|s| s.as_str()).unwrap_or("unknown");
Recommendation {
id: format!("rec-latency-{}", uuid::Uuid::new_v4()),
category: RecommendationCategory::Latency,
severity: if pattern.severity > 0.7 {
RecommendationSeverity::High
} else {
RecommendationSeverity::Medium
},
confidence: Confidence::new(0.85),
title: format!("Increase latency testing for endpoint: {}", endpoint),
description: format!(
"Endpoint {} shows high average latency ({:.0}ms) under chaos conditions",
endpoint,
pattern.severity * 1000.0
),
rationale: "High latency detected consistently across chaos experiments. \
This indicates the endpoint may be sensitive to delays and needs \
more comprehensive latency testing."
.to_string(),
action: format!(
"Add more aggressive latency scenarios for endpoint {}. \
Test with latencies up to {}ms to validate timeout handling.",
endpoint,
(pattern.severity * 2000.0) as u64
),
example: Some(format!(
"mockforge serve --chaos --chaos-latency-ms {} --chaos-latency-probability 0.8",
(pattern.severity * 1500.0) as u64
)),
affected_endpoints: pattern.affected.clone(),
metrics: {
let mut map = HashMap::new();
map.insert("avg_latency_ms".to_string(), pattern.severity * 1000.0);
map.insert("frequency".to_string(), pattern.frequency);
map
},
generated_at: Utc::now(),
expected_impact: pattern.severity * 0.8,
}
}
fn create_fault_recommendation(&self, pattern: &ChaosPattern) -> Recommendation {
let endpoint = pattern.affected.first().map(|s| s.as_str()).unwrap_or("unknown");
Recommendation {
id: format!("rec-fault-{}", uuid::Uuid::new_v4()),
category: RecommendationCategory::FaultInjection,
severity: if pattern.severity > 0.5 {
RecommendationSeverity::High
} else {
RecommendationSeverity::Medium
},
confidence: Confidence::new(0.80),
title: format!("Endpoint {} shows high fault sensitivity", endpoint),
description: format!(
"Fault rate of {:.1}% detected for endpoint {}",
pattern.frequency * 100.0,
endpoint
),
rationale: "High fault rate indicates insufficient error handling or retry logic. \
Testing with more diverse fault types is recommended."
.to_string(),
action: format!(
"Implement comprehensive error handling for endpoint {}. \
Test with multiple fault types (500, 502, 503, 504, connection errors).",
endpoint
),
example: Some(
"mockforge serve --chaos --chaos-http-errors '500,502,503,504' \
--chaos-http-error-probability 0.3"
.to_string(),
),
affected_endpoints: pattern.affected.clone(),
metrics: {
let mut map = HashMap::new();
map.insert("fault_rate".to_string(), pattern.frequency);
map.insert("severity".to_string(), pattern.severity);
map
},
generated_at: Utc::now(),
expected_impact: pattern.severity,
}
}
fn create_rate_limit_recommendation(&self, pattern: &ChaosPattern) -> Recommendation {
Recommendation {
id: format!("rec-ratelimit-{}", uuid::Uuid::new_v4()),
category: RecommendationCategory::RateLimit,
severity: RecommendationSeverity::Medium,
confidence: Confidence::new(0.75),
title: "Frequent rate limit violations detected".to_string(),
description: format!(
"Rate limit violations occurring at {:.1}% of requests",
pattern.frequency * 100.0
),
rationale: "High rate of rate limiting indicates need for better backpressure \
handling and retry logic with exponential backoff."
.to_string(),
action: "Implement proper retry logic with exponential backoff. \
Test with more aggressive rate limits to validate behavior."
.to_string(),
example: Some(
"mockforge serve --chaos --chaos-rate-limit 10 --chaos-scenario peak_traffic"
.to_string(),
),
affected_endpoints: pattern.affected.clone(),
metrics: {
let mut map = HashMap::new();
map.insert("violation_rate".to_string(), pattern.frequency);
map
},
generated_at: Utc::now(),
expected_impact: 0.6,
}
}
fn create_trend_recommendation(&self, pattern: &ChaosPattern) -> Recommendation {
Recommendation {
id: format!("rec-trend-{}", uuid::Uuid::new_v4()),
category: RecommendationCategory::Scenario,
severity: RecommendationSeverity::High,
confidence: Confidence::new(0.70),
title: "Increasing fault trend detected - system degradation".to_string(),
description: "Fault rate increasing over time, indicating system degradation \
or cascading failures."
.to_string(),
rationale: "Increasing fault trends suggest lack of circuit breaker or bulkhead \
patterns. System may be experiencing cascading failures."
.to_string(),
action: "Implement circuit breaker and bulkhead patterns. \
Test with cascading failure scenarios."
.to_string(),
example: Some("mockforge serve --chaos --chaos-scenario cascading_failure".to_string()),
affected_endpoints: pattern.affected.clone(),
metrics: {
let mut map = HashMap::new();
map.insert("severity".to_string(), pattern.severity);
map
},
generated_at: Utc::now(),
expected_impact: 0.9,
}
}
fn recommendations_from_weaknesses(
&self,
weaknesses: &[SystemWeakness],
) -> Vec<Recommendation> {
weaknesses
.iter()
.filter_map(|weakness| self.weakness_to_recommendation(weakness))
.collect()
}
fn weakness_to_recommendation(&self, weakness: &SystemWeakness) -> Option<Recommendation> {
match weakness.weakness_type.as_str() {
"no_chaos_testing" => Some(self.create_no_testing_recommendation()),
"low_resilience" => Some(self.create_resilience_recommendation(weakness)),
"insufficient_fault_coverage" => Some(self.create_coverage_recommendation()),
_ => None,
}
}
fn create_no_testing_recommendation(&self) -> Recommendation {
Recommendation {
id: format!("rec-start-{}", uuid::Uuid::new_v4()),
category: RecommendationCategory::Coverage,
severity: RecommendationSeverity::Critical,
confidence: Confidence::new(1.0),
title: "Start chaos engineering testing".to_string(),
description: "No chaos testing detected. Begin with basic scenarios to build \
confidence in system resilience."
.to_string(),
rationale: "Without chaos testing, you cannot validate how your system behaves \
under failure conditions."
.to_string(),
action: "Start with the 'network_degradation' scenario to test basic resilience."
.to_string(),
example: Some(
"mockforge serve --chaos --chaos-scenario network_degradation".to_string(),
),
affected_endpoints: vec![],
metrics: HashMap::new(),
generated_at: Utc::now(),
expected_impact: 1.0,
}
}
fn create_resilience_recommendation(&self, weakness: &SystemWeakness) -> Recommendation {
Recommendation {
id: format!("rec-resilience-{}", uuid::Uuid::new_v4()),
category: RecommendationCategory::CircuitBreaker,
severity: RecommendationSeverity::Critical,
confidence: Confidence::new(0.85),
title: "System shows low resilience - implement resilience patterns".to_string(),
description: format!(
"System degradation of {:.1}% under chaos - resilience patterns needed",
weakness.evidence.get("degradation_percent").unwrap_or(&0.0)
),
rationale: "High system degradation indicates missing resilience patterns like \
circuit breakers, bulkheads, and retry logic."
.to_string(),
action: "Implement circuit breaker and bulkhead patterns for critical endpoints. \
Add retry logic with exponential backoff."
.to_string(),
example: Some(
"# Test with circuit breaker scenario\n\
mockforge serve --chaos --chaos-scenario cascading_failure"
.to_string(),
),
affected_endpoints: weakness.endpoints.clone(),
metrics: weakness.evidence.clone(),
generated_at: Utc::now(),
expected_impact: 0.95,
}
}
fn create_coverage_recommendation(&self) -> Recommendation {
Recommendation {
id: format!("rec-coverage-{}", uuid::Uuid::new_v4()),
category: RecommendationCategory::Coverage,
severity: RecommendationSeverity::High,
confidence: Confidence::new(0.80),
title: "Insufficient fault type coverage".to_string(),
description: "Testing with limited fault types. Expand coverage to include \
multiple error conditions."
.to_string(),
rationale: "Comprehensive chaos testing should include various fault types: \
HTTP errors (500, 502, 503, 504), connection errors, and timeouts."
.to_string(),
action: "Add diverse fault injection scenarios covering all major failure modes."
.to_string(),
example: Some(
"mockforge serve --chaos --chaos-scenario service_instability".to_string(),
),
affected_endpoints: vec![],
metrics: HashMap::new(),
generated_at: Utc::now(),
expected_impact: 0.7,
}
}
fn coverage_recommendations(
&self,
buckets: &[MetricsBucket],
_impact: &ChaosImpact,
) -> Vec<Recommendation> {
let mut recs = Vec::new();
let protocols_tested: std::collections::HashSet<_> =
buckets.iter().flat_map(|b| b.protocol_events.keys()).collect();
if protocols_tested.is_empty() || protocols_tested.len() < 2 {
recs.push(Recommendation {
id: format!("rec-protocol-{}", uuid::Uuid::new_v4()),
category: RecommendationCategory::Coverage,
severity: RecommendationSeverity::Medium,
confidence: Confidence::new(0.75),
title: "Expand protocol-specific chaos testing".to_string(),
description: "Limited protocol coverage. Test chaos scenarios across \
HTTP, gRPC, WebSocket, and GraphQL."
.to_string(),
rationale: "Different protocols have different failure modes. \
Comprehensive testing should cover all protocols in use."
.to_string(),
action: "Enable protocol-specific chaos scenarios.".to_string(),
example: Some(
"# Test gRPC chaos\n\
mockforge serve --chaos --grpc-port 50051"
.to_string(),
),
affected_endpoints: vec![],
metrics: HashMap::new(),
generated_at: Utc::now(),
expected_impact: 0.6,
});
}
recs
}
fn scenario_recommendations(&self, impact: &ChaosImpact) -> Vec<Recommendation> {
let mut recs = Vec::new();
if impact.total_events < 100 {
recs.push(Recommendation {
id: format!("rec-progressive-{}", uuid::Uuid::new_v4()),
category: RecommendationCategory::Scenario,
severity: RecommendationSeverity::Medium,
confidence: Confidence::new(0.70),
title: "Implement progressive chaos testing".to_string(),
description: "Start with light chaos and gradually increase intensity \
to identify breaking points."
.to_string(),
rationale: "Progressive testing helps identify at what point your system \
starts to degrade, allowing you to set appropriate limits."
.to_string(),
action: "Run chaos scenarios in order of increasing intensity: \
network_degradation → service_instability → cascading_failure"
.to_string(),
example: Some(
"# Phase 1: Light chaos\n\
mockforge serve --chaos --chaos-scenario network_degradation\n\n\
# Phase 2: Medium chaos\n\
mockforge serve --chaos --chaos-scenario service_instability\n\n\
# Phase 3: Heavy chaos\n\
mockforge serve --chaos --chaos-scenario cascading_failure"
.to_string(),
),
affected_endpoints: vec![],
metrics: HashMap::new(),
generated_at: Utc::now(),
expected_impact: 0.75,
});
}
recs
}
pub fn get_recommendations(&self) -> Vec<Recommendation> {
self.recommendations.read().clone()
}
pub fn get_recommendations_by_category(
&self,
category: RecommendationCategory,
) -> Vec<Recommendation> {
self.recommendations
.read()
.iter()
.filter(|r| r.category == category)
.cloned()
.collect()
}
pub fn get_recommendations_by_severity(
&self,
min_severity: RecommendationSeverity,
) -> Vec<Recommendation> {
self.recommendations
.read()
.iter()
.filter(|r| r.severity >= min_severity)
.cloned()
.collect()
}
pub fn clear(&self) {
self.recommendations.write().clear();
}
}
impl Default for RecommendationEngine {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::collections::HashMap;
#[test]
fn test_confidence_creation() {
let conf = Confidence::new(0.8);
assert_eq!(conf.value(), 0.8);
assert!(conf.is_high());
assert!(!conf.is_medium());
assert!(!conf.is_low());
}
#[test]
fn test_confidence_clamping() {
let conf = Confidence::new(1.5);
assert_eq!(conf.value(), 1.0);
let conf = Confidence::new(-0.5);
assert_eq!(conf.value(), 0.0);
}
#[test]
fn test_recommendation_score() {
let rec = Recommendation {
id: "test".to_string(),
category: RecommendationCategory::Latency,
severity: RecommendationSeverity::High,
confidence: Confidence::new(0.9),
title: "Test".to_string(),
description: "Test".to_string(),
rationale: "Test".to_string(),
action: "Test".to_string(),
example: None,
affected_endpoints: vec![],
metrics: HashMap::new(),
generated_at: Utc::now(),
expected_impact: 0.8,
};
let score = rec.score();
assert!(score > 0.0 && score <= 1.0);
}
#[test]
fn test_engine_creation() {
let engine = RecommendationEngine::new();
let recs = engine.get_recommendations();
assert_eq!(recs.len(), 0);
}
#[test]
fn test_detect_latency_patterns() {
let engine = RecommendationEngine::new();
let mut bucket = MetricsBucket::new(Utc::now(), crate::analytics::TimeBucket::Minute);
bucket.avg_latency_ms = 800.0;
bucket.affected_endpoints.insert("/api/slow".to_string(), 10);
let patterns = engine.detect_latency_patterns(&[bucket]);
assert_eq!(patterns.len(), 1);
assert_eq!(patterns[0].pattern_type, "high_latency");
}
#[test]
fn test_no_chaos_recommendation() {
let engine = RecommendationEngine::new();
let impact = ChaosImpact::from_buckets(&[]);
let recs = engine.analyze_and_recommend(&[], &impact);
assert!(!recs.is_empty());
assert!(recs.iter().any(|r| r.category == RecommendationCategory::Coverage));
}
}