mockforge_chaos/
ml_parameter_optimizer.rs

1//! ML-based parameter optimization for chaos scenarios
2//!
3//! Uses Bayesian optimization and historical data to recommend optimal
4//! chaos parameters that balance effectiveness and system stability.
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10/// Historical orchestration run
11#[derive(Debug, Clone, Serialize, Deserialize)]
12pub struct OrchestrationRun {
13    pub id: String,
14    pub orchestration_id: String,
15    pub parameters: HashMap<String, f64>,
16    pub timestamp: DateTime<Utc>,
17    pub duration_ms: u64,
18    pub success: bool,
19    pub metrics: RunMetrics,
20}
21
22/// Run metrics
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct RunMetrics {
25    pub chaos_effectiveness: f64, // How much chaos was actually induced (0-1)
26    pub system_stability: f64,    // How stable the system remained (0-1)
27    pub error_rate: f64,
28    pub recovery_time_ms: u64,
29    pub failures_detected: u32,
30    pub false_positives: u32,
31}
32
33/// Parameter optimization recommendation
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct OptimizationRecommendation {
36    pub parameter: String,
37    pub current_value: Option<f64>,
38    pub recommended_value: f64,
39    pub confidence: f64,
40    pub reasoning: String,
41    pub expected_impact: ExpectedImpact,
42    pub based_on_runs: usize,
43}
44
45/// Expected impact of parameter change
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct ExpectedImpact {
48    pub chaos_effectiveness_delta: f64,
49    pub system_stability_delta: f64,
50    pub overall_score_delta: f64,
51}
52
53/// Optimization objective
54#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
55#[serde(rename_all = "snake_case")]
56pub enum OptimizationObjective {
57    MaxChaos,      // Maximize chaos (for stress testing)
58    Balanced,      // Balance chaos and stability
59    SafeTesting,   // Minimize risk while still effective
60    QuickRecovery, // Optimize for fast recovery
61    MaxDetection,  // Maximize failure detection
62}
63
64/// Parameter bounds
65#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct ParameterBounds {
67    pub min: f64,
68    pub max: f64,
69    pub step: Option<f64>,
70}
71
72/// Optimizer configuration
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct OptimizerConfig {
75    pub objective: OptimizationObjective,
76    pub min_runs: usize,
77    pub confidence_threshold: f64,
78    pub exploration_factor: f64,
79    pub parameter_bounds: HashMap<String, ParameterBounds>,
80    pub weights: ObjectiveWeights,
81}
82
83/// Weights for multi-objective optimization
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct ObjectiveWeights {
86    pub chaos_effectiveness: f64,
87    pub system_stability: f64,
88    pub recovery_time: f64,
89    pub detection_rate: f64,
90}
91
92impl Default for OptimizerConfig {
93    fn default() -> Self {
94        let mut parameter_bounds = HashMap::new();
95
96        // Common parameter bounds
97        parameter_bounds.insert(
98            "latency_ms".to_string(),
99            ParameterBounds {
100                min: 0.0,
101                max: 5000.0,
102                step: Some(10.0),
103            },
104        );
105        parameter_bounds.insert(
106            "error_rate".to_string(),
107            ParameterBounds {
108                min: 0.0,
109                max: 1.0,
110                step: Some(0.01),
111            },
112        );
113        parameter_bounds.insert(
114            "packet_loss".to_string(),
115            ParameterBounds {
116                min: 0.0,
117                max: 1.0,
118                step: Some(0.01),
119            },
120        );
121        parameter_bounds.insert(
122            "cpu_load".to_string(),
123            ParameterBounds {
124                min: 0.0,
125                max: 1.0,
126                step: Some(0.05),
127            },
128        );
129
130        Self {
131            objective: OptimizationObjective::Balanced,
132            min_runs: 10,
133            confidence_threshold: 0.7,
134            exploration_factor: 0.2,
135            parameter_bounds,
136            weights: ObjectiveWeights {
137                chaos_effectiveness: 0.3,
138                system_stability: 0.4,
139                recovery_time: 0.2,
140                detection_rate: 0.1,
141            },
142        }
143    }
144}
145
146/// ML-based parameter optimizer
147pub struct ParameterOptimizer {
148    config: OptimizerConfig,
149    historical_runs: Vec<OrchestrationRun>,
150}
151
152impl ParameterOptimizer {
153    /// Create a new optimizer
154    pub fn new(config: OptimizerConfig) -> Self {
155        Self {
156            config,
157            historical_runs: Vec::new(),
158        }
159    }
160
161    /// Add historical run data
162    pub fn add_run(&mut self, run: OrchestrationRun) {
163        self.historical_runs.push(run);
164    }
165
166    /// Add multiple runs
167    pub fn add_runs(&mut self, runs: Vec<OrchestrationRun>) {
168        self.historical_runs.extend(runs);
169    }
170
171    /// Generate optimization recommendations
172    pub fn optimize(&self) -> Result<Vec<OptimizationRecommendation>, String> {
173        if self.historical_runs.len() < self.config.min_runs {
174            return Err(format!(
175                "Insufficient data: need at least {} runs, have {}",
176                self.config.min_runs,
177                self.historical_runs.len()
178            ));
179        }
180
181        let mut recommendations = Vec::new();
182
183        // Extract all unique parameters
184        let all_parameters = self.extract_parameter_names();
185
186        for param_name in all_parameters {
187            if let Some(recommendation) = self.optimize_parameter(&param_name)? {
188                recommendations.push(recommendation);
189            }
190        }
191
192        // Sort by expected impact
193        recommendations.sort_by(|a, b| {
194            b.expected_impact
195                .overall_score_delta
196                .partial_cmp(&a.expected_impact.overall_score_delta)
197                .unwrap_or(std::cmp::Ordering::Equal)
198        });
199
200        Ok(recommendations)
201    }
202
203    /// Optimize a single parameter
204    fn optimize_parameter(
205        &self,
206        param_name: &str,
207    ) -> Result<Option<OptimizationRecommendation>, String> {
208        // Collect all runs that used this parameter
209        let relevant_runs: Vec<_> = self
210            .historical_runs
211            .iter()
212            .filter(|run| run.parameters.contains_key(param_name))
213            .collect();
214
215        if relevant_runs.is_empty() {
216            return Ok(None);
217        }
218
219        // Get parameter bounds
220        let bounds = self
221            .config
222            .parameter_bounds
223            .get(param_name)
224            .ok_or_else(|| format!("No bounds defined for parameter '{}'", param_name))?;
225
226        // Calculate scores for different parameter values
227        let mut value_scores: Vec<(f64, f64)> = Vec::new();
228
229        for run in &relevant_runs {
230            if let Some(&param_value) = run.parameters.get(param_name) {
231                let score = self.calculate_run_score(run);
232                value_scores.push((param_value, score));
233            }
234        }
235
236        if value_scores.is_empty() {
237            return Ok(None);
238        }
239
240        // Use Gaussian Process-inspired approach to find optimal value
241        let optimal_value = self.find_optimal_value(&value_scores, bounds)?;
242
243        // Get current average value
244        let current_value =
245            value_scores.iter().map(|(v, _)| v).sum::<f64>() / value_scores.len() as f64;
246
247        // Calculate confidence based on data density
248        let confidence = self.calculate_confidence(&value_scores, optimal_value);
249
250        if confidence < self.config.confidence_threshold {
251            return Ok(None);
252        }
253
254        // Estimate expected impact
255        let expected_impact =
256            self.estimate_impact(param_name, current_value, optimal_value, &relevant_runs)?;
257
258        // Generate reasoning
259        let reasoning = self.generate_reasoning(
260            param_name,
261            current_value,
262            optimal_value,
263            &expected_impact,
264            relevant_runs.len(),
265        );
266
267        Ok(Some(OptimizationRecommendation {
268            parameter: param_name.to_string(),
269            current_value: Some(current_value),
270            recommended_value: optimal_value,
271            confidence,
272            reasoning,
273            expected_impact,
274            based_on_runs: relevant_runs.len(),
275        }))
276    }
277
278    /// Calculate score for a run based on objective
279    fn calculate_run_score(&self, run: &OrchestrationRun) -> f64 {
280        let weights = &self.config.weights;
281
282        let chaos_score = run.metrics.chaos_effectiveness;
283        let stability_score = run.metrics.system_stability;
284        let recovery_score = 1.0 - (run.metrics.recovery_time_ms as f64 / 10000.0).min(1.0);
285        let detection_score = if run.metrics.failures_detected + run.metrics.false_positives > 0 {
286            run.metrics.failures_detected as f64
287                / (run.metrics.failures_detected + run.metrics.false_positives) as f64
288        } else {
289            0.5
290        };
291
292        // Apply objective-specific adjustments
293        let (chaos_w, stability_w, recovery_w, detection_w) = match self.config.objective {
294            OptimizationObjective::MaxChaos => (0.7, 0.1, 0.1, 0.1),
295            OptimizationObjective::Balanced => (
296                weights.chaos_effectiveness,
297                weights.system_stability,
298                weights.recovery_time,
299                weights.detection_rate,
300            ),
301            OptimizationObjective::SafeTesting => (0.2, 0.6, 0.1, 0.1),
302            OptimizationObjective::QuickRecovery => (0.2, 0.3, 0.4, 0.1),
303            OptimizationObjective::MaxDetection => (0.2, 0.2, 0.1, 0.5),
304        };
305
306        chaos_score * chaos_w
307            + stability_score * stability_w
308            + recovery_score * recovery_w
309            + detection_score * detection_w
310    }
311
312    /// Find optimal parameter value using expected improvement
313    fn find_optimal_value(
314        &self,
315        value_scores: &[(f64, f64)],
316        bounds: &ParameterBounds,
317    ) -> Result<f64, String> {
318        // Simple approach: find the value with best score, with some exploration
319        let best_value = value_scores
320            .iter()
321            .max_by(|(_, s1), (_, s2)| s1.partial_cmp(s2).unwrap_or(std::cmp::Ordering::Equal))
322            .map(|(v, _)| *v)
323            .ok_or("No valid values found")?;
324
325        // Add exploration factor for areas not well-explored
326        let exploration = self.config.exploration_factor;
327        let range = bounds.max - bounds.min;
328
329        let explored_values: Vec<f64> = value_scores.iter().map(|(v, _)| *v).collect();
330        let mean = explored_values.iter().sum::<f64>() / explored_values.len() as f64;
331
332        // If best value is at extremes and we haven't explored much, suggest moving toward center
333        let optimal = if (best_value - bounds.min).abs() < range * 0.1
334            || (best_value - bounds.max).abs() < range * 0.1
335        {
336            best_value * (1.0 - exploration) + mean * exploration
337        } else {
338            best_value
339        };
340
341        // Clamp to bounds
342        let clamped = optimal.max(bounds.min).min(bounds.max);
343
344        // Round to step if specified
345        let final_value = if let Some(step) = bounds.step {
346            (clamped / step).round() * step
347        } else {
348            clamped
349        };
350
351        Ok(final_value)
352    }
353
354    /// Calculate confidence based on data coverage
355    fn calculate_confidence(&self, value_scores: &[(f64, f64)], optimal_value: f64) -> f64 {
356        if value_scores.is_empty() {
357            return 0.0;
358        }
359
360        // Confidence based on:
361        // 1. Number of samples
362        let sample_confidence = (value_scores.len() as f64 / 20.0).min(1.0);
363
364        // 2. How close we have samples to the optimal value
365        let nearest_distance = value_scores
366            .iter()
367            .map(|(v, _)| (v - optimal_value).abs())
368            .min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
369            .unwrap_or(f64::MAX);
370
371        let proximity_confidence = if nearest_distance < 10.0 {
372            1.0
373        } else if nearest_distance < 50.0 {
374            0.8
375        } else if nearest_distance < 100.0 {
376            0.6
377        } else {
378            0.4
379        };
380
381        // 3. Score variance (lower is better)
382        let scores: Vec<f64> = value_scores.iter().map(|(_, s)| *s).collect();
383        let mean_score = scores.iter().sum::<f64>() / scores.len() as f64;
384        let variance =
385            scores.iter().map(|s| (s - mean_score).powi(2)).sum::<f64>() / scores.len() as f64;
386        let consistency_confidence = if variance < 0.01 {
387            0.9
388        } else if variance < 0.05 {
389            0.7
390        } else {
391            0.5
392        };
393
394        (sample_confidence + proximity_confidence + consistency_confidence) / 3.0
395    }
396
397    /// Estimate impact of parameter change
398    fn estimate_impact(
399        &self,
400        _param_name: &str,
401        current_value: f64,
402        optimal_value: f64,
403        runs: &[&OrchestrationRun],
404    ) -> Result<ExpectedImpact, String> {
405        // Find runs close to current and optimal values
406        let current_runs: Vec<_> = runs
407            .iter()
408            .filter(|r| r.parameters.values().any(|&v| (v - current_value).abs() < 10.0))
409            .collect();
410
411        let optimal_runs: Vec<_> = runs
412            .iter()
413            .filter(|r| r.parameters.values().any(|&v| (v - optimal_value).abs() < 10.0))
414            .collect();
415
416        let current_avg_chaos = if !current_runs.is_empty() {
417            current_runs.iter().map(|r| r.metrics.chaos_effectiveness).sum::<f64>()
418                / current_runs.len() as f64
419        } else {
420            0.5
421        };
422
423        let optimal_avg_chaos = if !optimal_runs.is_empty() {
424            optimal_runs.iter().map(|r| r.metrics.chaos_effectiveness).sum::<f64>()
425                / optimal_runs.len() as f64
426        } else {
427            0.5
428        };
429
430        let current_avg_stability = if !current_runs.is_empty() {
431            current_runs.iter().map(|r| r.metrics.system_stability).sum::<f64>()
432                / current_runs.len() as f64
433        } else {
434            0.5
435        };
436
437        let optimal_avg_stability = if !optimal_runs.is_empty() {
438            optimal_runs.iter().map(|r| r.metrics.system_stability).sum::<f64>()
439                / optimal_runs.len() as f64
440        } else {
441            0.5
442        };
443
444        let chaos_delta = optimal_avg_chaos - current_avg_chaos;
445        let stability_delta = optimal_avg_stability - current_avg_stability;
446
447        // Calculate overall score delta
448        let current_score = current_avg_chaos * self.config.weights.chaos_effectiveness
449            + current_avg_stability * self.config.weights.system_stability;
450        let optimal_score = optimal_avg_chaos * self.config.weights.chaos_effectiveness
451            + optimal_avg_stability * self.config.weights.system_stability;
452
453        Ok(ExpectedImpact {
454            chaos_effectiveness_delta: chaos_delta,
455            system_stability_delta: stability_delta,
456            overall_score_delta: optimal_score - current_score,
457        })
458    }
459
460    /// Generate human-readable reasoning
461    fn generate_reasoning(
462        &self,
463        param_name: &str,
464        current_value: f64,
465        optimal_value: f64,
466        impact: &ExpectedImpact,
467        sample_count: usize,
468    ) -> String {
469        let change_direction = if optimal_value > current_value {
470            "increase"
471        } else if optimal_value < current_value {
472            "decrease"
473        } else {
474            "maintain"
475        };
476
477        let change_pct = if current_value != 0.0 {
478            ((optimal_value - current_value) / current_value * 100.0).abs()
479        } else {
480            0.0
481        };
482
483        let impact_desc = if impact.overall_score_delta > 0.1 {
484            "significant improvement"
485        } else if impact.overall_score_delta > 0.05 {
486            "moderate improvement"
487        } else if impact.overall_score_delta > 0.0 {
488            "slight improvement"
489        } else {
490            "minimal impact"
491        };
492
493        format!(
494            "Based on {} historical runs, recommend to {} '{}' from {:.2} to {:.2} ({:.1}% change). \
495             This is expected to result in {} in overall effectiveness (chaos: {:+.2}%, stability: {:+.2}%).",
496            sample_count,
497            change_direction,
498            param_name,
499            current_value,
500            optimal_value,
501            change_pct,
502            impact_desc,
503            impact.chaos_effectiveness_delta * 100.0,
504            impact.system_stability_delta * 100.0
505        )
506    }
507
508    /// Extract all unique parameter names
509    fn extract_parameter_names(&self) -> Vec<String> {
510        let mut params = std::collections::HashSet::new();
511        for run in &self.historical_runs {
512            for key in run.parameters.keys() {
513                params.insert(key.clone());
514            }
515        }
516        params.into_iter().collect()
517    }
518
519    /// Get number of historical runs
520    pub fn run_count(&self) -> usize {
521        self.historical_runs.len()
522    }
523
524    /// Clear historical data
525    pub fn clear_runs(&mut self) {
526        self.historical_runs.clear();
527    }
528}
529
530#[cfg(test)]
531mod tests {
532    use super::*;
533
534    fn create_test_run(latency: f64, chaos_eff: f64, stability: f64) -> OrchestrationRun {
535        let mut params = HashMap::new();
536        params.insert("latency_ms".to_string(), latency);
537
538        OrchestrationRun {
539            id: format!("run-{}", latency),
540            orchestration_id: "test-orch".to_string(),
541            parameters: params,
542            timestamp: Utc::now(),
543            duration_ms: 1000,
544            success: true,
545            metrics: RunMetrics {
546                chaos_effectiveness: chaos_eff,
547                system_stability: stability,
548                error_rate: 0.1,
549                recovery_time_ms: 500,
550                failures_detected: 5,
551                false_positives: 1,
552            },
553        }
554    }
555
556    #[test]
557    fn test_optimizer_creation() {
558        let config = OptimizerConfig::default();
559        let optimizer = ParameterOptimizer::new(config);
560        assert_eq!(optimizer.run_count(), 0);
561    }
562
563    #[test]
564    fn test_add_runs() {
565        let config = OptimizerConfig::default();
566        let mut optimizer = ParameterOptimizer::new(config);
567
568        let runs = vec![
569            create_test_run(100.0, 0.5, 0.8),
570            create_test_run(200.0, 0.7, 0.6),
571        ];
572
573        optimizer.add_runs(runs);
574        assert_eq!(optimizer.run_count(), 2);
575    }
576
577    #[test]
578    fn test_optimize_with_sufficient_data() {
579        let config = OptimizerConfig::default();
580        let mut optimizer = ParameterOptimizer::new(config);
581
582        // Add runs with varying latency values
583        for i in 0..15 {
584            let latency = 50.0 + (i as f64 * 20.0);
585            let chaos_eff = 0.3 + (latency / 500.0).min(0.6);
586            let stability = 0.9 - (latency / 1000.0).min(0.4);
587            optimizer.add_run(create_test_run(latency, chaos_eff, stability));
588        }
589
590        let recommendations = optimizer.optimize().unwrap();
591        assert!(!recommendations.is_empty());
592        assert!(recommendations[0].confidence >= 0.0);
593    }
594
595    #[test]
596    fn test_optimize_insufficient_data() {
597        let config = OptimizerConfig::default();
598        let mut optimizer = ParameterOptimizer::new(config);
599
600        optimizer.add_run(create_test_run(100.0, 0.5, 0.8));
601
602        let result = optimizer.optimize();
603        assert!(result.is_err());
604    }
605
606    #[test]
607    fn test_different_objectives() {
608        let objectives = vec![
609            OptimizationObjective::MaxChaos,
610            OptimizationObjective::Balanced,
611            OptimizationObjective::SafeTesting,
612        ];
613
614        for objective in objectives {
615            let config = OptimizerConfig {
616                objective,
617                ..Default::default()
618            };
619            let optimizer = ParameterOptimizer::new(config);
620
621            // Just verify it can be created with different objectives
622            assert_eq!(optimizer.run_count(), 0);
623        }
624    }
625}