optirs_learned/transformer_based_optimizer/
performance_tracker.rs

1// Performance tracking for transformer-based optimizer
2
3use super::config::PerformanceConfig;
4use super::meta_learning::MetaLearningResult;
5use super::TrainingMetrics;
6use crate::error::Result;
7use scirs2_core::ndarray::{Array1, Array2};
8use scirs2_core::numeric::Float;
9use serde::{Deserialize, Serialize};
10use std::collections::{BTreeMap, HashMap, VecDeque};
11use std::fmt::Debug;
12use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
13
14/// Performance metrics for transformer optimizer
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct PerformanceMetrics {
17    /// Training metrics
18    pub training_metrics: TrainingMetricsCollection,
19
20    /// Inference metrics
21    pub inference_metrics: InferenceMetricsCollection,
22
23    /// Memory usage metrics
24    pub memory_metrics: MemoryMetricsCollection,
25
26    /// Computation time metrics
27    pub timing_metrics: TimingMetricsCollection,
28
29    /// Model quality metrics
30    pub quality_metrics: QualityMetricsCollection,
31
32    /// Resource utilization metrics
33    pub resource_metrics: ResourceMetricsCollection,
34
35    /// Optimization metrics
36    pub optimization_metrics: OptimizationMetricsCollection,
37}
38
39/// Transformer performance tracker
40pub struct TransformerPerformanceTracker<T: Float + Debug + Send + Sync + 'static> {
41    /// Configuration
42    config: PerformanceConfig,
43
44    /// Performance metrics
45    metrics: PerformanceMetrics,
46
47    /// Loss history
48    loss_history: VecDeque<f64>,
49
50    /// Training step timings
51    step_timings: VecDeque<Duration>,
52
53    /// Meta-learning results history
54    meta_results_history: VecDeque<MetaLearningResult<T>>,
55
56    /// Training metrics history
57    training_history: VecDeque<TrainingMetrics>,
58
59    /// Memory usage samples
60    memory_samples: VecDeque<MemorySample>,
61
62    /// Performance baselines
63    baselines: PerformanceBaselines,
64
65    /// Alert thresholds
66    alert_thresholds: AlertThresholds,
67
68    /// Performance trends
69    trends: PerformanceTrends,
70
71    /// Profiling data
72    profiling_data: ProfilingData,
73
74    /// Start time for tracking session
75    session_start: Instant,
76}
77
78impl<T: Float + Debug + Send + Sync + 'static> Default for TransformerPerformanceTracker<T> {
79    fn default() -> Self {
80        Self::new()
81    }
82}
83
84impl<T: Float + Debug + Send + Sync + 'static> TransformerPerformanceTracker<T> {
85    /// Create new performance tracker
86    pub fn new() -> Self {
87        let config = PerformanceConfig::default();
88
89        Self {
90            config,
91            metrics: PerformanceMetrics::new(),
92            loss_history: VecDeque::new(),
93            step_timings: VecDeque::new(),
94            meta_results_history: VecDeque::new(),
95            training_history: VecDeque::new(),
96            memory_samples: VecDeque::new(),
97            baselines: PerformanceBaselines::new(),
98            alert_thresholds: AlertThresholds::default(),
99            trends: PerformanceTrends::new(),
100            profiling_data: ProfilingData::new(),
101            session_start: Instant::now(),
102        }
103    }
104
105    /// Create with custom configuration
106    pub fn with_config(config: PerformanceConfig) -> Self {
107        let mut tracker = Self::new();
108        tracker.config = config;
109        tracker
110    }
111
112    /// Record optimization step performance
113    pub fn record_optimization_step(&mut self, duration: Duration, update: &Array1<T>) {
114        // Record timing
115        self.step_timings.push_back(duration);
116        if self.step_timings.len() > self.config.max_history_size {
117            self.step_timings.pop_front();
118        }
119
120        // Update timing metrics
121        self.metrics.timing_metrics.record_step_time(duration);
122
123        // Analyze update quality
124        let update_norm = self.compute_array_norm(update);
125        self.metrics
126            .optimization_metrics
127            .record_update_norm(update_norm);
128
129        // Update trends
130        self.trends.update_step_timing(duration);
131
132        // Check for performance alerts
133        self.check_performance_alerts(duration, update_norm);
134    }
135
136    /// Record training epoch performance
137    pub fn record_training_epoch(&mut self, metrics: TrainingMetrics) {
138        self.training_history.push_back(metrics.clone());
139        if self.training_history.len() > self.config.max_history_size {
140            self.training_history.pop_front();
141        }
142
143        // Update training metrics
144        self.metrics.training_metrics.record_epoch(
145            metrics.loss,
146            metrics.training_time,
147            metrics.convergence_rate,
148        );
149
150        // Update trends
151        self.trends.update_training_loss(metrics.loss);
152        self.trends
153            .update_convergence_rate(metrics.convergence_rate);
154
155        // Check for convergence alerts
156        self.check_convergence_alerts(&metrics);
157    }
158
159    /// Record meta-learning step performance
160    pub fn record_meta_step(&mut self, result: MetaLearningResult<T>) {
161        self.meta_results_history.push_back(result.clone());
162        if self.meta_results_history.len() > 100 {
163            self.meta_results_history.pop_front();
164        }
165
166        // Update meta-learning metrics
167        self.metrics.training_metrics.record_meta_step(
168            result.meta_loss,
169            result.computation_time,
170            result.task_adaptations.len(),
171        );
172
173        // Update trends
174        self.trends.update_meta_loss(result.meta_loss);
175        self.trends
176            .update_adaptation_efficiency(result.task_adaptations.len() as f64);
177    }
178
179    /// Record inference performance
180    pub fn record_inference(&mut self, input_size: usize, duration: Duration, output_quality: f64) {
181        self.metrics
182            .inference_metrics
183            .record_inference(input_size, duration, output_quality);
184
185        // Update throughput calculations
186        let throughput = input_size as f64 / duration.as_secs_f64();
187        self.metrics.inference_metrics.update_throughput(throughput);
188
189        // Update trends
190        self.trends.update_inference_latency(duration);
191        self.trends.update_inference_quality(output_quality);
192    }
193
194    /// Record memory usage
195    pub fn record_memory_usage(&mut self, usage: MemoryUsage) {
196        let sample = MemorySample {
197            timestamp: std::time::SystemTime::now(),
198            usage,
199        };
200
201        self.memory_samples.push_back(sample.clone());
202        if self.memory_samples.len() > self.config.max_history_size {
203            self.memory_samples.pop_front();
204        }
205
206        // Update memory metrics
207        self.metrics.memory_metrics.record_usage(usage);
208
209        // Update trends
210        self.trends.update_memory_usage(usage.total_memory);
211
212        // Check for memory alerts
213        self.check_memory_alerts(&usage);
214    }
215
216    /// Record loss value
217    pub fn record_loss(&mut self, loss: f64) {
218        self.loss_history.push_back(loss);
219        if self.loss_history.len() > self.config.max_history_size {
220            self.loss_history.pop_front();
221        }
222
223        // Update quality metrics
224        self.metrics.quality_metrics.record_loss(loss);
225
226        // Update trends
227        self.trends.update_loss(loss);
228    }
229
230    /// Profile operation performance
231    pub fn profile_operation<F, R>(&mut self, operation_name: &str, operation: F) -> Result<R>
232    where
233        F: FnOnce() -> Result<R>,
234    {
235        let start_time = Instant::now();
236        let result = operation();
237        let duration = start_time.elapsed();
238
239        // Record profiling data
240        self.profiling_data
241            .record_operation(operation_name.to_string(), duration);
242
243        // Update timing metrics
244        self.metrics
245            .timing_metrics
246            .record_operation_time(operation_name, duration);
247
248        result
249    }
250
251    /// Generate performance report
252    pub fn generate_report(&self) -> PerformanceReport {
253        let session_duration = self.session_start.elapsed();
254
255        PerformanceReport {
256            session_duration,
257            total_optimization_steps: self.step_timings.len(),
258            total_training_epochs: self.training_history.len(),
259            total_meta_steps: self.meta_results_history.len(),
260
261            // Average performance metrics
262            average_step_time: self.calculate_average_step_time(),
263            average_loss: self.calculate_average_loss(),
264            current_convergence_rate: self.calculate_current_convergence_rate(),
265
266            // Memory statistics
267            peak_memory_usage: self.calculate_peak_memory_usage(),
268            average_memory_usage: self.calculate_average_memory_usage(),
269
270            // Quality metrics
271            best_loss: self.calculate_best_loss(),
272            loss_improvement: self.calculate_loss_improvement(),
273
274            // Performance trends
275            loss_trend: self.trends.get_loss_trend(),
276            convergence_trend: self.trends.get_convergence_trend(),
277            memory_trend: self.trends.get_memory_trend(),
278
279            // Alert summary
280            performance_alerts: self.get_recent_alerts(),
281
282            // Resource utilization
283            cpu_utilization: self.metrics.resource_metrics.get_average_cpu_usage(),
284            memory_utilization: self.metrics.resource_metrics.get_average_memory_usage(),
285
286            // Model quality assessment
287            quality_score: self.calculate_overall_quality_score(),
288
289            // Recommendations
290            recommendations: self.generate_recommendations(),
291        }
292    }
293
294    /// Get loss history
295    pub fn get_loss_history(&self) -> &VecDeque<f64> {
296        &self.loss_history
297    }
298
299    /// Get current performance metrics
300    pub fn get_metrics(&self) -> &PerformanceMetrics {
301        &self.metrics
302    }
303
304    /// Get performance trends
305    pub fn get_trends(&self) -> &PerformanceTrends {
306        &self.trends
307    }
308
309    /// Set performance baselines
310    pub fn set_baselines(&mut self, baselines: PerformanceBaselines) {
311        self.baselines = baselines;
312    }
313
314    /// Reset all performance tracking
315    pub fn reset(&mut self) {
316        self.metrics = PerformanceMetrics::new();
317        self.loss_history.clear();
318        self.step_timings.clear();
319        self.meta_results_history.clear();
320        self.training_history.clear();
321        self.memory_samples.clear();
322        self.trends.reset();
323        self.profiling_data.reset();
324        self.session_start = Instant::now();
325    }
326
327    /// Helper methods
328    fn compute_array_norm(&self, array: &Array1<T>) -> f64 {
329        let sum_squares: T = array
330            .iter()
331            .map(|&x| x * x)
332            .fold(T::zero(), |acc, x| acc + x);
333        sum_squares.sqrt().to_f64().unwrap_or(0.0)
334    }
335
336    fn calculate_average_step_time(&self) -> Duration {
337        if self.step_timings.is_empty() {
338            Duration::new(0, 0)
339        } else {
340            let total: Duration = self.step_timings.iter().sum();
341            total / self.step_timings.len() as u32
342        }
343    }
344
345    fn calculate_average_loss(&self) -> f64 {
346        if self.loss_history.is_empty() {
347            0.0
348        } else {
349            self.loss_history.iter().sum::<f64>() / self.loss_history.len() as f64
350        }
351    }
352
353    fn calculate_current_convergence_rate(&self) -> f64 {
354        if let Some(recent_training) = self.training_history.back() {
355            recent_training.convergence_rate
356        } else {
357            0.0
358        }
359    }
360
361    fn calculate_peak_memory_usage(&self) -> usize {
362        self.memory_samples
363            .iter()
364            .map(|sample| sample.usage.total_memory)
365            .max()
366            .unwrap_or(0)
367    }
368
369    fn calculate_average_memory_usage(&self) -> f64 {
370        if self.memory_samples.is_empty() {
371            0.0
372        } else {
373            let total: usize = self
374                .memory_samples
375                .iter()
376                .map(|s| s.usage.total_memory)
377                .sum();
378            total as f64 / self.memory_samples.len() as f64
379        }
380    }
381
382    fn calculate_best_loss(&self) -> f64 {
383        self.loss_history
384            .iter()
385            .fold(f64::INFINITY, |min, &loss| min.min(loss))
386    }
387
388    fn calculate_loss_improvement(&self) -> f64 {
389        if self.loss_history.len() < 2 {
390            return 0.0;
391        }
392
393        let initial_loss = self.loss_history[0];
394        let final_loss = *self.loss_history.back().unwrap();
395
396        if initial_loss > 0.0 {
397            (initial_loss - final_loss) / initial_loss
398        } else {
399            0.0
400        }
401    }
402
403    fn calculate_overall_quality_score(&self) -> f64 {
404        // Weighted combination of various quality metrics
405        let loss_score = 1.0 - (self.calculate_average_loss() / 10.0).min(1.0);
406        let convergence_score = self.calculate_current_convergence_rate();
407        let stability_score = 1.0 - self.trends.get_loss_volatility().min(1.0);
408
409        (loss_score * 0.4 + convergence_score * 0.3 + stability_score * 0.3).clamp(0.0, 1.0)
410    }
411
412    fn generate_recommendations(&self) -> Vec<String> {
413        let mut recommendations = Vec::new();
414
415        // Loss-based recommendations
416        if self.trends.get_loss_trend() > 0.0 {
417            recommendations
418                .push("Loss is increasing. Consider reducing learning rate.".to_string());
419        }
420
421        // Memory-based recommendations
422        if self.calculate_peak_memory_usage() > 1024 * 1024 * 1024 {
423            // 1GB
424            recommendations.push(
425                "High memory usage detected. Consider enabling memory compression.".to_string(),
426            );
427        }
428
429        // Performance-based recommendations
430        if self.calculate_average_step_time() > Duration::from_millis(100) {
431            recommendations.push(
432                "Slow optimization steps. Consider reducing model size or batch size.".to_string(),
433            );
434        }
435
436        // Convergence-based recommendations
437        if self.calculate_current_convergence_rate() < 0.1 {
438            recommendations.push(
439                "Low convergence rate. Consider adjusting meta-learning parameters.".to_string(),
440            );
441        }
442
443        recommendations
444    }
445
446    fn check_performance_alerts(&mut self, duration: Duration, update_norm: f64) {
447        // Check for slow steps
448        if duration > self.alert_thresholds.max_step_time {
449            self.record_alert(AlertType::SlowStep, format!("Step took {:?}", duration));
450        }
451
452        // Check for gradient explosion
453        if update_norm > self.alert_thresholds.max_gradient_norm {
454            self.record_alert(
455                AlertType::GradientExplosion,
456                format!("Update norm: {:.6}", update_norm),
457            );
458        }
459
460        // Check for gradient vanishing
461        if update_norm < self.alert_thresholds.min_gradient_norm {
462            self.record_alert(
463                AlertType::GradientVanishing,
464                format!("Update norm: {:.6}", update_norm),
465            );
466        }
467    }
468
469    fn check_convergence_alerts(&mut self, metrics: &TrainingMetrics) {
470        // Check for training stagnation
471        if metrics.convergence_rate < self.alert_thresholds.min_convergence_rate {
472            self.record_alert(
473                AlertType::TrainingStagnation,
474                format!("Convergence rate: {:.6}", metrics.convergence_rate),
475            );
476        }
477
478        // Check for loss increase
479        if let Some(previous) = self.training_history.iter().rev().nth(1) {
480            if metrics.loss > previous.loss * 1.1 {
481                // 10% increase
482                self.record_alert(
483                    AlertType::LossIncrease,
484                    format!(
485                        "Loss increased from {:.6} to {:.6}",
486                        previous.loss, metrics.loss
487                    ),
488                );
489            }
490        }
491    }
492
493    fn check_memory_alerts(&mut self, usage: &MemoryUsage) {
494        // Check for high memory usage
495        if usage.total_memory > self.alert_thresholds.max_memory_usage {
496            self.record_alert(
497                AlertType::HighMemoryUsage,
498                format!("Memory usage: {} bytes", usage.total_memory),
499            );
500        }
501
502        // Check for memory leaks
503        if let Some(previous) = self.memory_samples.back() {
504            let memory_increase = usage
505                .total_memory
506                .saturating_sub(previous.usage.total_memory);
507            if memory_increase > 1024 * 1024 * 100 {
508                // 100MB increase
509                self.record_alert(
510                    AlertType::PossibleMemoryLeak,
511                    format!("Memory increased by {} bytes", memory_increase),
512                );
513            }
514        }
515    }
516
517    fn record_alert(&mut self, alert_type: AlertType, message: String) {
518        let alert = PerformanceAlert {
519            alert_type,
520            message,
521            timestamp: std::time::SystemTime::now(),
522        };
523
524        self.metrics.quality_metrics.record_alert(alert);
525    }
526
527    fn get_recent_alerts(&self) -> Vec<PerformanceAlert> {
528        self.metrics.quality_metrics.get_recent_alerts(10)
529    }
530}
531
532/// Performance metrics collections
533#[derive(Debug, Clone, Serialize, Deserialize)]
534pub struct TrainingMetricsCollection {
535    pub total_epochs: usize,
536    pub total_training_time: Duration,
537    pub average_loss: f64,
538    pub best_loss: f64,
539    pub convergence_rate: f64,
540    pub meta_steps: usize,
541    pub task_adaptations: usize,
542}
543
544#[derive(Debug, Clone, Serialize, Deserialize)]
545pub struct InferenceMetricsCollection {
546    pub total_inferences: usize,
547    pub average_latency: Duration,
548    pub peak_throughput: f64,
549    pub average_quality: f64,
550    pub cache_hit_rate: f64,
551}
552
553#[derive(Debug, Clone, Serialize, Deserialize)]
554pub struct MemoryMetricsCollection {
555    pub peak_usage: usize,
556    pub average_usage: f64,
557    pub total_allocations: usize,
558    pub compression_ratio: f64,
559    pub fragmentation_rate: f64,
560}
561
562#[derive(Debug, Clone, Serialize, Deserialize)]
563pub struct TimingMetricsCollection {
564    pub average_step_time: Duration,
565    pub total_computation_time: Duration,
566    pub operation_timings: HashMap<String, Duration>,
567    pub profiling_overhead: Duration,
568}
569
570#[derive(Debug, Clone, Serialize, Deserialize)]
571pub struct QualityMetricsCollection {
572    pub loss_statistics: LossStatistics,
573    pub convergence_statistics: ConvergenceStatistics,
574    pub stability_metrics: StabilityMetrics,
575    pub performance_alerts: VecDeque<PerformanceAlert>,
576}
577
578#[derive(Debug, Clone, Serialize, Deserialize)]
579pub struct ResourceMetricsCollection {
580    pub cpu_usage_history: VecDeque<f64>,
581    pub memory_usage_history: VecDeque<f64>,
582    pub disk_io_metrics: DiskIOMetrics,
583    pub network_metrics: NetworkMetrics,
584}
585
586#[derive(Debug, Clone, Serialize, Deserialize)]
587pub struct OptimizationMetricsCollection {
588    pub update_norm_history: VecDeque<f64>,
589    pub parameter_change_rate: f64,
590    pub optimization_efficiency: f64,
591    pub adaptive_learning_metrics: AdaptiveLearningMetrics,
592}
593
594/// Supporting data structures
595#[derive(Debug, Clone)]
596pub struct MemorySample {
597    pub timestamp: std::time::SystemTime,
598    pub usage: MemoryUsage,
599}
600
601#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
602pub struct MemoryUsage {
603    pub total_memory: usize,
604    pub model_memory: usize,
605    pub cache_memory: usize,
606    pub temporary_memory: usize,
607}
608
609#[derive(Debug, Clone)]
610pub struct PerformanceBaselines {
611    pub baseline_loss: f64,
612    pub baseline_step_time: Duration,
613    pub baseline_memory_usage: usize,
614    pub baseline_convergence_rate: f64,
615}
616
617impl Default for PerformanceBaselines {
618    fn default() -> Self {
619        Self::new()
620    }
621}
622
623impl PerformanceBaselines {
624    pub fn new() -> Self {
625        Self {
626            baseline_loss: f64::INFINITY,
627            baseline_step_time: Duration::new(0, 0),
628            baseline_memory_usage: 0,
629            baseline_convergence_rate: 0.0,
630        }
631    }
632}
633
634#[derive(Debug, Clone)]
635pub struct AlertThresholds {
636    pub max_step_time: Duration,
637    pub min_convergence_rate: f64,
638    pub max_memory_usage: usize,
639    pub max_gradient_norm: f64,
640    pub min_gradient_norm: f64,
641}
642
643impl Default for AlertThresholds {
644    fn default() -> Self {
645        Self {
646            max_step_time: Duration::from_secs(1),
647            min_convergence_rate: 0.01,
648            max_memory_usage: 2 * 1024 * 1024 * 1024, // 2GB
649            max_gradient_norm: 10.0,
650            min_gradient_norm: 1e-8,
651        }
652    }
653}
654
655#[derive(Debug, Clone)]
656pub struct PerformanceTrends {
657    loss_trend: TrendAnalyzer,
658    convergence_trend: TrendAnalyzer,
659    memory_trend: TrendAnalyzer,
660    timing_trend: TrendAnalyzer,
661}
662
663impl Default for PerformanceTrends {
664    fn default() -> Self {
665        Self::new()
666    }
667}
668
669impl PerformanceTrends {
670    pub fn new() -> Self {
671        Self {
672            loss_trend: TrendAnalyzer::new(100),
673            convergence_trend: TrendAnalyzer::new(50),
674            memory_trend: TrendAnalyzer::new(100),
675            timing_trend: TrendAnalyzer::new(100),
676        }
677    }
678
679    pub fn update_loss(&mut self, loss: f64) {
680        self.loss_trend.add_sample(loss);
681    }
682
683    pub fn update_training_loss(&mut self, loss: f64) {
684        self.loss_trend.add_sample(loss);
685    }
686
687    pub fn update_meta_loss(&mut self, loss: f64) {
688        self.loss_trend.add_sample(loss);
689    }
690
691    pub fn update_convergence_rate(&mut self, rate: f64) {
692        self.convergence_trend.add_sample(rate);
693    }
694
695    pub fn update_adaptation_efficiency(&mut self, efficiency: f64) {
696        self.convergence_trend.add_sample(efficiency);
697    }
698
699    pub fn update_memory_usage(&mut self, usage: usize) {
700        self.memory_trend.add_sample(usage as f64);
701    }
702
703    pub fn update_step_timing(&mut self, duration: Duration) {
704        self.timing_trend.add_sample(duration.as_secs_f64());
705    }
706
707    pub fn update_inference_latency(&mut self, duration: Duration) {
708        self.timing_trend.add_sample(duration.as_secs_f64());
709    }
710
711    pub fn update_inference_quality(&mut self, _quality: f64) {
712        // Could be used for quality trend analysis
713    }
714
715    pub fn get_loss_trend(&self) -> f64 {
716        self.loss_trend.get_trend()
717    }
718
719    pub fn get_convergence_trend(&self) -> f64 {
720        self.convergence_trend.get_trend()
721    }
722
723    pub fn get_memory_trend(&self) -> f64 {
724        self.memory_trend.get_trend()
725    }
726
727    pub fn get_loss_volatility(&self) -> f64 {
728        self.loss_trend.get_volatility()
729    }
730
731    pub fn reset(&mut self) {
732        self.loss_trend.reset();
733        self.convergence_trend.reset();
734        self.memory_trend.reset();
735        self.timing_trend.reset();
736    }
737}
738
739#[derive(Debug, Clone)]
740pub struct TrendAnalyzer {
741    samples: VecDeque<f64>,
742    max_samples: usize,
743}
744
745impl TrendAnalyzer {
746    pub fn new(max_samples: usize) -> Self {
747        Self {
748            samples: VecDeque::new(),
749            max_samples,
750        }
751    }
752
753    pub fn add_sample(&mut self, value: f64) {
754        self.samples.push_back(value);
755        if self.samples.len() > self.max_samples {
756            self.samples.pop_front();
757        }
758    }
759
760    pub fn get_trend(&self) -> f64 {
761        if self.samples.len() < 2 {
762            return 0.0;
763        }
764
765        // Simple linear trend calculation
766        let n = self.samples.len() as f64;
767        let x_sum = (0..self.samples.len()).map(|i| i as f64).sum::<f64>();
768        let y_sum = self.samples.iter().sum::<f64>();
769        let xy_sum = self
770            .samples
771            .iter()
772            .enumerate()
773            .map(|(i, &y)| i as f64 * y)
774            .sum::<f64>();
775        let x_sq_sum = (0..self.samples.len())
776            .map(|i| (i as f64).powi(2))
777            .sum::<f64>();
778
779        let denominator = n * x_sq_sum - x_sum * x_sum;
780        if denominator.abs() < 1e-10 {
781            0.0
782        } else {
783            (n * xy_sum - x_sum * y_sum) / denominator
784        }
785    }
786
787    pub fn get_volatility(&self) -> f64 {
788        if self.samples.len() < 2 {
789            return 0.0;
790        }
791
792        let mean = self.samples.iter().sum::<f64>() / self.samples.len() as f64;
793        let variance = self
794            .samples
795            .iter()
796            .map(|&x| (x - mean).powi(2))
797            .sum::<f64>()
798            / self.samples.len() as f64;
799
800        variance.sqrt()
801    }
802
803    pub fn reset(&mut self) {
804        self.samples.clear();
805    }
806}
807
808#[derive(Debug, Clone)]
809pub struct ProfilingData {
810    operation_timings: HashMap<String, VecDeque<Duration>>,
811    total_operations: usize,
812}
813
814impl Default for ProfilingData {
815    fn default() -> Self {
816        Self::new()
817    }
818}
819
820impl ProfilingData {
821    pub fn new() -> Self {
822        Self {
823            operation_timings: HashMap::new(),
824            total_operations: 0,
825        }
826    }
827
828    pub fn record_operation(&mut self, operation: String, duration: Duration) {
829        let timings = self.operation_timings.entry(operation).or_default();
830        timings.push_back(duration);
831        if timings.len() > 1000 {
832            timings.pop_front();
833        }
834        self.total_operations += 1;
835    }
836
837    pub fn get_average_time(&self, operation: &str) -> Option<Duration> {
838        self.operation_timings.get(operation).map(|timings| {
839            let total: Duration = timings.iter().sum();
840            total / timings.len() as u32
841        })
842    }
843
844    pub fn reset(&mut self) {
845        self.operation_timings.clear();
846        self.total_operations = 0;
847    }
848}
849
850/// Alert system
851#[derive(Debug, Clone, Serialize, Deserialize)]
852pub struct PerformanceAlert {
853    pub alert_type: AlertType,
854    pub message: String,
855    pub timestamp: std::time::SystemTime,
856}
857
858#[derive(Debug, Clone, Serialize, Deserialize)]
859pub enum AlertType {
860    SlowStep,
861    GradientExplosion,
862    GradientVanishing,
863    TrainingStagnation,
864    LossIncrease,
865    HighMemoryUsage,
866    PossibleMemoryLeak,
867    ConvergenceFailure,
868}
869
870/// Performance report
871#[derive(Debug, Clone)]
872pub struct PerformanceReport {
873    pub session_duration: Duration,
874    pub total_optimization_steps: usize,
875    pub total_training_epochs: usize,
876    pub total_meta_steps: usize,
877    pub average_step_time: Duration,
878    pub average_loss: f64,
879    pub current_convergence_rate: f64,
880    pub peak_memory_usage: usize,
881    pub average_memory_usage: f64,
882    pub best_loss: f64,
883    pub loss_improvement: f64,
884    pub loss_trend: f64,
885    pub convergence_trend: f64,
886    pub memory_trend: f64,
887    pub performance_alerts: Vec<PerformanceAlert>,
888    pub cpu_utilization: f64,
889    pub memory_utilization: f64,
890    pub quality_score: f64,
891    pub recommendations: Vec<String>,
892}
893
894/// Additional metric structures (simplified implementations)
895#[derive(Debug, Clone, Serialize, Deserialize)]
896pub struct LossStatistics {
897    pub mean: f64,
898    pub std_dev: f64,
899    pub min: f64,
900    pub max: f64,
901}
902
903#[derive(Debug, Clone, Serialize, Deserialize)]
904pub struct ConvergenceStatistics {
905    pub average_rate: f64,
906    pub best_rate: f64,
907    pub convergence_episodes: usize,
908}
909
910#[derive(Debug, Clone, Serialize, Deserialize)]
911pub struct StabilityMetrics {
912    pub loss_volatility: f64,
913    pub gradient_stability: f64,
914    pub convergence_stability: f64,
915}
916
917#[derive(Debug, Clone, Serialize, Deserialize)]
918pub struct DiskIOMetrics {
919    pub total_reads: usize,
920    pub total_writes: usize,
921    pub total_bytes_read: usize,
922    pub total_bytes_written: usize,
923}
924
925#[derive(Debug, Clone, Serialize, Deserialize)]
926pub struct NetworkMetrics {
927    pub total_requests: usize,
928    pub total_bytes_sent: usize,
929    pub total_bytes_received: usize,
930    pub average_latency: Duration,
931}
932
933#[derive(Debug, Clone, Serialize, Deserialize)]
934pub struct AdaptiveLearningMetrics {
935    pub learning_rate_adjustments: usize,
936    pub batch_size_adjustments: usize,
937    pub architecture_modifications: usize,
938}
939
940/// Implementation of basic functionality for metric collections
941impl Default for PerformanceMetrics {
942    fn default() -> Self {
943        Self::new()
944    }
945}
946
947impl PerformanceMetrics {
948    pub fn new() -> Self {
949        Self {
950            training_metrics: TrainingMetricsCollection::new(),
951            inference_metrics: InferenceMetricsCollection::new(),
952            memory_metrics: MemoryMetricsCollection::new(),
953            timing_metrics: TimingMetricsCollection::new(),
954            quality_metrics: QualityMetricsCollection::new(),
955            resource_metrics: ResourceMetricsCollection::new(),
956            optimization_metrics: OptimizationMetricsCollection::new(),
957        }
958    }
959}
960
961impl Default for TrainingMetricsCollection {
962    fn default() -> Self {
963        Self::new()
964    }
965}
966
967impl TrainingMetricsCollection {
968    pub fn new() -> Self {
969        Self {
970            total_epochs: 0,
971            total_training_time: Duration::new(0, 0),
972            average_loss: 0.0,
973            best_loss: f64::INFINITY,
974            convergence_rate: 0.0,
975            meta_steps: 0,
976            task_adaptations: 0,
977        }
978    }
979
980    pub fn record_epoch(&mut self, loss: f64, duration: Duration, convergence: f64) {
981        self.total_epochs += 1;
982        self.total_training_time += duration;
983        self.average_loss =
984            (self.average_loss * (self.total_epochs - 1) as f64 + loss) / self.total_epochs as f64;
985        self.best_loss = self.best_loss.min(loss);
986        self.convergence_rate = convergence;
987    }
988
989    pub fn record_meta_step(&mut self, loss: f64, _duration: Duration, adaptations: usize) {
990        self.meta_steps += 1;
991        self.task_adaptations += adaptations;
992        self.best_loss = self.best_loss.min(loss);
993    }
994}
995
996impl Default for InferenceMetricsCollection {
997    fn default() -> Self {
998        Self::new()
999    }
1000}
1001
1002impl InferenceMetricsCollection {
1003    pub fn new() -> Self {
1004        Self {
1005            total_inferences: 0,
1006            average_latency: Duration::new(0, 0),
1007            peak_throughput: 0.0,
1008            average_quality: 0.0,
1009            cache_hit_rate: 0.0,
1010        }
1011    }
1012
1013    pub fn record_inference(&mut self, _input_size: usize, duration: Duration, quality: f64) {
1014        self.total_inferences += 1;
1015        self.average_latency = (self.average_latency * (self.total_inferences - 1) as u32
1016            + duration)
1017            / self.total_inferences as u32;
1018        self.average_quality = (self.average_quality * (self.total_inferences - 1) as f64
1019            + quality)
1020            / self.total_inferences as f64;
1021    }
1022
1023    pub fn update_throughput(&mut self, throughput: f64) {
1024        self.peak_throughput = self.peak_throughput.max(throughput);
1025    }
1026}
1027
1028impl Default for MemoryMetricsCollection {
1029    fn default() -> Self {
1030        Self::new()
1031    }
1032}
1033
1034impl MemoryMetricsCollection {
1035    pub fn new() -> Self {
1036        Self {
1037            peak_usage: 0,
1038            average_usage: 0.0,
1039            total_allocations: 0,
1040            compression_ratio: 1.0,
1041            fragmentation_rate: 0.0,
1042        }
1043    }
1044
1045    pub fn record_usage(&mut self, usage: MemoryUsage) {
1046        self.peak_usage = self.peak_usage.max(usage.total_memory);
1047        self.total_allocations += 1;
1048        self.average_usage = (self.average_usage * (self.total_allocations - 1) as f64
1049            + usage.total_memory as f64)
1050            / self.total_allocations as f64;
1051    }
1052}
1053
1054impl Default for TimingMetricsCollection {
1055    fn default() -> Self {
1056        Self::new()
1057    }
1058}
1059
1060impl TimingMetricsCollection {
1061    pub fn new() -> Self {
1062        Self {
1063            average_step_time: Duration::new(0, 0),
1064            total_computation_time: Duration::new(0, 0),
1065            operation_timings: HashMap::new(),
1066            profiling_overhead: Duration::new(0, 0),
1067        }
1068    }
1069
1070    pub fn record_step_time(&mut self, duration: Duration) {
1071        self.total_computation_time += duration;
1072        // Update average step time logic would go here
1073    }
1074
1075    pub fn record_operation_time(&mut self, operation: &str, duration: Duration) {
1076        self.operation_timings
1077            .insert(operation.to_string(), duration);
1078    }
1079}
1080
1081impl Default for QualityMetricsCollection {
1082    fn default() -> Self {
1083        Self::new()
1084    }
1085}
1086
1087impl QualityMetricsCollection {
1088    pub fn new() -> Self {
1089        Self {
1090            loss_statistics: LossStatistics {
1091                mean: 0.0,
1092                std_dev: 0.0,
1093                min: f64::INFINITY,
1094                max: f64::NEG_INFINITY,
1095            },
1096            convergence_statistics: ConvergenceStatistics {
1097                average_rate: 0.0,
1098                best_rate: 0.0,
1099                convergence_episodes: 0,
1100            },
1101            stability_metrics: StabilityMetrics {
1102                loss_volatility: 0.0,
1103                gradient_stability: 0.0,
1104                convergence_stability: 0.0,
1105            },
1106            performance_alerts: VecDeque::new(),
1107        }
1108    }
1109
1110    pub fn record_loss(&mut self, loss: f64) {
1111        self.loss_statistics.min = self.loss_statistics.min.min(loss);
1112        self.loss_statistics.max = self.loss_statistics.max.max(loss);
1113    }
1114
1115    pub fn record_alert(&mut self, alert: PerformanceAlert) {
1116        self.performance_alerts.push_back(alert);
1117        if self.performance_alerts.len() > 100 {
1118            self.performance_alerts.pop_front();
1119        }
1120    }
1121
1122    pub fn get_recent_alerts(&self, count: usize) -> Vec<PerformanceAlert> {
1123        self.performance_alerts
1124            .iter()
1125            .rev()
1126            .take(count)
1127            .cloned()
1128            .collect()
1129    }
1130}
1131
1132impl Default for ResourceMetricsCollection {
1133    fn default() -> Self {
1134        Self::new()
1135    }
1136}
1137
1138impl ResourceMetricsCollection {
1139    pub fn new() -> Self {
1140        Self {
1141            cpu_usage_history: VecDeque::new(),
1142            memory_usage_history: VecDeque::new(),
1143            disk_io_metrics: DiskIOMetrics {
1144                total_reads: 0,
1145                total_writes: 0,
1146                total_bytes_read: 0,
1147                total_bytes_written: 0,
1148            },
1149            network_metrics: NetworkMetrics {
1150                total_requests: 0,
1151                total_bytes_sent: 0,
1152                total_bytes_received: 0,
1153                average_latency: Duration::new(0, 0),
1154            },
1155        }
1156    }
1157
1158    pub fn get_average_cpu_usage(&self) -> f64 {
1159        if self.cpu_usage_history.is_empty() {
1160            0.0
1161        } else {
1162            self.cpu_usage_history.iter().sum::<f64>() / self.cpu_usage_history.len() as f64
1163        }
1164    }
1165
1166    pub fn get_average_memory_usage(&self) -> f64 {
1167        if self.memory_usage_history.is_empty() {
1168            0.0
1169        } else {
1170            self.memory_usage_history.iter().sum::<f64>() / self.memory_usage_history.len() as f64
1171        }
1172    }
1173}
1174
1175impl Default for OptimizationMetricsCollection {
1176    fn default() -> Self {
1177        Self::new()
1178    }
1179}
1180
1181impl OptimizationMetricsCollection {
1182    pub fn new() -> Self {
1183        Self {
1184            update_norm_history: VecDeque::new(),
1185            parameter_change_rate: 0.0,
1186            optimization_efficiency: 0.0,
1187            adaptive_learning_metrics: AdaptiveLearningMetrics {
1188                learning_rate_adjustments: 0,
1189                batch_size_adjustments: 0,
1190                architecture_modifications: 0,
1191            },
1192        }
1193    }
1194
1195    pub fn record_update_norm(&mut self, norm: f64) {
1196        self.update_norm_history.push_back(norm);
1197        if self.update_norm_history.len() > 1000 {
1198            self.update_norm_history.pop_front();
1199        }
1200    }
1201}
1202
1203#[cfg(test)]
1204mod tests {
1205    use super::*;
1206
1207    #[test]
1208    fn test_performance_tracker_creation() {
1209        let tracker = TransformerPerformanceTracker::<f32>::new();
1210        assert_eq!(tracker.loss_history.len(), 0);
1211        assert_eq!(tracker.step_timings.len(), 0);
1212    }
1213
1214    #[test]
1215    fn test_record_loss() {
1216        let mut tracker = TransformerPerformanceTracker::<f32>::new();
1217        tracker.record_loss(1.5);
1218        tracker.record_loss(1.2);
1219        tracker.record_loss(0.9);
1220
1221        assert_eq!(tracker.loss_history.len(), 3);
1222        assert_eq!(tracker.calculate_best_loss(), 0.9);
1223    }
1224
1225    #[test]
1226    fn test_trend_analyzer() {
1227        let mut analyzer = TrendAnalyzer::new(10);
1228
1229        for i in 0..5 {
1230            analyzer.add_sample(i as f64);
1231        }
1232
1233        let trend = analyzer.get_trend();
1234        assert!(trend > 0.0); // Should be positive for increasing sequence
1235    }
1236
1237    #[test]
1238    fn test_performance_report_generation() {
1239        let mut tracker = TransformerPerformanceTracker::<f32>::new();
1240        tracker.record_loss(2.0);
1241        tracker.record_loss(1.5);
1242        tracker.record_loss(1.0);
1243
1244        let report = tracker.generate_report();
1245        assert!(report.loss_improvement > 0.0);
1246        assert_eq!(report.best_loss, 1.0);
1247    }
1248
1249    #[test]
1250    fn test_profiling() {
1251        let mut tracker = TransformerPerformanceTracker::<f32>::new();
1252
1253        let result = tracker.profile_operation("test_op", || {
1254            std::thread::sleep(Duration::from_millis(10));
1255            Ok(42)
1256        });
1257
1258        assert!(result.is_ok());
1259        assert_eq!(result.unwrap(), 42);
1260    }
1261}
optirs_learned/transformer_based_optimizer/performance_tracker.rs

optirs_learned/transformer_based_optimizer/
performance_tracker.rs