eventuali_core/observability/
profiling.rs

1//! Performance profiling module for comprehensive performance analysis.
2//!
3//! This module provides:
4//! - CPU profiling with sampling and flame graph generation
5//! - Memory profiling with allocation tracking and leak detection
6//! - I/O profiling with disk and network performance analysis
7//! - Method-level performance tracking with call graphs
8//! - Performance regression detection with alerting
9//! - Bottleneck identification and optimization recommendations
10
11use crate::error::{EventualiError, Result};
12use crate::observability::CorrelationId;
13use std::collections::{HashMap, VecDeque};
14use std::sync::Arc;
15use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
16use tokio::sync::RwLock;
17use serde::{Deserialize, Serialize};
18
19/// Types of profiling that can be performed
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
21pub enum ProfileType {
22    /// CPU profiling with sampling
23    Cpu,
24    /// Memory allocation profiling
25    Memory,
26    /// I/O performance profiling
27    Io,
28    /// Method-level call profiling
29    Method,
30    /// Combined profiling (all types)
31    Combined,
32}
33
34/// Profiling configuration options
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct ProfilingConfig {
37    /// Enable profiling globally
38    pub enabled: bool,
39    /// Sampling interval for CPU profiling (microseconds)
40    pub cpu_sampling_interval_us: u64,
41    /// Memory allocation threshold for tracking (bytes)
42    pub memory_allocation_threshold: usize,
43    /// I/O operation threshold for tracking (microseconds)
44    pub io_threshold_us: u64,
45    /// Maximum number of stack frames to capture
46    pub max_stack_frames: usize,
47    /// Maximum duration to keep profiling data (seconds)
48    pub data_retention_seconds: u64,
49    /// Enable flame graph generation
50    pub enable_flame_graphs: bool,
51    /// Performance regression threshold (percentage)
52    pub regression_threshold_percent: f64,
53}
54
55impl Default for ProfilingConfig {
56    fn default() -> Self {
57        Self {
58            enabled: true,
59            cpu_sampling_interval_us: 1000,       // 1ms sampling
60            memory_allocation_threshold: 1024,     // 1KB allocations
61            io_threshold_us: 100,                  // 100µs I/O operations
62            max_stack_frames: 32,
63            data_retention_seconds: 3600,          // 1 hour
64            enable_flame_graphs: true,
65            regression_threshold_percent: 10.0,    // 10% regression
66        }
67    }
68}
69
70/// Performance profile entry representing a single profiling sample
71#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct ProfileEntry {
73    /// Unique ID for this profile entry
74    pub id: String,
75    /// Type of profiling
76    pub profile_type: ProfileType,
77    /// Timestamp when this sample was taken
78    pub timestamp: SystemTime,
79    /// Duration of the operation being profiled
80    pub duration: Duration,
81    /// Stack trace at the time of sampling
82    pub stack_trace: Vec<String>,
83    /// Memory information (if memory profiling)
84    pub memory_info: Option<MemoryInfo>,
85    /// I/O information (if I/O profiling)
86    pub io_info: Option<IoInfo>,
87    /// Correlation ID for tracing
88    pub correlation_id: Option<CorrelationId>,
89    /// Additional metadata
90    pub metadata: HashMap<String, String>,
91}
92
93/// Memory profiling information
94#[derive(Debug, Clone, Serialize, Deserialize)]
95pub struct MemoryInfo {
96    /// Total allocated bytes
97    pub allocated_bytes: usize,
98    /// Total deallocated bytes
99    pub deallocated_bytes: usize,
100    /// Current memory usage
101    pub current_usage_bytes: usize,
102    /// Peak memory usage
103    pub peak_usage_bytes: usize,
104    /// Number of allocations
105    pub allocation_count: usize,
106    /// Number of deallocations
107    pub deallocation_count: usize,
108}
109
110/// I/O profiling information
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct IoInfo {
113    /// Type of I/O operation
114    pub operation_type: String,
115    /// Bytes read
116    pub bytes_read: u64,
117    /// Bytes written
118    pub bytes_written: u64,
119    /// Number of I/O operations
120    pub operation_count: u64,
121    /// Average I/O latency
122    pub average_latency: Duration,
123    /// I/O target (file path, network endpoint, etc.)
124    pub target: String,
125}
126
127/// Call graph node for method-level profiling
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct CallGraphNode {
130    /// Method or function name
131    pub name: String,
132    /// Total time spent in this method (including children)
133    pub total_time: Duration,
134    /// Self time spent in this method (excluding children)
135    pub self_time: Duration,
136    /// Number of times this method was called
137    pub call_count: u64,
138    /// Children method calls
139    pub children: HashMap<String, CallGraphNode>,
140    /// Average time per call
141    pub avg_time_per_call: Duration,
142}
143
144/// Performance regression detection result
145#[derive(Debug, Clone, Serialize, Deserialize)]
146pub struct RegressionDetection {
147    /// Operation or method name
148    pub operation: String,
149    /// Current performance metrics
150    pub current_metrics: PerformanceSnapshot,
151    /// Baseline performance metrics
152    pub baseline_metrics: PerformanceSnapshot,
153    /// Percentage change from baseline
154    pub change_percent: f64,
155    /// Whether this is considered a regression
156    pub is_regression: bool,
157    /// Severity level of the regression
158    pub severity: RegressionSeverity,
159    /// Recommended actions
160    pub recommendations: Vec<String>,
161}
162
163/// Performance snapshot for comparison
164#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct PerformanceSnapshot {
166    /// Average execution time
167    pub avg_execution_time: Duration,
168    /// 95th percentile execution time
169    pub p95_execution_time: Duration,
170    /// 99th percentile execution time
171    pub p99_execution_time: Duration,
172    /// Throughput (operations per second)
173    pub throughput: f64,
174    /// Memory usage
175    pub memory_usage_bytes: usize,
176    /// Error rate
177    pub error_rate: f64,
178    /// Timestamp of snapshot
179    pub timestamp: SystemTime,
180}
181
182/// Severity levels for performance regressions
183#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
184pub enum RegressionSeverity {
185    Low,      // 10-25% regression
186    Medium,   // 25-50% regression
187    High,     // 50-100% regression
188    Critical, // >100% regression
189}
190
191/// Flame graph data structure
192#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct FlameGraph {
194    /// Root node of the flame graph
195    pub root: FlameGraphNode,
196    /// Total duration represented by the flame graph
197    pub total_duration: Duration,
198    /// Number of samples in the flame graph
199    pub sample_count: usize,
200    /// Metadata about the flame graph
201    pub metadata: HashMap<String, String>,
202}
203
204/// Individual node in a flame graph
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct FlameGraphNode {
207    /// Name of the function/method
208    pub name: String,
209    /// Total time spent in this node and its children
210    pub total_time: Duration,
211    /// Self time spent in this node only
212    pub self_time: Duration,
213    /// Number of samples for this node
214    pub sample_count: usize,
215    /// Child nodes
216    pub children: HashMap<String, FlameGraphNode>,
217    /// Percentage of total execution time
218    pub percentage: f64,
219}
220
221/// Bottleneck identification result
222#[derive(Debug, Clone, Serialize, Deserialize)]
223pub struct BottleneckAnalysis {
224    /// Top bottlenecks found
225    pub bottlenecks: Vec<Bottleneck>,
226    /// Analysis timestamp
227    pub timestamp: SystemTime,
228    /// Total analysis duration
229    pub analysis_duration: Duration,
230    /// Optimization suggestions
231    pub optimization_suggestions: Vec<OptimizationSuggestion>,
232}
233
234/// Individual bottleneck identified
235#[derive(Debug, Clone, Serialize, Deserialize)]
236pub struct Bottleneck {
237    /// Location of the bottleneck (function, method, etc.)
238    pub location: String,
239    /// Type of bottleneck
240    pub bottleneck_type: BottleneckType,
241    /// Impact score (0-100)
242    pub impact_score: f64,
243    /// Time spent in this bottleneck
244    pub time_spent: Duration,
245    /// Percentage of total execution time
246    pub percentage_of_total: f64,
247    /// Call frequency
248    pub call_frequency: u64,
249    /// Description of the bottleneck
250    pub description: String,
251}
252
253/// Types of bottlenecks that can be detected
254#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
255pub enum BottleneckType {
256    /// CPU-intensive computation
257    Cpu,
258    /// Memory allocation/deallocation
259    Memory,
260    /// I/O operations
261    Io,
262    /// Lock contention
263    Lock,
264    /// Network operations
265    Network,
266    /// Database operations
267    Database,
268    /// Serialization/deserialization
269    Serialization,
270}
271
272/// Optimization suggestion
273#[derive(Debug, Clone, Serialize, Deserialize)]
274pub struct OptimizationSuggestion {
275    /// Target area for optimization
276    pub target: String,
277    /// Type of optimization
278    pub optimization_type: String,
279    /// Expected impact
280    pub expected_impact: String,
281    /// Implementation effort
282    pub effort_level: String,
283    /// Detailed description
284    pub description: String,
285    /// Code examples or references
286    pub examples: Vec<String>,
287}
288
289/// Main performance profiler service
290#[derive(Debug)]
291pub struct PerformanceProfiler {
292    #[allow(dead_code)] // Profiling configuration settings (stored but not currently accessed in implementation)
293    config: ProfilingConfig,
294    profile_data: Arc<RwLock<VecDeque<ProfileEntry>>>,
295    active_profiles: Arc<RwLock<HashMap<String, ProfileSession>>>,
296    baseline_metrics: Arc<RwLock<HashMap<String, PerformanceSnapshot>>>,
297    regression_history: Arc<RwLock<VecDeque<RegressionDetection>>>,
298    #[allow(dead_code)] // Call graph for performance analysis (constructed but not currently traversed)
299    call_graph: Arc<RwLock<CallGraphNode>>,
300}
301
302/// Active profiling session
303#[derive(Debug, Clone)]
304pub struct ProfileSession {
305    pub id: String,
306    pub profile_type: ProfileType,
307    pub start_time: Instant,
308    pub correlation_id: Option<CorrelationId>,
309    pub metadata: HashMap<String, String>,
310}
311
312impl PerformanceProfiler {
313    /// Create a new performance profiler with the given configuration
314    pub fn new(config: ProfilingConfig) -> Self {
315        Self {
316            config,
317            profile_data: Arc::new(RwLock::new(VecDeque::new())),
318            active_profiles: Arc::new(RwLock::new(HashMap::new())),
319            baseline_metrics: Arc::new(RwLock::new(HashMap::new())),
320            regression_history: Arc::new(RwLock::new(VecDeque::new())),
321            call_graph: Arc::new(RwLock::new(CallGraphNode {
322                name: "root".to_string(),
323                total_time: Duration::ZERO,
324                self_time: Duration::ZERO,
325                call_count: 0,
326                children: HashMap::new(),
327                avg_time_per_call: Duration::ZERO,
328            })),
329        }
330    }
331
332    /// Start a new profiling session
333    pub async fn start_profiling(
334        &self,
335        profile_type: ProfileType,
336        correlation_id: Option<CorrelationId>,
337        metadata: HashMap<String, String>,
338    ) -> Result<String> {
339        if !self.config.enabled {
340            return Err(EventualiError::InvalidState("Profiling is disabled".to_string()));
341        }
342
343        let session_id = uuid::Uuid::new_v4().to_string();
344        let session = ProfileSession {
345            id: session_id.clone(),
346            profile_type,
347            start_time: Instant::now(),
348            correlation_id,
349            metadata,
350        };
351
352        let mut active_profiles = self.active_profiles.write().await;
353        active_profiles.insert(session_id.clone(), session);
354
355        tracing::info!("Started profiling session: {} (type: {:?})", session_id, profile_type);
356        Ok(session_id)
357    }
358
359    /// End a profiling session and collect results
360    pub async fn end_profiling(&self, session_id: &str) -> Result<ProfileEntry> {
361        let mut active_profiles = self.active_profiles.write().await;
362        let session = active_profiles.remove(session_id)
363            .ok_or_else(|| EventualiError::InvalidState(format!("Profile session not found: {session_id}")))?;
364
365        let duration = session.start_time.elapsed();
366        let stack_trace = self.capture_stack_trace().await;
367        
368        let memory_info = if session.profile_type == ProfileType::Memory || session.profile_type == ProfileType::Combined {
369            Some(self.collect_memory_info().await?)
370        } else {
371            None
372        };
373
374        let io_info = if session.profile_type == ProfileType::Io || session.profile_type == ProfileType::Combined {
375            Some(self.collect_io_info().await?)
376        } else {
377            None
378        };
379
380        let entry = ProfileEntry {
381            id: session_id.to_string(),
382            profile_type: session.profile_type,
383            timestamp: SystemTime::now(),
384            duration,
385            stack_trace,
386            memory_info,
387            io_info,
388            correlation_id: session.correlation_id,
389            metadata: session.metadata,
390        };
391
392        // Store the profile entry
393        let mut profile_data = self.profile_data.write().await;
394        profile_data.push_back(entry.clone());
395
396        // Cleanup old data if necessary
397        self.cleanup_old_data(&mut profile_data).await;
398
399        tracing::info!("Ended profiling session: {} (duration: {:?})", session_id, duration);
400        Ok(entry)
401    }
402
403    /// Capture current stack trace
404    async fn capture_stack_trace(&self) -> Vec<String> {
405        // In a real implementation, this would use platform-specific APIs
406        // to capture the actual stack trace. For now, we'll simulate it.
407        vec![
408            "eventuali::event_store::append_events".to_string(),
409            "eventuali::aggregate::apply_event".to_string(),
410            "eventuali::performance::profile_operation".to_string(),
411        ]
412    }
413
414    /// Collect current memory information
415    async fn collect_memory_info(&self) -> Result<MemoryInfo> {
416        // In a real implementation, this would collect actual memory statistics
417        // using platform-specific APIs or memory profiling libraries
418        Ok(MemoryInfo {
419            allocated_bytes: 1024 * 1024,      // 1MB
420            deallocated_bytes: 512 * 1024,     // 512KB
421            current_usage_bytes: 512 * 1024,   // 512KB
422            peak_usage_bytes: 2 * 1024 * 1024, // 2MB
423            allocation_count: 100,
424            deallocation_count: 50,
425        })
426    }
427
428    /// Collect current I/O information
429    async fn collect_io_info(&self) -> Result<IoInfo> {
430        // In a real implementation, this would collect actual I/O statistics
431        Ok(IoInfo {
432            operation_type: "database_write".to_string(),
433            bytes_read: 2048,
434            bytes_written: 4096,
435            operation_count: 10,
436            average_latency: Duration::from_micros(250),
437            target: "sqlite://events.db".to_string(),
438        })
439    }
440
441    /// Generate a flame graph from collected profile data
442    pub async fn generate_flame_graph(
443        &self,
444        profile_type: ProfileType,
445        time_range: Option<(SystemTime, SystemTime)>,
446    ) -> Result<FlameGraph> {
447        if !self.config.enable_flame_graphs {
448            return Err(EventualiError::InvalidState("Flame graph generation is disabled".to_string()));
449        }
450
451        let profile_data = self.profile_data.read().await;
452        let filtered_data: Vec<_> = profile_data.iter()
453            .filter(|entry| {
454                entry.profile_type == profile_type || profile_type == ProfileType::Combined
455            })
456            .filter(|entry| {
457                if let Some((start, end)) = time_range {
458                    entry.timestamp >= start && entry.timestamp <= end
459                } else {
460                    true
461                }
462            })
463            .collect();
464
465        let mut root = FlameGraphNode {
466            name: "root".to_string(),
467            total_time: Duration::ZERO,
468            self_time: Duration::ZERO,
469            sample_count: 0,
470            children: HashMap::new(),
471            percentage: 100.0,
472        };
473
474        let total_samples = filtered_data.len();
475        let mut total_duration = Duration::ZERO;
476
477        for entry in &filtered_data {
478            total_duration += entry.duration;
479            self.add_to_flame_graph(&mut root, &entry.stack_trace, entry.duration).await;
480        }
481
482        // Calculate percentages
483        self.calculate_flame_graph_percentages(&mut root, total_duration).await;
484
485        Ok(FlameGraph {
486            root,
487            total_duration,
488            sample_count: total_samples,
489            metadata: HashMap::from([
490                ("profile_type".to_string(), format!("{profile_type:?}")),
491                ("generation_time".to_string(), SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs().to_string()),
492            ]),
493        })
494    }
495
496    /// Add a stack trace to the flame graph
497    async fn add_to_flame_graph(&self, node: &mut FlameGraphNode, stack_trace: &[String], duration: Duration) {
498        if stack_trace.is_empty() {
499            node.self_time += duration;
500            node.sample_count += 1;
501            return;
502        }
503
504        let frame = &stack_trace[0];
505        let child = node.children.entry(frame.clone()).or_insert_with(|| FlameGraphNode {
506            name: frame.clone(),
507            total_time: Duration::ZERO,
508            self_time: Duration::ZERO,
509            sample_count: 0,
510            children: HashMap::new(),
511            percentage: 0.0,
512        });
513
514        child.total_time += duration;
515        child.sample_count += 1;
516
517        if stack_trace.len() > 1 {
518            Box::pin(self.add_to_flame_graph(child, &stack_trace[1..], duration)).await;
519        } else {
520            child.self_time += duration;
521        }
522    }
523
524    /// Calculate percentages for flame graph nodes
525    async fn calculate_flame_graph_percentages(&self, node: &mut FlameGraphNode, total_duration: Duration) {
526        if total_duration > Duration::ZERO {
527            node.percentage = (node.total_time.as_nanos() as f64 / total_duration.as_nanos() as f64) * 100.0;
528        }
529
530        for child in node.children.values_mut() {
531            Box::pin(self.calculate_flame_graph_percentages(child, total_duration)).await;
532        }
533    }
534
535    /// Detect performance regressions by comparing current metrics with baselines
536    pub async fn detect_regressions(&self, operation: &str) -> Result<Option<RegressionDetection>> {
537        let current_metrics = self.collect_current_metrics(operation).await?;
538        
539        let baseline_metrics = self.baseline_metrics.read().await;
540        if let Some(baseline) = baseline_metrics.get(operation) {
541            let change_percent = ((current_metrics.avg_execution_time.as_nanos() as f64 - 
542                                  baseline.avg_execution_time.as_nanos() as f64) /
543                                 baseline.avg_execution_time.as_nanos() as f64) * 100.0;
544
545            let is_regression = change_percent > self.config.regression_threshold_percent;
546            let severity = match change_percent {
547                x if x > 100.0 => RegressionSeverity::Critical,
548                x if x > 50.0 => RegressionSeverity::High,
549                x if x > 25.0 => RegressionSeverity::Medium,
550                _ => RegressionSeverity::Low,
551            };
552
553            let recommendations = self.generate_optimization_recommendations(operation, &current_metrics, baseline).await;
554
555            let detection = RegressionDetection {
556                operation: operation.to_string(),
557                current_metrics,
558                baseline_metrics: baseline.clone(),
559                change_percent,
560                is_regression,
561                severity,
562                recommendations,
563            };
564
565            if is_regression {
566                let mut regression_history = self.regression_history.write().await;
567                regression_history.push_back(detection.clone());
568                
569                // Keep only recent regressions
570                while regression_history.len() > 1000 {
571                    regression_history.pop_front();
572                }
573
574                tracing::warn!("Performance regression detected for {}: {:.2}% slower", operation, change_percent);
575            }
576
577            Ok(Some(detection))
578        } else {
579            Ok(None)
580        }
581    }
582
583    /// Collect current performance metrics for an operation
584    async fn collect_current_metrics(&self, operation: &str) -> Result<PerformanceSnapshot> {
585        let profile_data = self.profile_data.read().await;
586        let operation_data: Vec<_> = profile_data.iter()
587            .filter(|entry| entry.metadata.get("operation").is_some_and(|op| op == operation))
588            .collect();
589
590        if operation_data.is_empty() {
591            return Ok(PerformanceSnapshot {
592                avg_execution_time: Duration::ZERO,
593                p95_execution_time: Duration::ZERO,
594                p99_execution_time: Duration::ZERO,
595                throughput: 0.0,
596                memory_usage_bytes: 0,
597                error_rate: 0.0,
598                timestamp: SystemTime::now(),
599            });
600        }
601
602        let mut durations: Vec<Duration> = operation_data.iter().map(|entry| entry.duration).collect();
603        durations.sort();
604
605        let avg_execution_time = Duration::from_nanos(
606            (durations.iter().map(|d| d.as_nanos()).sum::<u128>() / durations.len() as u128) as u64
607        );
608
609        let p95_index = (durations.len() as f64 * 0.95) as usize;
610        let p99_index = (durations.len() as f64 * 0.99) as usize;
611        
612        let p95_execution_time = durations.get(p95_index.min(durations.len() - 1)).copied().unwrap_or(Duration::ZERO);
613        let p99_execution_time = durations.get(p99_index.min(durations.len() - 1)).copied().unwrap_or(Duration::ZERO);
614
615        let throughput = if !durations.is_empty() {
616            1.0 / avg_execution_time.as_secs_f64()
617        } else {
618            0.0
619        };
620
621        let memory_usage_bytes = operation_data.iter()
622            .filter_map(|entry| entry.memory_info.as_ref())
623            .map(|info| info.current_usage_bytes)
624            .max()
625            .unwrap_or(0);
626
627        Ok(PerformanceSnapshot {
628            avg_execution_time,
629            p95_execution_time,
630            p99_execution_time,
631            throughput,
632            memory_usage_bytes,
633            error_rate: 0.0, // Would be calculated from error metrics
634            timestamp: SystemTime::now(),
635        })
636    }
637
638    /// Generate optimization recommendations
639    async fn generate_optimization_recommendations(
640        &self,
641        _operation: &str,
642        current: &PerformanceSnapshot,
643        baseline: &PerformanceSnapshot,
644    ) -> Vec<String> {
645        let mut recommendations = Vec::new();
646
647        // Performance degradation recommendations
648        if current.avg_execution_time > baseline.avg_execution_time * 2 {
649            recommendations.push("Consider profiling CPU usage - execution time has doubled".to_string());
650        }
651
652        // Memory usage recommendations
653        if current.memory_usage_bytes > baseline.memory_usage_bytes * 2 {
654            recommendations.push("Memory usage has significantly increased - check for memory leaks".to_string());
655        }
656
657        // Throughput recommendations
658        if current.throughput < baseline.throughput * 0.5 {
659            recommendations.push("Throughput has dropped significantly - consider optimizing bottlenecks".to_string());
660        }
661
662        // General recommendations
663        recommendations.push("Use flame graphs to identify specific bottlenecks".to_string());
664        recommendations.push("Consider enabling detailed I/O profiling".to_string());
665        recommendations.push("Review recent code changes for performance impact".to_string());
666
667        recommendations
668    }
669
670    /// Identify bottlenecks in the current profile data
671    pub async fn identify_bottlenecks(&self, profile_type: ProfileType) -> Result<BottleneckAnalysis> {
672        let start_time = Instant::now();
673        let profile_data = self.profile_data.read().await;
674        
675        let filtered_data: Vec<_> = profile_data.iter()
676            .filter(|entry| entry.profile_type == profile_type || profile_type == ProfileType::Combined)
677            .collect();
678
679        let mut bottlenecks = Vec::new();
680        let mut function_times: HashMap<String, (Duration, u64)> = HashMap::new();
681        let total_time: Duration = filtered_data.iter().map(|entry| entry.duration).sum();
682
683        // Analyze function call times
684        for entry in &filtered_data {
685            for frame in &entry.stack_trace {
686                let (current_time, count) = function_times.get(frame).unwrap_or(&(Duration::ZERO, 0));
687                function_times.insert(frame.clone(), (*current_time + entry.duration, count + 1));
688            }
689        }
690
691        // Identify top bottlenecks
692        let mut sorted_functions: Vec<_> = function_times.iter().collect();
693        sorted_functions.sort_by(|a, b| b.1.0.cmp(&a.1.0));
694
695        for (function, (time, count)) in sorted_functions.iter().take(10) {
696            let percentage = (time.as_nanos() as f64 / total_time.as_nanos() as f64) * 100.0;
697            let bottleneck_type = self.classify_bottleneck_type(function);
698            
699            bottlenecks.push(Bottleneck {
700                location: (*function).clone(),
701                bottleneck_type,
702                impact_score: percentage,
703                time_spent: *time,
704                percentage_of_total: percentage,
705                call_frequency: *count,
706                description: format!("Function {function} consumes {percentage:.2}% of total execution time"),
707            });
708        }
709
710        let optimization_suggestions = self.generate_bottleneck_optimizations(&bottlenecks).await;
711
712        Ok(BottleneckAnalysis {
713            bottlenecks,
714            timestamp: SystemTime::now(),
715            analysis_duration: start_time.elapsed(),
716            optimization_suggestions,
717        })
718    }
719
720    /// Classify the type of bottleneck based on function name
721    fn classify_bottleneck_type(&self, function_name: &str) -> BottleneckType {
722        if function_name.contains("database") || function_name.contains("sql") {
723            BottleneckType::Database
724        } else if function_name.contains("serialize") || function_name.contains("deserialize") {
725            BottleneckType::Serialization
726        } else if function_name.contains("network") || function_name.contains("http") {
727            BottleneckType::Network
728        } else if function_name.contains("lock") || function_name.contains("mutex") {
729            BottleneckType::Lock
730        } else if function_name.contains("io") || function_name.contains("read") || function_name.contains("write") {
731            BottleneckType::Io
732        } else if function_name.contains("alloc") || function_name.contains("free") {
733            BottleneckType::Memory
734        } else {
735            BottleneckType::Cpu
736        }
737    }
738
739    /// Generate optimization suggestions for bottlenecks
740    async fn generate_bottleneck_optimizations(&self, bottlenecks: &[Bottleneck]) -> Vec<OptimizationSuggestion> {
741        let mut suggestions = Vec::new();
742
743        for bottleneck in bottlenecks {
744            let suggestion = match bottleneck.bottleneck_type {
745                BottleneckType::Database => OptimizationSuggestion {
746                    target: bottleneck.location.clone(),
747                    optimization_type: "Database Optimization".to_string(),
748                    expected_impact: "20-50% performance improvement".to_string(),
749                    effort_level: "Medium".to_string(),
750                    description: "Add database indexes, optimize queries, consider connection pooling".to_string(),
751                    examples: vec![
752                        "CREATE INDEX idx_events_aggregate_id ON events(aggregate_id)".to_string(),
753                        "Use prepared statements for repeated queries".to_string(),
754                    ],
755                },
756                BottleneckType::Serialization => OptimizationSuggestion {
757                    target: bottleneck.location.clone(),
758                    optimization_type: "Serialization Optimization".to_string(),
759                    expected_impact: "10-30% performance improvement".to_string(),
760                    effort_level: "Low".to_string(),
761                    description: "Use more efficient serialization formats or optimize serialization code".to_string(),
762                    examples: vec![
763                        "Consider using Protocol Buffers instead of JSON".to_string(),
764                        "Implement custom serialization for hot paths".to_string(),
765                    ],
766                },
767                BottleneckType::Memory => OptimizationSuggestion {
768                    target: bottleneck.location.clone(),
769                    optimization_type: "Memory Optimization".to_string(),
770                    expected_impact: "15-40% performance improvement".to_string(),
771                    effort_level: "Medium".to_string(),
772                    description: "Optimize memory allocation patterns, use object pooling".to_string(),
773                    examples: vec![
774                        "Use Vec::with_capacity() to pre-allocate vectors".to_string(),
775                        "Implement object pooling for frequently allocated objects".to_string(),
776                    ],
777                },
778                _ => OptimizationSuggestion {
779                    target: bottleneck.location.clone(),
780                    optimization_type: "General Optimization".to_string(),
781                    expected_impact: "5-20% performance improvement".to_string(),
782                    effort_level: "Variable".to_string(),
783                    description: "Profile the specific function to identify optimization opportunities".to_string(),
784                    examples: vec![
785                        "Use more efficient algorithms".to_string(),
786                        "Reduce unnecessary computations".to_string(),
787                    ],
788                },
789            };
790            suggestions.push(suggestion);
791        }
792
793        suggestions
794    }
795
796    /// Set baseline metrics for an operation
797    pub async fn set_baseline(&self, operation: &str) -> Result<()> {
798        let metrics = self.collect_current_metrics(operation).await?;
799        let mut baseline_metrics = self.baseline_metrics.write().await;
800        baseline_metrics.insert(operation.to_string(), metrics);
801        
802        tracing::info!("Set baseline metrics for operation: {}", operation);
803        Ok(())
804    }
805
806    /// Get current profiling statistics
807    pub async fn get_statistics(&self) -> Result<HashMap<String, serde_json::Value>> {
808        let profile_data = self.profile_data.read().await;
809        let active_profiles = self.active_profiles.read().await;
810        let regression_history = self.regression_history.read().await;
811
812        let total_profiles = profile_data.len();
813        let active_sessions = active_profiles.len();
814        let total_regressions = regression_history.len();
815
816        let profile_type_counts = profile_data.iter()
817            .fold(HashMap::new(), |mut acc, entry| {
818                *acc.entry(format!("{:?}", entry.profile_type)).or_insert(0) += 1;
819                acc
820            });
821
822        Ok(HashMap::from([
823            ("total_profiles".to_string(), serde_json::Value::Number(total_profiles.into())),
824            ("active_sessions".to_string(), serde_json::Value::Number(active_sessions.into())),
825            ("total_regressions".to_string(), serde_json::Value::Number(total_regressions.into())),
826            ("profile_type_counts".to_string(), serde_json::to_value(profile_type_counts)?),
827            ("config".to_string(), serde_json::to_value(&self.config)?),
828        ]))
829    }
830
831    /// Cleanup old profile data based on retention settings
832    async fn cleanup_old_data(&self, profile_data: &mut VecDeque<ProfileEntry>) {
833        let cutoff = SystemTime::now() - Duration::from_secs(self.config.data_retention_seconds);
834        
835        while let Some(entry) = profile_data.front() {
836            if entry.timestamp < cutoff {
837                profile_data.pop_front();
838            } else {
839                break;
840            }
841        }
842    }
843
844    /// Export profile data to various formats
845    pub async fn export_profile_data(&self, format: &str) -> Result<String> {
846        let profile_data = self.profile_data.read().await;
847        
848        match format.to_lowercase().as_str() {
849            "json" => Ok(serde_json::to_string_pretty(&*profile_data)?),
850            "csv" => {
851                let mut csv = String::from("id,type,timestamp,duration_ns,stack_depth\n");
852                for entry in profile_data.iter() {
853                    csv.push_str(&format!(
854                        "{},{:?},{},{},{}\n",
855                        entry.id,
856                        entry.profile_type,
857                        entry.timestamp.duration_since(UNIX_EPOCH).unwrap().as_secs(),
858                        entry.duration.as_nanos(),
859                        entry.stack_trace.len()
860                    ));
861                }
862                Ok(csv)
863            },
864            _ => Err(EventualiError::InvalidState(format!("Unsupported export format: {format}")))
865        }
866    }
867}
868
869/// Performance profiler builder for easy configuration
870pub struct PerformanceProfilerBuilder {
871    config: ProfilingConfig,
872}
873
874impl PerformanceProfilerBuilder {
875    pub fn new() -> Self {
876        Self {
877            config: ProfilingConfig::default(),
878        }
879    }
880
881    pub fn with_cpu_sampling_interval(mut self, interval_us: u64) -> Self {
882        self.config.cpu_sampling_interval_us = interval_us;
883        self
884    }
885
886    pub fn with_memory_threshold(mut self, threshold_bytes: usize) -> Self {
887        self.config.memory_allocation_threshold = threshold_bytes;
888        self
889    }
890
891    pub fn with_io_threshold(mut self, threshold_us: u64) -> Self {
892        self.config.io_threshold_us = threshold_us;
893        self
894    }
895
896    pub fn with_flame_graphs(mut self, enabled: bool) -> Self {
897        self.config.enable_flame_graphs = enabled;
898        self
899    }
900
901    pub fn with_regression_threshold(mut self, threshold_percent: f64) -> Self {
902        self.config.regression_threshold_percent = threshold_percent;
903        self
904    }
905
906    pub fn build(self) -> PerformanceProfiler {
907        PerformanceProfiler::new(self.config)
908    }
909}
910
911impl Default for PerformanceProfilerBuilder {
912    fn default() -> Self {
913        Self::new()
914    }
915}
eventuali_core/observability/profiling.rs

eventuali_core/observability/
profiling.rs