rexis_rag/observability/
profiling.rs

1//! # Performance Profiling System
2//!
3//! Advanced profiling capabilities for identifying bottlenecks,
4//! performance trends, and optimization opportunities in RRAG systems.
5
6use crate::{RragError, RragResult};
7use chrono::{DateTime, Duration, Utc};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10use std::sync::Arc;
11use tokio::sync::RwLock;
12
13/// Profiling configuration
14#[derive(Debug, Clone, Serialize, Deserialize)]
15pub struct ProfilingConfig {
16    pub enabled: bool,
17    pub sample_rate: f64,
18    pub max_profiles: usize,
19    pub profile_duration_seconds: u64,
20    pub enable_cpu_profiling: bool,
21    pub enable_memory_profiling: bool,
22    pub enable_io_profiling: bool,
23    pub enable_custom_metrics: bool,
24    pub bottleneck_threshold_ms: f64,
25}
26
27impl Default for ProfilingConfig {
28    fn default() -> Self {
29        Self {
30            enabled: true,
31            sample_rate: 0.1, // 10% sampling
32            max_profiles: 1000,
33            profile_duration_seconds: 60,
34            enable_cpu_profiling: true,
35            enable_memory_profiling: true,
36            enable_io_profiling: true,
37            enable_custom_metrics: true,
38            bottleneck_threshold_ms: 100.0,
39        }
40    }
41}
42
43/// Profile data collection
44#[derive(Debug, Clone, Serialize, Deserialize)]
45pub struct ProfileData {
46    pub id: String,
47    pub timestamp: DateTime<Utc>,
48    pub duration_ms: f64,
49    pub operation: String,
50    pub component: String,
51    pub user_id: Option<String>,
52    pub session_id: Option<String>,
53    pub trace_id: Option<String>,
54    pub cpu_usage_percent: Option<f64>,
55    pub memory_usage_mb: Option<f64>,
56    pub io_read_bytes: Option<u64>,
57    pub io_write_bytes: Option<u64>,
58    pub custom_metrics: HashMap<String, f64>,
59    pub stack_trace: Option<String>,
60    pub tags: HashMap<String, String>,
61}
62
63impl ProfileData {
64    pub fn new(operation: impl Into<String>, component: impl Into<String>) -> Self {
65        Self {
66            id: uuid::Uuid::new_v4().to_string(),
67            timestamp: Utc::now(),
68            duration_ms: 0.0,
69            operation: operation.into(),
70            component: component.into(),
71            user_id: None,
72            session_id: None,
73            trace_id: None,
74            cpu_usage_percent: None,
75            memory_usage_mb: None,
76            io_read_bytes: None,
77            io_write_bytes: None,
78            custom_metrics: HashMap::new(),
79            stack_trace: None,
80            tags: HashMap::new(),
81        }
82    }
83
84    pub fn with_duration(mut self, duration_ms: f64) -> Self {
85        self.duration_ms = duration_ms;
86        self
87    }
88
89    pub fn with_user(mut self, user_id: impl Into<String>) -> Self {
90        self.user_id = Some(user_id.into());
91        self
92    }
93
94    pub fn with_session(mut self, session_id: impl Into<String>) -> Self {
95        self.session_id = Some(session_id.into());
96        self
97    }
98
99    pub fn with_trace(mut self, trace_id: impl Into<String>) -> Self {
100        self.trace_id = Some(trace_id.into());
101        self
102    }
103
104    pub fn with_cpu_usage(mut self, cpu_percent: f64) -> Self {
105        self.cpu_usage_percent = Some(cpu_percent);
106        self
107    }
108
109    pub fn with_memory_usage(mut self, memory_mb: f64) -> Self {
110        self.memory_usage_mb = Some(memory_mb);
111        self
112    }
113
114    pub fn with_io(mut self, read_bytes: u64, write_bytes: u64) -> Self {
115        self.io_read_bytes = Some(read_bytes);
116        self.io_write_bytes = Some(write_bytes);
117        self
118    }
119
120    pub fn with_custom_metric(mut self, name: impl Into<String>, value: f64) -> Self {
121        self.custom_metrics.insert(name.into(), value);
122        self
123    }
124
125    pub fn with_tag(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
126        self.tags.insert(key.into(), value.into());
127        self
128    }
129
130    pub fn with_stack_trace(mut self, trace: impl Into<String>) -> Self {
131        self.stack_trace = Some(trace.into());
132        self
133    }
134}
135
136/// Bottleneck analysis result
137#[derive(Debug, Clone, Serialize, Deserialize)]
138pub struct BottleneckAnalysis {
139    pub timestamp: DateTime<Utc>,
140    pub analysis_period_minutes: u32,
141    pub bottlenecks: Vec<Bottleneck>,
142    pub performance_trends: Vec<PerformanceTrend>,
143    pub recommendations: Vec<OptimizationRecommendation>,
144}
145
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct Bottleneck {
148    pub component: String,
149    pub operation: String,
150    pub average_duration_ms: f64,
151    pub max_duration_ms: f64,
152    pub occurrence_count: usize,
153    pub impact_score: f64,
154    pub bottleneck_type: BottleneckType,
155    pub affected_users: usize,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub enum BottleneckType {
160    CPU,
161    Memory,
162    IO,
163    Network,
164    Database,
165    Cache,
166    Algorithm,
167}
168
169#[derive(Debug, Clone, Serialize, Deserialize)]
170pub struct PerformanceTrend {
171    pub metric_name: String,
172    pub component: String,
173    pub trend_direction: TrendDirection,
174    pub change_rate_percent: f64,
175    pub significance: TrendSignificance,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub enum TrendDirection {
180    Improving,
181    Degrading,
182    Stable,
183}
184
185#[derive(Debug, Clone, Serialize, Deserialize)]
186pub enum TrendSignificance {
187    High,
188    Medium,
189    Low,
190}
191
192#[derive(Debug, Clone, Serialize, Deserialize)]
193pub struct OptimizationRecommendation {
194    pub component: String,
195    pub operation: String,
196    pub recommendation: String,
197    pub priority: RecommendationPriority,
198    pub estimated_improvement_percent: f64,
199    pub implementation_effort: ImplementationEffort,
200}
201
202#[derive(Debug, Clone, Serialize, Deserialize)]
203pub enum RecommendationPriority {
204    Critical,
205    High,
206    Medium,
207    Low,
208}
209
210#[derive(Debug, Clone, Serialize, Deserialize)]
211pub enum ImplementationEffort {
212    Low,
213    Medium,
214    High,
215}
216
217/// Performance report
218#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct PerformanceReport {
220    pub timestamp: DateTime<Utc>,
221    pub report_period: Duration,
222    pub summary: PerformanceSummary,
223    pub component_performance: HashMap<String, ComponentPerformanceMetrics>,
224    pub operation_performance: HashMap<String, OperationPerformanceMetrics>,
225    pub user_experience_metrics: UserExperienceMetrics,
226    pub resource_utilization: ResourceUtilizationMetrics,
227}
228
229#[derive(Debug, Clone, Serialize, Deserialize)]
230pub struct PerformanceSummary {
231    pub total_operations: usize,
232    pub average_response_time_ms: f64,
233    pub p95_response_time_ms: f64,
234    pub p99_response_time_ms: f64,
235    pub error_rate_percent: f64,
236    pub throughput_ops_per_second: f64,
237    pub bottlenecks_detected: usize,
238}
239
240#[derive(Debug, Clone, Serialize, Deserialize)]
241pub struct ComponentPerformanceMetrics {
242    pub component_name: String,
243    pub operation_count: usize,
244    pub average_duration_ms: f64,
245    pub max_duration_ms: f64,
246    pub min_duration_ms: f64,
247    pub standard_deviation_ms: f64,
248    pub error_count: usize,
249    pub cpu_usage_percent: Option<f64>,
250    pub memory_usage_mb: Option<f64>,
251}
252
253#[derive(Debug, Clone, Serialize, Deserialize)]
254pub struct OperationPerformanceMetrics {
255    pub operation_name: String,
256    pub invocation_count: usize,
257    pub average_duration_ms: f64,
258    pub percentiles: HashMap<u32, f64>, // P50, P95, P99, etc.
259    pub concurrent_executions_max: usize,
260    pub success_rate_percent: f64,
261}
262
263#[derive(Debug, Clone, Serialize, Deserialize)]
264pub struct UserExperienceMetrics {
265    pub average_session_duration_minutes: f64,
266    pub bounce_rate_percent: f64,
267    pub user_satisfaction_score: Option<f64>,
268    pub conversion_rate_percent: Option<f64>,
269}
270
271#[derive(Debug, Clone, Serialize, Deserialize)]
272pub struct ResourceUtilizationMetrics {
273    pub cpu_utilization_percent: f64,
274    pub memory_utilization_percent: f64,
275    pub disk_utilization_percent: f64,
276    pub network_utilization_percent: f64,
277    pub connection_pool_utilization_percent: f64,
278}
279
280/// Performance profiler
281pub struct PerformanceProfiler {
282    config: ProfilingConfig,
283    profiles: Arc<RwLock<Vec<ProfileData>>>,
284    active_profiles: Arc<RwLock<HashMap<String, std::time::Instant>>>,
285    is_running: Arc<RwLock<bool>>,
286}
287
288impl PerformanceProfiler {
289    pub async fn new(config: ProfilingConfig) -> RragResult<Self> {
290        Ok(Self {
291            config,
292            profiles: Arc::new(RwLock::new(Vec::new())),
293            active_profiles: Arc::new(RwLock::new(HashMap::new())),
294            is_running: Arc::new(RwLock::new(false)),
295        })
296    }
297
298    pub async fn start(&self) -> RragResult<()> {
299        let mut running = self.is_running.write().await;
300        if *running {
301            return Err(RragError::config("profiler", "stopped", "already running"));
302        }
303        *running = true;
304        tracing::info!("Performance profiler started");
305        Ok(())
306    }
307
308    pub async fn stop(&self) -> RragResult<()> {
309        let mut running = self.is_running.write().await;
310        if !*running {
311            return Ok(());
312        }
313        *running = false;
314        tracing::info!("Performance profiler stopped");
315        Ok(())
316    }
317
318    pub async fn is_healthy(&self) -> bool {
319        *self.is_running.read().await
320    }
321
322    /// Start profiling an operation
323    pub async fn start_profile(&self, operation_id: impl Into<String>) -> RragResult<()> {
324        if !*self.is_running.read().await {
325            return Ok(()); // Silently ignore if profiler is disabled
326        }
327
328        // Check sampling rate
329        if rand::random::<f64>() > self.config.sample_rate {
330            return Ok(()); // Skip this profile based on sampling
331        }
332
333        let mut active = self.active_profiles.write().await;
334        active.insert(operation_id.into(), std::time::Instant::now());
335        Ok(())
336    }
337
338    /// End profiling an operation and record the profile
339    pub async fn end_profile(
340        &self,
341        operation_id: impl Into<String>,
342        operation: impl Into<String>,
343        component: impl Into<String>,
344    ) -> RragResult<Option<ProfileData>> {
345        if !*self.is_running.read().await {
346            return Ok(None);
347        }
348
349        let operation_id = operation_id.into();
350        let mut active = self.active_profiles.write().await;
351
352        if let Some(start_time) = active.remove(&operation_id) {
353            let duration_ms = start_time.elapsed().as_millis() as f64;
354            drop(active);
355
356            let mut profile = ProfileData::new(operation, component)
357                .with_duration(duration_ms)
358                .with_trace(operation_id);
359
360            // Collect system metrics if enabled
361            if self.config.enable_cpu_profiling || self.config.enable_memory_profiling {
362                // In a real implementation, this would collect actual system metrics
363                profile = profile
364                    .with_cpu_usage(rand::random::<f64>() * 100.0)
365                    .with_memory_usage(rand::random::<f64>() * 1024.0);
366            }
367
368            // Store profile
369            let mut profiles = self.profiles.write().await;
370            profiles.push(profile.clone());
371
372            // Keep only recent profiles
373            let profiles_len = profiles.len();
374            if profiles_len > self.config.max_profiles {
375                profiles.drain(0..profiles_len - self.config.max_profiles);
376            }
377
378            Ok(Some(profile))
379        } else {
380            Ok(None)
381        }
382    }
383
384    /// Record a complete profile data
385    pub async fn record_profile(&self, profile: ProfileData) -> RragResult<()> {
386        if !*self.is_running.read().await {
387            return Ok(());
388        }
389
390        let mut profiles = self.profiles.write().await;
391        profiles.push(profile);
392
393        // Keep only recent profiles
394        let profiles_len = profiles.len();
395        if profiles_len > self.config.max_profiles {
396            profiles.drain(0..profiles_len - self.config.max_profiles);
397        }
398
399        Ok(())
400    }
401
402    /// Analyze bottlenecks in the collected profiles
403    pub async fn analyze_bottlenecks(&self, analysis_period_minutes: u32) -> BottleneckAnalysis {
404        let profiles = self.profiles.read().await;
405        let cutoff_time = Utc::now() - Duration::minutes(analysis_period_minutes as i64);
406
407        let recent_profiles: Vec<_> = profiles
408            .iter()
409            .filter(|p| p.timestamp >= cutoff_time)
410            .collect();
411
412        if recent_profiles.is_empty() {
413            return BottleneckAnalysis {
414                timestamp: Utc::now(),
415                analysis_period_minutes,
416                bottlenecks: Vec::new(),
417                performance_trends: Vec::new(),
418                recommendations: Vec::new(),
419            };
420        }
421
422        let bottlenecks = self.identify_bottlenecks(&recent_profiles);
423        let trends = self.analyze_performance_trends(&recent_profiles);
424        let recommendations = self.generate_recommendations(&bottlenecks);
425
426        BottleneckAnalysis {
427            timestamp: Utc::now(),
428            analysis_period_minutes,
429            bottlenecks,
430            performance_trends: trends,
431            recommendations,
432        }
433    }
434
435    fn identify_bottlenecks(&self, profiles: &[&ProfileData]) -> Vec<Bottleneck> {
436        let mut component_operations: HashMap<String, Vec<f64>> = HashMap::new();
437        let mut user_impact: HashMap<String, std::collections::HashSet<String>> = HashMap::new();
438
439        for profile in profiles {
440            let key = format!("{}:{}", profile.component, profile.operation);
441            component_operations
442                .entry(key.clone())
443                .or_default()
444                .push(profile.duration_ms);
445
446            if let Some(ref user_id) = profile.user_id {
447                user_impact.entry(key).or_default().insert(user_id.clone());
448            }
449        }
450
451        let mut bottlenecks = Vec::new();
452
453        for (key, durations) in component_operations {
454            let avg_duration = durations.iter().sum::<f64>() / durations.len() as f64;
455            let max_duration = durations.iter().fold(0.0f64, |a, &b| a.max(b));
456
457            if avg_duration > self.config.bottleneck_threshold_ms {
458                let parts: Vec<&str> = key.split(':').collect();
459                let component = parts[0].to_string();
460                let operation = parts[1].to_string();
461
462                let impact_score =
463                    self.calculate_impact_score(avg_duration, durations.len(), max_duration);
464                let bottleneck_type = self.determine_bottleneck_type(&component, avg_duration);
465                let affected_users = user_impact.get(&key).map(|set| set.len()).unwrap_or(0);
466
467                bottlenecks.push(Bottleneck {
468                    component,
469                    operation,
470                    average_duration_ms: avg_duration,
471                    max_duration_ms: max_duration,
472                    occurrence_count: durations.len(),
473                    impact_score,
474                    bottleneck_type,
475                    affected_users,
476                });
477            }
478        }
479
480        // Sort by impact score
481        bottlenecks.sort_by(|a, b| {
482            b.impact_score
483                .partial_cmp(&a.impact_score)
484                .unwrap_or(std::cmp::Ordering::Equal)
485        });
486        bottlenecks
487    }
488
489    fn calculate_impact_score(
490        &self,
491        avg_duration: f64,
492        frequency: usize,
493        max_duration: f64,
494    ) -> f64 {
495        // Impact score considers duration, frequency, and peak impact
496        let duration_weight = avg_duration / 1000.0; // Convert to seconds
497        let frequency_weight = (frequency as f64).log10();
498        let peak_weight = max_duration / avg_duration;
499
500        duration_weight * frequency_weight * peak_weight
501    }
502
503    fn determine_bottleneck_type(&self, component: &str, avg_duration: f64) -> BottleneckType {
504        match component.to_lowercase().as_str() {
505            s if s.contains("database") || s.contains("db") => BottleneckType::Database,
506            s if s.contains("cache") => BottleneckType::Cache,
507            s if s.contains("network") || s.contains("http") => BottleneckType::Network,
508            s if s.contains("io") || s.contains("file") => BottleneckType::IO,
509            _ => {
510                if avg_duration > 1000.0 {
511                    BottleneckType::Algorithm
512                } else if avg_duration > 500.0 {
513                    BottleneckType::CPU
514                } else {
515                    BottleneckType::Memory
516                }
517            }
518        }
519    }
520
521    fn analyze_performance_trends(&self, _profiles: &[&ProfileData]) -> Vec<PerformanceTrend> {
522        // This would implement sophisticated trend analysis
523        // For now, return a simple mock trend
524        vec![PerformanceTrend {
525            metric_name: "response_time".to_string(),
526            component: "search".to_string(),
527            trend_direction: TrendDirection::Stable,
528            change_rate_percent: -2.5,
529            significance: TrendSignificance::Low,
530        }]
531    }
532
533    fn generate_recommendations(
534        &self,
535        bottlenecks: &[Bottleneck],
536    ) -> Vec<OptimizationRecommendation> {
537        let mut recommendations = Vec::new();
538
539        for bottleneck in bottlenecks.iter().take(5) {
540            // Top 5 bottlenecks
541            let recommendation = match bottleneck.bottleneck_type {
542                BottleneckType::Database => OptimizationRecommendation {
543                    component: bottleneck.component.clone(),
544                    operation: bottleneck.operation.clone(),
545                    recommendation: "Consider adding database indexes or optimizing queries".to_string(),
546                    priority: if bottleneck.impact_score > 10.0 { RecommendationPriority::High } else { RecommendationPriority::Medium },
547                    estimated_improvement_percent: 40.0,
548                    implementation_effort: ImplementationEffort::Medium,
549                },
550                BottleneckType::Cache => OptimizationRecommendation {
551                    component: bottleneck.component.clone(),
552                    operation: bottleneck.operation.clone(),
553                    recommendation: "Implement caching strategy or increase cache size".to_string(),
554                    priority: RecommendationPriority::Medium,
555                    estimated_improvement_percent: 60.0,
556                    implementation_effort: ImplementationEffort::Low,
557                },
558                BottleneckType::Algorithm => OptimizationRecommendation {
559                    component: bottleneck.component.clone(),
560                    operation: bottleneck.operation.clone(),
561                    recommendation: "Review algorithm complexity and optimize data structures".to_string(),
562                    priority: RecommendationPriority::High,
563                    estimated_improvement_percent: 70.0,
564                    implementation_effort: ImplementationEffort::High,
565                },
566                _ => OptimizationRecommendation {
567                    component: bottleneck.component.clone(),
568                    operation: bottleneck.operation.clone(),
569                    recommendation: "Profile detailed resource usage to identify specific optimization opportunities".to_string(),
570                    priority: RecommendationPriority::Low,
571                    estimated_improvement_percent: 25.0,
572                    implementation_effort: ImplementationEffort::Medium,
573                },
574            };
575
576            recommendations.push(recommendation);
577        }
578
579        recommendations
580    }
581
582    /// Generate comprehensive performance report
583    pub async fn generate_performance_report(&self, period: Duration) -> PerformanceReport {
584        let profiles = self.profiles.read().await;
585        let cutoff_time = Utc::now() - period;
586
587        let recent_profiles: Vec<_> = profiles
588            .iter()
589            .filter(|p| p.timestamp >= cutoff_time)
590            .collect();
591
592        if recent_profiles.is_empty() {
593            return PerformanceReport {
594                timestamp: Utc::now(),
595                report_period: period,
596                summary: PerformanceSummary {
597                    total_operations: 0,
598                    average_response_time_ms: 0.0,
599                    p95_response_time_ms: 0.0,
600                    p99_response_time_ms: 0.0,
601                    error_rate_percent: 0.0,
602                    throughput_ops_per_second: 0.0,
603                    bottlenecks_detected: 0,
604                },
605                component_performance: HashMap::new(),
606                operation_performance: HashMap::new(),
607                user_experience_metrics: UserExperienceMetrics {
608                    average_session_duration_minutes: 0.0,
609                    bounce_rate_percent: 0.0,
610                    user_satisfaction_score: None,
611                    conversion_rate_percent: None,
612                },
613                resource_utilization: ResourceUtilizationMetrics {
614                    cpu_utilization_percent: 0.0,
615                    memory_utilization_percent: 0.0,
616                    disk_utilization_percent: 0.0,
617                    network_utilization_percent: 0.0,
618                    connection_pool_utilization_percent: 0.0,
619                },
620            };
621        }
622
623        let summary = self.calculate_performance_summary(&recent_profiles);
624        let component_performance = self.calculate_component_performance(&recent_profiles);
625        let operation_performance = self.calculate_operation_performance(&recent_profiles);
626        let user_experience = self.calculate_user_experience_metrics(&recent_profiles);
627        let resource_utilization = self.calculate_resource_utilization(&recent_profiles);
628
629        PerformanceReport {
630            timestamp: Utc::now(),
631            report_period: period,
632            summary,
633            component_performance,
634            operation_performance,
635            user_experience_metrics: user_experience,
636            resource_utilization,
637        }
638    }
639
640    fn calculate_performance_summary(&self, profiles: &[&ProfileData]) -> PerformanceSummary {
641        let durations: Vec<f64> = profiles.iter().map(|p| p.duration_ms).collect();
642        let total_operations = durations.len();
643
644        if durations.is_empty() {
645            return PerformanceSummary {
646                total_operations: 0,
647                average_response_time_ms: 0.0,
648                p95_response_time_ms: 0.0,
649                p99_response_time_ms: 0.0,
650                error_rate_percent: 0.0,
651                throughput_ops_per_second: 0.0,
652                bottlenecks_detected: 0,
653            };
654        }
655
656        let average_response_time_ms = durations.iter().sum::<f64>() / durations.len() as f64;
657
658        let mut sorted_durations = durations.clone();
659        sorted_durations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
660
661        let p95_index = (durations.len() as f64 * 0.95) as usize;
662        let p99_index = (durations.len() as f64 * 0.99) as usize;
663
664        let p95_response_time_ms = sorted_durations.get(p95_index).copied().unwrap_or(0.0);
665        let p99_response_time_ms = sorted_durations.get(p99_index).copied().unwrap_or(0.0);
666
667        // Mock error rate and throughput calculations
668        let error_rate_percent = 2.0; // Would be calculated from actual error data
669        let throughput_ops_per_second = total_operations as f64 / 60.0; // Assuming 1-minute window
670
671        let bottlenecks_detected = durations
672            .iter()
673            .filter(|&&d| d > self.config.bottleneck_threshold_ms)
674            .count();
675
676        PerformanceSummary {
677            total_operations,
678            average_response_time_ms,
679            p95_response_time_ms,
680            p99_response_time_ms,
681            error_rate_percent,
682            throughput_ops_per_second,
683            bottlenecks_detected,
684        }
685    }
686
687    fn calculate_component_performance(
688        &self,
689        profiles: &[&ProfileData],
690    ) -> HashMap<String, ComponentPerformanceMetrics> {
691        let mut component_data: HashMap<String, Vec<f64>> = HashMap::new();
692        let error_counts: HashMap<String, usize> = HashMap::new();
693
694        for profile in profiles {
695            component_data
696                .entry(profile.component.clone())
697                .or_default()
698                .push(profile.duration_ms);
699        }
700
701        let mut result = HashMap::new();
702
703        for (component_name, durations) in component_data {
704            let operation_count = durations.len();
705            let average_duration_ms = durations.iter().sum::<f64>() / durations.len() as f64;
706            let max_duration_ms = durations.iter().fold(0.0f64, |a, &b| a.max(b));
707            let min_duration_ms = durations.iter().fold(f64::INFINITY, |a, &b| a.min(b));
708
709            let variance = durations
710                .iter()
711                .map(|&d| (d - average_duration_ms).powi(2))
712                .sum::<f64>()
713                / durations.len() as f64;
714            let standard_deviation_ms = variance.sqrt();
715
716            let error_count = error_counts.get(&component_name).copied().unwrap_or(0);
717
718            result.insert(
719                component_name.clone(),
720                ComponentPerformanceMetrics {
721                    component_name,
722                    operation_count,
723                    average_duration_ms,
724                    max_duration_ms,
725                    min_duration_ms,
726                    standard_deviation_ms,
727                    error_count,
728                    cpu_usage_percent: Some(rand::random::<f64>() * 100.0), // Mock data
729                    memory_usage_mb: Some(rand::random::<f64>() * 1024.0),  // Mock data
730                },
731            );
732        }
733
734        result
735    }
736
737    fn calculate_operation_performance(
738        &self,
739        profiles: &[&ProfileData],
740    ) -> HashMap<String, OperationPerformanceMetrics> {
741        let mut operation_data: HashMap<String, Vec<f64>> = HashMap::new();
742
743        for profile in profiles {
744            operation_data
745                .entry(profile.operation.clone())
746                .or_default()
747                .push(profile.duration_ms);
748        }
749
750        let mut result = HashMap::new();
751
752        for (operation_name, durations) in operation_data {
753            let invocation_count = durations.len();
754            let average_duration_ms = durations.iter().sum::<f64>() / durations.len() as f64;
755
756            let mut sorted_durations = durations.clone();
757            sorted_durations.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
758
759            let mut percentiles = HashMap::new();
760            percentiles.insert(
761                50,
762                sorted_durations[(durations.len() * 50 / 100).min(durations.len() - 1)],
763            );
764            percentiles.insert(
765                95,
766                sorted_durations[(durations.len() * 95 / 100).min(durations.len() - 1)],
767            );
768            percentiles.insert(
769                99,
770                sorted_durations[(durations.len() * 99 / 100).min(durations.len() - 1)],
771            );
772
773            result.insert(
774                operation_name.clone(),
775                OperationPerformanceMetrics {
776                    operation_name,
777                    invocation_count,
778                    average_duration_ms,
779                    percentiles,
780                    concurrent_executions_max: 10, // Mock data
781                    success_rate_percent: 98.5,    // Mock data
782                },
783            );
784        }
785
786        result
787    }
788
789    fn calculate_user_experience_metrics(
790        &self,
791        _profiles: &[&ProfileData],
792    ) -> UserExperienceMetrics {
793        UserExperienceMetrics {
794            average_session_duration_minutes: 15.5,
795            bounce_rate_percent: 12.3,
796            user_satisfaction_score: Some(4.2),
797            conversion_rate_percent: Some(3.1),
798        }
799    }
800
801    fn calculate_resource_utilization(
802        &self,
803        profiles: &[&ProfileData],
804    ) -> ResourceUtilizationMetrics {
805        let cpu_values: Vec<f64> = profiles
806            .iter()
807            .filter_map(|p| p.cpu_usage_percent)
808            .collect();
809        let memory_values: Vec<f64> = profiles.iter().filter_map(|p| p.memory_usage_mb).collect();
810
811        let cpu_utilization = if !cpu_values.is_empty() {
812            cpu_values.iter().sum::<f64>() / cpu_values.len() as f64
813        } else {
814            45.0 // Mock data
815        };
816
817        let memory_utilization = if !memory_values.is_empty() {
818            (memory_values.iter().sum::<f64>() / memory_values.len() as f64) / 1024.0 * 100.0
819        // Convert to percentage
820        } else {
821            62.0 // Mock data
822        };
823
824        ResourceUtilizationMetrics {
825            cpu_utilization_percent: cpu_utilization,
826            memory_utilization_percent: memory_utilization,
827            disk_utilization_percent: 23.0,            // Mock data
828            network_utilization_percent: 15.0,         // Mock data
829            connection_pool_utilization_percent: 78.0, // Mock data
830        }
831    }
832
833    pub async fn clear_profiles(&self) -> RragResult<()> {
834        let mut profiles = self.profiles.write().await;
835        profiles.clear();
836        Ok(())
837    }
838
839    pub async fn get_profile_count(&self) -> usize {
840        self.profiles.read().await.len()
841    }
842}
843
844/// Profiler trait for custom profiling implementations
845#[async_trait::async_trait]
846pub trait Profiler: Send + Sync {
847    async fn start_profile(&self, operation_id: &str) -> RragResult<()>;
848    async fn end_profile(
849        &self,
850        operation_id: &str,
851        operation: &str,
852        component: &str,
853    ) -> RragResult<Option<ProfileData>>;
854    async fn record_profile(&self, profile: ProfileData) -> RragResult<()>;
855    async fn analyze_bottlenecks(&self, period_minutes: u32) -> BottleneckAnalysis;
856    async fn generate_report(&self, period: Duration) -> PerformanceReport;
857}
858
859#[async_trait::async_trait]
860impl Profiler for PerformanceProfiler {
861    async fn start_profile(&self, operation_id: &str) -> RragResult<()> {
862        self.start_profile(operation_id).await
863    }
864
865    async fn end_profile(
866        &self,
867        operation_id: &str,
868        operation: &str,
869        component: &str,
870    ) -> RragResult<Option<ProfileData>> {
871        self.end_profile(operation_id, operation, component).await
872    }
873
874    async fn record_profile(&self, profile: ProfileData) -> RragResult<()> {
875        self.record_profile(profile).await
876    }
877
878    async fn analyze_bottlenecks(&self, period_minutes: u32) -> BottleneckAnalysis {
879        self.analyze_bottlenecks(period_minutes).await
880    }
881
882    async fn generate_report(&self, period: Duration) -> PerformanceReport {
883        self.generate_performance_report(period).await
884    }
885}
886
887#[cfg(test)]
888mod tests {
889    use super::*;
890
891    #[tokio::test]
892    async fn test_profile_data_creation() {
893        let profile = ProfileData::new("search_query", "search_engine")
894            .with_duration(150.5)
895            .with_user("user123")
896            .with_session("session456")
897            .with_trace("trace789")
898            .with_cpu_usage(45.2)
899            .with_memory_usage(512.0)
900            .with_io(1024, 2048)
901            .with_custom_metric("cache_hits", 85.0)
902            .with_tag("priority", "high");
903
904        assert_eq!(profile.operation, "search_query");
905        assert_eq!(profile.component, "search_engine");
906        assert_eq!(profile.duration_ms, 150.5);
907        assert_eq!(profile.user_id.as_ref().unwrap(), "user123");
908        assert_eq!(profile.cpu_usage_percent.unwrap(), 45.2);
909        assert_eq!(profile.io_read_bytes.unwrap(), 1024);
910        assert!(profile.custom_metrics.contains_key("cache_hits"));
911        assert!(profile.tags.contains_key("priority"));
912    }
913
914    #[tokio::test]
915    async fn test_performance_profiler() {
916        let config = ProfilingConfig {
917            sample_rate: 1.0, // 100% sampling for testing
918            max_profiles: 100,
919            ..Default::default()
920        };
921
922        let profiler = PerformanceProfiler::new(config).await.unwrap();
923
924        assert!(!profiler.is_healthy().await);
925
926        profiler.start().await.unwrap();
927        assert!(profiler.is_healthy().await);
928
929        // Test operation profiling
930        profiler.start_profile("op1").await.unwrap();
931        tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
932        let profile = profiler
933            .end_profile("op1", "test_operation", "test_component")
934            .await
935            .unwrap();
936
937        assert!(profile.is_some());
938        let profile = profile.unwrap();
939        assert_eq!(profile.operation, "test_operation");
940        assert_eq!(profile.component, "test_component");
941        assert!(profile.duration_ms > 0.0);
942
943        assert_eq!(profiler.get_profile_count().await, 1);
944
945        profiler.stop().await.unwrap();
946        assert!(!profiler.is_healthy().await);
947    }
948
949    #[tokio::test]
950    async fn test_bottleneck_analysis() {
951        let config = ProfilingConfig {
952            sample_rate: 1.0,
953            bottleneck_threshold_ms: 50.0,
954            ..Default::default()
955        };
956
957        let profiler = PerformanceProfiler::new(config).await.unwrap();
958        profiler.start().await.unwrap();
959
960        // Add some profiles with different performance characteristics
961        let profiles = vec![
962            ProfileData::new("fast_operation", "component1").with_duration(25.0),
963            ProfileData::new("slow_operation", "component1")
964                .with_duration(150.0)
965                .with_user("user1"),
966            ProfileData::new("slow_operation", "component1")
967                .with_duration(200.0)
968                .with_user("user2"),
969            ProfileData::new("moderate_operation", "component2").with_duration(75.0),
970        ];
971
972        for profile in profiles {
973            profiler.record_profile(profile).await.unwrap();
974        }
975
976        let analysis = profiler.analyze_bottlenecks(60).await;
977
978        assert!(!analysis.bottlenecks.is_empty());
979        let bottleneck = &analysis.bottlenecks[0];
980        assert_eq!(bottleneck.operation, "slow_operation");
981        assert!(bottleneck.average_duration_ms > 50.0);
982        assert_eq!(bottleneck.affected_users, 2);
983
984        assert!(!analysis.recommendations.is_empty());
985
986        profiler.stop().await.unwrap();
987    }
988
989    #[tokio::test]
990    async fn test_performance_report() {
991        let config = ProfilingConfig::default();
992        let profiler = PerformanceProfiler::new(config).await.unwrap();
993        profiler.start().await.unwrap();
994
995        // Add sample profiles
996        let profiles = vec![
997            ProfileData::new("operation1", "component1").with_duration(100.0),
998            ProfileData::new("operation1", "component1").with_duration(120.0),
999            ProfileData::new("operation2", "component2").with_duration(50.0),
1000            ProfileData::new("operation2", "component2").with_duration(60.0),
1001        ];
1002
1003        for profile in profiles {
1004            profiler.record_profile(profile).await.unwrap();
1005        }
1006
1007        let report = profiler
1008            .generate_performance_report(Duration::hours(1))
1009            .await;
1010
1011        assert_eq!(report.summary.total_operations, 4);
1012        assert!(report.summary.average_response_time_ms > 0.0);
1013        assert!(!report.component_performance.is_empty());
1014        assert!(!report.operation_performance.is_empty());
1015
1016        assert!(report.component_performance.contains_key("component1"));
1017        assert!(report.component_performance.contains_key("component2"));
1018
1019        profiler.stop().await.unwrap();
1020    }
1021
1022    #[tokio::test]
1023    async fn test_sampling_rate() {
1024        let config = ProfilingConfig {
1025            sample_rate: 0.0, // No sampling
1026            ..Default::default()
1027        };
1028
1029        let profiler = PerformanceProfiler::new(config).await.unwrap();
1030        profiler.start().await.unwrap();
1031
1032        // These operations should be ignored due to 0% sampling rate
1033        for i in 0..10 {
1034            profiler.start_profile(&format!("op{}", i)).await.unwrap();
1035            let profile = profiler
1036                .end_profile(&format!("op{}", i), "test", "component")
1037                .await
1038                .unwrap();
1039            // Due to 0% sampling, profiles should not be recorded
1040            assert!(profile.is_none());
1041        }
1042
1043        assert_eq!(profiler.get_profile_count().await, 0);
1044
1045        profiler.stop().await.unwrap();
1046    }
1047
1048    #[test]
1049    fn test_bottleneck_types() {
1050        let config = ProfilingConfig::default();
1051        let profiler = futures::executor::block_on(PerformanceProfiler::new(config)).unwrap();
1052
1053        assert!(matches!(
1054            profiler.determine_bottleneck_type("database_service", 500.0),
1055            BottleneckType::Database
1056        ));
1057
1058        assert!(matches!(
1059            profiler.determine_bottleneck_type("cache_manager", 100.0),
1060            BottleneckType::Cache
1061        ));
1062
1063        assert!(matches!(
1064            profiler.determine_bottleneck_type("network_handler", 200.0),
1065            BottleneckType::Network
1066        ));
1067
1068        assert!(matches!(
1069            profiler.determine_bottleneck_type("regular_service", 1500.0),
1070            BottleneckType::Algorithm
1071        ));
1072    }
1073}
rexis_rag/observability/profiling.rs

rexis_rag/observability/
profiling.rs