codex_memory/monitoring/
profiling.rs

1use serde::{Deserialize, Serialize};
2use std::collections::{HashMap, VecDeque};
3use std::sync::{Arc, RwLock};
4use std::time::{Duration, Instant};
5use tracing::{debug, info, warn};
6use uuid::Uuid;
7
8/// Performance profiler for tracking system performance metrics
9#[derive(Debug, Clone)]
10pub struct PerformanceProfiler {
11    inner: Arc<RwLock<ProfilerInner>>,
12}
13
14#[derive(Debug)]
15struct ProfilerInner {
16    profiles: HashMap<String, OperationProfile>,
17    active_traces: HashMap<Uuid, ActiveTrace>,
18    config: ProfilerConfig,
19    start_time: Instant,
20}
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct ProfilerConfig {
24    pub max_traces: usize,
25    pub max_history_per_operation: usize,
26    pub slow_operation_threshold_ms: u64,
27    pub enabled: bool,
28}
29
30impl Default for ProfilerConfig {
31    fn default() -> Self {
32        Self {
33            max_traces: 1000,
34            max_history_per_operation: 100,
35            slow_operation_threshold_ms: 100,
36            enabled: true,
37        }
38    }
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct OperationProfile {
43    pub name: String,
44    pub total_calls: u64,
45    pub total_duration_ms: u64,
46    pub avg_duration_ms: f64,
47    pub min_duration_ms: u64,
48    pub max_duration_ms: u64,
49    pub p95_duration_ms: u64,
50    pub p99_duration_ms: u64,
51    pub error_count: u64,
52    pub slow_operations: u64,
53    pub recent_durations: VecDeque<u64>,
54    pub last_updated: std::time::SystemTime,
55}
56
57#[derive(Debug, Clone)]
58pub struct ActiveTrace {
59    pub id: Uuid,
60    pub operation: String,
61    pub start_time: Instant,
62    pub metadata: HashMap<String, String>,
63}
64
65#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct TraceResult {
67    pub id: Uuid,
68    pub operation: String,
69    pub duration_ms: u64,
70    pub success: bool,
71    pub metadata: HashMap<String, String>,
72    pub timestamp: std::time::SystemTime,
73}
74
75impl Default for PerformanceProfiler {
76    fn default() -> Self {
77        Self::new()
78    }
79}
80
81impl PerformanceProfiler {
82    pub fn new() -> Self {
83        Self::with_config(ProfilerConfig::default())
84    }
85
86    pub fn with_config(config: ProfilerConfig) -> Self {
87        Self {
88            inner: Arc::new(RwLock::new(ProfilerInner {
89                profiles: HashMap::new(),
90                active_traces: HashMap::new(),
91                config,
92                start_time: Instant::now(),
93            })),
94        }
95    }
96
97    /// Start profiling an operation
98    pub fn start_trace(&self, operation: String) -> Option<TraceHandle> {
99        if !self.is_enabled() {
100            return None;
101        }
102
103        let trace_id = Uuid::new_v4();
104        let trace = ActiveTrace {
105            id: trace_id,
106            operation: operation.clone(),
107            start_time: Instant::now(),
108            metadata: HashMap::new(),
109        };
110
111        if let Ok(mut inner) = self.inner.write() {
112            // Clean up old traces if we're at the limit
113            if inner.active_traces.len() >= inner.config.max_traces {
114                self.cleanup_old_traces(&mut inner);
115            }
116
117            inner.active_traces.insert(trace_id, trace);
118            debug!("Started trace {} for operation {}", trace_id, operation);
119        }
120
121        Some(TraceHandle {
122            profiler: self.clone(),
123            trace_id,
124            operation,
125        })
126    }
127
128    /// Complete a trace and record the results
129    pub fn complete_trace(
130        &self,
131        trace_id: Uuid,
132        success: bool,
133        metadata: Option<HashMap<String, String>>,
134    ) {
135        if !self.is_enabled() {
136            return;
137        }
138
139        if let Ok(mut inner) = self.inner.write() {
140            if let Some(trace) = inner.active_traces.remove(&trace_id) {
141                let duration = trace.start_time.elapsed();
142                let duration_ms = duration.as_millis() as u64;
143
144                let mut final_metadata = trace.metadata;
145                if let Some(additional_metadata) = metadata {
146                    final_metadata.extend(additional_metadata);
147                }
148
149                // Update operation profile
150                let profile = inner
151                    .profiles
152                    .entry(trace.operation.clone())
153                    .or_insert_with(|| OperationProfile::new(trace.operation.clone()));
154
155                profile.record_operation(duration_ms, success);
156
157                let is_slow = duration_ms > inner.config.slow_operation_threshold_ms;
158                if is_slow {
159                    warn!(
160                        "Slow operation detected: {} took {}ms",
161                        trace.operation, duration_ms
162                    );
163                }
164
165                debug!(
166                    "Completed trace {} for {} in {}ms (success: {})",
167                    trace_id, trace.operation, duration_ms, success
168                );
169            }
170        }
171    }
172
173    /// Get performance profile for a specific operation
174    pub fn get_operation_profile(&self, operation: &str) -> Option<OperationProfile> {
175        if let Ok(inner) = self.inner.read() {
176            inner.profiles.get(operation).cloned()
177        } else {
178            None
179        }
180    }
181
182    /// Get all operation profiles
183    pub fn get_all_profiles(&self) -> HashMap<String, OperationProfile> {
184        if let Ok(inner) = self.inner.read() {
185            inner.profiles.clone()
186        } else {
187            HashMap::new()
188        }
189    }
190
191    /// Get performance summary
192    pub fn get_performance_summary(&self) -> PerformanceSummary {
193        if let Ok(inner) = self.inner.read() {
194            let total_operations: u64 = inner.profiles.values().map(|p| p.total_calls).sum();
195            let total_errors: u64 = inner.profiles.values().map(|p| p.error_count).sum();
196            let total_slow_operations: u64 =
197                inner.profiles.values().map(|p| p.slow_operations).sum();
198
199            let avg_response_time = if total_operations > 0 {
200                let total_duration: u64 =
201                    inner.profiles.values().map(|p| p.total_duration_ms).sum();
202                total_duration as f64 / total_operations as f64
203            } else {
204                0.0
205            };
206
207            let error_rate = if total_operations > 0 {
208                total_errors as f64 / total_operations as f64 * 100.0
209            } else {
210                0.0
211            };
212
213            let uptime_seconds = inner.start_time.elapsed().as_secs();
214            let operations_per_second = if uptime_seconds > 0 {
215                total_operations as f64 / uptime_seconds as f64
216            } else {
217                0.0
218            };
219
220            PerformanceSummary {
221                total_operations,
222                total_errors,
223                error_rate,
224                avg_response_time_ms: avg_response_time,
225                total_slow_operations,
226                active_traces: inner.active_traces.len(),
227                uptime_seconds,
228                operations_per_second,
229                top_operations: inner
230                    .profiles
231                    .values()
232                    .cloned()
233                    .collect::<Vec<_>>()
234                    .into_iter()
235                    .fold(Vec::new(), |mut acc, profile| {
236                        acc.push((profile.name.clone(), profile.total_calls));
237                        acc.sort_by(|a, b| b.1.cmp(&a.1));
238                        acc.truncate(10);
239                        acc
240                    }),
241            }
242        } else {
243            PerformanceSummary::default()
244        }
245    }
246
247    /// Enable or disable profiling
248    pub fn set_enabled(&self, enabled: bool) {
249        if let Ok(mut inner) = self.inner.write() {
250            inner.config.enabled = enabled;
251            if enabled {
252                info!("Performance profiling enabled");
253            } else {
254                info!("Performance profiling disabled");
255                inner.active_traces.clear();
256            }
257        }
258    }
259
260    /// Check if profiling is enabled
261    pub fn is_enabled(&self) -> bool {
262        if let Ok(inner) = self.inner.read() {
263            inner.config.enabled
264        } else {
265            false
266        }
267    }
268
269    /// Reset all profiling data
270    pub fn reset(&self) {
271        if let Ok(mut inner) = self.inner.write() {
272            inner.profiles.clear();
273            inner.active_traces.clear();
274            inner.start_time = Instant::now();
275            info!("Performance profiling data reset");
276        }
277    }
278
279    /// Clean up old traces
280    fn cleanup_old_traces(&self, inner: &mut ProfilerInner) {
281        let cutoff_time = Instant::now() - Duration::from_secs(300); // 5 minutes
282        let initial_count = inner.active_traces.len();
283
284        inner
285            .active_traces
286            .retain(|_, trace| trace.start_time > cutoff_time);
287
288        let removed = initial_count - inner.active_traces.len();
289        if removed > 0 {
290            warn!("Cleaned up {} stale traces", removed);
291        }
292    }
293
294    /// Export performance data for external analysis
295    pub fn export_data(&self) -> Option<PerformanceExport> {
296        if let Ok(inner) = self.inner.read() {
297            Some(PerformanceExport {
298                profiles: inner.profiles.clone(),
299                summary: self.get_performance_summary(),
300                config: inner.config.clone(),
301                exported_at: std::time::SystemTime::now(),
302            })
303        } else {
304            None
305        }
306    }
307}
308
309/// Handle for an active trace that automatically completes on drop
310pub struct TraceHandle {
311    profiler: PerformanceProfiler,
312    trace_id: Uuid,
313    #[allow(dead_code)]
314    operation: String,
315}
316
317impl TraceHandle {
318    /// Add metadata to the trace
319    pub fn add_metadata(&self, key: String, value: String) {
320        if let Ok(mut inner) = self.profiler.inner.write() {
321            if let Some(trace) = inner.active_traces.get_mut(&self.trace_id) {
322                trace.metadata.insert(key, value);
323            }
324        }
325    }
326
327    /// Complete the trace manually
328    pub fn complete(self, success: bool) {
329        self.profiler.complete_trace(self.trace_id, success, None);
330        std::mem::forget(self); // Prevent drop from running
331    }
332
333    /// Complete the trace with additional metadata
334    pub fn complete_with_metadata(self, success: bool, metadata: HashMap<String, String>) {
335        self.profiler
336            .complete_trace(self.trace_id, success, Some(metadata));
337        std::mem::forget(self);
338    }
339}
340
341impl Drop for TraceHandle {
342    fn drop(&mut self) {
343        // Auto-complete as successful if not manually completed
344        self.profiler.complete_trace(self.trace_id, true, None);
345    }
346}
347
348impl OperationProfile {
349    fn new(name: String) -> Self {
350        Self {
351            name,
352            total_calls: 0,
353            total_duration_ms: 0,
354            avg_duration_ms: 0.0,
355            min_duration_ms: u64::MAX,
356            max_duration_ms: 0,
357            p95_duration_ms: 0,
358            p99_duration_ms: 0,
359            error_count: 0,
360            slow_operations: 0,
361            recent_durations: VecDeque::new(),
362            last_updated: std::time::SystemTime::now(),
363        }
364    }
365
366    fn record_operation(&mut self, duration_ms: u64, success: bool) {
367        self.total_calls += 1;
368        self.total_duration_ms += duration_ms;
369        self.avg_duration_ms = self.total_duration_ms as f64 / self.total_calls as f64;
370
371        self.min_duration_ms = self.min_duration_ms.min(duration_ms);
372        self.max_duration_ms = self.max_duration_ms.max(duration_ms);
373
374        if !success {
375            self.error_count += 1;
376        }
377
378        if duration_ms > 100 {
379            // Slow operation threshold
380            self.slow_operations += 1;
381        }
382
383        // Track recent durations for percentile calculations
384        self.recent_durations.push_back(duration_ms);
385        if self.recent_durations.len() > 100 {
386            self.recent_durations.pop_front();
387        }
388
389        // Update percentiles
390        self.update_percentiles();
391        self.last_updated = std::time::SystemTime::now();
392    }
393
394    fn update_percentiles(&mut self) {
395        if self.recent_durations.is_empty() {
396            return;
397        }
398
399        let mut sorted: Vec<u64> = self.recent_durations.iter().cloned().collect();
400        sorted.sort_unstable();
401
402        let len = sorted.len();
403        if len > 0 {
404            let p95_index = ((len as f64) * 0.95) as usize;
405            let p99_index = ((len as f64) * 0.99) as usize;
406
407            self.p95_duration_ms = sorted[p95_index.min(len - 1)];
408            self.p99_duration_ms = sorted[p99_index.min(len - 1)];
409        }
410    }
411}
412
413#[derive(Debug, Clone, Serialize, Deserialize)]
414pub struct PerformanceSummary {
415    pub total_operations: u64,
416    pub total_errors: u64,
417    pub error_rate: f64,
418    pub avg_response_time_ms: f64,
419    pub total_slow_operations: u64,
420    pub active_traces: usize,
421    pub uptime_seconds: u64,
422    pub operations_per_second: f64,
423    pub top_operations: Vec<(String, u64)>,
424}
425
426impl Default for PerformanceSummary {
427    fn default() -> Self {
428        Self {
429            total_operations: 0,
430            total_errors: 0,
431            error_rate: 0.0,
432            avg_response_time_ms: 0.0,
433            total_slow_operations: 0,
434            active_traces: 0,
435            uptime_seconds: 0,
436            operations_per_second: 0.0,
437            top_operations: Vec::new(),
438        }
439    }
440}
441
442#[derive(Debug, Clone, Serialize, Deserialize)]
443pub struct PerformanceExport {
444    pub profiles: HashMap<String, OperationProfile>,
445    pub summary: PerformanceSummary,
446    pub config: ProfilerConfig,
447    pub exported_at: std::time::SystemTime,
448}
449
450#[cfg(test)]
451mod tests {
452    use super::*;
453    use std::thread;
454    use std::time::Duration;
455
456    #[test]
457    fn test_profiler_creation() {
458        let profiler = PerformanceProfiler::new();
459        assert!(profiler.is_enabled());
460        assert!(profiler.get_all_profiles().is_empty());
461    }
462
463    #[test]
464    fn test_trace_lifecycle() {
465        let profiler = PerformanceProfiler::new();
466
467        let handle = profiler.start_trace("test_operation".to_string()).unwrap();
468        assert_eq!(profiler.get_performance_summary().active_traces, 1);
469
470        // Simulate work
471        thread::sleep(Duration::from_millis(10));
472
473        handle.complete(true);
474        assert_eq!(profiler.get_performance_summary().active_traces, 0);
475        assert_eq!(profiler.get_performance_summary().total_operations, 1);
476    }
477
478    #[test]
479    fn test_auto_complete_on_drop() {
480        let profiler = PerformanceProfiler::new();
481
482        {
483            let _handle = profiler.start_trace("test_operation".to_string()).unwrap();
484            assert_eq!(profiler.get_performance_summary().active_traces, 1);
485            // Handle drops here
486        }
487
488        assert_eq!(profiler.get_performance_summary().active_traces, 0);
489        assert_eq!(profiler.get_performance_summary().total_operations, 1);
490    }
491
492    #[test]
493    fn test_operation_profile_updates() {
494        let profiler = PerformanceProfiler::new();
495
496        // Record multiple operations
497        for i in 0..10 {
498            let handle = profiler.start_trace("test_operation".to_string()).unwrap();
499            thread::sleep(Duration::from_millis(i * 2 + 5)); // Variable duration
500            handle.complete(i % 7 != 0); // Some failures
501        }
502
503        let profile = profiler.get_operation_profile("test_operation").unwrap();
504        assert_eq!(profile.total_calls, 10);
505        assert!(profile.error_count > 0);
506        assert!(profile.avg_duration_ms > 0.0);
507        assert!(profile.max_duration_ms >= profile.min_duration_ms);
508    }
509
510    #[test]
511    fn test_profiler_disable() {
512        let profiler = PerformanceProfiler::new();
513        profiler.set_enabled(false);
514        assert!(!profiler.is_enabled());
515
516        let handle = profiler.start_trace("test_operation".to_string());
517        assert!(handle.is_none());
518    }
519
520    #[test]
521    fn test_performance_summary() {
522        let profiler = PerformanceProfiler::new();
523
524        // Add some operations
525        for _ in 0..5 {
526            let handle = profiler.start_trace("fast_op".to_string()).unwrap();
527            handle.complete(true);
528        }
529
530        for _ in 0..3 {
531            let handle = profiler.start_trace("slow_op".to_string()).unwrap();
532            thread::sleep(Duration::from_millis(150)); // Slow operation
533            handle.complete(true);
534        }
535
536        let summary = profiler.get_performance_summary();
537        assert_eq!(summary.total_operations, 8);
538        assert!(summary.total_slow_operations > 0);
539        assert_eq!(summary.total_errors, 0);
540        // Operations per second might be 0.0 if test runs too quickly
541        assert!(summary.operations_per_second >= 0.0);
542    }
543
544    #[test]
545    fn test_trace_metadata() {
546        let profiler = PerformanceProfiler::new();
547
548        let handle = profiler.start_trace("test_operation".to_string()).unwrap();
549        handle.add_metadata("user_id".to_string(), "12345".to_string());
550        handle.add_metadata("request_size".to_string(), "1024".to_string());
551
552        let mut metadata = HashMap::new();
553        metadata.insert("result_count".to_string(), "42".to_string());
554
555        handle.complete_with_metadata(true, metadata);
556
557        let summary = profiler.get_performance_summary();
558        assert_eq!(summary.total_operations, 1);
559    }
560
561    #[test]
562    fn test_data_export() {
563        let profiler = PerformanceProfiler::new();
564
565        let handle = profiler.start_trace("test_operation".to_string()).unwrap();
566        handle.complete(true);
567
568        let export = profiler.export_data().unwrap();
569        assert!(!export.profiles.is_empty());
570        assert_eq!(export.summary.total_operations, 1);
571    }
572}