Skip to main content

vtcode_core/tools/
health.rs

1use crate::types::CompactStr;
2use hashbrown::HashMap;
3use parking_lot::RwLock;
4use smallvec::SmallVec;
5use std::collections::VecDeque;
6use std::sync::Arc;
7use std::time::Duration;
8
9/// Outcome of a single tool execution.
10#[derive(Debug, Clone, Copy, PartialEq)]
11struct ExecutionResult {
12    success: bool,
13    latency_ms: f64,
14}
15
16/// Metrics for a single tool.
17#[derive(Debug, Clone, Default)]
18pub struct ToolStats {
19    pub success_count: u64,
20    pub failure_count: u64,
21    pub total_count: u64,
22    pub consecutive_failures: u64,
23    pub avg_latency_ms: f64,
24    /// Number of failures in the recent history window.
25    pub recent_failure_count: u64,
26    /// Sliding window of recent executions (last N).
27    recent_history: VecDeque<ExecutionResult>,
28}
29
30/// Tracks health and performance of tools with sliding window.
31pub struct ToolHealthTracker {
32    stats: Arc<RwLock<HashMap<CompactStr, ToolStats>>>,
33    failure_threshold: u64,
34    window_size: usize,
35}
36
37impl ToolHealthTracker {
38    /// Create a new health tracker.
39    /// failure_threshold: number of consecutive failures before marking as unhealthy.
40    pub fn new(failure_threshold: u64) -> Self {
41        Self {
42            stats: Arc::new(RwLock::new(HashMap::new())),
43            failure_threshold,
44            window_size: 20, // Track last 20 executions for current health
45        }
46    }
47
48    /// Set the tracking window size.
49    pub fn set_window_size(&mut self, size: usize) {
50        self.window_size = size;
51    }
52
53    /// Record a tool execution result.
54    pub fn record_execution(&self, tool_name: &str, success: bool, latency: Duration) {
55        let mut stats_map = self.stats.write();
56
57        // Optimized lookup: only clone if inserting a new entry
58        let tool_stats = if let Some(stats) = stats_map.get_mut(tool_name) {
59            stats
60        } else {
61            stats_map.entry(CompactStr::from(tool_name)).or_default()
62        };
63
64        let latency_ms = latency.as_secs_f64() * 1000.0;
65
66        // Update lifetime stats
67        tool_stats.total_count += 1;
68
69        // Weighted average for lifetime latency (simple decay)
70        if tool_stats.total_count == 1 {
71            tool_stats.avg_latency_ms = latency_ms;
72        } else {
73            let n = tool_stats.total_count as f64;
74            tool_stats.avg_latency_ms =
75                tool_stats.avg_latency_ms * ((n - 1.0) / n) + latency_ms / n;
76        }
77
78        if success {
79            tool_stats.success_count += 1;
80            tool_stats.consecutive_failures = 0;
81        } else {
82            tool_stats.failure_count += 1;
83            tool_stats.consecutive_failures += 1;
84            tool_stats.recent_failure_count += 1;
85        }
86
87        // Update sliding window
88        tool_stats.recent_history.push_back(ExecutionResult {
89            success,
90            latency_ms,
91        });
92
93        if tool_stats.recent_history.len() > self.window_size
94            && let Some(removed) = tool_stats.recent_history.pop_front()
95            && !removed.success
96        {
97            tool_stats.recent_failure_count = tool_stats.recent_failure_count.saturating_sub(1);
98        }
99    }
100
101    /// Check if a tool is considered healthy.
102    pub fn is_healthy(&self, tool_name: &str) -> bool {
103        self.check_health(tool_name).0
104    }
105
106    /// Check health and returns (is_healthy, reason)
107    pub fn check_health(&self, tool_name: &str) -> (bool, Option<String>) {
108        let stats_map = self.stats.read();
109        if let Some(stats) = stats_map.get(tool_name) {
110            // Criterion 1: Consecutive failures (Circuit Breaker)
111            if stats.consecutive_failures >= self.failure_threshold {
112                return (
113                    false,
114                    Some(format!(
115                        "{} consecutive failures",
116                        stats.consecutive_failures
117                    )),
118                );
119            }
120
121            // Criterion 2: Recent error rate (Degradation).
122            // Only enforce if we have enough data (at least half window).
123            let history_len = stats.recent_history.len();
124            if history_len >= 5 {
125                let failure_rate = stats.recent_failure_count as f64 / history_len as f64;
126                if failure_rate > 0.6 {
127                    return (
128                        false,
129                        Some(format!(
130                            "High recent failure rate: {:.1}%",
131                            failure_rate * 100.0
132                        )),
133                    );
134                }
135            }
136        }
137        (true, None)
138    }
139
140    /// Get latency stats (avg, p95 estimate)
141    pub fn get_latency_stats(&self, tool: &str) -> Option<(f64, f64)> {
142        let map = self.stats.read();
143        let stats = map.get(tool)?;
144
145        // Simple average
146        let avg = stats.avg_latency_ms;
147
148        // Use recent window for "current" latency if available, else lifetime
149        if stats.recent_history.is_empty() {
150            return Some((avg, avg));
151        }
152
153        // Use SmallVec to avoid heap allocation for common window sizes (e.g., 20).
154        let mut sorted = SmallVec::<[f64; 32]>::new();
155        sorted.extend(stats.recent_history.iter().map(|r| r.latency_ms));
156        sorted.sort_unstable_by(f64::total_cmp);
157
158        let p95_idx = ((sorted.len() as f64 * 0.95).ceil() as usize).saturating_sub(1);
159        let p95 = sorted.get(p95_idx).copied().unwrap_or(avg);
160
161        Some((avg, p95))
162    }
163    /// Get snapshot of all tool stats
164    pub fn get_all_tool_stats(&self) -> Vec<(CompactStr, ToolStats)> {
165        self.stats
166            .read()
167            .iter()
168            .map(|(k, v)| (k.clone(), v.clone()))
169            .collect()
170    }
171}
172
173impl Default for ToolHealthTracker {
174    fn default() -> Self {
175        Self::new(50)
176    }
177}