ai_lib/
metrics.rs

1//! 指标收集模块,提供可插拔的性能监控和统计功能
2//!
3//! Metrics collection module providing pluggable performance monitoring and statistics.
4//!
5//! This module defines the `Metrics` trait for collecting performance data,
6//! usage statistics, and error rates from AI provider interactions.
7//!
8//! Simple, injectable metrics trait used by adapters/clients.
9//! Keep the surface minimal: counters, gauges and a timer RAII helper.
10
11use async_trait::async_trait;
12#[async_trait]
13pub trait Metrics: Send + Sync + 'static {
14    /// Increment a named counter by `value`.
15    async fn incr_counter(&self, name: &str, value: u64);
16
17    /// Record a gauge metric.
18    async fn record_gauge(&self, name: &str, value: f64);
19
20    /// Start a timer for a named operation. Returns a boxed Timer which should be stopped
21    /// when the operation completes. Implementations may return None if timers aren't supported.
22    async fn start_timer(&self, name: &str) -> Option<Box<dyn Timer + Send>>;
23
24    /// Record a histogram value for a named metric.
25    async fn record_histogram(&self, name: &str, value: f64);
26
27    /// Record a histogram value with tags/labels.
28    async fn record_histogram_with_tags(&self, name: &str, value: f64, tags: &[(&str, &str)]);
29
30    /// Increment a counter with tags/labels.
31    async fn incr_counter_with_tags(&self, name: &str, value: u64, tags: &[(&str, &str)]);
32
33    /// Record a gauge with tags/labels.
34    async fn record_gauge_with_tags(&self, name: &str, value: f64, tags: &[(&str, &str)]);
35
36    /// Record an error occurrence.
37    async fn record_error(&self, name: &str, error_type: &str);
38
39    /// Record a success/failure boolean metric.
40    async fn record_success(&self, name: &str, success: bool);
41}
42
43/// Timer interface returned by Metrics::start_timer.
44pub trait Timer: Send {
45    /// Stop the timer and record the duration.
46    fn stop(self: Box<Self>);
47}
48
49/// No-op metrics implementation suitable as a default.
50pub struct NoopMetrics;
51
52#[async_trait]
53impl Metrics for NoopMetrics {
54    async fn incr_counter(&self, _name: &str, _value: u64) {}
55    async fn record_gauge(&self, _name: &str, _value: f64) {}
56    async fn start_timer(&self, _name: &str) -> Option<Box<dyn Timer + Send>> {
57        None
58    }
59    async fn record_histogram(&self, _name: &str, _value: f64) {}
60    async fn record_histogram_with_tags(&self, _name: &str, _value: f64, _tags: &[(&str, &str)]) {}
61    async fn incr_counter_with_tags(&self, _name: &str, _value: u64, _tags: &[(&str, &str)]) {}
62    async fn record_gauge_with_tags(&self, _name: &str, _value: f64, _tags: &[(&str, &str)]) {}
63    async fn record_error(&self, _name: &str, _error_type: &str) {}
64    async fn record_success(&self, _name: &str, _success: bool) {}
65}
66
67/// A no-op timer (returned when StartTimer implementations want to return a concrete value).
68pub struct NoopTimer;
69impl Timer for NoopTimer {
70    fn stop(self: Box<Self>) {}
71}
72
73impl NoopMetrics {
74    pub fn new() -> Self {
75        NoopMetrics
76    }
77}
78
79impl Default for NoopMetrics {
80    fn default() -> Self {
81        Self::new()
82    }
83}
84
85/// Convenience methods for common metric patterns
86#[allow(async_fn_in_trait)]
87pub trait MetricsExt: Metrics {
88    /// Record a request with timing and success/failure
89    async fn record_request(
90        &self,
91        name: &str,
92        timer: Option<Box<dyn Timer + Send>>,
93        success: bool,
94    ) {
95        if let Some(t) = timer {
96            t.stop();
97        }
98        self.record_success(name, success).await;
99    }
100
101    /// Record a request with timing, success/failure, and tags
102    async fn record_request_with_tags(
103        &self,
104        name: &str,
105        timer: Option<Box<dyn Timer + Send>>,
106        success: bool,
107        tags: &[(&str, &str)],
108    ) {
109        if let Some(t) = timer {
110            t.stop();
111        }
112        self.record_success(name, success).await;
113        // Record additional metrics with tags
114        self.incr_counter_with_tags(&format!("{}.total", name), 1, tags)
115            .await;
116        if success {
117            self.incr_counter_with_tags(&format!("{}.success", name), 1, tags)
118                .await;
119        } else {
120            self.incr_counter_with_tags(&format!("{}.failure", name), 1, tags)
121                .await;
122        }
123    }
124
125    /// Record an error with context
126    async fn record_error_with_context(&self, name: &str, error_type: &str, context: &str) {
127        self.record_error(name, error_type).await;
128        self.incr_counter_with_tags(name, 1, &[("error_type", error_type), ("context", context)])
129            .await;
130    }
131
132    /// Record a complete request with timing, status, and success metrics
133    async fn record_complete_request(
134        &self,
135        name: &str,
136        timer: Option<Box<dyn Timer + Send>>,
137        status_code: u16,
138        success: bool,
139        tags: &[(&str, &str)],
140    ) {
141        if let Some(t) = timer {
142            t.stop();
143        }
144
145        // Record basic metrics
146        self.record_success(name, success).await;
147        self.record_gauge_with_tags(&format!("{}.status_code", name), status_code as f64, tags)
148            .await;
149
150        // Record counters
151        self.incr_counter_with_tags(&format!("{}.total", name), 1, tags)
152            .await;
153        if success {
154            self.incr_counter_with_tags(&format!("{}.success", name), 1, tags)
155                .await;
156        } else {
157            self.incr_counter_with_tags(&format!("{}.failure", name), 1, tags)
158                .await;
159        }
160    }
161
162    /// Record latency percentiles for a batch of measurements
163    async fn record_batch_latency_percentiles(
164        &self,
165        name: &str,
166        measurements: &[f64],
167        tags: &[(&str, &str)],
168    ) {
169        if measurements.is_empty() {
170            return;
171        }
172
173        let mut sorted = measurements.to_vec();
174        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
175
176        let len = sorted.len();
177        let p50 = sorted[(len * 50 / 100).min(len - 1)];
178        let p95 = sorted[(len * 95 / 100).min(len - 1)];
179        let p99 = sorted[(len * 99 / 100).min(len - 1)];
180
181        self.record_gauge_with_tags(&format!("{}.latency.p50", name), p50, tags)
182            .await;
183        self.record_gauge_with_tags(&format!("{}.latency.p95", name), p95, tags)
184            .await;
185        self.record_gauge_with_tags(&format!("{}.latency.p99", name), p99, tags)
186            .await;
187    }
188}
189
190impl<T: Metrics> MetricsExt for T {}
191
192/// Centralized metric key helpers to standardize naming across adapters
193pub mod keys {
194    /// Request counter key
195    pub fn requests(provider: &str) -> String {
196        format!("{}.requests", provider)
197    }
198    /// Request duration timer key (ms)
199    pub fn request_duration_ms(provider: &str) -> String {
200        format!("{}.request_duration_ms", provider)
201    }
202    /// Success ratio gauge/histogram can be derived; optional explicit keys
203    pub fn success(provider: &str) -> String {
204        format!("{}.success", provider)
205    }
206    pub fn failure(provider: &str) -> String {
207        format!("{}.failure", provider)
208    }
209
210    /// Latency percentile keys
211    pub fn latency_p50(provider: &str) -> String {
212        format!("{}.latency_p50", provider)
213    }
214    pub fn latency_p95(provider: &str) -> String {
215        format!("{}.latency_p95", provider)
216    }
217    pub fn latency_p99(provider: &str) -> String {
218        format!("{}.latency_p99", provider)
219    }
220
221    /// Status code distribution key
222    pub fn status_codes(provider: &str) -> String {
223        format!("{}.status_codes", provider)
224    }
225
226    /// Error rate key
227    pub fn error_rate(provider: &str) -> String {
228        format!("{}.error_rate", provider)
229    }
230
231    /// Throughput key (requests per second)
232    pub fn throughput(provider: &str) -> String {
233        format!("{}.throughput", provider)
234    }
235
236    /// Cost metrics keys
237    pub fn cost_usd(provider: &str) -> String {
238        format!("{}.cost_usd", provider)
239    }
240    pub fn cost_per_request(provider: &str) -> String {
241        format!("{}.cost_per_request", provider)
242    }
243    pub fn tokens_input(provider: &str) -> String {
244        format!("{}.tokens_input", provider)
245    }
246    pub fn tokens_output(provider: &str) -> String {
247        format!("{}.tokens_output", provider)
248    }
249
250    /// Routing metrics keys
251    pub fn routing_requests(route: &str) -> String {
252        format!("routing.{}.requests", route)
253    }
254    pub fn routing_selected(route: &str) -> String {
255        format!("routing.{}.selected", route)
256    }
257    pub fn routing_health_fail(route: &str) -> String {
258        format!("routing.{}.health_fail", route)
259    }
260}
261
262/// Minimal cost accounting helper (feature-gated)
263#[cfg(feature = "cost_metrics")]
264pub mod cost {
265    use crate::metrics::Metrics;
266
267    /// Compute cost using env vars like COST_INPUT_PER_1K and COST_OUTPUT_PER_1K (USD)
268    pub fn estimate_usd(input_tokens: u32, output_tokens: u32) -> f64 {
269        let in_rate = std::env::var("COST_INPUT_PER_1K")
270            .ok()
271            .and_then(|s| s.parse::<f64>().ok())
272            .unwrap_or(0.0);
273        let out_rate = std::env::var("COST_OUTPUT_PER_1K")
274            .ok()
275            .and_then(|s| s.parse::<f64>().ok())
276            .unwrap_or(0.0);
277        (input_tokens as f64 / 1000.0) * in_rate + (output_tokens as f64 / 1000.0) * out_rate
278    }
279
280    /// Report cost via Metrics as histogram (usd) and counters by provider/model
281    pub async fn record_cost<M: Metrics + ?Sized>(m: &M, provider: &str, model: &str, usd: f64) {
282        m.record_histogram_with_tags("cost.usd", usd, &[("provider", provider), ("model", model)])
283            .await;
284    }
285}
286
287// Environment variables for optional features
288//
289// cost_metrics (if enabled):
290// - COST_INPUT_PER_1K: USD per 1000 input tokens
291// - COST_OUTPUT_PER_1K: USD per 1000 output tokens
292//
293// Note: In enterprise deployments (ai-lib PRO), these can be centrally managed
294// and hot-reloaded via external configuration providers.