Skip to main content

entrenar/monitor/llm/
metrics.rs

1//! LLM call metrics tracking.
2
3use chrono::{DateTime, Utc};
4use serde::{Deserialize, Serialize};
5use std::collections::HashMap;
6
7/// LLM call metrics
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct LLMMetrics {
10    /// Number of tokens in the prompt
11    pub prompt_tokens: u32,
12    /// Number of tokens in the completion
13    pub completion_tokens: u32,
14    /// Total tokens (prompt + completion)
15    pub total_tokens: u32,
16    /// Time to first token in milliseconds
17    pub time_to_first_token_ms: f64,
18    /// Tokens generated per second
19    pub tokens_per_second: f64,
20    /// Total latency in milliseconds
21    pub latency_ms: f64,
22    /// Estimated cost in USD (if known)
23    pub cost_usd: Option<f64>,
24    /// Model name (e.g., "gpt-4", "claude-3-opus")
25    pub model_name: String,
26    /// Timestamp of the call
27    pub timestamp: DateTime<Utc>,
28    /// Optional request ID
29    pub request_id: Option<String>,
30    /// Optional tags
31    pub tags: HashMap<String, String>,
32}
33
34impl LLMMetrics {
35    /// Create new LLM metrics with model name
36    pub fn new(model_name: &str) -> Self {
37        Self {
38            prompt_tokens: 0,
39            completion_tokens: 0,
40            total_tokens: 0,
41            time_to_first_token_ms: 0.0,
42            tokens_per_second: 0.0,
43            latency_ms: 0.0,
44            cost_usd: None,
45            model_name: model_name.to_string(),
46            timestamp: Utc::now(),
47            request_id: None,
48            tags: HashMap::new(),
49        }
50    }
51
52    /// Set token counts
53    pub fn with_tokens(mut self, prompt: u32, completion: u32) -> Self {
54        self.prompt_tokens = prompt;
55        self.completion_tokens = completion;
56        self.total_tokens = prompt + completion;
57        self
58    }
59
60    /// Set latency
61    pub fn with_latency(mut self, latency_ms: f64) -> Self {
62        self.latency_ms = latency_ms;
63        if latency_ms > 0.0 && self.completion_tokens > 0 {
64            self.tokens_per_second = f64::from(self.completion_tokens) / (latency_ms / 1000.0);
65        }
66        self
67    }
68
69    /// Set time to first token
70    pub fn with_ttft(mut self, ttft_ms: f64) -> Self {
71        self.time_to_first_token_ms = ttft_ms;
72        self
73    }
74
75    /// Set cost
76    pub fn with_cost(mut self, cost_usd: f64) -> Self {
77        self.cost_usd = Some(cost_usd);
78        self
79    }
80
81    /// Set request ID
82    pub fn with_request_id(mut self, id: &str) -> Self {
83        self.request_id = Some(id.to_string());
84        self
85    }
86
87    /// Add a tag
88    pub fn with_tag(mut self, key: &str, value: &str) -> Self {
89        self.tags.insert(key.to_string(), value.to_string());
90        self
91    }
92
93    /// Calculate cost based on model pricing (approximate)
94    ///
95    /// N-07 (Meyer DbC): Prices are approximate and may be stale. Order matters:
96    /// more specific patterns (e.g., "gpt-4-turbo") must precede generic ones
97    /// ("gpt-4") to avoid mis-categorization. Unknown models warn and use a
98    /// conservative default.
99    pub fn estimate_cost(&self) -> f64 {
100        // Approximate pricing per 1K tokens (as of late 2024).
101        // Table-driven: (pattern, prompt_price, completion_price).
102        // Order matters — more specific patterns first.
103        const PRICING: &[(&str, f64, f64)] = &[
104            ("gpt-4-turbo", 0.01, 0.03),
105            ("gpt-4o", 0.005, 0.015),
106            ("gpt-4", 0.03, 0.06),
107            ("gpt-3.5", 0.0005, 0.0015),
108            ("claude-3-opus", 0.015, 0.075),
109            ("claude-3-sonnet", 0.003, 0.015),
110            ("claude-3-haiku", 0.00025, 0.00125),
111            ("gemini", 0.00025, 0.0005),
112            ("mistral", 0.0002, 0.0006),
113            ("llama", 0.0002, 0.0006),
114        ];
115
116        let (prompt_price, completion_price) = PRICING
117            .iter()
118            .find(|(pattern, _, _)| self.model_name.contains(pattern))
119            .map_or_else(
120                || {
121                    eprintln!(
122                        "Warning: unknown model '{}' for cost estimation, using conservative default \
123                         ($0.001/$0.002 per 1K tokens)",
124                        self.model_name
125                    );
126                    (0.001, 0.002)
127                },
128                |&(_, p, c)| (p, c),
129            );
130
131        let prompt_cost = (f64::from(self.prompt_tokens) / 1000.0) * prompt_price;
132        let completion_cost = (f64::from(self.completion_tokens) / 1000.0) * completion_price;
133        prompt_cost + completion_cost
134    }
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140
141    #[test]
142    fn test_llm_metrics_new() {
143        let metrics = LLMMetrics::new("gpt-4");
144        assert_eq!(metrics.model_name, "gpt-4");
145        assert_eq!(metrics.prompt_tokens, 0);
146        assert_eq!(metrics.completion_tokens, 0);
147        assert_eq!(metrics.total_tokens, 0);
148        assert!(metrics.cost_usd.is_none());
149        assert!(metrics.request_id.is_none());
150        assert!(metrics.tags.is_empty());
151    }
152
153    #[test]
154    fn test_llm_metrics_with_tokens() {
155        let metrics = LLMMetrics::new("gpt-4").with_tokens(100, 50);
156        assert_eq!(metrics.prompt_tokens, 100);
157        assert_eq!(metrics.completion_tokens, 50);
158        assert_eq!(metrics.total_tokens, 150);
159    }
160
161    #[test]
162    fn test_llm_metrics_with_latency() {
163        let metrics = LLMMetrics::new("gpt-4").with_tokens(100, 100).with_latency(1000.0);
164        assert!((metrics.latency_ms - 1000.0).abs() < 1e-9);
165        // tokens_per_second = 100 / 1.0 = 100
166        assert!((metrics.tokens_per_second - 100.0).abs() < 1e-6);
167    }
168
169    #[test]
170    fn test_llm_metrics_with_latency_zero() {
171        let metrics = LLMMetrics::new("gpt-4").with_tokens(100, 100).with_latency(0.0);
172        assert!((metrics.latency_ms - 0.0).abs() < 1e-9);
173        // Should not calculate tokens_per_second for zero latency
174        assert!((metrics.tokens_per_second - 0.0).abs() < 1e-9);
175    }
176
177    #[test]
178    fn test_llm_metrics_with_ttft() {
179        let metrics = LLMMetrics::new("gpt-4").with_ttft(150.0);
180        assert!((metrics.time_to_first_token_ms - 150.0).abs() < 1e-9);
181    }
182
183    #[test]
184    fn test_llm_metrics_with_cost() {
185        let metrics = LLMMetrics::new("gpt-4").with_cost(0.05);
186        assert_eq!(metrics.cost_usd, Some(0.05));
187    }
188
189    #[test]
190    fn test_llm_metrics_with_request_id() {
191        let metrics = LLMMetrics::new("gpt-4").with_request_id("req-12345");
192        assert_eq!(metrics.request_id, Some("req-12345".to_string()));
193    }
194
195    #[test]
196    fn test_llm_metrics_with_tag() {
197        let metrics = LLMMetrics::new("gpt-4")
198            .with_tag("environment", "production")
199            .with_tag("user_id", "user123");
200        assert_eq!(metrics.tags.get("environment"), Some(&"production".to_string()));
201        assert_eq!(metrics.tags.get("user_id"), Some(&"user123".to_string()));
202    }
203
204    #[test]
205    fn test_llm_metrics_estimate_cost_gpt4() {
206        let metrics = LLMMetrics::new("gpt-4").with_tokens(1000, 1000);
207        // GPT-4: 0.03 per 1K prompt + 0.06 per 1K completion = 0.09
208        let cost = metrics.estimate_cost();
209        assert!((cost - 0.09).abs() < 0.001);
210    }
211
212    #[test]
213    fn test_llm_metrics_estimate_cost_gpt4_turbo() {
214        let metrics = LLMMetrics::new("gpt-4-turbo").with_tokens(1000, 1000);
215        // GPT-4-turbo: 0.01 + 0.03 = 0.04
216        let cost = metrics.estimate_cost();
217        assert!((cost - 0.04).abs() < 0.001);
218    }
219
220    #[test]
221    fn test_llm_metrics_estimate_cost_gpt35() {
222        let metrics = LLMMetrics::new("gpt-3.5-turbo").with_tokens(1000, 1000);
223        // GPT-3.5: 0.0005 + 0.0015 = 0.002
224        let cost = metrics.estimate_cost();
225        assert!((cost - 0.002).abs() < 0.0001);
226    }
227
228    #[test]
229    fn test_llm_metrics_estimate_cost_claude_opus() {
230        let metrics = LLMMetrics::new("claude-3-opus").with_tokens(1000, 1000);
231        // Claude-3-opus: 0.015 + 0.075 = 0.09
232        let cost = metrics.estimate_cost();
233        assert!((cost - 0.09).abs() < 0.001);
234    }
235
236    #[test]
237    fn test_llm_metrics_estimate_cost_claude_sonnet() {
238        let metrics = LLMMetrics::new("claude-3-sonnet").with_tokens(1000, 1000);
239        // Claude-3-sonnet: 0.003 + 0.015 = 0.018
240        let cost = metrics.estimate_cost();
241        assert!((cost - 0.018).abs() < 0.001);
242    }
243
244    #[test]
245    fn test_llm_metrics_estimate_cost_claude_haiku() {
246        let metrics = LLMMetrics::new("claude-3-haiku").with_tokens(1000, 1000);
247        // Claude-3-haiku: 0.00025 + 0.00125 = 0.0015
248        let cost = metrics.estimate_cost();
249        assert!((cost - 0.0015).abs() < 0.0001);
250    }
251
252    #[test]
253    fn test_llm_metrics_estimate_cost_unknown_model() {
254        let metrics = LLMMetrics::new("some-unknown-model").with_tokens(1000, 1000);
255        // Default: 0.001 + 0.002 = 0.003
256        let cost = metrics.estimate_cost();
257        assert!((cost - 0.003).abs() < 0.001);
258    }
259
260    #[test]
261    fn test_llm_metrics_clone() {
262        let metrics = LLMMetrics::new("gpt-4").with_tokens(100, 50).with_latency(500.0);
263        let cloned = metrics.clone();
264        assert_eq!(metrics.model_name, cloned.model_name);
265        assert_eq!(metrics.prompt_tokens, cloned.prompt_tokens);
266    }
267
268    #[test]
269    fn test_llm_metrics_serde() {
270        let metrics =
271            LLMMetrics::new("gpt-4").with_tokens(100, 50).with_latency(500.0).with_cost(0.01);
272
273        let json = serde_json::to_string(&metrics).expect("JSON serialization should succeed");
274        let deserialized: LLMMetrics =
275            serde_json::from_str(&json).expect("JSON deserialization should succeed");
276        assert_eq!(metrics.model_name, deserialized.model_name);
277        assert_eq!(metrics.prompt_tokens, deserialized.prompt_tokens);
278        assert_eq!(metrics.cost_usd, deserialized.cost_usd);
279    }
280
281    #[test]
282    fn test_llm_metrics_debug() {
283        let metrics = LLMMetrics::new("gpt-4");
284        let debug_str = format!("{metrics:?}");
285        assert!(debug_str.contains("LLMMetrics"));
286        assert!(debug_str.contains("gpt-4"));
287    }
288
289    #[test]
290    fn test_llm_metrics_chained_builders() {
291        let metrics = LLMMetrics::new("claude-3-opus")
292            .with_tokens(500, 200)
293            .with_latency(2000.0)
294            .with_ttft(100.0)
295            .with_cost(0.05)
296            .with_request_id("req-abc")
297            .with_tag("feature", "summarization");
298
299        assert_eq!(metrics.model_name, "claude-3-opus");
300        assert_eq!(metrics.total_tokens, 700);
301        assert!((metrics.latency_ms - 2000.0).abs() < 1e-9);
302        assert!((metrics.time_to_first_token_ms - 100.0).abs() < 1e-9);
303        assert_eq!(metrics.cost_usd, Some(0.05));
304        assert_eq!(metrics.request_id, Some("req-abc".to_string()));
305        assert_eq!(metrics.tags.get("feature"), Some(&"summarization".to_string()));
306    }
307
308    // =========================================================================
309    // FALSIFY tests — contract violation sweep (N-07)
310    // =========================================================================
311
312    #[test]
313    fn test_falsify_n07_gpt4_turbo_before_gpt4() {
314        // N-07: "gpt-4-turbo-preview" must match "gpt-4-turbo" tier ($0.01),
315        // not "gpt-4" tier ($0.03). Order of match arms matters.
316        let turbo = LLMMetrics::new("gpt-4-turbo-preview").with_tokens(1000, 0);
317        let base = LLMMetrics::new("gpt-4-0613").with_tokens(1000, 0);
318
319        let turbo_cost = turbo.estimate_cost();
320        let base_cost = base.estimate_cost();
321
322        assert!(
323            turbo_cost < base_cost,
324            "gpt-4-turbo-preview ({turbo_cost}) must be cheaper than gpt-4 ({base_cost})"
325        );
326    }
327
328    #[test]
329    fn test_falsify_n07_gpt4o_distinct_from_gpt4() {
330        // N-07: "gpt-4o" must match its own tier, not fall through to "gpt-4".
331        let gpt4o = LLMMetrics::new("gpt-4o-2024-05-13").with_tokens(1000, 1000);
332        let gpt4 = LLMMetrics::new("gpt-4-0613").with_tokens(1000, 1000);
333
334        let gpt4o_cost = gpt4o.estimate_cost();
335        let gpt4_cost = gpt4.estimate_cost();
336
337        assert!(
338            gpt4o_cost < gpt4_cost,
339            "gpt-4o ({gpt4o_cost}) must be cheaper than gpt-4 ({gpt4_cost})"
340        );
341    }
342
343    #[test]
344    fn test_falsify_n07_unknown_model_uses_conservative_default() {
345        // N-07: Unknown models must use the conservative default, never $0.
346        let metrics = LLMMetrics::new("totally-unknown-model-v9").with_tokens(1000, 1000);
347        let cost = metrics.estimate_cost();
348
349        assert!(cost > 0.0, "Unknown model cost must be > 0, got {cost}");
350        // Conservative default: $0.001 prompt + $0.002 completion = $0.003 per 1K
351        assert!((cost - 0.003).abs() < 1e-6, "Expected conservative default ~$0.003, got {cost}");
352    }
353
354    #[test]
355    fn test_estimate_cost_all_model_variants() {
356        // (model_name, expected_total_cost_per_1K_prompt_1K_completion)
357        let models = [
358            ("gpt-4-turbo-preview", 0.01 + 0.03),
359            ("gpt-4o-2024-05-13", 0.005 + 0.015),
360            ("gpt-4-0613", 0.03 + 0.06),
361            ("gpt-3.5-turbo", 0.0005 + 0.0015),
362            ("claude-3-opus-20240229", 0.015 + 0.075),
363            ("claude-3-sonnet-20240229", 0.003 + 0.015),
364            ("claude-3-haiku-20240307", 0.00025 + 0.00125),
365            ("gemini-pro", 0.00025 + 0.0005),
366            ("mistral-medium", 0.0002 + 0.0006),
367            ("llama-3-70b", 0.0002 + 0.0006),
368            ("unknown-model", 0.001 + 0.002),
369        ];
370
371        for (model_name, expected_cost) in &models {
372            let metrics = LLMMetrics::new(model_name).with_tokens(1000, 1000);
373            let cost = metrics.estimate_cost();
374            assert!(
375                (cost - expected_cost).abs() < 1e-6,
376                "cost mismatch for {model_name}: got {cost}, expected {expected_cost}"
377            );
378        }
379    }
380}