harn_vm/llm/api/
telemetry.rs

1//! Per-call provider telemetry envelope.
2//!
3//! Local runtimes (Ollama in particular) report enough server-side timing
4//! information to diagnose slow runs without scraping daemon logs: cold load
5//! vs steady state, prefill vs generation, and tokens-per-second ratios. The
6//! Anthropic and OpenAI hosted APIs don't expose comparable metrics, but the
7//! local runtimes Harn cares most about (Ollama, llama.cpp, MLX, vLLM) all
8//! ship at least a partial subset. This module normalizes whatever the
9//! provider reports into one stable envelope and represents missing fields
10//! as `None` so downstream evals can distinguish "not reported" from "zero".
11//!
12//! Conventions:
13//! - Durations are milliseconds (Ollama reports nanoseconds; we convert).
14//! - Token counts are signed `i64` to match the rest of `LlmResult`.
15//! - `source` identifies which wire format the values were lifted from so
16//!   eval scripts can route on it without re-deriving provider names.
17
18use crate::value::VmDictExt;
19
20use crate::value::VmValue;
21
22/// Wire-format identifiers for `ProviderTelemetry::source`. Keep these in
23/// sync with the matching strings in `docs/src/observability/*` and the
24/// The host eval aggregator.
25pub mod source {
26    /// Ollama `/api/chat` NDJSON stream — full timing breakdown.
27    pub const OLLAMA_CHAT: &str = "ollama_chat";
28    /// Ollama `/api/generate` (raw) — full timing breakdown.
29    pub const OLLAMA_GENERATE: &str = "ollama_generate";
30    /// OpenAI-style `usage` block (prompt/completion tokens, optional cache
31    /// details). No server-side timings unless the runtime extends the
32    /// schema.
33    pub const OPENAI_USAGE: &str = "openai_usage";
34    /// llama.cpp `usage.timings` extension (`prompt_ms`, `predicted_ms`,
35    /// `predicted_n`, `prompt_n`, ...). Preserved verbatim from the
36    /// non-OpenAI subset.
37    pub const LLAMACPP_TIMINGS: &str = "llamacpp_timings";
38    /// Anthropic Messages API — usage counts only; no timings.
39    pub const ANTHROPIC_USAGE: &str = "anthropic_usage";
40    /// Google Gemini `usageMetadata` block from `generateContent`.
41    pub const GEMINI_USAGE: &str = "gemini_usage";
42    /// Provider responded but we did not capture anything beyond what
43    /// already lives on `LlmResult` (e.g. mock / fake providers, or a
44    /// stream that finished without a usage frame).
45    pub const UNKNOWN: &str = "unknown";
46}
47
48pub(crate) fn elapsed_ms(started: std::time::Instant) -> u64 {
49    started.elapsed().as_millis().min(u128::from(u64::MAX)) as u64
50}
51
52/// Provider-side timing and runtime accounting captured per LLM call.
53///
54/// All fields default to `None` / empty. Producers fill in what they can
55/// extract and leave the rest absent; consumers must treat missing fields as
56/// "not reported by this provider", not "zero".
57#[derive(Clone, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
58pub struct ProviderTelemetry {
59    /// Wire format the values came from (`ollama_chat`, `openai_usage`, ...).
60    /// See [`source`] for the canonical strings. Empty when no telemetry was
61    /// captured.
62    #[serde(default, skip_serializing_if = "String::is_empty")]
63    pub source: String,
64    /// Total server-side wall clock (Ollama `total_duration`).
65    #[serde(skip_serializing_if = "Option::is_none")]
66    pub server_total_ms: Option<u64>,
67    /// Time the server spent loading/warming the model (Ollama
68    /// `load_duration`). Useful for detecting cold-start latency.
69    #[serde(skip_serializing_if = "Option::is_none")]
70    pub server_load_ms: Option<u64>,
71    /// Prompt-prefill time (Ollama `prompt_eval_duration`). Anything else
72    /// would be marketing — this is the field evals key on for prefill
73    /// regression detection.
74    #[serde(skip_serializing_if = "Option::is_none")]
75    pub server_prompt_eval_ms: Option<u64>,
76    /// Generation/decode time (Ollama `eval_duration`).
77    #[serde(skip_serializing_if = "Option::is_none")]
78    pub server_generation_ms: Option<u64>,
79    /// Tokens the server reports it prefilled (Ollama `prompt_eval_count`).
80    /// Distinct from `LlmResult::input_tokens` because hosted providers
81    /// frequently bill different token boundaries than the on-device
82    /// tokenizer reports.
83    #[serde(skip_serializing_if = "Option::is_none")]
84    pub server_prompt_tokens: Option<i64>,
85    /// Tokens the server reports it generated (Ollama `eval_count`).
86    #[serde(skip_serializing_if = "Option::is_none")]
87    pub server_output_tokens: Option<i64>,
88    /// Client-side wall clock around the HTTP request. Includes network and
89    /// streaming latency the server-side counters omit. Recorded for every
90    /// call regardless of provider.
91    #[serde(skip_serializing_if = "Option::is_none")]
92    pub client_wall_ms: Option<u64>,
93    /// Context window the model was loaded with (where the runtime
94    /// reports it; `/api/ps` for Ollama).
95    #[serde(skip_serializing_if = "Option::is_none")]
96    pub runtime_context_length: Option<u64>,
97    /// Exact model id the server resolved. May differ from
98    /// `LlmResult::model` when an alias / digest is rewritten upstream.
99    #[serde(skip_serializing_if = "Option::is_none")]
100    pub runtime_loaded_model: Option<String>,
101    /// Total resident bytes for the loaded model (Ollama `/api/ps.size`).
102    #[serde(skip_serializing_if = "Option::is_none")]
103    pub runtime_memory_bytes: Option<u64>,
104    /// VRAM bytes for the loaded model (Ollama `/api/ps.size_vram`).
105    #[serde(skip_serializing_if = "Option::is_none")]
106    pub runtime_memory_vram_bytes: Option<u64>,
107    /// When the server will unload the model (Ollama `/api/ps.expires_at`).
108    #[serde(skip_serializing_if = "Option::is_none")]
109    pub runtime_keep_alive_until: Option<String>,
110    /// Provider-supplied request id (`x-request-id` / `request_id`).
111    #[serde(skip_serializing_if = "Option::is_none")]
112    pub request_id: Option<String>,
113}
114
115impl ProviderTelemetry {
116    pub fn new(source: &str) -> Self {
117        Self {
118            source: source.to_string(),
119            ..Self::default()
120        }
121    }
122
123    /// Returns `true` when no meaningful telemetry was captured. A bare
124    /// `client_wall_ms` is still "meaningful" — it lets evals reason about
125    /// per-call latency even for providers that report nothing else.
126    pub fn is_empty(&self) -> bool {
127        let Self {
128            source,
129            server_total_ms,
130            server_load_ms,
131            server_prompt_eval_ms,
132            server_generation_ms,
133            server_prompt_tokens,
134            server_output_tokens,
135            client_wall_ms,
136            runtime_context_length,
137            runtime_loaded_model,
138            runtime_memory_bytes,
139            runtime_memory_vram_bytes,
140            runtime_keep_alive_until,
141            request_id,
142        } = self;
143        source.is_empty()
144            && server_total_ms.is_none()
145            && server_load_ms.is_none()
146            && server_prompt_eval_ms.is_none()
147            && server_generation_ms.is_none()
148            && server_prompt_tokens.is_none()
149            && server_output_tokens.is_none()
150            && client_wall_ms.is_none()
151            && runtime_context_length.is_none()
152            && runtime_loaded_model.is_none()
153            && runtime_memory_bytes.is_none()
154            && runtime_memory_vram_bytes.is_none()
155            && runtime_keep_alive_until.is_none()
156            && request_id.is_none()
157    }
158
159    /// Convert nanoseconds (Ollama's reporting unit) to milliseconds with
160    /// integer rounding. Centralized so every Ollama timing field uses the
161    /// same conversion and zero-vs-None semantics line up.
162    pub fn ns_to_ms(ns: u64) -> u64 {
163        // Use floor division (the conversion is approximate by design); when
164        // the upstream reports 0 ns we want a 0 ms entry rather than None,
165        // so callers should pass through `Some(ns_to_ms(0))` consciously.
166        ns / 1_000_000
167    }
168
169    /// Extract Ollama-shape telemetry from a `done=true` chat or generate
170    /// frame. Returns a populated [`ProviderTelemetry`] whose `source` is
171    /// the caller-provided wire identifier.
172    pub fn from_ollama_done(frame: &serde_json::Value, source: &str) -> Self {
173        let mut telemetry = Self::new(source);
174        telemetry.server_total_ms = ns_field(frame, "total_duration");
175        telemetry.server_load_ms = ns_field(frame, "load_duration");
176        telemetry.server_prompt_eval_ms = ns_field(frame, "prompt_eval_duration");
177        telemetry.server_generation_ms = ns_field(frame, "eval_duration");
178        telemetry.server_prompt_tokens = frame
179            .get("prompt_eval_count")
180            .and_then(serde_json::Value::as_i64);
181        telemetry.server_output_tokens =
182            frame.get("eval_count").and_then(serde_json::Value::as_i64);
183        if let Some(model) = frame.get("model").and_then(serde_json::Value::as_str) {
184            telemetry.runtime_loaded_model = Some(model.to_string());
185        }
186        telemetry
187    }
188
189    /// Extract OpenAI-shape `usage` telemetry. Most OpenAI-compatible local
190    /// runtimes only report counts; llama.cpp's `usage.timings` extension is
191    /// folded in here as well so a single envelope captures both shapes.
192    pub fn from_openai_usage(usage: &serde_json::Value, request_id: Option<&str>) -> Self {
193        let mut telemetry = Self::new(source::OPENAI_USAGE);
194        telemetry.server_prompt_tokens = usage
195            .get("prompt_tokens")
196            .or_else(|| usage.get("input_tokens"))
197            .and_then(serde_json::Value::as_i64);
198        telemetry.server_output_tokens = usage
199            .get("completion_tokens")
200            .or_else(|| usage.get("output_tokens"))
201            .and_then(serde_json::Value::as_i64);
202        if let Some(timings) = usage.get("timings").filter(|value| value.is_object()) {
203            telemetry.source = source::LLAMACPP_TIMINGS.to_string();
204            telemetry.server_prompt_eval_ms = ms_or_round(timings.get("prompt_ms"));
205            telemetry.server_generation_ms = ms_or_round(timings.get("predicted_ms"));
206            // llama.cpp also reports `prompt_n` / `predicted_n` here when
207            // its usage breakdown is enabled; prefer them over the
208            // legacy top-level counts so prefill cache hits surface
209            // correctly.
210            if let Some(prefill) = timings.get("prompt_n").and_then(serde_json::Value::as_i64) {
211                telemetry.server_prompt_tokens = Some(prefill);
212            }
213            if let Some(predicted) = timings
214                .get("predicted_n")
215                .and_then(serde_json::Value::as_i64)
216            {
217                telemetry.server_output_tokens = Some(predicted);
218            }
219            let total = telemetry
220                .server_prompt_eval_ms
221                .unwrap_or(0)
222                .saturating_add(telemetry.server_generation_ms.unwrap_or(0));
223            if total > 0 {
224                telemetry.server_total_ms = Some(total);
225            }
226        }
227        if let Some(request_id) = request_id.filter(|value| !value.is_empty()) {
228            telemetry.request_id = Some(request_id.to_string());
229        }
230        telemetry
231    }
232
233    /// Extract Anthropic-shape `usage` telemetry. Anthropic only reports
234    /// input/output (and cache) counts — preserving the request id is the
235    /// most useful incremental signal beyond what `LlmResult` already
236    /// carries.
237    pub fn from_anthropic_usage(usage: &serde_json::Value, request_id: Option<&str>) -> Self {
238        let mut telemetry = Self::new(source::ANTHROPIC_USAGE);
239        telemetry.server_prompt_tokens = usage
240            .get("input_tokens")
241            .and_then(serde_json::Value::as_i64);
242        telemetry.server_output_tokens = usage
243            .get("output_tokens")
244            .and_then(serde_json::Value::as_i64);
245        if let Some(request_id) = request_id.filter(|value| !value.is_empty()) {
246            telemetry.request_id = Some(request_id.to_string());
247        }
248        telemetry
249    }
250
251    /// Extract Gemini `usageMetadata` counts. Cache-hit accounting stays on
252    /// `LlmResult`; telemetry keeps provider prompt/output counters plus the
253    /// request id for eval joins.
254    pub fn from_gemini_usage(usage: &serde_json::Value, request_id: Option<&str>) -> Self {
255        let mut telemetry = Self::new(source::GEMINI_USAGE);
256        telemetry.server_prompt_tokens = usage
257            .get("promptTokenCount")
258            .and_then(serde_json::Value::as_i64);
259        telemetry.server_output_tokens = usage
260            .get("candidatesTokenCount")
261            .and_then(serde_json::Value::as_i64);
262        if let Some(request_id) = request_id.filter(|value| !value.is_empty()) {
263            telemetry.request_id = Some(request_id.to_string());
264        }
265        telemetry
266    }
267
268    /// Merge a `/api/ps` snapshot of a loaded Ollama model into this
269    /// telemetry envelope. Only fills in fields that were not already
270    /// populated so a per-call extraction keeps precedence.
271    pub fn merge_ollama_ps(&mut self, ps: &OllamaPsModel) {
272        if self.runtime_loaded_model.is_none() {
273            self.runtime_loaded_model = ps.name.clone();
274        }
275        if self.runtime_context_length.is_none() {
276            self.runtime_context_length = ps.context_length;
277        }
278        if self.runtime_memory_bytes.is_none() {
279            self.runtime_memory_bytes = ps.size_bytes;
280        }
281        if self.runtime_memory_vram_bytes.is_none() {
282            self.runtime_memory_vram_bytes = ps.size_vram_bytes;
283        }
284        if self.runtime_keep_alive_until.is_none() {
285            self.runtime_keep_alive_until = ps.expires_at.clone();
286        }
287    }
288
289    /// Render this envelope into the dictionary shape `llm_call` returns.
290    /// Returns `None` if the envelope is empty so callers can omit the key
291    /// entirely.
292    pub fn as_vm_dict(&self) -> Option<VmValue> {
293        if self.is_empty() {
294            return None;
295        }
296        let mut dict: crate::value::DictMap = crate::value::DictMap::new();
297        if !self.source.is_empty() {
298            dict.put_str("source", self.source.as_str());
299        }
300        insert_opt_u64(&mut dict, "server_total_ms", self.server_total_ms);
301        insert_opt_u64(&mut dict, "server_load_ms", self.server_load_ms);
302        insert_opt_u64(
303            &mut dict,
304            "server_prompt_eval_ms",
305            self.server_prompt_eval_ms,
306        );
307        insert_opt_u64(&mut dict, "server_generation_ms", self.server_generation_ms);
308        insert_opt_i64(&mut dict, "server_prompt_tokens", self.server_prompt_tokens);
309        insert_opt_i64(&mut dict, "server_output_tokens", self.server_output_tokens);
310        insert_opt_u64(&mut dict, "client_wall_ms", self.client_wall_ms);
311        insert_opt_u64(
312            &mut dict,
313            "runtime_context_length",
314            self.runtime_context_length,
315        );
316        if let Some(ref model) = self.runtime_loaded_model {
317            dict.put_str("runtime_loaded_model", model.as_str());
318        }
319        insert_opt_u64(&mut dict, "runtime_memory_bytes", self.runtime_memory_bytes);
320        insert_opt_u64(
321            &mut dict,
322            "runtime_memory_vram_bytes",
323            self.runtime_memory_vram_bytes,
324        );
325        if let Some(ref expires) = self.runtime_keep_alive_until {
326            dict.put_str("runtime_keep_alive_until", expires.as_str());
327        }
328        if let Some(ref request_id) = self.request_id {
329            dict.put_str("request_id", request_id.as_str());
330        }
331        Some(VmValue::dict(dict))
332    }
333}
334
335/// Loaded-model snapshot from Ollama's `/api/ps`. Shared with the CLI's
336/// `harn local` family so we don't duplicate the response shape.
337#[derive(Clone, Debug, Default, PartialEq, Eq)]
338pub struct OllamaPsModel {
339    pub name: Option<String>,
340    pub size_bytes: Option<u64>,
341    pub size_vram_bytes: Option<u64>,
342    pub expires_at: Option<String>,
343    pub context_length: Option<u64>,
344}
345
346impl OllamaPsModel {
347    /// Decode one `/api/ps` `models[]` entry. Returns `None` when the entry
348    /// has no usable identifier (an old daemon may emit completely empty
349    /// rows under load).
350    pub fn from_ps_entry(entry: &serde_json::Value) -> Option<Self> {
351        let name = entry
352            .get("name")
353            .and_then(serde_json::Value::as_str)
354            .or_else(|| entry.get("model").and_then(serde_json::Value::as_str))
355            .map(str::to_string);
356        let context_length = entry
357            .get("context_length")
358            .and_then(serde_json::Value::as_u64)
359            .or_else(|| {
360                entry
361                    .get("details")
362                    .and_then(|details| details.get("context_length"))
363                    .and_then(serde_json::Value::as_u64)
364            });
365        Some(Self {
366            name,
367            size_bytes: entry.get("size").and_then(serde_json::Value::as_u64),
368            size_vram_bytes: entry.get("size_vram").and_then(serde_json::Value::as_u64),
369            expires_at: entry
370                .get("expires_at")
371                .and_then(serde_json::Value::as_str)
372                .map(str::to_string),
373            context_length,
374        })
375    }
376}
377
378fn ns_field(frame: &serde_json::Value, key: &str) -> Option<u64> {
379    frame
380        .get(key)
381        .and_then(serde_json::Value::as_u64)
382        .map(ProviderTelemetry::ns_to_ms)
383}
384
385fn ms_or_round(value: Option<&serde_json::Value>) -> Option<u64> {
386    let value = value?;
387    if let Some(n) = value.as_u64() {
388        return Some(n);
389    }
390    value.as_f64().map(|n| n.round().max(0.0) as u64)
391}
392
393fn insert_opt_u64(dict: &mut crate::value::DictMap, key: &str, value: Option<u64>) {
394    if let Some(value) = value {
395        dict.insert(crate::value::intern_key(key), VmValue::Int(value as i64));
396    }
397}
398
399fn insert_opt_i64(dict: &mut crate::value::DictMap, key: &str, value: Option<i64>) {
400    if let Some(value) = value {
401        dict.insert(crate::value::intern_key(key), VmValue::Int(value));
402    }
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408
409    #[test]
410    fn ollama_done_frame_extracts_full_breakdown() {
411        let frame = serde_json::json!({
412            "model": "devstral-small-2:24b",
413            "total_duration": 7_400_000_000u64,
414            "load_duration": 400_000_000u64,
415            "prompt_eval_duration": 1_200_000_000u64,
416            "eval_duration": 5_800_000_000u64,
417            "prompt_eval_count": 1024,
418            "eval_count": 64
419        });
420
421        let telemetry = ProviderTelemetry::from_ollama_done(&frame, source::OLLAMA_CHAT);
422
423        assert_eq!(telemetry.source, source::OLLAMA_CHAT);
424        assert_eq!(telemetry.server_total_ms, Some(7400));
425        assert_eq!(telemetry.server_load_ms, Some(400));
426        assert_eq!(telemetry.server_prompt_eval_ms, Some(1200));
427        assert_eq!(telemetry.server_generation_ms, Some(5800));
428        assert_eq!(telemetry.server_prompt_tokens, Some(1024));
429        assert_eq!(telemetry.server_output_tokens, Some(64));
430        assert_eq!(
431            telemetry.runtime_loaded_model.as_deref(),
432            Some("devstral-small-2:24b")
433        );
434        assert!(!telemetry.is_empty());
435    }
436
437    #[test]
438    fn ollama_done_frame_leaves_missing_fields_as_none() {
439        let frame = serde_json::json!({
440            "model": "devstral-small-2:24b",
441            // Older Ollama builds omit duration fields on early frames.
442        });
443
444        let telemetry = ProviderTelemetry::from_ollama_done(&frame, source::OLLAMA_CHAT);
445
446        assert_eq!(telemetry.server_total_ms, None);
447        assert_eq!(telemetry.server_load_ms, None);
448        assert_eq!(telemetry.server_prompt_eval_ms, None);
449        assert_eq!(telemetry.server_generation_ms, None);
450        assert_eq!(telemetry.server_prompt_tokens, None);
451        assert_eq!(telemetry.server_output_tokens, None);
452    }
453
454    #[test]
455    fn openai_usage_partial_extracts_counts_only() {
456        let usage = serde_json::json!({
457            "prompt_tokens": 200,
458            "completion_tokens": 50
459        });
460
461        let telemetry = ProviderTelemetry::from_openai_usage(&usage, Some("req-abc"));
462
463        assert_eq!(telemetry.source, source::OPENAI_USAGE);
464        assert_eq!(telemetry.server_prompt_tokens, Some(200));
465        assert_eq!(telemetry.server_output_tokens, Some(50));
466        assert_eq!(telemetry.server_prompt_eval_ms, None);
467        assert_eq!(telemetry.request_id.as_deref(), Some("req-abc"));
468    }
469
470    #[test]
471    fn llamacpp_timings_promotes_source_and_fills_durations() {
472        let usage = serde_json::json!({
473            "prompt_tokens": 220,
474            "completion_tokens": 17,
475            "timings": {
476                "prompt_n": 200,
477                "prompt_ms": 145.4,
478                "predicted_n": 17,
479                "predicted_ms": 89.1,
480            }
481        });
482
483        let telemetry = ProviderTelemetry::from_openai_usage(&usage, None);
484
485        assert_eq!(telemetry.source, source::LLAMACPP_TIMINGS);
486        assert_eq!(telemetry.server_prompt_eval_ms, Some(145));
487        assert_eq!(telemetry.server_generation_ms, Some(89));
488        assert_eq!(telemetry.server_total_ms, Some(234));
489        assert_eq!(telemetry.server_prompt_tokens, Some(200));
490        assert_eq!(telemetry.server_output_tokens, Some(17));
491        assert!(!telemetry.is_empty());
492    }
493
494    #[test]
495    fn ps_entry_pulls_context_length_from_top_level_or_details() {
496        let entry = serde_json::json!({
497            "name": "devstral-small-2:24b",
498            "size": 4_700_000_000u64,
499            "size_vram": 4_500_000_000u64,
500            "expires_at": "2026-05-14T10:30:00Z",
501            "context_length": 32768
502        });
503        let model = OllamaPsModel::from_ps_entry(&entry).expect("ps entry parses");
504        assert_eq!(model.context_length, Some(32768));
505
506        let entry_nested = serde_json::json!({
507            "name": "devstral-small-2:24b",
508            "details": {"context_length": 16384}
509        });
510        let nested = OllamaPsModel::from_ps_entry(&entry_nested).expect("ps entry parses");
511        assert_eq!(nested.context_length, Some(16384));
512    }
513
514    #[test]
515    fn merge_ollama_ps_preserves_call_level_values() {
516        let mut telemetry = ProviderTelemetry::new(source::OLLAMA_CHAT);
517        telemetry.runtime_loaded_model = Some("real-model".to_string());
518        let ps = OllamaPsModel {
519            name: Some("alias-model".to_string()),
520            size_bytes: Some(1),
521            size_vram_bytes: Some(2),
522            expires_at: Some("forever".to_string()),
523            context_length: Some(8192),
524        };
525        telemetry.merge_ollama_ps(&ps);
526        assert_eq!(
527            telemetry.runtime_loaded_model.as_deref(),
528            Some("real-model")
529        );
530        assert_eq!(telemetry.runtime_memory_bytes, Some(1));
531        assert_eq!(telemetry.runtime_memory_vram_bytes, Some(2));
532        assert_eq!(
533            telemetry.runtime_keep_alive_until.as_deref(),
534            Some("forever")
535        );
536        assert_eq!(telemetry.runtime_context_length, Some(8192));
537    }
538
539    #[test]
540    fn as_vm_dict_returns_none_when_empty() {
541        let telemetry = ProviderTelemetry::default();
542        assert!(telemetry.is_empty());
543        assert!(telemetry.as_vm_dict().is_none());
544    }
545
546    #[test]
547    fn as_vm_dict_serializes_all_present_fields() {
548        let telemetry = ProviderTelemetry {
549            source: source::OLLAMA_CHAT.to_string(),
550            server_total_ms: Some(100),
551            client_wall_ms: Some(120),
552            runtime_loaded_model: Some("qwen".to_string()),
553            ..Default::default()
554        };
555        let value = telemetry.as_vm_dict().expect("dict present");
556        let dict = value.as_dict().expect("dict body");
557        assert_eq!(
558            dict.get("source").map(VmValue::display).as_deref(),
559            Some(source::OLLAMA_CHAT)
560        );
561        assert_eq!(
562            dict.get("server_total_ms").and_then(|v| match v {
563                VmValue::Int(n) => Some(*n),
564                _ => None,
565            }),
566            Some(100)
567        );
568        assert_eq!(
569            dict.get("client_wall_ms").and_then(|v| match v {
570                VmValue::Int(n) => Some(*n),
571                _ => None,
572            }),
573            Some(120)
574        );
575    }
576}
harn_vm/llm/api/telemetry.rs

harn_vm/llm/api/
telemetry.rs