strands_agents/telemetry/
mod.rs

1//! Telemetry and metrics for agent execution.
2
3pub mod config;
4pub mod tracer;
5
6pub use config::{OtelResource, StrandsTelemetry, StrandsTelemetryBuilder};
7pub use tracer::{get_tracer, serialize, Tracer, AttributeValue, Attributes};
8
9use std::collections::HashMap;
10use std::time::Instant;
11
12use serde::{Deserialize, Serialize};
13
14use crate::types::content::Message;
15use crate::types::streaming::{Metrics, Usage};
16use crate::types::tools::ToolUse;
17
18/// A trace representing a single operation or step in the execution flow.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct Trace {
21    pub id: String,
22    pub name: String,
23    pub raw_name: Option<String>,
24    pub parent_id: Option<String>,
25    pub start_time: f64,
26    pub end_time: Option<f64>,
27    pub children: Vec<Trace>,
28    pub metadata: HashMap<String, serde_json::Value>,
29    #[serde(skip)]
30    start_instant: Option<Instant>,
31}
32
33impl Trace {
34    pub fn new(name: impl Into<String>) -> Self {
35        Self {
36            id: uuid::Uuid::new_v4().to_string(),
37            name: name.into(),
38            raw_name: None,
39            parent_id: None,
40            start_time: std::time::SystemTime::now()
41                .duration_since(std::time::UNIX_EPOCH)
42                .unwrap()
43                .as_secs_f64(),
44            end_time: None,
45            children: Vec::new(),
46            metadata: HashMap::new(),
47            start_instant: Some(Instant::now()),
48        }
49    }
50
51    pub fn child(name: impl Into<String>, parent_id: impl Into<String>) -> Self {
52        let mut trace = Self::new(name);
53        trace.parent_id = Some(parent_id.into());
54        trace
55    }
56
57    pub fn end(&mut self) {
58        self.end_time = Some(
59            std::time::SystemTime::now()
60                .duration_since(std::time::UNIX_EPOCH)
61                .unwrap()
62                .as_secs_f64(),
63        );
64    }
65
66    pub fn add_child(&mut self, child: Trace) {
67        self.children.push(child);
68    }
69
70    pub fn duration(&self) -> Option<f64> {
71
72        if self.end_time.is_some() {
73            if let Some(instant) = self.start_instant {
74                return Some(instant.elapsed().as_secs_f64());
75            }
76        }
77
78        self.end_time.map(|end| end - self.start_time)
79    }
80
81    pub fn duration_ms(&self) -> Option<u64> {
82        self.duration().map(|d| (d * 1000.0) as u64)
83    }
84
85    pub fn add_message(&mut self, _message: &Message) {
86
87    }
88
89    pub fn to_dict(&self) -> serde_json::Value {
90        serde_json::json!({
91            "id": self.id,
92            "name": self.name,
93            "raw_name": self.raw_name,
94            "parent_id": self.parent_id,
95            "start_time": self.start_time,
96            "end_time": self.end_time,
97            "duration": self.duration(),
98            "children": self.children.iter().map(|c| c.to_dict()).collect::<Vec<_>>(),
99            "metadata": self.metadata,
100        })
101    }
102}
103
104/// Metrics for a specific tool's usage.
105#[derive(Debug, Clone, Default, Serialize, Deserialize)]
106pub struct ToolMetrics {
107    pub tool_name: String,
108    pub tool_use_id: String,
109    pub call_count: u32,
110    pub success_count: u32,
111    pub error_count: u32,
112    pub total_time: f64,
113}
114
115impl ToolMetrics {
116    pub fn new(tool: &ToolUse) -> Self {
117        Self {
118            tool_name: tool.name.clone(),
119            tool_use_id: tool.tool_use_id.clone(),
120            call_count: 0,
121            success_count: 0,
122            error_count: 0,
123            total_time: 0.0,
124        }
125    }
126
127    pub fn add_call(&mut self, tool: &ToolUse, duration: f64, success: bool) {
128        self.tool_name = tool.name.clone();
129        self.tool_use_id = tool.tool_use_id.clone();
130        self.call_count += 1;
131        self.total_time += duration;
132        if success {
133            self.success_count += 1;
134        } else {
135            self.error_count += 1;
136        }
137    }
138
139    pub fn average_time(&self) -> f64 {
140        if self.call_count > 0 {
141            self.total_time / self.call_count as f64
142        } else {
143            0.0
144        }
145    }
146
147    pub fn success_rate(&self) -> f64 {
148        if self.call_count > 0 {
149            self.success_count as f64 / self.call_count as f64
150        } else {
151            0.0
152        }
153    }
154}
155
156/// Metrics collected during event loop execution.
157#[derive(Debug, Clone, Default, Serialize, Deserialize)]
158pub struct EventLoopMetrics {
159    pub cycle_count: u32,
160    pub tool_metrics: HashMap<String, ToolMetrics>,
161    pub cycle_durations: Vec<f64>,
162    pub traces: Vec<Trace>,
163    pub accumulated_usage: Usage,
164    pub accumulated_metrics: Metrics,
165    #[serde(skip)]
166    cycle_start: Option<Instant>,
167}
168
169impl EventLoopMetrics {
170    pub fn new() -> Self {
171        Self::default()
172    }
173
174    /// Start a new event loop cycle.
175    pub fn start_cycle(&mut self) -> Trace {
176        self.cycle_start = Some(Instant::now());
177        self.cycle_count += 1;
178        let trace = Trace::new(format!("Cycle {}", self.cycle_count));
179        self.traces.push(trace.clone());
180        trace
181    }
182
183    /// End the current cycle.
184    pub fn end_cycle(&mut self, cycle_trace: &mut Trace) {
185        if let Some(start) = self.cycle_start.take() {
186            let duration = start.elapsed().as_secs_f64();
187            self.cycle_durations.push(duration);
188            cycle_trace.end();
189        }
190    }
191
192    /// Record tool usage metrics.
193    pub fn add_tool_usage(
194        &mut self,
195        tool: &ToolUse,
196        duration: f64,
197        tool_trace: &mut Trace,
198        success: bool,
199        message: &Message,
200    ) {
201        tool_trace.metadata.insert(
202            "toolUseId".to_string(),
203            serde_json::Value::String(tool.tool_use_id.clone()),
204        );
205        tool_trace.metadata.insert(
206            "tool_name".to_string(),
207            serde_json::Value::String(tool.name.clone()),
208        );
209        tool_trace.raw_name = Some(format!("{} - {}", tool.name, tool.tool_use_id));
210        tool_trace.add_message(message);
211
212        self.tool_metrics
213            .entry(tool.name.clone())
214            .or_insert_with(|| ToolMetrics::new(tool))
215            .add_call(tool, duration, success);
216
217        tool_trace.end();
218    }
219
220    /// Update accumulated token usage.
221    pub fn update_usage(&mut self, usage: &Usage) {
222        self.accumulated_usage.add(usage);
223    }
224
225    /// Update accumulated performance metrics.
226    pub fn update_metrics(&mut self, metrics: &Metrics) {
227        self.accumulated_metrics.latency_ms += metrics.latency_ms;
228    }
229
230    /// Get total duration of all cycles.
231    pub fn total_duration(&self) -> f64 {
232        self.cycle_durations.iter().sum()
233    }
234
235    /// Get average cycle time.
236    pub fn average_cycle_time(&self) -> f64 {
237        if self.cycle_count > 0 {
238            self.total_duration() / self.cycle_count as f64
239        } else {
240            0.0
241        }
242    }
243
244    /// Generate a comprehensive summary.
245    pub fn get_summary(&self) -> serde_json::Value {
246        serde_json::json!({
247            "total_cycles": self.cycle_count,
248            "total_duration": self.total_duration(),
249            "average_cycle_time": self.average_cycle_time(),
250            "tool_usage": self.tool_metrics.iter().map(|(name, metrics)| {
251                (name.clone(), serde_json::json!({
252                    "tool_info": {
253                        "tool_use_id": metrics.tool_use_id,
254                        "name": metrics.tool_name,
255                    },
256                    "execution_stats": {
257                        "call_count": metrics.call_count,
258                        "success_count": metrics.success_count,
259                        "error_count": metrics.error_count,
260                        "total_time": metrics.total_time,
261                        "average_time": metrics.average_time(),
262                        "success_rate": metrics.success_rate(),
263                    }
264                }))
265            }).collect::<HashMap<_, _>>(),
266            "traces": self.traces.iter().map(|t| t.to_dict()).collect::<Vec<_>>(),
267            "accumulated_usage": {
268                "inputTokens": self.accumulated_usage.input_tokens,
269                "outputTokens": self.accumulated_usage.output_tokens,
270                "totalTokens": self.accumulated_usage.total_tokens,
271                "cacheReadInputTokens": self.accumulated_usage.cache_read_input_tokens,
272                "cacheWriteInputTokens": self.accumulated_usage.cache_write_input_tokens,
273            },
274            "accumulated_metrics": {
275                "latencyMs": self.accumulated_metrics.latency_ms,
276            },
277        })
278    }
279
280    pub fn total_input_tokens(&self) -> u32 {
281        self.accumulated_usage.input_tokens
282    }
283
284    pub fn total_output_tokens(&self) -> u32 {
285        self.accumulated_usage.output_tokens
286    }
287
288    pub fn total_latency_ms(&self) -> u64 {
289        self.accumulated_metrics.latency_ms
290    }
291}
292
293/// Convert metrics to a formatted string.
294pub fn metrics_to_string(metrics: &EventLoopMetrics) -> String {
295    let summary = metrics.get_summary();
296    let mut lines = Vec::new();
297
298    lines.push("Event Loop Metrics Summary:".to_string());
299    lines.push(format!(
300        "├─ Cycles: total={}, avg_time={:.3}s, total_time={:.3}s",
301        summary["total_cycles"],
302        summary["average_cycle_time"].as_f64().unwrap_or(0.0),
303        summary["total_duration"].as_f64().unwrap_or(0.0)
304    ));
305
306    let usage = &summary["accumulated_usage"];
307    let mut token_parts = vec![
308        format!("in={}", usage["inputTokens"]),
309        format!("out={}", usage["outputTokens"]),
310        format!("total={}", usage["totalTokens"]),
311    ];
312
313    if let Some(cache_read) = usage["cacheReadInputTokens"].as_u64() {
314        if cache_read > 0 {
315            token_parts.push(format!("cache_read={}", cache_read));
316        }
317    }
318    if let Some(cache_write) = usage["cacheWriteInputTokens"].as_u64() {
319        if cache_write > 0 {
320            token_parts.push(format!("cache_write={}", cache_write));
321        }
322    }
323
324    lines.push(format!("├─ Tokens: {}", token_parts.join(", ")));
325    lines.push(format!(
326        "├─ Latency: {}ms",
327        summary["accumulated_metrics"]["latencyMs"]
328    ));
329
330    lines.push("├─ Tool Usage:".to_string());
331    if let Some(tool_usage) = summary["tool_usage"].as_object() {
332        for (tool_name, data) in tool_usage {
333            let stats = &data["execution_stats"];
334            lines.push(format!("   └─ {}:", tool_name));
335            lines.push(format!(
336                "      ├─ Stats: calls={}, success={}, errors={}, success_rate={:.1}%",
337                stats["call_count"],
338                stats["success_count"],
339                stats["error_count"],
340                stats["success_rate"].as_f64().unwrap_or(0.0) * 100.0
341            ));
342            lines.push(format!(
343                "      └─ Timing: avg={:.3}s, total={:.3}s",
344                stats["average_time"].as_f64().unwrap_or(0.0),
345                stats["total_time"].as_f64().unwrap_or(0.0)
346            ));
347        }
348    }
349
350    lines.join("\n")
351}
352
353/// Metrics constants matching Python SDK.
354pub mod constants {
355    pub const STRANDS_EVENT_LOOP_CYCLE_COUNT: &str = "strands.event_loop.cycle_count";
356    pub const STRANDS_EVENT_LOOP_START_CYCLE: &str = "strands.event_loop.start_cycle";
357    pub const STRANDS_EVENT_LOOP_END_CYCLE: &str = "strands.event_loop.end_cycle";
358    pub const STRANDS_EVENT_LOOP_CYCLE_DURATION: &str = "strands.event_loop.cycle_duration";
359    pub const STRANDS_EVENT_LOOP_LATENCY: &str = "strands.event_loop.latency";
360    pub const STRANDS_EVENT_LOOP_INPUT_TOKENS: &str = "strands.event_loop.input.tokens";
361    pub const STRANDS_EVENT_LOOP_OUTPUT_TOKENS: &str = "strands.event_loop.output.tokens";
362    pub const STRANDS_EVENT_LOOP_CACHE_READ_INPUT_TOKENS: &str =
363        "strands.event_loop.cache_read.input.tokens";
364    pub const STRANDS_EVENT_LOOP_CACHE_WRITE_INPUT_TOKENS: &str =
365        "strands.event_loop.cache_write.input.tokens";
366    pub const STRANDS_MODEL_TIME_TO_FIRST_TOKEN: &str = "strands.model.time_to_first_token";
367    pub const STRANDS_TOOL_CALL_COUNT: &str = "strands.tool.call_count";
368    pub const STRANDS_TOOL_SUCCESS_COUNT: &str = "strands.tool.success_count";
369    pub const STRANDS_TOOL_ERROR_COUNT: &str = "strands.tool.error_count";
370    pub const STRANDS_TOOL_DURATION: &str = "strands.tool.duration";
371}
372
373use opentelemetry::metrics::{Counter, Histogram, Meter};
374use opentelemetry::KeyValue;
375
376/// Global singleton instance for MetricsClient.
377static METRICS_CLIENT_INSTANCE: std::sync::OnceLock<MetricsClient> = std::sync::OnceLock::new();
378
379/// Singleton client for managing OpenTelemetry metrics instruments.
380///
381/// The actual metrics export destination (console, OTLP endpoint, etc.) is configured
382/// through OpenTelemetry SDK configuration by users, not by this client.
383pub struct MetricsClient {
384    meter: Meter,
385
386    event_loop_cycle_count: Counter<u64>,
387    event_loop_start_cycle: Counter<u64>,
388    event_loop_end_cycle: Counter<u64>,
389    tool_call_count: Counter<u64>,
390    tool_success_count: Counter<u64>,
391    tool_error_count: Counter<u64>,
392
393    event_loop_cycle_duration: Histogram<f64>,
394    event_loop_latency: Histogram<f64>,
395    event_loop_input_tokens: Histogram<u64>,
396    event_loop_output_tokens: Histogram<u64>,
397    event_loop_cache_read_input_tokens: Histogram<u64>,
398    event_loop_cache_write_input_tokens: Histogram<u64>,
399    model_time_to_first_token: Histogram<f64>,
400    tool_duration: Histogram<f64>,
401}
402
403impl MetricsClient {
404    /// Create a new MetricsClient with the given meter.
405    fn new(meter: Meter) -> Self {
406        tracing::info!("Creating Strands MetricsClient with OpenTelemetry instruments");
407
408        Self {
409            event_loop_cycle_count: meter
410                .u64_counter(constants::STRANDS_EVENT_LOOP_CYCLE_COUNT)
411                .with_description("Number of event loop cycles")
412                .with_unit("count")
413                .build(),
414            event_loop_start_cycle: meter
415                .u64_counter(constants::STRANDS_EVENT_LOOP_START_CYCLE)
416                .with_description("Event loop cycle starts")
417                .with_unit("count")
418                .build(),
419            event_loop_end_cycle: meter
420                .u64_counter(constants::STRANDS_EVENT_LOOP_END_CYCLE)
421                .with_description("Event loop cycle ends")
422                .with_unit("count")
423                .build(),
424            tool_call_count: meter
425                .u64_counter(constants::STRANDS_TOOL_CALL_COUNT)
426                .with_description("Number of tool calls")
427                .with_unit("count")
428                .build(),
429            tool_success_count: meter
430                .u64_counter(constants::STRANDS_TOOL_SUCCESS_COUNT)
431                .with_description("Number of successful tool calls")
432                .with_unit("count")
433                .build(),
434            tool_error_count: meter
435                .u64_counter(constants::STRANDS_TOOL_ERROR_COUNT)
436                .with_description("Number of failed tool calls")
437                .with_unit("count")
438                .build(),
439            event_loop_cycle_duration: meter
440                .f64_histogram(constants::STRANDS_EVENT_LOOP_CYCLE_DURATION)
441                .with_description("Duration of event loop cycles")
442                .with_unit("s")
443                .build(),
444            event_loop_latency: meter
445                .f64_histogram(constants::STRANDS_EVENT_LOOP_LATENCY)
446                .with_description("Latency of model requests")
447                .with_unit("ms")
448                .build(),
449            event_loop_input_tokens: meter
450                .u64_histogram(constants::STRANDS_EVENT_LOOP_INPUT_TOKENS)
451                .with_description("Number of input tokens")
452                .with_unit("token")
453                .build(),
454            event_loop_output_tokens: meter
455                .u64_histogram(constants::STRANDS_EVENT_LOOP_OUTPUT_TOKENS)
456                .with_description("Number of output tokens")
457                .with_unit("token")
458                .build(),
459            event_loop_cache_read_input_tokens: meter
460                .u64_histogram(constants::STRANDS_EVENT_LOOP_CACHE_READ_INPUT_TOKENS)
461                .with_description("Number of cache read input tokens")
462                .with_unit("token")
463                .build(),
464            event_loop_cache_write_input_tokens: meter
465                .u64_histogram(constants::STRANDS_EVENT_LOOP_CACHE_WRITE_INPUT_TOKENS)
466                .with_description("Number of cache write input tokens")
467                .with_unit("token")
468                .build(),
469            model_time_to_first_token: meter
470                .f64_histogram(constants::STRANDS_MODEL_TIME_TO_FIRST_TOKEN)
471                .with_description("Time to first token from model")
472                .with_unit("ms")
473                .build(),
474            tool_duration: meter
475                .f64_histogram(constants::STRANDS_TOOL_DURATION)
476                .with_description("Duration of tool execution")
477                .with_unit("s")
478                .build(),
479            meter,
480        }
481    }
482
483    /// Get the singleton MetricsClient instance.
484    ///
485    /// Uses the global OpenTelemetry meter provider. Users should configure
486    /// the meter provider before calling this method.
487    pub fn global() -> &'static MetricsClient {
488        METRICS_CLIENT_INSTANCE.get_or_init(|| {
489            let meter = opentelemetry::global::meter("strands");
490            MetricsClient::new(meter)
491        })
492    }
493
494    /// Convert HashMap attributes to OpenTelemetry KeyValue pairs.
495    fn to_key_values(attributes: &HashMap<String, String>) -> Vec<KeyValue> {
496        attributes
497            .iter()
498            .map(|(k, v)| KeyValue::new(k.clone(), v.clone()))
499            .collect()
500    }
501
502    /// Record event loop cycle count.
503    pub fn record_cycle_count(&self, count: u64, attributes: &HashMap<String, String>) {
504        self.event_loop_cycle_count
505            .add(count, &Self::to_key_values(attributes));
506    }
507
508    /// Record event loop start cycle.
509    pub fn record_start_cycle(&self, attributes: &HashMap<String, String>) {
510        self.event_loop_start_cycle
511            .add(1, &Self::to_key_values(attributes));
512    }
513
514    /// Record event loop end cycle.
515    pub fn record_end_cycle(&self, attributes: &HashMap<String, String>) {
516        self.event_loop_end_cycle
517            .add(1, &Self::to_key_values(attributes));
518    }
519
520    /// Record cycle duration.
521    pub fn record_cycle_duration(&self, duration_secs: f64, attributes: &HashMap<String, String>) {
522        self.event_loop_cycle_duration
523            .record(duration_secs, &Self::to_key_values(attributes));
524    }
525
526    /// Record latency.
527    pub fn record_latency(&self, latency_ms: u64, attributes: &HashMap<String, String>) {
528        self.event_loop_latency
529            .record(latency_ms as f64, &Self::to_key_values(attributes));
530    }
531
532    /// Record input tokens.
533    pub fn record_input_tokens(&self, tokens: u32, attributes: &HashMap<String, String>) {
534        self.event_loop_input_tokens
535            .record(tokens as u64, &Self::to_key_values(attributes));
536    }
537
538    /// Record output tokens.
539    pub fn record_output_tokens(&self, tokens: u32, attributes: &HashMap<String, String>) {
540        self.event_loop_output_tokens
541            .record(tokens as u64, &Self::to_key_values(attributes));
542    }
543
544    /// Record cache read input tokens.
545    pub fn record_cache_read_input_tokens(&self, tokens: u32, attributes: &HashMap<String, String>) {
546        self.event_loop_cache_read_input_tokens
547            .record(tokens as u64, &Self::to_key_values(attributes));
548    }
549
550    /// Record cache write input tokens.
551    pub fn record_cache_write_input_tokens(&self, tokens: u32, attributes: &HashMap<String, String>) {
552        self.event_loop_cache_write_input_tokens
553            .record(tokens as u64, &Self::to_key_values(attributes));
554    }
555
556    /// Record model time to first token.
557    pub fn record_time_to_first_token(&self, time_ms: u64, attributes: &HashMap<String, String>) {
558        self.model_time_to_first_token
559            .record(time_ms as f64, &Self::to_key_values(attributes));
560    }
561
562    /// Record tool call count.
563    pub fn record_tool_call_count(&self, count: u64, attributes: &HashMap<String, String>) {
564        self.tool_call_count
565            .add(count, &Self::to_key_values(attributes));
566    }
567
568    /// Record tool success count.
569    pub fn record_tool_success_count(&self, count: u64, attributes: &HashMap<String, String>) {
570        self.tool_success_count
571            .add(count, &Self::to_key_values(attributes));
572    }
573
574    /// Record tool error count.
575    pub fn record_tool_error_count(&self, count: u64, attributes: &HashMap<String, String>) {
576        self.tool_error_count
577            .add(count, &Self::to_key_values(attributes));
578    }
579
580    /// Record tool duration.
581    pub fn record_tool_duration(&self, duration_secs: f64, attributes: &HashMap<String, String>) {
582        self.tool_duration
583            .record(duration_secs, &Self::to_key_values(attributes));
584    }
585
586    /// Get the underlying meter for creating custom metrics.
587    pub fn meter(&self) -> &Meter {
588        &self.meter
589    }
590}