1pub mod config;
4pub mod tracer;
5
6pub use config::{OtelResource, StrandsTelemetry, StrandsTelemetryBuilder};
7pub use tracer::{get_tracer, serialize, Tracer, AttributeValue, Attributes};
8
9use std::collections::HashMap;
10use std::time::Instant;
11
12use serde::{Deserialize, Serialize};
13
14use crate::types::content::Message;
15use crate::types::streaming::{Metrics, Usage};
16use crate::types::tools::ToolUse;
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct Trace {
21 pub id: String,
22 pub name: String,
23 pub raw_name: Option<String>,
24 pub parent_id: Option<String>,
25 pub start_time: f64,
26 pub end_time: Option<f64>,
27 pub children: Vec<Trace>,
28 pub metadata: HashMap<String, serde_json::Value>,
29 #[serde(skip)]
30 start_instant: Option<Instant>,
31}
32
33impl Trace {
34 pub fn new(name: impl Into<String>) -> Self {
35 Self {
36 id: uuid::Uuid::new_v4().to_string(),
37 name: name.into(),
38 raw_name: None,
39 parent_id: None,
40 start_time: std::time::SystemTime::now()
41 .duration_since(std::time::UNIX_EPOCH)
42 .unwrap()
43 .as_secs_f64(),
44 end_time: None,
45 children: Vec::new(),
46 metadata: HashMap::new(),
47 start_instant: Some(Instant::now()),
48 }
49 }
50
51 pub fn child(name: impl Into<String>, parent_id: impl Into<String>) -> Self {
52 let mut trace = Self::new(name);
53 trace.parent_id = Some(parent_id.into());
54 trace
55 }
56
57 pub fn end(&mut self) {
58 self.end_time = Some(
59 std::time::SystemTime::now()
60 .duration_since(std::time::UNIX_EPOCH)
61 .unwrap()
62 .as_secs_f64(),
63 );
64 }
65
66 pub fn add_child(&mut self, child: Trace) {
67 self.children.push(child);
68 }
69
70 pub fn duration(&self) -> Option<f64> {
71
72 if self.end_time.is_some() {
73 if let Some(instant) = self.start_instant {
74 return Some(instant.elapsed().as_secs_f64());
75 }
76 }
77
78 self.end_time.map(|end| end - self.start_time)
79 }
80
81 pub fn duration_ms(&self) -> Option<u64> {
82 self.duration().map(|d| (d * 1000.0) as u64)
83 }
84
85 pub fn add_message(&mut self, _message: &Message) {
86
87 }
88
89 pub fn to_dict(&self) -> serde_json::Value {
90 serde_json::json!({
91 "id": self.id,
92 "name": self.name,
93 "raw_name": self.raw_name,
94 "parent_id": self.parent_id,
95 "start_time": self.start_time,
96 "end_time": self.end_time,
97 "duration": self.duration(),
98 "children": self.children.iter().map(|c| c.to_dict()).collect::<Vec<_>>(),
99 "metadata": self.metadata,
100 })
101 }
102}
103
104#[derive(Debug, Clone, Default, Serialize, Deserialize)]
106pub struct ToolMetrics {
107 pub tool_name: String,
108 pub tool_use_id: String,
109 pub call_count: u32,
110 pub success_count: u32,
111 pub error_count: u32,
112 pub total_time: f64,
113}
114
115impl ToolMetrics {
116 pub fn new(tool: &ToolUse) -> Self {
117 Self {
118 tool_name: tool.name.clone(),
119 tool_use_id: tool.tool_use_id.clone(),
120 call_count: 0,
121 success_count: 0,
122 error_count: 0,
123 total_time: 0.0,
124 }
125 }
126
127 pub fn add_call(&mut self, tool: &ToolUse, duration: f64, success: bool) {
128 self.tool_name = tool.name.clone();
129 self.tool_use_id = tool.tool_use_id.clone();
130 self.call_count += 1;
131 self.total_time += duration;
132 if success {
133 self.success_count += 1;
134 } else {
135 self.error_count += 1;
136 }
137 }
138
139 pub fn average_time(&self) -> f64 {
140 if self.call_count > 0 {
141 self.total_time / self.call_count as f64
142 } else {
143 0.0
144 }
145 }
146
147 pub fn success_rate(&self) -> f64 {
148 if self.call_count > 0 {
149 self.success_count as f64 / self.call_count as f64
150 } else {
151 0.0
152 }
153 }
154}
155
156#[derive(Debug, Clone, Default, Serialize, Deserialize)]
158pub struct EventLoopMetrics {
159 pub cycle_count: u32,
160 pub tool_metrics: HashMap<String, ToolMetrics>,
161 pub cycle_durations: Vec<f64>,
162 pub traces: Vec<Trace>,
163 pub accumulated_usage: Usage,
164 pub accumulated_metrics: Metrics,
165 #[serde(skip)]
166 cycle_start: Option<Instant>,
167}
168
169impl EventLoopMetrics {
170 pub fn new() -> Self {
171 Self::default()
172 }
173
174 pub fn start_cycle(&mut self) -> Trace {
176 self.cycle_start = Some(Instant::now());
177 self.cycle_count += 1;
178 let trace = Trace::new(format!("Cycle {}", self.cycle_count));
179 self.traces.push(trace.clone());
180 trace
181 }
182
183 pub fn end_cycle(&mut self, cycle_trace: &mut Trace) {
185 if let Some(start) = self.cycle_start.take() {
186 let duration = start.elapsed().as_secs_f64();
187 self.cycle_durations.push(duration);
188 cycle_trace.end();
189 }
190 }
191
192 pub fn add_tool_usage(
194 &mut self,
195 tool: &ToolUse,
196 duration: f64,
197 tool_trace: &mut Trace,
198 success: bool,
199 message: &Message,
200 ) {
201 tool_trace.metadata.insert(
202 "toolUseId".to_string(),
203 serde_json::Value::String(tool.tool_use_id.clone()),
204 );
205 tool_trace.metadata.insert(
206 "tool_name".to_string(),
207 serde_json::Value::String(tool.name.clone()),
208 );
209 tool_trace.raw_name = Some(format!("{} - {}", tool.name, tool.tool_use_id));
210 tool_trace.add_message(message);
211
212 self.tool_metrics
213 .entry(tool.name.clone())
214 .or_insert_with(|| ToolMetrics::new(tool))
215 .add_call(tool, duration, success);
216
217 tool_trace.end();
218 }
219
220 pub fn update_usage(&mut self, usage: &Usage) {
222 self.accumulated_usage.add(usage);
223 }
224
225 pub fn update_metrics(&mut self, metrics: &Metrics) {
227 self.accumulated_metrics.latency_ms += metrics.latency_ms;
228 }
229
230 pub fn total_duration(&self) -> f64 {
232 self.cycle_durations.iter().sum()
233 }
234
235 pub fn average_cycle_time(&self) -> f64 {
237 if self.cycle_count > 0 {
238 self.total_duration() / self.cycle_count as f64
239 } else {
240 0.0
241 }
242 }
243
244 pub fn get_summary(&self) -> serde_json::Value {
246 serde_json::json!({
247 "total_cycles": self.cycle_count,
248 "total_duration": self.total_duration(),
249 "average_cycle_time": self.average_cycle_time(),
250 "tool_usage": self.tool_metrics.iter().map(|(name, metrics)| {
251 (name.clone(), serde_json::json!({
252 "tool_info": {
253 "tool_use_id": metrics.tool_use_id,
254 "name": metrics.tool_name,
255 },
256 "execution_stats": {
257 "call_count": metrics.call_count,
258 "success_count": metrics.success_count,
259 "error_count": metrics.error_count,
260 "total_time": metrics.total_time,
261 "average_time": metrics.average_time(),
262 "success_rate": metrics.success_rate(),
263 }
264 }))
265 }).collect::<HashMap<_, _>>(),
266 "traces": self.traces.iter().map(|t| t.to_dict()).collect::<Vec<_>>(),
267 "accumulated_usage": {
268 "inputTokens": self.accumulated_usage.input_tokens,
269 "outputTokens": self.accumulated_usage.output_tokens,
270 "totalTokens": self.accumulated_usage.total_tokens,
271 "cacheReadInputTokens": self.accumulated_usage.cache_read_input_tokens,
272 "cacheWriteInputTokens": self.accumulated_usage.cache_write_input_tokens,
273 },
274 "accumulated_metrics": {
275 "latencyMs": self.accumulated_metrics.latency_ms,
276 },
277 })
278 }
279
280 pub fn total_input_tokens(&self) -> u32 {
281 self.accumulated_usage.input_tokens
282 }
283
284 pub fn total_output_tokens(&self) -> u32 {
285 self.accumulated_usage.output_tokens
286 }
287
288 pub fn total_latency_ms(&self) -> u64 {
289 self.accumulated_metrics.latency_ms
290 }
291}
292
293pub fn metrics_to_string(metrics: &EventLoopMetrics) -> String {
295 let summary = metrics.get_summary();
296 let mut lines = Vec::new();
297
298 lines.push("Event Loop Metrics Summary:".to_string());
299 lines.push(format!(
300 "├─ Cycles: total={}, avg_time={:.3}s, total_time={:.3}s",
301 summary["total_cycles"],
302 summary["average_cycle_time"].as_f64().unwrap_or(0.0),
303 summary["total_duration"].as_f64().unwrap_or(0.0)
304 ));
305
306 let usage = &summary["accumulated_usage"];
307 let mut token_parts = vec![
308 format!("in={}", usage["inputTokens"]),
309 format!("out={}", usage["outputTokens"]),
310 format!("total={}", usage["totalTokens"]),
311 ];
312
313 if let Some(cache_read) = usage["cacheReadInputTokens"].as_u64() {
314 if cache_read > 0 {
315 token_parts.push(format!("cache_read={}", cache_read));
316 }
317 }
318 if let Some(cache_write) = usage["cacheWriteInputTokens"].as_u64() {
319 if cache_write > 0 {
320 token_parts.push(format!("cache_write={}", cache_write));
321 }
322 }
323
324 lines.push(format!("├─ Tokens: {}", token_parts.join(", ")));
325 lines.push(format!(
326 "├─ Latency: {}ms",
327 summary["accumulated_metrics"]["latencyMs"]
328 ));
329
330 lines.push("├─ Tool Usage:".to_string());
331 if let Some(tool_usage) = summary["tool_usage"].as_object() {
332 for (tool_name, data) in tool_usage {
333 let stats = &data["execution_stats"];
334 lines.push(format!(" └─ {}:", tool_name));
335 lines.push(format!(
336 " ├─ Stats: calls={}, success={}, errors={}, success_rate={:.1}%",
337 stats["call_count"],
338 stats["success_count"],
339 stats["error_count"],
340 stats["success_rate"].as_f64().unwrap_or(0.0) * 100.0
341 ));
342 lines.push(format!(
343 " └─ Timing: avg={:.3}s, total={:.3}s",
344 stats["average_time"].as_f64().unwrap_or(0.0),
345 stats["total_time"].as_f64().unwrap_or(0.0)
346 ));
347 }
348 }
349
350 lines.join("\n")
351}
352
353pub mod constants {
355 pub const STRANDS_EVENT_LOOP_CYCLE_COUNT: &str = "strands.event_loop.cycle_count";
356 pub const STRANDS_EVENT_LOOP_START_CYCLE: &str = "strands.event_loop.start_cycle";
357 pub const STRANDS_EVENT_LOOP_END_CYCLE: &str = "strands.event_loop.end_cycle";
358 pub const STRANDS_EVENT_LOOP_CYCLE_DURATION: &str = "strands.event_loop.cycle_duration";
359 pub const STRANDS_EVENT_LOOP_LATENCY: &str = "strands.event_loop.latency";
360 pub const STRANDS_EVENT_LOOP_INPUT_TOKENS: &str = "strands.event_loop.input.tokens";
361 pub const STRANDS_EVENT_LOOP_OUTPUT_TOKENS: &str = "strands.event_loop.output.tokens";
362 pub const STRANDS_EVENT_LOOP_CACHE_READ_INPUT_TOKENS: &str =
363 "strands.event_loop.cache_read.input.tokens";
364 pub const STRANDS_EVENT_LOOP_CACHE_WRITE_INPUT_TOKENS: &str =
365 "strands.event_loop.cache_write.input.tokens";
366 pub const STRANDS_MODEL_TIME_TO_FIRST_TOKEN: &str = "strands.model.time_to_first_token";
367 pub const STRANDS_TOOL_CALL_COUNT: &str = "strands.tool.call_count";
368 pub const STRANDS_TOOL_SUCCESS_COUNT: &str = "strands.tool.success_count";
369 pub const STRANDS_TOOL_ERROR_COUNT: &str = "strands.tool.error_count";
370 pub const STRANDS_TOOL_DURATION: &str = "strands.tool.duration";
371}
372
373use opentelemetry::metrics::{Counter, Histogram, Meter};
374use opentelemetry::KeyValue;
375
376static METRICS_CLIENT_INSTANCE: std::sync::OnceLock<MetricsClient> = std::sync::OnceLock::new();
378
379pub struct MetricsClient {
384 meter: Meter,
385
386 event_loop_cycle_count: Counter<u64>,
387 event_loop_start_cycle: Counter<u64>,
388 event_loop_end_cycle: Counter<u64>,
389 tool_call_count: Counter<u64>,
390 tool_success_count: Counter<u64>,
391 tool_error_count: Counter<u64>,
392
393 event_loop_cycle_duration: Histogram<f64>,
394 event_loop_latency: Histogram<f64>,
395 event_loop_input_tokens: Histogram<u64>,
396 event_loop_output_tokens: Histogram<u64>,
397 event_loop_cache_read_input_tokens: Histogram<u64>,
398 event_loop_cache_write_input_tokens: Histogram<u64>,
399 model_time_to_first_token: Histogram<f64>,
400 tool_duration: Histogram<f64>,
401}
402
403impl MetricsClient {
404 fn new(meter: Meter) -> Self {
406 tracing::info!("Creating Strands MetricsClient with OpenTelemetry instruments");
407
408 Self {
409 event_loop_cycle_count: meter
410 .u64_counter(constants::STRANDS_EVENT_LOOP_CYCLE_COUNT)
411 .with_description("Number of event loop cycles")
412 .with_unit("count")
413 .build(),
414 event_loop_start_cycle: meter
415 .u64_counter(constants::STRANDS_EVENT_LOOP_START_CYCLE)
416 .with_description("Event loop cycle starts")
417 .with_unit("count")
418 .build(),
419 event_loop_end_cycle: meter
420 .u64_counter(constants::STRANDS_EVENT_LOOP_END_CYCLE)
421 .with_description("Event loop cycle ends")
422 .with_unit("count")
423 .build(),
424 tool_call_count: meter
425 .u64_counter(constants::STRANDS_TOOL_CALL_COUNT)
426 .with_description("Number of tool calls")
427 .with_unit("count")
428 .build(),
429 tool_success_count: meter
430 .u64_counter(constants::STRANDS_TOOL_SUCCESS_COUNT)
431 .with_description("Number of successful tool calls")
432 .with_unit("count")
433 .build(),
434 tool_error_count: meter
435 .u64_counter(constants::STRANDS_TOOL_ERROR_COUNT)
436 .with_description("Number of failed tool calls")
437 .with_unit("count")
438 .build(),
439 event_loop_cycle_duration: meter
440 .f64_histogram(constants::STRANDS_EVENT_LOOP_CYCLE_DURATION)
441 .with_description("Duration of event loop cycles")
442 .with_unit("s")
443 .build(),
444 event_loop_latency: meter
445 .f64_histogram(constants::STRANDS_EVENT_LOOP_LATENCY)
446 .with_description("Latency of model requests")
447 .with_unit("ms")
448 .build(),
449 event_loop_input_tokens: meter
450 .u64_histogram(constants::STRANDS_EVENT_LOOP_INPUT_TOKENS)
451 .with_description("Number of input tokens")
452 .with_unit("token")
453 .build(),
454 event_loop_output_tokens: meter
455 .u64_histogram(constants::STRANDS_EVENT_LOOP_OUTPUT_TOKENS)
456 .with_description("Number of output tokens")
457 .with_unit("token")
458 .build(),
459 event_loop_cache_read_input_tokens: meter
460 .u64_histogram(constants::STRANDS_EVENT_LOOP_CACHE_READ_INPUT_TOKENS)
461 .with_description("Number of cache read input tokens")
462 .with_unit("token")
463 .build(),
464 event_loop_cache_write_input_tokens: meter
465 .u64_histogram(constants::STRANDS_EVENT_LOOP_CACHE_WRITE_INPUT_TOKENS)
466 .with_description("Number of cache write input tokens")
467 .with_unit("token")
468 .build(),
469 model_time_to_first_token: meter
470 .f64_histogram(constants::STRANDS_MODEL_TIME_TO_FIRST_TOKEN)
471 .with_description("Time to first token from model")
472 .with_unit("ms")
473 .build(),
474 tool_duration: meter
475 .f64_histogram(constants::STRANDS_TOOL_DURATION)
476 .with_description("Duration of tool execution")
477 .with_unit("s")
478 .build(),
479 meter,
480 }
481 }
482
483 pub fn global() -> &'static MetricsClient {
488 METRICS_CLIENT_INSTANCE.get_or_init(|| {
489 let meter = opentelemetry::global::meter("strands");
490 MetricsClient::new(meter)
491 })
492 }
493
494 fn to_key_values(attributes: &HashMap<String, String>) -> Vec<KeyValue> {
496 attributes
497 .iter()
498 .map(|(k, v)| KeyValue::new(k.clone(), v.clone()))
499 .collect()
500 }
501
502 pub fn record_cycle_count(&self, count: u64, attributes: &HashMap<String, String>) {
504 self.event_loop_cycle_count
505 .add(count, &Self::to_key_values(attributes));
506 }
507
508 pub fn record_start_cycle(&self, attributes: &HashMap<String, String>) {
510 self.event_loop_start_cycle
511 .add(1, &Self::to_key_values(attributes));
512 }
513
514 pub fn record_end_cycle(&self, attributes: &HashMap<String, String>) {
516 self.event_loop_end_cycle
517 .add(1, &Self::to_key_values(attributes));
518 }
519
520 pub fn record_cycle_duration(&self, duration_secs: f64, attributes: &HashMap<String, String>) {
522 self.event_loop_cycle_duration
523 .record(duration_secs, &Self::to_key_values(attributes));
524 }
525
526 pub fn record_latency(&self, latency_ms: u64, attributes: &HashMap<String, String>) {
528 self.event_loop_latency
529 .record(latency_ms as f64, &Self::to_key_values(attributes));
530 }
531
532 pub fn record_input_tokens(&self, tokens: u32, attributes: &HashMap<String, String>) {
534 self.event_loop_input_tokens
535 .record(tokens as u64, &Self::to_key_values(attributes));
536 }
537
538 pub fn record_output_tokens(&self, tokens: u32, attributes: &HashMap<String, String>) {
540 self.event_loop_output_tokens
541 .record(tokens as u64, &Self::to_key_values(attributes));
542 }
543
544 pub fn record_cache_read_input_tokens(&self, tokens: u32, attributes: &HashMap<String, String>) {
546 self.event_loop_cache_read_input_tokens
547 .record(tokens as u64, &Self::to_key_values(attributes));
548 }
549
550 pub fn record_cache_write_input_tokens(&self, tokens: u32, attributes: &HashMap<String, String>) {
552 self.event_loop_cache_write_input_tokens
553 .record(tokens as u64, &Self::to_key_values(attributes));
554 }
555
556 pub fn record_time_to_first_token(&self, time_ms: u64, attributes: &HashMap<String, String>) {
558 self.model_time_to_first_token
559 .record(time_ms as f64, &Self::to_key_values(attributes));
560 }
561
562 pub fn record_tool_call_count(&self, count: u64, attributes: &HashMap<String, String>) {
564 self.tool_call_count
565 .add(count, &Self::to_key_values(attributes));
566 }
567
568 pub fn record_tool_success_count(&self, count: u64, attributes: &HashMap<String, String>) {
570 self.tool_success_count
571 .add(count, &Self::to_key_values(attributes));
572 }
573
574 pub fn record_tool_error_count(&self, count: u64, attributes: &HashMap<String, String>) {
576 self.tool_error_count
577 .add(count, &Self::to_key_values(attributes));
578 }
579
580 pub fn record_tool_duration(&self, duration_secs: f64, attributes: &HashMap<String, String>) {
582 self.tool_duration
583 .record(duration_secs, &Self::to_key_values(attributes));
584 }
585
586 pub fn meter(&self) -> &Meter {
588 &self.meter
589 }
590}