1use std::collections::HashMap;
9use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
10use std::sync::{Arc, Mutex};
11
12#[derive(Debug)]
26pub struct LatencyHistogram {
27 buckets: [AtomicU64; 7],
30 total_count: AtomicU64,
31 total_sum_ms: AtomicU64,
32}
33
34impl Default for LatencyHistogram {
35 fn default() -> Self {
36 Self {
37 buckets: [
38 AtomicU64::new(0),
39 AtomicU64::new(0),
40 AtomicU64::new(0),
41 AtomicU64::new(0),
42 AtomicU64::new(0),
43 AtomicU64::new(0),
44 AtomicU64::new(0),
45 ],
46 total_count: AtomicU64::new(0),
47 total_sum_ms: AtomicU64::new(0),
48 }
49 }
50}
51
52impl LatencyHistogram {
53 const BOUNDS: [u64; 7] = [1, 5, 10, 50, 100, 500, u64::MAX];
68
69 pub fn record(&self, ms: u64) {
71 self.total_count.fetch_add(1, Ordering::Relaxed);
72 self.total_sum_ms.fetch_add(ms, Ordering::Relaxed);
73 for (i, &bound) in Self::BOUNDS.iter().enumerate() {
74 if ms <= bound {
75 self.buckets[i].fetch_add(1, Ordering::Relaxed);
76 return;
77 }
78 }
79 }
80
81 pub fn mean_ms(&self) -> f64 {
83 let count = self.total_count.load(Ordering::Relaxed);
84 if count == 0 {
85 return 0.0;
86 }
87 self.total_sum_ms.load(Ordering::Relaxed) as f64 / count as f64
88 }
89
90 pub fn count(&self) -> u64 {
92 self.total_count.load(Ordering::Relaxed)
93 }
94
95 pub fn buckets(&self) -> Vec<(u64, u64)> {
97 Self::BOUNDS
98 .iter()
99 .zip(self.buckets.iter())
100 .map(|(&b, a)| (b, a.load(Ordering::Relaxed)))
101 .collect()
102 }
103}
104
105#[derive(Debug, Clone, PartialEq, Default)]
125pub struct MetricsSnapshot {
126 pub active_sessions: usize,
128 pub total_sessions: u64,
130 pub total_steps: u64,
132 pub total_tool_calls: u64,
134 pub failed_tool_calls: u64,
136 pub backpressure_shed_count: u64,
138 pub memory_recall_count: u64,
140 pub per_tool_calls: HashMap<String, u64>,
142 pub per_tool_failures: HashMap<String, u64>,
144 pub step_latency_buckets: Vec<(u64, u64)>,
146 pub step_latency_mean_ms: f64,
148 pub per_agent_tool_calls: HashMap<String, HashMap<String, u64>>,
150 pub per_agent_tool_failures: HashMap<String, HashMap<String, u64>>,
152}
153
154#[derive(Debug)]
156pub struct RuntimeMetrics {
157 pub active_sessions: AtomicUsize,
159 pub total_sessions: AtomicU64,
161 pub total_steps: AtomicU64,
163 pub total_tool_calls: AtomicU64,
165 pub failed_tool_calls: AtomicU64,
167 pub backpressure_shed_count: AtomicU64,
169 pub memory_recall_count: AtomicU64,
171 per_tool_calls: Mutex<HashMap<String, u64>>,
173 per_tool_failures: Mutex<HashMap<String, u64>>,
175 pub step_latency: LatencyHistogram,
177 per_agent_tool_calls: Mutex<HashMap<String, HashMap<String, u64>>>,
179 per_agent_tool_failures: Mutex<HashMap<String, HashMap<String, u64>>>,
181}
182
183impl Default for RuntimeMetrics {
184 fn default() -> Self {
185 Self {
186 active_sessions: AtomicUsize::new(0),
187 total_sessions: AtomicU64::new(0),
188 total_steps: AtomicU64::new(0),
189 total_tool_calls: AtomicU64::new(0),
190 failed_tool_calls: AtomicU64::new(0),
191 backpressure_shed_count: AtomicU64::new(0),
192 memory_recall_count: AtomicU64::new(0),
193 per_tool_calls: Mutex::new(HashMap::new()),
194 per_tool_failures: Mutex::new(HashMap::new()),
195 step_latency: LatencyHistogram::default(),
196 per_agent_tool_calls: Mutex::new(HashMap::new()),
197 per_agent_tool_failures: Mutex::new(HashMap::new()),
198 }
199 }
200}
201
202impl RuntimeMetrics {
203 pub fn new() -> Arc<Self> {
205 Arc::new(Self::default())
206 }
207
208 pub fn active_sessions(&self) -> usize {
210 self.active_sessions.load(Ordering::Relaxed)
211 }
212
213 pub fn total_sessions(&self) -> u64 {
215 self.total_sessions.load(Ordering::Relaxed)
216 }
217
218 pub fn total_steps(&self) -> u64 {
220 self.total_steps.load(Ordering::Relaxed)
221 }
222
223 pub fn total_tool_calls(&self) -> u64 {
225 self.total_tool_calls.load(Ordering::Relaxed)
226 }
227
228 pub fn failed_tool_calls(&self) -> u64 {
230 self.failed_tool_calls.load(Ordering::Relaxed)
231 }
232
233 pub fn backpressure_shed_count(&self) -> u64 {
235 self.backpressure_shed_count.load(Ordering::Relaxed)
236 }
237
238 pub fn memory_recall_count(&self) -> u64 {
240 self.memory_recall_count.load(Ordering::Relaxed)
241 }
242
243 pub fn record_tool_call(&self, tool_name: &str) {
247 self.total_tool_calls.fetch_add(1, Ordering::Relaxed);
248 if let Ok(mut map) = self.per_tool_calls.lock() {
249 *map.entry(tool_name.to_owned()).or_insert(0) += 1;
250 }
251 }
252
253 pub fn record_tool_failure(&self, tool_name: &str) {
257 self.failed_tool_calls.fetch_add(1, Ordering::Relaxed);
258 if let Ok(mut map) = self.per_tool_failures.lock() {
259 *map.entry(tool_name.to_owned()).or_insert(0) += 1;
260 }
261 }
262
263 pub fn per_tool_calls_snapshot(&self) -> HashMap<String, u64> {
265 self.per_tool_calls
266 .lock()
267 .map(|m| m.clone())
268 .unwrap_or_default()
269 }
270
271 pub fn per_tool_failures_snapshot(&self) -> HashMap<String, u64> {
273 self.per_tool_failures
274 .lock()
275 .map(|m| m.clone())
276 .unwrap_or_default()
277 }
278
279 pub fn record_agent_tool_call(&self, agent_id: &str, tool_name: &str) {
281 if let Ok(mut map) = self.per_agent_tool_calls.lock() {
282 *map.entry(agent_id.to_owned())
283 .or_default()
284 .entry(tool_name.to_owned())
285 .or_insert(0) += 1;
286 }
287 }
288
289 pub fn record_agent_tool_failure(&self, agent_id: &str, tool_name: &str) {
291 if let Ok(mut map) = self.per_agent_tool_failures.lock() {
292 *map.entry(agent_id.to_owned())
293 .or_default()
294 .entry(tool_name.to_owned())
295 .or_insert(0) += 1;
296 }
297 }
298
299 pub fn per_agent_tool_calls_snapshot(&self) -> HashMap<String, HashMap<String, u64>> {
301 self.per_agent_tool_calls
302 .lock()
303 .map(|m| m.clone())
304 .unwrap_or_default()
305 }
306
307 pub fn per_agent_tool_failures_snapshot(&self) -> HashMap<String, HashMap<String, u64>> {
309 self.per_agent_tool_failures
310 .lock()
311 .map(|m| m.clone())
312 .unwrap_or_default()
313 }
314
315 pub fn snapshot(&self) -> MetricsSnapshot {
322 MetricsSnapshot {
323 active_sessions: self.active_sessions.load(Ordering::Relaxed),
324 total_sessions: self.total_sessions.load(Ordering::Relaxed),
325 total_steps: self.total_steps.load(Ordering::Relaxed),
326 total_tool_calls: self.total_tool_calls.load(Ordering::Relaxed),
327 failed_tool_calls: self.failed_tool_calls.load(Ordering::Relaxed),
328 backpressure_shed_count: self.backpressure_shed_count.load(Ordering::Relaxed),
329 memory_recall_count: self.memory_recall_count.load(Ordering::Relaxed),
330 per_tool_calls: self.per_tool_calls_snapshot(),
331 per_tool_failures: self.per_tool_failures_snapshot(),
332 step_latency_buckets: self.step_latency.buckets(),
333 step_latency_mean_ms: self.step_latency.mean_ms(),
334 per_agent_tool_calls: self.per_agent_tool_calls_snapshot(),
335 per_agent_tool_failures: self.per_agent_tool_failures_snapshot(),
336 }
337 }
338
339 pub fn record_step_latency(&self, ms: u64) {
341 self.step_latency.record(ms);
342 }
343
344 pub fn reset(&self) {
348 self.active_sessions.store(0, Ordering::Relaxed);
349 self.total_sessions.store(0, Ordering::Relaxed);
350 self.total_steps.store(0, Ordering::Relaxed);
351 self.total_tool_calls.store(0, Ordering::Relaxed);
352 self.failed_tool_calls.store(0, Ordering::Relaxed);
353 self.backpressure_shed_count.store(0, Ordering::Relaxed);
354 self.memory_recall_count.store(0, Ordering::Relaxed);
355 if let Ok(mut m) = self.per_tool_calls.lock() {
356 m.clear();
357 }
358 if let Ok(mut m) = self.per_tool_failures.lock() {
359 m.clear();
360 }
361 if let Ok(mut m) = self.per_agent_tool_calls.lock() {
362 m.clear();
363 }
364 if let Ok(mut m) = self.per_agent_tool_failures.lock() {
365 m.clear();
366 }
367 self.step_latency.total_count.store(0, Ordering::Relaxed);
368 self.step_latency.total_sum_ms.store(0, Ordering::Relaxed);
369 for b in &self.step_latency.buckets {
370 b.store(0, Ordering::Relaxed);
371 }
372 }
373
374 pub fn to_snapshot(&self) -> (usize, u64, u64, u64, u64, u64, u64) {
385 (
386 self.active_sessions.load(Ordering::Relaxed),
387 self.total_sessions.load(Ordering::Relaxed),
388 self.total_steps.load(Ordering::Relaxed),
389 self.total_tool_calls.load(Ordering::Relaxed),
390 self.failed_tool_calls.load(Ordering::Relaxed),
391 self.backpressure_shed_count.load(Ordering::Relaxed),
392 self.memory_recall_count.load(Ordering::Relaxed),
393 )
394 }
395}
396
397#[cfg(test)]
400mod tests {
401 use super::*;
402
403 #[test]
404 fn test_metrics_new_returns_arc_with_zero_counters() {
405 let m = RuntimeMetrics::new();
406 assert_eq!(m.active_sessions(), 0);
407 assert_eq!(m.total_sessions(), 0);
408 assert_eq!(m.total_steps(), 0);
409 assert_eq!(m.total_tool_calls(), 0);
410 assert_eq!(m.failed_tool_calls(), 0);
411 assert_eq!(m.backpressure_shed_count(), 0);
412 assert_eq!(m.memory_recall_count(), 0);
413 }
414
415 #[test]
416 fn test_active_sessions_increments_and_decrements() {
417 let m = RuntimeMetrics::new();
418 m.active_sessions.fetch_add(1, Ordering::Relaxed);
419 assert_eq!(m.active_sessions(), 1);
420 m.active_sessions.fetch_sub(1, Ordering::Relaxed);
421 assert_eq!(m.active_sessions(), 0);
422 }
423
424 #[test]
425 fn test_total_sessions_increments() {
426 let m = RuntimeMetrics::new();
427 m.total_sessions.fetch_add(1, Ordering::Relaxed);
428 m.total_sessions.fetch_add(1, Ordering::Relaxed);
429 assert_eq!(m.total_sessions(), 2);
430 }
431
432 #[test]
433 fn test_total_steps_increments() {
434 let m = RuntimeMetrics::new();
435 m.total_steps.fetch_add(5, Ordering::Relaxed);
436 assert_eq!(m.total_steps(), 5);
437 }
438
439 #[test]
440 fn test_total_tool_calls_increments() {
441 let m = RuntimeMetrics::new();
442 m.total_tool_calls.fetch_add(3, Ordering::Relaxed);
443 assert_eq!(m.total_tool_calls(), 3);
444 }
445
446 #[test]
447 fn test_failed_tool_calls_increments() {
448 let m = RuntimeMetrics::new();
449 m.failed_tool_calls.fetch_add(2, Ordering::Relaxed);
450 assert_eq!(m.failed_tool_calls(), 2);
451 }
452
453 #[test]
454 fn test_backpressure_shed_count_increments() {
455 let m = RuntimeMetrics::new();
456 m.backpressure_shed_count.fetch_add(7, Ordering::Relaxed);
457 assert_eq!(m.backpressure_shed_count(), 7);
458 }
459
460 #[test]
461 fn test_memory_recall_count_increments() {
462 let m = RuntimeMetrics::new();
463 m.memory_recall_count.fetch_add(4, Ordering::Relaxed);
464 assert_eq!(m.memory_recall_count(), 4);
465 }
466
467 #[test]
468 fn test_reset_zeroes_all_counters() {
469 let m = RuntimeMetrics::new();
470 m.active_sessions.store(3, Ordering::Relaxed);
471 m.total_sessions.store(10, Ordering::Relaxed);
472 m.total_steps.store(50, Ordering::Relaxed);
473 m.total_tool_calls.store(20, Ordering::Relaxed);
474 m.failed_tool_calls.store(2, Ordering::Relaxed);
475 m.backpressure_shed_count.store(1, Ordering::Relaxed);
476 m.memory_recall_count.store(8, Ordering::Relaxed);
477
478 m.reset();
479
480 assert_eq!(m.active_sessions(), 0);
481 assert_eq!(m.total_sessions(), 0);
482 assert_eq!(m.total_steps(), 0);
483 assert_eq!(m.total_tool_calls(), 0);
484 assert_eq!(m.failed_tool_calls(), 0);
485 assert_eq!(m.backpressure_shed_count(), 0);
486 assert_eq!(m.memory_recall_count(), 0);
487 }
488
489 #[test]
490 fn test_to_snapshot_captures_correct_values() {
491 let m = RuntimeMetrics::new();
492 m.active_sessions.store(1, Ordering::Relaxed);
493 m.total_sessions.store(2, Ordering::Relaxed);
494 m.total_steps.store(3, Ordering::Relaxed);
495 m.total_tool_calls.store(4, Ordering::Relaxed);
496 m.failed_tool_calls.store(5, Ordering::Relaxed);
497 m.backpressure_shed_count.store(6, Ordering::Relaxed);
498 m.memory_recall_count.store(7, Ordering::Relaxed);
499
500 let snap = m.to_snapshot();
501 assert_eq!(snap, (1, 2, 3, 4, 5, 6, 7));
502 }
503
504 #[test]
505 fn test_metrics_is_send_sync() {
506 fn assert_send_sync<T: Send + Sync>() {}
507 assert_send_sync::<RuntimeMetrics>();
508 }
509
510 #[test]
511 fn test_multiple_increments_are_cumulative() {
512 let m = RuntimeMetrics::new();
513 for _ in 0..100 {
514 m.total_sessions.fetch_add(1, Ordering::Relaxed);
515 }
516 assert_eq!(m.total_sessions(), 100);
517 }
518
519 #[test]
520 fn test_arc_clone_shares_state() {
521 let m = RuntimeMetrics::new();
522 let m2 = Arc::clone(&m);
523 m.total_sessions.fetch_add(1, Ordering::Relaxed);
524 assert_eq!(m2.total_sessions(), 1);
525 }
526
527 #[test]
530 fn test_record_tool_call_increments_global_and_per_tool() {
531 let m = RuntimeMetrics::new();
532 m.record_tool_call("search");
533 m.record_tool_call("search");
534 m.record_tool_call("lookup");
535 assert_eq!(m.total_tool_calls(), 3);
536 let snap = m.per_tool_calls_snapshot();
537 assert_eq!(snap.get("search").copied(), Some(2));
538 assert_eq!(snap.get("lookup").copied(), Some(1));
539 }
540
541 #[test]
542 fn test_record_tool_failure_increments_global_and_per_tool() {
543 let m = RuntimeMetrics::new();
544 m.record_tool_failure("search");
545 m.record_tool_failure("lookup");
546 m.record_tool_failure("search");
547 assert_eq!(m.failed_tool_calls(), 3);
548 let snap = m.per_tool_failures_snapshot();
549 assert_eq!(snap.get("search").copied(), Some(2));
550 assert_eq!(snap.get("lookup").copied(), Some(1));
551 }
552
553 #[test]
554 fn test_reset_clears_per_tool_counters() {
555 let m = RuntimeMetrics::new();
556 m.record_tool_call("foo");
557 m.record_tool_failure("foo");
558 m.reset();
559 assert!(m.per_tool_calls_snapshot().is_empty());
560 assert!(m.per_tool_failures_snapshot().is_empty());
561 }
562
563 #[test]
564 fn test_per_tool_snapshot_is_independent_for_unknown_tools() {
565 let m = RuntimeMetrics::new();
566 let snap = m.per_tool_calls_snapshot();
567 assert!(snap.is_empty());
568 }
569
570 #[test]
573 fn test_latency_histogram_records_sample() {
574 let h = LatencyHistogram::default();
575 h.record(10);
576 assert_eq!(h.count(), 1);
577 }
578
579 #[test]
580 fn test_latency_histogram_mean_ms() {
581 let h = LatencyHistogram::default();
582 h.record(10);
583 h.record(20);
584 assert!((h.mean_ms() - 15.0).abs() < 1e-5);
585 }
586
587 #[test]
588 fn test_latency_histogram_buckets_correct_bucket() {
589 let h = LatencyHistogram::default();
590 h.record(3); let buckets = h.buckets();
592 assert_eq!(buckets[1].1, 1, "3ms should land in ≤5ms bucket");
594 assert_eq!(buckets[0].1, 0);
596 assert_eq!(buckets[2].1, 0);
597 }
598
599 #[test]
602 fn test_snapshot_returns_all_fields() {
603 let m = RuntimeMetrics::new();
604 m.active_sessions.store(1, Ordering::Relaxed);
605 m.total_sessions.store(2, Ordering::Relaxed);
606 m.total_steps.store(3, Ordering::Relaxed);
607 m.backpressure_shed_count.store(6, Ordering::Relaxed);
608 m.memory_recall_count.store(7, Ordering::Relaxed);
609 m.record_tool_call("my_tool");
611 m.record_tool_call("my_tool");
612 m.record_tool_failure("my_tool");
613
614 let snap = m.snapshot();
615 assert_eq!(snap.active_sessions, 1);
616 assert_eq!(snap.total_sessions, 2);
617 assert_eq!(snap.total_steps, 3);
618 assert_eq!(snap.total_tool_calls, 2);
619 assert_eq!(snap.failed_tool_calls, 1);
620 assert_eq!(snap.backpressure_shed_count, 6);
621 assert_eq!(snap.memory_recall_count, 7);
622 assert_eq!(snap.per_tool_calls.get("my_tool").copied(), Some(2));
623 assert_eq!(snap.per_tool_failures.get("my_tool").copied(), Some(1));
624 }
625
626 #[test]
627 fn test_snapshot_default_is_zeroed() {
628 let snap = MetricsSnapshot::default();
629 assert_eq!(snap.active_sessions, 0);
630 assert_eq!(snap.total_sessions, 0);
631 assert_eq!(snap.total_steps, 0);
632 assert!(snap.per_tool_calls.is_empty());
633 assert!(snap.per_tool_failures.is_empty());
634 }
635
636 #[test]
639 fn test_metrics_snapshot_contains_all_fields() {
640 let m = RuntimeMetrics::new();
641 m.record_step_latency(5);
642 m.record_step_latency(200);
643 let snap = m.snapshot();
644 assert_eq!(snap.step_latency_buckets.len(), 7);
646 assert!(snap.step_latency_mean_ms > 0.0);
647 }
648
649 #[test]
652 fn test_per_agent_tool_call_tracking() {
653 let m = RuntimeMetrics::new();
654 m.record_agent_tool_call("agent-1", "search");
655 m.record_agent_tool_call("agent-1", "search");
656 m.record_agent_tool_call("agent-2", "lookup");
657 m.record_agent_tool_failure("agent-1", "search");
658
659 let calls = m.per_agent_tool_calls_snapshot();
660 assert_eq!(calls.get("agent-1").and_then(|t| t.get("search")).copied(), Some(2));
661 assert_eq!(calls.get("agent-2").and_then(|t| t.get("lookup")).copied(), Some(1));
662
663 let failures = m.per_agent_tool_failures_snapshot();
664 assert_eq!(failures.get("agent-1").and_then(|t| t.get("search")).copied(), Some(1));
665
666 let snap = m.snapshot();
668 assert_eq!(snap.per_agent_tool_calls.get("agent-1").and_then(|t| t.get("search")).copied(), Some(2));
669
670 m.reset();
672 assert!(m.per_agent_tool_calls_snapshot().is_empty());
673 assert!(m.per_agent_tool_failures_snapshot().is_empty());
674 }
675}