rustyclaw_core/observability/traits.rs
1//! Observability traits and event types.
2//!
3//! This module defines the [`Observer`] trait and event/metric types for
4//! recording agent runtime telemetry. Implementations integrate with various
5//! backends (console logging, Prometheus, OpenTelemetry).
6//!
7//! Adapted from ZeroClaw (MIT OR Apache-2.0 licensed).
8
9use std::time::Duration;
10
11/// Discrete events emitted by the agent runtime for observability.
12///
13/// Each variant represents a lifecycle event that observers can record,
14/// aggregate, or forward to external monitoring systems. Events carry
15/// just enough context for tracing and diagnostics without exposing
16/// sensitive prompt or response content.
17#[derive(Debug, Clone)]
18pub enum ObserverEvent {
19 /// The agent orchestration loop has started a new session.
20 AgentStart { provider: String, model: String },
21 /// A request is about to be sent to an LLM provider.
22 ///
23 /// This is emitted immediately before a provider call so observers can print
24 /// user-facing progress without leaking prompt contents.
25 LlmRequest {
26 provider: String,
27 model: String,
28 messages_count: usize,
29 },
30 /// Result of a single LLM provider call.
31 LlmResponse {
32 provider: String,
33 model: String,
34 duration: Duration,
35 success: bool,
36 error_message: Option<String>,
37 input_tokens: Option<u64>,
38 output_tokens: Option<u64>,
39 },
40 /// The agent session has finished.
41 ///
42 /// Carries aggregate usage data (tokens, cost) when the provider reports it.
43 AgentEnd {
44 provider: String,
45 model: String,
46 duration: Duration,
47 tokens_used: Option<u64>,
48 cost_usd: Option<f64>,
49 },
50 /// A tool call is about to be executed.
51 ToolCallStart { tool: String },
52 /// A tool call has completed with a success/failure outcome.
53 ToolCall {
54 tool: String,
55 duration: Duration,
56 success: bool,
57 },
58 /// The agent produced a final answer for the current user message.
59 TurnComplete,
60 /// A message was sent or received through a channel.
61 ChannelMessage {
62 /// Channel name (e.g., `"telegram"`, `"discord"`).
63 channel: String,
64 /// `"inbound"` or `"outbound"`.
65 direction: String,
66 },
67 /// Periodic heartbeat tick from the runtime keep-alive loop.
68 HeartbeatTick,
69 /// An error occurred in a named component.
70 Error {
71 /// Subsystem where the error originated (e.g., `"provider"`, `"gateway"`).
72 component: String,
73 /// Human-readable error description. Must not contain secrets or tokens.
74 message: String,
75 },
76}
77
78/// Numeric metrics emitted by the agent runtime.
79///
80/// Observers can aggregate these into dashboards, alerts, or structured logs.
81/// Each variant carries a single scalar value with implicit units.
82#[derive(Debug, Clone)]
83pub enum ObserverMetric {
84 /// Time elapsed for a single LLM or tool request.
85 RequestLatency(Duration),
86 /// Number of tokens consumed by an LLM call.
87 TokensUsed(u64),
88 /// Current number of active concurrent sessions.
89 ActiveSessions(u64),
90 /// Current depth of the inbound message queue.
91 QueueDepth(u64),
92}
93
94/// Core observability trait for recording agent runtime telemetry.
95///
96/// Implement this trait to integrate with any monitoring backend (structured
97/// logging, Prometheus, OpenTelemetry, etc.). The agent runtime holds one or
98/// more `Observer` instances and calls [`record_event`](Observer::record_event)
99/// and [`record_metric`](Observer::record_metric) at key lifecycle points.
100///
101/// Implementations must be `Send + Sync + 'static` because the observer is
102/// shared across async tasks via `Arc`.
103pub trait Observer: Send + Sync + 'static {
104 /// Record a discrete lifecycle event.
105 ///
106 /// Called synchronously on the hot path; implementations should avoid
107 /// blocking I/O. Buffer events internally and flush asynchronously
108 /// when possible.
109 fn record_event(&self, event: &ObserverEvent);
110
111 /// Record a numeric metric sample.
112 ///
113 /// Called synchronously; same non-blocking guidance as
114 /// [`record_event`](Observer::record_event).
115 fn record_metric(&self, metric: &ObserverMetric);
116
117 /// Flush any buffered telemetry data to the backend.
118 ///
119 /// The runtime calls this during graceful shutdown. The default
120 /// implementation is a no-op, which is appropriate for backends
121 /// that write synchronously.
122 fn flush(&self) {}
123
124 /// Return the human-readable name of this observer backend.
125 ///
126 /// Used in logs and diagnostics (e.g., `"console"`, `"prometheus"`,
127 /// `"opentelemetry"`).
128 fn name(&self) -> &str;
129
130 /// Downcast to `Any` for backend-specific operations.
131 ///
132 /// Enables callers to access concrete observer types when needed
133 /// (e.g., retrieving a Prometheus registry handle for custom metrics).
134 fn as_any(&self) -> &dyn std::any::Any;
135}
136
137#[cfg(test)]
138mod tests {
139 use super::*;
140 use std::sync::Mutex;
141 use std::time::Duration;
142
143 #[derive(Default)]
144 struct DummyObserver {
145 events: Mutex<u64>,
146 metrics: Mutex<u64>,
147 }
148
149 impl Observer for DummyObserver {
150 fn record_event(&self, _event: &ObserverEvent) {
151 let mut guard = self.events.lock().unwrap();
152 *guard += 1;
153 }
154
155 fn record_metric(&self, _metric: &ObserverMetric) {
156 let mut guard = self.metrics.lock().unwrap();
157 *guard += 1;
158 }
159
160 fn name(&self) -> &str {
161 "dummy-observer"
162 }
163
164 fn as_any(&self) -> &dyn std::any::Any {
165 self
166 }
167 }
168
169 #[test]
170 fn observer_records_events_and_metrics() {
171 let observer = DummyObserver::default();
172
173 observer.record_event(&ObserverEvent::HeartbeatTick);
174 observer.record_event(&ObserverEvent::Error {
175 component: "test".into(),
176 message: "boom".into(),
177 });
178 observer.record_metric(&ObserverMetric::TokensUsed(42));
179
180 assert_eq!(*observer.events.lock().unwrap(), 2);
181 assert_eq!(*observer.metrics.lock().unwrap(), 1);
182 }
183
184 #[test]
185 fn observer_default_flush_and_as_any_work() {
186 let observer = DummyObserver::default();
187
188 observer.flush();
189 assert_eq!(observer.name(), "dummy-observer");
190 assert!(observer.as_any().downcast_ref::<DummyObserver>().is_some());
191 }
192
193 #[test]
194 fn observer_event_and_metric_are_cloneable() {
195 let event = ObserverEvent::ToolCall {
196 tool: "shell".into(),
197 duration: Duration::from_millis(10),
198 success: true,
199 };
200 let metric = ObserverMetric::RequestLatency(Duration::from_millis(8));
201
202 let cloned_event = event.clone();
203 let cloned_metric = metric.clone();
204
205 assert!(matches!(cloned_event, ObserverEvent::ToolCall { .. }));
206 assert!(matches!(cloned_metric, ObserverMetric::RequestLatency(_)));
207 }
208}