Skip to main content

rustyclaw_core/observability/
traits.rs

1//! Observability traits and event types.
2//!
3//! This module defines the [`Observer`] trait and event/metric types for
4//! recording agent runtime telemetry. Implementations integrate with various
5//! backends (console logging, Prometheus, OpenTelemetry).
6//!
7//! Adapted from ZeroClaw (MIT OR Apache-2.0 licensed).
8
9use std::time::Duration;
10
11/// Discrete events emitted by the agent runtime for observability.
12///
13/// Each variant represents a lifecycle event that observers can record,
14/// aggregate, or forward to external monitoring systems. Events carry
15/// just enough context for tracing and diagnostics without exposing
16/// sensitive prompt or response content.
17#[derive(Debug, Clone)]
18pub enum ObserverEvent {
19    /// The agent orchestration loop has started a new session.
20    AgentStart { provider: String, model: String },
21    /// A request is about to be sent to an LLM provider.
22    ///
23    /// This is emitted immediately before a provider call so observers can print
24    /// user-facing progress without leaking prompt contents.
25    LlmRequest {
26        provider: String,
27        model: String,
28        messages_count: usize,
29    },
30    /// Result of a single LLM provider call.
31    LlmResponse {
32        provider: String,
33        model: String,
34        duration: Duration,
35        success: bool,
36        error_message: Option<String>,
37        input_tokens: Option<u64>,
38        output_tokens: Option<u64>,
39    },
40    /// The agent session has finished.
41    ///
42    /// Carries aggregate usage data (tokens, cost) when the provider reports it.
43    AgentEnd {
44        provider: String,
45        model: String,
46        duration: Duration,
47        tokens_used: Option<u64>,
48        cost_usd: Option<f64>,
49    },
50    /// A tool call is about to be executed.
51    ToolCallStart { tool: String },
52    /// A tool call has completed with a success/failure outcome.
53    ToolCall {
54        tool: String,
55        duration: Duration,
56        success: bool,
57    },
58    /// The agent produced a final answer for the current user message.
59    TurnComplete,
60    /// A message was sent or received through a channel.
61    ChannelMessage {
62        /// Channel name (e.g., `"telegram"`, `"discord"`).
63        channel: String,
64        /// `"inbound"` or `"outbound"`.
65        direction: String,
66    },
67    /// Periodic heartbeat tick from the runtime keep-alive loop.
68    HeartbeatTick,
69    /// An error occurred in a named component.
70    Error {
71        /// Subsystem where the error originated (e.g., `"provider"`, `"gateway"`).
72        component: String,
73        /// Human-readable error description. Must not contain secrets or tokens.
74        message: String,
75    },
76}
77
78/// Numeric metrics emitted by the agent runtime.
79///
80/// Observers can aggregate these into dashboards, alerts, or structured logs.
81/// Each variant carries a single scalar value with implicit units.
82#[derive(Debug, Clone)]
83pub enum ObserverMetric {
84    /// Time elapsed for a single LLM or tool request.
85    RequestLatency(Duration),
86    /// Number of tokens consumed by an LLM call.
87    TokensUsed(u64),
88    /// Current number of active concurrent sessions.
89    ActiveSessions(u64),
90    /// Current depth of the inbound message queue.
91    QueueDepth(u64),
92}
93
94/// Core observability trait for recording agent runtime telemetry.
95///
96/// Implement this trait to integrate with any monitoring backend (structured
97/// logging, Prometheus, OpenTelemetry, etc.). The agent runtime holds one or
98/// more `Observer` instances and calls [`record_event`](Observer::record_event)
99/// and [`record_metric`](Observer::record_metric) at key lifecycle points.
100///
101/// Implementations must be `Send + Sync + 'static` because the observer is
102/// shared across async tasks via `Arc`.
103pub trait Observer: Send + Sync + 'static {
104    /// Record a discrete lifecycle event.
105    ///
106    /// Called synchronously on the hot path; implementations should avoid
107    /// blocking I/O. Buffer events internally and flush asynchronously
108    /// when possible.
109    fn record_event(&self, event: &ObserverEvent);
110
111    /// Record a numeric metric sample.
112    ///
113    /// Called synchronously; same non-blocking guidance as
114    /// [`record_event`](Observer::record_event).
115    fn record_metric(&self, metric: &ObserverMetric);
116
117    /// Flush any buffered telemetry data to the backend.
118    ///
119    /// The runtime calls this during graceful shutdown. The default
120    /// implementation is a no-op, which is appropriate for backends
121    /// that write synchronously.
122    fn flush(&self) {}
123
124    /// Return the human-readable name of this observer backend.
125    ///
126    /// Used in logs and diagnostics (e.g., `"console"`, `"prometheus"`,
127    /// `"opentelemetry"`).
128    fn name(&self) -> &str;
129
130    /// Downcast to `Any` for backend-specific operations.
131    ///
132    /// Enables callers to access concrete observer types when needed
133    /// (e.g., retrieving a Prometheus registry handle for custom metrics).
134    fn as_any(&self) -> &dyn std::any::Any;
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140    use std::sync::Mutex;
141    use std::time::Duration;
142
143    #[derive(Default)]
144    struct DummyObserver {
145        events: Mutex<u64>,
146        metrics: Mutex<u64>,
147    }
148
149    impl Observer for DummyObserver {
150        fn record_event(&self, _event: &ObserverEvent) {
151            let mut guard = self.events.lock().unwrap();
152            *guard += 1;
153        }
154
155        fn record_metric(&self, _metric: &ObserverMetric) {
156            let mut guard = self.metrics.lock().unwrap();
157            *guard += 1;
158        }
159
160        fn name(&self) -> &str {
161            "dummy-observer"
162        }
163
164        fn as_any(&self) -> &dyn std::any::Any {
165            self
166        }
167    }
168
169    #[test]
170    fn observer_records_events_and_metrics() {
171        let observer = DummyObserver::default();
172
173        observer.record_event(&ObserverEvent::HeartbeatTick);
174        observer.record_event(&ObserverEvent::Error {
175            component: "test".into(),
176            message: "boom".into(),
177        });
178        observer.record_metric(&ObserverMetric::TokensUsed(42));
179
180        assert_eq!(*observer.events.lock().unwrap(), 2);
181        assert_eq!(*observer.metrics.lock().unwrap(), 1);
182    }
183
184    #[test]
185    fn observer_default_flush_and_as_any_work() {
186        let observer = DummyObserver::default();
187
188        observer.flush();
189        assert_eq!(observer.name(), "dummy-observer");
190        assert!(observer.as_any().downcast_ref::<DummyObserver>().is_some());
191    }
192
193    #[test]
194    fn observer_event_and_metric_are_cloneable() {
195        let event = ObserverEvent::ToolCall {
196            tool: "shell".into(),
197            duration: Duration::from_millis(10),
198            success: true,
199        };
200        let metric = ObserverMetric::RequestLatency(Duration::from_millis(8));
201
202        let cloned_event = event.clone();
203        let cloned_metric = metric.clone();
204
205        assert!(matches!(cloned_event, ObserverEvent::ToolCall { .. }));
206        assert!(matches!(cloned_metric, ObserverMetric::RequestLatency(_)));
207    }
208}