attuned_core/telemetry/
mod.rs

1//! Observability infrastructure for Attuned.
2//!
3//! This module provides:
4//! - Structured logging with [`tracing`]
5//! - Metrics collection
6//! - Health check primitives
7//! - Audit event types
8//!
9//! ## Quick Start
10//!
11//! ```rust,ignore
12//! use attuned_core::telemetry::{init_tracing, TracingConfig};
13//!
14//! fn main() {
15//!     let _guard = init_tracing(TracingConfig::default());
16//!     // All Attuned operations now emit traces
17//! }
18//! ```
19
20mod setup;
21
22pub use setup::{init_tracing, init_tracing_from_env, TelemetryBuilder, TelemetryGuard};
23
24use serde::{Deserialize, Serialize};
25
26/// Configuration for tracing/logging.
27#[derive(Clone, Debug)]
28pub struct TracingConfig {
29    /// Output format.
30    pub format: TracingFormat,
31    /// Service name for distributed tracing.
32    pub service_name: String,
33    /// Minimum log level.
34    pub level: String,
35    /// Include source file and line in logs.
36    pub include_file_line: bool,
37    /// Include target (module path) in logs.
38    pub include_target: bool,
39}
40
41impl Default for TracingConfig {
42    fn default() -> Self {
43        Self {
44            format: TracingFormat::Pretty,
45            service_name: "attuned".to_string(),
46            level: "info".to_string(),
47            include_file_line: false,
48            include_target: true,
49        }
50    }
51}
52
53/// Tracing output format.
54#[derive(Clone, Debug, Default)]
55pub enum TracingFormat {
56    /// Human-readable colored output (development).
57    #[default]
58    Pretty,
59    /// JSON structured output (production).
60    Json,
61    /// Compact single-line output.
62    Compact,
63}
64
65/// Configuration for OpenTelemetry export.
66#[derive(Clone, Debug)]
67pub struct OtelConfig {
68    /// OTLP endpoint URL.
69    pub endpoint: String,
70    /// Service name.
71    pub service_name: String,
72    /// Service version.
73    pub service_version: String,
74    /// Sample rate (0.0 - 1.0).
75    pub sample_rate: f64,
76}
77
78impl Default for OtelConfig {
79    fn default() -> Self {
80        Self {
81            endpoint: "http://localhost:4317".to_string(),
82            service_name: "attuned".to_string(),
83            service_version: env!("CARGO_PKG_VERSION").to_string(),
84            sample_rate: 1.0,
85        }
86    }
87}
88
89/// Health status of a component.
90#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
91#[serde(rename_all = "lowercase")]
92pub enum HealthState {
93    /// Component is fully operational.
94    Healthy,
95    /// Component is operational but with issues.
96    Degraded,
97    /// Component is not operational.
98    Unhealthy,
99}
100
101/// Health status of a single component.
102#[derive(Clone, Debug, Serialize, Deserialize)]
103pub struct ComponentHealth {
104    /// Component name.
105    pub name: String,
106    /// Component status.
107    pub status: HealthState,
108    /// Response latency in milliseconds.
109    pub latency_ms: Option<u64>,
110    /// Additional status message.
111    pub message: Option<String>,
112}
113
114impl ComponentHealth {
115    /// Create a healthy component status.
116    pub fn healthy(name: impl Into<String>) -> Self {
117        Self {
118            name: name.into(),
119            status: HealthState::Healthy,
120            latency_ms: None,
121            message: None,
122        }
123    }
124
125    /// Create a healthy component status with latency.
126    pub fn healthy_with_latency(name: impl Into<String>, latency_ms: u64) -> Self {
127        Self {
128            name: name.into(),
129            status: HealthState::Healthy,
130            latency_ms: Some(latency_ms),
131            message: None,
132        }
133    }
134
135    /// Create an unhealthy component status.
136    pub fn unhealthy(name: impl Into<String>, message: impl Into<String>) -> Self {
137        Self {
138            name: name.into(),
139            status: HealthState::Unhealthy,
140            latency_ms: None,
141            message: Some(message.into()),
142        }
143    }
144
145    /// Create a degraded component status.
146    pub fn degraded(name: impl Into<String>, message: impl Into<String>) -> Self {
147        Self {
148            name: name.into(),
149            status: HealthState::Degraded,
150            latency_ms: None,
151            message: Some(message.into()),
152        }
153    }
154}
155
156/// Overall system health status.
157#[derive(Clone, Debug, Serialize, Deserialize)]
158pub struct HealthStatus {
159    /// Overall status (worst of all components).
160    pub status: HealthState,
161    /// Service version.
162    pub version: String,
163    /// Uptime in seconds.
164    pub uptime_seconds: u64,
165    /// Individual component health checks.
166    pub checks: Vec<ComponentHealth>,
167}
168
169impl HealthStatus {
170    /// Create a new health status from component checks.
171    pub fn from_checks(checks: Vec<ComponentHealth>, uptime_seconds: u64) -> Self {
172        let status = checks
173            .iter()
174            .map(|c| &c.status)
175            .fold(HealthState::Healthy, |acc, s| match (&acc, s) {
176                (HealthState::Unhealthy, _) | (_, HealthState::Unhealthy) => HealthState::Unhealthy,
177                (HealthState::Degraded, _) | (_, HealthState::Degraded) => HealthState::Degraded,
178                _ => HealthState::Healthy,
179            });
180
181        Self {
182            status,
183            version: env!("CARGO_PKG_VERSION").to_string(),
184            uptime_seconds,
185            checks,
186        }
187    }
188}
189
190/// Trait for components that can report their health.
191#[async_trait::async_trait]
192pub trait HealthCheck: Send + Sync {
193    /// Perform a health check and return the status.
194    async fn check(&self) -> ComponentHealth;
195}
196
197/// Audit event types for state mutations.
198#[derive(Clone, Debug, Serialize, Deserialize)]
199#[serde(rename_all = "snake_case")]
200pub enum AuditEventType {
201    /// New state created for a user.
202    StateCreated,
203    /// Existing state updated.
204    StateUpdated,
205    /// State deleted.
206    StateDeleted,
207}
208
209/// Audit event for tracking state mutations.
210#[derive(Clone, Debug, Serialize, Deserialize)]
211pub struct AuditEvent {
212    /// Event timestamp.
213    pub timestamp: chrono::DateTime<chrono::Utc>,
214    /// Type of event.
215    pub event_type: AuditEventType,
216    /// User ID affected.
217    pub user_id: String,
218    /// Source of the state change.
219    pub source: crate::Source,
220    /// Which axes were modified.
221    pub axes_changed: Vec<String>,
222    /// Confidence of the new state.
223    pub confidence: f32,
224    /// Trace ID for correlation (if available).
225    pub trace_id: Option<String>,
226}
227
228impl AuditEvent {
229    /// Create a new audit event.
230    pub fn new(
231        event_type: AuditEventType,
232        user_id: impl Into<String>,
233        source: crate::Source,
234        axes_changed: Vec<String>,
235        confidence: f32,
236    ) -> Self {
237        Self {
238            timestamp: chrono::Utc::now(),
239            event_type,
240            user_id: user_id.into(),
241            source,
242            axes_changed,
243            confidence,
244            trace_id: None,
245        }
246    }
247
248    /// Set the trace ID for correlation.
249    pub fn with_trace_id(mut self, trace_id: impl Into<String>) -> Self {
250        self.trace_id = Some(trace_id.into());
251        self
252    }
253
254    /// Emit this audit event to the tracing system.
255    pub fn emit(&self) {
256        tracing::info!(
257            event_type = ?self.event_type,
258            user_id = %self.user_id,
259            source = %self.source,
260            axes_changed = ?self.axes_changed,
261            confidence = %self.confidence,
262            trace_id = ?self.trace_id,
263            "audit_event"
264        );
265    }
266}
267
268/// Metric names used by Attuned.
269pub mod metric_names {
270    /// Counter: Total state update operations.
271    pub const STATE_UPDATES_TOTAL: &str = "attuned_state_updates_total";
272    /// Counter: Total state read operations.
273    pub const STATE_READS_TOTAL: &str = "attuned_state_reads_total";
274    /// Counter: Total translation operations.
275    pub const TRANSLATIONS_TOTAL: &str = "attuned_translations_total";
276    /// Counter: Total errors by type.
277    pub const ERRORS_TOTAL: &str = "attuned_errors_total";
278    /// Histogram: State update duration in seconds.
279    pub const STATE_UPDATE_DURATION: &str = "attuned_state_update_duration_seconds";
280    /// Histogram: State read duration in seconds.
281    pub const STATE_READ_DURATION: &str = "attuned_state_read_duration_seconds";
282    /// Histogram: Translation duration in seconds.
283    pub const TRANSLATION_DURATION: &str = "attuned_translation_duration_seconds";
284    /// Gauge: Number of active users (with state in last N minutes).
285    pub const ACTIVE_USERS: &str = "attuned_active_users";
286    /// Histogram: HTTP request duration in seconds.
287    pub const HTTP_REQUEST_DURATION: &str = "attuned_http_request_duration_seconds";
288}
289
290/// Span names used by Attuned.
291pub mod span_names {
292    /// Span for state upsert operations.
293    pub const STORE_UPSERT: &str = "attuned.store.upsert";
294    /// Span for state get operations.
295    pub const STORE_GET: &str = "attuned.store.get";
296    /// Span for state delete operations.
297    pub const STORE_DELETE: &str = "attuned.store.delete";
298    /// Span for translation operations.
299    pub const TRANSLATE: &str = "attuned.translate";
300    /// Span for health check operations.
301    pub const HEALTH_CHECK: &str = "attuned.health_check";
302    /// Span for HTTP request handling.
303    pub const HTTP_REQUEST: &str = "attuned.http.request";
304}
305
306#[cfg(test)]
307mod tests {
308    use super::*;
309
310    #[test]
311    fn test_health_status_aggregation() {
312        let checks = vec![
313            ComponentHealth::healthy("store"),
314            ComponentHealth::degraded("qdrant", "high latency"),
315        ];
316
317        let status = HealthStatus::from_checks(checks, 100);
318        assert_eq!(status.status, HealthState::Degraded);
319    }
320
321    #[test]
322    fn test_health_status_unhealthy_dominates() {
323        let checks = vec![
324            ComponentHealth::healthy("store"),
325            ComponentHealth::unhealthy("qdrant", "connection failed"),
326            ComponentHealth::degraded("cache", "high miss rate"),
327        ];
328
329        let status = HealthStatus::from_checks(checks, 100);
330        assert_eq!(status.status, HealthState::Unhealthy);
331    }
332
333    #[test]
334    fn test_audit_event_creation() {
335        let event = AuditEvent::new(
336            AuditEventType::StateUpdated,
337            "user_123",
338            crate::Source::SelfReport,
339            vec!["warmth".to_string(), "formality".to_string()],
340            1.0,
341        );
342
343        assert_eq!(event.user_id, "user_123");
344        assert_eq!(event.axes_changed.len(), 2);
345    }
346}