Skip to main content

smith_config/
observability.rs

1//! Observability configuration for OpenTelemetry-based tracing and monitoring
2//!
3//! This module provides configuration for the unified observability system
4//! with OpenTelemetry traces, metrics, and logs, featuring PII redaction
5//! and integration with ClickHouse, Phoenix, and HyperDX.
6
7use anyhow::{Context, Result};
8use serde::{Deserialize, Serialize};
9use std::collections::HashMap;
10
11fn default_true() -> bool {
12    true
13}
14
15/// Redaction levels for PII and sensitive data
16#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
17#[serde(rename_all = "snake_case")]
18pub enum RedactionLevel {
19    /// Maximum redaction - only essential fields preserved, everything else hashed
20    Strict,
21    /// Balanced approach - preserve more context while still protecting sensitive data
22    Balanced,
23    /// Permissive mode - only redact obvious secrets (PII, credentials), keep conversational text
24    Permissive,
25}
26
27impl Default for RedactionLevel {
28    fn default() -> Self {
29        RedactionLevel::Permissive
30    }
31}
32
33/// Sampling strategies for OpenTelemetry traces
34#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
35#[serde(rename_all = "snake_case")]
36pub enum SamplingStrategy {
37    /// Always sample all traces
38    AlwaysOn,
39    /// Never sample traces (only for emergencies)
40    AlwaysOff,
41    /// Parent-based sampling with fallback
42    ParentBased { fallback_ratio: f64 },
43    /// Fixed ratio sampling
44    Ratio(f64),
45}
46
47impl Default for SamplingStrategy {
48    fn default() -> Self {
49        SamplingStrategy::ParentBased {
50            fallback_ratio: 0.1,
51        }
52    }
53}
54
55/// OpenTelemetry Collector configuration
56#[derive(Debug, Clone, Serialize, Deserialize)]
57pub struct CollectorConfig {
58    /// OTLP endpoint for receiving traces/metrics/logs
59    pub otlp_endpoint: String,
60
61    /// OTLP endpoint for HTTP (fallback)
62    pub otlp_http_endpoint: String,
63
64    /// ClickHouse configuration for trace storage
65    pub clickhouse: ClickHouseConfig,
66
67    /// Phoenix configuration for LLM/agent session replay
68    pub phoenix: PhoenixConfig,
69
70    /// HyperDX configuration for unified observability UI
71    pub hyperdx: HyperDxConfig,
72
73    /// Memory limits for the collector
74    pub memory_limit_mib: u64,
75
76    /// Batch processing configuration
77    pub batch_timeout_ms: u64,
78    pub batch_send_size: u32,
79}
80
81/// ClickHouse configuration for OLAP storage
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct ClickHouseConfig {
84    /// ClickHouse connection URL
85    pub url: String,
86
87    /// Database name for observability data
88    pub database: String,
89
90    /// Username for ClickHouse connection
91    pub username: String,
92
93    /// Password for ClickHouse connection (should be from env var)
94    pub password: String,
95
96    /// Enable compression for better performance
97    pub compression: bool,
98
99    /// Table TTL in days
100    pub ttl_days: u32,
101}
102
103/// Phoenix configuration for LLM/agent session replay
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct PhoenixConfig {
106    /// Phoenix OTLP ingestion endpoint
107    pub otlp_endpoint: String,
108
109    /// Phoenix web UI endpoint
110    pub web_endpoint: String,
111
112    /// Enable Phoenix ingestion
113    pub enabled: bool,
114}
115
116/// HyperDX configuration for unified observability
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub struct HyperDxConfig {
119    /// HyperDX web UI endpoint
120    pub web_endpoint: String,
121
122    /// HyperDX API endpoint
123    pub api_endpoint: String,
124
125    /// Enable HyperDX integration
126    pub enabled: bool,
127}
128
129/// Main observability configuration
130#[derive(Debug, Clone, Serialize, Deserialize)]
131pub struct ObservabilityConfig {
132    /// Master kill-switch for all observability features
133    pub enabled: bool,
134
135    /// Redaction level for PII and sensitive data
136    pub redaction_level: RedactionLevel,
137
138    /// OpenTelemetry service configuration
139    pub service_name: String,
140    pub service_version: String,
141    pub deployment_environment: String,
142
143    /// Sampling configuration
144    pub sampling: SamplingStrategy,
145
146    /// Resource attributes to add to all telemetry
147    pub resource_attributes: HashMap<String, String>,
148
149    /// OpenTelemetry Collector configuration
150    pub collector: CollectorConfig,
151
152    /// Enable different telemetry types
153    pub traces_enabled: bool,
154    pub metrics_enabled: bool,
155    pub logs_enabled: bool,
156
157    /// NATS trace propagation configuration
158    pub nats_propagation_enabled: bool,
159
160    /// Session management configuration
161    pub session_timeout_minutes: u64,
162
163    /// Cost tracking configuration
164    pub cost_tracking_enabled: bool,
165
166    /// Performance monitoring thresholds
167    pub performance_thresholds: PerformanceThresholds,
168
169    /// Optional chat bridge configuration for task notifications
170    #[serde(default)]
171    pub chat_bridge_tasks: Option<TasksBridgeConfig>,
172}
173
174/// Configuration for routing spans to a Mattermost tasks channel via chat bridge
175#[derive(Debug, Clone, Serialize, Deserialize)]
176pub struct TasksBridgeConfig {
177    /// Enable Mattermost task notifications
178    pub enabled: bool,
179
180    /// Mattermost connection details
181    pub mattermost: MattermostBridgeSettings,
182
183    /// Target channel configuration
184    pub channel: MattermostChannelSettings,
185}
186
187impl Default for TasksBridgeConfig {
188    fn default() -> Self {
189        Self {
190            enabled: false,
191            mattermost: MattermostBridgeSettings::default(),
192            channel: MattermostChannelSettings::default(),
193        }
194    }
195}
196
197/// Mattermost bot configuration for chat bridge
198#[derive(Debug, Clone, Serialize, Deserialize)]
199pub struct MattermostBridgeSettings {
200    /// Mattermost server base URL (e.g., <https://mattermost.example.com>)
201    pub base_url: String,
202
203    /// Personal access token for the bot user
204    pub access_token: String,
205
206    /// Prefer the Mattermost AI Agent bridge instead of the REST bot flow
207    #[serde(default)]
208    pub use_agent_bridge: bool,
209
210    /// Optional Mattermost plugin identifier hosting the bridge endpoint
211    #[serde(default)]
212    pub plugin_id: Option<String>,
213
214    /// Optional override for the bridge URL (defaults to the plugin bridge endpoint)
215    #[serde(default)]
216    pub bridge_url: Option<String>,
217
218    /// Shared secret used when calling the bridge endpoint
219    #[serde(default)]
220    pub webhook_secret: Option<String>,
221
222    /// Optional agent identifier scoped to this bridge
223    #[serde(default)]
224    pub agent_id: Option<String>,
225
226    /// Optional adapter label used to identify this bridge instance
227    #[serde(default)]
228    pub label: Option<String>,
229
230    /// Verify TLS certificates when connecting to Mattermost
231    #[serde(default = "default_true")]
232    pub verify_tls: bool,
233}
234
235impl Default for MattermostBridgeSettings {
236    fn default() -> Self {
237        Self {
238            base_url: "http://localhost:8065".to_string(),
239            access_token: String::new(),
240            use_agent_bridge: false,
241            plugin_id: None,
242            bridge_url: None,
243            webhook_secret: None,
244            agent_id: None,
245            label: Some("mattermost-tasks".to_string()),
246            verify_tls: true,
247        }
248    }
249}
250
251/// Target Mattermost channel for task notifications
252#[derive(Debug, Clone, Serialize, Deserialize)]
253pub struct MattermostChannelSettings {
254    /// Team ID that owns the channel
255    pub team_id: String,
256
257    /// Channel ID that should receive notifications
258    pub channel_id: String,
259
260    /// Optional channel display name
261    #[serde(default)]
262    pub channel_name: Option<String>,
263
264    /// Optional prefix added ahead of the trace header message
265    #[serde(default)]
266    pub thread_prefix: Option<String>,
267}
268
269impl Default for MattermostChannelSettings {
270    fn default() -> Self {
271        Self {
272            team_id: String::new(),
273            channel_id: String::new(),
274            channel_name: None,
275            thread_prefix: Some("#tasks".to_string()),
276        }
277    }
278}
279
280/// Performance monitoring thresholds for alerting
281#[derive(Debug, Clone, Serialize, Deserialize)]
282pub struct PerformanceThresholds {
283    /// Maximum allowed latency in milliseconds
284    pub max_latency_ms: u64,
285
286    /// Maximum token cost per operation in USD
287    pub max_cost_usd: f64,
288
289    /// Maximum memory usage in MB
290    pub max_memory_mb: u64,
291
292    /// CPU usage threshold percentage
293    pub cpu_threshold_percent: f32,
294}
295
296impl Default for CollectorConfig {
297    fn default() -> Self {
298        Self {
299            otlp_endpoint: "http://localhost:4317".to_string(),
300            otlp_http_endpoint: "http://localhost:4318".to_string(),
301            clickhouse: ClickHouseConfig::default(),
302            phoenix: PhoenixConfig::default(),
303            hyperdx: HyperDxConfig::default(),
304            memory_limit_mib: 512,
305            batch_timeout_ms: 5000,
306            batch_send_size: 512,
307        }
308    }
309}
310
311impl Default for ClickHouseConfig {
312    fn default() -> Self {
313        Self {
314            url: "http://localhost:8123".to_string(),
315            database: "otel".to_string(),
316            username: "default".to_string(),
317            password: "".to_string(),
318            compression: true,
319            ttl_days: 30,
320        }
321    }
322}
323
324impl Default for PhoenixConfig {
325    fn default() -> Self {
326        Self {
327            otlp_endpoint: "http://localhost:6006".to_string(),
328            web_endpoint: "http://localhost:6006".to_string(),
329            enabled: true,
330        }
331    }
332}
333
334impl Default for HyperDxConfig {
335    fn default() -> Self {
336        Self {
337            web_endpoint: "http://localhost:8080".to_string(),
338            api_endpoint: "http://localhost:8080/api".to_string(),
339            enabled: true,
340        }
341    }
342}
343
344impl Default for PerformanceThresholds {
345    fn default() -> Self {
346        Self {
347            max_latency_ms: 30000, // 30 seconds
348            max_cost_usd: 1.0,     // $1 per operation
349            max_memory_mb: 1024,   // 1GB
350            cpu_threshold_percent: 80.0,
351        }
352    }
353}
354
355impl Default for ObservabilityConfig {
356    fn default() -> Self {
357        Self {
358            enabled: false, // Safe default - must be explicitly enabled
359            redaction_level: RedactionLevel::Permissive,
360            service_name: "smith".to_string(),
361            service_version: "0.1.1".to_string(),
362            deployment_environment: "development".to_string(),
363            sampling: SamplingStrategy::default(),
364            resource_attributes: HashMap::new(),
365            collector: CollectorConfig::default(),
366            traces_enabled: true,
367            metrics_enabled: true,
368            logs_enabled: true,
369            nats_propagation_enabled: true,
370            session_timeout_minutes: 60,
371            cost_tracking_enabled: true,
372            performance_thresholds: PerformanceThresholds::default(),
373            chat_bridge_tasks: None,
374        }
375    }
376}
377
378impl ObservabilityConfig {
379    /// Validate observability configuration
380    pub fn validate(&self) -> Result<()> {
381        if self.service_name.is_empty() {
382            return Err(anyhow::anyhow!("Service name cannot be empty"));
383        }
384
385        if self.service_version.is_empty() {
386            return Err(anyhow::anyhow!("Service version cannot be empty"));
387        }
388
389        if self.deployment_environment.is_empty() {
390            return Err(anyhow::anyhow!("Deployment environment cannot be empty"));
391        }
392
393        if self.session_timeout_minutes == 0 {
394            return Err(anyhow::anyhow!("Session timeout cannot be zero"));
395        }
396
397        if self.session_timeout_minutes > 1440 {
398            return Err(anyhow::anyhow!("Session timeout cannot exceed 24 hours"));
399        }
400
401        // Validate sampling strategy
402        match &self.sampling {
403            SamplingStrategy::Ratio(ratio) => {
404                if *ratio < 0.0 || *ratio > 1.0 {
405                    return Err(anyhow::anyhow!(
406                        "Sampling ratio must be between 0.0 and 1.0"
407                    ));
408                }
409            }
410            SamplingStrategy::ParentBased { fallback_ratio } => {
411                if *fallback_ratio < 0.0 || *fallback_ratio > 1.0 {
412                    return Err(anyhow::anyhow!(
413                        "Fallback sampling ratio must be between 0.0 and 1.0"
414                    ));
415                }
416            }
417            _ => {}
418        }
419
420        // Validate collector configuration
421        self.collector
422            .validate()
423            .context("Collector configuration validation failed")?;
424
425        // Validate performance thresholds
426        self.performance_thresholds
427            .validate()
428            .context("Performance thresholds validation failed")?;
429
430        if let Some(tasks) = &self.chat_bridge_tasks {
431            if tasks.enabled {
432                if tasks.mattermost.base_url.trim().is_empty() {
433                    return Err(anyhow::anyhow!(
434                        "Mattermost base_url must be set when chat bridge tasks are enabled"
435                    ));
436                }
437                if tasks.mattermost.use_agent_bridge {
438                    let secret_empty = tasks
439                        .mattermost
440                        .webhook_secret
441                        .as_ref()
442                        .map(|secret| secret.trim().is_empty())
443                        .unwrap_or(true);
444                    if secret_empty {
445                        return Err(anyhow::anyhow!(
446                            "Mattermost webhook_secret must be set when use_agent_bridge is enabled"
447                        ));
448                    }
449                } else if tasks.mattermost.access_token.trim().is_empty() {
450                    return Err(anyhow::anyhow!(
451                        "Mattermost access_token must be set when chat bridge tasks are enabled"
452                    ));
453                }
454                if tasks.channel.team_id.trim().is_empty() {
455                    return Err(anyhow::anyhow!(
456                        "Mattermost team_id must be set for chat bridge tasks"
457                    ));
458                }
459                if tasks.channel.channel_id.trim().is_empty() {
460                    return Err(anyhow::anyhow!(
461                        "Mattermost channel_id must be set for chat bridge tasks"
462                    ));
463                }
464            }
465        }
466
467        Ok(())
468    }
469
470    /// Get development environment configuration
471    pub fn development() -> Self {
472        Self {
473            enabled: false, // Start disabled even in dev for safety
474            deployment_environment: "development".to_string(),
475            sampling: SamplingStrategy::AlwaysOn, // Full sampling in dev
476            collector: CollectorConfig {
477                memory_limit_mib: 256,  // Lower memory for dev
478                batch_timeout_ms: 1000, // Faster batching in dev
479                ..CollectorConfig::default()
480            },
481            ..Self::default()
482        }
483    }
484
485    /// Get production environment configuration
486    pub fn production() -> Self {
487        Self {
488            enabled: false, // Must be explicitly enabled
489            deployment_environment: "production".to_string(),
490            redaction_level: RedactionLevel::Strict, // Maximum protection
491            sampling: SamplingStrategy::ParentBased {
492                fallback_ratio: 0.1,
493            },
494            collector: CollectorConfig {
495                memory_limit_mib: 1024, // Higher memory for prod
496                ..CollectorConfig::default()
497            },
498            performance_thresholds: PerformanceThresholds {
499                max_latency_ms: 10000, // Stricter in prod
500                ..PerformanceThresholds::default()
501            },
502            ..Self::default()
503        }
504    }
505
506    /// Get testing environment configuration  
507    pub fn testing() -> Self {
508        Self {
509            enabled: false, // Disabled during tests by default
510            deployment_environment: "testing".to_string(),
511            sampling: SamplingStrategy::AlwaysOff, // No sampling during tests
512            traces_enabled: false,
513            metrics_enabled: false,
514            logs_enabled: false,
515            ..Self::default()
516        }
517    }
518}
519
520impl CollectorConfig {
521    pub fn validate(&self) -> Result<()> {
522        if self.otlp_endpoint.is_empty() {
523            return Err(anyhow::anyhow!("OTLP endpoint cannot be empty"));
524        }
525
526        if self.otlp_http_endpoint.is_empty() {
527            return Err(anyhow::anyhow!("OTLP HTTP endpoint cannot be empty"));
528        }
529
530        if self.memory_limit_mib == 0 {
531            return Err(anyhow::anyhow!("Memory limit cannot be zero"));
532        }
533
534        if self.memory_limit_mib < 64 {
535            return Err(anyhow::anyhow!(
536                "Memory limit too low, minimum 64 MiB required"
537            ));
538        }
539
540        if self.batch_timeout_ms == 0 {
541            return Err(anyhow::anyhow!("Batch timeout cannot be zero"));
542        }
543
544        if self.batch_send_size == 0 {
545            return Err(anyhow::anyhow!("Batch send size cannot be zero"));
546        }
547
548        self.clickhouse
549            .validate()
550            .context("ClickHouse configuration validation failed")?;
551
552        Ok(())
553    }
554}
555
556impl ClickHouseConfig {
557    pub fn validate(&self) -> Result<()> {
558        if self.url.is_empty() {
559            return Err(anyhow::anyhow!("ClickHouse URL cannot be empty"));
560        }
561
562        if self.database.is_empty() {
563            return Err(anyhow::anyhow!("ClickHouse database cannot be empty"));
564        }
565
566        if self.username.is_empty() {
567            return Err(anyhow::anyhow!("ClickHouse username cannot be empty"));
568        }
569
570        if self.ttl_days == 0 {
571            return Err(anyhow::anyhow!("TTL cannot be zero"));
572        }
573
574        if self.ttl_days > 365 {
575            tracing::warn!("TTL > 1 year may consume significant storage space");
576        }
577
578        Ok(())
579    }
580}
581
582impl PerformanceThresholds {
583    pub fn validate(&self) -> Result<()> {
584        if self.max_latency_ms == 0 {
585            return Err(anyhow::anyhow!("Maximum latency cannot be zero"));
586        }
587
588        if self.max_cost_usd < 0.0 {
589            return Err(anyhow::anyhow!("Maximum cost cannot be negative"));
590        }
591
592        if self.max_memory_mb == 0 {
593            return Err(anyhow::anyhow!("Maximum memory cannot be zero"));
594        }
595
596        if self.cpu_threshold_percent <= 0.0 || self.cpu_threshold_percent > 100.0 {
597            return Err(anyhow::anyhow!("CPU threshold must be between 0 and 100"));
598        }
599
600        Ok(())
601    }
602}
603
604#[cfg(test)]
605mod tests {
606    use super::*;
607
608    #[test]
609    fn test_default_observability_config() {
610        let config = ObservabilityConfig::default();
611
612        // Should be disabled by default for safety
613        assert!(!config.enabled);
614        assert_eq!(config.redaction_level, RedactionLevel::Permissive);
615        assert!(config.validate().is_ok());
616    }
617
618    #[test]
619    fn test_environment_configs() {
620        let dev_config = ObservabilityConfig::development();
621        let prod_config = ObservabilityConfig::production();
622        let test_config = ObservabilityConfig::testing();
623
624        assert!(dev_config.validate().is_ok());
625        assert!(prod_config.validate().is_ok());
626        assert!(test_config.validate().is_ok());
627
628        // Development should allow full sampling
629        assert_eq!(dev_config.sampling, SamplingStrategy::AlwaysOn);
630
631        // Production should be strict on redaction
632        assert_eq!(prod_config.redaction_level, RedactionLevel::Strict);
633
634        // Testing should disable telemetry
635        assert!(!test_config.traces_enabled);
636        assert!(!test_config.metrics_enabled);
637        assert!(!test_config.logs_enabled);
638    }
639
640    #[test]
641    fn test_redaction_levels() {
642        let strict = RedactionLevel::Strict;
643        let balanced = RedactionLevel::Balanced;
644        let permissive = RedactionLevel::Permissive;
645
646        assert_ne!(strict, balanced);
647        assert_ne!(balanced, permissive);
648        assert_eq!(RedactionLevel::default(), RedactionLevel::Permissive);
649    }
650
651    #[test]
652    fn test_sampling_validation() {
653        let valid_config = ObservabilityConfig {
654            sampling: SamplingStrategy::Ratio(0.5),
655            ..ObservabilityConfig::default()
656        };
657        assert!(valid_config.validate().is_ok());
658
659        let invalid_low = ObservabilityConfig {
660            sampling: SamplingStrategy::Ratio(-0.1),
661            ..ObservabilityConfig::default()
662        };
663        assert!(invalid_low.validate().is_err());
664
665        let invalid_high = ObservabilityConfig {
666            sampling: SamplingStrategy::Ratio(1.1),
667            ..ObservabilityConfig::default()
668        };
669        assert!(invalid_high.validate().is_err());
670    }
671
672    #[test]
673    fn test_performance_thresholds_validation() {
674        let mut thresholds = PerformanceThresholds::default();
675
676        // Valid thresholds
677        assert!(thresholds.validate().is_ok());
678
679        // Invalid CPU threshold
680        thresholds.cpu_threshold_percent = 150.0;
681        assert!(thresholds.validate().is_err());
682
683        thresholds.cpu_threshold_percent = -10.0;
684        assert!(thresholds.validate().is_err());
685
686        // Invalid cost
687        thresholds.cpu_threshold_percent = 80.0;
688        thresholds.max_cost_usd = -1.0;
689        assert!(thresholds.validate().is_err());
690    }
691}