smith-config 0.1.2

Unified configuration management for agent services
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
//! Observability configuration for OpenTelemetry-based tracing and monitoring
//!
//! This module provides configuration for the unified observability system
//! with OpenTelemetry traces, metrics, and logs, featuring PII redaction
//! and integration with ClickHouse, Phoenix, and HyperDX.

use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

fn default_true() -> bool {
    true
}

/// Redaction levels for PII and sensitive data
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum RedactionLevel {
    /// Maximum redaction - only essential fields preserved, everything else hashed
    Strict,
    /// Balanced approach - preserve more context while still protecting sensitive data
    Balanced,
    /// Permissive mode - only redact obvious secrets (PII, credentials), keep conversational text
    Permissive,
}

impl Default for RedactionLevel {
    fn default() -> Self {
        RedactionLevel::Permissive
    }
}

/// Sampling strategies for OpenTelemetry traces
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum SamplingStrategy {
    /// Always sample all traces
    AlwaysOn,
    /// Never sample traces (only for emergencies)
    AlwaysOff,
    /// Parent-based sampling with fallback
    ParentBased { fallback_ratio: f64 },
    /// Fixed ratio sampling
    Ratio(f64),
}

impl Default for SamplingStrategy {
    fn default() -> Self {
        SamplingStrategy::ParentBased {
            fallback_ratio: 0.1,
        }
    }
}

/// OpenTelemetry Collector configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CollectorConfig {
    /// OTLP endpoint for receiving traces/metrics/logs
    pub otlp_endpoint: String,

    /// OTLP endpoint for HTTP (fallback)
    pub otlp_http_endpoint: String,

    /// ClickHouse configuration for trace storage
    pub clickhouse: ClickHouseConfig,

    /// Phoenix configuration for LLM/agent session replay
    pub phoenix: PhoenixConfig,

    /// HyperDX configuration for unified observability UI
    pub hyperdx: HyperDxConfig,

    /// Memory limits for the collector
    pub memory_limit_mib: u64,

    /// Batch processing configuration
    pub batch_timeout_ms: u64,
    pub batch_send_size: u32,
}

/// ClickHouse configuration for OLAP storage
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ClickHouseConfig {
    /// ClickHouse connection URL
    pub url: String,

    /// Database name for observability data
    pub database: String,

    /// Username for ClickHouse connection
    pub username: String,

    /// Password for ClickHouse connection (should be from env var)
    pub password: String,

    /// Enable compression for better performance
    pub compression: bool,

    /// Table TTL in days
    pub ttl_days: u32,
}

/// Phoenix configuration for LLM/agent session replay
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PhoenixConfig {
    /// Phoenix OTLP ingestion endpoint
    pub otlp_endpoint: String,

    /// Phoenix web UI endpoint
    pub web_endpoint: String,

    /// Enable Phoenix ingestion
    pub enabled: bool,
}

/// HyperDX configuration for unified observability
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HyperDxConfig {
    /// HyperDX web UI endpoint
    pub web_endpoint: String,

    /// HyperDX API endpoint
    pub api_endpoint: String,

    /// Enable HyperDX integration
    pub enabled: bool,
}

/// Main observability configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ObservabilityConfig {
    /// Master kill-switch for all observability features
    pub enabled: bool,

    /// Redaction level for PII and sensitive data
    pub redaction_level: RedactionLevel,

    /// OpenTelemetry service configuration
    pub service_name: String,
    pub service_version: String,
    pub deployment_environment: String,

    /// Sampling configuration
    pub sampling: SamplingStrategy,

    /// Resource attributes to add to all telemetry
    pub resource_attributes: HashMap<String, String>,

    /// OpenTelemetry Collector configuration
    pub collector: CollectorConfig,

    /// Enable different telemetry types
    pub traces_enabled: bool,
    pub metrics_enabled: bool,
    pub logs_enabled: bool,

    /// NATS trace propagation configuration
    pub nats_propagation_enabled: bool,

    /// Session management configuration
    pub session_timeout_minutes: u64,

    /// Cost tracking configuration
    pub cost_tracking_enabled: bool,

    /// Performance monitoring thresholds
    pub performance_thresholds: PerformanceThresholds,

    /// Optional chat bridge configuration for task notifications
    #[serde(default)]
    pub chat_bridge_tasks: Option<TasksBridgeConfig>,
}

/// Configuration for routing spans to a Mattermost tasks channel via chat bridge
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TasksBridgeConfig {
    /// Enable Mattermost task notifications
    pub enabled: bool,

    /// Mattermost connection details
    pub mattermost: MattermostBridgeSettings,

    /// Target channel configuration
    pub channel: MattermostChannelSettings,
}

impl Default for TasksBridgeConfig {
    fn default() -> Self {
        Self {
            enabled: false,
            mattermost: MattermostBridgeSettings::default(),
            channel: MattermostChannelSettings::default(),
        }
    }
}

/// Mattermost bot configuration for chat bridge
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MattermostBridgeSettings {
    /// Mattermost server base URL (e.g., <https://mattermost.example.com>)
    pub base_url: String,

    /// Personal access token for the bot user
    pub access_token: String,

    /// Prefer the Mattermost AI Agent bridge instead of the REST bot flow
    #[serde(default)]
    pub use_agent_bridge: bool,

    /// Optional Mattermost plugin identifier hosting the bridge endpoint
    #[serde(default)]
    pub plugin_id: Option<String>,

    /// Optional override for the bridge URL (defaults to the plugin bridge endpoint)
    #[serde(default)]
    pub bridge_url: Option<String>,

    /// Shared secret used when calling the bridge endpoint
    #[serde(default)]
    pub webhook_secret: Option<String>,

    /// Optional agent identifier scoped to this bridge
    #[serde(default)]
    pub agent_id: Option<String>,

    /// Optional adapter label used to identify this bridge instance
    #[serde(default)]
    pub label: Option<String>,

    /// Verify TLS certificates when connecting to Mattermost
    #[serde(default = "default_true")]
    pub verify_tls: bool,
}

impl Default for MattermostBridgeSettings {
    fn default() -> Self {
        Self {
            base_url: "http://localhost:8065".to_string(),
            access_token: String::new(),
            use_agent_bridge: false,
            plugin_id: None,
            bridge_url: None,
            webhook_secret: None,
            agent_id: None,
            label: Some("mattermost-tasks".to_string()),
            verify_tls: true,
        }
    }
}

/// Target Mattermost channel for task notifications
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MattermostChannelSettings {
    /// Team ID that owns the channel
    pub team_id: String,

    /// Channel ID that should receive notifications
    pub channel_id: String,

    /// Optional channel display name
    #[serde(default)]
    pub channel_name: Option<String>,

    /// Optional prefix added ahead of the trace header message
    #[serde(default)]
    pub thread_prefix: Option<String>,
}

impl Default for MattermostChannelSettings {
    fn default() -> Self {
        Self {
            team_id: String::new(),
            channel_id: String::new(),
            channel_name: None,
            thread_prefix: Some("#tasks".to_string()),
        }
    }
}

/// Performance monitoring thresholds for alerting
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceThresholds {
    /// Maximum allowed latency in milliseconds
    pub max_latency_ms: u64,

    /// Maximum token cost per operation in USD
    pub max_cost_usd: f64,

    /// Maximum memory usage in MB
    pub max_memory_mb: u64,

    /// CPU usage threshold percentage
    pub cpu_threshold_percent: f32,
}

impl Default for CollectorConfig {
    fn default() -> Self {
        Self {
            otlp_endpoint: "http://localhost:4317".to_string(),
            otlp_http_endpoint: "http://localhost:4318".to_string(),
            clickhouse: ClickHouseConfig::default(),
            phoenix: PhoenixConfig::default(),
            hyperdx: HyperDxConfig::default(),
            memory_limit_mib: 512,
            batch_timeout_ms: 5000,
            batch_send_size: 512,
        }
    }
}

impl Default for ClickHouseConfig {
    fn default() -> Self {
        Self {
            url: "http://localhost:8123".to_string(),
            database: "otel".to_string(),
            username: "default".to_string(),
            password: "".to_string(),
            compression: true,
            ttl_days: 30,
        }
    }
}

impl Default for PhoenixConfig {
    fn default() -> Self {
        Self {
            otlp_endpoint: "http://localhost:6006".to_string(),
            web_endpoint: "http://localhost:6006".to_string(),
            enabled: true,
        }
    }
}

impl Default for HyperDxConfig {
    fn default() -> Self {
        Self {
            web_endpoint: "http://localhost:8080".to_string(),
            api_endpoint: "http://localhost:8080/api".to_string(),
            enabled: true,
        }
    }
}

impl Default for PerformanceThresholds {
    fn default() -> Self {
        Self {
            max_latency_ms: 30000, // 30 seconds
            max_cost_usd: 1.0,     // $1 per operation
            max_memory_mb: 1024,   // 1GB
            cpu_threshold_percent: 80.0,
        }
    }
}

impl Default for ObservabilityConfig {
    fn default() -> Self {
        Self {
            enabled: false, // Safe default - must be explicitly enabled
            redaction_level: RedactionLevel::Permissive,
            service_name: "smith".to_string(),
            service_version: "0.1.1".to_string(),
            deployment_environment: "development".to_string(),
            sampling: SamplingStrategy::default(),
            resource_attributes: HashMap::new(),
            collector: CollectorConfig::default(),
            traces_enabled: true,
            metrics_enabled: true,
            logs_enabled: true,
            nats_propagation_enabled: true,
            session_timeout_minutes: 60,
            cost_tracking_enabled: true,
            performance_thresholds: PerformanceThresholds::default(),
            chat_bridge_tasks: None,
        }
    }
}

impl ObservabilityConfig {
    /// Validate observability configuration
    pub fn validate(&self) -> Result<()> {
        if self.service_name.is_empty() {
            return Err(anyhow::anyhow!("Service name cannot be empty"));
        }

        if self.service_version.is_empty() {
            return Err(anyhow::anyhow!("Service version cannot be empty"));
        }

        if self.deployment_environment.is_empty() {
            return Err(anyhow::anyhow!("Deployment environment cannot be empty"));
        }

        if self.session_timeout_minutes == 0 {
            return Err(anyhow::anyhow!("Session timeout cannot be zero"));
        }

        if self.session_timeout_minutes > 1440 {
            return Err(anyhow::anyhow!("Session timeout cannot exceed 24 hours"));
        }

        // Validate sampling strategy
        match &self.sampling {
            SamplingStrategy::Ratio(ratio) => {
                if *ratio < 0.0 || *ratio > 1.0 {
                    return Err(anyhow::anyhow!(
                        "Sampling ratio must be between 0.0 and 1.0"
                    ));
                }
            }
            SamplingStrategy::ParentBased { fallback_ratio } => {
                if *fallback_ratio < 0.0 || *fallback_ratio > 1.0 {
                    return Err(anyhow::anyhow!(
                        "Fallback sampling ratio must be between 0.0 and 1.0"
                    ));
                }
            }
            _ => {}
        }

        // Validate collector configuration
        self.collector
            .validate()
            .context("Collector configuration validation failed")?;

        // Validate performance thresholds
        self.performance_thresholds
            .validate()
            .context("Performance thresholds validation failed")?;

        if let Some(tasks) = &self.chat_bridge_tasks {
            if tasks.enabled {
                if tasks.mattermost.base_url.trim().is_empty() {
                    return Err(anyhow::anyhow!(
                        "Mattermost base_url must be set when chat bridge tasks are enabled"
                    ));
                }
                if tasks.mattermost.use_agent_bridge {
                    let secret_empty = tasks
                        .mattermost
                        .webhook_secret
                        .as_ref()
                        .map(|secret| secret.trim().is_empty())
                        .unwrap_or(true);
                    if secret_empty {
                        return Err(anyhow::anyhow!(
                            "Mattermost webhook_secret must be set when use_agent_bridge is enabled"
                        ));
                    }
                } else if tasks.mattermost.access_token.trim().is_empty() {
                    return Err(anyhow::anyhow!(
                        "Mattermost access_token must be set when chat bridge tasks are enabled"
                    ));
                }
                if tasks.channel.team_id.trim().is_empty() {
                    return Err(anyhow::anyhow!(
                        "Mattermost team_id must be set for chat bridge tasks"
                    ));
                }
                if tasks.channel.channel_id.trim().is_empty() {
                    return Err(anyhow::anyhow!(
                        "Mattermost channel_id must be set for chat bridge tasks"
                    ));
                }
            }
        }

        Ok(())
    }

    /// Get development environment configuration
    pub fn development() -> Self {
        Self {
            enabled: false, // Start disabled even in dev for safety
            deployment_environment: "development".to_string(),
            sampling: SamplingStrategy::AlwaysOn, // Full sampling in dev
            collector: CollectorConfig {
                memory_limit_mib: 256,  // Lower memory for dev
                batch_timeout_ms: 1000, // Faster batching in dev
                ..CollectorConfig::default()
            },
            ..Self::default()
        }
    }

    /// Get production environment configuration
    pub fn production() -> Self {
        Self {
            enabled: false, // Must be explicitly enabled
            deployment_environment: "production".to_string(),
            redaction_level: RedactionLevel::Strict, // Maximum protection
            sampling: SamplingStrategy::ParentBased {
                fallback_ratio: 0.1,
            },
            collector: CollectorConfig {
                memory_limit_mib: 1024, // Higher memory for prod
                ..CollectorConfig::default()
            },
            performance_thresholds: PerformanceThresholds {
                max_latency_ms: 10000, // Stricter in prod
                ..PerformanceThresholds::default()
            },
            ..Self::default()
        }
    }

    /// Get testing environment configuration  
    pub fn testing() -> Self {
        Self {
            enabled: false, // Disabled during tests by default
            deployment_environment: "testing".to_string(),
            sampling: SamplingStrategy::AlwaysOff, // No sampling during tests
            traces_enabled: false,
            metrics_enabled: false,
            logs_enabled: false,
            ..Self::default()
        }
    }
}

impl CollectorConfig {
    pub fn validate(&self) -> Result<()> {
        if self.otlp_endpoint.is_empty() {
            return Err(anyhow::anyhow!("OTLP endpoint cannot be empty"));
        }

        if self.otlp_http_endpoint.is_empty() {
            return Err(anyhow::anyhow!("OTLP HTTP endpoint cannot be empty"));
        }

        if self.memory_limit_mib == 0 {
            return Err(anyhow::anyhow!("Memory limit cannot be zero"));
        }

        if self.memory_limit_mib < 64 {
            return Err(anyhow::anyhow!(
                "Memory limit too low, minimum 64 MiB required"
            ));
        }

        if self.batch_timeout_ms == 0 {
            return Err(anyhow::anyhow!("Batch timeout cannot be zero"));
        }

        if self.batch_send_size == 0 {
            return Err(anyhow::anyhow!("Batch send size cannot be zero"));
        }

        self.clickhouse
            .validate()
            .context("ClickHouse configuration validation failed")?;

        Ok(())
    }
}

impl ClickHouseConfig {
    pub fn validate(&self) -> Result<()> {
        if self.url.is_empty() {
            return Err(anyhow::anyhow!("ClickHouse URL cannot be empty"));
        }

        if self.database.is_empty() {
            return Err(anyhow::anyhow!("ClickHouse database cannot be empty"));
        }

        if self.username.is_empty() {
            return Err(anyhow::anyhow!("ClickHouse username cannot be empty"));
        }

        if self.ttl_days == 0 {
            return Err(anyhow::anyhow!("TTL cannot be zero"));
        }

        if self.ttl_days > 365 {
            tracing::warn!("TTL > 1 year may consume significant storage space");
        }

        Ok(())
    }
}

impl PerformanceThresholds {
    pub fn validate(&self) -> Result<()> {
        if self.max_latency_ms == 0 {
            return Err(anyhow::anyhow!("Maximum latency cannot be zero"));
        }

        if self.max_cost_usd < 0.0 {
            return Err(anyhow::anyhow!("Maximum cost cannot be negative"));
        }

        if self.max_memory_mb == 0 {
            return Err(anyhow::anyhow!("Maximum memory cannot be zero"));
        }

        if self.cpu_threshold_percent <= 0.0 || self.cpu_threshold_percent > 100.0 {
            return Err(anyhow::anyhow!("CPU threshold must be between 0 and 100"));
        }

        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_default_observability_config() {
        let config = ObservabilityConfig::default();

        // Should be disabled by default for safety
        assert!(!config.enabled);
        assert_eq!(config.redaction_level, RedactionLevel::Permissive);
        assert!(config.validate().is_ok());
    }

    #[test]
    fn test_environment_configs() {
        let dev_config = ObservabilityConfig::development();
        let prod_config = ObservabilityConfig::production();
        let test_config = ObservabilityConfig::testing();

        assert!(dev_config.validate().is_ok());
        assert!(prod_config.validate().is_ok());
        assert!(test_config.validate().is_ok());

        // Development should allow full sampling
        assert_eq!(dev_config.sampling, SamplingStrategy::AlwaysOn);

        // Production should be strict on redaction
        assert_eq!(prod_config.redaction_level, RedactionLevel::Strict);

        // Testing should disable telemetry
        assert!(!test_config.traces_enabled);
        assert!(!test_config.metrics_enabled);
        assert!(!test_config.logs_enabled);
    }

    #[test]
    fn test_redaction_levels() {
        let strict = RedactionLevel::Strict;
        let balanced = RedactionLevel::Balanced;
        let permissive = RedactionLevel::Permissive;

        assert_ne!(strict, balanced);
        assert_ne!(balanced, permissive);
        assert_eq!(RedactionLevel::default(), RedactionLevel::Permissive);
    }

    #[test]
    fn test_sampling_validation() {
        let valid_config = ObservabilityConfig {
            sampling: SamplingStrategy::Ratio(0.5),
            ..ObservabilityConfig::default()
        };
        assert!(valid_config.validate().is_ok());

        let invalid_low = ObservabilityConfig {
            sampling: SamplingStrategy::Ratio(-0.1),
            ..ObservabilityConfig::default()
        };
        assert!(invalid_low.validate().is_err());

        let invalid_high = ObservabilityConfig {
            sampling: SamplingStrategy::Ratio(1.1),
            ..ObservabilityConfig::default()
        };
        assert!(invalid_high.validate().is_err());
    }

    #[test]
    fn test_performance_thresholds_validation() {
        let mut thresholds = PerformanceThresholds::default();

        // Valid thresholds
        assert!(thresholds.validate().is_ok());

        // Invalid CPU threshold
        thresholds.cpu_threshold_percent = 150.0;
        assert!(thresholds.validate().is_err());

        thresholds.cpu_threshold_percent = -10.0;
        assert!(thresholds.validate().is_err());

        // Invalid cost
        thresholds.cpu_threshold_percent = 80.0;
        thresholds.max_cost_usd = -1.0;
        assert!(thresholds.validate().is_err());
    }
}