kindly_guard_server/telemetry/
mod.rs

1// Copyright 2025 Kindly Software Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//! Telemetry module for `KindlyGuard`
15//! Provides observability through OpenTelemetry with trait-based architecture
16
17use anyhow::Result;
18use async_trait::async_trait;
19use serde::{Deserialize, Serialize};
20use std::sync::Arc;
21
22/// Telemetry configuration
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct TelemetryConfig {
25    /// Enable telemetry collection
26    pub enabled: bool,
27
28    /// Service name for telemetry
29    pub service_name: String,
30
31    /// Service version
32    pub service_version: String,
33
34    /// Export endpoint (e.g., OTLP endpoint)
35    pub export_endpoint: Option<String>,
36
37    /// Export interval
38    pub export_interval_seconds: u64,
39
40    /// Enable tracing
41    pub tracing_enabled: bool,
42
43    /// Enable metrics
44    pub metrics_enabled: bool,
45
46    /// Sampling rate (0.0 to 1.0)
47    pub sampling_rate: f64,
48}
49
50impl Default for TelemetryConfig {
51    fn default() -> Self {
52        Self {
53            enabled: false,
54            service_name: "kindly-guard".to_string(),
55            service_version: env!("CARGO_PKG_VERSION").to_string(),
56            export_endpoint: None,
57            export_interval_seconds: 60,
58            tracing_enabled: true,
59            metrics_enabled: true,
60            sampling_rate: 0.1,
61        }
62    }
63}
64
65/// Telemetry span for tracing
66#[derive(Debug, Clone)]
67pub struct TelemetrySpan {
68    pub name: String,
69    pub start_time: std::time::Instant,
70    pub attributes: Vec<(String, String)>,
71}
72
73/// Metric types
74#[derive(Debug, Clone)]
75pub enum MetricValue {
76    Counter(u64),
77    Gauge(f64),
78    Histogram(f64),
79}
80
81/// Telemetry metric
82#[derive(Debug, Clone)]
83pub struct TelemetryMetric {
84    pub name: String,
85    pub value: MetricValue,
86    pub labels: Vec<(String, String)>,
87}
88
89/// Telemetry provider trait - implementations can use OpenTelemetry or custom solutions
90#[async_trait]
91pub trait TelemetryProvider: Send + Sync {
92    /// Start a new span
93    fn start_span(&self, name: &str) -> TelemetrySpan;
94
95    /// End a span
96    fn end_span(&self, span: TelemetrySpan);
97
98    /// Record a metric
99    fn record_metric(&self, metric: TelemetryMetric);
100
101    /// Add event to current span
102    fn add_event(&self, name: &str, attributes: Vec<(&str, &str)>);
103
104    /// Set span status
105    fn set_status(&self, span: &TelemetrySpan, is_error: bool, message: Option<&str>);
106
107    /// Flush telemetry data
108    async fn flush(&self) -> Result<()>;
109
110    /// Shutdown telemetry
111    async fn shutdown(&self) -> Result<()>;
112}
113
114/// Context for telemetry operations
115#[derive(Debug, Clone)]
116pub struct TelemetryContext {
117    pub trace_id: String,
118    pub span_id: String,
119    pub parent_span_id: Option<String>,
120    pub baggage: Vec<(String, String)>,
121}
122
123impl Default for TelemetryContext {
124    fn default() -> Self {
125        Self::new()
126    }
127}
128
129impl TelemetryContext {
130    /// Create a new root context
131    pub fn new() -> Self {
132        Self {
133            trace_id: uuid::Uuid::new_v4().to_string(),
134            span_id: uuid::Uuid::new_v4().to_string(),
135            parent_span_id: None,
136            baggage: vec![],
137        }
138    }
139
140    /// Create a child context
141    pub fn child(&self) -> Self {
142        Self {
143            trace_id: self.trace_id.clone(),
144            span_id: uuid::Uuid::new_v4().to_string(),
145            parent_span_id: Some(self.span_id.clone()),
146            baggage: self.baggage.clone(),
147        }
148    }
149}
150
151/// Factory for creating telemetry providers
152pub trait TelemetryProviderFactory: Send + Sync {
153    /// Create a telemetry provider based on configuration
154    fn create(&self, config: &TelemetryConfig) -> Result<Arc<dyn TelemetryProvider>>;
155}
156
157/// Security-aware telemetry helper
158pub struct SecureTelemetry {
159    provider: Arc<dyn TelemetryProvider>,
160}
161
162impl SecureTelemetry {
163    pub fn new(provider: Arc<dyn TelemetryProvider>) -> Self {
164        Self { provider }
165    }
166
167    /// Record a security event with sanitized data
168    pub fn record_security_event(&self, event_type: &str, client_id: &str, threat_level: &str) {
169        // Sanitize client_id to prevent information leakage
170        let sanitized_client = if client_id.len() > 8 {
171            format!("{}...", &client_id[..8])
172        } else {
173            "anonymous".to_string()
174        };
175
176        self.provider.add_event(
177            "security.event",
178            vec![
179                ("event.type", event_type),
180                ("client.id", &sanitized_client),
181                ("threat.level", threat_level),
182            ],
183        );
184    }
185
186    /// Record performance metrics
187    pub fn record_performance(&self, operation: &str, duration_ms: f64) {
188        self.provider.record_metric(TelemetryMetric {
189            name: format!("kindly_guard.{operation}.duration"),
190            value: MetricValue::Histogram(duration_ms),
191            labels: vec![("operation".to_string(), operation.to_string())],
192        });
193    }
194
195    /// Record rate limit metrics
196    pub fn record_rate_limit(&self, client_id: &str, allowed: bool) {
197        let sanitized_client = if client_id.len() > 8 {
198            format!("{}...", &client_id[..8])
199        } else {
200            "anonymous".to_string()
201        };
202
203        self.provider.record_metric(TelemetryMetric {
204            name: "kindly_guard.rate_limit.decisions".to_string(),
205            value: MetricValue::Counter(1),
206            labels: vec![
207                ("client.id".to_string(), sanitized_client),
208                (
209                    "decision".to_string(),
210                    if allowed { "allow" } else { "deny" }.to_string(),
211                ),
212            ],
213        });
214    }
215
216    /// Record neutralization event
217    pub fn record_neutralization(
218        &self,
219        threat_type: &str,
220        action: &str,
221        duration_ms: f64,
222        success: bool,
223    ) {
224        // Record neutralization counter
225        self.provider.record_metric(TelemetryMetric {
226            name: "kindly_guard.neutralization.total".to_string(),
227            value: MetricValue::Counter(1),
228            labels: vec![
229                ("threat.type".to_string(), threat_type.to_string()),
230                ("action".to_string(), action.to_string()),
231                (
232                    "status".to_string(),
233                    if success { "success" } else { "failure" }.to_string(),
234                ),
235            ],
236        });
237
238        // Record neutralization duration
239        if success {
240            self.provider.record_metric(TelemetryMetric {
241                name: "kindly_guard.neutralization.duration_ms".to_string(),
242                value: MetricValue::Histogram(duration_ms),
243                labels: vec![
244                    ("threat.type".to_string(), threat_type.to_string()),
245                    ("action".to_string(), action.to_string()),
246                ],
247            });
248        }
249
250        // Add event to current span
251        self.provider.add_event(
252            "neutralization",
253            vec![
254                ("threat.type", threat_type),
255                ("action", action),
256                ("duration_ms", &duration_ms.to_string()),
257                ("success", if success { "true" } else { "false" }),
258            ],
259        );
260    }
261
262    /// Record neutralization batch metrics
263    pub fn record_neutralization_batch(
264        &self,
265        total_threats: usize,
266        neutralized: usize,
267        duration_ms: f64,
268    ) {
269        self.provider.record_metric(TelemetryMetric {
270            name: "kindly_guard.neutralization.batch.size".to_string(),
271            value: MetricValue::Histogram(total_threats as f64),
272            labels: vec![],
273        });
274
275        self.provider.record_metric(TelemetryMetric {
276            name: "kindly_guard.neutralization.batch.success_rate".to_string(),
277            value: MetricValue::Gauge(if total_threats > 0 {
278                (neutralized as f64 / total_threats as f64) * 100.0
279            } else {
280                0.0
281            }),
282            labels: vec![],
283        });
284
285        self.provider.record_metric(TelemetryMetric {
286            name: "kindly_guard.neutralization.batch.duration_ms".to_string(),
287            value: MetricValue::Histogram(duration_ms),
288            labels: vec![],
289        });
290    }
291}
292
293// Re-export implementations
294pub mod distributed;
295#[cfg(feature = "enhanced")]
296pub mod enhanced;
297pub mod metrics;
298pub mod standard;
299
300pub use distributed::{
301    ContextPropagator, DistributedSpan, DistributedTracingProvider, ProbabilitySampler,
302    SpanBuilder, SpanKind, SpanStatus, StatusCode, TracingSampler, W3CTraceContextPropagator,
303};
304pub use metrics::{CommandMetrics, MetricsCollector, MetricsSnapshot};
305pub use standard::StandardTelemetryProvider;
306
307/// Create a telemetry provider based on configuration
308pub fn create_telemetry_provider(_config: &crate::config::Config) -> Arc<dyn TelemetryProvider> {
309    // For now, use default telemetry config
310    // TODO: Add telemetry config to main Config struct
311    let telemetry_config = TelemetryConfig::default();
312
313    if telemetry_config.enabled {
314        Arc::new(StandardTelemetryProvider::new(telemetry_config))
315    } else {
316        // Return a no-op provider when disabled
317        Arc::new(NoOpTelemetryProvider)
318    }
319}
320
321/// No-op telemetry provider for when telemetry is disabled
322struct NoOpTelemetryProvider;
323
324#[async_trait]
325impl TelemetryProvider for NoOpTelemetryProvider {
326    fn start_span(&self, _name: &str) -> TelemetrySpan {
327        TelemetrySpan {
328            name: String::new(),
329            start_time: std::time::Instant::now(),
330            attributes: vec![],
331        }
332    }
333
334    fn end_span(&self, _span: TelemetrySpan) {}
335
336    fn record_metric(&self, _metric: TelemetryMetric) {}
337
338    fn add_event(&self, _name: &str, _attributes: Vec<(&str, &str)>) {}
339
340    fn set_status(&self, _span: &TelemetrySpan, _is_error: bool, _message: Option<&str>) {}
341
342    async fn flush(&self) -> Result<()> {
343        Ok(())
344    }
345
346    async fn shutdown(&self) -> Result<()> {
347        Ok(())
348    }
349}