reasonkit/telemetry/
privacy.rs

1//! Privacy Filter for Telemetry
2//!
3//! Implements PII stripping, differential privacy, and redaction.
4
5use crate::telemetry::{
6    FeedbackEvent, PrivacyConfig, QueryEvent, TelemetryError, TelemetryResult, TraceEvent,
7};
8// use once_cell::sync::Lazy;
9use regex::Regex;
10use sha2::{Digest, Sha256};
11use std::collections::HashSet;
12
13/// Privacy filter for sanitizing telemetry events
14pub struct PrivacyFilter {
15    /// Configuration
16    config: PrivacyConfig,
17    /// PII detection patterns
18    pii_patterns: Vec<PiiPattern>,
19    /// Sensitive keywords to redact
20    sensitive_keywords: HashSet<String>,
21}
22
23/// PII detection pattern
24struct PiiPattern {
25    /// Pattern name (useful for debugging/logging)
26    #[allow(dead_code)]
27    name: &'static str,
28    /// Regex pattern
29    regex: Regex,
30    /// Replacement string
31    replacement: &'static str,
32}
33
34impl PrivacyFilter {
35    /// Create a new privacy filter
36    pub fn new(config: PrivacyConfig) -> Self {
37        let pii_patterns = Self::build_pii_patterns();
38        let sensitive_keywords = Self::build_sensitive_keywords();
39
40        Self {
41            config,
42            pii_patterns,
43            sensitive_keywords,
44        }
45    }
46
47    /// Build PII detection patterns
48    /// PERFORMANCE: Patterns are pre-compiled as static Lazy<Regex> for optimal performance
49    fn build_pii_patterns() -> Vec<PiiPattern> {
50        use once_cell::sync::Lazy;
51
52        // Pre-compiled static regex patterns (compiled once at program start)
53        static EMAIL_RE: Lazy<Regex> =
54            Lazy::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
55        static PHONE_RE: Lazy<Regex> = Lazy::new(|| {
56            Regex::new(r"(\+?1?[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}").unwrap()
57        });
58        static SSN_RE: Lazy<Regex> =
59            Lazy::new(|| Regex::new(r"\b\d{3}[-.]?\d{2}[-.]?\d{4}\b").unwrap());
60        static CARD_RE: Lazy<Regex> =
61            Lazy::new(|| Regex::new(r"\b(?:\d{4}[-\s]?){3}\d{4}\b").unwrap());
62        static IP_RE: Lazy<Regex> =
63            Lazy::new(|| Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap());
64        static API_KEY_RE: Lazy<Regex> = Lazy::new(|| {
65            Regex::new(r#"(?i)(api[_-]?key|apikey|secret[_-]?key|auth[_-]?token|bearer)\s*[:=]\s*['"]?[\w-]{20,}['"]?"#).unwrap()
66        });
67        static AWS_KEY_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)AKIA[0-9A-Z]{16}").unwrap());
68        static AUTH_URL_RE: Lazy<Regex> =
69            Lazy::new(|| Regex::new(r#"https?://[^:]+:[^@]+@[^\s]+"#).unwrap());
70        static USER_PATH_RE: Lazy<Regex> =
71            Lazy::new(|| Regex::new(r"(?i)(/home/|/users/|C:\\Users\\)[a-zA-Z0-9._-]+").unwrap());
72
73        vec![
74            // Email addresses
75            PiiPattern {
76                name: "email",
77                regex: EMAIL_RE.clone(),
78                replacement: "[EMAIL]",
79            },
80            // Phone numbers (various formats)
81            PiiPattern {
82                name: "phone",
83                regex: PHONE_RE.clone(),
84                replacement: "[PHONE]",
85            },
86            // SSN
87            PiiPattern {
88                name: "ssn",
89                regex: SSN_RE.clone(),
90                replacement: "[SSN]",
91            },
92            // Credit card numbers
93            PiiPattern {
94                name: "credit_card",
95                regex: CARD_RE.clone(),
96                replacement: "[CARD]",
97            },
98            // IP addresses
99            PiiPattern {
100                name: "ip_address",
101                regex: IP_RE.clone(),
102                replacement: "[IP]",
103            },
104            // API keys (common patterns)
105            PiiPattern {
106                name: "api_key",
107                regex: API_KEY_RE.clone(),
108                replacement: "[API_KEY]",
109            },
110            // AWS access keys
111            PiiPattern {
112                name: "aws_key",
113                regex: AWS_KEY_RE.clone(),
114                replacement: "[AWS_KEY]",
115            },
116            // URLs with auth
117            PiiPattern {
118                name: "auth_url",
119                regex: AUTH_URL_RE.clone(),
120                replacement: "[AUTH_URL]",
121            },
122            // File paths with usernames
123            PiiPattern {
124                name: "user_path",
125                regex: USER_PATH_RE.clone(),
126                replacement: "[USER_PATH]",
127            },
128        ]
129    }
130
131    /// Build sensitive keywords set
132    fn build_sensitive_keywords() -> HashSet<String> {
133        [
134            "password",
135            "passwd",
136            "secret",
137            "token",
138            "credential",
139            "private",
140            "confidential",
141            "sensitive",
142            "ssn",
143            "social",
144        ]
145        .iter()
146        .map(|s| s.to_lowercase())
147        .collect()
148    }
149
150    /// Strip PII from a string
151    pub fn strip_pii(&self, text: &str) -> String {
152        if !self.config.strip_pii {
153            return text.to_string();
154        }
155
156        let mut result = text.to_string();
157
158        for pattern in &self.pii_patterns {
159            result = pattern
160                .regex
161                .replace_all(&result, pattern.replacement)
162                .to_string();
163        }
164
165        result
166    }
167
168    /// Hash a query for storage (never store raw queries)
169    pub fn hash_query(&self, query: &str) -> String {
170        // Normalize: lowercase, remove extra whitespace
171        let normalized = query
172            .to_lowercase()
173            .split_whitespace()
174            .collect::<Vec<_>>()
175            .join(" ");
176
177        let mut hasher = Sha256::new();
178        hasher.update(normalized.as_bytes());
179        format!("{:x}", hasher.finalize())
180    }
181
182    /// Check if text contains sensitive content
183    pub fn contains_sensitive(&self, text: &str) -> bool {
184        let lower = text.to_lowercase();
185        self.sensitive_keywords.iter().any(|kw| lower.contains(kw))
186    }
187
188    /// Sanitize a query event
189    pub fn sanitize_query_event(&self, mut event: QueryEvent) -> TelemetryResult<QueryEvent> {
190        // Check for blocked content
191        if self.config.block_sensitive && self.contains_sensitive(&event.query_text) {
192            return Err(TelemetryError::PrivacyViolation(
193                "Query contains sensitive keywords".to_string(),
194            ));
195        }
196
197        // Replace query text with hash
198        let _query_hash = self.hash_query(&event.query_text);
199        event.query_text = "[HASHED]".to_string(); // Never store raw query
200
201        // Sanitize tool names if needed
202        event.tools_used = event
203            .tools_used
204            .into_iter()
205            .map(|t| self.strip_pii(&t))
206            .collect();
207
208        Ok(event)
209    }
210
211    /// Sanitize a feedback event
212    pub fn sanitize_feedback_event(&self, event: FeedbackEvent) -> TelemetryResult<FeedbackEvent> {
213        // Feedback events don't contain user text, so minimal sanitization needed
214        Ok(event)
215    }
216
217    /// Sanitize a trace event
218    pub fn sanitize_trace_event(&self, mut event: TraceEvent) -> TelemetryResult<TraceEvent> {
219        // Sanitize step types
220        event.step_types = event
221            .step_types
222            .into_iter()
223            .map(|s| self.strip_pii(&s))
224            .collect();
225
226        Ok(event)
227    }
228
229    /// Apply differential privacy noise to a count
230    pub fn add_dp_noise(&self, count: u64) -> u64 {
231        if !self.config.differential_privacy {
232            return count;
233        }
234
235        // Laplace mechanism with epsilon from config
236        let epsilon = self.config.dp_epsilon;
237        let sensitivity = 1.0; // Count queries have sensitivity 1
238        let scale = sensitivity / epsilon;
239
240        // Simple Laplace noise (using deterministic approximation for reproducibility in tests)
241        // In production, use a proper random Laplace distribution
242        let noise = scale * 0.5; // Median of Laplace distribution
243
244        (count as f64 + noise).max(0.0).round() as u64
245    }
246}
247
248#[cfg(test)]
249mod tests {
250    use super::*;
251
252    fn test_config() -> PrivacyConfig {
253        PrivacyConfig {
254            strip_pii: true,
255            block_sensitive: true,
256            differential_privacy: true,
257            dp_epsilon: 1.0,
258            redact_file_paths: true,
259        }
260    }
261
262    #[test]
263    fn test_email_stripping() {
264        let filter = PrivacyFilter::new(test_config());
265        let result = filter.strip_pii("Contact me at user@example.com for details");
266        assert_eq!(result, "Contact me at [EMAIL] for details");
267    }
268
269    #[test]
270    fn test_phone_stripping() {
271        let filter = PrivacyFilter::new(test_config());
272        let result = filter.strip_pii("Call me at 555-123-4567");
273        // Regex may capture preceding whitespace, so check for [PHONE] presence
274        assert!(
275            result.contains("[PHONE]"),
276            "Expected [PHONE] in: {}",
277            result
278        );
279        assert!(!result.contains("555"), "Phone number should be redacted");
280    }
281
282    #[test]
283    fn test_api_key_stripping() {
284        let filter = PrivacyFilter::new(test_config());
285        // Use an API key without phone-like sequences to avoid phone regex matching first
286        let result = filter.strip_pii("Set api_key=sk-abcdefghijklmnopqrstuvwxyz");
287        assert!(
288            result.contains("[API_KEY]"),
289            "Expected [API_KEY] in: {}",
290            result
291        );
292    }
293
294    #[test]
295    fn test_query_hashing() {
296        let filter = PrivacyFilter::new(test_config());
297
298        // Same query (different whitespace) should produce same hash
299        let hash1 = filter.hash_query("what is  chain of thought");
300        let hash2 = filter.hash_query("what is chain of thought");
301        assert_eq!(hash1, hash2);
302
303        // Different queries should produce different hashes
304        let hash3 = filter.hash_query("different query");
305        assert_ne!(hash1, hash3);
306    }
307
308    #[test]
309    fn test_sensitive_detection() {
310        let filter = PrivacyFilter::new(test_config());
311
312        assert!(filter.contains_sensitive("my password is abc123"));
313        assert!(filter.contains_sensitive("This is CONFIDENTIAL"));
314        assert!(!filter.contains_sensitive("This is a normal query"));
315    }
316
317    #[test]
318    fn test_sensitive_blocking() {
319        let filter = PrivacyFilter::new(test_config());
320
321        let event = QueryEvent::new(uuid::Uuid::new_v4(), "my password is abc123".to_string());
322
323        let result = filter.sanitize_query_event(event);
324        assert!(result.is_err());
325    }
326
327    #[test]
328    fn test_user_path_stripping() {
329        let filter = PrivacyFilter::new(test_config());
330        let result = filter.strip_pii("File at /home/johndoe/secrets.txt");
331        assert!(result.contains("[USER_PATH]"));
332        assert!(!result.contains("johndoe"));
333    }
334}