reasonkit-core 0.1.8

The Reasoning Engine — Auditable Reasoning for Production AI | Rust-Native | Turn Prompts into Protocols
//! Privacy Filter for Telemetry
//!
//! Implements PII stripping, differential privacy, and redaction.

use crate::telemetry::{
    FeedbackEvent, PrivacyConfig, QueryEvent, TelemetryError, TelemetryResult, TraceEvent,
};
// use once_cell::sync::Lazy;
use regex::Regex;
use sha2::{Digest, Sha256};
use std::collections::HashSet;

/// Privacy filter for sanitizing telemetry events
pub struct PrivacyFilter {
    /// Configuration
    config: PrivacyConfig,
    /// PII detection patterns
    pii_patterns: Vec<PiiPattern>,
    /// Sensitive keywords to redact
    sensitive_keywords: HashSet<String>,
}

/// PII detection pattern
struct PiiPattern {
    /// Pattern name (useful for debugging/logging)
    #[allow(dead_code)]
    name: &'static str,
    /// Regex pattern
    regex: Regex,
    /// Replacement string
    replacement: &'static str,
}

impl PrivacyFilter {
    /// Create a new privacy filter
    pub fn new(config: PrivacyConfig) -> Self {
        let pii_patterns = Self::build_pii_patterns();
        let sensitive_keywords = Self::build_sensitive_keywords();

        Self {
            config,
            pii_patterns,
            sensitive_keywords,
        }
    }

    /// Build PII detection patterns
    /// PERFORMANCE: Patterns are pre-compiled as static Lazy<Regex> for optimal performance
    fn build_pii_patterns() -> Vec<PiiPattern> {
        use once_cell::sync::Lazy;

        // Pre-compiled static regex patterns (compiled once at program start)
        static EMAIL_RE: Lazy<Regex> =
            Lazy::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
        static PHONE_RE: Lazy<Regex> = Lazy::new(|| {
            Regex::new(r"(\+?1?[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}").unwrap()
        });
        static SSN_RE: Lazy<Regex> =
            Lazy::new(|| Regex::new(r"\b\d{3}[-.]?\d{2}[-.]?\d{4}\b").unwrap());
        static CARD_RE: Lazy<Regex> =
            Lazy::new(|| Regex::new(r"\b(?:\d{4}[-\s]?){3}\d{4}\b").unwrap());
        static IP_RE: Lazy<Regex> =
            Lazy::new(|| Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap());
        static API_KEY_RE: Lazy<Regex> = Lazy::new(|| {
            Regex::new(r#"(?i)(api[_-]?key|apikey|secret[_-]?key|auth[_-]?token|bearer)\s*[:=]\s*['"]?[\w-]{20,}['"]?"#).unwrap()
        });
        static AWS_KEY_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)AKIA[0-9A-Z]{16}").unwrap());
        static AUTH_URL_RE: Lazy<Regex> =
            Lazy::new(|| Regex::new(r#"https?://[^:]+:[^@]+@[^\s]+"#).unwrap());
        static USER_PATH_RE: Lazy<Regex> =
            Lazy::new(|| Regex::new(r"(?i)(/home/|/users/|C:\\Users\\)[a-zA-Z0-9._-]+").unwrap());

        vec![
            // Email addresses
            PiiPattern {
                name: "email",
                regex: EMAIL_RE.clone(),
                replacement: "[EMAIL]",
            },
            // Phone numbers (various formats)
            PiiPattern {
                name: "phone",
                regex: PHONE_RE.clone(),
                replacement: "[PHONE]",
            },
            // SSN
            PiiPattern {
                name: "ssn",
                regex: SSN_RE.clone(),
                replacement: "[SSN]",
            },
            // Credit card numbers
            PiiPattern {
                name: "credit_card",
                regex: CARD_RE.clone(),
                replacement: "[CARD]",
            },
            // IP addresses
            PiiPattern {
                name: "ip_address",
                regex: IP_RE.clone(),
                replacement: "[IP]",
            },
            // API keys (common patterns)
            PiiPattern {
                name: "api_key",
                regex: API_KEY_RE.clone(),
                replacement: "[API_KEY]",
            },
            // AWS access keys
            PiiPattern {
                name: "aws_key",
                regex: AWS_KEY_RE.clone(),
                replacement: "[AWS_KEY]",
            },
            // URLs with auth
            PiiPattern {
                name: "auth_url",
                regex: AUTH_URL_RE.clone(),
                replacement: "[AUTH_URL]",
            },
            // File paths with usernames
            PiiPattern {
                name: "user_path",
                regex: USER_PATH_RE.clone(),
                replacement: "[USER_PATH]",
            },
        ]
    }

    /// Build sensitive keywords set
    fn build_sensitive_keywords() -> HashSet<String> {
        [
            "password",
            "passwd",
            "secret",
            "token",
            "credential",
            "private",
            "confidential",
            "sensitive",
            "ssn",
            "social",
        ]
        .iter()
        .map(|s| s.to_lowercase())
        .collect()
    }

    /// Strip PII from a string
    pub fn strip_pii(&self, text: &str) -> String {
        if !self.config.strip_pii {
            return text.to_string();
        }

        let mut result = text.to_string();

        for pattern in &self.pii_patterns {
            result = pattern
                .regex
                .replace_all(&result, pattern.replacement)
                .to_string();
        }

        result
    }

    /// Hash a query for storage (never store raw queries)
    pub fn hash_query(&self, query: &str) -> String {
        // Normalize: lowercase, remove extra whitespace
        let normalized = query
            .to_lowercase()
            .split_whitespace()
            .collect::<Vec<_>>()
            .join(" ");

        let mut hasher = Sha256::new();
        hasher.update(normalized.as_bytes());
        format!("{:x}", hasher.finalize())
    }

    /// Check if text contains sensitive content
    pub fn contains_sensitive(&self, text: &str) -> bool {
        let lower = text.to_lowercase();
        self.sensitive_keywords.iter().any(|kw| lower.contains(kw))
    }

    /// Sanitize a query event
    pub fn sanitize_query_event(&self, mut event: QueryEvent) -> TelemetryResult<QueryEvent> {
        // Check for blocked content
        if self.config.block_sensitive && self.contains_sensitive(&event.query_text) {
            return Err(TelemetryError::PrivacyViolation(
                "Query contains sensitive keywords".to_string(),
            ));
        }

        // Replace query text with hash
        let _query_hash = self.hash_query(&event.query_text);
        event.query_text = "[HASHED]".to_string(); // Never store raw query

        // Sanitize tool names if needed
        event.tools_used = event
            .tools_used
            .into_iter()
            .map(|t| self.strip_pii(&t))
            .collect();

        Ok(event)
    }

    /// Sanitize a feedback event
    pub fn sanitize_feedback_event(&self, event: FeedbackEvent) -> TelemetryResult<FeedbackEvent> {
        // Feedback events don't contain user text, so minimal sanitization needed
        Ok(event)
    }

    /// Sanitize a trace event
    pub fn sanitize_trace_event(&self, mut event: TraceEvent) -> TelemetryResult<TraceEvent> {
        // Sanitize step types
        event.step_types = event
            .step_types
            .into_iter()
            .map(|s| self.strip_pii(&s))
            .collect();

        Ok(event)
    }

    /// Apply differential privacy noise to a count
    pub fn add_dp_noise(&self, count: u64) -> u64 {
        if !self.config.differential_privacy {
            return count;
        }

        // Laplace mechanism with epsilon from config
        let epsilon = self.config.dp_epsilon;
        let sensitivity = 1.0; // Count queries have sensitivity 1
        let scale = sensitivity / epsilon;

        // Simple Laplace noise (using deterministic approximation for reproducibility in tests)
        // In production, use a proper random Laplace distribution
        let noise = scale * 0.5; // Median of Laplace distribution

        (count as f64 + noise).max(0.0).round() as u64
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn test_config() -> PrivacyConfig {
        PrivacyConfig {
            strip_pii: true,
            block_sensitive: true,
            differential_privacy: true,
            dp_epsilon: 1.0,
            redact_file_paths: true,
        }
    }

    #[test]
    fn test_email_stripping() {
        let filter = PrivacyFilter::new(test_config());
        let result = filter.strip_pii("Contact me at user@example.com for details");
        assert_eq!(result, "Contact me at [EMAIL] for details");
    }

    #[test]
    fn test_phone_stripping() {
        let filter = PrivacyFilter::new(test_config());
        let result = filter.strip_pii("Call me at 555-123-4567");
        // Regex may capture preceding whitespace, so check for [PHONE] presence
        assert!(
            result.contains("[PHONE]"),
            "Expected [PHONE] in: {}",
            result
        );
        assert!(!result.contains("555"), "Phone number should be redacted");
    }

    #[test]
    fn test_api_key_stripping() {
        let filter = PrivacyFilter::new(test_config());
        // Use an API key without phone-like sequences to avoid phone regex matching first
        let result = filter.strip_pii("Set api_key=REDACTED_REDACTED_REDACTED");
        assert!(
            result.contains("[API_KEY]"),
            "Expected [API_KEY] in: {}",
            result
        );
    }

    #[test]
    fn test_query_hashing() {
        let filter = PrivacyFilter::new(test_config());

        // Same query (different whitespace) should produce same hash
        let hash1 = filter.hash_query("what is  chain of thought");
        let hash2 = filter.hash_query("what is chain of thought");
        assert_eq!(hash1, hash2);

        // Different queries should produce different hashes
        let hash3 = filter.hash_query("different query");
        assert_ne!(hash1, hash3);
    }

    #[test]
    fn test_sensitive_detection() {
        let filter = PrivacyFilter::new(test_config());

        assert!(filter.contains_sensitive("my password is abc123"));
        assert!(filter.contains_sensitive("This is CONFIDENTIAL"));
        assert!(!filter.contains_sensitive("This is a normal query"));
    }

    #[test]
    fn test_sensitive_blocking() {
        let filter = PrivacyFilter::new(test_config());

        let event = QueryEvent::new(uuid::Uuid::new_v4(), "my password is abc123".to_string());

        let result = filter.sanitize_query_event(event);
        assert!(result.is_err());
    }

    #[test]
    fn test_user_path_stripping() {
        let filter = PrivacyFilter::new(test_config());
        let result = filter.strip_pii("File at /home/johndoe/secrets.txt");
        assert!(result.contains("[USER_PATH]"));
        assert!(!result.contains("johndoe"));
    }
}