use crate::telemetry::{
FeedbackEvent, PrivacyConfig, QueryEvent, TelemetryError, TelemetryResult, TraceEvent,
};
use regex::Regex;
use sha2::{Digest, Sha256};
use std::collections::HashSet;
pub struct PrivacyFilter {
config: PrivacyConfig,
pii_patterns: Vec<PiiPattern>,
sensitive_keywords: HashSet<String>,
}
struct PiiPattern {
#[allow(dead_code)]
name: &'static str,
regex: Regex,
replacement: &'static str,
}
impl PrivacyFilter {
pub fn new(config: PrivacyConfig) -> Self {
let pii_patterns = Self::build_pii_patterns();
let sensitive_keywords = Self::build_sensitive_keywords();
Self {
config,
pii_patterns,
sensitive_keywords,
}
}
fn build_pii_patterns() -> Vec<PiiPattern> {
use once_cell::sync::Lazy;
static EMAIL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
static PHONE_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r"(\+?1?[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}").unwrap()
});
static SSN_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\b\d{3}[-.]?\d{2}[-.]?\d{4}\b").unwrap());
static CARD_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\b(?:\d{4}[-\s]?){3}\d{4}\b").unwrap());
static IP_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\b(?:\d{1,3}\.){3}\d{1,3}\b").unwrap());
static API_KEY_RE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"(?i)(api[_-]?key|apikey|secret[_-]?key|auth[_-]?token|bearer)\s*[:=]\s*['"]?[\w-]{20,}['"]?"#).unwrap()
});
static AWS_KEY_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)AKIA[0-9A-Z]{16}").unwrap());
static AUTH_URL_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"https?://[^:]+:[^@]+@[^\s]+"#).unwrap());
static USER_PATH_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(?i)(/home/|/users/|C:\\Users\\)[a-zA-Z0-9._-]+").unwrap());
vec![
PiiPattern {
name: "email",
regex: EMAIL_RE.clone(),
replacement: "[EMAIL]",
},
PiiPattern {
name: "phone",
regex: PHONE_RE.clone(),
replacement: "[PHONE]",
},
PiiPattern {
name: "ssn",
regex: SSN_RE.clone(),
replacement: "[SSN]",
},
PiiPattern {
name: "credit_card",
regex: CARD_RE.clone(),
replacement: "[CARD]",
},
PiiPattern {
name: "ip_address",
regex: IP_RE.clone(),
replacement: "[IP]",
},
PiiPattern {
name: "api_key",
regex: API_KEY_RE.clone(),
replacement: "[API_KEY]",
},
PiiPattern {
name: "aws_key",
regex: AWS_KEY_RE.clone(),
replacement: "[AWS_KEY]",
},
PiiPattern {
name: "auth_url",
regex: AUTH_URL_RE.clone(),
replacement: "[AUTH_URL]",
},
PiiPattern {
name: "user_path",
regex: USER_PATH_RE.clone(),
replacement: "[USER_PATH]",
},
]
}
fn build_sensitive_keywords() -> HashSet<String> {
[
"password",
"passwd",
"secret",
"token",
"credential",
"private",
"confidential",
"sensitive",
"ssn",
"social",
]
.iter()
.map(|s| s.to_lowercase())
.collect()
}
pub fn strip_pii(&self, text: &str) -> String {
if !self.config.strip_pii {
return text.to_string();
}
let mut result = text.to_string();
for pattern in &self.pii_patterns {
result = pattern
.regex
.replace_all(&result, pattern.replacement)
.to_string();
}
result
}
pub fn hash_query(&self, query: &str) -> String {
let normalized = query
.to_lowercase()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
let mut hasher = Sha256::new();
hasher.update(normalized.as_bytes());
format!("{:x}", hasher.finalize())
}
pub fn contains_sensitive(&self, text: &str) -> bool {
let lower = text.to_lowercase();
self.sensitive_keywords.iter().any(|kw| lower.contains(kw))
}
pub fn sanitize_query_event(&self, mut event: QueryEvent) -> TelemetryResult<QueryEvent> {
if self.config.block_sensitive && self.contains_sensitive(&event.query_text) {
return Err(TelemetryError::PrivacyViolation(
"Query contains sensitive keywords".to_string(),
));
}
let _query_hash = self.hash_query(&event.query_text);
event.query_text = "[HASHED]".to_string();
event.tools_used = event
.tools_used
.into_iter()
.map(|t| self.strip_pii(&t))
.collect();
Ok(event)
}
pub fn sanitize_feedback_event(&self, event: FeedbackEvent) -> TelemetryResult<FeedbackEvent> {
Ok(event)
}
pub fn sanitize_trace_event(&self, mut event: TraceEvent) -> TelemetryResult<TraceEvent> {
event.step_types = event
.step_types
.into_iter()
.map(|s| self.strip_pii(&s))
.collect();
Ok(event)
}
pub fn add_dp_noise(&self, count: u64) -> u64 {
if !self.config.differential_privacy {
return count;
}
let epsilon = self.config.dp_epsilon;
let sensitivity = 1.0; let scale = sensitivity / epsilon;
let noise = scale * 0.5;
(count as f64 + noise).max(0.0).round() as u64
}
}
#[cfg(test)]
mod tests {
use super::*;
fn test_config() -> PrivacyConfig {
PrivacyConfig {
strip_pii: true,
block_sensitive: true,
differential_privacy: true,
dp_epsilon: 1.0,
redact_file_paths: true,
}
}
#[test]
fn test_email_stripping() {
let filter = PrivacyFilter::new(test_config());
let result = filter.strip_pii("Contact me at user@example.com for details");
assert_eq!(result, "Contact me at [EMAIL] for details");
}
#[test]
fn test_phone_stripping() {
let filter = PrivacyFilter::new(test_config());
let result = filter.strip_pii("Call me at 555-123-4567");
assert!(
result.contains("[PHONE]"),
"Expected [PHONE] in: {}",
result
);
assert!(!result.contains("555"), "Phone number should be redacted");
}
#[test]
fn test_api_key_stripping() {
let filter = PrivacyFilter::new(test_config());
let result = filter.strip_pii("Set api_key=REDACTED_REDACTED_REDACTED");
assert!(
result.contains("[API_KEY]"),
"Expected [API_KEY] in: {}",
result
);
}
#[test]
fn test_query_hashing() {
let filter = PrivacyFilter::new(test_config());
let hash1 = filter.hash_query("what is chain of thought");
let hash2 = filter.hash_query("what is chain of thought");
assert_eq!(hash1, hash2);
let hash3 = filter.hash_query("different query");
assert_ne!(hash1, hash3);
}
#[test]
fn test_sensitive_detection() {
let filter = PrivacyFilter::new(test_config());
assert!(filter.contains_sensitive("my password is abc123"));
assert!(filter.contains_sensitive("This is CONFIDENTIAL"));
assert!(!filter.contains_sensitive("This is a normal query"));
}
#[test]
fn test_sensitive_blocking() {
let filter = PrivacyFilter::new(test_config());
let event = QueryEvent::new(uuid::Uuid::new_v4(), "my password is abc123".to_string());
let result = filter.sanitize_query_event(event);
assert!(result.is_err());
}
#[test]
fn test_user_path_stripping() {
let filter = PrivacyFilter::new(test_config());
let result = filter.strip_pii("File at /home/johndoe/secrets.txt");
assert!(result.contains("[USER_PATH]"));
assert!(!result.contains("johndoe"));
}
}