Skip to main content

vtcode_core/exec/
pii_tokenizer.rs

1//! PII (Personally Identifiable Information) tokenization for data privacy.
2//!
3//! Automatically detects and tokenizes sensitive data before MCP tool calls,
4//! preventing PII from entering model context or being logged.
5//!
6//! Features:
7//! - Pattern-based detection (email, phone, SSN, credit card, etc.)
8//! - Secure token generation and storage
9//! - Automatic de-tokenization on tool result
10//! - Configurable patterns and policies
11//! - Audit trail of tokenized data
12
13use anyhow::{Context, Result};
14use hashbrown::HashMap;
15use once_cell::sync::Lazy;
16use regex::Regex;
17use serde::{Deserialize, Serialize};
18use std::sync::{Arc, Mutex};
19use tracing::debug;
20
21/// Types of PII that can be tokenized.
22#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
23pub enum PiiType {
24    Email,
25    PhoneNumber,
26    SocialSecurityNumber,
27    CreditCard,
28    IpAddress,
29    ApiKey,
30    AuthToken,
31    Url,
32    Custom,
33}
34
35impl PiiType {
36    pub fn as_str(&self) -> &'static str {
37        match self {
38            Self::Email => "email",
39            Self::PhoneNumber => "phone_number",
40            Self::SocialSecurityNumber => "ssn",
41            Self::CreditCard => "credit_card",
42            Self::IpAddress => "ip_address",
43            Self::ApiKey => "api_key",
44            Self::AuthToken => "auth_token",
45            Self::Url => "url",
46            Self::Custom => "custom",
47        }
48    }
49}
50
51// Compile default PII patterns once to avoid repeated regex compilation overhead.
52static DEFAULT_PII_PATTERNS: Lazy<Result<Vec<(PiiType, Regex)>, String>> = Lazy::new(|| {
53    let patterns = vec![
54        (
55            PiiType::Email,
56            r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
57        ),
58        (
59            PiiType::PhoneNumber,
60            r"(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}",
61        ),
62        (PiiType::SocialSecurityNumber, r"[0-9]{3}-[0-9]{2}-[0-9]{4}"),
63        (
64            PiiType::CreditCard,
65            r"[0-9]{4}[\s-]?[0-9]{4}[\s-]?[0-9]{4}[\s-]?[0-9]{4}",
66        ),
67        (
68            PiiType::IpAddress,
69            r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
70        ),
71        (
72            PiiType::ApiKey,
73            r#"(?:api[_-]?key|apikey|API[_-]?KEY)\s*[:=]\s*['"]?[a-zA-Z0-9_-]{32,}['"]?"#,
74        ),
75        (
76            PiiType::AuthToken,
77            r"(?:bearer|token|authorization)\s+[a-zA-Z0-9._-]+",
78        ),
79    ];
80
81    let mut compiled = Vec::with_capacity(patterns.len());
82    for (pii_type, pattern) in patterns {
83        match Regex::new(pattern) {
84            Ok(regex) => compiled.push((pii_type, regex)),
85            Err(e) => {
86                return Err(format!(
87                    "Failed to compile PII regex for {:?}: {}",
88                    pii_type, e
89                ));
90            }
91        }
92    }
93    Ok(compiled)
94});
95
96/// Detected PII instance with location and type.
97#[derive(Debug, Clone, Serialize, Deserialize)]
98pub struct DetectedPii {
99    pub value: String,
100    pub pii_type: PiiType,
101    pub start: usize,
102    pub end: usize,
103    pub context: String,
104}
105
106/// Token for replacing PII.
107#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct PiiToken {
109    pub token: String,
110    pub original_value: String,
111    pub pii_type: PiiType,
112    pub created_at: String,
113}
114
115/// Manager for PII tokenization with configurable detection patterns.
116#[derive(Clone)]
117pub struct PiiTokenizer {
118    patterns: HashMap<PiiType, Regex>,
119    inner: Arc<Mutex<PiiTokenizerInner>>,
120}
121
122/// Inner state for PiiTokenizer
123struct PiiTokenizerInner {
124    token_store: HashMap<String, PiiToken>,
125}
126
127impl PiiTokenizer {
128    /// Create a new PII tokenizer with default patterns.
129    pub fn new() -> Result<Self> {
130        // Build patterns from static defaults (compiled once)
131        // Note: Cloning Regex is cheap (Arc-based internally)
132        let patterns = DEFAULT_PII_PATTERNS
133            .as_ref()
134            .map_err(|e| anyhow::anyhow!("PII pattern initialization failed: {}", e))?
135            .iter()
136            .map(|(pii_type, regex)| (*pii_type, regex.clone()))
137            .collect();
138
139        Ok(Self {
140            patterns,
141            inner: Arc::new(Mutex::new(PiiTokenizerInner {
142                token_store: HashMap::new(),
143            })),
144        })
145    }
146
147    /// Detect PII in a string.
148    pub fn detect_pii(&self, text: &str) -> Result<Vec<DetectedPii>> {
149        let mut detected = Vec::with_capacity(8);
150
151        for (pii_type, pattern) in &self.patterns {
152            for mat in pattern.find_iter(text) {
153                // Only allocate value when actually detected (lazy allocation)
154                let value = text[mat.start()..mat.end()].to_string();
155                let context_start = mat.start().saturating_sub(20);
156                let context_end = (mat.end() + 20).min(text.len());
157                let context = text[context_start..context_end]
158                    .replace('\n', "\\n")
159                    .replace('\r', "\\r");
160
161                debug!(
162                    pii_type = pii_type.as_str(),
163                    context = %context,
164                    "Detected PII in text"
165                );
166
167                detected.push(DetectedPii {
168                    value,
169                    pii_type: *pii_type,
170                    start: mat.start(),
171                    end: mat.end(),
172                    context,
173                });
174            }
175        }
176
177        Ok(detected)
178    }
179
180    /// Tokenize PII in a string, returning modified text and token map.
181    pub fn tokenize_string(&self, text: &str) -> Result<(String, HashMap<String, PiiToken>)> {
182        let mut detected = self.detect_pii(text)?;
183
184        if detected.is_empty() {
185            return Ok((text.to_string(), HashMap::new()));
186        }
187
188        // Ensure deterministic right-to-left replacement order by source offsets.
189        detected.sort_by_key(|d| d.start);
190
191        let mut result = text.to_string();
192        let mut new_tokens = HashMap::new();
193
194        // Process detections in reverse order to maintain offsets
195        for detection in detected.iter().rev() {
196            let token = self.generate_token(&detection.value, detection.pii_type)?;
197            let token_str = &token.token;
198            result.replace_range(detection.start..detection.end, token_str);
199            new_tokens.insert(token_str.clone(), token);
200        }
201
202        // Store tokens for later de-tokenization
203        {
204            let mut inner = self
205                .inner
206                .lock()
207                .map_err(|e| anyhow::anyhow!("Failed to acquire token store lock: {}", e))?;
208            // Clone tokens into the inner store so we can still return them
209            inner.token_store.extend(new_tokens.clone());
210        }
211
212        debug!(pii_count = detected.len(), "Tokenized PII in string");
213
214        Ok((result, new_tokens))
215    }
216
217    /// De-tokenize a string using stored token map.
218    pub fn detokenize_string(&self, text: &str) -> Result<String> {
219        let inner = self
220            .inner
221            .lock()
222            .map_err(|e| anyhow::anyhow!("Failed to acquire token store lock: {}", e))?;
223        let mut result = text.to_string();
224
225        for (token, pii_token) in inner.token_store.iter() {
226            result = result.replace(token, &pii_token.original_value);
227        }
228
229        Ok(result)
230    }
231
232    /// Clear all stored tokens (for security).
233    pub fn clear_tokens(&self) {
234        let mut inner = match self.inner.lock() {
235            Ok(guard) => guard,
236            Err(poisoned) => poisoned.into_inner(),
237        };
238        inner.token_store.clear();
239        debug!("Cleared all PII tokens");
240    }
241
242    /// Get audit trail of tokenized data.
243    pub fn audit_trail(&self) -> Result<Vec<(String, PiiType, String)>> {
244        let inner = self
245            .inner
246            .lock()
247            .map_err(|e| anyhow::anyhow!("Failed to acquire token store lock: {}", e))?;
248        Ok(inner
249            .token_store
250            .values()
251            .map(|t| (t.token.clone(), t.pii_type, t.created_at.clone()))
252            .collect())
253    }
254
255    /// Generate a secure token for PII value.
256    fn generate_token(&self, value: &str, pii_type: PiiType) -> Result<PiiToken> {
257        use std::collections::hash_map::DefaultHasher;
258        use std::hash::{Hash, Hasher};
259
260        let mut hasher = DefaultHasher::new();
261        value.hash(&mut hasher);
262        let hash = hasher.finish();
263
264        let token = format!("__PII_{}_{:x}__", pii_type.as_str(), hash);
265
266        Ok(PiiToken {
267            token,
268            original_value: value.to_string(),
269            pii_type,
270            created_at: chrono::Utc::now().to_rfc3339(),
271        })
272    }
273
274    /// Register custom PII pattern.
275    pub fn register_pattern(&mut self, pii_type: PiiType, pattern: &str) -> Result<()> {
276        let regex = Regex::new(pattern).context("invalid regex pattern for PII detection")?;
277        self.patterns.insert(pii_type, regex);
278        debug!(
279            pii_type = pii_type.as_str(),
280            pattern = pattern,
281            "Registered custom PII pattern"
282        );
283        Ok(())
284    }
285}
286
287impl Default for PiiTokenizer {
288    fn default() -> Self {
289        Self::new().unwrap_or_else(|_| Self {
290            patterns: Default::default(),
291            inner: Arc::new(Mutex::new(PiiTokenizerInner {
292                token_store: HashMap::new(),
293            })),
294        })
295    }
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301    use anyhow::Result;
302
303    #[test]
304    fn test_detect_email() -> Result<()> {
305        let tokenizer = PiiTokenizer::new()?;
306        let text = "Contact me at john@example.com for more info";
307        let detected = tokenizer.detect_pii(text)?;
308
309        assert!(!detected.is_empty());
310        assert!(detected.iter().any(|d| d.pii_type == PiiType::Email));
311        Ok(())
312    }
313
314    #[test]
315    fn test_detect_phone() -> Result<()> {
316        let tokenizer = PiiTokenizer::new()?;
317        let text = "Call me at 555-123-4567";
318        let detected = tokenizer.detect_pii(text)?;
319
320        assert!(!detected.is_empty());
321        assert!(detected.iter().any(|d| d.pii_type == PiiType::PhoneNumber));
322        Ok(())
323    }
324
325    #[test]
326    fn test_tokenize_string() -> Result<()> {
327        let tokenizer = PiiTokenizer::new()?;
328        let text = "Email: john@example.com, Phone: 555-123-4567";
329        let (tokenized, tokens) = tokenizer.tokenize_string(text)?;
330
331        assert!(tokenized.contains("__PII_"));
332        assert!(!tokenized.contains("john@example.com"));
333        assert!(!tokens.is_empty());
334        Ok(())
335    }
336
337    #[test]
338    fn test_no_pii_detected() -> Result<()> {
339        let tokenizer = PiiTokenizer::new()?;
340        let text = "This is regular text with no sensitive information";
341        let detected = tokenizer.detect_pii(text)?;
342
343        assert!(detected.is_empty());
344        Ok(())
345    }
346}