1use regex::Regex;
4use serde::{Deserialize, Serialize};
5use std::cmp::Ordering;
6use thiserror::Error;
7
8#[derive(Debug, Error)]
9pub enum PrivacyError {
10 #[error("Invalid regex pattern: {0}")]
11 InvalidPattern(String),
12 #[error("Classification error: {0}")]
13 Classification(String),
14}
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
18pub enum SensitivityLevel {
19 Public,
20 Normal,
21 Sensitive,
22 HighlySensitive,
23 Critical,
24}
25
26impl std::fmt::Display for SensitivityLevel {
27 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28 match self {
29 Self::Public => write!(f, "Public"),
30 Self::Normal => write!(f, "Normal"),
31 Self::Sensitive => write!(f, "Sensitive"),
32 Self::HighlySensitive => write!(f, "HighlySensitive"),
33 Self::Critical => write!(f, "Critical"),
34 }
35 }
36}
37
38impl Default for SensitivityLevel {
39 fn default() -> Self {
40 Self::Normal
41 }
42}
43
44impl PartialOrd for SensitivityLevel {
45 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
46 Some(self.cmp(other))
47 }
48}
49
50impl Ord for SensitivityLevel {
51 fn cmp(&self, other: &Self) -> Ordering {
52 let self_level = match self {
53 Self::Public => 0,
54 Self::Normal => 1,
55 Self::Sensitive => 2,
56 Self::HighlySensitive => 3,
57 Self::Critical => 4,
58 };
59 let other_level = match other {
60 Self::Public => 0,
61 Self::Normal => 1,
62 Self::Sensitive => 2,
63 Self::HighlySensitive => 3,
64 Self::Critical => 4,
65 };
66 self_level.cmp(&other_level)
67 }
68}
69
70#[derive(Debug, Clone, Serialize, Deserialize)]
72pub struct ClassificationRule {
73 pub name: String,
74 pub pattern: String,
75 pub level: SensitivityLevel,
76 pub description: String,
77}
78
79#[derive(Debug, Clone)]
81pub struct ClassificationMatch {
82 pub rule_name: String,
83 pub level: SensitivityLevel,
84 pub start: usize,
85 pub end: usize,
86 pub matched_text: String,
87}
88
89pub type PiiMatch = ClassificationMatch;
91
92#[derive(Debug, Clone)]
94pub struct ClassificationResult {
95 pub overall_level: SensitivityLevel,
96 pub matches: Vec<ClassificationMatch>,
97 pub requires_tee: bool,
98}
99
100#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
102pub enum RedactionStrategy {
103 Mask,
104 Remove,
105 Hash,
106}
107
108pub struct RegexClassifier {
110 rules: Vec<(String, Regex, SensitivityLevel)>,
111 default_level: SensitivityLevel,
112}
113
114impl RegexClassifier {
115 pub fn new(
117 rules: &[ClassificationRule],
118 default_level: SensitivityLevel,
119 ) -> Result<Self, PrivacyError> {
120 let compiled_rules = rules
121 .iter()
122 .map(|rule| {
123 let regex = Regex::new(&rule.pattern)
124 .map_err(|e| PrivacyError::InvalidPattern(format!("{}: {}", rule.name, e)))?;
125 Ok((rule.name.clone(), regex, rule.level))
126 })
127 .collect::<Result<Vec<_>, PrivacyError>>()?;
128
129 Ok(Self {
130 rules: compiled_rules,
131 default_level,
132 })
133 }
134
135 pub fn classify(&self, text: &str) -> ClassificationResult {
137 let mut matches = Vec::new();
138 let mut overall_level = self.default_level;
139
140 for (rule_name, regex, level) in &self.rules {
141 for mat in regex.find_iter(text) {
142 matches.push(ClassificationMatch {
143 rule_name: rule_name.clone(),
144 level: *level,
145 start: mat.start(),
146 end: mat.end(),
147 matched_text: mat.as_str().to_string(),
148 });
149 if *level > overall_level {
150 overall_level = *level;
151 }
152 }
153 }
154
155 let requires_tee = overall_level >= SensitivityLevel::Sensitive;
156
157 ClassificationResult {
158 overall_level,
159 matches,
160 requires_tee,
161 }
162 }
163
164 pub fn redact(&self, text: &str, strategy: RedactionStrategy) -> String {
166 let mut result = text.to_string();
167 let classification = self.classify(text);
168
169 let mut matches = classification.matches;
171 matches.sort_by(|a, b| b.start.cmp(&a.start));
172
173 for mat in matches {
174 let redacted = redact_text(&mat.matched_text, &mat.rule_name, strategy);
175 result.replace_range(mat.start..mat.end, &redacted);
176 }
177
178 result
179 }
180
181 pub fn contains_sensitive(&self, text: &str) -> bool {
183 self.classify(text).overall_level >= SensitivityLevel::Sensitive
184 }
185
186 pub fn get_sensitivity_level(&self, text: &str) -> SensitivityLevel {
188 self.classify(text).overall_level
189 }
190}
191
192pub fn redact_text(text: &str, rule_name: &str, strategy: RedactionStrategy) -> String {
194 match strategy {
195 RedactionStrategy::Mask => match rule_name {
196 "ssn" => "***-**-****".to_string(),
197 "email" => {
198 if let Some(at_pos) = text.find('@') {
199 format!("****{}", &text[at_pos..])
200 } else {
201 "[REDACTED]".to_string()
202 }
203 }
204 "credit_card" => {
205 let digits: String = text.chars().filter(|c| c.is_ascii_digit()).collect();
206 if digits.len() >= 4 {
207 format!("****-****-****-{}", &digits[digits.len() - 4..])
208 } else {
209 "****-****-****-****".to_string()
210 }
211 }
212 "phone" => "***-***-****".to_string(),
213 _ => "[REDACTED]".to_string(),
214 },
215 RedactionStrategy::Remove => String::new(),
216 RedactionStrategy::Hash => {
217 format!("[HASH:{}]", text.len())
218 }
219 }
220}
221
222pub fn default_classification_rules() -> Vec<ClassificationRule> {
224 vec![
225 ClassificationRule {
226 name: "credit_card".to_string(),
227 pattern: r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b".to_string(),
228 level: SensitivityLevel::HighlySensitive,
229 description: "Credit card number".to_string(),
230 },
231 ClassificationRule {
232 name: "ssn".to_string(),
233 pattern: r"\b\d{3}-\d{2}-\d{4}\b".to_string(),
234 level: SensitivityLevel::HighlySensitive,
235 description: "Social Security Number".to_string(),
236 },
237 ClassificationRule {
238 name: "email".to_string(),
239 pattern: r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b".to_string(),
240 level: SensitivityLevel::Sensitive,
241 description: "Email address".to_string(),
242 },
243 ClassificationRule {
244 name: "phone".to_string(),
245 pattern: r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b".to_string(),
246 level: SensitivityLevel::Sensitive,
247 description: "Phone number".to_string(),
248 },
249 ClassificationRule {
250 name: "api_key".to_string(),
251 pattern: r"\b[A-Za-z0-9_-]{32,}\b".to_string(),
252 level: SensitivityLevel::Critical,
253 description: "API key or token".to_string(),
254 },
255 ]
256}
257
258pub fn default_dangerous_commands() -> Vec<String> {
260 vec![
261 "rm -rf".to_string(),
262 "dd if=".to_string(),
263 "mkfs".to_string(),
264 ":(){ :|:& };:".to_string(), ]
266}
267
268#[derive(Debug, Clone, Serialize, Deserialize)]
270pub struct KeywordMatcherConfig {
271 pub keywords: Vec<String>,
272 pub case_sensitive: bool,
273 pub sensitive_keywords: Vec<String>,
274 pub tee_threshold: SensitivityLevel,
275}
276
277impl Default for KeywordMatcherConfig {
278 fn default() -> Self {
279 Self {
280 keywords: Vec::new(),
281 case_sensitive: false,
282 sensitive_keywords: Vec::new(),
283 tee_threshold: SensitivityLevel::Sensitive,
284 }
285 }
286}
287
288pub struct KeywordMatcher {
290 keywords: Vec<String>,
291 case_sensitive: bool,
292 sensitive_keywords: Vec<String>,
293 tee_threshold: SensitivityLevel,
294}
295
296impl KeywordMatcher {
297 pub fn new(config: KeywordMatcherConfig) -> Self {
299 Self {
300 keywords: config.keywords,
301 case_sensitive: config.case_sensitive,
302 sensitive_keywords: config.sensitive_keywords,
303 tee_threshold: config.tee_threshold,
304 }
305 }
306
307 pub fn from_keywords(keywords: Vec<String>) -> Self {
309 Self {
310 keywords,
311 case_sensitive: false,
312 sensitive_keywords: Vec::new(),
313 tee_threshold: SensitivityLevel::Sensitive,
314 }
315 }
316
317 pub fn from_config(config: KeywordMatcherConfig) -> Self {
319 Self::new(config)
320 }
321
322 pub fn matches(&self, text: &str) -> bool {
324 let text_to_check = if self.case_sensitive {
325 text.to_string()
326 } else {
327 text.to_lowercase()
328 };
329
330 let mut all_keywords = self.keywords.iter().chain(self.sensitive_keywords.iter());
331
332 all_keywords.any(|keyword| {
333 let keyword_to_check = if self.case_sensitive {
334 keyword.clone()
335 } else {
336 keyword.to_lowercase()
337 };
338 text_to_check.contains(&keyword_to_check)
339 })
340 }
341
342 pub fn classify(&self, text: &str) -> SensitivityLevel {
344 let text_to_check = if self.case_sensitive {
345 text.to_string()
346 } else {
347 text.to_lowercase()
348 };
349
350 for keyword in &self.sensitive_keywords {
352 let keyword_to_check = if self.case_sensitive {
353 keyword.clone()
354 } else {
355 keyword.to_lowercase()
356 };
357 if text_to_check.contains(&keyword_to_check) {
358 return self.tee_threshold;
359 }
360 }
361
362 if self.matches(text) {
364 SensitivityLevel::Normal
365 } else {
366 SensitivityLevel::Public
367 }
368 }
369}
370
371#[cfg(test)]
372mod tests {
373 use super::*;
374
375 #[test]
376 fn test_sensitivity_level_ordering() {
377 assert!(SensitivityLevel::Critical > SensitivityLevel::HighlySensitive);
378 assert!(SensitivityLevel::HighlySensitive > SensitivityLevel::Sensitive);
379 assert!(SensitivityLevel::Sensitive > SensitivityLevel::Normal);
380 assert!(SensitivityLevel::Normal > SensitivityLevel::Public);
381 }
382
383 #[test]
384 fn test_classifier_credit_card() {
385 let rules = default_classification_rules();
386 let classifier = RegexClassifier::new(&rules, SensitivityLevel::Normal).unwrap();
387
388 let text = "My card is 4111-1111-1111-1111";
389 let result = classifier.classify(text);
390
391 assert_eq!(result.overall_level, SensitivityLevel::HighlySensitive);
392 assert!(result.requires_tee);
393 assert_eq!(result.matches.len(), 1);
394 assert_eq!(result.matches[0].rule_name, "credit_card");
395 }
396
397 #[test]
398 fn test_classifier_email() {
399 let rules = default_classification_rules();
400 let classifier = RegexClassifier::new(&rules, SensitivityLevel::Normal).unwrap();
401
402 let text = "Contact: test@example.com";
403 let result = classifier.classify(text);
404
405 assert_eq!(result.overall_level, SensitivityLevel::Sensitive);
406 assert!(result.requires_tee);
407 }
408
409 #[test]
410 fn test_redact_ssn() {
411 let text = "123-45-6789";
412 let redacted = redact_text(text, "ssn", RedactionStrategy::Mask);
413 assert_eq!(redacted, "***-**-****");
414 }
415
416 #[test]
417 fn test_redact_credit_card() {
418 let text = "4111-1111-1111-1111";
419 let redacted = redact_text(text, "credit_card", RedactionStrategy::Mask);
420 assert_eq!(redacted, "****-****-****-1111");
421 }
422
423 #[test]
424 fn test_redact_email() {
425 let text = "test@example.com";
426 let redacted = redact_text(text, "email", RedactionStrategy::Mask);
427 assert_eq!(redacted, "****@example.com");
428 }
429
430 #[test]
431 fn test_keyword_matcher() {
432 let config = KeywordMatcherConfig {
433 keywords: vec!["secret".to_string()],
434 case_sensitive: false,
435 sensitive_keywords: vec!["password".to_string()],
436 tee_threshold: SensitivityLevel::HighlySensitive,
437 };
438 let matcher = KeywordMatcher::new(config);
439
440 assert!(matcher.matches("This is a secret message"));
441 assert!(matcher.matches("Enter your password"));
442 assert!(!matcher.matches("This is a normal message"));
443 }
444
445 #[test]
446 fn test_keyword_matcher_classify() {
447 let config = KeywordMatcherConfig {
448 keywords: vec![],
449 case_sensitive: false,
450 sensitive_keywords: vec!["confidential".to_string()],
451 tee_threshold: SensitivityLevel::HighlySensitive,
452 };
453 let matcher = KeywordMatcher::new(config);
454
455 assert_eq!(
456 matcher.classify("This is confidential"),
457 SensitivityLevel::HighlySensitive
458 );
459 assert_eq!(matcher.classify("This is public"), SensitivityLevel::Public);
460 }
461}