1#![deny(unsafe_code)]
18#![warn(missing_docs)]
19#![warn(rust_2018_idioms)]
20
21use aho_corasick::AhoCorasick;
22use regex::Regex;
23use serde::{Deserialize, Serialize};
24use thiserror::Error;
25
26pub type Result<T> = std::result::Result<T, MaskerError>;
28
29#[derive(Error, Debug)]
31pub enum MaskerError {
32 #[error("invalid config: {0}")]
34 InvalidConfig(String),
35 #[error("regex error: {0}")]
38 Regex(#[from] regex::Error),
39 #[error("aho-corasick error: {0}")]
41 Aho(#[from] aho_corasick::BuildError),
42}
43
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum BuiltinRule {
48 Email,
50 UsPhone,
52 UsSsn,
54 Ipv4,
56 Ipv6,
58 CreditCard,
60 AwsAccessKey,
62 GithubToken,
64 Jwt,
66 Url,
68 MacAddress,
70 Iban,
73}
74
75impl BuiltinRule {
76 pub fn tag(&self) -> &'static str {
78 match self {
79 Self::Email => "EMAIL",
80 Self::UsPhone => "US_PHONE",
81 Self::UsSsn => "US_SSN",
82 Self::Ipv4 => "IPV4",
83 Self::Ipv6 => "IPV6",
84 Self::CreditCard => "CREDIT_CARD",
85 Self::AwsAccessKey => "AWS_ACCESS_KEY",
86 Self::GithubToken => "GITHUB_TOKEN",
87 Self::Jwt => "JWT",
88 Self::Url => "URL",
89 Self::MacAddress => "MAC_ADDRESS",
90 Self::Iban => "IBAN",
91 }
92 }
93
94 fn pattern(&self) -> &'static str {
95 match self {
96 Self::Email => r"(?i)\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}\b",
99 Self::UsPhone => {
100 r"(?x)
101 \b
102 (?:\+?1[-.\ ]?)?
103 (?:\(\d{3}\)|\d{3})[-.\ ]?
104 \d{3}[-.\ ]?
105 \d{4}
106 \b
107 "
108 }
109 Self::UsSsn => r"\b\d{3}-\d{2}-\d{4}\b",
110 Self::Ipv4 => {
111 r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d{1,2})\.){3}(?:25[0-5]|2[0-4]\d|1?\d{1,2})\b"
112 }
113 Self::Ipv6 => {
117 r"\b(?:[0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}\b|::(?:[0-9a-fA-F]{1,4}:?){1,7}\b"
118 }
119 Self::CreditCard => r"\b(?:\d[\ -]?){12,18}\d\b",
122 Self::AwsAccessKey => r"\bAKIA[0-9A-Z]{16}\b",
123 Self::GithubToken => r"\bgh[pours]_[A-Za-z0-9]{36,}\b",
126 Self::Jwt => r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b",
128 Self::Url => {
131 r"https?://[A-Za-z0-9.\-]+(?::\d+)?(?:/[A-Za-z0-9._~:/?#\[\]@!$&'()*+,;=%-]*)?"
132 }
133 Self::MacAddress => r"\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b",
135 Self::Iban => r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b",
138 }
139 }
140
141 pub fn all() -> [BuiltinRule; 12] {
143 [
144 Self::Email,
145 Self::UsPhone,
146 Self::UsSsn,
147 Self::Ipv4,
148 Self::Ipv6,
149 Self::CreditCard,
150 Self::AwsAccessKey,
151 Self::GithubToken,
152 Self::Jwt,
153 Self::Url,
154 Self::MacAddress,
155 Self::Iban,
156 ]
157 }
158}
159
160#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
162#[serde(rename_all = "snake_case")]
163pub enum Strategy {
164 #[default]
166 Tag,
167 Hash,
170 Fixed,
172 Remove,
174 Truncate(u8),
178}
179
180#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
182pub struct MaskMatch {
183 pub kind: String,
186 pub start: usize,
188 pub end: usize,
190 pub value: String,
192}
193
194#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
196pub struct MaskResult {
197 pub masked: String,
199 pub matches: Vec<MaskMatch>,
201}
202
203pub struct Masker {
205 builtin: Vec<(BuiltinRule, Regex)>,
206 keywords: Vec<KeywordSet>,
207}
208
209struct KeywordSet {
210 label: String,
211 automaton: AhoCorasick,
212}
213
214impl Masker {
215 pub fn builder() -> MaskerBuilder {
217 MaskerBuilder::default()
218 }
219
220 pub fn with_all_builtins() -> Result<Self> {
222 let mut b = Self::builder();
223 for r in BuiltinRule::all() {
224 b = b.with_builtin(r);
225 }
226 b.build()
227 }
228
229 pub fn mask(&self, text: &str, strategy: Strategy) -> MaskResult {
231 let mut spans: Vec<MaskMatch> = Vec::new();
232
233 for (rule, regex) in &self.builtin {
235 for m in regex.find_iter(text) {
236 let value = &text[m.start()..m.end()];
237 if *rule == BuiltinRule::CreditCard && !is_luhn_valid(value) {
238 continue;
239 }
240 spans.push(MaskMatch {
241 kind: rule.tag().to_string(),
242 start: m.start(),
243 end: m.end(),
244 value: value.to_string(),
245 });
246 }
247 }
248
249 for set in &self.keywords {
251 for m in set.automaton.find_iter(text) {
252 spans.push(MaskMatch {
253 kind: set.label.clone(),
254 start: m.start(),
255 end: m.end(),
256 value: text[m.start()..m.end()].to_string(),
257 });
258 }
259 }
260
261 spans.sort_by(|a, b| a.start.cmp(&b.start).then(b.end.cmp(&a.end)));
263 let mut kept: Vec<MaskMatch> = Vec::with_capacity(spans.len());
264 let mut cursor = 0usize;
265 for m in spans {
266 if m.start < cursor {
267 continue;
268 }
269 cursor = m.end;
270 kept.push(m);
271 }
272
273 let mut out = String::with_capacity(text.len());
275 let mut last = 0usize;
276 for m in &kept {
277 out.push_str(&text[last..m.start]);
278 out.push_str(&render(&m.kind, &m.value, strategy));
279 last = m.end;
280 }
281 out.push_str(&text[last..]);
282 MaskResult {
283 masked: out,
284 matches: kept,
285 }
286 }
287}
288
289#[derive(Default)]
291pub struct MaskerBuilder {
292 builtins: Vec<BuiltinRule>,
293 keywords: Vec<(String, Vec<String>)>,
294}
295
296impl MaskerBuilder {
297 pub fn with_builtin(mut self, rule: BuiltinRule) -> Self {
299 if !self.builtins.contains(&rule) {
300 self.builtins.push(rule);
301 }
302 self
303 }
304
305 pub fn with_keywords<S: Into<String>>(mut self, label: S, needles: &[&str]) -> Self {
308 let label = label.into().to_uppercase();
309 let needles: Vec<String> = needles
310 .iter()
311 .filter(|s| !s.is_empty())
312 .map(|s| (*s).to_string())
313 .collect();
314 self.keywords.push((label, needles));
315 self
316 }
317
318 pub fn build(self) -> Result<Masker> {
320 if self.builtins.is_empty() && self.keywords.iter().all(|(_, n)| n.is_empty()) {
321 return Err(MaskerError::InvalidConfig(
322 "Masker has no rules; add at least one built-in or keyword set".into(),
323 ));
324 }
325 let mut builtin = Vec::with_capacity(self.builtins.len());
326 for r in self.builtins {
327 let re = Regex::new(r.pattern())?;
328 builtin.push((r, re));
329 }
330 let mut keywords = Vec::with_capacity(self.keywords.len());
331 for (label, needles) in self.keywords {
332 if needles.is_empty() {
333 continue;
334 }
335 let automaton = AhoCorasick::builder()
336 .ascii_case_insensitive(true)
337 .match_kind(aho_corasick::MatchKind::LeftmostLongest)
338 .build(&needles)?;
339 keywords.push(KeywordSet { label, automaton });
340 }
341 Ok(Masker { builtin, keywords })
342 }
343}
344
345fn render(kind: &str, value: &str, strategy: Strategy) -> String {
346 match strategy {
347 Strategy::Tag => format!("<{kind}>"),
348 Strategy::Hash => {
349 let mut hasher = blake3::Hasher::new();
350 hasher.update(value.as_bytes());
351 let h = hasher.finalize();
352 let hex = h.to_hex();
353 format!("<{kind}:{}>", &hex[..8])
354 }
355 Strategy::Fixed => "█".repeat(value.chars().count()),
356 Strategy::Remove => String::new(),
357 Strategy::Truncate(prefix) => {
358 let prefix = prefix as usize;
359 let kept: String = value.chars().take(prefix).collect();
360 if kept.chars().count() == value.chars().count() {
361 format!("<{kind}>")
364 } else {
365 format!("{kept}…<{kind}>")
366 }
367 }
368 }
369}
370
371fn is_luhn_valid(s: &str) -> bool {
373 let digits: Vec<u8> = s
374 .bytes()
375 .filter(|b| b.is_ascii_digit())
376 .map(|b| b - b'0')
377 .collect();
378 if !(13..=19).contains(&digits.len()) {
379 return false;
380 }
381 let mut sum = 0u32;
382 let mut alt = false;
383 for &d in digits.iter().rev() {
384 let mut x = d as u32;
385 if alt {
386 x *= 2;
387 if x > 9 {
388 x -= 9;
389 }
390 }
391 sum += x;
392 alt = !alt;
393 }
394 sum % 10 == 0
395}
396
397#[cfg(test)]
398mod tests {
399 use super::*;
400
401 fn email_masker() -> Masker {
402 Masker::builder()
403 .with_builtin(BuiltinRule::Email)
404 .build()
405 .unwrap()
406 }
407
408 #[test]
409 fn detects_email() {
410 let m = email_masker();
411 let r = m.mask("hi alice@example.com bye", Strategy::Tag);
412 assert_eq!(r.masked, "hi <EMAIL> bye");
413 assert_eq!(r.matches.len(), 1);
414 assert_eq!(r.matches[0].kind, "EMAIL");
415 assert_eq!(r.matches[0].value, "alice@example.com");
416 }
417
418 #[test]
419 fn no_match_returns_input_unchanged() {
420 let m = email_masker();
421 let r = m.mask("nothing here", Strategy::Tag);
422 assert_eq!(r.masked, "nothing here");
423 assert!(r.matches.is_empty());
424 }
425
426 #[test]
427 fn luhn_valid_card_redacted() {
428 let m = Masker::builder()
429 .with_builtin(BuiltinRule::CreditCard)
430 .build()
431 .unwrap();
432 let r = m.mask("paid 4111-1111-1111-1111 today", Strategy::Tag);
434 assert_eq!(r.masked, "paid <CREDIT_CARD> today");
435 }
436
437 #[test]
438 fn luhn_invalid_passes_through() {
439 let m = Masker::builder()
440 .with_builtin(BuiltinRule::CreditCard)
441 .build()
442 .unwrap();
443 let r = m.mask("order 1234-5678-1234-5678", Strategy::Tag);
445 assert_eq!(r.masked, "order 1234-5678-1234-5678");
446 assert!(r.matches.is_empty());
447 }
448
449 #[test]
450 fn ssn_redacted() {
451 let m = Masker::builder()
452 .with_builtin(BuiltinRule::UsSsn)
453 .build()
454 .unwrap();
455 let r = m.mask("ssn 123-45-6789 ok", Strategy::Tag);
456 assert_eq!(r.masked, "ssn <US_SSN> ok");
457 }
458
459 #[test]
460 fn ipv4_redacted() {
461 let m = Masker::builder()
462 .with_builtin(BuiltinRule::Ipv4)
463 .build()
464 .unwrap();
465 let r = m.mask("client 192.168.1.42", Strategy::Tag);
466 assert_eq!(r.masked, "client <IPV4>");
467 }
468
469 #[test]
470 fn aws_key_redacted() {
471 let m = Masker::builder()
472 .with_builtin(BuiltinRule::AwsAccessKey)
473 .build()
474 .unwrap();
475 let r = m.mask("key AKIAIOSFODNN7EXAMPLE leaked", Strategy::Tag);
476 assert_eq!(r.masked, "key <AWS_ACCESS_KEY> leaked");
477 }
478
479 #[test]
480 fn github_token_redacted() {
481 let m = Masker::builder()
482 .with_builtin(BuiltinRule::GithubToken)
483 .build()
484 .unwrap();
485 let token = "ghp_abcdefghijklmnopqrstuvwxyz0123456789";
486 let r = m.mask(&format!("token {token} bad"), Strategy::Tag);
487 assert_eq!(r.masked, "token <GITHUB_TOKEN> bad");
488 }
489
490 #[test]
491 fn jwt_redacted() {
492 let m = Masker::builder()
493 .with_builtin(BuiltinRule::Jwt)
494 .build()
495 .unwrap();
496 let jwt = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1MSJ9.signature_part_long_enough";
497 let r = m.mask(&format!("auth: {jwt} ok"), Strategy::Tag);
498 assert_eq!(r.masked, "auth: <JWT> ok");
499 }
500
501 #[test]
502 fn url_redacted() {
503 let m = Masker::builder()
504 .with_builtin(BuiltinRule::Url)
505 .build()
506 .unwrap();
507 let r = m.mask("see https://example.com/path?x=1", Strategy::Tag);
508 assert_eq!(r.masked, "see <URL>");
509 }
510
511 #[test]
512 fn mac_address_redacted() {
513 let m = Masker::builder()
514 .with_builtin(BuiltinRule::MacAddress)
515 .build()
516 .unwrap();
517 let r = m.mask("eth0 AA:BB:CC:DD:EE:FF up", Strategy::Tag);
518 assert_eq!(r.masked, "eth0 <MAC_ADDRESS> up");
519 }
520
521 #[test]
522 fn iban_redacted() {
523 let m = Masker::builder()
524 .with_builtin(BuiltinRule::Iban)
525 .build()
526 .unwrap();
527 let r = m.mask("from DE89370400440532013000 today", Strategy::Tag);
529 assert_eq!(r.masked, "from <IBAN> today");
530 }
531
532 #[test]
533 fn truncate_strategy_keeps_prefix_and_appends_tag() {
534 let m = Masker::builder()
535 .with_builtin(BuiltinRule::CreditCard)
536 .build()
537 .unwrap();
538 let r = m.mask("paid 4111-1111-1111-1111 today", Strategy::Truncate(4));
540 assert_eq!(r.masked, "paid 4111…<CREDIT_CARD> today");
541 }
542
543 #[test]
544 fn truncate_strategy_short_value_falls_back_to_tag() {
545 let m = email_masker();
546 let r = m.mask("hi a@b.com", Strategy::Truncate(20));
549 assert_eq!(r.masked, "hi <EMAIL>");
550 }
551
552 #[test]
553 fn all_returns_twelve_rules() {
554 assert_eq!(BuiltinRule::all().len(), 12);
555 }
556
557 #[test]
558 fn keywords_redacted_case_insensitive() {
559 let m = Masker::builder()
560 .with_keywords("customer", &["Acme Corp", "Globex"])
561 .build()
562 .unwrap();
563 let r = m.mask("call ACME corp and globex tomorrow", Strategy::Tag);
564 assert_eq!(r.masked, "call <CUSTOMER> and <CUSTOMER> tomorrow");
565 assert_eq!(r.matches.len(), 2);
566 }
567
568 #[test]
569 fn multiple_rules_resolved_left_to_right() {
570 let m = Masker::builder()
571 .with_builtin(BuiltinRule::Email)
572 .with_builtin(BuiltinRule::CreditCard)
573 .build()
574 .unwrap();
575 let text = "email alice@example.com card 4111-1111-1111-1111 done";
576 let r = m.mask(text, Strategy::Tag);
577 assert_eq!(r.masked, "email <EMAIL> card <CREDIT_CARD> done");
578 assert_eq!(r.matches.len(), 2);
579 assert!(r.matches[0].start < r.matches[1].start);
580 }
581
582 #[test]
583 fn hash_strategy_is_stable_for_same_value() {
584 let m = email_masker();
585 let r1 = m.mask("a@b.com", Strategy::Hash);
586 let r2 = m.mask("a@b.com", Strategy::Hash);
587 assert_eq!(r1.masked, r2.masked);
588 let r3 = m.mask("c@d.com", Strategy::Hash);
589 assert_ne!(r1.masked, r3.masked);
590 }
591
592 #[test]
593 fn hash_strategy_format() {
594 let m = email_masker();
595 let r = m.mask("a@b.com", Strategy::Hash);
596 assert!(r.masked.starts_with("<EMAIL:"));
598 assert!(r.masked.ends_with('>'));
599 assert_eq!(r.masked.len(), "<EMAIL:".len() + 8 + 1);
600 }
601
602 #[test]
603 fn fixed_strategy_preserves_length() {
604 let m = email_masker();
605 let r = m.mask("hi a@b.com bye", Strategy::Fixed);
606 assert!(r.masked.contains('█'));
608 assert!(r.masked.starts_with("hi "));
609 assert!(r.masked.ends_with(" bye"));
610 }
611
612 #[test]
613 fn remove_strategy_strips_match() {
614 let m = email_masker();
615 let r = m.mask("hi a@b.com bye", Strategy::Remove);
616 assert_eq!(r.masked, "hi bye");
617 }
618
619 #[test]
620 fn empty_masker_rejected() {
621 let r = Masker::builder().build();
622 assert!(r.is_err());
623 }
624
625 #[test]
626 fn with_all_builtins_works() {
627 let m = Masker::with_all_builtins().unwrap();
628 let r = m.mask("ip 10.0.0.1 ssn 123-45-6789", Strategy::Tag);
629 assert!(r.matches.iter().any(|m| m.kind == "IPV4"));
630 assert!(r.matches.iter().any(|m| m.kind == "US_SSN"));
631 }
632
633 #[test]
634 fn luhn_known_valid_numbers() {
635 for n in ["4111111111111111", "5500000000000004", "340000000000009"] {
637 assert!(is_luhn_valid(n), "{n} should be Luhn-valid");
638 }
639 }
640
641 #[test]
642 fn luhn_known_invalid() {
643 for n in ["1234567890123456", "0000000000000000123"] {
644 assert!(!is_luhn_valid(n), "{n} should be Luhn-invalid");
645 }
646 }
647
648 #[test]
649 fn match_offsets_are_byte_accurate() {
650 let m = email_masker();
651 let text = "hi alice@example.com bye";
652 let r = m.mask(text, Strategy::Tag);
653 let m0 = &r.matches[0];
654 assert_eq!(&text[m0.start..m0.end], "alice@example.com");
655 }
656
657 #[test]
658 fn keyword_with_empty_needles_skipped() {
659 let m = Masker::builder()
661 .with_builtin(BuiltinRule::Email)
662 .with_keywords("ignored", &[])
663 .build()
664 .unwrap();
665 let r = m.mask("a@b.com", Strategy::Tag);
666 assert_eq!(r.masked, "<EMAIL>");
667 }
668
669 #[test]
670 fn unicode_safe_in_input() {
671 let m = email_masker();
672 let r = m.mask("hello 世界 a@b.com 🌍", Strategy::Tag);
673 assert_eq!(r.masked, "hello 世界 <EMAIL> 🌍");
674 }
675}