Skip to main content

maskprompt_core/
lib.rs

1//! Pure-Rust core for `maskprompt`. Detects common PII patterns plus
2//! caller-supplied keywords and replaces them via one of four strategies.
3//!
4//! - **Built-in rules.** A small fixed set of regex patterns. Credit-card
5//!   matches are validated through Luhn before redaction so unrelated
6//!   13–19 digit strings (order numbers, etc.) are not flagged.
7//! - **Custom keywords.** An Aho-Corasick automaton over the union of all
8//!   user-supplied needles, so a list with thousands of entries (customer
9//!   names, internal project codes) still matches in linear time.
10//! - **Strategies.** `Tag`, `Hash`, `Fixed`, `Remove`. The `Hash` strategy
11//!   uses blake3 truncated to 8 hex chars to give stable cross-run
12//!   redaction without recovering the source.
13//!
14//! Match resolution: when two rules overlap, the one that started earlier
15//! wins; ties go to the longer match.
16
17#![deny(unsafe_code)]
18#![warn(missing_docs)]
19#![warn(rust_2018_idioms)]
20
21use aho_corasick::AhoCorasick;
22use regex::Regex;
23use serde::{Deserialize, Serialize};
24use thiserror::Error;
25
26/// Crate-wide result alias.
27pub type Result<T> = std::result::Result<T, MaskerError>;
28
29/// All errors surfaced by `maskprompt-core`.
30#[derive(Error, Debug)]
31pub enum MaskerError {
32    /// Caller supplied an invalid configuration.
33    #[error("invalid config: {0}")]
34    InvalidConfig(String),
35    /// A regex failed to compile. Should not happen at runtime; built-in
36    /// patterns are tested. Surfaces if a future caller adds custom regex.
37    #[error("regex error: {0}")]
38    Regex(#[from] regex::Error),
39    /// Aho-Corasick build failure.
40    #[error("aho-corasick error: {0}")]
41    Aho(#[from] aho_corasick::BuildError),
42}
43
44/// Built-in detector identifiers.
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum BuiltinRule {
48    /// Email addresses (RFC-5322-ish).
49    Email,
50    /// US phone numbers in the most common formats.
51    UsPhone,
52    /// US Social Security Numbers (`XXX-XX-XXXX`).
53    UsSsn,
54    /// IPv4 dotted-quad addresses.
55    Ipv4,
56    /// IPv6 addresses including `::` shorthand.
57    Ipv6,
58    /// 13-19 digit candidates that pass Luhn validation.
59    CreditCard,
60    /// AWS access key IDs (`AKIA…` + 16 alphanumerics).
61    AwsAccessKey,
62    /// GitHub personal access tokens / OAuth tokens.
63    GithubToken,
64    /// JWTs (three base64url segments separated by `.`).
65    Jwt,
66    /// HTTP/HTTPS URLs.
67    Url,
68    /// MAC addresses in colon or dash form (`AA:BB:CC:DD:EE:FF`).
69    MacAddress,
70    /// IBAN bank account numbers. Matches the standard 2-letter country
71    /// code + 2 check digits + up to 30 alphanumerics.
72    Iban,
73}
74
75impl BuiltinRule {
76    /// Stable lowercase tag used as `<TAG>` and as the key in [`MaskMatch::kind`].
77    pub fn tag(&self) -> &'static str {
78        match self {
79            Self::Email => "EMAIL",
80            Self::UsPhone => "US_PHONE",
81            Self::UsSsn => "US_SSN",
82            Self::Ipv4 => "IPV4",
83            Self::Ipv6 => "IPV6",
84            Self::CreditCard => "CREDIT_CARD",
85            Self::AwsAccessKey => "AWS_ACCESS_KEY",
86            Self::GithubToken => "GITHUB_TOKEN",
87            Self::Jwt => "JWT",
88            Self::Url => "URL",
89            Self::MacAddress => "MAC_ADDRESS",
90            Self::Iban => "IBAN",
91        }
92    }
93
94    fn pattern(&self) -> &'static str {
95        match self {
96            // Conservative email pattern: no unicode escapes, no quoted local
97            // parts. Covers the >99% case in production logs.
98            Self::Email => r"(?i)\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}\b",
99            Self::UsPhone => {
100                r"(?x)
101                    \b
102                    (?:\+?1[-.\ ]?)?
103                    (?:\(\d{3}\)|\d{3})[-.\ ]?
104                    \d{3}[-.\ ]?
105                    \d{4}
106                    \b
107                "
108            }
109            Self::UsSsn => r"\b\d{3}-\d{2}-\d{4}\b",
110            Self::Ipv4 => {
111                r"\b(?:(?:25[0-5]|2[0-4]\d|1?\d{1,2})\.){3}(?:25[0-5]|2[0-4]\d|1?\d{1,2})\b"
112            }
113            // IPv6: simplified. Covers full and `::`-shorthand forms; will
114            // false-positive on a few invalid strings (we don't validate hex
115            // group counts strictly) but those are rare in production logs.
116            Self::Ipv6 => {
117                r"\b(?:[0-9a-fA-F]{1,4}:){2,7}[0-9a-fA-F]{1,4}\b|::(?:[0-9a-fA-F]{1,4}:?){1,7}\b"
118            }
119            // Credit card: 13-19 digits, optional dash/space separators every 4.
120            // Luhn validation runs after the regex match.
121            Self::CreditCard => r"\b(?:\d[\ -]?){12,18}\d\b",
122            Self::AwsAccessKey => r"\bAKIA[0-9A-Z]{16}\b",
123            // GitHub token formats: ghp_ (PAT), gho_ (OAuth), ghu_ (user-to-server),
124            // ghr_ (refresh), ghs_ (server-to-server).
125            Self::GithubToken => r"\bgh[pours]_[A-Za-z0-9]{36,}\b",
126            // JWT: three url-safe base64 segments. Allows long signatures.
127            Self::Jwt => r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b",
128            // URLs: http(s) + host + optional path. Conservative; doesn't
129            // try to catch unicode-heavy IDN forms.
130            Self::Url => {
131                r"https?://[A-Za-z0-9.\-]+(?::\d+)?(?:/[A-Za-z0-9._~:/?#\[\]@!$&'()*+,;=%-]*)?"
132            }
133            // MAC addresses in 6-group colon or dash notation.
134            Self::MacAddress => r"\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b",
135            // IBAN: 2 letters + 2 digits + 11–30 alphanumerics. Length cap
136            // matches the longest IBAN spec (Malta = 31 chars total).
137            Self::Iban => r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b",
138        }
139    }
140
141    /// Return all built-in rules in declaration order.
142    pub fn all() -> [BuiltinRule; 12] {
143        [
144            Self::Email,
145            Self::UsPhone,
146            Self::UsSsn,
147            Self::Ipv4,
148            Self::Ipv6,
149            Self::CreditCard,
150            Self::AwsAccessKey,
151            Self::GithubToken,
152            Self::Jwt,
153            Self::Url,
154            Self::MacAddress,
155            Self::Iban,
156        ]
157    }
158}
159
160/// How matches are replaced in the masked string.
161#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
162#[serde(rename_all = "snake_case")]
163pub enum Strategy {
164    /// Replace with `<TAG>` (e.g. `<EMAIL>`). Default.
165    #[default]
166    Tag,
167    /// Replace with `<TAG:abc12345>` where the suffix is blake3 of the
168    /// original value, truncated to 8 hex characters. Stable across runs.
169    Hash,
170    /// Replace with `█` repeated, preserving the original length.
171    Fixed,
172    /// Replace with the empty string.
173    Remove,
174    /// Keep the first `prefix` characters of the original value, then
175    /// append `…<TAG>`. Useful when the prefix carries debugging signal
176    /// (e.g. `4111…<CREDIT_CARD>` for the BIN of a card number).
177    Truncate(u8),
178}
179
180/// One match found by [`Masker::mask`].
181#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
182pub struct MaskMatch {
183    /// Lowercase tag (built-in rules return their `tag()`, custom rules
184    /// return the user-provided label uppercased).
185    pub kind: String,
186    /// Byte-offset of the match start in the original string.
187    pub start: usize,
188    /// Byte-offset of the match end in the original string (exclusive).
189    pub end: usize,
190    /// The matched substring (preserved so the caller can hash, log, etc.).
191    pub value: String,
192}
193
194/// Output of [`Masker::mask`].
195#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
196pub struct MaskResult {
197    /// The redacted string.
198    pub masked: String,
199    /// One entry per redaction, in left-to-right order.
200    pub matches: Vec<MaskMatch>,
201}
202
203/// Compiled detector set. Build with [`Masker::builder`].
204pub struct Masker {
205    builtin: Vec<(BuiltinRule, Regex)>,
206    keywords: Vec<KeywordSet>,
207}
208
209struct KeywordSet {
210    label: String,
211    automaton: AhoCorasick,
212}
213
214impl Masker {
215    /// Start a new builder.
216    pub fn builder() -> MaskerBuilder {
217        MaskerBuilder::default()
218    }
219
220    /// Convenience: a Masker with every built-in rule and no custom keywords.
221    pub fn with_all_builtins() -> Result<Self> {
222        let mut b = Self::builder();
223        for r in BuiltinRule::all() {
224            b = b.with_builtin(r);
225        }
226        b.build()
227    }
228
229    /// Run all detectors against `text` and apply `strategy` to every match.
230    pub fn mask(&self, text: &str, strategy: Strategy) -> MaskResult {
231        let mut spans: Vec<MaskMatch> = Vec::new();
232
233        // Built-in detectors.
234        for (rule, regex) in &self.builtin {
235            for m in regex.find_iter(text) {
236                let value = &text[m.start()..m.end()];
237                if *rule == BuiltinRule::CreditCard && !is_luhn_valid(value) {
238                    continue;
239                }
240                spans.push(MaskMatch {
241                    kind: rule.tag().to_string(),
242                    start: m.start(),
243                    end: m.end(),
244                    value: value.to_string(),
245                });
246            }
247        }
248
249        // Custom keyword sets.
250        for set in &self.keywords {
251            for m in set.automaton.find_iter(text) {
252                spans.push(MaskMatch {
253                    kind: set.label.clone(),
254                    start: m.start(),
255                    end: m.end(),
256                    value: text[m.start()..m.end()].to_string(),
257                });
258            }
259        }
260
261        // Resolve overlaps: earliest start wins; ties broken by longer span.
262        spans.sort_by(|a, b| a.start.cmp(&b.start).then(b.end.cmp(&a.end)));
263        let mut kept: Vec<MaskMatch> = Vec::with_capacity(spans.len());
264        let mut cursor = 0usize;
265        for m in spans {
266            if m.start < cursor {
267                continue;
268            }
269            cursor = m.end;
270            kept.push(m);
271        }
272
273        // Build the masked string.
274        let mut out = String::with_capacity(text.len());
275        let mut last = 0usize;
276        for m in &kept {
277            out.push_str(&text[last..m.start]);
278            out.push_str(&render(&m.kind, &m.value, strategy));
279            last = m.end;
280        }
281        out.push_str(&text[last..]);
282        MaskResult {
283            masked: out,
284            matches: kept,
285        }
286    }
287}
288
289/// Builder for [`Masker`].
290#[derive(Default)]
291pub struct MaskerBuilder {
292    builtins: Vec<BuiltinRule>,
293    keywords: Vec<(String, Vec<String>)>,
294}
295
296impl MaskerBuilder {
297    /// Enable a built-in detector. Calling twice is idempotent.
298    pub fn with_builtin(mut self, rule: BuiltinRule) -> Self {
299        if !self.builtins.contains(&rule) {
300            self.builtins.push(rule);
301        }
302        self
303    }
304
305    /// Register a custom keyword set under `label`. Matching is
306    /// case-insensitive. Empty needles are silently dropped.
307    pub fn with_keywords<S: Into<String>>(mut self, label: S, needles: &[&str]) -> Self {
308        let label = label.into().to_uppercase();
309        let needles: Vec<String> = needles
310            .iter()
311            .filter(|s| !s.is_empty())
312            .map(|s| (*s).to_string())
313            .collect();
314        self.keywords.push((label, needles));
315        self
316    }
317
318    /// Build the [`Masker`].
319    pub fn build(self) -> Result<Masker> {
320        if self.builtins.is_empty() && self.keywords.iter().all(|(_, n)| n.is_empty()) {
321            return Err(MaskerError::InvalidConfig(
322                "Masker has no rules; add at least one built-in or keyword set".into(),
323            ));
324        }
325        let mut builtin = Vec::with_capacity(self.builtins.len());
326        for r in self.builtins {
327            let re = Regex::new(r.pattern())?;
328            builtin.push((r, re));
329        }
330        let mut keywords = Vec::with_capacity(self.keywords.len());
331        for (label, needles) in self.keywords {
332            if needles.is_empty() {
333                continue;
334            }
335            let automaton = AhoCorasick::builder()
336                .ascii_case_insensitive(true)
337                .match_kind(aho_corasick::MatchKind::LeftmostLongest)
338                .build(&needles)?;
339            keywords.push(KeywordSet { label, automaton });
340        }
341        Ok(Masker { builtin, keywords })
342    }
343}
344
345fn render(kind: &str, value: &str, strategy: Strategy) -> String {
346    match strategy {
347        Strategy::Tag => format!("<{kind}>"),
348        Strategy::Hash => {
349            let mut hasher = blake3::Hasher::new();
350            hasher.update(value.as_bytes());
351            let h = hasher.finalize();
352            let hex = h.to_hex();
353            format!("<{kind}:{}>", &hex[..8])
354        }
355        Strategy::Fixed => "█".repeat(value.chars().count()),
356        Strategy::Remove => String::new(),
357        Strategy::Truncate(prefix) => {
358            let prefix = prefix as usize;
359            let kept: String = value.chars().take(prefix).collect();
360            if kept.chars().count() == value.chars().count() {
361                // Whole value fit — no truncation needed; treat as Tag so we
362                // don't leak a value that was supposedly truncated.
363                format!("<{kind}>")
364            } else {
365                format!("{kept}…<{kind}>")
366            }
367        }
368    }
369}
370
371/// Luhn validation for credit-card numbers. Strips spaces and dashes first.
372fn is_luhn_valid(s: &str) -> bool {
373    let digits: Vec<u8> = s
374        .bytes()
375        .filter(|b| b.is_ascii_digit())
376        .map(|b| b - b'0')
377        .collect();
378    if !(13..=19).contains(&digits.len()) {
379        return false;
380    }
381    let mut sum = 0u32;
382    let mut alt = false;
383    for &d in digits.iter().rev() {
384        let mut x = d as u32;
385        if alt {
386            x *= 2;
387            if x > 9 {
388                x -= 9;
389            }
390        }
391        sum += x;
392        alt = !alt;
393    }
394    sum % 10 == 0
395}
396
397#[cfg(test)]
398mod tests {
399    use super::*;
400
401    fn email_masker() -> Masker {
402        Masker::builder()
403            .with_builtin(BuiltinRule::Email)
404            .build()
405            .unwrap()
406    }
407
408    #[test]
409    fn detects_email() {
410        let m = email_masker();
411        let r = m.mask("hi alice@example.com bye", Strategy::Tag);
412        assert_eq!(r.masked, "hi <EMAIL> bye");
413        assert_eq!(r.matches.len(), 1);
414        assert_eq!(r.matches[0].kind, "EMAIL");
415        assert_eq!(r.matches[0].value, "alice@example.com");
416    }
417
418    #[test]
419    fn no_match_returns_input_unchanged() {
420        let m = email_masker();
421        let r = m.mask("nothing here", Strategy::Tag);
422        assert_eq!(r.masked, "nothing here");
423        assert!(r.matches.is_empty());
424    }
425
426    #[test]
427    fn luhn_valid_card_redacted() {
428        let m = Masker::builder()
429            .with_builtin(BuiltinRule::CreditCard)
430            .build()
431            .unwrap();
432        // Real Visa test number, passes Luhn.
433        let r = m.mask("paid 4111-1111-1111-1111 today", Strategy::Tag);
434        assert_eq!(r.masked, "paid <CREDIT_CARD> today");
435    }
436
437    #[test]
438    fn luhn_invalid_passes_through() {
439        let m = Masker::builder()
440            .with_builtin(BuiltinRule::CreditCard)
441            .build()
442            .unwrap();
443        // 16 digits but doesn't pass Luhn.
444        let r = m.mask("order 1234-5678-1234-5678", Strategy::Tag);
445        assert_eq!(r.masked, "order 1234-5678-1234-5678");
446        assert!(r.matches.is_empty());
447    }
448
449    #[test]
450    fn ssn_redacted() {
451        let m = Masker::builder()
452            .with_builtin(BuiltinRule::UsSsn)
453            .build()
454            .unwrap();
455        let r = m.mask("ssn 123-45-6789 ok", Strategy::Tag);
456        assert_eq!(r.masked, "ssn <US_SSN> ok");
457    }
458
459    #[test]
460    fn ipv4_redacted() {
461        let m = Masker::builder()
462            .with_builtin(BuiltinRule::Ipv4)
463            .build()
464            .unwrap();
465        let r = m.mask("client 192.168.1.42", Strategy::Tag);
466        assert_eq!(r.masked, "client <IPV4>");
467    }
468
469    #[test]
470    fn aws_key_redacted() {
471        let m = Masker::builder()
472            .with_builtin(BuiltinRule::AwsAccessKey)
473            .build()
474            .unwrap();
475        let r = m.mask("key AKIAIOSFODNN7EXAMPLE leaked", Strategy::Tag);
476        assert_eq!(r.masked, "key <AWS_ACCESS_KEY> leaked");
477    }
478
479    #[test]
480    fn github_token_redacted() {
481        let m = Masker::builder()
482            .with_builtin(BuiltinRule::GithubToken)
483            .build()
484            .unwrap();
485        let token = "ghp_abcdefghijklmnopqrstuvwxyz0123456789";
486        let r = m.mask(&format!("token {token} bad"), Strategy::Tag);
487        assert_eq!(r.masked, "token <GITHUB_TOKEN> bad");
488    }
489
490    #[test]
491    fn jwt_redacted() {
492        let m = Masker::builder()
493            .with_builtin(BuiltinRule::Jwt)
494            .build()
495            .unwrap();
496        let jwt = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1MSJ9.signature_part_long_enough";
497        let r = m.mask(&format!("auth: {jwt} ok"), Strategy::Tag);
498        assert_eq!(r.masked, "auth: <JWT> ok");
499    }
500
501    #[test]
502    fn url_redacted() {
503        let m = Masker::builder()
504            .with_builtin(BuiltinRule::Url)
505            .build()
506            .unwrap();
507        let r = m.mask("see https://example.com/path?x=1", Strategy::Tag);
508        assert_eq!(r.masked, "see <URL>");
509    }
510
511    #[test]
512    fn mac_address_redacted() {
513        let m = Masker::builder()
514            .with_builtin(BuiltinRule::MacAddress)
515            .build()
516            .unwrap();
517        let r = m.mask("eth0 AA:BB:CC:DD:EE:FF up", Strategy::Tag);
518        assert_eq!(r.masked, "eth0 <MAC_ADDRESS> up");
519    }
520
521    #[test]
522    fn iban_redacted() {
523        let m = Masker::builder()
524            .with_builtin(BuiltinRule::Iban)
525            .build()
526            .unwrap();
527        // Real IBAN test value (Germany).
528        let r = m.mask("from DE89370400440532013000 today", Strategy::Tag);
529        assert_eq!(r.masked, "from <IBAN> today");
530    }
531
532    #[test]
533    fn truncate_strategy_keeps_prefix_and_appends_tag() {
534        let m = Masker::builder()
535            .with_builtin(BuiltinRule::CreditCard)
536            .build()
537            .unwrap();
538        // BIN-style preservation: keep first 4 digits.
539        let r = m.mask("paid 4111-1111-1111-1111 today", Strategy::Truncate(4));
540        assert_eq!(r.masked, "paid 4111…<CREDIT_CARD> today");
541    }
542
543    #[test]
544    fn truncate_strategy_short_value_falls_back_to_tag() {
545        let m = email_masker();
546        // Value is "a@b.com" (7 chars). Truncate(20) wouldn't actually
547        // truncate, so it falls back to <TAG> rather than leaking the value.
548        let r = m.mask("hi a@b.com", Strategy::Truncate(20));
549        assert_eq!(r.masked, "hi <EMAIL>");
550    }
551
552    #[test]
553    fn all_returns_twelve_rules() {
554        assert_eq!(BuiltinRule::all().len(), 12);
555    }
556
557    #[test]
558    fn keywords_redacted_case_insensitive() {
559        let m = Masker::builder()
560            .with_keywords("customer", &["Acme Corp", "Globex"])
561            .build()
562            .unwrap();
563        let r = m.mask("call ACME corp and globex tomorrow", Strategy::Tag);
564        assert_eq!(r.masked, "call <CUSTOMER> and <CUSTOMER> tomorrow");
565        assert_eq!(r.matches.len(), 2);
566    }
567
568    #[test]
569    fn multiple_rules_resolved_left_to_right() {
570        let m = Masker::builder()
571            .with_builtin(BuiltinRule::Email)
572            .with_builtin(BuiltinRule::CreditCard)
573            .build()
574            .unwrap();
575        let text = "email alice@example.com card 4111-1111-1111-1111 done";
576        let r = m.mask(text, Strategy::Tag);
577        assert_eq!(r.masked, "email <EMAIL> card <CREDIT_CARD> done");
578        assert_eq!(r.matches.len(), 2);
579        assert!(r.matches[0].start < r.matches[1].start);
580    }
581
582    #[test]
583    fn hash_strategy_is_stable_for_same_value() {
584        let m = email_masker();
585        let r1 = m.mask("a@b.com", Strategy::Hash);
586        let r2 = m.mask("a@b.com", Strategy::Hash);
587        assert_eq!(r1.masked, r2.masked);
588        let r3 = m.mask("c@d.com", Strategy::Hash);
589        assert_ne!(r1.masked, r3.masked);
590    }
591
592    #[test]
593    fn hash_strategy_format() {
594        let m = email_masker();
595        let r = m.mask("a@b.com", Strategy::Hash);
596        // <EMAIL:xxxxxxxx>
597        assert!(r.masked.starts_with("<EMAIL:"));
598        assert!(r.masked.ends_with('>'));
599        assert_eq!(r.masked.len(), "<EMAIL:".len() + 8 + 1);
600    }
601
602    #[test]
603    fn fixed_strategy_preserves_length() {
604        let m = email_masker();
605        let r = m.mask("hi a@b.com bye", Strategy::Fixed);
606        // a@b.com -> 7 chars -> 7 block characters (each block char is 3 bytes in UTF-8).
607        assert!(r.masked.contains('█'));
608        assert!(r.masked.starts_with("hi "));
609        assert!(r.masked.ends_with(" bye"));
610    }
611
612    #[test]
613    fn remove_strategy_strips_match() {
614        let m = email_masker();
615        let r = m.mask("hi a@b.com bye", Strategy::Remove);
616        assert_eq!(r.masked, "hi  bye");
617    }
618
619    #[test]
620    fn empty_masker_rejected() {
621        let r = Masker::builder().build();
622        assert!(r.is_err());
623    }
624
625    #[test]
626    fn with_all_builtins_works() {
627        let m = Masker::with_all_builtins().unwrap();
628        let r = m.mask("ip 10.0.0.1 ssn 123-45-6789", Strategy::Tag);
629        assert!(r.matches.iter().any(|m| m.kind == "IPV4"));
630        assert!(r.matches.iter().any(|m| m.kind == "US_SSN"));
631    }
632
633    #[test]
634    fn luhn_known_valid_numbers() {
635        // Visa, MasterCard, Amex test numbers.
636        for n in ["4111111111111111", "5500000000000004", "340000000000009"] {
637            assert!(is_luhn_valid(n), "{n} should be Luhn-valid");
638        }
639    }
640
641    #[test]
642    fn luhn_known_invalid() {
643        for n in ["1234567890123456", "0000000000000000123"] {
644            assert!(!is_luhn_valid(n), "{n} should be Luhn-invalid");
645        }
646    }
647
648    #[test]
649    fn match_offsets_are_byte_accurate() {
650        let m = email_masker();
651        let text = "hi alice@example.com bye";
652        let r = m.mask(text, Strategy::Tag);
653        let m0 = &r.matches[0];
654        assert_eq!(&text[m0.start..m0.end], "alice@example.com");
655    }
656
657    #[test]
658    fn keyword_with_empty_needles_skipped() {
659        // Empty needle list shouldn't crash AhoCorasick, just be ignored.
660        let m = Masker::builder()
661            .with_builtin(BuiltinRule::Email)
662            .with_keywords("ignored", &[])
663            .build()
664            .unwrap();
665        let r = m.mask("a@b.com", Strategy::Tag);
666        assert_eq!(r.masked, "<EMAIL>");
667    }
668
669    #[test]
670    fn unicode_safe_in_input() {
671        let m = email_masker();
672        let r = m.mask("hello 世界 a@b.com 🌍", Strategy::Tag);
673        assert_eq!(r.masked, "hello 世界 <EMAIL> 🌍");
674    }
675}