Skip to main content

sanitize_engine/
generator.rs

1//! Replacement generation strategies.
2//!
3//! Two concrete implementations:
4//! - `HmacGenerator`: Deterministic, seeded with a 32-byte key. Same seed + same
5//!   input = same output across runs. Uses HMAC-SHA256 for domain separation.
6//! - `RandomGenerator`: Cryptographically random replacements. Non-deterministic.
7//!
8//! Both produce category-aware, format-preserving replacements.
9//!
10//! # Design Note
11//!
12//! This module contains the category-aware formatters used by the CLI binary.
13//! For an extensible strategy API that allows custom replacement logic, see
14//! the [`crate::strategy`] module.
15
16use crate::category::Category;
17use hmac::{Hmac, Mac};
18use rand::Rng;
19use sha2::Sha256;
20use zeroize::Zeroize;
21
22// ---------------------------------------------------------------------------
23// Trait
24// ---------------------------------------------------------------------------
25
26/// Strategy for generating a sanitized replacement value.
27///
28/// Implementations MUST be deterministic to their inputs: given the same
29/// `(category, original)` pair (and same internal state / seed), the output
30/// must be identical. This is what enables per-run consistency when backed
31/// by a `MappingStore` that calls `generate` only once per unique value.
32pub trait ReplacementGenerator: Send + Sync {
33    /// Produce a sanitized replacement for `original` classified as `category`.
34    fn generate(&self, category: &Category, original: &str) -> String;
35}
36
37// ---------------------------------------------------------------------------
38// HMAC-SHA256 deterministic generator
39// ---------------------------------------------------------------------------
40
41/// Deterministic replacement generator seeded with a 32-byte key.
42///
43/// ```text
44/// replacement = format(category, HMAC-SHA256(key, category_tag || "\x00" || original))
45/// ```
46///
47/// The same key + same `(category, original)` always yields the same output.
48/// Different keys yield completely different outputs with overwhelming probability.
49pub struct HmacGenerator {
50    key: [u8; 32],
51}
52
53impl Drop for HmacGenerator {
54    fn drop(&mut self) {
55        self.key.zeroize();
56    }
57}
58
59impl HmacGenerator {
60    /// Create a new generator from a 32-byte seed.
61    #[must_use]
62    pub fn new(key: [u8; 32]) -> Self {
63        Self { key }
64    }
65
66    /// Create a generator from a byte slice (must be exactly 32 bytes).
67    ///
68    /// # Errors
69    ///
70    /// Returns [`SanitizeError::InvalidSeedLength`](crate::error::SanitizeError::InvalidSeedLength) if `bytes.len() != 32`.
71    pub fn from_slice(bytes: &[u8]) -> crate::error::Result<Self> {
72        if bytes.len() != 32 {
73            return Err(crate::error::SanitizeError::InvalidSeedLength(bytes.len()));
74        }
75        let mut key = [0u8; 32];
76        key.copy_from_slice(bytes);
77        Ok(Self { key })
78    }
79
80    /// Derive the raw 32-byte HMAC digest for `(category, original)`.
81    fn derive(&self, category: &Category, original: &str) -> [u8; 32] {
82        type HmacSha256 = Hmac<Sha256>;
83        let mut mac = HmacSha256::new_from_slice(&self.key).expect("HMAC accepts any key length");
84        let tag = category.domain_tag_hmac();
85        mac.update(tag.as_bytes());
86        mac.update(b"\x00"); // domain separator
87        mac.update(original.as_bytes());
88        let result = mac.finalize();
89        let mut out = [0u8; 32];
90        out.copy_from_slice(&result.into_bytes());
91        out
92    }
93}
94
95impl ReplacementGenerator for HmacGenerator {
96    fn generate(&self, category: &Category, original: &str) -> String {
97        let hash = self.derive(category, original);
98        format_replacement(category, &hash, original)
99    }
100}
101
102// ---------------------------------------------------------------------------
103// Cryptographically-random generator (non-deterministic)
104// ---------------------------------------------------------------------------
105
106/// Random replacement generator using OS CSPRNG.
107///
108/// Each call to `generate` produces a fresh random value. Determinism is
109/// achieved externally by the `MappingStore`, which calls `generate` only
110/// once per unique `(category, original)` pair and caches the result.
111pub struct RandomGenerator;
112
113impl RandomGenerator {
114    #[must_use]
115    pub fn new() -> Self {
116        Self
117    }
118}
119
120impl Default for RandomGenerator {
121    fn default() -> Self {
122        Self::new()
123    }
124}
125
126impl ReplacementGenerator for RandomGenerator {
127    fn generate(&self, category: &Category, original: &str) -> String {
128        let mut rng = rand::rng();
129        let mut hash = [0u8; 32];
130        rng.fill(&mut hash);
131        format_replacement(category, &hash, original)
132    }
133}
134
135// ---------------------------------------------------------------------------
136// Category-aware formatting helpers
137// ---------------------------------------------------------------------------
138
139/// Format a 32-byte hash into a length-preserving replacement whose
140/// byte length exactly matches `original.len()`. The shape is
141/// category-aware and deterministic for the same `(hash, original)` pair.
142fn format_replacement(category: &Category, hash: &[u8; 32], original: &str) -> String {
143    let target = original.len();
144    if target == 0 {
145        return String::new();
146    }
147    let hex = hex_bytes(hash);
148    match category {
149        Category::Email => format_email_lp(&hex, original, target),
150        Category::Name => format_name_lp(hash, &hex, target),
151        Category::Phone | Category::CreditCard | Category::IpV4 => {
152            format_digits_lp(hash, original, target)
153        }
154        Category::IpV6 | Category::MacAddress | Category::Uuid | Category::ContainerId => {
155            format_hex_digits_lp(hash, original, target)
156        }
157        Category::Ssn => format_ssn_lp(hash, original, target),
158        Category::Hostname => format_hostname_lp(&hex, original, target),
159        Category::Jwt => format_jwt_lp(hash, original, target),
160        Category::FilePath => format_filepath_lp(&hex, original, target),
161        Category::WindowsSid => format_windows_sid_lp(hash, original, target),
162        Category::Url => format_url_lp(&hex, original, target),
163        Category::AwsArn => format_arn_lp(&hex, original, target),
164        Category::AzureResourceId => format_azure_resource_id_lp(&hex, original, target),
165        Category::AuthToken | Category::Custom(_) => format_custom_lp(&hex, target),
166    }
167}
168
169// ---------------------------------------------------------------------------
170// Length-preserving helpers
171// ---------------------------------------------------------------------------
172
173/// Pad `s` with deterministic hex characters from `hex`, or truncate,
174/// to reach exactly `target` bytes.  All generated content is ASCII so
175/// byte length equals character count for the produced output.
176fn pad_or_truncate(s: &str, target: usize, hex: &[u8; 64]) -> String {
177    let slen = s.len();
178    if slen == target {
179        return s.to_string();
180    }
181    if slen > target {
182        return s[..target].to_string();
183    }
184    let mut buf = String::with_capacity(target);
185    buf.push_str(s);
186    for i in 0..target.saturating_sub(slen) {
187        buf.push(hex[i % 64] as char);
188    }
189    buf
190}
191
192/// Length-preserving email replacement.
193/// Preserves the domain from the original; generates a hex username
194/// sized so the total byte length matches the original.
195fn format_email_lp(hex: &[u8; 64], original: &str, target: usize) -> String {
196    let domain = original
197        .rfind('@')
198        .map_or("x.co", |pos| &original[pos + 1..]);
199    let at_domain = 1 + domain.len(); // "@" + domain
200    if target <= at_domain {
201        // Too short to fit @domain — use hex fallback.
202        return pad_or_truncate("", target, hex);
203    }
204    let user_len = target - at_domain;
205    let mut buf = String::with_capacity(target);
206    for i in 0..user_len {
207        buf.push(hex[i % 64] as char);
208    }
209    buf.push('@');
210    buf.push_str(domain);
211    buf
212}
213
214/// Length-preserving name replacement.
215/// Generates a synthetic name via the hash-indexed table, then
216/// truncates or pads to match `target` bytes.
217fn format_name_lp(hash: &[u8; 32], hex: &[u8; 64], target: usize) -> String {
218    let raw = format_name(hash);
219    pad_or_truncate(&raw, target, hex)
220}
221
222/// Replace each character matching `is_replaceable` with a deterministic
223/// character produced by `replacement(original_char, hash[hi % 32])`.
224/// All other characters are preserved as-is.
225/// Returns `None` if no replaceable characters were found (caller falls back).
226fn format_char_class_lp(
227    hash: &[u8; 32],
228    original: &str,
229    is_replaceable: impl Fn(char) -> bool,
230    replacement: impl Fn(char, u8) -> char,
231) -> Option<String> {
232    let mut buf = String::with_capacity(original.len());
233    let mut hi = 0usize;
234    let mut had_replaceable = false;
235    for ch in original.chars() {
236        if is_replaceable(ch) {
237            buf.push(replacement(ch, hash[hi % 32]));
238            hi += 1;
239            had_replaceable = true;
240        } else {
241            buf.push(ch);
242        }
243    }
244    had_replaceable.then_some(buf)
245}
246
247/// Length-preserving digit replacement.
248/// Preserves every non-digit character in `original`; replaces each
249/// ASCII digit with a deterministic digit derived from `hash`.
250/// Falls back to hex if the original contains no digits.
251fn format_digits_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
252    let hex = hex_bytes(hash);
253    format_char_class_lp(
254        hash,
255        original,
256        |c| c.is_ascii_digit(),
257        |_, b| (b'0' + b % 10) as char,
258    )
259    .unwrap_or_else(|| pad_or_truncate("", target, &hex))
260}
261
262/// Length-preserving hex-digit replacement (for IPv6, UUID, MAC, container ID).
263/// Preserves non-hex characters (colons, dashes, etc.); replaces each
264/// ASCII hex digit with a deterministic hex digit from `hash`, preserving case.
265fn format_hex_digits_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
266    let hex = hex_bytes(hash);
267    format_char_class_lp(
268        hash,
269        original,
270        |c| c.is_ascii_hexdigit(),
271        |ch, b| {
272            let nibble = b % 16;
273            if ch.is_ascii_uppercase() {
274                b"0123456789ABCDEF"[nibble as usize] as char
275            } else {
276                b"0123456789abcdef"[nibble as usize] as char
277            }
278        },
279    )
280    .unwrap_or_else(|| pad_or_truncate("", target, &hex))
281}
282
283/// Length-preserving SSN replacement.
284/// Preserves all non-digit characters.  The first three digit positions
285/// are forced to '0' (never-issued area code, clearly synthetic).
286/// Remaining digit positions are filled with deterministic digits.
287fn format_ssn_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
288    let has_digit = original.chars().any(|c| c.is_ascii_digit());
289    if !has_digit {
290        let hex = hex_bytes(hash);
291        return pad_or_truncate("", target, &hex);
292    }
293    let mut buf = String::with_capacity(target);
294    let mut digit_idx = 0usize;
295    for ch in original.chars() {
296        if ch.is_ascii_digit() {
297            if digit_idx < 3 {
298                buf.push('0');
299            } else {
300                buf.push((b'0' + hash[(digit_idx - 3) % 32] % 10) as char);
301            }
302            digit_idx += 1;
303        } else {
304            buf.push(ch);
305        }
306    }
307    buf
308}
309
310/// Length-preserving hostname replacement.
311/// Preserves the suffix (everything from the first `.` onward) and
312/// fills the prefix with deterministic hex characters to match `target`.
313fn format_hostname_lp(hex: &[u8; 64], original: &str, target: usize) -> String {
314    let suffix = original.find('.').map_or("", |p| &original[p..]);
315    let prefix_len = target.saturating_sub(suffix.len());
316    if prefix_len == 0 {
317        return pad_or_truncate("", target, hex);
318    }
319    let mut buf = String::with_capacity(target);
320    for i in 0..prefix_len {
321        buf.push(hex[i % 64] as char);
322    }
323    buf.push_str(suffix);
324    buf
325}
326
327/// Length-preserving custom replacement.
328/// Uses `__SANITIZED_<hex>__` format when the target is long enough;
329/// falls back to bare hex for short targets.
330fn format_custom_lp(hex: &[u8; 64], target: usize) -> String {
331    let prefix = "__SANITIZED_";
332    let suffix = "__";
333    let overhead = prefix.len() + suffix.len(); // 14
334    if target <= overhead {
335        return pad_or_truncate("", target, hex);
336    }
337    let hex_len = target - overhead;
338    let mut buf = String::with_capacity(target);
339    buf.push_str(prefix);
340    for i in 0..hex_len {
341        buf.push(hex[i % 64] as char);
342    }
343    buf.push_str(suffix);
344    buf
345}
346
347/// Length-preserving JWT replacement.
348/// Preserves `.` separators; replaces base64url characters
349/// (`[A-Za-z0-9_-]`) with deterministic base64url characters.
350fn format_jwt_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
351    const B64URL: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-";
352    let mut buf = String::with_capacity(target);
353    let mut hi = 0usize;
354    let mut had_b64 = false;
355    for ch in original.chars() {
356        if ch == '.' || ch == '=' {
357            buf.push(ch);
358        } else if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
359            buf.push(B64URL[hash[hi % 32] as usize % B64URL.len()] as char);
360            hi += 1;
361            had_b64 = true;
362        } else {
363            // Non-base64url, non-structural: emit byte-preserving replacement.
364            for _ in 0..ch.len_utf8() {
365                buf.push(B64URL[hash[hi % 32] as usize % B64URL.len()] as char);
366                hi += 1;
367            }
368            had_b64 = true;
369        }
370    }
371    if !had_b64 {
372        let hex = hex_bytes(hash);
373        return pad_or_truncate("", target, &hex);
374    }
375    buf
376}
377
378/// Length-preserving file path replacement.
379/// Preserves separators (`/`, `\`) and the final extension (from last `.`
380/// in the last segment). Replaces other characters with deterministic hex.
381fn format_filepath_lp(hex: &[u8; 64], original: &str, target: usize) -> String {
382    // Find the last path separator position to identify the filename segment.
383    let last_sep = original.rfind(['/', '\\']).map_or(0, |p| p + 1);
384    let filename = &original[last_sep..];
385    // Find extension in the filename (last `.` that isn't at position 0).
386    let ext_start = filename.rfind('.').filter(|&p| p > 0).map(|p| last_sep + p);
387
388    let mut buf = String::with_capacity(target);
389    let mut hi = 0usize;
390
391    for (i, ch) in original.char_indices() {
392        if matches!(ch, '/' | '\\') || ext_start.is_some_and(|es| i >= es) {
393            // Preserve separators and the file extension.
394            buf.push(ch);
395        } else {
396            // Emit as many ASCII hex bytes as the original char's UTF-8 length.
397            for _ in 0..ch.len_utf8() {
398                buf.push(hex[hi % 64] as char);
399                hi += 1;
400            }
401        }
402    }
403    // Ensure exact length (should be equal for ASCII, but guard anyway).
404    if buf.len() != target {
405        return pad_or_truncate(&buf, target, hex);
406    }
407    buf
408}
409
410/// Length-preserving Windows SID replacement.
411/// Preserves the `S-` prefix and `-` separators; replaces digit groups
412/// with deterministic digits.
413fn format_windows_sid_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
414    let has_digit = original.chars().any(|c| c.is_ascii_digit());
415    if !has_digit {
416        let hex = hex_bytes(hash);
417        return pad_or_truncate("", target, &hex);
418    }
419    let mut buf = String::with_capacity(target);
420    let mut hi = 0usize;
421    for ch in original.chars() {
422        if ch == 'S' || ch == '-' {
423            buf.push(ch);
424        } else if ch.is_ascii_digit() {
425            buf.push((b'0' + hash[hi % 32] % 10) as char);
426            hi += 1;
427        } else {
428            // Non-digit, non-structural: emit byte-count-preserving hex.
429            for _ in 0..ch.len_utf8() {
430                buf.push((b'0' + hash[hi % 32] % 10) as char);
431                hi += 1;
432            }
433        }
434    }
435    buf
436}
437
438/// Shared core for length-preserving hex replacement where a caller-supplied
439/// predicate identifies "structural" characters to preserve as-is.
440///
441/// All non-structural characters are replaced byte-by-byte with deterministic
442/// hex characters derived from `hex`.  Returns `None` if the original
443/// contained no replaceable content (caller should fall back to
444/// [`pad_or_truncate`]).
445fn format_preserving_hex_lp(
446    hex: &[u8; 64],
447    original: &str,
448    target: usize,
449    is_structural: impl Fn(char) -> bool,
450) -> Option<String> {
451    let mut buf = String::with_capacity(target);
452    let mut hi = 0usize;
453    let mut had_content = false;
454
455    for ch in original.chars() {
456        if is_structural(ch) {
457            buf.push(ch);
458        } else {
459            for _ in 0..ch.len_utf8() {
460                buf.push(hex[hi % 64] as char);
461                hi += 1;
462            }
463            had_content = true;
464        }
465    }
466
467    had_content.then_some(buf)
468}
469
470/// Length-preserving URL replacement.
471/// Preserves scheme prefix and structural characters
472/// (`://`, `/`, `?`, `=`, `&`, `#`, `:`); replaces content characters
473/// with deterministic hex.
474fn format_url_lp(hex: &[u8; 64], original: &str, target: usize) -> String {
475    format_preserving_hex_lp(hex, original, target, |ch| "/:?=&#@.".contains(ch))
476        .unwrap_or_else(|| pad_or_truncate("", target, hex))
477}
478
479/// Length-preserving AWS ARN replacement.
480/// Preserves `:` and `/` separators; replaces alphanumeric content
481/// in account/resource segments with deterministic hex.
482fn format_arn_lp(hex: &[u8; 64], original: &str, target: usize) -> String {
483    format_preserving_hex_lp(hex, original, target, |ch| ch == ':' || ch == '/')
484        .unwrap_or_else(|| pad_or_truncate("", target, hex))
485}
486
487/// Length-preserving Azure Resource ID replacement.
488/// Preserves `/` path separators and well-known Azure segment names
489/// (`subscriptions`, `resourceGroups`, `providers`, `resourcegroups`).
490/// Replaces variable segments (IDs, names) with deterministic hex.
491fn format_azure_resource_id_lp(hex: &[u8; 64], original: &str, target: usize) -> String {
492    const KNOWN_SEGMENTS: &[&str] = &[
493        "subscriptions",
494        "resourceGroups",
495        "resourcegroups",
496        "providers",
497    ];
498
499    let mut buf = String::with_capacity(target);
500    let mut hi = 0usize;
501
502    // Split on `/`, rebuild with deterministic replacement for non-known segments.
503    let mut prev_was_providers = false;
504    for (pi, part) in original.split('/').enumerate() {
505        if pi > 0 {
506            buf.push('/');
507        }
508        // Dotted segments (e.g. `Microsoft.Compute`) are only preserved when
509        // they immediately follow a `providers` segment. Preserving all dotted
510        // segments would accidentally pass through IPs or hostnames that appear
511        // elsewhere in the path.
512        let is_provider_namespace = prev_was_providers && part.contains('.');
513        if part.is_empty() || KNOWN_SEGMENTS.contains(&part) || is_provider_namespace {
514            buf.push_str(part);
515        } else {
516            // Replace this segment character-by-character to preserve byte length.
517            for ch in part.chars() {
518                for _ in 0..ch.len_utf8() {
519                    buf.push(hex[hi % 64] as char);
520                    hi += 1;
521                }
522            }
523        }
524        prev_was_providers = part == "providers" || part == "Providers";
525    }
526    if buf.len() != target {
527        return pad_or_truncate(&buf, target, hex);
528    }
529    buf
530}
531
532/// Deterministic synthetic name from hash bytes.
533fn format_name(hash: &[u8; 32]) -> String {
534    // We use a small, fixed table of first/last name fragments.
535    // The hash selects indices. This is NOT meant to be realistic — it's
536    // meant to be obviously synthetic while remaining structurally plausible.
537    const FIRST: &[&str] = &[
538        "Alex", "Blake", "Casey", "Dana", "Ellis", "Finley", "Gray", "Harper", "Ira", "Jordan",
539        "Kai", "Lane", "Morgan", "Noel", "Oakley", "Parker", "Quinn", "Reese", "Sage", "Taylor",
540        "Uri", "Val", "Wren", "Xen", "Yael", "Zion", "Arden", "Blair", "Corin", "Drew", "Emery",
541        "Frost",
542    ];
543    const LAST: &[&str] = &[
544        "Ashford",
545        "Blackwell",
546        "Crawford",
547        "Dalton",
548        "Eastwood",
549        "Fairbanks",
550        "Garrison",
551        "Hartley",
552        "Irvine",
553        "Jensen",
554        "Kendrick",
555        "Langley",
556        "Mercer",
557        "Newland",
558        "Oakwood",
559        "Preston",
560        "Quinlan",
561        "Redmond",
562        "Shepard",
563        "Thornton",
564        "Underwood",
565        "Vance",
566        "Whitmore",
567        "Xavier",
568        "Yardley",
569        "Zimmer",
570        "Ashton",
571        "Beckett",
572        "Calloway",
573        "Dempsey",
574        "Eldridge",
575        "Fletcher",
576    ];
577    let fi = hash[0] as usize % FIRST.len();
578    let li = hash[1] as usize % LAST.len();
579    format!("{} {}", FIRST[fi], LAST[li])
580}
581
582/// Encode 32 bytes as 64 lowercase hex ASCII bytes on the stack.
583fn hex_bytes(bytes: &[u8; 32]) -> [u8; 64] {
584    const HEX: &[u8; 16] = b"0123456789abcdef";
585    let mut out = [0u8; 64];
586    for (i, &b) in bytes.iter().enumerate() {
587        out[i * 2] = HEX[(b >> 4) as usize];
588        out[i * 2 + 1] = HEX[(b & 0xf) as usize];
589    }
590    out
591}
592
593// ---------------------------------------------------------------------------
594// Tests
595// ---------------------------------------------------------------------------
596
597#[cfg(test)]
598mod tests {
599    use super::*;
600
601    #[test]
602    fn hmac_deterministic_same_input() {
603        let gen = HmacGenerator::new([42u8; 32]);
604        let a = gen.generate(&Category::Email, "alice@corp.com");
605        let b = gen.generate(&Category::Email, "alice@corp.com");
606        assert_eq!(a, b, "same seed + same input must produce same output");
607    }
608
609    #[test]
610    fn hmac_different_inputs_differ() {
611        let gen = HmacGenerator::new([42u8; 32]);
612        let a = gen.generate(&Category::Email, "alice@corp.com");
613        let b = gen.generate(&Category::Email, "bob@corp.com");
614        assert_ne!(a, b);
615    }
616
617    #[test]
618    fn hmac_different_seeds_differ() {
619        let g1 = HmacGenerator::new([1u8; 32]);
620        let g2 = HmacGenerator::new([2u8; 32]);
621        let a = g1.generate(&Category::Email, "alice@corp.com");
622        let b = g2.generate(&Category::Email, "alice@corp.com");
623        assert_ne!(a, b);
624    }
625
626    #[test]
627    fn hmac_different_categories_differ() {
628        let gen = HmacGenerator::new([42u8; 32]);
629        let a = gen.generate(&Category::Email, "test");
630        let b = gen.generate(&Category::Name, "test");
631        assert_ne!(a, b, "different categories must produce different outputs");
632    }
633
634    #[test]
635    fn email_format() {
636        let gen = HmacGenerator::new([0u8; 32]);
637        let orig = "alice@corp.com";
638        let out = gen.generate(&Category::Email, orig);
639        assert!(out.contains('@'), "email must contain @");
640        assert!(out.ends_with("@corp.com"), "email must preserve domain");
641        assert_eq!(out.len(), orig.len(), "email must preserve length");
642    }
643
644    #[test]
645    fn ipv4_format() {
646        let gen = HmacGenerator::new([0u8; 32]);
647        let orig = "192.168.1.1";
648        let out = gen.generate(&Category::IpV4, orig);
649        // Dots preserved, length preserved.
650        let parts: Vec<&str> = out.split('.').collect();
651        assert_eq!(parts.len(), 4);
652        assert_eq!(out.len(), orig.len(), "ipv4 must preserve length");
653    }
654
655    #[test]
656    fn ssn_format() {
657        let gen = HmacGenerator::new([7u8; 32]);
658        let orig = "123-45-6789";
659        let out = gen.generate(&Category::Ssn, orig);
660        assert!(out.starts_with("000-"), "SSN must start with 000");
661        assert_eq!(out.len(), orig.len(), "SSN must preserve length");
662    }
663
664    #[test]
665    fn phone_format() {
666        let gen = HmacGenerator::new([3u8; 32]);
667        let orig = "+1-212-555-0100";
668        let out = gen.generate(&Category::Phone, orig);
669        // Formatting characters preserved.
670        assert!(out.starts_with('+'));
671        assert_eq!(
672            out.chars().filter(|c| *c == '-').count(),
673            orig.chars().filter(|c| *c == '-').count(),
674            "dashes must be preserved"
675        );
676        assert_eq!(out.len(), orig.len(), "phone must preserve length");
677    }
678
679    #[test]
680    fn hostname_format() {
681        let gen = HmacGenerator::new([5u8; 32]);
682        let orig = "db-prod-01.internal";
683        let out = gen.generate(&Category::Hostname, orig);
684        assert!(out.ends_with(".internal"), "hostname must preserve suffix");
685        assert_eq!(out.len(), orig.len(), "hostname must preserve length");
686    }
687
688    #[test]
689    fn custom_format() {
690        let gen = HmacGenerator::new([9u8; 32]);
691        let cat = Category::Custom("api_key".into());
692        // Use an input long enough for the __SANITIZED_..__ wrapper (>14 chars).
693        let orig = "sk-abc123-very-long-key";
694        let out = gen.generate(&cat, orig);
695        assert!(out.starts_with("__SANITIZED_"));
696        assert!(out.ends_with("__"));
697        assert_eq!(out.len(), orig.len(), "custom must preserve length");
698    }
699
700    #[test]
701    fn custom_format_short() {
702        let gen = HmacGenerator::new([9u8; 32]);
703        let cat = Category::Custom("api_key".into());
704        // Short input falls back to hex.
705        let orig = "sk-abc123";
706        let out = gen.generate(&cat, orig);
707        assert_eq!(
708            out.len(),
709            orig.len(),
710            "custom must preserve length even for short inputs"
711        );
712    }
713
714    #[test]
715    fn random_generator_produces_valid_format() {
716        let gen = RandomGenerator::new();
717        let orig = "test@example.com";
718        let out = gen.generate(&Category::Email, orig);
719        assert!(out.contains('@'));
720        assert_eq!(
721            out.len(),
722            orig.len(),
723            "random generator must preserve length"
724        );
725    }
726
727    #[test]
728    fn from_slice_rejects_bad_length() {
729        let result = HmacGenerator::from_slice(&[0u8; 16]);
730        assert!(result.is_err());
731    }
732
733    #[test]
734    fn credit_card_format() {
735        let gen = HmacGenerator::new([11u8; 32]);
736        let orig = "4111-1111-1111-1111";
737        let out = gen.generate(&Category::CreditCard, orig);
738        // Should be ####-####-####-####
739        let parts: Vec<&str> = out.split('-').collect();
740        assert_eq!(parts.len(), 4);
741        for part in &parts {
742            assert_eq!(part.len(), 4);
743            assert!(part.chars().all(|c| c.is_ascii_digit()));
744        }
745        assert_eq!(out.len(), orig.len(), "credit card must preserve length");
746    }
747
748    #[test]
749    fn name_format() {
750        let gen = HmacGenerator::new([0u8; 32]);
751        let orig = "John Doe";
752        let out = gen.generate(&Category::Name, orig);
753        assert_eq!(out.len(), orig.len(), "name must preserve length");
754    }
755
756    #[test]
757    fn ipv6_format() {
758        let gen = HmacGenerator::new([0u8; 32]);
759        let orig = "fd00:abcd:1234:5678::1";
760        let out = gen.generate(&Category::IpV6, orig);
761        // Colons and :: preserved, length preserved.
762        assert_eq!(
763            out.chars().filter(|c| *c == ':').count(),
764            orig.chars().filter(|c| *c == ':').count(),
765            "colons must be preserved"
766        );
767        assert_eq!(out.len(), orig.len(), "ipv6 must preserve length");
768    }
769
770    #[test]
771    fn length_preserved_all_categories() {
772        let gen = HmacGenerator::new([42u8; 32]);
773        let cases: Vec<(Category, &str)> = vec![
774            (Category::Email, "alice@corp.com"),
775            (Category::Name, "John Doe"),
776            (Category::Phone, "+1-212-555-0100"),
777            (Category::IpV4, "192.168.1.1"),
778            (Category::IpV6, "fd00::1"),
779            (Category::CreditCard, "4111-1111-1111-1111"),
780            (Category::Ssn, "123-45-6789"),
781            (Category::Hostname, "db-prod-01.internal"),
782            (Category::MacAddress, "AA:BB:CC:DD:EE:FF"),
783            (Category::ContainerId, "a1b2c3d4e5f6"),
784            (Category::Uuid, "550e8400-e29b-41d4-a716-446655440000"),
785            (Category::Jwt, "eyJhbGciOiJI.eyJzdWIiOiIx.SflKxwRJSMeK"),
786            (Category::AuthToken, "ghp_abc123secrettoken"),
787            (Category::FilePath, "/home/jsmith/config.yaml"),
788            (Category::WindowsSid, "S-1-5-21-3623811015-3361044348"),
789            (Category::Url, "https://internal.corp.com/api"),
790            (Category::AwsArn, "arn:aws:iam::123456789012:user/admin"),
791            (
792                Category::AzureResourceId,
793                "/subscriptions/550e8400/resourceGroups/rg-prod",
794            ),
795            (Category::Custom("key".into()), "some-secret-value-here"),
796        ];
797        for (cat, orig) in &cases {
798            let out = gen.generate(cat, orig);
799            assert_eq!(
800                out.len(),
801                orig.len(),
802                "length mismatch for {:?}: '{}' ({}) -> '{}' ({})",
803                cat,
804                orig,
805                orig.len(),
806                out,
807                out.len()
808            );
809        }
810    }
811
812    #[test]
813    fn mac_address_format() {
814        let gen = HmacGenerator::new([7u8; 32]);
815        let orig = "AA:BB:CC:DD:EE:FF";
816        let out = gen.generate(&Category::MacAddress, orig);
817        assert_eq!(out.len(), orig.len(), "mac must preserve length");
818        assert_eq!(
819            out.chars().filter(|c| *c == ':').count(),
820            5,
821            "mac must preserve colons"
822        );
823    }
824
825    #[test]
826    fn mac_address_dash_format() {
827        let gen = HmacGenerator::new([7u8; 32]);
828        let orig = "AA-BB-CC-DD-EE-FF";
829        let out = gen.generate(&Category::MacAddress, orig);
830        assert_eq!(out.len(), orig.len());
831        assert_eq!(out.chars().filter(|c| *c == '-').count(), 5);
832    }
833
834    #[test]
835    fn uuid_format() {
836        let gen = HmacGenerator::new([3u8; 32]);
837        let orig = "550e8400-e29b-41d4-a716-446655440000";
838        let out = gen.generate(&Category::Uuid, orig);
839        assert_eq!(out.len(), orig.len(), "uuid must preserve length");
840        assert_eq!(
841            out.chars().filter(|c| *c == '-').count(),
842            4,
843            "uuid must preserve dashes"
844        );
845    }
846
847    #[test]
848    fn container_id_format() {
849        let gen = HmacGenerator::new([5u8; 32]);
850        let orig = "a1b2c3d4e5f6";
851        let out = gen.generate(&Category::ContainerId, orig);
852        assert_eq!(out.len(), orig.len(), "container id must preserve length");
853        assert!(out.chars().all(|c| c.is_ascii_hexdigit()));
854    }
855
856    #[test]
857    fn jwt_format() {
858        let gen = HmacGenerator::new([11u8; 32]);
859        let orig = "eyJhbGciOiJI.eyJzdWIiOiIx.SflKxwRJSMeK";
860        let out = gen.generate(&Category::Jwt, orig);
861        assert_eq!(out.len(), orig.len(), "jwt must preserve length");
862        let orig_dots = orig.chars().filter(|c| *c == '.').count();
863        let out_dots = out.chars().filter(|c| *c == '.').count();
864        assert_eq!(out_dots, orig_dots, "jwt must preserve dots");
865    }
866
867    #[test]
868    fn auth_token_format() {
869        let gen = HmacGenerator::new([9u8; 32]);
870        let orig = "ghp_abc123secrettoken";
871        let out = gen.generate(&Category::AuthToken, orig);
872        assert!(out.starts_with("__SANITIZED_"));
873        assert!(out.ends_with("__"));
874        assert_eq!(out.len(), orig.len(), "auth_token must preserve length");
875    }
876
877    #[test]
878    fn filepath_unix_format() {
879        let gen = HmacGenerator::new([13u8; 32]);
880        let orig = "/home/jsmith/config.yaml";
881        let out = gen.generate(&Category::FilePath, orig);
882        assert_eq!(out.len(), orig.len(), "filepath must preserve length");
883        assert_eq!(
884            std::path::Path::new(&out)
885                .extension()
886                .and_then(|e| e.to_str()),
887            Some("yaml"),
888            "filepath must preserve extension"
889        );
890        assert_eq!(
891            out.chars().filter(|c| *c == '/').count(),
892            orig.chars().filter(|c| *c == '/').count(),
893            "filepath must preserve separators"
894        );
895    }
896
897    #[test]
898    fn filepath_windows_format() {
899        let gen = HmacGenerator::new([13u8; 32]);
900        let orig = "C:\\Users\\admin\\secrets.txt";
901        let out = gen.generate(&Category::FilePath, orig);
902        assert_eq!(out.len(), orig.len(), "filepath must preserve length");
903        assert_eq!(
904            std::path::Path::new(&out)
905                .extension()
906                .and_then(|e| e.to_str()),
907            Some("txt"),
908            "filepath must preserve extension"
909        );
910        assert_eq!(
911            out.chars().filter(|c| *c == '\\').count(),
912            orig.chars().filter(|c| *c == '\\').count(),
913            "filepath must preserve backslashes"
914        );
915    }
916
917    #[test]
918    fn windows_sid_format() {
919        let gen = HmacGenerator::new([7u8; 32]);
920        let orig = "S-1-5-21-3623811015-3361044348-30300820-1013";
921        let out = gen.generate(&Category::WindowsSid, orig);
922        assert_eq!(out.len(), orig.len(), "SID must preserve length");
923        assert!(out.starts_with("S-"), "SID must start with S-");
924        assert_eq!(
925            out.chars().filter(|c| *c == '-').count(),
926            orig.chars().filter(|c| *c == '-').count(),
927            "SID must preserve dashes"
928        );
929    }
930
931    #[test]
932    fn url_format() {
933        let gen = HmacGenerator::new([5u8; 32]);
934        let orig = "https://internal.corp.com/api/users?token=abc123";
935        let out = gen.generate(&Category::Url, orig);
936        assert_eq!(out.len(), orig.len(), "url must preserve length");
937        // Structural characters preserved.
938        assert!(out.contains("://"));
939        assert!(out.contains('?'));
940        assert!(out.contains('='));
941    }
942
943    #[test]
944    fn aws_arn_format() {
945        let gen = HmacGenerator::new([3u8; 32]);
946        let orig = "arn:aws:iam::123456789012:user/admin";
947        let out = gen.generate(&Category::AwsArn, orig);
948        assert_eq!(out.len(), orig.len(), "ARN must preserve length");
949        assert_eq!(
950            out.chars().filter(|c| *c == ':').count(),
951            orig.chars().filter(|c| *c == ':').count(),
952            "ARN must preserve colons"
953        );
954        assert!(out.contains('/'), "ARN must preserve slash");
955    }
956
957    #[test]
958    fn azure_resource_id_format() {
959        let gen = HmacGenerator::new([11u8; 32]);
960        let orig = "/subscriptions/550e8400-e29b/resourceGroups/rg-prod/providers/Microsoft.Compute/virtualMachines/vm-01";
961        let out = gen.generate(&Category::AzureResourceId, orig);
962        assert_eq!(
963            out.len(),
964            orig.len(),
965            "Azure resource ID must preserve length"
966        );
967        assert!(
968            out.contains("/subscriptions/"),
969            "must preserve 'subscriptions'"
970        );
971        assert!(
972            out.contains("/resourceGroups/"),
973            "must preserve 'resourceGroups'"
974        );
975        assert!(out.contains("/providers/"), "must preserve 'providers'");
976        assert!(
977            out.contains("Microsoft.Compute"),
978            "must preserve dotted provider name"
979        );
980    }
981
982    #[test]
983    fn azure_dotted_segment_outside_providers_is_replaced() {
984        let gen = HmacGenerator::new([11u8; 32]);
985        // A dotted segment that is NOT immediately after `providers/` must be
986        // treated as a variable component and replaced, not passed through.
987        // Before the fix, part.contains('.') caused this to be preserved.
988        let orig = "/subscriptions/10.0.0.1/resourceGroups/rg-prod";
989        let out = gen.generate(&Category::AzureResourceId, orig);
990        assert_eq!(out.len(), orig.len(), "length must be preserved");
991        assert!(out.contains("/subscriptions/"), "subscriptions preserved");
992        assert!(out.contains("/resourceGroups/"), "resourceGroups preserved");
993        assert!(
994            !out.contains("10.0.0.1"),
995            "dotted non-provider segment must be replaced, got: {out}"
996        );
997        assert!(
998            !out.contains("rg-prod"),
999            "variable resource group name must be replaced, got: {out}"
1000        );
1001    }
1002}