Skip to main content

sanitize_engine/
generator.rs

1//! Replacement generation strategies.
2//!
3//! Two concrete implementations:
4//! - `HmacGenerator`: Deterministic, seeded with a 32-byte key. Same seed + same
5//!   input = same output across runs. Uses HMAC-SHA256 for domain separation.
6//! - `RandomGenerator`: Cryptographically random replacements. Non-deterministic.
7//!
8//! Both produce category-aware, format-preserving replacements.
9//!
10//! # Design Note
11//!
12//! This module contains the category-aware formatters used by the CLI binary.
13//! For an extensible strategy API that allows custom replacement logic, see
14//! the [`crate::strategy`] module.
15
16use crate::category::Category;
17use hmac::{Hmac, Mac};
18use rand::Rng;
19use sha2::Sha256;
20use zeroize::Zeroize;
21
22// ---------------------------------------------------------------------------
23// Trait
24// ---------------------------------------------------------------------------
25
26/// Strategy for generating a sanitized replacement value.
27///
28/// Implementations MUST be deterministic to their inputs: given the same
29/// `(category, original)` pair (and same internal state / seed), the output
30/// must be identical. This is what enables per-run consistency when backed
31/// by a `MappingStore` that calls `generate` only once per unique value.
32pub trait ReplacementGenerator: Send + Sync {
33    /// Produce a sanitized replacement for `original` classified as `category`.
34    fn generate(&self, category: &Category, original: &str) -> String;
35}
36
37// ---------------------------------------------------------------------------
38// HMAC-SHA256 deterministic generator
39// ---------------------------------------------------------------------------
40
41/// Deterministic replacement generator seeded with a 32-byte key.
42///
43/// ```text
44/// replacement = format(category, HMAC-SHA256(key, category_tag || "\x00" || original))
45/// ```
46///
47/// The same key + same `(category, original)` always yields the same output.
48/// Different keys yield completely different outputs with overwhelming probability.
49pub struct HmacGenerator {
50    key: [u8; 32],
51}
52
53impl Drop for HmacGenerator {
54    fn drop(&mut self) {
55        self.key.zeroize();
56    }
57}
58
59impl HmacGenerator {
60    /// Create a new generator from a 32-byte seed.
61    #[must_use]
62    pub fn new(key: [u8; 32]) -> Self {
63        Self { key }
64    }
65
66    /// Create a generator from a byte slice (must be exactly 32 bytes).
67    ///
68    /// # Errors
69    ///
70    /// Returns [`SanitizeError::InvalidSeedLength`](crate::error::SanitizeError::InvalidSeedLength) if `bytes.len() != 32`.
71    pub fn from_slice(bytes: &[u8]) -> crate::error::Result<Self> {
72        if bytes.len() != 32 {
73            return Err(crate::error::SanitizeError::InvalidSeedLength(bytes.len()));
74        }
75        let mut key = [0u8; 32];
76        key.copy_from_slice(bytes);
77        Ok(Self { key })
78    }
79
80    /// Derive the raw 32-byte HMAC digest for `(category, original)`.
81    fn derive(&self, category: &Category, original: &str) -> [u8; 32] {
82        type HmacSha256 = Hmac<Sha256>;
83        let mut mac = HmacSha256::new_from_slice(&self.key).expect("HMAC accepts any key length");
84        let tag = category.domain_tag_hmac();
85        mac.update(tag.as_bytes());
86        mac.update(b"\x00"); // domain separator
87        mac.update(original.as_bytes());
88        let result = mac.finalize();
89        let mut out = [0u8; 32];
90        out.copy_from_slice(&result.into_bytes());
91        out
92    }
93}
94
95impl ReplacementGenerator for HmacGenerator {
96    fn generate(&self, category: &Category, original: &str) -> String {
97        let hash = self.derive(category, original);
98        format_replacement(category, &hash, original)
99    }
100}
101
102// ---------------------------------------------------------------------------
103// Cryptographically-random generator (non-deterministic)
104// ---------------------------------------------------------------------------
105
106/// Random replacement generator using OS CSPRNG.
107///
108/// Each call to `generate` produces a fresh random value. Determinism is
109/// achieved externally by the `MappingStore`, which calls `generate` only
110/// once per unique `(category, original)` pair and caches the result.
111pub struct RandomGenerator;
112
113impl RandomGenerator {
114    #[must_use]
115    pub fn new() -> Self {
116        Self
117    }
118}
119
120impl Default for RandomGenerator {
121    fn default() -> Self {
122        Self::new()
123    }
124}
125
126impl ReplacementGenerator for RandomGenerator {
127    fn generate(&self, category: &Category, original: &str) -> String {
128        let mut rng = rand::rng();
129        let mut hash = [0u8; 32];
130        rng.fill(&mut hash);
131        format_replacement(category, &hash, original)
132    }
133}
134
135// ---------------------------------------------------------------------------
136// Category-aware formatting helpers
137// ---------------------------------------------------------------------------
138
139/// Format a 32-byte hash into a length-preserving replacement whose
140/// byte length exactly matches `original.len()`. The shape is
141/// category-aware and deterministic for the same `(hash, original)` pair.
142fn format_replacement(category: &Category, hash: &[u8; 32], original: &str) -> String {
143    let target = original.len();
144    if target == 0 {
145        return String::new();
146    }
147    match category {
148        Category::Email => format_email_lp(hash, original, target),
149        Category::Name => format_name_lp(hash, target),
150        Category::Phone | Category::CreditCard | Category::IpV4 => {
151            format_digits_lp(hash, original, target)
152        }
153        Category::IpV6 | Category::MacAddress | Category::Uuid | Category::ContainerId => {
154            format_hex_digits_lp(hash, original, target)
155        }
156        Category::Ssn => format_ssn_lp(hash, original, target),
157        Category::Hostname => format_hostname_lp(hash, original, target),
158        Category::Jwt => format_jwt_lp(hash, original, target),
159        Category::FilePath => format_filepath_lp(hash, original, target),
160        Category::WindowsSid => format_windows_sid_lp(hash, original, target),
161        Category::Url => format_url_lp(hash, original, target),
162        Category::AwsArn => format_arn_lp(hash, original, target),
163        Category::AzureResourceId => format_azure_resource_id_lp(hash, original, target),
164        Category::AuthToken | Category::Custom(_) => format_custom_lp(hash, target),
165    }
166}
167
168// ---------------------------------------------------------------------------
169// Length-preserving helpers
170// ---------------------------------------------------------------------------
171
172/// Pad `s` with deterministic hex characters from `hash`, or truncate,
173/// to reach exactly `target` bytes.  All generated content is ASCII so
174/// byte length equals character count for the produced output.
175fn pad_or_truncate(s: &str, target: usize, hash: &[u8; 32]) -> String {
176    let slen = s.len();
177    if slen == target {
178        return s.to_string();
179    }
180    if slen > target {
181        return s[..target].to_string();
182    }
183    // Pad with deterministic hex chars derived from the hash.
184    let hex = hex_encode(hash);
185    let hex_bytes = hex.as_bytes();
186    let mut buf = String::with_capacity(target);
187    buf.push_str(s);
188    for i in 0..target.saturating_sub(slen) {
189        buf.push(hex_bytes[i % 64] as char);
190    }
191    buf
192}
193
194/// Length-preserving email replacement.
195/// Preserves the domain from the original; generates a hex username
196/// sized so the total byte length matches the original.
197fn format_email_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
198    let domain = original
199        .rfind('@')
200        .map_or("x.co", |pos| &original[pos + 1..]);
201    let at_domain = 1 + domain.len(); // "@" + domain
202    if target <= at_domain {
203        // Too short to fit @domain — use hex fallback.
204        return pad_or_truncate("", target, hash);
205    }
206    let user_len = target - at_domain;
207    let hex = hex_encode(hash);
208    let hex_bytes = hex.as_bytes();
209    let mut buf = String::with_capacity(target);
210    for i in 0..user_len {
211        buf.push(hex_bytes[i % 64] as char);
212    }
213    buf.push('@');
214    buf.push_str(domain);
215    buf
216}
217
218/// Length-preserving name replacement.
219/// Generates a synthetic name via the hash-indexed table, then
220/// truncates or pads to match `target` bytes.
221fn format_name_lp(hash: &[u8; 32], target: usize) -> String {
222    let raw = format_name(hash);
223    pad_or_truncate(&raw, target, hash)
224}
225
226/// Replace each character matching `is_replaceable` with a deterministic
227/// character produced by `replacement(original_char, hash[hi % 32])`.
228/// All other characters are preserved as-is.
229/// Returns `None` if no replaceable characters were found (caller falls back).
230fn format_char_class_lp(
231    hash: &[u8; 32],
232    original: &str,
233    is_replaceable: impl Fn(char) -> bool,
234    replacement: impl Fn(char, u8) -> char,
235) -> Option<String> {
236    let mut buf = String::with_capacity(original.len());
237    let mut hi = 0usize;
238    let mut had_replaceable = false;
239    for ch in original.chars() {
240        if is_replaceable(ch) {
241            buf.push(replacement(ch, hash[hi % 32]));
242            hi += 1;
243            had_replaceable = true;
244        } else {
245            buf.push(ch);
246        }
247    }
248    had_replaceable.then_some(buf)
249}
250
251/// Length-preserving digit replacement.
252/// Preserves every non-digit character in `original`; replaces each
253/// ASCII digit with a deterministic digit derived from `hash`.
254/// Falls back to hex if the original contains no digits.
255fn format_digits_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
256    format_char_class_lp(
257        hash,
258        original,
259        |c| c.is_ascii_digit(),
260        |_, b| (b'0' + b % 10) as char,
261    )
262    .unwrap_or_else(|| pad_or_truncate("", target, hash))
263}
264
265/// Length-preserving hex-digit replacement (for IPv6, UUID, MAC, container ID).
266/// Preserves non-hex characters (colons, dashes, etc.); replaces each
267/// ASCII hex digit with a deterministic hex digit from `hash`, preserving case.
268fn format_hex_digits_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
269    format_char_class_lp(
270        hash,
271        original,
272        |c| c.is_ascii_hexdigit(),
273        |ch, b| {
274            let nibble = b % 16;
275            if ch.is_ascii_uppercase() {
276                b"0123456789ABCDEF"[nibble as usize] as char
277            } else {
278                b"0123456789abcdef"[nibble as usize] as char
279            }
280        },
281    )
282    .unwrap_or_else(|| pad_or_truncate("", target, hash))
283}
284
285/// Length-preserving SSN replacement.
286/// Preserves all non-digit characters.  The first three digit positions
287/// are forced to '0' (never-issued area code, clearly synthetic).
288/// Remaining digit positions are filled with deterministic digits.
289fn format_ssn_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
290    let has_digit = original.chars().any(|c| c.is_ascii_digit());
291    if !has_digit {
292        return pad_or_truncate("", target, hash);
293    }
294    let mut buf = String::with_capacity(target);
295    let mut digit_idx = 0usize;
296    for ch in original.chars() {
297        if ch.is_ascii_digit() {
298            if digit_idx < 3 {
299                buf.push('0');
300            } else {
301                buf.push((b'0' + hash[(digit_idx - 3) % 32] % 10) as char);
302            }
303            digit_idx += 1;
304        } else {
305            buf.push(ch);
306        }
307    }
308    buf
309}
310
311/// Length-preserving hostname replacement.
312/// Preserves the suffix (everything from the first `.` onward) and
313/// fills the prefix with deterministic hex characters to match `target`.
314fn format_hostname_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
315    let suffix = original.find('.').map_or("", |p| &original[p..]);
316    let prefix_len = target.saturating_sub(suffix.len());
317    if prefix_len == 0 {
318        return pad_or_truncate("", target, hash);
319    }
320    let hex = hex_encode(hash);
321    let hex_bytes = hex.as_bytes();
322    let mut buf = String::with_capacity(target);
323    for i in 0..prefix_len {
324        buf.push(hex_bytes[i % 64] as char);
325    }
326    buf.push_str(suffix);
327    buf
328}
329
330/// Length-preserving custom replacement.
331/// Uses `__SANITIZED_<hex>__` format when the target is long enough;
332/// falls back to bare hex for short targets.
333fn format_custom_lp(hash: &[u8; 32], target: usize) -> String {
334    let prefix = "__SANITIZED_";
335    let suffix = "__";
336    let overhead = prefix.len() + suffix.len(); // 14
337    let hex = hex_encode(hash);
338    if target <= overhead {
339        return pad_or_truncate("", target, hash);
340    }
341    let hex_len = target - overhead;
342    let hex_bytes = hex.as_bytes();
343    let mut buf = String::with_capacity(target);
344    buf.push_str(prefix);
345    for i in 0..hex_len {
346        buf.push(hex_bytes[i % 64] as char);
347    }
348    buf.push_str(suffix);
349    buf
350}
351
352/// Length-preserving JWT replacement.
353/// Preserves `.` separators; replaces base64url characters
354/// (`[A-Za-z0-9_-]`) with deterministic base64url characters.
355fn format_jwt_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
356    const B64URL: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-";
357    let mut buf = String::with_capacity(target);
358    let mut hi = 0usize;
359    let mut had_b64 = false;
360    for ch in original.chars() {
361        if ch == '.' || ch == '=' {
362            buf.push(ch);
363        } else if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
364            buf.push(B64URL[hash[hi % 32] as usize % B64URL.len()] as char);
365            hi += 1;
366            had_b64 = true;
367        } else {
368            // Non-base64url, non-structural: emit byte-preserving replacement.
369            for _ in 0..ch.len_utf8() {
370                buf.push(B64URL[hash[hi % 32] as usize % B64URL.len()] as char);
371                hi += 1;
372            }
373            had_b64 = true;
374        }
375    }
376    if !had_b64 {
377        return pad_or_truncate("", target, hash);
378    }
379    buf
380}
381
382/// Length-preserving file path replacement.
383/// Preserves separators (`/`, `\`) and the final extension (from last `.`
384/// in the last segment). Replaces other characters with deterministic hex.
385fn format_filepath_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
386    // Find the last path separator position to identify the filename segment.
387    let last_sep = original.rfind(['/', '\\']).map_or(0, |p| p + 1);
388    let filename = &original[last_sep..];
389    // Find extension in the filename (last `.` that isn't at position 0).
390    let ext_start = filename.rfind('.').filter(|&p| p > 0).map(|p| last_sep + p);
391
392    let hex = hex_encode(hash);
393    let hex_bytes = hex.as_bytes();
394    let mut buf = String::with_capacity(target);
395    let mut hi = 0usize;
396
397    for (i, ch) in original.char_indices() {
398        if matches!(ch, '/' | '\\') || ext_start.is_some_and(|es| i >= es) {
399            // Preserve separators and the file extension.
400            buf.push(ch);
401        } else {
402            // Emit as many ASCII hex bytes as the original char's UTF-8 length.
403            for _ in 0..ch.len_utf8() {
404                buf.push(hex_bytes[hi % 64] as char);
405                hi += 1;
406            }
407        }
408    }
409    // Ensure exact length (should be equal for ASCII, but guard anyway).
410    if buf.len() != target {
411        return pad_or_truncate(&buf, target, hash);
412    }
413    buf
414}
415
416/// Length-preserving Windows SID replacement.
417/// Preserves the `S-` prefix and `-` separators; replaces digit groups
418/// with deterministic digits.
419fn format_windows_sid_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
420    let has_digit = original.chars().any(|c| c.is_ascii_digit());
421    if !has_digit {
422        return pad_or_truncate("", target, hash);
423    }
424    let mut buf = String::with_capacity(target);
425    let mut hi = 0usize;
426    for ch in original.chars() {
427        if ch == 'S' || ch == '-' {
428            buf.push(ch);
429        } else if ch.is_ascii_digit() {
430            buf.push((b'0' + hash[hi % 32] % 10) as char);
431            hi += 1;
432        } else {
433            // Non-digit, non-structural: emit byte-count-preserving hex.
434            for _ in 0..ch.len_utf8() {
435                buf.push((b'0' + hash[hi % 32] % 10) as char);
436                hi += 1;
437            }
438        }
439    }
440    buf
441}
442
443/// Shared core for length-preserving hex replacement where a caller-supplied
444/// predicate identifies "structural" characters to preserve as-is.
445///
446/// All non-structural characters are replaced byte-by-byte with deterministic
447/// hex characters derived from `hash`.  Returns `None` if the original
448/// contained no replaceable content (caller should fall back to
449/// [`pad_or_truncate`]).
450fn format_preserving_hex_lp(
451    hash: &[u8; 32],
452    original: &str,
453    target: usize,
454    is_structural: impl Fn(char) -> bool,
455) -> Option<String> {
456    let hex = hex_encode(hash);
457    let hex_bytes = hex.as_bytes();
458    let mut buf = String::with_capacity(target);
459    let mut hi = 0usize;
460    let mut had_content = false;
461
462    for ch in original.chars() {
463        if is_structural(ch) {
464            buf.push(ch);
465        } else {
466            for _ in 0..ch.len_utf8() {
467                buf.push(hex_bytes[hi % 64] as char);
468                hi += 1;
469            }
470            had_content = true;
471        }
472    }
473
474    had_content.then_some(buf)
475}
476
477/// Length-preserving URL replacement.
478/// Preserves scheme prefix and structural characters
479/// (`://`, `/`, `?`, `=`, `&`, `#`, `:`); replaces content characters
480/// with deterministic hex.
481fn format_url_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
482    format_preserving_hex_lp(hash, original, target, |ch| "/:?=&#@.".contains(ch))
483        .unwrap_or_else(|| pad_or_truncate("", target, hash))
484}
485
486/// Length-preserving AWS ARN replacement.
487/// Preserves `:` and `/` separators; replaces alphanumeric content
488/// in account/resource segments with deterministic hex.
489fn format_arn_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
490    format_preserving_hex_lp(hash, original, target, |ch| ch == ':' || ch == '/')
491        .unwrap_or_else(|| pad_or_truncate("", target, hash))
492}
493
494/// Length-preserving Azure Resource ID replacement.
495/// Preserves `/` path separators and well-known Azure segment names
496/// (`subscriptions`, `resourceGroups`, `providers`, `resourcegroups`).
497/// Replaces variable segments (IDs, names) with deterministic hex.
498fn format_azure_resource_id_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
499    const KNOWN_SEGMENTS: &[&str] = &[
500        "subscriptions",
501        "resourceGroups",
502        "resourcegroups",
503        "providers",
504    ];
505
506    let hex = hex_encode(hash);
507    let hex_bytes = hex.as_bytes();
508    let mut buf = String::with_capacity(target);
509    let mut hi = 0usize;
510
511    // Split on `/`, rebuild with deterministic replacement for non-known segments.
512    let parts: Vec<&str> = original.split('/').collect();
513    for (pi, part) in parts.iter().enumerate() {
514        if pi > 0 {
515            buf.push('/');
516        }
517        if part.is_empty() || KNOWN_SEGMENTS.contains(part) || part.contains('.') {
518            // Preserve empty segments (leading `/`), known names, and
519            // dotted provider names like `Microsoft.Compute`.
520            buf.push_str(part);
521        } else {
522            // Replace this segment character-by-character to preserve byte length.
523            for ch in part.chars() {
524                for _ in 0..ch.len_utf8() {
525                    buf.push(hex_bytes[hi % 64] as char);
526                    hi += 1;
527                }
528            }
529        }
530    }
531    if buf.len() != target {
532        return pad_or_truncate(&buf, target, hash);
533    }
534    buf
535}
536
537/// Deterministic synthetic name from hash bytes.
538fn format_name(hash: &[u8; 32]) -> String {
539    // We use a small, fixed table of first/last name fragments.
540    // The hash selects indices. This is NOT meant to be realistic — it's
541    // meant to be obviously synthetic while remaining structurally plausible.
542    const FIRST: &[&str] = &[
543        "Alex", "Blake", "Casey", "Dana", "Ellis", "Finley", "Gray", "Harper", "Ira", "Jordan",
544        "Kai", "Lane", "Morgan", "Noel", "Oakley", "Parker", "Quinn", "Reese", "Sage", "Taylor",
545        "Uri", "Val", "Wren", "Xen", "Yael", "Zion", "Arden", "Blair", "Corin", "Drew", "Emery",
546        "Frost",
547    ];
548    const LAST: &[&str] = &[
549        "Ashford",
550        "Blackwell",
551        "Crawford",
552        "Dalton",
553        "Eastwood",
554        "Fairbanks",
555        "Garrison",
556        "Hartley",
557        "Irvine",
558        "Jensen",
559        "Kendrick",
560        "Langley",
561        "Mercer",
562        "Newland",
563        "Oakwood",
564        "Preston",
565        "Quinlan",
566        "Redmond",
567        "Shepard",
568        "Thornton",
569        "Underwood",
570        "Vance",
571        "Whitmore",
572        "Xavier",
573        "Yardley",
574        "Zimmer",
575        "Ashton",
576        "Beckett",
577        "Calloway",
578        "Dempsey",
579        "Eldridge",
580        "Fletcher",
581    ];
582    let fi = hash[0] as usize % FIRST.len();
583    let li = hash[1] as usize % LAST.len();
584    format!("{} {}", FIRST[fi], LAST[li])
585}
586
587/// Hex-encode 32 bytes → 64-char lowercase string.
588fn hex_encode(bytes: &[u8; 32]) -> String {
589    use std::fmt::Write;
590    let mut hex = String::with_capacity(64);
591    for b in bytes {
592        let _ = write!(hex, "{:02x}", b);
593    }
594    hex
595}
596
597// ---------------------------------------------------------------------------
598// Tests
599// ---------------------------------------------------------------------------
600
601#[cfg(test)]
602mod tests {
603    use super::*;
604
605    #[test]
606    fn hmac_deterministic_same_input() {
607        let gen = HmacGenerator::new([42u8; 32]);
608        let a = gen.generate(&Category::Email, "alice@corp.com");
609        let b = gen.generate(&Category::Email, "alice@corp.com");
610        assert_eq!(a, b, "same seed + same input must produce same output");
611    }
612
613    #[test]
614    fn hmac_different_inputs_differ() {
615        let gen = HmacGenerator::new([42u8; 32]);
616        let a = gen.generate(&Category::Email, "alice@corp.com");
617        let b = gen.generate(&Category::Email, "bob@corp.com");
618        assert_ne!(a, b);
619    }
620
621    #[test]
622    fn hmac_different_seeds_differ() {
623        let g1 = HmacGenerator::new([1u8; 32]);
624        let g2 = HmacGenerator::new([2u8; 32]);
625        let a = g1.generate(&Category::Email, "alice@corp.com");
626        let b = g2.generate(&Category::Email, "alice@corp.com");
627        assert_ne!(a, b);
628    }
629
630    #[test]
631    fn hmac_different_categories_differ() {
632        let gen = HmacGenerator::new([42u8; 32]);
633        let a = gen.generate(&Category::Email, "test");
634        let b = gen.generate(&Category::Name, "test");
635        assert_ne!(a, b, "different categories must produce different outputs");
636    }
637
638    #[test]
639    fn email_format() {
640        let gen = HmacGenerator::new([0u8; 32]);
641        let orig = "alice@corp.com";
642        let out = gen.generate(&Category::Email, orig);
643        assert!(out.contains('@'), "email must contain @");
644        assert!(out.ends_with("@corp.com"), "email must preserve domain");
645        assert_eq!(out.len(), orig.len(), "email must preserve length");
646    }
647
648    #[test]
649    fn ipv4_format() {
650        let gen = HmacGenerator::new([0u8; 32]);
651        let orig = "192.168.1.1";
652        let out = gen.generate(&Category::IpV4, orig);
653        // Dots preserved, length preserved.
654        let parts: Vec<&str> = out.split('.').collect();
655        assert_eq!(parts.len(), 4);
656        assert_eq!(out.len(), orig.len(), "ipv4 must preserve length");
657    }
658
659    #[test]
660    fn ssn_format() {
661        let gen = HmacGenerator::new([7u8; 32]);
662        let orig = "123-45-6789";
663        let out = gen.generate(&Category::Ssn, orig);
664        assert!(out.starts_with("000-"), "SSN must start with 000");
665        assert_eq!(out.len(), orig.len(), "SSN must preserve length");
666    }
667
668    #[test]
669    fn phone_format() {
670        let gen = HmacGenerator::new([3u8; 32]);
671        let orig = "+1-212-555-0100";
672        let out = gen.generate(&Category::Phone, orig);
673        // Formatting characters preserved.
674        assert!(out.starts_with('+'));
675        assert_eq!(
676            out.chars().filter(|c| *c == '-').count(),
677            orig.chars().filter(|c| *c == '-').count(),
678            "dashes must be preserved"
679        );
680        assert_eq!(out.len(), orig.len(), "phone must preserve length");
681    }
682
683    #[test]
684    fn hostname_format() {
685        let gen = HmacGenerator::new([5u8; 32]);
686        let orig = "db-prod-01.internal";
687        let out = gen.generate(&Category::Hostname, orig);
688        assert!(out.ends_with(".internal"), "hostname must preserve suffix");
689        assert_eq!(out.len(), orig.len(), "hostname must preserve length");
690    }
691
692    #[test]
693    fn custom_format() {
694        let gen = HmacGenerator::new([9u8; 32]);
695        let cat = Category::Custom("api_key".into());
696        // Use an input long enough for the __SANITIZED_..__ wrapper (>14 chars).
697        let orig = "sk-abc123-very-long-key";
698        let out = gen.generate(&cat, orig);
699        assert!(out.starts_with("__SANITIZED_"));
700        assert!(out.ends_with("__"));
701        assert_eq!(out.len(), orig.len(), "custom must preserve length");
702    }
703
704    #[test]
705    fn custom_format_short() {
706        let gen = HmacGenerator::new([9u8; 32]);
707        let cat = Category::Custom("api_key".into());
708        // Short input falls back to hex.
709        let orig = "sk-abc123";
710        let out = gen.generate(&cat, orig);
711        assert_eq!(
712            out.len(),
713            orig.len(),
714            "custom must preserve length even for short inputs"
715        );
716    }
717
718    #[test]
719    fn random_generator_produces_valid_format() {
720        let gen = RandomGenerator::new();
721        let orig = "test@example.com";
722        let out = gen.generate(&Category::Email, orig);
723        assert!(out.contains('@'));
724        assert_eq!(
725            out.len(),
726            orig.len(),
727            "random generator must preserve length"
728        );
729    }
730
731    #[test]
732    fn from_slice_rejects_bad_length() {
733        let result = HmacGenerator::from_slice(&[0u8; 16]);
734        assert!(result.is_err());
735    }
736
737    #[test]
738    fn credit_card_format() {
739        let gen = HmacGenerator::new([11u8; 32]);
740        let orig = "4111-1111-1111-1111";
741        let out = gen.generate(&Category::CreditCard, orig);
742        // Should be ####-####-####-####
743        let parts: Vec<&str> = out.split('-').collect();
744        assert_eq!(parts.len(), 4);
745        for part in &parts {
746            assert_eq!(part.len(), 4);
747            assert!(part.chars().all(|c| c.is_ascii_digit()));
748        }
749        assert_eq!(out.len(), orig.len(), "credit card must preserve length");
750    }
751
752    #[test]
753    fn name_format() {
754        let gen = HmacGenerator::new([0u8; 32]);
755        let orig = "John Doe";
756        let out = gen.generate(&Category::Name, orig);
757        assert_eq!(out.len(), orig.len(), "name must preserve length");
758    }
759
760    #[test]
761    fn ipv6_format() {
762        let gen = HmacGenerator::new([0u8; 32]);
763        let orig = "fd00:abcd:1234:5678::1";
764        let out = gen.generate(&Category::IpV6, orig);
765        // Colons and :: preserved, length preserved.
766        assert_eq!(
767            out.chars().filter(|c| *c == ':').count(),
768            orig.chars().filter(|c| *c == ':').count(),
769            "colons must be preserved"
770        );
771        assert_eq!(out.len(), orig.len(), "ipv6 must preserve length");
772    }
773
774    #[test]
775    fn length_preserved_all_categories() {
776        let gen = HmacGenerator::new([42u8; 32]);
777        let cases: Vec<(Category, &str)> = vec![
778            (Category::Email, "alice@corp.com"),
779            (Category::Name, "John Doe"),
780            (Category::Phone, "+1-212-555-0100"),
781            (Category::IpV4, "192.168.1.1"),
782            (Category::IpV6, "fd00::1"),
783            (Category::CreditCard, "4111-1111-1111-1111"),
784            (Category::Ssn, "123-45-6789"),
785            (Category::Hostname, "db-prod-01.internal"),
786            (Category::MacAddress, "AA:BB:CC:DD:EE:FF"),
787            (Category::ContainerId, "a1b2c3d4e5f6"),
788            (Category::Uuid, "550e8400-e29b-41d4-a716-446655440000"),
789            (Category::Jwt, "eyJhbGciOiJI.eyJzdWIiOiIx.SflKxwRJSMeK"),
790            (Category::AuthToken, "ghp_abc123secrettoken"),
791            (Category::FilePath, "/home/jsmith/config.yaml"),
792            (Category::WindowsSid, "S-1-5-21-3623811015-3361044348"),
793            (Category::Url, "https://internal.corp.com/api"),
794            (Category::AwsArn, "arn:aws:iam::123456789012:user/admin"),
795            (
796                Category::AzureResourceId,
797                "/subscriptions/550e8400/resourceGroups/rg-prod",
798            ),
799            (Category::Custom("key".into()), "some-secret-value-here"),
800        ];
801        for (cat, orig) in &cases {
802            let out = gen.generate(cat, orig);
803            assert_eq!(
804                out.len(),
805                orig.len(),
806                "length mismatch for {:?}: '{}' ({}) -> '{}' ({})",
807                cat,
808                orig,
809                orig.len(),
810                out,
811                out.len()
812            );
813        }
814    }
815
816    #[test]
817    fn mac_address_format() {
818        let gen = HmacGenerator::new([7u8; 32]);
819        let orig = "AA:BB:CC:DD:EE:FF";
820        let out = gen.generate(&Category::MacAddress, orig);
821        assert_eq!(out.len(), orig.len(), "mac must preserve length");
822        assert_eq!(
823            out.chars().filter(|c| *c == ':').count(),
824            5,
825            "mac must preserve colons"
826        );
827    }
828
829    #[test]
830    fn mac_address_dash_format() {
831        let gen = HmacGenerator::new([7u8; 32]);
832        let orig = "AA-BB-CC-DD-EE-FF";
833        let out = gen.generate(&Category::MacAddress, orig);
834        assert_eq!(out.len(), orig.len());
835        assert_eq!(out.chars().filter(|c| *c == '-').count(), 5);
836    }
837
838    #[test]
839    fn uuid_format() {
840        let gen = HmacGenerator::new([3u8; 32]);
841        let orig = "550e8400-e29b-41d4-a716-446655440000";
842        let out = gen.generate(&Category::Uuid, orig);
843        assert_eq!(out.len(), orig.len(), "uuid must preserve length");
844        assert_eq!(
845            out.chars().filter(|c| *c == '-').count(),
846            4,
847            "uuid must preserve dashes"
848        );
849    }
850
851    #[test]
852    fn container_id_format() {
853        let gen = HmacGenerator::new([5u8; 32]);
854        let orig = "a1b2c3d4e5f6";
855        let out = gen.generate(&Category::ContainerId, orig);
856        assert_eq!(out.len(), orig.len(), "container id must preserve length");
857        assert!(out.chars().all(|c| c.is_ascii_hexdigit()));
858    }
859
860    #[test]
861    fn jwt_format() {
862        let gen = HmacGenerator::new([11u8; 32]);
863        let orig = "eyJhbGciOiJI.eyJzdWIiOiIx.SflKxwRJSMeK";
864        let out = gen.generate(&Category::Jwt, orig);
865        assert_eq!(out.len(), orig.len(), "jwt must preserve length");
866        let orig_dots = orig.chars().filter(|c| *c == '.').count();
867        let out_dots = out.chars().filter(|c| *c == '.').count();
868        assert_eq!(out_dots, orig_dots, "jwt must preserve dots");
869    }
870
871    #[test]
872    fn auth_token_format() {
873        let gen = HmacGenerator::new([9u8; 32]);
874        let orig = "ghp_abc123secrettoken";
875        let out = gen.generate(&Category::AuthToken, orig);
876        assert!(out.starts_with("__SANITIZED_"));
877        assert!(out.ends_with("__"));
878        assert_eq!(out.len(), orig.len(), "auth_token must preserve length");
879    }
880
881    #[test]
882    fn filepath_unix_format() {
883        let gen = HmacGenerator::new([13u8; 32]);
884        let orig = "/home/jsmith/config.yaml";
885        let out = gen.generate(&Category::FilePath, orig);
886        assert_eq!(out.len(), orig.len(), "filepath must preserve length");
887        assert_eq!(
888            std::path::Path::new(&out)
889                .extension()
890                .and_then(|e| e.to_str()),
891            Some("yaml"),
892            "filepath must preserve extension"
893        );
894        assert_eq!(
895            out.chars().filter(|c| *c == '/').count(),
896            orig.chars().filter(|c| *c == '/').count(),
897            "filepath must preserve separators"
898        );
899    }
900
901    #[test]
902    fn filepath_windows_format() {
903        let gen = HmacGenerator::new([13u8; 32]);
904        let orig = "C:\\Users\\admin\\secrets.txt";
905        let out = gen.generate(&Category::FilePath, orig);
906        assert_eq!(out.len(), orig.len(), "filepath must preserve length");
907        assert_eq!(
908            std::path::Path::new(&out)
909                .extension()
910                .and_then(|e| e.to_str()),
911            Some("txt"),
912            "filepath must preserve extension"
913        );
914        assert_eq!(
915            out.chars().filter(|c| *c == '\\').count(),
916            orig.chars().filter(|c| *c == '\\').count(),
917            "filepath must preserve backslashes"
918        );
919    }
920
921    #[test]
922    fn windows_sid_format() {
923        let gen = HmacGenerator::new([7u8; 32]);
924        let orig = "S-1-5-21-3623811015-3361044348-30300820-1013";
925        let out = gen.generate(&Category::WindowsSid, orig);
926        assert_eq!(out.len(), orig.len(), "SID must preserve length");
927        assert!(out.starts_with("S-"), "SID must start with S-");
928        assert_eq!(
929            out.chars().filter(|c| *c == '-').count(),
930            orig.chars().filter(|c| *c == '-').count(),
931            "SID must preserve dashes"
932        );
933    }
934
935    #[test]
936    fn url_format() {
937        let gen = HmacGenerator::new([5u8; 32]);
938        let orig = "https://internal.corp.com/api/users?token=abc123";
939        let out = gen.generate(&Category::Url, orig);
940        assert_eq!(out.len(), orig.len(), "url must preserve length");
941        // Structural characters preserved.
942        assert!(out.contains("://"));
943        assert!(out.contains('?'));
944        assert!(out.contains('='));
945    }
946
947    #[test]
948    fn aws_arn_format() {
949        let gen = HmacGenerator::new([3u8; 32]);
950        let orig = "arn:aws:iam::123456789012:user/admin";
951        let out = gen.generate(&Category::AwsArn, orig);
952        assert_eq!(out.len(), orig.len(), "ARN must preserve length");
953        assert_eq!(
954            out.chars().filter(|c| *c == ':').count(),
955            orig.chars().filter(|c| *c == ':').count(),
956            "ARN must preserve colons"
957        );
958        assert!(out.contains('/'), "ARN must preserve slash");
959    }
960
961    #[test]
962    fn azure_resource_id_format() {
963        let gen = HmacGenerator::new([11u8; 32]);
964        let orig = "/subscriptions/550e8400-e29b/resourceGroups/rg-prod/providers/Microsoft.Compute/virtualMachines/vm-01";
965        let out = gen.generate(&Category::AzureResourceId, orig);
966        assert_eq!(
967            out.len(),
968            orig.len(),
969            "Azure resource ID must preserve length"
970        );
971        assert!(
972            out.contains("/subscriptions/"),
973            "must preserve 'subscriptions'"
974        );
975        assert!(
976            out.contains("/resourceGroups/"),
977            "must preserve 'resourceGroups'"
978        );
979        assert!(out.contains("/providers/"), "must preserve 'providers'");
980        assert!(
981            out.contains("Microsoft.Compute"),
982            "must preserve dotted provider name"
983        );
984    }
985}