Skip to main content

sanitize_engine/
generator.rs

1//! Replacement generation strategies.
2//!
3//! Two concrete implementations:
4//! - `HmacGenerator`: Deterministic, seeded with a 32-byte key. Same seed + same
5//!   input = same output across runs. Uses HMAC-SHA256 for domain separation.
6//! - `RandomGenerator`: Cryptographically random replacements. Non-deterministic.
7//!
8//! Both produce category-aware, format-preserving replacements.
9//!
10//! # Design Note
11//!
12//! This module contains the category-aware formatters used by the CLI binary.
13//! For an extensible strategy API that allows custom replacement logic, see
14//! the [`crate::strategy`] module.
15
16use crate::category::Category;
17use hmac::{Hmac, Mac};
18use rand::Rng;
19use sha2::Sha256;
20use zeroize::Zeroize;
21
22// ---------------------------------------------------------------------------
23// Trait
24// ---------------------------------------------------------------------------
25
26/// Strategy for generating a sanitized replacement value.
27///
28/// Implementations MUST be deterministic to their inputs: given the same
29/// `(category, original)` pair (and same internal state / seed), the output
30/// must be identical. This is what enables per-run consistency when backed
31/// by a `MappingStore` that calls `generate` only once per unique value.
32pub trait ReplacementGenerator: Send + Sync {
33    /// Produce a sanitized replacement for `original` classified as `category`.
34    fn generate(&self, category: &Category, original: &str) -> String;
35}
36
37// ---------------------------------------------------------------------------
38// HMAC-SHA256 deterministic generator
39// ---------------------------------------------------------------------------
40
41/// Deterministic replacement generator seeded with a 32-byte key.
42///
43/// ```text
44/// replacement = format(category, HMAC-SHA256(key, category_tag || "\x00" || original))
45/// ```
46///
47/// The same key + same `(category, original)` always yields the same output.
48/// Different keys yield completely different outputs with overwhelming probability.
49pub struct HmacGenerator {
50    key: [u8; 32],
51}
52
53impl Drop for HmacGenerator {
54    fn drop(&mut self) {
55        self.key.zeroize();
56    }
57}
58
59impl HmacGenerator {
60    /// Create a new generator from a 32-byte seed.
61    #[must_use]
62    pub fn new(key: [u8; 32]) -> Self {
63        Self { key }
64    }
65
66    /// Create a generator from a byte slice (must be exactly 32 bytes).
67    ///
68    /// # Errors
69    ///
70    /// Returns [`SanitizeError::InvalidSeedLength`](crate::error::SanitizeError::InvalidSeedLength) if `bytes.len() != 32`.
71    pub fn from_slice(bytes: &[u8]) -> crate::error::Result<Self> {
72        if bytes.len() != 32 {
73            return Err(crate::error::SanitizeError::InvalidSeedLength(bytes.len()));
74        }
75        let mut key = [0u8; 32];
76        key.copy_from_slice(bytes);
77        Ok(Self { key })
78    }
79
80    /// Derive the raw 32-byte HMAC digest for `(category, original)`.
81    fn derive(&self, category: &Category, original: &str) -> [u8; 32] {
82        type HmacSha256 = Hmac<Sha256>;
83        let mut mac = HmacSha256::new_from_slice(&self.key).expect("HMAC accepts any key length");
84        let tag = category.domain_tag_hmac();
85        mac.update(tag.as_bytes());
86        mac.update(b"\x00"); // domain separator
87        mac.update(original.as_bytes());
88        let result = mac.finalize();
89        let mut out = [0u8; 32];
90        out.copy_from_slice(&result.into_bytes());
91        out
92    }
93}
94
95impl ReplacementGenerator for HmacGenerator {
96    fn generate(&self, category: &Category, original: &str) -> String {
97        let hash = self.derive(category, original);
98        format_replacement(category, &hash, original)
99    }
100}
101
102// ---------------------------------------------------------------------------
103// Cryptographically-random generator (non-deterministic)
104// ---------------------------------------------------------------------------
105
106/// Random replacement generator using OS CSPRNG.
107///
108/// Each call to `generate` produces a fresh random value. Determinism is
109/// achieved externally by the `MappingStore`, which calls `generate` only
110/// once per unique `(category, original)` pair and caches the result.
111pub struct RandomGenerator;
112
113impl RandomGenerator {
114    #[must_use]
115    pub fn new() -> Self {
116        Self
117    }
118}
119
120impl Default for RandomGenerator {
121    fn default() -> Self {
122        Self::new()
123    }
124}
125
126impl ReplacementGenerator for RandomGenerator {
127    fn generate(&self, category: &Category, original: &str) -> String {
128        let mut rng = rand::thread_rng();
129        let mut hash = [0u8; 32];
130        rng.fill(&mut hash);
131        format_replacement(category, &hash, original)
132    }
133}
134
135// ---------------------------------------------------------------------------
136// Category-aware formatting helpers
137// ---------------------------------------------------------------------------
138
139/// Format a 32-byte hash into a length-preserving replacement whose
140/// byte length exactly matches `original.len()`. The shape is
141/// category-aware and deterministic for the same `(hash, original)` pair.
142fn format_replacement(category: &Category, hash: &[u8; 32], original: &str) -> String {
143    let target = original.len();
144    if target == 0 {
145        return String::new();
146    }
147    match category {
148        Category::Email => format_email_lp(hash, original, target),
149        Category::Name => format_name_lp(hash, target),
150        Category::Phone | Category::CreditCard | Category::IpV4 => {
151            format_digits_lp(hash, original, target)
152        }
153        Category::IpV6 | Category::MacAddress | Category::Uuid | Category::ContainerId => {
154            format_hex_digits_lp(hash, original, target)
155        }
156        Category::Ssn => format_ssn_lp(hash, original, target),
157        Category::Hostname => format_hostname_lp(hash, original, target),
158        Category::Jwt => format_jwt_lp(hash, original, target),
159        Category::FilePath => format_filepath_lp(hash, original, target),
160        Category::WindowsSid => format_windows_sid_lp(hash, original, target),
161        Category::Url => format_url_lp(hash, original, target),
162        Category::AwsArn => format_arn_lp(hash, original, target),
163        Category::AzureResourceId => format_azure_resource_id_lp(hash, original, target),
164        Category::AuthToken | Category::Custom(_) => format_custom_lp(hash, target),
165    }
166}
167
168// ---------------------------------------------------------------------------
169// Length-preserving helpers
170// ---------------------------------------------------------------------------
171
172/// Pad `s` with deterministic hex characters from `hash`, or truncate,
173/// to reach exactly `target` bytes.  All generated content is ASCII so
174/// byte length equals character count for the produced output.
175fn pad_or_truncate(s: &str, target: usize, hash: &[u8; 32]) -> String {
176    let slen = s.len();
177    if slen == target {
178        return s.to_string();
179    }
180    if slen > target {
181        return s[..target].to_string();
182    }
183    // Pad with deterministic hex chars derived from the hash.
184    let hex = hex_encode(hash);
185    let hex_bytes = hex.as_bytes();
186    let mut buf = String::with_capacity(target);
187    buf.push_str(s);
188    for i in 0..target.saturating_sub(slen) {
189        buf.push(hex_bytes[i % 64] as char);
190    }
191    buf
192}
193
194/// Length-preserving email replacement.
195/// Preserves the domain from the original; generates a hex username
196/// sized so the total byte length matches the original.
197fn format_email_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
198    let domain = original
199        .rfind('@')
200        .map_or("x.co", |pos| &original[pos + 1..]);
201    let at_domain = 1 + domain.len(); // "@" + domain
202    if target <= at_domain {
203        // Too short to fit @domain — use hex fallback.
204        return pad_or_truncate("", target, hash);
205    }
206    let user_len = target - at_domain;
207    let hex = hex_encode(hash);
208    let hex_bytes = hex.as_bytes();
209    let mut buf = String::with_capacity(target);
210    for i in 0..user_len {
211        buf.push(hex_bytes[i % 64] as char);
212    }
213    buf.push('@');
214    buf.push_str(domain);
215    buf
216}
217
218/// Length-preserving name replacement.
219/// Generates a synthetic name via the hash-indexed table, then
220/// truncates or pads to match `target` bytes.
221fn format_name_lp(hash: &[u8; 32], target: usize) -> String {
222    let raw = format_name(hash);
223    pad_or_truncate(&raw, target, hash)
224}
225
226/// Length-preserving digit replacement.
227/// Preserves every non-digit character in `original`; replaces each
228/// ASCII digit with a deterministic digit derived from `hash`.
229/// Falls back to hex if the original contains no digits.
230fn format_digits_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
231    let mut buf = String::with_capacity(target);
232    let mut hi = 0usize;
233    let mut had_digit = false;
234    for ch in original.chars() {
235        if ch.is_ascii_digit() {
236            buf.push((b'0' + hash[hi % 32] % 10) as char);
237            hi += 1;
238            had_digit = true;
239        } else {
240            buf.push(ch);
241        }
242    }
243    if !had_digit {
244        return pad_or_truncate("", target, hash);
245    }
246    // Guard against multi-byte chars causing length mismatch.
247    if buf.len() != target {
248        return pad_or_truncate(&buf, target, hash);
249    }
250    buf
251}
252
253/// Length-preserving hex-digit replacement (for IPv6).
254/// Preserves non-hex characters (colons, `::`, etc.); replaces each
255/// ASCII hex digit with a deterministic hex digit from `hash`.
256fn format_hex_digits_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
257    let mut buf = String::with_capacity(target);
258    let mut hi = 0usize;
259    let mut had_hex = false;
260    for ch in original.chars() {
261        if ch.is_ascii_hexdigit() {
262            let nibble = hash[hi % 32] % 16;
263            let replacement = if ch.is_ascii_uppercase() {
264                b"0123456789ABCDEF"[nibble as usize]
265            } else {
266                b"0123456789abcdef"[nibble as usize]
267            };
268            buf.push(replacement as char);
269            hi += 1;
270            had_hex = true;
271        } else {
272            buf.push(ch);
273        }
274    }
275    if !had_hex {
276        return pad_or_truncate("", target, hash);
277    }
278    buf
279}
280
281/// Length-preserving SSN replacement.
282/// Preserves all non-digit characters.  The first three digit positions
283/// are forced to '0' (never-issued area code, clearly synthetic).
284/// Remaining digit positions are filled with deterministic digits.
285fn format_ssn_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
286    let has_digit = original.chars().any(|c| c.is_ascii_digit());
287    if !has_digit {
288        return pad_or_truncate("", target, hash);
289    }
290    let mut buf = String::with_capacity(target);
291    let mut digit_idx = 0usize;
292    for ch in original.chars() {
293        if ch.is_ascii_digit() {
294            if digit_idx < 3 {
295                buf.push('0');
296            } else {
297                buf.push((b'0' + hash[(digit_idx - 3) % 32] % 10) as char);
298            }
299            digit_idx += 1;
300        } else {
301            buf.push(ch);
302        }
303    }
304    buf
305}
306
307/// Length-preserving hostname replacement.
308/// Preserves the suffix (everything from the first `.` onward) and
309/// fills the prefix with deterministic hex characters to match `target`.
310fn format_hostname_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
311    let suffix = original.find('.').map_or("", |p| &original[p..]);
312    let prefix_len = target.saturating_sub(suffix.len());
313    if prefix_len == 0 {
314        return pad_or_truncate("", target, hash);
315    }
316    let hex = hex_encode(hash);
317    let hex_bytes = hex.as_bytes();
318    let mut buf = String::with_capacity(target);
319    for i in 0..prefix_len {
320        buf.push(hex_bytes[i % 64] as char);
321    }
322    buf.push_str(suffix);
323    buf
324}
325
326/// Length-preserving custom replacement.
327/// Uses `__SANITIZED_<hex>__` format when the target is long enough;
328/// falls back to bare hex for short targets.
329fn format_custom_lp(hash: &[u8; 32], target: usize) -> String {
330    let prefix = "__SANITIZED_";
331    let suffix = "__";
332    let overhead = prefix.len() + suffix.len(); // 14
333    let hex = hex_encode(hash);
334    if target <= overhead {
335        return pad_or_truncate("", target, hash);
336    }
337    let hex_len = target - overhead;
338    let hex_bytes = hex.as_bytes();
339    let mut buf = String::with_capacity(target);
340    buf.push_str(prefix);
341    for i in 0..hex_len {
342        buf.push(hex_bytes[i % 64] as char);
343    }
344    buf.push_str(suffix);
345    buf
346}
347
348/// Length-preserving JWT replacement.
349/// Preserves `.` separators; replaces base64url characters
350/// (`[A-Za-z0-9_-]`) with deterministic base64url characters.
351fn format_jwt_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
352    const B64URL: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-";
353    let mut buf = String::with_capacity(target);
354    let mut hi = 0usize;
355    let mut had_b64 = false;
356    for ch in original.chars() {
357        if ch == '.' || ch == '=' {
358            buf.push(ch);
359        } else if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
360            buf.push(B64URL[hash[hi % 32] as usize % B64URL.len()] as char);
361            hi += 1;
362            had_b64 = true;
363        } else {
364            // Non-base64url, non-structural: emit byte-preserving replacement.
365            for _ in 0..ch.len_utf8() {
366                buf.push(B64URL[hash[hi % 32] as usize % B64URL.len()] as char);
367                hi += 1;
368            }
369            had_b64 = true;
370        }
371    }
372    if !had_b64 {
373        return pad_or_truncate("", target, hash);
374    }
375    buf
376}
377
378/// Length-preserving file path replacement.
379/// Preserves separators (`/`, `\`) and the final extension (from last `.`
380/// in the last segment). Replaces other characters with deterministic hex.
381fn format_filepath_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
382    // Find the last path separator position to identify the filename segment.
383    let last_sep = original.rfind(['/', '\\']).map_or(0, |p| p + 1);
384    let filename = &original[last_sep..];
385    // Find extension in the filename (last `.` that isn't at position 0).
386    let ext_start = filename.rfind('.').filter(|&p| p > 0).map(|p| last_sep + p);
387
388    let hex = hex_encode(hash);
389    let hex_bytes = hex.as_bytes();
390    let mut buf = String::with_capacity(target);
391    let mut hi = 0usize;
392
393    for (i, ch) in original.char_indices() {
394        if matches!(ch, '/' | '\\') || ext_start.is_some_and(|es| i >= es) {
395            // Preserve separators and the file extension.
396            buf.push(ch);
397        } else {
398            // Emit as many ASCII hex bytes as the original char's UTF-8 length.
399            for _ in 0..ch.len_utf8() {
400                buf.push(hex_bytes[hi % 64] as char);
401                hi += 1;
402            }
403        }
404    }
405    // Ensure exact length (should be equal for ASCII, but guard anyway).
406    if buf.len() != target {
407        return pad_or_truncate(&buf, target, hash);
408    }
409    buf
410}
411
412/// Length-preserving Windows SID replacement.
413/// Preserves the `S-` prefix and `-` separators; replaces digit groups
414/// with deterministic digits.
415fn format_windows_sid_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
416    let has_digit = original.chars().any(|c| c.is_ascii_digit());
417    if !has_digit {
418        return pad_or_truncate("", target, hash);
419    }
420    let mut buf = String::with_capacity(target);
421    let mut hi = 0usize;
422    for ch in original.chars() {
423        if ch == 'S' || ch == '-' {
424            buf.push(ch);
425        } else if ch.is_ascii_digit() {
426            buf.push((b'0' + hash[hi % 32] % 10) as char);
427            hi += 1;
428        } else {
429            // Non-digit, non-structural: emit byte-count-preserving hex.
430            for _ in 0..ch.len_utf8() {
431                buf.push((b'0' + hash[hi % 32] % 10) as char);
432                hi += 1;
433            }
434        }
435    }
436    buf
437}
438
439/// Shared core for length-preserving hex replacement where a caller-supplied
440/// predicate identifies "structural" characters to preserve as-is.
441///
442/// All non-structural characters are replaced byte-by-byte with deterministic
443/// hex characters derived from `hash`.  Returns `None` if the original
444/// contained no replaceable content (caller should fall back to
445/// [`pad_or_truncate`]).
446fn format_preserving_hex_lp(
447    hash: &[u8; 32],
448    original: &str,
449    target: usize,
450    is_structural: impl Fn(char) -> bool,
451) -> Option<String> {
452    let hex = hex_encode(hash);
453    let hex_bytes = hex.as_bytes();
454    let mut buf = String::with_capacity(target);
455    let mut hi = 0usize;
456    let mut had_content = false;
457
458    for ch in original.chars() {
459        if is_structural(ch) {
460            buf.push(ch);
461        } else {
462            for _ in 0..ch.len_utf8() {
463                buf.push(hex_bytes[hi % 64] as char);
464                hi += 1;
465            }
466            had_content = true;
467        }
468    }
469
470    had_content.then_some(buf)
471}
472
473/// Length-preserving URL replacement.
474/// Preserves scheme prefix and structural characters
475/// (`://`, `/`, `?`, `=`, `&`, `#`, `:`); replaces content characters
476/// with deterministic hex.
477fn format_url_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
478    format_preserving_hex_lp(hash, original, target, |ch| "/:?=&#@.".contains(ch))
479        .unwrap_or_else(|| pad_or_truncate("", target, hash))
480}
481
482/// Length-preserving AWS ARN replacement.
483/// Preserves `:` and `/` separators; replaces alphanumeric content
484/// in account/resource segments with deterministic hex.
485fn format_arn_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
486    format_preserving_hex_lp(hash, original, target, |ch| ch == ':' || ch == '/')
487        .unwrap_or_else(|| pad_or_truncate("", target, hash))
488}
489
490/// Length-preserving Azure Resource ID replacement.
491/// Preserves `/` path separators and well-known Azure segment names
492/// (`subscriptions`, `resourceGroups`, `providers`, `resourcegroups`).
493/// Replaces variable segments (IDs, names) with deterministic hex.
494fn format_azure_resource_id_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
495    const KNOWN_SEGMENTS: &[&str] = &[
496        "subscriptions",
497        "resourceGroups",
498        "resourcegroups",
499        "providers",
500    ];
501
502    let hex = hex_encode(hash);
503    let hex_bytes = hex.as_bytes();
504    let mut buf = String::with_capacity(target);
505    let mut hi = 0usize;
506
507    // Split on `/`, rebuild with deterministic replacement for non-known segments.
508    let parts: Vec<&str> = original.split('/').collect();
509    for (pi, part) in parts.iter().enumerate() {
510        if pi > 0 {
511            buf.push('/');
512        }
513        if part.is_empty() || KNOWN_SEGMENTS.contains(part) || part.contains('.') {
514            // Preserve empty segments (leading `/`), known names, and
515            // dotted provider names like `Microsoft.Compute`.
516            buf.push_str(part);
517        } else {
518            // Replace this segment character-by-character to preserve byte length.
519            for ch in part.chars() {
520                for _ in 0..ch.len_utf8() {
521                    buf.push(hex_bytes[hi % 64] as char);
522                    hi += 1;
523                }
524            }
525        }
526    }
527    if buf.len() != target {
528        return pad_or_truncate(&buf, target, hash);
529    }
530    buf
531}
532
533/// Deterministic synthetic name from hash bytes.
534fn format_name(hash: &[u8; 32]) -> String {
535    // We use a small, fixed table of first/last name fragments.
536    // The hash selects indices. This is NOT meant to be realistic — it's
537    // meant to be obviously synthetic while remaining structurally plausible.
538    const FIRST: &[&str] = &[
539        "Alex", "Blake", "Casey", "Dana", "Ellis", "Finley", "Gray", "Harper", "Ira", "Jordan",
540        "Kai", "Lane", "Morgan", "Noel", "Oakley", "Parker", "Quinn", "Reese", "Sage", "Taylor",
541        "Uri", "Val", "Wren", "Xen", "Yael", "Zion", "Arden", "Blair", "Corin", "Drew", "Emery",
542        "Frost",
543    ];
544    const LAST: &[&str] = &[
545        "Ashford",
546        "Blackwell",
547        "Crawford",
548        "Dalton",
549        "Eastwood",
550        "Fairbanks",
551        "Garrison",
552        "Hartley",
553        "Irvine",
554        "Jensen",
555        "Kendrick",
556        "Langley",
557        "Mercer",
558        "Newland",
559        "Oakwood",
560        "Preston",
561        "Quinlan",
562        "Redmond",
563        "Shepard",
564        "Thornton",
565        "Underwood",
566        "Vance",
567        "Whitmore",
568        "Xavier",
569        "Yardley",
570        "Zimmer",
571        "Ashton",
572        "Beckett",
573        "Calloway",
574        "Dempsey",
575        "Eldridge",
576        "Fletcher",
577    ];
578    let fi = hash[0] as usize % FIRST.len();
579    let li = hash[1] as usize % LAST.len();
580    format!("{} {}", FIRST[fi], LAST[li])
581}
582
583/// Hex-encode 32 bytes → 64-char lowercase string.
584fn hex_encode(bytes: &[u8; 32]) -> String {
585    use std::fmt::Write;
586    let mut hex = String::with_capacity(64);
587    for b in bytes {
588        let _ = write!(hex, "{:02x}", b);
589    }
590    hex
591}
592
593// ---------------------------------------------------------------------------
594// Tests
595// ---------------------------------------------------------------------------
596
597#[cfg(test)]
598mod tests {
599    use super::*;
600
601    #[test]
602    fn hmac_deterministic_same_input() {
603        let gen = HmacGenerator::new([42u8; 32]);
604        let a = gen.generate(&Category::Email, "alice@corp.com");
605        let b = gen.generate(&Category::Email, "alice@corp.com");
606        assert_eq!(a, b, "same seed + same input must produce same output");
607    }
608
609    #[test]
610    fn hmac_different_inputs_differ() {
611        let gen = HmacGenerator::new([42u8; 32]);
612        let a = gen.generate(&Category::Email, "alice@corp.com");
613        let b = gen.generate(&Category::Email, "bob@corp.com");
614        assert_ne!(a, b);
615    }
616
617    #[test]
618    fn hmac_different_seeds_differ() {
619        let g1 = HmacGenerator::new([1u8; 32]);
620        let g2 = HmacGenerator::new([2u8; 32]);
621        let a = g1.generate(&Category::Email, "alice@corp.com");
622        let b = g2.generate(&Category::Email, "alice@corp.com");
623        assert_ne!(a, b);
624    }
625
626    #[test]
627    fn hmac_different_categories_differ() {
628        let gen = HmacGenerator::new([42u8; 32]);
629        let a = gen.generate(&Category::Email, "test");
630        let b = gen.generate(&Category::Name, "test");
631        assert_ne!(a, b, "different categories must produce different outputs");
632    }
633
634    #[test]
635    fn email_format() {
636        let gen = HmacGenerator::new([0u8; 32]);
637        let orig = "alice@corp.com";
638        let out = gen.generate(&Category::Email, orig);
639        assert!(out.contains('@'), "email must contain @");
640        assert!(out.ends_with("@corp.com"), "email must preserve domain");
641        assert_eq!(out.len(), orig.len(), "email must preserve length");
642    }
643
644    #[test]
645    fn ipv4_format() {
646        let gen = HmacGenerator::new([0u8; 32]);
647        let orig = "192.168.1.1";
648        let out = gen.generate(&Category::IpV4, orig);
649        // Dots preserved, length preserved.
650        let parts: Vec<&str> = out.split('.').collect();
651        assert_eq!(parts.len(), 4);
652        assert_eq!(out.len(), orig.len(), "ipv4 must preserve length");
653    }
654
655    #[test]
656    fn ssn_format() {
657        let gen = HmacGenerator::new([7u8; 32]);
658        let orig = "123-45-6789";
659        let out = gen.generate(&Category::Ssn, orig);
660        assert!(out.starts_with("000-"), "SSN must start with 000");
661        assert_eq!(out.len(), orig.len(), "SSN must preserve length");
662    }
663
664    #[test]
665    fn phone_format() {
666        let gen = HmacGenerator::new([3u8; 32]);
667        let orig = "+1-212-555-0100";
668        let out = gen.generate(&Category::Phone, orig);
669        // Formatting characters preserved.
670        assert!(out.starts_with('+'));
671        assert_eq!(
672            out.chars().filter(|c| *c == '-').count(),
673            orig.chars().filter(|c| *c == '-').count(),
674            "dashes must be preserved"
675        );
676        assert_eq!(out.len(), orig.len(), "phone must preserve length");
677    }
678
679    #[test]
680    fn hostname_format() {
681        let gen = HmacGenerator::new([5u8; 32]);
682        let orig = "db-prod-01.internal";
683        let out = gen.generate(&Category::Hostname, orig);
684        assert!(out.ends_with(".internal"), "hostname must preserve suffix");
685        assert_eq!(out.len(), orig.len(), "hostname must preserve length");
686    }
687
688    #[test]
689    fn custom_format() {
690        let gen = HmacGenerator::new([9u8; 32]);
691        let cat = Category::Custom("api_key".into());
692        // Use an input long enough for the __SANITIZED_..__ wrapper (>14 chars).
693        let orig = "sk-abc123-very-long-key";
694        let out = gen.generate(&cat, orig);
695        assert!(out.starts_with("__SANITIZED_"));
696        assert!(out.ends_with("__"));
697        assert_eq!(out.len(), orig.len(), "custom must preserve length");
698    }
699
700    #[test]
701    fn custom_format_short() {
702        let gen = HmacGenerator::new([9u8; 32]);
703        let cat = Category::Custom("api_key".into());
704        // Short input falls back to hex.
705        let orig = "sk-abc123";
706        let out = gen.generate(&cat, orig);
707        assert_eq!(
708            out.len(),
709            orig.len(),
710            "custom must preserve length even for short inputs"
711        );
712    }
713
714    #[test]
715    fn random_generator_produces_valid_format() {
716        let gen = RandomGenerator::new();
717        let orig = "test@example.com";
718        let out = gen.generate(&Category::Email, orig);
719        assert!(out.contains('@'));
720        assert_eq!(
721            out.len(),
722            orig.len(),
723            "random generator must preserve length"
724        );
725    }
726
727    #[test]
728    fn from_slice_rejects_bad_length() {
729        let result = HmacGenerator::from_slice(&[0u8; 16]);
730        assert!(result.is_err());
731    }
732
733    #[test]
734    fn credit_card_format() {
735        let gen = HmacGenerator::new([11u8; 32]);
736        let orig = "4111-1111-1111-1111";
737        let out = gen.generate(&Category::CreditCard, orig);
738        // Should be ####-####-####-####
739        let parts: Vec<&str> = out.split('-').collect();
740        assert_eq!(parts.len(), 4);
741        for part in &parts {
742            assert_eq!(part.len(), 4);
743            assert!(part.chars().all(|c| c.is_ascii_digit()));
744        }
745        assert_eq!(out.len(), orig.len(), "credit card must preserve length");
746    }
747
748    #[test]
749    fn name_format() {
750        let gen = HmacGenerator::new([0u8; 32]);
751        let orig = "John Doe";
752        let out = gen.generate(&Category::Name, orig);
753        assert_eq!(out.len(), orig.len(), "name must preserve length");
754    }
755
756    #[test]
757    fn ipv6_format() {
758        let gen = HmacGenerator::new([0u8; 32]);
759        let orig = "fd00:abcd:1234:5678::1";
760        let out = gen.generate(&Category::IpV6, orig);
761        // Colons and :: preserved, length preserved.
762        assert_eq!(
763            out.chars().filter(|c| *c == ':').count(),
764            orig.chars().filter(|c| *c == ':').count(),
765            "colons must be preserved"
766        );
767        assert_eq!(out.len(), orig.len(), "ipv6 must preserve length");
768    }
769
770    #[test]
771    fn length_preserved_all_categories() {
772        let gen = HmacGenerator::new([42u8; 32]);
773        let cases: Vec<(Category, &str)> = vec![
774            (Category::Email, "alice@corp.com"),
775            (Category::Name, "John Doe"),
776            (Category::Phone, "+1-212-555-0100"),
777            (Category::IpV4, "192.168.1.1"),
778            (Category::IpV6, "fd00::1"),
779            (Category::CreditCard, "4111-1111-1111-1111"),
780            (Category::Ssn, "123-45-6789"),
781            (Category::Hostname, "db-prod-01.internal"),
782            (Category::MacAddress, "AA:BB:CC:DD:EE:FF"),
783            (Category::ContainerId, "a1b2c3d4e5f6"),
784            (Category::Uuid, "550e8400-e29b-41d4-a716-446655440000"),
785            (Category::Jwt, "eyJhbGciOiJI.eyJzdWIiOiIx.SflKxwRJSMeK"),
786            (Category::AuthToken, "ghp_abc123secrettoken"),
787            (Category::FilePath, "/home/jsmith/config.yaml"),
788            (Category::WindowsSid, "S-1-5-21-3623811015-3361044348"),
789            (Category::Url, "https://internal.corp.com/api"),
790            (Category::AwsArn, "arn:aws:iam::123456789012:user/admin"),
791            (
792                Category::AzureResourceId,
793                "/subscriptions/550e8400/resourceGroups/rg-prod",
794            ),
795            (Category::Custom("key".into()), "some-secret-value-here"),
796        ];
797        for (cat, orig) in &cases {
798            let out = gen.generate(cat, orig);
799            assert_eq!(
800                out.len(),
801                orig.len(),
802                "length mismatch for {:?}: '{}' ({}) -> '{}' ({})",
803                cat,
804                orig,
805                orig.len(),
806                out,
807                out.len()
808            );
809        }
810    }
811
812    #[test]
813    fn mac_address_format() {
814        let gen = HmacGenerator::new([7u8; 32]);
815        let orig = "AA:BB:CC:DD:EE:FF";
816        let out = gen.generate(&Category::MacAddress, orig);
817        assert_eq!(out.len(), orig.len(), "mac must preserve length");
818        assert_eq!(
819            out.chars().filter(|c| *c == ':').count(),
820            5,
821            "mac must preserve colons"
822        );
823    }
824
825    #[test]
826    fn mac_address_dash_format() {
827        let gen = HmacGenerator::new([7u8; 32]);
828        let orig = "AA-BB-CC-DD-EE-FF";
829        let out = gen.generate(&Category::MacAddress, orig);
830        assert_eq!(out.len(), orig.len());
831        assert_eq!(out.chars().filter(|c| *c == '-').count(), 5);
832    }
833
834    #[test]
835    fn uuid_format() {
836        let gen = HmacGenerator::new([3u8; 32]);
837        let orig = "550e8400-e29b-41d4-a716-446655440000";
838        let out = gen.generate(&Category::Uuid, orig);
839        assert_eq!(out.len(), orig.len(), "uuid must preserve length");
840        assert_eq!(
841            out.chars().filter(|c| *c == '-').count(),
842            4,
843            "uuid must preserve dashes"
844        );
845    }
846
847    #[test]
848    fn container_id_format() {
849        let gen = HmacGenerator::new([5u8; 32]);
850        let orig = "a1b2c3d4e5f6";
851        let out = gen.generate(&Category::ContainerId, orig);
852        assert_eq!(out.len(), orig.len(), "container id must preserve length");
853        assert!(out.chars().all(|c| c.is_ascii_hexdigit()));
854    }
855
856    #[test]
857    fn jwt_format() {
858        let gen = HmacGenerator::new([11u8; 32]);
859        let orig = "eyJhbGciOiJI.eyJzdWIiOiIx.SflKxwRJSMeK";
860        let out = gen.generate(&Category::Jwt, orig);
861        assert_eq!(out.len(), orig.len(), "jwt must preserve length");
862        let orig_dots = orig.chars().filter(|c| *c == '.').count();
863        let out_dots = out.chars().filter(|c| *c == '.').count();
864        assert_eq!(out_dots, orig_dots, "jwt must preserve dots");
865    }
866
867    #[test]
868    fn auth_token_format() {
869        let gen = HmacGenerator::new([9u8; 32]);
870        let orig = "ghp_abc123secrettoken";
871        let out = gen.generate(&Category::AuthToken, orig);
872        assert!(out.starts_with("__SANITIZED_"));
873        assert!(out.ends_with("__"));
874        assert_eq!(out.len(), orig.len(), "auth_token must preserve length");
875    }
876
877    #[test]
878    fn filepath_unix_format() {
879        let gen = HmacGenerator::new([13u8; 32]);
880        let orig = "/home/jsmith/config.yaml";
881        let out = gen.generate(&Category::FilePath, orig);
882        assert_eq!(out.len(), orig.len(), "filepath must preserve length");
883        assert_eq!(
884            std::path::Path::new(&out)
885                .extension()
886                .and_then(|e| e.to_str()),
887            Some("yaml"),
888            "filepath must preserve extension"
889        );
890        assert_eq!(
891            out.chars().filter(|c| *c == '/').count(),
892            orig.chars().filter(|c| *c == '/').count(),
893            "filepath must preserve separators"
894        );
895    }
896
897    #[test]
898    fn filepath_windows_format() {
899        let gen = HmacGenerator::new([13u8; 32]);
900        let orig = "C:\\Users\\admin\\secrets.txt";
901        let out = gen.generate(&Category::FilePath, orig);
902        assert_eq!(out.len(), orig.len(), "filepath must preserve length");
903        assert_eq!(
904            std::path::Path::new(&out)
905                .extension()
906                .and_then(|e| e.to_str()),
907            Some("txt"),
908            "filepath must preserve extension"
909        );
910        assert_eq!(
911            out.chars().filter(|c| *c == '\\').count(),
912            orig.chars().filter(|c| *c == '\\').count(),
913            "filepath must preserve backslashes"
914        );
915    }
916
917    #[test]
918    fn windows_sid_format() {
919        let gen = HmacGenerator::new([7u8; 32]);
920        let orig = "S-1-5-21-3623811015-3361044348-30300820-1013";
921        let out = gen.generate(&Category::WindowsSid, orig);
922        assert_eq!(out.len(), orig.len(), "SID must preserve length");
923        assert!(out.starts_with("S-"), "SID must start with S-");
924        assert_eq!(
925            out.chars().filter(|c| *c == '-').count(),
926            orig.chars().filter(|c| *c == '-').count(),
927            "SID must preserve dashes"
928        );
929    }
930
931    #[test]
932    fn url_format() {
933        let gen = HmacGenerator::new([5u8; 32]);
934        let orig = "https://internal.corp.com/api/users?token=abc123";
935        let out = gen.generate(&Category::Url, orig);
936        assert_eq!(out.len(), orig.len(), "url must preserve length");
937        // Structural characters preserved.
938        assert!(out.contains("://"));
939        assert!(out.contains('?'));
940        assert!(out.contains('='));
941    }
942
943    #[test]
944    fn aws_arn_format() {
945        let gen = HmacGenerator::new([3u8; 32]);
946        let orig = "arn:aws:iam::123456789012:user/admin";
947        let out = gen.generate(&Category::AwsArn, orig);
948        assert_eq!(out.len(), orig.len(), "ARN must preserve length");
949        assert_eq!(
950            out.chars().filter(|c| *c == ':').count(),
951            orig.chars().filter(|c| *c == ':').count(),
952            "ARN must preserve colons"
953        );
954        assert!(out.contains('/'), "ARN must preserve slash");
955    }
956
957    #[test]
958    fn azure_resource_id_format() {
959        let gen = HmacGenerator::new([11u8; 32]);
960        let orig = "/subscriptions/550e8400-e29b/resourceGroups/rg-prod/providers/Microsoft.Compute/virtualMachines/vm-01";
961        let out = gen.generate(&Category::AzureResourceId, orig);
962        assert_eq!(
963            out.len(),
964            orig.len(),
965            "Azure resource ID must preserve length"
966        );
967        assert!(
968            out.contains("/subscriptions/"),
969            "must preserve 'subscriptions'"
970        );
971        assert!(
972            out.contains("/resourceGroups/"),
973            "must preserve 'resourceGroups'"
974        );
975        assert!(out.contains("/providers/"), "must preserve 'providers'");
976        assert!(
977            out.contains("Microsoft.Compute"),
978            "must preserve dotted provider name"
979        );
980    }
981}