Skip to main content

sanitize_engine/
generator.rs

1//! Replacement generation strategies.
2//!
3//! Two concrete implementations:
4//! - `HmacGenerator`: Deterministic, seeded with a 32-byte key. Same seed + same
5//!   input = same output across runs. Uses HMAC-SHA256 for domain separation.
6//! - `RandomGenerator`: Cryptographically random replacements. Non-deterministic.
7//!
8//! Both produce category-aware, format-preserving replacements.
9
10use crate::category::Category;
11use hmac::{Hmac, Mac};
12use rand::Rng;
13use sha2::Sha256;
14use zeroize::Zeroize;
15
16// ---------------------------------------------------------------------------
17// Trait
18// ---------------------------------------------------------------------------
19
20/// Strategy for generating a sanitized replacement value.
21///
22/// Implementations MUST be deterministic to their inputs: given the same
23/// `(category, original)` pair (and same internal state / seed), the output
24/// must be identical. This is what enables per-run consistency when backed
25/// by a `MappingStore` that calls `generate` only once per unique value.
26pub trait ReplacementGenerator: Send + Sync {
27    /// Produce a sanitized replacement for `original` classified as `category`.
28    fn generate(&self, category: &Category, original: &str) -> String;
29}
30
31// ---------------------------------------------------------------------------
32// HMAC-SHA256 deterministic generator
33// ---------------------------------------------------------------------------
34
35/// Deterministic replacement generator seeded with a 32-byte key.
36///
37/// ```text
38/// replacement = format(category, HMAC-SHA256(key, category_tag || "\x00" || original))
39/// ```
40///
41/// The same key + same `(category, original)` always yields the same output.
42/// Different keys yield completely different outputs with overwhelming probability.
43pub struct HmacGenerator {
44    key: [u8; 32],
45}
46
47impl Drop for HmacGenerator {
48    fn drop(&mut self) {
49        self.key.zeroize();
50    }
51}
52
53impl HmacGenerator {
54    /// Create a new generator from a 32-byte seed.
55    #[must_use]
56    pub fn new(key: [u8; 32]) -> Self {
57        Self { key }
58    }
59
60    /// Create a generator from a byte slice (must be exactly 32 bytes).
61    ///
62    /// # Errors
63    ///
64    /// Returns [`SanitizeError::InvalidSeedLength`](crate::error::SanitizeError::InvalidSeedLength) if `bytes.len() != 32`.
65    pub fn from_slice(bytes: &[u8]) -> crate::error::Result<Self> {
66        if bytes.len() != 32 {
67            return Err(crate::error::SanitizeError::InvalidSeedLength(bytes.len()));
68        }
69        let mut key = [0u8; 32];
70        key.copy_from_slice(bytes);
71        Ok(Self { key })
72    }
73
74    /// Derive the raw 32-byte HMAC digest for `(category, original)`.
75    fn derive(&self, category: &Category, original: &str) -> [u8; 32] {
76        type HmacSha256 = Hmac<Sha256>;
77        let mut mac = HmacSha256::new_from_slice(&self.key).expect("HMAC accepts any key length");
78        let tag = category.domain_tag_hmac();
79        mac.update(tag.as_bytes());
80        mac.update(b"\x00"); // domain separator
81        mac.update(original.as_bytes());
82        let result = mac.finalize();
83        let mut out = [0u8; 32];
84        out.copy_from_slice(&result.into_bytes());
85        out
86    }
87}
88
89impl ReplacementGenerator for HmacGenerator {
90    fn generate(&self, category: &Category, original: &str) -> String {
91        let hash = self.derive(category, original);
92        format_replacement(category, &hash, original)
93    }
94}
95
96// ---------------------------------------------------------------------------
97// Cryptographically-random generator (non-deterministic)
98// ---------------------------------------------------------------------------
99
100/// Random replacement generator using OS CSPRNG.
101///
102/// Each call to `generate` produces a fresh random value. Determinism is
103/// achieved externally by the `MappingStore`, which calls `generate` only
104/// once per unique `(category, original)` pair and caches the result.
105pub struct RandomGenerator;
106
107impl RandomGenerator {
108    #[must_use]
109    pub fn new() -> Self {
110        Self
111    }
112}
113
114impl Default for RandomGenerator {
115    fn default() -> Self {
116        Self::new()
117    }
118}
119
120impl ReplacementGenerator for RandomGenerator {
121    fn generate(&self, category: &Category, original: &str) -> String {
122        let mut rng = rand::thread_rng();
123        let mut hash = [0u8; 32];
124        rng.fill(&mut hash);
125        format_replacement(category, &hash, original)
126    }
127}
128
129// ---------------------------------------------------------------------------
130// Category-aware formatting helpers
131// ---------------------------------------------------------------------------
132
133/// Format a 32-byte hash into a length-preserving replacement whose
134/// byte length exactly matches `original.len()`. The shape is
135/// category-aware and deterministic for the same `(hash, original)` pair.
136fn format_replacement(category: &Category, hash: &[u8; 32], original: &str) -> String {
137    let target = original.len();
138    if target == 0 {
139        return String::new();
140    }
141    match category {
142        Category::Email => format_email_lp(hash, original, target),
143        Category::Name => format_name_lp(hash, target),
144        Category::Phone | Category::CreditCard | Category::IpV4 => {
145            format_digits_lp(hash, original, target)
146        }
147        Category::IpV6 | Category::MacAddress | Category::Uuid | Category::ContainerId => {
148            format_hex_digits_lp(hash, original, target)
149        }
150        Category::Ssn => format_ssn_lp(hash, original, target),
151        Category::Hostname => format_hostname_lp(hash, original, target),
152        Category::Jwt => format_jwt_lp(hash, original, target),
153        Category::FilePath => format_filepath_lp(hash, original, target),
154        Category::WindowsSid => format_windows_sid_lp(hash, original, target),
155        Category::Url => format_url_lp(hash, original, target),
156        Category::AwsArn => format_arn_lp(hash, original, target),
157        Category::AzureResourceId => format_azure_resource_id_lp(hash, original, target),
158        Category::AuthToken | Category::Custom(_) => format_custom_lp(hash, target),
159    }
160}
161
162// ---------------------------------------------------------------------------
163// Length-preserving helpers
164// ---------------------------------------------------------------------------
165
166/// Pad `s` with deterministic hex characters from `hash`, or truncate,
167/// to reach exactly `target` bytes.  All generated content is ASCII so
168/// byte length equals character count for the produced output.
169fn pad_or_truncate(s: &str, target: usize, hash: &[u8; 32]) -> String {
170    let slen = s.len();
171    if slen == target {
172        return s.to_string();
173    }
174    if slen > target {
175        return s[..target].to_string();
176    }
177    // Pad with deterministic hex chars derived from the hash.
178    let hex = hex_encode(hash);
179    let hex_bytes = hex.as_bytes();
180    let mut buf = String::with_capacity(target);
181    buf.push_str(s);
182    for i in 0..(target - slen) {
183        buf.push(hex_bytes[i % 64] as char);
184    }
185    buf
186}
187
188/// Length-preserving email replacement.
189/// Preserves the domain from the original; generates a hex username
190/// sized so the total byte length matches the original.
191fn format_email_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
192    let domain = original
193        .rfind('@')
194        .map_or("x.co", |pos| &original[pos + 1..]);
195    let at_domain = 1 + domain.len(); // "@" + domain
196    if target <= at_domain {
197        // Too short to fit @domain — use hex fallback.
198        return pad_or_truncate("", target, hash);
199    }
200    let user_len = target - at_domain;
201    let hex = hex_encode(hash);
202    let hex_bytes = hex.as_bytes();
203    let mut buf = String::with_capacity(target);
204    for i in 0..user_len {
205        buf.push(hex_bytes[i % 64] as char);
206    }
207    buf.push('@');
208    buf.push_str(domain);
209    buf
210}
211
212/// Length-preserving name replacement.
213/// Generates a synthetic name via the hash-indexed table, then
214/// truncates or pads to match `target` bytes.
215fn format_name_lp(hash: &[u8; 32], target: usize) -> String {
216    let raw = format_name(hash);
217    pad_or_truncate(&raw, target, hash)
218}
219
220/// Length-preserving digit replacement.
221/// Preserves every non-digit character in `original`; replaces each
222/// ASCII digit with a deterministic digit derived from `hash`.
223/// Falls back to hex if the original contains no digits.
224fn format_digits_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
225    let mut buf = String::with_capacity(target);
226    let mut hi = 0usize;
227    let mut had_digit = false;
228    for ch in original.chars() {
229        if ch.is_ascii_digit() {
230            buf.push((b'0' + hash[hi % 32] % 10) as char);
231            hi += 1;
232            had_digit = true;
233        } else {
234            buf.push(ch);
235        }
236    }
237    if !had_digit {
238        return pad_or_truncate("", target, hash);
239    }
240    // Guard against multi-byte chars causing length mismatch.
241    if buf.len() != target {
242        return pad_or_truncate(&buf, target, hash);
243    }
244    buf
245}
246
247/// Length-preserving hex-digit replacement (for IPv6).
248/// Preserves non-hex characters (colons, `::`, etc.); replaces each
249/// ASCII hex digit with a deterministic hex digit from `hash`.
250fn format_hex_digits_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
251    let mut buf = String::with_capacity(target);
252    let mut hi = 0usize;
253    let mut had_hex = false;
254    for ch in original.chars() {
255        if ch.is_ascii_hexdigit() {
256            let nibble = hash[hi % 32] % 16;
257            let replacement = if ch.is_ascii_uppercase() {
258                b"0123456789ABCDEF"[nibble as usize]
259            } else {
260                b"0123456789abcdef"[nibble as usize]
261            };
262            buf.push(replacement as char);
263            hi += 1;
264            had_hex = true;
265        } else {
266            buf.push(ch);
267        }
268    }
269    if !had_hex {
270        return pad_or_truncate("", target, hash);
271    }
272    buf
273}
274
275/// Length-preserving SSN replacement.
276/// Preserves all non-digit characters.  The first three digit positions
277/// are forced to '0' (never-issued area code, clearly synthetic).
278/// Remaining digit positions are filled with deterministic digits.
279fn format_ssn_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
280    let has_digit = original.chars().any(|c| c.is_ascii_digit());
281    if !has_digit {
282        return pad_or_truncate("", target, hash);
283    }
284    let mut buf = String::with_capacity(target);
285    let mut digit_idx = 0usize;
286    for ch in original.chars() {
287        if ch.is_ascii_digit() {
288            if digit_idx < 3 {
289                buf.push('0');
290            } else {
291                buf.push((b'0' + hash[(digit_idx - 3) % 32] % 10) as char);
292            }
293            digit_idx += 1;
294        } else {
295            buf.push(ch);
296        }
297    }
298    buf
299}
300
301/// Length-preserving hostname replacement.
302/// Preserves the suffix (everything from the first `.` onward) and
303/// fills the prefix with deterministic hex characters to match `target`.
304fn format_hostname_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
305    let suffix = original.find('.').map_or("", |p| &original[p..]);
306    let prefix_len = target.saturating_sub(suffix.len());
307    if prefix_len == 0 {
308        return pad_or_truncate("", target, hash);
309    }
310    let hex = hex_encode(hash);
311    let hex_bytes = hex.as_bytes();
312    let mut buf = String::with_capacity(target);
313    for i in 0..prefix_len {
314        buf.push(hex_bytes[i % 64] as char);
315    }
316    buf.push_str(suffix);
317    buf
318}
319
320/// Length-preserving custom replacement.
321/// Uses `__SANITIZED_<hex>__` format when the target is long enough;
322/// falls back to bare hex for short targets.
323fn format_custom_lp(hash: &[u8; 32], target: usize) -> String {
324    let prefix = "__SANITIZED_";
325    let suffix = "__";
326    let overhead = prefix.len() + suffix.len(); // 14
327    let hex = hex_encode(hash);
328    if target <= overhead {
329        return pad_or_truncate("", target, hash);
330    }
331    let hex_len = target - overhead;
332    let hex_bytes = hex.as_bytes();
333    let mut buf = String::with_capacity(target);
334    buf.push_str(prefix);
335    for i in 0..hex_len {
336        buf.push(hex_bytes[i % 64] as char);
337    }
338    buf.push_str(suffix);
339    buf
340}
341
342/// Length-preserving JWT replacement.
343/// Preserves `.` separators; replaces base64url characters
344/// (`[A-Za-z0-9_-]`) with deterministic base64url characters.
345fn format_jwt_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
346    const B64URL: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-";
347    let mut buf = String::with_capacity(target);
348    let mut hi = 0usize;
349    let mut had_b64 = false;
350    for ch in original.chars() {
351        if ch == '.' || ch == '=' {
352            buf.push(ch);
353        } else if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
354            buf.push(B64URL[hash[hi % 32] as usize % B64URL.len()] as char);
355            hi += 1;
356            had_b64 = true;
357        } else {
358            // Non-base64url, non-structural: emit byte-preserving replacement.
359            for _ in 0..ch.len_utf8() {
360                buf.push(B64URL[hash[hi % 32] as usize % B64URL.len()] as char);
361                hi += 1;
362            }
363            had_b64 = true;
364        }
365    }
366    if !had_b64 {
367        return pad_or_truncate("", target, hash);
368    }
369    buf
370}
371
372/// Length-preserving file path replacement.
373/// Preserves separators (`/`, `\`) and the final extension (from last `.`
374/// in the last segment). Replaces other characters with deterministic hex.
375fn format_filepath_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
376    // Find the last path separator position to identify the filename segment.
377    let last_sep = original
378        .rfind(['/', '\\'])
379        .map_or(0, |p| p + 1);
380    let filename = &original[last_sep..];
381    // Find extension in the filename (last `.` that isn't at position 0).
382    let ext_start = filename.rfind('.').filter(|&p| p > 0).map(|p| last_sep + p);
383
384    let hex = hex_encode(hash);
385    let hex_bytes = hex.as_bytes();
386    let mut buf = String::with_capacity(target);
387    let mut hi = 0usize;
388
389    for (i, ch) in original.char_indices() {
390        if matches!(ch, '/' | '\\') || ext_start.is_some_and(|es| i >= es) {
391            // Preserve separators and the file extension.
392            buf.push(ch);
393        } else {
394            // Emit as many ASCII hex bytes as the original char's UTF-8 length.
395            for _ in 0..ch.len_utf8() {
396                buf.push(hex_bytes[hi % 64] as char);
397                hi += 1;
398            }
399        }
400    }
401    // Ensure exact length (should be equal for ASCII, but guard anyway).
402    if buf.len() != target {
403        return pad_or_truncate(&buf, target, hash);
404    }
405    buf
406}
407
408/// Length-preserving Windows SID replacement.
409/// Preserves the `S-` prefix and `-` separators; replaces digit groups
410/// with deterministic digits.
411fn format_windows_sid_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
412    let has_digit = original.chars().any(|c| c.is_ascii_digit());
413    if !has_digit {
414        return pad_or_truncate("", target, hash);
415    }
416    let mut buf = String::with_capacity(target);
417    let mut hi = 0usize;
418    for ch in original.chars() {
419        if ch == 'S' || ch == '-' {
420            buf.push(ch);
421        } else if ch.is_ascii_digit() {
422            buf.push((b'0' + hash[hi % 32] % 10) as char);
423            hi += 1;
424        } else {
425            // Non-digit, non-structural: emit byte-count-preserving hex.
426            for _ in 0..ch.len_utf8() {
427                buf.push((b'0' + hash[hi % 32] % 10) as char);
428                hi += 1;
429            }
430        }
431    }
432    buf
433}
434
435/// Length-preserving URL replacement.
436/// Preserves scheme prefix and structural characters
437/// (`://`, `/`, `?`, `=`, `&`, `#`, `:`); replaces content characters
438/// with deterministic hex.
439fn format_url_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
440    let hex = hex_encode(hash);
441    let hex_bytes = hex.as_bytes();
442    let mut buf = String::with_capacity(target);
443    let mut hi = 0usize;
444    let mut had_content = false;
445
446    for ch in original.chars() {
447        if "/:?=&#@.".contains(ch) {
448            buf.push(ch);
449        } else {
450            // Emit as many ASCII hex bytes as the original char's UTF-8 length.
451            for _ in 0..ch.len_utf8() {
452                buf.push(hex_bytes[hi % 64] as char);
453                hi += 1;
454            }
455            had_content = true;
456        }
457    }
458    if !had_content {
459        return pad_or_truncate("", target, hash);
460    }
461    buf
462}
463
464/// Length-preserving AWS ARN replacement.
465/// Preserves `:` and `/` separators; replaces alphanumeric content
466/// in account/resource segments with deterministic hex.
467fn format_arn_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
468    let hex = hex_encode(hash);
469    let hex_bytes = hex.as_bytes();
470    let mut buf = String::with_capacity(target);
471    let mut hi = 0usize;
472    let mut had_content = false;
473
474    for ch in original.chars() {
475        if ch == ':' || ch == '/' {
476            buf.push(ch);
477        } else {
478            for _ in 0..ch.len_utf8() {
479                buf.push(hex_bytes[hi % 64] as char);
480                hi += 1;
481            }
482            had_content = true;
483        }
484    }
485    if !had_content {
486        return pad_or_truncate("", target, hash);
487    }
488    buf
489}
490
491/// Length-preserving Azure Resource ID replacement.
492/// Preserves `/` path separators and well-known Azure segment names
493/// (`subscriptions`, `resourceGroups`, `providers`, `resourcegroups`).
494/// Replaces variable segments (IDs, names) with deterministic hex.
495fn format_azure_resource_id_lp(hash: &[u8; 32], original: &str, target: usize) -> String {
496    const KNOWN_SEGMENTS: &[&str] = &[
497        "subscriptions",
498        "resourceGroups",
499        "resourcegroups",
500        "providers",
501    ];
502
503    let hex = hex_encode(hash);
504    let hex_bytes = hex.as_bytes();
505    let mut buf = String::with_capacity(target);
506    let mut hi = 0usize;
507
508    // Split on `/`, rebuild with deterministic replacement for non-known segments.
509    let parts: Vec<&str> = original.split('/').collect();
510    for (pi, part) in parts.iter().enumerate() {
511        if pi > 0 {
512            buf.push('/');
513        }
514        if part.is_empty() || KNOWN_SEGMENTS.contains(part) || part.contains('.') {
515            // Preserve empty segments (leading `/`), known names, and
516            // dotted provider names like `Microsoft.Compute`.
517            buf.push_str(part);
518        } else {
519            // Replace this segment character-by-character to preserve byte length.
520            for ch in part.chars() {
521                for _ in 0..ch.len_utf8() {
522                    buf.push(hex_bytes[hi % 64] as char);
523                    hi += 1;
524                }
525            }
526        }
527    }
528    if buf.len() != target {
529        return pad_or_truncate(&buf, target, hash);
530    }
531    buf
532}
533
534/// Deterministic synthetic name from hash bytes.
535fn format_name(hash: &[u8; 32]) -> String {
536    // We use a small, fixed table of first/last name fragments.
537    // The hash selects indices. This is NOT meant to be realistic — it's
538    // meant to be obviously synthetic while remaining structurally plausible.
539    const FIRST: &[&str] = &[
540        "Alex", "Blake", "Casey", "Dana", "Ellis", "Finley", "Gray", "Harper", "Ira", "Jordan",
541        "Kai", "Lane", "Morgan", "Noel", "Oakley", "Parker", "Quinn", "Reese", "Sage", "Taylor",
542        "Uri", "Val", "Wren", "Xen", "Yael", "Zion", "Arden", "Blair", "Corin", "Drew", "Emery",
543        "Frost",
544    ];
545    const LAST: &[&str] = &[
546        "Ashford",
547        "Blackwell",
548        "Crawford",
549        "Dalton",
550        "Eastwood",
551        "Fairbanks",
552        "Garrison",
553        "Hartley",
554        "Irvine",
555        "Jensen",
556        "Kendrick",
557        "Langley",
558        "Mercer",
559        "Newland",
560        "Oakwood",
561        "Preston",
562        "Quinlan",
563        "Redmond",
564        "Shepard",
565        "Thornton",
566        "Underwood",
567        "Vance",
568        "Whitmore",
569        "Xavier",
570        "Yardley",
571        "Zimmer",
572        "Ashton",
573        "Beckett",
574        "Calloway",
575        "Dempsey",
576        "Eldridge",
577        "Fletcher",
578    ];
579    let fi = hash[0] as usize % FIRST.len();
580    let li = hash[1] as usize % LAST.len();
581    format!("{} {}", FIRST[fi], LAST[li])
582}
583
584/// Hex-encode 32 bytes → 64-char lowercase string.
585fn hex_encode(bytes: &[u8; 32]) -> String {
586    use std::fmt::Write;
587    let mut hex = String::with_capacity(64);
588    for b in bytes {
589        let _ = write!(hex, "{:02x}", b);
590    }
591    hex
592}
593
594// ---------------------------------------------------------------------------
595// Tests
596// ---------------------------------------------------------------------------
597
598#[cfg(test)]
599mod tests {
600    use super::*;
601
602    #[test]
603    fn hmac_deterministic_same_input() {
604        let gen = HmacGenerator::new([42u8; 32]);
605        let a = gen.generate(&Category::Email, "alice@corp.com");
606        let b = gen.generate(&Category::Email, "alice@corp.com");
607        assert_eq!(a, b, "same seed + same input must produce same output");
608    }
609
610    #[test]
611    fn hmac_different_inputs_differ() {
612        let gen = HmacGenerator::new([42u8; 32]);
613        let a = gen.generate(&Category::Email, "alice@corp.com");
614        let b = gen.generate(&Category::Email, "bob@corp.com");
615        assert_ne!(a, b);
616    }
617
618    #[test]
619    fn hmac_different_seeds_differ() {
620        let g1 = HmacGenerator::new([1u8; 32]);
621        let g2 = HmacGenerator::new([2u8; 32]);
622        let a = g1.generate(&Category::Email, "alice@corp.com");
623        let b = g2.generate(&Category::Email, "alice@corp.com");
624        assert_ne!(a, b);
625    }
626
627    #[test]
628    fn hmac_different_categories_differ() {
629        let gen = HmacGenerator::new([42u8; 32]);
630        let a = gen.generate(&Category::Email, "test");
631        let b = gen.generate(&Category::Name, "test");
632        assert_ne!(a, b, "different categories must produce different outputs");
633    }
634
635    #[test]
636    fn email_format() {
637        let gen = HmacGenerator::new([0u8; 32]);
638        let orig = "alice@corp.com";
639        let out = gen.generate(&Category::Email, orig);
640        assert!(out.contains('@'), "email must contain @");
641        assert!(out.ends_with("@corp.com"), "email must preserve domain");
642        assert_eq!(out.len(), orig.len(), "email must preserve length");
643    }
644
645    #[test]
646    fn ipv4_format() {
647        let gen = HmacGenerator::new([0u8; 32]);
648        let orig = "192.168.1.1";
649        let out = gen.generate(&Category::IpV4, orig);
650        // Dots preserved, length preserved.
651        let parts: Vec<&str> = out.split('.').collect();
652        assert_eq!(parts.len(), 4);
653        assert_eq!(out.len(), orig.len(), "ipv4 must preserve length");
654    }
655
656    #[test]
657    fn ssn_format() {
658        let gen = HmacGenerator::new([7u8; 32]);
659        let orig = "123-45-6789";
660        let out = gen.generate(&Category::Ssn, orig);
661        assert!(out.starts_with("000-"), "SSN must start with 000");
662        assert_eq!(out.len(), orig.len(), "SSN must preserve length");
663    }
664
665    #[test]
666    fn phone_format() {
667        let gen = HmacGenerator::new([3u8; 32]);
668        let orig = "+1-212-555-0100";
669        let out = gen.generate(&Category::Phone, orig);
670        // Formatting characters preserved.
671        assert!(out.starts_with('+'));
672        assert_eq!(
673            out.chars().filter(|c| *c == '-').count(),
674            orig.chars().filter(|c| *c == '-').count(),
675            "dashes must be preserved"
676        );
677        assert_eq!(out.len(), orig.len(), "phone must preserve length");
678    }
679
680    #[test]
681    fn hostname_format() {
682        let gen = HmacGenerator::new([5u8; 32]);
683        let orig = "db-prod-01.internal";
684        let out = gen.generate(&Category::Hostname, orig);
685        assert!(out.ends_with(".internal"), "hostname must preserve suffix");
686        assert_eq!(out.len(), orig.len(), "hostname must preserve length");
687    }
688
689    #[test]
690    fn custom_format() {
691        let gen = HmacGenerator::new([9u8; 32]);
692        let cat = Category::Custom("api_key".into());
693        // Use an input long enough for the __SANITIZED_..__ wrapper (>14 chars).
694        let orig = "sk-abc123-very-long-key";
695        let out = gen.generate(&cat, orig);
696        assert!(out.starts_with("__SANITIZED_"));
697        assert!(out.ends_with("__"));
698        assert_eq!(out.len(), orig.len(), "custom must preserve length");
699    }
700
701    #[test]
702    fn custom_format_short() {
703        let gen = HmacGenerator::new([9u8; 32]);
704        let cat = Category::Custom("api_key".into());
705        // Short input falls back to hex.
706        let orig = "sk-abc123";
707        let out = gen.generate(&cat, orig);
708        assert_eq!(
709            out.len(),
710            orig.len(),
711            "custom must preserve length even for short inputs"
712        );
713    }
714
715    #[test]
716    fn random_generator_produces_valid_format() {
717        let gen = RandomGenerator::new();
718        let orig = "test@example.com";
719        let out = gen.generate(&Category::Email, orig);
720        assert!(out.contains('@'));
721        assert_eq!(
722            out.len(),
723            orig.len(),
724            "random generator must preserve length"
725        );
726    }
727
728    #[test]
729    fn from_slice_rejects_bad_length() {
730        let result = HmacGenerator::from_slice(&[0u8; 16]);
731        assert!(result.is_err());
732    }
733
734    #[test]
735    fn credit_card_format() {
736        let gen = HmacGenerator::new([11u8; 32]);
737        let orig = "4111-1111-1111-1111";
738        let out = gen.generate(&Category::CreditCard, orig);
739        // Should be ####-####-####-####
740        let parts: Vec<&str> = out.split('-').collect();
741        assert_eq!(parts.len(), 4);
742        for part in &parts {
743            assert_eq!(part.len(), 4);
744            assert!(part.chars().all(|c| c.is_ascii_digit()));
745        }
746        assert_eq!(out.len(), orig.len(), "credit card must preserve length");
747    }
748
749    #[test]
750    fn name_format() {
751        let gen = HmacGenerator::new([0u8; 32]);
752        let orig = "John Doe";
753        let out = gen.generate(&Category::Name, orig);
754        assert_eq!(out.len(), orig.len(), "name must preserve length");
755    }
756
757    #[test]
758    fn ipv6_format() {
759        let gen = HmacGenerator::new([0u8; 32]);
760        let orig = "fd00:abcd:1234:5678::1";
761        let out = gen.generate(&Category::IpV6, orig);
762        // Colons and :: preserved, length preserved.
763        assert_eq!(
764            out.chars().filter(|c| *c == ':').count(),
765            orig.chars().filter(|c| *c == ':').count(),
766            "colons must be preserved"
767        );
768        assert_eq!(out.len(), orig.len(), "ipv6 must preserve length");
769    }
770
771    #[test]
772    fn length_preserved_all_categories() {
773        let gen = HmacGenerator::new([42u8; 32]);
774        let cases: Vec<(Category, &str)> = vec![
775            (Category::Email, "alice@corp.com"),
776            (Category::Name, "John Doe"),
777            (Category::Phone, "+1-212-555-0100"),
778            (Category::IpV4, "192.168.1.1"),
779            (Category::IpV6, "fd00::1"),
780            (Category::CreditCard, "4111-1111-1111-1111"),
781            (Category::Ssn, "123-45-6789"),
782            (Category::Hostname, "db-prod-01.internal"),
783            (Category::MacAddress, "AA:BB:CC:DD:EE:FF"),
784            (Category::ContainerId, "a1b2c3d4e5f6"),
785            (Category::Uuid, "550e8400-e29b-41d4-a716-446655440000"),
786            (Category::Jwt, "eyJhbGciOiJI.eyJzdWIiOiIx.SflKxwRJSMeK"),
787            (Category::AuthToken, "ghp_abc123secrettoken"),
788            (Category::FilePath, "/home/jsmith/config.yaml"),
789            (Category::WindowsSid, "S-1-5-21-3623811015-3361044348"),
790            (Category::Url, "https://internal.corp.com/api"),
791            (Category::AwsArn, "arn:aws:iam::123456789012:user/admin"),
792            (
793                Category::AzureResourceId,
794                "/subscriptions/550e8400/resourceGroups/rg-prod",
795            ),
796            (Category::Custom("key".into()), "some-secret-value-here"),
797        ];
798        for (cat, orig) in &cases {
799            let out = gen.generate(cat, orig);
800            assert_eq!(
801                out.len(),
802                orig.len(),
803                "length mismatch for {:?}: '{}' ({}) -> '{}' ({})",
804                cat,
805                orig,
806                orig.len(),
807                out,
808                out.len()
809            );
810        }
811    }
812
813    #[test]
814    fn mac_address_format() {
815        let gen = HmacGenerator::new([7u8; 32]);
816        let orig = "AA:BB:CC:DD:EE:FF";
817        let out = gen.generate(&Category::MacAddress, orig);
818        assert_eq!(out.len(), orig.len(), "mac must preserve length");
819        assert_eq!(
820            out.chars().filter(|c| *c == ':').count(),
821            5,
822            "mac must preserve colons"
823        );
824    }
825
826    #[test]
827    fn mac_address_dash_format() {
828        let gen = HmacGenerator::new([7u8; 32]);
829        let orig = "AA-BB-CC-DD-EE-FF";
830        let out = gen.generate(&Category::MacAddress, orig);
831        assert_eq!(out.len(), orig.len());
832        assert_eq!(out.chars().filter(|c| *c == '-').count(), 5);
833    }
834
835    #[test]
836    fn uuid_format() {
837        let gen = HmacGenerator::new([3u8; 32]);
838        let orig = "550e8400-e29b-41d4-a716-446655440000";
839        let out = gen.generate(&Category::Uuid, orig);
840        assert_eq!(out.len(), orig.len(), "uuid must preserve length");
841        assert_eq!(
842            out.chars().filter(|c| *c == '-').count(),
843            4,
844            "uuid must preserve dashes"
845        );
846    }
847
848    #[test]
849    fn container_id_format() {
850        let gen = HmacGenerator::new([5u8; 32]);
851        let orig = "a1b2c3d4e5f6";
852        let out = gen.generate(&Category::ContainerId, orig);
853        assert_eq!(out.len(), orig.len(), "container id must preserve length");
854        assert!(out.chars().all(|c| c.is_ascii_hexdigit()));
855    }
856
857    #[test]
858    fn jwt_format() {
859        let gen = HmacGenerator::new([11u8; 32]);
860        let orig = "eyJhbGciOiJI.eyJzdWIiOiIx.SflKxwRJSMeK";
861        let out = gen.generate(&Category::Jwt, orig);
862        assert_eq!(out.len(), orig.len(), "jwt must preserve length");
863        let orig_dots = orig.chars().filter(|c| *c == '.').count();
864        let out_dots = out.chars().filter(|c| *c == '.').count();
865        assert_eq!(out_dots, orig_dots, "jwt must preserve dots");
866    }
867
868    #[test]
869    fn auth_token_format() {
870        let gen = HmacGenerator::new([9u8; 32]);
871        let orig = "ghp_abc123secrettoken";
872        let out = gen.generate(&Category::AuthToken, orig);
873        assert!(out.starts_with("__SANITIZED_"));
874        assert!(out.ends_with("__"));
875        assert_eq!(out.len(), orig.len(), "auth_token must preserve length");
876    }
877
878    #[test]
879    fn filepath_unix_format() {
880        let gen = HmacGenerator::new([13u8; 32]);
881        let orig = "/home/jsmith/config.yaml";
882        let out = gen.generate(&Category::FilePath, orig);
883        assert_eq!(out.len(), orig.len(), "filepath must preserve length");
884        assert_eq!(
885            std::path::Path::new(&out)
886                .extension()
887                .and_then(|e| e.to_str()),
888            Some("yaml"),
889            "filepath must preserve extension"
890        );
891        assert_eq!(
892            out.chars().filter(|c| *c == '/').count(),
893            orig.chars().filter(|c| *c == '/').count(),
894            "filepath must preserve separators"
895        );
896    }
897
898    #[test]
899    fn filepath_windows_format() {
900        let gen = HmacGenerator::new([13u8; 32]);
901        let orig = "C:\\Users\\admin\\secrets.txt";
902        let out = gen.generate(&Category::FilePath, orig);
903        assert_eq!(out.len(), orig.len(), "filepath must preserve length");
904        assert_eq!(
905            std::path::Path::new(&out)
906                .extension()
907                .and_then(|e| e.to_str()),
908            Some("txt"),
909            "filepath must preserve extension"
910        );
911        assert_eq!(
912            out.chars().filter(|c| *c == '\\').count(),
913            orig.chars().filter(|c| *c == '\\').count(),
914            "filepath must preserve backslashes"
915        );
916    }
917
918    #[test]
919    fn windows_sid_format() {
920        let gen = HmacGenerator::new([7u8; 32]);
921        let orig = "S-1-5-21-3623811015-3361044348-30300820-1013";
922        let out = gen.generate(&Category::WindowsSid, orig);
923        assert_eq!(out.len(), orig.len(), "SID must preserve length");
924        assert!(out.starts_with("S-"), "SID must start with S-");
925        assert_eq!(
926            out.chars().filter(|c| *c == '-').count(),
927            orig.chars().filter(|c| *c == '-').count(),
928            "SID must preserve dashes"
929        );
930    }
931
932    #[test]
933    fn url_format() {
934        let gen = HmacGenerator::new([5u8; 32]);
935        let orig = "https://internal.corp.com/api/users?token=abc123";
936        let out = gen.generate(&Category::Url, orig);
937        assert_eq!(out.len(), orig.len(), "url must preserve length");
938        // Structural characters preserved.
939        assert!(out.contains("://"));
940        assert!(out.contains('?'));
941        assert!(out.contains('='));
942    }
943
944    #[test]
945    fn aws_arn_format() {
946        let gen = HmacGenerator::new([3u8; 32]);
947        let orig = "arn:aws:iam::123456789012:user/admin";
948        let out = gen.generate(&Category::AwsArn, orig);
949        assert_eq!(out.len(), orig.len(), "ARN must preserve length");
950        assert_eq!(
951            out.chars().filter(|c| *c == ':').count(),
952            orig.chars().filter(|c| *c == ':').count(),
953            "ARN must preserve colons"
954        );
955        assert!(out.contains('/'), "ARN must preserve slash");
956    }
957
958    #[test]
959    fn azure_resource_id_format() {
960        let gen = HmacGenerator::new([11u8; 32]);
961        let orig = "/subscriptions/550e8400-e29b/resourceGroups/rg-prod/providers/Microsoft.Compute/virtualMachines/vm-01";
962        let out = gen.generate(&Category::AzureResourceId, orig);
963        assert_eq!(
964            out.len(),
965            orig.len(),
966            "Azure resource ID must preserve length"
967        );
968        assert!(
969            out.contains("/subscriptions/"),
970            "must preserve 'subscriptions'"
971        );
972        assert!(
973            out.contains("/resourceGroups/"),
974            "must preserve 'resourceGroups'"
975        );
976        assert!(out.contains("/providers/"), "must preserve 'providers'");
977        assert!(
978            out.contains("Microsoft.Compute"),
979            "must preserve dotted provider name"
980        );
981    }
982}