1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
use keyhog_scanner::unicode_hardening::*;
/// Proving test: Multiple zero-width characters are all removed.
/// Contract: "token=abcd\u{200B}\u{200C}\u{200D}efgh" normalizes to "token=abcdefgh".
#[test]
fn normalize_removes_multiple_zero_width_characters() {
// Mix of three different zero-width evasion characters:
// U+200B = Zero-Width Space
// U+200C = Zero-Width Non-Joiner
// U+200D = Zero-Width Joiner
let text = "token=abcd\u{200B}\u{200C}\u{200D}efgh";
let normalized = normalize_homoglyphs(text);
// All zero-width characters must be removed
assert_eq!(
normalized.as_ref(),
"token=abcdefgh",
"All three zero-width characters must be stripped; got: {normalized:?}"
);
// Result must not contain any of the zero-width characters
assert!(!normalized.contains('\u{200B}'));
assert!(!normalized.contains('\u{200C}'));
assert!(!normalized.contains('\u{200D}'));
assert!(normalized.is_ascii());
}
/// Proving test: Zero-width joiner (ZWJ) in credential body is removed.
/// Contract: normalize_homoglyphs("ghp_abc\u{200D}def") → "ghp_abcdef".
#[test]
fn normalize_removes_zero_width_joiner() {
// Zero-Width Joiner (U+200D) between credential parts
let text = "export PAT=\"ghp_\u{200D}abcdefghijklmnopqrstuvwxyz1234567890ab\"";
let normalized = normalize_homoglyphs(text);
// ZWJ must be removed, preserving the credential structure
assert!(
normalized.contains("ghp_abcdefghij"),
"ZWJ (U+200D) must be removed; got: {normalized:?}"
);
assert!(!normalized.contains('\u{200D}'));
}
/// Proving test: BOM (U+FEFF) zero-width no-break space is removed.
/// Contract: Text starting with BOM normalizes without it.
#[test]
fn normalize_removes_bom_zero_width_no_break_space() {
// U+FEFF can appear at start of file or embedded
let text = "\u{FEFF}ghp_abcdefghijklmnopqrstuvwxyz1234567890ab";
let normalized = normalize_homoglyphs(text);
// BOM must be stripped
assert!(
normalized.starts_with("ghp_"),
"BOM (U+FEFF) must be removed from start; got: {normalized:?}"
);
assert!(!normalized.contains('\u{FEFF}'));
}
/// Proving test: Word Joiner (U+2060) is removed.
/// Contract: normalize_homoglyphs with U+2060 removes it.
#[test]
fn normalize_removes_word_joiner() {
// U+2060 Word Joiner
let text = "token\u{2060}=abcd1234567890ab";
let normalized = normalize_homoglyphs(text);
// Word joiner must be removed
assert!(
normalized.contains("token=abcd"),
"Word Joiner (U+2060) must be removed; got: {normalized:?}"
);
assert!(!normalized.contains('\u{2060}'));
}
/// Proving test: Directional marks (LTR/RTL marks) are removed.
/// Contract: "text\u{200E}more" normalizes to "textmore".
#[test]
fn normalize_removes_directional_marks() {
// U+200E = Left-to-Right Mark
// U+200F = Right-to-Left Mark
let text = "ghp_abc\u{200E}def\u{200F}ghi";
let normalized = normalize_homoglyphs(text);
// Directional marks must be removed
assert!(
normalized.contains("ghp_abcdefghi"),
"Directional marks (U+200E/U+200F) must be removed; got: {normalized:?}"
);
assert!(!normalized.contains('\u{200E}'));
assert!(!normalized.contains('\u{200F}'));
}
/// Proving test: Directional isolate characters (U+2066-U+2069) are removed.
/// Contract: Text with LRI, RLI, FSI, PDI all strip cleanly.
#[test]
fn normalize_removes_directional_isolates() {
// U+2066 = Left-to-Right Isolate
// U+2067 = Right-to-Left Isolate
// U+2068 = First Strong Isolate
// U+2069 = Pop Directional Isolate
let text = "sk_\u{2066}live\u{2067}secret\u{2068}key\u{2069}end";
let normalized = normalize_homoglyphs(text);
// All isolate characters must be removed
assert!(
normalized.contains("sk_livesecretkeyend"),
"Directional isolates (U+2066-U+2069) must be removed; got: {normalized:?}"
);
assert!(!normalized.contains('\u{2066}'));
assert!(!normalized.contains('\u{2067}'));
assert!(!normalized.contains('\u{2068}'));
assert!(!normalized.contains('\u{2069}'));
}
/// Proving test: Soft hyphen (U+00AD) adjacent to alphanumeric is removed.
/// Contract: "secret\u{00AD}123" normalizes to "secret123" (not "secret-123").
#[test]
fn normalize_removes_soft_hyphen_not_promotes_to_hyphen() {
// U+00AD Soft Hyphen
let text = "password\u{00AD}1234567890";
let normalized = normalize_homoglyphs(text);
// Soft hyphen must be removed, not converted to '-'
assert_eq!(
normalized.as_ref(),
"password1234567890",
"Soft hyphen (U+00AD) must be removed, not converted; got: {normalized:?}"
);
assert!(!normalized.contains('-'));
}