1use std::collections::HashSet;
26
27#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct SecurityAnalysis {
30 pub has_invisible_chars: bool,
32 pub has_bidi_overrides: bool,
34 pub has_mixed_scripts: bool,
36 pub has_confusables: bool,
38 pub invisible_chars: Vec<(usize, char, &'static str)>,
40 pub bidi_chars: Vec<(usize, char, &'static str)>,
42 pub scripts: HashSet<Script>,
44 pub risk_level: RiskLevel,
46}
47
48#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
50pub enum Script {
51 Latin,
52 Cyrillic,
53 Greek,
54 Arabic,
55 Hebrew,
56 Chinese,
57 Japanese,
58 Korean,
59 Thai,
60 Devanagari,
61 Other(u32), }
63
64#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
66pub enum RiskLevel {
67 Low,
69 Medium,
71 High,
73 Critical,
75}
76
77pub fn analyze_text(text: &str) -> SecurityAnalysis {
100 let mut analysis = SecurityAnalysis {
101 has_invisible_chars: false,
102 has_bidi_overrides: false,
103 has_mixed_scripts: false,
104 has_confusables: false,
105 invisible_chars: Vec::new(),
106 bidi_chars: Vec::new(),
107 scripts: HashSet::new(),
108 risk_level: RiskLevel::Low,
109 };
110
111 for (pos, ch) in text.char_indices() {
112 if is_invisible_char(ch) {
114 analysis.has_invisible_chars = true;
115 analysis.invisible_chars.push((pos, ch, get_char_description(ch)));
116 }
117
118 if is_bidi_char(ch) {
120 analysis.has_bidi_overrides = true;
121 analysis.bidi_chars.push((pos, ch, get_char_description(ch)));
122 }
123
124 let script = get_script(ch);
126 analysis.scripts.insert(script);
127
128 if is_confusable_char(ch) {
130 analysis.has_confusables = true;
131 }
132 }
133
134 let non_latin_scripts: Vec<_> = analysis.scripts.iter()
138 .filter(|s| !matches!(s, Script::Latin))
139 .collect();
140
141 analysis.has_mixed_scripts = non_latin_scripts.len() > 1
142 || (non_latin_scripts.len() == 1 && analysis.scripts.contains(&Script::Latin));
143
144 analysis.risk_level = calculate_risk_level(&analysis);
146
147 analysis
148}
149
150pub fn is_invisible_char(ch: char) -> bool {
152 matches!(ch,
153 '\u{00AD}' | '\u{034F}' | '\u{061C}' | '\u{115F}' | '\u{1160}' | '\u{17B4}' | '\u{17B5}' | '\u{180E}' | '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2060}' | '\u{2061}' | '\u{2062}' | '\u{2063}' | '\u{2064}' | '\u{206A}' | '\u{206B}' | '\u{206C}' | '\u{206D}' | '\u{206E}' | '\u{206F}' | '\u{3164}' | '\u{FEFF}' | '\u{FFA0}' | '\u{1D159}' | '\u{1D173}' | '\u{1D174}' | '\u{1D175}' | '\u{1D176}' | '\u{1D177}' | '\u{1D178}' | '\u{1D179}' | '\u{1D17A}' )
195}
196
197pub fn is_bidi_char(ch: char) -> bool {
199 matches!(ch,
200 '\u{061C}' | '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
213}
214
215pub fn is_confusable_char(ch: char) -> bool {
217 matches!(ch,
219 'а' | 'е' | 'о' | 'р' | 'с' | 'у' | 'х' | 'А' | 'В' | 'Е' | 'К' | 'М' | 'Н' | 'О' | 'Р' | 'С' | 'Т' | 'У' | 'Х' |
221 'α' | 'β' | 'γ' | 'δ' | 'ε' | 'ζ' | 'η' | 'θ' | 'ι' | 'κ' | 'λ' | 'μ' | 'ν' | 'ξ' | 'ο' | 'π' | 'ρ' | 'σ' | 'τ' | 'υ' | 'φ' | 'χ' | 'ψ' | 'ω' |
223 '𝐀' | '𝐁' | '𝐂' | '𝐃' | '𝐄' | '𝐅' | '𝐆' | '𝐇' | '𝐈' | '𝐉' | '𝐊' | '𝐋' | '𝐌' | '𝐍' | '𝐎' | '𝐏' | '𝐐' | '𝐑' | '𝐒' | '𝐓' | '𝐔' | '𝐕' | '𝐖' | '𝐗' | '𝐘' | '𝐙'
225 )
226}
227
228pub fn get_script(ch: char) -> Script {
230 match ch {
231 'A'..='Z' | 'a'..='z' => Script::Latin,
232 'А'..='я' | 'Ё' | 'ё' => Script::Cyrillic,
233 'Α'..='ω' => Script::Greek,
234 '\u{0600}'..='\u{06FF}' => Script::Arabic,
235 '\u{0590}'..='\u{05FF}' => Script::Hebrew,
236 '\u{4E00}'..='\u{9FFF}' => Script::Chinese,
237 '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' => Script::Japanese,
238 '\u{AC00}'..='\u{D7AF}' => Script::Korean,
239 '\u{0E00}'..='\u{0E7F}' => Script::Thai,
240 '\u{0900}'..='\u{097F}' => Script::Devanagari,
241 '0'..='9' | ' ' | '\t' | '\n' | '\r' | '!' | '?' | '.' | ',' | ';' | ':' |
243 '"' | '\'' | '(' | ')' | '[' | ']' | '{' | '}' | '-' | '_' | '=' | '+' |
244 '*' | '/' | '\\' | '|' | '@' | '#' | '$' | '%' | '^' | '&' | '~' | '`' => Script::Latin,
245 _ => Script::Other(ch as u32),
246 }
247}
248
249pub fn get_char_description(ch: char) -> &'static str {
251 match ch {
252 '\u{00AD}' => "Soft Hyphen",
253 '\u{200B}' => "Zero Width Space",
254 '\u{200C}' => "Zero Width Non-Joiner",
255 '\u{200D}' => "Zero Width Joiner",
256 '\u{200E}' => "Left-to-Right Mark",
257 '\u{200F}' => "Right-to-Left Mark",
258 '\u{202A}' => "Left-to-Right Embedding",
259 '\u{202B}' => "Right-to-Left Embedding",
260 '\u{202C}' => "Pop Directional Formatting",
261 '\u{202D}' => "Left-to-Right Override",
262 '\u{202E}' => "Right-to-Left Override",
263 '\u{2060}' => "Word Joiner",
264 '\u{FEFF}' => "Zero Width No-Break Space (BOM)",
265 _ => "Unknown Special Character",
266 }
267}
268
269fn calculate_risk_level(analysis: &SecurityAnalysis) -> RiskLevel {
271 let mut score = 0;
272
273 if analysis.has_invisible_chars {
274 score += 3;
275 }
276 if analysis.has_bidi_overrides {
277 score += 4;
278 }
279 if analysis.has_mixed_scripts {
280 score += 2;
281 }
282 if analysis.has_confusables {
283 score += 2;
284 }
285
286 if analysis.invisible_chars.len() > 3 {
288 score += 2;
289 }
290 if analysis.bidi_chars.len() > 1 {
291 score += 2;
292 }
293
294 match score {
295 0 => RiskLevel::Low,
296 1..=3 => RiskLevel::Medium,
297 4..=6 => RiskLevel::High,
298 _ => RiskLevel::Critical,
299 }
300}
301
302pub fn sanitize_text(text: &str) -> String {
314 text.chars()
315 .filter(|&ch| !is_invisible_char(ch) && !is_bidi_char(ch))
316 .collect()
317}
318
319pub fn generate_security_report(text: &str) -> String {
330 let analysis = analyze_text(text);
331 let mut report = String::new();
332
333 report.push_str(&format!("Unicode Security Analysis\n"));
334 report.push_str(&format!("========================\n\n"));
335 report.push_str(&format!("Risk Level: {:?}\n\n", analysis.risk_level));
336
337 if analysis.has_invisible_chars {
338 report.push_str("⚠️ INVISIBLE CHARACTERS DETECTED:\n");
339 for (pos, ch, desc) in &analysis.invisible_chars {
340 report.push_str(&format!(" Position {}: U+{:04X} ({})\n", pos, *ch as u32, desc));
341 }
342 report.push('\n');
343 }
344
345 if analysis.has_bidi_overrides {
346 report.push_str("⚠️ BIDIRECTIONAL OVERRIDE CHARACTERS DETECTED:\n");
347 for (pos, ch, desc) in &analysis.bidi_chars {
348 report.push_str(&format!(" Position {}: U+{:04X} ({})\n", pos, *ch as u32, desc));
349 }
350 report.push('\n');
351 }
352
353 if analysis.has_mixed_scripts {
354 report.push_str("⚠️ MIXED SCRIPTS DETECTED (Potential Homograph Attack):\n");
355 for script in &analysis.scripts {
356 report.push_str(&format!(" {:?}\n", script));
357 }
358 report.push('\n');
359 }
360
361 if analysis.has_confusables {
362 report.push_str("⚠️ CONFUSABLE CHARACTERS DETECTED\n\n");
363 }
364
365 if analysis.risk_level == RiskLevel::Low {
366 report.push_str("✅ No security concerns detected.\n");
367 }
368
369 report
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375
376 #[test]
377 fn test_safe_text() {
378 let analysis = analyze_text("Hello World");
379 assert_eq!(analysis.risk_level, RiskLevel::Low);
380 assert!(!analysis.has_invisible_chars);
381 assert!(!analysis.has_bidi_overrides);
382 assert!(!analysis.has_confusables);
383 }
384
385 #[test]
386 fn test_invisible_characters() {
387 let text = "Hello\u{200B}World";
388 let analysis = analyze_text(text);
389 assert!(analysis.has_invisible_chars);
390 assert_eq!(analysis.invisible_chars.len(), 1);
391 assert_eq!(analysis.invisible_chars[0].1, '\u{200B}');
392 assert!(analysis.risk_level >= RiskLevel::High);
393 }
394
395 #[test]
396 fn test_bidi_override() {
397 let text = "filename\u{202E}gpj.exe";
398 let analysis = analyze_text(text);
399 assert!(analysis.has_bidi_overrides);
400 assert_eq!(analysis.bidi_chars.len(), 1);
401 assert_eq!(analysis.bidi_chars[0].1, '\u{202E}');
402 assert_eq!(analysis.risk_level, RiskLevel::Critical);
403 }
404
405 #[test]
406 fn test_mixed_scripts() {
407 let text = "раураӏ.com"; let analysis = analyze_text(text);
409 assert!(analysis.has_mixed_scripts);
410 assert!(analysis.scripts.len() > 1);
411 assert!(analysis.risk_level >= RiskLevel::High);
412 }
413
414 #[test]
415 fn test_sanitization() {
416 let dangerous = "Hello\u{200B}World\u{202E}Test";
417 let sanitized = sanitize_text(dangerous);
418 assert_eq!(sanitized, "HelloWorldTest");
419
420 let analysis = analyze_text(&sanitized);
421 assert_eq!(analysis.risk_level, RiskLevel::Low);
422 }
423
424 #[test]
425 fn test_character_detection() {
426 assert!(is_invisible_char('\u{200B}')); assert!(is_invisible_char('\u{FEFF}')); assert!(!is_invisible_char('a'));
429
430 assert!(is_bidi_char('\u{202E}')); assert!(is_bidi_char('\u{200F}')); assert!(!is_bidi_char('a'));
433
434 assert!(is_confusable_char('а')); assert!(is_confusable_char('α')); assert!(!is_confusable_char('a')); }
438
439 #[test]
440 fn test_script_detection() {
441 assert_eq!(get_script('a'), Script::Latin);
442 assert_eq!(get_script('А'), Script::Cyrillic);
443 assert_eq!(get_script('α'), Script::Greek);
444 assert_eq!(get_script('世'), Script::Chinese);
445 }
446
447 #[test]
448 fn test_risk_calculation() {
449 let safe = analyze_text("Hello World");
451 assert_eq!(safe.risk_level, RiskLevel::Low);
452
453 let invisible = analyze_text("Hello\u{200B}World");
455 assert!(invisible.risk_level >= RiskLevel::High);
456
457 let bidi = analyze_text("test\u{202E}evil");
459 assert_eq!(bidi.risk_level, RiskLevel::Critical);
460 }
461
462 #[test]
463 fn test_security_report() {
464 let text = "Hello\u{200B}World";
465 let report = generate_security_report(text);
466 assert!(report.contains("INVISIBLE CHARACTERS DETECTED"));
467 assert!(report.contains("U+200B"));
468 assert!(report.contains("Zero Width Space"));
469 }
470}