unicode_rs/unicode/
security.rs

1//! Unicode security utilities
2//!
3//! This module provides utilities to detect and analyze potentially dangerous Unicode characters
4//! that could be used in security attacks such as:
5//! - Homograph attacks (visually similar characters)
6//! - Invisible character injection
7//! - Bidirectional text attacks
8//! - Mixed script attacks
9//!
10//! # Examples
11//!
12//! ```rust
13//! use unicode_rs::security::*;
14//!
15//! // Check for suspicious characters
16//! let result = analyze_text("Hello\u{200B}World"); // Zero-width space
17//! assert!(result.has_invisible_chars);
18//!
19//! // Detect homograph attacks
20//! let suspicious = "раураӏ.com"; // Cyrillic characters that look like "paypal.com"
21//! let analysis = analyze_text(suspicious);
22//! assert!(analysis.has_mixed_scripts);
23//! ```
24
25use std::collections::HashSet;
26
27/// Security analysis result for Unicode text
28#[derive(Debug, Clone, PartialEq, Eq)]
29pub struct SecurityAnalysis {
30    /// Contains invisible or zero-width characters
31    pub has_invisible_chars: bool,
32    /// Contains bidirectional override characters
33    pub has_bidi_overrides: bool,
34    /// Contains mixed scripts (potential homograph attack)
35    pub has_mixed_scripts: bool,
36    /// Contains confusable characters
37    pub has_confusables: bool,
38    /// List of detected invisible characters with their positions
39    pub invisible_chars: Vec<(usize, char, &'static str)>,
40    /// List of detected bidirectional characters with their positions
41    pub bidi_chars: Vec<(usize, char, &'static str)>,
42    /// Set of detected scripts
43    pub scripts: HashSet<Script>,
44    /// Overall risk level
45    pub risk_level: RiskLevel,
46}
47
48/// Unicode script categories
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
50pub enum Script {
51    Latin,
52    Cyrillic,
53    Greek,
54    Arabic,
55    Hebrew,
56    Chinese,
57    Japanese,
58    Korean,
59    Thai,
60    Devanagari,
61    Other(u32), // Unicode script code
62}
63
64/// Risk level assessment
65#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
66pub enum RiskLevel {
67    /// No security concerns detected
68    Low,
69    /// Minor concerns, likely benign
70    Medium,
71    /// Significant security concerns
72    High,
73    /// Critical security risk
74    Critical,
75}
76
77/// Analyze text for Unicode security issues
78///
79/// This function performs a comprehensive analysis of the input text to detect
80/// various Unicode-based security vulnerabilities.
81///
82/// # Arguments
83///
84/// * `text` - The text to analyze
85///
86/// # Examples
87///
88/// ```rust
89/// use unicode_rs::security::*;
90///
91/// // Safe text
92/// let safe = analyze_text("Hello World");
93/// assert_eq!(safe.risk_level, RiskLevel::Low);
94///
95/// // Text with invisible characters
96/// let suspicious = analyze_text("Hello\u{200B}World");
97/// assert_eq!(suspicious.risk_level, RiskLevel::High);
98/// ```
99pub fn analyze_text(text: &str) -> SecurityAnalysis {
100    let mut analysis = SecurityAnalysis {
101        has_invisible_chars: false,
102        has_bidi_overrides: false,
103        has_mixed_scripts: false,
104        has_confusables: false,
105        invisible_chars: Vec::new(),
106        bidi_chars: Vec::new(),
107        scripts: HashSet::new(),
108        risk_level: RiskLevel::Low,
109    };
110
111    for (pos, ch) in text.char_indices() {
112        // Check for invisible characters
113        if is_invisible_char(ch) {
114            analysis.has_invisible_chars = true;
115            analysis.invisible_chars.push((pos, ch, get_char_description(ch)));
116        }
117
118        // Check for bidirectional override characters
119        if is_bidi_char(ch) {
120            analysis.has_bidi_overrides = true;
121            analysis.bidi_chars.push((pos, ch, get_char_description(ch)));
122        }
123
124        // Collect scripts
125        let script = get_script(ch);
126        analysis.scripts.insert(script);
127
128        // Check for confusable characters
129        if is_confusable_char(ch) {
130            analysis.has_confusables = true;
131        }
132    }
133
134    // Determine if mixed scripts (potential homograph attack)
135    // Only consider it mixed scripts if we have non-Latin scripts mixed with Latin,
136    // or multiple non-Latin scripts
137    let non_latin_scripts: Vec<_> = analysis.scripts.iter()
138        .filter(|s| !matches!(s, Script::Latin))
139        .collect();
140
141    analysis.has_mixed_scripts = non_latin_scripts.len() > 1
142        || (non_latin_scripts.len() == 1 && analysis.scripts.contains(&Script::Latin));
143
144    // Calculate risk level
145    analysis.risk_level = calculate_risk_level(&analysis);
146
147    analysis
148}
149
150/// Check if a character is invisible or zero-width
151pub fn is_invisible_char(ch: char) -> bool {
152    matches!(ch,
153        '\u{00AD}' | // Soft hyphen
154        '\u{034F}' | // Combining grapheme joiner
155        '\u{061C}' | // Arabic letter mark
156        '\u{115F}' | // Hangul choseong filler
157        '\u{1160}' | // Hangul jungseong filler
158        '\u{17B4}' | // Khmer vowel inherent AQ
159        '\u{17B5}' | // Khmer vowel inherent AA
160        '\u{180E}' | // Mongolian vowel separator
161        '\u{200B}' | // Zero width space
162        '\u{200C}' | // Zero width non-joiner
163        '\u{200D}' | // Zero width joiner
164        '\u{200E}' | // Left-to-right mark
165        '\u{200F}' | // Right-to-left mark
166        '\u{202A}' | // Left-to-right embedding
167        '\u{202B}' | // Right-to-left embedding
168        '\u{202C}' | // Pop directional formatting
169        '\u{202D}' | // Left-to-right override
170        '\u{202E}' | // Right-to-left override
171        '\u{2060}' | // Word joiner
172        '\u{2061}' | // Function application
173        '\u{2062}' | // Invisible times
174        '\u{2063}' | // Invisible separator
175        '\u{2064}' | // Invisible plus
176        '\u{206A}' | // Inhibit symmetric swapping
177        '\u{206B}' | // Activate symmetric swapping
178        '\u{206C}' | // Inhibit Arabic form shaping
179        '\u{206D}' | // Activate Arabic form shaping
180        '\u{206E}' | // National digit shapes
181        '\u{206F}' | // Nominal digit shapes
182        '\u{3164}' | // Hangul filler
183        '\u{FEFF}' | // Zero width no-break space (BOM)
184        '\u{FFA0}' | // Halfwidth Hangul filler
185        '\u{1D159}' | // Musical symbol null notehead
186        '\u{1D173}' | // Musical symbol begin beam
187        '\u{1D174}' | // Musical symbol end beam
188        '\u{1D175}' | // Musical symbol begin tie
189        '\u{1D176}' | // Musical symbol end tie
190        '\u{1D177}' | // Musical symbol begin slur
191        '\u{1D178}' | // Musical symbol end slur
192        '\u{1D179}' | // Musical symbol begin phrase
193        '\u{1D17A}'   // Musical symbol end phrase
194    )
195}
196
197/// Check if a character is a bidirectional override character
198pub fn is_bidi_char(ch: char) -> bool {
199    matches!(ch,
200        '\u{061C}' | // Arabic letter mark
201        '\u{200E}' | // Left-to-right mark
202        '\u{200F}' | // Right-to-left mark
203        '\u{202A}' | // Left-to-right embedding
204        '\u{202B}' | // Right-to-left embedding
205        '\u{202C}' | // Pop directional formatting
206        '\u{202D}' | // Left-to-right override
207        '\u{202E}' | // Right-to-left override
208        '\u{2066}' | // Left-to-right isolate
209        '\u{2067}' | // Right-to-left isolate
210        '\u{2068}' | // First strong isolate
211        '\u{2069}'   // Pop directional isolate
212    )
213}
214
215/// Check if a character is commonly used in confusable attacks
216pub fn is_confusable_char(ch: char) -> bool {
217    // Common confusable characters (this is a simplified set)
218    matches!(ch,
219        // Cyrillic that look like Latin
220        'а' | 'е' | 'о' | 'р' | 'с' | 'у' | 'х' | 'А' | 'В' | 'Е' | 'К' | 'М' | 'Н' | 'О' | 'Р' | 'С' | 'Т' | 'У' | 'Х' |
221        // Greek that look like Latin  
222        'α' | 'β' | 'γ' | 'δ' | 'ε' | 'ζ' | 'η' | 'θ' | 'ι' | 'κ' | 'λ' | 'μ' | 'ν' | 'ξ' | 'ο' | 'π' | 'ρ' | 'σ' | 'τ' | 'υ' | 'φ' | 'χ' | 'ψ' | 'ω' |
223        // Mathematical symbols that look like Latin
224        '𝐀' | '𝐁' | '𝐂' | '𝐃' | '𝐄' | '𝐅' | '𝐆' | '𝐇' | '𝐈' | '𝐉' | '𝐊' | '𝐋' | '𝐌' | '𝐍' | '𝐎' | '𝐏' | '𝐐' | '𝐑' | '𝐒' | '𝐓' | '𝐔' | '𝐕' | '𝐖' | '𝐗' | '𝐘' | '𝐙'
225    )
226}
227
228/// Get the script category for a character
229pub fn get_script(ch: char) -> Script {
230    match ch {
231        'A'..='Z' | 'a'..='z' => Script::Latin,
232        'А'..='я' | 'Ё' | 'ё' => Script::Cyrillic,
233        'Α'..='ω' => Script::Greek,
234        '\u{0600}'..='\u{06FF}' => Script::Arabic,
235        '\u{0590}'..='\u{05FF}' => Script::Hebrew,
236        '\u{4E00}'..='\u{9FFF}' => Script::Chinese,
237        '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' => Script::Japanese,
238        '\u{AC00}'..='\u{D7AF}' => Script::Korean,
239        '\u{0E00}'..='\u{0E7F}' => Script::Thai,
240        '\u{0900}'..='\u{097F}' => Script::Devanagari,
241        // Don't count common punctuation, digits, and whitespace as separate scripts
242        '0'..='9' | ' ' | '\t' | '\n' | '\r' | '!' | '?' | '.' | ',' | ';' | ':' |
243        '"' | '\'' | '(' | ')' | '[' | ']' | '{' | '}' | '-' | '_' | '=' | '+' |
244        '*' | '/' | '\\' | '|' | '@' | '#' | '$' | '%' | '^' | '&' | '~' | '`' => Script::Latin,
245        _ => Script::Other(ch as u32),
246    }
247}
248
249/// Get a human-readable description of a character
250pub fn get_char_description(ch: char) -> &'static str {
251    match ch {
252        '\u{00AD}' => "Soft Hyphen",
253        '\u{200B}' => "Zero Width Space",
254        '\u{200C}' => "Zero Width Non-Joiner",
255        '\u{200D}' => "Zero Width Joiner",
256        '\u{200E}' => "Left-to-Right Mark",
257        '\u{200F}' => "Right-to-Left Mark",
258        '\u{202A}' => "Left-to-Right Embedding",
259        '\u{202B}' => "Right-to-Left Embedding",
260        '\u{202C}' => "Pop Directional Formatting",
261        '\u{202D}' => "Left-to-Right Override",
262        '\u{202E}' => "Right-to-Left Override",
263        '\u{2060}' => "Word Joiner",
264        '\u{FEFF}' => "Zero Width No-Break Space (BOM)",
265        _ => "Unknown Special Character",
266    }
267}
268
269/// Calculate the overall risk level based on analysis results
270fn calculate_risk_level(analysis: &SecurityAnalysis) -> RiskLevel {
271    let mut score = 0;
272
273    if analysis.has_invisible_chars {
274        score += 3;
275    }
276    if analysis.has_bidi_overrides {
277        score += 4;
278    }
279    if analysis.has_mixed_scripts {
280        score += 2;
281    }
282    if analysis.has_confusables {
283        score += 2;
284    }
285
286    // Additional scoring based on quantity
287    if analysis.invisible_chars.len() > 3 {
288        score += 2;
289    }
290    if analysis.bidi_chars.len() > 1 {
291        score += 2;
292    }
293
294    match score {
295        0 => RiskLevel::Low,
296        1..=3 => RiskLevel::Medium,
297        4..=6 => RiskLevel::High,
298        _ => RiskLevel::Critical,
299    }
300}
301
302/// Sanitize text by removing dangerous Unicode characters
303///
304/// # Examples
305///
306/// ```rust
307/// use unicode_rs::security::*;
308///
309/// let dangerous = "Hello\u{200B}World\u{202E}";
310/// let safe = sanitize_text(dangerous);
311/// assert_eq!(safe, "HelloWorld");
312/// ```
313pub fn sanitize_text(text: &str) -> String {
314    text.chars()
315        .filter(|&ch| !is_invisible_char(ch) && !is_bidi_char(ch))
316        .collect()
317}
318
319/// Generate a security report for the given text
320///
321/// # Examples
322///
323/// ```rust
324/// use unicode_rs::security::*;
325///
326/// let report = generate_security_report("Hello\u{200B}World");
327/// println!("{}", report);
328/// ```
329pub fn generate_security_report(text: &str) -> String {
330    let analysis = analyze_text(text);
331    let mut report = String::new();
332
333    report.push_str(&format!("Unicode Security Analysis\n"));
334    report.push_str(&format!("========================\n\n"));
335    report.push_str(&format!("Risk Level: {:?}\n\n", analysis.risk_level));
336
337    if analysis.has_invisible_chars {
338        report.push_str("⚠️  INVISIBLE CHARACTERS DETECTED:\n");
339        for (pos, ch, desc) in &analysis.invisible_chars {
340            report.push_str(&format!("  Position {}: U+{:04X} ({})\n", pos, *ch as u32, desc));
341        }
342        report.push('\n');
343    }
344
345    if analysis.has_bidi_overrides {
346        report.push_str("⚠️  BIDIRECTIONAL OVERRIDE CHARACTERS DETECTED:\n");
347        for (pos, ch, desc) in &analysis.bidi_chars {
348            report.push_str(&format!("  Position {}: U+{:04X} ({})\n", pos, *ch as u32, desc));
349        }
350        report.push('\n');
351    }
352
353    if analysis.has_mixed_scripts {
354        report.push_str("⚠️  MIXED SCRIPTS DETECTED (Potential Homograph Attack):\n");
355        for script in &analysis.scripts {
356            report.push_str(&format!("  {:?}\n", script));
357        }
358        report.push('\n');
359    }
360
361    if analysis.has_confusables {
362        report.push_str("⚠️  CONFUSABLE CHARACTERS DETECTED\n\n");
363    }
364
365    if analysis.risk_level == RiskLevel::Low {
366        report.push_str("✅ No security concerns detected.\n");
367    }
368
369    report
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375
376    #[test]
377    fn test_safe_text() {
378        let analysis = analyze_text("Hello World");
379        assert_eq!(analysis.risk_level, RiskLevel::Low);
380        assert!(!analysis.has_invisible_chars);
381        assert!(!analysis.has_bidi_overrides);
382        assert!(!analysis.has_confusables);
383    }
384
385    #[test]
386    fn test_invisible_characters() {
387        let text = "Hello\u{200B}World";
388        let analysis = analyze_text(text);
389        assert!(analysis.has_invisible_chars);
390        assert_eq!(analysis.invisible_chars.len(), 1);
391        assert_eq!(analysis.invisible_chars[0].1, '\u{200B}');
392        assert!(analysis.risk_level >= RiskLevel::High);
393    }
394
395    #[test]
396    fn test_bidi_override() {
397        let text = "filename\u{202E}gpj.exe";
398        let analysis = analyze_text(text);
399        assert!(analysis.has_bidi_overrides);
400        assert_eq!(analysis.bidi_chars.len(), 1);
401        assert_eq!(analysis.bidi_chars[0].1, '\u{202E}');
402        assert_eq!(analysis.risk_level, RiskLevel::Critical);
403    }
404
405    #[test]
406    fn test_mixed_scripts() {
407        let text = "раураӏ.com"; // Cyrillic that looks like "paypal.com"
408        let analysis = analyze_text(text);
409        assert!(analysis.has_mixed_scripts);
410        assert!(analysis.scripts.len() > 1);
411        assert!(analysis.risk_level >= RiskLevel::High);
412    }
413
414    #[test]
415    fn test_sanitization() {
416        let dangerous = "Hello\u{200B}World\u{202E}Test";
417        let sanitized = sanitize_text(dangerous);
418        assert_eq!(sanitized, "HelloWorldTest");
419
420        let analysis = analyze_text(&sanitized);
421        assert_eq!(analysis.risk_level, RiskLevel::Low);
422    }
423
424    #[test]
425    fn test_character_detection() {
426        assert!(is_invisible_char('\u{200B}')); // Zero width space
427        assert!(is_invisible_char('\u{FEFF}')); // BOM
428        assert!(!is_invisible_char('a'));
429
430        assert!(is_bidi_char('\u{202E}')); // Right-to-left override
431        assert!(is_bidi_char('\u{200F}')); // Right-to-left mark
432        assert!(!is_bidi_char('a'));
433
434        assert!(is_confusable_char('а')); // Cyrillic 'a'
435        assert!(is_confusable_char('α')); // Greek alpha
436        assert!(!is_confusable_char('a')); // Latin 'a'
437    }
438
439    #[test]
440    fn test_script_detection() {
441        assert_eq!(get_script('a'), Script::Latin);
442        assert_eq!(get_script('А'), Script::Cyrillic);
443        assert_eq!(get_script('α'), Script::Greek);
444        assert_eq!(get_script('世'), Script::Chinese);
445    }
446
447    #[test]
448    fn test_risk_calculation() {
449        // Low risk
450        let safe = analyze_text("Hello World");
451        assert_eq!(safe.risk_level, RiskLevel::Low);
452
453        // High risk - invisible chars
454        let invisible = analyze_text("Hello\u{200B}World");
455        assert!(invisible.risk_level >= RiskLevel::High);
456
457        // Critical risk - bidi override
458        let bidi = analyze_text("test\u{202E}evil");
459        assert_eq!(bidi.risk_level, RiskLevel::Critical);
460    }
461
462    #[test]
463    fn test_security_report() {
464        let text = "Hello\u{200B}World";
465        let report = generate_security_report(text);
466        assert!(report.contains("INVISIBLE CHARACTERS DETECTED"));
467        assert!(report.contains("U+200B"));
468        assert!(report.contains("Zero Width Space"));
469    }
470}