use std::collections::HashSet;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SecurityAnalysis {
pub has_invisible_chars: bool,
pub has_bidi_overrides: bool,
pub has_mixed_scripts: bool,
pub has_confusables: bool,
pub invisible_chars: Vec<(usize, char, &'static str)>,
pub bidi_chars: Vec<(usize, char, &'static str)>,
pub scripts: HashSet<Script>,
pub risk_level: RiskLevel,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum Script {
Latin,
Cyrillic,
Greek,
Arabic,
Hebrew,
Chinese,
Japanese,
Korean,
Thai,
Devanagari,
Other(u32), }
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum RiskLevel {
Low,
Medium,
High,
Critical,
}
pub fn analyze_text(text: &str) -> SecurityAnalysis {
let mut analysis = SecurityAnalysis {
has_invisible_chars: false,
has_bidi_overrides: false,
has_mixed_scripts: false,
has_confusables: false,
invisible_chars: Vec::new(),
bidi_chars: Vec::new(),
scripts: HashSet::new(),
risk_level: RiskLevel::Low,
};
for (pos, ch) in text.char_indices() {
if is_invisible_char(ch) {
analysis.has_invisible_chars = true;
analysis.invisible_chars.push((pos, ch, get_char_description(ch)));
}
if is_bidi_char(ch) {
analysis.has_bidi_overrides = true;
analysis.bidi_chars.push((pos, ch, get_char_description(ch)));
}
let script = get_script(ch);
analysis.scripts.insert(script);
if is_confusable_char(ch) {
analysis.has_confusables = true;
}
}
let non_latin_scripts: Vec<_> = analysis.scripts.iter()
.filter(|s| !matches!(s, Script::Latin))
.collect();
analysis.has_mixed_scripts = non_latin_scripts.len() > 1
|| (non_latin_scripts.len() == 1 && analysis.scripts.contains(&Script::Latin));
analysis.risk_level = calculate_risk_level(&analysis);
analysis
}
pub fn is_invisible_char(ch: char) -> bool {
matches!(ch,
'\u{00AD}' | '\u{034F}' | '\u{061C}' | '\u{115F}' | '\u{1160}' | '\u{17B4}' | '\u{17B5}' | '\u{180E}' | '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2060}' | '\u{2061}' | '\u{2062}' | '\u{2063}' | '\u{2064}' | '\u{206A}' | '\u{206B}' | '\u{206C}' | '\u{206D}' | '\u{206E}' | '\u{206F}' | '\u{3164}' | '\u{FEFF}' | '\u{FFA0}' | '\u{1D159}' | '\u{1D173}' | '\u{1D174}' | '\u{1D175}' | '\u{1D176}' | '\u{1D177}' | '\u{1D178}' | '\u{1D179}' | '\u{1D17A}' )
}
pub fn is_bidi_char(ch: char) -> bool {
matches!(ch,
'\u{061C}' | '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
}
pub fn is_confusable_char(ch: char) -> bool {
matches!(ch,
'а' | 'е' | 'о' | 'р' | 'с' | 'у' | 'х' | 'А' | 'В' | 'Е' | 'К' | 'М' | 'Н' | 'О' | 'Р' | 'С' | 'Т' | 'У' | 'Х' |
'α' | 'β' | 'γ' | 'δ' | 'ε' | 'ζ' | 'η' | 'θ' | 'ι' | 'κ' | 'λ' | 'μ' | 'ν' | 'ξ' | 'ο' | 'π' | 'ρ' | 'σ' | 'τ' | 'υ' | 'φ' | 'χ' | 'ψ' | 'ω' |
'𝐀' | '𝐁' | '𝐂' | '𝐃' | '𝐄' | '𝐅' | '𝐆' | '𝐇' | '𝐈' | '𝐉' | '𝐊' | '𝐋' | '𝐌' | '𝐍' | '𝐎' | '𝐏' | '𝐐' | '𝐑' | '𝐒' | '𝐓' | '𝐔' | '𝐕' | '𝐖' | '𝐗' | '𝐘' | '𝐙'
)
}
pub fn get_script(ch: char) -> Script {
match ch {
'A'..='Z' | 'a'..='z' => Script::Latin,
'А'..='я' | 'Ё' | 'ё' => Script::Cyrillic,
'Α'..='ω' => Script::Greek,
'\u{0600}'..='\u{06FF}' => Script::Arabic,
'\u{0590}'..='\u{05FF}' => Script::Hebrew,
'\u{4E00}'..='\u{9FFF}' => Script::Chinese,
'\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' => Script::Japanese,
'\u{AC00}'..='\u{D7AF}' => Script::Korean,
'\u{0E00}'..='\u{0E7F}' => Script::Thai,
'\u{0900}'..='\u{097F}' => Script::Devanagari,
'0'..='9' | ' ' | '\t' | '\n' | '\r' | '!' | '?' | '.' | ',' | ';' | ':' |
'"' | '\'' | '(' | ')' | '[' | ']' | '{' | '}' | '-' | '_' | '=' | '+' |
'*' | '/' | '\\' | '|' | '@' | '#' | '$' | '%' | '^' | '&' | '~' | '`' => Script::Latin,
_ => Script::Other(ch as u32),
}
}
pub fn get_char_description(ch: char) -> &'static str {
match ch {
'\u{00AD}' => "Soft Hyphen",
'\u{200B}' => "Zero Width Space",
'\u{200C}' => "Zero Width Non-Joiner",
'\u{200D}' => "Zero Width Joiner",
'\u{200E}' => "Left-to-Right Mark",
'\u{200F}' => "Right-to-Left Mark",
'\u{202A}' => "Left-to-Right Embedding",
'\u{202B}' => "Right-to-Left Embedding",
'\u{202C}' => "Pop Directional Formatting",
'\u{202D}' => "Left-to-Right Override",
'\u{202E}' => "Right-to-Left Override",
'\u{2060}' => "Word Joiner",
'\u{FEFF}' => "Zero Width No-Break Space (BOM)",
_ => "Unknown Special Character",
}
}
fn calculate_risk_level(analysis: &SecurityAnalysis) -> RiskLevel {
let mut score = 0;
if analysis.has_invisible_chars {
score += 3;
}
if analysis.has_bidi_overrides {
score += 4;
}
if analysis.has_mixed_scripts {
score += 2;
}
if analysis.has_confusables {
score += 2;
}
if analysis.invisible_chars.len() > 3 {
score += 2;
}
if analysis.bidi_chars.len() > 1 {
score += 2;
}
match score {
0 => RiskLevel::Low,
1..=3 => RiskLevel::Medium,
4..=6 => RiskLevel::High,
_ => RiskLevel::Critical,
}
}
pub fn sanitize_text(text: &str) -> String {
text.chars()
.filter(|&ch| !is_invisible_char(ch) && !is_bidi_char(ch))
.collect()
}
pub fn generate_security_report(text: &str) -> String {
let analysis = analyze_text(text);
let mut report = String::new();
report.push_str(&format!("Unicode Security Analysis\n"));
report.push_str(&format!("========================\n\n"));
report.push_str(&format!("Risk Level: {:?}\n\n", analysis.risk_level));
if analysis.has_invisible_chars {
report.push_str("⚠️ INVISIBLE CHARACTERS DETECTED:\n");
for (pos, ch, desc) in &analysis.invisible_chars {
report.push_str(&format!(" Position {}: U+{:04X} ({})\n", pos, *ch as u32, desc));
}
report.push('\n');
}
if analysis.has_bidi_overrides {
report.push_str("⚠️ BIDIRECTIONAL OVERRIDE CHARACTERS DETECTED:\n");
for (pos, ch, desc) in &analysis.bidi_chars {
report.push_str(&format!(" Position {}: U+{:04X} ({})\n", pos, *ch as u32, desc));
}
report.push('\n');
}
if analysis.has_mixed_scripts {
report.push_str("⚠️ MIXED SCRIPTS DETECTED (Potential Homograph Attack):\n");
for script in &analysis.scripts {
report.push_str(&format!(" {:?}\n", script));
}
report.push('\n');
}
if analysis.has_confusables {
report.push_str("⚠️ CONFUSABLE CHARACTERS DETECTED\n\n");
}
if analysis.risk_level == RiskLevel::Low {
report.push_str("✅ No security concerns detected.\n");
}
report
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_safe_text() {
let analysis = analyze_text("Hello World");
assert_eq!(analysis.risk_level, RiskLevel::Low);
assert!(!analysis.has_invisible_chars);
assert!(!analysis.has_bidi_overrides);
assert!(!analysis.has_confusables);
}
#[test]
fn test_invisible_characters() {
let text = "Hello\u{200B}World";
let analysis = analyze_text(text);
assert!(analysis.has_invisible_chars);
assert_eq!(analysis.invisible_chars.len(), 1);
assert_eq!(analysis.invisible_chars[0].1, '\u{200B}');
assert!(analysis.risk_level >= RiskLevel::High);
}
#[test]
fn test_bidi_override() {
let text = "filename\u{202E}gpj.exe";
let analysis = analyze_text(text);
assert!(analysis.has_bidi_overrides);
assert_eq!(analysis.bidi_chars.len(), 1);
assert_eq!(analysis.bidi_chars[0].1, '\u{202E}');
assert_eq!(analysis.risk_level, RiskLevel::Critical);
}
#[test]
fn test_mixed_scripts() {
let text = "раураӏ.com"; let analysis = analyze_text(text);
assert!(analysis.has_mixed_scripts);
assert!(analysis.scripts.len() > 1);
assert!(analysis.risk_level >= RiskLevel::High);
}
#[test]
fn test_sanitization() {
let dangerous = "Hello\u{200B}World\u{202E}Test";
let sanitized = sanitize_text(dangerous);
assert_eq!(sanitized, "HelloWorldTest");
let analysis = analyze_text(&sanitized);
assert_eq!(analysis.risk_level, RiskLevel::Low);
}
#[test]
fn test_character_detection() {
assert!(is_invisible_char('\u{200B}')); assert!(is_invisible_char('\u{FEFF}')); assert!(!is_invisible_char('a'));
assert!(is_bidi_char('\u{202E}')); assert!(is_bidi_char('\u{200F}')); assert!(!is_bidi_char('a'));
assert!(is_confusable_char('а')); assert!(is_confusable_char('α')); assert!(!is_confusable_char('a')); }
#[test]
fn test_script_detection() {
assert_eq!(get_script('a'), Script::Latin);
assert_eq!(get_script('А'), Script::Cyrillic);
assert_eq!(get_script('α'), Script::Greek);
assert_eq!(get_script('世'), Script::Chinese);
}
#[test]
fn test_risk_calculation() {
let safe = analyze_text("Hello World");
assert_eq!(safe.risk_level, RiskLevel::Low);
let invisible = analyze_text("Hello\u{200B}World");
assert!(invisible.risk_level >= RiskLevel::High);
let bidi = analyze_text("test\u{202E}evil");
assert_eq!(bidi.risk_level, RiskLevel::Critical);
}
#[test]
fn test_security_report() {
let text = "Hello\u{200B}World";
let report = generate_security_report(text);
assert!(report.contains("INVISIBLE CHARACTERS DETECTED"));
assert!(report.contains("U+200B"));
assert!(report.contains("Zero Width Space"));
}
}