use crate::core::{Finding, Severity};
use crate::plugins::traits::{PluginError, PluginReport, ScanContext, ScanPhase, SecurityPlugin};
use async_trait::async_trait;
use std::path::Path;
use std::time::Instant;
const HOMOGLYPH_RANGES: &[(char, char, &str)] = &[
('\u{0400}', '\u{04FF}', "Cyrillic"),
('\u{0370}', '\u{03FF}', "Greek"),
('\u{FF01}', '\u{FF5E}', "Fullwidth"),
('\u{1D400}', '\u{1D7FF}', "Mathematical"),
('\u{0180}', '\u{024F}', "Latin Extended-B"),
];
const BIDI_CHARS: &[(char, &str)] = &[
('\u{202A}', "Left-to-Right Embedding (LRE)"),
('\u{202B}', "Right-to-Left Embedding (RLE)"),
('\u{202C}', "Pop Directional Formatting (PDF)"),
('\u{202D}', "Left-to-Right Override (LRO)"),
('\u{202E}', "Right-to-Left Override (RLO)"),
('\u{2066}', "Left-to-Right Isolate (LRI)"),
('\u{2067}', "Right-to-Left Isolate (RLI)"),
('\u{2068}', "First Strong Isolate (FSI)"),
('\u{2069}', "Pop Directional Isolate (PDI)"),
('\u{200F}', "Right-to-Left Mark (RLM)"),
('\u{200E}', "Left-to-Right Mark (LRM)"),
];
const INVISIBLE_CHARS: &[(char, &str)] = &[
('\u{200B}', "Zero Width Space"),
('\u{200C}', "Zero Width Non-Joiner"),
('\u{200D}', "Zero Width Joiner"),
('\u{FEFF}', "Zero Width No-Break Space (BOM)"),
('\u{00AD}', "Soft Hyphen"),
('\u{034F}', "Combining Grapheme Joiner"),
('\u{2060}', "Word Joiner"),
('\u{2061}', "Function Application"),
('\u{2062}', "Invisible Times"),
('\u{2063}', "Invisible Separator"),
('\u{2064}', "Invisible Plus"),
];
const SOURCE_EXTENSIONS: &[&str] = &[
"py",
"js",
"ts",
"jsx",
"tsx",
"rs",
"go",
"c",
"cpp",
"h",
"hpp",
"java",
"kt",
"scala",
"rb",
"php",
"cs",
"vb",
"swift",
"sh",
"bash",
"zsh",
"ps1",
"bat",
"cmd",
"pl",
"pm",
"lua",
"r",
"jl",
"hs",
"html",
"htm",
"css",
"scss",
"less",
"yaml",
"yml",
"toml",
"json",
"xml",
"sql",
"tf",
"hcl",
"Makefile",
"Dockerfile",
"gradle",
];
pub struct EncodingScanner;
impl Default for EncodingScanner {
fn default() -> Self {
Self::new()
}
}
impl EncodingScanner {
pub fn new() -> Self {
Self
}
fn is_source_file(path: &Path) -> bool {
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
SOURCE_EXTENSIONS.contains(&ext)
|| filename == "Makefile"
|| filename == "Dockerfile"
|| filename == "Jenkinsfile"
|| filename == "Gemfile"
|| filename == "Rakefile"
}
fn scan_for_bidi(path: &Path, content: &str, findings: &mut Vec<Finding>) {
for (line_num, line) in content.lines().enumerate() {
for &(ch, name) in BIDI_CHARS {
if line.contains(ch) {
let count = line.chars().filter(|&c| c == ch).count();
findings.push(
Finding::new(
format!("ENC-BIDI-{:03}", findings.len() + 1),
format!("Bidirectional text override: {} (Trojan Source)", name),
Severity::Critical,
)
.with_file(path.to_path_buf())
.with_line((line_num + 1) as u32)
.with_evidence(format!(
"Found {} occurrence(s) of {} (U+{:04X}) — CVE-2021-42574",
count, name, ch as u32
))
.with_description(
"Bidirectional override characters make source code appear \
different than it actually executes. This is the Trojan Source \
attack (CVE-2021-42574). CWE-451."
.to_string(),
),
);
}
}
}
}
fn scan_for_homoglyphs(path: &Path, content: &str, findings: &mut Vec<Finding>) {
for (line_num, line) in content.lines().enumerate() {
let trimmed = line.trim();
if trimmed.starts_with('#') || trimmed.starts_with("//") || trimmed.starts_with('*') {
continue;
}
for &(start, end, block_name) in HOMOGLYPH_RANGES {
let suspicious: Vec<char> =
line.chars().filter(|&c| c >= start && c <= end).collect();
if !suspicious.is_empty() {
let chars_display: String = suspicious
.iter()
.take(5)
.map(|c| format!("U+{:04X}", *c as u32))
.collect::<Vec<_>>()
.join(", ");
findings.push(
Finding::new(
format!("ENC-HOMO-{:03}", findings.len() + 1),
format!("Unicode homoglyph from {} block in source code", block_name),
Severity::High,
)
.with_file(path.to_path_buf())
.with_line((line_num + 1) as u32)
.with_evidence(format!(
"Found {} {} character(s): {}",
suspicious.len(),
block_name,
chars_display
))
.with_description(format!(
"{} characters in source code may be homoglyph attacks — \
visually identical to ASCII but semantically different. \
For example, Cyrillic 'а' (U+0430) looks identical to Latin 'a' (U+0061). CWE-1007.",
block_name
)),
);
}
}
}
}
fn scan_for_invisible(path: &Path, content: &str, findings: &mut Vec<Finding>) {
for (line_num, line) in content.lines().enumerate() {
for &(ch, name) in INVISIBLE_CHARS {
if ch == '\u{FEFF}' && line_num == 0 && line.starts_with(ch) {
continue;
}
if line.contains(ch) {
let count = line.chars().filter(|&c| c == ch).count();
findings.push(
Finding::new(
format!("ENC-INVIS-{:03}", findings.len() + 1),
format!("Invisible character: {} in source code", name),
Severity::High,
)
.with_file(path.to_path_buf())
.with_line((line_num + 1) as u32)
.with_evidence(format!(
"Found {} occurrence(s) of {} (U+{:04X})",
count, name, ch as u32
))
.with_description(
"Invisible/zero-width characters in source code can be used \
for steganography, identifier confusion, or to bypass string \
comparison checks. CWE-1007."
.to_string(),
),
);
}
}
}
}
}
#[async_trait]
impl SecurityPlugin for EncodingScanner {
fn name(&self) -> &str {
"encoding"
}
fn version(&self) -> &str {
"0.1.0"
}
fn description(&self) -> &str {
"Detect Unicode homoglyphs, BiDi overrides, and invisible characters (Trojan Source)"
}
fn scan_phase(&self) -> ScanPhase {
ScanPhase::All
}
async fn initialize(&mut self) -> Result<(), PluginError> {
Ok(())
}
async fn scan(&self, context: &ScanContext<'_>) -> Result<PluginReport, PluginError> {
let start = Instant::now();
let mut report = PluginReport::new(self.name().to_string());
if !Self::is_source_file(context.path) {
report.duration_ms = start.elapsed().as_millis() as u64;
return Ok(report);
}
if let Some(content) = context.file_content {
if let Ok(content_str) = std::str::from_utf8(content) {
Self::scan_for_bidi(context.path, content_str, &mut report.findings);
Self::scan_for_homoglyphs(context.path, content_str, &mut report.findings);
Self::scan_for_invisible(context.path, content_str, &mut report.findings);
if !report.findings.is_empty() {
report.scanned_files = 1;
}
}
}
report.duration_ms = start.elapsed().as_millis() as u64;
Ok(report)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::plugins::traits::ScanContext;
use std::collections::HashMap;
#[tokio::test]
async fn test_bidi_override() {
let scanner = EncodingScanner::new();
let content = "fn check() { if \u{202E}access_level != \"user\" { grant(); } }";
let context = ScanContext {
path: Path::new("auth.py"),
scan_phase: ScanPhase::PostExtract,
file_content: Some(content.as_bytes()),
metadata: HashMap::new(),
};
let report = scanner.scan(&context).await.unwrap();
assert!(report
.findings
.iter()
.any(|f| f.title.contains("Bidirectional")));
}
#[tokio::test]
async fn test_cyrillic_homoglyph() {
let scanner = EncodingScanner::new();
let content = "def \u{0430}dmin_check(): pass";
let context = ScanContext {
path: Path::new("auth.py"),
scan_phase: ScanPhase::PostExtract,
file_content: Some(content.as_bytes()),
metadata: HashMap::new(),
};
let report = scanner.scan(&context).await.unwrap();
assert!(report
.findings
.iter()
.any(|f| f.title.contains("homoglyph")));
}
#[tokio::test]
async fn test_zero_width_space() {
let scanner = EncodingScanner::new();
let content = "const password\u{200B} = 'secret';";
let context = ScanContext {
path: Path::new("config.js"),
scan_phase: ScanPhase::PostExtract,
file_content: Some(content.as_bytes()),
metadata: HashMap::new(),
};
let report = scanner.scan(&context).await.unwrap();
assert!(report
.findings
.iter()
.any(|f| f.title.contains("Invisible")));
}
#[tokio::test]
async fn test_clean_ascii_file() {
let scanner = EncodingScanner::new();
let content = b"fn main() { println!(\"hello\"); }";
let context = ScanContext {
path: Path::new("main.rs"),
scan_phase: ScanPhase::PostExtract,
file_content: Some(content),
metadata: HashMap::new(),
};
let report = scanner.scan(&context).await.unwrap();
assert!(report.findings.is_empty());
}
#[tokio::test]
async fn test_non_source_file_skipped() {
let scanner = EncodingScanner::new();
let content = "Contains \u{202E} bidi but in a .png file";
let context = ScanContext {
path: Path::new("image.png"),
scan_phase: ScanPhase::PostExtract,
file_content: Some(content.as_bytes()),
metadata: HashMap::new(),
};
let report = scanner.scan(&context).await.unwrap();
assert!(report.findings.is_empty());
}
}