use std::collections::HashMap;
use std::path::{Path, PathBuf};
use serde::Deserialize;
const CHARSET_BASE: &str = include_str!("charsets/base.json");
const CHARSET_CHINESE: &str = include_str!("charsets/chinese.json");
const CHARSET_JAPANESE: &str = include_str!("charsets/japanese.json");
const CHARSET_KOREAN: &str = include_str!("charsets/korean.json");
#[derive(Deserialize)]
struct RangeEntry {
start: String,
end: String,
}
#[derive(Deserialize)]
struct DangerousEntry {
code: String,
name: String,
}
#[derive(Deserialize)]
struct CharsetData {
#[serde(default)]
emoji_ranges: Vec<RangeEntry>,
#[serde(default)]
extra_ranges: Vec<RangeEntry>,
#[serde(default)]
dangerous: Vec<DangerousEntry>,
}
fn parse_hex(s: &str) -> Result<u32, String> {
let s = s.trim_start_matches("0x").trim_start_matches("0X");
u32::from_str_radix(s, 16)
.map_err(|_| format!("Invalid hex codepoint in charset: '{}'", s))
}
fn parse_charset_data(json: &str) -> Result<CharsetData, String> {
serde_json::from_str(json).map_err(|e| e.to_string())
}
fn load_builtin_charset(name: &str) -> Option<&'static str> {
match name {
"base" => Some(CHARSET_BASE),
"chinese" => Some(CHARSET_CHINESE),
"japanese" => Some(CHARSET_JAPANESE),
"korean" => Some(CHARSET_KOREAN),
_ => None,
}
}
fn load_charset(name_or_path: &str) -> Result<CharsetData, String> {
if name_or_path.contains(std::path::MAIN_SEPARATOR) || name_or_path.ends_with(".json") {
let content = std::fs::read_to_string(name_or_path)
.map_err(|_| format!("Charset file not found: {}", name_or_path))?;
return parse_charset_data(&content);
}
match load_builtin_charset(name_or_path) {
Some(json) => parse_charset_data(json),
None => Err(format!("Unknown charset: {}", name_or_path)),
}
}
fn parse_ranges(entries: &[RangeEntry]) -> Result<Vec<(u32, u32)>, String> {
entries
.iter()
.map(|e| Ok((parse_hex(&e.start)?, parse_hex(&e.end)?)))
.collect()
}
fn parse_dangerous(entries: &[DangerousEntry]) -> Result<HashMap<u32, String>, String> {
entries
.iter()
.map(|e| Ok((parse_hex(&e.code)?, e.name.clone())))
.collect()
}
pub fn resolve_charsets(
charset_names: &[String],
charset_files: &[String],
) -> Result<(Vec<(u32, u32)>, HashMap<u32, String>), String> {
let base = load_charset("base")?;
let mut ranges_set: std::collections::HashSet<(u32, u32)> = std::collections::HashSet::new();
ranges_set.extend(parse_ranges(&base.emoji_ranges)?);
ranges_set.extend(parse_ranges(&base.extra_ranges)?);
let mut dangerous = parse_dangerous(&base.dangerous)?;
for name in charset_names {
let data = load_charset(name)?;
ranges_set.extend(parse_ranges(&data.emoji_ranges)?);
ranges_set.extend(parse_ranges(&data.extra_ranges)?);
dangerous.extend(parse_dangerous(&data.dangerous)?);
}
for path in charset_files {
let data = load_charset(path)?;
ranges_set.extend(parse_ranges(&data.emoji_ranges)?);
ranges_set.extend(parse_ranges(&data.extra_ranges)?);
dangerous.extend(parse_dangerous(&data.dangerous)?);
}
let mut ranges: Vec<(u32, u32)> = ranges_set.into_iter().collect();
ranges.sort();
Ok((ranges, dangerous))
}
fn is_in_ranges(c: char, ranges: &[(u32, u32)]) -> bool {
let code = c as u32;
if code <= 127 {
return true;
}
let idx = ranges.partition_point(|&(start, _)| start <= code);
if idx == 0 {
return false;
}
code <= ranges[idx - 1].1
}
fn compute_comment_mask_python(chars: &[char]) -> Vec<bool> {
let n = chars.len();
let mut mask = vec![false; n];
let mut i = 0;
while i < n {
if i + 2 < n {
let is_triple_double =
chars[i] == '"' && chars[i + 1] == '"' && chars[i + 2] == '"';
let is_triple_single =
chars[i] == '\'' && chars[i + 1] == '\'' && chars[i + 2] == '\'';
if is_triple_double || is_triple_single {
let q = chars[i];
i += 3;
while i < n {
if chars[i] == '\\' && i + 1 < n {
i += 2;
continue;
}
if i + 2 < n && chars[i] == q && chars[i + 1] == q && chars[i + 2] == q {
i += 3;
break;
}
i += 1;
}
continue;
}
}
if chars[i] == '"' || chars[i] == '\'' {
let q = chars[i];
i += 1;
while i < n && chars[i] != '\n' {
if chars[i] == '\\' && i + 1 < n {
i += 2;
continue;
}
if chars[i] == q {
i += 1;
break;
}
i += 1;
}
continue;
}
if chars[i] == '#' {
while i < n && chars[i] != '\n' {
mask[i] = true;
i += 1;
}
continue;
}
i += 1;
}
mask
}
fn compute_comment_mask_js(chars: &[char]) -> Vec<bool> {
let n = chars.len();
let mut mask = vec![false; n];
let mut i = 0;
while i < n {
if chars[i] == '`' {
i += 1;
while i < n {
if chars[i] == '\\' && i + 1 < n {
i += 2;
continue;
}
if chars[i] == '`' {
i += 1;
break;
}
i += 1;
}
continue;
}
if chars[i] == '"' || chars[i] == '\'' {
let q = chars[i];
i += 1;
while i < n && chars[i] != '\n' {
if chars[i] == '\\' && i + 1 < n {
i += 2;
continue;
}
if chars[i] == q {
i += 1;
break;
}
i += 1;
}
continue;
}
if i + 1 < n && chars[i] == '/' && chars[i + 1] == '/' {
while i < n && chars[i] != '\n' {
mask[i] = true;
i += 1;
}
continue;
}
if i + 1 < n && chars[i] == '/' && chars[i + 1] == '*' {
while i < n {
if i + 1 < n && chars[i] == '*' && chars[i + 1] == '/' {
mask[i] = true;
mask[i + 1] = true;
i += 2;
break;
}
mask[i] = true;
i += 1;
}
continue;
}
i += 1;
}
mask
}
pub fn check_file(
path: &Path,
extra_ranges: &[(u32, u32)],
dangerous: &HashMap<u32, String>,
) -> Vec<String> {
let mut problems = Vec::new();
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(e) => {
problems.push(format!("Failed to read file: {}", e));
return problems;
}
};
let chars: Vec<char> = content.chars().collect();
let suffix = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_lowercase();
let comment_mask: Vec<bool> = match suffix.as_str() {
"py" => compute_comment_mask_python(&chars),
"js" | "ts" | "tsx" | "jsx" | "mjs" | "cjs" | "rs" => compute_comment_mask_js(&chars),
_ => vec![false; chars.len()],
};
for (i, &c) in chars.iter().enumerate() {
let code = c as u32;
if let Some(name) = dangerous.get(&code) {
if !comment_mask[i] {
problems.push(format!(
"Dangerous character in code at position {}: U+{:04X} ({})",
i + 1,
code,
name
));
}
} else if !is_in_ranges(c, extra_ranges) {
problems.push(format!(
"Illegal character at position {}: {:?} (U+{:04X})",
i + 1,
c,
code
));
}
if problems.len() >= 5 {
break;
}
}
problems
}
const DEFAULT_DIRS: &[&str] = &["src", "tests", "examples"];
const DEFAULT_GLOBS: &[&str] = &["*.md", "*.yml", "*.yaml", "*.json", ".gitignore"];
const SKIP_SUFFIXES: &[&str] = &[
"pyc", "pyo",
"png", "jpg", "jpeg", "gif", "bmp", "ico", "svg", "webp",
"ttf", "otf", "woff", "woff2", "eot",
"zip", "tar", "gz", "bz2", "xz", "7z",
"so", "dylib", "dll", "exe", "o", "a", "whl", "egg",
"mp3", "mp4", "wav", "avi", "mov", "flac", "ogg",
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
"db", "sqlite", "sqlite3", "pickle", "pkl",
];
const SKIP_DIRS: &[&str] = &[
"__pycache__",
"node_modules",
".git",
".venv",
"venv",
".tox",
".mypy_cache",
".pytest_cache",
".ruff_cache",
"dist",
"build",
"target",
];
fn should_skip_suffix(path: &Path) -> bool {
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
let ext_lower = ext.to_lowercase();
SKIP_SUFFIXES.contains(&ext_lower.as_str())
} else {
false
}
}
fn walk_dir(directory: &Path) -> Vec<PathBuf> {
let mut files = Vec::new();
let mut entries: Vec<_> = match std::fs::read_dir(directory) {
Ok(r) => r.filter_map(|e| e.ok()).collect(),
Err(_) => return files,
};
entries.sort_by_key(|e| e.file_name());
for entry in entries {
let path = entry.path();
let name = entry.file_name();
let name_str = name.to_string_lossy();
if name_str.starts_with('.') {
continue;
}
if path.is_dir() {
if SKIP_DIRS.contains(&name_str.as_ref()) || name_str.ends_with(".egg-info") {
continue;
}
files.extend(walk_dir(&path));
} else if path.is_file() && !should_skip_suffix(&path) {
files.push(path);
}
}
files
}
fn matches_simple_glob(path: &Path, pattern: &str) -> bool {
let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
if let Some(ext) = pattern.strip_prefix("*.") {
name.ends_with(&format!(".{}", ext))
} else {
name == pattern
}
}
fn default_project_files() -> Vec<PathBuf> {
let cwd = std::env::current_dir().unwrap_or_default();
let mut files = Vec::new();
for &dirname in DEFAULT_DIRS {
let d = cwd.join(dirname);
if d.is_dir() {
files.extend(walk_dir(&d));
}
}
for &pattern in DEFAULT_GLOBS {
if let Ok(entries) = std::fs::read_dir(&cwd) {
let mut matched: Vec<PathBuf> = entries
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| p.is_file() && matches_simple_glob(p, pattern))
.collect();
matched.sort();
files.extend(matched);
}
}
files
}
fn resolve_paths(paths: Vec<PathBuf>) -> Vec<PathBuf> {
if paths.is_empty() {
return default_project_files();
}
let mut result = Vec::new();
for p in paths {
if p.is_dir() {
result.extend(walk_dir(&p));
} else {
result.push(p);
}
}
result
}
pub fn check_paths(
paths: Vec<PathBuf>,
extra_ranges: &[(u32, u32)],
dangerous: &HashMap<u32, String>,
) -> i32 {
let resolved = resolve_paths(paths);
if resolved.is_empty() {
println!("No files to check.");
return 0;
}
let mut has_error = false;
let checked = resolved.len();
for path in &resolved {
let problems = check_file(path, extra_ranges, dangerous);
if !problems.is_empty() {
has_error = true;
println!("\n{} contains illegal characters:", path.display());
for p in &problems {
println!(" {}", p);
}
}
}
if !has_error {
println!("All {} files passed.", checked);
}
if has_error { 1 } else { 0 }
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn base_charsets() -> (Vec<(u32, u32)>, HashMap<u32, String>) {
resolve_charsets(&[], &[]).unwrap()
}
fn check_content(content: &str, suffix: &str) -> Vec<String> {
let mut f = NamedTempFile::with_suffix(suffix).unwrap();
f.write_all(content.as_bytes()).unwrap();
let (ranges, dangerous) = base_charsets();
check_file(f.path(), &ranges, &dangerous)
}
#[test]
fn test_ascii_passes() {
let problems = check_content("hello world\n", ".py");
assert!(problems.is_empty());
}
#[test]
fn test_emoji_passes() {
let problems = check_content("x = '😀'\n", ".py");
assert!(problems.is_empty());
}
#[test]
fn test_illegal_char_flagged() {
let problems = check_content("x = '你'\n", ".py");
assert!(!problems.is_empty());
assert!(problems[0].contains("U+4F60"));
}
#[test]
fn test_chinese_charset_allows_cjk() {
let (ranges, dangerous) = resolve_charsets(&["chinese".to_string()], &[]).unwrap();
let mut f = NamedTempFile::with_suffix(".py").unwrap();
f.write_all("x = '你好'\n".as_bytes()).unwrap();
let problems = check_file(f.path(), &ranges, &dangerous);
assert!(problems.is_empty());
}
#[test]
fn test_dangerous_char_in_code_flagged() {
let content = "x = 1\ny\u{202E} = 2\n";
let problems = check_content(content, ".py");
assert!(!problems.is_empty());
assert!(problems[0].contains("Dangerous character in code"));
assert!(problems[0].contains("U+202E"));
}
#[test]
fn test_dangerous_char_in_comment_allowed() {
let content = "x = 1\n# \u{202E} bidi in comment\n";
let problems = check_content(content, ".py");
assert!(problems.is_empty());
}
#[test]
fn test_dangerous_char_in_js_line_comment_allowed() {
let content = "const x = 1;\n// \u{202E} bidi\n";
let problems = check_content(content, ".js");
assert!(problems.is_empty());
}
#[test]
fn test_dangerous_char_in_rust_line_comment_allowed() {
let content = "let x = 1;\n// \u{202E} bidi\n";
let problems = check_content(content, ".rs");
assert!(problems.is_empty());
}
#[test]
fn test_dangerous_char_in_rust_block_comment_allowed() {
let content = "/* \u{202E} bidi */ let x = 1;\n";
let problems = check_content(content, ".rs");
assert!(problems.is_empty());
}
#[test]
fn test_dangerous_char_in_js_block_comment_allowed() {
let content = "/* \u{202E} bidi */ const x = 1;\n";
let problems = check_content(content, ".js");
assert!(problems.is_empty());
}
#[test]
fn test_max_5_problems_reported() {
let content = "你好世界再见哦";
let problems = check_content(content, ".txt");
assert_eq!(problems.len(), 5);
}
#[test]
fn test_resolve_charsets_unknown_name() {
let result = resolve_charsets(&["nonexistent".to_string()], &[]);
assert!(result.is_err());
assert!(result.unwrap_err().contains("Unknown charset"));
}
#[test]
fn test_resolve_charsets_deduplication() {
let (r1, _) = resolve_charsets(&["chinese".to_string()], &[]).unwrap();
let (r2, _) =
resolve_charsets(&["chinese".to_string(), "chinese".to_string()], &[]).unwrap();
assert_eq!(r1.len(), r2.len());
}
#[test]
fn test_non_utf8_file_returns_error() {
use std::fs;
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("bad.py");
fs::write(&path, b"\xFF\xFE invalid utf8").unwrap();
let (ranges, dangerous) = base_charsets();
let problems = check_file(&path, &ranges, &dangerous);
assert!(!problems.is_empty());
assert!(problems[0].contains("Failed to read file"));
}
#[test]
fn test_is_in_ranges_binary_search_boundary() {
let (ranges, _) = base_charsets();
assert!(is_in_ranges('\u{0080}', &ranges)); assert!(is_in_ranges('\u{00FF}', &ranges)); assert!(!is_in_ranges('\u{0100}', &ranges)); }
#[test]
fn test_parse_hex_invalid_propagates_error() {
let bad_json = r#"{"extra_ranges": [{"start": "0xZZZZ", "end": "0x00FF"}]}"#;
let result: Result<CharsetData, _> = serde_json::from_str(bad_json);
let data: CharsetData = result.unwrap();
let range_result = parse_ranges(&data.extra_ranges);
assert!(range_result.is_err());
assert!(range_result.unwrap_err().contains("Invalid hex codepoint"));
}
}