use anyhow::Result;
use once_cell::sync::Lazy;
use std::collections::HashMap;
use std::path::Path;
use tree_sitter::Language;
#[derive(Debug, Clone)]
pub struct LanguageInfo {
pub name: &'static str,
pub extensions: &'static [&'static str],
pub language: Language,
pub patterns: &'static [&'static str],
}
static LANGUAGES: Lazy<HashMap<&'static str, LanguageInfo>> = Lazy::new(|| {
let mut languages = HashMap::new();
macro_rules! register_language {
($name:expr, $lang:expr, $exts:expr, $patterns:expr) => {
languages.insert($name, LanguageInfo {
name: $name,
extensions: $exts,
language: $lang,
patterns: $patterns,
});
};
}
register_language!(
"rust",
tree_sitter_rust::language(),
&["rs"],
&["Cargo.toml", "Cargo.lock"]
);
register_language!(
"python",
tree_sitter_python::language(),
&["py", "pyw", "pyi"],
&["setup.py", "pyproject.toml", "requirements.txt"]
);
register_language!(
"javascript",
tree_sitter_javascript::language(),
&["js", "mjs", "cjs"],
&["package.json", ".eslintrc.js"]
);
register_language!(
"typescript",
tree_sitter_typescript::language_typescript(),
&["ts"],
&["tsconfig.json"]
);
register_language!(
"tsx",
tree_sitter_typescript::language_tsx(),
&["tsx"],
&[]
);
register_language!(
"go",
tree_sitter_go::language(),
&["go"],
&["go.mod", "go.sum"]
);
register_language!(
"java",
tree_sitter_java::language(),
&["java"],
&["pom.xml", "build.gradle"]
);
register_language!(
"c",
tree_sitter_c::language(),
&["c", "h"],
&["Makefile", "CMakeLists.txt"]
);
register_language!(
"cpp",
tree_sitter_cpp::language(),
&["cpp", "cc", "cxx", "hpp", "hxx", "h++"],
&["CMakeLists.txt"]
);
register_language!(
"csharp",
tree_sitter_c_sharp::language(),
&["cs"],
&["*.csproj", "*.sln"]
);
register_language!(
"ruby",
tree_sitter_ruby::language(),
&["rb"],
&["Gemfile", "Rakefile"]
);
register_language!(
"bash",
tree_sitter_bash::language(),
&["sh", "bash", "zsh"],
&[".bashrc", ".zshrc"]
);
register_language!(
"json",
tree_sitter_json::language(),
&["json"],
&["package.json", "tsconfig.json"]
);
register_language!(
"toml",
tree_sitter_toml::language(),
&["toml"],
&["Cargo.toml", "pyproject.toml"]
);
register_language!(
"css",
tree_sitter_css::language(),
&["css"],
&[]
);
languages
});
static EXTENSION_MAP: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
let mut map = HashMap::new();
for (name, info) in LANGUAGES.iter() {
for ext in info.extensions {
map.insert(*ext, *name);
}
}
map
});
static SPECIAL_FILES: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
let mut map = HashMap::new();
map.insert("Dockerfile", "dockerfile");
map.insert("Dockerfile.dev", "dockerfile");
map.insert("Dockerfile.prod", "dockerfile");
map.insert("Makefile", "make");
map.insert("makefile", "make");
map.insert("GNUmakefile", "make");
map.insert("CMakeLists.txt", "cmake");
map.insert(".gitignore", "gitignore");
map.insert(".dockerignore", "dockerignore");
map.insert("README.md", "markdown");
map.insert("README.rst", "rst");
map
});
pub fn detect_language(path: &Path) -> Option<String> {
if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
if let Some(lang) = SPECIAL_FILES.get(file_name) {
return Some(lang.to_string());
}
}
if let Some(extension) = path.extension().and_then(|e| e.to_str()) {
if let Some(lang) = EXTENSION_MAP.get(extension) {
return Some(lang.to_string());
}
}
if let Ok(content) = std::fs::read_to_string(path) {
if let Some(first_line) = content.lines().next() {
if first_line.starts_with("#!") {
if first_line.contains("python") {
return Some("python".to_string());
} else if first_line.contains("bash") || first_line.contains("sh") {
return Some("bash".to_string());
} else if first_line.contains("node") {
return Some("javascript".to_string());
}
}
}
}
None
}
pub fn get_language(name: &str) -> Option<&'static LanguageInfo> {
LANGUAGES.get(name)
}
pub fn parse_ast(source: &str, language: &str) -> Result<crate::ast::ParsedAst> {
let lang_info = get_language(language)
.ok_or_else(|| anyhow::anyhow!("Unknown language: {}", language))?;
let mut analyzer = crate::ast::AstAnalyzer::new(lang_info.language.clone())?;
analyzer.parse(source)
}
pub fn list_supported_languages() {
println!("Supported languages:");
println!();
let mut languages: Vec<_> = LANGUAGES.values().collect();
languages.sort_by_key(|info| info.name);
for info in languages {
let extensions = info.extensions.join(", ");
println!(" {:<15} Extensions: {}", info.name, extensions);
if !info.patterns.is_empty() {
let patterns = info.patterns.join(", ");
println!(" {:<15} Patterns: {}", "", patterns);
}
println!();
}
println!("Special files:");
for (file, lang) in SPECIAL_FILES.iter() {
println!(" {:<15} -> {}", file, lang);
}
}
pub fn get_language_filter(languages: &[String]) -> Box<dyn Fn(&Path) -> bool + Send + Sync> {
let languages = languages.to_vec();
Box::new(move |path: &Path| {
if languages.is_empty() {
return true;
}
if let Some(detected) = detect_language(path) {
languages.contains(&detected)
} else {
false
}
})
}
pub fn is_binary_file(path: &Path) -> bool {
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
match ext.to_lowercase().as_str() {
"exe" | "dll" | "so" | "dylib" | "a" | "lib" | "obj" | "o" => return true,
"jpg" | "jpeg" | "png" | "gif" | "bmp" | "ico" | "svg" => return true,
"mp3" | "mp4" | "wav" | "avi" | "mov" | "mkv" => return true,
"zip" | "tar" | "gz" | "rar" | "7z" => return true,
"pdf" | "doc" | "docx" | "xls" | "xlsx" | "ppt" | "pptx" => return true,
_ => {}
}
}
if let Ok(content) = std::fs::read(path) {
if content.len() > 8192 {
return content[..8192].contains(&0);
} else {
return content.contains(&0);
}
}
false
}
pub fn get_language_stats(path: &Path) -> Result<HashMap<String, usize>> {
use walkdir::WalkDir;
let mut stats = HashMap::new();
for entry in WalkDir::new(path).follow_links(false) {
let entry = entry?;
if entry.file_type().is_file() {
if let Some(lang) = detect_language(entry.path()) {
*stats.entry(lang).or_insert(0) += 1;
}
}
}
Ok(stats)
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_detect_language() {
assert_eq!(detect_language(&PathBuf::from("test.rs")), Some("rust".to_string()));
assert_eq!(detect_language(&PathBuf::from("test.py")), Some("python".to_string()));
assert_eq!(detect_language(&PathBuf::from("test.js")), Some("javascript".to_string()));
assert_eq!(detect_language(&PathBuf::from("test.ts")), Some("typescript".to_string()));
assert_eq!(detect_language(&PathBuf::from("Dockerfile")), Some("dockerfile".to_string()));
assert_eq!(detect_language(&PathBuf::from("Makefile")), Some("make".to_string()));
}
#[test]
fn test_get_language() {
assert!(get_language("rust").is_some());
assert!(get_language("python").is_some());
assert!(get_language("javascript").is_some());
assert!(get_language("unknown").is_none());
}
#[test]
fn test_language_filter() {
let filter = get_language_filter(&["rust".to_string()]);
assert!(filter(&PathBuf::from("test.rs")));
assert!(!filter(&PathBuf::from("test.py")));
let filter = get_language_filter(&[]);
assert!(filter(&PathBuf::from("test.rs")));
assert!(filter(&PathBuf::from("test.py")));
}
#[test]
fn test_is_binary_file() {
assert!(is_binary_file(&PathBuf::from("test.exe")));
assert!(is_binary_file(&PathBuf::from("image.jpg")));
assert!(!is_binary_file(&PathBuf::from("test.rs")));
assert!(!is_binary_file(&PathBuf::from("README.md")));
}
}