tree_parser/
languages.rs

1//! Language detection and tree-sitter language loading
2
3use crate::{Error, Language};
4use std::path::Path;
5use tree_sitter::Language as TSLanguage;
6
7/// Get the tree-sitter language for a given Language enum
8pub fn get_tree_sitter_language(language: &Language) -> Result<TSLanguage, Error> {
9    match language {
10        #[cfg(feature = "python")]
11        Language::Python => Ok(tree_sitter_python::LANGUAGE.into()),
12        #[cfg(feature = "rust_lang")]
13        Language::Rust => Ok(tree_sitter_rust::LANGUAGE.into()),
14        #[cfg(feature = "javascript")]
15        Language::JavaScript => Ok(tree_sitter_javascript::LANGUAGE.into()),
16        #[cfg(feature = "typescript")]
17        Language::TypeScript => Ok(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
18        #[cfg(feature = "java")]
19        Language::Java => Ok(tree_sitter_java::LANGUAGE.into()),
20        #[cfg(feature = "c")]
21        Language::C => Ok(tree_sitter_c::LANGUAGE.into()),
22        #[cfg(feature = "cpp")]
23        Language::Cpp => Ok(tree_sitter_cpp::LANGUAGE.into()),
24        #[cfg(feature = "go")]
25        Language::Go => Ok(tree_sitter_go::LANGUAGE.into()),
26        _ => Err(Error::UnsupportedLanguage(format!("{:?}", language))),
27    }
28}
29
30/// Detect language by file extension
31pub fn detect_language_by_extension(file_path: &str) -> Option<Language> {
32    let path = Path::new(file_path);
33    let extension = path.extension()?.to_str()?.to_lowercase();
34    
35    match extension.as_str() {
36        "py" | "pyw" | "pyi" => Some(Language::Python),
37        "rs" => Some(Language::Rust),
38        "js" | "mjs" | "cjs" => Some(Language::JavaScript),
39        "ts" | "mts" | "cts" => Some(Language::TypeScript),
40        "java" => Some(Language::Java),
41        "c" | "h" => Some(Language::C),
42        "cpp" | "cc" | "cxx" | "c++" | "hpp" | "hh" | "hxx" | "h++" => Some(Language::Cpp),
43        "go" => Some(Language::Go),
44        "cs" => Some(Language::CSharp),
45        "php" | "phtml" | "php3" | "php4" | "php5" | "phps" => Some(Language::Php),
46        "rb" | "rbw" => Some(Language::Ruby),
47        "swift" => Some(Language::Swift),
48        "kt" | "kts" => Some(Language::Kotlin),
49        "scala" | "sc" => Some(Language::Scala),
50        "hs" | "lhs" => Some(Language::Haskell),
51        "lua" => Some(Language::Lua),
52        "pl" | "pm" | "t" | "pod" => Some(Language::Perl),
53        "r" | "R" => Some(Language::R),
54        "sh" | "bash" | "zsh" | "fish" => Some(Language::Bash),
55        "ps1" | "psm1" | "psd1" => Some(Language::PowerShell),
56        "html" | "htm" | "xhtml" => Some(Language::Html),
57        "css" => Some(Language::Css),
58        "sql" => Some(Language::Sql),
59        "json" => Some(Language::Json),
60        "yaml" | "yml" => Some(Language::Yaml),
61        "toml" => Some(Language::Toml),
62        "xml" | "xsd" | "xsl" | "xslt" => Some(Language::Xml),
63        _ => None,
64    }
65}
66
67/// Detect language by shebang line
68pub fn detect_language_by_shebang(content: &str) -> Option<Language> {
69    let first_line = content.lines().next()?;
70    if !first_line.starts_with("#!") {
71        return None;
72    }
73    
74    let shebang = first_line.to_lowercase();
75    
76    if shebang.contains("python") {
77        Some(Language::Python)
78    } else if shebang.contains("node") {
79        Some(Language::JavaScript)
80    } else if shebang.contains("bash") || shebang.contains("/bin/sh") {
81        Some(Language::Bash)
82    } else if shebang.contains("ruby") {
83        Some(Language::Ruby)
84    } else if shebang.contains("perl") {
85        Some(Language::Perl)
86    } else if shebang.contains("php") {
87        Some(Language::Php)
88    } else {
89        None
90    }
91}
92
93/// Detect language by file content patterns
94pub fn detect_language_by_content(content: &str) -> Option<Language> {
95    // Simple heuristics based on common patterns
96    let content_lower = content.to_lowercase();
97    
98    // Check for specific language patterns
99    if content_lower.contains("def ") && content_lower.contains("import ") {
100        return Some(Language::Python);
101    }
102    
103    if content_lower.contains("fn ") && content_lower.contains("use ") {
104        return Some(Language::Rust);
105    }
106    
107    if content_lower.contains("function ") && content_lower.contains("var ") {
108        return Some(Language::JavaScript);
109    }
110    
111    if content_lower.contains("public class ") && content_lower.contains("import ") {
112        return Some(Language::Java);
113    }
114    
115    if content_lower.contains("#include") && content_lower.contains("int main") {
116        return Some(Language::C);
117    }
118    
119    None
120}
121
122/// Combined language detection using multiple methods
123pub fn detect_language(file_path: &str, content: Option<&str>) -> Option<Language> {
124    // Try extension first
125    if let Some(lang) = detect_language_by_extension(file_path) {
126        return Some(lang);
127    }
128    
129    // If content is provided, try shebang and content analysis
130    if let Some(content) = content {
131        if let Some(lang) = detect_language_by_shebang(content) {
132            return Some(lang);
133        }
134        
135        if let Some(lang) = detect_language_by_content(content) {
136            return Some(lang);
137        }
138    }
139    
140    None
141}
142
143/// Get supported node types for a language
144pub fn get_supported_node_types(language: &Language) -> Vec<String> {
145    match language {
146        Language::Python => vec![
147            "function_definition".to_string(),
148            "class_definition".to_string(),
149            "import_statement".to_string(),
150            "import_from_statement".to_string(),
151            "assignment".to_string(),
152            "decorated_definition".to_string(),
153        ],
154        Language::Rust => vec![
155            "function_item".to_string(),
156            "struct_item".to_string(),
157            "enum_item".to_string(),
158            "impl_item".to_string(),
159            "trait_item".to_string(),
160            "mod_item".to_string(),
161            "use_declaration".to_string(),
162            "const_item".to_string(),
163            "static_item".to_string(),
164        ],
165        Language::JavaScript => vec![
166            "function_declaration".to_string(),
167            "function_expression".to_string(),
168            "arrow_function".to_string(),
169            "class_declaration".to_string(),
170            "method_definition".to_string(),
171            "variable_declaration".to_string(),
172            "import_statement".to_string(),
173            "export_statement".to_string(),
174        ],
175        Language::TypeScript => vec![
176            "function_declaration".to_string(),
177            "function_expression".to_string(),
178            "arrow_function".to_string(),
179            "class_declaration".to_string(),
180            "interface_declaration".to_string(),
181            "type_alias_declaration".to_string(),
182            "method_definition".to_string(),
183            "variable_declaration".to_string(),
184            "import_statement".to_string(),
185            "export_statement".to_string(),
186        ],
187        Language::Java => vec![
188            "class_declaration".to_string(),
189            "interface_declaration".to_string(),
190            "method_declaration".to_string(),
191            "constructor_declaration".to_string(),
192            "field_declaration".to_string(),
193            "import_declaration".to_string(),
194            "package_declaration".to_string(),
195        ],
196        Language::C => vec![
197            "function_definition".to_string(),
198            "declaration".to_string(),
199            "struct_specifier".to_string(),
200            "union_specifier".to_string(),
201            "enum_specifier".to_string(),
202            "preproc_include".to_string(),
203            "preproc_define".to_string(),
204        ],
205        Language::Cpp => vec![
206            "function_definition".to_string(),
207            "declaration".to_string(),
208            "class_specifier".to_string(),
209            "struct_specifier".to_string(),
210            "union_specifier".to_string(),
211            "enum_specifier".to_string(),
212            "namespace_definition".to_string(),
213            "preproc_include".to_string(),
214            "preproc_define".to_string(),
215        ],
216        Language::Go => vec![
217            "function_declaration".to_string(),
218            "method_declaration".to_string(),
219            "type_declaration".to_string(),
220            "var_declaration".to_string(),
221            "const_declaration".to_string(),
222            "import_declaration".to_string(),
223            "package_clause".to_string(),
224        ],
225        _ => vec![], // For unsupported languages
226    }
227}