Skip to main content

ripvec_core/
languages.rs

1//! Language registry mapping file extensions to tree-sitter grammars.
2//!
3//! Each supported language has a grammar and a tree-sitter query that
4//! extracts function, class, and method definitions. Compiled queries
5//! are cached so that repeated calls for the same extension are free.
6
7use std::sync::{Arc, OnceLock};
8
9use tree_sitter::{Language, Query};
10
11/// Configuration for extracting function calls from a language.
12///
13/// Wrapped in [`Arc`] so it can be shared across threads and returned
14/// from the cache without cloning the compiled [`Query`].
15pub struct CallConfig {
16    /// The tree-sitter Language grammar.
17    pub language: Language,
18    /// Query that extracts call sites (`@callee` captures).
19    pub query: Query,
20}
21
22/// Configuration for a supported source language.
23///
24/// Wrapped in [`Arc`] so it can be shared across threads and returned
25/// from the cache without cloning the compiled [`Query`].
26pub struct LangConfig {
27    /// The tree-sitter Language grammar.
28    pub language: Language,
29    /// Query that extracts semantic chunks (`@def` captures with `@name`).
30    pub query: Query,
31}
32
33/// Look up the language configuration for a file extension.
34///
35/// Compiled queries are cached per extension so repeated calls are free.
36/// Returns `None` for unsupported extensions.
37#[must_use]
38pub fn config_for_extension(ext: &str) -> Option<Arc<LangConfig>> {
39    // Cache of compiled configs, keyed by canonical extension.
40    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<LangConfig>>> =
41        OnceLock::new();
42
43    let cache = CACHE.get_or_init(|| {
44        let mut m = std::collections::HashMap::new();
45        // Pre-compile all supported extensions
46        for &ext in &[
47            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
48            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
49            "scala", "toml",
50        ] {
51            if let Some(cfg) = compile_config(ext) {
52                m.insert(ext, Arc::new(cfg));
53            }
54        }
55        m
56    });
57
58    cache.get(ext).cloned()
59}
60
61/// Compile a [`LangConfig`] for the given extension (uncached).
62#[expect(
63    clippy::too_many_lines,
64    reason = "one match arm per language — flat by design"
65)]
66fn compile_config(ext: &str) -> Option<LangConfig> {
67    let (lang, query_str): (Language, &str) = match ext {
68        // Rust: standalone functions, structs, and methods INSIDE impl/trait blocks.
69        // impl_item and trait_item are NOT captured as wholes — we extract their
70        // individual function_item children for method-level granularity.
71        "rs" => (
72            tree_sitter_rust::LANGUAGE.into(),
73            concat!(
74                "(function_item name: (identifier) @name) @def\n",
75                "(struct_item name: (type_identifier) @name) @def\n",
76                "(enum_item name: (type_identifier) @name) @def\n",
77                "(type_item name: (type_identifier) @name) @def\n",
78                "(field_declaration name: (field_identifier) @name) @def\n",
79                "(enum_variant name: (identifier) @name) @def\n",
80                "(impl_item type: (type_identifier) @name) @def\n",
81                "(trait_item name: (type_identifier) @name) @def\n",
82                "(const_item name: (identifier) @name) @def\n",
83                "(static_item name: (identifier) @name) @def\n",
84                "(mod_item name: (identifier) @name) @def",
85            ),
86        ),
87        // Python: top-level functions AND methods inside classes (function_definition
88        // matches at any nesting depth, so methods are captured individually).
89        "py" => (
90            tree_sitter_python::LANGUAGE.into(),
91            concat!(
92                "(function_definition name: (identifier) @name) @def\n",
93                "(class_definition name: (identifier) @name body: (block) @def)\n",
94                "(assignment left: (identifier) @name) @def",
95            ),
96        ),
97        // JS: functions, methods, and arrow functions assigned to variables.
98        "js" | "jsx" => (
99            tree_sitter_javascript::LANGUAGE.into(),
100            concat!(
101                "(function_declaration name: (identifier) @name) @def\n",
102                "(method_definition name: (property_identifier) @name) @def\n",
103                "(class_declaration name: (identifier) @name) @def\n",
104                "(variable_declarator name: (identifier) @name) @def",
105            ),
106        ),
107        "ts" => (
108            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
109            concat!(
110                "(function_declaration name: (identifier) @name) @def\n",
111                "(method_definition name: (property_identifier) @name) @def\n",
112                "(class_declaration name: (type_identifier) @name) @def\n",
113                "(interface_declaration name: (type_identifier) @name) @def\n",
114                "(variable_declarator name: (identifier) @name) @def\n",
115                "(type_alias_declaration name: (type_identifier) @name) @def\n",
116                "(enum_declaration name: (identifier) @name) @def",
117            ),
118        ),
119        "tsx" => (
120            tree_sitter_typescript::LANGUAGE_TSX.into(),
121            concat!(
122                "(function_declaration name: (identifier) @name) @def\n",
123                "(method_definition name: (property_identifier) @name) @def\n",
124                "(class_declaration name: (type_identifier) @name) @def\n",
125                "(interface_declaration name: (type_identifier) @name) @def\n",
126                "(variable_declarator name: (identifier) @name) @def\n",
127                "(type_alias_declaration name: (type_identifier) @name) @def\n",
128                "(enum_declaration name: (identifier) @name) @def",
129            ),
130        ),
131        "go" => (
132            tree_sitter_go::LANGUAGE.into(),
133            concat!(
134                "(function_declaration name: (identifier) @name) @def\n",
135                "(method_declaration name: (field_identifier) @name) @def\n",
136                "(type_declaration (type_spec name: (type_identifier) @name)) @def\n",
137                "(const_spec name: (identifier) @name) @def",
138            ),
139        ),
140        // Java: methods are already captured individually (method_declaration
141        // matches inside class bodies). Keep class for the signature/fields.
142        "java" => (
143            tree_sitter_java::LANGUAGE.into(),
144            concat!(
145                "(method_declaration name: (identifier) @name) @def\n",
146                "(class_declaration name: (identifier) @name) @def\n",
147                "(interface_declaration name: (identifier) @name) @def\n",
148                "(field_declaration declarator: (variable_declarator name: (identifier) @name)) @def\n",
149                "(enum_constant name: (identifier) @name) @def\n",
150                "(enum_declaration name: (identifier) @name) @def\n",
151                "(constructor_declaration name: (identifier) @name) @def",
152            ),
153        ),
154        "c" | "h" => (
155            tree_sitter_c::LANGUAGE.into(),
156            concat!(
157                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
158                "(declaration declarator: (init_declarator declarator: (identifier) @name)) @def\n",
159                "(struct_specifier name: (type_identifier) @name) @def\n",
160                "(enum_specifier name: (type_identifier) @name) @def\n",
161                "(type_definition declarator: (type_identifier) @name) @def",
162            ),
163        ),
164        // C++: functions at any level, plus class signatures.
165        "cpp" | "cc" | "cxx" | "hpp" => (
166            tree_sitter_cpp::LANGUAGE.into(),
167            concat!(
168                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
169                "(class_specifier name: (type_identifier) @name) @def\n",
170                "(declaration declarator: (init_declarator declarator: (identifier) @name)) @def\n",
171                "(struct_specifier name: (type_identifier) @name) @def\n",
172                "(enum_specifier name: (type_identifier) @name) @def\n",
173                "(type_definition declarator: (type_identifier) @name) @def\n",
174                "(namespace_definition name: (namespace_identifier) @name) @def\n",
175                "(field_declaration declarator: (field_identifier) @name) @def",
176            ),
177        ),
178        // Bash: function definitions (.bats = Bash Automated Testing System).
179        "sh" | "bash" | "bats" => (
180            tree_sitter_bash::LANGUAGE.into(),
181            concat!(
182                "(function_definition name: (word) @name) @def\n",
183                "(variable_assignment name: (variable_name) @name) @def",
184            ),
185        ),
186        // Ruby: methods, classes, and modules.
187        "rb" => (
188            tree_sitter_ruby::LANGUAGE.into(),
189            concat!(
190                "(method name: (identifier) @name) @def\n",
191                "(class name: (constant) @name) @def\n",
192                "(module name: (constant) @name) @def\n",
193                "(assignment left: (identifier) @name) @def\n",
194                "(assignment left: (constant) @name) @def",
195            ),
196        ),
197        // HCL (Terraform): resource, data, variable, and output blocks.
198        "tf" | "tfvars" | "hcl" => (
199            tree_sitter_hcl::LANGUAGE.into(),
200            "(block (identifier) @name) @def",
201        ),
202        // Kotlin: functions, classes, and objects.
203        "kt" | "kts" => (
204            tree_sitter_kotlin_ng::LANGUAGE.into(),
205            concat!(
206                "(function_declaration name: (identifier) @name) @def\n",
207                "(class_declaration name: (identifier) @name) @def\n",
208                "(object_declaration name: (identifier) @name) @def\n",
209                "(property_declaration (identifier) @name) @def\n",
210                "(enum_entry (identifier) @name) @def",
211            ),
212        ),
213        // Swift: functions, classes, structs, enums, and protocols.
214        "swift" => (
215            tree_sitter_swift::LANGUAGE.into(),
216            concat!(
217                "(function_declaration name: (simple_identifier) @name) @def\n",
218                "(class_declaration name: (type_identifier) @name) @def\n",
219                "(protocol_declaration name: (type_identifier) @name) @def\n",
220                "(property_declaration name: (pattern bound_identifier: (simple_identifier) @name)) @def\n",
221                "(typealias_declaration name: (type_identifier) @name) @def",
222            ),
223        ),
224        // Scala: functions, classes, traits, and objects.
225        "scala" => (
226            tree_sitter_scala::LANGUAGE.into(),
227            concat!(
228                "(function_definition name: (identifier) @name) @def\n",
229                "(class_definition name: (identifier) @name) @def\n",
230                "(trait_definition name: (identifier) @name) @def\n",
231                "(object_definition name: (identifier) @name) @def\n",
232                "(val_definition pattern: (identifier) @name) @def\n",
233                "(var_definition pattern: (identifier) @name) @def\n",
234                "(type_definition name: (type_identifier) @name) @def",
235            ),
236        ),
237        // TOML: table headers (sections).
238        "toml" => (
239            tree_sitter_toml_ng::LANGUAGE.into(),
240            concat!(
241                "(table (bare_key) @name) @def\n",
242                "(pair (bare_key) @name) @def",
243            ),
244        ),
245        _ => return None,
246    };
247    let query = match Query::new(&lang, query_str) {
248        Ok(q) => q,
249        Err(e) => {
250            tracing::warn!(ext, %e, "tree-sitter query compilation failed — language may be ABI-incompatible");
251            return None;
252        }
253    };
254    Some(LangConfig {
255        language: lang,
256        query,
257    })
258}
259
260/// Look up the call-extraction query for a file extension.
261///
262/// Compiled queries are cached per extension so repeated calls are free.
263/// Returns `None` for unsupported extensions (including TOML, which has
264/// no function calls).
265#[must_use]
266pub fn call_query_for_extension(ext: &str) -> Option<Arc<CallConfig>> {
267    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<CallConfig>>> =
268        OnceLock::new();
269
270    let cache = CACHE.get_or_init(|| {
271        let mut m = std::collections::HashMap::new();
272        // Pre-compile for all extensions that have callable constructs.
273        // TOML is deliberately excluded — it has no function calls.
274        for &ext in &[
275            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
276            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
277            "scala",
278        ] {
279            if let Some(cfg) = compile_call_config(ext) {
280                m.insert(ext, Arc::new(cfg));
281            }
282        }
283        m
284    });
285
286    cache.get(ext).cloned()
287}
288
289/// Compile a [`CallConfig`] for the given extension (uncached).
290///
291/// Each query extracts the callee identifier (`@callee`) from function
292/// and method calls, plus the whole call expression (`@call`).
293#[expect(
294    clippy::too_many_lines,
295    reason = "one match arm per language — flat by design"
296)]
297fn compile_call_config(ext: &str) -> Option<CallConfig> {
298    let (lang, query_str): (Language, &str) = match ext {
299        // Rust: free calls, method calls, and scoped (path) calls.
300        "rs" => (
301            tree_sitter_rust::LANGUAGE.into(),
302            concat!(
303                "(call_expression function: (identifier) @callee) @call\n",
304                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call\n",
305                "(call_expression function: (scoped_identifier name: (identifier) @callee)) @call",
306            ),
307        ),
308        // Python: simple calls and attribute (method) calls.
309        "py" => (
310            tree_sitter_python::LANGUAGE.into(),
311            concat!(
312                "(call function: (identifier) @callee) @call\n",
313                "(call function: (attribute attribute: (identifier) @callee)) @call",
314            ),
315        ),
316        // JavaScript: function calls and member expression calls.
317        "js" | "jsx" => (
318            tree_sitter_javascript::LANGUAGE.into(),
319            concat!(
320                "(call_expression function: (identifier) @callee) @call\n",
321                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
322            ),
323        ),
324        // TypeScript: same patterns as JavaScript.
325        "ts" => (
326            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
327            concat!(
328                "(call_expression function: (identifier) @callee) @call\n",
329                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
330            ),
331        ),
332        // TSX: same patterns as JavaScript.
333        "tsx" => (
334            tree_sitter_typescript::LANGUAGE_TSX.into(),
335            concat!(
336                "(call_expression function: (identifier) @callee) @call\n",
337                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
338            ),
339        ),
340        // Go: function calls and selector (method) calls.
341        "go" => (
342            tree_sitter_go::LANGUAGE.into(),
343            concat!(
344                "(call_expression function: (identifier) @callee) @call\n",
345                "(call_expression function: (selector_expression field: (field_identifier) @callee)) @call",
346            ),
347        ),
348        // Java: method invocations.
349        "java" => (
350            tree_sitter_java::LANGUAGE.into(),
351            "(method_invocation name: (identifier) @callee) @call",
352        ),
353        // C: function calls and field-expression calls (function pointers).
354        "c" | "h" => (
355            tree_sitter_c::LANGUAGE.into(),
356            concat!(
357                "(call_expression function: (identifier) @callee) @call\n",
358                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call",
359            ),
360        ),
361        // C++: same patterns as C.
362        "cpp" | "cc" | "cxx" | "hpp" => (
363            tree_sitter_cpp::LANGUAGE.into(),
364            concat!(
365                "(call_expression function: (identifier) @callee) @call\n",
366                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call",
367            ),
368        ),
369        // Bash: command invocations (.bats = Bash Automated Testing System).
370        "sh" | "bash" | "bats" => (
371            tree_sitter_bash::LANGUAGE.into(),
372            "(command name: (command_name (word) @callee)) @call",
373        ),
374        // Ruby: method calls.
375        "rb" => (
376            tree_sitter_ruby::LANGUAGE.into(),
377            "(call method: (identifier) @callee) @call",
378        ),
379        // HCL (Terraform): built-in function calls.
380        "tf" | "tfvars" | "hcl" => (
381            tree_sitter_hcl::LANGUAGE.into(),
382            "(function_call (identifier) @callee) @call",
383        ),
384        // Kotlin: call expressions — grammar uses unnamed children, so match
385        // identifier as first child of call_expression.
386        "kt" | "kts" => (
387            tree_sitter_kotlin_ng::LANGUAGE.into(),
388            "(call_expression (identifier) @callee) @call",
389        ),
390        // Swift: call expressions with simple identifiers.
391        "swift" => (
392            tree_sitter_swift::LANGUAGE.into(),
393            "(call_expression (simple_identifier) @callee) @call",
394        ),
395        // Scala: function calls and field-expression (method) calls.
396        "scala" => (
397            tree_sitter_scala::LANGUAGE.into(),
398            concat!(
399                "(call_expression function: (identifier) @callee) @call\n",
400                "(call_expression function: (field_expression field: (identifier) @callee)) @call",
401            ),
402        ),
403        _ => return None,
404    };
405    let query = match Query::new(&lang, query_str) {
406        Ok(q) => q,
407        Err(e) => {
408            tracing::warn!(ext, %e, "tree-sitter call query compilation failed");
409            return None;
410        }
411    };
412    Some(CallConfig {
413        language: lang,
414        query,
415    })
416}
417
418#[cfg(test)]
419mod tests {
420    use super::*;
421
422    #[test]
423    fn rust_extension_resolves() {
424        assert!(config_for_extension("rs").is_some());
425    }
426
427    #[test]
428    fn python_extension_resolves() {
429        assert!(config_for_extension("py").is_some());
430    }
431
432    #[test]
433    fn unknown_extension_returns_none() {
434        assert!(config_for_extension("xyz").is_none());
435    }
436
437    #[test]
438    fn all_supported_extensions() {
439        let exts = [
440            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
441            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
442            "scala", "toml",
443        ];
444        for ext in &exts {
445            assert!(config_for_extension(ext).is_some(), "failed for {ext}");
446        }
447    }
448
449    #[test]
450    fn all_call_query_extensions() {
451        let exts = [
452            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
453            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
454            "scala",
455        ];
456        for ext in &exts {
457            assert!(
458                call_query_for_extension(ext).is_some(),
459                "call query failed for {ext}"
460            );
461        }
462    }
463
464    #[test]
465    fn toml_has_no_call_query() {
466        assert!(call_query_for_extension("toml").is_none());
467    }
468}