Skip to main content

ripvec_core/
languages.rs

1//! Language registry mapping file extensions to tree-sitter grammars.
2//!
3//! Each supported language has a grammar and a tree-sitter query that
4//! extracts function, class, and method definitions. Compiled queries
5//! are cached so that repeated calls for the same extension are free.
6
7use std::sync::{Arc, OnceLock};
8
9use tree_sitter::{Language, Query};
10
11/// Configuration for extracting function calls from a language.
12///
13/// Wrapped in [`Arc`] so it can be shared across threads and returned
14/// from the cache without cloning the compiled [`Query`].
15pub struct CallConfig {
16    /// The tree-sitter Language grammar.
17    pub language: Language,
18    /// Query that extracts call sites (`@callee` captures).
19    pub query: Query,
20}
21
22/// Configuration for a supported source language.
23///
24/// Wrapped in [`Arc`] so it can be shared across threads and returned
25/// from the cache without cloning the compiled [`Query`].
26pub struct LangConfig {
27    /// The tree-sitter Language grammar.
28    pub language: Language,
29    /// Query that extracts semantic chunks (`@def` captures with `@name`).
30    pub query: Query,
31}
32
33/// Look up the language configuration for a file extension.
34///
35/// Compiled queries are cached per extension so repeated calls are free.
36/// Returns `None` for unsupported extensions.
37#[must_use]
38pub fn config_for_extension(ext: &str) -> Option<Arc<LangConfig>> {
39    // Cache of compiled configs, keyed by canonical extension.
40    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<LangConfig>>> =
41        OnceLock::new();
42
43    let cache = CACHE.get_or_init(|| {
44        let mut m = std::collections::HashMap::new();
45        // Pre-compile all supported extensions
46        for &ext in &[
47            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
48            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
49            "scala", "toml", "json", "yaml", "yml", "md",
50        ] {
51            if let Some(cfg) = compile_config(ext) {
52                m.insert(ext, Arc::new(cfg));
53            }
54        }
55        m
56    });
57
58    cache.get(ext).cloned()
59}
60
61/// Compile a [`LangConfig`] for the given extension (uncached).
62#[expect(
63    clippy::too_many_lines,
64    reason = "one match arm per language — flat by design"
65)]
66fn compile_config(ext: &str) -> Option<LangConfig> {
67    let (lang, query_str): (Language, &str) = match ext {
68        // Rust: standalone functions, structs, and methods INSIDE impl/trait blocks.
69        // impl_item and trait_item are NOT captured as wholes — we extract their
70        // individual function_item children for method-level granularity.
71        "rs" => (
72            tree_sitter_rust::LANGUAGE.into(),
73            concat!(
74                "(function_item name: (identifier) @name) @def\n",
75                "(struct_item name: (type_identifier) @name) @def\n",
76                "(enum_item name: (type_identifier) @name) @def\n",
77                "(type_item name: (type_identifier) @name) @def\n",
78                "(field_declaration name: (field_identifier) @name) @def\n",
79                "(enum_variant name: (identifier) @name) @def\n",
80                "(impl_item type: (type_identifier) @name) @def\n",
81                "(trait_item name: (type_identifier) @name) @def\n",
82                "(const_item name: (identifier) @name) @def\n",
83                "(static_item name: (identifier) @name) @def\n",
84                "(mod_item name: (identifier) @name) @def",
85            ),
86        ),
87        // Python: top-level functions AND methods inside classes (function_definition
88        // matches at any nesting depth, so methods are captured individually).
89        "py" => (
90            tree_sitter_python::LANGUAGE.into(),
91            concat!(
92                "(function_definition name: (identifier) @name) @def\n",
93                "(class_definition name: (identifier) @name body: (block) @def)\n",
94                "(assignment left: (identifier) @name) @def",
95            ),
96        ),
97        // JS: functions, methods, and arrow functions assigned to variables.
98        "js" | "jsx" => (
99            tree_sitter_javascript::LANGUAGE.into(),
100            concat!(
101                "(function_declaration name: (identifier) @name) @def\n",
102                "(method_definition name: (property_identifier) @name) @def\n",
103                "(class_declaration name: (identifier) @name) @def\n",
104                "(variable_declarator name: (identifier) @name) @def",
105            ),
106        ),
107        "ts" => (
108            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
109            concat!(
110                "(function_declaration name: (identifier) @name) @def\n",
111                "(method_definition name: (property_identifier) @name) @def\n",
112                "(class_declaration name: (type_identifier) @name) @def\n",
113                "(interface_declaration name: (type_identifier) @name) @def\n",
114                "(variable_declarator name: (identifier) @name) @def\n",
115                "(type_alias_declaration name: (type_identifier) @name) @def\n",
116                "(enum_declaration name: (identifier) @name) @def",
117            ),
118        ),
119        "tsx" => (
120            tree_sitter_typescript::LANGUAGE_TSX.into(),
121            concat!(
122                "(function_declaration name: (identifier) @name) @def\n",
123                "(method_definition name: (property_identifier) @name) @def\n",
124                "(class_declaration name: (type_identifier) @name) @def\n",
125                "(interface_declaration name: (type_identifier) @name) @def\n",
126                "(variable_declarator name: (identifier) @name) @def\n",
127                "(type_alias_declaration name: (type_identifier) @name) @def\n",
128                "(enum_declaration name: (identifier) @name) @def",
129            ),
130        ),
131        "go" => (
132            tree_sitter_go::LANGUAGE.into(),
133            concat!(
134                "(function_declaration name: (identifier) @name) @def\n",
135                "(method_declaration name: (field_identifier) @name) @def\n",
136                "(type_declaration (type_spec name: (type_identifier) @name)) @def\n",
137                "(const_spec name: (identifier) @name) @def",
138            ),
139        ),
140        // Java: methods are already captured individually (method_declaration
141        // matches inside class bodies). Keep class for the signature/fields.
142        "java" => (
143            tree_sitter_java::LANGUAGE.into(),
144            concat!(
145                "(method_declaration name: (identifier) @name) @def\n",
146                "(class_declaration name: (identifier) @name) @def\n",
147                "(interface_declaration name: (identifier) @name) @def\n",
148                "(field_declaration declarator: (variable_declarator name: (identifier) @name)) @def\n",
149                "(enum_constant name: (identifier) @name) @def\n",
150                "(enum_declaration name: (identifier) @name) @def\n",
151                "(constructor_declaration name: (identifier) @name) @def",
152            ),
153        ),
154        "c" | "h" => (
155            tree_sitter_c::LANGUAGE.into(),
156            concat!(
157                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
158                "(declaration declarator: (init_declarator declarator: (identifier) @name)) @def\n",
159                "(struct_specifier name: (type_identifier) @name) @def\n",
160                "(enum_specifier name: (type_identifier) @name) @def\n",
161                "(type_definition declarator: (type_identifier) @name) @def",
162            ),
163        ),
164        // C++: functions at any level, plus class signatures.
165        "cpp" | "cc" | "cxx" | "hpp" => (
166            tree_sitter_cpp::LANGUAGE.into(),
167            concat!(
168                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
169                "(class_specifier name: (type_identifier) @name) @def\n",
170                "(declaration declarator: (init_declarator declarator: (identifier) @name)) @def\n",
171                "(struct_specifier name: (type_identifier) @name) @def\n",
172                "(enum_specifier name: (type_identifier) @name) @def\n",
173                "(type_definition declarator: (type_identifier) @name) @def\n",
174                "(namespace_definition name: (namespace_identifier) @name) @def\n",
175                "(field_declaration declarator: (field_identifier) @name) @def",
176            ),
177        ),
178        // Bash: function definitions (.bats = Bash Automated Testing System).
179        "sh" | "bash" | "bats" => (
180            tree_sitter_bash::LANGUAGE.into(),
181            concat!(
182                "(function_definition name: (word) @name) @def\n",
183                "(variable_assignment name: (variable_name) @name) @def",
184            ),
185        ),
186        // Ruby: methods, classes, and modules.
187        "rb" => (
188            tree_sitter_ruby::LANGUAGE.into(),
189            concat!(
190                "(method name: (identifier) @name) @def\n",
191                "(class name: (constant) @name) @def\n",
192                "(module name: (constant) @name) @def\n",
193                "(assignment left: (identifier) @name) @def\n",
194                "(assignment left: (constant) @name) @def",
195            ),
196        ),
197        // HCL (Terraform): resource, data, variable, and output blocks.
198        "tf" | "tfvars" | "hcl" => (
199            tree_sitter_hcl::LANGUAGE.into(),
200            "(block (identifier) @name) @def",
201        ),
202        // Kotlin: functions, classes, and objects.
203        "kt" | "kts" => (
204            tree_sitter_kotlin_ng::LANGUAGE.into(),
205            concat!(
206                "(function_declaration name: (identifier) @name) @def\n",
207                "(class_declaration name: (identifier) @name) @def\n",
208                "(object_declaration name: (identifier) @name) @def\n",
209                "(property_declaration (identifier) @name) @def\n",
210                "(enum_entry (identifier) @name) @def",
211            ),
212        ),
213        // Swift: functions, classes, structs, enums, and protocols.
214        "swift" => (
215            tree_sitter_swift::LANGUAGE.into(),
216            concat!(
217                "(function_declaration name: (simple_identifier) @name) @def\n",
218                "(class_declaration name: (type_identifier) @name) @def\n",
219                "(protocol_declaration name: (type_identifier) @name) @def\n",
220                "(property_declaration name: (pattern bound_identifier: (simple_identifier) @name)) @def\n",
221                "(typealias_declaration name: (type_identifier) @name) @def",
222            ),
223        ),
224        // Scala: functions, classes, traits, and objects.
225        "scala" => (
226            tree_sitter_scala::LANGUAGE.into(),
227            concat!(
228                "(function_definition name: (identifier) @name) @def\n",
229                "(class_definition name: (identifier) @name) @def\n",
230                "(trait_definition name: (identifier) @name) @def\n",
231                "(object_definition name: (identifier) @name) @def\n",
232                "(val_definition pattern: (identifier) @name) @def\n",
233                "(var_definition pattern: (identifier) @name) @def\n",
234                "(type_definition name: (type_identifier) @name) @def",
235            ),
236        ),
237        // TOML: table headers (sections).
238        "toml" => (
239            tree_sitter_toml_ng::LANGUAGE.into(),
240            concat!(
241                "(table (bare_key) @name) @def\n",
242                "(pair (bare_key) @name) @def",
243            ),
244        ),
245        // JSON: key-value pairs, capturing the key string content.
246        "json" => (
247            tree_sitter_json::LANGUAGE.into(),
248            "(pair key: (string (string_content) @name)) @def",
249        ),
250        // YAML: block mapping pairs with plain scalar keys.
251        "yaml" | "yml" => (
252            tree_sitter_yaml::LANGUAGE.into(),
253            "(block_mapping_pair key: (flow_node (plain_scalar (string_scalar) @name))) @def",
254        ),
255        // Markdown: ATX headings (# through ######), capturing the heading text.
256        "md" => (
257            tree_sitter_md::LANGUAGE.into(),
258            "(atx_heading heading_content: (inline) @name) @def",
259        ),
260        _ => return None,
261    };
262    let query = match Query::new(&lang, query_str) {
263        Ok(q) => q,
264        Err(e) => {
265            tracing::warn!(ext, %e, "tree-sitter query compilation failed — language may be ABI-incompatible");
266            return None;
267        }
268    };
269    Some(LangConfig {
270        language: lang,
271        query,
272    })
273}
274
275/// Look up the call-extraction query for a file extension.
276///
277/// Compiled queries are cached per extension so repeated calls are free.
278/// Returns `None` for unsupported extensions (including TOML, which has
279/// no function calls).
280#[must_use]
281pub fn call_query_for_extension(ext: &str) -> Option<Arc<CallConfig>> {
282    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<CallConfig>>> =
283        OnceLock::new();
284
285    let cache = CACHE.get_or_init(|| {
286        let mut m = std::collections::HashMap::new();
287        // Pre-compile for all extensions that have callable constructs.
288        // TOML is deliberately excluded — it has no function calls.
289        for &ext in &[
290            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
291            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
292            "scala",
293        ] {
294            if let Some(cfg) = compile_call_config(ext) {
295                m.insert(ext, Arc::new(cfg));
296            }
297        }
298        m
299    });
300
301    cache.get(ext).cloned()
302}
303
304/// Compile a [`CallConfig`] for the given extension (uncached).
305///
306/// Each query extracts the callee identifier (`@callee`) from function
307/// and method calls, plus the whole call expression (`@call`).
308#[expect(
309    clippy::too_many_lines,
310    reason = "one match arm per language — flat by design"
311)]
312fn compile_call_config(ext: &str) -> Option<CallConfig> {
313    let (lang, query_str): (Language, &str) = match ext {
314        // Rust: free calls, method calls, and scoped (path) calls.
315        "rs" => (
316            tree_sitter_rust::LANGUAGE.into(),
317            concat!(
318                "(call_expression function: (identifier) @callee) @call\n",
319                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call\n",
320                "(call_expression function: (scoped_identifier name: (identifier) @callee)) @call",
321            ),
322        ),
323        // Python: simple calls and attribute (method) calls.
324        "py" => (
325            tree_sitter_python::LANGUAGE.into(),
326            concat!(
327                "(call function: (identifier) @callee) @call\n",
328                "(call function: (attribute attribute: (identifier) @callee)) @call",
329            ),
330        ),
331        // JavaScript: function calls and member expression calls.
332        "js" | "jsx" => (
333            tree_sitter_javascript::LANGUAGE.into(),
334            concat!(
335                "(call_expression function: (identifier) @callee) @call\n",
336                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
337            ),
338        ),
339        // TypeScript: same patterns as JavaScript.
340        "ts" => (
341            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
342            concat!(
343                "(call_expression function: (identifier) @callee) @call\n",
344                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
345            ),
346        ),
347        // TSX: same patterns as JavaScript.
348        "tsx" => (
349            tree_sitter_typescript::LANGUAGE_TSX.into(),
350            concat!(
351                "(call_expression function: (identifier) @callee) @call\n",
352                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
353            ),
354        ),
355        // Go: function calls and selector (method) calls.
356        "go" => (
357            tree_sitter_go::LANGUAGE.into(),
358            concat!(
359                "(call_expression function: (identifier) @callee) @call\n",
360                "(call_expression function: (selector_expression field: (field_identifier) @callee)) @call",
361            ),
362        ),
363        // Java: method invocations.
364        "java" => (
365            tree_sitter_java::LANGUAGE.into(),
366            "(method_invocation name: (identifier) @callee) @call",
367        ),
368        // C: function calls and field-expression calls (function pointers).
369        "c" | "h" => (
370            tree_sitter_c::LANGUAGE.into(),
371            concat!(
372                "(call_expression function: (identifier) @callee) @call\n",
373                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call",
374            ),
375        ),
376        // C++: same patterns as C.
377        "cpp" | "cc" | "cxx" | "hpp" => (
378            tree_sitter_cpp::LANGUAGE.into(),
379            concat!(
380                "(call_expression function: (identifier) @callee) @call\n",
381                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call",
382            ),
383        ),
384        // Bash: command invocations (.bats = Bash Automated Testing System).
385        "sh" | "bash" | "bats" => (
386            tree_sitter_bash::LANGUAGE.into(),
387            "(command name: (command_name (word) @callee)) @call",
388        ),
389        // Ruby: method calls.
390        "rb" => (
391            tree_sitter_ruby::LANGUAGE.into(),
392            "(call method: (identifier) @callee) @call",
393        ),
394        // HCL (Terraform): built-in function calls.
395        "tf" | "tfvars" | "hcl" => (
396            tree_sitter_hcl::LANGUAGE.into(),
397            "(function_call (identifier) @callee) @call",
398        ),
399        // Kotlin: call expressions — grammar uses unnamed children, so match
400        // identifier as first child of call_expression.
401        "kt" | "kts" => (
402            tree_sitter_kotlin_ng::LANGUAGE.into(),
403            "(call_expression (identifier) @callee) @call",
404        ),
405        // Swift: call expressions with simple identifiers.
406        "swift" => (
407            tree_sitter_swift::LANGUAGE.into(),
408            "(call_expression (simple_identifier) @callee) @call",
409        ),
410        // Scala: function calls and field-expression (method) calls.
411        "scala" => (
412            tree_sitter_scala::LANGUAGE.into(),
413            concat!(
414                "(call_expression function: (identifier) @callee) @call\n",
415                "(call_expression function: (field_expression field: (identifier) @callee)) @call",
416            ),
417        ),
418        _ => return None,
419    };
420    let query = match Query::new(&lang, query_str) {
421        Ok(q) => q,
422        Err(e) => {
423            tracing::warn!(ext, %e, "tree-sitter call query compilation failed");
424            return None;
425        }
426    };
427    Some(CallConfig {
428        language: lang,
429        query,
430    })
431}
432
433#[cfg(test)]
434mod tests {
435    use super::*;
436
437    #[test]
438    fn rust_extension_resolves() {
439        assert!(config_for_extension("rs").is_some());
440    }
441
442    #[test]
443    fn python_extension_resolves() {
444        assert!(config_for_extension("py").is_some());
445    }
446
447    #[test]
448    fn unknown_extension_returns_none() {
449        assert!(config_for_extension("xyz").is_none());
450    }
451
452    #[test]
453    fn all_supported_extensions() {
454        let exts = [
455            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
456            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
457            "scala", "toml", "json", "yaml", "yml", "md",
458        ];
459        for ext in &exts {
460            assert!(config_for_extension(ext).is_some(), "failed for {ext}");
461        }
462    }
463
464    #[test]
465    fn all_call_query_extensions() {
466        let exts = [
467            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
468            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
469            "scala",
470        ];
471        for ext in &exts {
472            assert!(
473                call_query_for_extension(ext).is_some(),
474                "call query failed for {ext}"
475            );
476        }
477    }
478
479    #[test]
480    fn toml_has_no_call_query() {
481        assert!(call_query_for_extension("toml").is_none());
482    }
483}