Skip to main content

ripvec_core/
languages.rs

1//! Language registry mapping file extensions to tree-sitter grammars.
2//!
3//! Each supported language has a grammar and a tree-sitter query that
4//! extracts function, class, and method definitions. Compiled queries
5//! are cached so that repeated calls for the same extension are free.
6
7use std::sync::{Arc, OnceLock};
8
9use tree_sitter::{Language, Query};
10
11/// Configuration for extracting function calls from a language.
12///
13/// Wrapped in [`Arc`] so it can be shared across threads and returned
14/// from the cache without cloning the compiled [`Query`].
15pub struct CallConfig {
16    /// The tree-sitter Language grammar.
17    pub language: Language,
18    /// Query that extracts call sites (`@callee` captures).
19    pub query: Query,
20}
21
22/// Configuration for a supported source language.
23///
24/// Wrapped in [`Arc`] so it can be shared across threads and returned
25/// from the cache without cloning the compiled [`Query`].
26pub struct LangConfig {
27    /// The tree-sitter Language grammar.
28    pub language: Language,
29    /// Query that extracts semantic chunks (`@def` captures with `@name`).
30    pub query: Query,
31}
32
33/// Look up the language configuration for a file extension.
34///
35/// Compiled queries are cached per extension so repeated calls are free.
36/// Returns `None` for unsupported extensions.
37#[must_use]
38pub fn config_for_extension(ext: &str) -> Option<Arc<LangConfig>> {
39    // Cache of compiled configs, keyed by canonical extension.
40    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<LangConfig>>> =
41        OnceLock::new();
42
43    let cache = CACHE.get_or_init(|| {
44        let mut m = std::collections::HashMap::new();
45        // Pre-compile all supported extensions
46        for &ext in &[
47            "rs", "py", "pyi", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc",
48            "cxx", "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
49            "scala", "toml", "json", "yaml", "yml", "md", "xml", "rdf", "owl",
50        ] {
51            if let Some(cfg) = compile_config(ext) {
52                m.insert(ext, Arc::new(cfg));
53            }
54        }
55        m
56    });
57
58    cache.get(ext).cloned()
59}
60
61/// Compile a [`LangConfig`] for the given extension (uncached).
62#[expect(
63    clippy::too_many_lines,
64    reason = "one match arm per language — flat by design"
65)]
66fn compile_config(ext: &str) -> Option<LangConfig> {
67    let (lang, query_str): (Language, &str) = match ext {
68        // Rust: standalone functions, structs, and methods INSIDE impl/trait blocks.
69        // impl_item and trait_item are NOT captured as wholes — we extract their
70        // individual function_item children for method-level granularity.
71        "rs" => (
72            tree_sitter_rust::LANGUAGE.into(),
73            concat!(
74                "(function_item name: (identifier) @name) @def\n",
75                "(struct_item name: (type_identifier) @name) @def\n",
76                "(enum_item name: (type_identifier) @name) @def\n",
77                "(type_item name: (type_identifier) @name) @def\n",
78                "(field_declaration name: (field_identifier) @name) @def\n",
79                "(enum_variant name: (identifier) @name) @def\n",
80                "(impl_item type: (type_identifier) @name) @def\n",
81                "(trait_item name: (type_identifier) @name) @def\n",
82                "(const_item name: (identifier) @name) @def\n",
83                "(static_item name: (identifier) @name) @def\n",
84                "(mod_item name: (identifier) @name) @def",
85            ),
86        ),
87        // Python: top-level functions AND methods inside classes (function_definition
88        // matches at any nesting depth, so methods are captured individually).
89        "py" | "pyi" => (
90            tree_sitter_python::LANGUAGE.into(),
91            concat!(
92                "(function_definition name: (identifier) @name) @def\n",
93                "(class_definition name: (identifier) @name body: (block) @def)\n",
94                "(assignment left: (identifier) @name) @def",
95            ),
96        ),
97        // JS: functions, methods, and arrow functions assigned to variables.
98        "js" | "jsx" => (
99            tree_sitter_javascript::LANGUAGE.into(),
100            concat!(
101                "(function_declaration name: (identifier) @name) @def\n",
102                "(method_definition name: (property_identifier) @name) @def\n",
103                "(class_declaration name: (identifier) @name) @def\n",
104                "(variable_declarator name: (identifier) @name) @def",
105            ),
106        ),
107        "ts" => (
108            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
109            concat!(
110                "(function_declaration name: (identifier) @name) @def\n",
111                "(method_definition name: (property_identifier) @name) @def\n",
112                "(class_declaration name: (type_identifier) @name) @def\n",
113                "(interface_declaration name: (type_identifier) @name) @def\n",
114                "(variable_declarator name: (identifier) @name) @def\n",
115                "(type_alias_declaration name: (type_identifier) @name) @def\n",
116                "(enum_declaration name: (identifier) @name) @def",
117            ),
118        ),
119        "tsx" => (
120            tree_sitter_typescript::LANGUAGE_TSX.into(),
121            concat!(
122                "(function_declaration name: (identifier) @name) @def\n",
123                "(method_definition name: (property_identifier) @name) @def\n",
124                "(class_declaration name: (type_identifier) @name) @def\n",
125                "(interface_declaration name: (type_identifier) @name) @def\n",
126                "(variable_declarator name: (identifier) @name) @def\n",
127                "(type_alias_declaration name: (type_identifier) @name) @def\n",
128                "(enum_declaration name: (identifier) @name) @def",
129            ),
130        ),
131        "go" => (
132            tree_sitter_go::LANGUAGE.into(),
133            concat!(
134                "(function_declaration name: (identifier) @name) @def\n",
135                "(method_declaration name: (field_identifier) @name) @def\n",
136                "(type_declaration (type_spec name: (type_identifier) @name)) @def\n",
137                "(const_spec name: (identifier) @name) @def",
138            ),
139        ),
140        // Java: methods are already captured individually (method_declaration
141        // matches inside class bodies). Keep class for the signature/fields.
142        "java" => (
143            tree_sitter_java::LANGUAGE.into(),
144            concat!(
145                "(method_declaration name: (identifier) @name) @def\n",
146                "(class_declaration name: (identifier) @name) @def\n",
147                "(interface_declaration name: (identifier) @name) @def\n",
148                "(field_declaration declarator: (variable_declarator name: (identifier) @name)) @def\n",
149                "(enum_constant name: (identifier) @name) @def\n",
150                "(enum_declaration name: (identifier) @name) @def\n",
151                "(constructor_declaration name: (identifier) @name) @def",
152            ),
153        ),
154        "c" | "h" => (
155            tree_sitter_c::LANGUAGE.into(),
156            concat!(
157                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
158                "(declaration declarator: (init_declarator declarator: (identifier) @name)) @def\n",
159                "(struct_specifier name: (type_identifier) @name) @def\n",
160                "(enum_specifier name: (type_identifier) @name) @def\n",
161                "(type_definition declarator: (type_identifier) @name) @def",
162            ),
163        ),
164        // C++: functions at any level, plus class signatures.
165        "cpp" | "cc" | "cxx" | "hpp" => (
166            tree_sitter_cpp::LANGUAGE.into(),
167            concat!(
168                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
169                "(class_specifier name: (type_identifier) @name) @def\n",
170                "(declaration declarator: (init_declarator declarator: (identifier) @name)) @def\n",
171                "(struct_specifier name: (type_identifier) @name) @def\n",
172                "(enum_specifier name: (type_identifier) @name) @def\n",
173                "(type_definition declarator: (type_identifier) @name) @def\n",
174                "(namespace_definition name: (namespace_identifier) @name) @def\n",
175                "(field_declaration declarator: (field_identifier) @name) @def",
176            ),
177        ),
178        // Bash: function definitions (.bats = Bash Automated Testing System).
179        "sh" | "bash" | "bats" => (
180            tree_sitter_bash::LANGUAGE.into(),
181            concat!(
182                "(function_definition name: (word) @name) @def\n",
183                "(variable_assignment name: (variable_name) @name) @def",
184            ),
185        ),
186        // Ruby: methods, classes, and modules.
187        "rb" => (
188            tree_sitter_ruby::LANGUAGE.into(),
189            concat!(
190                "(method name: (identifier) @name) @def\n",
191                "(class name: (constant) @name) @def\n",
192                "(module name: (constant) @name) @def\n",
193                "(assignment left: (identifier) @name) @def\n",
194                "(assignment left: (constant) @name) @def",
195            ),
196        ),
197        // HCL (Terraform): resource, data, variable, and output blocks.
198        "tf" | "tfvars" | "hcl" => (
199            tree_sitter_hcl::LANGUAGE.into(),
200            "(block (identifier) @name) @def",
201        ),
202        // Kotlin: functions, classes, and objects.
203        "kt" | "kts" => (
204            tree_sitter_kotlin_ng::LANGUAGE.into(),
205            concat!(
206                "(function_declaration name: (identifier) @name) @def\n",
207                "(class_declaration name: (identifier) @name) @def\n",
208                "(object_declaration name: (identifier) @name) @def\n",
209                "(property_declaration (identifier) @name) @def\n",
210                "(enum_entry (identifier) @name) @def",
211            ),
212        ),
213        // Swift: functions, classes, structs, enums, and protocols.
214        "swift" => (
215            tree_sitter_swift::LANGUAGE.into(),
216            concat!(
217                "(function_declaration name: (simple_identifier) @name) @def\n",
218                "(class_declaration name: (type_identifier) @name) @def\n",
219                "(protocol_declaration name: (type_identifier) @name) @def\n",
220                "(property_declaration name: (pattern bound_identifier: (simple_identifier) @name)) @def\n",
221                "(typealias_declaration name: (type_identifier) @name) @def",
222            ),
223        ),
224        // Scala: functions, classes, traits, and objects.
225        "scala" => (
226            tree_sitter_scala::LANGUAGE.into(),
227            concat!(
228                "(function_definition name: (identifier) @name) @def\n",
229                "(class_definition name: (identifier) @name) @def\n",
230                "(trait_definition name: (identifier) @name) @def\n",
231                "(object_definition name: (identifier) @name) @def\n",
232                "(val_definition pattern: (identifier) @name) @def\n",
233                "(var_definition pattern: (identifier) @name) @def\n",
234                "(type_definition name: (type_identifier) @name) @def",
235            ),
236        ),
237        // TOML: table headers (sections).
238        "toml" => (
239            tree_sitter_toml_ng::LANGUAGE.into(),
240            concat!(
241                "(table (bare_key) @name) @def\n",
242                "(pair (bare_key) @name) @def",
243            ),
244        ),
245        // JSON: key-value pairs, capturing the key string content.
246        "json" => (
247            tree_sitter_json::LANGUAGE.into(),
248            "(pair key: (string (string_content) @name)) @def",
249        ),
250        // YAML: block mapping pairs with plain scalar keys.
251        "yaml" | "yml" => (
252            tree_sitter_yaml::LANGUAGE.into(),
253            "(block_mapping_pair key: (flow_node (plain_scalar (string_scalar) @name))) @def",
254        ),
255        // Markdown: ATX headings (# through ######), capturing the heading text.
256        "md" => (
257            tree_sitter_md::LANGUAGE.into(),
258            "(atx_heading heading_content: (inline) @name) @def",
259        ),
260        // RDF/XML and OWL/XML are XML documents; capture each element so
261        // ontology classes/properties become searchable semantic chunks.
262        "xml" | "rdf" | "owl" => (
263            tree_sitter_xml::LANGUAGE_XML.into(),
264            concat!(
265                "(element (STag (Name) @name)) @def\n",
266                "(element (EmptyElemTag (Name) @name)) @def",
267            ),
268        ),
269        _ => return None,
270    };
271    let query = match Query::new(&lang, query_str) {
272        Ok(q) => q,
273        Err(e) => {
274            tracing::warn!(ext, %e, "tree-sitter query compilation failed — language may be ABI-incompatible");
275            return None;
276        }
277    };
278    Some(LangConfig {
279        language: lang,
280        query,
281    })
282}
283
284/// Look up the call-extraction query for a file extension.
285///
286/// Compiled queries are cached per extension so repeated calls are free.
287/// Returns `None` for unsupported extensions (including TOML, which has
288/// no function calls).
289#[must_use]
290pub fn call_query_for_extension(ext: &str) -> Option<Arc<CallConfig>> {
291    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<CallConfig>>> =
292        OnceLock::new();
293
294    let cache = CACHE.get_or_init(|| {
295        let mut m = std::collections::HashMap::new();
296        // Pre-compile for all extensions that have callable constructs.
297        // TOML is deliberately excluded — it has no function calls.
298        for &ext in &[
299            "rs", "py", "pyi", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc",
300            "cxx", "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
301            "scala",
302        ] {
303            if let Some(cfg) = compile_call_config(ext) {
304                m.insert(ext, Arc::new(cfg));
305            }
306        }
307        m
308    });
309
310    cache.get(ext).cloned()
311}
312
313/// Compile a [`CallConfig`] for the given extension (uncached).
314///
315/// Each query extracts the callee identifier (`@callee`) from function
316/// and method calls, plus the whole call expression (`@call`).
317#[expect(
318    clippy::too_many_lines,
319    reason = "one match arm per language — flat by design"
320)]
321fn compile_call_config(ext: &str) -> Option<CallConfig> {
322    let (lang, query_str): (Language, &str) = match ext {
323        // Rust: free calls, method calls, and scoped (path) calls.
324        "rs" => (
325            tree_sitter_rust::LANGUAGE.into(),
326            concat!(
327                "(call_expression function: (identifier) @callee) @call\n",
328                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call\n",
329                "(call_expression function: (scoped_identifier name: (identifier) @callee)) @call",
330            ),
331        ),
332        // Python: simple calls and attribute (method) calls.
333        "py" | "pyi" => (
334            tree_sitter_python::LANGUAGE.into(),
335            concat!(
336                "(call function: (identifier) @callee) @call\n",
337                "(call function: (attribute attribute: (identifier) @callee)) @call",
338            ),
339        ),
340        // JavaScript: function calls and member expression calls.
341        "js" | "jsx" => (
342            tree_sitter_javascript::LANGUAGE.into(),
343            concat!(
344                "(call_expression function: (identifier) @callee) @call\n",
345                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
346            ),
347        ),
348        // TypeScript: same patterns as JavaScript.
349        "ts" => (
350            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
351            concat!(
352                "(call_expression function: (identifier) @callee) @call\n",
353                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
354            ),
355        ),
356        // TSX: same patterns as JavaScript.
357        "tsx" => (
358            tree_sitter_typescript::LANGUAGE_TSX.into(),
359            concat!(
360                "(call_expression function: (identifier) @callee) @call\n",
361                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
362            ),
363        ),
364        // Go: function calls and selector (method) calls.
365        "go" => (
366            tree_sitter_go::LANGUAGE.into(),
367            concat!(
368                "(call_expression function: (identifier) @callee) @call\n",
369                "(call_expression function: (selector_expression field: (field_identifier) @callee)) @call",
370            ),
371        ),
372        // Java: method invocations.
373        "java" => (
374            tree_sitter_java::LANGUAGE.into(),
375            "(method_invocation name: (identifier) @callee) @call",
376        ),
377        // C: function calls and field-expression calls (function pointers).
378        "c" | "h" => (
379            tree_sitter_c::LANGUAGE.into(),
380            concat!(
381                "(call_expression function: (identifier) @callee) @call\n",
382                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call",
383            ),
384        ),
385        // C++: same patterns as C.
386        "cpp" | "cc" | "cxx" | "hpp" => (
387            tree_sitter_cpp::LANGUAGE.into(),
388            concat!(
389                "(call_expression function: (identifier) @callee) @call\n",
390                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call",
391            ),
392        ),
393        // Bash: command invocations (.bats = Bash Automated Testing System).
394        "sh" | "bash" | "bats" => (
395            tree_sitter_bash::LANGUAGE.into(),
396            "(command name: (command_name (word) @callee)) @call",
397        ),
398        // Ruby: method calls.
399        "rb" => (
400            tree_sitter_ruby::LANGUAGE.into(),
401            "(call method: (identifier) @callee) @call",
402        ),
403        // HCL (Terraform): built-in function calls.
404        "tf" | "tfvars" | "hcl" => (
405            tree_sitter_hcl::LANGUAGE.into(),
406            "(function_call (identifier) @callee) @call",
407        ),
408        // Kotlin: call expressions — grammar uses unnamed children, so match
409        // identifier as first child of call_expression.
410        "kt" | "kts" => (
411            tree_sitter_kotlin_ng::LANGUAGE.into(),
412            "(call_expression (identifier) @callee) @call",
413        ),
414        // Swift: call expressions with simple identifiers.
415        "swift" => (
416            tree_sitter_swift::LANGUAGE.into(),
417            "(call_expression (simple_identifier) @callee) @call",
418        ),
419        // Scala: function calls and field-expression (method) calls.
420        "scala" => (
421            tree_sitter_scala::LANGUAGE.into(),
422            concat!(
423                "(call_expression function: (identifier) @callee) @call\n",
424                "(call_expression function: (field_expression field: (identifier) @callee)) @call",
425            ),
426        ),
427        _ => return None,
428    };
429    let query = match Query::new(&lang, query_str) {
430        Ok(q) => q,
431        Err(e) => {
432            tracing::warn!(ext, %e, "tree-sitter call query compilation failed");
433            return None;
434        }
435    };
436    Some(CallConfig {
437        language: lang,
438        query,
439    })
440}
441
442#[cfg(test)]
443mod tests {
444    use super::*;
445
446    #[test]
447    fn rust_extension_resolves() {
448        assert!(config_for_extension("rs").is_some());
449    }
450
451    #[test]
452    fn python_extension_resolves() {
453        assert!(config_for_extension("py").is_some());
454    }
455
456    #[test]
457    fn python_stub_extension_resolves() {
458        assert!(config_for_extension("pyi").is_some());
459    }
460
461    #[test]
462    fn unknown_extension_returns_none() {
463        assert!(config_for_extension("xyz").is_none());
464    }
465
466    #[test]
467    fn all_supported_extensions() {
468        let exts = [
469            "rs", "py", "pyi", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc",
470            "cxx", "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
471            "scala", "toml", "json", "yaml", "yml", "md", "xml", "rdf", "owl",
472        ];
473        for ext in &exts {
474            assert!(config_for_extension(ext).is_some(), "failed for {ext}");
475        }
476    }
477
478    #[test]
479    fn turtle_family_uses_rdf_text_chunking_not_tree_sitter() {
480        for ext in ["ttl", "nt", "n3", "trig", "nq"] {
481            assert!(
482                config_for_extension(ext).is_none(),
483                "{ext} should be handled by RDF text chunking"
484            );
485            assert!(crate::chunk::is_rdf_text_extension(ext));
486        }
487    }
488
489    #[test]
490    fn all_call_query_extensions() {
491        let exts = [
492            "rs", "py", "pyi", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc",
493            "cxx", "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
494            "scala",
495        ];
496        for ext in &exts {
497            assert!(
498                call_query_for_extension(ext).is_some(),
499                "call query failed for {ext}"
500            );
501        }
502    }
503
504    #[test]
505    fn toml_has_no_call_query() {
506        assert!(call_query_for_extension("toml").is_none());
507    }
508}