Skip to main content

ripvec_core/
languages.rs

1//! Language registry mapping file extensions to tree-sitter grammars.
2//!
3//! Each supported language has a grammar and a tree-sitter query that
4//! extracts function, class, and method definitions. Compiled queries
5//! are cached so that repeated calls for the same extension are free.
6
7use std::sync::{Arc, OnceLock};
8
9use tree_sitter::{Language, Query};
10
11/// Configuration for extracting function calls from a language.
12///
13/// Wrapped in [`Arc`] so it can be shared across threads and returned
14/// from the cache without cloning the compiled [`Query`].
15pub struct CallConfig {
16    /// The tree-sitter Language grammar.
17    pub language: Language,
18    /// Query that extracts call sites (`@callee` captures).
19    pub query: Query,
20}
21
22/// Configuration for a supported source language.
23///
24/// Wrapped in [`Arc`] so it can be shared across threads and returned
25/// from the cache without cloning the compiled [`Query`].
26pub struct LangConfig {
27    /// The tree-sitter Language grammar.
28    pub language: Language,
29    /// Query that extracts semantic chunks (`@def` captures with `@name`).
30    pub query: Query,
31}
32
33/// Look up the language configuration for a file extension.
34///
35/// Compiled queries are cached per extension so repeated calls are free.
36/// Returns `None` for unsupported extensions.
37#[must_use]
38pub fn config_for_extension(ext: &str) -> Option<Arc<LangConfig>> {
39    // Cache of compiled configs, keyed by canonical extension.
40    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<LangConfig>>> =
41        OnceLock::new();
42
43    let cache = CACHE.get_or_init(|| {
44        let mut m = std::collections::HashMap::new();
45        // Pre-compile all supported extensions
46        for &ext in &[
47            "rs", "py", "pyi", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc",
48            "cxx", "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
49            "scala", "toml", "json", "yaml", "yml", "md", "xml", "rdf", "owl",
50        ] {
51            if let Some(cfg) = compile_config(ext) {
52                m.insert(ext, Arc::new(cfg));
53            }
54        }
55        m
56    });
57
58    cache.get(ext).cloned()
59}
60
61/// Compile a [`LangConfig`] for the given extension (uncached).
62#[expect(
63    clippy::too_many_lines,
64    reason = "one match arm per language — flat by design"
65)]
66fn compile_config(ext: &str) -> Option<LangConfig> {
67    let (lang, query_str): (Language, &str) = match ext {
68        // Rust: standalone functions, structs, and methods INSIDE impl/trait blocks.
69        // impl_item and trait_item are NOT captured as wholes — we extract their
70        // individual function_item children for method-level granularity.
71        "rs" => (
72            tree_sitter_rust::LANGUAGE.into(),
73            concat!(
74                "(function_item name: (identifier) @name) @def\n",
75                "(struct_item name: (type_identifier) @name) @def\n",
76                "(enum_item name: (type_identifier) @name) @def\n",
77                "(type_item name: (type_identifier) @name) @def\n",
78                "(field_declaration name: (field_identifier) @name) @def\n",
79                "(enum_variant name: (identifier) @name) @def\n",
80                "(impl_item type: (type_identifier) @name) @def\n",
81                "(trait_item name: (type_identifier) @name) @def\n",
82                "(const_item name: (identifier) @name) @def\n",
83                "(static_item name: (identifier) @name) @def\n",
84                "(mod_item name: (identifier) @name) @def",
85            ),
86        ),
87        // Python: top-level functions AND methods inside classes (function_definition
88        // matches at any nesting depth, so methods are captured individually).
89        "py" | "pyi" => (
90            tree_sitter_python::LANGUAGE.into(),
91            concat!(
92                "(function_definition name: (identifier) @name) @def\n",
93                "(class_definition name: (identifier) @name body: (block) @def)\n",
94                "(assignment left: (identifier) @name) @def",
95            ),
96        ),
97        // JS: functions, methods, and arrow functions assigned to variables.
98        "js" | "jsx" => (
99            tree_sitter_javascript::LANGUAGE.into(),
100            concat!(
101                "(function_declaration name: (identifier) @name) @def\n",
102                "(method_definition name: (property_identifier) @name) @def\n",
103                "(class_declaration name: (identifier) @name) @def\n",
104                "(variable_declarator name: (identifier) @name) @def",
105            ),
106        ),
107        "ts" => (
108            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
109            concat!(
110                "(function_declaration name: (identifier) @name) @def\n",
111                "(method_definition name: (property_identifier) @name) @def\n",
112                "(class_declaration name: (type_identifier) @name) @def\n",
113                "(interface_declaration name: (type_identifier) @name) @def\n",
114                "(variable_declarator name: (identifier) @name) @def\n",
115                "(type_alias_declaration name: (type_identifier) @name) @def\n",
116                "(enum_declaration name: (identifier) @name) @def",
117            ),
118        ),
119        "tsx" => (
120            tree_sitter_typescript::LANGUAGE_TSX.into(),
121            concat!(
122                "(function_declaration name: (identifier) @name) @def\n",
123                "(method_definition name: (property_identifier) @name) @def\n",
124                "(class_declaration name: (type_identifier) @name) @def\n",
125                "(interface_declaration name: (type_identifier) @name) @def\n",
126                "(variable_declarator name: (identifier) @name) @def\n",
127                "(type_alias_declaration name: (type_identifier) @name) @def\n",
128                "(enum_declaration name: (identifier) @name) @def",
129            ),
130        ),
131        "go" => (
132            tree_sitter_go::LANGUAGE.into(),
133            concat!(
134                "(function_declaration name: (identifier) @name) @def\n",
135                "(method_declaration name: (field_identifier) @name) @def\n",
136                "(type_declaration (type_spec name: (type_identifier) @name)) @def\n",
137                "(const_spec name: (identifier) @name) @def",
138            ),
139        ),
140        // Java: methods are already captured individually (method_declaration
141        // matches inside class bodies). Keep class for the signature/fields.
142        "java" => (
143            tree_sitter_java::LANGUAGE.into(),
144            concat!(
145                "(method_declaration name: (identifier) @name) @def\n",
146                "(class_declaration name: (identifier) @name) @def\n",
147                "(interface_declaration name: (identifier) @name) @def\n",
148                "(field_declaration declarator: (variable_declarator name: (identifier) @name)) @def\n",
149                "(enum_constant name: (identifier) @name) @def\n",
150                "(enum_declaration name: (identifier) @name) @def\n",
151                "(constructor_declaration name: (identifier) @name) @def",
152            ),
153        ),
154        "c" | "h" => (
155            tree_sitter_c::LANGUAGE.into(),
156            concat!(
157                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
158                "(declaration declarator: (init_declarator declarator: (identifier) @name)) @def\n",
159                "(struct_specifier name: (type_identifier) @name) @def\n",
160                "(enum_specifier name: (type_identifier) @name) @def\n",
161                "(type_definition declarator: (type_identifier) @name) @def",
162            ),
163        ),
164        // C++: functions at any level, plus class signatures.
165        "cpp" | "cc" | "cxx" | "hpp" => (
166            tree_sitter_cpp::LANGUAGE.into(),
167            concat!(
168                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
169                "(class_specifier name: (type_identifier) @name) @def\n",
170                "(declaration declarator: (init_declarator declarator: (identifier) @name)) @def\n",
171                "(struct_specifier name: (type_identifier) @name) @def\n",
172                "(enum_specifier name: (type_identifier) @name) @def\n",
173                "(type_definition declarator: (type_identifier) @name) @def\n",
174                "(namespace_definition name: (namespace_identifier) @name) @def\n",
175                "(field_declaration declarator: (field_identifier) @name) @def",
176            ),
177        ),
178        // Bash: function definitions (.bats = Bash Automated Testing System).
179        "sh" | "bash" | "bats" => (
180            tree_sitter_bash::LANGUAGE.into(),
181            concat!(
182                "(function_definition name: (word) @name) @def\n",
183                "(variable_assignment name: (variable_name) @name) @def",
184            ),
185        ),
186        // Ruby: methods, classes, and modules.
187        "rb" => (
188            tree_sitter_ruby::LANGUAGE.into(),
189            concat!(
190                "(method name: (identifier) @name) @def\n",
191                "(class name: (constant) @name) @def\n",
192                "(module name: (constant) @name) @def\n",
193                "(assignment left: (identifier) @name) @def\n",
194                "(assignment left: (constant) @name) @def",
195            ),
196        ),
197        // HCL (Terraform): resource, data, variable, and output blocks.
198        "tf" | "tfvars" | "hcl" => (
199            tree_sitter_hcl::LANGUAGE.into(),
200            "(block (identifier) @name) @def",
201        ),
202        // Kotlin: functions, classes, and objects.
203        "kt" | "kts" => (
204            tree_sitter_kotlin_ng::LANGUAGE.into(),
205            concat!(
206                "(function_declaration name: (identifier) @name) @def\n",
207                "(class_declaration name: (identifier) @name) @def\n",
208                "(object_declaration name: (identifier) @name) @def\n",
209                "(property_declaration (identifier) @name) @def\n",
210                "(enum_entry (identifier) @name) @def",
211            ),
212        ),
213        // Swift: functions, classes, structs, enums, and protocols.
214        "swift" => (
215            tree_sitter_swift::LANGUAGE.into(),
216            concat!(
217                "(function_declaration name: (simple_identifier) @name) @def\n",
218                "(class_declaration name: (type_identifier) @name) @def\n",
219                "(protocol_declaration name: (type_identifier) @name) @def\n",
220                "(property_declaration name: (pattern bound_identifier: (simple_identifier) @name)) @def\n",
221                "(typealias_declaration name: (type_identifier) @name) @def",
222            ),
223        ),
224        // Scala: functions, classes, traits, and objects.
225        "scala" => (
226            tree_sitter_scala::LANGUAGE.into(),
227            concat!(
228                "(function_definition name: (identifier) @name) @def\n",
229                "(class_definition name: (identifier) @name) @def\n",
230                "(trait_definition name: (identifier) @name) @def\n",
231                "(object_definition name: (identifier) @name) @def\n",
232                "(val_definition pattern: (identifier) @name) @def\n",
233                "(var_definition pattern: (identifier) @name) @def\n",
234                "(type_definition name: (type_identifier) @name) @def",
235            ),
236        ),
237        // TOML: table headers (sections).
238        "toml" => (
239            tree_sitter_toml_ng::LANGUAGE.into(),
240            concat!(
241                "(table (bare_key) @name) @def\n",
242                "(pair (bare_key) @name) @def",
243            ),
244        ),
245        // JSON: key-value pairs, capturing the key string content.
246        "json" => (
247            tree_sitter_json::LANGUAGE.into(),
248            "(pair key: (string (string_content) @name)) @def",
249        ),
250        // YAML: block mapping pairs with plain scalar keys.
251        "yaml" | "yml" => (
252            tree_sitter_yaml::LANGUAGE.into(),
253            "(block_mapping_pair key: (flow_node (plain_scalar (string_scalar) @name))) @def",
254        ),
255        // Markdown: ATX headings (# through ######), capturing the heading text.
256        "md" => (
257            tree_sitter_md::LANGUAGE.into(),
258            "(atx_heading heading_content: (inline) @name) @def",
259        ),
260        // RDF/XML and OWL/XML are XML documents; capture each element so
261        // ontology classes/properties become searchable semantic chunks.
262        "xml" | "rdf" | "owl" => (
263            tree_sitter_xml::LANGUAGE_XML.into(),
264            concat!(
265                "(element (STag (Name) @name)) @def\n",
266                "(element (EmptyElemTag (Name) @name)) @def",
267            ),
268        ),
269        _ => return None,
270    };
271    let query = match Query::new(&lang, query_str) {
272        Ok(q) => q,
273        Err(e) => {
274            tracing::warn!(ext, %e, "tree-sitter query compilation failed — language may be ABI-incompatible");
275            return None;
276        }
277    };
278    Some(LangConfig {
279        language: lang,
280        query,
281    })
282}
283
284/// Look up the call-extraction query for a file extension.
285///
286/// Compiled queries are cached per extension so repeated calls are free.
287/// Returns `None` for unsupported extensions (including TOML, which has
288/// no function calls).
289#[must_use]
290pub fn call_query_for_extension(ext: &str) -> Option<Arc<CallConfig>> {
291    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<CallConfig>>> =
292        OnceLock::new();
293
294    let cache = CACHE.get_or_init(|| {
295        let mut m = std::collections::HashMap::new();
296        // Pre-compile for all extensions that have callable constructs.
297        // TOML is deliberately excluded — it has no function calls.
298        for &ext in &[
299            "rs", "py", "pyi", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc",
300            "cxx", "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
301            "scala",
302        ] {
303            if let Some(cfg) = compile_call_config(ext) {
304                m.insert(ext, Arc::new(cfg));
305            }
306        }
307        m
308    });
309
310    cache.get(ext).cloned()
311}
312
313/// Compile a [`CallConfig`] for the given extension (uncached).
314///
315/// Each query extracts the callee identifier (`@callee`) from function
316/// and method calls, plus the whole call expression (`@call`).
317#[expect(
318    clippy::too_many_lines,
319    reason = "one match arm per language — flat by design"
320)]
321fn compile_call_config(ext: &str) -> Option<CallConfig> {
322    let (lang, query_str): (Language, &str) = match ext {
323        // Rust: free calls, method calls, and scoped (path) calls.
324        //
325        // For scoped calls, capture the full `scoped_identifier` node as @callee
326        // (not just the trailing `(identifier)` child). This preserves the qualified
327        // path so that `mod_a::foo()` records "mod_a::foo" rather than bare "foo",
328        // enabling cross-module disambiguation in `resolve_calls`.
329        "rs" => (
330            tree_sitter_rust::LANGUAGE.into(),
331            concat!(
332                "(call_expression function: (identifier) @callee) @call\n",
333                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call\n",
334                "(call_expression function: (scoped_identifier) @callee) @call",
335            ),
336        ),
337        // Python: simple calls and attribute (method) calls.
338        "py" | "pyi" => (
339            tree_sitter_python::LANGUAGE.into(),
340            concat!(
341                "(call function: (identifier) @callee) @call\n",
342                "(call function: (attribute attribute: (identifier) @callee)) @call",
343            ),
344        ),
345        // JavaScript: function calls and member expression calls.
346        "js" | "jsx" => (
347            tree_sitter_javascript::LANGUAGE.into(),
348            concat!(
349                "(call_expression function: (identifier) @callee) @call\n",
350                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
351            ),
352        ),
353        // TypeScript: same patterns as JavaScript.
354        "ts" => (
355            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
356            concat!(
357                "(call_expression function: (identifier) @callee) @call\n",
358                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
359            ),
360        ),
361        // TSX: same patterns as JavaScript.
362        "tsx" => (
363            tree_sitter_typescript::LANGUAGE_TSX.into(),
364            concat!(
365                "(call_expression function: (identifier) @callee) @call\n",
366                "(call_expression function: (member_expression property: (property_identifier) @callee)) @call",
367            ),
368        ),
369        // Go: function calls and selector (method) calls.
370        "go" => (
371            tree_sitter_go::LANGUAGE.into(),
372            concat!(
373                "(call_expression function: (identifier) @callee) @call\n",
374                "(call_expression function: (selector_expression field: (field_identifier) @callee)) @call",
375            ),
376        ),
377        // Java: method invocations.
378        "java" => (
379            tree_sitter_java::LANGUAGE.into(),
380            "(method_invocation name: (identifier) @callee) @call",
381        ),
382        // C: function calls and field-expression calls (function pointers).
383        "c" | "h" => (
384            tree_sitter_c::LANGUAGE.into(),
385            concat!(
386                "(call_expression function: (identifier) @callee) @call\n",
387                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call",
388            ),
389        ),
390        // C++: same patterns as C.
391        "cpp" | "cc" | "cxx" | "hpp" => (
392            tree_sitter_cpp::LANGUAGE.into(),
393            concat!(
394                "(call_expression function: (identifier) @callee) @call\n",
395                "(call_expression function: (field_expression field: (field_identifier) @callee)) @call",
396            ),
397        ),
398        // Bash: command invocations (.bats = Bash Automated Testing System).
399        "sh" | "bash" | "bats" => (
400            tree_sitter_bash::LANGUAGE.into(),
401            "(command name: (command_name (word) @callee)) @call",
402        ),
403        // Ruby: method calls.
404        "rb" => (
405            tree_sitter_ruby::LANGUAGE.into(),
406            "(call method: (identifier) @callee) @call",
407        ),
408        // HCL (Terraform): built-in function calls.
409        "tf" | "tfvars" | "hcl" => (
410            tree_sitter_hcl::LANGUAGE.into(),
411            "(function_call (identifier) @callee) @call",
412        ),
413        // Kotlin: call expressions — grammar uses unnamed children, so match
414        // identifier as first child of call_expression.
415        "kt" | "kts" => (
416            tree_sitter_kotlin_ng::LANGUAGE.into(),
417            "(call_expression (identifier) @callee) @call",
418        ),
419        // Swift: call expressions with simple identifiers.
420        "swift" => (
421            tree_sitter_swift::LANGUAGE.into(),
422            "(call_expression (simple_identifier) @callee) @call",
423        ),
424        // Scala: function calls and field-expression (method) calls.
425        "scala" => (
426            tree_sitter_scala::LANGUAGE.into(),
427            concat!(
428                "(call_expression function: (identifier) @callee) @call\n",
429                "(call_expression function: (field_expression field: (identifier) @callee)) @call",
430            ),
431        ),
432        _ => return None,
433    };
434    let query = match Query::new(&lang, query_str) {
435        Ok(q) => q,
436        Err(e) => {
437            tracing::warn!(ext, %e, "tree-sitter call query compilation failed");
438            return None;
439        }
440    };
441    Some(CallConfig {
442        language: lang,
443        query,
444    })
445}
446
447#[cfg(test)]
448mod tests {
449    use super::*;
450
451    #[test]
452    fn rust_extension_resolves() {
453        assert!(config_for_extension("rs").is_some());
454    }
455
456    #[test]
457    fn python_extension_resolves() {
458        assert!(config_for_extension("py").is_some());
459    }
460
461    #[test]
462    fn python_stub_extension_resolves() {
463        assert!(config_for_extension("pyi").is_some());
464    }
465
466    #[test]
467    fn unknown_extension_returns_none() {
468        assert!(config_for_extension("xyz").is_none());
469    }
470
471    #[test]
472    fn all_supported_extensions() {
473        let exts = [
474            "rs", "py", "pyi", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc",
475            "cxx", "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
476            "scala", "toml", "json", "yaml", "yml", "md", "xml", "rdf", "owl",
477        ];
478        for ext in &exts {
479            assert!(config_for_extension(ext).is_some(), "failed for {ext}");
480        }
481    }
482
483    #[test]
484    fn turtle_family_uses_rdf_text_chunking_not_tree_sitter() {
485        for ext in ["ttl", "nt", "n3", "trig", "nq"] {
486            assert!(
487                config_for_extension(ext).is_none(),
488                "{ext} should be handled by RDF text chunking"
489            );
490            assert!(crate::chunk::is_rdf_text_extension(ext));
491        }
492    }
493
494    #[test]
495    fn all_call_query_extensions() {
496        let exts = [
497            "rs", "py", "pyi", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc",
498            "cxx", "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
499            "scala",
500        ];
501        for ext in &exts {
502            assert!(
503                call_query_for_extension(ext).is_some(),
504                "call query failed for {ext}"
505            );
506        }
507    }
508
509    #[test]
510    fn toml_has_no_call_query() {
511        assert!(call_query_for_extension("toml").is_none());
512    }
513
514    /// RED test (R2.3 issue a): scoped_identifier call must capture the full path.
515    ///
516    /// Before the fix, `mod_a::foo()` captured only `foo` as @callee.
517    /// After the fix, it must capture `mod_a::foo` as @callee.
518    #[test]
519    fn test_scoped_identifier_call_query_captures_full_path() {
520        use streaming_iterator::StreamingIterator as _;
521
522        let source = "
523fn caller() {
524    mod_a::foo();
525    std::io::stderr();
526}
527";
528        let call_cfg = call_query_for_extension("rs").expect("rs call config");
529        let mut parser = tree_sitter::Parser::new();
530        parser
531            .set_language(&call_cfg.language)
532            .expect("set language");
533        let tree = parser.parse(source, None).expect("parse");
534
535        let mut cursor = tree_sitter::QueryCursor::new();
536        let mut matches = cursor.matches(&call_cfg.query, tree.root_node(), source.as_bytes());
537
538        let mut callees: Vec<String> = Vec::new();
539        while let Some(m) = matches.next() {
540            for cap in m.captures {
541                let name = &call_cfg.query.capture_names()[cap.index as usize];
542                if *name == "callee" {
543                    let text = &source[cap.node.start_byte()..cap.node.end_byte()];
544                    callees.push(text.to_string());
545                }
546            }
547        }
548
549        // Must contain full qualified path, not bare identifier
550        assert!(
551            callees.contains(&"mod_a::foo".to_string()),
552            "expected 'mod_a::foo' in callees, got: {callees:?}"
553        );
554        // Bare 'foo' must not appear when scoped call is made
555        assert!(
556            !callees.contains(&"foo".to_string()),
557            "bare 'foo' must not appear for scoped call; got: {callees:?}"
558        );
559    }
560}