Skip to main content

ripvec_core/
languages.rs

1//! Language registry mapping file extensions to tree-sitter grammars.
2//!
3//! Each supported language has a grammar and a tree-sitter query that
4//! extracts function, class, and method definitions. Compiled queries
5//! are cached so that repeated calls for the same extension are free.
6
7use std::sync::{Arc, OnceLock};
8
9use tree_sitter::{Language, Query};
10
11/// Configuration for a supported source language.
12///
13/// Wrapped in [`Arc`] so it can be shared across threads and returned
14/// from the cache without cloning the compiled [`Query`].
15pub struct LangConfig {
16    /// The tree-sitter Language grammar.
17    pub language: Language,
18    /// Query that extracts semantic chunks (`@def` captures with `@name`).
19    pub query: Query,
20}
21
22/// Look up the language configuration for a file extension.
23///
24/// Compiled queries are cached per extension so repeated calls are free.
25/// Returns `None` for unsupported extensions.
26#[must_use]
27pub fn config_for_extension(ext: &str) -> Option<Arc<LangConfig>> {
28    // Cache of compiled configs, keyed by canonical extension.
29    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<LangConfig>>> =
30        OnceLock::new();
31
32    let cache = CACHE.get_or_init(|| {
33        let mut m = std::collections::HashMap::new();
34        // Pre-compile all supported extensions
35        for &ext in &[
36            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
37            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
38            "scala", "toml",
39        ] {
40            if let Some(cfg) = compile_config(ext) {
41                m.insert(ext, Arc::new(cfg));
42            }
43        }
44        m
45    });
46
47    cache.get(ext).cloned()
48}
49
50/// Compile a [`LangConfig`] for the given extension (uncached).
51#[expect(
52    clippy::too_many_lines,
53    reason = "one match arm per language — flat by design"
54)]
55fn compile_config(ext: &str) -> Option<LangConfig> {
56    let (lang, query_str): (Language, &str) = match ext {
57        // Rust: standalone functions, structs, and methods INSIDE impl/trait blocks.
58        // impl_item and trait_item are NOT captured as wholes — we extract their
59        // individual function_item children for method-level granularity.
60        "rs" => (
61            tree_sitter_rust::LANGUAGE.into(),
62            concat!(
63                "(function_item name: (identifier) @name) @def\n",
64                "(struct_item name: (type_identifier) @name) @def\n",
65                "(enum_item name: (type_identifier) @name) @def\n",
66                "(type_item name: (type_identifier) @name) @def",
67            ),
68        ),
69        // Python: top-level functions AND methods inside classes (function_definition
70        // matches at any nesting depth, so methods are captured individually).
71        "py" => (
72            tree_sitter_python::LANGUAGE.into(),
73            concat!(
74                "(function_definition name: (identifier) @name) @def\n",
75                "(class_definition name: (identifier) @name body: (block) @def)",
76            ),
77        ),
78        // JS: functions, methods, and arrow functions assigned to variables.
79        "js" | "jsx" => (
80            tree_sitter_javascript::LANGUAGE.into(),
81            concat!(
82                "(function_declaration name: (identifier) @name) @def\n",
83                "(method_definition name: (property_identifier) @name) @def\n",
84                "(class_declaration name: (identifier) @name) @def",
85            ),
86        ),
87        "ts" => (
88            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
89            concat!(
90                "(function_declaration name: (identifier) @name) @def\n",
91                "(method_definition name: (property_identifier) @name) @def\n",
92                "(class_declaration name: (type_identifier) @name) @def\n",
93                "(interface_declaration name: (type_identifier) @name) @def",
94            ),
95        ),
96        "tsx" => (
97            tree_sitter_typescript::LANGUAGE_TSX.into(),
98            concat!(
99                "(function_declaration name: (identifier) @name) @def\n",
100                "(method_definition name: (property_identifier) @name) @def\n",
101                "(class_declaration name: (type_identifier) @name) @def\n",
102                "(interface_declaration name: (type_identifier) @name) @def",
103            ),
104        ),
105        "go" => (
106            tree_sitter_go::LANGUAGE.into(),
107            concat!(
108                "(function_declaration name: (identifier) @name) @def\n",
109                "(method_declaration name: (field_identifier) @name) @def",
110            ),
111        ),
112        // Java: methods are already captured individually (method_declaration
113        // matches inside class bodies). Keep class for the signature/fields.
114        "java" => (
115            tree_sitter_java::LANGUAGE.into(),
116            concat!(
117                "(method_declaration name: (identifier) @name) @def\n",
118                "(class_declaration name: (identifier) @name) @def\n",
119                "(interface_declaration name: (identifier) @name) @def",
120            ),
121        ),
122        "c" | "h" => (
123            tree_sitter_c::LANGUAGE.into(),
124            "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def",
125        ),
126        // C++: functions at any level, plus class signatures.
127        "cpp" | "cc" | "cxx" | "hpp" => (
128            tree_sitter_cpp::LANGUAGE.into(),
129            concat!(
130                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
131                "(class_specifier name: (type_identifier) @name) @def",
132            ),
133        ),
134        // Bash: function definitions (.bats = Bash Automated Testing System).
135        "sh" | "bash" | "bats" => (
136            tree_sitter_bash::LANGUAGE.into(),
137            "(function_definition name: (word) @name) @def",
138        ),
139        // Ruby: methods, classes, and modules.
140        "rb" => (
141            tree_sitter_ruby::LANGUAGE.into(),
142            concat!(
143                "(method name: (identifier) @name) @def\n",
144                "(class name: (constant) @name) @def\n",
145                "(module name: (constant) @name) @def",
146            ),
147        ),
148        // HCL (Terraform): resource, data, variable, and output blocks.
149        "tf" | "tfvars" | "hcl" => (
150            tree_sitter_hcl::LANGUAGE.into(),
151            "(block (identifier) @name) @def",
152        ),
153        // Kotlin: functions, classes, and objects.
154        "kt" | "kts" => (
155            tree_sitter_kotlin_ng::LANGUAGE.into(),
156            concat!(
157                "(function_declaration name: (identifier) @name) @def\n",
158                "(class_declaration name: (identifier) @name) @def\n",
159                "(object_declaration name: (identifier) @name) @def",
160            ),
161        ),
162        // Swift: functions, classes, structs, enums, and protocols.
163        "swift" => (
164            tree_sitter_swift::LANGUAGE.into(),
165            concat!(
166                "(function_declaration name: (simple_identifier) @name) @def\n",
167                "(class_declaration name: (type_identifier) @name) @def\n",
168                "(protocol_declaration name: (type_identifier) @name) @def",
169            ),
170        ),
171        // Scala: functions, classes, traits, and objects.
172        "scala" => (
173            tree_sitter_scala::LANGUAGE.into(),
174            concat!(
175                "(function_definition name: (identifier) @name) @def\n",
176                "(class_definition name: (identifier) @name) @def\n",
177                "(trait_definition name: (identifier) @name) @def\n",
178                "(object_definition name: (identifier) @name) @def",
179            ),
180        ),
181        // TOML: table headers (sections).
182        "toml" => (
183            tree_sitter_toml_ng::LANGUAGE.into(),
184            "(table (bare_key) @name) @def",
185        ),
186        _ => return None,
187    };
188    let query = match Query::new(&lang, query_str) {
189        Ok(q) => q,
190        Err(e) => {
191            tracing::warn!(ext, %e, "tree-sitter query compilation failed — language may be ABI-incompatible");
192            return None;
193        }
194    };
195    Some(LangConfig {
196        language: lang,
197        query,
198    })
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    #[test]
206    fn rust_extension_resolves() {
207        assert!(config_for_extension("rs").is_some());
208    }
209
210    #[test]
211    fn python_extension_resolves() {
212        assert!(config_for_extension("py").is_some());
213    }
214
215    #[test]
216    fn unknown_extension_returns_none() {
217        assert!(config_for_extension("xyz").is_none());
218    }
219
220    #[test]
221    fn all_supported_extensions() {
222        let exts = [
223            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
224            "hpp", "sh", "bash", "bats", "rb", "tf", "tfvars", "hcl", "kt", "kts", "swift",
225            "scala", "toml",
226        ];
227        for ext in &exts {
228            assert!(config_for_extension(ext).is_some(), "failed for {ext}");
229        }
230    }
231}