Skip to main content

ripvec_core/
languages.rs

1//! Language registry mapping file extensions to tree-sitter grammars.
2//!
3//! Each supported language has a grammar and a tree-sitter query that
4//! extracts function, class, and method definitions. Compiled queries
5//! are cached so that repeated calls for the same extension are free.
6
7use std::sync::{Arc, OnceLock};
8
9use tree_sitter::{Language, Query};
10
11/// Configuration for a supported source language.
12///
13/// Wrapped in [`Arc`] so it can be shared across threads and returned
14/// from the cache without cloning the compiled [`Query`].
15pub struct LangConfig {
16    /// The tree-sitter Language grammar.
17    pub language: Language,
18    /// Query that extracts semantic chunks (`@def` captures with `@name`).
19    pub query: Query,
20}
21
22/// Look up the language configuration for a file extension.
23///
24/// Compiled queries are cached per extension so repeated calls are free.
25/// Returns `None` for unsupported extensions.
26#[must_use]
27pub fn config_for_extension(ext: &str) -> Option<Arc<LangConfig>> {
28    // Cache of compiled configs, keyed by canonical extension.
29    static CACHE: OnceLock<std::collections::HashMap<&'static str, Arc<LangConfig>>> =
30        OnceLock::new();
31
32    let cache = CACHE.get_or_init(|| {
33        let mut m = std::collections::HashMap::new();
34        // Pre-compile all supported extensions
35        for &ext in &[
36            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
37            "hpp", "sh", "bash", "rb", "tf", "hcl", "kt", "kts", "swift", "scala", "toml",
38        ] {
39            if let Some(cfg) = compile_config(ext) {
40                m.insert(ext, Arc::new(cfg));
41            }
42        }
43        m
44    });
45
46    cache.get(ext).cloned()
47}
48
49/// Compile a [`LangConfig`] for the given extension (uncached).
50#[expect(
51    clippy::too_many_lines,
52    reason = "one match arm per language — flat by design"
53)]
54fn compile_config(ext: &str) -> Option<LangConfig> {
55    let (lang, query_str): (Language, &str) = match ext {
56        // Rust: standalone functions, structs, and methods INSIDE impl/trait blocks.
57        // impl_item and trait_item are NOT captured as wholes — we extract their
58        // individual function_item children for method-level granularity.
59        "rs" => (
60            tree_sitter_rust::LANGUAGE.into(),
61            concat!(
62                "(function_item name: (identifier) @name) @def\n",
63                "(struct_item name: (type_identifier) @name) @def\n",
64                "(enum_item name: (type_identifier) @name) @def\n",
65                "(type_item name: (type_identifier) @name) @def",
66            ),
67        ),
68        // Python: top-level functions AND methods inside classes (function_definition
69        // matches at any nesting depth, so methods are captured individually).
70        "py" => (
71            tree_sitter_python::LANGUAGE.into(),
72            concat!(
73                "(function_definition name: (identifier) @name) @def\n",
74                "(class_definition name: (identifier) @name body: (block) @def)",
75            ),
76        ),
77        // JS: functions, methods, and arrow functions assigned to variables.
78        "js" | "jsx" => (
79            tree_sitter_javascript::LANGUAGE.into(),
80            concat!(
81                "(function_declaration name: (identifier) @name) @def\n",
82                "(method_definition name: (property_identifier) @name) @def\n",
83                "(class_declaration name: (identifier) @name) @def",
84            ),
85        ),
86        "ts" => (
87            tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
88            concat!(
89                "(function_declaration name: (identifier) @name) @def\n",
90                "(method_definition name: (property_identifier) @name) @def\n",
91                "(class_declaration name: (type_identifier) @name) @def\n",
92                "(interface_declaration name: (type_identifier) @name) @def",
93            ),
94        ),
95        "tsx" => (
96            tree_sitter_typescript::LANGUAGE_TSX.into(),
97            concat!(
98                "(function_declaration name: (identifier) @name) @def\n",
99                "(method_definition name: (property_identifier) @name) @def\n",
100                "(class_declaration name: (type_identifier) @name) @def\n",
101                "(interface_declaration name: (type_identifier) @name) @def",
102            ),
103        ),
104        "go" => (
105            tree_sitter_go::LANGUAGE.into(),
106            concat!(
107                "(function_declaration name: (identifier) @name) @def\n",
108                "(method_declaration name: (field_identifier) @name) @def",
109            ),
110        ),
111        // Java: methods are already captured individually (method_declaration
112        // matches inside class bodies). Keep class for the signature/fields.
113        "java" => (
114            tree_sitter_java::LANGUAGE.into(),
115            concat!(
116                "(method_declaration name: (identifier) @name) @def\n",
117                "(class_declaration name: (identifier) @name) @def\n",
118                "(interface_declaration name: (identifier) @name) @def",
119            ),
120        ),
121        "c" | "h" => (
122            tree_sitter_c::LANGUAGE.into(),
123            "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def",
124        ),
125        // C++: functions at any level, plus class signatures.
126        "cpp" | "cc" | "cxx" | "hpp" => (
127            tree_sitter_cpp::LANGUAGE.into(),
128            concat!(
129                "(function_definition declarator: (function_declarator declarator: (identifier) @name)) @def\n",
130                "(class_specifier name: (type_identifier) @name) @def",
131            ),
132        ),
133        // Bash: function definitions.
134        "sh" | "bash" => (
135            tree_sitter_bash::LANGUAGE.into(),
136            "(function_definition name: (word) @name) @def",
137        ),
138        // Ruby: methods, classes, and modules.
139        "rb" => (
140            tree_sitter_ruby::LANGUAGE.into(),
141            concat!(
142                "(method name: (identifier) @name) @def\n",
143                "(class name: (constant) @name) @def\n",
144                "(module name: (constant) @name) @def",
145            ),
146        ),
147        // HCL (Terraform): resource, data, variable, and output blocks.
148        "tf" | "hcl" => (
149            tree_sitter_hcl::LANGUAGE.into(),
150            "(block (identifier) @name) @def",
151        ),
152        // Kotlin: functions, classes, and objects.
153        "kt" | "kts" => (
154            tree_sitter_kotlin_ng::LANGUAGE.into(),
155            concat!(
156                "(function_declaration name: (identifier) @name) @def\n",
157                "(class_declaration name: (identifier) @name) @def\n",
158                "(object_declaration name: (identifier) @name) @def",
159            ),
160        ),
161        // Swift: functions, classes, structs, enums, and protocols.
162        "swift" => (
163            tree_sitter_swift::LANGUAGE.into(),
164            concat!(
165                "(function_declaration name: (simple_identifier) @name) @def\n",
166                "(class_declaration name: (type_identifier) @name) @def\n",
167                "(protocol_declaration name: (type_identifier) @name) @def",
168            ),
169        ),
170        // Scala: functions, classes, traits, and objects.
171        "scala" => (
172            tree_sitter_scala::LANGUAGE.into(),
173            concat!(
174                "(function_definition name: (identifier) @name) @def\n",
175                "(class_definition name: (identifier) @name) @def\n",
176                "(trait_definition name: (identifier) @name) @def\n",
177                "(object_definition name: (identifier) @name) @def",
178            ),
179        ),
180        // TOML: table headers (sections).
181        "toml" => (
182            tree_sitter_toml_ng::LANGUAGE.into(),
183            "(table (bare_key) @name) @def",
184        ),
185        _ => return None,
186    };
187    let query = match Query::new(&lang, query_str) {
188        Ok(q) => q,
189        Err(e) => {
190            tracing::warn!(ext, %e, "tree-sitter query compilation failed — language may be ABI-incompatible");
191            return None;
192        }
193    };
194    Some(LangConfig {
195        language: lang,
196        query,
197    })
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    #[test]
205    fn rust_extension_resolves() {
206        assert!(config_for_extension("rs").is_some());
207    }
208
209    #[test]
210    fn python_extension_resolves() {
211        assert!(config_for_extension("py").is_some());
212    }
213
214    #[test]
215    fn unknown_extension_returns_none() {
216        assert!(config_for_extension("xyz").is_none());
217    }
218
219    #[test]
220    fn all_supported_extensions() {
221        let exts = [
222            "rs", "py", "js", "jsx", "ts", "tsx", "go", "java", "c", "h", "cpp", "cc", "cxx",
223            "hpp", "sh", "bash", "rb", "tf", "hcl", "kt", "kts", "swift", "scala", "toml",
224        ];
225        for ext in &exts {
226            assert!(config_for_extension(ext).is_some(), "failed for {ext}");
227        }
228    }
229}