Skip to main content

normalize_languages/
grammar_loader.rs

1//! Dynamic grammar loading for tree-sitter.
2//!
3//! Loads tree-sitter grammars from shared libraries (.so/.dylib/.dll).
4//! Also loads highlight queries (.scm files) for syntax highlighting.
5//! Grammars are compiled from arborium sources via `cargo xtask build-grammars`.
6//!
7//! # ABI Compatibility
8//!
9//! Tree-sitter grammars have an ABI version embedded at compile time. The tree-sitter
10//! library only loads grammars within its supported version range:
11//! - tree-sitter 0.24: ABI 13-14
12//! - tree-sitter 0.25+: ABI 13-15
13//!
14//! Arborium grammar crates embed the ABI version in their parser.c source. When arborium
15//! updates to use newer tree-sitter, grammars must be recompiled. Stale grammars in
16//! `~/.config/moss/grammars/` may cause `LanguageError { version: N }` if incompatible.
17//!
18//! # Lifetime Requirements
19//!
20//! **IMPORTANT**: The `GrammarLoader` must outlive any `Language` or `Tree` obtained from it.
21//! The loader holds the shared library (`Library`) that contains the grammar's code. If the
22//! loader is dropped, the library is unloaded, and any `Language`/`Tree` references become
23//! dangling pointers (use-after-free, likely segfault).
24//!
25//! Safe patterns:
26//! - Use a global singleton loader (see `normalize::parsers::grammar_loader()`)
27//! - Keep the loader in scope for the duration of tree usage
28//! - Return `(Tree, GrammarLoader)` tuples from helper functions
29//!
30//! Unsafe pattern (causes segfault):
31//! ```ignore
32//! fn parse(code: &str) -> Tree {
33//!     let loader = GrammarLoader::new();  // Created here
34//!     let lang = loader.get("python").unwrap();
35//!     let mut parser = Parser::new();
36//!     parser.set_language(&lang).unwrap();
37//!     parser.parse(code, None).unwrap()   // Tree returned
38//! }  // loader dropped here - library unloaded!
39//! // Tree now has dangling pointers -> segfault on use
40//! ```
41
42use libloading::{Library, Symbol};
43use std::collections::HashMap;
44use std::path::{Path, PathBuf};
45use std::sync::{Arc, RwLock};
46use tree_sitter::Language;
47use tree_sitter_language::LanguageFn;
48
49/// Loaded grammar with its backing library.
50///
51/// The `_library` field keeps the shared library loaded in memory. The `language`
52/// field contains pointers into this library's memory. Dropping the library while
53/// the language is in use causes undefined behavior (typically segfault).
54struct LoadedGrammar {
55    /// Backing shared library - must outlive any use of `language`.
56    _library: Library,
57    /// Tree-sitter Language (contains pointers into `_library`).
58    language: Language,
59}
60
61/// Dynamic grammar loader with caching.
62pub struct GrammarLoader {
63    /// Search paths for grammar libraries.
64    search_paths: Vec<PathBuf>,
65    /// Cached loaded grammars.
66    cache: RwLock<HashMap<String, Arc<LoadedGrammar>>>,
67    /// Cached highlight queries.
68    highlight_cache: RwLock<HashMap<String, Arc<String>>>,
69    /// Cached injection queries.
70    injection_cache: RwLock<HashMap<String, Arc<String>>>,
71}
72
73impl GrammarLoader {
74    /// Create a new grammar loader with default search paths.
75    ///
76    /// Search order:
77    /// 1. `MOSS_GRAMMAR_PATH` environment variable (colon-separated)
78    /// 2. `~/.config/moss/grammars/`
79    pub fn new() -> Self {
80        let mut paths = Vec::new();
81
82        // Environment variable takes priority
83        if let Ok(env_path) = std::env::var("MOSS_GRAMMAR_PATH") {
84            for p in env_path.split(':') {
85                if !p.is_empty() {
86                    paths.push(PathBuf::from(p));
87                }
88            }
89        }
90
91        // User config directory
92        if let Some(config) = dirs::config_dir() {
93            paths.push(config.join("moss/grammars"));
94        }
95
96        Self {
97            search_paths: paths,
98            cache: RwLock::new(HashMap::new()),
99            highlight_cache: RwLock::new(HashMap::new()),
100            injection_cache: RwLock::new(HashMap::new()),
101        }
102    }
103
104    /// Create a loader with custom search paths.
105    pub fn with_paths(paths: Vec<PathBuf>) -> Self {
106        Self {
107            search_paths: paths,
108            cache: RwLock::new(HashMap::new()),
109            highlight_cache: RwLock::new(HashMap::new()),
110            injection_cache: RwLock::new(HashMap::new()),
111        }
112    }
113
114    /// Add a search path.
115    pub fn add_path(&mut self, path: PathBuf) {
116        self.search_paths.push(path);
117    }
118
119    /// Get a grammar by name.
120    ///
121    /// Returns None if grammar not found in search paths.
122    pub fn get(&self, name: &str) -> Option<Language> {
123        // Check cache first
124        if let Some(loaded) = self.cache.read().ok()?.get(name) {
125            return Some(loaded.language.clone());
126        }
127
128        self.load_external(name)
129    }
130
131    /// Get the highlight query for a grammar.
132    ///
133    /// Returns None if no highlight query found for the grammar.
134    /// Query files are {name}.highlights.scm in the grammar search paths.
135    pub fn get_highlights(&self, name: &str) -> Option<Arc<String>> {
136        // Check cache first
137        if let Some(query) = self.highlight_cache.read().ok()?.get(name) {
138            return Some(Arc::clone(query));
139        }
140
141        self.load_query(name, "highlights", &self.highlight_cache)
142    }
143
144    /// Get the injection query for a grammar.
145    ///
146    /// Returns None if no injection query found for the grammar.
147    /// Query files are {name}.injections.scm in the grammar search paths.
148    pub fn get_injections(&self, name: &str) -> Option<Arc<String>> {
149        // Check cache first
150        if let Some(query) = self.injection_cache.read().ok()?.get(name) {
151            return Some(Arc::clone(query));
152        }
153
154        self.load_query(name, "injections", &self.injection_cache)
155    }
156
157    /// Load a query file (.scm) from external file.
158    fn load_query(
159        &self,
160        name: &str,
161        query_type: &str,
162        cache: &RwLock<HashMap<String, Arc<String>>>,
163    ) -> Option<Arc<String>> {
164        let scm_name = format!("{name}.{query_type}.scm");
165
166        for search_path in &self.search_paths {
167            let scm_path = search_path.join(&scm_name);
168            if scm_path.exists() {
169                if let Ok(content) = std::fs::read_to_string(&scm_path) {
170                    let query = Arc::new(content);
171
172                    // Cache it
173                    if let Ok(mut c) = cache.write() {
174                        c.insert(name.to_string(), Arc::clone(&query));
175                    }
176
177                    return Some(query);
178                }
179            }
180        }
181
182        None
183    }
184
185    /// Load a grammar from external .so file.
186    fn load_external(&self, name: &str) -> Option<Language> {
187        let lib_name = grammar_lib_name(name);
188
189        for search_path in &self.search_paths {
190            let lib_path = search_path.join(&lib_name);
191            if lib_path.exists() {
192                if let Some(lang) = self.load_from_path(name, &lib_path) {
193                    return Some(lang);
194                }
195            }
196        }
197
198        None
199    }
200
201    /// Load grammar from a specific path.
202    fn load_from_path(&self, name: &str, path: &Path) -> Option<Language> {
203        // SAFETY: Loading shared libraries is inherently unsafe. We accept this risk because:
204        // 1. Grammars come from arborium (bundled) or user-configured search paths
205        // 2. The alternative (no dynamic loading) would require compiling all grammars statically
206        // 3. Tree-sitter grammars are widely used and well-tested
207        let library = unsafe { Library::new(path).ok()? };
208
209        let symbol_name = grammar_symbol_name(name);
210        // SAFETY: We call the tree-sitter grammar function which returns a Language pointer.
211        // The function signature is defined by tree-sitter's C ABI. We trust that:
212        // 1. The symbol exists (checked by library.get)
213        // 2. The function conforms to tree-sitter's expected signature
214        // 3. The returned Language is valid for the lifetime of the library
215        let language = unsafe {
216            let func: Symbol<unsafe extern "C" fn() -> *const ()> =
217                library.get(symbol_name.as_bytes()).ok()?;
218            let lang_fn = LanguageFn::from_raw(*func);
219            Language::new(lang_fn)
220        };
221
222        // Cache the loaded grammar
223        let loaded = Arc::new(LoadedGrammar {
224            _library: library,
225            language: language.clone(),
226        });
227
228        if let Ok(mut cache) = self.cache.write() {
229            cache.insert(name.to_string(), loaded);
230        }
231
232        Some(language)
233    }
234
235    /// List available grammars in search paths.
236    pub fn available_external(&self) -> Vec<String> {
237        let mut grammars = Vec::new();
238        let ext = grammar_extension();
239
240        for path in &self.search_paths {
241            if let Ok(entries) = std::fs::read_dir(path) {
242                for entry in entries.flatten() {
243                    let name = entry.file_name();
244                    let name_str = name.to_string_lossy();
245                    if name_str.ends_with(ext) {
246                        let grammar_name = name_str.trim_end_matches(ext);
247                        if !grammars.contains(&grammar_name.to_string()) {
248                            grammars.push(grammar_name.to_string());
249                        }
250                    }
251                }
252            }
253        }
254
255        grammars.sort();
256        grammars
257    }
258}
259
260impl Default for GrammarLoader {
261    fn default() -> Self {
262        Self::new()
263    }
264}
265
266/// Get the library file name for a grammar.
267fn grammar_lib_name(name: &str) -> String {
268    let ext = grammar_extension();
269    format!("{name}{ext}")
270}
271
272/// Get the expected symbol name for a grammar.
273fn grammar_symbol_name(name: &str) -> String {
274    // Special cases for arborium grammars with non-standard symbol names
275    match name {
276        "rust" => return "tree_sitter_rust_orchard".to_string(),
277        "vb" => return "tree_sitter_vb_dotnet".to_string(),
278        _ => {}
279    }
280    // Most grammars use tree_sitter_{name} with hyphens replaced by underscores
281    let normalized = name.replace('-', "_");
282    format!("tree_sitter_{normalized}")
283}
284
285/// Get the shared library extension for the current platform.
286fn grammar_extension() -> &'static str {
287    if cfg!(target_os = "macos") {
288        ".dylib"
289    } else if cfg!(target_os = "windows") {
290        ".dll"
291    } else {
292        ".so"
293    }
294}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299
300    #[test]
301    fn test_grammar_lib_name() {
302        let name = grammar_lib_name("python");
303        assert!(name.starts_with("python."));
304    }
305
306    #[test]
307    fn test_grammar_symbol_name() {
308        assert_eq!(grammar_symbol_name("python"), "tree_sitter_python");
309        assert_eq!(grammar_symbol_name("rust"), "tree_sitter_rust_orchard");
310        assert_eq!(grammar_symbol_name("ssh-config"), "tree_sitter_ssh_config");
311        assert_eq!(grammar_symbol_name("vb"), "tree_sitter_vb_dotnet");
312    }
313
314    #[test]
315    fn test_load_from_env() {
316        // Set up env var pointing to target/grammars
317        let grammar_path = std::env::current_dir().unwrap().join("target/grammars");
318
319        if !grammar_path.exists() {
320            eprintln!("Skipping: run `cargo xtask build-grammars` first");
321            return;
322        }
323
324        // SAFETY: This is a test that runs single-threaded
325        unsafe {
326            std::env::set_var("MOSS_GRAMMAR_PATH", grammar_path.to_str().unwrap());
327        }
328
329        let loader = GrammarLoader::new();
330
331        // Should load python from .so
332        let ext = grammar_extension();
333        if grammar_path.join(format!("python{ext}")).exists() {
334            let lang = loader.get("python");
335            assert!(lang.is_some(), "Failed to load python grammar");
336        }
337
338        // Clean up
339        // SAFETY: This is a test that runs single-threaded
340        unsafe {
341            std::env::remove_var("MOSS_GRAMMAR_PATH");
342        }
343    }
344}