normalize_languages/grammar_loader.rs
1//! Dynamic grammar loading for tree-sitter.
2//!
3//! Loads tree-sitter grammars from shared libraries (.so/.dylib/.dll).
4//! Also loads highlight queries (.scm files) for syntax highlighting.
5//! Grammars are compiled from arborium sources via `cargo xtask build-grammars`.
6//!
7//! # ABI Compatibility
8//!
9//! Tree-sitter grammars have an ABI version embedded at compile time. The tree-sitter
10//! library only loads grammars within its supported version range:
11//! - tree-sitter 0.24: ABI 13-14
12//! - tree-sitter 0.25+: ABI 13-15
13//!
14//! Arborium grammar crates embed the ABI version in their parser.c source. When arborium
15//! updates to use newer tree-sitter, grammars must be recompiled. Stale grammars in
16//! `~/.config/moss/grammars/` may cause `LanguageError { version: N }` if incompatible.
17//!
18//! # Lifetime Requirements
19//!
20//! **IMPORTANT**: The `GrammarLoader` must outlive any `Language` or `Tree` obtained from it.
21//! The loader holds the shared library (`Library`) that contains the grammar's code. If the
22//! loader is dropped, the library is unloaded, and any `Language`/`Tree` references become
23//! dangling pointers (use-after-free, likely segfault).
24//!
25//! Safe patterns:
26//! - Use a global singleton loader (see `normalize::parsers::grammar_loader()`)
27//! - Keep the loader in scope for the duration of tree usage
28//! - Return `(Tree, GrammarLoader)` tuples from helper functions
29//!
30//! Unsafe pattern (causes segfault):
31//! ```ignore
32//! fn parse(code: &str) -> Tree {
33//! let loader = GrammarLoader::new(); // Created here
34//! let lang = loader.get("python").unwrap();
35//! let mut parser = Parser::new();
36//! parser.set_language(&lang).unwrap();
37//! parser.parse(code, None).unwrap() // Tree returned
38//! } // loader dropped here - library unloaded!
39//! // Tree now has dangling pointers -> segfault on use
40//! ```
41
42use libloading::{Library, Symbol};
43use std::collections::HashMap;
44use std::path::{Path, PathBuf};
45use std::sync::{Arc, RwLock};
46use tree_sitter::Language;
47use tree_sitter_language::LanguageFn;
48
49/// Loaded grammar with its backing library.
50///
51/// The `_library` field keeps the shared library loaded in memory. The `language`
52/// field contains pointers into this library's memory. Dropping the library while
53/// the language is in use causes undefined behavior (typically segfault).
54struct LoadedGrammar {
55 /// Backing shared library - must outlive any use of `language`.
56 _library: Library,
57 /// Tree-sitter Language (contains pointers into `_library`).
58 language: Language,
59}
60
61/// Dynamic grammar loader with caching.
62pub struct GrammarLoader {
63 /// Search paths for grammar libraries.
64 search_paths: Vec<PathBuf>,
65 /// Cached loaded grammars.
66 cache: RwLock<HashMap<String, Arc<LoadedGrammar>>>,
67 /// Cached highlight queries.
68 highlight_cache: RwLock<HashMap<String, Arc<String>>>,
69 /// Cached injection queries.
70 injection_cache: RwLock<HashMap<String, Arc<String>>>,
71}
72
73impl GrammarLoader {
74 /// Create a new grammar loader with default search paths.
75 ///
76 /// Search order:
77 /// 1. `MOSS_GRAMMAR_PATH` environment variable (colon-separated)
78 /// 2. `~/.config/moss/grammars/`
79 pub fn new() -> Self {
80 let mut paths = Vec::new();
81
82 // Environment variable takes priority
83 if let Ok(env_path) = std::env::var("MOSS_GRAMMAR_PATH") {
84 for p in env_path.split(':') {
85 if !p.is_empty() {
86 paths.push(PathBuf::from(p));
87 }
88 }
89 }
90
91 // User config directory
92 if let Some(config) = dirs::config_dir() {
93 paths.push(config.join("moss/grammars"));
94 }
95
96 Self {
97 search_paths: paths,
98 cache: RwLock::new(HashMap::new()),
99 highlight_cache: RwLock::new(HashMap::new()),
100 injection_cache: RwLock::new(HashMap::new()),
101 }
102 }
103
104 /// Create a loader with custom search paths.
105 pub fn with_paths(paths: Vec<PathBuf>) -> Self {
106 Self {
107 search_paths: paths,
108 cache: RwLock::new(HashMap::new()),
109 highlight_cache: RwLock::new(HashMap::new()),
110 injection_cache: RwLock::new(HashMap::new()),
111 }
112 }
113
114 /// Add a search path.
115 pub fn add_path(&mut self, path: PathBuf) {
116 self.search_paths.push(path);
117 }
118
119 /// Get a grammar by name.
120 ///
121 /// Returns None if grammar not found in search paths.
122 pub fn get(&self, name: &str) -> Option<Language> {
123 // Check cache first
124 if let Some(loaded) = self.cache.read().ok()?.get(name) {
125 return Some(loaded.language.clone());
126 }
127
128 self.load_external(name)
129 }
130
131 /// Get the highlight query for a grammar.
132 ///
133 /// Returns None if no highlight query found for the grammar.
134 /// Query files are {name}.highlights.scm in the grammar search paths.
135 pub fn get_highlights(&self, name: &str) -> Option<Arc<String>> {
136 // Check cache first
137 if let Some(query) = self.highlight_cache.read().ok()?.get(name) {
138 return Some(Arc::clone(query));
139 }
140
141 self.load_query(name, "highlights", &self.highlight_cache)
142 }
143
144 /// Get the injection query for a grammar.
145 ///
146 /// Returns None if no injection query found for the grammar.
147 /// Query files are {name}.injections.scm in the grammar search paths.
148 pub fn get_injections(&self, name: &str) -> Option<Arc<String>> {
149 // Check cache first
150 if let Some(query) = self.injection_cache.read().ok()?.get(name) {
151 return Some(Arc::clone(query));
152 }
153
154 self.load_query(name, "injections", &self.injection_cache)
155 }
156
157 /// Load a query file (.scm) from external file.
158 fn load_query(
159 &self,
160 name: &str,
161 query_type: &str,
162 cache: &RwLock<HashMap<String, Arc<String>>>,
163 ) -> Option<Arc<String>> {
164 let scm_name = format!("{name}.{query_type}.scm");
165
166 for search_path in &self.search_paths {
167 let scm_path = search_path.join(&scm_name);
168 if scm_path.exists() {
169 if let Ok(content) = std::fs::read_to_string(&scm_path) {
170 let query = Arc::new(content);
171
172 // Cache it
173 if let Ok(mut c) = cache.write() {
174 c.insert(name.to_string(), Arc::clone(&query));
175 }
176
177 return Some(query);
178 }
179 }
180 }
181
182 None
183 }
184
185 /// Load a grammar from external .so file.
186 fn load_external(&self, name: &str) -> Option<Language> {
187 let lib_name = grammar_lib_name(name);
188
189 for search_path in &self.search_paths {
190 let lib_path = search_path.join(&lib_name);
191 if lib_path.exists() {
192 if let Some(lang) = self.load_from_path(name, &lib_path) {
193 return Some(lang);
194 }
195 }
196 }
197
198 None
199 }
200
201 /// Load grammar from a specific path.
202 fn load_from_path(&self, name: &str, path: &Path) -> Option<Language> {
203 // SAFETY: Loading shared libraries is inherently unsafe. We accept this risk because:
204 // 1. Grammars come from arborium (bundled) or user-configured search paths
205 // 2. The alternative (no dynamic loading) would require compiling all grammars statically
206 // 3. Tree-sitter grammars are widely used and well-tested
207 let library = unsafe { Library::new(path).ok()? };
208
209 let symbol_name = grammar_symbol_name(name);
210 // SAFETY: We call the tree-sitter grammar function which returns a Language pointer.
211 // The function signature is defined by tree-sitter's C ABI. We trust that:
212 // 1. The symbol exists (checked by library.get)
213 // 2. The function conforms to tree-sitter's expected signature
214 // 3. The returned Language is valid for the lifetime of the library
215 let language = unsafe {
216 let func: Symbol<unsafe extern "C" fn() -> *const ()> =
217 library.get(symbol_name.as_bytes()).ok()?;
218 let lang_fn = LanguageFn::from_raw(*func);
219 Language::new(lang_fn)
220 };
221
222 // Cache the loaded grammar
223 let loaded = Arc::new(LoadedGrammar {
224 _library: library,
225 language: language.clone(),
226 });
227
228 if let Ok(mut cache) = self.cache.write() {
229 cache.insert(name.to_string(), loaded);
230 }
231
232 Some(language)
233 }
234
235 /// List available grammars in search paths.
236 pub fn available_external(&self) -> Vec<String> {
237 let mut grammars = Vec::new();
238 let ext = grammar_extension();
239
240 for path in &self.search_paths {
241 if let Ok(entries) = std::fs::read_dir(path) {
242 for entry in entries.flatten() {
243 let name = entry.file_name();
244 let name_str = name.to_string_lossy();
245 if name_str.ends_with(ext) {
246 let grammar_name = name_str.trim_end_matches(ext);
247 if !grammars.contains(&grammar_name.to_string()) {
248 grammars.push(grammar_name.to_string());
249 }
250 }
251 }
252 }
253 }
254
255 grammars.sort();
256 grammars
257 }
258}
259
260impl Default for GrammarLoader {
261 fn default() -> Self {
262 Self::new()
263 }
264}
265
266/// Get the library file name for a grammar.
267fn grammar_lib_name(name: &str) -> String {
268 let ext = grammar_extension();
269 format!("{name}{ext}")
270}
271
272/// Get the expected symbol name for a grammar.
273fn grammar_symbol_name(name: &str) -> String {
274 // Special cases for arborium grammars with non-standard symbol names
275 match name {
276 "rust" => return "tree_sitter_rust_orchard".to_string(),
277 "vb" => return "tree_sitter_vb_dotnet".to_string(),
278 _ => {}
279 }
280 // Most grammars use tree_sitter_{name} with hyphens replaced by underscores
281 let normalized = name.replace('-', "_");
282 format!("tree_sitter_{normalized}")
283}
284
285/// Get the shared library extension for the current platform.
286fn grammar_extension() -> &'static str {
287 if cfg!(target_os = "macos") {
288 ".dylib"
289 } else if cfg!(target_os = "windows") {
290 ".dll"
291 } else {
292 ".so"
293 }
294}
295
296#[cfg(test)]
297mod tests {
298 use super::*;
299
300 #[test]
301 fn test_grammar_lib_name() {
302 let name = grammar_lib_name("python");
303 assert!(name.starts_with("python."));
304 }
305
306 #[test]
307 fn test_grammar_symbol_name() {
308 assert_eq!(grammar_symbol_name("python"), "tree_sitter_python");
309 assert_eq!(grammar_symbol_name("rust"), "tree_sitter_rust_orchard");
310 assert_eq!(grammar_symbol_name("ssh-config"), "tree_sitter_ssh_config");
311 assert_eq!(grammar_symbol_name("vb"), "tree_sitter_vb_dotnet");
312 }
313
314 #[test]
315 fn test_load_from_env() {
316 // Set up env var pointing to target/grammars
317 let grammar_path = std::env::current_dir().unwrap().join("target/grammars");
318
319 if !grammar_path.exists() {
320 eprintln!("Skipping: run `cargo xtask build-grammars` first");
321 return;
322 }
323
324 // SAFETY: This is a test that runs single-threaded
325 unsafe {
326 std::env::set_var("MOSS_GRAMMAR_PATH", grammar_path.to_str().unwrap());
327 }
328
329 let loader = GrammarLoader::new();
330
331 // Should load python from .so
332 let ext = grammar_extension();
333 if grammar_path.join(format!("python{ext}")).exists() {
334 let lang = loader.get("python");
335 assert!(lang.is_some(), "Failed to load python grammar");
336 }
337
338 // Clean up
339 // SAFETY: This is a test that runs single-threaded
340 unsafe {
341 std::env::remove_var("MOSS_GRAMMAR_PATH");
342 }
343 }
344}