libdiffsitter/
parse.rs

1//! Utilities for reading and parsing files with the diffsitter parser
2
3// Loads codegen methods from the build script
4// We only load for either static-grammar-libs or dynamic-grammar-libs. This is required
5// because both of these feature enable functions that need imports and functions
6#[cfg(feature = "static-grammar-libs")]
7include!(concat!(env!("OUT_DIR"), "/generated_grammar.rs"));
8
9#[cfg(feature = "static-grammar-libs")]
10use lazy_static::lazy_static;
11
12#[cfg(feature = "static-grammar-libs")]
13lazy_static! {
14    /// All of the languages diffsitter was compiled with support for.
15    ///
16    /// This *only* applies for statically compiled tree-sitter grammars.
17    pub static ref SUPPORTED_LANGUAGES: Vec<&'static str> = {
18        let mut keys: Vec<&'static str> = LANGUAGES.keys().copied().collect();
19        keys.sort_unstable();
20        keys
21    };
22}
23
24#[cfg(not(feature = "static-grammar-libs"))]
25use phf::phf_map;
26
27#[cfg(not(feature = "static-grammar-libs"))]
28use tree_sitter::Language;
29
30use log::{debug, error, info};
31use logging_timer::time;
32use serde::{Deserialize, Serialize};
33use std::{
34    collections::HashMap,
35    fs, io,
36    path::{Path, PathBuf},
37};
38use thiserror::Error;
39use tree_sitter::{Parser, Tree, LANGUAGE_VERSION, MIN_COMPATIBLE_LANGUAGE_VERSION};
40
41/// A mapping of file extensions to their associated languages
42///
43/// The languages correspond to grammars from `tree-sitter`
44static FILE_EXTS: phf::Map<&'static str, &'static str> = phf_map! {
45    "hs" => "haskell",
46    "rs" => "rust",
47    "go" => "go",
48    "c" => "c",
49    "cc" => "cpp",
50    "cpp" => "cpp",
51    "cs" => "c_sharp",
52    "java" => "java",
53    "py" => "python",
54    "css" => "css",
55    "sh" => "bash",
56    "bash" => "bash",
57    "jl" => "julia",
58    "ml" => "ocaml",
59    "rb" => "ruby",
60    "scala" => "scala",
61    "sc" => "scala",
62    "swift" => "swift",
63    "php" => "php",
64    "json" => "json",
65    "hcl" => "hcl",
66    "ts" => "typescript",
67    "tsx" => "tsx",
68    "js" => "typescript",
69    "jsx" => "tsx",
70    "hpp" => "cpp",
71    "tpp" => "tpp",
72    "h" => "c",
73    "tf" => "hcl",
74    "md" => "markdown",
75};
76
77/// Possible errors that can arise when loading grammars
78#[derive(Error, Debug)]
79pub enum LoadingError {
80    #[cfg(feature = "static-grammar-libs")]
81    #[error("The program was not compiled with support for {0}")]
82    StaticNotCompiled(String),
83
84    #[error("This program was not compiled with support for any grammars")]
85    NoGrammars,
86
87    #[error("Unsupported extension: {0}")]
88    UnsupportedExt(String),
89
90    #[error("Did not find a valid file extension from filename {0}")]
91    NoFileExt(String),
92
93    #[error("tree-sitter had an error")]
94    LanguageError(#[from] tree_sitter::LanguageError),
95
96    #[error("could not parse {0} with tree-sitter")]
97    TSParseFailure(PathBuf),
98
99    #[error("Some IO error was encountered")]
100    IoError(#[from] io::Error),
101
102    #[error("Unable to dynamically load grammar")]
103    LibloadingError(#[from] libloading::Error),
104
105    #[error("Attempted to load a tree-sitter grammar with incompatible language ABI version: {0} (supported range: {1} - {2})")]
106    AbiOutOfRange(usize, usize, usize),
107}
108
109type StringMap = HashMap<String, String>;
110
111/// Configuration options pertaining to loading grammars and parsing files.
112#[derive(Debug, Eq, PartialEq, Serialize, Deserialize, Clone, Default)]
113#[serde(rename_all = "kebab-case")]
114pub struct GrammarConfig {
115    /// Set which dynamic library files should be used for different languages.
116    ///
117    /// This is a mapping from language strings to absolute file paths, relative filepaths, or
118    /// file names.
119    pub dylib_overrides: Option<StringMap>,
120
121    /// Override the languages that get resolved for different extensions.
122    ///
123    /// This is a mapping from extension names to language strings. For example:
124    /// ```txt
125    /// "cpp" => "cpp"
126    /// ```
127    pub file_associations: Option<StringMap>,
128}
129
130/// Generate a [tree sitter language](Language) from a language string for a static language.
131///
132/// This will return an error if an unknown string is provided.
133#[cfg(feature = "static-grammar-libs")]
134fn generate_language_static(lang: &str) -> Result<Language, LoadingError> {
135    info!("Using tree-sitter parser for language {}", lang);
136    match LANGUAGES.get(lang) {
137        Some(grammar_fn) => Ok(unsafe { grammar_fn() }),
138        None => Err(LoadingError::StaticNotCompiled(lang.to_string())),
139    }
140}
141
142/// Generate the method name to load a parser given the name of the language.
143///
144/// "tree-sitter-" will be prepended to the language and any dashes (-) will be converted
145/// to underscores (_).
146///
147/// # Arguments
148///
149/// - lang: The name of the language that corresponds to the parser. This must be the language name
150///   that corresponds to the actual tree-sitter name for the language because it is used to
151///   generate the name of the symbol from the shared object library that serves as the
152///   constructor.
153#[must_use]
154pub fn tree_sitter_constructor_symbol_name(lang: &str) -> String {
155    format!("tree_sitter_{}", lang.replace('-', "_"))
156}
157
158/// Generate the name of the library to `dlopen` given the name of the langauge.
159///
160/// "lib" will be prepended to the name of the language, and any underscores (_) will be converted
161/// to dashes (-) and the appropriate extension will be applied based on the platform this binary
162/// was compiled for.
163#[cfg(feature = "dynamic-grammar-libs")]
164fn lib_name_from_lang(lang: &str) -> String {
165    let extension = if cfg!(target_os = "macos") {
166        "dylib"
167    } else if cfg!(any(target_os = "linux", target_os = "netbsd")) {
168        "so"
169    } else if cfg!(target_os = "windows") {
170        "dll"
171    } else {
172        panic!("Dynamic libraries are not supported for this platform.");
173    };
174    format!("libtree-sitter-{}.{}", lang.replace('_', "-"), extension)
175}
176
177/// Create a tree sitter [Language] from a shared library object.
178///
179/// This creates a memory leak by leaking the shared library that's loaded from the file path
180/// (assuming that loading is succesful). This memory leak is *necessary* otherwise the program
181/// will segfault when trying to use the generated [Language] object with the tree-sitter library.
182/// The tree-sitter rust bindings wrap the tree-sitter C FFI interface, so the shared library
183/// object has to be loaded into memory while we want to use the [Language] object with any method
184/// in [`tree_sitter`].
185///
186/// # Arguments
187///
188/// - `language_name`: The tree-sitter language name.
189/// - `parser_path`: The path to the shared library object file.
190///
191/// # Errors
192///
193/// This will return an error if the file path doesn't exist or if there's an error trying to load
194/// symbols from the shared library object.
195///
196/// # Safety
197///
198/// This uses the [libloading] library to load symbols from the shared library object. This is
199/// inherently unsafe because it loads symbols from an arbitrary shared library object. Both the
200/// file path and the actual loaded symbol name can be generated from user input. This method does
201/// leak the shared library loaded with [libloading] to prevent segfaults because the parser loaded
202/// from the shared library may be used at any point for the duration of the program.
203pub fn construct_ts_lang_from_shared_lib(
204    language_name: &str,
205    parser_path: &Path,
206) -> Result<Language, LoadingError> {
207    info!(
208        "Loading dynamic library for language '{}' path '{}'",
209        language_name,
210        parser_path.to_string_lossy(),
211    );
212    let constructor_symbol_name = tree_sitter_constructor_symbol_name(language_name);
213    debug!(
214        "Using '{}' as symbol name for parser constructor method",
215        constructor_symbol_name
216    );
217    // We need to have the path as bytes for libloading
218    let grammar = unsafe {
219        // We leak the library because the symbol table has to be loaded in memory for the
220        // entire duration of the program up until the very end. There is probably a better way
221        // to do this that doesn't involve leaking memory, but I wasn't able to figure it out.
222        let shared_library = Box::new(libloading::Library::new(parser_path.as_os_str())?);
223        let static_shared_library = Box::leak(shared_library);
224        let constructor = static_shared_library.get::<libloading::Symbol<
225            unsafe extern "C" fn() -> Language,
226        >>(constructor_symbol_name.as_bytes())?;
227        constructor()
228    };
229    Ok(grammar)
230}
231
232/// Attempt to generate a tree-sitter grammar from a shared library
233#[cfg(feature = "dynamic-grammar-libs")]
234fn generate_language_dynamic(
235    lang: &str,
236    overrides: Option<&StringMap>,
237) -> Result<Language, LoadingError> {
238    let default_fname = lib_name_from_lang(lang);
239
240    let lib_fname = if let Some(d) = overrides {
241        debug!("Overriding dynamic library name because of user config");
242        d.get(lang).unwrap_or(&default_fname)
243    } else {
244        &default_fname
245    };
246    let language_path = PathBuf::from(lib_fname);
247    construct_ts_lang_from_shared_lib(lang, &language_path)
248}
249
250/// Generate a tree-sitter language from a language string.
251///
252/// This is a dispatch method that will attempt to load a statically linked grammar, and then fall
253/// back to loading the dynamic library for the grammar. If the user specifies an override for the
254/// dynamic library then that will be prioritized first.
255#[allow(clippy::vec_init_then_push)]
256// `config` is not used if the `dynamic-grammar-libs` build flag isn't enabled
257#[allow(unused)]
258pub fn generate_language(lang: &str, config: &GrammarConfig) -> Result<Language, LoadingError> {
259    // The candidates for the grammar, in order of precedence.
260    let mut grammar_candidates = Vec::new();
261
262    // Try the dynamic grammar first if there's a user override
263    #[cfg(feature = "dynamic-grammar-libs")]
264    if config.dylib_overrides.is_some() {
265        grammar_candidates.push(generate_language_dynamic(
266            lang,
267            config.dylib_overrides.as_ref(),
268        ));
269    }
270
271    // If there's no user override we prioritize the static/vendored grammar since there's much
272    // better guarantees of that working correctly.
273    #[cfg(feature = "static-grammar-libs")]
274    grammar_candidates.push(generate_language_static(lang));
275
276    #[cfg(feature = "dynamic-grammar-libs")]
277    if config.dylib_overrides.is_none() {
278        grammar_candidates.push(generate_language_dynamic(
279            lang,
280            config.dylib_overrides.as_ref(),
281        ));
282    }
283
284    // Need to get the length of the vector here to prevent issues with borrowing in the loop
285    let last_cand_idx = grammar_candidates.len() - 1;
286
287    for (i, candidate_result) in grammar_candidates.into_iter().enumerate() {
288        let is_last_cand = i == last_cand_idx;
289
290        match candidate_result {
291            Ok(grammar) => {
292                info!("Succeeded loading grammar for {}", lang);
293                ts_language_abi_checked(&grammar)?;
294                return Ok(grammar);
295            }
296            Err(e) => {
297                debug!("Failed to load candidate grammar for {}: {}", lang, &e);
298                // Only error out on the last candidate, otherwise we want to keep falling back to
299                // the next potential grammar
300                if is_last_cand {
301                    error!("Failed to load all candidate grammars for {}", lang);
302                    return Err(e);
303                }
304            }
305        };
306    }
307    error!("No grammars were loaded at all");
308    Err(LoadingError::NoGrammars)
309}
310
311/// Get the language string that corresponds to an extension.
312///
313/// The user is optionally allowed to supply a map of overrides for these extensions, if none are
314/// supplied, or if the given extension is not found in the map, this method will fall back to the
315/// default map, `FILE_EXTS`.
316#[must_use]
317pub fn resolve_language_str<'a>(
318    ext: &str,
319    overrides: Option<&'a HashMap<String, String>>,
320) -> Option<&'a str> {
321    let lang_from_override = {
322        if let Some(overrides) = overrides {
323            overrides.get(ext)
324        } else {
325            None
326        }
327    };
328    let lang_from_defaults = FILE_EXTS.get(ext);
329
330    if let Some(lang) = lang_from_override {
331        info!(
332            "Deduced language \"{}\" from extension \"{}\" provided from user mappings",
333            lang, ext
334        );
335        Some(lang)
336    } else if let Some(lang) = lang_from_defaults {
337        info!(
338            "Deduced language \"{}\" from extension \"{}\" from default mappings",
339            lang, ext
340        );
341        Some(lang)
342    } else {
343        error!(
344            "Was not able to find a language string for extension {}",
345            ext
346        );
347        None
348    }
349}
350
351/// Create an instance of a language from a file extension
352///
353/// The user may optionally provide a map of overrides or additional file extensions.
354#[deprecated(
355    since = "0.8.1",
356    note = "You should use lang_name_from_file_ext instead."
357)]
358pub fn language_from_ext(
359    ext: &str,
360    grammar_config: &GrammarConfig,
361) -> Result<Language, LoadingError> {
362    let language_str_cand = resolve_language_str(ext, grammar_config.file_associations.as_ref());
363
364    if let Some(language_str) = language_str_cand {
365        generate_language(language_str, grammar_config)
366    } else {
367        Err(LoadingError::UnsupportedExt(ext.to_string()))
368    }
369}
370
371/// Load a language name from a file extension.
372///
373/// This will return the name of a language, like "python" based on the file extension and
374/// configured file associations.
375///
376/// # Arguments
377///
378/// * `ext` - The file extension string, without the leading period character. For example: "md",
379///   "py".
380/// * `config` - The grammar config. This holds file associations between extensions and language
381///    names.
382///
383/// # Errors
384///
385/// This will return an error if an associated language is not found for the given file extension.
386/// If this is the case, this function returns an [`UnsupportedExt`](LoadingError::UnsupportedExt)
387/// error.
388///
389/// # Examples
390///
391/// ```
392/// use libdiffsitter::parse::{GrammarConfig, lang_name_from_file_ext};
393///
394/// let config = GrammarConfig::default();
395/// let lang_name = lang_name_from_file_ext("py", &config);
396///
397/// assert_eq!(lang_name.unwrap(), "python");
398/// ```
399pub fn lang_name_from_file_ext<'cfg>(
400    ext: &str,
401    grammar_config: &'cfg GrammarConfig,
402) -> Result<&'cfg str, LoadingError> {
403    let language_str_cand = resolve_language_str(ext, grammar_config.file_associations.as_ref());
404    match language_str_cand {
405        Some(s) => Ok(s),
406        None => Err(LoadingError::UnsupportedExt(ext.to_string())),
407    }
408}
409
410/// A convenience function to check of a tree-sitter language has a compatible ABI version for
411/// `diffsitter`.
412///
413/// Diffsitter has a version of the tree-sitter library it's build against and that library
414/// supports a certain range of tree-sitter ABIs. Each compiled tree-sitter grammar reports its ABI
415/// version, so we can check whether the ABI versions are compatible before loading the grammar
416/// as a tree-sitter parser, which should prevent segfaults due to these sorts of mismatches.
417pub fn ts_language_abi_checked(ts_language: &Language) -> Result<(), LoadingError> {
418    let loaded_ts_version = ts_language.abi_version();
419    let is_abi_compatible =
420        (MIN_COMPATIBLE_LANGUAGE_VERSION..=LANGUAGE_VERSION).contains(&loaded_ts_version);
421    if !is_abi_compatible {
422        return Err(LoadingError::AbiOutOfRange(
423            loaded_ts_version,
424            MIN_COMPATIBLE_LANGUAGE_VERSION,
425            LANGUAGE_VERSION,
426        ));
427    }
428    Ok(())
429}
430
431/// Parse a file to an AST
432///
433/// The user may optionally supply the language to use. If the language is not supplied, it will be
434/// inferrred from the file's extension.
435#[time("info", "parse::{}")]
436pub fn parse_file(
437    p: &Path,
438    language: Option<&str>,
439    config: &GrammarConfig,
440) -> Result<Tree, LoadingError> {
441    // Either use the provided language or infer the language to use with the parser from the file
442    // extension
443    let resolved_language = match language {
444        Some(lang) => Ok(lang),
445        None => {
446            if let Some(ext) = p.extension() {
447                lang_name_from_file_ext(&ext.to_string_lossy(), config)
448            } else {
449                Err(LoadingError::NoFileExt(p.to_string_lossy().to_string()))
450            }
451        }
452    }?;
453    let mut parser = Parser::new();
454    let ts_lang = generate_language(resolved_language, config)?;
455    parser.set_language(&ts_lang)?;
456    let text = fs::read_to_string(p)?;
457    match parser.parse(&text, None) {
458        Some(ast) => {
459            debug!("Parsed AST");
460            Ok(ast)
461        }
462        None => Err(LoadingError::TSParseFailure(p.to_owned())),
463    }
464}
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469
470    /// Test that every parser that this program was compiled to support can be loaded by the tree
471    /// sitter [parser](tree_sitter::Parser)
472    #[cfg(feature = "static-grammar-libs")]
473    #[test]
474    fn static_load_parsers() {
475        // Collect all of the test failures in a vector so we can show a comprehensive error with
476        // all of the failed languages instead of panicking one at a time
477        let mut failures = Vec::new();
478
479        for (&name, lang) in &LANGUAGES {
480            let mut parser = tree_sitter::Parser::new();
481            let result = unsafe {
482                let ts_lang = lang();
483                parser.set_language(&ts_lang)
484            };
485
486            if let Err(e) = result {
487                failures.push((name, e));
488            }
489        }
490
491        assert!(failures.is_empty(), "{failures:#?}");
492    }
493
494    #[cfg(feature = "dynamic-grammar-libs")]
495    #[test]
496    #[ignore] // this test is only applicable in certain packaging scenarios
497    fn dynamic_load_parsers() {
498        let languages = vec![
499            "rust", "cpp", "python", "bash", "ocaml", "go", "ruby", "java", "c_sharp", "css",
500            "php", "json", "tsx", "hcl",
501        ];
502        let mut failures = Vec::new();
503
504        for &name in &languages {
505            if generate_language_dynamic(name, None).is_err() {
506                failures.push(name);
507            }
508        }
509
510        assert!(failures.is_empty(), "{:#?}", failures);
511    }
512
513    #[cfg(feature = "static-grammar-libs")]
514    #[test]
515    fn test_static_grammar_tree_sitter_abi_compatibility() -> Result<(), LoadingError> {
516        for (_, language_ctor) in &LANGUAGES {
517            unsafe {
518                let language = language_ctor();
519                ts_language_abi_checked(&language)?;
520            }
521        }
522        Ok(())
523    }
524}