tree_sitter_generate/
generate.rs

1use std::{collections::BTreeMap, sync::LazyLock};
2#[cfg(feature = "load")]
3use std::{
4    env, fs,
5    io::Write,
6    path::{Path, PathBuf},
7    process::{Command, Stdio},
8};
9
10use bitflags::bitflags;
11use log::warn;
12use node_types::VariableInfo;
13use regex::{Regex, RegexBuilder};
14use rules::{Alias, Symbol};
15#[cfg(feature = "load")]
16use semver::Version;
17#[cfg(feature = "load")]
18use serde::Deserialize;
19use serde::Serialize;
20use thiserror::Error;
21
22mod build_tables;
23mod dedup;
24mod grammars;
25mod nfa;
26mod node_types;
27pub mod parse_grammar;
28mod prepare_grammar;
29#[cfg(feature = "qjs-rt")]
30mod quickjs;
31mod render;
32mod rules;
33mod tables;
34
35use build_tables::build_tables;
36pub use build_tables::ParseTableBuilderError;
37use grammars::{InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar};
38pub use node_types::{SuperTypeCycleError, VariableInfoError};
39use parse_grammar::parse_grammar;
40pub use parse_grammar::ParseGrammarError;
41use prepare_grammar::prepare_grammar;
42pub use prepare_grammar::PrepareGrammarError;
43use render::render_c_code;
44pub use render::{ABI_VERSION_MAX, ABI_VERSION_MIN};
45
46static JSON_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
47    RegexBuilder::new("^\\s*//.*")
48        .multi_line(true)
49        .build()
50        .unwrap()
51});
52
53struct JSONOutput {
54    #[cfg(feature = "load")]
55    node_types_json: String,
56    syntax_grammar: SyntaxGrammar,
57    lexical_grammar: LexicalGrammar,
58    inlines: InlinedProductionMap,
59    simple_aliases: BTreeMap<Symbol, Alias>,
60    variable_info: Vec<VariableInfo>,
61}
62
63struct GeneratedParser {
64    c_code: String,
65    #[cfg(feature = "load")]
66    node_types_json: String,
67}
68
69// NOTE: This constant must be kept in sync with the definition of
70// `TREE_SITTER_LANGUAGE_VERSION` in `lib/include/tree_sitter/api.h`.
71const LANGUAGE_VERSION: usize = 15;
72
73pub const ALLOC_HEADER: &str = include_str!("templates/alloc.h");
74pub const ARRAY_HEADER: &str = include_str!("templates/array.h");
75pub const PARSER_HEADER: &str = include_str!("parser.h.inc");
76
77pub type GenerateResult<T> = Result<T, GenerateError>;
78
79#[derive(Debug, Error, Serialize)]
80pub enum GenerateError {
81    #[error("Error with specified path -- {0}")]
82    GrammarPath(String),
83    #[error(transparent)]
84    IO(IoError),
85    #[cfg(feature = "load")]
86    #[error(transparent)]
87    LoadGrammarFile(#[from] LoadGrammarError),
88    #[error(transparent)]
89    ParseGrammar(#[from] ParseGrammarError),
90    #[error(transparent)]
91    Prepare(#[from] PrepareGrammarError),
92    #[error(transparent)]
93    VariableInfo(#[from] VariableInfoError),
94    #[error(transparent)]
95    BuildTables(#[from] ParseTableBuilderError),
96    #[cfg(feature = "load")]
97    #[error(transparent)]
98    ParseVersion(#[from] ParseVersionError),
99    #[error(transparent)]
100    SuperTypeCycle(#[from] SuperTypeCycleError),
101}
102
103#[derive(Debug, Error, Serialize)]
104pub struct IoError {
105    pub error: String,
106    pub path: Option<String>,
107}
108
109impl IoError {
110    fn new(error: &std::io::Error, path: Option<&Path>) -> Self {
111        Self {
112            error: error.to_string(),
113            path: path.map(|p| p.to_string_lossy().to_string()),
114        }
115    }
116}
117
118impl std::fmt::Display for IoError {
119    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
120        write!(f, "{}", self.error)?;
121        if let Some(ref path) = self.path {
122            write!(f, " ({path})")?;
123        }
124        Ok(())
125    }
126}
127
128#[cfg(feature = "load")]
129pub type LoadGrammarFileResult<T> = Result<T, LoadGrammarError>;
130
131#[cfg(feature = "load")]
132#[derive(Debug, Error, Serialize)]
133pub enum LoadGrammarError {
134    #[error("Path to a grammar file with `.js` or `.json` extension is required")]
135    InvalidPath,
136    #[error("Failed to load grammar.js -- {0}")]
137    LoadJSGrammarFile(#[from] JSError),
138    #[error("Failed to load grammar.json -- {0}")]
139    IO(IoError),
140    #[error("Unknown grammar file extension: {0:?}")]
141    FileExtension(PathBuf),
142}
143
144#[cfg(feature = "load")]
145#[derive(Debug, Error, Serialize)]
146pub enum ParseVersionError {
147    #[error("{0}")]
148    Version(String),
149    #[error("{0}")]
150    JSON(String),
151    #[error(transparent)]
152    IO(IoError),
153}
154
155#[cfg(feature = "load")]
156pub type JSResult<T> = Result<T, JSError>;
157
158#[cfg(feature = "load")]
159#[derive(Debug, Error, Serialize)]
160pub enum JSError {
161    #[error("Failed to run `{runtime}` -- {error}")]
162    JSRuntimeSpawn { runtime: String, error: String },
163    #[error("Got invalid UTF8 from `{runtime}` -- {error}")]
164    JSRuntimeUtf8 { runtime: String, error: String },
165    #[error("`{runtime}` process exited with status {code}")]
166    JSRuntimeExit { runtime: String, code: i32 },
167    #[error("Failed to open stdin for `{runtime}`")]
168    JSRuntimeStdin { runtime: String },
169    #[error("Failed to write {item} to `{runtime}`'s stdin -- {error}")]
170    JSRuntimeWrite {
171        runtime: String,
172        item: String,
173        error: String,
174    },
175    #[error("Failed to read output from `{runtime}` -- {error}")]
176    JSRuntimeRead { runtime: String, error: String },
177    #[error(transparent)]
178    IO(IoError),
179    #[cfg(feature = "qjs-rt")]
180    #[error("Failed to get relative path")]
181    RelativePath,
182    #[error("Could not parse this package's version as semver -- {0}")]
183    Semver(String),
184    #[error("Failed to serialze grammar JSON -- {0}")]
185    Serialzation(String),
186    #[cfg(feature = "qjs-rt")]
187    #[error("QuickJS error: {0}")]
188    QuickJS(String),
189}
190
191#[cfg(feature = "load")]
192impl From<serde_json::Error> for JSError {
193    fn from(value: serde_json::Error) -> Self {
194        Self::Serialzation(value.to_string())
195    }
196}
197
198#[cfg(feature = "load")]
199impl From<semver::Error> for JSError {
200    fn from(value: semver::Error) -> Self {
201        Self::Semver(value.to_string())
202    }
203}
204
205#[cfg(feature = "qjs-rt")]
206impl From<rquickjs::Error> for JSError {
207    fn from(value: rquickjs::Error) -> Self {
208        Self::QuickJS(value.to_string())
209    }
210}
211
212bitflags! {
213    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
214    pub struct OptLevel: u32 {
215        const MergeStates = 1 << 0;
216    }
217}
218
219impl Default for OptLevel {
220    fn default() -> Self {
221        Self::MergeStates
222    }
223}
224
225#[cfg(feature = "load")]
226#[allow(clippy::too_many_arguments)]
227pub fn generate_parser_in_directory<T, U, V>(
228    repo_path: T,
229    out_path: Option<U>,
230    grammar_path: Option<V>,
231    mut abi_version: usize,
232    report_symbol_name: Option<&str>,
233    js_runtime: Option<&str>,
234    generate_parser: bool,
235    optimizations: OptLevel,
236) -> GenerateResult<()>
237where
238    T: Into<PathBuf>,
239    U: Into<PathBuf>,
240    V: Into<PathBuf>,
241{
242    let mut repo_path: PathBuf = repo_path.into();
243
244    // Populate a new empty grammar directory.
245    let grammar_path = if let Some(path) = grammar_path {
246        let path_buf: PathBuf = path.into();
247        if !path_buf
248            .try_exists()
249            .map_err(|e| GenerateError::GrammarPath(e.to_string()))?
250        {
251            fs::create_dir_all(&path_buf)
252                .map_err(|e| GenerateError::IO(IoError::new(&e, Some(path_buf.as_path()))))?;
253            repo_path = path_buf;
254            repo_path.join("grammar.js")
255        } else {
256            path_buf
257        }
258    } else {
259        repo_path.join("grammar.js")
260    };
261
262    // Read the grammar file.
263    let grammar_json = load_grammar_file(&grammar_path, js_runtime)?;
264
265    let src_path = out_path.map_or_else(|| repo_path.join("src"), |p| p.into());
266    let header_path = src_path.join("tree_sitter");
267
268    // Ensure that the output directory exists
269    fs::create_dir_all(&src_path)
270        .map_err(|e| GenerateError::IO(IoError::new(&e, Some(src_path.as_path()))))?;
271
272    if grammar_path.file_name().unwrap() != "grammar.json" {
273        fs::write(src_path.join("grammar.json"), &grammar_json)
274            .map_err(|e| GenerateError::IO(IoError::new(&e, Some(src_path.as_path()))))?;
275    }
276
277    // If our job is only to generate `grammar.json` and not `parser.c`, stop here.
278    let input_grammar = parse_grammar(&grammar_json)?;
279
280    if !generate_parser {
281        let node_types_json = generate_node_types_from_grammar(&input_grammar)?.node_types_json;
282        write_file(&src_path.join("node-types.json"), node_types_json)?;
283        return Ok(());
284    }
285
286    let semantic_version = read_grammar_version(&repo_path)?;
287
288    if semantic_version.is_none() && abi_version > ABI_VERSION_MIN {
289        warn!(
290            concat!(
291                "No `tree-sitter.json` file found in your grammar, ",
292                "this file is required to generate with ABI {}. ",
293                "Using ABI version {} instead.\n",
294                "This file can be set up with `tree-sitter init`. ",
295                "For more information, see https://tree-sitter.github.io/tree-sitter/cli/init."
296            ),
297            abi_version, ABI_VERSION_MIN
298        );
299        abi_version = ABI_VERSION_MIN;
300    }
301
302    // Generate the parser and related files.
303    let GeneratedParser {
304        c_code,
305        node_types_json,
306    } = generate_parser_for_grammar_with_opts(
307        &input_grammar,
308        abi_version,
309        semantic_version.map(|v| (v.major as u8, v.minor as u8, v.patch as u8)),
310        report_symbol_name,
311        optimizations,
312    )?;
313
314    write_file(&src_path.join("parser.c"), c_code)?;
315    write_file(&src_path.join("node-types.json"), node_types_json)?;
316    fs::create_dir_all(&header_path)
317        .map_err(|e| GenerateError::IO(IoError::new(&e, Some(header_path.as_path()))))?;
318    write_file(&header_path.join("alloc.h"), ALLOC_HEADER)?;
319    write_file(&header_path.join("array.h"), ARRAY_HEADER)?;
320    write_file(&header_path.join("parser.h"), PARSER_HEADER)?;
321
322    Ok(())
323}
324
325pub fn generate_parser_for_grammar(
326    grammar_json: &str,
327    semantic_version: Option<(u8, u8, u8)>,
328) -> GenerateResult<(String, String)> {
329    let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
330    let input_grammar = parse_grammar(&grammar_json)?;
331    let parser = generate_parser_for_grammar_with_opts(
332        &input_grammar,
333        LANGUAGE_VERSION,
334        semantic_version,
335        None,
336        OptLevel::empty(),
337    )?;
338    Ok((input_grammar.name, parser.c_code))
339}
340
341fn generate_node_types_from_grammar(input_grammar: &InputGrammar) -> GenerateResult<JSONOutput> {
342    let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
343        prepare_grammar(input_grammar)?;
344    let variable_info =
345        node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
346
347    #[cfg(feature = "load")]
348    let node_types_json = node_types::generate_node_types_json(
349        &syntax_grammar,
350        &lexical_grammar,
351        &simple_aliases,
352        &variable_info,
353    )?;
354    Ok(JSONOutput {
355        #[cfg(feature = "load")]
356        node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(),
357        syntax_grammar,
358        lexical_grammar,
359        inlines,
360        simple_aliases,
361        variable_info,
362    })
363}
364
365fn generate_parser_for_grammar_with_opts(
366    input_grammar: &InputGrammar,
367    abi_version: usize,
368    semantic_version: Option<(u8, u8, u8)>,
369    report_symbol_name: Option<&str>,
370    optimizations: OptLevel,
371) -> GenerateResult<GeneratedParser> {
372    let JSONOutput {
373        syntax_grammar,
374        lexical_grammar,
375        inlines,
376        simple_aliases,
377        variable_info,
378        #[cfg(feature = "load")]
379        node_types_json,
380    } = generate_node_types_from_grammar(input_grammar)?;
381    let supertype_symbol_map =
382        node_types::get_supertype_symbol_map(&syntax_grammar, &simple_aliases, &variable_info);
383    let tables = build_tables(
384        &syntax_grammar,
385        &lexical_grammar,
386        &simple_aliases,
387        &variable_info,
388        &inlines,
389        report_symbol_name,
390        optimizations,
391    )?;
392    let c_code = render_c_code(
393        &input_grammar.name,
394        tables,
395        syntax_grammar,
396        lexical_grammar,
397        simple_aliases,
398        abi_version,
399        semantic_version,
400        supertype_symbol_map,
401    );
402    Ok(GeneratedParser {
403        c_code,
404        #[cfg(feature = "load")]
405        node_types_json,
406    })
407}
408
409/// This will read the `tree-sitter.json` config file and attempt to extract the version.
410///
411/// If the file is not found in the current directory or any of its parent directories, this will
412/// return `None` to maintain backwards compatibility. If the file is found but the version cannot
413/// be parsed as semver, this will return an error.
414#[cfg(feature = "load")]
415fn read_grammar_version(repo_path: &Path) -> Result<Option<Version>, ParseVersionError> {
416    #[derive(Deserialize)]
417    struct TreeSitterJson {
418        metadata: Metadata,
419    }
420
421    #[derive(Deserialize)]
422    struct Metadata {
423        version: String,
424    }
425
426    let filename = "tree-sitter.json";
427    let mut path = repo_path.join(filename);
428
429    loop {
430        let json = path
431            .exists()
432            .then(|| {
433                let contents = fs::read_to_string(path.as_path())
434                    .map_err(|e| ParseVersionError::IO(IoError::new(&e, Some(path.as_path()))))?;
435                serde_json::from_str::<TreeSitterJson>(&contents).map_err(|e| {
436                    ParseVersionError::JSON(format!("Failed to parse `{}` -- {e}", path.display()))
437                })
438            })
439            .transpose()?;
440        if let Some(json) = json {
441            return Version::parse(&json.metadata.version)
442                .map_err(|e| {
443                    ParseVersionError::Version(format!(
444                        "Failed to parse `{}` version as semver -- {e}",
445                        path.display()
446                    ))
447                })
448                .map(Some);
449        }
450        path.pop(); // filename
451        if !path.pop() {
452            return Ok(None);
453        }
454        path.push(filename);
455    }
456}
457
458#[cfg(feature = "load")]
459pub fn load_grammar_file(
460    grammar_path: &Path,
461    js_runtime: Option<&str>,
462) -> LoadGrammarFileResult<String> {
463    if grammar_path.is_dir() {
464        Err(LoadGrammarError::InvalidPath)?;
465    }
466    match grammar_path.extension().and_then(|e| e.to_str()) {
467        Some("js") => Ok(load_js_grammar_file(grammar_path, js_runtime)?),
468        Some("json") => Ok(fs::read_to_string(grammar_path)
469            .map_err(|e| LoadGrammarError::IO(IoError::new(&e, Some(grammar_path))))?),
470        _ => Err(LoadGrammarError::FileExtension(grammar_path.to_owned()))?,
471    }
472}
473
474#[cfg(feature = "load")]
475fn load_js_grammar_file(grammar_path: &Path, js_runtime: Option<&str>) -> JSResult<String> {
476    let grammar_path = dunce::canonicalize(grammar_path)
477        .map_err(|e| JSError::IO(IoError::new(&e, Some(grammar_path))))?;
478
479    #[cfg(feature = "qjs-rt")]
480    if js_runtime == Some("native") {
481        return quickjs::execute_native_runtime(&grammar_path);
482    }
483
484    // The "file:///" prefix is incompatible with the quickjs runtime, but is required
485    // for node and bun
486    #[cfg(windows)]
487    let grammar_path = PathBuf::from(format!("file:///{}", grammar_path.display()));
488
489    let js_runtime = js_runtime.unwrap_or("node");
490
491    let mut js_command = Command::new(js_runtime);
492    match js_runtime {
493        "node" => {
494            js_command.args(["--input-type=module", "-"]);
495        }
496        "bun" => {
497            js_command.arg("-");
498        }
499        "deno" => {
500            js_command.args(["run", "--allow-all", "-"]);
501        }
502        _ => {}
503    }
504
505    let mut js_process = js_command
506        .env("TREE_SITTER_GRAMMAR_PATH", grammar_path)
507        .stdin(Stdio::piped())
508        .stdout(Stdio::piped())
509        .spawn()
510        .map_err(|e| JSError::JSRuntimeSpawn {
511            runtime: js_runtime.to_string(),
512            error: e.to_string(),
513        })?;
514
515    let mut js_stdin = js_process
516        .stdin
517        .take()
518        .ok_or_else(|| JSError::JSRuntimeStdin {
519            runtime: js_runtime.to_string(),
520        })?;
521
522    let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))?;
523    write!(
524        js_stdin,
525        "globalThis.TREE_SITTER_CLI_VERSION_MAJOR = {};
526         globalThis.TREE_SITTER_CLI_VERSION_MINOR = {};
527         globalThis.TREE_SITTER_CLI_VERSION_PATCH = {};",
528        cli_version.major, cli_version.minor, cli_version.patch,
529    )
530    .map_err(|e| JSError::JSRuntimeWrite {
531        runtime: js_runtime.to_string(),
532        item: "tree-sitter version".to_string(),
533        error: e.to_string(),
534    })?;
535    js_stdin
536        .write(include_bytes!("./dsl.js"))
537        .map_err(|e| JSError::JSRuntimeWrite {
538            runtime: js_runtime.to_string(),
539            item: "grammar dsl".to_string(),
540            error: e.to_string(),
541        })?;
542    drop(js_stdin);
543
544    let output = js_process
545        .wait_with_output()
546        .map_err(|e| JSError::JSRuntimeRead {
547            runtime: js_runtime.to_string(),
548            error: e.to_string(),
549        })?;
550    match output.status.code() {
551        Some(0) => {
552            let stdout = String::from_utf8(output.stdout).map_err(|e| JSError::JSRuntimeUtf8 {
553                runtime: js_runtime.to_string(),
554                error: e.to_string(),
555            })?;
556
557            let mut grammar_json = &stdout[..];
558
559            if let Some(pos) = stdout.rfind('\n') {
560                // If there's a newline, split the last line from the rest of the output
561                let node_output = &stdout[..pos];
562                grammar_json = &stdout[pos + 1..];
563
564                let mut stdout = std::io::stdout().lock();
565                stdout
566                    .write_all(node_output.as_bytes())
567                    .map_err(|e| JSError::IO(IoError::new(&e, None)))?;
568                stdout
569                    .write_all(b"\n")
570                    .map_err(|e| JSError::IO(IoError::new(&e, None)))?;
571                stdout
572                    .flush()
573                    .map_err(|e| JSError::IO(IoError::new(&e, None)))?;
574            }
575
576            Ok(serde_json::to_string_pretty(&serde_json::from_str::<
577                serde_json::Value,
578            >(grammar_json)?)?)
579        }
580        Some(code) => Err(JSError::JSRuntimeExit {
581            runtime: js_runtime.to_string(),
582            code,
583        }),
584        None => Err(JSError::JSRuntimeExit {
585            runtime: js_runtime.to_string(),
586            code: -1,
587        }),
588    }
589}
590
591#[cfg(feature = "load")]
592pub fn write_file(path: &Path, body: impl AsRef<[u8]>) -> GenerateResult<()> {
593    fs::write(path, body).map_err(|e| GenerateError::IO(IoError::new(&e, Some(path))))
594}
595
596#[cfg(test)]
597mod tests {
598    use super::{LANGUAGE_VERSION, PARSER_HEADER};
599    #[test]
600    fn test_language_versions_are_in_sync() {
601        let api_h = include_str!("../../../lib/include/tree_sitter/api.h");
602        let api_language_version = api_h
603            .lines()
604            .find_map(|line| {
605                line.trim()
606                    .strip_prefix("#define TREE_SITTER_LANGUAGE_VERSION ")
607                    .and_then(|v| v.parse::<usize>().ok())
608            })
609            .expect("Failed to find TREE_SITTER_LANGUAGE_VERSION definition in api.h");
610        assert_eq!(LANGUAGE_VERSION, api_language_version);
611    }
612
613    #[test]
614    fn test_parser_header_in_sync() {
615        let parser_h = include_str!("../../../lib/src/parser.h");
616        assert!(
617            parser_h == PARSER_HEADER,
618            "parser.h.inc is out of sync with lib/src/parser.h. Run: cp lib/src/parser.h crates/generate/src/parser.h.inc"
619        );
620    }
621}