Skip to main content

tree_sitter_generate/
generate.rs

1use std::{collections::BTreeMap, sync::LazyLock};
2#[cfg(feature = "load")]
3use std::{
4    env, fs,
5    io::Write,
6    path::{Path, PathBuf},
7    process::{Command, Stdio},
8};
9
10use bitflags::bitflags;
11use log::warn;
12use node_types::VariableInfo;
13use regex::{Regex, RegexBuilder};
14use rules::{Alias, Symbol};
15#[cfg(feature = "load")]
16use semver::Version;
17#[cfg(feature = "load")]
18use serde::Deserialize;
19use serde::Serialize;
20use thiserror::Error;
21
22mod build_tables;
23mod dedup;
24mod grammars;
25mod nfa;
26mod node_types;
27pub mod parse_grammar;
28mod prepare_grammar;
29#[cfg(feature = "qjs-rt")]
30mod quickjs;
31mod render;
32mod rules;
33mod tables;
34
35use build_tables::build_tables;
36pub use build_tables::ParseTableBuilderError;
37use grammars::{InlinedProductionMap, InputGrammar, LexicalGrammar, SyntaxGrammar};
38pub use node_types::{SuperTypeCycleError, VariableInfoError};
39use parse_grammar::parse_grammar;
40pub use parse_grammar::ParseGrammarError;
41use prepare_grammar::prepare_grammar;
42pub use prepare_grammar::PrepareGrammarError;
43use render::render_c_code;
44pub use render::{RenderError, ABI_VERSION_MAX, ABI_VERSION_MIN};
45
46static JSON_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
47    RegexBuilder::new("^\\s*//.*")
48        .multi_line(true)
49        .build()
50        .unwrap()
51});
52
53struct JSONOutput {
54    #[cfg(feature = "load")]
55    node_types_json: String,
56    syntax_grammar: SyntaxGrammar,
57    lexical_grammar: LexicalGrammar,
58    inlines: InlinedProductionMap,
59    simple_aliases: BTreeMap<Symbol, Alias>,
60    variable_info: Vec<VariableInfo>,
61}
62
63struct GeneratedParser {
64    c_code: String,
65    #[cfg(feature = "load")]
66    node_types_json: String,
67}
68
69// NOTE: This constant must be kept in sync with the definition of
70// `TREE_SITTER_LANGUAGE_VERSION` in `lib/include/tree_sitter/api.h`.
71const LANGUAGE_VERSION: usize = 15;
72
73pub const ALLOC_HEADER: &str = include_str!("templates/alloc.h");
74pub const ARRAY_HEADER: &str = include_str!("templates/array.h");
75pub const PARSER_HEADER: &str = include_str!("parser.h.inc");
76
77pub type GenerateResult<T> = Result<T, GenerateError>;
78
79#[derive(Debug, Error, Serialize)]
80pub enum GenerateError {
81    #[error("Error with specified path -- {0}")]
82    GrammarPath(String),
83    #[error(transparent)]
84    IO(IoError),
85    #[cfg(feature = "load")]
86    #[error(transparent)]
87    LoadGrammarFile(#[from] LoadGrammarError),
88    #[error(transparent)]
89    ParseGrammar(#[from] ParseGrammarError),
90    #[error(transparent)]
91    Prepare(#[from] PrepareGrammarError),
92    #[error(transparent)]
93    VariableInfo(#[from] VariableInfoError),
94    #[error(transparent)]
95    BuildTables(#[from] ParseTableBuilderError),
96    #[error(transparent)]
97    Render(#[from] RenderError),
98    #[cfg(feature = "load")]
99    #[error(transparent)]
100    ParseVersion(#[from] ParseVersionError),
101    #[error(transparent)]
102    SuperTypeCycle(#[from] SuperTypeCycleError),
103}
104
105#[derive(Debug, Error, Serialize)]
106pub struct IoError {
107    pub error: String,
108    pub path: Option<String>,
109}
110
111impl IoError {
112    fn new(error: &std::io::Error, path: Option<&Path>) -> Self {
113        Self {
114            error: error.to_string(),
115            path: path.map(|p| p.to_string_lossy().to_string()),
116        }
117    }
118}
119
120impl std::fmt::Display for IoError {
121    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
122        write!(f, "{}", self.error)?;
123        if let Some(ref path) = self.path {
124            write!(f, " ({path})")?;
125        }
126        Ok(())
127    }
128}
129
130#[cfg(feature = "load")]
131pub type LoadGrammarFileResult<T> = Result<T, LoadGrammarError>;
132
133#[cfg(feature = "load")]
134#[derive(Debug, Error, Serialize)]
135pub enum LoadGrammarError {
136    #[error("Path to a grammar file with `.js` or `.json` extension is required")]
137    InvalidPath,
138    #[error("Failed to load grammar.js -- {0}")]
139    LoadJSGrammarFile(#[from] JSError),
140    #[error("Failed to load grammar.json -- {0}")]
141    IO(IoError),
142    #[error("Unknown grammar file extension: {0:?}")]
143    FileExtension(PathBuf),
144}
145
146#[cfg(feature = "load")]
147#[derive(Debug, Error, Serialize)]
148pub enum ParseVersionError {
149    #[error("{0}")]
150    Version(String),
151    #[error("{0}")]
152    JSON(String),
153    #[error(transparent)]
154    IO(IoError),
155}
156
157#[cfg(feature = "load")]
158pub type JSResult<T> = Result<T, JSError>;
159
160#[cfg(feature = "load")]
161#[derive(Debug, Error, Serialize)]
162pub enum JSError {
163    #[error("Failed to run `{runtime}` -- {error}")]
164    JSRuntimeSpawn { runtime: String, error: String },
165    #[error("Got invalid UTF8 from `{runtime}` -- {error}")]
166    JSRuntimeUtf8 { runtime: String, error: String },
167    #[error("`{runtime}` process exited with status {code}")]
168    JSRuntimeExit { runtime: String, code: i32 },
169    #[error("Failed to open stdin for `{runtime}`")]
170    JSRuntimeStdin { runtime: String },
171    #[error("Failed to write {item} to `{runtime}`'s stdin -- {error}")]
172    JSRuntimeWrite {
173        runtime: String,
174        item: String,
175        error: String,
176    },
177    #[error("Failed to read output from `{runtime}` -- {error}")]
178    JSRuntimeRead { runtime: String, error: String },
179    #[error(transparent)]
180    IO(IoError),
181    #[cfg(feature = "qjs-rt")]
182    #[error("Failed to get relative path")]
183    RelativePath,
184    #[error("Could not parse this package's version as semver -- {0}")]
185    Semver(String),
186    #[error("Failed to serialze grammar JSON -- {0}")]
187    Serialzation(String),
188    #[cfg(feature = "qjs-rt")]
189    #[error("QuickJS error: {0}")]
190    QuickJS(String),
191}
192
193#[cfg(feature = "load")]
194impl From<serde_json::Error> for JSError {
195    fn from(value: serde_json::Error) -> Self {
196        Self::Serialzation(value.to_string())
197    }
198}
199
200#[cfg(feature = "load")]
201impl From<semver::Error> for JSError {
202    fn from(value: semver::Error) -> Self {
203        Self::Semver(value.to_string())
204    }
205}
206
207#[cfg(feature = "qjs-rt")]
208impl From<rquickjs::Error> for JSError {
209    fn from(value: rquickjs::Error) -> Self {
210        Self::QuickJS(value.to_string())
211    }
212}
213
214bitflags! {
215    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
216    pub struct OptLevel: u32 {
217        const MergeStates = 1 << 0;
218    }
219}
220
221impl Default for OptLevel {
222    fn default() -> Self {
223        Self::MergeStates
224    }
225}
226
227#[cfg(feature = "load")]
228#[allow(clippy::too_many_arguments)]
229pub fn generate_parser_in_directory<T, U, V>(
230    repo_path: T,
231    out_path: Option<U>,
232    grammar_path: Option<V>,
233    mut abi_version: usize,
234    report_symbol_name: Option<&str>,
235    js_runtime: Option<&str>,
236    generate_parser: bool,
237    optimizations: OptLevel,
238) -> GenerateResult<()>
239where
240    T: Into<PathBuf>,
241    U: Into<PathBuf>,
242    V: Into<PathBuf>,
243{
244    let mut repo_path: PathBuf = repo_path.into();
245
246    // Populate a new empty grammar directory.
247    let grammar_path = if let Some(path) = grammar_path {
248        let path_buf: PathBuf = path.into();
249        if !path_buf
250            .try_exists()
251            .map_err(|e| GenerateError::GrammarPath(e.to_string()))?
252        {
253            fs::create_dir_all(&path_buf)
254                .map_err(|e| GenerateError::IO(IoError::new(&e, Some(path_buf.as_path()))))?;
255            repo_path = path_buf;
256            repo_path.join("grammar.js")
257        } else {
258            path_buf
259        }
260    } else {
261        repo_path.join("grammar.js")
262    };
263
264    // Read the grammar file.
265    let grammar_json = load_grammar_file(&grammar_path, js_runtime)?;
266
267    let src_path = out_path.map_or_else(|| repo_path.join("src"), |p| p.into());
268    let header_path = src_path.join("tree_sitter");
269
270    // Ensure that the output directory exists
271    fs::create_dir_all(&src_path)
272        .map_err(|e| GenerateError::IO(IoError::new(&e, Some(src_path.as_path()))))?;
273
274    if grammar_path.file_name().unwrap() != "grammar.json" {
275        fs::write(src_path.join("grammar.json"), &grammar_json)
276            .map_err(|e| GenerateError::IO(IoError::new(&e, Some(src_path.as_path()))))?;
277    }
278
279    // If our job is only to generate `grammar.json` and not `parser.c`, stop here.
280    let input_grammar = parse_grammar(&grammar_json)?;
281
282    if !generate_parser {
283        let node_types_json = generate_node_types_from_grammar(&input_grammar)?.node_types_json;
284        write_file(&src_path.join("node-types.json"), node_types_json)?;
285        return Ok(());
286    }
287
288    let semantic_version = read_grammar_version(&repo_path)?;
289
290    if semantic_version.is_none() && abi_version > ABI_VERSION_MIN {
291        warn!(
292            concat!(
293                "No `tree-sitter.json` file found in your grammar, ",
294                "this file is required to generate with ABI {}. ",
295                "Using ABI version {} instead.\n",
296                "This file can be set up with `tree-sitter init`. ",
297                "For more information, see https://tree-sitter.github.io/tree-sitter/cli/init."
298            ),
299            abi_version, ABI_VERSION_MIN
300        );
301        abi_version = ABI_VERSION_MIN;
302    }
303
304    // Generate the parser and related files.
305    let GeneratedParser {
306        c_code,
307        node_types_json,
308    } = generate_parser_for_grammar_with_opts(
309        &input_grammar,
310        abi_version,
311        semantic_version.map(|v| (v.major as u8, v.minor as u8, v.patch as u8)),
312        report_symbol_name,
313        optimizations,
314    )?;
315
316    write_file(&src_path.join("parser.c"), c_code)?;
317    write_file(&src_path.join("node-types.json"), node_types_json)?;
318    fs::create_dir_all(&header_path)
319        .map_err(|e| GenerateError::IO(IoError::new(&e, Some(header_path.as_path()))))?;
320    write_file(&header_path.join("alloc.h"), ALLOC_HEADER)?;
321    write_file(&header_path.join("array.h"), ARRAY_HEADER)?;
322    write_file(&header_path.join("parser.h"), PARSER_HEADER)?;
323
324    Ok(())
325}
326
327pub fn generate_parser_for_grammar(
328    grammar_json: &str,
329    semantic_version: Option<(u8, u8, u8)>,
330) -> GenerateResult<(String, String)> {
331    let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
332    let input_grammar = parse_grammar(&grammar_json)?;
333    let parser = generate_parser_for_grammar_with_opts(
334        &input_grammar,
335        LANGUAGE_VERSION,
336        semantic_version,
337        None,
338        OptLevel::empty(),
339    )?;
340    Ok((input_grammar.name, parser.c_code))
341}
342
343fn generate_node_types_from_grammar(input_grammar: &InputGrammar) -> GenerateResult<JSONOutput> {
344    let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
345        prepare_grammar(input_grammar)?;
346    let variable_info =
347        node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
348
349    #[cfg(feature = "load")]
350    let node_types_json = node_types::generate_node_types_json(
351        &syntax_grammar,
352        &lexical_grammar,
353        &simple_aliases,
354        &variable_info,
355    )?;
356    Ok(JSONOutput {
357        #[cfg(feature = "load")]
358        node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(),
359        syntax_grammar,
360        lexical_grammar,
361        inlines,
362        simple_aliases,
363        variable_info,
364    })
365}
366
367fn generate_parser_for_grammar_with_opts(
368    input_grammar: &InputGrammar,
369    abi_version: usize,
370    semantic_version: Option<(u8, u8, u8)>,
371    report_symbol_name: Option<&str>,
372    optimizations: OptLevel,
373) -> GenerateResult<GeneratedParser> {
374    let JSONOutput {
375        syntax_grammar,
376        lexical_grammar,
377        inlines,
378        simple_aliases,
379        variable_info,
380        #[cfg(feature = "load")]
381        node_types_json,
382    } = generate_node_types_from_grammar(input_grammar)?;
383    let supertype_symbol_map =
384        node_types::get_supertype_symbol_map(&syntax_grammar, &simple_aliases, &variable_info);
385    let tables = build_tables(
386        &syntax_grammar,
387        &lexical_grammar,
388        &simple_aliases,
389        &variable_info,
390        &inlines,
391        report_symbol_name,
392        optimizations,
393    )?;
394    let c_code = render_c_code(
395        &input_grammar.name,
396        tables,
397        syntax_grammar,
398        lexical_grammar,
399        simple_aliases,
400        abi_version,
401        semantic_version,
402        supertype_symbol_map,
403    )?;
404    Ok(GeneratedParser {
405        c_code,
406        #[cfg(feature = "load")]
407        node_types_json,
408    })
409}
410
411/// This will read the `tree-sitter.json` config file and attempt to extract the version.
412///
413/// If the file is not found in the current directory or any of its parent directories, this will
414/// return `None` to maintain backwards compatibility. If the file is found but the version cannot
415/// be parsed as semver, this will return an error.
416#[cfg(feature = "load")]
417fn read_grammar_version(repo_path: &Path) -> Result<Option<Version>, ParseVersionError> {
418    #[derive(Deserialize)]
419    struct TreeSitterJson {
420        metadata: Metadata,
421    }
422
423    #[derive(Deserialize)]
424    struct Metadata {
425        version: String,
426    }
427
428    let filename = "tree-sitter.json";
429    let mut path = repo_path.join(filename);
430
431    loop {
432        let json = path
433            .exists()
434            .then(|| {
435                let contents = fs::read_to_string(path.as_path())
436                    .map_err(|e| ParseVersionError::IO(IoError::new(&e, Some(path.as_path()))))?;
437                serde_json::from_str::<TreeSitterJson>(&contents).map_err(|e| {
438                    ParseVersionError::JSON(format!("Failed to parse `{}` -- {e}", path.display()))
439                })
440            })
441            .transpose()?;
442        if let Some(json) = json {
443            return Version::parse(&json.metadata.version)
444                .map_err(|e| {
445                    ParseVersionError::Version(format!(
446                        "Failed to parse `{}` version as semver -- {e}",
447                        path.display()
448                    ))
449                })
450                .map(Some);
451        }
452        path.pop(); // filename
453        if !path.pop() {
454            return Ok(None);
455        }
456        path.push(filename);
457    }
458}
459
460#[cfg(feature = "load")]
461pub fn load_grammar_file(
462    grammar_path: &Path,
463    js_runtime: Option<&str>,
464) -> LoadGrammarFileResult<String> {
465    if grammar_path.is_dir() {
466        Err(LoadGrammarError::InvalidPath)?;
467    }
468    match grammar_path.extension().and_then(|e| e.to_str()) {
469        Some("js") => Ok(load_js_grammar_file(grammar_path, js_runtime)?),
470        Some("json") => Ok(fs::read_to_string(grammar_path)
471            .map_err(|e| LoadGrammarError::IO(IoError::new(&e, Some(grammar_path))))?),
472        _ => Err(LoadGrammarError::FileExtension(grammar_path.to_owned()))?,
473    }
474}
475
476#[cfg(feature = "load")]
477fn load_js_grammar_file(grammar_path: &Path, js_runtime: Option<&str>) -> JSResult<String> {
478    let grammar_path = dunce::canonicalize(grammar_path)
479        .map_err(|e| JSError::IO(IoError::new(&e, Some(grammar_path))))?;
480
481    #[cfg(feature = "qjs-rt")]
482    if js_runtime == Some("native") {
483        return quickjs::execute_native_runtime(&grammar_path);
484    }
485
486    // The "file:///" prefix is incompatible with the quickjs runtime, but is required
487    // for node and bun
488    #[cfg(windows)]
489    let grammar_path = PathBuf::from(format!("file:///{}", grammar_path.display()));
490
491    let js_runtime = js_runtime.unwrap_or("node");
492
493    let mut js_command = Command::new(js_runtime);
494    match js_runtime {
495        "node" => {
496            js_command.args(["--input-type=module", "-"]);
497        }
498        "bun" => {
499            js_command.arg("-");
500        }
501        "deno" => {
502            js_command.args(["run", "--allow-all", "-"]);
503        }
504        _ => {}
505    }
506
507    let mut js_process = js_command
508        .env("TREE_SITTER_GRAMMAR_PATH", grammar_path)
509        .stdin(Stdio::piped())
510        .stdout(Stdio::piped())
511        .spawn()
512        .map_err(|e| JSError::JSRuntimeSpawn {
513            runtime: js_runtime.to_string(),
514            error: e.to_string(),
515        })?;
516
517    let mut js_stdin = js_process
518        .stdin
519        .take()
520        .ok_or_else(|| JSError::JSRuntimeStdin {
521            runtime: js_runtime.to_string(),
522        })?;
523
524    let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))?;
525    write!(
526        js_stdin,
527        "globalThis.TREE_SITTER_CLI_VERSION_MAJOR = {};
528         globalThis.TREE_SITTER_CLI_VERSION_MINOR = {};
529         globalThis.TREE_SITTER_CLI_VERSION_PATCH = {};",
530        cli_version.major, cli_version.minor, cli_version.patch,
531    )
532    .map_err(|e| JSError::JSRuntimeWrite {
533        runtime: js_runtime.to_string(),
534        item: "tree-sitter version".to_string(),
535        error: e.to_string(),
536    })?;
537    js_stdin
538        .write(include_bytes!("./dsl.js"))
539        .map_err(|e| JSError::JSRuntimeWrite {
540            runtime: js_runtime.to_string(),
541            item: "grammar dsl".to_string(),
542            error: e.to_string(),
543        })?;
544    drop(js_stdin);
545
546    let output = js_process
547        .wait_with_output()
548        .map_err(|e| JSError::JSRuntimeRead {
549            runtime: js_runtime.to_string(),
550            error: e.to_string(),
551        })?;
552    match output.status.code() {
553        Some(0) => {
554            let stdout = String::from_utf8(output.stdout).map_err(|e| JSError::JSRuntimeUtf8 {
555                runtime: js_runtime.to_string(),
556                error: e.to_string(),
557            })?;
558
559            let mut grammar_json = &stdout[..];
560
561            if let Some(pos) = stdout.rfind('\n') {
562                // If there's a newline, split the last line from the rest of the output
563                let node_output = &stdout[..pos];
564                grammar_json = &stdout[pos + 1..];
565
566                let mut stdout = std::io::stdout().lock();
567                stdout
568                    .write_all(node_output.as_bytes())
569                    .map_err(|e| JSError::IO(IoError::new(&e, None)))?;
570                stdout
571                    .write_all(b"\n")
572                    .map_err(|e| JSError::IO(IoError::new(&e, None)))?;
573                stdout
574                    .flush()
575                    .map_err(|e| JSError::IO(IoError::new(&e, None)))?;
576            }
577
578            Ok(serde_json::to_string_pretty(&serde_json::from_str::<
579                serde_json::Value,
580            >(grammar_json)?)?)
581        }
582        Some(code) => Err(JSError::JSRuntimeExit {
583            runtime: js_runtime.to_string(),
584            code,
585        }),
586        None => Err(JSError::JSRuntimeExit {
587            runtime: js_runtime.to_string(),
588            code: -1,
589        }),
590    }
591}
592
593#[cfg(feature = "load")]
594pub fn write_file(path: &Path, body: impl AsRef<[u8]>) -> GenerateResult<()> {
595    fs::write(path, body).map_err(|e| GenerateError::IO(IoError::new(&e, Some(path))))
596}
597
598#[cfg(test)]
599mod tests {
600    use super::{LANGUAGE_VERSION, PARSER_HEADER};
601    #[test]
602    fn test_language_versions_are_in_sync() {
603        let api_h = include_str!("../../../lib/include/tree_sitter/api.h");
604        let api_language_version = api_h
605            .lines()
606            .find_map(|line| {
607                line.trim()
608                    .strip_prefix("#define TREE_SITTER_LANGUAGE_VERSION ")
609                    .and_then(|v| v.parse::<usize>().ok())
610            })
611            .expect("Failed to find TREE_SITTER_LANGUAGE_VERSION definition in api.h");
612        assert_eq!(LANGUAGE_VERSION, api_language_version);
613    }
614
615    #[test]
616    fn test_parser_header_in_sync() {
617        let parser_h = include_str!("../../../lib/src/parser.h");
618        assert!(
619            parser_h == PARSER_HEADER,
620            "parser.h.inc is out of sync with lib/src/parser.h. Run: cp lib/src/parser.h crates/generate/src/parser.h.inc"
621        );
622    }
623}