auto_lsp_codegen/
lib.rs

1/*
2This file is part of auto-lsp.
3Copyright (C) 2025 CLAUZEL Adrien
4
5auto-lsp is free software: you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation, either version 3 of the License, or
8(at your option) any later version.
9
10This program is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program.  If not, see <http://www.gnu.org/licenses/>
17*/
18
19//! # Auto LSP Codegen
20//!
21//! To generate an AST, simply provide a Tree-sitter [node-types.json](https://tree-sitter.github.io/tree-sitter/using-parsers/6-static-node-types.html#static-node-types) and [LanguageFn](https://docs.rs/tree-sitter/latest/tree_sitter/struct.Language.html) of any language to the `generate` function of the `auto_lsp_codegen` crate.
22//!
23//! ```sh
24//! cargo add auto_lsp_codegen
25//! ```
26//!
27//! Although `auto_lsp_codegen` is a standalone crate, the generated code depends on the main `auto_lsp` crate.
28//!
29//! ## Usage
30//!
31//! The `auto_lsp_codegen` crate exposes a single `generate` function, which takes:
32//!
33//! - A [`node-types.json`](https://tree-sitter.github.io/tree-sitter/using-parsers/6-static-node-types.html),
34//! - A [`LanguageFn`](https://docs.rs/tree-sitter-language/0.1.5/tree_sitter_language/struct.LanguageFn.html)
35//! - A `HashMap<&str, &str>` to rename tokens (see [Custom Tokens](#custom-tokens))
36//! - And returns a **TokenStream**.
37//!
38//! How you choose to use the `TokenStream` is up to you.
39//!
40//! The most common setup is to call it from a **build.rs** script and write the generated code to a Rust file.
41//!
42//! Note, however, that the output can be quite large—for example, Python’s AST results in ~11,000 lines of code.
43//!
44//! ```rust, ignore
45//! use auto_lsp_codegen::generate;
46//! use std::{fs, path::PathBuf};
47//!
48//! fn main() {
49//!    if std::env::var("AST_GEN").unwrap_or("0".to_string()) == "0" {
50//!        return;
51//!    }
52//!
53//!    let output_path = PathBuf::from("./src/generated.rs");
54//!
55//!    fs::write(
56//!        output_path,
57//!        generate(
58//!            tree_sitter_python::NODE_TYPES,
59//!            &tree_sitter_python::LANGUAGE.into(),
60//!            None,
61//!        )
62//!        .to_string(),
63//!    )
64//!    .unwrap();
65//!}
66//! ```
67//!
68//! You can also invoke it from your own CLI or tool if needed.
69//!
70//! ## How Codegen Works
71//!
72//! The generated code structure depends on the Tree-sitter grammar.
73//!
74//! ## Structs for Rules
75//!
76//! Each rule in `node-types.json` becomes a dedicated Rust struct. For example, given the rule:
77//!
78//! ```js
79//! function_definition: $ => seq(
80//!      optional('async'),
81//!      'def',
82//!      field('name', $.identifier),
83//!      field('type_parameters', optional($.type_parameter)),
84//!      field('parameters', $.parameters),
85//!      optional(
86//!        seq(
87//!          '->',
88//!          field('return_type', $.type),
89//!        ),
90//!      ),
91//!      ':',
92//!      field('body', $._suite),
93//!    ),
94//! ```
95//!
96//! The generated struct would look like this:
97//!
98//! ```rust, ignore
99//!#[derive(Debug, Clone, PartialEq)]
100//!pub struct FunctionDefinition {
101//!    pub name: std::sync::Arc<Identifier>,
102//!    pub body: std::sync::Arc<Block>,
103//!    pub type_parameters: Option<std::sync::Arc<TypeParameter>>,
104//!    pub parameters: std::sync::Arc<Parameters>,
105//!    pub return_type: Option<std::sync::Arc<Type>>,
106//!    /* ... */
107//!}
108//! ```
109//!
110//! ## Field Matching
111//!
112//! To match fields, codegen uses the `field_id()` method from the Tree-sitter cursor.
113//!
114//! From the above example, the generated builder might look like this:
115//!
116//! ```rust, ignore
117//!builder.builder(db, &node, Some(id), |b| {
118//!  b.on_field_id::<Identifier, 19u16>(&mut name)?
119//!    .on_field_id::<Block, 6u16>(&mut body)?
120//!    .on_field_id::<TypeParameter, 31u16>(&mut type_parameters)?
121//!    .on_field_id::<Parameters, 23u16>(&mut parameters)?
122//!    .on_field_id::<Type, 24u16>(&mut return_type)
123//!});
124//! ```
125//!
126//! Each **u16** represents the unique field ID assigned by the Tree-sitter language parser.
127//!
128//! ## Handling Children
129//!
130//! If a node has no named fields, a children enum is generated to represent all possible variants.
131//!
132//! - If the children are **unnamed**, a generic "Operator_" enum is generated
133//! - If the children are **named**, the enum will be a concatenation of all possible child node types with underscores, using sanitized Rust-friendly names.
134//!
135//! For example, given the rule:
136//!
137//! ```js
138//!  _statement: $ => choice(
139//!      $._simple_statement,
140//!      $._compound_statement,
141//!    ),
142//! ```
143//!
144//! The generated enum would look like this:
145//!
146//! ```rust, ignore
147//! pub enum SimpleStatement_CompoundStatement {
148//!    SimpleStatement(SimpleStatement),
149//!    CompoundStatement(CompoundStatement),
150//! }
151//! ```
152//!
153//! If the generated enum name becomes too long, consider using a Tree-sitter
154//! <a href="https://tree-sitter.github.io/tree-sitter/using-parsers/6-static-node-types.html#supertype-nodes">supertype</a> to group nodes together.
155//!
156//! The `kind_id()` method is used to determine child kinds during traversal.
157//!
158//! The `AstNode::contains` method relies on this to check whether a node kind belongs to a specific struct or enum variant.
159//!
160//! ## Vec and Option Fields
161//!
162//! `repeat` and `repeat1` in the grammar will generate a `Vec` field.
163//!
164//! `optional(...)` will generate an `Option<T>` field.
165//!
166//! ## Token Naming
167//!
168//! Unnamed tokens are mapped to Rust enums using a built-in token map. For instance:
169//!
170//! ```json
171//!  { "type": "+", "named": false },
172//!  { "type": "+=", "named": false },
173//!  { "type": ",", "named": false },
174//!  { "type": "-", "named": false },
175//!  { "type": "-=", "named": false },
176//! ```
177//!
178//! Generates:
179//!
180//! ```rust, ignore
181//! pub enum Token_Plus {}
182//! pub enum Token_PlusEqual {}
183//! pub enum Token_Comma {}
184//! pub enum Token_Minus {}
185//! pub enum Token_MinusEqual {}
186//! ```
187//!
188//! Tokens with regular identifiers are converted to PascalCase.
189//!
190//! ## Custom Tokens
191//!
192//! If your grammar defines additional unnamed tokens not covered by the default map, you can provide a custom token mapping to generate appropriate Rust enum names.
193//!
194//! ```rust, ignore
195//!use auto_lsp_codegen::generate;
196//!
197//!let _result = generate(
198//!        &tree_sitter_python::NODE_TYPES,
199//!        &tree_sitter_python::LANGUAGE.into(),
200//!        Some(HashMap::from([
201//!            ("+", "Plus"),
202//!            ("+=", "PlusEqual"),
203//!            (",", "Comma"),
204//!            ("-", "Minus"),
205//!            ("-=", "MinusEqual"),
206//!        ])),
207//!    );
208//! ```
209//!
210//! Tokens that are not in the map will be added, and tokens that already exist in the map will be overwritten.
211//!
212//! ## Super Types
213//!
214//! Tree-sitter supports [supertypes](https://tree-sitter.github.io/tree-sitter/using-parsers/6-static-node-types.html#supertype-nodes), which allow grouping related nodes under a common type.
215//!
216//! For example, in the Python grammar:
217//!
218//! ```json
219//!  {
220//!    "type": "_compound_statement",
221//!    "named": true,
222//!    "subtypes": [
223//!      {
224//!        "type": "class_definition",
225//!        "named": true
226//!      },
227//!      {
228//!        "type": "decorated_definition",
229//!        "named": true
230//!      },
231//!      /* ... */
232//!      {
233//!        "type": "with_statement",
234//!        "named": true
235//!      }
236//!    ]
237//!  },
238//! ```
239//!
240//! This becomes a Rust enum:
241//!
242//! ```rust, ignore
243//! pub enum CompoundStatement {
244//!    ClassDefinition(ClassDefinition),
245//!    DecoratedDefinition(DecoratedDefinition),
246//!    /* ... */
247//!    WithStatement(WithStatement),
248//! }
249//! ```
250//!
251//! Some super types might contain other super types, in which case, the generated enum will flatten the hierarchy.
252
253mod ir;
254mod json;
255mod output;
256mod supertypes;
257mod tests;
258mod utils;
259
260use crate::json::{NodeType, TypeInfo};
261use crate::output::{generate_enum, generate_struct};
262use crate::supertypes::{generate_super_type, SuperType};
263use crate::utils::{sanitize_string, sanitize_string_to_pascal};
264use proc_macro2::TokenStream;
265use quote::{format_ident, quote, ToTokens};
266use std::collections::{HashMap, HashSet};
267use std::sync::{LazyLock, Mutex, RwLock};
268use utils::TOKENS;
269
270/// List of all named rules (nodes with `named: true`)
271pub(crate) static NAMED_RULES: LazyLock<Mutex<Vec<String>>> = LazyLock::new(Default::default);
272
273/// List of fields/children that are only composed of operators
274pub(crate) struct OperatorList {
275    index: usize,
276    operators: Vec<TypeInfo>,
277}
278
279pub(crate) static OPERATORS_RULES: LazyLock<Mutex<HashMap<String, OperatorList>>> =
280    LazyLock::new(Default::default);
281
282/// List of fields/children that are composed of multiple rules
283pub(crate) static INLINE_MULTIPLE_RULES: LazyLock<Mutex<HashMap<String, Vec<TypeInfo>>>> =
284    LazyLock::new(Default::default);
285
286/// List of anonymous rules (usually aliases created on the fly)
287pub(crate) static ANONYMOUS_TYPES: LazyLock<Mutex<HashSet<String>>> =
288    LazyLock::new(Default::default);
289
290/// Map of node kind to  named node id
291pub(crate) static NODE_ID_FOR_NAMED_NODE: LazyLock<Mutex<HashMap<String, u16>>> =
292    LazyLock::new(Default::default);
293
294/// Map of node kind to unnamed node id
295pub(crate) static NODE_ID_FOR_UNNAMED_NODE: LazyLock<Mutex<HashMap<String, u16>>> =
296    LazyLock::new(Default::default);
297
298/// Map of field name to field id
299pub(crate) static FIELD_ID_FOR_NAME: LazyLock<Mutex<HashMap<String, u16>>> =
300    LazyLock::new(Default::default);
301
302/// List of super types
303pub(crate) static SUPER_TYPES: LazyLock<RwLock<HashMap<String, SuperType>>> =
304    LazyLock::new(Default::default);
305
306/// Generates the Rust code for a given Tree-sitter grammar
307///
308/// # Arguments
309///
310/// * `source` - node-types.json
311/// * `language` - tree-sitter language fn
312/// * `tokens` - optional map of tokens to enum names (since tokens can't be valid rust identifiers)
313///
314/// # Returns
315/// A TokenStream containing the generated code
316///
317/// # Example
318///
319/// ```rust
320/// use auto_lsp_codegen::generate;
321///
322/// let _result = generate(
323///        &tree_sitter_python::NODE_TYPES,
324///        &tree_sitter_python::LANGUAGE.into(),
325///        None,
326///    );
327/// ```
328///
329pub fn generate(
330    source: &str,
331    language: &tree_sitter::Language,
332    tokens: Option<HashMap<&'static str, &'static str>>,
333) -> TokenStream {
334    if let Some(tokens) = tokens {
335        // extend or overwrite the default tokens
336
337        let mut lock = TOKENS.write().unwrap();
338        for (k, v) in tokens {
339            lock.insert(k, v);
340        }
341    }
342
343    let nodes: Vec<NodeType> = serde_json::from_str(source).expect("Invalid JSON");
344
345    let mut output = quote! {
346        // Auto-generated file. Do not edit manually.
347        #![allow(clippy::all)]
348        #![allow(unused)]
349        #![allow(dead_code)]
350        #![allow(non_camel_case_types)]
351        #![allow(non_snake_case)]
352
353    };
354    for node in &nodes {
355        if node.named {
356            // Push the node kind to the list of named rules
357            NAMED_RULES
358                .lock()
359                .unwrap()
360                .push(sanitize_string_to_pascal(&node.kind));
361            // Push the node kind to the list of ids for named nodes
362            NODE_ID_FOR_NAMED_NODE.lock().unwrap().insert(
363                node.kind.clone(),
364                language.id_for_node_kind(&node.kind, true),
365            );
366            // If the node has fields, we need to add them to the list of fields
367            if let Some(fields) = &node.fields {
368                fields.iter().for_each(|(field_name, _)| {
369                    let field_id = language.field_id_for_name(field_name);
370                    FIELD_ID_FOR_NAME
371                        .lock()
372                        .unwrap()
373                        .insert(field_name.clone(), field_id.unwrap().get());
374                });
375            }
376        } else {
377            // Push the node kind to the list of ids for named nodes
378            NODE_ID_FOR_UNNAMED_NODE.lock().unwrap().insert(
379                node.kind.clone(),
380                language.id_for_node_kind(&node.kind, false),
381            );
382        }
383        // If node is a supertype, add it to the list of super types
384        if node.is_supertype() {
385            SUPER_TYPES
386                .write()
387                .unwrap()
388                .insert(node.kind.clone(), generate_super_type(node));
389        }
390    }
391
392    // Super types may contains other super types
393    // in this case we need to add the nested super types to the `types` field of the current super type
394    let mut super_types_lock = SUPER_TYPES.write().unwrap();
395    let mut new_super_types = HashMap::new();
396
397    for (super_type_name, super_type) in super_types_lock.iter() {
398        let mut new_super_type = SuperType::default();
399
400        // Iterate over the types of this super type
401        super_type.types.iter().enumerate().for_each(|(i, key)| {
402            if let Some(nested_super_type) = super_types_lock.get(key) {
403                // Some types are super types
404                new_super_type.types.extend(nested_super_type.types.clone());
405            } else {
406                // Otherwise, we just clone the type
407                new_super_type.types.push(key.clone());
408            }
409            new_super_type.variants.push(super_type.variants[i].clone())
410        });
411        new_super_types.insert(super_type_name.clone(), new_super_type);
412    }
413
414    // Now we need to merge the new super types with the existing ones
415    new_super_types.into_iter().for_each(|(name, s)| {
416        super_types_lock.insert(name.clone(), s.clone());
417    });
418
419    drop(super_types_lock);
420
421    // Generate the structs and enums for all rules
422    for node in &nodes {
423        output.extend(node.to_token_stream());
424    }
425
426    // Generate the list of operators
427    for operators in (*OPERATORS_RULES.lock().unwrap()).values() {
428        output.extend(generate_enum(
429            &format_ident!("Operators_{}", operators.index),
430            &operators.operators,
431        ));
432    }
433
434    // Generate the list of inline multiple rules
435    for (id, values) in &*INLINE_MULTIPLE_RULES.lock().unwrap() {
436        output.extend(generate_enum(
437            &format_ident!("{}", sanitize_string(id)),
438            values,
439        ));
440    }
441
442    // Generate the list of anonymous types
443    for name in ANONYMOUS_TYPES.lock().unwrap().iter() {
444        output.extend(generate_struct(
445            &format_ident!("{}", &sanitize_string_to_pascal(name)),
446            name,
447            &vec![],
448            &vec![],
449            &vec![],
450            &vec![],
451        ));
452    }
453
454    // Generate the list of super types
455    // We need to clone because generate_enum will also check if some variants are super types
456    for (super_type_name, super_type) in SUPER_TYPES.read().unwrap().iter() {
457        output.extend(generate_enum(
458            &format_ident!("{}", &sanitize_string_to_pascal(super_type_name)),
459            &super_type.variants,
460        ));
461    }
462
463    output
464}