Skip to main content

panproto_parse/languages/
common.rs

1//! Common language parser implementation shared by all tree-sitter-based parsers.
2//!
3//! Since the generic [`AstWalker`](crate::walker::AstWalker) handles all languages
4//! uniformly (the node kind IS the vertex kind, the field name IS the edge kind),
5//! per-language parsers are thin wrappers that provide:
6//!
7//! 1. The tree-sitter Language object
8//! 2. The embedded `NODE_TYPES` JSON
9//! 3. Language-specific [`WalkerConfig`](crate::walker::WalkerConfig) overrides
10//! 4. File extension mapping
11
12use panproto_schema::{Protocol, Schema};
13
14use crate::error::ParseError;
15use crate::registry::AstParser;
16use crate::theory_extract::{ExtractedTheoryMeta, extract_theory_from_node_types};
17use crate::walker::{AstWalker, WalkerConfig};
18
19/// A generic language parser built from a tree-sitter grammar.
20///
21/// This struct is the shared implementation behind all 10 language parsers.
22/// Each language constructs one with its specific grammar, node types, and config.
23pub struct LanguageParser {
24    /// The protocol name (e.g. `"typescript"`, `"python"`).
25    protocol_name: String,
26    /// File extensions this language handles.
27    extensions: Vec<&'static str>,
28    /// The resolved tree-sitter language.
29    language: tree_sitter::Language,
30    /// The auto-derived theory metadata.
31    theory_meta: ExtractedTheoryMeta,
32    /// The panproto protocol definition (used for `SchemaBuilder` validation).
33    protocol: Protocol,
34    /// Per-language walker configuration.
35    walker_config: WalkerConfig,
36}
37
38impl LanguageParser {
39    /// Create a new language parser from a pre-constructed [`Language`](tree_sitter::Language).
40    ///
41    /// # Errors
42    ///
43    /// Returns [`ParseError`] if theory extraction from `node_types_json` fails.
44    pub fn from_language(
45        protocol_name: &str,
46        extensions: Vec<&'static str>,
47        language: tree_sitter::Language,
48        node_types_json: &[u8],
49        walker_config: WalkerConfig,
50    ) -> Result<Self, ParseError> {
51        let theory_name = format!("Th{}FullAST", capitalize_first(protocol_name));
52        let theory_meta = extract_theory_from_node_types(&theory_name, node_types_json)?;
53        let protocol = build_full_ast_protocol(protocol_name, &theory_name);
54
55        Ok(Self {
56            protocol_name: protocol_name.to_owned(),
57            extensions,
58            language,
59            theory_meta,
60            protocol,
61            walker_config,
62        })
63    }
64}
65
66impl AstParser for LanguageParser {
67    fn protocol_name(&self) -> &str {
68        &self.protocol_name
69    }
70
71    fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError> {
72        let mut parser = tree_sitter::Parser::new();
73        parser
74            .set_language(&self.language)
75            .map_err(|e| ParseError::TreeSitterParse {
76                path: format!("{file_path}: set_language failed: {e}"),
77            })?;
78
79        let tree = parser
80            .parse(source, None)
81            .ok_or_else(|| ParseError::TreeSitterParse {
82                path: format!("{file_path}: parse returned None (timeout or cancellation)"),
83            })?;
84
85        let walker = AstWalker::new(
86            source,
87            &self.theory_meta,
88            &self.protocol,
89            self.walker_config.clone(),
90        );
91
92        walker.walk(&tree, file_path)
93    }
94
95    fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
96        // Reconstruct source text from the schema's structural information.
97        //
98        // The walker stores two types of text constraints:
99        // 1. `literal-value` on leaf nodes: the source text of identifiers, literals, etc.
100        // 2. `interstitial-N` on parent nodes: the text between named children, which
101        //    contains keywords, punctuation, whitespace, and comments.
102        //
103        // The emitter walks the schema tree depth-first, interleaving interstitial text
104        // with child emissions to reconstruct the full source.
105        emit_from_schema(schema, &self.protocol_name)
106    }
107
108    fn supported_extensions(&self) -> &[&str] {
109        &self.extensions
110    }
111
112    fn theory_meta(&self) -> &ExtractedTheoryMeta {
113        &self.theory_meta
114    }
115}
116
117/// Reconstruct source text from a schema using interstitial text and leaf literals.
118///
119/// The walker stores two types of text data:
120/// - `literal-value` on leaf nodes: identifiers, literals, keywords that are named nodes
121/// - `interstitial-N` on parent nodes: text between named children (keywords, punctuation,
122///   whitespace, comments from anonymous/unnamed tokens)
123///
124/// The emitter reconstructs source by collecting ALL text fragments (both interstitials
125/// and leaf literals) and sorting them by their byte position in the original source.
126/// This produces exact round-trip fidelity: `emit(parse(source))` = `source`.
127fn emit_from_schema(schema: &Schema, protocol: &str) -> Result<Vec<u8>, ParseError> {
128    // Collect all text fragments with their byte positions.
129    // Each fragment is (start_byte, text).
130    let mut fragments: Vec<(usize, String)> = Vec::new();
131
132    for name in schema.vertices.keys() {
133        if let Some(constraints) = schema.constraints.get(name) {
134            // Get start-byte for this vertex.
135            let start_byte = constraints
136                .iter()
137                .find(|c| c.sort.as_ref() == "start-byte")
138                .and_then(|c| c.value.parse::<usize>().ok());
139
140            // Collect literal-value from leaf nodes.
141            let literal = constraints
142                .iter()
143                .find(|c| c.sort.as_ref() == "literal-value")
144                .map(|c| c.value.clone());
145
146            if let (Some(start), Some(text)) = (start_byte, literal) {
147                fragments.push((start, text));
148            }
149
150            // Collect interstitial text fragments.
151            // Each interstitial has a byte position derived from its parent and index.
152            for c in constraints {
153                let sort_str = c.sort.as_ref();
154                if sort_str.starts_with("interstitial-") {
155                    // The interstitial's position is encoded in a companion constraint.
156                    // We stored interstitial-N-start-byte alongside interstitial-N.
157                    let pos_sort = format!("{sort_str}-start-byte");
158                    let pos = constraints
159                        .iter()
160                        .find(|c2| c2.sort.as_ref() == pos_sort.as_str())
161                        .and_then(|c2| c2.value.parse::<usize>().ok());
162
163                    if let Some(p) = pos {
164                        fragments.push((p, c.value.clone()));
165                    }
166                }
167            }
168        }
169    }
170
171    if fragments.is_empty() {
172        return Err(ParseError::EmitFailed {
173            protocol: protocol.to_owned(),
174            reason: "schema has no text fragments".to_owned(),
175        });
176    }
177
178    // Sort by byte position and concatenate.
179    fragments.sort_by_key(|(pos, _)| *pos);
180
181    // Deduplicate overlapping fragments (parent interstitials may overlap with
182    // child literals). Keep the first fragment at each position.
183    let mut output = Vec::new();
184    let mut cursor = 0;
185
186    for (pos, text) in &fragments {
187        if *pos >= cursor {
188            output.extend_from_slice(text.as_bytes());
189            cursor = pos + text.len();
190        }
191    }
192
193    Ok(output)
194}
195
196/// Build the standard Protocol for a full-AST language parser.
197///
198/// Shared by `LanguageParser::new` and `LanguageParser::from_language`
199/// to avoid duplicating the constraint sorts and flag definitions.
200fn build_full_ast_protocol(protocol_name: &str, theory_name: &str) -> Protocol {
201    Protocol {
202        name: protocol_name.into(),
203        schema_theory: theory_name.into(),
204        instance_theory: format!("{theory_name}Instance"),
205        obj_kinds: vec![],
206        edge_rules: vec![],
207        constraint_sorts: vec![
208            "literal-value".into(),
209            "literal-type".into(),
210            "operator".into(),
211            "visibility".into(),
212            "mutability".into(),
213            "async".into(),
214            "static".into(),
215            "generator".into(),
216            "comment".into(),
217            "indent".into(),
218            "trailing-comma".into(),
219            "semicolon".into(),
220            "blank-lines-before".into(),
221            "start-byte".into(),
222            "end-byte".into(),
223        ],
224        has_order: true,
225        has_coproducts: false,
226        has_recursion: true,
227        has_causal: false,
228        nominal_identity: false,
229        has_defaults: false,
230        has_coercions: false,
231        has_mergers: false,
232        has_policies: false,
233    }
234}
235
236/// Capitalize the first letter of a string.
237fn capitalize_first(s: &str) -> String {
238    let mut chars = s.chars();
239    chars.next().map_or_else(String::new, |c| {
240        c.to_uppercase().collect::<String>() + chars.as_str()
241    })
242}