panproto_parse/languages/common.rs
1//! Common language parser implementation shared by all tree-sitter-based parsers.
2//!
3//! Since the generic [`AstWalker`](crate::walker::AstWalker) handles all languages
4//! uniformly (the node kind IS the vertex kind, the field name IS the edge kind),
5//! per-language parsers are thin wrappers that provide:
6//!
7//! 1. The tree-sitter Language object
8//! 2. The embedded `NODE_TYPES` JSON
9//! 3. Language-specific [`WalkerConfig`](crate::walker::WalkerConfig) overrides
10//! 4. File extension mapping
11
12use std::sync::Mutex;
13
14use panproto_schema::{Protocol, Schema};
15
16use crate::error::ParseError;
17use crate::registry::AstParser;
18use crate::scope_detector::ScopeDetector;
19use crate::theory_extract::{ExtractedTheoryMeta, extract_theory_from_node_types};
20use crate::walker::{AstWalker, WalkerConfig};
21
22/// A generic language parser built from a tree-sitter grammar.
23///
24/// This struct is the shared implementation behind all language parsers.
25/// Each language constructs one with its specific grammar, node types,
26/// tags query, and config.
27pub struct LanguageParser {
28 /// The protocol name (e.g. `"typescript"`, `"python"`).
29 protocol_name: String,
30 /// File extensions this language handles.
31 extensions: Vec<&'static str>,
32 /// The resolved tree-sitter language.
33 language: tree_sitter::Language,
34 /// The grammar's bundled `tags.scm`, if any (for named-scope detection).
35 tags_query: Option<&'static str>,
36 /// Project-level tags-query override (concatenated in front of
37 /// `tags_query` when constructing the [`ScopeDetector`]).
38 project_tags_override: Option<String>,
39 /// The auto-derived theory metadata.
40 theory_meta: ExtractedTheoryMeta,
41 /// The panproto protocol definition (used for `SchemaBuilder` validation).
42 protocol: Protocol,
43 /// Per-language walker configuration.
44 walker_config: WalkerConfig,
45 /// A reusable [`ScopeDetector`] for this language.
46 ///
47 /// Held behind a `Mutex` because `parse()` on [`AstParser`] takes `&self`
48 /// but the detector's `TagsContext` (and internal `QueryCursor`) need
49 /// `&mut` access during a tags query run. A single parser instance is
50 /// typically used serially; contention here is rare.
51 scope_detector: Mutex<ScopeDetector>,
52}
53
54impl LanguageParser {
55 /// Create a new language parser from a pre-constructed [`Language`](tree_sitter::Language).
56 ///
57 /// `tags_query` is the grammar's `queries/tags.scm` content, usually
58 /// sourced from [`panproto_grammars::Grammar::tags_query`]; pass `None`
59 /// if the grammar does not ship one.
60 ///
61 /// # Errors
62 ///
63 /// Returns [`ParseError`] if theory extraction from `node_types_json`
64 /// fails, or if the grammar's tags query fails to compile.
65 pub fn from_language(
66 protocol_name: &str,
67 extensions: Vec<&'static str>,
68 language: tree_sitter::Language,
69 node_types_json: &[u8],
70 tags_query: Option<&'static str>,
71 walker_config: WalkerConfig,
72 ) -> Result<Self, ParseError> {
73 let theory_name = format!("Th{}FullAST", capitalize_first(protocol_name));
74 let theory_meta = extract_theory_from_node_types(&theory_name, node_types_json)?;
75 let protocol = build_full_ast_protocol(protocol_name, &theory_name);
76 let scope_detector = ScopeDetector::new(&language, tags_query, None)?;
77
78 Ok(Self {
79 protocol_name: protocol_name.to_owned(),
80 extensions,
81 language,
82 tags_query,
83 project_tags_override: None,
84 theory_meta,
85 protocol,
86 walker_config,
87 scope_detector: Mutex::new(scope_detector),
88 })
89 }
90
91 /// Install a project-level tags-query override.
92 ///
93 /// The override string is concatenated in front of the grammar's
94 /// bundled `tags.scm` when the detector is rebuilt. Tree-sitter unions
95 /// all patterns, so overrides augment the defaults without replacing
96 /// them. Pass `None` to clear an existing override.
97 ///
98 /// Typical source: `panproto.toml`'s `[parse.tags.<lang>] path = "..."`.
99 ///
100 /// # Errors
101 ///
102 /// Returns [`ParseError::ScopeQueryCompile`] if the combined query
103 /// fails to compile against this language.
104 pub fn set_tags_override(&mut self, override_query: Option<String>) -> Result<(), ParseError> {
105 let detector =
106 ScopeDetector::new(&self.language, self.tags_query, override_query.as_deref())?;
107 self.project_tags_override = override_query;
108 if let Ok(mut guard) = self.scope_detector.lock() {
109 *guard = detector;
110 }
111 Ok(())
112 }
113}
114
115impl AstParser for LanguageParser {
116 fn protocol_name(&self) -> &str {
117 &self.protocol_name
118 }
119
120 fn parse(&self, source: &[u8], file_path: &str) -> Result<Schema, ParseError> {
121 let mut parser = tree_sitter::Parser::new();
122 parser
123 .set_language(&self.language)
124 .map_err(|e| ParseError::TreeSitterParse {
125 path: format!("{file_path}: set_language failed: {e}"),
126 })?;
127
128 let tree = parser
129 .parse(source, None)
130 .ok_or_else(|| ParseError::TreeSitterParse {
131 path: format!("{file_path}: parse returned None (timeout or cancellation)"),
132 })?;
133
134 // Build the walker (which runs the tags query once via the
135 // detector) inside the guard scope, then drop the guard before
136 // walking the tree. The scope map is copied into the walker, so
137 // the detector lock is no longer needed past that point.
138 let walker = {
139 let mut detector_guard =
140 self.scope_detector
141 .lock()
142 .map_err(|_| ParseError::SchemaConstruction {
143 reason: "scope-detector mutex poisoned".to_owned(),
144 })?;
145 AstWalker::new(
146 source,
147 &self.theory_meta,
148 &self.protocol,
149 self.walker_config.clone(),
150 Some(&mut *detector_guard),
151 )
152 };
153
154 walker.walk(&tree, file_path)
155 }
156
157 fn emit(&self, schema: &Schema) -> Result<Vec<u8>, ParseError> {
158 // Reconstruct source text from the schema's structural information.
159 //
160 // The walker stores two types of text constraints:
161 // 1. `literal-value` on leaf nodes: the source text of identifiers, literals, etc.
162 // 2. `interstitial-N` on parent nodes: the text between named children, which
163 // contains keywords, punctuation, whitespace, and comments.
164 //
165 // The emitter walks the schema tree depth-first, interleaving interstitial text
166 // with child emissions to reconstruct the full source.
167 emit_from_schema(schema, &self.protocol_name)
168 }
169
170 fn supported_extensions(&self) -> &[&str] {
171 &self.extensions
172 }
173
174 fn theory_meta(&self) -> &ExtractedTheoryMeta {
175 &self.theory_meta
176 }
177}
178
179/// Reconstruct source text from a schema using interstitial text and leaf literals.
180///
181/// The walker stores two types of text data:
182/// - `literal-value` on leaf nodes: identifiers, literals, keywords that are named nodes
183/// - `interstitial-N` on parent nodes: text between named children (keywords, punctuation,
184/// whitespace, comments from anonymous/unnamed tokens)
185///
186/// The emitter reconstructs source by collecting ALL text fragments (both interstitials
187/// and leaf literals) and sorting them by their byte position in the original source.
188/// This produces exact round-trip fidelity: `emit(parse(source))` = `source`.
189fn emit_from_schema(schema: &Schema, protocol: &str) -> Result<Vec<u8>, ParseError> {
190 // Collect all text fragments with their byte positions.
191 // Each fragment is (start_byte, text).
192 let mut fragments: Vec<(usize, String)> = Vec::new();
193
194 for name in schema.vertices.keys() {
195 if let Some(constraints) = schema.constraints.get(name) {
196 // Get start-byte for this vertex.
197 let start_byte = constraints
198 .iter()
199 .find(|c| c.sort.as_ref() == "start-byte")
200 .and_then(|c| c.value.parse::<usize>().ok());
201
202 // Collect literal-value from leaf nodes.
203 let literal = constraints
204 .iter()
205 .find(|c| c.sort.as_ref() == "literal-value")
206 .map(|c| c.value.clone());
207
208 if let (Some(start), Some(text)) = (start_byte, literal) {
209 fragments.push((start, text));
210 }
211
212 // Collect interstitial text fragments.
213 // Each interstitial has a byte position derived from its parent and index.
214 for c in constraints {
215 let sort_str = c.sort.as_ref();
216 if sort_str.starts_with("interstitial-") {
217 // The interstitial's position is encoded in a companion constraint.
218 // We stored interstitial-N-start-byte alongside interstitial-N.
219 let pos_sort = format!("{sort_str}-start-byte");
220 let pos = constraints
221 .iter()
222 .find(|c2| c2.sort.as_ref() == pos_sort.as_str())
223 .and_then(|c2| c2.value.parse::<usize>().ok());
224
225 if let Some(p) = pos {
226 fragments.push((p, c.value.clone()));
227 }
228 }
229 }
230 }
231 }
232
233 if fragments.is_empty() {
234 return Err(ParseError::EmitFailed {
235 protocol: protocol.to_owned(),
236 reason: "schema has no text fragments".to_owned(),
237 });
238 }
239
240 // Sort by byte position and concatenate.
241 fragments.sort_by_key(|(pos, _)| *pos);
242
243 // Deduplicate overlapping fragments (parent interstitials may overlap with
244 // child literals). Keep the first fragment at each position.
245 let mut output = Vec::new();
246 let mut cursor = 0;
247
248 for (pos, text) in &fragments {
249 if *pos >= cursor {
250 output.extend_from_slice(text.as_bytes());
251 cursor = pos + text.len();
252 }
253 }
254
255 Ok(output)
256}
257
258/// Build the standard Protocol for a full-AST language parser.
259///
260/// Shared by `LanguageParser::new` and `LanguageParser::from_language`
261/// to avoid duplicating the constraint sorts and flag definitions.
262fn build_full_ast_protocol(protocol_name: &str, theory_name: &str) -> Protocol {
263 Protocol {
264 name: protocol_name.into(),
265 schema_theory: theory_name.into(),
266 instance_theory: format!("{theory_name}Instance"),
267 schema_composition: None,
268 instance_composition: None,
269 obj_kinds: vec![],
270 edge_rules: vec![],
271 constraint_sorts: vec![
272 "literal-value".into(),
273 "literal-type".into(),
274 "operator".into(),
275 "visibility".into(),
276 "mutability".into(),
277 "async".into(),
278 "static".into(),
279 "generator".into(),
280 "comment".into(),
281 "indent".into(),
282 "trailing-comma".into(),
283 "semicolon".into(),
284 "blank-lines-before".into(),
285 "start-byte".into(),
286 "end-byte".into(),
287 ],
288 has_order: true,
289 has_coproducts: false,
290 has_recursion: true,
291 has_causal: false,
292 nominal_identity: false,
293 has_defaults: false,
294 has_coercions: false,
295 has_mergers: false,
296 has_policies: false,
297 }
298}
299
300/// Capitalize the first letter of a string.
301fn capitalize_first(s: &str) -> String {
302 let mut chars = s.chars();
303 chars.next().map_or_else(String::new, |c| {
304 c.to_uppercase().collect::<String>() + chars.as_str()
305 })
306}