Skip to main content

zerodds_idl/
parser.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 ZeroDDS Contributors
3//! Public Top-Level-Parser-API (T5.4).
4//!
5//! Wickelt die Pipeline `Tokenize → Recognize → CST → AST` zu einer
6//! ergonomischen Funktion. Default-Konfig:
7//!
8//! ```
9//! let ast = zerodds_idl::parse("module Empty {};", &Default::default())
10//!     .expect("parse must succeed");
11//! assert_eq!(ast.definitions.len(), 1);
12//! ```
13//!
14//! Die Pipeline ist bewusst simpel — fuer Caching, Recovery oder
15//! Inkremental-Parsing wird in Phase 1 ein dedizierter Session-Layer
16//! aufgesetzt (RFC 0001 §6).
17
18use crate::ast::{self, Specification};
19use crate::config::ParserConfig;
20use crate::engine::{Engine, EngineError, Recognizer};
21use crate::errors::{ParseError, Span};
22use crate::grammar::TokenKind;
23use crate::grammar::compile::CompiledGrammar;
24use crate::grammar::compose::compose;
25use crate::grammar::deltas::GrammarDelta;
26use crate::grammar::idl42::IDL_42;
27use crate::grammar::validate::validate;
28use crate::lexer::{Token, TokenRules, Tokenizer};
29
30/// Maximale `{`-/`}`-Verschachtelungstiefe der Token-Sequenz.
31///
32/// Schuetzt CST- und AST-Builder vor Stack-Overflow bei adversarial
33/// deeply nested IDL-Inputs (TS-1-Finding 1,
34/// `docs/test-harness/plan.md`). Der Cap ist deutlich oberhalb
35/// realistischer IDL-Bestaende (typische Modul-Hierarchien gehen
36/// 4-6 tief; selbst CCM-Component-Stacks bleiben unter 16) und
37/// unterhalb der Stack-Frame-Limite des rekursiven CST-Builders im
38/// Debug-Build (ab ~128 nested module triggern Test-Threads
39/// Stack-Overflow).
40pub const MAX_NESTING_DEPTH: usize = 64;
41
42/// Maximale Anzahl aufeinanderfolgender `@`-Annotations vor einer
43/// Declaration.
44///
45/// Die Annotation-Grammar ist linksrekursiv (`seq -> seq appl |
46/// empty`); im rekursiven CST-Builder fuehrt das zu quadratischem
47/// Verhalten in `try_match_symbols` (TS-1-Finding 2). Realistische
48/// IDL-Decls tragen 1-5 Annotations; >64 ist adversarial.
49pub const MAX_CONSECUTIVE_ANNOTATIONS: usize = 64;
50
51/// Pre-Tokenization-Validation:
52///
53/// 1. Zaehlt die maximale `{`-Tiefe — Cap [`MAX_NESTING_DEPTH`]
54///    schuetzt CST-Builder vor Stack-Overflow.
55/// 2. Zaehlt aufeinanderfolgende `@`-Annotations — Cap
56///    [`MAX_CONSECUTIVE_ANNOTATIONS`] schuetzt vor quadratischem
57///    Verhalten im linksrekursiven `annotation_appl_seq`.
58///
59/// Beide Pruefungen laufen vor Engine-Recognize, sodass adversarial
60/// Inputs ohne Penalty zurueckgewiesen werden.
61fn check_nesting_depth(tokens: &[Token<'_>]) -> Result<(), Error> {
62    let mut depth: usize = 0;
63    let mut consecutive_at: usize = 0;
64    for t in tokens {
65        let TokenKind::Punct(p) = t.kind else {
66            continue;
67        };
68        if p == "{" {
69            depth += 1;
70            consecutive_at = 0;
71            if depth > MAX_NESTING_DEPTH {
72                return Err(Error::DepthLimit {
73                    limit: MAX_NESTING_DEPTH,
74                    span: t.span,
75                });
76            }
77        } else if p == "}" {
78            depth = depth.saturating_sub(1);
79            consecutive_at = 0;
80        } else if p == "@" {
81            consecutive_at += 1;
82            if consecutive_at > MAX_CONSECUTIVE_ANNOTATIONS {
83                return Err(Error::AnnotationLimit {
84                    limit: MAX_CONSECUTIVE_ANNOTATIONS,
85                    span: t.span,
86                });
87            }
88        } else if p == ";" {
89            // Ein Semicolon trennt Decl-Sequenzen — Annotations
90            // davor gehoeren zur abgeschlossenen Decl.
91            consecutive_at = 0;
92        }
93    }
94    Ok(())
95}
96
97/// High-Level-Fehler des Top-Level-Parsers.
98///
99/// Vereint Lexer-, Recognition- und Builder-Fehler unter einem Typ, damit
100/// Konsumenten nur eine Error-Variante behandeln muessen.
101#[derive(Debug, Clone, PartialEq, Eq)]
102pub enum Error {
103    /// Lexer- oder Recognition-Fehler. Enthaelt den existierenden
104    /// [`ParseError`]-Variant fuer Token/EOF-Diagnostik.
105    Parse(ParseError),
106    /// Grammar-Konstruktions-Fehler. Tritt nur bei korruptem Build der
107    /// Grammar-Konstante auf — sollte fuer `IDL_42` nie passieren.
108    InvalidGrammar(String),
109    /// AST-Builder-Fehler. Indiziert Bug im Builder oder Grammar-Drift.
110    AstBuild(ast::BuilderError),
111    /// Verwendetes Konstrukt benoetigt ein Feature das in
112    /// [`ParserConfig::features`] aus ist. Liste aller Verletzungen.
113    FeaturesDisabled(Vec<crate::features::gate::FeatureGateError>),
114    /// `{`-/`}`-Verschachtelung ueberschreitet [`MAX_NESTING_DEPTH`].
115    /// Schutz vor Stack-Overflow im rekursiven CST-Builder
116    /// (TS-1-Finding 1).
117    DepthLimit {
118        /// Aktueller Cap.
119        limit: usize,
120        /// Position des `{`-Tokens, das den Cap ueberschritten hat.
121        span: Span,
122    },
123    /// Aufeinanderfolgende `@`-Annotations ueberschreiten
124    /// [`MAX_CONSECUTIVE_ANNOTATIONS`]. Schutz vor quadratischem
125    /// Verhalten im linksrekursiven Annotation-Sequence-CST-Build
126    /// (TS-1-Finding 2).
127    AnnotationLimit {
128        /// Aktueller Cap.
129        limit: usize,
130        /// Position des `@`-Tokens, das den Cap ueberschritten hat.
131        span: Span,
132    },
133}
134
135impl core::fmt::Display for Error {
136    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
137        match self {
138            Self::Parse(e) => write!(f, "parse error: {e:?}"),
139            Self::InvalidGrammar(msg) => write!(f, "invalid grammar: {msg}"),
140            Self::AstBuild(e) => write!(f, "ast build error: {e}"),
141            Self::FeaturesDisabled(errs) => {
142                writeln!(f, "{} feature-gate violation(s):", errs.len())?;
143                for e in errs {
144                    writeln!(f, "  - {e}")?;
145                }
146                Ok(())
147            }
148            Self::DepthLimit { limit, span } => {
149                write!(
150                    f,
151                    "brace nesting exceeds {limit} at {span} — refusing to parse to protect \
152                     against stack overflow"
153                )
154            }
155            Self::AnnotationLimit { limit, span } => {
156                write!(
157                    f,
158                    "more than {limit} consecutive annotations at {span} — refusing to parse \
159                     to protect against quadratic CST-build cost"
160                )
161            }
162        }
163    }
164}
165
166impl std::error::Error for Error {}
167
168impl From<ParseError> for Error {
169    fn from(e: ParseError) -> Self {
170        Self::Parse(e)
171    }
172}
173
174impl From<ast::BuilderError> for Error {
175    fn from(e: ast::BuilderError) -> Self {
176        Self::AstBuild(e)
177    }
178}
179
180/// Parst IDL-Source zu einer typisierten [`Specification`].
181///
182/// Pipeline: Tokenize → Earley-Recognize → CST-Build → AST-Build.
183///
184/// In Phase 0 wird `cfg.version`, `cfg.compat` und `cfg.vendor`
185/// noch nicht ausgewertet — die Grammar ist hardgecodet [`IDL_42`].
186/// Mit T6.x werden Versions-/Compat-/Vendor-Deltas wirksam (siehe
187/// [`crate::config`]).
188///
189/// # Errors
190/// - [`Error::Parse`]: Lexer-Fehler, Token-Mismatch, oder die Grammar
191///   akzeptiert die Token-Sequenz nicht.
192/// - [`Error::InvalidGrammar`]: Sollte nur bei korrupter Grammar-
193///   Konstante auftreten.
194/// - [`Error::AstBuild`]: CST-Struktur weicht von Grammar ab — Bug.
195pub fn parse(src: &str, cfg: &ParserConfig) -> Result<Specification, Error> {
196    let tokenizer = Tokenizer::for_grammar(&IDL_42);
197    let stream = tokenizer.tokenize(src).map_err(Error::Parse)?;
198    check_nesting_depth(stream.tokens())?;
199    let engine = Engine::new(&IDL_42);
200    let result = match engine.recognize(stream.tokens()) {
201        Ok(r) => r,
202        Err(EngineError::InvalidGrammar(report)) => {
203            return Err(Error::InvalidGrammar(format!(
204                "{} validation issues",
205                report.errors().count()
206            )));
207        }
208        Err(EngineError::NotAccepted { last_consumed }) => {
209            return Err(Error::Parse(ParseError::UnexpectedToken {
210                found: stream
211                    .tokens()
212                    .get(last_consumed)
213                    .map(|t| t.kind)
214                    .unwrap_or(crate::grammar::TokenKind::Ident),
215                expected: Vec::new(),
216                span: stream
217                    .tokens()
218                    .get(last_consumed)
219                    .map(|t| t.span)
220                    .unwrap_or(Span::SYNTHETIC),
221            }));
222        }
223    };
224    let cst = crate::cst::build_cst(engine.compiled_grammar(), stream.tokens(), &result)
225        .ok_or_else(|| {
226            Error::AstBuild(ast::BuilderError {
227                message: "CST reconstruction failed (recognition succeeded but tree invalid)"
228                    .to_string(),
229                span: Span::SYNTHETIC,
230            })
231        })?;
232    // Feature-Gate-Pass: lehne Konstrukte ab deren Feature in
233    // `cfg.features` aus ist. Bei Violations: alle gesammelt liefern.
234    let gate_errors = crate::features::gate::validate(&cst, &cfg.features);
235    if !gate_errors.is_empty() {
236        return Err(Error::FeaturesDisabled(gate_errors));
237    }
238    let ast = ast::build(&cst)?;
239    Ok(ast)
240}
241
242/// Wie [`parse`], aber mit zusaetzlichen Vendor-Grammar-Deltas (T6.5).
243///
244/// Komposition: Base-Grammar `IDL_42` + Deltas → [`CompiledGrammar`].
245/// Tokenizer-Rules werden aus der composed Grammar abgeleitet, damit
246/// Vendor-spezifische Keywords/Punctuation erkannt werden.
247///
248/// # Beispiel
249/// ```
250/// use zerodds_idl::config::ParserConfig;
251/// use zerodds_idl::grammar::deltas::RTI_CONNEXT;
252/// use zerodds_idl::parser::parse_with_deltas;
253///
254/// let src = r"
255///     struct Sensor { long id; double value; };
256///     keylist Sensor (id);
257/// ";
258/// let ast = parse_with_deltas(src, &ParserConfig::default(), &[&RTI_CONNEXT])
259///     .expect("RTI delta accepts keylist");
260/// assert_eq!(ast.definitions.len(), 2);
261/// ```
262///
263/// # Errors
264/// Wie [`parse`].
265pub fn parse_with_deltas(
266    src: &str,
267    cfg: &ParserConfig,
268    deltas: &[&GrammarDelta],
269) -> Result<Specification, Error> {
270    let _ = cfg; // aktuell ungenutzt.
271    let composed: CompiledGrammar = compose(&IDL_42, deltas);
272
273    // Token-Rules aus composed Grammar — damit Vendor-Keywords erkannt
274    // werden.
275    let rules = TokenRules::from_productions(composed.productions_iter());
276    let tokenizer = Tokenizer::new(rules);
277    let stream = tokenizer.tokenize(src).map_err(Error::Parse)?;
278    check_nesting_depth(stream.tokens())?;
279
280    // Validation auf der Base-Grammar (Delta-Validation kommt mit T6.9).
281    let base_report = validate(&IDL_42);
282    if base_report.has_errors() {
283        return Err(Error::InvalidGrammar(format!(
284            "{} validation issues",
285            base_report.errors().count()
286        )));
287    }
288
289    let result = Recognizer::new(&composed).recognize(stream.tokens());
290    if !result.accepted {
291        let last = stream.tokens().len();
292        return Err(Error::Parse(ParseError::UnexpectedToken {
293            found: stream
294                .tokens()
295                .get(last)
296                .map(|t| t.kind)
297                .unwrap_or(crate::grammar::TokenKind::Ident),
298            expected: Vec::new(),
299            span: stream
300                .tokens()
301                .get(last)
302                .map(|t| t.span)
303                .unwrap_or(Span::SYNTHETIC),
304        }));
305    }
306
307    let cst = crate::cst::build_cst(&composed, stream.tokens(), &result).ok_or_else(|| {
308        Error::AstBuild(ast::BuilderError {
309            message: "CST reconstruction failed (composed grammar)".to_string(),
310            span: Span::SYNTHETIC,
311        })
312    })?;
313    ast::build(&cst).map_err(Error::AstBuild)
314}
315
316#[cfg(test)]
317mod tests {
318    #![allow(clippy::expect_used, clippy::panic, clippy::unwrap_used)]
319
320    use super::*;
321    use crate::ast::{Definition, TypeDecl};
322
323    #[test]
324    fn parse_empty_module_with_default_config() {
325        let ast = parse("module Empty {};", &ParserConfig::default()).expect("parse");
326        assert_eq!(ast.definitions.len(), 1);
327        assert!(matches!(ast.definitions[0], Definition::Module(_)));
328    }
329
330    #[test]
331    fn parse_struct_with_members() {
332        let ast = parse(
333            "struct Point { long x; long y; };",
334            &ParserConfig::default(),
335        )
336        .expect("parse");
337        assert_eq!(ast.definitions.len(), 1);
338        assert!(matches!(
339            ast.definitions[0],
340            Definition::Type(TypeDecl::Constr(_))
341        ));
342    }
343
344    #[test]
345    fn parse_returns_error_for_lex_failure() {
346        let res = parse("\u{4711} not idl", &ParserConfig::default());
347        assert!(matches!(res, Err(Error::Parse(_))));
348    }
349
350    #[test]
351    fn parse_returns_error_for_grammar_rejection() {
352        let res = parse("struct Foo {", &ParserConfig::default());
353        assert!(matches!(res, Err(Error::Parse(_))));
354    }
355
356    #[test]
357    fn parse_with_pragmatic_config_parses_empty_struct_member_list() {
358        // Vendor-pragmatisch: leere member_list zugelassen.
359        let ast = parse("struct Empty {};", &ParserConfig::pragmatic_4_2()).expect("parse");
360        assert_eq!(ast.definitions.len(), 1);
361    }
362
363    #[test]
364    fn parse_handles_complex_dds_topic_pattern() {
365        let src = r#"
366            @topic
367            @appendable
368            struct Sensor {
369                @key long sensor_id;
370                double value;
371                @optional string label;
372            };
373        "#;
374        let ast = parse(src, &ParserConfig::default()).expect("parse");
375        assert_eq!(ast.definitions.len(), 1);
376    }
377
378    #[test]
379    fn parse_with_rti_delta_accepts_keylist() {
380        use crate::grammar::deltas::RTI_CONNEXT;
381        let src = r"
382            struct Sensor { long id; double value; };
383            keylist Sensor (id);
384        ";
385        let result = parse_with_deltas(src, &ParserConfig::default(), &[&RTI_CONNEXT]);
386        assert!(result.is_ok(), "expected ok, got {result:?}");
387    }
388
389    #[test]
390    fn parse_without_rti_delta_rejects_keylist() {
391        let src = r"
392            struct Sensor { long id; double value; };
393            keylist Sensor (id);
394        ";
395        let result = parse(src, &ParserConfig::default());
396        assert!(
397            matches!(result, Err(Error::Parse(_))),
398            "expected parse error, got {result:?}"
399        );
400    }
401
402    #[test]
403    fn parse_with_rti_delta_accepts_multi_field_keylist() {
404        use crate::grammar::deltas::RTI_CONNEXT;
405        let src = r"
406            struct Coord { long x; long y; long z; };
407            keylist Coord (x, y, z);
408        ";
409        let result = parse_with_deltas(src, &ParserConfig::default(), &[&RTI_CONNEXT]);
410        assert!(result.is_ok(), "expected ok, got {result:?}");
411    }
412
413    #[test]
414    fn error_display_does_not_panic() {
415        let err = Error::AstBuild(ast::BuilderError {
416            message: "x".to_string(),
417            span: Span::new(0, 1),
418        });
419        let s = format!("{err}");
420        assert!(s.contains("ast build error"));
421    }
422}