Skip to main content

dmc_parser/
parser.rs

1use crate::ast::*;
2use crate::refs::{RefMap, parse_link_ref_def};
3use dmc_diagnostic::Code;
4use dmc_diagnostic::metadata::{Origin, SourceMeta};
5use dmc_lexer::Lexer;
6use dmc_lexer::token::{Token, TokenKind};
7use duck_diagnostic::{Diagnostic, DiagnosticEngine, Span};
8use std::sync::Arc;
9
10/// Dialect knobs. Default is MDX-friendly; spec runners flip these to match
11/// strict CommonMark / GFM semantics.
12#[derive(Debug, Clone, Copy, Default)]
13pub struct ParseOptions {
14  /// CM 4.6 strict raw-HTML detection. Treats uppercase JSX (`<Warning>`) as
15  /// type-7 raw HTML instead of `JsxElement`. Spec runner only.
16  pub cm_strict_html_blocks: bool,
17  /// GFM autolink extension. Wraps `http(s)://` / `www....` in `Link` nodes
18  /// at parse time. Default off so the `BareUrlAutolink` transformer owns
19  /// this for MDX consumers.
20  pub gfm_autolinks: bool,
21  /// Legacy GFM 0.29 emphasis: flatten redundant nested `<strong>` / `<em>`.
22  /// Lets the GFM spec runner keep older delimiter behavior without
23  /// regressing CM 0.31.2.
24  pub legacy_gfm_emphasis: bool,
25}
26
27/// Token-stream cursor + diagnostic engine. `'tokens` ties borrowed lexemes
28/// to the source; `'eng` ties the engine borrow to the caller.
29pub struct Parser<'eng, 'tokens> {
30  pub tokens: Vec<Token<'tokens>>,
31  pub meta: Arc<SourceMeta>,
32  pub pos: usize,
33  pub refs: RefMap,
34  pub diag_engine: &'eng mut DiagnosticEngine<Code>,
35  pub options: ParseOptions,
36  /// Original source (`with_source`). Enables a provenance-correct
37  /// byte-offset reslice in `raw_source_for_token_range`.
38  pub source: Option<&'tokens str>,
39  /// Current `[...]` link-label nesting depth. Unresolved-shortcut replay is
40  /// super-linear in this depth; above [`MAX_LINK_LABEL_DEPTH`] a `[` becomes
41  /// literal text. CM forbids links inside link text so this only bounds
42  /// adversarial `[[[[[...` input.
43  pub link_label_depth: u16,
44  /// JSX elements currently being parsed (outermost first). `parse_jsx`
45  /// pushes the open-tag name and pops on close. Inline / block collection
46  /// consults this so a `JsxCloseTagStart` for an enclosing element
47  /// terminates the run instead of leaking as literal text. Lowercase HTML
48  /// tags never push here; only MDX component names.
49  pub jsx_open_stack: Vec<String>,
50  /// Current recursive-descent block-nesting depth (lists, blockquotes,
51  /// JSX children). Bounded by [`MAX_BLOCK_NESTING_DEPTH`] so adversarial
52  /// deeply-nested input (`>>>>...`, `- - - ...`, nested `<div>`) cannot
53  /// overflow the stack. See SEC-003.
54  pub block_depth: usize,
55}
56
57/// Maximum recursive-descent block-nesting depth before the parser stops
58/// recursing and treats the remaining content as literal text. A
59/// recursive-descent parser uses real stack frames per level; 128 leaves
60/// generous headroom for legitimate documents while bounding hostile
61/// input well below a stack overflow.
62pub(crate) const MAX_BLOCK_NESTING_DEPTH: usize = 128;
63
64/// Maximum `[...]` link-label nesting before `[` is treated as literal. The
65/// unresolved-shortcut fallback re-parses its label into the outer delimiter
66/// stack, so total work is exponential in this depth on `[[[[...]]]]` input.
67pub(crate) const MAX_LINK_LABEL_DEPTH: u16 = 12;
68
69impl<'eng, 'tokens> Parser<'eng, 'tokens> {
70  pub fn new(
71    tokens: Vec<Token<'tokens>>,
72    meta: Arc<SourceMeta>,
73    diag_engine: &'eng mut DiagnosticEngine<Code>,
74  ) -> Self {
75    Self {
76      tokens,
77      meta,
78      pos: 0,
79      refs: RefMap::new(),
80      diag_engine,
81      options: ParseOptions::default(),
82      source: None,
83      link_label_depth: 0,
84      jsx_open_stack: Vec::new(),
85      block_depth: 0,
86    }
87  }
88
89  pub fn new_with_options(
90    tokens: Vec<Token<'tokens>>,
91    meta: Arc<SourceMeta>,
92    diag_engine: &'eng mut DiagnosticEngine<Code>,
93    options: ParseOptions,
94  ) -> Self {
95    Self {
96      tokens,
97      meta,
98      pos: 0,
99      refs: RefMap::new(),
100      diag_engine,
101      options,
102      source: None,
103      link_label_depth: 0,
104      jsx_open_stack: Vec::new(),
105      block_depth: 0,
106    }
107  }
108
109  /// Attach the original source so verbatim-slice reconstruction (raw HTML
110  /// blocks, malformed-link bodies) can reslice it directly.
111  pub fn with_source(mut self, source: &'tokens str) -> Self {
112    self.source = Some(source);
113    self
114  }
115
116  /// Drive the top-level loop until EOF. Force-advances on no-progress so a
117  /// malformed token cannot wedge the cursor.
118  pub fn parse(&mut self) -> Document {
119    self.collect_refs();
120    let span = self.tokens.first().map(|t| t.span.clone()).unwrap_or_else(default_span);
121    let mut children = Vec::new();
122    while !self.is_eof() {
123      let before = self.pos;
124      if let Some(node) = self.parse_block() {
125        children.push(node);
126      }
127      if self.pos == before {
128        self.advance();
129      }
130    }
131    Document { children, span }
132  }
133
134  /// First pass: harvest every `LinkRefDef` token's `[label]: url "title"`
135  /// payload into `self.refs`. CM 4.7: a ref-def cannot interrupt a
136  /// paragraph, so skip defs on lines whose predecessor was paragraph text.
137  fn collect_refs(&mut self) {
138    let mut in_paragraph = false;
139    let mut on_heading_line = false;
140    for tok in &self.tokens {
141      match &tok.kind {
142        TokenKind::LinkRefDef => {
143          if !in_paragraph && let Some((label, url, title)) = parse_link_ref_def(tok.raw) {
144            let url = crate::inline::decode_entities_in(&unescape_link_part(&url));
145            let title = title.map(|t| crate::inline::decode_entities_in(&unescape_link_part(&t)));
146            self.refs.insert(&label, url, title);
147          }
148        },
149        TokenKind::BlankLine
150        | TokenKind::CodeFenceOpen(_, _)
151        | TokenKind::CodeFenceClose(_, _)
152        | TokenKind::ThematicBreak
153        | TokenKind::FrontmatterEnd(_) => {
154          in_paragraph = false;
155          on_heading_line = false;
156        },
157        TokenKind::Heading(_) => {
158          in_paragraph = false;
159          on_heading_line = true;
160        },
161        TokenKind::BlockQuoteMarker => {
162          in_paragraph = false;
163          on_heading_line = false;
164        },
165        TokenKind::SoftBreak | TokenKind::HardBreak => {
166          if on_heading_line {
167            in_paragraph = false;
168          }
169          on_heading_line = false;
170        },
171        TokenKind::Whitespace(_) | TokenKind::Eof => {},
172        _ => {
173          if !on_heading_line {
174            in_paragraph = true;
175          }
176        },
177      }
178    }
179  }
180
181  pub(crate) fn emit_diagnostic(&mut self, diagnostic: Diagnostic<Code>) {
182    self.diag_engine.emit(diagnostic);
183  }
184
185  pub(crate) fn diag(&mut self, code: Code, message: impl Into<String>) {
186    let (line, column) = self.tokens.get(self.pos).map(|t| (t.span.line, t.span.column)).unwrap_or((0, 0));
187    let span = Span::from_zero_based(self.meta.path.clone(), line, column, 1);
188    self.emit_diagnostic(duck_diagnostic::diag!(code, span, message.into()));
189  }
190
191  pub(crate) fn warn(&mut self, code: Code, message: impl Into<String>) {
192    self.diag(code, message);
193  }
194
195  pub(crate) fn span_at(&self, pos: usize) -> Span {
196    self.tokens.get(pos).map(|t| t.span.clone()).unwrap_or_else(default_span)
197  }
198
199  /// Rebuild the verbatim source slice covered by `tokens[start..end)`.
200  /// With `with_source`, reslices the original `&str` directly. Without it,
201  /// concatenates token lexemes - loses any JSX-internal whitespace the
202  /// lexer normalized away, but callers that need that whitespace always
203  /// attach a source.
204  pub(crate) fn raw_source_for_token_range(&self, start: usize, end: usize) -> String {
205    if start >= end {
206      return String::new();
207    }
208    let Some(start_tok) = self.tokens.get(start) else {
209      return String::new();
210    };
211    let Some(end_tok) = self.tokens.get(end - 1) else {
212      return String::new();
213    };
214
215    if let Some(source) = self.source {
216      let base = source.as_ptr() as usize;
217      let src_lo = base;
218      let src_hi = base + source.len();
219      let lo = start_tok.raw.as_ptr() as usize;
220      let hi = end_tok.raw.as_ptr() as usize + end_tok.raw.len();
221      debug_assert!(lo <= hi, "token slice start pointer exceeded end pointer");
222      debug_assert!(lo >= src_lo, "token slice start pointer fell before the source buffer");
223      debug_assert!(hi <= src_hi, "token slice end pointer exceeded the source buffer");
224      if lo < src_lo || hi > src_hi || lo > hi {
225        return String::new();
226      }
227      let off_lo = lo - base;
228      let off_hi = hi - base;
229      return source.get(off_lo..off_hi).map(|s| s.to_string()).unwrap_or_default();
230    }
231
232    let mut out = String::new();
233    for tok in &self.tokens[start..end] {
234      out.push_str(tok.raw);
235    }
236    out
237  }
238
239  pub(crate) fn current_span(&self) -> Span {
240    self.tokens.get(self.pos).map(|t| t.span.clone()).unwrap_or_else(default_span)
241  }
242
243  pub(crate) fn peek(&'_ self) -> Option<&'_ Token<'_>> {
244    self.tokens.get(self.pos)
245  }
246
247  pub(crate) fn peek_kind(&self) -> Option<&TokenKind> {
248    self.tokens.get(self.pos).map(|t| &t.kind)
249  }
250
251  /// Raw lexeme with its source-tied `'tokens` lifetime, decoupled from the
252  /// `&self` borrow so callers can hold it across mutations.
253  pub(crate) fn peek_raw(&self) -> Option<&'tokens str> {
254    self.tokens.get(self.pos).map(|t| t.raw)
255  }
256
257  pub(crate) fn advance(&'_ mut self) -> Option<&'_ Token<'_>> {
258    let t = self.tokens.get(self.pos);
259    if t.is_some() {
260      self.pos += 1;
261    }
262    t
263  }
264
265  pub(crate) fn is_eof(&self) -> bool {
266    matches!(self.peek_kind(), Some(TokenKind::Eof) | None)
267  }
268}
269
270/// CM-escape decoder for link destinations/titles in `LinkRefDef` tokens.
271/// Mirrors the inline path's `unescape_markdown`.
272fn unescape_link_part(s: &str) -> String {
273  if !s.contains('\\') {
274    return s.to_string();
275  }
276  let mut out = String::with_capacity(s.len());
277  let bytes = s.as_bytes();
278  let mut i = 0;
279  while i < bytes.len() {
280    if bytes[i] == b'\\' && i + 1 < bytes.len() {
281      let nx = bytes[i + 1];
282      if matches!(
283        nx,
284        b'!'
285          | b'"'
286          | b'#'
287          | b'$'
288          | b'%'
289          | b'&'
290          | b'\''
291          | b'('
292          | b')'
293          | b'*'
294          | b'+'
295          | b','
296          | b'-'
297          | b'.'
298          | b'/'
299          | b':'
300          | b';'
301          | b'<'
302          | b'='
303          | b'>'
304          | b'?'
305          | b'@'
306          | b'['
307          | b'\\'
308          | b']'
309          | b'^'
310          | b'_'
311          | b'`'
312          | b'{'
313          | b'|'
314          | b'}'
315          | b'~'
316      ) {
317        out.push(nx as char);
318        i += 2;
319        continue;
320      }
321    }
322    out.push(bytes[i] as char);
323    i += 1;
324  }
325  out
326}
327
328/// Lex + parse in one shot, dropping all diagnostics. Tests + the `parse`
329/// bin; production callers should construct their own `DiagnosticEngine`.
330pub fn parse(source: &str) -> Document {
331  parse_with(source, ParseOptions::default())
332}
333
334/// `parse` with explicit `ParseOptions`.
335pub fn parse_with(source: &str, options: ParseOptions) -> Document {
336  let meta = Arc::from(SourceMeta { path: Arc::from("<inline>"), origin: Origin::Inline("<inline>") });
337  let mut lex_engine = DiagnosticEngine::new();
338  let mut lexer = Lexer::new(source, meta.clone(), &mut lex_engine);
339  let _ = lexer.scan_tokens();
340  let tokens = std::mem::take(&mut lexer.tokens);
341  drop(lexer);
342
343  let mut parse_engine = DiagnosticEngine::new();
344  let mut p = Parser::new_with_options(tokens, meta, &mut parse_engine, options).with_source(source);
345  p.parse()
346}
347
348/// Lex `s` and run the inline parser. Used by table cells, which receive
349/// raw cell strings rather than pre-tokenised inline content.
350pub fn parse_inline_str(s: &str) -> Vec<crate::ast::Node> {
351  let meta = Arc::from(SourceMeta { path: Arc::from("<inline>"), origin: Origin::Inline("<inline>") });
352  let mut lex_engine = DiagnosticEngine::new();
353  let mut lexer = Lexer::new(s, meta.clone(), &mut lex_engine);
354  let _ = lexer.scan_tokens();
355  let tokens = std::mem::take(&mut lexer.tokens);
356  drop(lexer);
357  let mut parse_engine = DiagnosticEngine::new();
358  let mut p = Parser::new(tokens, meta, &mut parse_engine).with_source(s);
359  p.collect_inline_until_break()
360}