Skip to main content

dmc_parser/
parser.rs

1use crate::ast::*;
2use crate::refs::{RefMap, parse_link_ref_def};
3use dmc_diagnostic::Code;
4use dmc_diagnostic::metadata::{Origin, SourceMeta};
5use dmc_lexer::Lexer;
6use dmc_lexer::token::{Token, TokenKind};
7use duck_diagnostic::{Diagnostic, DiagnosticEngine, Span};
8use std::sync::Arc;
9
10/// Dialect knobs that change parse behavior between strict CommonMark and
11/// MDX. Default is MDX-friendly so capital JSX components round-trip as
12/// `JsxElement` nodes; spec runners can flip `cm_strict_html_blocks` to
13/// treat capital lowercase tags as CM 4.6 type-7 raw HTML.
14#[derive(Debug, Clone, Copy, Default)]
15pub struct ParseOptions {
16  /// CM 4.6 strict raw-HTML block detection. Treats uppercase JSX
17  /// (`<Warning>`) as type-7 raw HTML instead of routing through the
18  /// MDX `JsxElement` path. Spec runner only.
19  pub cm_strict_html_blocks: bool,
20  /// GFM autolink extension. Wraps `http(s)://` and `www....` runs in
21  /// `Link` nodes during inline parsing. Default off so the
22  /// `BareUrlAutolink` transformer owns this for MDX consumers.
23  pub gfm_autolinks: bool,
24  /// Legacy GFM 0.29 emphasis rendering. Flattens redundant nested
25  /// `<strong>` / `<em>` structure so the GFM spec runner can keep the
26  /// older delimiter behavior without regressing CommonMark 0.31.2.
27  pub legacy_gfm_emphasis: bool,
28}
29
30/// Token-stream cursor + diagnostic engine. `'tokens` ties borrowed lexemes to
31/// the source; `'eng` ties the engine borrow to the caller.
32pub struct Parser<'eng, 'tokens> {
33  pub tokens: Vec<Token<'tokens>>,
34  pub meta: Arc<SourceMeta>,
35  pub pos: usize,
36  pub refs: RefMap,
37  pub diag_engine: &'eng mut DiagnosticEngine<Code>,
38  pub options: ParseOptions,
39  /// Original source string, if the caller supplied it (via
40  /// `with_source`). Enables a safe, provenance-correct byte-offset
41  /// reslice in `raw_source_for_token_range` instead of pointer
42  /// arithmetic across token slices.
43  pub source: Option<&'tokens str>,
44  /// Current `[...]` link-label nesting depth. Recursive label parsing
45  /// (and the unresolved-shortcut replay) is super-linear in the number
46  /// of nested brackets; once this exceeds [`MAX_LINK_LABEL_DEPTH`] a
47  /// `[` is treated as literal text instead of opening yet another
48  /// recursive parse. No real document nests link labels that deeply
49  /// (CM forbids links inside link text), so this only bounds adversarial
50  /// `[[[[[...` input.
51  pub link_label_depth: u16,
52  /// Names of the JSX elements currently being parsed (outermost first).
53  /// `parse_jsx` pushes the open-tag name before walking the element's
54  /// children and pops it afterwards. Inline / block collection consults
55  /// this stack so a `JsxCloseTagStart` that closes an *enclosing* JSX
56  /// element terminates the run instead of being emitted as literal
57  /// `</`, tag-name, `>` text. Empty at top level; lowercase HTML tags
58  /// never enter here (they route through the CM raw-HTML path), so the
59  /// stack only ever holds MDX component names.
60  pub jsx_open_stack: Vec<String>,
61}
62
63/// Maximum `[...]` link-label nesting before `[` is treated as literal.
64/// Kept small because an unresolved-shortcut fallback re-parses its
65/// label into the outer delimiter stack, so total work is exponential
66/// in this depth on adversarial `[[[[...]]]]` input. CommonMark never
67/// nests link labels more than a couple deep (links cannot contain
68/// links), so 12 is far more than any real document needs.
69pub(crate) const MAX_LINK_LABEL_DEPTH: u16 = 12;
70
71impl<'eng, 'tokens> Parser<'eng, 'tokens> {
72  /// Build a parser positioned at the first token.
73  pub fn new(
74    tokens: Vec<Token<'tokens>>,
75    meta: Arc<SourceMeta>,
76    diag_engine: &'eng mut DiagnosticEngine<Code>,
77  ) -> Self {
78    Self {
79      tokens,
80      meta,
81      pos: 0,
82      refs: RefMap::new(),
83      diag_engine,
84      options: ParseOptions::default(),
85      source: None,
86      link_label_depth: 0,
87      jsx_open_stack: Vec::new(),
88    }
89  }
90
91  /// Build a parser with explicit `ParseOptions`.
92  pub fn new_with_options(
93    tokens: Vec<Token<'tokens>>,
94    meta: Arc<SourceMeta>,
95    diag_engine: &'eng mut DiagnosticEngine<Code>,
96    options: ParseOptions,
97  ) -> Self {
98    Self {
99      tokens,
100      meta,
101      pos: 0,
102      refs: RefMap::new(),
103      diag_engine,
104      options,
105      source: None,
106      link_label_depth: 0,
107      jsx_open_stack: Vec::new(),
108    }
109  }
110
111  /// Attach the original source string so verbatim-slice reconstruction
112  /// (raw HTML blocks, malformed-link bodies) can reslice it directly
113  /// instead of reconstructing a pointer range across token lexemes.
114  pub fn with_source(mut self, source: &'tokens str) -> Self {
115    self.source = Some(source);
116    self
117  }
118
119  /// Drive the top-level loop until EOF. Force-advances on no-progress so a
120  /// malformed token cannot wedge the parser.
121  pub fn parse(&mut self) -> Document {
122    self.collect_refs();
123    let span = self.tokens.first().map(|t| t.span.clone()).unwrap_or_else(default_span);
124    let mut children = Vec::new();
125    while !self.is_eof() {
126      let before = self.pos;
127      if let Some(node) = self.parse_block() {
128        children.push(node);
129      }
130      if self.pos == before {
131        self.advance();
132      }
133    }
134    Document { children, span }
135  }
136
137  /// First pass: harvest every `LinkRefDef` token's `[label]: url "title"`
138  /// payload into `self.refs`. Cursor is left untouched; the main parse
139  /// loop then resolves shortcut / full / collapsed refs against the map.
140  fn collect_refs(&mut self) {
141    // CM 4.7: a link reference definition cannot interrupt a paragraph.
142    // Track per-line whether the current line started with a paragraph-
143    // worthy inline run; the line ends at SoftBreak/HardBreak. If a
144    // LinkRefDef appears on a line whose predecessor line was paragraph
145    // text (no intervening blank / heading / etc.), skip the def.
146    let mut in_paragraph = false;
147    let mut on_heading_line = false;
148    for tok in &self.tokens {
149      match &tok.kind {
150        TokenKind::LinkRefDef => {
151          if !in_paragraph && let Some((label, url, title)) = parse_link_ref_def(tok.raw) {
152            let url = crate::inline::decode_entities_in(&unescape_link_part(&url));
153            let title = title.map(|t| crate::inline::decode_entities_in(&unescape_link_part(&t)));
154            self.refs.insert(&label, url, title);
155          }
156        },
157        TokenKind::BlankLine
158        | TokenKind::CodeFenceOpen(_, _)
159        | TokenKind::CodeFenceClose(_, _)
160        | TokenKind::ThematicBreak
161        | TokenKind::FrontmatterEnd(_) => {
162          in_paragraph = false;
163          on_heading_line = false;
164        },
165        TokenKind::Heading(_) => {
166          // ATX heading line: content on this line is heading content,
167          // not a paragraph. After the line break, in_paragraph resets.
168          in_paragraph = false;
169          on_heading_line = true;
170        },
171        TokenKind::BlockQuoteMarker => {
172          in_paragraph = false;
173          on_heading_line = false;
174        },
175        TokenKind::SoftBreak | TokenKind::HardBreak => {
176          if on_heading_line {
177            in_paragraph = false;
178          }
179          on_heading_line = false;
180        },
181        TokenKind::Whitespace(_) | TokenKind::Eof => {},
182        _ => {
183          if !on_heading_line {
184            in_paragraph = true;
185          }
186        },
187      }
188    }
189  }
190
191  /// Forward a fully-built diagnostic to the engine.
192  pub(crate) fn emit_diagnostic(&mut self, diagnostic: Diagnostic<Code>) {
193    self.diag_engine.emit(diagnostic);
194  }
195
196  /// Build a primary-labelled diagnostic at the cursor and emit it.
197  pub(crate) fn diag(&mut self, code: Code, message: impl Into<String>) {
198    let (line, column) = self.tokens.get(self.pos).map(|t| (t.span.line, t.span.column)).unwrap_or((0, 0));
199    let span = Span::from_zero_based(self.meta.path.clone(), line, column, 1);
200    self.emit_diagnostic(duck_diagnostic::diag!(code, span, message.into()));
201  }
202
203  /// Sugar for emitting a warning-severity diagnostic.
204  pub(crate) fn warn(&mut self, code: Code, message: impl Into<String>) {
205    self.diag(code, message);
206  }
207
208  /// Span of an arbitrary token position, or a default EOF-adjacent span.
209  pub(crate) fn span_at(&self, pos: usize) -> Span {
210    self.tokens.get(pos).map(|t| t.span.clone()).unwrap_or_else(default_span)
211  }
212
213  /// Rebuild the verbatim source slice covered by `tokens[start..end)`.
214  /// Returns an empty string for empty / invalid ranges.
215  ///
216  /// When the caller attached the original source (`with_source`), the
217  /// span is recovered as a safe byte-offset reslice of that `&str` -
218  /// no `unsafe`, no provenance hazard. Without it (a few sample bins
219  /// and the inline-string helper), we fall back to concatenating the
220  /// covered tokens' lexemes; that loses any JSX-internal whitespace
221  /// the lexer normalized away, but those callers don't reconstruct
222  /// raw HTML blocks where that distinction matters.
223  pub(crate) fn raw_source_for_token_range(&self, start: usize, end: usize) -> String {
224    if start >= end {
225      return String::new();
226    }
227    let Some(start_tok) = self.tokens.get(start) else {
228      return String::new();
229    };
230    let Some(end_tok) = self.tokens.get(end - 1) else {
231      return String::new();
232    };
233
234    if let Some(source) = self.source {
235      let base = source.as_ptr() as usize;
236      let src_lo = base;
237      let src_hi = base + source.len();
238      let lo = start_tok.raw.as_ptr() as usize;
239      let hi = end_tok.raw.as_ptr() as usize + end_tok.raw.len();
240      debug_assert!(lo <= hi, "token slice start pointer exceeded end pointer");
241      debug_assert!(lo >= src_lo, "token slice start pointer fell before the source buffer");
242      debug_assert!(hi <= src_hi, "token slice end pointer exceeded the source buffer");
243      if lo < src_lo || hi > src_hi || lo > hi {
244        return String::new();
245      }
246      let off_lo = lo - base;
247      let off_hi = hi - base;
248      // `&str` indexing handles the UTF-8 boundary check; these offsets
249      // came from `Token.raw` slices of `source`, so they're aligned.
250      return source.get(off_lo..off_hi).map(|s| s.to_string()).unwrap_or_default();
251    }
252
253    // Fallback: concatenate the covered tokens' raw lexemes.
254    let mut out = String::new();
255    for tok in &self.tokens[start..end] {
256      out.push_str(tok.raw);
257    }
258    out
259  }
260
261  /// Span of the token at the cursor, or a default span at EOF.
262  pub(crate) fn current_span(&self) -> Span {
263    self.tokens.get(self.pos).map(|t| t.span.clone()).unwrap_or_else(default_span)
264  }
265
266  /// Token under the cursor (no consume).
267  pub(crate) fn peek(&'_ self) -> Option<&'_ Token<'_>> {
268    self.tokens.get(self.pos)
269  }
270
271  /// Kind of the token under the cursor (no consume).
272  pub(crate) fn peek_kind(&self) -> Option<&TokenKind> {
273    self.tokens.get(self.pos).map(|t| &t.kind)
274  }
275
276  /// Raw lexeme of the upcoming token with its source-tied `'tokens` lifetime,
277  /// decoupled from the `&self` borrow so callers can hold it across mutations.
278  pub(crate) fn peek_raw(&self) -> Option<&'tokens str> {
279    self.tokens.get(self.pos).map(|t| t.raw)
280  }
281
282  /// Consume one token and return it. No-op at EOF.
283  pub(crate) fn advance(&'_ mut self) -> Option<&'_ Token<'_>> {
284    let t = self.tokens.get(self.pos);
285    if t.is_some() {
286      self.pos += 1;
287    }
288    t
289  }
290
291  /// True at the `Eof` token or past the end of the stream.
292  pub(crate) fn is_eof(&self) -> bool {
293    matches!(self.peek_kind(), Some(TokenKind::Eof) | None)
294  }
295}
296
297/// CM-escape decoder for link destinations and titles harvested from
298/// `LinkRefDef` tokens. Mirrors the inline path's `unescape_markdown`.
299fn unescape_link_part(s: &str) -> String {
300  if !s.contains('\\') {
301    return s.to_string();
302  }
303  let mut out = String::with_capacity(s.len());
304  let bytes = s.as_bytes();
305  let mut i = 0;
306  while i < bytes.len() {
307    if bytes[i] == b'\\' && i + 1 < bytes.len() {
308      let nx = bytes[i + 1];
309      if matches!(
310        nx,
311        b'!'
312          | b'"'
313          | b'#'
314          | b'$'
315          | b'%'
316          | b'&'
317          | b'\''
318          | b'('
319          | b')'
320          | b'*'
321          | b'+'
322          | b','
323          | b'-'
324          | b'.'
325          | b'/'
326          | b':'
327          | b';'
328          | b'<'
329          | b'='
330          | b'>'
331          | b'?'
332          | b'@'
333          | b'['
334          | b'\\'
335          | b']'
336          | b'^'
337          | b'_'
338          | b'`'
339          | b'{'
340          | b'|'
341          | b'}'
342          | b'~'
343      ) {
344        out.push(nx as char);
345        i += 2;
346        continue;
347      }
348    }
349    out.push(bytes[i] as char);
350    i += 1;
351  }
352  out
353}
354
355/// Lex + parse `source` in one shot, dropping all diagnostics. Convenience for
356/// tests + the `parse` bin; production callers should construct their own
357/// `DiagnosticEngine`.
358pub fn parse(source: &str) -> Document {
359  parse_with(source, ParseOptions::default())
360}
361
362/// `parse` variant with explicit `ParseOptions`. Used by the CM spec
363/// runner to opt into CM-strict HTML block detection.
364pub fn parse_with(source: &str, options: ParseOptions) -> Document {
365  let meta = Arc::from(SourceMeta { path: Arc::from("<inline>"), origin: Origin::Inline("<inline>") });
366  let mut lex_engine = DiagnosticEngine::new();
367  let mut lexer = Lexer::new(source, meta.clone(), &mut lex_engine);
368  let _ = lexer.scan_tokens();
369  let tokens = std::mem::take(&mut lexer.tokens);
370  drop(lexer);
371
372  let mut parse_engine = DiagnosticEngine::new();
373  let mut p = Parser::new_with_options(tokens, meta, &mut parse_engine, options).with_source(source);
374  p.parse()
375}
376
377/// Lex `s` and run the inline parser on it. Returns the inline `Node`
378/// list (Text, InlineCode, Bold, Italic, Strikethrough, Link, ...).
379/// Used by table cells, which receive raw cell strings rather than
380/// pre-tokenised inline content.
381pub fn parse_inline_str(s: &str) -> Vec<crate::ast::Node> {
382  let meta = Arc::from(SourceMeta { path: Arc::from("<inline>"), origin: Origin::Inline("<inline>") });
383  let mut lex_engine = DiagnosticEngine::new();
384  let mut lexer = Lexer::new(s, meta.clone(), &mut lex_engine);
385  let _ = lexer.scan_tokens();
386  let tokens = std::mem::take(&mut lexer.tokens);
387  drop(lexer);
388  let mut parse_engine = DiagnosticEngine::new();
389  let mut p = Parser::new(tokens, meta, &mut parse_engine).with_source(s);
390  p.collect_inline_until_break()
391}