Skip to main content

dmc_parser/
parser.rs

1use crate::ast::*;
2use crate::refs::{RefMap, parse_link_ref_def};
3use dmc_diagnostic::Code;
4use dmc_diagnostic::metadata::{Origin, SourceMeta};
5use dmc_lexer::Lexer;
6use dmc_lexer::token::{Token, TokenKind};
7use duck_diagnostic::{Diagnostic, DiagnosticEngine, Span};
8use std::sync::Arc;
9
10/// Dialect knobs that change parse behavior between strict CommonMark and
11/// MDX. Default is MDX-friendly so capital JSX components round-trip as
12/// `JsxElement` nodes; spec runners can flip `cm_strict_html_blocks` to
13/// treat capital lowercase tags as CM 4.6 type-7 raw HTML.
14#[derive(Debug, Clone, Copy, Default)]
15pub struct ParseOptions {
16  /// CM 4.6 strict raw-HTML block detection. Treats uppercase JSX
17  /// (`<Warning>`) as type-7 raw HTML instead of routing through the
18  /// MDX `JsxElement` path. Spec runner only.
19  pub cm_strict_html_blocks: bool,
20  /// GFM autolink extension. Wraps `http(s)://` and `www....` runs in
21  /// `Link` nodes during inline parsing. Default off so the
22  /// `BareUrlAutolink` transformer owns this for MDX consumers.
23  pub gfm_autolinks: bool,
24  /// Legacy GFM 0.29 emphasis rendering. Flattens redundant nested
25  /// `<strong>` / `<em>` structure so the GFM spec runner can keep the
26  /// older delimiter behavior without regressing CommonMark 0.31.2.
27  pub legacy_gfm_emphasis: bool,
28}
29
30/// Token-stream cursor + diagnostic engine. `'tokens` ties borrowed lexemes to
31/// the source; `'eng` ties the engine borrow to the caller.
32pub struct Parser<'eng, 'tokens> {
33  pub tokens: Vec<Token<'tokens>>,
34  pub meta: Arc<SourceMeta>,
35  pub pos: usize,
36  pub refs: RefMap,
37  pub diag_engine: &'eng mut DiagnosticEngine<Code>,
38  pub options: ParseOptions,
39  /// Original source string, if the caller supplied it (via
40  /// `with_source`). Enables a safe, provenance-correct byte-offset
41  /// reslice in `raw_source_for_token_range` instead of pointer
42  /// arithmetic across token slices.
43  pub source: Option<&'tokens str>,
44  /// Current `[...]` link-label nesting depth. Recursive label parsing
45  /// (and the unresolved-shortcut replay) is super-linear in the number
46  /// of nested brackets; once this exceeds [`MAX_LINK_LABEL_DEPTH`] a
47  /// `[` is treated as literal text instead of opening yet another
48  /// recursive parse. No real document nests link labels that deeply
49  /// (CM forbids links inside link text), so this only bounds adversarial
50  /// `[[[[[...` input.
51  pub link_label_depth: u16,
52}
53
54/// Maximum `[...]` link-label nesting before `[` is treated as literal.
55/// Kept small because an unresolved-shortcut fallback re-parses its
56/// label into the outer delimiter stack, so total work is exponential
57/// in this depth on adversarial `[[[[...]]]]` input. CommonMark never
58/// nests link labels more than a couple deep (links cannot contain
59/// links), so 12 is far more than any real document needs.
60pub(crate) const MAX_LINK_LABEL_DEPTH: u16 = 12;
61
62impl<'eng, 'tokens> Parser<'eng, 'tokens> {
63  /// Build a parser positioned at the first token.
64  pub fn new(
65    tokens: Vec<Token<'tokens>>,
66    meta: Arc<SourceMeta>,
67    diag_engine: &'eng mut DiagnosticEngine<Code>,
68  ) -> Self {
69    Self {
70      tokens,
71      meta,
72      pos: 0,
73      refs: RefMap::new(),
74      diag_engine,
75      options: ParseOptions::default(),
76      source: None,
77      link_label_depth: 0,
78    }
79  }
80
81  /// Build a parser with explicit `ParseOptions`.
82  pub fn new_with_options(
83    tokens: Vec<Token<'tokens>>,
84    meta: Arc<SourceMeta>,
85    diag_engine: &'eng mut DiagnosticEngine<Code>,
86    options: ParseOptions,
87  ) -> Self {
88    Self { tokens, meta, pos: 0, refs: RefMap::new(), diag_engine, options, source: None, link_label_depth: 0 }
89  }
90
91  /// Attach the original source string so verbatim-slice reconstruction
92  /// (raw HTML blocks, malformed-link bodies) can reslice it directly
93  /// instead of reconstructing a pointer range across token lexemes.
94  pub fn with_source(mut self, source: &'tokens str) -> Self {
95    self.source = Some(source);
96    self
97  }
98
99  /// Drive the top-level loop until EOF. Force-advances on no-progress so a
100  /// malformed token cannot wedge the parser.
101  pub fn parse(&mut self) -> Document {
102    self.collect_refs();
103    let span = self.tokens.first().map(|t| t.span.clone()).unwrap_or_else(default_span);
104    let mut children = Vec::new();
105    while !self.is_eof() {
106      let before = self.pos;
107      if let Some(node) = self.parse_block() {
108        children.push(node);
109      }
110      if self.pos == before {
111        self.advance();
112      }
113    }
114    Document { children, span }
115  }
116
117  /// First pass: harvest every `LinkRefDef` token's `[label]: url "title"`
118  /// payload into `self.refs`. Cursor is left untouched; the main parse
119  /// loop then resolves shortcut / full / collapsed refs against the map.
120  fn collect_refs(&mut self) {
121    // CM 4.7: a link reference definition cannot interrupt a paragraph.
122    // Track per-line whether the current line started with a paragraph-
123    // worthy inline run; the line ends at SoftBreak/HardBreak. If a
124    // LinkRefDef appears on a line whose predecessor line was paragraph
125    // text (no intervening blank / heading / etc.), skip the def.
126    let mut in_paragraph = false;
127    let mut on_heading_line = false;
128    for tok in &self.tokens {
129      match &tok.kind {
130        TokenKind::LinkRefDef => {
131          if !in_paragraph && let Some((label, url, title)) = parse_link_ref_def(tok.raw) {
132            let url = crate::inline::decode_entities_in(&unescape_link_part(&url));
133            let title = title.map(|t| crate::inline::decode_entities_in(&unescape_link_part(&t)));
134            self.refs.insert(&label, url, title);
135          }
136        },
137        TokenKind::BlankLine
138        | TokenKind::CodeFenceOpen(_, _)
139        | TokenKind::CodeFenceClose(_, _)
140        | TokenKind::ThematicBreak
141        | TokenKind::FrontmatterEnd(_) => {
142          in_paragraph = false;
143          on_heading_line = false;
144        },
145        TokenKind::Heading(_) => {
146          // ATX heading line: content on this line is heading content,
147          // not a paragraph. After the line break, in_paragraph resets.
148          in_paragraph = false;
149          on_heading_line = true;
150        },
151        TokenKind::BlockQuoteMarker => {
152          in_paragraph = false;
153          on_heading_line = false;
154        },
155        TokenKind::SoftBreak | TokenKind::HardBreak => {
156          if on_heading_line {
157            in_paragraph = false;
158          }
159          on_heading_line = false;
160        },
161        TokenKind::Whitespace(_) | TokenKind::Eof => {},
162        _ => {
163          if !on_heading_line {
164            in_paragraph = true;
165          }
166        },
167      }
168    }
169  }
170
171  /// Forward a fully-built diagnostic to the engine.
172  pub(crate) fn emit_diagnostic(&mut self, diagnostic: Diagnostic<Code>) {
173    self.diag_engine.emit(diagnostic);
174  }
175
176  /// Build a primary-labelled diagnostic at the cursor and emit it.
177  pub(crate) fn diag(&mut self, code: Code, message: impl Into<String>) {
178    let (line, column) = self.tokens.get(self.pos).map(|t| (t.span.line, t.span.column)).unwrap_or((0, 0));
179    let span = Span::from_zero_based(self.meta.path.clone(), line, column, 1);
180    self.emit_diagnostic(duck_diagnostic::diag!(code, span, message.into()));
181  }
182
183  /// Sugar for emitting a warning-severity diagnostic.
184  pub(crate) fn warn(&mut self, code: Code, message: impl Into<String>) {
185    self.diag(code, message);
186  }
187
188  /// Span of an arbitrary token position, or a default EOF-adjacent span.
189  pub(crate) fn span_at(&self, pos: usize) -> Span {
190    self.tokens.get(pos).map(|t| t.span.clone()).unwrap_or_else(default_span)
191  }
192
193  /// Rebuild the verbatim source slice covered by `tokens[start..end)`.
194  /// Returns an empty string for empty / invalid ranges.
195  ///
196  /// When the caller attached the original source (`with_source`), the
197  /// span is recovered as a safe byte-offset reslice of that `&str` -
198  /// no `unsafe`, no provenance hazard. Without it (a few sample bins
199  /// and the inline-string helper), we fall back to concatenating the
200  /// covered tokens' lexemes; that loses any JSX-internal whitespace
201  /// the lexer normalized away, but those callers don't reconstruct
202  /// raw HTML blocks where that distinction matters.
203  pub(crate) fn raw_source_for_token_range(&self, start: usize, end: usize) -> String {
204    if start >= end {
205      return String::new();
206    }
207    let Some(start_tok) = self.tokens.get(start) else {
208      return String::new();
209    };
210    let Some(end_tok) = self.tokens.get(end - 1) else {
211      return String::new();
212    };
213
214    if let Some(source) = self.source {
215      let base = source.as_ptr() as usize;
216      let src_lo = base;
217      let src_hi = base + source.len();
218      let lo = start_tok.raw.as_ptr() as usize;
219      let hi = end_tok.raw.as_ptr() as usize + end_tok.raw.len();
220      debug_assert!(lo <= hi, "token slice start pointer exceeded end pointer");
221      debug_assert!(lo >= src_lo, "token slice start pointer fell before the source buffer");
222      debug_assert!(hi <= src_hi, "token slice end pointer exceeded the source buffer");
223      if lo < src_lo || hi > src_hi || lo > hi {
224        return String::new();
225      }
226      let off_lo = lo - base;
227      let off_hi = hi - base;
228      // `&str` indexing handles the UTF-8 boundary check; these offsets
229      // came from `Token.raw` slices of `source`, so they're aligned.
230      return source.get(off_lo..off_hi).map(|s| s.to_string()).unwrap_or_default();
231    }
232
233    // Fallback: concatenate the covered tokens' raw lexemes.
234    let mut out = String::new();
235    for tok in &self.tokens[start..end] {
236      out.push_str(tok.raw);
237    }
238    out
239  }
240
241  /// Span of the token at the cursor, or a default span at EOF.
242  pub(crate) fn current_span(&self) -> Span {
243    self.tokens.get(self.pos).map(|t| t.span.clone()).unwrap_or_else(default_span)
244  }
245
246  /// Token under the cursor (no consume).
247  pub(crate) fn peek(&'_ self) -> Option<&'_ Token<'_>> {
248    self.tokens.get(self.pos)
249  }
250
251  /// Kind of the token under the cursor (no consume).
252  pub(crate) fn peek_kind(&self) -> Option<&TokenKind> {
253    self.tokens.get(self.pos).map(|t| &t.kind)
254  }
255
256  /// Raw lexeme of the upcoming token with its source-tied `'tokens` lifetime,
257  /// decoupled from the `&self` borrow so callers can hold it across mutations.
258  pub(crate) fn peek_raw(&self) -> Option<&'tokens str> {
259    self.tokens.get(self.pos).map(|t| t.raw)
260  }
261
262  /// Consume one token and return it. No-op at EOF.
263  pub(crate) fn advance(&'_ mut self) -> Option<&'_ Token<'_>> {
264    let t = self.tokens.get(self.pos);
265    if t.is_some() {
266      self.pos += 1;
267    }
268    t
269  }
270
271  /// True at the `Eof` token or past the end of the stream.
272  pub(crate) fn is_eof(&self) -> bool {
273    matches!(self.peek_kind(), Some(TokenKind::Eof) | None)
274  }
275}
276
277/// CM-escape decoder for link destinations and titles harvested from
278/// `LinkRefDef` tokens. Mirrors the inline path's `unescape_markdown`.
279fn unescape_link_part(s: &str) -> String {
280  if !s.contains('\\') {
281    return s.to_string();
282  }
283  let mut out = String::with_capacity(s.len());
284  let bytes = s.as_bytes();
285  let mut i = 0;
286  while i < bytes.len() {
287    if bytes[i] == b'\\' && i + 1 < bytes.len() {
288      let nx = bytes[i + 1];
289      if matches!(
290        nx,
291        b'!'
292          | b'"'
293          | b'#'
294          | b'$'
295          | b'%'
296          | b'&'
297          | b'\''
298          | b'('
299          | b')'
300          | b'*'
301          | b'+'
302          | b','
303          | b'-'
304          | b'.'
305          | b'/'
306          | b':'
307          | b';'
308          | b'<'
309          | b'='
310          | b'>'
311          | b'?'
312          | b'@'
313          | b'['
314          | b'\\'
315          | b']'
316          | b'^'
317          | b'_'
318          | b'`'
319          | b'{'
320          | b'|'
321          | b'}'
322          | b'~'
323      ) {
324        out.push(nx as char);
325        i += 2;
326        continue;
327      }
328    }
329    out.push(bytes[i] as char);
330    i += 1;
331  }
332  out
333}
334
335/// Lex + parse `source` in one shot, dropping all diagnostics. Convenience for
336/// tests + the `parse` bin; production callers should construct their own
337/// `DiagnosticEngine`.
338pub fn parse(source: &str) -> Document {
339  parse_with(source, ParseOptions::default())
340}
341
342/// `parse` variant with explicit `ParseOptions`. Used by the CM spec
343/// runner to opt into CM-strict HTML block detection.
344pub fn parse_with(source: &str, options: ParseOptions) -> Document {
345  let meta = Arc::from(SourceMeta { path: Arc::from("<inline>"), origin: Origin::Inline("<inline>") });
346  let mut lex_engine = DiagnosticEngine::new();
347  let mut lexer = Lexer::new(source, meta.clone(), &mut lex_engine);
348  let _ = lexer.scan_tokens();
349  let tokens = std::mem::take(&mut lexer.tokens);
350  drop(lexer);
351
352  let mut parse_engine = DiagnosticEngine::new();
353  let mut p = Parser::new_with_options(tokens, meta, &mut parse_engine, options).with_source(source);
354  p.parse()
355}
356
357/// Lex `s` and run the inline parser on it. Returns the inline `Node`
358/// list (Text, InlineCode, Bold, Italic, Strikethrough, Link, ...).
359/// Used by table cells, which receive raw cell strings rather than
360/// pre-tokenised inline content.
361pub fn parse_inline_str(s: &str) -> Vec<crate::ast::Node> {
362  let meta = Arc::from(SourceMeta { path: Arc::from("<inline>"), origin: Origin::Inline("<inline>") });
363  let mut lex_engine = DiagnosticEngine::new();
364  let mut lexer = Lexer::new(s, meta.clone(), &mut lex_engine);
365  let _ = lexer.scan_tokens();
366  let tokens = std::mem::take(&mut lexer.tokens);
367  drop(lexer);
368  let mut parse_engine = DiagnosticEngine::new();
369  let mut p = Parser::new(tokens, meta, &mut parse_engine).with_source(s);
370  p.collect_inline_until_break()
371}