1use crate::ast::*;
2use crate::refs::{RefMap, parse_link_ref_def};
3use dmc_diagnostic::Code;
4use dmc_diagnostic::metadata::{Origin, SourceMeta};
5use dmc_lexer::Lexer;
6use dmc_lexer::token::{Token, TokenKind};
7use duck_diagnostic::{Diagnostic, DiagnosticEngine, Span};
8use std::sync::Arc;
9
10#[derive(Debug, Clone, Copy, Default)]
15pub struct ParseOptions {
16 pub cm_strict_html_blocks: bool,
20 pub gfm_autolinks: bool,
24 pub legacy_gfm_emphasis: bool,
28}
29
30pub struct Parser<'eng, 'tokens> {
33 pub tokens: Vec<Token<'tokens>>,
34 pub meta: Arc<SourceMeta>,
35 pub pos: usize,
36 pub refs: RefMap,
37 pub diag_engine: &'eng mut DiagnosticEngine<Code>,
38 pub options: ParseOptions,
39 pub source: Option<&'tokens str>,
44 pub link_label_depth: u16,
52}
53
54pub(crate) const MAX_LINK_LABEL_DEPTH: u16 = 12;
61
62impl<'eng, 'tokens> Parser<'eng, 'tokens> {
63 pub fn new(
65 tokens: Vec<Token<'tokens>>,
66 meta: Arc<SourceMeta>,
67 diag_engine: &'eng mut DiagnosticEngine<Code>,
68 ) -> Self {
69 Self {
70 tokens,
71 meta,
72 pos: 0,
73 refs: RefMap::new(),
74 diag_engine,
75 options: ParseOptions::default(),
76 source: None,
77 link_label_depth: 0,
78 }
79 }
80
81 pub fn new_with_options(
83 tokens: Vec<Token<'tokens>>,
84 meta: Arc<SourceMeta>,
85 diag_engine: &'eng mut DiagnosticEngine<Code>,
86 options: ParseOptions,
87 ) -> Self {
88 Self { tokens, meta, pos: 0, refs: RefMap::new(), diag_engine, options, source: None, link_label_depth: 0 }
89 }
90
91 pub fn with_source(mut self, source: &'tokens str) -> Self {
95 self.source = Some(source);
96 self
97 }
98
99 pub fn parse(&mut self) -> Document {
102 self.collect_refs();
103 let span = self.tokens.first().map(|t| t.span.clone()).unwrap_or_else(default_span);
104 let mut children = Vec::new();
105 while !self.is_eof() {
106 let before = self.pos;
107 if let Some(node) = self.parse_block() {
108 children.push(node);
109 }
110 if self.pos == before {
111 self.advance();
112 }
113 }
114 Document { children, span }
115 }
116
117 fn collect_refs(&mut self) {
121 let mut in_paragraph = false;
127 let mut on_heading_line = false;
128 for tok in &self.tokens {
129 match &tok.kind {
130 TokenKind::LinkRefDef => {
131 if !in_paragraph && let Some((label, url, title)) = parse_link_ref_def(tok.raw) {
132 let url = crate::inline::decode_entities_in(&unescape_link_part(&url));
133 let title = title.map(|t| crate::inline::decode_entities_in(&unescape_link_part(&t)));
134 self.refs.insert(&label, url, title);
135 }
136 },
137 TokenKind::BlankLine
138 | TokenKind::CodeFenceOpen(_, _)
139 | TokenKind::CodeFenceClose(_, _)
140 | TokenKind::ThematicBreak
141 | TokenKind::FrontmatterEnd(_) => {
142 in_paragraph = false;
143 on_heading_line = false;
144 },
145 TokenKind::Heading(_) => {
146 in_paragraph = false;
149 on_heading_line = true;
150 },
151 TokenKind::BlockQuoteMarker => {
152 in_paragraph = false;
153 on_heading_line = false;
154 },
155 TokenKind::SoftBreak | TokenKind::HardBreak => {
156 if on_heading_line {
157 in_paragraph = false;
158 }
159 on_heading_line = false;
160 },
161 TokenKind::Whitespace(_) | TokenKind::Eof => {},
162 _ => {
163 if !on_heading_line {
164 in_paragraph = true;
165 }
166 },
167 }
168 }
169 }
170
171 pub(crate) fn emit_diagnostic(&mut self, diagnostic: Diagnostic<Code>) {
173 self.diag_engine.emit(diagnostic);
174 }
175
176 pub(crate) fn diag(&mut self, code: Code, message: impl Into<String>) {
178 let (line, column) = self.tokens.get(self.pos).map(|t| (t.span.line, t.span.column)).unwrap_or((0, 0));
179 let span = Span::from_zero_based(self.meta.path.clone(), line, column, 1);
180 self.emit_diagnostic(duck_diagnostic::diag!(code, span, message.into()));
181 }
182
183 pub(crate) fn warn(&mut self, code: Code, message: impl Into<String>) {
185 self.diag(code, message);
186 }
187
188 pub(crate) fn span_at(&self, pos: usize) -> Span {
190 self.tokens.get(pos).map(|t| t.span.clone()).unwrap_or_else(default_span)
191 }
192
193 pub(crate) fn raw_source_for_token_range(&self, start: usize, end: usize) -> String {
204 if start >= end {
205 return String::new();
206 }
207 let Some(start_tok) = self.tokens.get(start) else {
208 return String::new();
209 };
210 let Some(end_tok) = self.tokens.get(end - 1) else {
211 return String::new();
212 };
213
214 if let Some(source) = self.source {
215 let base = source.as_ptr() as usize;
216 let src_lo = base;
217 let src_hi = base + source.len();
218 let lo = start_tok.raw.as_ptr() as usize;
219 let hi = end_tok.raw.as_ptr() as usize + end_tok.raw.len();
220 debug_assert!(lo <= hi, "token slice start pointer exceeded end pointer");
221 debug_assert!(lo >= src_lo, "token slice start pointer fell before the source buffer");
222 debug_assert!(hi <= src_hi, "token slice end pointer exceeded the source buffer");
223 if lo < src_lo || hi > src_hi || lo > hi {
224 return String::new();
225 }
226 let off_lo = lo - base;
227 let off_hi = hi - base;
228 return source.get(off_lo..off_hi).map(|s| s.to_string()).unwrap_or_default();
231 }
232
233 let mut out = String::new();
235 for tok in &self.tokens[start..end] {
236 out.push_str(tok.raw);
237 }
238 out
239 }
240
241 pub(crate) fn current_span(&self) -> Span {
243 self.tokens.get(self.pos).map(|t| t.span.clone()).unwrap_or_else(default_span)
244 }
245
246 pub(crate) fn peek(&'_ self) -> Option<&'_ Token<'_>> {
248 self.tokens.get(self.pos)
249 }
250
251 pub(crate) fn peek_kind(&self) -> Option<&TokenKind> {
253 self.tokens.get(self.pos).map(|t| &t.kind)
254 }
255
256 pub(crate) fn peek_raw(&self) -> Option<&'tokens str> {
259 self.tokens.get(self.pos).map(|t| t.raw)
260 }
261
262 pub(crate) fn advance(&'_ mut self) -> Option<&'_ Token<'_>> {
264 let t = self.tokens.get(self.pos);
265 if t.is_some() {
266 self.pos += 1;
267 }
268 t
269 }
270
271 pub(crate) fn is_eof(&self) -> bool {
273 matches!(self.peek_kind(), Some(TokenKind::Eof) | None)
274 }
275}
276
277fn unescape_link_part(s: &str) -> String {
280 if !s.contains('\\') {
281 return s.to_string();
282 }
283 let mut out = String::with_capacity(s.len());
284 let bytes = s.as_bytes();
285 let mut i = 0;
286 while i < bytes.len() {
287 if bytes[i] == b'\\' && i + 1 < bytes.len() {
288 let nx = bytes[i + 1];
289 if matches!(
290 nx,
291 b'!'
292 | b'"'
293 | b'#'
294 | b'$'
295 | b'%'
296 | b'&'
297 | b'\''
298 | b'('
299 | b')'
300 | b'*'
301 | b'+'
302 | b','
303 | b'-'
304 | b'.'
305 | b'/'
306 | b':'
307 | b';'
308 | b'<'
309 | b'='
310 | b'>'
311 | b'?'
312 | b'@'
313 | b'['
314 | b'\\'
315 | b']'
316 | b'^'
317 | b'_'
318 | b'`'
319 | b'{'
320 | b'|'
321 | b'}'
322 | b'~'
323 ) {
324 out.push(nx as char);
325 i += 2;
326 continue;
327 }
328 }
329 out.push(bytes[i] as char);
330 i += 1;
331 }
332 out
333}
334
335pub fn parse(source: &str) -> Document {
339 parse_with(source, ParseOptions::default())
340}
341
342pub fn parse_with(source: &str, options: ParseOptions) -> Document {
345 let meta = Arc::from(SourceMeta { path: Arc::from("<inline>"), origin: Origin::Inline("<inline>") });
346 let mut lex_engine = DiagnosticEngine::new();
347 let mut lexer = Lexer::new(source, meta.clone(), &mut lex_engine);
348 let _ = lexer.scan_tokens();
349 let tokens = std::mem::take(&mut lexer.tokens);
350 drop(lexer);
351
352 let mut parse_engine = DiagnosticEngine::new();
353 let mut p = Parser::new_with_options(tokens, meta, &mut parse_engine, options).with_source(source);
354 p.parse()
355}
356
357pub fn parse_inline_str(s: &str) -> Vec<crate::ast::Node> {
362 let meta = Arc::from(SourceMeta { path: Arc::from("<inline>"), origin: Origin::Inline("<inline>") });
363 let mut lex_engine = DiagnosticEngine::new();
364 let mut lexer = Lexer::new(s, meta.clone(), &mut lex_engine);
365 let _ = lexer.scan_tokens();
366 let tokens = std::mem::take(&mut lexer.tokens);
367 drop(lexer);
368 let mut parse_engine = DiagnosticEngine::new();
369 let mut p = Parser::new(tokens, meta, &mut parse_engine).with_source(s);
370 p.collect_inline_until_break()
371}