asciidork_parser/
parser.rs

1use std::{cell::RefCell, rc::Rc};
2
3use crate::internal::*;
4
5pub struct Parser<'arena> {
6  pub(super) bump: &'arena Bump,
7  pub(super) lexer: Lexer<'arena>,
8  pub(super) document: Document<'arena>,
9  pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
10  pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
11  pub(super) ctx: ParseContext<'arena>,
12  pub(super) errors: RefCell<Vec<Diagnostic>>,
13  pub(super) strict: bool, // todo: naming...
14  pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
15  #[cfg(feature = "attr_ref_observation")]
16  pub(super) attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
17}
18
19#[derive(Debug)]
20pub struct ParseResult<'arena> {
21  pub document: Document<'arena>,
22  pub warnings: Vec<Diagnostic>,
23}
24
25impl<'arena> Parser<'arena> {
26  pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
27    Parser::from_lexer(Lexer::new(src, file, bump))
28  }
29
30  pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
31    Parser::from_lexer(Lexer::from_str(bump, file, src))
32  }
33
34  fn from_lexer(lexer: Lexer<'arena>) -> Self {
35    let mut parser = Parser {
36      bump: lexer.bump,
37      document: Document::new(lexer.bump),
38      peeked_lines: None,
39      peeked_meta: None,
40      ctx: ParseContext::new(lexer.bump),
41      errors: RefCell::new(Vec::new()),
42      strict: true,
43      include_resolver: None,
44      lexer,
45      #[cfg(feature = "attr_ref_observation")]
46      attr_ref_observer: None,
47    };
48    parser.set_source_file_attrs();
49    parser
50  }
51
52  pub fn apply_job_settings(&mut self, settings: JobSettings) {
53    if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
54      Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
55    }
56    self.strict = settings.strict;
57    self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
58    self.document.meta = settings.into();
59    self.set_source_file_attrs();
60  }
61
62  pub fn provide_timestamps(
63    &mut self,
64    now: u64,
65    input_modified_time: Option<u64>,
66    reproducible_override: Option<u64>,
67  ) {
68    self.set_datetime_attrs(now, input_modified_time, reproducible_override);
69  }
70
71  pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
72    self.include_resolver = Some(resolver);
73  }
74
75  #[cfg(feature = "attr_ref_observation")]
76  pub fn set_attr_ref_observer(&mut self, observer: Box<dyn AttrRefObserver>) {
77    self.attr_ref_observer = Some(observer);
78  }
79
80  pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
81    let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
82    cell_parser.include_resolver = self.include_resolver.as_ref().map(|r| r.clone_box());
83    cell_parser.strict = self.strict;
84    cell_parser.lexer.adjust_offset(offset);
85    cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
86    cell_parser.document.meta = self.document.meta.clone_for_cell();
87    cell_parser.document.anchors = Rc::clone(&self.document.anchors);
88    cell_parser
89  }
90
91  pub(crate) fn loc(&self) -> SourceLocation {
92    self
93      .peeked_lines
94      .as_ref()
95      .and_then(|lines| lines.first_loc())
96      .unwrap_or_else(|| self.lexer.loc())
97  }
98
99  pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
100    assert!(self.peeked_lines.is_none());
101    if self.lexer.is_eof() {
102      return Ok(None);
103    }
104
105    let mut drop_line = false;
106    let mut line = Line::empty(self.bump);
107    while !self.lexer.at_newline() && !self.lexer.is_eof() {
108      let token = self.lexer.next_token();
109      self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
110    }
111    self.lexer.skip_newline();
112    if drop_line {
113      return self.read_line();
114    }
115    if line.starts(TokenKind::Directive) && !self.ctx.within_block_comment() {
116      match self.try_process_directive(&mut line)? {
117        DirectiveAction::Passthrough => Ok(Some(line)),
118        DirectiveAction::SubstituteLine(line) => Ok(Some(line)),
119        DirectiveAction::ReadNextLine => self.read_line(),
120        DirectiveAction::SkipLinesUntilEndIf => self.skip_lines_until_endif(&line),
121      }
122    } else {
123      Ok(Some(line))
124    }
125  }
126
127  pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
128    self.ctx.comment_delim_in_lines = false;
129    if let Some(peeked) = self.peeked_lines.take() {
130      return Ok(Some(peeked));
131    }
132    self.lexer.consume_empty_lines();
133    if self.lexer.is_eof() {
134      return Ok(None);
135    }
136    let mut lines = Deq::new(self.bump);
137    while let Some(line) = self.read_line()? {
138      if line.is_emptyish() {
139        if lines.is_empty() {
140          // this case can happen if our first non-empty line was an include directive
141          // that then resolved to an initial empty line, otherwise consume_empty_lines
142          // would have skipped over it, so we keep going
143          continue;
144        } else {
145          // this case happens only when we DROP a line
146          break;
147        }
148      }
149      if line.is_delimiter_kind(DelimiterKind::Comment) {
150        self.ctx.comment_delim_in_lines = true;
151      }
152      lines.push(line);
153      if self.lexer.at_newline() {
154        break;
155      }
156    }
157    if lines.is_empty() {
158      Ok(None)
159    } else {
160      Ok(Some(ContiguousLines::new(lines)))
161    }
162  }
163
164  pub(crate) fn read_lines_until(
165    &mut self,
166    delimiter: Delimiter,
167  ) -> Result<Option<ContiguousLines<'arena>>> {
168    let Some(mut lines) = self.read_lines()? else {
169      return Ok(None);
170    };
171    if lines.any(|l| l.is_delimiter(delimiter)) {
172      return Ok(Some(lines));
173    }
174
175    let mut additional_lines = BumpVec::new_in(self.bump);
176    while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
177      additional_lines.push(self.read_line()?.unwrap());
178    }
179    lines.extend(additional_lines);
180
181    while lines.last().map(|l| l.is_empty()) == Some(true) {
182      lines.pop();
183    }
184    Ok(Some(lines))
185  }
186
187  fn at_delimiter(&self, delimiter: Delimiter) -> bool {
188    match delimiter.kind {
189      DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
190      DelimiterKind::Example => {
191        self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
192      }
193      DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
194      DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
195      DelimiterKind::Listing => self.lexer.at_delimiter_line() == Some((4, b'-')),
196      DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
197      DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
198      DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
199    }
200  }
201
202  pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
203    debug_assert!(self.peeked_lines.is_none());
204    if !lines.is_empty() {
205      self.peeked_lines = Some(lines);
206    }
207  }
208
209  pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
210    if !meta.is_empty() {
211      debug_assert!(self.peeked_meta.is_none());
212      self.peeked_meta = Some(meta);
213    }
214  }
215
216  pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
217    self.restore_lines(lines);
218    self.restore_peeked_meta(meta);
219  }
220
221  pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
222    self.parse_document_header()?;
223    self.prepare_toc();
224
225    // ensure we only read a single "paragraph" for `inline` doc_type
226    // https://docs.asciidoctor.org/asciidoc/latest/document/doctype/#inline-doctype-rules
227    if self.document.meta.get_doctype() == DocType::Inline {
228      if self.peeked_lines.is_none() {
229        // tmp:
230        self.peeked_lines = self.read_lines().expect("tmp");
231      }
232      self.lexer.truncate();
233    }
234
235    if let Some(book_content) = self.parse_book()? {
236      self.document.content = book_content;
237    } else {
238      let sectioned = self.parse_sectioned()?;
239      self.document.content = sectioned.into_doc_content(self.bump);
240    }
241
242    // so the backend can see them replayed in decl order
243    self.document.meta.clear_doc_attrs();
244
245    self.diagnose_document()?;
246
247    Ok(ParseResult {
248      document: self.document,
249      warnings: self.errors.into_inner(),
250    })
251  }
252
253  pub(crate) fn parse_sectioned(&mut self) -> Result<Sectioned<'arena>> {
254    let mut blocks = bvec![in self.bump];
255    while let Some(block) = self.parse_block()? {
256      blocks.push(block);
257    }
258    let preamble = if blocks.is_empty() { None } else { Some(blocks) };
259    let mut sections = bvec![in self.bump];
260    while let Some(section) = self.parse_section()? {
261      sections.push(section);
262    }
263    Ok(Sectioned { preamble, sections })
264  }
265
266  pub(crate) fn parse_chunk_meta(
267    &mut self,
268    lines: &mut ContiguousLines<'arena>,
269  ) -> Result<ChunkMeta<'arena>> {
270    if let Some(meta) = self.peeked_meta.take() {
271      return Ok(meta);
272    }
273    assert!(!lines.is_empty());
274    let start_loc = lines.current_token().unwrap().loc;
275    let mut attrs = MultiAttrList::new_in(self.bump);
276    let mut title = None;
277    if !lines.current().unwrap().is_fully_unconsumed() {
278      return Ok(ChunkMeta::new(attrs, title, start_loc));
279    }
280    loop {
281      match lines.current() {
282        Some(line) if line.is_chunk_title() => {
283          let mut line = lines.consume_current().unwrap();
284          line.discard_assert(TokenKind::Dots);
285          title = Some(self.parse_inlines(&mut line.into_lines())?);
286        }
287        Some(line) if line.is_block_attr_list() => {
288          let mut line = lines.consume_current().unwrap();
289          line.discard_assert(TokenKind::OpenBracket);
290          attrs.push(self.parse_block_attr_list(&mut line)?);
291        }
292        Some(line) if line.is_block_anchor() => {
293          let mut line = lines.consume_current().unwrap();
294          let first = line.discard_assert(TokenKind::OpenBracket);
295          line.discard_assert(TokenKind::OpenBracket);
296          let Some(anchor) = self.parse_block_anchor(&mut line)? else {
297            self.err_line_starting("Invalid block anchor", first.loc)?;
298            return Ok(ChunkMeta::new(attrs, title, start_loc));
299          };
300          let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
301          anchor_attrs.id = Some(anchor.id);
302          anchor_attrs.positional.push(anchor.reftext);
303          attrs.push(anchor_attrs);
304        }
305        // consume trailing comment lines for valid meta
306        Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
307          lines.consume_current();
308        }
309        _ => break,
310      }
311    }
312    Ok(ChunkMeta::new(attrs, title, start_loc))
313  }
314
315  pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
316    BumpString::from_str_in(s, self.bump)
317  }
318}
319
320pub trait HasArena<'arena> {
321  fn bump(&self) -> &'arena Bump;
322  fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
323    Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
324  }
325}
326
327impl<'arena> HasArena<'arena> for Parser<'arena> {
328  fn bump(&self) -> &'arena Bump {
329    self.bump
330  }
331}
332
333pub enum DirectiveAction<'arena> {
334  Passthrough,
335  ReadNextLine,
336  SkipLinesUntilEndIf,
337  SubstituteLine(Line<'arena>),
338}
339
340#[derive(Debug, Clone, PartialEq, Eq)]
341pub enum SourceFile {
342  Stdin { cwd: Path },
343  Path(Path),
344  Tmp,
345}
346
347impl SourceFile {
348  pub fn file_name(&self) -> &str {
349    match self {
350      SourceFile::Stdin { .. } => "<stdin>",
351      SourceFile::Path(path) => path.file_name(),
352      SourceFile::Tmp => "<temp-buffer>",
353    }
354  }
355
356  pub fn matches_xref_target(&self, target: &str) -> bool {
357    let SourceFile::Path(path) = self else {
358      return false;
359    };
360    let filename = path.file_name();
361    if filename == target {
362      return true;
363    }
364    let xref_ext = file::ext(target);
365    let path_ext = file::ext(filename);
366    if xref_ext.is_some() && xref_ext != path_ext {
367      return false;
368    }
369    let fullpath = path.to_string();
370    if fullpath.ends_with(target) {
371      true
372    } else if xref_ext.is_some() {
373      false
374    } else {
375      file::remove_ext(&fullpath).ends_with(target)
376    }
377  }
378}
379
380impl From<Diagnostic> for Vec<Diagnostic> {
381  fn from(diagnostic: Diagnostic) -> Self {
382    vec![diagnostic]
383  }
384}
385
386#[cfg(test)]
387mod tests {
388  use super::*;
389  use test_utils::*;
390
391  fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
392    #[derive(Clone)]
393    struct MockResolver(pub Vec<u8>);
394    impl IncludeResolver for MockResolver {
395      fn resolve(
396        &mut self,
397        _: IncludeTarget,
398        buffer: &mut dyn IncludeBuffer,
399      ) -> std::result::Result<usize, ResolveError> {
400        buffer.initialize(self.0.len());
401        let bytes = buffer.as_bytes_mut();
402        bytes.copy_from_slice(&self.0);
403        Ok(self.0.len())
404      }
405      fn get_base_dir(&self) -> Option<String> {
406        Some("/".to_string())
407      }
408      fn clone_box(&self) -> Box<dyn IncludeResolver> {
409        Box::new(self.clone())
410      }
411    }
412    Box::new(MockResolver(Vec::from(src.as_bytes())))
413  }
414
415  fn reassemble(lines: ContiguousLines) -> String {
416    lines
417      .iter()
418      .map(|l| l.reassemble_src())
419      .collect::<Vec<_>>()
420      .join("\n")
421  }
422
423  #[test]
424  fn test_attr_ref() {
425    let mut parser = test_parser!("hello {foo} world");
426    parser
427      .document
428      .meta
429      .insert_doc_attr("foo", "_bar_")
430      .unwrap();
431    let mut lines = parser.read_lines().unwrap().unwrap();
432    let line = lines.consume_current().unwrap();
433    let tokens = line.into_iter().collect::<Vec<_>>();
434    expect_eq!(
435      &tokens,
436      &[
437        Token::new(TokenKind::Word, loc!(0..5), bstr!("hello")),
438        Token::new(TokenKind::Whitespace, loc!(5..6), bstr!(" ")),
439        Token::new(TokenKind::AttrRef, loc!(6..11), bstr!("{foo}")),
440        // these are inserted as an inline preprocessing step
441        // NB: we will use the source loc of the attr ref token to know how
442        // to skip over the resolve attribute in no-attr-ref subs contexts
443        Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
444        Token::new(TokenKind::Word, loc!(6..11), bstr!("bar")),
445        Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
446        // end inserted.
447        Token::new(TokenKind::Whitespace, loc!(11..12), bstr!(" ")),
448        Token::new(TokenKind::Word, loc!(12..17), bstr!("world")),
449      ]
450    );
451  }
452
453  #[test]
454  fn invalid_directive_line_passed_thru() {
455    let input = adoc! {"
456      foo
457      include::invalid []
458      bar
459    "};
460
461    let mut parser = test_parser!(input);
462    assert_eq!(
463      reassemble(parser.read_lines().unwrap().unwrap()),
464      input.trim_end()
465    );
466  }
467
468  #[test]
469  fn safe_mode_include_to_link() {
470    let input = adoc! {"
471      foo
472      include::include-file.adoc[]
473      baz
474    "};
475
476    let mut parser = test_parser!(input);
477    parser.apply_job_settings(JobSettings::secure());
478    assert_eq!(
479      reassemble(parser.read_lines().unwrap().unwrap()),
480      adoc! {"
481        foo
482        link:include-file.adoc[role=include,]
483        baz"
484      }
485    );
486
487    // assert on the tokens and positions
488    let mut parser = test_parser!(input);
489    parser.apply_job_settings(JobSettings::secure());
490
491    let mut line = parser.read_line().unwrap().unwrap();
492    expect_eq!(
493      line.consume_current().unwrap(),
494      Token::new(TokenKind::Word, loc!(0..3), bstr!("foo"))
495    );
496    assert!(line.consume_current().is_none());
497
498    assert_eq!(&input[8..13], "ude::");
499    assert_eq!(&input[30..32], "[]");
500
501    let mut line = parser.read_line().unwrap().unwrap();
502    expect_eq!(
503      std::array::from_fn(|_| line.consume_current().unwrap()),
504      [
505        // we "drop" positions 4-7, the `inc` of `include::`
506        // which becomes `••••link:`, keeping rest of token positions
507        Token::new(TokenKind::MacroName, loc!(8..13), bstr!("link:")),
508        Token::new(TokenKind::Word, loc!(13..20), bstr!("include")),
509        Token::new(TokenKind::Dashes, loc!(20..21), bstr!("-")),
510        Token::new(TokenKind::Word, loc!(21..25), bstr!("file")),
511        Token::new(TokenKind::Dots, loc!(25..26), bstr!(".")),
512        Token::new(TokenKind::Word, loc!(26..30), bstr!("adoc")),
513        Token::new(TokenKind::OpenBracket, loc!(30..31), bstr!("[")),
514        // these tokens are inserted, they have no true source so we
515        // represent their position as empty at the insertion point
516        Token::new(TokenKind::Word, loc!(31..31), bstr!("role")),
517        Token::new(TokenKind::EqualSigns, loc!(31..31), bstr!("=")),
518        Token::new(TokenKind::Word, loc!(31..31), bstr!("include")),
519        Token::new(TokenKind::Comma, loc!(31..31), bstr!(",")),
520        // /end `role=include` inserted tokens
521        Token::new(TokenKind::CloseBracket, loc!(31..32), bstr!("]")),
522      ]
523    );
524    assert!(line.consume_current().is_none());
525  }
526
527  #[test]
528  fn attrs_preserved_when_replacing_include() {
529    let input = "include::some-file.adoc[leveloffset+=1]";
530    let mut parser = test_parser!(input);
531    parser.apply_job_settings(JobSettings::secure());
532    assert_eq!(
533      parser.read_line().unwrap().unwrap().reassemble_src(),
534      "link:some-file.adoc[role=include,leveloffset+=1]"
535    );
536  }
537
538  #[test]
539  fn spaces_in_include_file_to_pass_macro_link() {
540    let input = "include::foo bar baz.adoc[]";
541    let mut parser = test_parser!(input);
542    parser.apply_job_settings(JobSettings::secure());
543    assert_eq!(
544      parser.read_line().unwrap().unwrap().reassemble_src(),
545      "link:pass:c[foo bar baz.adoc][role=include,]"
546    );
547  }
548
549  #[test]
550  fn uri_read_not_allowed_include_non_strict() {
551    // non-strict mode replaced with link
552    let input = "include::https://my.com/foo bar.adoc[]";
553    let mut parser = test_parser!(input);
554    let mut settings = JobSettings::r#unsafe();
555    settings.strict = false;
556    parser.apply_job_settings(settings);
557    expect_eq!(
558      parser.read_line().unwrap().unwrap().reassemble_src(),
559      "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
560      from: input
561    );
562  }
563}