asciidork_parser/
parser.rs

1use std::fmt::Debug;
2use std::{cell::RefCell, rc::Rc};
3
4use crate::internal::*;
5
6pub struct Parser<'arena> {
7  pub(super) bump: &'arena Bump,
8  pub(super) lexer: Lexer<'arena>,
9  pub(super) document: Document<'arena>,
10  pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
11  pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
12  pub(super) ctx: ParseContext<'arena>,
13  pub(super) errors: RefCell<Vec<Diagnostic>>,
14  pub(super) strict: bool, // todo: naming...
15  pub(super) attr_locs: Vec<(SourceLocation, bool)>,
16  pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
17  #[cfg(feature = "attr_ref_observation")]
18  pub(super) attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
19}
20
21impl<'arena> Parser<'arena> {
22  pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
23    Parser::from_lexer(Lexer::new(src, file, bump))
24  }
25
26  pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
27    Parser::from_lexer(Lexer::from_str(bump, file, src))
28  }
29
30  fn from_lexer(lexer: Lexer<'arena>) -> Self {
31    let mut parser = Parser {
32      bump: lexer.bump,
33      document: Document::new(lexer.bump),
34      peeked_lines: None,
35      peeked_meta: None,
36      ctx: ParseContext::new(lexer.bump),
37      errors: RefCell::new(Vec::new()),
38      strict: true,
39      include_resolver: None,
40      lexer,
41      attr_locs: Vec::with_capacity(16),
42      #[cfg(feature = "attr_ref_observation")]
43      attr_ref_observer: None,
44    };
45    parser.set_source_file_attrs();
46    parser
47  }
48
49  pub fn apply_job_settings(&mut self, settings: JobSettings) {
50    if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
51      Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
52    }
53    self.strict = settings.strict;
54    self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
55    self.document.meta = settings.into();
56    self.set_source_file_attrs();
57  }
58
59  pub fn provide_timestamps(
60    &mut self,
61    now: u64,
62    input_modified_time: Option<u64>,
63    reproducible_override: Option<u64>,
64  ) {
65    self.set_datetime_attrs(now, input_modified_time, reproducible_override);
66  }
67
68  pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
69    self.include_resolver = Some(resolver);
70  }
71
72  #[cfg(feature = "attr_ref_observation")]
73  pub fn set_attr_ref_observer(&mut self, observer: Box<dyn AttrRefObserver>) {
74    self.attr_ref_observer = Some(observer);
75  }
76
77  pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
78    let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
79    cell_parser.include_resolver = self.include_resolver.as_ref().map(|r| r.clone_box());
80    cell_parser.strict = self.strict;
81    cell_parser.lexer.adjust_offset(offset);
82    cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
83    cell_parser.document.meta = self.document.meta.clone_for_cell();
84    cell_parser.document.anchors = Rc::clone(&self.document.anchors);
85
86    #[cfg(feature = "attr_ref_observation")]
87    {
88      cell_parser.attr_ref_observer = self.attr_ref_observer.take();
89    }
90
91    cell_parser
92  }
93
94  pub(crate) fn loc(&self) -> SourceLocation {
95    self
96      .peeked_lines
97      .as_ref()
98      .and_then(|lines| lines.first_loc())
99      .unwrap_or_else(|| self.lexer.loc())
100  }
101
102  pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
103    Ok(self._read_line(false)?.map(|(line, _)| line))
104  }
105
106  fn _read_line(&mut self, ignored_last: bool) -> Result<Option<(Line<'arena>, bool)>> {
107    assert!(self.peeked_lines.is_none());
108    if self.lexer.is_eof() {
109      return Ok(None);
110    }
111
112    let mut drop_line = false;
113    let mut line = Line::empty(self.bump);
114    while !self.lexer.at_newline() && !self.lexer.is_eof() {
115      let token = self.lexer.next_token();
116      self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
117    }
118    self.lexer.skip_newline();
119    if drop_line {
120      return self._read_line(false);
121    }
122    if line.starts(TokenKind::Directive) && !self.ctx.within_block_comment() {
123      match self.try_process_directive(&mut line)? {
124        DirectiveAction::Passthrough => Ok(Some((line, ignored_last))),
125        DirectiveAction::SubstituteLine(line) => Ok(Some((line, ignored_last))),
126        DirectiveAction::IgnoreNotIncluded => self._read_line(true),
127        DirectiveAction::ReadNextLine => self._read_line(false),
128        DirectiveAction::SkipLinesUntilEndIf => Ok(
129          self
130            .skip_lines_until_endif(&line)?
131            .map(|l| (l, ignored_last)),
132        ),
133      }
134    } else {
135      Ok(Some((line, ignored_last)))
136    }
137  }
138
139  pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
140    self.ctx.comment_delim_in_lines = false;
141    if let Some(peeked) = self.peeked_lines.take() {
142      return Ok(Some(peeked));
143    }
144    self.lexer.consume_empty_lines();
145    if self.lexer.is_eof() {
146      return Ok(None);
147    }
148    let mut lines = Deq::new(self.bump);
149    while let Some((line, ignored_removed_include_line)) = self._read_line(false)? {
150      if line.is_emptyish() {
151        if lines.is_empty() {
152          // this case can happen if our first non-empty line was an include directive
153          // that then resolved to an initial empty line, otherwise consume_empty_lines
154          // would have skipped over it, so we keep going
155          continue;
156        } else if !ignored_removed_include_line {
157          // this case can happen if our first non-empty line was an include directive
158          // this case happens only when we DROP a line
159          break;
160        }
161      }
162      if line.is_delimiter_kind(DelimiterKind::Comment) {
163        self.ctx.comment_delim_in_lines = true;
164      }
165      lines.push(line);
166      if self.lexer.at_newline() {
167        break;
168      }
169    }
170    if lines.is_empty() {
171      Ok(None)
172    } else {
173      Ok(Some(ContiguousLines::new(lines)))
174    }
175  }
176
177  pub(crate) fn read_lines_until(
178    &mut self,
179    delimiter: Delimiter,
180  ) -> Result<Option<ContiguousLines<'arena>>> {
181    let Some(mut lines) = self.read_lines()? else {
182      return Ok(None);
183    };
184    if lines.any(|l| l.is_delimiter(delimiter)) {
185      return Ok(Some(lines));
186    }
187
188    let mut additional_lines = BumpVec::new_in(self.bump);
189    while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
190      additional_lines.push(self.read_line()?.unwrap());
191    }
192    lines.extend(additional_lines);
193
194    while lines.last().map(|l| l.is_empty()) == Some(true) {
195      lines.pop();
196    }
197    Ok(Some(lines))
198  }
199
200  fn at_delimiter(&self, delimiter: Delimiter) -> bool {
201    match delimiter.kind {
202      DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
203      DelimiterKind::Example => {
204        self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
205      }
206      DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
207      DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
208      DelimiterKind::Listing => self.lexer.at_delimiter_line() == Some((4, b'-')),
209      DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
210      DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
211      DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
212    }
213  }
214
215  pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
216    debug_assert!(self.peeked_lines.is_none());
217    if !lines.is_empty() {
218      self.peeked_lines = Some(lines);
219    }
220  }
221
222  pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
223    if !meta.is_empty() {
224      debug_assert!(self.peeked_meta.is_none());
225      self.peeked_meta = Some(meta);
226    }
227  }
228
229  pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
230    self.restore_lines(lines);
231    self.restore_peeked_meta(meta);
232  }
233
234  pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
235    self.parse_document_header()?;
236    self.prepare_toc();
237
238    // ensure we only read a single "paragraph" for `inline` doc_type
239    // https://docs.asciidoctor.org/asciidoc/latest/document/doctype/#inline-doctype-rules
240    if self.document.meta.get_doctype() == DocType::Inline {
241      if self.peeked_lines.is_none() {
242        // tmp:
243        self.peeked_lines = self.read_lines().expect("tmp");
244      }
245      self.lexer.truncate();
246    }
247
248    if let Some(book_content) = self.parse_book()? {
249      self.document.content = book_content;
250    } else {
251      let sectioned = self.parse_sectioned()?;
252      self.document.content = sectioned.into_doc_content(self.bump);
253    }
254
255    // so the backend can see them replayed in decl order
256    self.document.meta.clear_doc_attrs();
257    self.diagnose_document()?;
258    Ok(self.into())
259  }
260
261  pub(crate) fn parse_sectioned(&mut self) -> Result<Sectioned<'arena>> {
262    let mut blocks = bvec![in self.bump];
263    while let Some(block) = self.parse_block()? {
264      blocks.push(block);
265    }
266    let preamble = if blocks.is_empty() { None } else { Some(blocks) };
267    let mut sections = bvec![in self.bump];
268    while let Some(section) = self.parse_section()? {
269      sections.push(section);
270    }
271    Ok(Sectioned { preamble, sections })
272  }
273
274  pub(crate) fn parse_chunk_meta(
275    &mut self,
276    lines: &mut ContiguousLines<'arena>,
277  ) -> Result<ChunkMeta<'arena>> {
278    if let Some(meta) = self.peeked_meta.take() {
279      return Ok(meta);
280    }
281    assert!(!lines.is_empty());
282    let start_loc = lines.current_token().unwrap().loc;
283    let mut attrs = MultiAttrList::new_in(self.bump);
284    let mut title = None;
285    if !lines.current().unwrap().is_fully_unconsumed() {
286      return Ok(ChunkMeta::new(attrs, title, start_loc));
287    }
288    loop {
289      match lines.current() {
290        Some(line) if line.is_chunk_title() => {
291          let mut line = lines.consume_current().unwrap();
292          line.discard_assert(TokenKind::Dots);
293          title = Some(self.parse_inlines(&mut line.into_lines())?);
294        }
295        Some(line) if line.is_block_attr_list() => {
296          let mut line = lines.consume_current().unwrap();
297          line.discard_assert(TokenKind::OpenBracket);
298          attrs.push(self.parse_block_attr_list(&mut line)?);
299        }
300        Some(line) if line.is_block_anchor() => {
301          let mut line = lines.consume_current().unwrap();
302          let first = line.discard_assert(TokenKind::OpenBracket);
303          line.discard_assert(TokenKind::OpenBracket);
304          let Some(anchor) = self.parse_block_anchor(&mut line)? else {
305            self.err_line_starting("Invalid block anchor", first.loc)?;
306            return Ok(ChunkMeta::new(attrs, title, start_loc));
307          };
308          let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
309          anchor_attrs.id = Some(anchor.id);
310          anchor_attrs.positional.push(anchor.reftext);
311          attrs.push(anchor_attrs);
312        }
313        // consume trailing comment lines for valid meta
314        Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
315          lines.consume_current();
316        }
317        _ => break,
318      }
319    }
320    Ok(ChunkMeta::new(attrs, title, start_loc))
321  }
322
323  pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
324    BumpString::from_str_in(s, self.bump)
325  }
326}
327
328pub trait HasArena<'arena> {
329  fn bump(&self) -> &'arena Bump;
330  fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
331    Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
332  }
333}
334
335impl<'arena> HasArena<'arena> for Parser<'arena> {
336  fn bump(&self) -> &'arena Bump {
337    self.bump
338  }
339}
340
341pub enum DirectiveAction<'arena> {
342  Passthrough,
343  ReadNextLine,
344  IgnoreNotIncluded,
345  SkipLinesUntilEndIf,
346  SubstituteLine(Line<'arena>),
347}
348
349#[derive(Debug, Clone, PartialEq, Eq)]
350pub enum SourceFile {
351  Stdin { cwd: Path },
352  Path(Path),
353  Tmp,
354}
355
356impl SourceFile {
357  pub fn file_name(&self) -> &str {
358    match self {
359      SourceFile::Stdin { .. } => "<stdin>",
360      SourceFile::Path(path) => path.file_name(),
361      SourceFile::Tmp => "<temp-buffer>",
362    }
363  }
364
365  pub fn matches_xref_target(&self, target: &str) -> bool {
366    let SourceFile::Path(path) = self else {
367      return false;
368    };
369    let filename = path.file_name();
370    if filename == target {
371      return true;
372    }
373    let xref_ext = file::ext(target);
374    let path_ext = file::ext(filename);
375    if xref_ext.is_some() && xref_ext != path_ext {
376      return false;
377    }
378    let fullpath = path.to_string();
379    if fullpath.ends_with(target) {
380      true
381    } else if xref_ext.is_some() {
382      false
383    } else {
384      file::remove_ext(&fullpath).ends_with(target)
385    }
386  }
387}
388
389impl From<Diagnostic> for Vec<Diagnostic> {
390  fn from(diagnostic: Diagnostic) -> Self {
391    vec![diagnostic]
392  }
393}
394
395#[cfg(test)]
396mod tests {
397  use super::*;
398  use test_utils::*;
399
400  fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
401    #[derive(Clone)]
402    struct MockResolver(pub Vec<u8>);
403    impl IncludeResolver for MockResolver {
404      fn resolve(
405        &mut self,
406        _: IncludeTarget,
407        buffer: &mut dyn IncludeBuffer,
408      ) -> std::result::Result<usize, ResolveError> {
409        buffer.initialize(self.0.len());
410        let bytes = buffer.as_bytes_mut();
411        bytes.copy_from_slice(&self.0);
412        Ok(self.0.len())
413      }
414      fn get_base_dir(&self) -> Option<String> {
415        Some("/".to_string())
416      }
417      fn clone_box(&self) -> Box<dyn IncludeResolver> {
418        Box::new(self.clone())
419      }
420    }
421    Box::new(MockResolver(Vec::from(src.as_bytes())))
422  }
423
424  fn reassemble(lines: ContiguousLines) -> String {
425    lines
426      .iter()
427      .map(|l| l.reassemble_src())
428      .collect::<Vec<_>>()
429      .join("\n")
430  }
431
432  #[test]
433  fn test_attr_ref() {
434    let mut parser = test_parser!("hello {foo} world");
435    parser
436      .document
437      .meta
438      .insert_doc_attr("foo", "_bar_")
439      .unwrap();
440    let mut lines = parser.read_lines().unwrap().unwrap();
441    let line = lines.consume_current().unwrap();
442    let tokens = line.into_iter().collect::<Vec<_>>();
443    expect_eq!(
444      &tokens,
445      &[
446        Token::new(TokenKind::Word, loc!(0..5), bstr!("hello")),
447        Token::new(TokenKind::Whitespace, loc!(5..6), bstr!(" ")),
448        Token::new(TokenKind::AttrRef, loc!(6..11), bstr!("{foo}")),
449        // these are inserted as an inline preprocessing step
450        // NB: we will use the source loc of the attr ref token to know how
451        // to skip over the resolve attribute in no-attr-ref subs contexts
452        Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
453        Token::new(TokenKind::Word, loc!(6..11), bstr!("bar")),
454        Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
455        // end inserted.
456        Token::new(TokenKind::Whitespace, loc!(11..12), bstr!(" ")),
457        Token::new(TokenKind::Word, loc!(12..17), bstr!("world")),
458      ]
459    );
460  }
461
462  #[test]
463  fn invalid_directive_line_passed_thru() {
464    let input = adoc! {"
465      foo
466      include::invalid []
467      bar
468    "};
469
470    let mut parser = test_parser!(input);
471    assert_eq!(
472      reassemble(parser.read_lines().unwrap().unwrap()),
473      input.trim_end()
474    );
475  }
476
477  #[test]
478  fn safe_mode_include_to_link() {
479    let input = adoc! {"
480      foo
481      include::include-file.adoc[]
482      baz
483    "};
484
485    let mut parser = test_parser!(input);
486    parser.apply_job_settings(JobSettings::secure());
487    assert_eq!(
488      reassemble(parser.read_lines().unwrap().unwrap()),
489      adoc! {"
490        foo
491        link:include-file.adoc[role=include,]
492        baz"
493      }
494    );
495
496    // assert on the tokens and positions
497    let mut parser = test_parser!(input);
498    parser.apply_job_settings(JobSettings::secure());
499
500    let mut line = parser.read_line().unwrap().unwrap();
501    expect_eq!(
502      line.consume_current().unwrap(),
503      Token::new(TokenKind::Word, loc!(0..3), bstr!("foo"))
504    );
505    assert!(line.consume_current().is_none());
506
507    assert_eq!(&input[8..13], "ude::");
508    assert_eq!(&input[30..32], "[]");
509
510    let mut line = parser.read_line().unwrap().unwrap();
511    expect_eq!(
512      std::array::from_fn(|_| line.consume_current().unwrap()),
513      [
514        // we "drop" positions 4-7, the `inc` of `include::`
515        // which becomes `••••link:`, keeping rest of token positions
516        Token::new(TokenKind::MacroName, loc!(8..13), bstr!("link:")),
517        Token::new(TokenKind::Word, loc!(13..20), bstr!("include")),
518        Token::new(TokenKind::Dashes, loc!(20..21), bstr!("-")),
519        Token::new(TokenKind::Word, loc!(21..25), bstr!("file")),
520        Token::new(TokenKind::Dots, loc!(25..26), bstr!(".")),
521        Token::new(TokenKind::Word, loc!(26..30), bstr!("adoc")),
522        Token::new(TokenKind::OpenBracket, loc!(30..31), bstr!("[")),
523        // these tokens are inserted, they have no true source so we
524        // represent their position as empty at the insertion point
525        Token::new(TokenKind::Word, loc!(31..31), bstr!("role")),
526        Token::new(TokenKind::EqualSigns, loc!(31..31), bstr!("=")),
527        Token::new(TokenKind::Word, loc!(31..31), bstr!("include")),
528        Token::new(TokenKind::Comma, loc!(31..31), bstr!(",")),
529        // /end `role=include` inserted tokens
530        Token::new(TokenKind::CloseBracket, loc!(31..32), bstr!("]")),
531      ]
532    );
533    assert!(line.consume_current().is_none());
534  }
535
536  #[test]
537  fn attrs_preserved_when_replacing_include() {
538    let input = "include::some-file.adoc[leveloffset+=1]";
539    let mut parser = test_parser!(input);
540    parser.apply_job_settings(JobSettings::secure());
541    assert_eq!(
542      parser.read_line().unwrap().unwrap().reassemble_src(),
543      "link:some-file.adoc[role=include,leveloffset+=1]"
544    );
545  }
546
547  #[test]
548  fn spaces_in_include_file_to_pass_macro_link() {
549    let input = "include::foo bar baz.adoc[]";
550    let mut parser = test_parser!(input);
551    parser.apply_job_settings(JobSettings::secure());
552    assert_eq!(
553      parser.read_line().unwrap().unwrap().reassemble_src(),
554      "link:pass:c[foo bar baz.adoc][role=include,]"
555    );
556  }
557
558  #[test]
559  fn uri_read_not_allowed_include_non_strict() {
560    // non-strict mode replaced with link
561    let input = "include::https://my.com/foo bar.adoc[]";
562    let mut parser = test_parser!(input);
563    let mut settings = JobSettings::r#unsafe();
564    settings.strict = false;
565    parser.apply_job_settings(settings);
566    expect_eq!(
567      parser.read_line().unwrap().unwrap().reassemble_src(),
568      "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
569      from: input
570    );
571  }
572}