Skip to main content

asciidork_parser/
parser.rs

1use std::fmt::Debug;
2use std::{cell::RefCell, rc::Rc};
3
4use crate::internal::*;
5
6pub struct Parser<'arena> {
7  pub(super) bump: &'arena Bump,
8  pub(super) lexer: Lexer<'arena>,
9  pub(super) document: Document<'arena>,
10  pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
11  pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
12  pub(super) ctx: ParseContext<'arena>,
13  pub(super) errors: RefCell<Vec<Diagnostic>>,
14  pub(super) strict: bool, // todo: naming...
15  pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
16  #[cfg(feature = "attr_ref_observation")]
17  pub(super) attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
18}
19
20impl<'arena> Parser<'arena> {
21  pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
22    Parser::from_lexer(Lexer::new(src, file, bump))
23  }
24
25  pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
26    Parser::from_lexer(Lexer::from_str(bump, file, src))
27  }
28
29  fn from_lexer(lexer: Lexer<'arena>) -> Self {
30    let mut parser = Parser {
31      bump: lexer.bump,
32      document: Document::new(lexer.bump),
33      peeked_lines: None,
34      peeked_meta: None,
35      ctx: ParseContext::new(lexer.bump),
36      errors: RefCell::new(Vec::new()),
37      strict: true,
38      include_resolver: None,
39      lexer,
40      #[cfg(feature = "attr_ref_observation")]
41      attr_ref_observer: None,
42    };
43    parser.set_source_file_attrs();
44    parser
45  }
46
47  pub fn apply_job_settings(&mut self, settings: JobSettings) {
48    if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
49      Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
50    }
51    self.strict = settings.strict;
52    self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
53    self.document.meta = settings.into();
54    self.set_source_file_attrs();
55  }
56
57  pub fn register_plugin_macros(&mut self, names: &[impl AsRef<str>]) {
58    self.lexer.register_plugin_macros(names);
59  }
60
61  pub fn provide_timestamps(
62    &mut self,
63    now: u64,
64    input_modified_time: Option<u64>,
65    reproducible_override: Option<u64>,
66  ) {
67    self.set_datetime_attrs(now, input_modified_time, reproducible_override);
68  }
69
70  pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
71    self.include_resolver = Some(resolver);
72  }
73
74  #[cfg(feature = "attr_ref_observation")]
75  pub fn set_attr_ref_observer(&mut self, observer: Box<dyn AttrRefObserver>) {
76    self.attr_ref_observer = Some(observer);
77  }
78
79  pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
80    let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
81    cell_parser.include_resolver = self.include_resolver.as_ref().map(|r| r.clone_box());
82    cell_parser.strict = self.strict;
83    cell_parser.lexer.adjust_offset(offset);
84    cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
85    cell_parser.document.meta = self.document.meta.clone_for_cell();
86    cell_parser.document.anchors = Rc::clone(&self.document.anchors);
87
88    #[cfg(feature = "attr_ref_observation")]
89    {
90      cell_parser.attr_ref_observer = self.attr_ref_observer.take();
91    }
92
93    cell_parser
94  }
95
96  pub(crate) fn loc(&self) -> SourceLocation {
97    self
98      .peeked_lines
99      .as_ref()
100      .and_then(|lines| lines.first_loc())
101      .unwrap_or_else(|| self.lexer.loc())
102  }
103
104  pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
105    Ok(self._read_line(false)?.map(|(line, _)| line))
106  }
107
108  fn _read_line(&mut self, ignored_last: bool) -> Result<Option<(Line<'arena>, bool)>> {
109    assert!(self.peeked_lines.is_none());
110    if self.lexer.is_eof() {
111      return Ok(None);
112    }
113
114    use TokenKind::*;
115    let mut drop_line = false;
116    let mut line = Line::empty(self.bump);
117    while !self.lexer.at_newline() && !self.lexer.is_eof() {
118      let mut token = self.lexer.next_token();
119      if line.is_empty() {
120        // if we encounter a line like `|===` in the very first paragraph,
121        // we know we're not in the header anymore, so any attrs refs can be set properly
122        if self.ctx.in_header
123          && !matches!(
124            token.kind,
125            Colon | EqualSigns | Word | ForwardSlashes | Directive | OpenBracket
126          )
127        {
128          self.ctx.in_header = false;
129        } else if token.kind == Colon && self.ctx.subs.attr_refs() {
130          self.try_parse_attr_def(&mut token)?;
131        }
132      }
133      self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
134    }
135    self.lexer.skip_newline();
136    if drop_line {
137      return self._read_line(false);
138    }
139    if line.starts(TokenKind::Directive) && !self.ctx.within_block_comment() {
140      match self.try_process_directive(&mut line)? {
141        DirectiveAction::Passthrough => Ok(Some((line, ignored_last))),
142        DirectiveAction::SubstituteLine(line) => Ok(Some((line, ignored_last))),
143        DirectiveAction::IgnoreNotIncluded => self._read_line(true),
144        DirectiveAction::ReadNextLine => self._read_line(false),
145        DirectiveAction::SkipLinesUntilEndIf => Ok(
146          self
147            .skip_lines_until_endif(&line)?
148            .map(|l| (l, ignored_last)),
149        ),
150      }
151    } else {
152      Ok(Some((line, ignored_last)))
153    }
154  }
155
156  pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
157    self.ctx.comment_delim_in_lines = false;
158    if let Some(peeked) = self.peeked_lines.take() {
159      return Ok(Some(peeked));
160    }
161    self.lexer.consume_empty_lines();
162    if self.lexer.is_eof() {
163      return Ok(None);
164    }
165    let mut lines = Deq::new(self.bump);
166    while let Some((line, ignored_removed_include_line)) = self._read_line(false)? {
167      if line.is_emptyish() {
168        if lines.is_empty() {
169          // this case can happen if our first non-empty line was an include directive
170          // that then resolved to an initial empty line, otherwise consume_empty_lines
171          // would have skipped over it, so we keep going
172          continue;
173        } else if !ignored_removed_include_line {
174          // this case can happen if our first non-empty line was an include directive
175          // this case happens only when we DROP a line
176          break;
177        }
178      }
179      if line.is_delimiter_kind(DelimiterKind::Comment) {
180        self.ctx.comment_delim_in_lines = true;
181      }
182      lines.push(line);
183      if self.lexer.at_newline() {
184        break;
185      }
186    }
187    if lines.is_empty() {
188      Ok(None)
189    } else {
190      Ok(Some(ContiguousLines::new(lines)))
191    }
192  }
193
194  pub(crate) fn read_lines_until(
195    &mut self,
196    delimiter: Delimiter,
197  ) -> Result<Option<ContiguousLines<'arena>>> {
198    let Some(mut lines) = self.read_lines()? else {
199      return Ok(None);
200    };
201    if lines.any(|l| l.is_delimiter(delimiter)) {
202      return Ok(Some(lines));
203    }
204
205    let mut additional_lines = BumpVec::new_in(self.bump);
206    while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
207      additional_lines.push(self.read_line()?.unwrap());
208    }
209    lines.extend(additional_lines);
210
211    while lines.last().map(|l| l.is_empty()) == Some(true) {
212      lines.pop();
213    }
214    Ok(Some(lines))
215  }
216
217  fn at_delimiter(&self, delimiter: Delimiter) -> bool {
218    match delimiter.kind {
219      DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
220      DelimiterKind::Example => {
221        self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
222      }
223      DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
224      DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
225      DelimiterKind::Listing => {
226        if delimiter.len == 3 {
227          self.lexer.at_delimiter_line() == Some((3, b'`'))
228        } else {
229          self.lexer.at_delimiter_line() == Some((4, b'-'))
230        }
231      }
232      DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
233      DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
234      DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
235    }
236  }
237
238  pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
239    debug_assert!(self.peeked_lines.is_none());
240    if !lines.is_empty() {
241      self.peeked_lines = Some(lines);
242    }
243  }
244
245  pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
246    if !meta.is_empty() {
247      debug_assert!(self.peeked_meta.is_none());
248      self.peeked_meta = Some(meta);
249    }
250  }
251
252  pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
253    self.restore_lines(lines);
254    self.restore_peeked_meta(meta);
255  }
256
257  pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
258    self.lockdown_secure_mode();
259    self.parse_document_header()?;
260    self.prepare_toc();
261
262    // ensure we only read a single "paragraph" for `inline` doc_type
263    // https://docs.asciidoctor.org/asciidoc/latest/document/doctype/#inline-doctype-rules
264    if self.document.meta.get_doctype() == DocType::Inline {
265      if self.peeked_lines.is_none() {
266        // tmp:
267        self.peeked_lines = self.read_lines().expect("tmp");
268      }
269      self.lexer.truncate();
270    }
271
272    if let Some(book_content) = self.parse_book()? {
273      self.document.content = book_content;
274    } else {
275      let sectioned = self.parse_sectioned()?;
276      self.document.content = sectioned.into_doc_content(self.bump);
277    }
278
279    self.resolve_docinfo();
280
281    // so the backend can see them replayed in decl order
282    self.document.meta.clear_doc_attrs();
283    self.diagnose_document()?;
284    Ok(self.into())
285  }
286
287  pub(crate) fn parse_sectioned(&mut self) -> Result<Sectioned<'arena>> {
288    let mut blocks = bvec![in self.bump];
289    while let Some(block) = self.parse_block()? {
290      blocks.push(block);
291    }
292    let preamble = if blocks.is_empty() { None } else { Some(blocks) };
293    let mut sections = bvec![in self.bump];
294    while let Some(section) = self.parse_section()? {
295      sections.push(section);
296    }
297    Ok(Sectioned { preamble, sections })
298  }
299
300  pub(crate) fn parse_chunk_meta(
301    &mut self,
302    lines: &mut ContiguousLines<'arena>,
303  ) -> Result<ChunkMeta<'arena>> {
304    if let Some(meta) = self.peeked_meta.take() {
305      return Ok(meta);
306    }
307    assert!(!lines.is_empty());
308    let start_loc = lines.current_token().unwrap().loc;
309    let mut attrs = MultiAttrList::new_in(self.bump);
310    let mut title = None;
311    if !lines.current().unwrap().is_fully_unconsumed() {
312      return Ok(ChunkMeta::new(attrs, title, start_loc));
313    }
314    loop {
315      match lines.current() {
316        Some(line) if line.is_chunk_title() => {
317          let mut line = lines.consume_current().unwrap();
318          line.discard_assert(TokenKind::Dots);
319          title = Some(self.parse_inlines(&mut line.into_lines())?);
320        }
321        Some(line) if line.is_block_attr_list() => {
322          let mut line = lines.consume_current().unwrap();
323          line.discard_assert(TokenKind::OpenBracket);
324          attrs.push(self.parse_block_attr_list(&mut line)?);
325        }
326        Some(line) if line.is_block_anchor() => {
327          let mut line = lines.consume_current().unwrap();
328          let first = line.discard_assert(TokenKind::OpenBracket);
329          line.discard_assert(TokenKind::OpenBracket);
330          let Some(anchor) = self.parse_block_anchor(&mut line)? else {
331            self.err_line_starting("Invalid block anchor", first.loc)?;
332            return Ok(ChunkMeta::new(attrs, title, start_loc));
333          };
334          let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
335          anchor_attrs.id = Some(anchor.id);
336          anchor_attrs.positional.push(anchor.reftext);
337          attrs.push(anchor_attrs);
338        }
339        // consume trailing comment lines for valid meta
340        Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
341          lines.consume_current();
342        }
343        _ => break,
344      }
345    }
346    Ok(ChunkMeta::new(attrs, title, start_loc))
347  }
348
349  pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
350    BumpString::from_str_in(s, self.bump)
351  }
352
353  fn lockdown_secure_mode(&mut self) {
354    let meta = &mut self.document.meta;
355    if meta.safe_mode == SafeMode::Secure {
356      _ = meta.insert_job_attr("data-uri", JobAttr::readonly(false));
357      if meta.is_unset("max-attribute-value-size") {
358        _ = meta.insert_job_attr("max-attribute-value-size", JobAttr::readonly("4096"));
359      }
360      if meta.is_unset("linkcss") {
361        _ = meta.insert_job_attr("linkcss", JobAttr::readonly(""));
362      }
363      if meta.is_unset("icons") {
364        _ = meta.insert_job_attr("icons", JobAttr::readonly(false));
365      }
366    }
367  }
368}
369
370pub trait HasArena<'arena> {
371  fn bump(&self) -> &'arena Bump;
372  fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
373    Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
374  }
375}
376
377impl<'arena> HasArena<'arena> for Parser<'arena> {
378  fn bump(&self) -> &'arena Bump {
379    self.bump
380  }
381}
382
383pub enum DirectiveAction<'arena> {
384  Passthrough,
385  ReadNextLine,
386  IgnoreNotIncluded,
387  SkipLinesUntilEndIf,
388  SubstituteLine(Line<'arena>),
389}
390
391#[derive(Debug, Clone, PartialEq, Eq)]
392pub enum SourceFile {
393  Stdin { cwd: Path },
394  Path(Path),
395  Tmp,
396}
397
398impl SourceFile {
399  pub fn file_name(&self) -> &str {
400    match self {
401      SourceFile::Stdin { .. } => "<stdin>",
402      SourceFile::Path(path) => path.file_name(),
403      SourceFile::Tmp => "<temp-buffer>",
404    }
405  }
406
407  pub fn matches_xref_target(&self, target: &str) -> bool {
408    let SourceFile::Path(path) = self else {
409      return false;
410    };
411    let filename = path.file_name();
412    if filename == target {
413      return true;
414    }
415    let xref_ext = file::ext(target);
416    let path_ext = file::ext(filename);
417    if xref_ext.is_some() && xref_ext != path_ext {
418      return false;
419    }
420    let fullpath = path.to_string();
421    if fullpath.ends_with(target) {
422      true
423    } else if xref_ext.is_some() {
424      false
425    } else {
426      file::remove_ext(&fullpath).ends_with(target)
427    }
428  }
429}
430
431impl From<Diagnostic> for Vec<Diagnostic> {
432  fn from(diagnostic: Diagnostic) -> Self {
433    vec![diagnostic]
434  }
435}
436
437#[cfg(test)]
438mod tests {
439  use super::*;
440  use test_utils::*;
441
442  fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
443    #[derive(Clone)]
444    struct MockResolver(pub Vec<u8>);
445    impl IncludeResolver for MockResolver {
446      fn resolve(
447        &mut self,
448        _: IncludeTarget,
449        buffer: &mut dyn IncludeBuffer,
450        _: SafeMode,
451      ) -> std::result::Result<usize, ResolveError> {
452        buffer.initialize(self.0.len());
453        let bytes = buffer.as_bytes_mut();
454        bytes.copy_from_slice(&self.0);
455        Ok(self.0.len())
456      }
457      fn get_base_dir(&self) -> Option<String> {
458        Some("/".to_string())
459      }
460      fn clone_box(&self) -> Box<dyn IncludeResolver> {
461        Box::new(self.clone())
462      }
463    }
464    Box::new(MockResolver(Vec::from(src.as_bytes())))
465  }
466
467  fn reassemble(lines: ContiguousLines) -> String {
468    lines
469      .iter()
470      .map(|l| l.reassemble_src())
471      .collect::<Vec<_>>()
472      .join("\n")
473  }
474
475  #[test]
476  fn test_attr_ref() {
477    let mut parser = test_parser!("hello {foo} world");
478    parser
479      .document
480      .meta
481      .insert_doc_attr("foo", "_bar_")
482      .unwrap();
483    let mut lines = parser.read_lines().unwrap().unwrap();
484    let line = lines.consume_current().unwrap();
485    let tokens = line.into_iter().collect::<Vec<_>>();
486    expect_eq!(
487      &tokens,
488      &[
489        Token::new(TokenKind::Word, loc!(0..5), bstr!("hello")),
490        Token::new(TokenKind::Whitespace, loc!(5..6), bstr!(" ")),
491        Token::new(TokenKind::AttrRef, loc!(6..11), bstr!("{foo}")),
492        // these are inserted as an inline preprocessing step
493        // NB: we will use the source loc of the attr ref token to know how
494        // to skip over the resolve attribute in no-attr-ref subs contexts
495        Token {
496          kind: TokenKind::Underscore,
497          loc: loc!(6..11),
498          lexeme: bstr!("_"),
499          attr_replacement: true
500        },
501        Token {
502          kind: TokenKind::Word,
503          loc: loc!(6..11),
504          lexeme: bstr!("bar"),
505          attr_replacement: true
506        },
507        Token {
508          kind: TokenKind::Underscore,
509          loc: loc!(6..11),
510          lexeme: bstr!("_"),
511          attr_replacement: true
512        },
513        // end inserted.
514        Token::new(TokenKind::Whitespace, loc!(11..12), bstr!(" ")),
515        Token::new(TokenKind::Word, loc!(12..17), bstr!("world")),
516      ]
517    );
518  }
519
520  #[test]
521  fn invalid_directive_line_passed_thru() {
522    let input = adoc! {"
523      foo
524      include::invalid []
525      bar
526    "};
527
528    let mut parser = test_parser!(input);
529    assert_eq!(
530      reassemble(parser.read_lines().unwrap().unwrap()),
531      input.trim_end()
532    );
533  }
534
535  #[test]
536  fn safe_mode_include_to_link() {
537    let input = adoc! {"
538      foo
539      include::include-file.adoc[]
540      baz
541    "};
542
543    let mut parser = test_parser!(input);
544    parser.apply_job_settings(JobSettings::secure());
545    assert_eq!(
546      reassemble(parser.read_lines().unwrap().unwrap()),
547      adoc! {"
548        foo
549        link:include-file.adoc[role=include,]
550        baz"
551      }
552    );
553
554    // assert on the tokens and positions
555    let mut parser = test_parser!(input);
556    parser.apply_job_settings(JobSettings::secure());
557
558    let mut line = parser.read_line().unwrap().unwrap();
559    expect_eq!(
560      line.consume_current().unwrap(),
561      Token::new(TokenKind::Word, loc!(0..3), bstr!("foo"))
562    );
563    assert!(line.consume_current().is_none());
564
565    assert_eq!(&input[8..13], "ude::");
566    assert_eq!(&input[30..32], "[]");
567
568    let mut line = parser.read_line().unwrap().unwrap();
569    expect_eq!(
570      std::array::from_fn(|_| line.consume_current().unwrap()),
571      [
572        // we "drop" positions 4-7, the `inc` of `include::`
573        // which becomes `••••link:`, keeping rest of token positions
574        Token::new(TokenKind::MacroName, loc!(8..13), bstr!("link:")),
575        Token::new(TokenKind::Word, loc!(13..20), bstr!("include")),
576        Token::new(TokenKind::Dashes, loc!(20..21), bstr!("-")),
577        Token::new(TokenKind::Word, loc!(21..25), bstr!("file")),
578        Token::new(TokenKind::Dots, loc!(25..26), bstr!(".")),
579        Token::new(TokenKind::Word, loc!(26..30), bstr!("adoc")),
580        Token::new(TokenKind::OpenBracket, loc!(30..31), bstr!("[")),
581        // these tokens are inserted, they have no true source so we
582        // represent their position as empty at the insertion point
583        Token::new(TokenKind::Word, loc!(31..31), bstr!("role")),
584        Token::new(TokenKind::EqualSigns, loc!(31..31), bstr!("=")),
585        Token::new(TokenKind::Word, loc!(31..31), bstr!("include")),
586        Token::new(TokenKind::Comma, loc!(31..31), bstr!(",")),
587        // /end `role=include` inserted tokens
588        Token::new(TokenKind::CloseBracket, loc!(31..32), bstr!("]")),
589      ]
590    );
591    assert!(line.consume_current().is_none());
592  }
593
594  #[test]
595  fn attrs_preserved_when_replacing_include() {
596    let input = "include::some-file.adoc[leveloffset+=1]";
597    let mut parser = test_parser!(input);
598    parser.apply_job_settings(JobSettings::secure());
599    assert_eq!(
600      parser.read_line().unwrap().unwrap().reassemble_src(),
601      "link:some-file.adoc[role=include,leveloffset+=1]"
602    );
603  }
604
605  #[test]
606  fn spaces_in_include_file_to_pass_macro_link() {
607    let input = "include::foo bar baz.adoc[]";
608    let mut parser = test_parser!(input);
609    parser.apply_job_settings(JobSettings::secure());
610    assert_eq!(
611      parser.read_line().unwrap().unwrap().reassemble_src(),
612      "link:pass:c[foo bar baz.adoc][role=include,]"
613    );
614  }
615
616  #[test]
617  fn uri_read_not_allowed_include_non_strict() {
618    // non-strict mode replaced with link
619    let input = "include::https://my.com/foo bar.adoc[]";
620    let mut parser = test_parser!(input);
621    let mut settings = JobSettings::r#unsafe();
622    settings.strict = false;
623    parser.apply_job_settings(settings);
624    expect_eq!(
625      parser.read_line().unwrap().unwrap().reassemble_src(),
626      "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
627      from: input
628    );
629  }
630}