asciidork_parser/
parser.rs

1use std::fmt::{Debug, Formatter};
2use std::{cell::RefCell, rc::Rc};
3
4use crate::internal::*;
5
6pub struct Parser<'arena> {
7  pub(super) bump: &'arena Bump,
8  pub(super) lexer: Lexer<'arena>,
9  pub(super) document: Document<'arena>,
10  pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
11  pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
12  pub(super) ctx: ParseContext<'arena>,
13  pub(super) errors: RefCell<Vec<Diagnostic>>,
14  pub(super) strict: bool, // todo: naming...
15  pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
16  #[cfg(feature = "attr_ref_observation")]
17  pub(super) attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
18}
19
20pub struct ParseResult<'arena> {
21  pub document: Document<'arena>,
22  pub warnings: Vec<Diagnostic>,
23  #[cfg(feature = "attr_ref_observation")]
24  pub attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
25}
26
27impl<'arena> Parser<'arena> {
28  pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
29    Parser::from_lexer(Lexer::new(src, file, bump))
30  }
31
32  pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
33    Parser::from_lexer(Lexer::from_str(bump, file, src))
34  }
35
36  fn from_lexer(lexer: Lexer<'arena>) -> Self {
37    let mut parser = Parser {
38      bump: lexer.bump,
39      document: Document::new(lexer.bump),
40      peeked_lines: None,
41      peeked_meta: None,
42      ctx: ParseContext::new(lexer.bump),
43      errors: RefCell::new(Vec::new()),
44      strict: true,
45      include_resolver: None,
46      lexer,
47      #[cfg(feature = "attr_ref_observation")]
48      attr_ref_observer: None,
49    };
50    parser.set_source_file_attrs();
51    parser
52  }
53
54  pub fn apply_job_settings(&mut self, settings: JobSettings) {
55    if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
56      Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
57    }
58    self.strict = settings.strict;
59    self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
60    self.document.meta = settings.into();
61    self.set_source_file_attrs();
62  }
63
64  pub fn provide_timestamps(
65    &mut self,
66    now: u64,
67    input_modified_time: Option<u64>,
68    reproducible_override: Option<u64>,
69  ) {
70    self.set_datetime_attrs(now, input_modified_time, reproducible_override);
71  }
72
73  pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
74    self.include_resolver = Some(resolver);
75  }
76
77  #[cfg(feature = "attr_ref_observation")]
78  pub fn set_attr_ref_observer(&mut self, observer: Box<dyn AttrRefObserver>) {
79    self.attr_ref_observer = Some(observer);
80  }
81
82  pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
83    let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
84    cell_parser.include_resolver = self.include_resolver.as_ref().map(|r| r.clone_box());
85    cell_parser.strict = self.strict;
86    cell_parser.lexer.adjust_offset(offset);
87    cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
88    cell_parser.document.meta = self.document.meta.clone_for_cell();
89    cell_parser.document.anchors = Rc::clone(&self.document.anchors);
90
91    #[cfg(feature = "attr_ref_observation")]
92    {
93      cell_parser.attr_ref_observer = self.attr_ref_observer.take();
94    }
95
96    cell_parser
97  }
98
99  pub(crate) fn loc(&self) -> SourceLocation {
100    self
101      .peeked_lines
102      .as_ref()
103      .and_then(|lines| lines.first_loc())
104      .unwrap_or_else(|| self.lexer.loc())
105  }
106
107  pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
108    Ok(self._read_line(false)?.map(|(line, _)| line))
109  }
110
111  fn _read_line(&mut self, ignored_last: bool) -> Result<Option<(Line<'arena>, bool)>> {
112    assert!(self.peeked_lines.is_none());
113    if self.lexer.is_eof() {
114      return Ok(None);
115    }
116
117    let mut drop_line = false;
118    let mut line = Line::empty(self.bump);
119    while !self.lexer.at_newline() && !self.lexer.is_eof() {
120      let token = self.lexer.next_token();
121      self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
122    }
123    self.lexer.skip_newline();
124    if drop_line {
125      return self._read_line(false);
126    }
127    if line.starts(TokenKind::Directive) && !self.ctx.within_block_comment() {
128      match self.try_process_directive(&mut line)? {
129        DirectiveAction::Passthrough => Ok(Some((line, ignored_last))),
130        DirectiveAction::SubstituteLine(line) => Ok(Some((line, ignored_last))),
131        DirectiveAction::IgnoreNotIncluded => self._read_line(true),
132        DirectiveAction::ReadNextLine => self._read_line(false),
133        DirectiveAction::SkipLinesUntilEndIf => Ok(
134          self
135            .skip_lines_until_endif(&line)?
136            .map(|l| (l, ignored_last)),
137        ),
138      }
139    } else {
140      Ok(Some((line, ignored_last)))
141    }
142  }
143
144  pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
145    self.ctx.comment_delim_in_lines = false;
146    if let Some(peeked) = self.peeked_lines.take() {
147      return Ok(Some(peeked));
148    }
149    self.lexer.consume_empty_lines();
150    if self.lexer.is_eof() {
151      return Ok(None);
152    }
153    let mut lines = Deq::new(self.bump);
154    while let Some((line, ignored_removed_include_line)) = self._read_line(false)? {
155      if line.is_emptyish() {
156        if lines.is_empty() {
157          // this case can happen if our first non-empty line was an include directive
158          // that then resolved to an initial empty line, otherwise consume_empty_lines
159          // would have skipped over it, so we keep going
160          continue;
161        } else if !ignored_removed_include_line {
162          // this case can happen if our first non-empty line was an include directive
163          // this case happens only when we DROP a line
164          break;
165        }
166      }
167      if line.is_delimiter_kind(DelimiterKind::Comment) {
168        self.ctx.comment_delim_in_lines = true;
169      }
170      lines.push(line);
171      if self.lexer.at_newline() {
172        break;
173      }
174    }
175    if lines.is_empty() {
176      Ok(None)
177    } else {
178      Ok(Some(ContiguousLines::new(lines)))
179    }
180  }
181
182  pub(crate) fn read_lines_until(
183    &mut self,
184    delimiter: Delimiter,
185  ) -> Result<Option<ContiguousLines<'arena>>> {
186    let Some(mut lines) = self.read_lines()? else {
187      return Ok(None);
188    };
189    if lines.any(|l| l.is_delimiter(delimiter)) {
190      return Ok(Some(lines));
191    }
192
193    let mut additional_lines = BumpVec::new_in(self.bump);
194    while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
195      additional_lines.push(self.read_line()?.unwrap());
196    }
197    lines.extend(additional_lines);
198
199    while lines.last().map(|l| l.is_empty()) == Some(true) {
200      lines.pop();
201    }
202    Ok(Some(lines))
203  }
204
205  fn at_delimiter(&self, delimiter: Delimiter) -> bool {
206    match delimiter.kind {
207      DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
208      DelimiterKind::Example => {
209        self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
210      }
211      DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
212      DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
213      DelimiterKind::Listing => self.lexer.at_delimiter_line() == Some((4, b'-')),
214      DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
215      DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
216      DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
217    }
218  }
219
220  pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
221    debug_assert!(self.peeked_lines.is_none());
222    if !lines.is_empty() {
223      self.peeked_lines = Some(lines);
224    }
225  }
226
227  pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
228    if !meta.is_empty() {
229      debug_assert!(self.peeked_meta.is_none());
230      self.peeked_meta = Some(meta);
231    }
232  }
233
234  pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
235    self.restore_lines(lines);
236    self.restore_peeked_meta(meta);
237  }
238
239  pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
240    self.parse_document_header()?;
241    self.prepare_toc();
242
243    // ensure we only read a single "paragraph" for `inline` doc_type
244    // https://docs.asciidoctor.org/asciidoc/latest/document/doctype/#inline-doctype-rules
245    if self.document.meta.get_doctype() == DocType::Inline {
246      if self.peeked_lines.is_none() {
247        // tmp:
248        self.peeked_lines = self.read_lines().expect("tmp");
249      }
250      self.lexer.truncate();
251    }
252
253    if let Some(book_content) = self.parse_book()? {
254      self.document.content = book_content;
255    } else {
256      let sectioned = self.parse_sectioned()?;
257      self.document.content = sectioned.into_doc_content(self.bump);
258    }
259
260    // so the backend can see them replayed in decl order
261    self.document.meta.clear_doc_attrs();
262
263    self.diagnose_document()?;
264
265    Ok(ParseResult {
266      document: self.document,
267      warnings: self.errors.into_inner(),
268      #[cfg(feature = "attr_ref_observation")]
269      attr_ref_observer: self.attr_ref_observer,
270    })
271  }
272
273  pub(crate) fn parse_sectioned(&mut self) -> Result<Sectioned<'arena>> {
274    let mut blocks = bvec![in self.bump];
275    while let Some(block) = self.parse_block()? {
276      blocks.push(block);
277    }
278    let preamble = if blocks.is_empty() { None } else { Some(blocks) };
279    let mut sections = bvec![in self.bump];
280    while let Some(section) = self.parse_section()? {
281      sections.push(section);
282    }
283    Ok(Sectioned { preamble, sections })
284  }
285
286  pub(crate) fn parse_chunk_meta(
287    &mut self,
288    lines: &mut ContiguousLines<'arena>,
289  ) -> Result<ChunkMeta<'arena>> {
290    if let Some(meta) = self.peeked_meta.take() {
291      return Ok(meta);
292    }
293    assert!(!lines.is_empty());
294    let start_loc = lines.current_token().unwrap().loc;
295    let mut attrs = MultiAttrList::new_in(self.bump);
296    let mut title = None;
297    if !lines.current().unwrap().is_fully_unconsumed() {
298      return Ok(ChunkMeta::new(attrs, title, start_loc));
299    }
300    loop {
301      match lines.current() {
302        Some(line) if line.is_chunk_title() => {
303          let mut line = lines.consume_current().unwrap();
304          line.discard_assert(TokenKind::Dots);
305          title = Some(self.parse_inlines(&mut line.into_lines())?);
306        }
307        Some(line) if line.is_block_attr_list() => {
308          let mut line = lines.consume_current().unwrap();
309          line.discard_assert(TokenKind::OpenBracket);
310          attrs.push(self.parse_block_attr_list(&mut line)?);
311        }
312        Some(line) if line.is_block_anchor() => {
313          let mut line = lines.consume_current().unwrap();
314          let first = line.discard_assert(TokenKind::OpenBracket);
315          line.discard_assert(TokenKind::OpenBracket);
316          let Some(anchor) = self.parse_block_anchor(&mut line)? else {
317            self.err_line_starting("Invalid block anchor", first.loc)?;
318            return Ok(ChunkMeta::new(attrs, title, start_loc));
319          };
320          let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
321          anchor_attrs.id = Some(anchor.id);
322          anchor_attrs.positional.push(anchor.reftext);
323          attrs.push(anchor_attrs);
324        }
325        // consume trailing comment lines for valid meta
326        Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
327          lines.consume_current();
328        }
329        _ => break,
330      }
331    }
332    Ok(ChunkMeta::new(attrs, title, start_loc))
333  }
334
335  pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
336    BumpString::from_str_in(s, self.bump)
337  }
338
339  pub fn line_number_with_offset(&self, loc: SourceLocation) -> (u32, u32) {
340    self.lexer.line_number_with_offset(loc)
341  }
342
343  pub fn source_file_at(&self, idx: u16) -> &SourceFile {
344    self.lexer.source_file_at(idx)
345  }
346}
347
348pub trait HasArena<'arena> {
349  fn bump(&self) -> &'arena Bump;
350  fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
351    Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
352  }
353}
354
355impl<'arena> HasArena<'arena> for Parser<'arena> {
356  fn bump(&self) -> &'arena Bump {
357    self.bump
358  }
359}
360
361pub enum DirectiveAction<'arena> {
362  Passthrough,
363  ReadNextLine,
364  IgnoreNotIncluded,
365  SkipLinesUntilEndIf,
366  SubstituteLine(Line<'arena>),
367}
368
369#[derive(Debug, Clone, PartialEq, Eq)]
370pub enum SourceFile {
371  Stdin { cwd: Path },
372  Path(Path),
373  Tmp,
374}
375
376impl SourceFile {
377  pub fn file_name(&self) -> &str {
378    match self {
379      SourceFile::Stdin { .. } => "<stdin>",
380      SourceFile::Path(path) => path.file_name(),
381      SourceFile::Tmp => "<temp-buffer>",
382    }
383  }
384
385  pub fn matches_xref_target(&self, target: &str) -> bool {
386    let SourceFile::Path(path) = self else {
387      return false;
388    };
389    let filename = path.file_name();
390    if filename == target {
391      return true;
392    }
393    let xref_ext = file::ext(target);
394    let path_ext = file::ext(filename);
395    if xref_ext.is_some() && xref_ext != path_ext {
396      return false;
397    }
398    let fullpath = path.to_string();
399    if fullpath.ends_with(target) {
400      true
401    } else if xref_ext.is_some() {
402      false
403    } else {
404      file::remove_ext(&fullpath).ends_with(target)
405    }
406  }
407}
408
409impl From<Diagnostic> for Vec<Diagnostic> {
410  fn from(diagnostic: Diagnostic) -> Self {
411    vec![diagnostic]
412  }
413}
414
415impl Debug for ParseResult<'_> {
416  fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
417    f.debug_struct("ParseResult")
418      .field("document", &self.document)
419      .field("warnings", &self.warnings)
420      .finish()
421  }
422}
423
424#[cfg(test)]
425mod tests {
426  use super::*;
427  use test_utils::*;
428
429  fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
430    #[derive(Clone)]
431    struct MockResolver(pub Vec<u8>);
432    impl IncludeResolver for MockResolver {
433      fn resolve(
434        &mut self,
435        _: IncludeTarget,
436        buffer: &mut dyn IncludeBuffer,
437      ) -> std::result::Result<usize, ResolveError> {
438        buffer.initialize(self.0.len());
439        let bytes = buffer.as_bytes_mut();
440        bytes.copy_from_slice(&self.0);
441        Ok(self.0.len())
442      }
443      fn get_base_dir(&self) -> Option<String> {
444        Some("/".to_string())
445      }
446      fn clone_box(&self) -> Box<dyn IncludeResolver> {
447        Box::new(self.clone())
448      }
449    }
450    Box::new(MockResolver(Vec::from(src.as_bytes())))
451  }
452
453  fn reassemble(lines: ContiguousLines) -> String {
454    lines
455      .iter()
456      .map(|l| l.reassemble_src())
457      .collect::<Vec<_>>()
458      .join("\n")
459  }
460
461  #[test]
462  fn test_attr_ref() {
463    let mut parser = test_parser!("hello {foo} world");
464    parser
465      .document
466      .meta
467      .insert_doc_attr("foo", "_bar_")
468      .unwrap();
469    let mut lines = parser.read_lines().unwrap().unwrap();
470    let line = lines.consume_current().unwrap();
471    let tokens = line.into_iter().collect::<Vec<_>>();
472    expect_eq!(
473      &tokens,
474      &[
475        Token::new(TokenKind::Word, loc!(0..5), bstr!("hello")),
476        Token::new(TokenKind::Whitespace, loc!(5..6), bstr!(" ")),
477        Token::new(TokenKind::AttrRef, loc!(6..11), bstr!("{foo}")),
478        // these are inserted as an inline preprocessing step
479        // NB: we will use the source loc of the attr ref token to know how
480        // to skip over the resolve attribute in no-attr-ref subs contexts
481        Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
482        Token::new(TokenKind::Word, loc!(6..11), bstr!("bar")),
483        Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
484        // end inserted.
485        Token::new(TokenKind::Whitespace, loc!(11..12), bstr!(" ")),
486        Token::new(TokenKind::Word, loc!(12..17), bstr!("world")),
487      ]
488    );
489  }
490
491  #[test]
492  fn invalid_directive_line_passed_thru() {
493    let input = adoc! {"
494      foo
495      include::invalid []
496      bar
497    "};
498
499    let mut parser = test_parser!(input);
500    assert_eq!(
501      reassemble(parser.read_lines().unwrap().unwrap()),
502      input.trim_end()
503    );
504  }
505
506  #[test]
507  fn safe_mode_include_to_link() {
508    let input = adoc! {"
509      foo
510      include::include-file.adoc[]
511      baz
512    "};
513
514    let mut parser = test_parser!(input);
515    parser.apply_job_settings(JobSettings::secure());
516    assert_eq!(
517      reassemble(parser.read_lines().unwrap().unwrap()),
518      adoc! {"
519        foo
520        link:include-file.adoc[role=include,]
521        baz"
522      }
523    );
524
525    // assert on the tokens and positions
526    let mut parser = test_parser!(input);
527    parser.apply_job_settings(JobSettings::secure());
528
529    let mut line = parser.read_line().unwrap().unwrap();
530    expect_eq!(
531      line.consume_current().unwrap(),
532      Token::new(TokenKind::Word, loc!(0..3), bstr!("foo"))
533    );
534    assert!(line.consume_current().is_none());
535
536    assert_eq!(&input[8..13], "ude::");
537    assert_eq!(&input[30..32], "[]");
538
539    let mut line = parser.read_line().unwrap().unwrap();
540    expect_eq!(
541      std::array::from_fn(|_| line.consume_current().unwrap()),
542      [
543        // we "drop" positions 4-7, the `inc` of `include::`
544        // which becomes `••••link:`, keeping rest of token positions
545        Token::new(TokenKind::MacroName, loc!(8..13), bstr!("link:")),
546        Token::new(TokenKind::Word, loc!(13..20), bstr!("include")),
547        Token::new(TokenKind::Dashes, loc!(20..21), bstr!("-")),
548        Token::new(TokenKind::Word, loc!(21..25), bstr!("file")),
549        Token::new(TokenKind::Dots, loc!(25..26), bstr!(".")),
550        Token::new(TokenKind::Word, loc!(26..30), bstr!("adoc")),
551        Token::new(TokenKind::OpenBracket, loc!(30..31), bstr!("[")),
552        // these tokens are inserted, they have no true source so we
553        // represent their position as empty at the insertion point
554        Token::new(TokenKind::Word, loc!(31..31), bstr!("role")),
555        Token::new(TokenKind::EqualSigns, loc!(31..31), bstr!("=")),
556        Token::new(TokenKind::Word, loc!(31..31), bstr!("include")),
557        Token::new(TokenKind::Comma, loc!(31..31), bstr!(",")),
558        // /end `role=include` inserted tokens
559        Token::new(TokenKind::CloseBracket, loc!(31..32), bstr!("]")),
560      ]
561    );
562    assert!(line.consume_current().is_none());
563  }
564
565  #[test]
566  fn attrs_preserved_when_replacing_include() {
567    let input = "include::some-file.adoc[leveloffset+=1]";
568    let mut parser = test_parser!(input);
569    parser.apply_job_settings(JobSettings::secure());
570    assert_eq!(
571      parser.read_line().unwrap().unwrap().reassemble_src(),
572      "link:some-file.adoc[role=include,leveloffset+=1]"
573    );
574  }
575
576  #[test]
577  fn spaces_in_include_file_to_pass_macro_link() {
578    let input = "include::foo bar baz.adoc[]";
579    let mut parser = test_parser!(input);
580    parser.apply_job_settings(JobSettings::secure());
581    assert_eq!(
582      parser.read_line().unwrap().unwrap().reassemble_src(),
583      "link:pass:c[foo bar baz.adoc][role=include,]"
584    );
585  }
586
587  #[test]
588  fn uri_read_not_allowed_include_non_strict() {
589    // non-strict mode replaced with link
590    let input = "include::https://my.com/foo bar.adoc[]";
591    let mut parser = test_parser!(input);
592    let mut settings = JobSettings::r#unsafe();
593    settings.strict = false;
594    parser.apply_job_settings(settings);
595    expect_eq!(
596      parser.read_line().unwrap().unwrap().reassemble_src(),
597      "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
598      from: input
599    );
600  }
601}