asciidork_parser/
parser.rs

1use std::{cell::RefCell, rc::Rc};
2
3use crate::internal::*;
4
5pub struct Parser<'arena> {
6  pub(super) bump: &'arena Bump,
7  pub(super) lexer: Lexer<'arena>,
8  pub(super) document: Document<'arena>,
9  pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
10  pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
11  pub(super) ctx: ParseContext<'arena>,
12  pub(super) errors: RefCell<Vec<Diagnostic>>,
13  pub(super) strict: bool, // todo: naming...
14  pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
15}
16
17pub struct ParseResult<'arena> {
18  pub document: Document<'arena>,
19  pub warnings: Vec<Diagnostic>,
20}
21
22impl<'arena> Parser<'arena> {
23  pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
24    Parser::from_lexer(Lexer::new(src, file, bump))
25  }
26
27  pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
28    Parser::from_lexer(Lexer::from_str(bump, file, src))
29  }
30
31  fn from_lexer(lexer: Lexer<'arena>) -> Self {
32    let mut parser = Parser {
33      bump: lexer.bump,
34      document: Document::new(lexer.bump),
35      peeked_lines: None,
36      peeked_meta: None,
37      ctx: ParseContext::new(lexer.bump),
38      errors: RefCell::new(Vec::new()),
39      strict: true,
40      include_resolver: None,
41      lexer,
42    };
43    parser.set_source_file_attrs();
44    parser
45  }
46
47  pub fn apply_job_settings(&mut self, settings: JobSettings) {
48    if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
49      Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
50    }
51    self.strict = settings.strict;
52    self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
53    self.document.meta = settings.into();
54    self.set_source_file_attrs();
55  }
56
57  pub fn provide_timestamps(
58    &mut self,
59    now: u64,
60    input_modified_time: Option<u64>,
61    reproducible_override: Option<u64>,
62  ) {
63    self.set_datetime_attrs(now, input_modified_time, reproducible_override);
64  }
65
66  pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
67    self.include_resolver = Some(resolver);
68  }
69
70  pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
71    let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
72    cell_parser.strict = self.strict;
73    cell_parser.lexer.adjust_offset(offset);
74    cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
75    cell_parser.document.meta = self.document.meta.clone_for_cell();
76    cell_parser.document.anchors = Rc::clone(&self.document.anchors);
77    cell_parser
78  }
79
80  pub(crate) fn loc(&self) -> SourceLocation {
81    self
82      .peeked_lines
83      .as_ref()
84      .and_then(|lines| lines.first_loc())
85      .unwrap_or_else(|| self.lexer.loc())
86  }
87
88  pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
89    assert!(self.peeked_lines.is_none());
90    if self.lexer.is_eof() {
91      return Ok(None);
92    }
93
94    let mut drop_line = false;
95    let mut line = Line::empty(self.bump);
96    while !self.lexer.at_newline() && !self.lexer.is_eof() {
97      let token = self.lexer.next_token();
98      self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
99    }
100    self.lexer.skip_newline();
101    if drop_line {
102      return self.read_line();
103    }
104    if line.starts(TokenKind::Directive) {
105      match self.try_process_directive(&mut line)? {
106        DirectiveAction::Passthrough => Ok(Some(line)),
107        DirectiveAction::SubstituteLine(line) => Ok(Some(line)),
108        DirectiveAction::ReadNextLine => self.read_line(),
109        DirectiveAction::SkipLinesUntilEndIf => self.skip_lines_until_endif(&line),
110      }
111    } else {
112      Ok(Some(line))
113    }
114  }
115
116  pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
117    if let Some(peeked) = self.peeked_lines.take() {
118      return Ok(Some(peeked));
119    }
120    self.lexer.consume_empty_lines();
121    if self.lexer.is_eof() {
122      return Ok(None);
123    }
124    let mut lines = Deq::new(self.bump);
125    while let Some(line) = self.read_line()? {
126      if line.is_emptyish() {
127        if lines.is_empty() {
128          // this case can happen if our first non-empty line was an include directive
129          // that then resolved to an initial empty line, otherwise consume_empty_lines
130          // would have skipped over it, so we keep going
131          continue;
132        } else {
133          // this case happens only when we DROP a line
134          break;
135        }
136      }
137      lines.push(line);
138      if self.lexer.at_newline() {
139        break;
140      }
141    }
142    if lines.is_empty() {
143      Ok(None)
144    } else {
145      Ok(Some(ContiguousLines::new(lines)))
146    }
147  }
148
149  pub(crate) fn read_lines_until(
150    &mut self,
151    delimiter: Delimiter,
152  ) -> Result<Option<ContiguousLines<'arena>>> {
153    let Some(mut lines) = self.read_lines()? else {
154      return Ok(None);
155    };
156    if lines.any(|l| l.is_delimiter(delimiter)) {
157      return Ok(Some(lines));
158    }
159
160    let mut additional_lines = BumpVec::new_in(self.bump);
161    while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
162      additional_lines.push(self.read_line()?.unwrap());
163    }
164    lines.extend(additional_lines);
165    Ok(Some(lines))
166  }
167
168  fn at_delimiter(&self, delimiter: Delimiter) -> bool {
169    match delimiter.kind {
170      DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
171      DelimiterKind::Example => {
172        self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
173      }
174      DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
175      DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
176      DelimiterKind::Listing => self.lexer.at_delimiter_line() == Some((4, b'-')),
177      DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
178      DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
179      DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
180    }
181  }
182
183  pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
184    debug_assert!(self.peeked_lines.is_none());
185    if !lines.is_empty() {
186      self.peeked_lines = Some(lines);
187    }
188  }
189
190  pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
191    if !meta.is_empty() {
192      debug_assert!(self.peeked_meta.is_none());
193      self.peeked_meta = Some(meta);
194    }
195  }
196
197  pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
198    self.restore_lines(lines);
199    self.restore_peeked_meta(meta);
200  }
201
202  pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
203    self.parse_document_header()?;
204    self.prepare_toc();
205
206    // ensure we only read a single "paragraph" for `inline` doc_type
207    // https://docs.asciidoctor.org/asciidoc/latest/document/doctype/#inline-doctype-rules
208    if self.document.meta.get_doctype() == DocType::Inline {
209      if self.peeked_lines.is_none() {
210        // tmp:
211        self.peeked_lines = self.read_lines().expect("tmp");
212      }
213      self.lexer.truncate();
214    }
215
216    while let Some(chunk) = self.parse_chunk()? {
217      match chunk {
218        Chunk::Block(block) => self.document.content.push_block(block, self.bump),
219        Chunk::Section(section) => self.document.content.push_section(section, self.bump),
220      }
221    }
222
223    // clear the doc attrs so the backend can see them replayed in decl order
224    self.document.meta.clear_doc_attrs();
225
226    self.diagnose_document()?;
227
228    Ok(ParseResult {
229      document: self.document,
230      warnings: vec![],
231    })
232  }
233
234  fn parse_chunk(&mut self) -> Result<Option<Chunk<'arena>>> {
235    match self.parse_section()? {
236      Some(section) => Ok(Some(Chunk::Section(section))),
237      None => Ok(self.parse_block()?.map(Chunk::Block)),
238    }
239  }
240
241  pub(crate) fn parse_chunk_meta(
242    &mut self,
243    lines: &mut ContiguousLines<'arena>,
244  ) -> Result<ChunkMeta<'arena>> {
245    if let Some(meta) = self.peeked_meta.take() {
246      return Ok(meta);
247    }
248    assert!(!lines.is_empty());
249    let start_loc = lines.current_token().unwrap().loc;
250    let mut attrs = MultiAttrList::new_in(self.bump);
251    let mut title = None;
252    if !lines.current().unwrap().is_fully_unconsumed() {
253      return Ok(ChunkMeta::new(attrs, title, start_loc));
254    }
255    loop {
256      match lines.current() {
257        Some(line) if line.is_chunk_title() => {
258          let mut line = lines.consume_current().unwrap();
259          line.discard_assert(TokenKind::Dots);
260          title = Some(self.parse_inlines(&mut line.into_lines())?);
261        }
262        Some(line) if line.is_block_attr_list() => {
263          let mut line = lines.consume_current().unwrap();
264          line.discard_assert(TokenKind::OpenBracket);
265          attrs.push(self.parse_block_attr_list(&mut line)?);
266        }
267        Some(line) if line.is_block_anchor() => {
268          let mut line = lines.consume_current().unwrap();
269          line.discard_assert(TokenKind::OpenBracket);
270          line.discard_assert(TokenKind::OpenBracket);
271          let anchor = self.parse_block_anchor(&mut line)?.unwrap();
272          let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
273          anchor_attrs.id = Some(anchor.id);
274          anchor_attrs.positional.push(anchor.reftext);
275          attrs.push(anchor_attrs);
276        }
277        // consume trailing comment lines for valid meta
278        Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
279          lines.consume_current();
280        }
281        _ => break,
282      }
283    }
284    Ok(ChunkMeta::new(attrs, title, start_loc))
285  }
286
287  pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
288    BumpString::from_str_in(s, self.bump)
289  }
290}
291
292pub trait HasArena<'arena> {
293  fn bump(&self) -> &'arena Bump;
294  fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
295    Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
296  }
297}
298
299impl<'arena> HasArena<'arena> for Parser<'arena> {
300  fn bump(&self) -> &'arena Bump {
301    self.bump
302  }
303}
304
305#[derive(Debug)]
306pub enum Chunk<'arena> {
307  Block(Block<'arena>),
308  Section(Section<'arena>),
309}
310
311pub enum DirectiveAction<'arena> {
312  Passthrough,
313  ReadNextLine,
314  SkipLinesUntilEndIf,
315  SubstituteLine(Line<'arena>),
316}
317
318#[derive(Debug, Clone, PartialEq, Eq)]
319pub enum SourceFile {
320  Stdin { cwd: Path },
321  Path(Path),
322  Tmp,
323}
324
325impl SourceFile {
326  pub fn file_name(&self) -> &str {
327    match self {
328      SourceFile::Stdin { .. } => "<stdin>",
329      SourceFile::Path(path) => path.file_name(),
330      SourceFile::Tmp => "<temp-buffer>",
331    }
332  }
333
334  pub fn matches_xref_target(&self, target: &str) -> bool {
335    let SourceFile::Path(path) = self else {
336      return false;
337    };
338    let filename = path.file_name();
339    if filename == target {
340      return true;
341    }
342    let xref_ext = file::ext(target);
343    let path_ext = file::ext(filename);
344    if xref_ext.is_some() && xref_ext != path_ext {
345      return false;
346    }
347    let fullpath = path.to_string();
348    if fullpath.ends_with(target) {
349      true
350    } else if xref_ext.is_some() {
351      false
352    } else {
353      file::remove_ext(&fullpath).ends_with(target)
354    }
355  }
356}
357
358impl From<Diagnostic> for Vec<Diagnostic> {
359  fn from(diagnostic: Diagnostic) -> Self {
360    vec![diagnostic]
361  }
362}
363
364#[cfg(test)]
365mod tests {
366  use super::*;
367  use test_utils::*;
368
369  fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
370    struct MockResolver(pub Vec<u8>);
371    impl IncludeResolver for MockResolver {
372      fn resolve(
373        &mut self,
374        _: IncludeTarget,
375        buffer: &mut dyn IncludeBuffer,
376      ) -> std::result::Result<usize, ResolveError> {
377        buffer.initialize(self.0.len());
378        let bytes = buffer.as_bytes_mut();
379        bytes.copy_from_slice(&self.0);
380        Ok(self.0.len())
381      }
382      fn get_base_dir(&self) -> Option<String> {
383        Some("/".to_string())
384      }
385    }
386    Box::new(MockResolver(Vec::from(src.as_bytes())))
387  }
388
389  fn reassemble(lines: ContiguousLines) -> String {
390    lines
391      .iter()
392      .map(|l| l.reassemble_src())
393      .collect::<Vec<_>>()
394      .join("\n")
395  }
396
397  #[test]
398  fn test_attr_ref() {
399    let mut parser = test_parser!("hello {foo} world");
400    parser
401      .document
402      .meta
403      .insert_doc_attr("foo", "_bar_")
404      .unwrap();
405    let mut lines = parser.read_lines().unwrap().unwrap();
406    let line = lines.consume_current().unwrap();
407    let tokens = line.into_iter().collect::<Vec<_>>();
408    expect_eq!(
409      &tokens,
410      &[
411        Token::new(TokenKind::Word, 0..5, bstr!("hello")),
412        Token::new(TokenKind::Whitespace, 5..6, bstr!(" ")),
413        Token::new(TokenKind::AttrRef, 6..11, bstr!("{foo}")),
414        // these are inserted as an inline preprocessing step
415        // NB: we will use the source loc of the attr ref token to know how
416        // to skip over the resolve attribute in no-attr-ref subs contexts
417        Token::new(TokenKind::Underscore, 6..11, bstr!("_")),
418        Token::new(TokenKind::Word, 6..11, bstr!("bar")),
419        Token::new(TokenKind::Underscore, 6..11, bstr!("_")),
420        // end inserted.
421        Token::new(TokenKind::Whitespace, 11..12, bstr!(" ")),
422        Token::new(TokenKind::Word, 12..17, bstr!("world")),
423      ]
424    );
425  }
426
427  #[test]
428  fn invalid_directive_line_passed_thru() {
429    let input = adoc! {"
430      foo
431      include::invalid []
432      bar
433    "};
434
435    let mut parser = test_parser!(input);
436    assert_eq!(
437      reassemble(parser.read_lines().unwrap().unwrap()),
438      input.trim_end()
439    );
440  }
441
442  #[test]
443  fn safe_mode_include_to_link() {
444    let input = adoc! {"
445      foo
446      include::include-file.adoc[]
447      baz
448    "};
449
450    let mut parser = test_parser!(input);
451    parser.apply_job_settings(JobSettings::secure());
452    assert_eq!(
453      reassemble(parser.read_lines().unwrap().unwrap()),
454      adoc! {"
455        foo
456        link:include-file.adoc[role=include,]
457        baz"
458      }
459    );
460
461    // assert on the tokens and positions
462    let mut parser = test_parser!(input);
463    parser.apply_job_settings(JobSettings::secure());
464
465    let mut line = parser.read_line().unwrap().unwrap();
466    expect_eq!(
467      line.consume_current().unwrap(),
468      Token::new(TokenKind::Word, 0..3, bstr!("foo"))
469    );
470    assert!(line.consume_current().is_none());
471
472    assert_eq!(&input[8..13], "ude::");
473    assert_eq!(&input[30..32], "[]");
474
475    let mut line = parser.read_line().unwrap().unwrap();
476    expect_eq!(
477      std::array::from_fn(|_| line.consume_current().unwrap()),
478      [
479        // we "drop" positions 4-7, the `inc` of `include::`
480        // which becomes `••••link:`, keeping rest of token positions
481        Token::new(TokenKind::MacroName, 8..13, bstr!("link:")),
482        Token::new(TokenKind::Word, 13..20, bstr!("include")),
483        Token::new(TokenKind::Dashes, 20..21, bstr!("-")),
484        Token::new(TokenKind::Word, 21..25, bstr!("file")),
485        Token::new(TokenKind::Dots, 25..26, bstr!(".")),
486        Token::new(TokenKind::Word, 26..30, bstr!("adoc")),
487        Token::new(TokenKind::OpenBracket, 30..31, bstr!("[")),
488        // these tokens are inserted, they have no true source so we
489        // represent their position as empty at the insertion point
490        Token::new(TokenKind::Word, 31..31, bstr!("role")),
491        Token::new(TokenKind::EqualSigns, 31..31, bstr!("=")),
492        Token::new(TokenKind::Word, 31..31, bstr!("include")),
493        Token::new(TokenKind::Comma, 31..31, bstr!(",")),
494        // /end `role=include` inserted tokens
495        Token::new(TokenKind::CloseBracket, 31..32, bstr!("]")),
496      ]
497    );
498    assert!(line.consume_current().is_none());
499  }
500
501  #[test]
502  fn attrs_preserved_when_replacing_include() {
503    let input = "include::some-file.adoc[leveloffset+=1]";
504    let mut parser = test_parser!(input);
505    parser.apply_job_settings(JobSettings::secure());
506    assert_eq!(
507      parser.read_line().unwrap().unwrap().reassemble_src(),
508      "link:some-file.adoc[role=include,leveloffset+=1]"
509    );
510  }
511
512  #[test]
513  fn spaces_in_include_file_to_pass_macro_link() {
514    let input = "include::foo bar baz.adoc[]";
515    let mut parser = test_parser!(input);
516    parser.apply_job_settings(JobSettings::secure());
517    assert_eq!(
518      parser.read_line().unwrap().unwrap().reassemble_src(),
519      "link:pass:c[foo bar baz.adoc][role=include,]"
520    );
521  }
522
523  #[test]
524  fn uri_read_not_allowed_include_non_strict() {
525    // non-strict mode replaced with link
526    let input = "include::https://my.com/foo bar.adoc[]";
527    let mut parser = test_parser!(input);
528    let mut settings = JobSettings::r#unsafe();
529    settings.strict = false;
530    parser.apply_job_settings(settings);
531    expect_eq!(
532      parser.read_line().unwrap().unwrap().reassemble_src(),
533      "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
534      from: input
535    );
536  }
537}