1use std::fmt::{Debug, Formatter};
2use std::{cell::RefCell, rc::Rc};
3
4use crate::internal::*;
5
6pub struct Parser<'arena> {
7 pub(super) bump: &'arena Bump,
8 pub(super) lexer: Lexer<'arena>,
9 pub(super) document: Document<'arena>,
10 pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
11 pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
12 pub(super) ctx: ParseContext<'arena>,
13 pub(super) errors: RefCell<Vec<Diagnostic>>,
14 pub(super) strict: bool, pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
16 #[cfg(feature = "attr_ref_observation")]
17 pub(super) attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
18}
19
20pub struct ParseResult<'arena> {
21 pub document: Document<'arena>,
22 pub warnings: Vec<Diagnostic>,
23 #[cfg(feature = "attr_ref_observation")]
24 pub attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
25}
26
27impl<'arena> Parser<'arena> {
28 pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
29 Parser::from_lexer(Lexer::new(src, file, bump))
30 }
31
32 pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
33 Parser::from_lexer(Lexer::from_str(bump, file, src))
34 }
35
36 fn from_lexer(lexer: Lexer<'arena>) -> Self {
37 let mut parser = Parser {
38 bump: lexer.bump,
39 document: Document::new(lexer.bump),
40 peeked_lines: None,
41 peeked_meta: None,
42 ctx: ParseContext::new(lexer.bump),
43 errors: RefCell::new(Vec::new()),
44 strict: true,
45 include_resolver: None,
46 lexer,
47 #[cfg(feature = "attr_ref_observation")]
48 attr_ref_observer: None,
49 };
50 parser.set_source_file_attrs();
51 parser
52 }
53
54 pub fn apply_job_settings(&mut self, settings: JobSettings) {
55 if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
56 Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
57 }
58 self.strict = settings.strict;
59 self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
60 self.document.meta = settings.into();
61 self.set_source_file_attrs();
62 }
63
64 pub fn provide_timestamps(
65 &mut self,
66 now: u64,
67 input_modified_time: Option<u64>,
68 reproducible_override: Option<u64>,
69 ) {
70 self.set_datetime_attrs(now, input_modified_time, reproducible_override);
71 }
72
73 pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
74 self.include_resolver = Some(resolver);
75 }
76
77 #[cfg(feature = "attr_ref_observation")]
78 pub fn set_attr_ref_observer(&mut self, observer: Box<dyn AttrRefObserver>) {
79 self.attr_ref_observer = Some(observer);
80 }
81
82 pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
83 let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
84 cell_parser.include_resolver = self.include_resolver.as_ref().map(|r| r.clone_box());
85 cell_parser.strict = self.strict;
86 cell_parser.lexer.adjust_offset(offset);
87 cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
88 cell_parser.document.meta = self.document.meta.clone_for_cell();
89 cell_parser.document.anchors = Rc::clone(&self.document.anchors);
90
91 #[cfg(feature = "attr_ref_observation")]
92 {
93 cell_parser.attr_ref_observer = self.attr_ref_observer.take();
94 }
95
96 cell_parser
97 }
98
99 pub(crate) fn loc(&self) -> SourceLocation {
100 self
101 .peeked_lines
102 .as_ref()
103 .and_then(|lines| lines.first_loc())
104 .unwrap_or_else(|| self.lexer.loc())
105 }
106
107 pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
108 Ok(self._read_line(false)?.map(|(line, _)| line))
109 }
110
111 fn _read_line(&mut self, ignored_last: bool) -> Result<Option<(Line<'arena>, bool)>> {
112 assert!(self.peeked_lines.is_none());
113 if self.lexer.is_eof() {
114 return Ok(None);
115 }
116
117 let mut drop_line = false;
118 let mut line = Line::empty(self.bump);
119 while !self.lexer.at_newline() && !self.lexer.is_eof() {
120 let token = self.lexer.next_token();
121 self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
122 }
123 self.lexer.skip_newline();
124 if drop_line {
125 return self._read_line(false);
126 }
127 if line.starts(TokenKind::Directive) && !self.ctx.within_block_comment() {
128 match self.try_process_directive(&mut line)? {
129 DirectiveAction::Passthrough => Ok(Some((line, ignored_last))),
130 DirectiveAction::SubstituteLine(line) => Ok(Some((line, ignored_last))),
131 DirectiveAction::IgnoreNotIncluded => self._read_line(true),
132 DirectiveAction::ReadNextLine => self._read_line(false),
133 DirectiveAction::SkipLinesUntilEndIf => Ok(
134 self
135 .skip_lines_until_endif(&line)?
136 .map(|l| (l, ignored_last)),
137 ),
138 }
139 } else {
140 Ok(Some((line, ignored_last)))
141 }
142 }
143
144 pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
145 self.ctx.comment_delim_in_lines = false;
146 if let Some(peeked) = self.peeked_lines.take() {
147 return Ok(Some(peeked));
148 }
149 self.lexer.consume_empty_lines();
150 if self.lexer.is_eof() {
151 return Ok(None);
152 }
153 let mut lines = Deq::new(self.bump);
154 while let Some((line, ignored_removed_include_line)) = self._read_line(false)? {
155 if line.is_emptyish() {
156 if lines.is_empty() {
157 continue;
161 } else if !ignored_removed_include_line {
162 break;
165 }
166 }
167 if line.is_delimiter_kind(DelimiterKind::Comment) {
168 self.ctx.comment_delim_in_lines = true;
169 }
170 lines.push(line);
171 if self.lexer.at_newline() {
172 break;
173 }
174 }
175 if lines.is_empty() {
176 Ok(None)
177 } else {
178 Ok(Some(ContiguousLines::new(lines)))
179 }
180 }
181
182 pub(crate) fn read_lines_until(
183 &mut self,
184 delimiter: Delimiter,
185 ) -> Result<Option<ContiguousLines<'arena>>> {
186 let Some(mut lines) = self.read_lines()? else {
187 return Ok(None);
188 };
189 if lines.any(|l| l.is_delimiter(delimiter)) {
190 return Ok(Some(lines));
191 }
192
193 let mut additional_lines = BumpVec::new_in(self.bump);
194 while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
195 additional_lines.push(self.read_line()?.unwrap());
196 }
197 lines.extend(additional_lines);
198
199 while lines.last().map(|l| l.is_empty()) == Some(true) {
200 lines.pop();
201 }
202 Ok(Some(lines))
203 }
204
205 fn at_delimiter(&self, delimiter: Delimiter) -> bool {
206 match delimiter.kind {
207 DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
208 DelimiterKind::Example => {
209 self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
210 }
211 DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
212 DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
213 DelimiterKind::Listing => self.lexer.at_delimiter_line() == Some((4, b'-')),
214 DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
215 DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
216 DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
217 }
218 }
219
220 pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
221 debug_assert!(self.peeked_lines.is_none());
222 if !lines.is_empty() {
223 self.peeked_lines = Some(lines);
224 }
225 }
226
227 pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
228 if !meta.is_empty() {
229 debug_assert!(self.peeked_meta.is_none());
230 self.peeked_meta = Some(meta);
231 }
232 }
233
234 pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
235 self.restore_lines(lines);
236 self.restore_peeked_meta(meta);
237 }
238
239 pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
240 self.parse_document_header()?;
241 self.prepare_toc();
242
243 if self.document.meta.get_doctype() == DocType::Inline {
246 if self.peeked_lines.is_none() {
247 self.peeked_lines = self.read_lines().expect("tmp");
249 }
250 self.lexer.truncate();
251 }
252
253 if let Some(book_content) = self.parse_book()? {
254 self.document.content = book_content;
255 } else {
256 let sectioned = self.parse_sectioned()?;
257 self.document.content = sectioned.into_doc_content(self.bump);
258 }
259
260 self.document.meta.clear_doc_attrs();
262
263 self.diagnose_document()?;
264
265 Ok(ParseResult {
266 document: self.document,
267 warnings: self.errors.into_inner(),
268 #[cfg(feature = "attr_ref_observation")]
269 attr_ref_observer: self.attr_ref_observer,
270 })
271 }
272
273 pub(crate) fn parse_sectioned(&mut self) -> Result<Sectioned<'arena>> {
274 let mut blocks = bvec![in self.bump];
275 while let Some(block) = self.parse_block()? {
276 blocks.push(block);
277 }
278 let preamble = if blocks.is_empty() { None } else { Some(blocks) };
279 let mut sections = bvec![in self.bump];
280 while let Some(section) = self.parse_section()? {
281 sections.push(section);
282 }
283 Ok(Sectioned { preamble, sections })
284 }
285
286 pub(crate) fn parse_chunk_meta(
287 &mut self,
288 lines: &mut ContiguousLines<'arena>,
289 ) -> Result<ChunkMeta<'arena>> {
290 if let Some(meta) = self.peeked_meta.take() {
291 return Ok(meta);
292 }
293 assert!(!lines.is_empty());
294 let start_loc = lines.current_token().unwrap().loc;
295 let mut attrs = MultiAttrList::new_in(self.bump);
296 let mut title = None;
297 if !lines.current().unwrap().is_fully_unconsumed() {
298 return Ok(ChunkMeta::new(attrs, title, start_loc));
299 }
300 loop {
301 match lines.current() {
302 Some(line) if line.is_chunk_title() => {
303 let mut line = lines.consume_current().unwrap();
304 line.discard_assert(TokenKind::Dots);
305 title = Some(self.parse_inlines(&mut line.into_lines())?);
306 }
307 Some(line) if line.is_block_attr_list() => {
308 let mut line = lines.consume_current().unwrap();
309 line.discard_assert(TokenKind::OpenBracket);
310 attrs.push(self.parse_block_attr_list(&mut line)?);
311 }
312 Some(line) if line.is_block_anchor() => {
313 let mut line = lines.consume_current().unwrap();
314 let first = line.discard_assert(TokenKind::OpenBracket);
315 line.discard_assert(TokenKind::OpenBracket);
316 let Some(anchor) = self.parse_block_anchor(&mut line)? else {
317 self.err_line_starting("Invalid block anchor", first.loc)?;
318 return Ok(ChunkMeta::new(attrs, title, start_loc));
319 };
320 let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
321 anchor_attrs.id = Some(anchor.id);
322 anchor_attrs.positional.push(anchor.reftext);
323 attrs.push(anchor_attrs);
324 }
325 Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
327 lines.consume_current();
328 }
329 _ => break,
330 }
331 }
332 Ok(ChunkMeta::new(attrs, title, start_loc))
333 }
334
335 pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
336 BumpString::from_str_in(s, self.bump)
337 }
338
339 pub fn line_number_with_offset(&self, loc: SourceLocation) -> (u32, u32) {
340 self.lexer.line_number_with_offset(loc)
341 }
342
343 pub fn source_file_at(&self, idx: u16) -> &SourceFile {
344 self.lexer.source_file_at(idx)
345 }
346}
347
348pub trait HasArena<'arena> {
349 fn bump(&self) -> &'arena Bump;
350 fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
351 Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
352 }
353}
354
355impl<'arena> HasArena<'arena> for Parser<'arena> {
356 fn bump(&self) -> &'arena Bump {
357 self.bump
358 }
359}
360
361pub enum DirectiveAction<'arena> {
362 Passthrough,
363 ReadNextLine,
364 IgnoreNotIncluded,
365 SkipLinesUntilEndIf,
366 SubstituteLine(Line<'arena>),
367}
368
369#[derive(Debug, Clone, PartialEq, Eq)]
370pub enum SourceFile {
371 Stdin { cwd: Path },
372 Path(Path),
373 Tmp,
374}
375
376impl SourceFile {
377 pub fn file_name(&self) -> &str {
378 match self {
379 SourceFile::Stdin { .. } => "<stdin>",
380 SourceFile::Path(path) => path.file_name(),
381 SourceFile::Tmp => "<temp-buffer>",
382 }
383 }
384
385 pub fn matches_xref_target(&self, target: &str) -> bool {
386 let SourceFile::Path(path) = self else {
387 return false;
388 };
389 let filename = path.file_name();
390 if filename == target {
391 return true;
392 }
393 let xref_ext = file::ext(target);
394 let path_ext = file::ext(filename);
395 if xref_ext.is_some() && xref_ext != path_ext {
396 return false;
397 }
398 let fullpath = path.to_string();
399 if fullpath.ends_with(target) {
400 true
401 } else if xref_ext.is_some() {
402 false
403 } else {
404 file::remove_ext(&fullpath).ends_with(target)
405 }
406 }
407}
408
409impl From<Diagnostic> for Vec<Diagnostic> {
410 fn from(diagnostic: Diagnostic) -> Self {
411 vec![diagnostic]
412 }
413}
414
415impl Debug for ParseResult<'_> {
416 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
417 f.debug_struct("ParseResult")
418 .field("document", &self.document)
419 .field("warnings", &self.warnings)
420 .finish()
421 }
422}
423
424#[cfg(test)]
425mod tests {
426 use super::*;
427 use test_utils::*;
428
429 fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
430 #[derive(Clone)]
431 struct MockResolver(pub Vec<u8>);
432 impl IncludeResolver for MockResolver {
433 fn resolve(
434 &mut self,
435 _: IncludeTarget,
436 buffer: &mut dyn IncludeBuffer,
437 ) -> std::result::Result<usize, ResolveError> {
438 buffer.initialize(self.0.len());
439 let bytes = buffer.as_bytes_mut();
440 bytes.copy_from_slice(&self.0);
441 Ok(self.0.len())
442 }
443 fn get_base_dir(&self) -> Option<String> {
444 Some("/".to_string())
445 }
446 fn clone_box(&self) -> Box<dyn IncludeResolver> {
447 Box::new(self.clone())
448 }
449 }
450 Box::new(MockResolver(Vec::from(src.as_bytes())))
451 }
452
453 fn reassemble(lines: ContiguousLines) -> String {
454 lines
455 .iter()
456 .map(|l| l.reassemble_src())
457 .collect::<Vec<_>>()
458 .join("\n")
459 }
460
461 #[test]
462 fn test_attr_ref() {
463 let mut parser = test_parser!("hello {foo} world");
464 parser
465 .document
466 .meta
467 .insert_doc_attr("foo", "_bar_")
468 .unwrap();
469 let mut lines = parser.read_lines().unwrap().unwrap();
470 let line = lines.consume_current().unwrap();
471 let tokens = line.into_iter().collect::<Vec<_>>();
472 expect_eq!(
473 &tokens,
474 &[
475 Token::new(TokenKind::Word, loc!(0..5), bstr!("hello")),
476 Token::new(TokenKind::Whitespace, loc!(5..6), bstr!(" ")),
477 Token::new(TokenKind::AttrRef, loc!(6..11), bstr!("{foo}")),
478 Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
482 Token::new(TokenKind::Word, loc!(6..11), bstr!("bar")),
483 Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
484 Token::new(TokenKind::Whitespace, loc!(11..12), bstr!(" ")),
486 Token::new(TokenKind::Word, loc!(12..17), bstr!("world")),
487 ]
488 );
489 }
490
491 #[test]
492 fn invalid_directive_line_passed_thru() {
493 let input = adoc! {"
494 foo
495 include::invalid []
496 bar
497 "};
498
499 let mut parser = test_parser!(input);
500 assert_eq!(
501 reassemble(parser.read_lines().unwrap().unwrap()),
502 input.trim_end()
503 );
504 }
505
506 #[test]
507 fn safe_mode_include_to_link() {
508 let input = adoc! {"
509 foo
510 include::include-file.adoc[]
511 baz
512 "};
513
514 let mut parser = test_parser!(input);
515 parser.apply_job_settings(JobSettings::secure());
516 assert_eq!(
517 reassemble(parser.read_lines().unwrap().unwrap()),
518 adoc! {"
519 foo
520 link:include-file.adoc[role=include,]
521 baz"
522 }
523 );
524
525 let mut parser = test_parser!(input);
527 parser.apply_job_settings(JobSettings::secure());
528
529 let mut line = parser.read_line().unwrap().unwrap();
530 expect_eq!(
531 line.consume_current().unwrap(),
532 Token::new(TokenKind::Word, loc!(0..3), bstr!("foo"))
533 );
534 assert!(line.consume_current().is_none());
535
536 assert_eq!(&input[8..13], "ude::");
537 assert_eq!(&input[30..32], "[]");
538
539 let mut line = parser.read_line().unwrap().unwrap();
540 expect_eq!(
541 std::array::from_fn(|_| line.consume_current().unwrap()),
542 [
543 Token::new(TokenKind::MacroName, loc!(8..13), bstr!("link:")),
546 Token::new(TokenKind::Word, loc!(13..20), bstr!("include")),
547 Token::new(TokenKind::Dashes, loc!(20..21), bstr!("-")),
548 Token::new(TokenKind::Word, loc!(21..25), bstr!("file")),
549 Token::new(TokenKind::Dots, loc!(25..26), bstr!(".")),
550 Token::new(TokenKind::Word, loc!(26..30), bstr!("adoc")),
551 Token::new(TokenKind::OpenBracket, loc!(30..31), bstr!("[")),
552 Token::new(TokenKind::Word, loc!(31..31), bstr!("role")),
555 Token::new(TokenKind::EqualSigns, loc!(31..31), bstr!("=")),
556 Token::new(TokenKind::Word, loc!(31..31), bstr!("include")),
557 Token::new(TokenKind::Comma, loc!(31..31), bstr!(",")),
558 Token::new(TokenKind::CloseBracket, loc!(31..32), bstr!("]")),
560 ]
561 );
562 assert!(line.consume_current().is_none());
563 }
564
565 #[test]
566 fn attrs_preserved_when_replacing_include() {
567 let input = "include::some-file.adoc[leveloffset+=1]";
568 let mut parser = test_parser!(input);
569 parser.apply_job_settings(JobSettings::secure());
570 assert_eq!(
571 parser.read_line().unwrap().unwrap().reassemble_src(),
572 "link:some-file.adoc[role=include,leveloffset+=1]"
573 );
574 }
575
576 #[test]
577 fn spaces_in_include_file_to_pass_macro_link() {
578 let input = "include::foo bar baz.adoc[]";
579 let mut parser = test_parser!(input);
580 parser.apply_job_settings(JobSettings::secure());
581 assert_eq!(
582 parser.read_line().unwrap().unwrap().reassemble_src(),
583 "link:pass:c[foo bar baz.adoc][role=include,]"
584 );
585 }
586
587 #[test]
588 fn uri_read_not_allowed_include_non_strict() {
589 let input = "include::https://my.com/foo bar.adoc[]";
591 let mut parser = test_parser!(input);
592 let mut settings = JobSettings::r#unsafe();
593 settings.strict = false;
594 parser.apply_job_settings(settings);
595 expect_eq!(
596 parser.read_line().unwrap().unwrap().reassemble_src(),
597 "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
598 from: input
599 );
600 }
601}