1use std::fmt::Debug;
2use std::{cell::RefCell, rc::Rc};
3
4use crate::internal::*;
5
6pub struct Parser<'arena> {
7 pub(super) bump: &'arena Bump,
8 pub(super) lexer: Lexer<'arena>,
9 pub(super) document: Document<'arena>,
10 pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
11 pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
12 pub(super) ctx: ParseContext<'arena>,
13 pub(super) errors: RefCell<Vec<Diagnostic>>,
14 pub(super) strict: bool, pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
16 #[cfg(feature = "attr_ref_observation")]
17 pub(super) attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
18}
19
20impl<'arena> Parser<'arena> {
21 pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
22 Parser::from_lexer(Lexer::new(src, file, bump))
23 }
24
25 pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
26 Parser::from_lexer(Lexer::from_str(bump, file, src))
27 }
28
29 fn from_lexer(lexer: Lexer<'arena>) -> Self {
30 let mut parser = Parser {
31 bump: lexer.bump,
32 document: Document::new(lexer.bump),
33 peeked_lines: None,
34 peeked_meta: None,
35 ctx: ParseContext::new(lexer.bump),
36 errors: RefCell::new(Vec::new()),
37 strict: true,
38 include_resolver: None,
39 lexer,
40 #[cfg(feature = "attr_ref_observation")]
41 attr_ref_observer: None,
42 };
43 parser.set_source_file_attrs();
44 parser
45 }
46
47 pub fn apply_job_settings(&mut self, settings: JobSettings) {
48 if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
49 Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
50 }
51 self.strict = settings.strict;
52 self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
53 self.document.meta = settings.into();
54 self.set_source_file_attrs();
55 }
56
57 pub fn register_plugin_macros(&mut self, names: &[impl AsRef<str>]) {
58 self.lexer.register_plugin_macros(names);
59 }
60
61 pub fn provide_timestamps(
62 &mut self,
63 now: u64,
64 input_modified_time: Option<u64>,
65 reproducible_override: Option<u64>,
66 ) {
67 self.set_datetime_attrs(now, input_modified_time, reproducible_override);
68 }
69
70 pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
71 self.include_resolver = Some(resolver);
72 }
73
74 #[cfg(feature = "attr_ref_observation")]
75 pub fn set_attr_ref_observer(&mut self, observer: Box<dyn AttrRefObserver>) {
76 self.attr_ref_observer = Some(observer);
77 }
78
79 pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
80 let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
81 cell_parser.include_resolver = self.include_resolver.as_ref().map(|r| r.clone_box());
82 cell_parser.strict = self.strict;
83 cell_parser.lexer.adjust_offset(offset);
84 cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
85 cell_parser.document.meta = self.document.meta.clone_for_cell();
86 cell_parser.document.anchors = Rc::clone(&self.document.anchors);
87
88 #[cfg(feature = "attr_ref_observation")]
89 {
90 cell_parser.attr_ref_observer = self.attr_ref_observer.take();
91 }
92
93 cell_parser
94 }
95
96 pub(crate) fn loc(&self) -> SourceLocation {
97 self
98 .peeked_lines
99 .as_ref()
100 .and_then(|lines| lines.first_loc())
101 .unwrap_or_else(|| self.lexer.loc())
102 }
103
104 pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
105 Ok(self._read_line(false)?.map(|(line, _)| line))
106 }
107
108 fn _read_line(&mut self, ignored_last: bool) -> Result<Option<(Line<'arena>, bool)>> {
109 assert!(self.peeked_lines.is_none());
110 if self.lexer.is_eof() {
111 return Ok(None);
112 }
113
114 use TokenKind::*;
115 let mut drop_line = false;
116 let mut line = Line::empty(self.bump);
117 while !self.lexer.at_newline() && !self.lexer.is_eof() {
118 let mut token = self.lexer.next_token();
119 if line.is_empty() {
120 if self.ctx.in_header
123 && !matches!(
124 token.kind,
125 Colon | EqualSigns | Word | ForwardSlashes | Directive | OpenBracket
126 )
127 {
128 self.ctx.in_header = false;
129 } else if token.kind == Colon && self.ctx.subs.attr_refs() {
130 self.try_parse_attr_def(&mut token)?;
131 }
132 }
133 self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
134 }
135 self.lexer.skip_newline();
136 if drop_line {
137 return self._read_line(false);
138 }
139 if line.starts(TokenKind::Directive) && !self.ctx.within_block_comment() {
140 match self.try_process_directive(&mut line)? {
141 DirectiveAction::Passthrough => Ok(Some((line, ignored_last))),
142 DirectiveAction::SubstituteLine(line) => Ok(Some((line, ignored_last))),
143 DirectiveAction::IgnoreNotIncluded => self._read_line(true),
144 DirectiveAction::ReadNextLine => self._read_line(false),
145 DirectiveAction::SkipLinesUntilEndIf => Ok(
146 self
147 .skip_lines_until_endif(&line)?
148 .map(|l| (l, ignored_last)),
149 ),
150 }
151 } else {
152 Ok(Some((line, ignored_last)))
153 }
154 }
155
156 pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
157 self.ctx.comment_delim_in_lines = false;
158 if let Some(peeked) = self.peeked_lines.take() {
159 return Ok(Some(peeked));
160 }
161 self.lexer.consume_empty_lines();
162 if self.lexer.is_eof() {
163 return Ok(None);
164 }
165 let mut lines = Deq::new(self.bump);
166 while let Some((line, ignored_removed_include_line)) = self._read_line(false)? {
167 if line.is_emptyish() {
168 if lines.is_empty() {
169 continue;
173 } else if !ignored_removed_include_line {
174 break;
177 }
178 }
179 if line.is_delimiter_kind(DelimiterKind::Comment) {
180 self.ctx.comment_delim_in_lines = true;
181 }
182 lines.push(line);
183 if self.lexer.at_newline() {
184 break;
185 }
186 }
187 if lines.is_empty() {
188 Ok(None)
189 } else {
190 Ok(Some(ContiguousLines::new(lines)))
191 }
192 }
193
194 pub(crate) fn read_lines_until(
195 &mut self,
196 delimiter: Delimiter,
197 ) -> Result<Option<ContiguousLines<'arena>>> {
198 let Some(mut lines) = self.read_lines()? else {
199 return Ok(None);
200 };
201 if lines.any(|l| l.is_delimiter(delimiter)) {
202 return Ok(Some(lines));
203 }
204
205 let mut additional_lines = BumpVec::new_in(self.bump);
206 while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
207 additional_lines.push(self.read_line()?.unwrap());
208 }
209 lines.extend(additional_lines);
210
211 while lines.last().map(|l| l.is_empty()) == Some(true) {
212 lines.pop();
213 }
214 Ok(Some(lines))
215 }
216
217 fn at_delimiter(&self, delimiter: Delimiter) -> bool {
218 match delimiter.kind {
219 DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
220 DelimiterKind::Example => {
221 self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
222 }
223 DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
224 DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
225 DelimiterKind::Listing => {
226 if delimiter.len == 3 {
227 self.lexer.at_delimiter_line() == Some((3, b'`'))
228 } else {
229 self.lexer.at_delimiter_line() == Some((4, b'-'))
230 }
231 }
232 DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
233 DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
234 DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
235 }
236 }
237
238 pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
239 debug_assert!(self.peeked_lines.is_none());
240 if !lines.is_empty() {
241 self.peeked_lines = Some(lines);
242 }
243 }
244
245 pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
246 if !meta.is_empty() {
247 debug_assert!(self.peeked_meta.is_none());
248 self.peeked_meta = Some(meta);
249 }
250 }
251
252 pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
253 self.restore_lines(lines);
254 self.restore_peeked_meta(meta);
255 }
256
257 pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
258 self.lockdown_secure_mode();
259 self.parse_document_header()?;
260 self.prepare_toc();
261
262 if self.document.meta.get_doctype() == DocType::Inline {
265 if self.peeked_lines.is_none() {
266 self.peeked_lines = self.read_lines().expect("tmp");
268 }
269 self.lexer.truncate();
270 }
271
272 if let Some(book_content) = self.parse_book()? {
273 self.document.content = book_content;
274 } else {
275 let sectioned = self.parse_sectioned()?;
276 self.document.content = sectioned.into_doc_content(self.bump);
277 }
278
279 self.resolve_docinfo();
280
281 self.document.meta.clear_doc_attrs();
283 self.diagnose_document()?;
284 Ok(self.into())
285 }
286
287 pub(crate) fn parse_sectioned(&mut self) -> Result<Sectioned<'arena>> {
288 let mut blocks = bvec![in self.bump];
289 while let Some(block) = self.parse_block()? {
290 blocks.push(block);
291 }
292 let preamble = if blocks.is_empty() { None } else { Some(blocks) };
293 let mut sections = bvec![in self.bump];
294 while let Some(section) = self.parse_section()? {
295 sections.push(section);
296 }
297 Ok(Sectioned { preamble, sections })
298 }
299
300 pub(crate) fn parse_chunk_meta(
301 &mut self,
302 lines: &mut ContiguousLines<'arena>,
303 ) -> Result<ChunkMeta<'arena>> {
304 if let Some(meta) = self.peeked_meta.take() {
305 return Ok(meta);
306 }
307 assert!(!lines.is_empty());
308 let start_loc = lines.current_token().unwrap().loc;
309 let mut attrs = MultiAttrList::new_in(self.bump);
310 let mut title = None;
311 if !lines.current().unwrap().is_fully_unconsumed() {
312 return Ok(ChunkMeta::new(attrs, title, start_loc));
313 }
314 loop {
315 match lines.current() {
316 Some(line) if line.is_chunk_title() => {
317 let mut line = lines.consume_current().unwrap();
318 line.discard_assert(TokenKind::Dots);
319 title = Some(self.parse_inlines(&mut line.into_lines())?);
320 }
321 Some(line) if line.is_block_attr_list() => {
322 let mut line = lines.consume_current().unwrap();
323 line.discard_assert(TokenKind::OpenBracket);
324 attrs.push(self.parse_block_attr_list(&mut line)?);
325 }
326 Some(line) if line.is_block_anchor() => {
327 let mut line = lines.consume_current().unwrap();
328 let first = line.discard_assert(TokenKind::OpenBracket);
329 line.discard_assert(TokenKind::OpenBracket);
330 let Some(anchor) = self.parse_block_anchor(&mut line)? else {
331 self.err_line_starting("Invalid block anchor", first.loc)?;
332 return Ok(ChunkMeta::new(attrs, title, start_loc));
333 };
334 let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
335 anchor_attrs.id = Some(anchor.id);
336 anchor_attrs.positional.push(anchor.reftext);
337 attrs.push(anchor_attrs);
338 }
339 Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
341 lines.consume_current();
342 }
343 _ => break,
344 }
345 }
346 Ok(ChunkMeta::new(attrs, title, start_loc))
347 }
348
349 pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
350 BumpString::from_str_in(s, self.bump)
351 }
352
353 fn lockdown_secure_mode(&mut self) {
354 let meta = &mut self.document.meta;
355 if meta.safe_mode == SafeMode::Secure {
356 _ = meta.insert_job_attr("data-uri", JobAttr::readonly(false));
357 if meta.is_unset("max-attribute-value-size") {
358 _ = meta.insert_job_attr("max-attribute-value-size", JobAttr::readonly("4096"));
359 }
360 if meta.is_unset("linkcss") {
361 _ = meta.insert_job_attr("linkcss", JobAttr::readonly(""));
362 }
363 if meta.is_unset("icons") {
364 _ = meta.insert_job_attr("icons", JobAttr::readonly(false));
365 }
366 }
367 }
368}
369
370pub trait HasArena<'arena> {
371 fn bump(&self) -> &'arena Bump;
372 fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
373 Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
374 }
375}
376
377impl<'arena> HasArena<'arena> for Parser<'arena> {
378 fn bump(&self) -> &'arena Bump {
379 self.bump
380 }
381}
382
383pub enum DirectiveAction<'arena> {
384 Passthrough,
385 ReadNextLine,
386 IgnoreNotIncluded,
387 SkipLinesUntilEndIf,
388 SubstituteLine(Line<'arena>),
389}
390
391#[derive(Debug, Clone, PartialEq, Eq)]
392pub enum SourceFile {
393 Stdin { cwd: Path },
394 Path(Path),
395 Tmp,
396}
397
398impl SourceFile {
399 pub fn file_name(&self) -> &str {
400 match self {
401 SourceFile::Stdin { .. } => "<stdin>",
402 SourceFile::Path(path) => path.file_name(),
403 SourceFile::Tmp => "<temp-buffer>",
404 }
405 }
406
407 pub fn matches_xref_target(&self, target: &str) -> bool {
408 let SourceFile::Path(path) = self else {
409 return false;
410 };
411 let filename = path.file_name();
412 if filename == target {
413 return true;
414 }
415 let xref_ext = file::ext(target);
416 let path_ext = file::ext(filename);
417 if xref_ext.is_some() && xref_ext != path_ext {
418 return false;
419 }
420 let fullpath = path.to_string();
421 if fullpath.ends_with(target) {
422 true
423 } else if xref_ext.is_some() {
424 false
425 } else {
426 file::remove_ext(&fullpath).ends_with(target)
427 }
428 }
429}
430
431impl From<Diagnostic> for Vec<Diagnostic> {
432 fn from(diagnostic: Diagnostic) -> Self {
433 vec![diagnostic]
434 }
435}
436
437#[cfg(test)]
438mod tests {
439 use super::*;
440 use test_utils::*;
441
442 fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
443 #[derive(Clone)]
444 struct MockResolver(pub Vec<u8>);
445 impl IncludeResolver for MockResolver {
446 fn resolve(
447 &mut self,
448 _: IncludeTarget,
449 buffer: &mut dyn IncludeBuffer,
450 _: SafeMode,
451 ) -> std::result::Result<usize, ResolveError> {
452 buffer.initialize(self.0.len());
453 let bytes = buffer.as_bytes_mut();
454 bytes.copy_from_slice(&self.0);
455 Ok(self.0.len())
456 }
457 fn get_base_dir(&self) -> Option<String> {
458 Some("/".to_string())
459 }
460 fn clone_box(&self) -> Box<dyn IncludeResolver> {
461 Box::new(self.clone())
462 }
463 }
464 Box::new(MockResolver(Vec::from(src.as_bytes())))
465 }
466
467 fn reassemble(lines: ContiguousLines) -> String {
468 lines
469 .iter()
470 .map(|l| l.reassemble_src())
471 .collect::<Vec<_>>()
472 .join("\n")
473 }
474
475 #[test]
476 fn test_attr_ref() {
477 let mut parser = test_parser!("hello {foo} world");
478 parser
479 .document
480 .meta
481 .insert_doc_attr("foo", "_bar_")
482 .unwrap();
483 let mut lines = parser.read_lines().unwrap().unwrap();
484 let line = lines.consume_current().unwrap();
485 let tokens = line.into_iter().collect::<Vec<_>>();
486 expect_eq!(
487 &tokens,
488 &[
489 Token::new(TokenKind::Word, loc!(0..5), bstr!("hello")),
490 Token::new(TokenKind::Whitespace, loc!(5..6), bstr!(" ")),
491 Token::new(TokenKind::AttrRef, loc!(6..11), bstr!("{foo}")),
492 Token {
496 kind: TokenKind::Underscore,
497 loc: loc!(6..11),
498 lexeme: bstr!("_"),
499 attr_replacement: true
500 },
501 Token {
502 kind: TokenKind::Word,
503 loc: loc!(6..11),
504 lexeme: bstr!("bar"),
505 attr_replacement: true
506 },
507 Token {
508 kind: TokenKind::Underscore,
509 loc: loc!(6..11),
510 lexeme: bstr!("_"),
511 attr_replacement: true
512 },
513 Token::new(TokenKind::Whitespace, loc!(11..12), bstr!(" ")),
515 Token::new(TokenKind::Word, loc!(12..17), bstr!("world")),
516 ]
517 );
518 }
519
520 #[test]
521 fn invalid_directive_line_passed_thru() {
522 let input = adoc! {"
523 foo
524 include::invalid []
525 bar
526 "};
527
528 let mut parser = test_parser!(input);
529 assert_eq!(
530 reassemble(parser.read_lines().unwrap().unwrap()),
531 input.trim_end()
532 );
533 }
534
535 #[test]
536 fn safe_mode_include_to_link() {
537 let input = adoc! {"
538 foo
539 include::include-file.adoc[]
540 baz
541 "};
542
543 let mut parser = test_parser!(input);
544 parser.apply_job_settings(JobSettings::secure());
545 assert_eq!(
546 reassemble(parser.read_lines().unwrap().unwrap()),
547 adoc! {"
548 foo
549 link:include-file.adoc[role=include,]
550 baz"
551 }
552 );
553
554 let mut parser = test_parser!(input);
556 parser.apply_job_settings(JobSettings::secure());
557
558 let mut line = parser.read_line().unwrap().unwrap();
559 expect_eq!(
560 line.consume_current().unwrap(),
561 Token::new(TokenKind::Word, loc!(0..3), bstr!("foo"))
562 );
563 assert!(line.consume_current().is_none());
564
565 assert_eq!(&input[8..13], "ude::");
566 assert_eq!(&input[30..32], "[]");
567
568 let mut line = parser.read_line().unwrap().unwrap();
569 expect_eq!(
570 std::array::from_fn(|_| line.consume_current().unwrap()),
571 [
572 Token::new(TokenKind::MacroName, loc!(8..13), bstr!("link:")),
575 Token::new(TokenKind::Word, loc!(13..20), bstr!("include")),
576 Token::new(TokenKind::Dashes, loc!(20..21), bstr!("-")),
577 Token::new(TokenKind::Word, loc!(21..25), bstr!("file")),
578 Token::new(TokenKind::Dots, loc!(25..26), bstr!(".")),
579 Token::new(TokenKind::Word, loc!(26..30), bstr!("adoc")),
580 Token::new(TokenKind::OpenBracket, loc!(30..31), bstr!("[")),
581 Token::new(TokenKind::Word, loc!(31..31), bstr!("role")),
584 Token::new(TokenKind::EqualSigns, loc!(31..31), bstr!("=")),
585 Token::new(TokenKind::Word, loc!(31..31), bstr!("include")),
586 Token::new(TokenKind::Comma, loc!(31..31), bstr!(",")),
587 Token::new(TokenKind::CloseBracket, loc!(31..32), bstr!("]")),
589 ]
590 );
591 assert!(line.consume_current().is_none());
592 }
593
594 #[test]
595 fn attrs_preserved_when_replacing_include() {
596 let input = "include::some-file.adoc[leveloffset+=1]";
597 let mut parser = test_parser!(input);
598 parser.apply_job_settings(JobSettings::secure());
599 assert_eq!(
600 parser.read_line().unwrap().unwrap().reassemble_src(),
601 "link:some-file.adoc[role=include,leveloffset+=1]"
602 );
603 }
604
605 #[test]
606 fn spaces_in_include_file_to_pass_macro_link() {
607 let input = "include::foo bar baz.adoc[]";
608 let mut parser = test_parser!(input);
609 parser.apply_job_settings(JobSettings::secure());
610 assert_eq!(
611 parser.read_line().unwrap().unwrap().reassemble_src(),
612 "link:pass:c[foo bar baz.adoc][role=include,]"
613 );
614 }
615
616 #[test]
617 fn uri_read_not_allowed_include_non_strict() {
618 let input = "include::https://my.com/foo bar.adoc[]";
620 let mut parser = test_parser!(input);
621 let mut settings = JobSettings::r#unsafe();
622 settings.strict = false;
623 parser.apply_job_settings(settings);
624 expect_eq!(
625 parser.read_line().unwrap().unwrap().reassemble_src(),
626 "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
627 from: input
628 );
629 }
630}