1use std::fmt::Debug;
2use std::{cell::RefCell, rc::Rc};
3
4use crate::internal::*;
5
6pub struct Parser<'arena> {
7 pub(super) bump: &'arena Bump,
8 pub(super) lexer: Lexer<'arena>,
9 pub(super) document: Document<'arena>,
10 pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
11 pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
12 pub(super) ctx: ParseContext<'arena>,
13 pub(super) errors: RefCell<Vec<Diagnostic>>,
14 pub(super) strict: bool, pub(super) attr_locs: Vec<(SourceLocation, bool)>,
16 pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
17 #[cfg(feature = "attr_ref_observation")]
18 pub(super) attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
19}
20
21impl<'arena> Parser<'arena> {
22 pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
23 Parser::from_lexer(Lexer::new(src, file, bump))
24 }
25
26 pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
27 Parser::from_lexer(Lexer::from_str(bump, file, src))
28 }
29
30 fn from_lexer(lexer: Lexer<'arena>) -> Self {
31 let mut parser = Parser {
32 bump: lexer.bump,
33 document: Document::new(lexer.bump),
34 peeked_lines: None,
35 peeked_meta: None,
36 ctx: ParseContext::new(lexer.bump),
37 errors: RefCell::new(Vec::new()),
38 strict: true,
39 include_resolver: None,
40 lexer,
41 attr_locs: Vec::with_capacity(16),
42 #[cfg(feature = "attr_ref_observation")]
43 attr_ref_observer: None,
44 };
45 parser.set_source_file_attrs();
46 parser
47 }
48
49 pub fn apply_job_settings(&mut self, settings: JobSettings) {
50 if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
51 Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
52 }
53 self.strict = settings.strict;
54 self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
55 self.document.meta = settings.into();
56 self.set_source_file_attrs();
57 }
58
59 pub fn provide_timestamps(
60 &mut self,
61 now: u64,
62 input_modified_time: Option<u64>,
63 reproducible_override: Option<u64>,
64 ) {
65 self.set_datetime_attrs(now, input_modified_time, reproducible_override);
66 }
67
68 pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
69 self.include_resolver = Some(resolver);
70 }
71
72 #[cfg(feature = "attr_ref_observation")]
73 pub fn set_attr_ref_observer(&mut self, observer: Box<dyn AttrRefObserver>) {
74 self.attr_ref_observer = Some(observer);
75 }
76
77 pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
78 let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
79 cell_parser.include_resolver = self.include_resolver.as_ref().map(|r| r.clone_box());
80 cell_parser.strict = self.strict;
81 cell_parser.lexer.adjust_offset(offset);
82 cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
83 cell_parser.document.meta = self.document.meta.clone_for_cell();
84 cell_parser.document.anchors = Rc::clone(&self.document.anchors);
85
86 #[cfg(feature = "attr_ref_observation")]
87 {
88 cell_parser.attr_ref_observer = self.attr_ref_observer.take();
89 }
90
91 cell_parser
92 }
93
94 pub(crate) fn loc(&self) -> SourceLocation {
95 self
96 .peeked_lines
97 .as_ref()
98 .and_then(|lines| lines.first_loc())
99 .unwrap_or_else(|| self.lexer.loc())
100 }
101
102 pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
103 Ok(self._read_line(false)?.map(|(line, _)| line))
104 }
105
106 fn _read_line(&mut self, ignored_last: bool) -> Result<Option<(Line<'arena>, bool)>> {
107 assert!(self.peeked_lines.is_none());
108 if self.lexer.is_eof() {
109 return Ok(None);
110 }
111
112 let mut drop_line = false;
113 let mut line = Line::empty(self.bump);
114 while !self.lexer.at_newline() && !self.lexer.is_eof() {
115 let token = self.lexer.next_token();
116 self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
117 }
118 self.lexer.skip_newline();
119 if drop_line {
120 return self._read_line(false);
121 }
122 if line.starts(TokenKind::Directive) && !self.ctx.within_block_comment() {
123 match self.try_process_directive(&mut line)? {
124 DirectiveAction::Passthrough => Ok(Some((line, ignored_last))),
125 DirectiveAction::SubstituteLine(line) => Ok(Some((line, ignored_last))),
126 DirectiveAction::IgnoreNotIncluded => self._read_line(true),
127 DirectiveAction::ReadNextLine => self._read_line(false),
128 DirectiveAction::SkipLinesUntilEndIf => Ok(
129 self
130 .skip_lines_until_endif(&line)?
131 .map(|l| (l, ignored_last)),
132 ),
133 }
134 } else {
135 Ok(Some((line, ignored_last)))
136 }
137 }
138
139 pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
140 self.ctx.comment_delim_in_lines = false;
141 if let Some(peeked) = self.peeked_lines.take() {
142 return Ok(Some(peeked));
143 }
144 self.lexer.consume_empty_lines();
145 if self.lexer.is_eof() {
146 return Ok(None);
147 }
148 let mut lines = Deq::new(self.bump);
149 while let Some((line, ignored_removed_include_line)) = self._read_line(false)? {
150 if line.is_emptyish() {
151 if lines.is_empty() {
152 continue;
156 } else if !ignored_removed_include_line {
157 break;
160 }
161 }
162 if line.is_delimiter_kind(DelimiterKind::Comment) {
163 self.ctx.comment_delim_in_lines = true;
164 }
165 lines.push(line);
166 if self.lexer.at_newline() {
167 break;
168 }
169 }
170 if lines.is_empty() {
171 Ok(None)
172 } else {
173 Ok(Some(ContiguousLines::new(lines)))
174 }
175 }
176
177 pub(crate) fn read_lines_until(
178 &mut self,
179 delimiter: Delimiter,
180 ) -> Result<Option<ContiguousLines<'arena>>> {
181 let Some(mut lines) = self.read_lines()? else {
182 return Ok(None);
183 };
184 if lines.any(|l| l.is_delimiter(delimiter)) {
185 return Ok(Some(lines));
186 }
187
188 let mut additional_lines = BumpVec::new_in(self.bump);
189 while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
190 additional_lines.push(self.read_line()?.unwrap());
191 }
192 lines.extend(additional_lines);
193
194 while lines.last().map(|l| l.is_empty()) == Some(true) {
195 lines.pop();
196 }
197 Ok(Some(lines))
198 }
199
200 fn at_delimiter(&self, delimiter: Delimiter) -> bool {
201 match delimiter.kind {
202 DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
203 DelimiterKind::Example => {
204 self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
205 }
206 DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
207 DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
208 DelimiterKind::Listing => self.lexer.at_delimiter_line() == Some((4, b'-')),
209 DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
210 DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
211 DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
212 }
213 }
214
215 pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
216 debug_assert!(self.peeked_lines.is_none());
217 if !lines.is_empty() {
218 self.peeked_lines = Some(lines);
219 }
220 }
221
222 pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
223 if !meta.is_empty() {
224 debug_assert!(self.peeked_meta.is_none());
225 self.peeked_meta = Some(meta);
226 }
227 }
228
229 pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
230 self.restore_lines(lines);
231 self.restore_peeked_meta(meta);
232 }
233
234 pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
235 self.parse_document_header()?;
236 self.prepare_toc();
237
238 if self.document.meta.get_doctype() == DocType::Inline {
241 if self.peeked_lines.is_none() {
242 self.peeked_lines = self.read_lines().expect("tmp");
244 }
245 self.lexer.truncate();
246 }
247
248 if let Some(book_content) = self.parse_book()? {
249 self.document.content = book_content;
250 } else {
251 let sectioned = self.parse_sectioned()?;
252 self.document.content = sectioned.into_doc_content(self.bump);
253 }
254
255 self.document.meta.clear_doc_attrs();
257 self.diagnose_document()?;
258 Ok(self.into())
259 }
260
261 pub(crate) fn parse_sectioned(&mut self) -> Result<Sectioned<'arena>> {
262 let mut blocks = bvec![in self.bump];
263 while let Some(block) = self.parse_block()? {
264 blocks.push(block);
265 }
266 let preamble = if blocks.is_empty() { None } else { Some(blocks) };
267 let mut sections = bvec![in self.bump];
268 while let Some(section) = self.parse_section()? {
269 sections.push(section);
270 }
271 Ok(Sectioned { preamble, sections })
272 }
273
274 pub(crate) fn parse_chunk_meta(
275 &mut self,
276 lines: &mut ContiguousLines<'arena>,
277 ) -> Result<ChunkMeta<'arena>> {
278 if let Some(meta) = self.peeked_meta.take() {
279 return Ok(meta);
280 }
281 assert!(!lines.is_empty());
282 let start_loc = lines.current_token().unwrap().loc;
283 let mut attrs = MultiAttrList::new_in(self.bump);
284 let mut title = None;
285 if !lines.current().unwrap().is_fully_unconsumed() {
286 return Ok(ChunkMeta::new(attrs, title, start_loc));
287 }
288 loop {
289 match lines.current() {
290 Some(line) if line.is_chunk_title() => {
291 let mut line = lines.consume_current().unwrap();
292 line.discard_assert(TokenKind::Dots);
293 title = Some(self.parse_inlines(&mut line.into_lines())?);
294 }
295 Some(line) if line.is_block_attr_list() => {
296 let mut line = lines.consume_current().unwrap();
297 line.discard_assert(TokenKind::OpenBracket);
298 attrs.push(self.parse_block_attr_list(&mut line)?);
299 }
300 Some(line) if line.is_block_anchor() => {
301 let mut line = lines.consume_current().unwrap();
302 let first = line.discard_assert(TokenKind::OpenBracket);
303 line.discard_assert(TokenKind::OpenBracket);
304 let Some(anchor) = self.parse_block_anchor(&mut line)? else {
305 self.err_line_starting("Invalid block anchor", first.loc)?;
306 return Ok(ChunkMeta::new(attrs, title, start_loc));
307 };
308 let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
309 anchor_attrs.id = Some(anchor.id);
310 anchor_attrs.positional.push(anchor.reftext);
311 attrs.push(anchor_attrs);
312 }
313 Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
315 lines.consume_current();
316 }
317 _ => break,
318 }
319 }
320 Ok(ChunkMeta::new(attrs, title, start_loc))
321 }
322
323 pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
324 BumpString::from_str_in(s, self.bump)
325 }
326}
327
328pub trait HasArena<'arena> {
329 fn bump(&self) -> &'arena Bump;
330 fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
331 Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
332 }
333}
334
335impl<'arena> HasArena<'arena> for Parser<'arena> {
336 fn bump(&self) -> &'arena Bump {
337 self.bump
338 }
339}
340
341pub enum DirectiveAction<'arena> {
342 Passthrough,
343 ReadNextLine,
344 IgnoreNotIncluded,
345 SkipLinesUntilEndIf,
346 SubstituteLine(Line<'arena>),
347}
348
349#[derive(Debug, Clone, PartialEq, Eq)]
350pub enum SourceFile {
351 Stdin { cwd: Path },
352 Path(Path),
353 Tmp,
354}
355
356impl SourceFile {
357 pub fn file_name(&self) -> &str {
358 match self {
359 SourceFile::Stdin { .. } => "<stdin>",
360 SourceFile::Path(path) => path.file_name(),
361 SourceFile::Tmp => "<temp-buffer>",
362 }
363 }
364
365 pub fn matches_xref_target(&self, target: &str) -> bool {
366 let SourceFile::Path(path) = self else {
367 return false;
368 };
369 let filename = path.file_name();
370 if filename == target {
371 return true;
372 }
373 let xref_ext = file::ext(target);
374 let path_ext = file::ext(filename);
375 if xref_ext.is_some() && xref_ext != path_ext {
376 return false;
377 }
378 let fullpath = path.to_string();
379 if fullpath.ends_with(target) {
380 true
381 } else if xref_ext.is_some() {
382 false
383 } else {
384 file::remove_ext(&fullpath).ends_with(target)
385 }
386 }
387}
388
389impl From<Diagnostic> for Vec<Diagnostic> {
390 fn from(diagnostic: Diagnostic) -> Self {
391 vec![diagnostic]
392 }
393}
394
395#[cfg(test)]
396mod tests {
397 use super::*;
398 use test_utils::*;
399
400 fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
401 #[derive(Clone)]
402 struct MockResolver(pub Vec<u8>);
403 impl IncludeResolver for MockResolver {
404 fn resolve(
405 &mut self,
406 _: IncludeTarget,
407 buffer: &mut dyn IncludeBuffer,
408 ) -> std::result::Result<usize, ResolveError> {
409 buffer.initialize(self.0.len());
410 let bytes = buffer.as_bytes_mut();
411 bytes.copy_from_slice(&self.0);
412 Ok(self.0.len())
413 }
414 fn get_base_dir(&self) -> Option<String> {
415 Some("/".to_string())
416 }
417 fn clone_box(&self) -> Box<dyn IncludeResolver> {
418 Box::new(self.clone())
419 }
420 }
421 Box::new(MockResolver(Vec::from(src.as_bytes())))
422 }
423
424 fn reassemble(lines: ContiguousLines) -> String {
425 lines
426 .iter()
427 .map(|l| l.reassemble_src())
428 .collect::<Vec<_>>()
429 .join("\n")
430 }
431
432 #[test]
433 fn test_attr_ref() {
434 let mut parser = test_parser!("hello {foo} world");
435 parser
436 .document
437 .meta
438 .insert_doc_attr("foo", "_bar_")
439 .unwrap();
440 let mut lines = parser.read_lines().unwrap().unwrap();
441 let line = lines.consume_current().unwrap();
442 let tokens = line.into_iter().collect::<Vec<_>>();
443 expect_eq!(
444 &tokens,
445 &[
446 Token::new(TokenKind::Word, loc!(0..5), bstr!("hello")),
447 Token::new(TokenKind::Whitespace, loc!(5..6), bstr!(" ")),
448 Token::new(TokenKind::AttrRef, loc!(6..11), bstr!("{foo}")),
449 Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
453 Token::new(TokenKind::Word, loc!(6..11), bstr!("bar")),
454 Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
455 Token::new(TokenKind::Whitespace, loc!(11..12), bstr!(" ")),
457 Token::new(TokenKind::Word, loc!(12..17), bstr!("world")),
458 ]
459 );
460 }
461
462 #[test]
463 fn invalid_directive_line_passed_thru() {
464 let input = adoc! {"
465 foo
466 include::invalid []
467 bar
468 "};
469
470 let mut parser = test_parser!(input);
471 assert_eq!(
472 reassemble(parser.read_lines().unwrap().unwrap()),
473 input.trim_end()
474 );
475 }
476
477 #[test]
478 fn safe_mode_include_to_link() {
479 let input = adoc! {"
480 foo
481 include::include-file.adoc[]
482 baz
483 "};
484
485 let mut parser = test_parser!(input);
486 parser.apply_job_settings(JobSettings::secure());
487 assert_eq!(
488 reassemble(parser.read_lines().unwrap().unwrap()),
489 adoc! {"
490 foo
491 link:include-file.adoc[role=include,]
492 baz"
493 }
494 );
495
496 let mut parser = test_parser!(input);
498 parser.apply_job_settings(JobSettings::secure());
499
500 let mut line = parser.read_line().unwrap().unwrap();
501 expect_eq!(
502 line.consume_current().unwrap(),
503 Token::new(TokenKind::Word, loc!(0..3), bstr!("foo"))
504 );
505 assert!(line.consume_current().is_none());
506
507 assert_eq!(&input[8..13], "ude::");
508 assert_eq!(&input[30..32], "[]");
509
510 let mut line = parser.read_line().unwrap().unwrap();
511 expect_eq!(
512 std::array::from_fn(|_| line.consume_current().unwrap()),
513 [
514 Token::new(TokenKind::MacroName, loc!(8..13), bstr!("link:")),
517 Token::new(TokenKind::Word, loc!(13..20), bstr!("include")),
518 Token::new(TokenKind::Dashes, loc!(20..21), bstr!("-")),
519 Token::new(TokenKind::Word, loc!(21..25), bstr!("file")),
520 Token::new(TokenKind::Dots, loc!(25..26), bstr!(".")),
521 Token::new(TokenKind::Word, loc!(26..30), bstr!("adoc")),
522 Token::new(TokenKind::OpenBracket, loc!(30..31), bstr!("[")),
523 Token::new(TokenKind::Word, loc!(31..31), bstr!("role")),
526 Token::new(TokenKind::EqualSigns, loc!(31..31), bstr!("=")),
527 Token::new(TokenKind::Word, loc!(31..31), bstr!("include")),
528 Token::new(TokenKind::Comma, loc!(31..31), bstr!(",")),
529 Token::new(TokenKind::CloseBracket, loc!(31..32), bstr!("]")),
531 ]
532 );
533 assert!(line.consume_current().is_none());
534 }
535
536 #[test]
537 fn attrs_preserved_when_replacing_include() {
538 let input = "include::some-file.adoc[leveloffset+=1]";
539 let mut parser = test_parser!(input);
540 parser.apply_job_settings(JobSettings::secure());
541 assert_eq!(
542 parser.read_line().unwrap().unwrap().reassemble_src(),
543 "link:some-file.adoc[role=include,leveloffset+=1]"
544 );
545 }
546
547 #[test]
548 fn spaces_in_include_file_to_pass_macro_link() {
549 let input = "include::foo bar baz.adoc[]";
550 let mut parser = test_parser!(input);
551 parser.apply_job_settings(JobSettings::secure());
552 assert_eq!(
553 parser.read_line().unwrap().unwrap().reassemble_src(),
554 "link:pass:c[foo bar baz.adoc][role=include,]"
555 );
556 }
557
558 #[test]
559 fn uri_read_not_allowed_include_non_strict() {
560 let input = "include::https://my.com/foo bar.adoc[]";
562 let mut parser = test_parser!(input);
563 let mut settings = JobSettings::r#unsafe();
564 settings.strict = false;
565 parser.apply_job_settings(settings);
566 expect_eq!(
567 parser.read_line().unwrap().unwrap().reassemble_src(),
568 "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
569 from: input
570 );
571 }
572}