1use std::{cell::RefCell, rc::Rc};
2
3use crate::internal::*;
4
5pub struct Parser<'arena> {
6 pub(super) bump: &'arena Bump,
7 pub(super) lexer: Lexer<'arena>,
8 pub(super) document: Document<'arena>,
9 pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
10 pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
11 pub(super) ctx: ParseContext<'arena>,
12 pub(super) errors: RefCell<Vec<Diagnostic>>,
13 pub(super) strict: bool, pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
15 #[cfg(feature = "attr_ref_observation")]
16 pub(super) attr_ref_observer: Option<Box<dyn AttrRefObserver>>,
17}
18
19#[derive(Debug)]
20pub struct ParseResult<'arena> {
21 pub document: Document<'arena>,
22 pub warnings: Vec<Diagnostic>,
23}
24
25impl<'arena> Parser<'arena> {
26 pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
27 Parser::from_lexer(Lexer::new(src, file, bump))
28 }
29
30 pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
31 Parser::from_lexer(Lexer::from_str(bump, file, src))
32 }
33
34 fn from_lexer(lexer: Lexer<'arena>) -> Self {
35 let mut parser = Parser {
36 bump: lexer.bump,
37 document: Document::new(lexer.bump),
38 peeked_lines: None,
39 peeked_meta: None,
40 ctx: ParseContext::new(lexer.bump),
41 errors: RefCell::new(Vec::new()),
42 strict: true,
43 include_resolver: None,
44 lexer,
45 #[cfg(feature = "attr_ref_observation")]
46 attr_ref_observer: None,
47 };
48 parser.set_source_file_attrs();
49 parser
50 }
51
52 pub fn apply_job_settings(&mut self, settings: JobSettings) {
53 if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
54 Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
55 }
56 self.strict = settings.strict;
57 self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
58 self.document.meta = settings.into();
59 self.set_source_file_attrs();
60 }
61
62 pub fn provide_timestamps(
63 &mut self,
64 now: u64,
65 input_modified_time: Option<u64>,
66 reproducible_override: Option<u64>,
67 ) {
68 self.set_datetime_attrs(now, input_modified_time, reproducible_override);
69 }
70
71 pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
72 self.include_resolver = Some(resolver);
73 }
74
75 #[cfg(feature = "attr_ref_observation")]
76 pub fn set_attr_ref_observer(&mut self, observer: Box<dyn AttrRefObserver>) {
77 self.attr_ref_observer = Some(observer);
78 }
79
80 pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
81 let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
82 cell_parser.include_resolver = self.include_resolver.as_ref().map(|r| r.clone_box());
83 cell_parser.strict = self.strict;
84 cell_parser.lexer.adjust_offset(offset);
85 cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
86 cell_parser.document.meta = self.document.meta.clone_for_cell();
87 cell_parser.document.anchors = Rc::clone(&self.document.anchors);
88 cell_parser
89 }
90
91 pub(crate) fn loc(&self) -> SourceLocation {
92 self
93 .peeked_lines
94 .as_ref()
95 .and_then(|lines| lines.first_loc())
96 .unwrap_or_else(|| self.lexer.loc())
97 }
98
99 pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
100 assert!(self.peeked_lines.is_none());
101 if self.lexer.is_eof() {
102 return Ok(None);
103 }
104
105 let mut drop_line = false;
106 let mut line = Line::empty(self.bump);
107 while !self.lexer.at_newline() && !self.lexer.is_eof() {
108 let token = self.lexer.next_token();
109 self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
110 }
111 self.lexer.skip_newline();
112 if drop_line {
113 return self.read_line();
114 }
115 if line.starts(TokenKind::Directive) && !self.ctx.within_block_comment() {
116 match self.try_process_directive(&mut line)? {
117 DirectiveAction::Passthrough => Ok(Some(line)),
118 DirectiveAction::SubstituteLine(line) => Ok(Some(line)),
119 DirectiveAction::ReadNextLine => self.read_line(),
120 DirectiveAction::SkipLinesUntilEndIf => self.skip_lines_until_endif(&line),
121 }
122 } else {
123 Ok(Some(line))
124 }
125 }
126
127 pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
128 self.ctx.comment_delim_in_lines = false;
129 if let Some(peeked) = self.peeked_lines.take() {
130 return Ok(Some(peeked));
131 }
132 self.lexer.consume_empty_lines();
133 if self.lexer.is_eof() {
134 return Ok(None);
135 }
136 let mut lines = Deq::new(self.bump);
137 while let Some(line) = self.read_line()? {
138 if line.is_emptyish() {
139 if lines.is_empty() {
140 continue;
144 } else {
145 break;
147 }
148 }
149 if line.is_delimiter_kind(DelimiterKind::Comment) {
150 self.ctx.comment_delim_in_lines = true;
151 }
152 lines.push(line);
153 if self.lexer.at_newline() {
154 break;
155 }
156 }
157 if lines.is_empty() {
158 Ok(None)
159 } else {
160 Ok(Some(ContiguousLines::new(lines)))
161 }
162 }
163
164 pub(crate) fn read_lines_until(
165 &mut self,
166 delimiter: Delimiter,
167 ) -> Result<Option<ContiguousLines<'arena>>> {
168 let Some(mut lines) = self.read_lines()? else {
169 return Ok(None);
170 };
171 if lines.any(|l| l.is_delimiter(delimiter)) {
172 return Ok(Some(lines));
173 }
174
175 let mut additional_lines = BumpVec::new_in(self.bump);
176 while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
177 additional_lines.push(self.read_line()?.unwrap());
178 }
179 lines.extend(additional_lines);
180
181 while lines.last().map(|l| l.is_empty()) == Some(true) {
182 lines.pop();
183 }
184 Ok(Some(lines))
185 }
186
187 fn at_delimiter(&self, delimiter: Delimiter) -> bool {
188 match delimiter.kind {
189 DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
190 DelimiterKind::Example => {
191 self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
192 }
193 DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
194 DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
195 DelimiterKind::Listing => self.lexer.at_delimiter_line() == Some((4, b'-')),
196 DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
197 DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
198 DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
199 }
200 }
201
202 pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
203 debug_assert!(self.peeked_lines.is_none());
204 if !lines.is_empty() {
205 self.peeked_lines = Some(lines);
206 }
207 }
208
209 pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
210 if !meta.is_empty() {
211 debug_assert!(self.peeked_meta.is_none());
212 self.peeked_meta = Some(meta);
213 }
214 }
215
216 pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
217 self.restore_lines(lines);
218 self.restore_peeked_meta(meta);
219 }
220
221 pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
222 self.parse_document_header()?;
223 self.prepare_toc();
224
225 if self.document.meta.get_doctype() == DocType::Inline {
228 if self.peeked_lines.is_none() {
229 self.peeked_lines = self.read_lines().expect("tmp");
231 }
232 self.lexer.truncate();
233 }
234
235 if let Some(book_content) = self.parse_book()? {
236 self.document.content = book_content;
237 } else {
238 let sectioned = self.parse_sectioned()?;
239 self.document.content = sectioned.into_doc_content(self.bump);
240 }
241
242 self.document.meta.clear_doc_attrs();
244
245 self.diagnose_document()?;
246
247 Ok(ParseResult {
248 document: self.document,
249 warnings: self.errors.into_inner(),
250 })
251 }
252
253 pub(crate) fn parse_sectioned(&mut self) -> Result<Sectioned<'arena>> {
254 let mut blocks = bvec![in self.bump];
255 while let Some(block) = self.parse_block()? {
256 blocks.push(block);
257 }
258 let preamble = if blocks.is_empty() { None } else { Some(blocks) };
259 let mut sections = bvec![in self.bump];
260 while let Some(section) = self.parse_section()? {
261 sections.push(section);
262 }
263 Ok(Sectioned { preamble, sections })
264 }
265
266 pub(crate) fn parse_chunk_meta(
267 &mut self,
268 lines: &mut ContiguousLines<'arena>,
269 ) -> Result<ChunkMeta<'arena>> {
270 if let Some(meta) = self.peeked_meta.take() {
271 return Ok(meta);
272 }
273 assert!(!lines.is_empty());
274 let start_loc = lines.current_token().unwrap().loc;
275 let mut attrs = MultiAttrList::new_in(self.bump);
276 let mut title = None;
277 if !lines.current().unwrap().is_fully_unconsumed() {
278 return Ok(ChunkMeta::new(attrs, title, start_loc));
279 }
280 loop {
281 match lines.current() {
282 Some(line) if line.is_chunk_title() => {
283 let mut line = lines.consume_current().unwrap();
284 line.discard_assert(TokenKind::Dots);
285 title = Some(self.parse_inlines(&mut line.into_lines())?);
286 }
287 Some(line) if line.is_block_attr_list() => {
288 let mut line = lines.consume_current().unwrap();
289 line.discard_assert(TokenKind::OpenBracket);
290 attrs.push(self.parse_block_attr_list(&mut line)?);
291 }
292 Some(line) if line.is_block_anchor() => {
293 let mut line = lines.consume_current().unwrap();
294 let first = line.discard_assert(TokenKind::OpenBracket);
295 line.discard_assert(TokenKind::OpenBracket);
296 let Some(anchor) = self.parse_block_anchor(&mut line)? else {
297 self.err_line_starting("Invalid block anchor", first.loc)?;
298 return Ok(ChunkMeta::new(attrs, title, start_loc));
299 };
300 let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
301 anchor_attrs.id = Some(anchor.id);
302 anchor_attrs.positional.push(anchor.reftext);
303 attrs.push(anchor_attrs);
304 }
305 Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
307 lines.consume_current();
308 }
309 _ => break,
310 }
311 }
312 Ok(ChunkMeta::new(attrs, title, start_loc))
313 }
314
315 pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
316 BumpString::from_str_in(s, self.bump)
317 }
318}
319
320pub trait HasArena<'arena> {
321 fn bump(&self) -> &'arena Bump;
322 fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
323 Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
324 }
325}
326
327impl<'arena> HasArena<'arena> for Parser<'arena> {
328 fn bump(&self) -> &'arena Bump {
329 self.bump
330 }
331}
332
333pub enum DirectiveAction<'arena> {
334 Passthrough,
335 ReadNextLine,
336 SkipLinesUntilEndIf,
337 SubstituteLine(Line<'arena>),
338}
339
340#[derive(Debug, Clone, PartialEq, Eq)]
341pub enum SourceFile {
342 Stdin { cwd: Path },
343 Path(Path),
344 Tmp,
345}
346
347impl SourceFile {
348 pub fn file_name(&self) -> &str {
349 match self {
350 SourceFile::Stdin { .. } => "<stdin>",
351 SourceFile::Path(path) => path.file_name(),
352 SourceFile::Tmp => "<temp-buffer>",
353 }
354 }
355
356 pub fn matches_xref_target(&self, target: &str) -> bool {
357 let SourceFile::Path(path) = self else {
358 return false;
359 };
360 let filename = path.file_name();
361 if filename == target {
362 return true;
363 }
364 let xref_ext = file::ext(target);
365 let path_ext = file::ext(filename);
366 if xref_ext.is_some() && xref_ext != path_ext {
367 return false;
368 }
369 let fullpath = path.to_string();
370 if fullpath.ends_with(target) {
371 true
372 } else if xref_ext.is_some() {
373 false
374 } else {
375 file::remove_ext(&fullpath).ends_with(target)
376 }
377 }
378}
379
380impl From<Diagnostic> for Vec<Diagnostic> {
381 fn from(diagnostic: Diagnostic) -> Self {
382 vec![diagnostic]
383 }
384}
385
386#[cfg(test)]
387mod tests {
388 use super::*;
389 use test_utils::*;
390
391 fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
392 #[derive(Clone)]
393 struct MockResolver(pub Vec<u8>);
394 impl IncludeResolver for MockResolver {
395 fn resolve(
396 &mut self,
397 _: IncludeTarget,
398 buffer: &mut dyn IncludeBuffer,
399 ) -> std::result::Result<usize, ResolveError> {
400 buffer.initialize(self.0.len());
401 let bytes = buffer.as_bytes_mut();
402 bytes.copy_from_slice(&self.0);
403 Ok(self.0.len())
404 }
405 fn get_base_dir(&self) -> Option<String> {
406 Some("/".to_string())
407 }
408 fn clone_box(&self) -> Box<dyn IncludeResolver> {
409 Box::new(self.clone())
410 }
411 }
412 Box::new(MockResolver(Vec::from(src.as_bytes())))
413 }
414
415 fn reassemble(lines: ContiguousLines) -> String {
416 lines
417 .iter()
418 .map(|l| l.reassemble_src())
419 .collect::<Vec<_>>()
420 .join("\n")
421 }
422
423 #[test]
424 fn test_attr_ref() {
425 let mut parser = test_parser!("hello {foo} world");
426 parser
427 .document
428 .meta
429 .insert_doc_attr("foo", "_bar_")
430 .unwrap();
431 let mut lines = parser.read_lines().unwrap().unwrap();
432 let line = lines.consume_current().unwrap();
433 let tokens = line.into_iter().collect::<Vec<_>>();
434 expect_eq!(
435 &tokens,
436 &[
437 Token::new(TokenKind::Word, loc!(0..5), bstr!("hello")),
438 Token::new(TokenKind::Whitespace, loc!(5..6), bstr!(" ")),
439 Token::new(TokenKind::AttrRef, loc!(6..11), bstr!("{foo}")),
440 Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
444 Token::new(TokenKind::Word, loc!(6..11), bstr!("bar")),
445 Token::new(TokenKind::Underscore, loc!(6..11), bstr!("_")),
446 Token::new(TokenKind::Whitespace, loc!(11..12), bstr!(" ")),
448 Token::new(TokenKind::Word, loc!(12..17), bstr!("world")),
449 ]
450 );
451 }
452
453 #[test]
454 fn invalid_directive_line_passed_thru() {
455 let input = adoc! {"
456 foo
457 include::invalid []
458 bar
459 "};
460
461 let mut parser = test_parser!(input);
462 assert_eq!(
463 reassemble(parser.read_lines().unwrap().unwrap()),
464 input.trim_end()
465 );
466 }
467
468 #[test]
469 fn safe_mode_include_to_link() {
470 let input = adoc! {"
471 foo
472 include::include-file.adoc[]
473 baz
474 "};
475
476 let mut parser = test_parser!(input);
477 parser.apply_job_settings(JobSettings::secure());
478 assert_eq!(
479 reassemble(parser.read_lines().unwrap().unwrap()),
480 adoc! {"
481 foo
482 link:include-file.adoc[role=include,]
483 baz"
484 }
485 );
486
487 let mut parser = test_parser!(input);
489 parser.apply_job_settings(JobSettings::secure());
490
491 let mut line = parser.read_line().unwrap().unwrap();
492 expect_eq!(
493 line.consume_current().unwrap(),
494 Token::new(TokenKind::Word, loc!(0..3), bstr!("foo"))
495 );
496 assert!(line.consume_current().is_none());
497
498 assert_eq!(&input[8..13], "ude::");
499 assert_eq!(&input[30..32], "[]");
500
501 let mut line = parser.read_line().unwrap().unwrap();
502 expect_eq!(
503 std::array::from_fn(|_| line.consume_current().unwrap()),
504 [
505 Token::new(TokenKind::MacroName, loc!(8..13), bstr!("link:")),
508 Token::new(TokenKind::Word, loc!(13..20), bstr!("include")),
509 Token::new(TokenKind::Dashes, loc!(20..21), bstr!("-")),
510 Token::new(TokenKind::Word, loc!(21..25), bstr!("file")),
511 Token::new(TokenKind::Dots, loc!(25..26), bstr!(".")),
512 Token::new(TokenKind::Word, loc!(26..30), bstr!("adoc")),
513 Token::new(TokenKind::OpenBracket, loc!(30..31), bstr!("[")),
514 Token::new(TokenKind::Word, loc!(31..31), bstr!("role")),
517 Token::new(TokenKind::EqualSigns, loc!(31..31), bstr!("=")),
518 Token::new(TokenKind::Word, loc!(31..31), bstr!("include")),
519 Token::new(TokenKind::Comma, loc!(31..31), bstr!(",")),
520 Token::new(TokenKind::CloseBracket, loc!(31..32), bstr!("]")),
522 ]
523 );
524 assert!(line.consume_current().is_none());
525 }
526
527 #[test]
528 fn attrs_preserved_when_replacing_include() {
529 let input = "include::some-file.adoc[leveloffset+=1]";
530 let mut parser = test_parser!(input);
531 parser.apply_job_settings(JobSettings::secure());
532 assert_eq!(
533 parser.read_line().unwrap().unwrap().reassemble_src(),
534 "link:some-file.adoc[role=include,leveloffset+=1]"
535 );
536 }
537
538 #[test]
539 fn spaces_in_include_file_to_pass_macro_link() {
540 let input = "include::foo bar baz.adoc[]";
541 let mut parser = test_parser!(input);
542 parser.apply_job_settings(JobSettings::secure());
543 assert_eq!(
544 parser.read_line().unwrap().unwrap().reassemble_src(),
545 "link:pass:c[foo bar baz.adoc][role=include,]"
546 );
547 }
548
549 #[test]
550 fn uri_read_not_allowed_include_non_strict() {
551 let input = "include::https://my.com/foo bar.adoc[]";
553 let mut parser = test_parser!(input);
554 let mut settings = JobSettings::r#unsafe();
555 settings.strict = false;
556 parser.apply_job_settings(settings);
557 expect_eq!(
558 parser.read_line().unwrap().unwrap().reassemble_src(),
559 "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
560 from: input
561 );
562 }
563}