1use std::{cell::RefCell, rc::Rc};
2
3use crate::internal::*;
4
5pub struct Parser<'arena> {
6 pub(super) bump: &'arena Bump,
7 pub(super) lexer: Lexer<'arena>,
8 pub(super) document: Document<'arena>,
9 pub(super) peeked_lines: Option<ContiguousLines<'arena>>,
10 pub(super) peeked_meta: Option<ChunkMeta<'arena>>,
11 pub(super) ctx: ParseContext<'arena>,
12 pub(super) errors: RefCell<Vec<Diagnostic>>,
13 pub(super) strict: bool, pub(super) include_resolver: Option<Box<dyn IncludeResolver>>,
15}
16
17pub struct ParseResult<'arena> {
18 pub document: Document<'arena>,
19 pub warnings: Vec<Diagnostic>,
20}
21
22impl<'arena> Parser<'arena> {
23 pub fn new(src: BumpVec<'arena, u8>, file: SourceFile, bump: &'arena Bump) -> Self {
24 Parser::from_lexer(Lexer::new(src, file, bump))
25 }
26
27 pub fn from_str(src: &str, file: SourceFile, bump: &'arena Bump) -> Self {
28 Parser::from_lexer(Lexer::from_str(bump, file, src))
29 }
30
31 fn from_lexer(lexer: Lexer<'arena>) -> Self {
32 let mut parser = Parser {
33 bump: lexer.bump,
34 document: Document::new(lexer.bump),
35 peeked_lines: None,
36 peeked_meta: None,
37 ctx: ParseContext::new(lexer.bump),
38 errors: RefCell::new(Vec::new()),
39 strict: true,
40 include_resolver: None,
41 lexer,
42 };
43 parser.set_source_file_attrs();
44 parser
45 }
46
47 pub fn apply_job_settings(&mut self, settings: JobSettings) {
48 if let Some(leveloffset) = settings.job_attrs.get("leveloffset") {
49 Parser::adjust_leveloffset(&mut self.ctx.leveloffset, &leveloffset.value);
50 }
51 self.strict = settings.strict;
52 self.ctx.max_include_depth = settings.job_attrs.u16("max-include-depth").unwrap_or(64);
53 self.document.meta = settings.into();
54 self.set_source_file_attrs();
55 }
56
57 pub fn provide_timestamps(
58 &mut self,
59 now: u64,
60 input_modified_time: Option<u64>,
61 reproducible_override: Option<u64>,
62 ) {
63 self.set_datetime_attrs(now, input_modified_time, reproducible_override);
64 }
65
66 pub fn set_resolver(&mut self, resolver: Box<dyn IncludeResolver>) {
67 self.include_resolver = Some(resolver);
68 }
69
70 pub fn cell_parser(&mut self, src: BumpVec<'arena, u8>, offset: u32) -> Parser<'arena> {
71 let mut cell_parser = Parser::new(src, self.lexer.source_file().clone(), self.bump);
72 cell_parser.strict = self.strict;
73 cell_parser.lexer.adjust_offset(offset);
74 cell_parser.ctx = self.ctx.clone_for_cell(self.bump);
75 cell_parser.document.meta = self.document.meta.clone_for_cell();
76 cell_parser.document.anchors = Rc::clone(&self.document.anchors);
77 cell_parser
78 }
79
80 pub(crate) fn loc(&self) -> SourceLocation {
81 self
82 .peeked_lines
83 .as_ref()
84 .and_then(|lines| lines.first_loc())
85 .unwrap_or_else(|| self.lexer.loc())
86 }
87
88 pub(crate) fn read_line(&mut self) -> Result<Option<Line<'arena>>> {
89 assert!(self.peeked_lines.is_none());
90 if self.lexer.is_eof() {
91 return Ok(None);
92 }
93
94 let mut drop_line = false;
95 let mut line = Line::empty(self.bump);
96 while !self.lexer.at_newline() && !self.lexer.is_eof() {
97 let token = self.lexer.next_token();
98 self.push_token_replacing_attr_ref(token, &mut line, &mut drop_line)?;
99 }
100 self.lexer.skip_newline();
101 if drop_line {
102 return self.read_line();
103 }
104 if line.starts(TokenKind::Directive) {
105 match self.try_process_directive(&mut line)? {
106 DirectiveAction::Passthrough => Ok(Some(line)),
107 DirectiveAction::SubstituteLine(line) => Ok(Some(line)),
108 DirectiveAction::ReadNextLine => self.read_line(),
109 DirectiveAction::SkipLinesUntilEndIf => self.skip_lines_until_endif(&line),
110 }
111 } else {
112 Ok(Some(line))
113 }
114 }
115
116 pub(crate) fn read_lines(&mut self) -> Result<Option<ContiguousLines<'arena>>> {
117 if let Some(peeked) = self.peeked_lines.take() {
118 return Ok(Some(peeked));
119 }
120 self.lexer.consume_empty_lines();
121 if self.lexer.is_eof() {
122 return Ok(None);
123 }
124 let mut lines = Deq::new(self.bump);
125 while let Some(line) = self.read_line()? {
126 if line.is_emptyish() {
127 if lines.is_empty() {
128 continue;
132 } else {
133 break;
135 }
136 }
137 lines.push(line);
138 if self.lexer.at_newline() {
139 break;
140 }
141 }
142 if lines.is_empty() {
143 Ok(None)
144 } else {
145 Ok(Some(ContiguousLines::new(lines)))
146 }
147 }
148
149 pub(crate) fn read_lines_until(
150 &mut self,
151 delimiter: Delimiter,
152 ) -> Result<Option<ContiguousLines<'arena>>> {
153 let Some(mut lines) = self.read_lines()? else {
154 return Ok(None);
155 };
156 if lines.any(|l| l.is_delimiter(delimiter)) {
157 return Ok(Some(lines));
158 }
159
160 let mut additional_lines = BumpVec::new_in(self.bump);
161 while !self.lexer.is_eof() && !self.at_delimiter(delimiter) {
162 additional_lines.push(self.read_line()?.unwrap());
163 }
164 lines.extend(additional_lines);
165 Ok(Some(lines))
166 }
167
168 fn at_delimiter(&self, delimiter: Delimiter) -> bool {
169 match delimiter.kind {
170 DelimiterKind::BlockQuote => self.lexer.at_delimiter_line() == Some((4, b'_')),
171 DelimiterKind::Example => {
172 self.lexer.at_delimiter_line() == Some((delimiter.len as u32, b'='))
173 }
174 DelimiterKind::Open => self.lexer.at_delimiter_line() == Some((2, b'-')),
175 DelimiterKind::Sidebar => self.lexer.at_delimiter_line() == Some((4, b'*')),
176 DelimiterKind::Listing => self.lexer.at_delimiter_line() == Some((4, b'-')),
177 DelimiterKind::Literal => self.lexer.at_delimiter_line() == Some((4, b'.')),
178 DelimiterKind::Passthrough => self.lexer.at_delimiter_line() == Some((4, b'+')),
179 DelimiterKind::Comment => self.lexer.at_delimiter_line() == Some((4, b'/')),
180 }
181 }
182
183 pub(crate) fn restore_lines(&mut self, lines: ContiguousLines<'arena>) {
184 debug_assert!(self.peeked_lines.is_none());
185 if !lines.is_empty() {
186 self.peeked_lines = Some(lines);
187 }
188 }
189
190 pub(crate) fn restore_peeked_meta(&mut self, meta: ChunkMeta<'arena>) {
191 if !meta.is_empty() {
192 debug_assert!(self.peeked_meta.is_none());
193 self.peeked_meta = Some(meta);
194 }
195 }
196
197 pub(crate) fn restore_peeked(&mut self, lines: ContiguousLines<'arena>, meta: ChunkMeta<'arena>) {
198 self.restore_lines(lines);
199 self.restore_peeked_meta(meta);
200 }
201
202 pub fn parse(mut self) -> std::result::Result<ParseResult<'arena>, Vec<Diagnostic>> {
203 self.parse_document_header()?;
204 self.prepare_toc();
205
206 if self.document.meta.get_doctype() == DocType::Inline {
209 if self.peeked_lines.is_none() {
210 self.peeked_lines = self.read_lines().expect("tmp");
212 }
213 self.lexer.truncate();
214 }
215
216 while let Some(chunk) = self.parse_chunk()? {
217 match chunk {
218 Chunk::Block(block) => self.document.content.push_block(block, self.bump),
219 Chunk::Section(section) => self.document.content.push_section(section, self.bump),
220 }
221 }
222
223 self.document.meta.clear_doc_attrs();
225
226 self.diagnose_document()?;
227
228 Ok(ParseResult {
229 document: self.document,
230 warnings: vec![],
231 })
232 }
233
234 fn parse_chunk(&mut self) -> Result<Option<Chunk<'arena>>> {
235 match self.parse_section()? {
236 Some(section) => Ok(Some(Chunk::Section(section))),
237 None => Ok(self.parse_block()?.map(Chunk::Block)),
238 }
239 }
240
241 pub(crate) fn parse_chunk_meta(
242 &mut self,
243 lines: &mut ContiguousLines<'arena>,
244 ) -> Result<ChunkMeta<'arena>> {
245 if let Some(meta) = self.peeked_meta.take() {
246 return Ok(meta);
247 }
248 assert!(!lines.is_empty());
249 let start_loc = lines.current_token().unwrap().loc;
250 let mut attrs = MultiAttrList::new_in(self.bump);
251 let mut title = None;
252 if !lines.current().unwrap().is_fully_unconsumed() {
253 return Ok(ChunkMeta::new(attrs, title, start_loc));
254 }
255 loop {
256 match lines.current() {
257 Some(line) if line.is_chunk_title() => {
258 let mut line = lines.consume_current().unwrap();
259 line.discard_assert(TokenKind::Dots);
260 title = Some(self.parse_inlines(&mut line.into_lines())?);
261 }
262 Some(line) if line.is_block_attr_list() => {
263 let mut line = lines.consume_current().unwrap();
264 line.discard_assert(TokenKind::OpenBracket);
265 attrs.push(self.parse_block_attr_list(&mut line)?);
266 }
267 Some(line) if line.is_block_anchor() => {
268 let mut line = lines.consume_current().unwrap();
269 line.discard_assert(TokenKind::OpenBracket);
270 line.discard_assert(TokenKind::OpenBracket);
271 let anchor = self.parse_block_anchor(&mut line)?.unwrap();
272 let mut anchor_attrs = AttrList::new(anchor.loc, self.bump);
273 anchor_attrs.id = Some(anchor.id);
274 anchor_attrs.positional.push(anchor.reftext);
275 attrs.push(anchor_attrs);
276 }
277 Some(line) if line.is_comment() && (!attrs.is_empty() || title.is_some()) => {
279 lines.consume_current();
280 }
281 _ => break,
282 }
283 }
284 Ok(ChunkMeta::new(attrs, title, start_loc))
285 }
286
287 pub(crate) fn string(&self, s: &str) -> BumpString<'arena> {
288 BumpString::from_str_in(s, self.bump)
289 }
290}
291
292pub trait HasArena<'arena> {
293 fn bump(&self) -> &'arena Bump;
294 fn token(&self, kind: TokenKind, lexeme: &str, loc: SourceLocation) -> Token<'arena> {
295 Token::new(kind, loc, BumpString::from_str_in(lexeme, self.bump()))
296 }
297}
298
299impl<'arena> HasArena<'arena> for Parser<'arena> {
300 fn bump(&self) -> &'arena Bump {
301 self.bump
302 }
303}
304
305#[derive(Debug)]
306pub enum Chunk<'arena> {
307 Block(Block<'arena>),
308 Section(Section<'arena>),
309}
310
311pub enum DirectiveAction<'arena> {
312 Passthrough,
313 ReadNextLine,
314 SkipLinesUntilEndIf,
315 SubstituteLine(Line<'arena>),
316}
317
318#[derive(Debug, Clone, PartialEq, Eq)]
319pub enum SourceFile {
320 Stdin { cwd: Path },
321 Path(Path),
322 Tmp,
323}
324
325impl SourceFile {
326 pub fn file_name(&self) -> &str {
327 match self {
328 SourceFile::Stdin { .. } => "<stdin>",
329 SourceFile::Path(path) => path.file_name(),
330 SourceFile::Tmp => "<temp-buffer>",
331 }
332 }
333
334 pub fn matches_xref_target(&self, target: &str) -> bool {
335 let SourceFile::Path(path) = self else {
336 return false;
337 };
338 let filename = path.file_name();
339 if filename == target {
340 return true;
341 }
342 let xref_ext = file::ext(target);
343 let path_ext = file::ext(filename);
344 if xref_ext.is_some() && xref_ext != path_ext {
345 return false;
346 }
347 let fullpath = path.to_string();
348 if fullpath.ends_with(target) {
349 true
350 } else if xref_ext.is_some() {
351 false
352 } else {
353 file::remove_ext(&fullpath).ends_with(target)
354 }
355 }
356}
357
358impl From<Diagnostic> for Vec<Diagnostic> {
359 fn from(diagnostic: Diagnostic) -> Self {
360 vec![diagnostic]
361 }
362}
363
364#[cfg(test)]
365mod tests {
366 use super::*;
367 use test_utils::*;
368
369 fn resolve(src: &'static str) -> Box<dyn IncludeResolver> {
370 struct MockResolver(pub Vec<u8>);
371 impl IncludeResolver for MockResolver {
372 fn resolve(
373 &mut self,
374 _: IncludeTarget,
375 buffer: &mut dyn IncludeBuffer,
376 ) -> std::result::Result<usize, ResolveError> {
377 buffer.initialize(self.0.len());
378 let bytes = buffer.as_bytes_mut();
379 bytes.copy_from_slice(&self.0);
380 Ok(self.0.len())
381 }
382 fn get_base_dir(&self) -> Option<String> {
383 Some("/".to_string())
384 }
385 }
386 Box::new(MockResolver(Vec::from(src.as_bytes())))
387 }
388
389 fn reassemble(lines: ContiguousLines) -> String {
390 lines
391 .iter()
392 .map(|l| l.reassemble_src())
393 .collect::<Vec<_>>()
394 .join("\n")
395 }
396
397 #[test]
398 fn test_attr_ref() {
399 let mut parser = test_parser!("hello {foo} world");
400 parser
401 .document
402 .meta
403 .insert_doc_attr("foo", "_bar_")
404 .unwrap();
405 let mut lines = parser.read_lines().unwrap().unwrap();
406 let line = lines.consume_current().unwrap();
407 let tokens = line.into_iter().collect::<Vec<_>>();
408 expect_eq!(
409 &tokens,
410 &[
411 Token::new(TokenKind::Word, 0..5, bstr!("hello")),
412 Token::new(TokenKind::Whitespace, 5..6, bstr!(" ")),
413 Token::new(TokenKind::AttrRef, 6..11, bstr!("{foo}")),
414 Token::new(TokenKind::Underscore, 6..11, bstr!("_")),
418 Token::new(TokenKind::Word, 6..11, bstr!("bar")),
419 Token::new(TokenKind::Underscore, 6..11, bstr!("_")),
420 Token::new(TokenKind::Whitespace, 11..12, bstr!(" ")),
422 Token::new(TokenKind::Word, 12..17, bstr!("world")),
423 ]
424 );
425 }
426
427 #[test]
428 fn invalid_directive_line_passed_thru() {
429 let input = adoc! {"
430 foo
431 include::invalid []
432 bar
433 "};
434
435 let mut parser = test_parser!(input);
436 assert_eq!(
437 reassemble(parser.read_lines().unwrap().unwrap()),
438 input.trim_end()
439 );
440 }
441
442 #[test]
443 fn safe_mode_include_to_link() {
444 let input = adoc! {"
445 foo
446 include::include-file.adoc[]
447 baz
448 "};
449
450 let mut parser = test_parser!(input);
451 parser.apply_job_settings(JobSettings::secure());
452 assert_eq!(
453 reassemble(parser.read_lines().unwrap().unwrap()),
454 adoc! {"
455 foo
456 link:include-file.adoc[role=include,]
457 baz"
458 }
459 );
460
461 let mut parser = test_parser!(input);
463 parser.apply_job_settings(JobSettings::secure());
464
465 let mut line = parser.read_line().unwrap().unwrap();
466 expect_eq!(
467 line.consume_current().unwrap(),
468 Token::new(TokenKind::Word, 0..3, bstr!("foo"))
469 );
470 assert!(line.consume_current().is_none());
471
472 assert_eq!(&input[8..13], "ude::");
473 assert_eq!(&input[30..32], "[]");
474
475 let mut line = parser.read_line().unwrap().unwrap();
476 expect_eq!(
477 std::array::from_fn(|_| line.consume_current().unwrap()),
478 [
479 Token::new(TokenKind::MacroName, 8..13, bstr!("link:")),
482 Token::new(TokenKind::Word, 13..20, bstr!("include")),
483 Token::new(TokenKind::Dashes, 20..21, bstr!("-")),
484 Token::new(TokenKind::Word, 21..25, bstr!("file")),
485 Token::new(TokenKind::Dots, 25..26, bstr!(".")),
486 Token::new(TokenKind::Word, 26..30, bstr!("adoc")),
487 Token::new(TokenKind::OpenBracket, 30..31, bstr!("[")),
488 Token::new(TokenKind::Word, 31..31, bstr!("role")),
491 Token::new(TokenKind::EqualSigns, 31..31, bstr!("=")),
492 Token::new(TokenKind::Word, 31..31, bstr!("include")),
493 Token::new(TokenKind::Comma, 31..31, bstr!(",")),
494 Token::new(TokenKind::CloseBracket, 31..32, bstr!("]")),
496 ]
497 );
498 assert!(line.consume_current().is_none());
499 }
500
501 #[test]
502 fn attrs_preserved_when_replacing_include() {
503 let input = "include::some-file.adoc[leveloffset+=1]";
504 let mut parser = test_parser!(input);
505 parser.apply_job_settings(JobSettings::secure());
506 assert_eq!(
507 parser.read_line().unwrap().unwrap().reassemble_src(),
508 "link:some-file.adoc[role=include,leveloffset+=1]"
509 );
510 }
511
512 #[test]
513 fn spaces_in_include_file_to_pass_macro_link() {
514 let input = "include::foo bar baz.adoc[]";
515 let mut parser = test_parser!(input);
516 parser.apply_job_settings(JobSettings::secure());
517 assert_eq!(
518 parser.read_line().unwrap().unwrap().reassemble_src(),
519 "link:pass:c[foo bar baz.adoc][role=include,]"
520 );
521 }
522
523 #[test]
524 fn uri_read_not_allowed_include_non_strict() {
525 let input = "include::https://my.com/foo bar.adoc[]";
527 let mut parser = test_parser!(input);
528 let mut settings = JobSettings::r#unsafe();
529 settings.strict = false;
530 parser.apply_job_settings(settings);
531 expect_eq!(
532 parser.read_line().unwrap().unwrap().reassemble_src(),
533 "link:pass:c[https://my.com/foo bar.adoc][role=include,]",
534 from: input
535 );
536 }
537}