acdc_parser/lib.rs
1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//! use acdc_parser::{Document, parse};
18//!
19//! let content = r#"= Document Title
20//!
21//! This is a paragraph.
22//!
23//! == Section Title
24//!
25//! This is a subsection."#;
26//!
27//! let options = acdc_parser::Options::default();
28//! let document = parse(content, &options).unwrap();
29//!
30//! println!("{:?}", document);
31//! ```
32//!
33//! # Features
34//!
35//! - Full support for `AsciiDoc` syntax, including blocks, inline elements, attributes, and more.
36//! - Configurable options for parsing behaviour, including safe mode and timing. Just
37//! like `asciidoctor`, you can choose to enable or disable certain features based on your
38//! needs.
39//! - Detailed error reporting with source location information.
40//! - Support for parsing from strings, files, and readers.
41//!
42
43use std::{
44 path::{Path, PathBuf},
45 string::ToString,
46};
47
48use tracing::instrument;
49
50mod blocks;
51mod constants;
52mod error;
53pub(crate) mod grammar;
54mod model;
55mod options;
56mod preprocessor;
57mod safe_mode;
58
59pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
60use preprocessor::Preprocessor;
61
62pub use error::{Error, Positioning, SourceLocation};
63pub use grammar::parse_text_for_quotes;
64pub use model::{
65 Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Attribution, Audio,
66 Author, Autolink, Block, BlockMetadata, Bold, Button, CalloutList, CalloutListItem, CalloutRef,
67 CalloutRefKind, CiteTitle, ColumnFormat, ColumnStyle, ColumnWidth, Comment, CrossReference,
68 CurvedApostrophe, CurvedQuotation, DelimitedBlock, DelimitedBlockType, DescriptionList,
69 DescriptionListItem, DiscreteHeader, Document, DocumentAttribute, DocumentAttributes,
70 ElementAttributes, Footnote, Form, HEADER, Header, Highlight, HorizontalAlignment, ICON_SIZES,
71 Icon, Image, IndexTerm, IndexTermKind, InlineMacro, InlineNode, Italic, Keyboard, LineBreak,
72 Link, ListItem, ListItemCheckedStatus, Location, MAX_SECTION_LEVELS, MAX_TOC_LEVELS, Mailto,
73 Menu, Monospace, NORMAL, OrderedList, PageBreak, Paragraph, Pass, PassthroughKind, Plain,
74 Position, Raw, Role, Section, Source, SourceUrl, StandaloneCurvedApostrophe, Stem, StemContent,
75 StemNotation, Subscript, Substitution, SubstitutionOp, SubstitutionSpec, Subtitle, Superscript,
76 Table, TableColumn, TableOfContents, TableRow, ThematicBreak, Title, TocEntry,
77 UNNUMBERED_SECTION_STYLES, UnorderedList, Url, VERBATIM, Verbatim, VerticalAlignment, Video,
78 inlines_to_string, strip_quotes, substitute,
79};
80pub use options::{Options, OptionsBuilder, SafeMode};
81
82/// Type-based parser for `AsciiDoc` content.
83///
84/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
85///
86/// # Examples
87///
88/// Basic usage:
89///
90/// ```
91/// use acdc_parser::Parser;
92///
93/// let content = "= Document Title\n\nParagraph text.";
94/// let doc = Parser::new(content).parse()?;
95/// # Ok::<(), acdc_parser::Error>(())
96/// ```
97///
98/// With options:
99///
100/// ```
101/// use acdc_parser::{Parser, Options, SafeMode};
102///
103/// let content = "= Document Title\n\nParagraph text.";
104/// let options = Options::builder()
105/// .with_safe_mode(SafeMode::Safe)
106/// .with_timings()
107/// .build();
108///
109/// let doc = Parser::new(content)
110/// .with_options(options)
111/// .parse()?;
112/// # Ok::<(), acdc_parser::Error>(())
113/// ```
114///
115/// For file-based parsing, read the file first:
116///
117/// ```no_run
118/// use acdc_parser::Parser;
119/// use std::fs;
120///
121/// let content = fs::read_to_string("document.adoc")?;
122/// let doc = Parser::new(&content).parse()?;
123/// # Ok::<(), Box<dyn std::error::Error>>(())
124/// ```
125#[derive(Debug)]
126pub struct Parser<'input> {
127 input: &'input str,
128 options: Options,
129}
130
131impl<'input> Parser<'input> {
132 /// Create a new parser for the given input string.
133 ///
134 /// The parser will use default options. Use `with_options` to customize.
135 ///
136 /// # Example
137 ///
138 /// ```
139 /// use acdc_parser::Parser;
140 ///
141 /// let parser = Parser::new("= Title\n\nContent");
142 /// let doc = parser.parse()?;
143 /// # Ok::<(), acdc_parser::Error>(())
144 /// ```
145 #[must_use]
146 pub fn new(input: &'input str) -> Self {
147 Self {
148 input,
149 options: Options::default(),
150 }
151 }
152
153 /// Set the options for this parser.
154 ///
155 /// This consumes the parser and returns a new one with the specified options.
156 ///
157 /// # Example
158 ///
159 /// ```
160 /// use acdc_parser::{Parser, Options, SafeMode};
161 ///
162 /// let options = Options::builder()
163 /// .with_safe_mode(SafeMode::Safe)
164 /// .build();
165 ///
166 /// let parser = Parser::new("= Title")
167 /// .with_options(options);
168 /// # Ok::<(), acdc_parser::Error>(())
169 /// ```
170 #[must_use]
171 pub fn with_options(mut self, options: Options) -> Self {
172 self.options = options;
173 self
174 }
175
176 /// Parse the input into a Document.
177 ///
178 /// # Example
179 ///
180 /// ```
181 /// use acdc_parser::Parser;
182 ///
183 /// let doc = Parser::new("= Title\n\nContent").parse()?;
184 /// # Ok::<(), acdc_parser::Error>(())
185 /// ```
186 ///
187 /// # Errors
188 ///
189 /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
190 pub fn parse(self) -> Result<Document, Error> {
191 parse(self.input, &self.options)
192 }
193
194 /// Parse only inline elements from the input.
195 ///
196 /// This is useful for parsing fragments of `AsciiDoc` that contain only
197 /// inline markup like bold, italic, links, etc.
198 ///
199 /// # Example
200 ///
201 /// ```
202 /// use acdc_parser::Parser;
203 ///
204 /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
205 /// # Ok::<(), acdc_parser::Error>(())
206 /// ```
207 ///
208 /// # Errors
209 ///
210 /// Returns an error if the input cannot be parsed.
211 pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
212 parse_inline(self.input, &self.options)
213 }
214}
215
216/// Parse `AsciiDoc` content from a reader.
217///
218/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
219///
220/// # Example
221///
222/// ```
223/// use acdc_parser::{Options, SafeMode, parse_from_reader};
224/// use std::fs::File;
225///
226/// let options = Options::builder()
227/// .with_safe_mode(SafeMode::Unsafe)
228/// .build();
229/// let file = File::open("fixtures/samples/README.adoc").unwrap();
230/// let document = parse_from_reader(file, &options).unwrap();
231/// ```
232///
233/// # Errors
234/// This function returns an error if the content cannot be parsed.
235#[instrument(skip(reader))]
236pub fn parse_from_reader<R: std::io::Read>(
237 reader: R,
238 options: &Options,
239) -> Result<Document, Error> {
240 let result = Preprocessor.process_reader(reader, options)?;
241 parse_input(
242 &result.text,
243 options,
244 None,
245 result.leveloffset_ranges,
246 result.source_ranges,
247 )
248}
249
250/// Parse `AsciiDoc` content from a string.
251///
252/// This function parses the provided string as `AsciiDoc`.
253///
254/// # Example
255///
256/// ```
257/// use acdc_parser::{Options, SafeMode, parse};
258///
259/// let options = Options::builder()
260/// .with_safe_mode(SafeMode::Unsafe)
261/// .build();
262/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
263/// let document = parse(content, &options).unwrap();
264/// ```
265///
266/// # Errors
267/// This function returns an error if the content cannot be parsed.
268#[instrument]
269pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
270 let result = Preprocessor.process(input, options)?;
271 parse_input(
272 &result.text,
273 options,
274 None,
275 result.leveloffset_ranges,
276 result.source_ranges,
277 )
278}
279
280/// Parse `AsciiDoc` content from a file.
281///
282/// This function reads the content from the provided file and parses it as `AsciiDoc`.
283///
284/// # Example
285///
286/// ```
287/// use std::path::Path;
288/// use acdc_parser::{Options, SafeMode, parse_file};
289///
290/// let options = Options::builder()
291/// .with_safe_mode(SafeMode::Unsafe)
292/// .build();
293/// let file_path = Path::new("fixtures/samples/README.adoc");
294/// let document = parse_file(file_path, &options).unwrap();
295/// ```
296///
297/// # Errors
298/// This function returns an error if the content cannot be parsed.
299#[instrument(skip(file_path))]
300pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
301 let path = file_path.as_ref().to_path_buf();
302 let result = Preprocessor.process_file(file_path, options)?;
303 parse_input(
304 &result.text,
305 options,
306 Some(path),
307 result.leveloffset_ranges,
308 result.source_ranges,
309 )
310}
311
312/// Helper to convert a PEG parse error to our `SourceLocation` type,
313/// resolving the correct file and line for included content.
314fn peg_error_to_source_location(
315 error: &peg::error::ParseError<peg::str::LineCol>,
316 state: &grammar::ParserState,
317) -> SourceLocation {
318 let offset = error.location.offset;
319 if let Some(range) = state
320 .source_ranges
321 .iter()
322 .rev()
323 .find(|r| r.contains(offset))
324 {
325 let line_in_file = state
326 .input
327 .get(range.start_offset..offset)
328 .map_or(0, |s| s.matches('\n').count());
329 SourceLocation {
330 file: Some(range.file.clone()),
331 positioning: Positioning::Position(Position {
332 line: range.start_line + line_in_file,
333 column: error.location.column,
334 }),
335 }
336 } else {
337 SourceLocation {
338 file: state.current_file.clone(),
339 positioning: Positioning::Position(Position {
340 line: error.location.line,
341 column: error.location.column,
342 }),
343 }
344 }
345}
346
347#[instrument]
348fn parse_input(
349 input: &str,
350 options: &Options,
351 file_path: Option<PathBuf>,
352 leveloffset_ranges: Vec<model::LeveloffsetRange>,
353 source_ranges: Vec<model::SourceRange>,
354) -> Result<Document, Error> {
355 tracing::trace!(?input, "post preprocessor");
356 let mut state = grammar::ParserState::new(input);
357 state.document_attributes = options.document_attributes.clone();
358 state.options = options.clone();
359 state.current_file = file_path;
360 state.leveloffset_ranges = leveloffset_ranges;
361 state.source_ranges = source_ranges;
362 let result = match grammar::document_parser::document(input, &mut state) {
363 Ok(doc) => doc,
364 Err(error) => {
365 tracing::error!(?error, "error parsing document content");
366 let source_location = peg_error_to_source_location(&error, &state);
367 Err(Error::Parse(Box::new(source_location), error.to_string()))
368 }
369 };
370 state.emit_warnings();
371 result
372}
373
374/// Parse inline `AsciiDoc` content from a string.
375///
376/// This function parses the provided string as inline `AsciiDoc` elements, returning a
377/// vector of inline nodes instead of a complete document structure. This is useful for
378/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
379/// strong text, links, macros, and other inline elements.
380///
381/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
382/// on an "inline" type output.
383///
384/// # Example
385///
386/// ```
387/// use acdc_parser::{Options, SafeMode, parse_inline};
388///
389/// let options = Options::builder()
390/// .with_safe_mode(SafeMode::Unsafe)
391/// .build();
392/// let content = "This is *strong* text with a https://example.com[link].";
393/// let inline_nodes = parse_inline(content, &options).unwrap();
394/// ```
395///
396/// # Errors
397/// This function returns an error if the inline content cannot be parsed.
398#[instrument]
399pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
400 tracing::trace!(?input, "post preprocessor");
401 let mut state = grammar::ParserState::new(input);
402 state.document_attributes = options.document_attributes.clone();
403 state.options = options.clone();
404 let result = match grammar::inline_parser::inlines(
405 input,
406 &mut state,
407 0,
408 &grammar::BlockParsingMetadata::default(),
409 ) {
410 Ok(inlines) => Ok(inlines),
411 Err(error) => {
412 tracing::error!(?error, "error parsing inline content");
413 Err(Error::Parse(
414 Box::new(peg_error_to_source_location(&error, &state)),
415 error.to_string(),
416 ))
417 }
418 };
419 state.emit_warnings();
420 result
421}
422
423#[cfg(test)]
424mod proptests;
425
426#[cfg(test)]
427#[allow(clippy::unwrap_used)]
428#[allow(clippy::panic)]
429#[allow(clippy::expect_used)]
430mod tests {
431 use super::*;
432 use pretty_assertions::assert_eq;
433
434 fn read_file_contents_with_extension(
435 path: &std::path::PathBuf,
436 ext: &str,
437 ) -> Result<String, Error> {
438 let test_file_path = path.with_extension(ext);
439 let file_contents = std::fs::read_to_string(&test_file_path).inspect_err(
440 |e| tracing::warn!(?path, ?test_file_path, error = %e, "test file not found"),
441 )?;
442 Ok(file_contents)
443 }
444
445 #[rstest::rstest]
446 #[tracing_test::traced_test]
447 fn test_with_fixtures(
448 #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
449 ) -> Result<(), Error> {
450 let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
451
452 match parse_file(&path, &options) {
453 Ok(result) => {
454 let expected = read_file_contents_with_extension(&path, "json")?;
455 let actual =
456 serde_json::to_string_pretty(&result).expect("could not serialize result");
457 assert_eq!(expected, actual);
458 }
459 Err(e) => {
460 let file_contents = read_file_contents_with_extension(&path, "error")?;
461 // Error fixtures contain expected error message as plain text
462 let expected = file_contents.trim();
463 assert_eq!(expected, e.to_string());
464 }
465 }
466 Ok(())
467 }
468
469 #[cfg(test)]
470 mod empty_document_tests {
471 use crate::{Options, parse};
472
473 #[test]
474 fn test_whitespace_only_documents() {
475 let test_cases = vec![
476 "\n", "\n\n", "\t", " \n\t\n ", " ",
477 /* The original proptest failing case -> */ "\n\n\t",
478 ];
479
480 for input in test_cases {
481 let options = Options::default();
482 let result = parse(input, &options);
483
484 match result {
485 Ok(doc) => {
486 // Validate the invariant using absolute offsets
487 assert!(
488 doc.location.absolute_start <= doc.location.absolute_end,
489 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
490 doc.location.absolute_start,
491 doc.location.absolute_end
492 );
493
494 // Validate with our helper
495 doc.location.validate(input).unwrap_or_else(|e| {
496 panic!("Location validation failed for {input:?}: {e}")
497 });
498 }
499 Err(e) => {
500 panic!("Failed to parse {input:?}: {e}");
501 }
502 }
503 }
504 }
505
506 #[test]
507 fn test_document_with_content_after_whitespace() {
508 let test_cases = vec!["\n\nHello", "\t\tWorld", " \n = Title"];
509
510 for input in test_cases {
511 let options = Options::default();
512 let doc =
513 parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
514
515 assert!(
516 doc.location.absolute_start <= doc.location.absolute_end,
517 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
518 doc.location.absolute_start,
519 doc.location.absolute_end
520 );
521
522 // Validate with our helper
523 doc.location
524 .validate(input)
525 .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
526 }
527 }
528
529 #[test]
530 fn test_unicode_characters() {
531 // Test that UTF-8 safety is maintained
532 let test_cases = vec![
533 "π", // 4-byte emoji
534 "Χ", // 2-byte Hebrew
535 "Hello δΈη", // Mixed content
536 "\u{200b}", // Zero-width space
537 ];
538
539 for input in test_cases {
540 let options = Options::default();
541 let result = parse(input, &options);
542
543 match result {
544 Ok(doc) => {
545 // All offsets should be on UTF-8 boundaries
546 assert!(
547 input.is_char_boundary(doc.location.absolute_start),
548 "Absolute start {} not on UTF-8 boundary for {input:?}",
549 doc.location.absolute_start,
550 );
551 assert!(
552 input.is_char_boundary(doc.location.absolute_end),
553 "Absolute end {} not on UTF-8 boundary for {input:?}",
554 doc.location.absolute_end,
555 );
556
557 // Validate with our helper
558 doc.location.validate(input).unwrap_or_else(|e| {
559 panic!("Location validation failed for {input:?}: {e}");
560 });
561 }
562 Err(e) => {
563 // Some of these might fail to parse, which is OK for now
564 // We're just testing that if they parse, the locations are valid
565 println!("Failed to parse {input:?}: {e} (this might be expected)",);
566 }
567 }
568 }
569 }
570 }
571
572 /// Integration tests for attribute resolution behavior.
573 ///
574 /// These tests verify that acdc matches asciidoctor's attribute resolution semantics:
575 /// - Attributes are resolved at definition time (not reference time)
576 /// - If {bar} is undefined when :foo: {bar} is parsed, foo stores literal "{bar}"
577 /// - If {bar} IS defined when :foo: {bar} is parsed, foo stores bar's resolved value
578 mod warning_deduplication_tests {
579 use crate::{Options, parse};
580
581 #[test]
582 #[tracing_test::traced_test]
583 fn counter_reference_emits_single_warning() {
584 // A document with the same counter referenced multiple times should
585 // produce exactly one warning after parsing (not one per PEG attempt).
586 let input = "= Title\n\n{counter:hits} then {counter:hits} again";
587 let options = Options::default();
588 let _doc = parse(input, &options).expect("should parse");
589 assert!(logs_contain("Counters"));
590 logs_assert(|lines: &[&str]| {
591 let count = lines
592 .iter()
593 .filter(|l| l.contains("not supported and will be removed"))
594 .count();
595 if count == 1 {
596 Ok(())
597 } else {
598 Err(format!("expected exactly 1 counter warning, got {count}"))
599 }
600 });
601 }
602
603 #[test]
604 #[tracing_test::traced_test]
605 fn distinct_warnings_all_emitted() {
606 // Different warnings should each appear once.
607 let input = "= Title\n\n{counter:a} and {counter2:b}";
608 let options = Options::default();
609 let _doc = parse(input, &options).expect("should parse");
610 assert!(logs_contain(
611 "Counters ({counter:a}) are not supported and will be removed from output"
612 ));
613 assert!(logs_contain(
614 "Counters ({counter2:b}) are not supported and will be removed from output"
615 ));
616 }
617 }
618
619 mod attribute_resolution_tests {
620 use crate::{AttributeValue, Options, parse};
621
622 #[test]
623 fn test_definition_time_resolution_bar_defined_first() {
624 // When bar is defined BEFORE foo, {bar} in foo's value should be expanded
625 let input = r":bar: resolved-bar
626:foo: {bar}
627
628{foo}
629";
630 let options = Options::default();
631 let doc = parse(input, &options).expect("should parse");
632
633 // foo should have bar's value expanded at definition time
634 assert_eq!(
635 doc.attributes.get("foo"),
636 Some(&AttributeValue::String("resolved-bar".to_string()))
637 );
638 }
639
640 #[test]
641 fn test_definition_time_resolution_bar_defined_after() {
642 // When bar is defined AFTER foo, {bar} should stay literal in foo's value
643 let input = r":foo: {bar}
644:bar: resolved-bar
645
646{foo}
647";
648 let options = Options::default();
649 let doc = parse(input, &options).expect("should parse");
650
651 // foo should keep {bar} as literal since bar wasn't defined yet
652 assert_eq!(
653 doc.attributes.get("foo"),
654 Some(&AttributeValue::String("{bar}".to_string()))
655 );
656 }
657
658 #[test]
659 fn test_chained_attribute_resolution() {
660 // When attributes form a chain: a -> b -> c, each should resolve
661 // based on what's defined at each definition point
662 let input = r":c: final-value
663:b: {c}
664:a: {b}
665
666{a}
667";
668 let options = Options::default();
669 let doc = parse(input, &options).expect("should parse");
670
671 // c is defined first, so b gets "final-value", then a gets "final-value"
672 assert_eq!(
673 doc.attributes.get("c"),
674 Some(&AttributeValue::String("final-value".to_string()))
675 );
676 assert_eq!(
677 doc.attributes.get("b"),
678 Some(&AttributeValue::String("final-value".to_string()))
679 );
680 assert_eq!(
681 doc.attributes.get("a"),
682 Some(&AttributeValue::String("final-value".to_string()))
683 );
684 }
685 }
686}