Skip to main content

acdc_parser/
lib.rs

1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//! use acdc_parser::{Document, parse};
18//!
19//! let content = r#"= Document Title
20//!
21//! This is a paragraph.
22//!
23//! == Section Title
24//!
25//! This is a subsection."#;
26//!
27//! let options = acdc_parser::Options::default();
28//! let document = parse(content, &options).unwrap();
29//!
30//! println!("{:?}", document);
31//! ```
32//!
33//! # Features
34//!
35//! - Full support for `AsciiDoc` syntax, including blocks, inline elements, attributes, and more.
36//! - Configurable options for parsing behaviour, including safe mode and timing. Just
37//!   like `asciidoctor`, you can choose to enable or disable certain features based on your
38//!   needs.
39//! - Detailed error reporting with source location information.
40//! - Support for parsing from strings, files, and readers.
41//!
42
43use std::{
44    path::{Path, PathBuf},
45    string::ToString,
46};
47
48use tracing::instrument;
49
50mod blocks;
51mod constants;
52mod error;
53pub(crate) mod grammar;
54mod model;
55mod options;
56mod preprocessor;
57mod safe_mode;
58
59pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
60use preprocessor::Preprocessor;
61
62pub use error::{Error, Positioning, SourceLocation};
63pub use grammar::parse_text_for_quotes;
64pub use model::{
65    Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Attribution, Audio,
66    Author, Autolink, Block, BlockMetadata, Bold, Button, CalloutList, CalloutListItem, CalloutRef,
67    CalloutRefKind, CiteTitle, ColumnFormat, ColumnStyle, ColumnWidth, Comment, CrossReference,
68    CurvedApostrophe, CurvedQuotation, DelimitedBlock, DelimitedBlockType, DescriptionList,
69    DescriptionListItem, DiscreteHeader, Document, DocumentAttribute, DocumentAttributes,
70    ElementAttributes, Footnote, Form, HEADER, Header, Highlight, HorizontalAlignment, ICON_SIZES,
71    Icon, Image, IndexTerm, IndexTermKind, InlineMacro, InlineNode, Italic, Keyboard, LineBreak,
72    Link, ListItem, ListItemCheckedStatus, Location, MAX_SECTION_LEVELS, MAX_TOC_LEVELS, Mailto,
73    Menu, Monospace, NORMAL, OrderedList, PageBreak, Paragraph, Pass, PassthroughKind, Plain,
74    Position, Raw, Role, Section, Source, SourceUrl, StandaloneCurvedApostrophe, Stem, StemContent,
75    StemNotation, Subscript, Substitution, SubstitutionOp, SubstitutionSpec, Subtitle, Superscript,
76    Table, TableColumn, TableOfContents, TableRow, ThematicBreak, Title, TocEntry,
77    UNNUMBERED_SECTION_STYLES, UnorderedList, Url, VERBATIM, Verbatim, VerticalAlignment, Video,
78    inlines_to_string, strip_quotes, substitute,
79};
80pub use options::{Options, OptionsBuilder, SafeMode};
81
82/// Type-based parser for `AsciiDoc` content.
83///
84/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
85///
86/// # Examples
87///
88/// Basic usage:
89///
90/// ```
91/// use acdc_parser::Parser;
92///
93/// let content = "= Document Title\n\nParagraph text.";
94/// let doc = Parser::new(content).parse()?;
95/// # Ok::<(), acdc_parser::Error>(())
96/// ```
97///
98/// With options:
99///
100/// ```
101/// use acdc_parser::{Parser, Options, SafeMode};
102///
103/// let content = "= Document Title\n\nParagraph text.";
104/// let options = Options::builder()
105///     .with_safe_mode(SafeMode::Safe)
106///     .with_timings()
107///     .build();
108///
109/// let doc = Parser::new(content)
110///     .with_options(options)
111///     .parse()?;
112/// # Ok::<(), acdc_parser::Error>(())
113/// ```
114///
115/// For file-based parsing, read the file first:
116///
117/// ```no_run
118/// use acdc_parser::Parser;
119/// use std::fs;
120///
121/// let content = fs::read_to_string("document.adoc")?;
122/// let doc = Parser::new(&content).parse()?;
123/// # Ok::<(), Box<dyn std::error::Error>>(())
124/// ```
125#[derive(Debug)]
126pub struct Parser<'input> {
127    input: &'input str,
128    options: Options,
129}
130
131impl<'input> Parser<'input> {
132    /// Create a new parser for the given input string.
133    ///
134    /// The parser will use default options. Use `with_options` to customize.
135    ///
136    /// # Example
137    ///
138    /// ```
139    /// use acdc_parser::Parser;
140    ///
141    /// let parser = Parser::new("= Title\n\nContent");
142    /// let doc = parser.parse()?;
143    /// # Ok::<(), acdc_parser::Error>(())
144    /// ```
145    #[must_use]
146    pub fn new(input: &'input str) -> Self {
147        Self {
148            input,
149            options: Options::default(),
150        }
151    }
152
153    /// Set the options for this parser.
154    ///
155    /// This consumes the parser and returns a new one with the specified options.
156    ///
157    /// # Example
158    ///
159    /// ```
160    /// use acdc_parser::{Parser, Options, SafeMode};
161    ///
162    /// let options = Options::builder()
163    ///     .with_safe_mode(SafeMode::Safe)
164    ///     .build();
165    ///
166    /// let parser = Parser::new("= Title")
167    ///     .with_options(options);
168    /// # Ok::<(), acdc_parser::Error>(())
169    /// ```
170    #[must_use]
171    pub fn with_options(mut self, options: Options) -> Self {
172        self.options = options;
173        self
174    }
175
176    /// Parse the input into a Document.
177    ///
178    /// # Example
179    ///
180    /// ```
181    /// use acdc_parser::Parser;
182    ///
183    /// let doc = Parser::new("= Title\n\nContent").parse()?;
184    /// # Ok::<(), acdc_parser::Error>(())
185    /// ```
186    ///
187    /// # Errors
188    ///
189    /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
190    pub fn parse(self) -> Result<Document, Error> {
191        parse(self.input, &self.options)
192    }
193
194    /// Parse only inline elements from the input.
195    ///
196    /// This is useful for parsing fragments of `AsciiDoc` that contain only
197    /// inline markup like bold, italic, links, etc.
198    ///
199    /// # Example
200    ///
201    /// ```
202    /// use acdc_parser::Parser;
203    ///
204    /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
205    /// # Ok::<(), acdc_parser::Error>(())
206    /// ```
207    ///
208    /// # Errors
209    ///
210    /// Returns an error if the input cannot be parsed.
211    pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
212        parse_inline(self.input, &self.options)
213    }
214}
215
216/// Parse `AsciiDoc` content from a reader.
217///
218/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
219///
220/// # Example
221///
222/// ```
223/// use acdc_parser::{Options, SafeMode, parse_from_reader};
224/// use std::fs::File;
225///
226/// let options = Options::builder()
227///     .with_safe_mode(SafeMode::Unsafe)
228///     .build();
229/// let file = File::open("fixtures/samples/README.adoc").unwrap();
230/// let document = parse_from_reader(file, &options).unwrap();
231/// ```
232///
233/// # Errors
234/// This function returns an error if the content cannot be parsed.
235#[instrument(skip(reader))]
236pub fn parse_from_reader<R: std::io::Read>(
237    reader: R,
238    options: &Options,
239) -> Result<Document, Error> {
240    let result = Preprocessor.process_reader(reader, options)?;
241    parse_input(
242        &result.text,
243        options,
244        None,
245        result.leveloffset_ranges,
246        result.source_ranges,
247    )
248}
249
250/// Parse `AsciiDoc` content from a string.
251///
252/// This function parses the provided string as `AsciiDoc`.
253///
254/// # Example
255///
256/// ```
257/// use acdc_parser::{Options, SafeMode, parse};
258///
259/// let options = Options::builder()
260///     .with_safe_mode(SafeMode::Unsafe)
261///     .build();
262/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
263/// let document = parse(content, &options).unwrap();
264/// ```
265///
266/// # Errors
267/// This function returns an error if the content cannot be parsed.
268#[instrument]
269pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
270    let result = Preprocessor.process(input, options)?;
271    parse_input(
272        &result.text,
273        options,
274        None,
275        result.leveloffset_ranges,
276        result.source_ranges,
277    )
278}
279
280/// Parse `AsciiDoc` content from a file.
281///
282/// This function reads the content from the provided file and parses it as `AsciiDoc`.
283///
284/// # Example
285///
286/// ```
287/// use std::path::Path;
288/// use acdc_parser::{Options, SafeMode, parse_file};
289///
290/// let options = Options::builder()
291///     .with_safe_mode(SafeMode::Unsafe)
292///     .build();
293/// let file_path = Path::new("fixtures/samples/README.adoc");
294/// let document = parse_file(file_path, &options).unwrap();
295/// ```
296///
297/// # Errors
298/// This function returns an error if the content cannot be parsed.
299#[instrument(skip(file_path))]
300pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
301    let path = file_path.as_ref().to_path_buf();
302    let result = Preprocessor.process_file(file_path, options)?;
303    parse_input(
304        &result.text,
305        options,
306        Some(path),
307        result.leveloffset_ranges,
308        result.source_ranges,
309    )
310}
311
312/// Helper to convert a PEG parse error to our `SourceLocation` type,
313/// resolving the correct file and line for included content.
314fn peg_error_to_source_location(
315    error: &peg::error::ParseError<peg::str::LineCol>,
316    state: &grammar::ParserState,
317) -> SourceLocation {
318    let offset = error.location.offset;
319    if let Some(range) = state
320        .source_ranges
321        .iter()
322        .rev()
323        .find(|r| r.contains(offset))
324    {
325        let line_in_file = state
326            .input
327            .get(range.start_offset..offset)
328            .map_or(0, |s| s.matches('\n').count());
329        SourceLocation {
330            file: Some(range.file.clone()),
331            positioning: Positioning::Position(Position {
332                line: range.start_line + line_in_file,
333                column: error.location.column,
334            }),
335        }
336    } else {
337        SourceLocation {
338            file: state.current_file.clone(),
339            positioning: Positioning::Position(Position {
340                line: error.location.line,
341                column: error.location.column,
342            }),
343        }
344    }
345}
346
347#[instrument]
348fn parse_input(
349    input: &str,
350    options: &Options,
351    file_path: Option<PathBuf>,
352    leveloffset_ranges: Vec<model::LeveloffsetRange>,
353    source_ranges: Vec<model::SourceRange>,
354) -> Result<Document, Error> {
355    tracing::trace!(?input, "post preprocessor");
356    let mut state = grammar::ParserState::new(input);
357    state.document_attributes = options.document_attributes.clone();
358    state.options = options.clone();
359    state.current_file = file_path;
360    state.leveloffset_ranges = leveloffset_ranges;
361    state.source_ranges = source_ranges;
362    let result = match grammar::document_parser::document(input, &mut state) {
363        Ok(doc) => doc,
364        Err(error) => {
365            tracing::error!(?error, "error parsing document content");
366            let source_location = peg_error_to_source_location(&error, &state);
367            Err(Error::Parse(Box::new(source_location), error.to_string()))
368        }
369    };
370    state.emit_warnings();
371    result
372}
373
374/// Parse inline `AsciiDoc` content from a string.
375///
376/// This function parses the provided string as inline `AsciiDoc` elements, returning a
377/// vector of inline nodes instead of a complete document structure. This is useful for
378/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
379/// strong text, links, macros, and other inline elements.
380///
381/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
382/// on an "inline" type output.
383///
384/// # Example
385///
386/// ```
387/// use acdc_parser::{Options, SafeMode, parse_inline};
388///
389/// let options = Options::builder()
390///     .with_safe_mode(SafeMode::Unsafe)
391///     .build();
392/// let content = "This is *strong* text with a https://example.com[link].";
393/// let inline_nodes = parse_inline(content, &options).unwrap();
394/// ```
395///
396/// # Errors
397/// This function returns an error if the inline content cannot be parsed.
398#[instrument]
399pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
400    tracing::trace!(?input, "post preprocessor");
401    let mut state = grammar::ParserState::new(input);
402    state.document_attributes = options.document_attributes.clone();
403    state.options = options.clone();
404    let result = match grammar::inline_parser::inlines(
405        input,
406        &mut state,
407        0,
408        &grammar::BlockParsingMetadata::default(),
409    ) {
410        Ok(inlines) => Ok(inlines),
411        Err(error) => {
412            tracing::error!(?error, "error parsing inline content");
413            Err(Error::Parse(
414                Box::new(peg_error_to_source_location(&error, &state)),
415                error.to_string(),
416            ))
417        }
418    };
419    state.emit_warnings();
420    result
421}
422
423#[cfg(test)]
424mod proptests;
425
426#[cfg(test)]
427#[allow(clippy::unwrap_used)]
428#[allow(clippy::panic)]
429#[allow(clippy::expect_used)]
430mod tests {
431    use super::*;
432    use pretty_assertions::assert_eq;
433
434    fn read_file_contents_with_extension(
435        path: &std::path::PathBuf,
436        ext: &str,
437    ) -> Result<String, Error> {
438        let test_file_path = path.with_extension(ext);
439        let file_contents = std::fs::read_to_string(&test_file_path).inspect_err(
440            |e| tracing::warn!(?path, ?test_file_path, error = %e, "test file not found"),
441        )?;
442        Ok(file_contents)
443    }
444
445    #[rstest::rstest]
446    #[tracing_test::traced_test]
447    fn test_with_fixtures(
448        #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
449    ) -> Result<(), Error> {
450        let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
451
452        match parse_file(&path, &options) {
453            Ok(result) => {
454                let expected = read_file_contents_with_extension(&path, "json")?;
455                let actual =
456                    serde_json::to_string_pretty(&result).expect("could not serialize result");
457                assert_eq!(expected, actual);
458            }
459            Err(e) => {
460                let file_contents = read_file_contents_with_extension(&path, "error")?;
461                // Error fixtures contain expected error message as plain text
462                let expected = file_contents.trim();
463                assert_eq!(expected, e.to_string());
464            }
465        }
466        Ok(())
467    }
468
469    #[cfg(test)]
470    mod empty_document_tests {
471        use crate::{Options, parse};
472
473        #[test]
474        fn test_whitespace_only_documents() {
475            let test_cases = vec![
476                "\n", "\n\n", "\t", " \n\t\n ", "   ",
477                /* The original proptest failing case -> */ "\n\n\t",
478            ];
479
480            for input in test_cases {
481                let options = Options::default();
482                let result = parse(input, &options);
483
484                match result {
485                    Ok(doc) => {
486                        // Validate the invariant using absolute offsets
487                        assert!(
488                            doc.location.absolute_start <= doc.location.absolute_end,
489                            "Failed for input {input:?}: absolute_start {} > absolute_end {}",
490                            doc.location.absolute_start,
491                            doc.location.absolute_end
492                        );
493
494                        // Validate with our helper
495                        doc.location.validate(input).unwrap_or_else(|e| {
496                            panic!("Location validation failed for {input:?}: {e}")
497                        });
498                    }
499                    Err(e) => {
500                        panic!("Failed to parse {input:?}: {e}");
501                    }
502                }
503            }
504        }
505
506        #[test]
507        fn test_document_with_content_after_whitespace() {
508            let test_cases = vec!["\n\nHello", "\t\tWorld", "  \n  = Title"];
509
510            for input in test_cases {
511                let options = Options::default();
512                let doc =
513                    parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
514
515                assert!(
516                    doc.location.absolute_start <= doc.location.absolute_end,
517                    "Failed for input {input:?}: absolute_start {} > absolute_end {}",
518                    doc.location.absolute_start,
519                    doc.location.absolute_end
520                );
521
522                // Validate with our helper
523                doc.location
524                    .validate(input)
525                    .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
526            }
527        }
528
529        #[test]
530        fn test_unicode_characters() {
531            // Test that UTF-8 safety is maintained
532            let test_cases = vec![
533                "πŸ˜€",         // 4-byte emoji
534                "א",          // 2-byte Hebrew
535                "Hello δΈ–η•Œ", // Mixed content
536                "\u{200b}",   // Zero-width space
537            ];
538
539            for input in test_cases {
540                let options = Options::default();
541                let result = parse(input, &options);
542
543                match result {
544                    Ok(doc) => {
545                        // All offsets should be on UTF-8 boundaries
546                        assert!(
547                            input.is_char_boundary(doc.location.absolute_start),
548                            "Absolute start {} not on UTF-8 boundary for {input:?}",
549                            doc.location.absolute_start,
550                        );
551                        assert!(
552                            input.is_char_boundary(doc.location.absolute_end),
553                            "Absolute end {} not on UTF-8 boundary for {input:?}",
554                            doc.location.absolute_end,
555                        );
556
557                        // Validate with our helper
558                        doc.location.validate(input).unwrap_or_else(|e| {
559                            panic!("Location validation failed for {input:?}: {e}");
560                        });
561                    }
562                    Err(e) => {
563                        // Some of these might fail to parse, which is OK for now
564                        // We're just testing that if they parse, the locations are valid
565                        println!("Failed to parse {input:?}: {e} (this might be expected)",);
566                    }
567                }
568            }
569        }
570    }
571
572    /// Integration tests for attribute resolution behavior.
573    ///
574    /// These tests verify that acdc matches asciidoctor's attribute resolution semantics:
575    /// - Attributes are resolved at definition time (not reference time)
576    /// - If {bar} is undefined when :foo: {bar} is parsed, foo stores literal "{bar}"
577    /// - If {bar} IS defined when :foo: {bar} is parsed, foo stores bar's resolved value
578    mod warning_deduplication_tests {
579        use crate::{Options, parse};
580
581        #[test]
582        #[tracing_test::traced_test]
583        fn counter_reference_emits_single_warning() {
584            // A document with the same counter referenced multiple times should
585            // produce exactly one warning after parsing (not one per PEG attempt).
586            let input = "= Title\n\n{counter:hits} then {counter:hits} again";
587            let options = Options::default();
588            let _doc = parse(input, &options).expect("should parse");
589            assert!(logs_contain("Counters"));
590            logs_assert(|lines: &[&str]| {
591                let count = lines
592                    .iter()
593                    .filter(|l| l.contains("not supported and will be removed"))
594                    .count();
595                if count == 1 {
596                    Ok(())
597                } else {
598                    Err(format!("expected exactly 1 counter warning, got {count}"))
599                }
600            });
601        }
602
603        #[test]
604        #[tracing_test::traced_test]
605        fn distinct_warnings_all_emitted() {
606            // Different warnings should each appear once.
607            let input = "= Title\n\n{counter:a} and {counter2:b}";
608            let options = Options::default();
609            let _doc = parse(input, &options).expect("should parse");
610            assert!(logs_contain(
611                "Counters ({counter:a}) are not supported and will be removed from output"
612            ));
613            assert!(logs_contain(
614                "Counters ({counter2:b}) are not supported and will be removed from output"
615            ));
616        }
617    }
618
619    mod attribute_resolution_tests {
620        use crate::{AttributeValue, Options, parse};
621
622        #[test]
623        fn test_definition_time_resolution_bar_defined_first() {
624            // When bar is defined BEFORE foo, {bar} in foo's value should be expanded
625            let input = r":bar: resolved-bar
626:foo: {bar}
627
628{foo}
629";
630            let options = Options::default();
631            let doc = parse(input, &options).expect("should parse");
632
633            // foo should have bar's value expanded at definition time
634            assert_eq!(
635                doc.attributes.get("foo"),
636                Some(&AttributeValue::String("resolved-bar".to_string()))
637            );
638        }
639
640        #[test]
641        fn test_definition_time_resolution_bar_defined_after() {
642            // When bar is defined AFTER foo, {bar} should stay literal in foo's value
643            let input = r":foo: {bar}
644:bar: resolved-bar
645
646{foo}
647";
648            let options = Options::default();
649            let doc = parse(input, &options).expect("should parse");
650
651            // foo should keep {bar} as literal since bar wasn't defined yet
652            assert_eq!(
653                doc.attributes.get("foo"),
654                Some(&AttributeValue::String("{bar}".to_string()))
655            );
656        }
657
658        #[test]
659        fn test_chained_attribute_resolution() {
660            // When attributes form a chain: a -> b -> c, each should resolve
661            // based on what's defined at each definition point
662            let input = r":c: final-value
663:b: {c}
664:a: {b}
665
666{a}
667";
668            let options = Options::default();
669            let doc = parse(input, &options).expect("should parse");
670
671            // c is defined first, so b gets "final-value", then a gets "final-value"
672            assert_eq!(
673                doc.attributes.get("c"),
674                Some(&AttributeValue::String("final-value".to_string()))
675            );
676            assert_eq!(
677                doc.attributes.get("b"),
678                Some(&AttributeValue::String("final-value".to_string()))
679            );
680            assert_eq!(
681                doc.attributes.get("a"),
682                Some(&AttributeValue::String("final-value".to_string()))
683            );
684        }
685    }
686}