Skip to main content

acdc_parser/
lib.rs

1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33    path::{Path, PathBuf},
34    string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46mod safe_mode;
47
48pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
49use preprocessor::Preprocessor;
50
51pub use error::{Error, Positioning, SourceLocation};
52pub use grammar::parse_text_for_quotes;
53pub use model::{
54    Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Attribution, Audio,
55    Author, Autolink, Block, BlockMetadata, Bold, Button, CalloutList, CalloutListItem, CalloutRef,
56    CalloutRefKind, CiteTitle, ColumnFormat, ColumnStyle, ColumnWidth, Comment, CrossReference,
57    CurvedApostrophe, CurvedQuotation, DelimitedBlock, DelimitedBlockType, DescriptionList,
58    DescriptionListItem, DiscreteHeader, Document, DocumentAttribute, DocumentAttributes,
59    ElementAttributes, Footnote, Form, HEADER, Header, Highlight, HorizontalAlignment, ICON_SIZES,
60    Icon, Image, IndexTerm, IndexTermKind, InlineMacro, InlineNode, Italic, Keyboard, LineBreak,
61    Link, ListItem, ListItemCheckedStatus, Location, MAX_SECTION_LEVELS, MAX_TOC_LEVELS, Mailto,
62    Menu, Monospace, NORMAL, OrderedList, PageBreak, Paragraph, Pass, PassthroughKind, Plain,
63    Position, Raw, Role, Section, Source, SourceUrl, StandaloneCurvedApostrophe, Stem, StemContent,
64    StemNotation, Subscript, Substitution, SubstitutionOp, SubstitutionSpec, Subtitle, Superscript,
65    Table, TableColumn, TableOfContents, TableRow, ThematicBreak, Title, TocEntry,
66    UNNUMBERED_SECTION_STYLES, UnorderedList, Url, VERBATIM, Verbatim, VerticalAlignment, Video,
67    inlines_to_string, substitute,
68};
69pub use options::{Options, OptionsBuilder, SafeMode};
70
71/// Type-based parser for `AsciiDoc` content.
72///
73/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
74///
75/// # Examples
76///
77/// Basic usage:
78///
79/// ```
80/// use acdc_parser::Parser;
81///
82/// let content = "= Document Title\n\nParagraph text.";
83/// let doc = Parser::new(content).parse()?;
84/// # Ok::<(), acdc_parser::Error>(())
85/// ```
86///
87/// With options:
88///
89/// ```
90/// use acdc_parser::{Parser, Options, SafeMode};
91///
92/// let content = "= Document Title\n\nParagraph text.";
93/// let options = Options::builder()
94///     .with_safe_mode(SafeMode::Safe)
95///     .with_timings()
96///     .build();
97///
98/// let doc = Parser::new(content)
99///     .with_options(options)
100///     .parse()?;
101/// # Ok::<(), acdc_parser::Error>(())
102/// ```
103///
104/// For file-based parsing, read the file first:
105///
106/// ```no_run
107/// use acdc_parser::Parser;
108/// use std::fs;
109///
110/// let content = fs::read_to_string("document.adoc")?;
111/// let doc = Parser::new(&content).parse()?;
112/// # Ok::<(), Box<dyn std::error::Error>>(())
113/// ```
114#[derive(Debug)]
115pub struct Parser<'input> {
116    input: &'input str,
117    options: Options,
118}
119
120impl<'input> Parser<'input> {
121    /// Create a new parser for the given input string.
122    ///
123    /// The parser will use default options. Use `with_options` to customize.
124    ///
125    /// # Example
126    ///
127    /// ```
128    /// use acdc_parser::Parser;
129    ///
130    /// let parser = Parser::new("= Title\n\nContent");
131    /// let doc = parser.parse()?;
132    /// # Ok::<(), acdc_parser::Error>(())
133    /// ```
134    #[must_use]
135    pub fn new(input: &'input str) -> Self {
136        Self {
137            input,
138            options: Options::default(),
139        }
140    }
141
142    /// Set the options for this parser.
143    ///
144    /// This consumes the parser and returns a new one with the specified options.
145    ///
146    /// # Example
147    ///
148    /// ```
149    /// use acdc_parser::{Parser, Options, SafeMode};
150    ///
151    /// let options = Options::builder()
152    ///     .with_safe_mode(SafeMode::Safe)
153    ///     .build();
154    ///
155    /// let parser = Parser::new("= Title")
156    ///     .with_options(options);
157    /// # Ok::<(), acdc_parser::Error>(())
158    /// ```
159    #[must_use]
160    pub fn with_options(mut self, options: Options) -> Self {
161        self.options = options;
162        self
163    }
164
165    /// Parse the input into a Document.
166    ///
167    /// # Example
168    ///
169    /// ```
170    /// use acdc_parser::Parser;
171    ///
172    /// let doc = Parser::new("= Title\n\nContent").parse()?;
173    /// # Ok::<(), acdc_parser::Error>(())
174    /// ```
175    ///
176    /// # Errors
177    ///
178    /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
179    pub fn parse(self) -> Result<Document, Error> {
180        parse(self.input, &self.options)
181    }
182
183    /// Parse only inline elements from the input.
184    ///
185    /// This is useful for parsing fragments of `AsciiDoc` that contain only
186    /// inline markup like bold, italic, links, etc.
187    ///
188    /// # Example
189    ///
190    /// ```
191    /// use acdc_parser::Parser;
192    ///
193    /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
194    /// # Ok::<(), acdc_parser::Error>(())
195    /// ```
196    ///
197    /// # Errors
198    ///
199    /// Returns an error if the input cannot be parsed.
200    pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
201        parse_inline(self.input, &self.options)
202    }
203}
204
205/// Parse `AsciiDoc` content from a reader.
206///
207/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
208///
209/// # Example
210///
211/// ```
212/// use acdc_parser::{Options, SafeMode, parse_from_reader};
213/// use std::fs::File;
214///
215/// let options = Options::builder()
216///     .with_safe_mode(SafeMode::Unsafe)
217///     .build();
218/// let file = File::open("fixtures/samples/README.adoc").unwrap();
219/// let document = parse_from_reader(file, &options).unwrap();
220/// ```
221///
222/// # Errors
223/// This function returns an error if the content cannot be parsed.
224#[instrument(skip(reader))]
225pub fn parse_from_reader<R: std::io::Read>(
226    reader: R,
227    options: &Options,
228) -> Result<Document, Error> {
229    let result = Preprocessor.process_reader(reader, options)?;
230    parse_input(
231        &result.text,
232        options,
233        None,
234        result.leveloffset_ranges,
235        result.source_ranges,
236    )
237}
238
239/// Parse `AsciiDoc` content from a string.
240///
241/// This function parses the provided string as `AsciiDoc`.
242///
243/// # Example
244///
245/// ```
246/// use acdc_parser::{Options, SafeMode, parse};
247///
248/// let options = Options::builder()
249///     .with_safe_mode(SafeMode::Unsafe)
250///     .build();
251/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
252/// let document = parse(content, &options).unwrap();
253/// ```
254///
255/// # Errors
256/// This function returns an error if the content cannot be parsed.
257#[instrument]
258pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
259    let result = Preprocessor.process(input, options)?;
260    parse_input(
261        &result.text,
262        options,
263        None,
264        result.leveloffset_ranges,
265        result.source_ranges,
266    )
267}
268
269/// Parse `AsciiDoc` content from a file.
270///
271/// This function reads the content from the provided file and parses it as `AsciiDoc`.
272///
273/// # Example
274///
275/// ```
276/// use std::path::Path;
277/// use acdc_parser::{Options, SafeMode, parse_file};
278///
279/// let options = Options::builder()
280///     .with_safe_mode(SafeMode::Unsafe)
281///     .build();
282/// let file_path = Path::new("fixtures/samples/README.adoc");
283/// let document = parse_file(file_path, &options).unwrap();
284/// ```
285///
286/// # Errors
287/// This function returns an error if the content cannot be parsed.
288#[instrument(skip(file_path))]
289pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
290    let path = file_path.as_ref().to_path_buf();
291    let result = Preprocessor.process_file(file_path, options)?;
292    parse_input(
293        &result.text,
294        options,
295        Some(path),
296        result.leveloffset_ranges,
297        result.source_ranges,
298    )
299}
300
301/// Helper to convert a PEG parse error to our `SourceLocation` type,
302/// resolving the correct file and line for included content.
303fn peg_error_to_source_location(
304    error: &peg::error::ParseError<peg::str::LineCol>,
305    state: &grammar::ParserState,
306) -> SourceLocation {
307    let offset = error.location.offset;
308    if let Some(range) = state
309        .source_ranges
310        .iter()
311        .rev()
312        .find(|r| r.contains(offset))
313    {
314        let line_in_file = state
315            .input
316            .get(range.start_offset..offset)
317            .map_or(0, |s| s.matches('\n').count());
318        SourceLocation {
319            file: Some(range.file.clone()),
320            positioning: Positioning::Position(Position {
321                line: range.start_line + line_in_file,
322                column: error.location.column,
323            }),
324        }
325    } else {
326        SourceLocation {
327            file: state.current_file.clone(),
328            positioning: Positioning::Position(Position {
329                line: error.location.line,
330                column: error.location.column,
331            }),
332        }
333    }
334}
335
336#[instrument]
337fn parse_input(
338    input: &str,
339    options: &Options,
340    file_path: Option<PathBuf>,
341    leveloffset_ranges: Vec<model::LeveloffsetRange>,
342    source_ranges: Vec<model::SourceRange>,
343) -> Result<Document, Error> {
344    tracing::trace!(?input, "post preprocessor");
345    let mut state = grammar::ParserState::new(input);
346    state.document_attributes = options.document_attributes.clone();
347    state.options = options.clone();
348    state.current_file = file_path;
349    state.leveloffset_ranges = leveloffset_ranges;
350    state.source_ranges = source_ranges;
351    let result = match grammar::document_parser::document(input, &mut state) {
352        Ok(doc) => doc,
353        Err(error) => {
354            tracing::error!(?error, "error parsing document content");
355            let source_location = peg_error_to_source_location(&error, &state);
356            Err(Error::Parse(Box::new(source_location), error.to_string()))
357        }
358    };
359    state.emit_warnings();
360    result
361}
362
363/// Parse inline `AsciiDoc` content from a string.
364///
365/// This function parses the provided string as inline `AsciiDoc` elements, returning a
366/// vector of inline nodes instead of a complete document structure. This is useful for
367/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
368/// strong text, links, macros, and other inline elements.
369///
370/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
371/// on an "inline" type output.
372///
373/// # Example
374///
375/// ```
376/// use acdc_parser::{Options, SafeMode, parse_inline};
377///
378/// let options = Options::builder()
379///     .with_safe_mode(SafeMode::Unsafe)
380///     .build();
381/// let content = "This is *strong* text with a https://example.com[link].";
382/// let inline_nodes = parse_inline(content, &options).unwrap();
383/// ```
384///
385/// # Errors
386/// This function returns an error if the inline content cannot be parsed.
387#[instrument]
388pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
389    tracing::trace!(?input, "post preprocessor");
390    let mut state = grammar::ParserState::new(input);
391    state.document_attributes = options.document_attributes.clone();
392    state.options = options.clone();
393    let result = match grammar::document_parser::inlines(
394        input,
395        &mut state,
396        0,
397        &grammar::BlockParsingMetadata::default(),
398    ) {
399        Ok(inlines) => Ok(inlines),
400        Err(error) => {
401            tracing::error!(?error, "error parsing inline content");
402            Err(Error::Parse(
403                Box::new(peg_error_to_source_location(&error, &state)),
404                error.to_string(),
405            ))
406        }
407    };
408    state.emit_warnings();
409    result
410}
411
412#[cfg(test)]
413mod proptests;
414
415#[cfg(test)]
416#[allow(clippy::unwrap_used)]
417#[allow(clippy::panic)]
418#[allow(clippy::expect_used)]
419mod tests {
420    use super::*;
421    use pretty_assertions::assert_eq;
422
423    fn read_file_contents_with_extension(
424        path: &std::path::PathBuf,
425        ext: &str,
426    ) -> Result<String, Error> {
427        let test_file_path = path.with_extension(ext);
428        let file_contents = std::fs::read_to_string(&test_file_path).inspect_err(
429            |e| tracing::warn!(?path, ?test_file_path, error = %e, "test file not found"),
430        )?;
431        Ok(file_contents)
432    }
433
434    #[rstest::rstest]
435    #[tracing_test::traced_test]
436    fn test_with_fixtures(
437        #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
438    ) -> Result<(), Error> {
439        let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
440
441        match parse_file(&path, &options) {
442            Ok(result) => {
443                let expected = read_file_contents_with_extension(&path, "json")?;
444                let actual =
445                    serde_json::to_string_pretty(&result).expect("could not serialize result");
446                assert_eq!(expected, actual);
447            }
448            Err(e) => {
449                let file_contents = read_file_contents_with_extension(&path, "error")?;
450                // Error fixtures contain expected error message as plain text
451                let expected = file_contents.trim();
452                assert_eq!(expected, e.to_string());
453            }
454        }
455        Ok(())
456    }
457
458    #[cfg(test)]
459    mod empty_document_tests {
460        use crate::{Options, parse};
461
462        #[test]
463        fn test_whitespace_only_documents() {
464            let test_cases = vec![
465                "\n", "\n\n", "\t", " \n\t\n ", "   ",
466                /* The original proptest failing case -> */ "\n\n\t",
467            ];
468
469            for input in test_cases {
470                let options = Options::default();
471                let result = parse(input, &options);
472
473                match result {
474                    Ok(doc) => {
475                        // Validate the invariant using absolute offsets
476                        assert!(
477                            doc.location.absolute_start <= doc.location.absolute_end,
478                            "Failed for input {input:?}: absolute_start {} > absolute_end {}",
479                            doc.location.absolute_start,
480                            doc.location.absolute_end
481                        );
482
483                        // Validate with our helper
484                        doc.location.validate(input).unwrap_or_else(|e| {
485                            panic!("Location validation failed for {input:?}: {e}")
486                        });
487                    }
488                    Err(e) => {
489                        panic!("Failed to parse {input:?}: {e}");
490                    }
491                }
492            }
493        }
494
495        #[test]
496        fn test_document_with_content_after_whitespace() {
497            let test_cases = vec!["\n\nHello", "\t\tWorld", "  \n  = Title"];
498
499            for input in test_cases {
500                let options = Options::default();
501                let doc =
502                    parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
503
504                assert!(
505                    doc.location.absolute_start <= doc.location.absolute_end,
506                    "Failed for input {input:?}: absolute_start {} > absolute_end {}",
507                    doc.location.absolute_start,
508                    doc.location.absolute_end
509                );
510
511                // Validate with our helper
512                doc.location
513                    .validate(input)
514                    .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
515            }
516        }
517
518        #[test]
519        fn test_unicode_characters() {
520            // Test that UTF-8 safety is maintained
521            let test_cases = vec![
522                "πŸ˜€",         // 4-byte emoji
523                "א",          // 2-byte Hebrew
524                "Hello δΈ–η•Œ", // Mixed content
525                "\u{200b}",   // Zero-width space
526            ];
527
528            for input in test_cases {
529                let options = Options::default();
530                let result = parse(input, &options);
531
532                match result {
533                    Ok(doc) => {
534                        // All offsets should be on UTF-8 boundaries
535                        assert!(
536                            input.is_char_boundary(doc.location.absolute_start),
537                            "Absolute start {} not on UTF-8 boundary for {input:?}",
538                            doc.location.absolute_start,
539                        );
540                        assert!(
541                            input.is_char_boundary(doc.location.absolute_end),
542                            "Absolute end {} not on UTF-8 boundary for {input:?}",
543                            doc.location.absolute_end,
544                        );
545
546                        // Validate with our helper
547                        doc.location.validate(input).unwrap_or_else(|e| {
548                            panic!("Location validation failed for {input:?}: {e}");
549                        });
550                    }
551                    Err(e) => {
552                        // Some of these might fail to parse, which is OK for now
553                        // We're just testing that if they parse, the locations are valid
554                        println!("Failed to parse {input:?}: {e} (this might be expected)",);
555                    }
556                }
557            }
558        }
559    }
560
561    /// Integration tests for attribute resolution behavior.
562    ///
563    /// These tests verify that acdc matches asciidoctor's attribute resolution semantics:
564    /// - Attributes are resolved at definition time (not reference time)
565    /// - If {bar} is undefined when :foo: {bar} is parsed, foo stores literal "{bar}"
566    /// - If {bar} IS defined when :foo: {bar} is parsed, foo stores bar's resolved value
567    mod warning_deduplication_tests {
568        use crate::{Options, parse};
569
570        #[test]
571        #[tracing_test::traced_test]
572        fn counter_reference_emits_single_warning() {
573            // A document with the same counter referenced multiple times should
574            // produce exactly one warning after parsing (not one per PEG attempt).
575            let input = "= Title\n\n{counter:hits} then {counter:hits} again";
576            let options = Options::default();
577            let _doc = parse(input, &options).expect("should parse");
578            assert!(logs_contain("Counters"));
579            logs_assert(|lines: &[&str]| {
580                let count = lines
581                    .iter()
582                    .filter(|l| l.contains("not supported and will be removed"))
583                    .count();
584                if count == 1 {
585                    Ok(())
586                } else {
587                    Err(format!("expected exactly 1 counter warning, got {count}"))
588                }
589            });
590        }
591
592        #[test]
593        #[tracing_test::traced_test]
594        fn distinct_warnings_all_emitted() {
595            // Different warnings should each appear once.
596            let input = "= Title\n\n{counter:a} and {counter2:b}";
597            let options = Options::default();
598            let _doc = parse(input, &options).expect("should parse");
599            assert!(logs_contain(
600                "Counters ({counter:a}) are not supported and will be removed from output"
601            ));
602            assert!(logs_contain(
603                "Counters ({counter2:b}) are not supported and will be removed from output"
604            ));
605        }
606    }
607
608    mod attribute_resolution_tests {
609        use crate::{AttributeValue, Options, parse};
610
611        #[test]
612        fn test_definition_time_resolution_bar_defined_first() {
613            // When bar is defined BEFORE foo, {bar} in foo's value should be expanded
614            let input = r":bar: resolved-bar
615:foo: {bar}
616
617{foo}
618";
619            let options = Options::default();
620            let doc = parse(input, &options).expect("should parse");
621
622            // foo should have bar's value expanded at definition time
623            assert_eq!(
624                doc.attributes.get("foo"),
625                Some(&AttributeValue::String("resolved-bar".to_string()))
626            );
627        }
628
629        #[test]
630        fn test_definition_time_resolution_bar_defined_after() {
631            // When bar is defined AFTER foo, {bar} should stay literal in foo's value
632            let input = r":foo: {bar}
633:bar: resolved-bar
634
635{foo}
636";
637            let options = Options::default();
638            let doc = parse(input, &options).expect("should parse");
639
640            // foo should keep {bar} as literal since bar wasn't defined yet
641            assert_eq!(
642                doc.attributes.get("foo"),
643                Some(&AttributeValue::String("{bar}".to_string()))
644            );
645        }
646
647        #[test]
648        fn test_chained_attribute_resolution() {
649            // When attributes form a chain: a -> b -> c, each should resolve
650            // based on what's defined at each definition point
651            let input = r":c: final-value
652:b: {c}
653:a: {b}
654
655{a}
656";
657            let options = Options::default();
658            let doc = parse(input, &options).expect("should parse");
659
660            // c is defined first, so b gets "final-value", then a gets "final-value"
661            assert_eq!(
662                doc.attributes.get("c"),
663                Some(&AttributeValue::String("final-value".to_string()))
664            );
665            assert_eq!(
666                doc.attributes.get("b"),
667                Some(&AttributeValue::String("final-value".to_string()))
668            );
669            assert_eq!(
670                doc.attributes.get("a"),
671                Some(&AttributeValue::String("final-value".to_string()))
672            );
673        }
674    }
675}