Skip to main content

acdc_parser/
lib.rs

1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33    path::{Path, PathBuf},
34    string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46mod safe_mode;
47
48pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
49use preprocessor::Preprocessor;
50
51pub use error::{Error, Positioning, SourceLocation};
52pub use grammar::parse_text_for_quotes;
53pub use model::{
54    Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
55    Block, BlockMetadata, Bold, Button, CalloutList, CalloutListItem, CalloutRef, CalloutRefKind,
56    ColumnFormat, ColumnStyle, ColumnWidth, Comment, CrossReference, CurvedApostrophe,
57    CurvedQuotation, DelimitedBlock, DelimitedBlockType, DescriptionList, DescriptionListItem,
58    DiscreteHeader, Document, DocumentAttribute, DocumentAttributes, ElementAttributes, Footnote,
59    Form, HEADER, Header, Highlight, HorizontalAlignment, ICON_SIZES, Icon, Image, IndexTerm,
60    IndexTermKind, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
61    ListItemCheckedStatus, Location, MAX_SECTION_LEVELS, MAX_TOC_LEVELS, Mailto, Menu, Monospace,
62    NORMAL, OrderedList, PageBreak, Paragraph, Pass, PassthroughKind, Plain, Position, Raw, Role,
63    Section, Source, StandaloneCurvedApostrophe, Stem, StemContent, StemNotation, Subscript,
64    Substitution, SubstitutionOp, SubstitutionSpec, Subtitle, Superscript, Table, TableColumn,
65    TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UNNUMBERED_SECTION_STYLES,
66    UnorderedList, Url, VERBATIM, Verbatim, VerticalAlignment, Video, inlines_to_string,
67    substitute,
68};
69pub use options::{Options, OptionsBuilder, SafeMode};
70
71/// Type-based parser for `AsciiDoc` content.
72///
73/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
74///
75/// # Examples
76///
77/// Basic usage:
78///
79/// ```
80/// use acdc_parser::Parser;
81///
82/// let content = "= Document Title\n\nParagraph text.";
83/// let doc = Parser::new(content).parse()?;
84/// # Ok::<(), acdc_parser::Error>(())
85/// ```
86///
87/// With options:
88///
89/// ```
90/// use acdc_parser::{Parser, Options, SafeMode};
91///
92/// let content = "= Document Title\n\nParagraph text.";
93/// let options = Options::builder()
94///     .with_safe_mode(SafeMode::Safe)
95///     .with_timings()
96///     .build();
97///
98/// let doc = Parser::new(content)
99///     .with_options(options)
100///     .parse()?;
101/// # Ok::<(), acdc_parser::Error>(())
102/// ```
103///
104/// For file-based parsing, read the file first:
105///
106/// ```no_run
107/// use acdc_parser::Parser;
108/// use std::fs;
109///
110/// let content = fs::read_to_string("document.adoc")?;
111/// let doc = Parser::new(&content).parse()?;
112/// # Ok::<(), Box<dyn std::error::Error>>(())
113/// ```
114#[derive(Debug)]
115pub struct Parser<'input> {
116    input: &'input str,
117    options: Options,
118}
119
120impl<'input> Parser<'input> {
121    /// Create a new parser for the given input string.
122    ///
123    /// The parser will use default options. Use `with_options` to customize.
124    ///
125    /// # Example
126    ///
127    /// ```
128    /// use acdc_parser::Parser;
129    ///
130    /// let parser = Parser::new("= Title\n\nContent");
131    /// let doc = parser.parse()?;
132    /// # Ok::<(), acdc_parser::Error>(())
133    /// ```
134    #[must_use]
135    pub fn new(input: &'input str) -> Self {
136        Self {
137            input,
138            options: Options::default(),
139        }
140    }
141
142    /// Set the options for this parser.
143    ///
144    /// This consumes the parser and returns a new one with the specified options.
145    ///
146    /// # Example
147    ///
148    /// ```
149    /// use acdc_parser::{Parser, Options, SafeMode};
150    ///
151    /// let options = Options::builder()
152    ///     .with_safe_mode(SafeMode::Safe)
153    ///     .build();
154    ///
155    /// let parser = Parser::new("= Title")
156    ///     .with_options(options);
157    /// # Ok::<(), acdc_parser::Error>(())
158    /// ```
159    #[must_use]
160    pub fn with_options(mut self, options: Options) -> Self {
161        self.options = options;
162        self
163    }
164
165    /// Parse the input into a Document.
166    ///
167    /// # Example
168    ///
169    /// ```
170    /// use acdc_parser::Parser;
171    ///
172    /// let doc = Parser::new("= Title\n\nContent").parse()?;
173    /// # Ok::<(), acdc_parser::Error>(())
174    /// ```
175    ///
176    /// # Errors
177    ///
178    /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
179    pub fn parse(self) -> Result<Document, Error> {
180        parse(self.input, &self.options)
181    }
182
183    /// Parse only inline elements from the input.
184    ///
185    /// This is useful for parsing fragments of `AsciiDoc` that contain only
186    /// inline markup like bold, italic, links, etc.
187    ///
188    /// # Example
189    ///
190    /// ```
191    /// use acdc_parser::Parser;
192    ///
193    /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
194    /// # Ok::<(), acdc_parser::Error>(())
195    /// ```
196    ///
197    /// # Errors
198    ///
199    /// Returns an error if the input cannot be parsed.
200    pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
201        parse_inline(self.input, &self.options)
202    }
203}
204
205/// Parse `AsciiDoc` content from a reader.
206///
207/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
208///
209/// # Example
210///
211/// ```
212/// use acdc_parser::{Options, SafeMode, parse_from_reader};
213/// use std::fs::File;
214///
215/// let options = Options::builder()
216///     .with_safe_mode(SafeMode::Unsafe)
217///     .build();
218/// let file = File::open("fixtures/samples/README.adoc").unwrap();
219/// let document = parse_from_reader(file, &options).unwrap();
220/// ```
221///
222/// # Errors
223/// This function returns an error if the content cannot be parsed.
224#[instrument(skip(reader))]
225pub fn parse_from_reader<R: std::io::Read>(
226    reader: R,
227    options: &Options,
228) -> Result<Document, Error> {
229    let result = Preprocessor.process_reader(reader, options)?;
230    parse_input(&result.text, options, None, result.leveloffset_ranges)
231}
232
233/// Parse `AsciiDoc` content from a string.
234///
235/// This function parses the provided string as `AsciiDoc`.
236///
237/// # Example
238///
239/// ```
240/// use acdc_parser::{Options, SafeMode, parse};
241///
242/// let options = Options::builder()
243///     .with_safe_mode(SafeMode::Unsafe)
244///     .build();
245/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
246/// let document = parse(content, &options).unwrap();
247/// ```
248///
249/// # Errors
250/// This function returns an error if the content cannot be parsed.
251#[instrument]
252pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
253    let result = Preprocessor.process(input, options)?;
254    parse_input(&result.text, options, None, result.leveloffset_ranges)
255}
256
257/// Parse `AsciiDoc` content from a file.
258///
259/// This function reads the content from the provided file and parses it as `AsciiDoc`.
260///
261/// # Example
262///
263/// ```
264/// use std::path::Path;
265/// use acdc_parser::{Options, SafeMode, parse_file};
266///
267/// let options = Options::builder()
268///     .with_safe_mode(SafeMode::Unsafe)
269///     .build();
270/// let file_path = Path::new("fixtures/samples/README.adoc");
271/// let document = parse_file(file_path, &options).unwrap();
272/// ```
273///
274/// # Errors
275/// This function returns an error if the content cannot be parsed.
276#[instrument(skip(file_path))]
277pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
278    let path = file_path.as_ref().to_path_buf();
279    let result = Preprocessor.process_file(file_path, options)?;
280    parse_input(&result.text, options, Some(path), result.leveloffset_ranges)
281}
282
283/// Helper to convert a PEG parse error to our `SourceLocation` type
284fn peg_error_to_source_location(
285    error: &peg::error::ParseError<peg::str::LineCol>,
286    file: Option<PathBuf>,
287) -> SourceLocation {
288    SourceLocation {
289        file,
290        positioning: Positioning::Position(Position {
291            line: error.location.line,
292            column: error.location.column,
293        }),
294    }
295}
296
297#[instrument]
298fn parse_input(
299    input: &str,
300    options: &Options,
301    file_path: Option<PathBuf>,
302    leveloffset_ranges: Vec<model::LeveloffsetRange>,
303) -> Result<Document, Error> {
304    tracing::trace!(?input, "post preprocessor");
305    let mut state = grammar::ParserState::new(input);
306    state.document_attributes = options.document_attributes.clone();
307    state.options = options.clone();
308    state.current_file.clone_from(&file_path);
309    state.leveloffset_ranges = leveloffset_ranges;
310    let result = match grammar::document_parser::document(input, &mut state) {
311        Ok(doc) => doc,
312        Err(error) => {
313            tracing::error!(?error, "error parsing document content");
314            let source_location = peg_error_to_source_location(&error, file_path);
315            Err(Error::Parse(Box::new(source_location), error.to_string()))
316        }
317    };
318    state.emit_warnings();
319    result
320}
321
322/// Parse inline `AsciiDoc` content from a string.
323///
324/// This function parses the provided string as inline `AsciiDoc` elements, returning a
325/// vector of inline nodes instead of a complete document structure. This is useful for
326/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
327/// strong text, links, macros, and other inline elements.
328///
329/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
330/// on an "inline" type output.
331///
332/// # Example
333///
334/// ```
335/// use acdc_parser::{Options, SafeMode, parse_inline};
336///
337/// let options = Options::builder()
338///     .with_safe_mode(SafeMode::Unsafe)
339///     .build();
340/// let content = "This is *strong* text with a https://example.com[link].";
341/// let inline_nodes = parse_inline(content, &options).unwrap();
342/// ```
343///
344/// # Errors
345/// This function returns an error if the inline content cannot be parsed.
346#[instrument]
347pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
348    tracing::trace!(?input, "post preprocessor");
349    let mut state = grammar::ParserState::new(input);
350    state.document_attributes = options.document_attributes.clone();
351    state.options = options.clone();
352    let result = match grammar::document_parser::inlines(
353        input,
354        &mut state,
355        0,
356        &grammar::BlockParsingMetadata::default(),
357    ) {
358        Ok(inlines) => Ok(inlines),
359        Err(error) => {
360            tracing::error!(?error, "error parsing inline content");
361            Err(Error::Parse(
362                Box::new(peg_error_to_source_location(&error, None)),
363                error.to_string(),
364            ))
365        }
366    };
367    state.emit_warnings();
368    result
369}
370
371#[cfg(test)]
372mod proptests;
373
374#[cfg(test)]
375#[allow(clippy::unwrap_used)]
376#[allow(clippy::panic)]
377#[allow(clippy::expect_used)]
378mod tests {
379    use super::*;
380    use pretty_assertions::assert_eq;
381
382    fn read_file_contents_with_extension(
383        path: &std::path::PathBuf,
384        ext: &str,
385    ) -> Result<String, Error> {
386        let test_file_path = path.with_extension(ext);
387        let file_contents = std::fs::read_to_string(&test_file_path).inspect_err(
388            |e| tracing::warn!(?path, ?test_file_path, error = %e, "test file not found"),
389        )?;
390        Ok(file_contents)
391    }
392
393    #[rstest::rstest]
394    #[tracing_test::traced_test]
395    fn test_with_fixtures(
396        #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
397    ) -> Result<(), Error> {
398        let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
399
400        match parse_file(&path, &options) {
401            Ok(result) => {
402                let expected = read_file_contents_with_extension(&path, "json")?;
403                let actual =
404                    serde_json::to_string_pretty(&result).expect("could not serialize result");
405                assert_eq!(expected, actual);
406            }
407            Err(e) => {
408                let file_contents = read_file_contents_with_extension(&path, "error")?;
409                // Error fixtures contain expected error message as plain text
410                let expected = file_contents.trim();
411                assert_eq!(expected, e.to_string());
412            }
413        }
414        Ok(())
415    }
416
417    #[cfg(test)]
418    mod empty_document_tests {
419        use crate::{Options, parse};
420
421        #[test]
422        fn test_whitespace_only_documents() {
423            let test_cases = vec![
424                "\n", "\n\n", "\t", " \n\t\n ", "   ",
425                /* The original proptest failing case -> */ "\n\n\t",
426            ];
427
428            for input in test_cases {
429                let options = Options::default();
430                let result = parse(input, &options);
431
432                match result {
433                    Ok(doc) => {
434                        // Validate the invariant using absolute offsets
435                        assert!(
436                            doc.location.absolute_start <= doc.location.absolute_end,
437                            "Failed for input {input:?}: absolute_start {} > absolute_end {}",
438                            doc.location.absolute_start,
439                            doc.location.absolute_end
440                        );
441
442                        // Validate with our helper
443                        doc.location.validate(input).unwrap_or_else(|e| {
444                            panic!("Location validation failed for {input:?}: {e}")
445                        });
446                    }
447                    Err(e) => {
448                        panic!("Failed to parse {input:?}: {e}");
449                    }
450                }
451            }
452        }
453
454        #[test]
455        fn test_document_with_content_after_whitespace() {
456            let test_cases = vec!["\n\nHello", "\t\tWorld", "  \n  = Title"];
457
458            for input in test_cases {
459                let options = Options::default();
460                let doc =
461                    parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
462
463                assert!(
464                    doc.location.absolute_start <= doc.location.absolute_end,
465                    "Failed for input {input:?}: absolute_start {} > absolute_end {}",
466                    doc.location.absolute_start,
467                    doc.location.absolute_end
468                );
469
470                // Validate with our helper
471                doc.location
472                    .validate(input)
473                    .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
474            }
475        }
476
477        #[test]
478        fn test_unicode_characters() {
479            // Test that UTF-8 safety is maintained
480            let test_cases = vec![
481                "πŸ˜€",         // 4-byte emoji
482                "א",          // 2-byte Hebrew
483                "Hello δΈ–η•Œ", // Mixed content
484                "\u{200b}",   // Zero-width space
485            ];
486
487            for input in test_cases {
488                let options = Options::default();
489                let result = parse(input, &options);
490
491                match result {
492                    Ok(doc) => {
493                        // All offsets should be on UTF-8 boundaries
494                        assert!(
495                            input.is_char_boundary(doc.location.absolute_start),
496                            "Absolute start {} not on UTF-8 boundary for {input:?}",
497                            doc.location.absolute_start,
498                        );
499                        assert!(
500                            input.is_char_boundary(doc.location.absolute_end),
501                            "Absolute end {} not on UTF-8 boundary for {input:?}",
502                            doc.location.absolute_end,
503                        );
504
505                        // Validate with our helper
506                        doc.location.validate(input).unwrap_or_else(|e| {
507                            panic!("Location validation failed for {input:?}: {e}");
508                        });
509                    }
510                    Err(e) => {
511                        // Some of these might fail to parse, which is OK for now
512                        // We're just testing that if they parse, the locations are valid
513                        println!("Failed to parse {input:?}: {e} (this might be expected)",);
514                    }
515                }
516            }
517        }
518    }
519
520    /// Integration tests for attribute resolution behavior.
521    ///
522    /// These tests verify that acdc matches asciidoctor's attribute resolution semantics:
523    /// - Attributes are resolved at definition time (not reference time)
524    /// - If {bar} is undefined when :foo: {bar} is parsed, foo stores literal "{bar}"
525    /// - If {bar} IS defined when :foo: {bar} is parsed, foo stores bar's resolved value
526    mod warning_deduplication_tests {
527        use crate::{Options, parse};
528
529        #[test]
530        #[tracing_test::traced_test]
531        fn counter_reference_emits_single_warning() {
532            // A document with the same counter referenced multiple times should
533            // produce exactly one warning after parsing (not one per PEG attempt).
534            let input = "= Title\n\n{counter:hits} then {counter:hits} again";
535            let options = Options::default();
536            let _doc = parse(input, &options).expect("should parse");
537            assert!(logs_contain("Counters"));
538            logs_assert(|lines: &[&str]| {
539                let count = lines
540                    .iter()
541                    .filter(|l| l.contains("not supported and will be removed"))
542                    .count();
543                if count == 1 {
544                    Ok(())
545                } else {
546                    Err(format!("expected exactly 1 counter warning, got {count}"))
547                }
548            });
549        }
550
551        #[test]
552        #[tracing_test::traced_test]
553        fn distinct_warnings_all_emitted() {
554            // Different warnings should each appear once.
555            let input = "= Title\n\n{counter:a} and {counter2:b}";
556            let options = Options::default();
557            let _doc = parse(input, &options).expect("should parse");
558            assert!(logs_contain(
559                "Counters ({counter:a}) are not supported and will be removed from output"
560            ));
561            assert!(logs_contain(
562                "Counters ({counter2:b}) are not supported and will be removed from output"
563            ));
564        }
565    }
566
567    mod attribute_resolution_tests {
568        use crate::{AttributeValue, Options, parse};
569
570        #[test]
571        fn test_definition_time_resolution_bar_defined_first() {
572            // When bar is defined BEFORE foo, {bar} in foo's value should be expanded
573            let input = r":bar: resolved-bar
574:foo: {bar}
575
576{foo}
577";
578            let options = Options::default();
579            let doc = parse(input, &options).expect("should parse");
580
581            // foo should have bar's value expanded at definition time
582            assert_eq!(
583                doc.attributes.get("foo"),
584                Some(&AttributeValue::String("resolved-bar".to_string()))
585            );
586        }
587
588        #[test]
589        fn test_definition_time_resolution_bar_defined_after() {
590            // When bar is defined AFTER foo, {bar} should stay literal in foo's value
591            let input = r":foo: {bar}
592:bar: resolved-bar
593
594{foo}
595";
596            let options = Options::default();
597            let doc = parse(input, &options).expect("should parse");
598
599            // foo should keep {bar} as literal since bar wasn't defined yet
600            assert_eq!(
601                doc.attributes.get("foo"),
602                Some(&AttributeValue::String("{bar}".to_string()))
603            );
604        }
605
606        #[test]
607        fn test_chained_attribute_resolution() {
608            // When attributes form a chain: a -> b -> c, each should resolve
609            // based on what's defined at each definition point
610            let input = r":c: final-value
611:b: {c}
612:a: {b}
613
614{a}
615";
616            let options = Options::default();
617            let doc = parse(input, &options).expect("should parse");
618
619            // c is defined first, so b gets "final-value", then a gets "final-value"
620            assert_eq!(
621                doc.attributes.get("c"),
622                Some(&AttributeValue::String("final-value".to_string()))
623            );
624            assert_eq!(
625                doc.attributes.get("b"),
626                Some(&AttributeValue::String("final-value".to_string()))
627            );
628            assert_eq!(
629                doc.attributes.get("a"),
630                Some(&AttributeValue::String("final-value".to_string()))
631            );
632        }
633    }
634}