Skip to main content

acdc_parser/
lib.rs

1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33    path::{Path, PathBuf},
34    string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46mod safe_mode;
47
48pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
49use preprocessor::Preprocessor;
50
51pub use error::{Error, Positioning, SourceLocation};
52pub use grammar::parse_text_for_quotes;
53pub use model::{
54    Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
55    Block, BlockMetadata, Bold, Button, CalloutList, CalloutListItem, CalloutRef, CalloutRefKind,
56    ColumnFormat, ColumnStyle, ColumnWidth, Comment, CrossReference, CurvedApostrophe,
57    CurvedQuotation, DelimitedBlock, DelimitedBlockType, DescriptionList, DescriptionListItem,
58    DiscreteHeader, Document, DocumentAttribute, DocumentAttributes, ElementAttributes, Footnote,
59    Form, HEADER, Header, Highlight, HorizontalAlignment, ICON_SIZES, Icon, Image, IndexTerm,
60    IndexTermKind, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
61    ListItemCheckedStatus, Location, MAX_SECTION_LEVELS, MAX_TOC_LEVELS, Mailto, Menu, Monospace,
62    NORMAL, OrderedList, PageBreak, Paragraph, Pass, PassthroughKind, Plain, Position, Raw, Role,
63    Section, Source, StandaloneCurvedApostrophe, Stem, StemContent, StemNotation, Subscript,
64    Substitution, SubstitutionOp, SubstitutionSpec, Subtitle, Superscript, Table, TableColumn,
65    TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UNNUMBERED_SECTION_STYLES,
66    UnorderedList, Url, VERBATIM, Verbatim, VerticalAlignment, Video, inlines_to_string,
67    substitute,
68};
69pub use options::{Options, OptionsBuilder, SafeMode};
70
71/// Type-based parser for `AsciiDoc` content.
72///
73/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
74///
75/// # Examples
76///
77/// Basic usage:
78///
79/// ```
80/// use acdc_parser::Parser;
81///
82/// let content = "= Document Title\n\nParagraph text.";
83/// let doc = Parser::new(content).parse()?;
84/// # Ok::<(), acdc_parser::Error>(())
85/// ```
86///
87/// With options:
88///
89/// ```
90/// use acdc_parser::{Parser, Options, SafeMode};
91///
92/// let content = "= Document Title\n\nParagraph text.";
93/// let options = Options::builder()
94///     .with_safe_mode(SafeMode::Safe)
95///     .with_timings()
96///     .build();
97///
98/// let doc = Parser::new(content)
99///     .with_options(options)
100///     .parse()?;
101/// # Ok::<(), acdc_parser::Error>(())
102/// ```
103///
104/// For file-based parsing, read the file first:
105///
106/// ```no_run
107/// use acdc_parser::Parser;
108/// use std::fs;
109///
110/// let content = fs::read_to_string("document.adoc")?;
111/// let doc = Parser::new(&content).parse()?;
112/// # Ok::<(), Box<dyn std::error::Error>>(())
113/// ```
114#[derive(Debug)]
115pub struct Parser<'input> {
116    input: &'input str,
117    options: Options,
118}
119
120impl<'input> Parser<'input> {
121    /// Create a new parser for the given input string.
122    ///
123    /// The parser will use default options. Use `with_options` to customize.
124    ///
125    /// # Example
126    ///
127    /// ```
128    /// use acdc_parser::Parser;
129    ///
130    /// let parser = Parser::new("= Title\n\nContent");
131    /// let doc = parser.parse()?;
132    /// # Ok::<(), acdc_parser::Error>(())
133    /// ```
134    #[must_use]
135    pub fn new(input: &'input str) -> Self {
136        Self {
137            input,
138            options: Options::default(),
139        }
140    }
141
142    /// Set the options for this parser.
143    ///
144    /// This consumes the parser and returns a new one with the specified options.
145    ///
146    /// # Example
147    ///
148    /// ```
149    /// use acdc_parser::{Parser, Options, SafeMode};
150    ///
151    /// let options = Options::builder()
152    ///     .with_safe_mode(SafeMode::Safe)
153    ///     .build();
154    ///
155    /// let parser = Parser::new("= Title")
156    ///     .with_options(options);
157    /// # Ok::<(), acdc_parser::Error>(())
158    /// ```
159    #[must_use]
160    pub fn with_options(mut self, options: Options) -> Self {
161        self.options = options;
162        self
163    }
164
165    /// Parse the input into a Document.
166    ///
167    /// # Example
168    ///
169    /// ```
170    /// use acdc_parser::Parser;
171    ///
172    /// let doc = Parser::new("= Title\n\nContent").parse()?;
173    /// # Ok::<(), acdc_parser::Error>(())
174    /// ```
175    ///
176    /// # Errors
177    ///
178    /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
179    pub fn parse(self) -> Result<Document, Error> {
180        parse(self.input, &self.options)
181    }
182
183    /// Parse only inline elements from the input.
184    ///
185    /// This is useful for parsing fragments of `AsciiDoc` that contain only
186    /// inline markup like bold, italic, links, etc.
187    ///
188    /// # Example
189    ///
190    /// ```
191    /// use acdc_parser::Parser;
192    ///
193    /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
194    /// # Ok::<(), acdc_parser::Error>(())
195    /// ```
196    ///
197    /// # Errors
198    ///
199    /// Returns an error if the input cannot be parsed.
200    pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
201        parse_inline(self.input, &self.options)
202    }
203}
204
205/// Parse `AsciiDoc` content from a reader.
206///
207/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
208///
209/// # Example
210///
211/// ```
212/// use acdc_parser::{Options, SafeMode, parse_from_reader};
213/// use std::fs::File;
214///
215/// let options = Options::builder()
216///     .with_safe_mode(SafeMode::Unsafe)
217///     .build();
218/// let file = File::open("fixtures/samples/README.adoc").unwrap();
219/// let document = parse_from_reader(file, &options).unwrap();
220/// ```
221///
222/// # Errors
223/// This function returns an error if the content cannot be parsed.
224#[instrument(skip(reader))]
225pub fn parse_from_reader<R: std::io::Read>(
226    reader: R,
227    options: &Options,
228) -> Result<Document, Error> {
229    let result = Preprocessor.process_reader(reader, options)?;
230    parse_input(&result.text, options, None, result.leveloffset_ranges)
231}
232
233/// Parse `AsciiDoc` content from a string.
234///
235/// This function parses the provided string as `AsciiDoc`.
236///
237/// # Example
238///
239/// ```
240/// use acdc_parser::{Options, SafeMode, parse};
241///
242/// let options = Options::builder()
243///     .with_safe_mode(SafeMode::Unsafe)
244///     .build();
245/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
246/// let document = parse(content, &options).unwrap();
247/// ```
248///
249/// # Errors
250/// This function returns an error if the content cannot be parsed.
251#[instrument]
252pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
253    let result = Preprocessor.process(input, options)?;
254    parse_input(&result.text, options, None, result.leveloffset_ranges)
255}
256
257/// Parse `AsciiDoc` content from a file.
258///
259/// This function reads the content from the provided file and parses it as `AsciiDoc`.
260///
261/// # Example
262///
263/// ```
264/// use std::path::Path;
265/// use acdc_parser::{Options, SafeMode, parse_file};
266///
267/// let options = Options::builder()
268///     .with_safe_mode(SafeMode::Unsafe)
269///     .build();
270/// let file_path = Path::new("fixtures/samples/README.adoc");
271/// let document = parse_file(file_path, &options).unwrap();
272/// ```
273///
274/// # Errors
275/// This function returns an error if the content cannot be parsed.
276#[instrument(skip(file_path))]
277pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
278    let path = file_path.as_ref().to_path_buf();
279    let result = Preprocessor.process_file(file_path, options)?;
280    parse_input(&result.text, options, Some(path), result.leveloffset_ranges)
281}
282
283/// Helper to convert a PEG parse error to our `SourceLocation` type
284fn peg_error_to_source_location(
285    error: &peg::error::ParseError<peg::str::LineCol>,
286    file: Option<PathBuf>,
287) -> SourceLocation {
288    SourceLocation {
289        file,
290        positioning: Positioning::Position(Position {
291            line: error.location.line,
292            column: error.location.column,
293        }),
294    }
295}
296
297#[instrument]
298fn parse_input(
299    input: &str,
300    options: &Options,
301    file_path: Option<PathBuf>,
302    leveloffset_ranges: Vec<model::LeveloffsetRange>,
303) -> Result<Document, Error> {
304    tracing::trace!(?input, "post preprocessor");
305    let mut state = grammar::ParserState::new(input);
306    state.document_attributes = options.document_attributes.clone();
307    state.options = options.clone();
308    state.current_file.clone_from(&file_path);
309    state.leveloffset_ranges = leveloffset_ranges;
310    match grammar::document_parser::document(input, &mut state) {
311        Ok(doc) => doc,
312        Err(error) => {
313            tracing::error!(?error, "error parsing document content");
314            let source_location = peg_error_to_source_location(&error, file_path);
315            Err(Error::Parse(Box::new(source_location), error.to_string()))
316        }
317    }
318}
319
320/// Parse inline `AsciiDoc` content from a string.
321///
322/// This function parses the provided string as inline `AsciiDoc` elements, returning a
323/// vector of inline nodes instead of a complete document structure. This is useful for
324/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
325/// strong text, links, macros, and other inline elements.
326///
327/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
328/// on an "inline" type output.
329///
330/// # Example
331///
332/// ```
333/// use acdc_parser::{Options, SafeMode, parse_inline};
334///
335/// let options = Options::builder()
336///     .with_safe_mode(SafeMode::Unsafe)
337///     .build();
338/// let content = "This is *strong* text with a https://example.com[link].";
339/// let inline_nodes = parse_inline(content, &options).unwrap();
340/// ```
341///
342/// # Errors
343/// This function returns an error if the inline content cannot be parsed.
344#[instrument]
345pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
346    tracing::trace!(?input, "post preprocessor");
347    let mut state = grammar::ParserState::new(input);
348    state.document_attributes = options.document_attributes.clone();
349    state.options = options.clone();
350    match grammar::document_parser::inlines(
351        input,
352        &mut state,
353        0,
354        &grammar::BlockParsingMetadata::default(),
355    ) {
356        Ok(inlines) => Ok(inlines),
357        Err(error) => {
358            tracing::error!(?error, "error parsing inline content");
359            Err(Error::Parse(
360                Box::new(peg_error_to_source_location(&error, None)),
361                error.to_string(),
362            ))
363        }
364    }
365}
366
367#[cfg(test)]
368mod proptests;
369
370#[cfg(test)]
371#[allow(clippy::unwrap_used)]
372#[allow(clippy::panic)]
373#[allow(clippy::expect_used)]
374mod tests {
375    use super::*;
376    use pretty_assertions::assert_eq;
377
378    fn read_file_contents_with_extension(
379        path: &std::path::PathBuf,
380        ext: &str,
381    ) -> Result<String, Error> {
382        let test_file_path = path.with_extension(ext);
383        let file_contents = std::fs::read_to_string(&test_file_path).inspect_err(
384            |e| tracing::warn!(?path, ?test_file_path, error = %e, "test file not found"),
385        )?;
386        Ok(file_contents)
387    }
388
389    #[rstest::rstest]
390    #[tracing_test::traced_test]
391    fn test_with_fixtures(
392        #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
393    ) -> Result<(), Error> {
394        let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
395
396        match parse_file(&path, &options) {
397            Ok(result) => {
398                let expected = read_file_contents_with_extension(&path, "json")?;
399                let actual =
400                    serde_json::to_string_pretty(&result).expect("could not serialize result");
401                assert_eq!(expected, actual);
402            }
403            Err(e) => {
404                let file_contents = read_file_contents_with_extension(&path, "error")?;
405                // Error fixtures contain expected error message as plain text
406                let expected = file_contents.trim();
407                assert_eq!(expected, e.to_string());
408            }
409        }
410        Ok(())
411    }
412
413    #[cfg(test)]
414    mod empty_document_tests {
415        use crate::{Options, parse};
416
417        #[test]
418        fn test_whitespace_only_documents() {
419            let test_cases = vec![
420                "\n", "\n\n", "\t", " \n\t\n ", "   ",
421                /* The original proptest failing case -> */ "\n\n\t",
422            ];
423
424            for input in test_cases {
425                let options = Options::default();
426                let result = parse(input, &options);
427
428                match result {
429                    Ok(doc) => {
430                        // Validate the invariant using absolute offsets
431                        assert!(
432                            doc.location.absolute_start <= doc.location.absolute_end,
433                            "Failed for input {input:?}: absolute_start {} > absolute_end {}",
434                            doc.location.absolute_start,
435                            doc.location.absolute_end
436                        );
437
438                        // Validate with our helper
439                        doc.location.validate(input).unwrap_or_else(|e| {
440                            panic!("Location validation failed for {input:?}: {e}")
441                        });
442                    }
443                    Err(e) => {
444                        panic!("Failed to parse {input:?}: {e}");
445                    }
446                }
447            }
448        }
449
450        #[test]
451        fn test_document_with_content_after_whitespace() {
452            let test_cases = vec!["\n\nHello", "\t\tWorld", "  \n  = Title"];
453
454            for input in test_cases {
455                let options = Options::default();
456                let doc =
457                    parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
458
459                assert!(
460                    doc.location.absolute_start <= doc.location.absolute_end,
461                    "Failed for input {input:?}: absolute_start {} > absolute_end {}",
462                    doc.location.absolute_start,
463                    doc.location.absolute_end
464                );
465
466                // Validate with our helper
467                doc.location
468                    .validate(input)
469                    .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
470            }
471        }
472
473        #[test]
474        fn test_unicode_characters() {
475            // Test that UTF-8 safety is maintained
476            let test_cases = vec![
477                "πŸ˜€",         // 4-byte emoji
478                "א",          // 2-byte Hebrew
479                "Hello δΈ–η•Œ", // Mixed content
480                "\u{200b}",   // Zero-width space
481            ];
482
483            for input in test_cases {
484                let options = Options::default();
485                let result = parse(input, &options);
486
487                match result {
488                    Ok(doc) => {
489                        // All offsets should be on UTF-8 boundaries
490                        assert!(
491                            input.is_char_boundary(doc.location.absolute_start),
492                            "Absolute start {} not on UTF-8 boundary for {input:?}",
493                            doc.location.absolute_start,
494                        );
495                        assert!(
496                            input.is_char_boundary(doc.location.absolute_end),
497                            "Absolute end {} not on UTF-8 boundary for {input:?}",
498                            doc.location.absolute_end,
499                        );
500
501                        // Validate with our helper
502                        doc.location.validate(input).unwrap_or_else(|e| {
503                            panic!("Location validation failed for {input:?}: {e}");
504                        });
505                    }
506                    Err(e) => {
507                        // Some of these might fail to parse, which is OK for now
508                        // We're just testing that if they parse, the locations are valid
509                        println!("Failed to parse {input:?}: {e} (this might be expected)",);
510                    }
511                }
512            }
513        }
514    }
515
516    /// Integration tests for attribute resolution behavior.
517    ///
518    /// These tests verify that acdc matches asciidoctor's attribute resolution semantics:
519    /// - Attributes are resolved at definition time (not reference time)
520    /// - If {bar} is undefined when :foo: {bar} is parsed, foo stores literal "{bar}"
521    /// - If {bar} IS defined when :foo: {bar} is parsed, foo stores bar's resolved value
522    mod attribute_resolution_tests {
523        use crate::{AttributeValue, Options, parse};
524
525        #[test]
526        fn test_definition_time_resolution_bar_defined_first() {
527            // When bar is defined BEFORE foo, {bar} in foo's value should be expanded
528            let input = r":bar: resolved-bar
529:foo: {bar}
530
531{foo}
532";
533            let options = Options::default();
534            let doc = parse(input, &options).expect("should parse");
535
536            // foo should have bar's value expanded at definition time
537            assert_eq!(
538                doc.attributes.get("foo"),
539                Some(&AttributeValue::String("resolved-bar".to_string()))
540            );
541        }
542
543        #[test]
544        fn test_definition_time_resolution_bar_defined_after() {
545            // When bar is defined AFTER foo, {bar} should stay literal in foo's value
546            let input = r":foo: {bar}
547:bar: resolved-bar
548
549{foo}
550";
551            let options = Options::default();
552            let doc = parse(input, &options).expect("should parse");
553
554            // foo should keep {bar} as literal since bar wasn't defined yet
555            assert_eq!(
556                doc.attributes.get("foo"),
557                Some(&AttributeValue::String("{bar}".to_string()))
558            );
559        }
560
561        #[test]
562        fn test_chained_attribute_resolution() {
563            // When attributes form a chain: a -> b -> c, each should resolve
564            // based on what's defined at each definition point
565            let input = r":c: final-value
566:b: {c}
567:a: {b}
568
569{a}
570";
571            let options = Options::default();
572            let doc = parse(input, &options).expect("should parse");
573
574            // c is defined first, so b gets "final-value", then a gets "final-value"
575            assert_eq!(
576                doc.attributes.get("c"),
577                Some(&AttributeValue::String("final-value".to_string()))
578            );
579            assert_eq!(
580                doc.attributes.get("b"),
581                Some(&AttributeValue::String("final-value".to_string()))
582            );
583            assert_eq!(
584                doc.attributes.get("a"),
585                Some(&AttributeValue::String("final-value".to_string()))
586            );
587        }
588    }
589}