acdc_parser/
lib.rs

1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33    path::{Path, PathBuf},
34    string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46mod safe_mode;
47
48pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
49use preprocessor::Preprocessor;
50
51pub use error::{Error, Positioning, SourceLocation};
52pub use model::{
53    Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
54    Block, BlockMetadata, Bold, Button, CalloutList, CalloutListItem, CalloutRef, CalloutRefKind,
55    ColumnFormat, ColumnStyle, ColumnWidth, Comment, CrossReference, CurvedApostrophe,
56    CurvedQuotation, DelimitedBlock, DelimitedBlockType, DescriptionList, DescriptionListItem,
57    DiscreteHeader, Document, DocumentAttribute, DocumentAttributes, ElementAttributes, Footnote,
58    Form, Header, Highlight, HorizontalAlignment, ICON_SIZES, Icon, Image, IndexTerm,
59    IndexTermKind, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
60    ListItemCheckedStatus, Location, Mailto, Menu, Monospace, OrderedList, PageBreak, Paragraph,
61    Pass, PassthroughKind, Plain, Position, Raw, Role, Section, Source, StandaloneCurvedApostrophe,
62    Stem, StemContent, StemNotation, Subscript, Substitution, Subtitle, Superscript, Table,
63    TableColumn, TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UnorderedList, Url,
64    Verbatim, VerticalAlignment, Video, inlines_to_string,
65};
66pub use options::{Options, OptionsBuilder, SafeMode};
67
68/// Type-based parser for `AsciiDoc` content.
69///
70/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
71///
72/// # Examples
73///
74/// Basic usage:
75///
76/// ```
77/// use acdc_parser::Parser;
78///
79/// let content = "= Document Title\n\nParagraph text.";
80/// let doc = Parser::new(content).parse()?;
81/// # Ok::<(), acdc_parser::Error>(())
82/// ```
83///
84/// With options:
85///
86/// ```
87/// use acdc_parser::{Parser, Options, SafeMode};
88///
89/// let content = "= Document Title\n\nParagraph text.";
90/// let options = Options::builder()
91///     .with_safe_mode(SafeMode::Safe)
92///     .with_timings()
93///     .build();
94///
95/// let doc = Parser::new(content)
96///     .with_options(options)
97///     .parse()?;
98/// # Ok::<(), acdc_parser::Error>(())
99/// ```
100///
101/// For file-based parsing, read the file first:
102///
103/// ```no_run
104/// use acdc_parser::Parser;
105/// use std::fs;
106///
107/// let content = fs::read_to_string("document.adoc")?;
108/// let doc = Parser::new(&content).parse()?;
109/// # Ok::<(), Box<dyn std::error::Error>>(())
110/// ```
111#[derive(Debug)]
112pub struct Parser<'input> {
113    input: &'input str,
114    options: Options,
115}
116
117impl<'input> Parser<'input> {
118    /// Create a new parser for the given input string.
119    ///
120    /// The parser will use default options. Use `with_options` to customize.
121    ///
122    /// # Example
123    ///
124    /// ```
125    /// use acdc_parser::Parser;
126    ///
127    /// let parser = Parser::new("= Title\n\nContent");
128    /// let doc = parser.parse()?;
129    /// # Ok::<(), acdc_parser::Error>(())
130    /// ```
131    #[must_use]
132    pub fn new(input: &'input str) -> Self {
133        Self {
134            input,
135            options: Options::default(),
136        }
137    }
138
139    /// Set the options for this parser.
140    ///
141    /// This consumes the parser and returns a new one with the specified options.
142    ///
143    /// # Example
144    ///
145    /// ```
146    /// use acdc_parser::{Parser, Options, SafeMode};
147    ///
148    /// let options = Options::builder()
149    ///     .with_safe_mode(SafeMode::Safe)
150    ///     .build();
151    ///
152    /// let parser = Parser::new("= Title")
153    ///     .with_options(options);
154    /// # Ok::<(), acdc_parser::Error>(())
155    /// ```
156    #[must_use]
157    pub fn with_options(mut self, options: Options) -> Self {
158        self.options = options;
159        self
160    }
161
162    /// Parse the input into a Document.
163    ///
164    /// # Example
165    ///
166    /// ```
167    /// use acdc_parser::Parser;
168    ///
169    /// let doc = Parser::new("= Title\n\nContent").parse()?;
170    /// # Ok::<(), acdc_parser::Error>(())
171    /// ```
172    ///
173    /// # Errors
174    ///
175    /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
176    pub fn parse(self) -> Result<Document, Error> {
177        parse(self.input, &self.options)
178    }
179
180    /// Parse only inline elements from the input.
181    ///
182    /// This is useful for parsing fragments of `AsciiDoc` that contain only
183    /// inline markup like bold, italic, links, etc.
184    ///
185    /// # Example
186    ///
187    /// ```
188    /// use acdc_parser::Parser;
189    ///
190    /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
191    /// # Ok::<(), acdc_parser::Error>(())
192    /// ```
193    ///
194    /// # Errors
195    ///
196    /// Returns an error if the input cannot be parsed.
197    pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
198        parse_inline(self.input, &self.options)
199    }
200}
201
202/// Parse `AsciiDoc` content from a reader.
203///
204/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
205///
206/// # Example
207///
208/// ```
209/// use acdc_parser::{Options, SafeMode, parse_from_reader};
210/// use std::fs::File;
211///
212/// let options = Options::builder()
213///     .with_safe_mode(SafeMode::Unsafe)
214///     .build();
215/// let file = File::open("fixtures/samples/README.adoc").unwrap();
216/// let document = parse_from_reader(file, &options).unwrap();
217/// ```
218///
219/// # Errors
220/// This function returns an error if the content cannot be parsed.
221#[instrument(skip(reader))]
222pub fn parse_from_reader<R: std::io::Read>(
223    reader: R,
224    options: &Options,
225) -> Result<Document, Error> {
226    let input = Preprocessor.process_reader(reader, options)?;
227    parse_input(&input, options, None)
228}
229
230/// Parse `AsciiDoc` content from a string.
231///
232/// This function parses the provided string as `AsciiDoc`.
233///
234/// # Example
235///
236/// ```
237/// use acdc_parser::{Options, SafeMode, parse};
238///
239/// let options = Options::builder()
240///     .with_safe_mode(SafeMode::Unsafe)
241///     .build();
242/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
243/// let document = parse(content, &options).unwrap();
244/// ```
245///
246/// # Errors
247/// This function returns an error if the content cannot be parsed.
248#[instrument]
249pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
250    let input = Preprocessor.process(input, options)?;
251    parse_input(&input, options, None)
252}
253
254/// Parse `AsciiDoc` content from a file.
255///
256/// This function reads the content from the provided file and parses it as `AsciiDoc`.
257///
258/// # Example
259///
260/// ```
261/// use std::path::Path;
262/// use acdc_parser::{Options, SafeMode, parse_file};
263///
264/// let options = Options::builder()
265///     .with_safe_mode(SafeMode::Unsafe)
266///     .build();
267/// let file_path = Path::new("fixtures/samples/README.adoc");
268/// let document = parse_file(file_path, &options).unwrap();
269/// ```
270///
271/// # Errors
272/// This function returns an error if the content cannot be parsed.
273#[instrument(skip(file_path))]
274pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
275    let path = file_path.as_ref().to_path_buf();
276    let input = Preprocessor.process_file(file_path, options)?;
277    parse_input(&input, options, Some(path))
278}
279
280/// Helper to convert a PEG parse error to our `SourceLocation` type
281fn peg_error_to_source_location(
282    error: &peg::error::ParseError<peg::str::LineCol>,
283    file: Option<PathBuf>,
284) -> SourceLocation {
285    SourceLocation {
286        file,
287        positioning: Positioning::Position(Position {
288            line: error.location.line,
289            column: error.location.column,
290        }),
291    }
292}
293
294#[instrument]
295fn parse_input(
296    input: &str,
297    options: &Options,
298    file_path: Option<PathBuf>,
299) -> Result<Document, Error> {
300    tracing::trace!(?input, "post preprocessor");
301    let mut state = grammar::ParserState::new(input);
302    state.document_attributes = options.document_attributes.clone();
303    state.options = options.clone();
304    state.current_file.clone_from(&file_path);
305    match grammar::document_parser::document(input, &mut state) {
306        Ok(doc) => doc,
307        Err(error) => {
308            tracing::error!(?error, "error parsing document content");
309            let source_location = peg_error_to_source_location(&error, file_path);
310            Err(Error::Parse(Box::new(source_location), error.to_string()))
311        }
312    }
313}
314
315/// Parse inline `AsciiDoc` content from a string.
316///
317/// This function parses the provided string as inline `AsciiDoc` elements, returning a
318/// vector of inline nodes instead of a complete document structure. This is useful for
319/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
320/// strong text, links, macros, and other inline elements.
321///
322/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
323/// on an "inline" type output.
324///
325/// # Example
326///
327/// ```
328/// use acdc_parser::{Options, SafeMode, parse_inline};
329///
330/// let options = Options::builder()
331///     .with_safe_mode(SafeMode::Unsafe)
332///     .build();
333/// let content = "This is *strong* text with a https://example.com[link].";
334/// let inline_nodes = parse_inline(content, &options).unwrap();
335/// ```
336///
337/// # Errors
338/// This function returns an error if the inline content cannot be parsed.
339#[instrument]
340pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
341    tracing::trace!(?input, "post preprocessor");
342    let mut state = grammar::ParserState::new(input);
343    state.document_attributes = options.document_attributes.clone();
344    state.options = options.clone();
345    match grammar::document_parser::inlines(
346        input,
347        &mut state,
348        0,
349        &grammar::BlockParsingMetadata::default(),
350    ) {
351        Ok(inlines) => Ok(inlines),
352        Err(error) => {
353            tracing::error!(?error, "error parsing inline content");
354            Err(Error::Parse(
355                Box::new(peg_error_to_source_location(&error, None)),
356                error.to_string(),
357            ))
358        }
359    }
360}
361
362#[cfg(test)]
363mod proptests;
364
365#[cfg(test)]
366#[allow(clippy::unwrap_used)]
367#[allow(clippy::panic)]
368#[allow(clippy::expect_used)]
369mod tests {
370    use super::*;
371    use pretty_assertions::assert_eq;
372
373    #[rstest::rstest]
374    #[tracing_test::traced_test]
375    fn test_with_fixtures(
376        #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
377    ) -> Result<(), Error> {
378        let test_file_path = path.with_extension("json");
379        let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
380
381        // We do this check because we have files that won't have a test file, namely ones
382        // that are supposed to error out!
383        if test_file_path.exists() {
384            let test_file_contents = std::fs::read_to_string(test_file_path)?;
385            match parse_file(&path, &options) {
386                Ok(result) => {
387                    let result_str =
388                        serde_json::to_string(&result).expect("could not serialize result");
389                    let test: Document = serde_json::from_str(&test_file_contents)
390                        .expect("could not deserialize test");
391                    let test_str = serde_json::to_string(&test).expect("could not serialize test");
392                    assert_eq!(test_str, result_str);
393                }
394                Err(e) => {
395                    let test: Error = serde_json::from_str(&test_file_contents)
396                        .expect("could not deserialize test");
397                    assert_eq!(test.to_string(), e.to_string());
398                }
399            }
400        } else {
401            tracing::warn!(?path, "test file not found");
402        }
403        Ok(())
404    }
405
406    #[cfg(test)]
407    mod empty_document_tests {
408        use crate::{Options, parse};
409
410        #[test]
411        fn test_whitespace_only_documents() {
412            let test_cases = vec![
413                "\n", "\n\n", "\t", " \n\t\n ", "   ",
414                /* The original proptest failing case -> */ "\n\n\t",
415            ];
416
417            for input in test_cases {
418                let options = Options::default();
419                let result = parse(input, &options);
420
421                match result {
422                    Ok(doc) => {
423                        // Validate the invariant using absolute offsets
424                        assert!(
425                            doc.location.absolute_start <= doc.location.absolute_end,
426                            "Failed for input {input:?}: absolute_start {} > absolute_end {}",
427                            doc.location.absolute_start,
428                            doc.location.absolute_end
429                        );
430
431                        // Validate with our helper
432                        doc.location.validate(input).unwrap_or_else(|e| {
433                            panic!("Location validation failed for {input:?}: {e}")
434                        });
435                    }
436                    Err(e) => {
437                        panic!("Failed to parse {input:?}: {e}");
438                    }
439                }
440            }
441        }
442
443        #[test]
444        fn test_document_with_content_after_whitespace() {
445            let test_cases = vec!["\n\nHello", "\t\tWorld", "  \n  = Title"];
446
447            for input in test_cases {
448                let options = Options::default();
449                let doc =
450                    parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
451
452                assert!(
453                    doc.location.absolute_start <= doc.location.absolute_end,
454                    "Failed for input {input:?}: absolute_start {} > absolute_end {}",
455                    doc.location.absolute_start,
456                    doc.location.absolute_end
457                );
458
459                // Validate with our helper
460                doc.location
461                    .validate(input)
462                    .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
463            }
464        }
465
466        #[test]
467        fn test_unicode_characters() {
468            // Test that UTF-8 safety is maintained
469            let test_cases = vec![
470                "πŸ˜€",         // 4-byte emoji
471                "א",          // 2-byte Hebrew
472                "Hello δΈ–η•Œ", // Mixed content
473                "\u{200b}",   // Zero-width space
474            ];
475
476            for input in test_cases {
477                let options = Options::default();
478                let result = parse(input, &options);
479
480                match result {
481                    Ok(doc) => {
482                        // All offsets should be on UTF-8 boundaries
483                        assert!(
484                            input.is_char_boundary(doc.location.absolute_start),
485                            "Absolute start {} not on UTF-8 boundary for {input:?}",
486                            doc.location.absolute_start,
487                        );
488                        assert!(
489                            input.is_char_boundary(doc.location.absolute_end),
490                            "Absolute end {} not on UTF-8 boundary for {input:?}",
491                            doc.location.absolute_end,
492                        );
493
494                        // Validate with our helper
495                        doc.location.validate(input).unwrap_or_else(|e| {
496                            panic!("Location validation failed for {input:?}: {e}");
497                        });
498                    }
499                    Err(e) => {
500                        // Some of these might fail to parse, which is OK for now
501                        // We're just testing that if they parse, the locations are valid
502                        println!("Failed to parse {input:?}: {e} (this might be expected)",);
503                    }
504                }
505            }
506        }
507    }
508}