acdc_parser/
lib.rs

1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33    path::{Path, PathBuf},
34    string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46mod safe_mode;
47
48pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
49use preprocessor::Preprocessor;
50
51pub use error::{Error, Positioning, SourceLocation};
52pub use model::{
53    Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
54    Block, BlockMetadata, Bold, Button, CalloutList, ColumnFormat, ColumnStyle, ColumnWidth,
55    Comment, CrossReference, CurvedApostrophe, CurvedQuotation, DelimitedBlock, DelimitedBlockType,
56    DescriptionList, DescriptionListItem, DiscreteHeader, Document, DocumentAttribute,
57    DocumentAttributes, ElementAttributes, Footnote, Form, Header, Highlight, HorizontalAlignment,
58    ICON_SIZES, Icon, Image, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
59    ListItemCheckedStatus, Location, Mailto, Menu, Monospace, OrderedList, PageBreak, Paragraph,
60    Pass, PassthroughKind, Plain, Position, Raw, Role, Section, Source, StandaloneCurvedApostrophe,
61    Stem, StemContent, StemNotation, Subscript, Substitution, Subtitle, Superscript, Table,
62    TableColumn, TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UnorderedList, Url,
63    Verbatim, VerticalAlignment, Video, inlines_to_string,
64};
65pub use options::{Options, OptionsBuilder, SafeMode};
66
67/// Type-based parser for `AsciiDoc` content.
68///
69/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
70///
71/// # Examples
72///
73/// Basic usage:
74///
75/// ```
76/// use acdc_parser::Parser;
77///
78/// let content = "= Document Title\n\nParagraph text.";
79/// let doc = Parser::new(content).parse()?;
80/// # Ok::<(), acdc_parser::Error>(())
81/// ```
82///
83/// With options:
84///
85/// ```
86/// use acdc_parser::{Parser, Options, SafeMode};
87///
88/// let content = "= Document Title\n\nParagraph text.";
89/// let options = Options::builder()
90///     .with_safe_mode(SafeMode::Safe)
91///     .with_timings()
92///     .build();
93///
94/// let doc = Parser::new(content)
95///     .with_options(options)
96///     .parse()?;
97/// # Ok::<(), acdc_parser::Error>(())
98/// ```
99///
100/// For file-based parsing, read the file first:
101///
102/// ```no_run
103/// use acdc_parser::Parser;
104/// use std::fs;
105///
106/// let content = fs::read_to_string("document.adoc")?;
107/// let doc = Parser::new(&content).parse()?;
108/// # Ok::<(), Box<dyn std::error::Error>>(())
109/// ```
110#[derive(Debug)]
111pub struct Parser<'input> {
112    input: &'input str,
113    options: Options,
114}
115
116impl<'input> Parser<'input> {
117    /// Create a new parser for the given input string.
118    ///
119    /// The parser will use default options. Use `with_options` to customize.
120    ///
121    /// # Example
122    ///
123    /// ```
124    /// use acdc_parser::Parser;
125    ///
126    /// let parser = Parser::new("= Title\n\nContent");
127    /// let doc = parser.parse()?;
128    /// # Ok::<(), acdc_parser::Error>(())
129    /// ```
130    #[must_use]
131    pub fn new(input: &'input str) -> Self {
132        Self {
133            input,
134            options: Options::default(),
135        }
136    }
137
138    /// Set the options for this parser.
139    ///
140    /// This consumes the parser and returns a new one with the specified options.
141    ///
142    /// # Example
143    ///
144    /// ```
145    /// use acdc_parser::{Parser, Options, SafeMode};
146    ///
147    /// let options = Options::builder()
148    ///     .with_safe_mode(SafeMode::Safe)
149    ///     .build();
150    ///
151    /// let parser = Parser::new("= Title")
152    ///     .with_options(options);
153    /// # Ok::<(), acdc_parser::Error>(())
154    /// ```
155    #[must_use]
156    pub fn with_options(mut self, options: Options) -> Self {
157        self.options = options;
158        self
159    }
160
161    /// Parse the input into a Document.
162    ///
163    /// # Example
164    ///
165    /// ```
166    /// use acdc_parser::Parser;
167    ///
168    /// let doc = Parser::new("= Title\n\nContent").parse()?;
169    /// # Ok::<(), acdc_parser::Error>(())
170    /// ```
171    ///
172    /// # Errors
173    ///
174    /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
175    pub fn parse(self) -> Result<Document, Error> {
176        parse(self.input, &self.options)
177    }
178
179    /// Parse only inline elements from the input.
180    ///
181    /// This is useful for parsing fragments of `AsciiDoc` that contain only
182    /// inline markup like bold, italic, links, etc.
183    ///
184    /// # Example
185    ///
186    /// ```
187    /// use acdc_parser::Parser;
188    ///
189    /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
190    /// # Ok::<(), acdc_parser::Error>(())
191    /// ```
192    ///
193    /// # Errors
194    ///
195    /// Returns an error if the input cannot be parsed.
196    pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
197        parse_inline(self.input, &self.options)
198    }
199}
200
201/// Parse `AsciiDoc` content from a reader.
202///
203/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
204///
205/// # Example
206///
207/// ```
208/// use acdc_parser::{Options, SafeMode, parse_from_reader};
209/// use std::fs::File;
210///
211/// let options = Options::builder()
212///     .with_safe_mode(SafeMode::Unsafe)
213///     .build();
214/// let file = File::open("fixtures/samples/README.adoc").unwrap();
215/// let document = parse_from_reader(file, &options).unwrap();
216/// ```
217///
218/// # Errors
219/// This function returns an error if the content cannot be parsed.
220#[instrument(skip(reader))]
221pub fn parse_from_reader<R: std::io::Read>(
222    reader: R,
223    options: &Options,
224) -> Result<Document, Error> {
225    let input = Preprocessor.process_reader(reader, options)?;
226    parse_input(&input, options, None)
227}
228
229/// Parse `AsciiDoc` content from a string.
230///
231/// This function parses the provided string as `AsciiDoc`.
232///
233/// # Example
234///
235/// ```
236/// use acdc_parser::{Options, SafeMode, parse};
237///
238/// let options = Options::builder()
239///     .with_safe_mode(SafeMode::Unsafe)
240///     .build();
241/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
242/// let document = parse(content, &options).unwrap();
243/// ```
244///
245/// # Errors
246/// This function returns an error if the content cannot be parsed.
247#[instrument]
248pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
249    let input = Preprocessor.process(input, options)?;
250    parse_input(&input, options, None)
251}
252
253/// Parse `AsciiDoc` content from a file.
254///
255/// This function reads the content from the provided file and parses it as `AsciiDoc`.
256///
257/// # Example
258///
259/// ```
260/// use std::path::Path;
261/// use acdc_parser::{Options, SafeMode, parse_file};
262///
263/// let options = Options::builder()
264///     .with_safe_mode(SafeMode::Unsafe)
265///     .build();
266/// let file_path = Path::new("fixtures/samples/README.adoc");
267/// let document = parse_file(file_path, &options).unwrap();
268/// ```
269///
270/// # Errors
271/// This function returns an error if the content cannot be parsed.
272#[instrument(skip(file_path))]
273pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
274    let path = file_path.as_ref().to_path_buf();
275    let input = Preprocessor.process_file(file_path, options)?;
276    parse_input(&input, options, Some(path))
277}
278
279/// Helper to convert a PEG parse error to our `SourceLocation` type
280fn peg_error_to_source_location(
281    error: &peg::error::ParseError<peg::str::LineCol>,
282    file: Option<PathBuf>,
283) -> SourceLocation {
284    SourceLocation {
285        file,
286        positioning: Positioning::Position(Position {
287            line: error.location.line,
288            column: error.location.column,
289        }),
290    }
291}
292
293#[instrument]
294fn parse_input(
295    input: &str,
296    options: &Options,
297    file_path: Option<PathBuf>,
298) -> Result<Document, Error> {
299    tracing::trace!(?input, "post preprocessor");
300    let mut state = grammar::ParserState::new(input);
301    state.document_attributes = options.document_attributes.clone();
302    state.options = options.clone();
303    state.current_file.clone_from(&file_path);
304    match grammar::document_parser::document(input, &mut state) {
305        Ok(doc) => doc,
306        Err(error) => {
307            tracing::error!(?error, "error parsing document content");
308            let source_location = peg_error_to_source_location(&error, file_path);
309            Err(Error::Parse(Box::new(source_location), error.to_string()))
310        }
311    }
312}
313
314/// Parse inline `AsciiDoc` content from a string.
315///
316/// This function parses the provided string as inline `AsciiDoc` elements, returning a
317/// vector of inline nodes instead of a complete document structure. This is useful for
318/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
319/// strong text, links, macros, and other inline elements.
320///
321/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
322/// on an "inline" type output.
323///
324/// # Example
325///
326/// ```
327/// use acdc_parser::{Options, SafeMode, parse_inline};
328///
329/// let options = Options::builder()
330///     .with_safe_mode(SafeMode::Unsafe)
331///     .build();
332/// let content = "This is *strong* text with a https://example.com[link].";
333/// let inline_nodes = parse_inline(content, &options).unwrap();
334/// ```
335///
336/// # Errors
337/// This function returns an error if the inline content cannot be parsed.
338#[instrument]
339pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
340    tracing::trace!(?input, "post preprocessor");
341    let mut state = grammar::ParserState::new(input);
342    state.document_attributes = options.document_attributes.clone();
343    state.options = options.clone();
344    match grammar::document_parser::inlines(
345        input,
346        &mut state,
347        0,
348        &grammar::BlockParsingMetadata::default(),
349    ) {
350        Ok(inlines) => Ok(inlines),
351        Err(error) => {
352            tracing::error!(?error, "error parsing inline content");
353            Err(Error::Parse(
354                Box::new(peg_error_to_source_location(&error, None)),
355                error.to_string(),
356            ))
357        }
358    }
359}
360
361#[cfg(test)]
362mod proptests;
363
364#[cfg(test)]
365#[allow(clippy::unwrap_used)]
366#[allow(clippy::panic)]
367#[allow(clippy::expect_used)]
368mod tests {
369    use super::*;
370    use pretty_assertions::assert_eq;
371
372    #[rstest::rstest]
373    #[tracing_test::traced_test]
374    fn test_with_fixtures(
375        #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
376    ) -> Result<(), Error> {
377        let test_file_path = path.with_extension("json");
378        let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
379
380        // We do this check because we have files that won't have a test file, namely ones
381        // that are supposed to error out!
382        if test_file_path.exists() {
383            let test_file_contents = std::fs::read_to_string(test_file_path)?;
384            match parse_file(&path, &options) {
385                Ok(result) => {
386                    let result_str =
387                        serde_json::to_string(&result).expect("could not serialize result");
388                    let test: Document = serde_json::from_str(&test_file_contents)
389                        .expect("could not deserialize test");
390                    let test_str = serde_json::to_string(&test).expect("could not serialize test");
391                    assert_eq!(test_str, result_str);
392                }
393                Err(e) => {
394                    let test: Error = serde_json::from_str(&test_file_contents)
395                        .expect("could not deserialize test");
396                    assert_eq!(test.to_string(), e.to_string());
397                }
398            }
399        } else {
400            tracing::warn!(?path, "test file not found");
401        }
402        Ok(())
403    }
404
405    #[cfg(test)]
406    mod empty_document_tests {
407        use crate::{Options, parse};
408
409        #[test]
410        fn test_whitespace_only_documents() {
411            let test_cases = vec![
412                "\n", "\n\n", "\t", " \n\t\n ", "   ",
413                /* The original proptest failing case -> */ "\n\n\t",
414            ];
415
416            for input in test_cases {
417                let options = Options::default();
418                let result = parse(input, &options);
419
420                match result {
421                    Ok(doc) => {
422                        // Validate the invariant using absolute offsets
423                        assert!(
424                            doc.location.absolute_start <= doc.location.absolute_end,
425                            "Failed for input {input:?}: absolute_start {} > absolute_end {}",
426                            doc.location.absolute_start,
427                            doc.location.absolute_end
428                        );
429
430                        // Validate with our helper
431                        doc.location.validate(input).unwrap_or_else(|e| {
432                            panic!("Location validation failed for {input:?}: {e}")
433                        });
434                    }
435                    Err(e) => {
436                        panic!("Failed to parse {input:?}: {e}");
437                    }
438                }
439            }
440        }
441
442        #[test]
443        fn test_document_with_content_after_whitespace() {
444            let test_cases = vec!["\n\nHello", "\t\tWorld", "  \n  = Title"];
445
446            for input in test_cases {
447                let options = Options::default();
448                let doc =
449                    parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
450
451                assert!(
452                    doc.location.absolute_start <= doc.location.absolute_end,
453                    "Failed for input {input:?}: absolute_start {} > absolute_end {}",
454                    doc.location.absolute_start,
455                    doc.location.absolute_end
456                );
457
458                // Validate with our helper
459                doc.location
460                    .validate(input)
461                    .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
462            }
463        }
464
465        #[test]
466        fn test_unicode_characters() {
467            // Test that UTF-8 safety is maintained
468            let test_cases = vec![
469                "πŸ˜€",         // 4-byte emoji
470                "א",          // 2-byte Hebrew
471                "Hello δΈ–η•Œ", // Mixed content
472                "\u{200b}",   // Zero-width space
473            ];
474
475            for input in test_cases {
476                let options = Options::default();
477                let result = parse(input, &options);
478
479                match result {
480                    Ok(doc) => {
481                        // All offsets should be on UTF-8 boundaries
482                        assert!(
483                            input.is_char_boundary(doc.location.absolute_start),
484                            "Absolute start {} not on UTF-8 boundary for {input:?}",
485                            doc.location.absolute_start,
486                        );
487                        assert!(
488                            input.is_char_boundary(doc.location.absolute_end),
489                            "Absolute end {} not on UTF-8 boundary for {input:?}",
490                            doc.location.absolute_end,
491                        );
492
493                        // Validate with our helper
494                        doc.location.validate(input).unwrap_or_else(|e| {
495                            panic!("Location validation failed for {input:?}: {e}");
496                        });
497                    }
498                    Err(e) => {
499                        // Some of these might fail to parse, which is OK for now
500                        // We're just testing that if they parse, the locations are valid
501                        println!("Failed to parse {input:?}: {e} (this might be expected)",);
502                    }
503                }
504            }
505        }
506    }
507}