acdc_parser/
lib.rs

1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33    path::{Path, PathBuf},
34    string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46
47pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
48use preprocessor::Preprocessor;
49
50pub use error::{Error, Positioning, SourceLocation};
51pub use model::{
52    Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
53    Block, BlockMetadata, Bold, Button, CalloutList, ColumnFormat, ColumnStyle, ColumnWidth,
54    Comment, CrossReference, CurvedApostrophe, CurvedQuotation, DelimitedBlock, DelimitedBlockType,
55    DescriptionList, DescriptionListItem, DiscreteHeader, Document, DocumentAttribute,
56    DocumentAttributes, ElementAttributes, Footnote, Form, Header, Highlight, HorizontalAlignment,
57    ICON_SIZES, Icon, Image, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
58    ListItemCheckedStatus, Location, Mailto, Menu, Monospace, OrderedList, PageBreak, Paragraph,
59    Pass, PassthroughKind, Plain, Position, Raw, Role, Section, Source, StandaloneCurvedApostrophe,
60    Stem, StemContent, StemNotation, Subscript, Substitution, Subtitle, Superscript, Table,
61    TableColumn, TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UnorderedList, Url,
62    Verbatim, VerticalAlignment, Video, inlines_to_string,
63};
64pub use options::{Options, OptionsBuilder, SafeMode};
65
66/// Type-based parser for `AsciiDoc` content.
67///
68/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
69///
70/// # Examples
71///
72/// Basic usage:
73///
74/// ```
75/// use acdc_parser::Parser;
76///
77/// let content = "= Document Title\n\nParagraph text.";
78/// let doc = Parser::new(content).parse()?;
79/// # Ok::<(), acdc_parser::Error>(())
80/// ```
81///
82/// With options:
83///
84/// ```
85/// use acdc_core::SafeMode;
86/// use acdc_parser::{Parser, Options};
87///
88/// let content = "= Document Title\n\nParagraph text.";
89/// let options = Options::builder()
90///     .with_safe_mode(SafeMode::Safe)
91///     .with_timings()
92///     .build();
93///
94/// let doc = Parser::new(content)
95///     .with_options(options)
96///     .parse()?;
97/// # Ok::<(), acdc_parser::Error>(())
98/// ```
99///
100/// For file-based parsing, read the file first:
101///
102/// ```no_run
103/// use acdc_parser::Parser;
104/// use std::fs;
105///
106/// let content = fs::read_to_string("document.adoc")?;
107/// let doc = Parser::new(&content).parse()?;
108/// # Ok::<(), Box<dyn std::error::Error>>(())
109/// ```
110#[derive(Debug)]
111pub struct Parser<'input> {
112    input: &'input str,
113    options: Options,
114}
115
116impl<'input> Parser<'input> {
117    /// Create a new parser for the given input string.
118    ///
119    /// The parser will use default options. Use `with_options` to customize.
120    ///
121    /// # Example
122    ///
123    /// ```
124    /// use acdc_parser::Parser;
125    ///
126    /// let parser = Parser::new("= Title\n\nContent");
127    /// let doc = parser.parse()?;
128    /// # Ok::<(), acdc_parser::Error>(())
129    /// ```
130    #[must_use]
131    pub fn new(input: &'input str) -> Self {
132        Self {
133            input,
134            options: Options::default(),
135        }
136    }
137
138    /// Set the options for this parser.
139    ///
140    /// This consumes the parser and returns a new one with the specified options.
141    ///
142    /// # Example
143    ///
144    /// ```
145    /// use acdc_core::SafeMode;
146    /// use acdc_parser::{Parser, Options};
147    ///
148    /// let options = Options::builder()
149    ///     .with_safe_mode(SafeMode::Safe)
150    ///     .build();
151    ///
152    /// let parser = Parser::new("= Title")
153    ///     .with_options(options);
154    /// # Ok::<(), acdc_parser::Error>(())
155    /// ```
156    #[must_use]
157    pub fn with_options(mut self, options: Options) -> Self {
158        self.options = options;
159        self
160    }
161
162    /// Parse the input into a Document.
163    ///
164    /// # Example
165    ///
166    /// ```
167    /// use acdc_parser::Parser;
168    ///
169    /// let doc = Parser::new("= Title\n\nContent").parse()?;
170    /// # Ok::<(), acdc_parser::Error>(())
171    /// ```
172    ///
173    /// # Errors
174    ///
175    /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
176    pub fn parse(self) -> Result<Document, Error> {
177        parse(self.input, &self.options)
178    }
179
180    /// Parse only inline elements from the input.
181    ///
182    /// This is useful for parsing fragments of `AsciiDoc` that contain only
183    /// inline markup like bold, italic, links, etc.
184    ///
185    /// # Example
186    ///
187    /// ```
188    /// use acdc_parser::Parser;
189    ///
190    /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
191    /// # Ok::<(), acdc_parser::Error>(())
192    /// ```
193    ///
194    /// # Errors
195    ///
196    /// Returns an error if the input cannot be parsed.
197    pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
198        parse_inline(self.input, &self.options)
199    }
200}
201
202/// Parse `AsciiDoc` content from a reader.
203///
204/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
205///
206/// # Example
207///
208/// ```
209/// use acdc_core::SafeMode;
210/// use acdc_parser::{Options, parse_from_reader};
211/// use std::fs::File;
212///
213/// let options = Options::builder()
214///     .with_safe_mode(SafeMode::Unsafe)
215///     .build();
216/// let file = File::open("fixtures/samples/README.adoc").unwrap();
217/// let document = parse_from_reader(file, &options).unwrap();
218/// ```
219///
220/// # Errors
221/// This function returns an error if the content cannot be parsed.
222#[instrument(skip(reader))]
223pub fn parse_from_reader<R: std::io::Read>(
224    reader: R,
225    options: &Options,
226) -> Result<Document, Error> {
227    let input = Preprocessor.process_reader(reader, options)?;
228    parse_input(&input, options, None)
229}
230
231/// Parse `AsciiDoc` content from a string.
232///
233/// This function parses the provided string as `AsciiDoc`.
234///
235/// # Example
236///
237/// ```
238/// use acdc_core::SafeMode;
239/// use acdc_parser::{Options, parse};
240///
241/// let options = Options::builder()
242///     .with_safe_mode(SafeMode::Unsafe)
243///     .build();
244/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
245/// let document = parse(content, &options).unwrap();
246/// ```
247///
248/// # Errors
249/// This function returns an error if the content cannot be parsed.
250#[instrument]
251pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
252    let input = Preprocessor.process(input, options)?;
253    parse_input(&input, options, None)
254}
255
256/// Parse `AsciiDoc` content from a file.
257///
258/// This function reads the content from the provided file and parses it as `AsciiDoc`.
259///
260/// # Example
261///
262/// ```
263/// use std::path::Path;
264///
265/// use acdc_core::SafeMode;
266/// use acdc_parser::{Options, parse_file};
267///
268/// let options = Options::builder()
269///     .with_safe_mode(SafeMode::Unsafe)
270///     .build();
271/// let file_path = Path::new("fixtures/samples/README.adoc");
272/// let document = parse_file(file_path, &options).unwrap();
273/// ```
274///
275/// # Errors
276/// This function returns an error if the content cannot be parsed.
277#[instrument(skip(file_path))]
278pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
279    let path = file_path.as_ref().to_path_buf();
280    let input = Preprocessor.process_file(file_path, options)?;
281    parse_input(&input, options, Some(path))
282}
283
284/// Helper to convert a PEG parse error to our `SourceLocation` type
285fn peg_error_to_source_location(
286    error: &peg::error::ParseError<peg::str::LineCol>,
287    file: Option<PathBuf>,
288) -> SourceLocation {
289    SourceLocation {
290        file,
291        positioning: Positioning::Position(Position {
292            line: error.location.line,
293            column: error.location.column,
294        }),
295    }
296}
297
298#[instrument]
299fn parse_input(
300    input: &str,
301    options: &Options,
302    file_path: Option<PathBuf>,
303) -> Result<Document, Error> {
304    tracing::trace!(?input, "post preprocessor");
305    let mut state = grammar::ParserState::new(input);
306    state.document_attributes = options.document_attributes.clone();
307    state.options = options.clone();
308    state.current_file.clone_from(&file_path);
309    match grammar::document_parser::document(input, &mut state) {
310        Ok(doc) => doc,
311        Err(error) => {
312            tracing::error!(?error, "error parsing document content");
313            let source_location = peg_error_to_source_location(&error, file_path);
314            Err(Error::Parse(Box::new(source_location), error.to_string()))
315        }
316    }
317}
318
319/// Parse inline `AsciiDoc` content from a string.
320///
321/// This function parses the provided string as inline `AsciiDoc` elements, returning a
322/// vector of inline nodes instead of a complete document structure. This is useful for
323/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
324/// strong text, links, macros, and other inline elements.
325///
326/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
327/// on an "inline" type output.
328///
329/// # Example
330///
331/// ```
332/// use acdc_core::SafeMode;
333/// use acdc_parser::{parse_inline, Options};
334///
335/// let options = Options::builder()
336///     .with_safe_mode(SafeMode::Unsafe)
337///     .build();
338/// let content = "This is *strong* text with a https://example.com[link].";
339/// let inline_nodes = parse_inline(content, &options).unwrap();
340/// ```
341///
342/// # Errors
343/// This function returns an error if the inline content cannot be parsed.
344#[instrument]
345pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
346    tracing::trace!(?input, "post preprocessor");
347    let mut state = grammar::ParserState::new(input);
348    state.document_attributes = options.document_attributes.clone();
349    state.options = options.clone();
350    match grammar::document_parser::inlines(
351        input,
352        &mut state,
353        0,
354        &grammar::BlockParsingMetadata::default(),
355    ) {
356        Ok(inlines) => Ok(inlines),
357        Err(error) => {
358            tracing::error!(?error, "error parsing inline content");
359            Err(Error::Parse(
360                Box::new(peg_error_to_source_location(&error, None)),
361                error.to_string(),
362            ))
363        }
364    }
365}
366
367#[cfg(test)]
368mod proptests;
369
370#[cfg(test)]
371#[allow(clippy::unwrap_used)]
372#[allow(clippy::panic)]
373#[allow(clippy::expect_used)]
374mod tests {
375    use super::*;
376    use pretty_assertions::assert_eq;
377
378    #[rstest::rstest]
379    #[tracing_test::traced_test]
380    fn test_with_fixtures(
381        #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
382    ) -> Result<(), Error> {
383        let test_file_path = path.with_extension("json");
384        let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
385
386        // We do this check because we have files that won't have a test file, namely ones
387        // that are supposed to error out!
388        if test_file_path.exists() {
389            let test_file_contents = std::fs::read_to_string(test_file_path)?;
390            match parse_file(&path, &options) {
391                Ok(result) => {
392                    let result_str =
393                        serde_json::to_string(&result).expect("could not serialize result");
394                    let test: Document = serde_json::from_str(&test_file_contents)
395                        .expect("could not deserialize test");
396                    let test_str = serde_json::to_string(&test).expect("could not serialize test");
397                    assert_eq!(test_str, result_str);
398                }
399                Err(e) => {
400                    let test: Error = serde_json::from_str(&test_file_contents)
401                        .expect("could not deserialize test");
402                    assert_eq!(test.to_string(), e.to_string());
403                }
404            }
405        } else {
406            tracing::warn!(?path, "test file not found");
407        }
408        Ok(())
409    }
410
411    #[cfg(test)]
412    mod empty_document_tests {
413        use crate::{Options, parse};
414
415        #[test]
416        fn test_whitespace_only_documents() {
417            let test_cases = vec![
418                "\n", "\n\n", "\t", " \n\t\n ", "   ",
419                /* The original proptest failing case -> */ "\n\n\t",
420            ];
421
422            for input in test_cases {
423                let options = Options::default();
424                let result = parse(input, &options);
425
426                match result {
427                    Ok(doc) => {
428                        // Validate the invariant using absolute offsets
429                        assert!(
430                            doc.location.absolute_start <= doc.location.absolute_end,
431                            "Failed for input {input:?}: absolute_start {} > absolute_end {}",
432                            doc.location.absolute_start,
433                            doc.location.absolute_end
434                        );
435
436                        // Validate with our helper
437                        doc.location.validate(input).unwrap_or_else(|e| {
438                            panic!("Location validation failed for {input:?}: {e}")
439                        });
440                    }
441                    Err(e) => {
442                        panic!("Failed to parse {input:?}: {e}");
443                    }
444                }
445            }
446        }
447
448        #[test]
449        fn test_document_with_content_after_whitespace() {
450            let test_cases = vec!["\n\nHello", "\t\tWorld", "  \n  = Title"];
451
452            for input in test_cases {
453                let options = Options::default();
454                let doc =
455                    parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
456
457                assert!(
458                    doc.location.absolute_start <= doc.location.absolute_end,
459                    "Failed for input {input:?}: absolute_start {} > absolute_end {}",
460                    doc.location.absolute_start,
461                    doc.location.absolute_end
462                );
463
464                // Validate with our helper
465                doc.location
466                    .validate(input)
467                    .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
468            }
469        }
470
471        #[test]
472        fn test_unicode_characters() {
473            // Test that UTF-8 safety is maintained
474            let test_cases = vec![
475                "πŸ˜€",         // 4-byte emoji
476                "א",          // 2-byte Hebrew
477                "Hello δΈ–η•Œ", // Mixed content
478                "\u{200b}",   // Zero-width space
479            ];
480
481            for input in test_cases {
482                let options = Options::default();
483                let result = parse(input, &options);
484
485                match result {
486                    Ok(doc) => {
487                        // All offsets should be on UTF-8 boundaries
488                        assert!(
489                            input.is_char_boundary(doc.location.absolute_start),
490                            "Absolute start {} not on UTF-8 boundary for {input:?}",
491                            doc.location.absolute_start,
492                        );
493                        assert!(
494                            input.is_char_boundary(doc.location.absolute_end),
495                            "Absolute end {} not on UTF-8 boundary for {input:?}",
496                            doc.location.absolute_end,
497                        );
498
499                        // Validate with our helper
500                        doc.location.validate(input).unwrap_or_else(|e| {
501                            panic!("Location validation failed for {input:?}: {e}");
502                        });
503                    }
504                    Err(e) => {
505                        // Some of these might fail to parse, which is OK for now
506                        // We're just testing that if they parse, the locations are valid
507                        println!("Failed to parse {input:?}: {e} (this might be expected)",);
508                    }
509                }
510            }
511        }
512    }
513}