acdc_parser/lib.rs
1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33 path::{Path, PathBuf},
34 string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46mod safe_mode;
47
48pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
49use preprocessor::Preprocessor;
50
51pub use error::{Error, Positioning, SourceLocation};
52pub use model::{
53 Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
54 Block, BlockMetadata, Bold, Button, CalloutList, CalloutListItem, CalloutRef, CalloutRefKind,
55 ColumnFormat, ColumnStyle, ColumnWidth, Comment, CrossReference, CurvedApostrophe,
56 CurvedQuotation, DelimitedBlock, DelimitedBlockType, DescriptionList, DescriptionListItem,
57 DiscreteHeader, Document, DocumentAttribute, DocumentAttributes, ElementAttributes, Footnote,
58 Form, Header, Highlight, HorizontalAlignment, ICON_SIZES, Icon, Image, IndexTerm,
59 IndexTermKind, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
60 ListItemCheckedStatus, Location, Mailto, Menu, Monospace, OrderedList, PageBreak, Paragraph,
61 Pass, PassthroughKind, Plain, Position, Raw, Role, Section, Source, StandaloneCurvedApostrophe,
62 Stem, StemContent, StemNotation, Subscript, Substitution, Subtitle, Superscript, Table,
63 TableColumn, TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UnorderedList, Url,
64 Verbatim, VerticalAlignment, Video, inlines_to_string,
65};
66pub use options::{Options, OptionsBuilder, SafeMode};
67
68/// Type-based parser for `AsciiDoc` content.
69///
70/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
71///
72/// # Examples
73///
74/// Basic usage:
75///
76/// ```
77/// use acdc_parser::Parser;
78///
79/// let content = "= Document Title\n\nParagraph text.";
80/// let doc = Parser::new(content).parse()?;
81/// # Ok::<(), acdc_parser::Error>(())
82/// ```
83///
84/// With options:
85///
86/// ```
87/// use acdc_parser::{Parser, Options, SafeMode};
88///
89/// let content = "= Document Title\n\nParagraph text.";
90/// let options = Options::builder()
91/// .with_safe_mode(SafeMode::Safe)
92/// .with_timings()
93/// .build();
94///
95/// let doc = Parser::new(content)
96/// .with_options(options)
97/// .parse()?;
98/// # Ok::<(), acdc_parser::Error>(())
99/// ```
100///
101/// For file-based parsing, read the file first:
102///
103/// ```no_run
104/// use acdc_parser::Parser;
105/// use std::fs;
106///
107/// let content = fs::read_to_string("document.adoc")?;
108/// let doc = Parser::new(&content).parse()?;
109/// # Ok::<(), Box<dyn std::error::Error>>(())
110/// ```
111#[derive(Debug)]
112pub struct Parser<'input> {
113 input: &'input str,
114 options: Options,
115}
116
117impl<'input> Parser<'input> {
118 /// Create a new parser for the given input string.
119 ///
120 /// The parser will use default options. Use `with_options` to customize.
121 ///
122 /// # Example
123 ///
124 /// ```
125 /// use acdc_parser::Parser;
126 ///
127 /// let parser = Parser::new("= Title\n\nContent");
128 /// let doc = parser.parse()?;
129 /// # Ok::<(), acdc_parser::Error>(())
130 /// ```
131 #[must_use]
132 pub fn new(input: &'input str) -> Self {
133 Self {
134 input,
135 options: Options::default(),
136 }
137 }
138
139 /// Set the options for this parser.
140 ///
141 /// This consumes the parser and returns a new one with the specified options.
142 ///
143 /// # Example
144 ///
145 /// ```
146 /// use acdc_parser::{Parser, Options, SafeMode};
147 ///
148 /// let options = Options::builder()
149 /// .with_safe_mode(SafeMode::Safe)
150 /// .build();
151 ///
152 /// let parser = Parser::new("= Title")
153 /// .with_options(options);
154 /// # Ok::<(), acdc_parser::Error>(())
155 /// ```
156 #[must_use]
157 pub fn with_options(mut self, options: Options) -> Self {
158 self.options = options;
159 self
160 }
161
162 /// Parse the input into a Document.
163 ///
164 /// # Example
165 ///
166 /// ```
167 /// use acdc_parser::Parser;
168 ///
169 /// let doc = Parser::new("= Title\n\nContent").parse()?;
170 /// # Ok::<(), acdc_parser::Error>(())
171 /// ```
172 ///
173 /// # Errors
174 ///
175 /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
176 pub fn parse(self) -> Result<Document, Error> {
177 parse(self.input, &self.options)
178 }
179
180 /// Parse only inline elements from the input.
181 ///
182 /// This is useful for parsing fragments of `AsciiDoc` that contain only
183 /// inline markup like bold, italic, links, etc.
184 ///
185 /// # Example
186 ///
187 /// ```
188 /// use acdc_parser::Parser;
189 ///
190 /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
191 /// # Ok::<(), acdc_parser::Error>(())
192 /// ```
193 ///
194 /// # Errors
195 ///
196 /// Returns an error if the input cannot be parsed.
197 pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
198 parse_inline(self.input, &self.options)
199 }
200}
201
202/// Parse `AsciiDoc` content from a reader.
203///
204/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
205///
206/// # Example
207///
208/// ```
209/// use acdc_parser::{Options, SafeMode, parse_from_reader};
210/// use std::fs::File;
211///
212/// let options = Options::builder()
213/// .with_safe_mode(SafeMode::Unsafe)
214/// .build();
215/// let file = File::open("fixtures/samples/README.adoc").unwrap();
216/// let document = parse_from_reader(file, &options).unwrap();
217/// ```
218///
219/// # Errors
220/// This function returns an error if the content cannot be parsed.
221#[instrument(skip(reader))]
222pub fn parse_from_reader<R: std::io::Read>(
223 reader: R,
224 options: &Options,
225) -> Result<Document, Error> {
226 let input = Preprocessor.process_reader(reader, options)?;
227 parse_input(&input, options, None)
228}
229
230/// Parse `AsciiDoc` content from a string.
231///
232/// This function parses the provided string as `AsciiDoc`.
233///
234/// # Example
235///
236/// ```
237/// use acdc_parser::{Options, SafeMode, parse};
238///
239/// let options = Options::builder()
240/// .with_safe_mode(SafeMode::Unsafe)
241/// .build();
242/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
243/// let document = parse(content, &options).unwrap();
244/// ```
245///
246/// # Errors
247/// This function returns an error if the content cannot be parsed.
248#[instrument]
249pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
250 let input = Preprocessor.process(input, options)?;
251 parse_input(&input, options, None)
252}
253
254/// Parse `AsciiDoc` content from a file.
255///
256/// This function reads the content from the provided file and parses it as `AsciiDoc`.
257///
258/// # Example
259///
260/// ```
261/// use std::path::Path;
262/// use acdc_parser::{Options, SafeMode, parse_file};
263///
264/// let options = Options::builder()
265/// .with_safe_mode(SafeMode::Unsafe)
266/// .build();
267/// let file_path = Path::new("fixtures/samples/README.adoc");
268/// let document = parse_file(file_path, &options).unwrap();
269/// ```
270///
271/// # Errors
272/// This function returns an error if the content cannot be parsed.
273#[instrument(skip(file_path))]
274pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
275 let path = file_path.as_ref().to_path_buf();
276 let input = Preprocessor.process_file(file_path, options)?;
277 parse_input(&input, options, Some(path))
278}
279
280/// Helper to convert a PEG parse error to our `SourceLocation` type
281fn peg_error_to_source_location(
282 error: &peg::error::ParseError<peg::str::LineCol>,
283 file: Option<PathBuf>,
284) -> SourceLocation {
285 SourceLocation {
286 file,
287 positioning: Positioning::Position(Position {
288 line: error.location.line,
289 column: error.location.column,
290 }),
291 }
292}
293
294#[instrument]
295fn parse_input(
296 input: &str,
297 options: &Options,
298 file_path: Option<PathBuf>,
299) -> Result<Document, Error> {
300 tracing::trace!(?input, "post preprocessor");
301 let mut state = grammar::ParserState::new(input);
302 state.document_attributes = options.document_attributes.clone();
303 state.options = options.clone();
304 state.current_file.clone_from(&file_path);
305 match grammar::document_parser::document(input, &mut state) {
306 Ok(doc) => doc,
307 Err(error) => {
308 tracing::error!(?error, "error parsing document content");
309 let source_location = peg_error_to_source_location(&error, file_path);
310 Err(Error::Parse(Box::new(source_location), error.to_string()))
311 }
312 }
313}
314
315/// Parse inline `AsciiDoc` content from a string.
316///
317/// This function parses the provided string as inline `AsciiDoc` elements, returning a
318/// vector of inline nodes instead of a complete document structure. This is useful for
319/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
320/// strong text, links, macros, and other inline elements.
321///
322/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
323/// on an "inline" type output.
324///
325/// # Example
326///
327/// ```
328/// use acdc_parser::{Options, SafeMode, parse_inline};
329///
330/// let options = Options::builder()
331/// .with_safe_mode(SafeMode::Unsafe)
332/// .build();
333/// let content = "This is *strong* text with a https://example.com[link].";
334/// let inline_nodes = parse_inline(content, &options).unwrap();
335/// ```
336///
337/// # Errors
338/// This function returns an error if the inline content cannot be parsed.
339#[instrument]
340pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
341 tracing::trace!(?input, "post preprocessor");
342 let mut state = grammar::ParserState::new(input);
343 state.document_attributes = options.document_attributes.clone();
344 state.options = options.clone();
345 match grammar::document_parser::inlines(
346 input,
347 &mut state,
348 0,
349 &grammar::BlockParsingMetadata::default(),
350 ) {
351 Ok(inlines) => Ok(inlines),
352 Err(error) => {
353 tracing::error!(?error, "error parsing inline content");
354 Err(Error::Parse(
355 Box::new(peg_error_to_source_location(&error, None)),
356 error.to_string(),
357 ))
358 }
359 }
360}
361
362#[cfg(test)]
363mod proptests;
364
365#[cfg(test)]
366#[allow(clippy::unwrap_used)]
367#[allow(clippy::panic)]
368#[allow(clippy::expect_used)]
369mod tests {
370 use super::*;
371 use pretty_assertions::assert_eq;
372
373 #[rstest::rstest]
374 #[tracing_test::traced_test]
375 fn test_with_fixtures(
376 #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
377 ) -> Result<(), Error> {
378 let test_file_path = path.with_extension("json");
379 let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
380
381 // We do this check because we have files that won't have a test file, namely ones
382 // that are supposed to error out!
383 if test_file_path.exists() {
384 let test_file_contents = std::fs::read_to_string(test_file_path)?;
385 match parse_file(&path, &options) {
386 Ok(result) => {
387 let result_str =
388 serde_json::to_string(&result).expect("could not serialize result");
389 let test: Document = serde_json::from_str(&test_file_contents)
390 .expect("could not deserialize test");
391 let test_str = serde_json::to_string(&test).expect("could not serialize test");
392 assert_eq!(test_str, result_str);
393 }
394 Err(e) => {
395 let test: Error = serde_json::from_str(&test_file_contents)
396 .expect("could not deserialize test");
397 assert_eq!(test.to_string(), e.to_string());
398 }
399 }
400 } else {
401 tracing::warn!(?path, "test file not found");
402 }
403 Ok(())
404 }
405
406 #[cfg(test)]
407 mod empty_document_tests {
408 use crate::{Options, parse};
409
410 #[test]
411 fn test_whitespace_only_documents() {
412 let test_cases = vec![
413 "\n", "\n\n", "\t", " \n\t\n ", " ",
414 /* The original proptest failing case -> */ "\n\n\t",
415 ];
416
417 for input in test_cases {
418 let options = Options::default();
419 let result = parse(input, &options);
420
421 match result {
422 Ok(doc) => {
423 // Validate the invariant using absolute offsets
424 assert!(
425 doc.location.absolute_start <= doc.location.absolute_end,
426 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
427 doc.location.absolute_start,
428 doc.location.absolute_end
429 );
430
431 // Validate with our helper
432 doc.location.validate(input).unwrap_or_else(|e| {
433 panic!("Location validation failed for {input:?}: {e}")
434 });
435 }
436 Err(e) => {
437 panic!("Failed to parse {input:?}: {e}");
438 }
439 }
440 }
441 }
442
443 #[test]
444 fn test_document_with_content_after_whitespace() {
445 let test_cases = vec!["\n\nHello", "\t\tWorld", " \n = Title"];
446
447 for input in test_cases {
448 let options = Options::default();
449 let doc =
450 parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
451
452 assert!(
453 doc.location.absolute_start <= doc.location.absolute_end,
454 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
455 doc.location.absolute_start,
456 doc.location.absolute_end
457 );
458
459 // Validate with our helper
460 doc.location
461 .validate(input)
462 .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
463 }
464 }
465
466 #[test]
467 fn test_unicode_characters() {
468 // Test that UTF-8 safety is maintained
469 let test_cases = vec![
470 "π", // 4-byte emoji
471 "Χ", // 2-byte Hebrew
472 "Hello δΈη", // Mixed content
473 "\u{200b}", // Zero-width space
474 ];
475
476 for input in test_cases {
477 let options = Options::default();
478 let result = parse(input, &options);
479
480 match result {
481 Ok(doc) => {
482 // All offsets should be on UTF-8 boundaries
483 assert!(
484 input.is_char_boundary(doc.location.absolute_start),
485 "Absolute start {} not on UTF-8 boundary for {input:?}",
486 doc.location.absolute_start,
487 );
488 assert!(
489 input.is_char_boundary(doc.location.absolute_end),
490 "Absolute end {} not on UTF-8 boundary for {input:?}",
491 doc.location.absolute_end,
492 );
493
494 // Validate with our helper
495 doc.location.validate(input).unwrap_or_else(|e| {
496 panic!("Location validation failed for {input:?}: {e}");
497 });
498 }
499 Err(e) => {
500 // Some of these might fail to parse, which is OK for now
501 // We're just testing that if they parse, the locations are valid
502 println!("Failed to parse {input:?}: {e} (this might be expected)",);
503 }
504 }
505 }
506 }
507 }
508}