acdc_parser/lib.rs
1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33 path::{Path, PathBuf},
34 string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46mod safe_mode;
47
48pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
49use preprocessor::Preprocessor;
50
51pub use error::{Error, Positioning, SourceLocation};
52pub use model::{
53 Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
54 Block, BlockMetadata, Bold, Button, CalloutList, ColumnFormat, ColumnStyle, ColumnWidth,
55 Comment, CrossReference, CurvedApostrophe, CurvedQuotation, DelimitedBlock, DelimitedBlockType,
56 DescriptionList, DescriptionListItem, DiscreteHeader, Document, DocumentAttribute,
57 DocumentAttributes, ElementAttributes, Footnote, Form, Header, Highlight, HorizontalAlignment,
58 ICON_SIZES, Icon, Image, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
59 ListItemCheckedStatus, Location, Mailto, Menu, Monospace, OrderedList, PageBreak, Paragraph,
60 Pass, PassthroughKind, Plain, Position, Raw, Role, Section, Source, StandaloneCurvedApostrophe,
61 Stem, StemContent, StemNotation, Subscript, Substitution, Subtitle, Superscript, Table,
62 TableColumn, TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UnorderedList, Url,
63 Verbatim, VerticalAlignment, Video, inlines_to_string,
64};
65pub use options::{Options, OptionsBuilder, SafeMode};
66
67/// Type-based parser for `AsciiDoc` content.
68///
69/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
70///
71/// # Examples
72///
73/// Basic usage:
74///
75/// ```
76/// use acdc_parser::Parser;
77///
78/// let content = "= Document Title\n\nParagraph text.";
79/// let doc = Parser::new(content).parse()?;
80/// # Ok::<(), acdc_parser::Error>(())
81/// ```
82///
83/// With options:
84///
85/// ```
86/// use acdc_parser::{Parser, Options, SafeMode};
87///
88/// let content = "= Document Title\n\nParagraph text.";
89/// let options = Options::builder()
90/// .with_safe_mode(SafeMode::Safe)
91/// .with_timings()
92/// .build();
93///
94/// let doc = Parser::new(content)
95/// .with_options(options)
96/// .parse()?;
97/// # Ok::<(), acdc_parser::Error>(())
98/// ```
99///
100/// For file-based parsing, read the file first:
101///
102/// ```no_run
103/// use acdc_parser::Parser;
104/// use std::fs;
105///
106/// let content = fs::read_to_string("document.adoc")?;
107/// let doc = Parser::new(&content).parse()?;
108/// # Ok::<(), Box<dyn std::error::Error>>(())
109/// ```
110#[derive(Debug)]
111pub struct Parser<'input> {
112 input: &'input str,
113 options: Options,
114}
115
116impl<'input> Parser<'input> {
117 /// Create a new parser for the given input string.
118 ///
119 /// The parser will use default options. Use `with_options` to customize.
120 ///
121 /// # Example
122 ///
123 /// ```
124 /// use acdc_parser::Parser;
125 ///
126 /// let parser = Parser::new("= Title\n\nContent");
127 /// let doc = parser.parse()?;
128 /// # Ok::<(), acdc_parser::Error>(())
129 /// ```
130 #[must_use]
131 pub fn new(input: &'input str) -> Self {
132 Self {
133 input,
134 options: Options::default(),
135 }
136 }
137
138 /// Set the options for this parser.
139 ///
140 /// This consumes the parser and returns a new one with the specified options.
141 ///
142 /// # Example
143 ///
144 /// ```
145 /// use acdc_parser::{Parser, Options, SafeMode};
146 ///
147 /// let options = Options::builder()
148 /// .with_safe_mode(SafeMode::Safe)
149 /// .build();
150 ///
151 /// let parser = Parser::new("= Title")
152 /// .with_options(options);
153 /// # Ok::<(), acdc_parser::Error>(())
154 /// ```
155 #[must_use]
156 pub fn with_options(mut self, options: Options) -> Self {
157 self.options = options;
158 self
159 }
160
161 /// Parse the input into a Document.
162 ///
163 /// # Example
164 ///
165 /// ```
166 /// use acdc_parser::Parser;
167 ///
168 /// let doc = Parser::new("= Title\n\nContent").parse()?;
169 /// # Ok::<(), acdc_parser::Error>(())
170 /// ```
171 ///
172 /// # Errors
173 ///
174 /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
175 pub fn parse(self) -> Result<Document, Error> {
176 parse(self.input, &self.options)
177 }
178
179 /// Parse only inline elements from the input.
180 ///
181 /// This is useful for parsing fragments of `AsciiDoc` that contain only
182 /// inline markup like bold, italic, links, etc.
183 ///
184 /// # Example
185 ///
186 /// ```
187 /// use acdc_parser::Parser;
188 ///
189 /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
190 /// # Ok::<(), acdc_parser::Error>(())
191 /// ```
192 ///
193 /// # Errors
194 ///
195 /// Returns an error if the input cannot be parsed.
196 pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
197 parse_inline(self.input, &self.options)
198 }
199}
200
201/// Parse `AsciiDoc` content from a reader.
202///
203/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
204///
205/// # Example
206///
207/// ```
208/// use acdc_parser::{Options, SafeMode, parse_from_reader};
209/// use std::fs::File;
210///
211/// let options = Options::builder()
212/// .with_safe_mode(SafeMode::Unsafe)
213/// .build();
214/// let file = File::open("fixtures/samples/README.adoc").unwrap();
215/// let document = parse_from_reader(file, &options).unwrap();
216/// ```
217///
218/// # Errors
219/// This function returns an error if the content cannot be parsed.
220#[instrument(skip(reader))]
221pub fn parse_from_reader<R: std::io::Read>(
222 reader: R,
223 options: &Options,
224) -> Result<Document, Error> {
225 let input = Preprocessor.process_reader(reader, options)?;
226 parse_input(&input, options, None)
227}
228
229/// Parse `AsciiDoc` content from a string.
230///
231/// This function parses the provided string as `AsciiDoc`.
232///
233/// # Example
234///
235/// ```
236/// use acdc_parser::{Options, SafeMode, parse};
237///
238/// let options = Options::builder()
239/// .with_safe_mode(SafeMode::Unsafe)
240/// .build();
241/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
242/// let document = parse(content, &options).unwrap();
243/// ```
244///
245/// # Errors
246/// This function returns an error if the content cannot be parsed.
247#[instrument]
248pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
249 let input = Preprocessor.process(input, options)?;
250 parse_input(&input, options, None)
251}
252
253/// Parse `AsciiDoc` content from a file.
254///
255/// This function reads the content from the provided file and parses it as `AsciiDoc`.
256///
257/// # Example
258///
259/// ```
260/// use std::path::Path;
261/// use acdc_parser::{Options, SafeMode, parse_file};
262///
263/// let options = Options::builder()
264/// .with_safe_mode(SafeMode::Unsafe)
265/// .build();
266/// let file_path = Path::new("fixtures/samples/README.adoc");
267/// let document = parse_file(file_path, &options).unwrap();
268/// ```
269///
270/// # Errors
271/// This function returns an error if the content cannot be parsed.
272#[instrument(skip(file_path))]
273pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
274 let path = file_path.as_ref().to_path_buf();
275 let input = Preprocessor.process_file(file_path, options)?;
276 parse_input(&input, options, Some(path))
277}
278
279/// Helper to convert a PEG parse error to our `SourceLocation` type
280fn peg_error_to_source_location(
281 error: &peg::error::ParseError<peg::str::LineCol>,
282 file: Option<PathBuf>,
283) -> SourceLocation {
284 SourceLocation {
285 file,
286 positioning: Positioning::Position(Position {
287 line: error.location.line,
288 column: error.location.column,
289 }),
290 }
291}
292
293#[instrument]
294fn parse_input(
295 input: &str,
296 options: &Options,
297 file_path: Option<PathBuf>,
298) -> Result<Document, Error> {
299 tracing::trace!(?input, "post preprocessor");
300 let mut state = grammar::ParserState::new(input);
301 state.document_attributes = options.document_attributes.clone();
302 state.options = options.clone();
303 state.current_file.clone_from(&file_path);
304 match grammar::document_parser::document(input, &mut state) {
305 Ok(doc) => doc,
306 Err(error) => {
307 tracing::error!(?error, "error parsing document content");
308 let source_location = peg_error_to_source_location(&error, file_path);
309 Err(Error::Parse(Box::new(source_location), error.to_string()))
310 }
311 }
312}
313
314/// Parse inline `AsciiDoc` content from a string.
315///
316/// This function parses the provided string as inline `AsciiDoc` elements, returning a
317/// vector of inline nodes instead of a complete document structure. This is useful for
318/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
319/// strong text, links, macros, and other inline elements.
320///
321/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
322/// on an "inline" type output.
323///
324/// # Example
325///
326/// ```
327/// use acdc_parser::{Options, SafeMode, parse_inline};
328///
329/// let options = Options::builder()
330/// .with_safe_mode(SafeMode::Unsafe)
331/// .build();
332/// let content = "This is *strong* text with a https://example.com[link].";
333/// let inline_nodes = parse_inline(content, &options).unwrap();
334/// ```
335///
336/// # Errors
337/// This function returns an error if the inline content cannot be parsed.
338#[instrument]
339pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
340 tracing::trace!(?input, "post preprocessor");
341 let mut state = grammar::ParserState::new(input);
342 state.document_attributes = options.document_attributes.clone();
343 state.options = options.clone();
344 match grammar::document_parser::inlines(
345 input,
346 &mut state,
347 0,
348 &grammar::BlockParsingMetadata::default(),
349 ) {
350 Ok(inlines) => Ok(inlines),
351 Err(error) => {
352 tracing::error!(?error, "error parsing inline content");
353 Err(Error::Parse(
354 Box::new(peg_error_to_source_location(&error, None)),
355 error.to_string(),
356 ))
357 }
358 }
359}
360
361#[cfg(test)]
362mod proptests;
363
364#[cfg(test)]
365#[allow(clippy::unwrap_used)]
366#[allow(clippy::panic)]
367#[allow(clippy::expect_used)]
368mod tests {
369 use super::*;
370 use pretty_assertions::assert_eq;
371
372 #[rstest::rstest]
373 #[tracing_test::traced_test]
374 fn test_with_fixtures(
375 #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
376 ) -> Result<(), Error> {
377 let test_file_path = path.with_extension("json");
378 let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
379
380 // We do this check because we have files that won't have a test file, namely ones
381 // that are supposed to error out!
382 if test_file_path.exists() {
383 let test_file_contents = std::fs::read_to_string(test_file_path)?;
384 match parse_file(&path, &options) {
385 Ok(result) => {
386 let result_str =
387 serde_json::to_string(&result).expect("could not serialize result");
388 let test: Document = serde_json::from_str(&test_file_contents)
389 .expect("could not deserialize test");
390 let test_str = serde_json::to_string(&test).expect("could not serialize test");
391 assert_eq!(test_str, result_str);
392 }
393 Err(e) => {
394 let test: Error = serde_json::from_str(&test_file_contents)
395 .expect("could not deserialize test");
396 assert_eq!(test.to_string(), e.to_string());
397 }
398 }
399 } else {
400 tracing::warn!(?path, "test file not found");
401 }
402 Ok(())
403 }
404
405 #[cfg(test)]
406 mod empty_document_tests {
407 use crate::{Options, parse};
408
409 #[test]
410 fn test_whitespace_only_documents() {
411 let test_cases = vec![
412 "\n", "\n\n", "\t", " \n\t\n ", " ",
413 /* The original proptest failing case -> */ "\n\n\t",
414 ];
415
416 for input in test_cases {
417 let options = Options::default();
418 let result = parse(input, &options);
419
420 match result {
421 Ok(doc) => {
422 // Validate the invariant using absolute offsets
423 assert!(
424 doc.location.absolute_start <= doc.location.absolute_end,
425 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
426 doc.location.absolute_start,
427 doc.location.absolute_end
428 );
429
430 // Validate with our helper
431 doc.location.validate(input).unwrap_or_else(|e| {
432 panic!("Location validation failed for {input:?}: {e}")
433 });
434 }
435 Err(e) => {
436 panic!("Failed to parse {input:?}: {e}");
437 }
438 }
439 }
440 }
441
442 #[test]
443 fn test_document_with_content_after_whitespace() {
444 let test_cases = vec!["\n\nHello", "\t\tWorld", " \n = Title"];
445
446 for input in test_cases {
447 let options = Options::default();
448 let doc =
449 parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
450
451 assert!(
452 doc.location.absolute_start <= doc.location.absolute_end,
453 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
454 doc.location.absolute_start,
455 doc.location.absolute_end
456 );
457
458 // Validate with our helper
459 doc.location
460 .validate(input)
461 .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
462 }
463 }
464
465 #[test]
466 fn test_unicode_characters() {
467 // Test that UTF-8 safety is maintained
468 let test_cases = vec![
469 "π", // 4-byte emoji
470 "Χ", // 2-byte Hebrew
471 "Hello δΈη", // Mixed content
472 "\u{200b}", // Zero-width space
473 ];
474
475 for input in test_cases {
476 let options = Options::default();
477 let result = parse(input, &options);
478
479 match result {
480 Ok(doc) => {
481 // All offsets should be on UTF-8 boundaries
482 assert!(
483 input.is_char_boundary(doc.location.absolute_start),
484 "Absolute start {} not on UTF-8 boundary for {input:?}",
485 doc.location.absolute_start,
486 );
487 assert!(
488 input.is_char_boundary(doc.location.absolute_end),
489 "Absolute end {} not on UTF-8 boundary for {input:?}",
490 doc.location.absolute_end,
491 );
492
493 // Validate with our helper
494 doc.location.validate(input).unwrap_or_else(|e| {
495 panic!("Location validation failed for {input:?}: {e}");
496 });
497 }
498 Err(e) => {
499 // Some of these might fail to parse, which is OK for now
500 // We're just testing that if they parse, the locations are valid
501 println!("Failed to parse {input:?}: {e} (this might be expected)",);
502 }
503 }
504 }
505 }
506 }
507}