acdc_parser/lib.rs
1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33 path::{Path, PathBuf},
34 string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46
47pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
48use preprocessor::Preprocessor;
49
50pub use error::{Error, Positioning, SourceLocation};
51pub use model::{
52 Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
53 Block, BlockMetadata, Bold, Button, CalloutList, ColumnFormat, ColumnStyle, ColumnWidth,
54 Comment, CrossReference, CurvedApostrophe, CurvedQuotation, DelimitedBlock, DelimitedBlockType,
55 DescriptionList, DescriptionListItem, DiscreteHeader, Document, DocumentAttribute,
56 DocumentAttributes, ElementAttributes, Footnote, Form, Header, Highlight, HorizontalAlignment,
57 ICON_SIZES, Icon, Image, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
58 ListItemCheckedStatus, Location, Mailto, Menu, Monospace, OrderedList, PageBreak, Paragraph,
59 Pass, PassthroughKind, Plain, Position, Raw, Role, Section, Source, StandaloneCurvedApostrophe,
60 Stem, StemContent, StemNotation, Subscript, Substitution, Subtitle, Superscript, Table,
61 TableColumn, TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UnorderedList, Url,
62 Verbatim, VerticalAlignment, Video, inlines_to_string,
63};
64pub use options::{Options, OptionsBuilder, SafeMode};
65
66/// Type-based parser for `AsciiDoc` content.
67///
68/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
69///
70/// # Examples
71///
72/// Basic usage:
73///
74/// ```
75/// use acdc_parser::Parser;
76///
77/// let content = "= Document Title\n\nParagraph text.";
78/// let doc = Parser::new(content).parse()?;
79/// # Ok::<(), acdc_parser::Error>(())
80/// ```
81///
82/// With options:
83///
84/// ```
85/// use acdc_core::SafeMode;
86/// use acdc_parser::{Parser, Options};
87///
88/// let content = "= Document Title\n\nParagraph text.";
89/// let options = Options::builder()
90/// .with_safe_mode(SafeMode::Safe)
91/// .with_timings()
92/// .build();
93///
94/// let doc = Parser::new(content)
95/// .with_options(options)
96/// .parse()?;
97/// # Ok::<(), acdc_parser::Error>(())
98/// ```
99///
100/// For file-based parsing, read the file first:
101///
102/// ```no_run
103/// use acdc_parser::Parser;
104/// use std::fs;
105///
106/// let content = fs::read_to_string("document.adoc")?;
107/// let doc = Parser::new(&content).parse()?;
108/// # Ok::<(), Box<dyn std::error::Error>>(())
109/// ```
110#[derive(Debug)]
111pub struct Parser<'input> {
112 input: &'input str,
113 options: Options,
114}
115
116impl<'input> Parser<'input> {
117 /// Create a new parser for the given input string.
118 ///
119 /// The parser will use default options. Use `with_options` to customize.
120 ///
121 /// # Example
122 ///
123 /// ```
124 /// use acdc_parser::Parser;
125 ///
126 /// let parser = Parser::new("= Title\n\nContent");
127 /// let doc = parser.parse()?;
128 /// # Ok::<(), acdc_parser::Error>(())
129 /// ```
130 #[must_use]
131 pub fn new(input: &'input str) -> Self {
132 Self {
133 input,
134 options: Options::default(),
135 }
136 }
137
138 /// Set the options for this parser.
139 ///
140 /// This consumes the parser and returns a new one with the specified options.
141 ///
142 /// # Example
143 ///
144 /// ```
145 /// use acdc_core::SafeMode;
146 /// use acdc_parser::{Parser, Options};
147 ///
148 /// let options = Options::builder()
149 /// .with_safe_mode(SafeMode::Safe)
150 /// .build();
151 ///
152 /// let parser = Parser::new("= Title")
153 /// .with_options(options);
154 /// # Ok::<(), acdc_parser::Error>(())
155 /// ```
156 #[must_use]
157 pub fn with_options(mut self, options: Options) -> Self {
158 self.options = options;
159 self
160 }
161
162 /// Parse the input into a Document.
163 ///
164 /// # Example
165 ///
166 /// ```
167 /// use acdc_parser::Parser;
168 ///
169 /// let doc = Parser::new("= Title\n\nContent").parse()?;
170 /// # Ok::<(), acdc_parser::Error>(())
171 /// ```
172 ///
173 /// # Errors
174 ///
175 /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
176 pub fn parse(self) -> Result<Document, Error> {
177 parse(self.input, &self.options)
178 }
179
180 /// Parse only inline elements from the input.
181 ///
182 /// This is useful for parsing fragments of `AsciiDoc` that contain only
183 /// inline markup like bold, italic, links, etc.
184 ///
185 /// # Example
186 ///
187 /// ```
188 /// use acdc_parser::Parser;
189 ///
190 /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
191 /// # Ok::<(), acdc_parser::Error>(())
192 /// ```
193 ///
194 /// # Errors
195 ///
196 /// Returns an error if the input cannot be parsed.
197 pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
198 parse_inline(self.input, &self.options)
199 }
200}
201
202/// Parse `AsciiDoc` content from a reader.
203///
204/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
205///
206/// # Example
207///
208/// ```
209/// use acdc_core::SafeMode;
210/// use acdc_parser::{Options, parse_from_reader};
211/// use std::fs::File;
212///
213/// let options = Options::builder()
214/// .with_safe_mode(SafeMode::Unsafe)
215/// .build();
216/// let file = File::open("fixtures/samples/README.adoc").unwrap();
217/// let document = parse_from_reader(file, &options).unwrap();
218/// ```
219///
220/// # Errors
221/// This function returns an error if the content cannot be parsed.
222#[instrument(skip(reader))]
223pub fn parse_from_reader<R: std::io::Read>(
224 reader: R,
225 options: &Options,
226) -> Result<Document, Error> {
227 let input = Preprocessor.process_reader(reader, options)?;
228 parse_input(&input, options, None)
229}
230
231/// Parse `AsciiDoc` content from a string.
232///
233/// This function parses the provided string as `AsciiDoc`.
234///
235/// # Example
236///
237/// ```
238/// use acdc_core::SafeMode;
239/// use acdc_parser::{Options, parse};
240///
241/// let options = Options::builder()
242/// .with_safe_mode(SafeMode::Unsafe)
243/// .build();
244/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
245/// let document = parse(content, &options).unwrap();
246/// ```
247///
248/// # Errors
249/// This function returns an error if the content cannot be parsed.
250#[instrument]
251pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
252 let input = Preprocessor.process(input, options)?;
253 parse_input(&input, options, None)
254}
255
256/// Parse `AsciiDoc` content from a file.
257///
258/// This function reads the content from the provided file and parses it as `AsciiDoc`.
259///
260/// # Example
261///
262/// ```
263/// use std::path::Path;
264///
265/// use acdc_core::SafeMode;
266/// use acdc_parser::{Options, parse_file};
267///
268/// let options = Options::builder()
269/// .with_safe_mode(SafeMode::Unsafe)
270/// .build();
271/// let file_path = Path::new("fixtures/samples/README.adoc");
272/// let document = parse_file(file_path, &options).unwrap();
273/// ```
274///
275/// # Errors
276/// This function returns an error if the content cannot be parsed.
277#[instrument(skip(file_path))]
278pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
279 let path = file_path.as_ref().to_path_buf();
280 let input = Preprocessor.process_file(file_path, options)?;
281 parse_input(&input, options, Some(path))
282}
283
284/// Helper to convert a PEG parse error to our `SourceLocation` type
285fn peg_error_to_source_location(
286 error: &peg::error::ParseError<peg::str::LineCol>,
287 file: Option<PathBuf>,
288) -> SourceLocation {
289 SourceLocation {
290 file,
291 positioning: Positioning::Position(Position {
292 line: error.location.line,
293 column: error.location.column,
294 }),
295 }
296}
297
298#[instrument]
299fn parse_input(
300 input: &str,
301 options: &Options,
302 file_path: Option<PathBuf>,
303) -> Result<Document, Error> {
304 tracing::trace!(?input, "post preprocessor");
305 let mut state = grammar::ParserState::new(input);
306 state.document_attributes = options.document_attributes.clone();
307 state.options = options.clone();
308 state.current_file.clone_from(&file_path);
309 match grammar::document_parser::document(input, &mut state) {
310 Ok(doc) => doc,
311 Err(error) => {
312 tracing::error!(?error, "error parsing document content");
313 let source_location = peg_error_to_source_location(&error, file_path);
314 Err(Error::Parse(Box::new(source_location), error.to_string()))
315 }
316 }
317}
318
319/// Parse inline `AsciiDoc` content from a string.
320///
321/// This function parses the provided string as inline `AsciiDoc` elements, returning a
322/// vector of inline nodes instead of a complete document structure. This is useful for
323/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
324/// strong text, links, macros, and other inline elements.
325///
326/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
327/// on an "inline" type output.
328///
329/// # Example
330///
331/// ```
332/// use acdc_core::SafeMode;
333/// use acdc_parser::{parse_inline, Options};
334///
335/// let options = Options::builder()
336/// .with_safe_mode(SafeMode::Unsafe)
337/// .build();
338/// let content = "This is *strong* text with a https://example.com[link].";
339/// let inline_nodes = parse_inline(content, &options).unwrap();
340/// ```
341///
342/// # Errors
343/// This function returns an error if the inline content cannot be parsed.
344#[instrument]
345pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
346 tracing::trace!(?input, "post preprocessor");
347 let mut state = grammar::ParserState::new(input);
348 state.document_attributes = options.document_attributes.clone();
349 state.options = options.clone();
350 match grammar::document_parser::inlines(
351 input,
352 &mut state,
353 0,
354 &grammar::BlockParsingMetadata::default(),
355 ) {
356 Ok(inlines) => Ok(inlines),
357 Err(error) => {
358 tracing::error!(?error, "error parsing inline content");
359 Err(Error::Parse(
360 Box::new(peg_error_to_source_location(&error, None)),
361 error.to_string(),
362 ))
363 }
364 }
365}
366
367#[cfg(test)]
368mod proptests;
369
370#[cfg(test)]
371#[allow(clippy::unwrap_used)]
372#[allow(clippy::panic)]
373#[allow(clippy::expect_used)]
374mod tests {
375 use super::*;
376 use pretty_assertions::assert_eq;
377
378 #[rstest::rstest]
379 #[tracing_test::traced_test]
380 fn test_with_fixtures(
381 #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
382 ) -> Result<(), Error> {
383 let test_file_path = path.with_extension("json");
384 let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
385
386 // We do this check because we have files that won't have a test file, namely ones
387 // that are supposed to error out!
388 if test_file_path.exists() {
389 let test_file_contents = std::fs::read_to_string(test_file_path)?;
390 match parse_file(&path, &options) {
391 Ok(result) => {
392 let result_str =
393 serde_json::to_string(&result).expect("could not serialize result");
394 let test: Document = serde_json::from_str(&test_file_contents)
395 .expect("could not deserialize test");
396 let test_str = serde_json::to_string(&test).expect("could not serialize test");
397 assert_eq!(test_str, result_str);
398 }
399 Err(e) => {
400 let test: Error = serde_json::from_str(&test_file_contents)
401 .expect("could not deserialize test");
402 assert_eq!(test.to_string(), e.to_string());
403 }
404 }
405 } else {
406 tracing::warn!(?path, "test file not found");
407 }
408 Ok(())
409 }
410
411 #[cfg(test)]
412 mod empty_document_tests {
413 use crate::{Options, parse};
414
415 #[test]
416 fn test_whitespace_only_documents() {
417 let test_cases = vec![
418 "\n", "\n\n", "\t", " \n\t\n ", " ",
419 /* The original proptest failing case -> */ "\n\n\t",
420 ];
421
422 for input in test_cases {
423 let options = Options::default();
424 let result = parse(input, &options);
425
426 match result {
427 Ok(doc) => {
428 // Validate the invariant using absolute offsets
429 assert!(
430 doc.location.absolute_start <= doc.location.absolute_end,
431 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
432 doc.location.absolute_start,
433 doc.location.absolute_end
434 );
435
436 // Validate with our helper
437 doc.location.validate(input).unwrap_or_else(|e| {
438 panic!("Location validation failed for {input:?}: {e}")
439 });
440 }
441 Err(e) => {
442 panic!("Failed to parse {input:?}: {e}");
443 }
444 }
445 }
446 }
447
448 #[test]
449 fn test_document_with_content_after_whitespace() {
450 let test_cases = vec!["\n\nHello", "\t\tWorld", " \n = Title"];
451
452 for input in test_cases {
453 let options = Options::default();
454 let doc =
455 parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
456
457 assert!(
458 doc.location.absolute_start <= doc.location.absolute_end,
459 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
460 doc.location.absolute_start,
461 doc.location.absolute_end
462 );
463
464 // Validate with our helper
465 doc.location
466 .validate(input)
467 .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
468 }
469 }
470
471 #[test]
472 fn test_unicode_characters() {
473 // Test that UTF-8 safety is maintained
474 let test_cases = vec![
475 "π", // 4-byte emoji
476 "Χ", // 2-byte Hebrew
477 "Hello δΈη", // Mixed content
478 "\u{200b}", // Zero-width space
479 ];
480
481 for input in test_cases {
482 let options = Options::default();
483 let result = parse(input, &options);
484
485 match result {
486 Ok(doc) => {
487 // All offsets should be on UTF-8 boundaries
488 assert!(
489 input.is_char_boundary(doc.location.absolute_start),
490 "Absolute start {} not on UTF-8 boundary for {input:?}",
491 doc.location.absolute_start,
492 );
493 assert!(
494 input.is_char_boundary(doc.location.absolute_end),
495 "Absolute end {} not on UTF-8 boundary for {input:?}",
496 doc.location.absolute_end,
497 );
498
499 // Validate with our helper
500 doc.location.validate(input).unwrap_or_else(|e| {
501 panic!("Location validation failed for {input:?}: {e}");
502 });
503 }
504 Err(e) => {
505 // Some of these might fail to parse, which is OK for now
506 // We're just testing that if they parse, the locations are valid
507 println!("Failed to parse {input:?}: {e} (this might be expected)",);
508 }
509 }
510 }
511 }
512 }
513}