acdc_parser/lib.rs
1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33 path::{Path, PathBuf},
34 string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46mod safe_mode;
47
48pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
49use preprocessor::Preprocessor;
50
51pub use error::{Error, Positioning, SourceLocation};
52pub use grammar::parse_text_for_quotes;
53pub use model::{
54 Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
55 Block, BlockMetadata, Bold, Button, CalloutList, CalloutListItem, CalloutRef, CalloutRefKind,
56 ColumnFormat, ColumnStyle, ColumnWidth, Comment, CrossReference, CurvedApostrophe,
57 CurvedQuotation, DelimitedBlock, DelimitedBlockType, DescriptionList, DescriptionListItem,
58 DiscreteHeader, Document, DocumentAttribute, DocumentAttributes, ElementAttributes, Footnote,
59 Form, HEADER, Header, Highlight, HorizontalAlignment, ICON_SIZES, Icon, Image, IndexTerm,
60 IndexTermKind, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
61 ListItemCheckedStatus, Location, MAX_SECTION_LEVELS, MAX_TOC_LEVELS, Mailto, Menu, Monospace,
62 NORMAL, OrderedList, PageBreak, Paragraph, Pass, PassthroughKind, Plain, Position, Raw, Role,
63 Section, Source, StandaloneCurvedApostrophe, Stem, StemContent, StemNotation, Subscript,
64 Substitution, SubstitutionOp, SubstitutionSpec, Subtitle, Superscript, Table, TableColumn,
65 TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UNNUMBERED_SECTION_STYLES,
66 UnorderedList, Url, VERBATIM, Verbatim, VerticalAlignment, Video, inlines_to_string,
67 substitute,
68};
69pub use options::{Options, OptionsBuilder, SafeMode};
70
71/// Type-based parser for `AsciiDoc` content.
72///
73/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
74///
75/// # Examples
76///
77/// Basic usage:
78///
79/// ```
80/// use acdc_parser::Parser;
81///
82/// let content = "= Document Title\n\nParagraph text.";
83/// let doc = Parser::new(content).parse()?;
84/// # Ok::<(), acdc_parser::Error>(())
85/// ```
86///
87/// With options:
88///
89/// ```
90/// use acdc_parser::{Parser, Options, SafeMode};
91///
92/// let content = "= Document Title\n\nParagraph text.";
93/// let options = Options::builder()
94/// .with_safe_mode(SafeMode::Safe)
95/// .with_timings()
96/// .build();
97///
98/// let doc = Parser::new(content)
99/// .with_options(options)
100/// .parse()?;
101/// # Ok::<(), acdc_parser::Error>(())
102/// ```
103///
104/// For file-based parsing, read the file first:
105///
106/// ```no_run
107/// use acdc_parser::Parser;
108/// use std::fs;
109///
110/// let content = fs::read_to_string("document.adoc")?;
111/// let doc = Parser::new(&content).parse()?;
112/// # Ok::<(), Box<dyn std::error::Error>>(())
113/// ```
114#[derive(Debug)]
115pub struct Parser<'input> {
116 input: &'input str,
117 options: Options,
118}
119
120impl<'input> Parser<'input> {
121 /// Create a new parser for the given input string.
122 ///
123 /// The parser will use default options. Use `with_options` to customize.
124 ///
125 /// # Example
126 ///
127 /// ```
128 /// use acdc_parser::Parser;
129 ///
130 /// let parser = Parser::new("= Title\n\nContent");
131 /// let doc = parser.parse()?;
132 /// # Ok::<(), acdc_parser::Error>(())
133 /// ```
134 #[must_use]
135 pub fn new(input: &'input str) -> Self {
136 Self {
137 input,
138 options: Options::default(),
139 }
140 }
141
142 /// Set the options for this parser.
143 ///
144 /// This consumes the parser and returns a new one with the specified options.
145 ///
146 /// # Example
147 ///
148 /// ```
149 /// use acdc_parser::{Parser, Options, SafeMode};
150 ///
151 /// let options = Options::builder()
152 /// .with_safe_mode(SafeMode::Safe)
153 /// .build();
154 ///
155 /// let parser = Parser::new("= Title")
156 /// .with_options(options);
157 /// # Ok::<(), acdc_parser::Error>(())
158 /// ```
159 #[must_use]
160 pub fn with_options(mut self, options: Options) -> Self {
161 self.options = options;
162 self
163 }
164
165 /// Parse the input into a Document.
166 ///
167 /// # Example
168 ///
169 /// ```
170 /// use acdc_parser::Parser;
171 ///
172 /// let doc = Parser::new("= Title\n\nContent").parse()?;
173 /// # Ok::<(), acdc_parser::Error>(())
174 /// ```
175 ///
176 /// # Errors
177 ///
178 /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
179 pub fn parse(self) -> Result<Document, Error> {
180 parse(self.input, &self.options)
181 }
182
183 /// Parse only inline elements from the input.
184 ///
185 /// This is useful for parsing fragments of `AsciiDoc` that contain only
186 /// inline markup like bold, italic, links, etc.
187 ///
188 /// # Example
189 ///
190 /// ```
191 /// use acdc_parser::Parser;
192 ///
193 /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
194 /// # Ok::<(), acdc_parser::Error>(())
195 /// ```
196 ///
197 /// # Errors
198 ///
199 /// Returns an error if the input cannot be parsed.
200 pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
201 parse_inline(self.input, &self.options)
202 }
203}
204
205/// Parse `AsciiDoc` content from a reader.
206///
207/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
208///
209/// # Example
210///
211/// ```
212/// use acdc_parser::{Options, SafeMode, parse_from_reader};
213/// use std::fs::File;
214///
215/// let options = Options::builder()
216/// .with_safe_mode(SafeMode::Unsafe)
217/// .build();
218/// let file = File::open("fixtures/samples/README.adoc").unwrap();
219/// let document = parse_from_reader(file, &options).unwrap();
220/// ```
221///
222/// # Errors
223/// This function returns an error if the content cannot be parsed.
224#[instrument(skip(reader))]
225pub fn parse_from_reader<R: std::io::Read>(
226 reader: R,
227 options: &Options,
228) -> Result<Document, Error> {
229 let result = Preprocessor.process_reader(reader, options)?;
230 parse_input(&result.text, options, None, result.leveloffset_ranges)
231}
232
233/// Parse `AsciiDoc` content from a string.
234///
235/// This function parses the provided string as `AsciiDoc`.
236///
237/// # Example
238///
239/// ```
240/// use acdc_parser::{Options, SafeMode, parse};
241///
242/// let options = Options::builder()
243/// .with_safe_mode(SafeMode::Unsafe)
244/// .build();
245/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
246/// let document = parse(content, &options).unwrap();
247/// ```
248///
249/// # Errors
250/// This function returns an error if the content cannot be parsed.
251#[instrument]
252pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
253 let result = Preprocessor.process(input, options)?;
254 parse_input(&result.text, options, None, result.leveloffset_ranges)
255}
256
257/// Parse `AsciiDoc` content from a file.
258///
259/// This function reads the content from the provided file and parses it as `AsciiDoc`.
260///
261/// # Example
262///
263/// ```
264/// use std::path::Path;
265/// use acdc_parser::{Options, SafeMode, parse_file};
266///
267/// let options = Options::builder()
268/// .with_safe_mode(SafeMode::Unsafe)
269/// .build();
270/// let file_path = Path::new("fixtures/samples/README.adoc");
271/// let document = parse_file(file_path, &options).unwrap();
272/// ```
273///
274/// # Errors
275/// This function returns an error if the content cannot be parsed.
276#[instrument(skip(file_path))]
277pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
278 let path = file_path.as_ref().to_path_buf();
279 let result = Preprocessor.process_file(file_path, options)?;
280 parse_input(&result.text, options, Some(path), result.leveloffset_ranges)
281}
282
283/// Helper to convert a PEG parse error to our `SourceLocation` type
284fn peg_error_to_source_location(
285 error: &peg::error::ParseError<peg::str::LineCol>,
286 file: Option<PathBuf>,
287) -> SourceLocation {
288 SourceLocation {
289 file,
290 positioning: Positioning::Position(Position {
291 line: error.location.line,
292 column: error.location.column,
293 }),
294 }
295}
296
297#[instrument]
298fn parse_input(
299 input: &str,
300 options: &Options,
301 file_path: Option<PathBuf>,
302 leveloffset_ranges: Vec<model::LeveloffsetRange>,
303) -> Result<Document, Error> {
304 tracing::trace!(?input, "post preprocessor");
305 let mut state = grammar::ParserState::new(input);
306 state.document_attributes = options.document_attributes.clone();
307 state.options = options.clone();
308 state.current_file.clone_from(&file_path);
309 state.leveloffset_ranges = leveloffset_ranges;
310 match grammar::document_parser::document(input, &mut state) {
311 Ok(doc) => doc,
312 Err(error) => {
313 tracing::error!(?error, "error parsing document content");
314 let source_location = peg_error_to_source_location(&error, file_path);
315 Err(Error::Parse(Box::new(source_location), error.to_string()))
316 }
317 }
318}
319
320/// Parse inline `AsciiDoc` content from a string.
321///
322/// This function parses the provided string as inline `AsciiDoc` elements, returning a
323/// vector of inline nodes instead of a complete document structure. This is useful for
324/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
325/// strong text, links, macros, and other inline elements.
326///
327/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
328/// on an "inline" type output.
329///
330/// # Example
331///
332/// ```
333/// use acdc_parser::{Options, SafeMode, parse_inline};
334///
335/// let options = Options::builder()
336/// .with_safe_mode(SafeMode::Unsafe)
337/// .build();
338/// let content = "This is *strong* text with a https://example.com[link].";
339/// let inline_nodes = parse_inline(content, &options).unwrap();
340/// ```
341///
342/// # Errors
343/// This function returns an error if the inline content cannot be parsed.
344#[instrument]
345pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
346 tracing::trace!(?input, "post preprocessor");
347 let mut state = grammar::ParserState::new(input);
348 state.document_attributes = options.document_attributes.clone();
349 state.options = options.clone();
350 match grammar::document_parser::inlines(
351 input,
352 &mut state,
353 0,
354 &grammar::BlockParsingMetadata::default(),
355 ) {
356 Ok(inlines) => Ok(inlines),
357 Err(error) => {
358 tracing::error!(?error, "error parsing inline content");
359 Err(Error::Parse(
360 Box::new(peg_error_to_source_location(&error, None)),
361 error.to_string(),
362 ))
363 }
364 }
365}
366
367#[cfg(test)]
368mod proptests;
369
370#[cfg(test)]
371#[allow(clippy::unwrap_used)]
372#[allow(clippy::panic)]
373#[allow(clippy::expect_used)]
374mod tests {
375 use super::*;
376 use pretty_assertions::assert_eq;
377
378 fn read_file_contents_with_extension(
379 path: &std::path::PathBuf,
380 ext: &str,
381 ) -> Result<String, Error> {
382 let test_file_path = path.with_extension(ext);
383 let file_contents = std::fs::read_to_string(&test_file_path).inspect_err(
384 |e| tracing::warn!(?path, ?test_file_path, error = %e, "test file not found"),
385 )?;
386 Ok(file_contents)
387 }
388
389 #[rstest::rstest]
390 #[tracing_test::traced_test]
391 fn test_with_fixtures(
392 #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
393 ) -> Result<(), Error> {
394 let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
395
396 match parse_file(&path, &options) {
397 Ok(result) => {
398 let expected = read_file_contents_with_extension(&path, "json")?;
399 let actual =
400 serde_json::to_string_pretty(&result).expect("could not serialize result");
401 assert_eq!(expected, actual);
402 }
403 Err(e) => {
404 let file_contents = read_file_contents_with_extension(&path, "error")?;
405 // Error fixtures contain expected error message as plain text
406 let expected = file_contents.trim();
407 assert_eq!(expected, e.to_string());
408 }
409 }
410 Ok(())
411 }
412
413 #[cfg(test)]
414 mod empty_document_tests {
415 use crate::{Options, parse};
416
417 #[test]
418 fn test_whitespace_only_documents() {
419 let test_cases = vec![
420 "\n", "\n\n", "\t", " \n\t\n ", " ",
421 /* The original proptest failing case -> */ "\n\n\t",
422 ];
423
424 for input in test_cases {
425 let options = Options::default();
426 let result = parse(input, &options);
427
428 match result {
429 Ok(doc) => {
430 // Validate the invariant using absolute offsets
431 assert!(
432 doc.location.absolute_start <= doc.location.absolute_end,
433 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
434 doc.location.absolute_start,
435 doc.location.absolute_end
436 );
437
438 // Validate with our helper
439 doc.location.validate(input).unwrap_or_else(|e| {
440 panic!("Location validation failed for {input:?}: {e}")
441 });
442 }
443 Err(e) => {
444 panic!("Failed to parse {input:?}: {e}");
445 }
446 }
447 }
448 }
449
450 #[test]
451 fn test_document_with_content_after_whitespace() {
452 let test_cases = vec!["\n\nHello", "\t\tWorld", " \n = Title"];
453
454 for input in test_cases {
455 let options = Options::default();
456 let doc =
457 parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
458
459 assert!(
460 doc.location.absolute_start <= doc.location.absolute_end,
461 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
462 doc.location.absolute_start,
463 doc.location.absolute_end
464 );
465
466 // Validate with our helper
467 doc.location
468 .validate(input)
469 .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
470 }
471 }
472
473 #[test]
474 fn test_unicode_characters() {
475 // Test that UTF-8 safety is maintained
476 let test_cases = vec![
477 "π", // 4-byte emoji
478 "Χ", // 2-byte Hebrew
479 "Hello δΈη", // Mixed content
480 "\u{200b}", // Zero-width space
481 ];
482
483 for input in test_cases {
484 let options = Options::default();
485 let result = parse(input, &options);
486
487 match result {
488 Ok(doc) => {
489 // All offsets should be on UTF-8 boundaries
490 assert!(
491 input.is_char_boundary(doc.location.absolute_start),
492 "Absolute start {} not on UTF-8 boundary for {input:?}",
493 doc.location.absolute_start,
494 );
495 assert!(
496 input.is_char_boundary(doc.location.absolute_end),
497 "Absolute end {} not on UTF-8 boundary for {input:?}",
498 doc.location.absolute_end,
499 );
500
501 // Validate with our helper
502 doc.location.validate(input).unwrap_or_else(|e| {
503 panic!("Location validation failed for {input:?}: {e}");
504 });
505 }
506 Err(e) => {
507 // Some of these might fail to parse, which is OK for now
508 // We're just testing that if they parse, the locations are valid
509 println!("Failed to parse {input:?}: {e} (this might be expected)",);
510 }
511 }
512 }
513 }
514 }
515
516 /// Integration tests for attribute resolution behavior.
517 ///
518 /// These tests verify that acdc matches asciidoctor's attribute resolution semantics:
519 /// - Attributes are resolved at definition time (not reference time)
520 /// - If {bar} is undefined when :foo: {bar} is parsed, foo stores literal "{bar}"
521 /// - If {bar} IS defined when :foo: {bar} is parsed, foo stores bar's resolved value
522 mod attribute_resolution_tests {
523 use crate::{AttributeValue, Options, parse};
524
525 #[test]
526 fn test_definition_time_resolution_bar_defined_first() {
527 // When bar is defined BEFORE foo, {bar} in foo's value should be expanded
528 let input = r":bar: resolved-bar
529:foo: {bar}
530
531{foo}
532";
533 let options = Options::default();
534 let doc = parse(input, &options).expect("should parse");
535
536 // foo should have bar's value expanded at definition time
537 assert_eq!(
538 doc.attributes.get("foo"),
539 Some(&AttributeValue::String("resolved-bar".to_string()))
540 );
541 }
542
543 #[test]
544 fn test_definition_time_resolution_bar_defined_after() {
545 // When bar is defined AFTER foo, {bar} should stay literal in foo's value
546 let input = r":foo: {bar}
547:bar: resolved-bar
548
549{foo}
550";
551 let options = Options::default();
552 let doc = parse(input, &options).expect("should parse");
553
554 // foo should keep {bar} as literal since bar wasn't defined yet
555 assert_eq!(
556 doc.attributes.get("foo"),
557 Some(&AttributeValue::String("{bar}".to_string()))
558 );
559 }
560
561 #[test]
562 fn test_chained_attribute_resolution() {
563 // When attributes form a chain: a -> b -> c, each should resolve
564 // based on what's defined at each definition point
565 let input = r":c: final-value
566:b: {c}
567:a: {b}
568
569{a}
570";
571 let options = Options::default();
572 let doc = parse(input, &options).expect("should parse");
573
574 // c is defined first, so b gets "final-value", then a gets "final-value"
575 assert_eq!(
576 doc.attributes.get("c"),
577 Some(&AttributeValue::String("final-value".to_string()))
578 );
579 assert_eq!(
580 doc.attributes.get("b"),
581 Some(&AttributeValue::String("final-value".to_string()))
582 );
583 assert_eq!(
584 doc.attributes.get("a"),
585 Some(&AttributeValue::String("final-value".to_string()))
586 );
587 }
588 }
589}