acdc_parser/lib.rs
1#![deny(clippy::pedantic)]
2#![warn(clippy::all)]
3//! `AsciiDoc` parser.
4//!
5//! This module provides a parser for the `AsciiDoc` markup language. The parser is
6//! implemented using the `peg` parser generator.
7//!
8//! # Quick Start
9//!
10//! The parser is implemented as a struct that implements the `Parser` trait. The
11//! trait provides two methods for parsing `AsciiDoc` content:
12//!
13//! - `parse`: parses a string containing `AsciiDoc` content.
14//! - `parse_file`: parses the content of a file containing `AsciiDoc` content.
15//!
16//! ```rust
17//!
18//! use acdc_parser::{Document, parse};
19//!
20//! let content = r#"= Document Title
21//!
22//! This is a paragraph.
23//!
24//! == Section Title
25//!
26//! This is a subsection."#;
27//!
28//! let options = acdc_parser::Options::default();
29//! let document = parse(content, &options).unwrap();
30//!
31//! println!("{:?}", document);
32use std::{
33 path::{Path, PathBuf},
34 string::ToString,
35};
36
37use tracing::instrument;
38
39mod blocks;
40mod constants;
41mod error;
42pub(crate) mod grammar;
43mod model;
44mod options;
45mod preprocessor;
46mod safe_mode;
47
48pub(crate) use grammar::{InlinePreprocessorParserState, ProcessedContent, inline_preprocessing};
49use preprocessor::Preprocessor;
50
51pub use error::{Error, Positioning, SourceLocation};
52pub use grammar::parse_text_for_quotes;
53pub use model::{
54 Admonition, AdmonitionVariant, Anchor, AttributeName, AttributeValue, Audio, Author, Autolink,
55 Block, BlockMetadata, Bold, Button, CalloutList, CalloutListItem, CalloutRef, CalloutRefKind,
56 ColumnFormat, ColumnStyle, ColumnWidth, Comment, CrossReference, CurvedApostrophe,
57 CurvedQuotation, DelimitedBlock, DelimitedBlockType, DescriptionList, DescriptionListItem,
58 DiscreteHeader, Document, DocumentAttribute, DocumentAttributes, ElementAttributes, Footnote,
59 Form, HEADER, Header, Highlight, HorizontalAlignment, ICON_SIZES, Icon, Image, IndexTerm,
60 IndexTermKind, InlineMacro, InlineNode, Italic, Keyboard, LineBreak, Link, ListItem,
61 ListItemCheckedStatus, Location, MAX_SECTION_LEVELS, MAX_TOC_LEVELS, Mailto, Menu, Monospace,
62 NORMAL, OrderedList, PageBreak, Paragraph, Pass, PassthroughKind, Plain, Position, Raw, Role,
63 Section, Source, StandaloneCurvedApostrophe, Stem, StemContent, StemNotation, Subscript,
64 Substitution, SubstitutionOp, SubstitutionSpec, Subtitle, Superscript, Table, TableColumn,
65 TableOfContents, TableRow, ThematicBreak, Title, TocEntry, UNNUMBERED_SECTION_STYLES,
66 UnorderedList, Url, VERBATIM, Verbatim, VerticalAlignment, Video, inlines_to_string,
67 substitute,
68};
69pub use options::{Options, OptionsBuilder, SafeMode};
70
71/// Type-based parser for `AsciiDoc` content.
72///
73/// `Parser` provides a more discoverable, fluent API for parsing `AsciiDoc` documents.
74///
75/// # Examples
76///
77/// Basic usage:
78///
79/// ```
80/// use acdc_parser::Parser;
81///
82/// let content = "= Document Title\n\nParagraph text.";
83/// let doc = Parser::new(content).parse()?;
84/// # Ok::<(), acdc_parser::Error>(())
85/// ```
86///
87/// With options:
88///
89/// ```
90/// use acdc_parser::{Parser, Options, SafeMode};
91///
92/// let content = "= Document Title\n\nParagraph text.";
93/// let options = Options::builder()
94/// .with_safe_mode(SafeMode::Safe)
95/// .with_timings()
96/// .build();
97///
98/// let doc = Parser::new(content)
99/// .with_options(options)
100/// .parse()?;
101/// # Ok::<(), acdc_parser::Error>(())
102/// ```
103///
104/// For file-based parsing, read the file first:
105///
106/// ```no_run
107/// use acdc_parser::Parser;
108/// use std::fs;
109///
110/// let content = fs::read_to_string("document.adoc")?;
111/// let doc = Parser::new(&content).parse()?;
112/// # Ok::<(), Box<dyn std::error::Error>>(())
113/// ```
114#[derive(Debug)]
115pub struct Parser<'input> {
116 input: &'input str,
117 options: Options,
118}
119
120impl<'input> Parser<'input> {
121 /// Create a new parser for the given input string.
122 ///
123 /// The parser will use default options. Use `with_options` to customize.
124 ///
125 /// # Example
126 ///
127 /// ```
128 /// use acdc_parser::Parser;
129 ///
130 /// let parser = Parser::new("= Title\n\nContent");
131 /// let doc = parser.parse()?;
132 /// # Ok::<(), acdc_parser::Error>(())
133 /// ```
134 #[must_use]
135 pub fn new(input: &'input str) -> Self {
136 Self {
137 input,
138 options: Options::default(),
139 }
140 }
141
142 /// Set the options for this parser.
143 ///
144 /// This consumes the parser and returns a new one with the specified options.
145 ///
146 /// # Example
147 ///
148 /// ```
149 /// use acdc_parser::{Parser, Options, SafeMode};
150 ///
151 /// let options = Options::builder()
152 /// .with_safe_mode(SafeMode::Safe)
153 /// .build();
154 ///
155 /// let parser = Parser::new("= Title")
156 /// .with_options(options);
157 /// # Ok::<(), acdc_parser::Error>(())
158 /// ```
159 #[must_use]
160 pub fn with_options(mut self, options: Options) -> Self {
161 self.options = options;
162 self
163 }
164
165 /// Parse the input into a Document.
166 ///
167 /// # Example
168 ///
169 /// ```
170 /// use acdc_parser::Parser;
171 ///
172 /// let doc = Parser::new("= Title\n\nContent").parse()?;
173 /// # Ok::<(), acdc_parser::Error>(())
174 /// ```
175 ///
176 /// # Errors
177 ///
178 /// Returns an error if the input cannot be parsed as valid `AsciiDoc`.
179 pub fn parse(self) -> Result<Document, Error> {
180 parse(self.input, &self.options)
181 }
182
183 /// Parse only inline elements from the input.
184 ///
185 /// This is useful for parsing fragments of `AsciiDoc` that contain only
186 /// inline markup like bold, italic, links, etc.
187 ///
188 /// # Example
189 ///
190 /// ```
191 /// use acdc_parser::Parser;
192 ///
193 /// let inlines = Parser::new("This is *bold* text").parse_inline()?;
194 /// # Ok::<(), acdc_parser::Error>(())
195 /// ```
196 ///
197 /// # Errors
198 ///
199 /// Returns an error if the input cannot be parsed.
200 pub fn parse_inline(self) -> Result<Vec<InlineNode>, Error> {
201 parse_inline(self.input, &self.options)
202 }
203}
204
205/// Parse `AsciiDoc` content from a reader.
206///
207/// This function reads the content from the provided reader and parses it as `AsciiDoc`.
208///
209/// # Example
210///
211/// ```
212/// use acdc_parser::{Options, SafeMode, parse_from_reader};
213/// use std::fs::File;
214///
215/// let options = Options::builder()
216/// .with_safe_mode(SafeMode::Unsafe)
217/// .build();
218/// let file = File::open("fixtures/samples/README.adoc").unwrap();
219/// let document = parse_from_reader(file, &options).unwrap();
220/// ```
221///
222/// # Errors
223/// This function returns an error if the content cannot be parsed.
224#[instrument(skip(reader))]
225pub fn parse_from_reader<R: std::io::Read>(
226 reader: R,
227 options: &Options,
228) -> Result<Document, Error> {
229 let result = Preprocessor.process_reader(reader, options)?;
230 parse_input(&result.text, options, None, result.leveloffset_ranges)
231}
232
233/// Parse `AsciiDoc` content from a string.
234///
235/// This function parses the provided string as `AsciiDoc`.
236///
237/// # Example
238///
239/// ```
240/// use acdc_parser::{Options, SafeMode, parse};
241///
242/// let options = Options::builder()
243/// .with_safe_mode(SafeMode::Unsafe)
244/// .build();
245/// let content = "= Document Title\n\nThis is a paragraph.\n\n== Section Title\n\nThis is a subsection.";
246/// let document = parse(content, &options).unwrap();
247/// ```
248///
249/// # Errors
250/// This function returns an error if the content cannot be parsed.
251#[instrument]
252pub fn parse(input: &str, options: &Options) -> Result<Document, Error> {
253 let result = Preprocessor.process(input, options)?;
254 parse_input(&result.text, options, None, result.leveloffset_ranges)
255}
256
257/// Parse `AsciiDoc` content from a file.
258///
259/// This function reads the content from the provided file and parses it as `AsciiDoc`.
260///
261/// # Example
262///
263/// ```
264/// use std::path::Path;
265/// use acdc_parser::{Options, SafeMode, parse_file};
266///
267/// let options = Options::builder()
268/// .with_safe_mode(SafeMode::Unsafe)
269/// .build();
270/// let file_path = Path::new("fixtures/samples/README.adoc");
271/// let document = parse_file(file_path, &options).unwrap();
272/// ```
273///
274/// # Errors
275/// This function returns an error if the content cannot be parsed.
276#[instrument(skip(file_path))]
277pub fn parse_file<P: AsRef<Path>>(file_path: P, options: &Options) -> Result<Document, Error> {
278 let path = file_path.as_ref().to_path_buf();
279 let result = Preprocessor.process_file(file_path, options)?;
280 parse_input(&result.text, options, Some(path), result.leveloffset_ranges)
281}
282
283/// Helper to convert a PEG parse error to our `SourceLocation` type
284fn peg_error_to_source_location(
285 error: &peg::error::ParseError<peg::str::LineCol>,
286 file: Option<PathBuf>,
287) -> SourceLocation {
288 SourceLocation {
289 file,
290 positioning: Positioning::Position(Position {
291 line: error.location.line,
292 column: error.location.column,
293 }),
294 }
295}
296
297#[instrument]
298fn parse_input(
299 input: &str,
300 options: &Options,
301 file_path: Option<PathBuf>,
302 leveloffset_ranges: Vec<model::LeveloffsetRange>,
303) -> Result<Document, Error> {
304 tracing::trace!(?input, "post preprocessor");
305 let mut state = grammar::ParserState::new(input);
306 state.document_attributes = options.document_attributes.clone();
307 state.options = options.clone();
308 state.current_file.clone_from(&file_path);
309 state.leveloffset_ranges = leveloffset_ranges;
310 let result = match grammar::document_parser::document(input, &mut state) {
311 Ok(doc) => doc,
312 Err(error) => {
313 tracing::error!(?error, "error parsing document content");
314 let source_location = peg_error_to_source_location(&error, file_path);
315 Err(Error::Parse(Box::new(source_location), error.to_string()))
316 }
317 };
318 state.emit_warnings();
319 result
320}
321
322/// Parse inline `AsciiDoc` content from a string.
323///
324/// This function parses the provided string as inline `AsciiDoc` elements, returning a
325/// vector of inline nodes instead of a complete document structure. This is useful for
326/// parsing fragments of `AsciiDoc` content that contain inline markup like emphasis,
327/// strong text, links, macros, and other inline elements.
328///
329/// NOTE: This function exists pretty much just for the sake of the TCK tests, which rely
330/// on an "inline" type output.
331///
332/// # Example
333///
334/// ```
335/// use acdc_parser::{Options, SafeMode, parse_inline};
336///
337/// let options = Options::builder()
338/// .with_safe_mode(SafeMode::Unsafe)
339/// .build();
340/// let content = "This is *strong* text with a https://example.com[link].";
341/// let inline_nodes = parse_inline(content, &options).unwrap();
342/// ```
343///
344/// # Errors
345/// This function returns an error if the inline content cannot be parsed.
346#[instrument]
347pub fn parse_inline(input: &str, options: &Options) -> Result<Vec<InlineNode>, Error> {
348 tracing::trace!(?input, "post preprocessor");
349 let mut state = grammar::ParserState::new(input);
350 state.document_attributes = options.document_attributes.clone();
351 state.options = options.clone();
352 let result = match grammar::document_parser::inlines(
353 input,
354 &mut state,
355 0,
356 &grammar::BlockParsingMetadata::default(),
357 ) {
358 Ok(inlines) => Ok(inlines),
359 Err(error) => {
360 tracing::error!(?error, "error parsing inline content");
361 Err(Error::Parse(
362 Box::new(peg_error_to_source_location(&error, None)),
363 error.to_string(),
364 ))
365 }
366 };
367 state.emit_warnings();
368 result
369}
370
371#[cfg(test)]
372mod proptests;
373
374#[cfg(test)]
375#[allow(clippy::unwrap_used)]
376#[allow(clippy::panic)]
377#[allow(clippy::expect_used)]
378mod tests {
379 use super::*;
380 use pretty_assertions::assert_eq;
381
382 fn read_file_contents_with_extension(
383 path: &std::path::PathBuf,
384 ext: &str,
385 ) -> Result<String, Error> {
386 let test_file_path = path.with_extension(ext);
387 let file_contents = std::fs::read_to_string(&test_file_path).inspect_err(
388 |e| tracing::warn!(?path, ?test_file_path, error = %e, "test file not found"),
389 )?;
390 Ok(file_contents)
391 }
392
393 #[rstest::rstest]
394 #[tracing_test::traced_test]
395 fn test_with_fixtures(
396 #[files("fixtures/tests/**/*.adoc")] path: std::path::PathBuf,
397 ) -> Result<(), Error> {
398 let options = Options::builder().with_safe_mode(SafeMode::Unsafe).build();
399
400 match parse_file(&path, &options) {
401 Ok(result) => {
402 let expected = read_file_contents_with_extension(&path, "json")?;
403 let actual =
404 serde_json::to_string_pretty(&result).expect("could not serialize result");
405 assert_eq!(expected, actual);
406 }
407 Err(e) => {
408 let file_contents = read_file_contents_with_extension(&path, "error")?;
409 // Error fixtures contain expected error message as plain text
410 let expected = file_contents.trim();
411 assert_eq!(expected, e.to_string());
412 }
413 }
414 Ok(())
415 }
416
417 #[cfg(test)]
418 mod empty_document_tests {
419 use crate::{Options, parse};
420
421 #[test]
422 fn test_whitespace_only_documents() {
423 let test_cases = vec![
424 "\n", "\n\n", "\t", " \n\t\n ", " ",
425 /* The original proptest failing case -> */ "\n\n\t",
426 ];
427
428 for input in test_cases {
429 let options = Options::default();
430 let result = parse(input, &options);
431
432 match result {
433 Ok(doc) => {
434 // Validate the invariant using absolute offsets
435 assert!(
436 doc.location.absolute_start <= doc.location.absolute_end,
437 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
438 doc.location.absolute_start,
439 doc.location.absolute_end
440 );
441
442 // Validate with our helper
443 doc.location.validate(input).unwrap_or_else(|e| {
444 panic!("Location validation failed for {input:?}: {e}")
445 });
446 }
447 Err(e) => {
448 panic!("Failed to parse {input:?}: {e}");
449 }
450 }
451 }
452 }
453
454 #[test]
455 fn test_document_with_content_after_whitespace() {
456 let test_cases = vec!["\n\nHello", "\t\tWorld", " \n = Title"];
457
458 for input in test_cases {
459 let options = Options::default();
460 let doc =
461 parse(input, &options).unwrap_or_else(|_| panic!("Should parse {input:?}"));
462
463 assert!(
464 doc.location.absolute_start <= doc.location.absolute_end,
465 "Failed for input {input:?}: absolute_start {} > absolute_end {}",
466 doc.location.absolute_start,
467 doc.location.absolute_end
468 );
469
470 // Validate with our helper
471 doc.location
472 .validate(input)
473 .unwrap_or_else(|e| panic!("Location validation failed for {input:?}: {e}"));
474 }
475 }
476
477 #[test]
478 fn test_unicode_characters() {
479 // Test that UTF-8 safety is maintained
480 let test_cases = vec![
481 "π", // 4-byte emoji
482 "Χ", // 2-byte Hebrew
483 "Hello δΈη", // Mixed content
484 "\u{200b}", // Zero-width space
485 ];
486
487 for input in test_cases {
488 let options = Options::default();
489 let result = parse(input, &options);
490
491 match result {
492 Ok(doc) => {
493 // All offsets should be on UTF-8 boundaries
494 assert!(
495 input.is_char_boundary(doc.location.absolute_start),
496 "Absolute start {} not on UTF-8 boundary for {input:?}",
497 doc.location.absolute_start,
498 );
499 assert!(
500 input.is_char_boundary(doc.location.absolute_end),
501 "Absolute end {} not on UTF-8 boundary for {input:?}",
502 doc.location.absolute_end,
503 );
504
505 // Validate with our helper
506 doc.location.validate(input).unwrap_or_else(|e| {
507 panic!("Location validation failed for {input:?}: {e}");
508 });
509 }
510 Err(e) => {
511 // Some of these might fail to parse, which is OK for now
512 // We're just testing that if they parse, the locations are valid
513 println!("Failed to parse {input:?}: {e} (this might be expected)",);
514 }
515 }
516 }
517 }
518 }
519
520 /// Integration tests for attribute resolution behavior.
521 ///
522 /// These tests verify that acdc matches asciidoctor's attribute resolution semantics:
523 /// - Attributes are resolved at definition time (not reference time)
524 /// - If {bar} is undefined when :foo: {bar} is parsed, foo stores literal "{bar}"
525 /// - If {bar} IS defined when :foo: {bar} is parsed, foo stores bar's resolved value
526 mod warning_deduplication_tests {
527 use crate::{Options, parse};
528
529 #[test]
530 #[tracing_test::traced_test]
531 fn counter_reference_emits_single_warning() {
532 // A document with the same counter referenced multiple times should
533 // produce exactly one warning after parsing (not one per PEG attempt).
534 let input = "= Title\n\n{counter:hits} then {counter:hits} again";
535 let options = Options::default();
536 let _doc = parse(input, &options).expect("should parse");
537 assert!(logs_contain("Counters"));
538 logs_assert(|lines: &[&str]| {
539 let count = lines
540 .iter()
541 .filter(|l| l.contains("not supported and will be removed"))
542 .count();
543 if count == 1 {
544 Ok(())
545 } else {
546 Err(format!("expected exactly 1 counter warning, got {count}"))
547 }
548 });
549 }
550
551 #[test]
552 #[tracing_test::traced_test]
553 fn distinct_warnings_all_emitted() {
554 // Different warnings should each appear once.
555 let input = "= Title\n\n{counter:a} and {counter2:b}";
556 let options = Options::default();
557 let _doc = parse(input, &options).expect("should parse");
558 assert!(logs_contain(
559 "Counters ({counter:a}) are not supported and will be removed from output"
560 ));
561 assert!(logs_contain(
562 "Counters ({counter2:b}) are not supported and will be removed from output"
563 ));
564 }
565 }
566
567 mod attribute_resolution_tests {
568 use crate::{AttributeValue, Options, parse};
569
570 #[test]
571 fn test_definition_time_resolution_bar_defined_first() {
572 // When bar is defined BEFORE foo, {bar} in foo's value should be expanded
573 let input = r":bar: resolved-bar
574:foo: {bar}
575
576{foo}
577";
578 let options = Options::default();
579 let doc = parse(input, &options).expect("should parse");
580
581 // foo should have bar's value expanded at definition time
582 assert_eq!(
583 doc.attributes.get("foo"),
584 Some(&AttributeValue::String("resolved-bar".to_string()))
585 );
586 }
587
588 #[test]
589 fn test_definition_time_resolution_bar_defined_after() {
590 // When bar is defined AFTER foo, {bar} should stay literal in foo's value
591 let input = r":foo: {bar}
592:bar: resolved-bar
593
594{foo}
595";
596 let options = Options::default();
597 let doc = parse(input, &options).expect("should parse");
598
599 // foo should keep {bar} as literal since bar wasn't defined yet
600 assert_eq!(
601 doc.attributes.get("foo"),
602 Some(&AttributeValue::String("{bar}".to_string()))
603 );
604 }
605
606 #[test]
607 fn test_chained_attribute_resolution() {
608 // When attributes form a chain: a -> b -> c, each should resolve
609 // based on what's defined at each definition point
610 let input = r":c: final-value
611:b: {c}
612:a: {b}
613
614{a}
615";
616 let options = Options::default();
617 let doc = parse(input, &options).expect("should parse");
618
619 // c is defined first, so b gets "final-value", then a gets "final-value"
620 assert_eq!(
621 doc.attributes.get("c"),
622 Some(&AttributeValue::String("final-value".to_string()))
623 );
624 assert_eq!(
625 doc.attributes.get("b"),
626 Some(&AttributeValue::String("final-value".to_string()))
627 );
628 assert_eq!(
629 doc.attributes.get("a"),
630 Some(&AttributeValue::String("final-value".to_string()))
631 );
632 }
633 }
634}