html_to_markdown_rs/
lib.rs

1//! High-performance HTML to Markdown converter.
2//!
3//! Built with html5ever for fast, memory-efficient HTML parsing.
4//!
5//! ## Optional inline image extraction
6//!
7//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
8//! assets alongside the produced Markdown.
9
10pub mod converter;
11pub mod error;
12pub mod hocr;
13#[cfg(feature = "inline-images")]
14mod inline_images;
15pub mod options;
16pub mod sanitizer;
17pub mod text;
18pub mod wrapper;
19
20pub use error::{ConversionError, Result};
21#[cfg(feature = "inline-images")]
22pub use inline_images::{
23    HtmlExtraction, InlineImage, InlineImageConfig, InlineImageFormat, InlineImageSource, InlineImageWarning,
24};
25pub use options::{
26    CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
27    PreprocessingOptions, PreprocessingPreset, WhitespaceMode,
28};
29
30/// Convert HTML to Markdown.
31///
32/// # Arguments
33///
34/// * `html` - The HTML string to convert
35/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
36///
37/// # Example
38///
39/// ```
40/// use html_to_markdown_rs::{convert, ConversionOptions};
41///
42/// let html = "<h1>Hello World</h1>";
43/// let markdown = convert(html, None).unwrap();
44/// assert!(markdown.contains("Hello World"));
45/// ```
46pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
47    let options = options.unwrap_or_default();
48
49    let normalized_html = html.replace("\r\n", "\n").replace('\r', "\n");
50
51    let clean_html = if options.preprocessing.enabled {
52        sanitizer::sanitize(&normalized_html, &options.preprocessing)?
53    } else {
54        normalized_html
55    };
56
57    let markdown = converter::convert_html(&clean_html, &options)?;
58
59    if options.wrap {
60        Ok(wrapper::wrap_markdown(&markdown, &options))
61    } else {
62        Ok(markdown)
63    }
64}
65
66#[cfg(feature = "inline-images")]
67/// Convert HTML to Markdown while collecting inline image assets (requires the `inline-images` feature).
68///
69/// Extracts inline image data URIs and inline `<svg>` elements alongside Markdown conversion.
70///
71/// # Arguments
72///
73/// * `html` - The HTML string to convert
74/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
75/// * `image_cfg` - Configuration controlling inline image extraction
76pub fn convert_with_inline_images(
77    html: &str,
78    options: Option<ConversionOptions>,
79    image_cfg: InlineImageConfig,
80) -> Result<HtmlExtraction> {
81    use std::cell::RefCell;
82    use std::rc::Rc;
83
84    let options = options.unwrap_or_default();
85
86    let normalized_html = html.replace("\r\n", "\n").replace('\r', "\n");
87
88    let clean_html = if options.preprocessing.enabled {
89        sanitizer::sanitize(&normalized_html, &options.preprocessing)?
90    } else {
91        normalized_html
92    };
93
94    let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
95
96    let markdown = converter::convert_html_with_inline_collector(&clean_html, &options, Rc::clone(&collector))?;
97
98    let markdown = if options.wrap {
99        wrapper::wrap_markdown(&markdown, &options)
100    } else {
101        markdown
102    };
103
104    let collector = Rc::try_unwrap(collector)
105        .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
106        .into_inner();
107    let (inline_images, warnings) = collector.finish();
108
109    Ok(HtmlExtraction {
110        markdown,
111        inline_images,
112        warnings,
113    })
114}