html_to_markdown_rs/
lib.rs

1//! High-performance HTML to Markdown converter.
2//!
3//! Built with html5ever for fast, memory-efficient HTML parsing.
4//!
5//! ## Optional inline image extraction
6//!
7//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
8//! assets alongside the produced Markdown.
9
10use std::borrow::Cow;
11
12pub mod converter;
13pub mod error;
14pub mod hocr;
15#[cfg(feature = "inline-images")]
16mod inline_images;
17pub mod options;
18pub mod text;
19pub mod wrapper;
20
21pub use error::{ConversionError, Result};
22#[cfg(feature = "inline-images")]
23pub use inline_images::{
24    HtmlExtraction, InlineImage, InlineImageConfig, InlineImageFormat, InlineImageSource, InlineImageWarning,
25};
26pub use options::{
27    CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
28    PreprocessingOptions, PreprocessingPreset, WhitespaceMode,
29};
30
31/// Convert HTML to Markdown.
32///
33/// # Arguments
34///
35/// * `html` - The HTML string to convert
36/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
37///
38/// # Example
39///
40/// ```
41/// use html_to_markdown_rs::{convert, ConversionOptions};
42///
43/// let html = "<h1>Hello World</h1>";
44/// let markdown = convert(html, None).unwrap();
45/// assert!(markdown.contains("Hello World"));
46/// ```
47pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
48    let options = options.unwrap_or_default();
49
50    let normalized_html = if html.contains('\r') {
51        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
52    } else {
53        Cow::Borrowed(html)
54    };
55
56    let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
57
58    if options.wrap {
59        Ok(wrapper::wrap_markdown(&markdown, &options))
60    } else {
61        Ok(markdown)
62    }
63}
64
65#[cfg(feature = "inline-images")]
66/// Convert HTML to Markdown while collecting inline image assets (requires the `inline-images` feature).
67///
68/// Extracts inline image data URIs and inline `<svg>` elements alongside Markdown conversion.
69///
70/// # Arguments
71///
72/// * `html` - The HTML string to convert
73/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
74/// * `image_cfg` - Configuration controlling inline image extraction
75pub fn convert_with_inline_images(
76    html: &str,
77    options: Option<ConversionOptions>,
78    image_cfg: InlineImageConfig,
79) -> Result<HtmlExtraction> {
80    use std::cell::RefCell;
81    use std::rc::Rc;
82
83    let options = options.unwrap_or_default();
84
85    let normalized_html = if html.contains('\r') {
86        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
87    } else {
88        Cow::Borrowed(html)
89    };
90
91    let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
92
93    let markdown =
94        converter::convert_html_with_inline_collector(normalized_html.as_ref(), &options, Rc::clone(&collector))?;
95
96    let markdown = if options.wrap {
97        wrapper::wrap_markdown(&markdown, &options)
98    } else {
99        markdown
100    };
101
102    let collector = Rc::try_unwrap(collector)
103        .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
104        .into_inner();
105    let (inline_images, warnings) = collector.finish();
106
107    Ok(HtmlExtraction {
108        markdown,
109        inline_images,
110        warnings,
111    })
112}