html_to_markdown_rs/
lib.rs

1//! High-performance HTML to Markdown converter.
2//!
3//! Built with html5ever for fast, memory-efficient HTML parsing.
4//!
5//! ## Optional inline image extraction
6//!
7//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
8//! assets alongside the produced Markdown.
9
10use std::borrow::Cow;
11
12pub mod converter;
13pub mod error;
14pub mod hocr;
15#[cfg(feature = "inline-images")]
16mod inline_images;
17#[cfg(feature = "metadata")]
18pub mod metadata;
19pub mod options;
20pub mod safety;
21pub mod text;
22pub mod wrapper;
23
24pub use error::{ConversionError, Result};
25#[cfg(feature = "inline-images")]
26pub use inline_images::{
27    HtmlExtraction, InlineImage, InlineImageConfig, InlineImageFormat, InlineImageSource, InlineImageWarning,
28};
29#[cfg(feature = "metadata")]
30pub use metadata::{
31    DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType, LinkMetadata, LinkType,
32    MetadataConfig, StructuredData, StructuredDataType, TextDirection,
33};
34pub use options::{
35    CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
36    PreprocessingOptions, PreprocessingPreset, WhitespaceMode,
37};
38
39/// Convert HTML to Markdown.
40///
41/// # Arguments
42///
43/// * `html` - The HTML string to convert
44/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
45///
46/// # Example
47///
48/// ```
49/// use html_to_markdown_rs::{convert, ConversionOptions};
50///
51/// let html = "<h1>Hello World</h1>";
52/// let markdown = convert(html, None).unwrap();
53/// assert!(markdown.contains("Hello World"));
54/// ```
55pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
56    let options = options.unwrap_or_default();
57
58    let normalized_html = if html.contains('\r') {
59        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
60    } else {
61        Cow::Borrowed(html)
62    };
63
64    let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
65
66    if options.wrap {
67        Ok(wrapper::wrap_markdown(&markdown, &options))
68    } else {
69        Ok(markdown)
70    }
71}
72
73#[cfg(feature = "inline-images")]
74/// Convert HTML to Markdown while collecting inline image assets (requires the `inline-images` feature).
75///
76/// Extracts inline image data URIs and inline `<svg>` elements alongside Markdown conversion.
77///
78/// # Arguments
79///
80/// * `html` - The HTML string to convert
81/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
82/// * `image_cfg` - Configuration controlling inline image extraction
83pub fn convert_with_inline_images(
84    html: &str,
85    options: Option<ConversionOptions>,
86    image_cfg: InlineImageConfig,
87) -> Result<HtmlExtraction> {
88    use std::cell::RefCell;
89    use std::rc::Rc;
90
91    let options = options.unwrap_or_default();
92
93    let normalized_html = if html.contains('\r') {
94        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
95    } else {
96        Cow::Borrowed(html)
97    };
98
99    let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
100
101    let markdown =
102        converter::convert_html_with_inline_collector(normalized_html.as_ref(), &options, Rc::clone(&collector))?;
103
104    let markdown = if options.wrap {
105        wrapper::wrap_markdown(&markdown, &options)
106    } else {
107        markdown
108    };
109
110    let collector = Rc::try_unwrap(collector)
111        .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
112        .into_inner();
113    let (inline_images, warnings) = collector.finish();
114
115    Ok(HtmlExtraction {
116        markdown,
117        inline_images,
118        warnings,
119    })
120}
121
122#[cfg(feature = "metadata")]
123/// Convert HTML to Markdown with comprehensive metadata extraction (requires the `metadata` feature).
124///
125/// Extracts document metadata, headers, links, images, and structured data during conversion.
126/// The metadata is collected in a single pass during tree traversal for efficiency.
127///
128/// # Arguments
129///
130/// * `html` - The HTML string to convert
131/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
132/// * `metadata_cfg` - Configuration for what metadata to extract
133///
134/// # Returns
135///
136/// A tuple of `(markdown: String, metadata: ExtendedMetadata)` on success.
137///
138/// # Example
139///
140/// ```ignore
141/// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
142///
143/// let html = r#"<html lang="en"><head><title>Test</title></head><body><h1 id="main">Hello</h1></body></html>"#;
144/// let (markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default()).unwrap();
145///
146/// assert_eq!(metadata.document.title, Some("Test".to_string()));
147/// assert_eq!(metadata.headers.len(), 1);
148/// assert_eq!(metadata.document.language, Some("en".to_string()));
149/// ```
150pub fn convert_with_metadata(
151    html: &str,
152    options: Option<ConversionOptions>,
153    metadata_cfg: MetadataConfig,
154) -> Result<(String, ExtendedMetadata)> {
155    use std::cell::RefCell;
156    use std::rc::Rc;
157
158    let options = options.unwrap_or_default();
159
160    let normalized_html = if html.contains('\r') {
161        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
162    } else {
163        Cow::Borrowed(html)
164    };
165
166    let metadata_collector = Rc::new(RefCell::new(metadata::MetadataCollector::new(metadata_cfg)));
167
168    let markdown =
169        converter::convert_html_with_metadata(normalized_html.as_ref(), &options, Rc::clone(&metadata_collector))?;
170
171    let markdown = if options.wrap {
172        wrapper::wrap_markdown(&markdown, &options)
173    } else {
174        markdown
175    };
176
177    let metadata_collector = Rc::try_unwrap(metadata_collector)
178        .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
179        .into_inner();
180    let metadata = metadata_collector.finish();
181
182    Ok((markdown, metadata))
183}
184
185#[cfg(all(test, feature = "metadata"))]
186mod tests {
187    use super::*;
188
189    #[test]
190    fn test_convert_with_metadata_full_workflow() {
191        let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
192
193        let config = MetadataConfig {
194            extract_headers: true,
195            extract_links: true,
196            extract_images: true,
197            extract_structured_data: true,
198            max_structured_data_size: 1_000_000,
199        };
200
201        let (markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
202
203        // Verify markdown was generated
204        assert!(!markdown.is_empty());
205        assert!(markdown.contains("Main Title"));
206        assert!(markdown.contains("Subsection"));
207
208        // Verify language and direction were extracted
209        assert_eq!(metadata.document.language, Some("en".to_string()));
210
211        // Verify headers were extracted
212        assert_eq!(metadata.headers.len(), 2);
213        assert_eq!(metadata.headers[0].level, 1);
214        assert_eq!(metadata.headers[0].text, "Main Title");
215        assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
216        assert_eq!(metadata.headers[1].level, 2);
217        assert_eq!(metadata.headers[1].text, "Subsection");
218
219        // Verify links were extracted
220        assert!(metadata.links.len() >= 2);
221        let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
222        assert!(external_link.is_some());
223        let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
224        assert!(anchor_link.is_some());
225
226        // Verify images were extracted
227        assert_eq!(metadata.images.len(), 1);
228        assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
229        assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
230        assert_eq!(metadata.images[0].image_type, ImageType::External);
231    }
232
233    #[test]
234    fn test_convert_with_metadata_document_fields() {
235        let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
236
237        let (_markdown, metadata) =
238            convert_with_metadata(html, None, MetadataConfig::default()).expect("conversion should succeed");
239
240        assert_eq!(
241            metadata.document.title,
242            Some("Test Article".to_string()),
243            "document: {:?}",
244            metadata.document
245        );
246        assert_eq!(metadata.document.description, Some("Desc".to_string()));
247        assert_eq!(metadata.document.author, Some("Author".to_string()));
248        assert_eq!(metadata.document.language, Some("en".to_string()));
249        assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
250        assert_eq!(
251            metadata.document.open_graph.get("description"),
252            Some(&"OG Desc".to_string())
253        );
254    }
255
256    #[test]
257    fn test_convert_with_metadata_empty_config() {
258        let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
259
260        let config = MetadataConfig {
261            extract_headers: false,
262            extract_links: false,
263            extract_images: false,
264            extract_structured_data: false,
265            max_structured_data_size: 0,
266        };
267
268        let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
269
270        // With extraction disabled, collections should be empty
271        assert!(metadata.headers.is_empty());
272        assert!(metadata.links.is_empty());
273        assert!(metadata.images.is_empty());
274        // Document metadata extraction includes language from html tag
275        assert_eq!(metadata.document.language, Some("en".to_string()));
276    }
277
278    #[test]
279    fn test_convert_with_metadata_data_uri_image() {
280        let html = "<html><body><img src=\"\" alt=\"Pixel\"></body></html>";
281
282        let config = MetadataConfig::default();
283
284        let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
285
286        assert_eq!(metadata.images.len(), 1);
287        assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
288        assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
289    }
290
291    #[test]
292    fn test_convert_with_metadata_relative_paths() {
293        let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
294
295        let config = MetadataConfig::default();
296
297        let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
298
299        let internal_links: Vec<_> = metadata
300            .links
301            .iter()
302            .filter(|l| l.link_type == LinkType::Internal)
303            .collect();
304        assert_eq!(internal_links.len(), 2);
305    }
306}