html_to_markdown_rs/
lib.rs

1//! High-performance HTML to Markdown converter.
2//!
3//! Built with html5ever for fast, memory-efficient HTML parsing.
4//!
5//! ## Optional inline image extraction
6//!
7//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
8//! assets alongside the produced Markdown.
9
10use std::borrow::Cow;
11
12pub mod converter;
13pub mod error;
14pub mod hocr;
15#[cfg(feature = "inline-images")]
16mod inline_images;
17#[cfg(feature = "metadata")]
18pub mod metadata;
19pub mod options;
20pub mod safety;
21pub mod text;
22pub mod wrapper;
23
24pub use error::{ConversionError, Result};
25#[cfg(feature = "inline-images")]
26pub use inline_images::{
27    HtmlExtraction, InlineImage, InlineImageConfig, InlineImageFormat, InlineImageSource, InlineImageWarning,
28};
29#[cfg(feature = "metadata")]
30pub use metadata::{
31    DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType,
32    LinkMetadata, LinkType, MetadataConfig, StructuredData, StructuredDataType, TextDirection,
33};
34pub use options::{
35    CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
36    PreprocessingOptions, PreprocessingPreset, WhitespaceMode,
37};
38
39/// Convert HTML to Markdown.
40///
41/// # Arguments
42///
43/// * `html` - The HTML string to convert
44/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
45///
46/// # Example
47///
48/// ```
49/// use html_to_markdown_rs::{convert, ConversionOptions};
50///
51/// let html = "<h1>Hello World</h1>";
52/// let markdown = convert(html, None).unwrap();
53/// assert!(markdown.contains("Hello World"));
54/// ```
55pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
56    let options = options.unwrap_or_default();
57
58    let normalized_html = if html.contains('\r') {
59        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
60    } else {
61        Cow::Borrowed(html)
62    };
63
64    let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
65
66    if options.wrap {
67        Ok(wrapper::wrap_markdown(&markdown, &options))
68    } else {
69        Ok(markdown)
70    }
71}
72
73#[cfg(feature = "inline-images")]
74/// Convert HTML to Markdown while collecting inline image assets (requires the `inline-images` feature).
75///
76/// Extracts inline image data URIs and inline `<svg>` elements alongside Markdown conversion.
77///
78/// # Arguments
79///
80/// * `html` - The HTML string to convert
81/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
82/// * `image_cfg` - Configuration controlling inline image extraction
83pub fn convert_with_inline_images(
84    html: &str,
85    options: Option<ConversionOptions>,
86    image_cfg: InlineImageConfig,
87) -> Result<HtmlExtraction> {
88    use std::cell::RefCell;
89    use std::rc::Rc;
90
91    let options = options.unwrap_or_default();
92
93    let normalized_html = if html.contains('\r') {
94        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
95    } else {
96        Cow::Borrowed(html)
97    };
98
99    let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
100
101    let markdown =
102        converter::convert_html_with_inline_collector(normalized_html.as_ref(), &options, Rc::clone(&collector))?;
103
104    let markdown = if options.wrap {
105        wrapper::wrap_markdown(&markdown, &options)
106    } else {
107        markdown
108    };
109
110    let collector = Rc::try_unwrap(collector)
111        .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
112        .into_inner();
113    let (inline_images, warnings) = collector.finish();
114
115    Ok(HtmlExtraction {
116        markdown,
117        inline_images,
118        warnings,
119    })
120}
121
122#[cfg(feature = "metadata")]
123/// Convert HTML to Markdown with comprehensive metadata extraction (requires the `metadata` feature).
124///
125/// Performs HTML-to-Markdown conversion while simultaneously extracting structured metadata in a
126/// single pass for maximum efficiency. Ideal for content analysis, SEO optimization, and document
127/// indexing workflows.
128///
129/// # Arguments
130///
131/// * `html` - The HTML string to convert. Will normalize line endings (CRLF → LF).
132/// * `options` - Optional conversion configuration. Defaults to `ConversionOptions::default()` if `None`.
133///   Controls heading style, list indentation, escape behavior, wrapping, and other output formatting.
134/// * `metadata_cfg` - Configuration for metadata extraction granularity. Use `MetadataConfig::default()`
135///   to extract all metadata types, or customize with selective extraction flags.
136///
137/// # Returns
138///
139/// On success, returns a tuple of:
140/// - `String`: The converted Markdown output
141/// - `ExtendedMetadata`: Comprehensive metadata containing:
142///   - `document`: Title, description, author, language, Open Graph, Twitter Card, and other meta tags
143///   - `headers`: All heading elements (h1-h6) with hierarchy and IDs
144///   - `links`: Hyperlinks classified as anchor, internal, external, email, or phone
145///   - `images`: Image elements with source, dimensions, and alt text
146///   - `structured_data`: JSON-LD, Microdata, and RDFa blocks
147///
148/// # Errors
149///
150/// Returns `ConversionError` if:
151/// - HTML parsing fails
152/// - Invalid UTF-8 sequences encountered
153/// - Internal panic during conversion (wrapped in `ConversionError::Panic`)
154/// - Configuration size limits exceeded
155///
156/// # Performance Notes
157///
158/// - Single-pass collection: metadata extraction has minimal overhead
159/// - Zero cost when metadata feature is disabled
160/// - Pre-allocated buffers: typically handles 50+ headers, 100+ links, 20+ images efficiently
161/// - Structured data size-limited to prevent memory exhaustion (configurable)
162///
163/// # Example: Basic Usage
164///
165/// ```ignore
166/// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
167///
168/// let html = r#"
169///   <html lang="en">
170///     <head><title>My Article</title></head>
171///     <body>
172///       <h1 id="intro">Introduction</h1>
173///       <p>Welcome to <a href="https://example.com">our site</a></p>
174///     </body>
175///   </html>
176/// "#;
177///
178/// let (markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
179///
180/// assert_eq!(metadata.document.title, Some("My Article".to_string()));
181/// assert_eq!(metadata.document.language, Some("en".to_string()));
182/// assert_eq!(metadata.headers[0].text, "Introduction");
183/// assert_eq!(metadata.headers[0].id, Some("intro".to_string()));
184/// assert_eq!(metadata.links.len(), 1);
185/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
186/// ```
187///
188/// # Example: Selective Metadata Extraction
189///
190/// ```ignore
191/// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
192///
193/// let html = "<html><body><h1>Title</h1><a href='#anchor'>Link</a></body></html>";
194///
195/// // Extract only headers and document metadata, skip links/images
196/// let config = MetadataConfig {
197///     extract_headers: true,
198///     extract_links: false,
199///     extract_images: false,
200///     extract_structured_data: false,
201///     max_structured_data_size: 0,
202/// };
203///
204/// let (markdown, metadata) = convert_with_metadata(html, None, config)?;
205/// assert!(metadata.headers.len() > 0);
206/// assert!(metadata.links.is_empty());  // Not extracted
207/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
208/// ```
209///
210/// # Example: With Conversion Options and Metadata Config
211///
212/// ```ignore
213/// use html_to_markdown_rs::{convert_with_metadata, ConversionOptions, MetadataConfig, HeadingStyle};
214///
215/// let html = "<html><head><title>Blog Post</title></head><body><h1>Hello</h1></body></html>";
216///
217/// let options = ConversionOptions {
218///     heading_style: HeadingStyle::Atx,
219///     wrap: true,
220///     wrap_width: 80,
221///     ..Default::default()
222/// };
223///
224/// let metadata_cfg = MetadataConfig::default();
225///
226/// let (markdown, metadata) = convert_with_metadata(html, Some(options), metadata_cfg)?;
227/// // Markdown will use ATX-style headings (# H1, ## H2, etc.)
228/// // Wrapped at 80 characters
229/// // All metadata extracted
230/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
231/// ```
232///
233/// # See Also
234///
235/// - [`convert`] - Simple HTML to Markdown conversion without metadata
236/// - [`convert_with_inline_images`] - Conversion with inline image extraction
237/// - [`MetadataConfig`] - Configuration for metadata extraction
238/// - [`ExtendedMetadata`] - Metadata structure documentation
239/// - [`metadata`] module - Detailed type documentation for metadata components
240pub fn convert_with_metadata(
241    html: &str,
242    options: Option<ConversionOptions>,
243    metadata_cfg: MetadataConfig,
244) -> Result<(String, ExtendedMetadata)> {
245    use std::cell::RefCell;
246    use std::rc::Rc;
247
248    let options = options.unwrap_or_default();
249
250    let normalized_html = if html.contains('\r') {
251        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
252    } else {
253        Cow::Borrowed(html)
254    };
255
256    let metadata_collector = Rc::new(RefCell::new(metadata::MetadataCollector::new(metadata_cfg)));
257
258    let markdown =
259        converter::convert_html_with_metadata(normalized_html.as_ref(), &options, Rc::clone(&metadata_collector))?;
260
261    let markdown = if options.wrap {
262        wrapper::wrap_markdown(&markdown, &options)
263    } else {
264        markdown
265    };
266
267    let metadata_collector = Rc::try_unwrap(metadata_collector)
268        .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
269        .into_inner();
270    let metadata = metadata_collector.finish();
271
272    Ok((markdown, metadata))
273}
274
275#[cfg(all(test, feature = "metadata"))]
276mod tests {
277    use super::*;
278
279    #[test]
280    fn test_convert_with_metadata_full_workflow() {
281        let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
282
283        let config = MetadataConfig {
284            extract_document: true,
285            extract_headers: true,
286            extract_links: true,
287            extract_images: true,
288            extract_structured_data: true,
289            max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
290        };
291
292        let (markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
293
294        // Verify markdown was generated
295        assert!(!markdown.is_empty());
296        assert!(markdown.contains("Main Title"));
297        assert!(markdown.contains("Subsection"));
298
299        // Verify language and direction were extracted
300        assert_eq!(metadata.document.language, Some("en".to_string()));
301
302        // Verify headers were extracted
303        assert_eq!(metadata.headers.len(), 2);
304        assert_eq!(metadata.headers[0].level, 1);
305        assert_eq!(metadata.headers[0].text, "Main Title");
306        assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
307        assert_eq!(metadata.headers[1].level, 2);
308        assert_eq!(metadata.headers[1].text, "Subsection");
309
310        // Verify links were extracted
311        assert!(metadata.links.len() >= 2);
312        let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
313        assert!(external_link.is_some());
314        let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
315        assert!(anchor_link.is_some());
316
317        // Verify images were extracted
318        assert_eq!(metadata.images.len(), 1);
319        assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
320        assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
321        assert_eq!(metadata.images[0].image_type, ImageType::External);
322    }
323
324    #[test]
325    fn test_convert_with_metadata_document_fields() {
326        let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
327
328        let (_markdown, metadata) =
329            convert_with_metadata(html, None, MetadataConfig::default()).expect("conversion should succeed");
330
331        assert_eq!(
332            metadata.document.title,
333            Some("Test Article".to_string()),
334            "document: {:?}",
335            metadata.document
336        );
337        assert_eq!(metadata.document.description, Some("Desc".to_string()));
338        assert_eq!(metadata.document.author, Some("Author".to_string()));
339        assert_eq!(metadata.document.language, Some("en".to_string()));
340        assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
341        assert_eq!(
342            metadata.document.open_graph.get("description"),
343            Some(&"OG Desc".to_string())
344        );
345    }
346
347    #[test]
348    fn test_convert_with_metadata_empty_config() {
349        let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
350
351        let config = MetadataConfig {
352            extract_document: false,
353            extract_headers: false,
354            extract_links: false,
355            extract_images: false,
356            extract_structured_data: false,
357            max_structured_data_size: 0,
358        };
359
360        let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
361
362        // With extraction disabled, collections should be empty
363        assert!(metadata.headers.is_empty());
364        assert!(metadata.links.is_empty());
365        assert!(metadata.images.is_empty());
366        // Document metadata extraction includes language from html tag
367        assert_eq!(metadata.document.language, Some("en".to_string()));
368    }
369
370    #[test]
371    fn test_convert_with_metadata_data_uri_image() {
372        let html = "<html><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"Pixel\"></body></html>";
373
374        let config = MetadataConfig::default();
375
376        let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
377
378        assert_eq!(metadata.images.len(), 1);
379        assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
380        assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
381    }
382
383    #[test]
384    fn test_convert_with_metadata_relative_paths() {
385        let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
386
387        let config = MetadataConfig::default();
388
389        let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
390
391        let internal_links: Vec<_> = metadata
392            .links
393            .iter()
394            .filter(|l| l.link_type == LinkType::Internal)
395            .collect();
396        assert_eq!(internal_links.len(), 2);
397    }
398}