html_to_markdown_rs/
lib.rs

1//! High-performance HTML to Markdown converter.
2//!
3//! Built with html5ever for fast, memory-efficient HTML parsing.
4//!
5//! ## Optional inline image extraction
6//!
7//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
8//! assets alongside the produced Markdown.
9
10use std::borrow::Cow;
11
12pub mod converter;
13pub mod error;
14pub mod hocr;
15#[cfg(feature = "inline-images")]
16mod inline_images;
17#[cfg(feature = "metadata")]
18pub mod metadata;
19pub mod options;
20pub mod safety;
21pub mod text;
22#[cfg(feature = "visitor")]
23pub mod visitor;
24#[cfg(feature = "visitor")]
25pub mod visitor_helpers;
26#[cfg(feature = "async-visitor")]
27pub use visitor_helpers::AsyncVisitorHandle;
28pub mod wrapper;
29
30pub use error::{ConversionError, Result};
31#[cfg(feature = "inline-images")]
32pub use inline_images::{
33    DEFAULT_INLINE_IMAGE_LIMIT, HtmlExtraction, InlineImage, InlineImageConfig, InlineImageConfigUpdate,
34    InlineImageFormat, InlineImageSource, InlineImageWarning,
35};
36#[cfg(feature = "metadata")]
37pub use metadata::{
38    DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType,
39    LinkMetadata, LinkType, MetadataConfig, MetadataConfigUpdate, StructuredData, StructuredDataType, TextDirection,
40};
41pub use options::{
42    CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
43    NewlineStyle, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
44};
45
46const BINARY_SCAN_LIMIT: usize = 8192;
47const BINARY_CONTROL_RATIO: f64 = 0.3;
48const BINARY_UTF16_NULL_RATIO: f64 = 0.2;
49
50const BINARY_MAGIC_PREFIXES: &[(&[u8], &str)] = &[
51    (b"\x1F\x8B", "gzip-compressed data"),
52    (b"\x28\xB5\x2F\xFD", "zstd-compressed data"),
53    (b"PK\x03\x04", "zip archive"),
54    (b"PK\x05\x06", "zip archive"),
55    (b"PK\x07\x08", "zip archive"),
56    (b"%PDF-", "PDF data"),
57];
58
59fn validate_input(html: &str) -> Result<()> {
60    let bytes = html.as_bytes();
61    if bytes.is_empty() {
62        return Ok(());
63    }
64
65    if let Some(label) = detect_binary_magic(bytes) {
66        return Err(ConversionError::InvalidInput(format!(
67            "binary data detected ({label}); decode/decompress to UTF-8 HTML first"
68        )));
69    }
70
71    let sample_len = bytes.len().min(BINARY_SCAN_LIMIT);
72    let mut control_count = 0usize;
73    let mut nul_count = 0usize;
74    let mut even_nul_count = 0usize;
75    let mut odd_nul_count = 0usize;
76
77    for (idx, &byte) in bytes[..sample_len].iter().enumerate() {
78        if byte == 0 {
79            nul_count += 1;
80            if idx % 2 == 0 {
81                even_nul_count += 1;
82            } else {
83                odd_nul_count += 1;
84            }
85        }
86        let is_control = (byte < 0x09) || (0x0E..0x20).contains(&byte);
87        if is_control {
88            control_count += 1;
89        }
90    }
91
92    if nul_count > 0 {
93        if let Some(label) = detect_utf16_hint(bytes, sample_len, nul_count, even_nul_count, odd_nul_count) {
94            return Err(ConversionError::InvalidInput(format!(
95                "binary data detected ({label}); decode to UTF-8 HTML first"
96            )));
97        }
98        return Err(ConversionError::InvalidInput("binary data detected".to_string()));
99    }
100
101    let control_ratio = control_count as f64 / sample_len as f64;
102    if control_ratio > BINARY_CONTROL_RATIO {
103        return Err(ConversionError::InvalidInput(
104            "binary data detected (excess control bytes)".to_string(),
105        ));
106    }
107
108    Ok(())
109}
110
111fn detect_binary_magic(bytes: &[u8]) -> Option<&'static str> {
112    for (prefix, label) in BINARY_MAGIC_PREFIXES {
113        if bytes.starts_with(prefix) {
114            return Some(*label);
115        }
116    }
117    None
118}
119
120fn detect_utf16_hint(
121    bytes: &[u8],
122    sample_len: usize,
123    nul_count: usize,
124    even_nul_count: usize,
125    odd_nul_count: usize,
126) -> Option<&'static str> {
127    if bytes.len() >= 2 {
128        if bytes.starts_with(b"\xFF\xFE") {
129            return Some("UTF-16LE BOM");
130        }
131        if bytes.starts_with(b"\xFE\xFF") {
132            return Some("UTF-16BE BOM");
133        }
134    }
135
136    let nul_ratio = nul_count as f64 / sample_len as f64;
137    if nul_ratio < BINARY_UTF16_NULL_RATIO {
138        return None;
139    }
140
141    let dominant_ratio = (even_nul_count.max(odd_nul_count) as f64) / nul_count as f64;
142    if dominant_ratio >= 0.9 {
143        Some("UTF-16 data without BOM")
144    } else {
145        None
146    }
147}
148
149fn normalize_line_endings(html: &str) -> Cow<'_, str> {
150    if html.contains('\r') {
151        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
152    } else {
153        Cow::Borrowed(html)
154    }
155}
156
157fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
158    if html.contains('<') {
159        return None;
160    }
161
162    let mut decoded = text::decode_html_entities_cow(html);
163    if options.strip_newlines && (decoded.contains('\n') || decoded.contains('\r')) {
164        decoded = Cow::Owned(decoded.replace(&['\r', '\n'][..], " "));
165    }
166    let trimmed = decoded.trim_end_matches('\n');
167    if trimmed.is_empty() {
168        return Some(String::new());
169    }
170
171    let normalized = if options.whitespace_mode == WhitespaceMode::Normalized {
172        text::normalize_whitespace_cow(trimmed)
173    } else {
174        Cow::Borrowed(trimmed)
175    };
176
177    let escaped =
178        if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
179            text::escape(
180                normalized.as_ref(),
181                options.escape_misc,
182                options.escape_asterisks,
183                options.escape_underscores,
184                options.escape_ascii,
185            )
186        } else {
187            normalized.into_owned()
188        };
189
190    let mut output = String::with_capacity(escaped.len() + 1);
191    output.push_str(&escaped);
192    while output.ends_with(' ') || output.ends_with('\t') {
193        output.pop();
194    }
195    output.push('\n');
196    Some(output)
197}
198
199#[cfg(any(feature = "serde", feature = "metadata"))]
200fn parse_json<T: serde::de::DeserializeOwned>(json: &str) -> Result<T> {
201    serde_json::from_str(json).map_err(|err| ConversionError::ConfigError(err.to_string()))
202}
203
204#[cfg(any(feature = "serde", feature = "metadata"))]
205pub fn conversion_options_from_json(json: &str) -> Result<ConversionOptions> {
206    let update: ConversionOptionsUpdate = parse_json(json)?;
207    Ok(ConversionOptions::from(update))
208}
209
210#[cfg(any(feature = "serde", feature = "metadata"))]
211pub fn conversion_options_update_from_json(json: &str) -> Result<ConversionOptionsUpdate> {
212    parse_json(json)
213}
214
215#[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
216pub fn inline_image_config_from_json(json: &str) -> Result<InlineImageConfig> {
217    let update: InlineImageConfigUpdate = parse_json(json)?;
218    Ok(InlineImageConfig::from_update(update))
219}
220
221#[cfg(all(feature = "metadata", any(feature = "serde", feature = "metadata")))]
222pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
223    let update: MetadataConfigUpdate = parse_json(json)?;
224    Ok(MetadataConfig::from(update))
225}
226
227/// Convert HTML to Markdown.
228///
229/// # Arguments
230///
231/// * `html` - The HTML string to convert
232/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
233///
234/// # Example
235///
236/// ```
237/// use html_to_markdown_rs::{convert, ConversionOptions};
238///
239/// let html = "<h1>Hello World</h1>";
240/// let markdown = convert(html, None).unwrap();
241/// assert!(markdown.contains("Hello World"));
242/// ```
243pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
244    validate_input(html)?;
245    let options = options.unwrap_or_default();
246
247    let normalized_html = normalize_line_endings(html);
248
249    if !options.wrap {
250        if let Some(markdown) = fast_text_only(normalized_html.as_ref(), &options) {
251            return Ok(markdown);
252        }
253    }
254
255    let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
256
257    if options.wrap {
258        Ok(wrapper::wrap_markdown(&markdown, &options))
259    } else {
260        Ok(markdown)
261    }
262}
263
264#[cfg(feature = "inline-images")]
265/// Convert HTML to Markdown while collecting inline image assets (requires the `inline-images` feature).
266///
267/// Extracts inline image data URIs and inline `<svg>` elements alongside Markdown conversion.
268///
269/// # Arguments
270///
271/// * `html` - The HTML string to convert
272/// * `options` - Optional conversion options (defaults to ConversionOptions::default())
273/// * `image_cfg` - Configuration controlling inline image extraction
274pub fn convert_with_inline_images(
275    html: &str,
276    options: Option<ConversionOptions>,
277    image_cfg: InlineImageConfig,
278) -> Result<HtmlExtraction> {
279    use std::cell::RefCell;
280    use std::rc::Rc;
281
282    validate_input(html)?;
283    let options = options.unwrap_or_default();
284
285    let normalized_html = normalize_line_endings(html);
286
287    let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
288
289    let markdown =
290        converter::convert_html_with_inline_collector(normalized_html.as_ref(), &options, Rc::clone(&collector))?;
291
292    let markdown = if options.wrap {
293        wrapper::wrap_markdown(&markdown, &options)
294    } else {
295        markdown
296    };
297
298    let collector = Rc::try_unwrap(collector)
299        .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
300        .into_inner();
301    let (inline_images, warnings) = collector.finish();
302
303    Ok(HtmlExtraction {
304        markdown,
305        inline_images,
306        warnings,
307    })
308}
309
310#[cfg(feature = "metadata")]
311/// Convert HTML to Markdown with comprehensive metadata extraction (requires the `metadata` feature).
312///
313/// Performs HTML-to-Markdown conversion while simultaneously extracting structured metadata in a
314/// single pass for maximum efficiency. Ideal for content analysis, SEO optimization, and document
315/// indexing workflows.
316///
317/// # Arguments
318///
319/// * `html` - The HTML string to convert. Will normalize line endings (CRLF → LF).
320/// * `options` - Optional conversion configuration. Defaults to `ConversionOptions::default()` if `None`.
321///   Controls heading style, list indentation, escape behavior, wrapping, and other output formatting.
322/// * `metadata_cfg` - Configuration for metadata extraction granularity. Use `MetadataConfig::default()`
323///   to extract all metadata types, or customize with selective extraction flags.
324///
325/// # Returns
326///
327/// On success, returns a tuple of:
328/// - `String`: The converted Markdown output
329/// - `ExtendedMetadata`: Comprehensive metadata containing:
330///   - `document`: Title, description, author, language, Open Graph, Twitter Card, and other meta tags
331///   - `headers`: All heading elements (h1-h6) with hierarchy and IDs
332///   - `links`: Hyperlinks classified as anchor, internal, external, email, or phone
333///   - `images`: Image elements with source, dimensions, and alt text
334///   - `structured_data`: JSON-LD, Microdata, and RDFa blocks
335///
336/// # Errors
337///
338/// Returns `ConversionError` if:
339/// - HTML parsing fails
340/// - Invalid UTF-8 sequences encountered
341/// - Internal panic during conversion (wrapped in `ConversionError::Panic`)
342/// - Configuration size limits exceeded
343///
344/// # Performance Notes
345///
346/// - Single-pass collection: metadata extraction has minimal overhead
347/// - Zero cost when metadata feature is disabled
348/// - Pre-allocated buffers: typically handles 50+ headers, 100+ links, 20+ images efficiently
349/// - Structured data size-limited to prevent memory exhaustion (configurable)
350///
351/// # Example: Basic Usage
352///
353/// ```ignore
354/// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
355///
356/// let html = r#"
357///   <html lang="en">
358///     <head><title>My Article</title></head>
359///     <body>
360///       <h1 id="intro">Introduction</h1>
361///       <p>Welcome to <a href="https://example.com">our site</a></p>
362///     </body>
363///   </html>
364/// "#;
365///
366/// let (markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default())?;
367///
368/// assert_eq!(metadata.document.title, Some("My Article".to_string()));
369/// assert_eq!(metadata.document.language, Some("en".to_string()));
370/// assert_eq!(metadata.headers[0].text, "Introduction");
371/// assert_eq!(metadata.headers[0].id, Some("intro".to_string()));
372/// assert_eq!(metadata.links.len(), 1);
373/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
374/// ```
375///
376/// # Example: Selective Metadata Extraction
377///
378/// ```ignore
379/// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
380///
381/// let html = "<html><body><h1>Title</h1><a href='#anchor'>Link</a></body></html>";
382///
383/// // Extract only headers and document metadata, skip links/images
384/// let config = MetadataConfig {
385///     extract_headers: true,
386///     extract_links: false,
387///     extract_images: false,
388///     extract_structured_data: false,
389///     max_structured_data_size: 0,
390/// };
391///
392/// let (markdown, metadata) = convert_with_metadata(html, None, config)?;
393/// assert!(metadata.headers.len() > 0);
394/// assert!(metadata.links.is_empty());  // Not extracted
395/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
396/// ```
397///
398/// # Example: With Conversion Options and Metadata Config
399///
400/// ```ignore
401/// use html_to_markdown_rs::{convert_with_metadata, ConversionOptions, MetadataConfig, HeadingStyle};
402///
403/// let html = "<html><head><title>Blog Post</title></head><body><h1>Hello</h1></body></html>";
404///
405/// let options = ConversionOptions {
406///     heading_style: HeadingStyle::Atx,
407///     wrap: true,
408///     wrap_width: 80,
409///     ..Default::default()
410/// };
411///
412/// let metadata_cfg = MetadataConfig::default();
413///
414/// let (markdown, metadata) = convert_with_metadata(html, Some(options), metadata_cfg)?;
415/// // Markdown will use ATX-style headings (# H1, ## H2, etc.)
416/// // Wrapped at 80 characters
417/// // All metadata extracted
418/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
419/// ```
420///
421/// # See Also
422///
423/// - [`convert`] - Simple HTML to Markdown conversion without metadata
424/// - [`convert_with_inline_images`] - Conversion with inline image extraction
425/// - [`MetadataConfig`] - Configuration for metadata extraction
426/// - [`ExtendedMetadata`] - Metadata structure documentation
427/// - [`metadata`] module - Detailed type documentation for metadata components
428pub fn convert_with_metadata(
429    html: &str,
430    options: Option<ConversionOptions>,
431    metadata_cfg: MetadataConfig,
432) -> Result<(String, ExtendedMetadata)> {
433    use std::cell::RefCell;
434    use std::rc::Rc;
435
436    validate_input(html)?;
437    let options = options.unwrap_or_default();
438    if !metadata_cfg.any_enabled() {
439        let markdown = convert(html, Some(options))?;
440        return Ok((markdown, ExtendedMetadata::default()));
441    }
442
443    let normalized_html = normalize_line_endings(html);
444
445    let metadata_collector = Rc::new(RefCell::new(metadata::MetadataCollector::new(metadata_cfg)));
446
447    let markdown =
448        converter::convert_html_with_metadata(normalized_html.as_ref(), &options, Rc::clone(&metadata_collector))?;
449
450    let markdown = if options.wrap {
451        wrapper::wrap_markdown(&markdown, &options)
452    } else {
453        markdown
454    };
455
456    let metadata_collector = Rc::try_unwrap(metadata_collector)
457        .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
458        .into_inner();
459    let metadata = metadata_collector.finish();
460
461    Ok((markdown, metadata))
462}
463
464/// Convert HTML to Markdown with a custom visitor callback.
465///
466/// This function allows you to provide a visitor implementation that can inspect,
467/// modify, or replace the default conversion behavior for any HTML element type.
468///
469/// # Arguments
470///
471/// * `html` - The HTML input to convert
472/// * `options` - Optional conversion options (uses defaults if None)
473/// * `visitor` - Mutable reference to visitor implementation for customization
474///
475/// # Example
476///
477/// ```ignore
478/// use html_to_markdown_rs::convert_with_visitor;
479/// use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
480///
481/// #[derive(Debug)]
482/// struct CustomVisitor;
483///
484/// impl HtmlVisitor for CustomVisitor {
485///     fn visit_code_block(
486///         &mut self,
487///         _ctx: &NodeContext,
488///         code: &str,
489///         language: Option<&str>,
490///     ) -> VisitResult {
491///         VisitResult::Custom(format!("```{}\n{}\n```", language.unwrap_or(""), code))
492///     }
493/// }
494///
495/// let html = "<pre><code class=\"language-rust\">fn main() {}</code></pre>";
496/// let mut visitor = CustomVisitor;
497/// let markdown = convert_with_visitor(html, None, &mut visitor).unwrap();
498/// ```
499#[cfg(feature = "visitor")]
500pub fn convert_with_visitor(
501    html: &str,
502    options: Option<ConversionOptions>,
503    visitor: Option<visitor::VisitorHandle>,
504) -> Result<String> {
505    validate_input(html)?;
506    let options = options.unwrap_or_default();
507
508    let normalized_html = normalize_line_endings(html);
509
510    let markdown = converter::convert_html_with_visitor(normalized_html.as_ref(), &options, visitor)?;
511
512    if options.wrap {
513        Ok(wrapper::wrap_markdown(&markdown, &options))
514    } else {
515        Ok(markdown)
516    }
517}
518
519#[cfg(feature = "async-visitor")]
520/// Convert HTML to Markdown with an async visitor callback.
521///
522/// This async function allows you to provide an async visitor implementation that can inspect,
523/// modify, or replace the default conversion behavior for any HTML element type.
524///
525/// This function is useful for:
526/// - Python async functions (with `async def` and `asyncio`)
527/// - TypeScript/JavaScript async functions (with `Promise`-based callbacks)
528/// - Elixir processes (with message-passing async operations)
529///
530/// For synchronous languages (Ruby, PHP, Go, Java, C#), use `convert_with_visitor` instead.
531///
532/// # Note
533///
534/// The async visitor trait (`AsyncHtmlVisitor`) and async dispatch helpers are designed to be
535/// consumed by language bindings (PyO3, NAPI-RS, Magnus, etc.) which can bridge async/await
536/// semantics from their host languages. The conversion pipeline itself runs synchronously,
537/// but visitor callbacks are defined as async to support languages with native async/await.
538///
539/// Binding implementations will be responsible for running async callbacks on appropriate
540/// event loops (asyncio for Python, Promise chains for TypeScript, etc.).
541///
542/// # Arguments
543///
544/// * `html` - The HTML input to convert
545/// * `options` - Optional conversion options (uses defaults if None)
546/// * `visitor` - Optional async visitor implementing `AsyncHtmlVisitor` trait for customization
547///
548/// # Example (Rust-like async)
549///
550/// ```ignore
551/// use html_to_markdown_rs::convert_with_async_visitor;
552/// use html_to_markdown_rs::visitor::{AsyncHtmlVisitor, NodeContext, VisitResult};
553/// use async_trait::async_trait;
554/// use std::rc::Rc;
555/// use std::cell::RefCell;
556///
557/// #[derive(Debug)]
558/// struct CustomAsyncVisitor;
559///
560/// #[async_trait]
561/// impl AsyncHtmlVisitor for CustomAsyncVisitor {
562///     async fn visit_code_block(
563///         &mut self,
564///         _ctx: &NodeContext,
565///         code: &str,
566///         language: Option<&str>,
567///     ) -> VisitResult {
568///         // Can perform async operations here (e.g., syntax highlighting via service)
569///         VisitResult::Custom(format!("```{}\n{}\n```", language.unwrap_or(""), code))
570///     }
571/// }
572///
573/// let html = "<pre><code class=\"language-rust\">fn main() {}</code></pre>";
574/// let visitor = Some(Rc::new(RefCell::new(CustomAsyncVisitor) as _));
575/// let markdown = convert_with_async_visitor(html, None, visitor).await.unwrap();
576/// ```
577///
578/// # Implementation Note
579///
580/// Currently, this function placeholder documents the async visitor interface for binding
581/// implementations. The actual async dispatch integration will be implemented in binding
582/// layers (Python, TypeScript, etc.) to properly bridge async/await semantics to the
583/// synchronous Rust conversion core.
584pub async fn convert_with_async_visitor(
585    html: &str,
586    options: Option<ConversionOptions>,
587    _visitor: Option<visitor_helpers::AsyncVisitorHandle>,
588) -> Result<String> {
589    validate_input(html)?;
590    let options = options.unwrap_or_default();
591
592    let normalized_html = normalize_line_endings(html);
593
594    // TODO: Implement async dispatch in conversion pipeline
595    let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
596
597    if options.wrap {
598        Ok(wrapper::wrap_markdown(&markdown, &options))
599    } else {
600        Ok(markdown)
601    }
602}
603
604#[cfg(all(test, feature = "metadata"))]
605mod tests {
606    use super::*;
607
608    #[test]
609    fn test_convert_with_metadata_full_workflow() {
610        let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
611
612        let config = MetadataConfig {
613            extract_document: true,
614            extract_headers: true,
615            extract_links: true,
616            extract_images: true,
617            extract_structured_data: true,
618            max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
619        };
620
621        let (markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
622
623        assert!(!markdown.is_empty());
624        assert!(markdown.contains("Main Title"));
625        assert!(markdown.contains("Subsection"));
626
627        assert_eq!(metadata.document.language, Some("en".to_string()));
628
629        assert_eq!(metadata.headers.len(), 2);
630        assert_eq!(metadata.headers[0].level, 1);
631        assert_eq!(metadata.headers[0].text, "Main Title");
632        assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
633        assert_eq!(metadata.headers[1].level, 2);
634        assert_eq!(metadata.headers[1].text, "Subsection");
635
636        assert!(metadata.links.len() >= 2);
637        let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
638        assert!(external_link.is_some());
639        let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
640        assert!(anchor_link.is_some());
641
642        assert_eq!(metadata.images.len(), 1);
643        assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
644        assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
645        assert_eq!(metadata.images[0].image_type, ImageType::External);
646    }
647
648    #[test]
649    fn test_convert_with_metadata_document_fields() {
650        let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
651
652        let (_markdown, metadata) =
653            convert_with_metadata(html, None, MetadataConfig::default()).expect("conversion should succeed");
654
655        assert_eq!(
656            metadata.document.title,
657            Some("Test Article".to_string()),
658            "document: {:?}",
659            metadata.document
660        );
661        assert_eq!(metadata.document.description, Some("Desc".to_string()));
662        assert_eq!(metadata.document.author, Some("Author".to_string()));
663        assert_eq!(metadata.document.language, Some("en".to_string()));
664        assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
665        assert_eq!(
666            metadata.document.open_graph.get("description"),
667            Some(&"OG Desc".to_string())
668        );
669    }
670
671    #[test]
672    fn test_convert_with_metadata_empty_config() {
673        let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
674
675        let config = MetadataConfig {
676            extract_document: false,
677            extract_headers: false,
678            extract_links: false,
679            extract_images: false,
680            extract_structured_data: false,
681            max_structured_data_size: 0,
682        };
683
684        let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
685
686        assert!(metadata.headers.is_empty());
687        assert!(metadata.links.is_empty());
688        assert!(metadata.images.is_empty());
689        assert_eq!(metadata.document.language, None);
690    }
691
692    #[test]
693    fn test_convert_with_metadata_data_uri_image() {
694        let html = "<html><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"Pixel\"></body></html>";
695
696        let config = MetadataConfig::default();
697
698        let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
699
700        assert_eq!(metadata.images.len(), 1);
701        assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
702        assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
703    }
704
705    #[test]
706    fn test_convert_with_metadata_relative_paths() {
707        let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
708
709        let config = MetadataConfig::default();
710
711        let (_markdown, metadata) = convert_with_metadata(html, None, config).expect("conversion should succeed");
712
713        let internal_links: Vec<_> = metadata
714            .links
715            .iter()
716            .filter(|l| l.link_type == LinkType::Internal)
717            .collect();
718        assert_eq!(internal_links.len(), 2);
719    }
720}
721
722#[cfg(test)]
723mod basic_tests {
724    use super::*;
725
726    #[test]
727    fn test_binary_input_rejected() {
728        let html = "PDF\0DATA";
729        let result = convert(html, None);
730        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
731    }
732
733    #[test]
734    fn test_binary_magic_rejected() {
735        let html = String::from_utf8_lossy(b"\x1F\x8B\x08\x00gzip").to_string();
736        let result = convert(&html, None);
737        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
738    }
739
740    #[test]
741    fn test_utf16_hint_rejected() {
742        let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
743        let result = convert(&html, None);
744        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
745    }
746
747    #[test]
748    fn test_plain_text_allowed() {
749        let result = convert("Just text", None).unwrap();
750        assert!(result.contains("Just text"));
751    }
752
753    #[test]
754    fn test_plain_text_escaped_when_enabled() {
755        let options = ConversionOptions {
756            escape_asterisks: true,
757            escape_underscores: true,
758            ..ConversionOptions::default()
759        };
760        let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
761        assert!(result.contains(r"\*asterisks\*"));
762        assert!(result.contains(r"\_underscores\_"));
763    }
764}
html_to_markdown_rs/lib.rs

html_to_markdown_rs/
lib.rs