html_to_markdown_rs/
lib.rs

1#![allow(
2    clippy::too_many_lines,
3    clippy::option_if_let_else,
4    clippy::match_wildcard_for_single_variants,
5    clippy::needless_pass_by_value,
6    clippy::struct_excessive_bools,
7    clippy::fn_params_excessive_bools,
8    clippy::branches_sharing_code,
9    clippy::match_same_arms,
10    clippy::missing_errors_doc,
11    clippy::items_after_statements,
12    clippy::doc_markdown,
13    clippy::cast_sign_loss,
14    clippy::default_trait_access,
15    clippy::unused_self,
16    clippy::cast_precision_loss,
17    clippy::collapsible_if,
18    clippy::too_many_arguments,
19    clippy::collapsible_else_if,
20    clippy::extra_unused_lifetimes,
21    clippy::unnecessary_lazy_evaluations,
22    clippy::must_use_candidate,
23    clippy::trivially_copy_pass_by_ref,
24    clippy::explicit_iter_loop,
25    clippy::missing_const_for_fn,
26    clippy::manual_assert,
27    clippy::return_self_not_must_use,
28    clippy::collapsible_match,
29    clippy::cast_possible_truncation,
30    clippy::map_unwrap_or,
31    clippy::manual_let_else,
32    clippy::used_underscore_binding,
33    clippy::assigning_clones,
34    clippy::uninlined_format_args
35)]
36#![allow(dead_code)]
37
38//! High-performance HTML to Markdown converter.
39//!
40//! Built with html5ever for fast, memory-efficient HTML parsing.
41//!
42//! ## Optional inline image extraction
43//!
44//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
45//! assets alongside the produced Markdown.
46use std::borrow::Cow;
47
48pub mod converter;
49pub mod error;
50pub mod hocr;
51#[cfg(feature = "inline-images")]
52mod inline_images;
53#[cfg(feature = "metadata")]
54pub mod metadata;
55pub mod options;
56pub mod safety;
57pub mod text;
58#[cfg(feature = "visitor")]
59pub mod visitor;
60#[cfg(feature = "visitor")]
61pub mod visitor_helpers;
62#[cfg(feature = "async-visitor")]
63pub use visitor_helpers::AsyncVisitorHandle;
64pub mod wrapper;
65
66pub use error::{ConversionError, Result};
67#[cfg(feature = "inline-images")]
68pub use inline_images::{
69    DEFAULT_INLINE_IMAGE_LIMIT, HtmlExtraction, InlineImage, InlineImageConfig, InlineImageConfigUpdate,
70    InlineImageFormat, InlineImageSource, InlineImageWarning,
71};
72#[cfg(feature = "metadata")]
73pub use metadata::{
74    DEFAULT_MAX_STRUCTURED_DATA_SIZE, DocumentMetadata, ExtendedMetadata, HeaderMetadata, ImageMetadata, ImageType,
75    LinkMetadata, LinkType, MetadataConfig, MetadataConfigUpdate, StructuredData, StructuredDataType, TextDirection,
76};
77pub use options::{
78    CodeBlockStyle, ConversionOptions, ConversionOptionsUpdate, HeadingStyle, HighlightStyle, ListIndentType,
79    NewlineStyle, PreprocessingOptions, PreprocessingOptionsUpdate, PreprocessingPreset, WhitespaceMode,
80};
81
82const BINARY_SCAN_LIMIT: usize = 8192;
83const BINARY_CONTROL_RATIO: f64 = 0.3;
84const BINARY_UTF16_NULL_RATIO: f64 = 0.2;
85
86const BINARY_MAGIC_PREFIXES: &[(&[u8], &str)] = &[
87    (b"\x1F\x8B", "gzip-compressed data"),
88    (b"\x28\xB5\x2F\xFD", "zstd-compressed data"),
89    (b"PK\x03\x04", "zip archive"),
90    (b"PK\x05\x06", "zip archive"),
91    (b"PK\x07\x08", "zip archive"),
92    (b"%PDF-", "PDF data"),
93];
94
95#[allow(clippy::cast_precision_loss)]
96fn validate_input(html: &str) -> Result<()> {
97    let bytes = html.as_bytes();
98    if bytes.is_empty() {
99        return Ok(());
100    }
101
102    if let Some(label) = detect_binary_magic(bytes) {
103        return Err(ConversionError::InvalidInput(format!(
104            "binary data detected ({label}); decode/decompress to UTF-8 HTML first"
105        )));
106    }
107
108    let sample_len = bytes.len().min(BINARY_SCAN_LIMIT);
109    let mut control_count = 0usize;
110    let mut nul_count = 0usize;
111    let mut even_nul_count = 0usize;
112    let mut odd_nul_count = 0usize;
113
114    for (idx, &byte) in bytes[..sample_len].iter().enumerate() {
115        if byte == 0 {
116            nul_count += 1;
117            if idx % 2 == 0 {
118                even_nul_count += 1;
119            } else {
120                odd_nul_count += 1;
121            }
122        }
123        let is_control = (byte < 0x09) || (0x0E..0x20).contains(&byte);
124        if is_control {
125            control_count += 1;
126        }
127    }
128
129    if nul_count > 0 {
130        if let Some(label) = detect_utf16_hint(bytes, sample_len, nul_count, even_nul_count, odd_nul_count) {
131            return Err(ConversionError::InvalidInput(format!(
132                "binary data detected ({label}); decode to UTF-8 HTML first"
133            )));
134        }
135        return Err(ConversionError::InvalidInput("binary data detected".to_string()));
136    }
137
138    let control_ratio = control_count as f64 / sample_len as f64;
139    if control_ratio > BINARY_CONTROL_RATIO {
140        return Err(ConversionError::InvalidInput(
141            "binary data detected (excess control bytes)".to_string(),
142        ));
143    }
144
145    Ok(())
146}
147
148fn detect_binary_magic(bytes: &[u8]) -> Option<&'static str> {
149    for (prefix, label) in BINARY_MAGIC_PREFIXES {
150        if bytes.starts_with(prefix) {
151            return Some(*label);
152        }
153    }
154    None
155}
156
157#[allow(clippy::cast_precision_loss)]
158fn detect_utf16_hint(
159    bytes: &[u8],
160    sample_len: usize,
161    nul_count: usize,
162    even_nul_count: usize,
163    odd_nul_count: usize,
164) -> Option<&'static str> {
165    if bytes.len() >= 2 {
166        if bytes.starts_with(b"\xFF\xFE") {
167            return Some("UTF-16LE BOM");
168        }
169        if bytes.starts_with(b"\xFE\xFF") {
170            return Some("UTF-16BE BOM");
171        }
172    }
173
174    #[allow(clippy::cast_precision_loss)]
175    let nul_ratio = nul_count as f64 / sample_len as f64;
176    if nul_ratio < BINARY_UTF16_NULL_RATIO {
177        return None;
178    }
179
180    #[allow(clippy::cast_precision_loss)]
181    let dominant_ratio = (even_nul_count.max(odd_nul_count) as f64) / nul_count as f64;
182    if dominant_ratio >= 0.9 {
183        Some("UTF-16 data without BOM")
184    } else {
185        None
186    }
187}
188
189fn normalize_line_endings(html: &str) -> Cow<'_, str> {
190    if html.contains('\r') {
191        Cow::Owned(html.replace("\r\n", "\n").replace('\r', "\n"))
192    } else {
193        Cow::Borrowed(html)
194    }
195}
196
197fn fast_text_only(html: &str, options: &ConversionOptions) -> Option<String> {
198    if html.contains('<') {
199        return None;
200    }
201
202    let mut decoded = text::decode_html_entities_cow(html);
203    if options.strip_newlines && (decoded.contains('\n') || decoded.contains('\r')) {
204        decoded = Cow::Owned(decoded.replace(&['\r', '\n'][..], " "));
205    }
206    let trimmed = decoded.trim_end_matches('\n');
207    if trimmed.is_empty() {
208        return Some(String::new());
209    }
210
211    let normalized = if options.whitespace_mode == WhitespaceMode::Normalized {
212        text::normalize_whitespace_cow(trimmed)
213    } else {
214        Cow::Borrowed(trimmed)
215    };
216
217    let escaped =
218        if options.escape_misc || options.escape_asterisks || options.escape_underscores || options.escape_ascii {
219            text::escape(
220                normalized.as_ref(),
221                options.escape_misc,
222                options.escape_asterisks,
223                options.escape_underscores,
224                options.escape_ascii,
225            )
226        } else {
227            normalized.into_owned()
228        };
229
230    let mut output = String::with_capacity(escaped.len() + 1);
231    output.push_str(&escaped);
232    while output.ends_with(' ') || output.ends_with('\t') {
233        output.pop();
234    }
235    output.push('\n');
236    Some(output)
237}
238
239#[cfg(any(feature = "serde", feature = "metadata"))]
240fn parse_json<T: serde::de::DeserializeOwned>(json: &str) -> Result<T> {
241    serde_json::from_str(json).map_err(|err| ConversionError::ConfigError(err.to_string()))
242}
243
244#[cfg(any(feature = "serde", feature = "metadata"))]
245/// Parse JSON string into `ConversionOptions`.
246///
247/// Deserializes a JSON string into a full set of conversion options.
248/// The JSON can be either a complete or partial options object.
249///
250/// # Arguments
251///
252/// * `json` - JSON string representing conversion options
253///
254/// # Returns
255///
256/// Fully populated `ConversionOptions` with defaults applied to any unspecified values
257///
258/// # Errors
259///
260/// Returns `ConversionError::ConfigError` if JSON parsing fails or contains invalid option values
261pub fn conversion_options_from_json(json: &str) -> Result<ConversionOptions> {
262    let update: ConversionOptionsUpdate = parse_json(json)?;
263    Ok(ConversionOptions::from(update))
264}
265
266#[cfg(any(feature = "serde", feature = "metadata"))]
267/// Parse JSON string into partial `ConversionOptions` update.
268///
269/// Deserializes a JSON string into a partial set of conversion options.
270/// Only specified options are included; unspecified options are None.
271///
272/// # Arguments
273///
274/// * `json` - JSON string representing partial conversion options
275///
276/// # Returns
277///
278/// `ConversionOptionsUpdate` with only specified fields populated
279///
280/// # Errors
281///
282/// Returns `ConversionError::ConfigError` if JSON parsing fails or contains invalid option values
283pub fn conversion_options_update_from_json(json: &str) -> Result<ConversionOptionsUpdate> {
284    parse_json(json)
285}
286
287#[cfg(all(feature = "inline-images", any(feature = "serde", feature = "metadata")))]
288/// Parse JSON string into `InlineImageConfig` (requires `inline-images` feature).
289///
290/// Deserializes a JSON string into inline image extraction configuration.
291/// The JSON can be either a complete or partial configuration object.
292///
293/// # Arguments
294///
295/// * `json` - JSON string representing inline image configuration
296///
297/// # Returns
298///
299/// Fully populated `InlineImageConfig` with defaults applied to any unspecified values
300///
301/// # Errors
302///
303/// Returns `ConversionError::ConfigError` if JSON parsing fails or contains invalid configuration values
304pub fn inline_image_config_from_json(json: &str) -> Result<InlineImageConfig> {
305    let update: InlineImageConfigUpdate = parse_json(json)?;
306    Ok(InlineImageConfig::from_update(update))
307}
308
309#[cfg(all(feature = "metadata", any(feature = "serde", feature = "metadata")))]
310/// Parse JSON string into `MetadataConfig` (requires `metadata` feature).
311///
312/// Deserializes a JSON string into metadata extraction configuration.
313/// The JSON can be either a complete or partial configuration object.
314///
315/// # Arguments
316///
317/// * `json` - JSON string representing metadata extraction configuration
318///
319/// # Returns
320///
321/// Fully populated `MetadataConfig` with defaults applied to any unspecified values
322///
323/// # Errors
324///
325/// Returns `ConversionError::ConfigError` if JSON parsing fails or contains invalid configuration values
326pub fn metadata_config_from_json(json: &str) -> Result<MetadataConfig> {
327    let update: MetadataConfigUpdate = parse_json(json)?;
328    Ok(MetadataConfig::from(update))
329}
330
331/// Convert HTML to Markdown.
332///
333/// # Arguments
334///
335/// * `html` - The HTML string to convert
336/// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
337///
338/// # Example
339///
340/// ```
341/// use html_to_markdown_rs::{convert, ConversionOptions};
342///
343/// let html = "<h1>Hello World</h1>";
344/// let markdown = convert(html, None).unwrap();
345/// assert!(markdown.contains("Hello World"));
346/// ```
347/// # Errors
348///
349/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
350pub fn convert(html: &str, options: Option<ConversionOptions>) -> Result<String> {
351    validate_input(html)?;
352    let options = options.unwrap_or_default();
353
354    let normalized_html = normalize_line_endings(html);
355
356    if !options.wrap {
357        if let Some(markdown) = fast_text_only(normalized_html.as_ref(), &options) {
358            return Ok(markdown);
359        }
360    }
361
362    let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
363
364    if options.wrap {
365        Ok(wrapper::wrap_markdown(&markdown, &options))
366    } else {
367        Ok(markdown)
368    }
369}
370
371/// Convert HTML to Markdown while collecting inline image assets (requires the `inline-images` feature).
372///
373/// Extracts inline image data URIs and inline `<svg>` elements alongside Markdown conversion.
374///
375/// # Arguments
376///
377/// * `html` - The HTML string to convert
378/// * `options` - Optional conversion options (defaults to `ConversionOptions::default()`)
379/// * `image_cfg` - Configuration controlling inline image extraction
380/// * `visitor` - Optional visitor for customizing conversion behavior. Only used if `visitor` feature is enabled.
381/// # Errors
382///
383/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
384#[cfg(feature = "inline-images")]
385pub fn convert_with_inline_images(
386    html: &str,
387    options: Option<ConversionOptions>,
388    image_cfg: InlineImageConfig,
389    #[cfg(feature = "visitor")] visitor: Option<visitor::VisitorHandle>,
390    #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
391) -> Result<HtmlExtraction> {
392    use std::cell::RefCell;
393    use std::rc::Rc;
394
395    validate_input(html)?;
396    let options = options.unwrap_or_default();
397
398    let normalized_html = normalize_line_endings(html);
399
400    let collector = Rc::new(RefCell::new(inline_images::InlineImageCollector::new(image_cfg)?));
401
402    #[cfg(feature = "visitor")]
403    let markdown = converter::convert_html_impl(
404        normalized_html.as_ref(),
405        &options,
406        Some(Rc::clone(&collector)),
407        None,
408        visitor,
409    )?;
410    #[cfg(not(feature = "visitor"))]
411    let markdown = converter::convert_html_impl(
412        normalized_html.as_ref(),
413        &options,
414        Some(Rc::clone(&collector)),
415        None,
416        None,
417    )?;
418
419    let markdown = if options.wrap {
420        wrapper::wrap_markdown(&markdown, &options)
421    } else {
422        markdown
423    };
424
425    let collector = Rc::try_unwrap(collector)
426        .map_err(|_| ConversionError::Other("failed to recover inline image state".to_string()))?
427        .into_inner();
428    let (inline_images, warnings) = collector.finish();
429
430    Ok(HtmlExtraction {
431        markdown,
432        inline_images,
433        warnings,
434    })
435}
436
437/// Convert HTML to Markdown with comprehensive metadata extraction (requires the `metadata` feature).
438///
439/// Performs HTML-to-Markdown conversion while simultaneously extracting structured metadata in a
440/// single pass for maximum efficiency. Ideal for content analysis, SEO optimization, and document
441/// indexing workflows.
442///
443/// # Arguments
444///
445/// * `html` - The HTML string to convert. Will normalize line endings (CRLF → LF).
446/// * `options` - Optional conversion configuration. Defaults to `ConversionOptions::default()` if `None`.
447///   Controls heading style, list indentation, escape behavior, wrapping, and other output formatting.
448/// * `metadata_cfg` - Configuration for metadata extraction granularity. Use `MetadataConfig::default()`
449///   to extract all metadata types, or customize with selective extraction flags.
450/// * `visitor` - Optional visitor for customizing conversion behavior. Only used if `visitor` feature is enabled.
451///
452/// # Returns
453///
454/// On success, returns a tuple of:
455/// - `String`: The converted Markdown output
456/// - `ExtendedMetadata`: Comprehensive metadata containing:
457///   - `document`: Title, description, author, language, Open Graph, Twitter Card, and other meta tags
458///   - `headers`: All heading elements (h1-h6) with hierarchy and IDs
459///   - `links`: Hyperlinks classified as anchor, internal, external, email, or phone
460///   - `images`: Image elements with source, dimensions, and alt text
461///   - `structured_data`: JSON-LD, Microdata, and `RDFa` blocks
462///
463/// # Errors
464///
465/// Returns `ConversionError` if:
466/// - HTML parsing fails
467/// - Invalid UTF-8 sequences encountered
468/// - Internal panic during conversion (wrapped in `ConversionError::Panic`)
469/// - Configuration size limits exceeded
470///
471/// # Performance Notes
472///
473/// - Single-pass collection: metadata extraction has minimal overhead
474/// - Zero cost when metadata feature is disabled
475/// - Pre-allocated buffers: typically handles 50+ headers, 100+ links, 20+ images efficiently
476/// - Structured data size-limited to prevent memory exhaustion (configurable)
477///
478/// # Example: Basic Usage
479///
480/// ```ignore
481/// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
482///
483/// let html = r#"
484///   <html lang="en">
485///     <head><title>My Article</title></head>
486///     <body>
487///       <h1 id="intro">Introduction</h1>
488///       <p>Welcome to <a href="https://example.com">our site</a></p>
489///     </body>
490///   </html>
491/// "#;
492///
493/// let (markdown, metadata) = convert_with_metadata(html, None, MetadataConfig::default(), None)?;
494///
495/// assert_eq!(metadata.document.title, Some("My Article".to_string()));
496/// assert_eq!(metadata.document.language, Some("en".to_string()));
497/// assert_eq!(metadata.headers[0].text, "Introduction");
498/// assert_eq!(metadata.headers[0].id, Some("intro".to_string()));
499/// assert_eq!(metadata.links.len(), 1);
500/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
501/// ```
502///
503/// # Example: Selective Metadata Extraction
504///
505/// ```ignore
506/// use html_to_markdown_rs::{convert_with_metadata, MetadataConfig};
507///
508/// let html = "<html><body><h1>Title</h1><a href='#anchor'>Link</a></body></html>";
509///
510/// // Extract only headers and document metadata, skip links/images
511/// let config = MetadataConfig {
512///     extract_headers: true,
513///     extract_links: false,
514///     extract_images: false,
515///     extract_structured_data: false,
516///     max_structured_data_size: 0,
517/// };
518///
519/// let (markdown, metadata) = convert_with_metadata(html, None, config, None)?;
520/// assert!(metadata.headers.len() > 0);
521/// assert!(metadata.links.is_empty());  // Not extracted
522/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
523/// ```
524///
525/// # Example: With Conversion Options and Metadata Config
526///
527/// ```ignore
528/// use html_to_markdown_rs::{convert_with_metadata, ConversionOptions, MetadataConfig, HeadingStyle};
529///
530/// let html = "<html><head><title>Blog Post</title></head><body><h1>Hello</h1></body></html>";
531///
532/// let options = ConversionOptions {
533///     heading_style: HeadingStyle::Atx,
534///     wrap: true,
535///     wrap_width: 80,
536///     ..Default::default()
537/// };
538///
539/// let metadata_cfg = MetadataConfig::default();
540///
541/// let (markdown, metadata) = convert_with_metadata(html, Some(options), metadata_cfg, None)?;
542/// // Markdown will use ATX-style headings (# H1, ## H2, etc.)
543/// // Wrapped at 80 characters
544/// // All metadata extracted
545/// # Ok::<(), html_to_markdown_rs::ConversionError>(())
546/// ```
547///
548/// # See Also
549///
550/// - [`convert`] - Simple HTML to Markdown conversion without metadata
551/// - [`convert_with_inline_images`] - Conversion with inline image extraction
552/// - [`MetadataConfig`] - Configuration for metadata extraction
553/// - [`ExtendedMetadata`] - Metadata structure documentation
554/// - [`metadata`] module - Detailed type documentation for metadata components
555#[cfg(feature = "metadata")]
556pub fn convert_with_metadata(
557    html: &str,
558    options: Option<ConversionOptions>,
559    metadata_cfg: MetadataConfig,
560    #[cfg(feature = "visitor")] visitor: Option<visitor::VisitorHandle>,
561    #[cfg(not(feature = "visitor"))] _visitor: Option<()>,
562) -> Result<(String, ExtendedMetadata)> {
563    use std::cell::RefCell;
564    use std::rc::Rc;
565
566    validate_input(html)?;
567    let options = options.unwrap_or_default();
568    if !metadata_cfg.any_enabled() {
569        let normalized_html = normalize_line_endings(html);
570        #[cfg(feature = "visitor")]
571        let markdown = converter::convert_html_impl(normalized_html.as_ref(), &options, None, None, visitor)?;
572        #[cfg(not(feature = "visitor"))]
573        let markdown = converter::convert_html_impl(normalized_html.as_ref(), &options, None, None, None)?;
574        let markdown = if options.wrap {
575            wrapper::wrap_markdown(&markdown, &options)
576        } else {
577            markdown
578        };
579        return Ok((markdown, ExtendedMetadata::default()));
580    }
581
582    let normalized_html = normalize_line_endings(html);
583
584    let metadata_collector = Rc::new(RefCell::new(metadata::MetadataCollector::new(metadata_cfg)));
585
586    #[cfg(feature = "visitor")]
587    let markdown = converter::convert_html_impl(
588        normalized_html.as_ref(),
589        &options,
590        None,
591        Some(Rc::clone(&metadata_collector)),
592        visitor,
593    )?;
594    #[cfg(not(feature = "visitor"))]
595    let markdown = converter::convert_html_impl(
596        normalized_html.as_ref(),
597        &options,
598        None,
599        Some(Rc::clone(&metadata_collector)),
600        None,
601    )?;
602
603    let markdown = if options.wrap {
604        wrapper::wrap_markdown(&markdown, &options)
605    } else {
606        markdown
607    };
608
609    let metadata_collector = Rc::try_unwrap(metadata_collector)
610        .map_err(|_| ConversionError::Other("failed to recover metadata state".to_string()))?
611        .into_inner();
612    let metadata = metadata_collector.finish();
613
614    Ok((markdown, metadata))
615}
616
617/// Convert HTML to Markdown with a custom visitor callback.
618///
619/// This function allows you to provide a visitor implementation that can inspect,
620/// modify, or replace the default conversion behavior for any HTML element type.
621///
622/// # Arguments
623///
624/// * `html` - The HTML input to convert
625/// * `options` - Optional conversion options (uses defaults if None)
626/// * `visitor` - Mutable reference to visitor implementation for customization
627///
628/// # Example
629///
630/// ```ignore
631/// use html_to_markdown_rs::convert_with_visitor;
632/// use html_to_markdown_rs::visitor::{HtmlVisitor, NodeContext, VisitResult};
633///
634/// #[derive(Debug)]
635/// struct CustomVisitor;
636///
637/// impl HtmlVisitor for CustomVisitor {
638///     fn visit_code_block(
639///         &mut self,
640///         _ctx: &NodeContext,
641///         code: &str,
642///         language: Option<&str>,
643///     ) -> VisitResult {
644///         VisitResult::Custom(format!("```{}\n{}\n```", language.unwrap_or(""), code))
645///     }
646/// }
647///
648/// let html = "<pre><code class=\"language-rust\">fn main() {}</code></pre>";
649/// let mut visitor = CustomVisitor;
650/// let markdown = convert_with_visitor(html, None, &mut visitor).unwrap();
651/// ```
652#[cfg(feature = "visitor")]
653/// # Errors
654///
655/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
656pub fn convert_with_visitor(
657    html: &str,
658    options: Option<ConversionOptions>,
659    visitor: Option<visitor::VisitorHandle>,
660) -> Result<String> {
661    validate_input(html)?;
662    let options = options.unwrap_or_default();
663
664    let normalized_html = normalize_line_endings(html);
665
666    let markdown = converter::convert_html_with_visitor(normalized_html.as_ref(), &options, visitor)?;
667
668    if options.wrap {
669        Ok(wrapper::wrap_markdown(&markdown, &options))
670    } else {
671        Ok(markdown)
672    }
673}
674
675#[cfg(feature = "async-visitor")]
676/// Convert HTML to Markdown with an async visitor callback.
677///
678/// This async function allows you to provide an async visitor implementation that can inspect,
679/// modify, or replace the default conversion behavior for any HTML element type.
680///
681/// This function is useful for:
682/// - Python async functions (with `async def` and `asyncio`)
683/// - TypeScript/JavaScript async functions (with `Promise`-based callbacks)
684/// - Elixir processes (with message-passing async operations)
685///
686/// For synchronous languages (Ruby, PHP, Go, Java, C#), use `convert_with_visitor` instead.
687///
688/// # Note
689///
690/// The async visitor trait (`AsyncHtmlVisitor`) and async dispatch helpers are designed to be
691/// consumed by language bindings (`PyO3`, NAPI-RS, Magnus, etc.) which can bridge async/await
692/// semantics from their host languages. The conversion pipeline itself runs synchronously,
693/// but visitor callbacks are defined as async to support languages with native async/await.
694///
695/// Binding implementations will be responsible for running async callbacks on appropriate
696/// event loops (asyncio for Python, Promise chains for TypeScript, etc.).
697///
698/// # Arguments
699///
700/// * `html` - The HTML input to convert
701/// * `options` - Optional conversion options (uses defaults if None)
702/// * `visitor` - Optional async visitor implementing `AsyncHtmlVisitor` trait for customization
703///
704/// # Example (Rust-like async)
705///
706/// ```ignore
707/// use html_to_markdown_rs::convert_with_async_visitor;
708/// use html_to_markdown_rs::visitor::{AsyncHtmlVisitor, NodeContext, VisitResult};
709/// use async_trait::async_trait;
710/// use std::rc::Rc;
711/// use std::cell::RefCell;
712///
713/// #[derive(Debug)]
714/// struct CustomAsyncVisitor;
715///
716/// #[async_trait]
717/// impl AsyncHtmlVisitor for CustomAsyncVisitor {
718///     async fn visit_code_block(
719///         &mut self,
720///         _ctx: &NodeContext,
721///         code: &str,
722///         language: Option<&str>,
723///     ) -> VisitResult {
724///         // Can perform async operations here (e.g., syntax highlighting via service)
725///         VisitResult::Custom(format!("```{}\n{}\n```", language.unwrap_or(""), code))
726///     }
727/// }
728///
729/// let html = "<pre><code class=\"language-rust\">fn main() {}</code></pre>";
730/// let visitor = Some(Rc::new(RefCell::new(CustomAsyncVisitor) as _));
731/// let markdown = convert_with_async_visitor(html, None, visitor).await.unwrap();
732/// ```
733///
734/// # Implementation Note
735///
736/// Currently, this function placeholder documents the async visitor interface for binding
737/// implementations. The actual async dispatch integration will be implemented in binding
738/// layers (Python, TypeScript, etc.) to properly bridge async/await semantics to the
739/// synchronous Rust conversion core.
740#[allow(clippy::future_not_send)]
741/// # Errors
742///
743/// Returns an error if HTML parsing fails or if the input contains invalid UTF-8.
744#[allow(clippy::unused_async)]
745pub async fn convert_with_async_visitor(
746    html: &str,
747    options: Option<ConversionOptions>,
748    _visitor: Option<visitor_helpers::AsyncVisitorHandle>,
749) -> Result<String> {
750    validate_input(html)?;
751    let options = options.unwrap_or_default();
752
753    let normalized_html = normalize_line_endings(html);
754
755    // TODO: Implement async dispatch in conversion pipeline
756    let markdown = converter::convert_html(normalized_html.as_ref(), &options)?;
757
758    if options.wrap {
759        Ok(wrapper::wrap_markdown(&markdown, &options))
760    } else {
761        Ok(markdown)
762    }
763}
764
765#[cfg(all(test, feature = "metadata"))]
766mod tests {
767    use super::*;
768
769    #[test]
770    fn test_convert_with_metadata_full_workflow() {
771        let html = "<html lang=\"en\" dir=\"ltr\"><head><title>Test Article</title></head><body><h1 id=\"main-title\">Main Title</h1><p>This is a paragraph with a <a href=\"https://example.com\">link</a>.</p><h2>Subsection</h2><p>Another paragraph with <a href=\"#main-title\">internal link</a>.</p><img src=\"https://example.com/image.jpg\" alt=\"Test image\" title=\"Image title\"></body></html>";
772
773        let config = MetadataConfig {
774            extract_document: true,
775            extract_headers: true,
776            extract_links: true,
777            extract_images: true,
778            extract_structured_data: true,
779            max_structured_data_size: metadata::DEFAULT_MAX_STRUCTURED_DATA_SIZE,
780        };
781
782        let (markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
783
784        assert!(!markdown.is_empty());
785        assert!(markdown.contains("Main Title"));
786        assert!(markdown.contains("Subsection"));
787
788        assert_eq!(metadata.document.language, Some("en".to_string()));
789
790        assert_eq!(metadata.headers.len(), 2);
791        assert_eq!(metadata.headers[0].level, 1);
792        assert_eq!(metadata.headers[0].text, "Main Title");
793        assert_eq!(metadata.headers[0].id, Some("main-title".to_string()));
794        assert_eq!(metadata.headers[1].level, 2);
795        assert_eq!(metadata.headers[1].text, "Subsection");
796
797        assert!(metadata.links.len() >= 2);
798        let external_link = metadata.links.iter().find(|l| l.link_type == LinkType::External);
799        assert!(external_link.is_some());
800        let anchor_link = metadata.links.iter().find(|l| l.link_type == LinkType::Anchor);
801        assert!(anchor_link.is_some());
802
803        assert_eq!(metadata.images.len(), 1);
804        assert_eq!(metadata.images[0].alt, Some("Test image".to_string()));
805        assert_eq!(metadata.images[0].title, Some("Image title".to_string()));
806        assert_eq!(metadata.images[0].image_type, ImageType::External);
807    }
808
809    #[test]
810    fn test_convert_with_metadata_document_fields() {
811        let html = "<html lang=\"en\"><head><title>Test Article</title><meta name=\"description\" content=\"Desc\"><meta name=\"author\" content=\"Author\"><meta property=\"og:title\" content=\"OG Title\"><meta property=\"og:description\" content=\"OG Desc\"></head><body><h1>Heading</h1></body></html>";
812
813        let (_markdown, metadata) =
814            convert_with_metadata(html, None, MetadataConfig::default(), None).expect("conversion should succeed");
815
816        assert_eq!(
817            metadata.document.title,
818            Some("Test Article".to_string()),
819            "document: {:?}",
820            metadata.document
821        );
822        assert_eq!(metadata.document.description, Some("Desc".to_string()));
823        assert_eq!(metadata.document.author, Some("Author".to_string()));
824        assert_eq!(metadata.document.language, Some("en".to_string()));
825        assert_eq!(metadata.document.open_graph.get("title"), Some(&"OG Title".to_string()));
826        assert_eq!(
827            metadata.document.open_graph.get("description"),
828            Some(&"OG Desc".to_string())
829        );
830    }
831
832    #[test]
833    fn test_convert_with_metadata_empty_config() {
834        let html = "<html lang=\"en\"><head><title>Test</title></head><body><h1>Title</h1><a href=\"#\">Link</a></body></html>";
835
836        let config = MetadataConfig {
837            extract_document: false,
838            extract_headers: false,
839            extract_links: false,
840            extract_images: false,
841            extract_structured_data: false,
842            max_structured_data_size: 0,
843        };
844
845        let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
846
847        assert!(metadata.headers.is_empty());
848        assert!(metadata.links.is_empty());
849        assert!(metadata.images.is_empty());
850        assert_eq!(metadata.document.language, None);
851    }
852
853    #[test]
854    fn test_convert_with_metadata_data_uri_image() {
855        let html = "<html><body><img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"Pixel\"></body></html>";
856
857        let config = MetadataConfig::default();
858
859        let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
860
861        assert_eq!(metadata.images.len(), 1);
862        assert_eq!(metadata.images[0].image_type, ImageType::DataUri);
863        assert_eq!(metadata.images[0].alt, Some("Pixel".to_string()));
864    }
865
866    #[test]
867    fn test_convert_with_metadata_relative_paths() {
868        let html = r#"<html><body><a href="/page">Internal</a><a href="../other">Relative</a></body></html>"#;
869
870        let config = MetadataConfig::default();
871
872        let (_markdown, metadata) = convert_with_metadata(html, None, config, None).expect("conversion should succeed");
873
874        let internal_links: Vec<_> = metadata
875            .links
876            .iter()
877            .filter(|l| l.link_type == LinkType::Internal)
878            .collect();
879        assert_eq!(internal_links.len(), 2);
880    }
881}
882
883#[cfg(test)]
884mod basic_tests {
885    use super::*;
886
887    #[test]
888    fn test_binary_input_rejected() {
889        let html = "PDF\0DATA";
890        let result = convert(html, None);
891        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
892    }
893
894    #[test]
895    fn test_binary_magic_rejected() {
896        let html = String::from_utf8_lossy(b"\x1F\x8B\x08\x00gzip").to_string();
897        let result = convert(&html, None);
898        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
899    }
900
901    #[test]
902    fn test_utf16_hint_rejected() {
903        let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
904        let result = convert(&html, None);
905        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
906    }
907
908    #[test]
909    fn test_plain_text_allowed() {
910        let result = convert("Just text", None).unwrap();
911        assert!(result.contains("Just text"));
912    }
913
914    #[test]
915    fn test_plain_text_escaped_when_enabled() {
916        let options = ConversionOptions {
917            escape_asterisks: true,
918            escape_underscores: true,
919            ..ConversionOptions::default()
920        };
921        let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
922        assert!(result.contains(r"\*asterisks\*"));
923        assert!(result.contains(r"\_underscores\_"));
924    }
925}
html_to_markdown_rs/lib.rs

html_to_markdown_rs/
lib.rs