fast_h2m 0.4.2

High-performance HTML to Markdown converter
Documentation
#![allow(clippy::too_many_arguments, clippy::trivially_copy_pass_by_ref)]
#![cfg_attr(all(feature = "simd", nightly), feature(portable_simd))]

//! High-performance HTML to Markdown converter.
//!
//! Built with html5ever for fast, memory-efficient HTML parsing.
//!
//! ## Optional inline image extraction
//!
//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
//! assets alongside the produced Markdown.

// ============================================================================
// Module Declarations
// ============================================================================

pub mod error;
#[cfg(feature = "metadata")]
pub mod metadata;
pub mod options;
pub mod types;
#[cfg(feature = "visitor")]
pub mod visitor;

// Internal modules (not part of public API)
mod convert_api;
#[allow(dead_code)]
pub(crate) mod converter;
mod exports;
mod mdream_adapter;
pub(crate) mod tl_types;

// Re-export internal test/benchmark modules when the testkit feature is active.
// This lets integration tests and the bench harness access prescan and tier1
// without making them part of the stable public API.
//
// We use a pub mod alias so tests can use both the short path (`crate::prescan`)
// and the original path (`crate::converter::prescan`) via the re-export below.
#[cfg(any(test, feature = "testkit"))]
#[allow(unused_imports)]
/// Re-exports of internal modules for integration tests and the bench harness.
pub mod testkit {
    pub use crate::converter::prescan;
    pub use crate::converter::tier1;
}
#[cfg(any(test, feature = "testkit"))]
pub use converter::prescan;
#[cfg(any(test, feature = "testkit"))]
pub use converter::tier1;
#[cfg(feature = "inline-images")]
mod inline_images;
pub(crate) mod prelude;
mod rcdom;
#[cfg(all(feature = "simd", nightly))]
mod simd_scan;
pub(crate) mod text;
mod validation;
#[cfg(feature = "visitor")]
pub(crate) mod visitor_helpers;
pub(crate) mod wrapper;

// ============================================================================
// Public Re-exports (from exports module)
// ============================================================================

pub use exports::*;
pub use types::{
    AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent,
    ProcessingWarning, TableData, TableGrid, TextAnnotation, WarningKind,
};
#[cfg(feature = "visitor")]
pub use visitor::{NodeContext, NodeType, VisitResult};

// ============================================================================
// Main Public API Functions
// ============================================================================

pub use convert_api::convert;

// Tests
// ============================================================================

#[cfg(test)]
mod basic_tests {
    use super::*;

    #[test]
    fn test_binary_input_rejected() {
        let html = format!("abc{}def", "\0".repeat(20));
        let result = convert(&html, None);
        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
    }

    #[test]
    fn test_binary_magic_rejected() {
        let html = "%PDF-1.7";
        let result = convert(html, None);
        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
    }

    #[test]
    fn test_utf16_hint_recovered() {
        let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
        let result = convert(&html, None);
        assert!(
            result.is_ok(),
            "UTF-16 input should be recovered instead of rejected"
        );
    }

    #[test]
    fn test_plain_text_allowed() {
        let result = convert("Just text", None).unwrap();
        let content = result.content.unwrap_or_default();
        assert!(content.contains("Just text"));
    }

    #[test]
    fn test_plain_text_escaped_when_enabled() {
        let options = ConversionOptions {
            escape_asterisks: true,
            escape_underscores: true,
            ..ConversionOptions::default()
        };
        let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
        let content = result.content.unwrap_or_default();
        assert!(content.contains(r"\*asterisks\*"));
        assert!(content.contains(r"\_underscores\_"));
    }
}