fast_h2m 0.1.0

High-performance HTML to Markdown converter using the rustedbytes-tl parser. Part of the Kreuzberg ecosystem.
Documentation
#![allow(
    clippy::too_many_lines,
    clippy::option_if_let_else,
    clippy::match_wildcard_for_single_variants,
    clippy::needless_pass_by_value,
    clippy::struct_excessive_bools,
    clippy::fn_params_excessive_bools,
    clippy::branches_sharing_code,
    clippy::match_same_arms,
    clippy::missing_errors_doc,
    clippy::items_after_statements,
    clippy::doc_markdown,
    clippy::cast_sign_loss,
    clippy::default_trait_access,
    clippy::unused_self,
    clippy::cast_precision_loss,
    clippy::collapsible_if,
    clippy::too_many_arguments,
    clippy::collapsible_else_if,
    clippy::extra_unused_lifetimes,
    clippy::unnecessary_lazy_evaluations,
    clippy::must_use_candidate,
    clippy::trivially_copy_pass_by_ref,
    clippy::explicit_iter_loop,
    clippy::missing_const_for_fn,
    clippy::manual_assert,
    clippy::return_self_not_must_use,
    clippy::collapsible_match,
    clippy::cast_possible_truncation,
    clippy::map_unwrap_or,
    clippy::manual_let_else,
    clippy::used_underscore_binding,
    clippy::assigning_clones,
    clippy::uninlined_format_args
)]

//! High-performance HTML to Markdown converter.
//!
//! Built with html5ever for fast, memory-efficient HTML parsing.
//!
//! ## Optional inline image extraction
//!
//! Enable the `inline-images` Cargo feature to collect embedded data URI images and inline SVG
//! assets alongside the produced Markdown.

// ============================================================================
// Module Declarations
// ============================================================================

pub mod error;
#[cfg(feature = "metadata")]
pub mod metadata;
pub mod options;
pub mod types;
#[cfg(feature = "visitor")]
pub mod visitor;

// Internal modules (not part of public API)
mod convert_api;
#[allow(dead_code)]
pub(crate) mod converter;
mod exports;
pub(crate) mod tl_types;

// Re-export internal test/benchmark modules when the testkit feature is active.
// This lets integration tests and the bench harness access prescan and tier1
// without making them part of the stable public API.
//
// We use a pub mod alias so tests can use both the short path (`crate::prescan`)
// and the original path (`crate::converter::prescan`) via the re-export below.
#[cfg(any(test, feature = "testkit"))]
#[allow(unused_imports)]
/// Re-exports of internal modules for integration tests and the bench harness.
pub mod testkit {
    pub use crate::converter::prescan;
    pub use crate::converter::tier1;
}
#[cfg(any(test, feature = "testkit"))]
pub use converter::prescan;
#[cfg(any(test, feature = "testkit"))]
pub use converter::tier1;
#[cfg(feature = "inline-images")]
mod inline_images;
pub(crate) mod prelude;
mod rcdom;
pub(crate) mod text;
mod validation;
#[cfg(feature = "visitor")]
#[allow(clippy::ref_option)]
pub(crate) mod visitor_helpers;
pub(crate) mod wrapper;

// ============================================================================
// Public Re-exports (from exports module)
// ============================================================================

pub use exports::*;
pub use types::{
    AnnotationKind, ConversionResult, DocumentNode, DocumentStructure, GridCell, NodeContent,
    ProcessingWarning, TableData, TableGrid, TextAnnotation, WarningKind,
};
#[cfg(feature = "visitor")]
pub use visitor::{NodeContext, NodeType, VisitResult};

// ============================================================================
// Main Public API Functions
// ============================================================================

pub use convert_api::convert;

// Tests
// ============================================================================

#[cfg(test)]
mod basic_tests {
    use super::*;

    #[test]
    fn test_binary_input_rejected() {
        let html = format!("abc{}def", "\0".repeat(20));
        let result = convert(&html, None);
        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
    }

    #[test]
    fn test_binary_magic_rejected() {
        let html = "%PDF-1.7";
        let result = convert(html, None);
        assert!(matches!(result, Err(ConversionError::InvalidInput(_))));
    }

    #[test]
    fn test_utf16_hint_recovered() {
        let html = String::from_utf8_lossy(b"\xFF\xFE<\0h\0t\0m\0l\0>\0").to_string();
        let result = convert(&html, None);
        assert!(
            result.is_ok(),
            "UTF-16 input should be recovered instead of rejected"
        );
    }

    #[test]
    fn test_plain_text_allowed() {
        let result = convert("Just text", None).unwrap();
        let content = result.content.unwrap_or_default();
        assert!(content.contains("Just text"));
    }

    #[test]
    fn test_plain_text_escaped_when_enabled() {
        let options = ConversionOptions {
            escape_asterisks: true,
            escape_underscores: true,
            ..ConversionOptions::default()
        };
        let result = convert("Text *asterisks* _underscores_", Some(options)).unwrap();
        let content = result.content.unwrap_or_default();
        assert!(content.contains(r"\*asterisks\*"));
        assert!(content.contains(r"\_underscores\_"));
    }
}