html-to-markdown-rs 2.4.2

High-performance HTML to Markdown converter using tl parser. Part of the Kreuzberg ecosystem.
Documentation

html-to-markdown-rs

High-performance HTML to Markdown converter built with Rust.

Crates.io License: MIT docs.rs

Fast, reliable HTML to Markdown conversion with full CommonMark compliance. Built with html5ever for correctness and ammonia for safe HTML preprocessing.

Installation

[dependencies]
html-to-markdown-rs = "2.3"

Basic Usage

use html_to_markdown_rs::{convert, ConversionOptions};

fn main() -> Result<(), Box<dyn std::error::Error>> {
    let html = r#"
        <h1>Welcome</h1>
        <p>This is <strong>fast</strong> conversion!</p>
        <ul>
            <li>Built with Rust</li>
            <li>CommonMark compliant</li>
        </ul>
    "#;

    let markdown = convert(html, None)?;
    println!("{}", markdown);
    Ok(())
}

Configuration

use html_to_markdown_rs::{
    convert, ConversionOptions, HeadingStyle, ListIndentType,
    PreprocessingOptions, PreprocessingPreset,
};

let options = ConversionOptions {
    heading_style: HeadingStyle::Atx,
    list_indent_width: 2,
    list_indent_type: ListIndentType::Spaces,
    bullets: "-".to_string(),
    strong_em_symbol: '*',
    escape_asterisks: false,
    escape_underscores: false,
    newline_style: html_to_markdown_rs::NewlineStyle::Backslash,
    code_block_style: html_to_markdown_rs::CodeBlockStyle::Backticks,
    ..Default::default()
};

let markdown = convert(html, Some(options))?;

Web Scraping with Preprocessing

use html_to_markdown_rs::{convert, ConversionOptions, PreprocessingOptions};

let mut options = ConversionOptions::default();
options.preprocessing.enabled = true;
options.preprocessing.preset = html_to_markdown_rs::PreprocessingPreset::Aggressive;
options.preprocessing.remove_navigation = true;
options.preprocessing.remove_forms = true;

let markdown = convert(scraped_html, Some(options))?;

hOCR Table Extraction

use html_to_markdown_rs::convert;

// hOCR documents (from Tesseract, etc.) are detected automatically.
// Tables and spatial layout are reconstructed without additional options.
let markdown = convert(hocr_html, None)?;

Inline Image Extraction

use html_to_markdown_rs::{convert_with_inline_images, InlineImageConfig};

let config = InlineImageConfig::new(5 * 1024 * 1024) // 5MB max
    .with_infer_dimensions(true)
    .with_filename_prefix("img_".to_string());

let extraction = convert_with_inline_images(html, None, config)?;

println!("{}", extraction.markdown);
for (i, img) in extraction.inline_images.iter().enumerate() {
    println!("Image {}: {} ({} bytes)", i, img.format, img.data.len());
}

Other Language Bindings

This is the core Rust library. For other languages:

Documentation

Performance

10-30x faster than pure Python/JavaScript implementations, delivering 150-210 MB/s throughput.

License

MIT