web-capture 0.3.31

CLI and microservice to render web pages as HTML, Markdown, or PNG
Documentation
//! Kreuzberg html-to-markdown integration module.
//!
//! Provides high-performance HTML to Markdown conversion using the
//! `html-to-markdown-rs` crate (Rust-powered, 150-280 MB/s).
//!
//! This converter is available as an alternative to the default `html2md`-based
//! converter, providing structured results with metadata, table, and image extraction.
//!
//! # References
//!
//! - <https://github.com/kreuzberg-dev/html-to-markdown>
//! - <https://crates.io/crates/html-to-markdown-rs>

use crate::html::convert_relative_urls;
use crate::Result;
use base64::{engine::general_purpose::STANDARD, Engine as _};
use serde::{Deserialize, Serialize};
use tracing::{debug, info};

/// Structured result from kreuzberg HTML-to-Markdown conversion.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KreuzbergResult {
    /// The converted markdown content.
    pub content: String,
    /// Extracted metadata (title, links, headings, images, structured data).
    pub metadata: Option<serde_json::Value>,
    /// Extracted table data.
    pub tables: Vec<serde_json::Value>,
    /// Extracted inline image data.
    pub images: Vec<serde_json::Value>,
    /// Non-fatal conversion warnings.
    pub warnings: Vec<String>,
}

/// Convert HTML to Markdown using the kreuzberg html-to-markdown library.
///
/// Returns a structured result with content, metadata, tables, images, and warnings.
///
/// # Arguments
///
/// * `html` - The HTML content to convert
/// * `base_url` - Optional base URL for converting relative URLs to absolute
///
/// # Returns
///
/// A `KreuzbergResult` with structured conversion output
///
/// # Errors
///
/// Returns an error if conversion fails
pub fn convert_with_kreuzberg(html: &str, base_url: Option<&str>) -> Result<KreuzbergResult> {
    info!("Converting HTML to Markdown using kreuzberg");

    // Convert relative URLs to absolute if base_url is provided
    let processed_html = base_url.map_or_else(
        || html.to_string(),
        |base| convert_relative_urls(html, base),
    );

    let options = html_to_markdown_rs::ConversionOptions {
        extract_metadata: true,
        extract_images: true,
        ..html_to_markdown_rs::ConversionOptions::default()
    };

    let result = html_to_markdown_rs::convert(&processed_html, Some(options))
        .map_err(|e| crate::WebCaptureError::MarkdownError(format!("kreuzberg: {e}")))?;

    let content = result.content.unwrap_or_default();

    // Convert metadata to JSON value
    let metadata = serde_json::to_value(&result.metadata)
        .ok()
        .filter(|v| !v.is_null());

    // Convert tables to JSON values
    let tables: Vec<serde_json::Value> = result
        .tables
        .iter()
        .filter_map(|t| serde_json::to_value(t).ok())
        .collect();

    // Convert inline image payloads into a JSON-compatible structure.
    let images: Vec<serde_json::Value> = result.images.iter().map(inline_image_to_json).collect();

    // Convert warnings to strings
    let warnings: Vec<String> = result.warnings.iter().map(|w| w.message.clone()).collect();

    debug!(
        "Kreuzberg conversion complete: {} bytes content, {} tables, {} images, {} warnings",
        content.len(),
        tables.len(),
        images.len(),
        warnings.len()
    );

    Ok(KreuzbergResult {
        content,
        metadata,
        tables,
        images,
        warnings,
    })
}

fn inline_image_to_json(image: &html_to_markdown_rs::InlineImage) -> serde_json::Value {
    let dimensions = image.dimensions.map(|dimensions| {
        serde_json::json!({
            "width": dimensions.width,
            "height": dimensions.height,
        })
    });

    serde_json::json!({
        "data": STANDARD.encode(&image.data),
        "format": image.format.to_string(),
        "filename": image.filename.as_deref(),
        "description": image.description.as_deref(),
        "dimensions": dimensions,
        "source": image.source.to_string(),
        "attributes": &image.attributes,
    })
}

#[cfg(test)]
mod tests {
    use super::inline_image_to_json;
    use html_to_markdown_rs::{ImageDimensions, InlineImage, InlineImageFormat, InlineImageSource};
    use std::collections::BTreeMap;

    /// Regression for issue #137: html-to-markdown-rs 3.6 replaced the
    /// `Option<(u32, u32)>` tuple on `InlineImage.dimensions` with a structured
    /// `Option<ImageDimensions>`. The extracted JSON must keep the
    /// `{ "width", "height" }` shape consumers depend on.
    #[test]
    fn inline_image_dimensions_serialize_with_width_height_keys() {
        let image = InlineImage {
            data: vec![1, 2, 3],
            format: InlineImageFormat::Png,
            filename: Some("pixel.png".to_string()),
            description: Some("one pixel".to_string()),
            dimensions: Some(ImageDimensions {
                width: 800,
                height: 600,
            }),
            source: InlineImageSource::ImgDataUri,
            attributes: BTreeMap::new(),
        };

        let json = inline_image_to_json(&image);

        assert_eq!(json["dimensions"]["width"], 800);
        assert_eq!(json["dimensions"]["height"], 600);
        assert_eq!(json["format"], "png");
        assert_eq!(json["filename"], "pixel.png");
    }

    /// Images without inferred dimensions must serialize `dimensions` as null
    /// rather than panicking on the new `ImageDimensions` type.
    #[test]
    fn inline_image_without_dimensions_serializes_null() {
        let image = InlineImage {
            data: Vec::new(),
            format: InlineImageFormat::Svg,
            filename: None,
            description: None,
            dimensions: None,
            source: InlineImageSource::SvgElement,
            attributes: BTreeMap::new(),
        };

        let json = inline_image_to_json(&image);

        assert!(json["dimensions"].is_null());
    }
}