use crate::html::convert_relative_urls;
use crate::Result;
use base64::{engine::general_purpose::STANDARD, Engine as _};
use serde::{Deserialize, Serialize};
use tracing::{debug, info};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KreuzbergResult {
pub content: String,
pub metadata: Option<serde_json::Value>,
pub tables: Vec<serde_json::Value>,
pub images: Vec<serde_json::Value>,
pub warnings: Vec<String>,
}
pub fn convert_with_kreuzberg(html: &str, base_url: Option<&str>) -> Result<KreuzbergResult> {
info!("Converting HTML to Markdown using kreuzberg");
let processed_html = base_url.map_or_else(
|| html.to_string(),
|base| convert_relative_urls(html, base),
);
let options = html_to_markdown_rs::ConversionOptions {
extract_metadata: true,
extract_images: true,
..html_to_markdown_rs::ConversionOptions::default()
};
let result = html_to_markdown_rs::convert(&processed_html, Some(options))
.map_err(|e| crate::WebCaptureError::MarkdownError(format!("kreuzberg: {e}")))?;
let content = result.content.unwrap_or_default();
let metadata = serde_json::to_value(&result.metadata)
.ok()
.filter(|v| !v.is_null());
let tables: Vec<serde_json::Value> = result
.tables
.iter()
.filter_map(|t| serde_json::to_value(t).ok())
.collect();
let images: Vec<serde_json::Value> = result.images.iter().map(inline_image_to_json).collect();
let warnings: Vec<String> = result.warnings.iter().map(|w| w.message.clone()).collect();
debug!(
"Kreuzberg conversion complete: {} bytes content, {} tables, {} images, {} warnings",
content.len(),
tables.len(),
images.len(),
warnings.len()
);
Ok(KreuzbergResult {
content,
metadata,
tables,
images,
warnings,
})
}
fn inline_image_to_json(image: &html_to_markdown_rs::InlineImage) -> serde_json::Value {
let dimensions = image.dimensions.map(|dimensions| {
serde_json::json!({
"width": dimensions.width,
"height": dimensions.height,
})
});
serde_json::json!({
"data": STANDARD.encode(&image.data),
"format": image.format.to_string(),
"filename": image.filename.as_deref(),
"description": image.description.as_deref(),
"dimensions": dimensions,
"source": image.source.to_string(),
"attributes": &image.attributes,
})
}
#[cfg(test)]
mod tests {
use super::inline_image_to_json;
use html_to_markdown_rs::{ImageDimensions, InlineImage, InlineImageFormat, InlineImageSource};
use std::collections::BTreeMap;
#[test]
fn inline_image_dimensions_serialize_with_width_height_keys() {
let image = InlineImage {
data: vec![1, 2, 3],
format: InlineImageFormat::Png,
filename: Some("pixel.png".to_string()),
description: Some("one pixel".to_string()),
dimensions: Some(ImageDimensions {
width: 800,
height: 600,
}),
source: InlineImageSource::ImgDataUri,
attributes: BTreeMap::new(),
};
let json = inline_image_to_json(&image);
assert_eq!(json["dimensions"]["width"], 800);
assert_eq!(json["dimensions"]["height"], 600);
assert_eq!(json["format"], "png");
assert_eq!(json["filename"], "pixel.png");
}
#[test]
fn inline_image_without_dimensions_serializes_null() {
let image = InlineImage {
data: Vec::new(),
format: InlineImageFormat::Svg,
filename: None,
description: None,
dimensions: None,
source: InlineImageSource::SvgElement,
attributes: BTreeMap::new(),
};
let json = inline_image_to_json(&image);
assert!(json["dimensions"].is_null());
}
}