use crate::html::convert_relative_urls;
use crate::Result;
use base64::{engine::general_purpose::STANDARD, Engine as _};
use serde::{Deserialize, Serialize};
use tracing::{debug, info};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct KreuzbergResult {
pub content: String,
pub metadata: Option<serde_json::Value>,
pub tables: Vec<serde_json::Value>,
pub images: Vec<serde_json::Value>,
pub warnings: Vec<String>,
}
pub fn convert_with_kreuzberg(html: &str, base_url: Option<&str>) -> Result<KreuzbergResult> {
info!("Converting HTML to Markdown using kreuzberg");
let processed_html = base_url.map_or_else(
|| html.to_string(),
|base| convert_relative_urls(html, base),
);
let options = html_to_markdown_rs::ConversionOptions {
extract_metadata: true,
extract_images: true,
..html_to_markdown_rs::ConversionOptions::default()
};
let result = html_to_markdown_rs::convert(&processed_html, Some(options))
.map_err(|e| crate::WebCaptureError::MarkdownError(format!("kreuzberg: {e}")))?;
let content = result.content.unwrap_or_default();
let metadata = serde_json::to_value(&result.metadata)
.ok()
.filter(|v| !v.is_null());
let tables: Vec<serde_json::Value> = result
.tables
.iter()
.filter_map(|t| serde_json::to_value(t).ok())
.collect();
let images: Vec<serde_json::Value> = result.images.iter().map(inline_image_to_json).collect();
let warnings: Vec<String> = result.warnings.iter().map(|w| w.message.clone()).collect();
debug!(
"Kreuzberg conversion complete: {} bytes content, {} tables, {} images, {} warnings",
content.len(),
tables.len(),
images.len(),
warnings.len()
);
Ok(KreuzbergResult {
content,
metadata,
tables,
images,
warnings,
})
}
fn inline_image_to_json(image: &html_to_markdown_rs::InlineImage) -> serde_json::Value {
let dimensions = image.dimensions.map(|(width, height)| {
serde_json::json!({
"width": width,
"height": height,
})
});
serde_json::json!({
"data": STANDARD.encode(&image.data),
"format": image.format.to_string(),
"filename": image.filename.as_deref(),
"description": image.description.as_deref(),
"dimensions": dimensions,
"source": image.source.to_string(),
"attributes": &image.attributes,
})
}