pub mod budget;
pub mod diff;
pub mod diff_format;
pub mod focus;
pub mod html;
pub mod image;
pub mod link_extract;
pub mod media;
pub mod ocr;
#[cfg(feature = "pdf")]
pub mod pdf;
pub mod pdf_light;
pub mod plain;
pub mod quality;
pub mod readability;
pub mod response_classifier;
pub mod snapshot_store;
pub mod spa_extract;
pub mod structured;
#[cfg(feature = "pdf")]
pub mod table;
#[cfg(feature = "pdf")]
pub mod types;
use anyhow::Result;
#[derive(Debug, Clone)]
pub struct ConversionResult {
pub markdown: String,
pub page_count: Option<usize>,
pub content_type: String,
pub elapsed_ms: f64,
pub quality: Option<quality::QualityScore>,
}
pub trait ContentHandler: Send + Sync {
fn supported_types(&self) -> &[&str];
fn to_markdown(&self, bytes: &[u8], content_type: &str) -> Result<ConversionResult>;
}
pub struct ContentRouter {
handlers: Vec<Box<dyn ContentHandler>>,
html_options: html::HtmlConversionOptions,
}
impl ContentRouter {
#[must_use]
pub fn new() -> Self {
#[cfg(feature = "pdf")]
let handlers: Vec<Box<dyn ContentHandler>> = vec![
Box::new(pdf::PdfHandler::new()),
Box::new(html::HtmlHandler),
Box::new(image::ImageHandler),
Box::new(plain::PlainHandler),
];
#[cfg(not(feature = "pdf"))]
let handlers: Vec<Box<dyn ContentHandler>> = vec![
Box::new(pdf_light::PdfLightHandler),
Box::new(html::HtmlHandler),
Box::new(image::ImageHandler),
Box::new(plain::PlainHandler),
];
Self {
handlers,
html_options: html::HtmlConversionOptions::default(),
}
}
#[must_use]
pub fn with_html_options(html_options: html::HtmlConversionOptions) -> Self {
let mut router = Self::new();
router.html_options = html_options;
router
}
pub fn convert(&self, bytes: &[u8], content_type: &str) -> Result<ConversionResult> {
self.convert_with_url(bytes, content_type, None)
}
pub fn convert_with_url(
&self,
bytes: &[u8],
content_type: &str,
url: Option<&str>,
) -> Result<ConversionResult> {
let mime = content_type
.split(';')
.next()
.unwrap_or(content_type)
.trim()
.to_lowercase();
if mime == "text/html" || mime == "application/xhtml+xml" {
return html::HtmlHandler.to_markdown_with_url_and_options(
bytes,
content_type,
url,
self.html_options,
);
}
for handler in &self.handlers {
if handler.supported_types().iter().any(|t| *t == mime) {
return handler.to_markdown(bytes, content_type);
}
}
if bytes.starts_with(b"<!") || bytes.starts_with(b"<html") || bytes.starts_with(b"<HTML") {
return html::HtmlHandler.to_markdown_with_url_and_options(
bytes,
"text/html",
url,
self.html_options,
);
}
plain::PlainHandler.to_markdown(bytes, content_type)
}
}
impl Default for ContentRouter {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn router_dispatches_html_to_html_handler() {
let router = ContentRouter::new();
let html = b"<html><body><h1>Title</h1><p>Body text</p></body></html>";
let result = router.convert(html, "text/html").unwrap();
assert!(result.markdown.contains("Title"));
assert!(result.markdown.contains("Body text"));
assert_eq!(result.content_type, "text/html");
assert!(result.page_count.is_none());
}
#[test]
fn router_dispatches_xhtml_to_html_handler() {
let router = ContentRouter::new();
let xhtml = b"<html><body><p>XHTML content</p></body></html>";
let result = router.convert(xhtml, "application/xhtml+xml").unwrap();
assert!(result.markdown.contains("XHTML content"));
}
#[test]
fn router_dispatches_plain_text() {
let router = ContentRouter::new();
let text = b"Hello, plain world!";
let result = router.convert(text, "text/plain").unwrap();
assert_eq!(result.markdown, "Hello, plain world!");
}
#[test]
fn router_dispatches_json() {
let router = ContentRouter::new();
let json = br#"{"key": "value"}"#;
let result = router.convert(json, "application/json").unwrap();
assert!(result.markdown.contains(r#""key""#));
}
#[test]
fn router_handles_content_type_with_charset() {
let router = ContentRouter::new();
let html = b"<html><body>Charset test</body></html>";
let result = router.convert(html, "text/html; charset=utf-8").unwrap();
assert!(result.markdown.contains("Charset test"));
}
#[test]
fn router_falls_back_to_html_for_html_like_bytes() {
let router = ContentRouter::new();
let html = b"<!DOCTYPE html><html><body>Fallback</body></html>";
let result = router.convert(html, "application/octet-stream").unwrap();
assert!(result.markdown.contains("Fallback"));
}
#[test]
fn router_dispatches_markdown_to_plain_handler() {
let router = ContentRouter::new();
let md = b"# Title\n\nParagraph with **bold**.";
let result = router.convert(md, "text/markdown").unwrap();
assert!(result.markdown.contains("# Title"));
assert!(result.markdown.contains("**bold**"));
}
#[test]
fn router_falls_back_to_plain_for_unknown() {
let router = ContentRouter::new();
let data = b"Some unknown binary-ish data";
let result = router.convert(data, "application/octet-stream").unwrap();
assert!(result.markdown.contains("unknown binary"));
}
}