use crate::{Document, Extractor, Result};
use std::path::Path;
#[cfg(test)]
use crate::Error;
#[derive(Default)]
pub struct Html2mdExtractor;
impl Html2mdExtractor {
#[must_use]
pub fn new() -> Self {
Self
}
}
impl Extractor for Html2mdExtractor {
fn extensions(&self) -> &[&'static str] {
&["html", "htm"]
}
fn name(&self) -> &'static str {
"html2md"
}
fn extract(&self, path: &Path) -> Result<Document> {
let html = std::fs::read_to_string(path)?;
let markdown = html2md::parse_html(&html);
Ok(Document {
markdown,
title: None,
metadata: std::collections::HashMap::new(),
})
}
fn extract_bytes(&self, bytes: &[u8], _ext: &str) -> Result<Document> {
let html = std::str::from_utf8(bytes).map_or_else(
|_| String::from_utf8_lossy(bytes).into_owned(),
std::string::ToString::to_string,
);
Ok(Document {
markdown: html2md::parse_html(&html),
title: None,
metadata: std::collections::HashMap::new(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
#[test]
fn handles_html_and_htm_extensions() {
assert_eq!(Html2mdExtractor.extensions(), &["html", "htm"]);
}
#[test]
fn name_identifies_backend() {
assert_eq!(Html2mdExtractor.name(), "html2md");
}
#[test]
fn converts_basic_html_to_markdown() {
let mut tmp = tempfile::Builder::new().suffix(".html").tempfile().unwrap();
write!(tmp, "<html><body><h1>Hello</h1><p>World</p></body></html>").unwrap();
tmp.flush().unwrap();
let doc = Html2mdExtractor.extract(tmp.path()).unwrap();
assert!(
doc.markdown.contains("Hello"),
"expected 'Hello' in output: {:?}",
doc.markdown
);
assert!(
doc.markdown.contains("World"),
"expected 'World' in output: {:?}",
doc.markdown
);
}
#[test]
fn extract_bytes_works_too() {
let bytes = b"<h1>From Bytes</h1>";
let doc = Html2mdExtractor.extract_bytes(bytes, "html").unwrap();
assert!(
doc.markdown.contains("From Bytes"),
"expected 'From Bytes' in output: {:?}",
doc.markdown
);
}
#[test]
fn missing_file_returns_io_error() {
let result =
Html2mdExtractor.extract(std::path::Path::new("/nonexistent-html-file-here.html"));
assert!(matches!(result, Err(Error::Io(_))));
}
}