#![doc(html_root_url = "https://docs.rs/mdkit")]
#![cfg_attr(docsrs, feature(doc_cfg))]
use std::collections::HashMap;
use std::path::Path;
mod error;
pub use error::{Error, Result};
#[cfg(feature = "pdf")]
pub mod pdf;
#[cfg(feature = "calamine")]
pub mod calamine;
#[cfg(feature = "csv")]
pub mod csv;
#[cfg(feature = "html")]
pub mod html;
#[cfg(feature = "pandoc")]
pub mod pandoc;
#[cfg(all(feature = "ocr-platform", target_os = "macos"))]
pub mod ocr_macos;
#[derive(Debug, Clone, Default)]
pub struct Document {
pub markdown: String,
pub title: Option<String>,
pub metadata: HashMap<String, String>,
}
impl Document {
pub fn new(markdown: impl Into<String>) -> Self {
Self {
markdown: markdown.into(),
title: None,
metadata: HashMap::new(),
}
}
pub fn len(&self) -> usize {
self.markdown.chars().count()
}
pub fn is_empty(&self) -> bool {
self.markdown.is_empty()
}
}
pub trait Extractor: Send + Sync {
fn extensions(&self) -> &[&'static str];
fn extract(&self, path: &Path) -> Result<Document>;
fn extract_bytes(&self, _bytes: &[u8], _ext: &str) -> Result<Document> {
Err(Error::UnsupportedOperation(
"this extractor does not support in-memory extraction".into(),
))
}
fn name(&self) -> &'static str {
std::any::type_name::<Self>()
}
}
pub struct Engine {
extractors: Vec<Box<dyn Extractor>>,
}
impl Engine {
pub fn new() -> Self {
Self {
extractors: Vec::new(),
}
}
pub fn with_defaults() -> Self {
let (engine, _errors) = Self::with_defaults_diagnostic();
engine
}
pub fn with_defaults_diagnostic() -> (Self, Vec<(&'static str, Error)>) {
let mut engine = Self::new();
#[allow(unused_mut)]
let mut errors: Vec<(&'static str, Error)> = Vec::new();
#[cfg(feature = "pdf")]
{
match crate::pdf::PdfiumExtractor::new() {
Ok(ext) => {
engine.register(Box::new(ext));
}
Err(e) => errors.push(("pdf", e)),
}
}
#[cfg(feature = "calamine")]
{
engine.register(Box::new(crate::calamine::CalamineExtractor::new()));
}
#[cfg(feature = "csv")]
{
engine.register(Box::new(crate::csv::CsvExtractor::new()));
}
#[cfg(feature = "html")]
{
engine.register(Box::new(crate::html::Html2mdExtractor::new()));
}
#[cfg(all(feature = "ocr-platform", target_os = "macos"))]
{
engine.register(Box::new(crate::ocr_macos::VisionOcrExtractor::new()));
}
#[cfg(feature = "pandoc")]
{
match crate::pandoc::PandocExtractor::new() {
Ok(ext) => {
engine.register(Box::new(ext));
}
Err(e) => errors.push(("pandoc", e)),
}
}
(engine, errors)
}
pub fn register(&mut self, extractor: Box<dyn Extractor>) -> &mut Self {
self.extractors.push(extractor);
self
}
pub fn len(&self) -> usize {
self.extractors.len()
}
pub fn is_empty(&self) -> bool {
self.extractors.is_empty()
}
pub fn extract(&self, path: &Path) -> Result<Document> {
let ext = extension_of(path).ok_or_else(|| {
Error::UnsupportedFormat(format!("no file extension on {}", path.display()))
})?;
let extractor = self.find(&ext).ok_or_else(|| {
Error::UnsupportedFormat(format!("no extractor registered for .{ext}"))
})?;
extractor.extract(path)
}
pub fn extract_bytes(&self, bytes: &[u8], ext: &str) -> Result<Document> {
let lower = ext.trim_start_matches('.').to_ascii_lowercase();
let extractor = self.find(&lower).ok_or_else(|| {
Error::UnsupportedFormat(format!("no extractor registered for .{lower}"))
})?;
extractor.extract_bytes(bytes, &lower)
}
fn find(&self, ext: &str) -> Option<&dyn Extractor> {
self.extractors
.iter()
.find(|e| e.extensions().contains(&ext))
.map(std::convert::AsRef::as_ref)
}
}
impl Default for Engine {
fn default() -> Self {
Self::with_defaults()
}
}
fn extension_of(path: &Path) -> Option<String> {
path.extension()
.and_then(|os| os.to_str())
.map(str::to_ascii_lowercase)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
struct EchoExtractor {
exts: &'static [&'static str],
}
impl Extractor for EchoExtractor {
fn extensions(&self) -> &[&'static str] {
self.exts
}
fn extract(&self, path: &Path) -> Result<Document> {
Ok(Document::new(std::fs::read_to_string(path)?))
}
fn extract_bytes(&self, bytes: &[u8], _ext: &str) -> Result<Document> {
Ok(Document::new(String::from_utf8_lossy(bytes).into_owned()))
}
}
#[test]
fn empty_engine_rejects_all_files() {
let engine = Engine::new();
let f = NamedTempFile::new().unwrap();
let err = engine.extract(f.path()).unwrap_err();
assert!(matches!(err, Error::UnsupportedFormat(_)));
}
#[test]
fn dispatches_by_extension() {
let mut engine = Engine::new();
engine.register(Box::new(EchoExtractor { exts: &["txt"] }));
let mut f = tempfile::Builder::new().suffix(".txt").tempfile().unwrap();
write!(f, "hello world").unwrap();
f.flush().unwrap();
let doc = engine.extract(f.path()).unwrap();
assert_eq!(doc.markdown, "hello world");
}
#[test]
fn extension_match_is_case_insensitive() {
let mut engine = Engine::new();
engine.register(Box::new(EchoExtractor { exts: &["pdf"] }));
let mut f = tempfile::Builder::new().suffix(".PDF").tempfile().unwrap();
write!(f, "fake pdf").unwrap();
f.flush().unwrap();
let doc = engine.extract(f.path()).unwrap();
assert_eq!(doc.markdown, "fake pdf");
}
#[test]
fn first_registered_extractor_wins() {
let mut engine = Engine::new();
engine.register(Box::new(EchoExtractor { exts: &["md"] }));
engine.register(Box::new(EchoExtractor { exts: &["md"] }));
assert_eq!(engine.len(), 2);
}
#[test]
fn extract_bytes_uses_explicit_extension() {
let mut engine = Engine::new();
engine.register(Box::new(EchoExtractor { exts: &["html"] }));
let doc = engine.extract_bytes(b"<p>hi</p>", "html").unwrap();
assert_eq!(doc.markdown, "<p>hi</p>");
let doc2 = engine.extract_bytes(b"<p>hi</p>", ".html").unwrap();
assert_eq!(doc2.markdown, "<p>hi</p>");
}
#[test]
fn missing_extension_is_a_clean_error() {
let engine = Engine::with_defaults();
let f = tempfile::Builder::new().tempfile().unwrap();
let err = engine.extract(f.path()).unwrap_err();
assert!(matches!(err, Error::UnsupportedFormat(_)));
}
#[test]
fn document_helpers_work() {
let mut doc = Document::new("hello");
assert_eq!(doc.len(), 5);
assert!(!doc.is_empty());
doc.markdown.clear();
assert!(doc.is_empty());
}
}