Skip to main content

mdkit/
lib.rs

1//! # mdkit — get markdown out of any document.
2//!
3//! See the [README](https://github.com/mdkit-project/mdkit) for the full
4//! design rationale; the short version is: dispatch by file extension to
5//! the best backend per format. Pandoc for DOCX/PPTX/EPUB/RTF/ODT/LaTeX,
6//! Pdfium for PDF, OS-native APIs for OCR, `calamine` for spreadsheets.
7//!
8//! ## Quick start
9//!
10//! ```no_run
11//! use mdkit::Engine;
12//! use std::path::Path;
13//!
14//! let engine = Engine::with_defaults();
15//! let doc = engine.extract(Path::new("report.pdf"))?;
16//! println!("{}", doc.markdown);
17//! # Ok::<(), mdkit::Error>(())
18//! ```
19//!
20//! ## Custom extractor
21//!
22//! Implement [`Extractor`] for your own format and register it on an
23//! [`Engine`]:
24//!
25//! ```
26//! use mdkit::{Document, Engine, Extractor, Result};
27//! use std::path::Path;
28//!
29//! struct MyParser;
30//!
31//! impl Extractor for MyParser {
32//!     fn extensions(&self) -> &[&'static str] { &["custom"] }
33//!     fn extract(&self, path: &Path) -> Result<Document> {
34//!         Ok(Document::new(std::fs::read_to_string(path)?))
35//!     }
36//! }
37//!
38//! let mut engine = Engine::new();
39//! engine.register(Box::new(MyParser));
40//! ```
41
42#![doc(html_root_url = "https://docs.rs/mdkit")]
43#![cfg_attr(docsrs, feature(doc_cfg))]
44
45use std::collections::HashMap;
46use std::path::Path;
47
48mod error;
49pub use error::{Error, Result};
50
51#[cfg(feature = "pdf")]
52pub mod pdf;
53
54#[cfg(feature = "calamine")]
55pub mod calamine;
56
57#[cfg(feature = "csv")]
58pub mod csv;
59
60#[cfg(feature = "html")]
61pub mod html;
62
63#[cfg(feature = "pandoc")]
64pub mod pandoc;
65
66// Platform-native OCR. Each module is gated by both the
67// `ocr-platform` feature AND the matching `target_os`, because the
68// underlying FFI deps (objc2-vision on macOS, the `windows` crate on
69// Windows) are platform-specific by definition. On Linux with
70// `ocr-platform` enabled, neither module compiles — Linux users get
71// no platform OCR backend; ONNX-based fallback ships in v0.6 via
72// the separate `ocr-onnx` feature.
73#[cfg(all(feature = "ocr-platform", target_os = "macos"))]
74pub mod ocr_macos;
75
76#[cfg(all(feature = "ocr-platform", target_os = "windows"))]
77pub mod ocr_windows;
78
79// Cross-platform ONNX OCR via the `oar-ocr` crate. Unlike the
80// platform-native modules above this one isn't gated by `target_os`
81// — `oar-ocr` works on Linux, macOS, Windows, and WebAssembly.
82// `Engine::with_defaults` does NOT auto-register an ONNX extractor
83// because construction requires caller-supplied model paths; users
84// wire it in explicitly. See module docs for the registration
85// pattern.
86#[cfg(feature = "ocr-onnx")]
87pub mod ocr_onnx;
88
89// ---------------------------------------------------------------------------
90// Document — the unit of output
91// ---------------------------------------------------------------------------
92
93/// The result of extracting one document. Markdown is always present;
94/// title and metadata are best-effort and may be empty depending on the
95/// backend.
96#[derive(Debug, Clone, Default)]
97pub struct Document {
98    /// The extracted markdown text.
99    pub markdown: String,
100    /// Document title if the backend could derive one (DOCX core
101    /// properties, PDF metadata, HTML `<title>`, etc.). `None` when
102    /// unknown.
103    pub title: Option<String>,
104    /// Backend-specific metadata. Stable keys are documented per-backend;
105    /// callers should treat unknown keys as opaque.
106    pub metadata: HashMap<String, String>,
107}
108
109impl Document {
110    /// Convenience constructor for the common case where you only have
111    /// markdown text.
112    pub fn new(markdown: impl Into<String>) -> Self {
113        Self {
114            markdown: markdown.into(),
115            title: None,
116            metadata: HashMap::new(),
117        }
118    }
119
120    /// Returns the document's character count. Useful for capping logged
121    /// payloads or tracking extraction throughput.
122    pub fn len(&self) -> usize {
123        self.markdown.chars().count()
124    }
125
126    /// Returns true if the extracted markdown is empty.
127    pub fn is_empty(&self) -> bool {
128        self.markdown.is_empty()
129    }
130}
131
132// ---------------------------------------------------------------------------
133// Extractor — the per-format trait
134// ---------------------------------------------------------------------------
135
136/// A backend that knows how to convert one or more file formats to
137/// markdown. Implementors register themselves with an [`Engine`].
138///
139/// `Send + Sync` is required so engines can be shared across threads.
140/// All public methods take `&self` so implementors can wrap their
141/// internals in `Arc<Mutex<...>>` if they need interior state.
142pub trait Extractor: Send + Sync {
143    /// Lowercase file extensions this extractor handles, **without**
144    /// the leading dot. For example: `&["pdf"]`, `&["docx", "doc"]`.
145    fn extensions(&self) -> &[&'static str];
146
147    /// Convert the document at `path` to markdown. Returns
148    /// [`Error::Io`] for filesystem failures, [`Error::ParseError`]
149    /// for backend-specific failures.
150    fn extract(&self, path: &Path) -> Result<Document>;
151
152    /// Convert from in-memory bytes. Default implementation returns
153    /// [`Error::UnsupportedOperation`] — backends that can support it
154    /// (PDF, HTML) should override.
155    fn extract_bytes(&self, _bytes: &[u8], _ext: &str) -> Result<Document> {
156        Err(Error::UnsupportedOperation(
157            "this extractor does not support in-memory extraction".into(),
158        ))
159    }
160
161    /// Human-readable backend name, used in error messages and audit
162    /// logs (e.g. `"pandoc"`, `"pdfium-render"`, `"calamine"`).
163    fn name(&self) -> &'static str {
164        std::any::type_name::<Self>()
165    }
166}
167
168// ---------------------------------------------------------------------------
169// Engine — the dispatcher
170// ---------------------------------------------------------------------------
171
172/// Dispatches `extract` calls to the registered [`Extractor`] for the
173/// file's extension. Construct with [`Engine::new`] for an empty
174/// engine, or [`Engine::with_defaults`] to populate the defaults that
175/// match enabled feature flags.
176pub struct Engine {
177    extractors: Vec<Box<dyn Extractor>>,
178}
179
180impl Engine {
181    /// New engine with no extractors registered. Useful when you want
182    /// full control over the backend set.
183    pub fn new() -> Self {
184        Self {
185            extractors: Vec::new(),
186        }
187    }
188
189    /// New engine with the default backends for the enabled feature
190    /// flags. Backends register themselves silently — if a backend
191    /// can't initialize (e.g. libpdfium isn't on the system library
192    /// path for the `pdf` feature), it's skipped rather than failing
193    /// the whole construction. Use [`with_defaults_diagnostic`] if
194    /// you want to surface those failures to the user.
195    ///
196    /// [`with_defaults_diagnostic`]: Self::with_defaults_diagnostic
197    pub fn with_defaults() -> Self {
198        let (engine, _errors) = Self::with_defaults_diagnostic();
199        engine
200    }
201
202    /// Like [`with_defaults`](Self::with_defaults) but returns the
203    /// list of backend-init errors alongside the engine, so callers
204    /// can log "PDF support disabled: libpdfium not found" rather
205    /// than silently shipping a degraded experience.
206    pub fn with_defaults_diagnostic() -> (Self, Vec<(&'static str, Error)>) {
207        // `mut` is conditionally needed: when --no-default-features is
208        // set and no optional backends are enabled, neither `engine`
209        // nor `errors` ever gets a mutating call. The allow keeps that
210        // valid configuration buildable under -D warnings.
211        #[allow(unused_mut)]
212        let mut engine = Self::new();
213        #[allow(unused_mut)]
214        let mut errors: Vec<(&'static str, Error)> = Vec::new();
215
216        // Registration order matters: the Engine dispatcher returns
217        // the FIRST registered extractor that claims a given file
218        // extension. We register cheap in-process Rust backends first
219        // so they win over the (heavier) Pandoc sidecar for any
220        // overlapping format — most importantly HTML, which both
221        // Html2mdExtractor and PandocExtractor handle. Pandoc is the
222        // last registered, so it picks up DOCX/PPTX/EPUB/RTF/ODT/LaTeX
223        // (which nothing else handles) and ALSO acts as the fallback
224        // HTML reader if the `html` feature is disabled.
225
226        #[cfg(feature = "pdf")]
227        {
228            match crate::pdf::PdfiumExtractor::new() {
229                Ok(ext) => {
230                    // Wire the platform OCR backend in as a fallback
231                    // for scanned (image-only) PDFs. Pdfium can't
232                    // extract text from those — without this hop,
233                    // PdfiumExtractor returns empty markdown silently.
234                    // We construct a SECOND OCR-extractor instance
235                    // here (the standalone image-OCR registration is
236                    // separate); both are stateless so duplication is
237                    // free.
238                    #[allow(unused_mut)]
239                    let mut ext = ext;
240                    #[cfg(all(feature = "ocr-platform", target_os = "macos"))]
241                    {
242                        ext = ext.with_ocr_fallback(Box::new(
243                            crate::ocr_macos::VisionOcrExtractor::new(),
244                        ));
245                    }
246                    #[cfg(all(feature = "ocr-platform", target_os = "windows"))]
247                    {
248                        ext = ext.with_ocr_fallback(Box::new(
249                            crate::ocr_windows::WindowsOcrExtractor::new(),
250                        ));
251                    }
252                    engine.register(Box::new(ext));
253                }
254                Err(e) => errors.push(("pdf", e)),
255            }
256        }
257
258        #[cfg(feature = "calamine")]
259        {
260            engine.register(Box::new(crate::calamine::CalamineExtractor::new()));
261        }
262
263        #[cfg(feature = "csv")]
264        {
265            engine.register(Box::new(crate::csv::CsvExtractor::new()));
266        }
267
268        #[cfg(feature = "html")]
269        {
270            engine.register(Box::new(crate::html::Html2mdExtractor::new()));
271        }
272
273        #[cfg(all(feature = "ocr-platform", target_os = "macos"))]
274        {
275            // Vision is part of macOS — no init failure mode.
276            engine.register(Box::new(crate::ocr_macos::VisionOcrExtractor::new()));
277        }
278
279        #[cfg(all(feature = "ocr-platform", target_os = "windows"))]
280        {
281            // Windows.Media.Ocr is part of Windows — no init failure
282            // at construction time. (Per-call init may still fail if
283            // the user has no OCR-capable language pack installed; we
284            // surface that as a typed error from `extract`.)
285            engine.register(Box::new(crate::ocr_windows::WindowsOcrExtractor::new()));
286        }
287
288        #[cfg(feature = "pandoc")]
289        {
290            match crate::pandoc::PandocExtractor::new() {
291                Ok(ext) => {
292                    engine.register(Box::new(ext));
293                }
294                Err(e) => errors.push(("pandoc", e)),
295            }
296        }
297
298        (engine, errors)
299    }
300
301    /// Register a backend. Multiple backends can claim the same
302    /// extension; the first registered wins on dispatch (so you can
303    /// override defaults by registering your own extractor first).
304    pub fn register(&mut self, extractor: Box<dyn Extractor>) -> &mut Self {
305        self.extractors.push(extractor);
306        self
307    }
308
309    /// Returns the number of registered extractors.
310    pub fn len(&self) -> usize {
311        self.extractors.len()
312    }
313
314    /// Returns true when no extractors are registered.
315    pub fn is_empty(&self) -> bool {
316        self.extractors.is_empty()
317    }
318
319    /// Extract `path` to markdown, dispatching by file extension.
320    /// Returns [`Error::UnsupportedFormat`] if no registered extractor
321    /// claims the extension.
322    pub fn extract(&self, path: &Path) -> Result<Document> {
323        let ext = extension_of(path).ok_or_else(|| {
324            Error::UnsupportedFormat(format!("no file extension on {}", path.display()))
325        })?;
326        let extractor = self.find(&ext).ok_or_else(|| {
327            Error::UnsupportedFormat(format!("no extractor registered for .{ext}"))
328        })?;
329        extractor.extract(path)
330    }
331
332    /// Same as [`extract`](Self::extract) but takes bytes + an explicit
333    /// extension. Backends that don't implement
334    /// [`Extractor::extract_bytes`] return
335    /// [`Error::UnsupportedOperation`].
336    pub fn extract_bytes(&self, bytes: &[u8], ext: &str) -> Result<Document> {
337        let lower = ext.trim_start_matches('.').to_ascii_lowercase();
338        let extractor = self.find(&lower).ok_or_else(|| {
339            Error::UnsupportedFormat(format!("no extractor registered for .{lower}"))
340        })?;
341        extractor.extract_bytes(bytes, &lower)
342    }
343
344    fn find(&self, ext: &str) -> Option<&dyn Extractor> {
345        self.extractors
346            .iter()
347            .find(|e| e.extensions().contains(&ext))
348            .map(std::convert::AsRef::as_ref)
349    }
350}
351
352impl Default for Engine {
353    fn default() -> Self {
354        Self::with_defaults()
355    }
356}
357
358fn extension_of(path: &Path) -> Option<String> {
359    path.extension()
360        .and_then(|os| os.to_str())
361        .map(str::to_ascii_lowercase)
362}
363
364// ---------------------------------------------------------------------------
365// Tests
366// ---------------------------------------------------------------------------
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371    use std::io::Write;
372    use tempfile::NamedTempFile;
373
374    /// A minimal extractor used in unit tests: returns the raw file
375    /// content as the markdown body. Stand-in for real backends until
376    /// they land per the roadmap.
377    struct EchoExtractor {
378        exts: &'static [&'static str],
379    }
380
381    impl Extractor for EchoExtractor {
382        fn extensions(&self) -> &[&'static str] {
383            self.exts
384        }
385        fn extract(&self, path: &Path) -> Result<Document> {
386            Ok(Document::new(std::fs::read_to_string(path)?))
387        }
388        fn extract_bytes(&self, bytes: &[u8], _ext: &str) -> Result<Document> {
389            Ok(Document::new(String::from_utf8_lossy(bytes).into_owned()))
390        }
391    }
392
393    #[test]
394    fn empty_engine_rejects_all_files() {
395        let engine = Engine::new();
396        let f = NamedTempFile::new().unwrap();
397        let err = engine.extract(f.path()).unwrap_err();
398        assert!(matches!(err, Error::UnsupportedFormat(_)));
399    }
400
401    #[test]
402    fn dispatches_by_extension() {
403        let mut engine = Engine::new();
404        engine.register(Box::new(EchoExtractor { exts: &["txt"] }));
405
406        let mut f = tempfile::Builder::new().suffix(".txt").tempfile().unwrap();
407        write!(f, "hello world").unwrap();
408        f.flush().unwrap();
409
410        let doc = engine.extract(f.path()).unwrap();
411        assert_eq!(doc.markdown, "hello world");
412    }
413
414    #[test]
415    fn extension_match_is_case_insensitive() {
416        let mut engine = Engine::new();
417        engine.register(Box::new(EchoExtractor { exts: &["pdf"] }));
418
419        let mut f = tempfile::Builder::new().suffix(".PDF").tempfile().unwrap();
420        write!(f, "fake pdf").unwrap();
421        f.flush().unwrap();
422
423        // Engine should normalize the extension to lowercase before
424        // looking up the extractor — `EchoExtractor` registered as "pdf"
425        // must still match a file ending ".PDF".
426        let doc = engine.extract(f.path()).unwrap();
427        assert_eq!(doc.markdown, "fake pdf");
428    }
429
430    #[test]
431    fn first_registered_extractor_wins() {
432        let mut engine = Engine::new();
433        engine.register(Box::new(EchoExtractor { exts: &["md"] }));
434        // A second extractor for the same extension should be reachable
435        // only via direct calls — the dispatcher picks the first match.
436        engine.register(Box::new(EchoExtractor { exts: &["md"] }));
437        assert_eq!(engine.len(), 2);
438    }
439
440    #[test]
441    fn extract_bytes_uses_explicit_extension() {
442        let mut engine = Engine::new();
443        engine.register(Box::new(EchoExtractor { exts: &["html"] }));
444
445        let doc = engine.extract_bytes(b"<p>hi</p>", "html").unwrap();
446        assert_eq!(doc.markdown, "<p>hi</p>");
447
448        // Leading dot is tolerated.
449        let doc2 = engine.extract_bytes(b"<p>hi</p>", ".html").unwrap();
450        assert_eq!(doc2.markdown, "<p>hi</p>");
451    }
452
453    #[test]
454    fn missing_extension_is_a_clean_error() {
455        let engine = Engine::with_defaults();
456        let f = tempfile::Builder::new().tempfile().unwrap();
457        let err = engine.extract(f.path()).unwrap_err();
458        assert!(matches!(err, Error::UnsupportedFormat(_)));
459    }
460
461    #[test]
462    fn document_helpers_work() {
463        let mut doc = Document::new("hello");
464        assert_eq!(doc.len(), 5);
465        assert!(!doc.is_empty());
466        doc.markdown.clear();
467        assert!(doc.is_empty());
468    }
469}