mdkit 0.5.5

Get markdown out of any document — Pandoc + pdfium + platform-native OCR, dispatched per format.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
//! # mdkit — get markdown out of any document.
//!
//! See the [README](https://github.com/mdkit-project/mdkit) for the full
//! design rationale; the short version is: dispatch by file extension to
//! the best backend per format. Pandoc for DOCX/PPTX/EPUB/RTF/ODT/LaTeX,
//! Pdfium for PDF, OS-native APIs for OCR, `calamine` for spreadsheets.
//!
//! ## Quick start
//!
//! ```no_run
//! use mdkit::Engine;
//! use std::path::Path;
//!
//! let engine = Engine::with_defaults();
//! let doc = engine.extract(Path::new("report.pdf"))?;
//! println!("{}", doc.markdown);
//! # Ok::<(), mdkit::Error>(())
//! ```
//!
//! ## Custom extractor
//!
//! Implement [`Extractor`] for your own format and register it on an
//! [`Engine`]:
//!
//! ```
//! use mdkit::{Document, Engine, Extractor, Result};
//! use std::path::Path;
//!
//! struct MyParser;
//!
//! impl Extractor for MyParser {
//!     fn extensions(&self) -> &[&'static str] { &["custom"] }
//!     fn extract(&self, path: &Path) -> Result<Document> {
//!         Ok(Document::new(std::fs::read_to_string(path)?))
//!     }
//! }
//!
//! let mut engine = Engine::new();
//! engine.register(Box::new(MyParser));
//! ```

#![doc(html_root_url = "https://docs.rs/mdkit")]
#![cfg_attr(docsrs, feature(doc_cfg))]

use std::collections::HashMap;
use std::path::Path;

mod error;
pub use error::{Error, Result};

#[cfg(feature = "pdf")]
pub mod pdf;

#[cfg(feature = "calamine")]
pub mod calamine;

#[cfg(feature = "csv")]
pub mod csv;

#[cfg(feature = "html")]
pub mod html;

#[cfg(feature = "pandoc")]
pub mod pandoc;

// Platform-native OCR. Each module is gated by both the
// `ocr-platform` feature AND the matching `target_os`, because the
// underlying FFI deps (objc2-vision on macOS, the `windows` crate on
// Windows) are platform-specific by definition. On Linux with
// `ocr-platform` enabled, neither module compiles — Linux users get
// no platform OCR backend; ONNX-based fallback ships in v0.6 via
// the separate `ocr-onnx` feature.
#[cfg(all(feature = "ocr-platform", target_os = "macos"))]
pub mod ocr_macos;

#[cfg(all(feature = "ocr-platform", target_os = "windows"))]
pub mod ocr_windows;

// ---------------------------------------------------------------------------
// Document — the unit of output
// ---------------------------------------------------------------------------

/// The result of extracting one document. Markdown is always present;
/// title and metadata are best-effort and may be empty depending on the
/// backend.
#[derive(Debug, Clone, Default)]
pub struct Document {
    /// The extracted markdown text.
    pub markdown: String,
    /// Document title if the backend could derive one (DOCX core
    /// properties, PDF metadata, HTML `<title>`, etc.). `None` when
    /// unknown.
    pub title: Option<String>,
    /// Backend-specific metadata. Stable keys are documented per-backend;
    /// callers should treat unknown keys as opaque.
    pub metadata: HashMap<String, String>,
}

impl Document {
    /// Convenience constructor for the common case where you only have
    /// markdown text.
    pub fn new(markdown: impl Into<String>) -> Self {
        Self {
            markdown: markdown.into(),
            title: None,
            metadata: HashMap::new(),
        }
    }

    /// Returns the document's character count. Useful for capping logged
    /// payloads or tracking extraction throughput.
    pub fn len(&self) -> usize {
        self.markdown.chars().count()
    }

    /// Returns true if the extracted markdown is empty.
    pub fn is_empty(&self) -> bool {
        self.markdown.is_empty()
    }
}

// ---------------------------------------------------------------------------
// Extractor — the per-format trait
// ---------------------------------------------------------------------------

/// A backend that knows how to convert one or more file formats to
/// markdown. Implementors register themselves with an [`Engine`].
///
/// `Send + Sync` is required so engines can be shared across threads.
/// All public methods take `&self` so implementors can wrap their
/// internals in `Arc<Mutex<...>>` if they need interior state.
pub trait Extractor: Send + Sync {
    /// Lowercase file extensions this extractor handles, **without**
    /// the leading dot. For example: `&["pdf"]`, `&["docx", "doc"]`.
    fn extensions(&self) -> &[&'static str];

    /// Convert the document at `path` to markdown. Returns
    /// [`Error::Io`] for filesystem failures, [`Error::ParseError`]
    /// for backend-specific failures.
    fn extract(&self, path: &Path) -> Result<Document>;

    /// Convert from in-memory bytes. Default implementation returns
    /// [`Error::UnsupportedOperation`] — backends that can support it
    /// (PDF, HTML) should override.
    fn extract_bytes(&self, _bytes: &[u8], _ext: &str) -> Result<Document> {
        Err(Error::UnsupportedOperation(
            "this extractor does not support in-memory extraction".into(),
        ))
    }

    /// Human-readable backend name, used in error messages and audit
    /// logs (e.g. `"pandoc"`, `"pdfium-render"`, `"calamine"`).
    fn name(&self) -> &'static str {
        std::any::type_name::<Self>()
    }
}

// ---------------------------------------------------------------------------
// Engine — the dispatcher
// ---------------------------------------------------------------------------

/// Dispatches `extract` calls to the registered [`Extractor`] for the
/// file's extension. Construct with [`Engine::new`] for an empty
/// engine, or [`Engine::with_defaults`] to populate the defaults that
/// match enabled feature flags.
pub struct Engine {
    extractors: Vec<Box<dyn Extractor>>,
}

impl Engine {
    /// New engine with no extractors registered. Useful when you want
    /// full control over the backend set.
    pub fn new() -> Self {
        Self {
            extractors: Vec::new(),
        }
    }

    /// New engine with the default backends for the enabled feature
    /// flags. Backends register themselves silently — if a backend
    /// can't initialize (e.g. libpdfium isn't on the system library
    /// path for the `pdf` feature), it's skipped rather than failing
    /// the whole construction. Use [`with_defaults_diagnostic`] if
    /// you want to surface those failures to the user.
    ///
    /// [`with_defaults_diagnostic`]: Self::with_defaults_diagnostic
    pub fn with_defaults() -> Self {
        let (engine, _errors) = Self::with_defaults_diagnostic();
        engine
    }

    /// Like [`with_defaults`](Self::with_defaults) but returns the
    /// list of backend-init errors alongside the engine, so callers
    /// can log "PDF support disabled: libpdfium not found" rather
    /// than silently shipping a degraded experience.
    pub fn with_defaults_diagnostic() -> (Self, Vec<(&'static str, Error)>) {
        // `mut` is conditionally needed: when --no-default-features is
        // set and no optional backends are enabled, neither `engine`
        // nor `errors` ever gets a mutating call. The allow keeps that
        // valid configuration buildable under -D warnings.
        #[allow(unused_mut)]
        let mut engine = Self::new();
        #[allow(unused_mut)]
        let mut errors: Vec<(&'static str, Error)> = Vec::new();

        // Registration order matters: the Engine dispatcher returns
        // the FIRST registered extractor that claims a given file
        // extension. We register cheap in-process Rust backends first
        // so they win over the (heavier) Pandoc sidecar for any
        // overlapping format — most importantly HTML, which both
        // Html2mdExtractor and PandocExtractor handle. Pandoc is the
        // last registered, so it picks up DOCX/PPTX/EPUB/RTF/ODT/LaTeX
        // (which nothing else handles) and ALSO acts as the fallback
        // HTML reader if the `html` feature is disabled.

        #[cfg(feature = "pdf")]
        {
            match crate::pdf::PdfiumExtractor::new() {
                Ok(ext) => {
                    // Wire the platform OCR backend in as a fallback
                    // for scanned (image-only) PDFs. Pdfium can't
                    // extract text from those — without this hop,
                    // PdfiumExtractor returns empty markdown silently.
                    // We construct a SECOND OCR-extractor instance
                    // here (the standalone image-OCR registration is
                    // separate); both are stateless so duplication is
                    // free.
                    #[allow(unused_mut)]
                    let mut ext = ext;
                    #[cfg(all(feature = "ocr-platform", target_os = "macos"))]
                    {
                        ext = ext.with_ocr_fallback(Box::new(
                            crate::ocr_macos::VisionOcrExtractor::new(),
                        ));
                    }
                    #[cfg(all(feature = "ocr-platform", target_os = "windows"))]
                    {
                        ext = ext.with_ocr_fallback(Box::new(
                            crate::ocr_windows::WindowsOcrExtractor::new(),
                        ));
                    }
                    engine.register(Box::new(ext));
                }
                Err(e) => errors.push(("pdf", e)),
            }
        }

        #[cfg(feature = "calamine")]
        {
            engine.register(Box::new(crate::calamine::CalamineExtractor::new()));
        }

        #[cfg(feature = "csv")]
        {
            engine.register(Box::new(crate::csv::CsvExtractor::new()));
        }

        #[cfg(feature = "html")]
        {
            engine.register(Box::new(crate::html::Html2mdExtractor::new()));
        }

        #[cfg(all(feature = "ocr-platform", target_os = "macos"))]
        {
            // Vision is part of macOS — no init failure mode.
            engine.register(Box::new(crate::ocr_macos::VisionOcrExtractor::new()));
        }

        #[cfg(all(feature = "ocr-platform", target_os = "windows"))]
        {
            // Windows.Media.Ocr is part of Windows — no init failure
            // at construction time. (Per-call init may still fail if
            // the user has no OCR-capable language pack installed; we
            // surface that as a typed error from `extract`.)
            engine.register(Box::new(crate::ocr_windows::WindowsOcrExtractor::new()));
        }

        #[cfg(feature = "pandoc")]
        {
            match crate::pandoc::PandocExtractor::new() {
                Ok(ext) => {
                    engine.register(Box::new(ext));
                }
                Err(e) => errors.push(("pandoc", e)),
            }
        }

        (engine, errors)
    }

    /// Register a backend. Multiple backends can claim the same
    /// extension; the first registered wins on dispatch (so you can
    /// override defaults by registering your own extractor first).
    pub fn register(&mut self, extractor: Box<dyn Extractor>) -> &mut Self {
        self.extractors.push(extractor);
        self
    }

    /// Returns the number of registered extractors.
    pub fn len(&self) -> usize {
        self.extractors.len()
    }

    /// Returns true when no extractors are registered.
    pub fn is_empty(&self) -> bool {
        self.extractors.is_empty()
    }

    /// Extract `path` to markdown, dispatching by file extension.
    /// Returns [`Error::UnsupportedFormat`] if no registered extractor
    /// claims the extension.
    pub fn extract(&self, path: &Path) -> Result<Document> {
        let ext = extension_of(path).ok_or_else(|| {
            Error::UnsupportedFormat(format!("no file extension on {}", path.display()))
        })?;
        let extractor = self.find(&ext).ok_or_else(|| {
            Error::UnsupportedFormat(format!("no extractor registered for .{ext}"))
        })?;
        extractor.extract(path)
    }

    /// Same as [`extract`](Self::extract) but takes bytes + an explicit
    /// extension. Backends that don't implement
    /// [`Extractor::extract_bytes`] return
    /// [`Error::UnsupportedOperation`].
    pub fn extract_bytes(&self, bytes: &[u8], ext: &str) -> Result<Document> {
        let lower = ext.trim_start_matches('.').to_ascii_lowercase();
        let extractor = self.find(&lower).ok_or_else(|| {
            Error::UnsupportedFormat(format!("no extractor registered for .{lower}"))
        })?;
        extractor.extract_bytes(bytes, &lower)
    }

    fn find(&self, ext: &str) -> Option<&dyn Extractor> {
        self.extractors
            .iter()
            .find(|e| e.extensions().contains(&ext))
            .map(std::convert::AsRef::as_ref)
    }
}

impl Default for Engine {
    fn default() -> Self {
        Self::with_defaults()
    }
}

fn extension_of(path: &Path) -> Option<String> {
    path.extension()
        .and_then(|os| os.to_str())
        .map(str::to_ascii_lowercase)
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;
    use tempfile::NamedTempFile;

    /// A minimal extractor used in unit tests: returns the raw file
    /// content as the markdown body. Stand-in for real backends until
    /// they land per the roadmap.
    struct EchoExtractor {
        exts: &'static [&'static str],
    }

    impl Extractor for EchoExtractor {
        fn extensions(&self) -> &[&'static str] {
            self.exts
        }
        fn extract(&self, path: &Path) -> Result<Document> {
            Ok(Document::new(std::fs::read_to_string(path)?))
        }
        fn extract_bytes(&self, bytes: &[u8], _ext: &str) -> Result<Document> {
            Ok(Document::new(String::from_utf8_lossy(bytes).into_owned()))
        }
    }

    #[test]
    fn empty_engine_rejects_all_files() {
        let engine = Engine::new();
        let f = NamedTempFile::new().unwrap();
        let err = engine.extract(f.path()).unwrap_err();
        assert!(matches!(err, Error::UnsupportedFormat(_)));
    }

    #[test]
    fn dispatches_by_extension() {
        let mut engine = Engine::new();
        engine.register(Box::new(EchoExtractor { exts: &["txt"] }));

        let mut f = tempfile::Builder::new().suffix(".txt").tempfile().unwrap();
        write!(f, "hello world").unwrap();
        f.flush().unwrap();

        let doc = engine.extract(f.path()).unwrap();
        assert_eq!(doc.markdown, "hello world");
    }

    #[test]
    fn extension_match_is_case_insensitive() {
        let mut engine = Engine::new();
        engine.register(Box::new(EchoExtractor { exts: &["pdf"] }));

        let mut f = tempfile::Builder::new().suffix(".PDF").tempfile().unwrap();
        write!(f, "fake pdf").unwrap();
        f.flush().unwrap();

        // Engine should normalize the extension to lowercase before
        // looking up the extractor — `EchoExtractor` registered as "pdf"
        // must still match a file ending ".PDF".
        let doc = engine.extract(f.path()).unwrap();
        assert_eq!(doc.markdown, "fake pdf");
    }

    #[test]
    fn first_registered_extractor_wins() {
        let mut engine = Engine::new();
        engine.register(Box::new(EchoExtractor { exts: &["md"] }));
        // A second extractor for the same extension should be reachable
        // only via direct calls — the dispatcher picks the first match.
        engine.register(Box::new(EchoExtractor { exts: &["md"] }));
        assert_eq!(engine.len(), 2);
    }

    #[test]
    fn extract_bytes_uses_explicit_extension() {
        let mut engine = Engine::new();
        engine.register(Box::new(EchoExtractor { exts: &["html"] }));

        let doc = engine.extract_bytes(b"<p>hi</p>", "html").unwrap();
        assert_eq!(doc.markdown, "<p>hi</p>");

        // Leading dot is tolerated.
        let doc2 = engine.extract_bytes(b"<p>hi</p>", ".html").unwrap();
        assert_eq!(doc2.markdown, "<p>hi</p>");
    }

    #[test]
    fn missing_extension_is_a_clean_error() {
        let engine = Engine::with_defaults();
        let f = tempfile::Builder::new().tempfile().unwrap();
        let err = engine.extract(f.path()).unwrap_err();
        assert!(matches!(err, Error::UnsupportedFormat(_)));
    }

    #[test]
    fn document_helpers_work() {
        let mut doc = Document::new("hello");
        assert_eq!(doc.len(), 5);
        assert!(!doc.is_empty());
        doc.markdown.clear();
        assert!(doc.is_empty());
    }
}