mdkit/lib.rs
1//! # mdkit — get markdown out of any document.
2//!
3//! See the [README](https://github.com/mdkit-project/mdkit) for the full
4//! design rationale; the short version is: dispatch by file extension to
5//! the best backend per format. Pandoc for DOCX/PPTX/EPUB/RTF/ODT/LaTeX,
6//! Pdfium for PDF, OS-native APIs for OCR, `calamine` for spreadsheets.
7//!
8//! ## Quick start
9//!
10//! ```no_run
11//! use mdkit::Engine;
12//! use std::path::Path;
13//!
14//! let engine = Engine::with_defaults();
15//! let doc = engine.extract(Path::new("report.pdf"))?;
16//! println!("{}", doc.markdown);
17//! # Ok::<(), mdkit::Error>(())
18//! ```
19//!
20//! ## Custom extractor
21//!
22//! Implement [`Extractor`] for your own format and register it on an
23//! [`Engine`]:
24//!
25//! ```
26//! use mdkit::{Document, Engine, Extractor, Result};
27//! use std::path::Path;
28//!
29//! struct MyParser;
30//!
31//! impl Extractor for MyParser {
32//! fn extensions(&self) -> &[&'static str] { &["custom"] }
33//! fn extract(&self, path: &Path) -> Result<Document> {
34//! Ok(Document::new(std::fs::read_to_string(path)?))
35//! }
36//! }
37//!
38//! let mut engine = Engine::new();
39//! engine.register(Box::new(MyParser));
40//! ```
41
42#![doc(html_root_url = "https://docs.rs/mdkit")]
43#![cfg_attr(docsrs, feature(doc_cfg))]
44
45use std::collections::HashMap;
46use std::path::Path;
47
48mod error;
49pub use error::{Error, Result};
50
51#[cfg(feature = "pdf")]
52pub mod pdf;
53
54#[cfg(feature = "calamine")]
55pub mod calamine;
56
57#[cfg(feature = "csv")]
58pub mod csv;
59
60#[cfg(feature = "html")]
61pub mod html;
62
63#[cfg(feature = "pandoc")]
64pub mod pandoc;
65
66// Platform-native OCR. Each module is gated by both the
67// `ocr-platform` feature AND the matching `target_os`, because the
68// underlying FFI deps (objc2-vision on macOS, the `windows` crate on
69// Windows) are platform-specific by definition. On Linux with
70// `ocr-platform` enabled, neither module compiles — Linux users get
71// no platform OCR backend; ONNX-based fallback ships in v0.6 via
72// the separate `ocr-onnx` feature.
73#[cfg(all(feature = "ocr-platform", target_os = "macos"))]
74pub mod ocr_macos;
75
76#[cfg(all(feature = "ocr-platform", target_os = "windows"))]
77pub mod ocr_windows;
78
79// Cross-platform ONNX OCR via the `oar-ocr` crate. Unlike the
80// platform-native modules above this one isn't gated by `target_os`
81// — `oar-ocr` works on Linux, macOS, Windows, and WebAssembly.
82// `Engine::with_defaults` does NOT auto-register an ONNX extractor
83// because construction requires caller-supplied model paths; users
84// wire it in explicitly. See module docs for the registration
85// pattern.
86#[cfg(feature = "ocr-onnx")]
87pub mod ocr_onnx;
88
89// ---------------------------------------------------------------------------
90// Document — the unit of output
91// ---------------------------------------------------------------------------
92
93/// The result of extracting one document. Markdown is always present;
94/// title and metadata are best-effort and may be empty depending on the
95/// backend.
96#[derive(Debug, Clone, Default)]
97pub struct Document {
98 /// The extracted markdown text.
99 pub markdown: String,
100 /// Document title if the backend could derive one (DOCX core
101 /// properties, PDF metadata, HTML `<title>`, etc.). `None` when
102 /// unknown.
103 pub title: Option<String>,
104 /// Backend-specific metadata. Stable keys are documented per-backend;
105 /// callers should treat unknown keys as opaque.
106 pub metadata: HashMap<String, String>,
107}
108
109impl Document {
110 /// Convenience constructor for the common case where you only have
111 /// markdown text.
112 pub fn new(markdown: impl Into<String>) -> Self {
113 Self {
114 markdown: markdown.into(),
115 title: None,
116 metadata: HashMap::new(),
117 }
118 }
119
120 /// Returns the document's character count. Useful for capping logged
121 /// payloads or tracking extraction throughput.
122 pub fn len(&self) -> usize {
123 self.markdown.chars().count()
124 }
125
126 /// Returns true if the extracted markdown is empty.
127 pub fn is_empty(&self) -> bool {
128 self.markdown.is_empty()
129 }
130}
131
132// ---------------------------------------------------------------------------
133// Extractor — the per-format trait
134// ---------------------------------------------------------------------------
135
136/// A backend that knows how to convert one or more file formats to
137/// markdown. Implementors register themselves with an [`Engine`].
138///
139/// `Send + Sync` is required so engines can be shared across threads.
140/// All public methods take `&self` so implementors can wrap their
141/// internals in `Arc<Mutex<...>>` if they need interior state.
142pub trait Extractor: Send + Sync {
143 /// Lowercase file extensions this extractor handles, **without**
144 /// the leading dot. For example: `&["pdf"]`, `&["docx", "doc"]`.
145 fn extensions(&self) -> &[&'static str];
146
147 /// Convert the document at `path` to markdown. Returns
148 /// [`Error::Io`] for filesystem failures, [`Error::ParseError`]
149 /// for backend-specific failures.
150 fn extract(&self, path: &Path) -> Result<Document>;
151
152 /// Convert from in-memory bytes. Default implementation returns
153 /// [`Error::UnsupportedOperation`] — backends that can support it
154 /// (PDF, HTML) should override.
155 fn extract_bytes(&self, _bytes: &[u8], _ext: &str) -> Result<Document> {
156 Err(Error::UnsupportedOperation(
157 "this extractor does not support in-memory extraction".into(),
158 ))
159 }
160
161 /// Human-readable backend name, used in error messages and audit
162 /// logs (e.g. `"pandoc"`, `"pdfium-render"`, `"calamine"`).
163 fn name(&self) -> &'static str {
164 std::any::type_name::<Self>()
165 }
166}
167
168// ---------------------------------------------------------------------------
169// Engine — the dispatcher
170// ---------------------------------------------------------------------------
171
172/// Dispatches `extract` calls to the registered [`Extractor`] for the
173/// file's extension. Construct with [`Engine::new`] for an empty
174/// engine, or [`Engine::with_defaults`] to populate the defaults that
175/// match enabled feature flags.
176pub struct Engine {
177 extractors: Vec<Box<dyn Extractor>>,
178}
179
180impl Engine {
181 /// New engine with no extractors registered. Useful when you want
182 /// full control over the backend set.
183 pub fn new() -> Self {
184 Self {
185 extractors: Vec::new(),
186 }
187 }
188
189 /// New engine with the default backends for the enabled feature
190 /// flags. Backends register themselves silently — if a backend
191 /// can't initialize (e.g. libpdfium isn't on the system library
192 /// path for the `pdf` feature), it's skipped rather than failing
193 /// the whole construction. Use [`with_defaults_diagnostic`] if
194 /// you want to surface those failures to the user.
195 ///
196 /// [`with_defaults_diagnostic`]: Self::with_defaults_diagnostic
197 pub fn with_defaults() -> Self {
198 let (engine, _errors) = Self::with_defaults_diagnostic();
199 engine
200 }
201
202 /// Like [`with_defaults`](Self::with_defaults) but returns the
203 /// list of backend-init errors alongside the engine, so callers
204 /// can log "PDF support disabled: libpdfium not found" rather
205 /// than silently shipping a degraded experience.
206 pub fn with_defaults_diagnostic() -> (Self, Vec<(&'static str, Error)>) {
207 // `mut` is conditionally needed: when --no-default-features is
208 // set and no optional backends are enabled, neither `engine`
209 // nor `errors` ever gets a mutating call. The allow keeps that
210 // valid configuration buildable under -D warnings.
211 #[allow(unused_mut)]
212 let mut engine = Self::new();
213 #[allow(unused_mut)]
214 let mut errors: Vec<(&'static str, Error)> = Vec::new();
215
216 // Registration order matters: the Engine dispatcher returns
217 // the FIRST registered extractor that claims a given file
218 // extension. We register cheap in-process Rust backends first
219 // so they win over the (heavier) Pandoc sidecar for any
220 // overlapping format — most importantly HTML, which both
221 // Html2mdExtractor and PandocExtractor handle. Pandoc is the
222 // last registered, so it picks up DOCX/PPTX/EPUB/RTF/ODT/LaTeX
223 // (which nothing else handles) and ALSO acts as the fallback
224 // HTML reader if the `html` feature is disabled.
225
226 #[cfg(feature = "pdf")]
227 {
228 match crate::pdf::PdfiumExtractor::new() {
229 Ok(ext) => {
230 // Wire the platform OCR backend in as a fallback
231 // for scanned (image-only) PDFs. Pdfium can't
232 // extract text from those — without this hop,
233 // PdfiumExtractor returns empty markdown silently.
234 // We construct a SECOND OCR-extractor instance
235 // here (the standalone image-OCR registration is
236 // separate); both are stateless so duplication is
237 // free.
238 #[allow(unused_mut)]
239 let mut ext = ext;
240 #[cfg(all(feature = "ocr-platform", target_os = "macos"))]
241 {
242 ext = ext.with_ocr_fallback(Box::new(
243 crate::ocr_macos::VisionOcrExtractor::new(),
244 ));
245 }
246 #[cfg(all(feature = "ocr-platform", target_os = "windows"))]
247 {
248 ext = ext.with_ocr_fallback(Box::new(
249 crate::ocr_windows::WindowsOcrExtractor::new(),
250 ));
251 }
252 engine.register(Box::new(ext));
253 }
254 Err(e) => errors.push(("pdf", e)),
255 }
256 }
257
258 #[cfg(feature = "calamine")]
259 {
260 engine.register(Box::new(crate::calamine::CalamineExtractor::new()));
261 }
262
263 #[cfg(feature = "csv")]
264 {
265 engine.register(Box::new(crate::csv::CsvExtractor::new()));
266 }
267
268 #[cfg(feature = "html")]
269 {
270 engine.register(Box::new(crate::html::Html2mdExtractor::new()));
271 }
272
273 #[cfg(all(feature = "ocr-platform", target_os = "macos"))]
274 {
275 // Vision is part of macOS — no init failure mode.
276 engine.register(Box::new(crate::ocr_macos::VisionOcrExtractor::new()));
277 }
278
279 #[cfg(all(feature = "ocr-platform", target_os = "windows"))]
280 {
281 // Windows.Media.Ocr is part of Windows — no init failure
282 // at construction time. (Per-call init may still fail if
283 // the user has no OCR-capable language pack installed; we
284 // surface that as a typed error from `extract`.)
285 engine.register(Box::new(crate::ocr_windows::WindowsOcrExtractor::new()));
286 }
287
288 #[cfg(feature = "pandoc")]
289 {
290 match crate::pandoc::PandocExtractor::new() {
291 Ok(ext) => {
292 engine.register(Box::new(ext));
293 }
294 Err(e) => errors.push(("pandoc", e)),
295 }
296 }
297
298 (engine, errors)
299 }
300
301 /// Register a backend. Multiple backends can claim the same
302 /// extension; the first registered wins on dispatch (so you can
303 /// override defaults by registering your own extractor first).
304 pub fn register(&mut self, extractor: Box<dyn Extractor>) -> &mut Self {
305 self.extractors.push(extractor);
306 self
307 }
308
309 /// Returns the number of registered extractors.
310 pub fn len(&self) -> usize {
311 self.extractors.len()
312 }
313
314 /// Returns true when no extractors are registered.
315 pub fn is_empty(&self) -> bool {
316 self.extractors.is_empty()
317 }
318
319 /// Extract `path` to markdown, dispatching by file extension.
320 /// Returns [`Error::UnsupportedFormat`] if no registered extractor
321 /// claims the extension.
322 pub fn extract(&self, path: &Path) -> Result<Document> {
323 let ext = extension_of(path).ok_or_else(|| {
324 Error::UnsupportedFormat(format!("no file extension on {}", path.display()))
325 })?;
326 let extractor = self.find(&ext).ok_or_else(|| {
327 Error::UnsupportedFormat(format!("no extractor registered for .{ext}"))
328 })?;
329 extractor.extract(path)
330 }
331
332 /// Same as [`extract`](Self::extract) but takes bytes + an explicit
333 /// extension. Backends that don't implement
334 /// [`Extractor::extract_bytes`] return
335 /// [`Error::UnsupportedOperation`].
336 pub fn extract_bytes(&self, bytes: &[u8], ext: &str) -> Result<Document> {
337 let lower = ext.trim_start_matches('.').to_ascii_lowercase();
338 let extractor = self.find(&lower).ok_or_else(|| {
339 Error::UnsupportedFormat(format!("no extractor registered for .{lower}"))
340 })?;
341 extractor.extract_bytes(bytes, &lower)
342 }
343
344 fn find(&self, ext: &str) -> Option<&dyn Extractor> {
345 self.extractors
346 .iter()
347 .find(|e| e.extensions().contains(&ext))
348 .map(std::convert::AsRef::as_ref)
349 }
350}
351
352impl Default for Engine {
353 fn default() -> Self {
354 Self::with_defaults()
355 }
356}
357
358fn extension_of(path: &Path) -> Option<String> {
359 path.extension()
360 .and_then(|os| os.to_str())
361 .map(str::to_ascii_lowercase)
362}
363
364// ---------------------------------------------------------------------------
365// Tests
366// ---------------------------------------------------------------------------
367
368#[cfg(test)]
369mod tests {
370 use super::*;
371 use std::io::Write;
372 use tempfile::NamedTempFile;
373
374 /// A minimal extractor used in unit tests: returns the raw file
375 /// content as the markdown body. Stand-in for real backends until
376 /// they land per the roadmap.
377 struct EchoExtractor {
378 exts: &'static [&'static str],
379 }
380
381 impl Extractor for EchoExtractor {
382 fn extensions(&self) -> &[&'static str] {
383 self.exts
384 }
385 fn extract(&self, path: &Path) -> Result<Document> {
386 Ok(Document::new(std::fs::read_to_string(path)?))
387 }
388 fn extract_bytes(&self, bytes: &[u8], _ext: &str) -> Result<Document> {
389 Ok(Document::new(String::from_utf8_lossy(bytes).into_owned()))
390 }
391 }
392
393 #[test]
394 fn empty_engine_rejects_all_files() {
395 let engine = Engine::new();
396 let f = NamedTempFile::new().unwrap();
397 let err = engine.extract(f.path()).unwrap_err();
398 assert!(matches!(err, Error::UnsupportedFormat(_)));
399 }
400
401 #[test]
402 fn dispatches_by_extension() {
403 let mut engine = Engine::new();
404 engine.register(Box::new(EchoExtractor { exts: &["txt"] }));
405
406 let mut f = tempfile::Builder::new().suffix(".txt").tempfile().unwrap();
407 write!(f, "hello world").unwrap();
408 f.flush().unwrap();
409
410 let doc = engine.extract(f.path()).unwrap();
411 assert_eq!(doc.markdown, "hello world");
412 }
413
414 #[test]
415 fn extension_match_is_case_insensitive() {
416 let mut engine = Engine::new();
417 engine.register(Box::new(EchoExtractor { exts: &["pdf"] }));
418
419 let mut f = tempfile::Builder::new().suffix(".PDF").tempfile().unwrap();
420 write!(f, "fake pdf").unwrap();
421 f.flush().unwrap();
422
423 // Engine should normalize the extension to lowercase before
424 // looking up the extractor — `EchoExtractor` registered as "pdf"
425 // must still match a file ending ".PDF".
426 let doc = engine.extract(f.path()).unwrap();
427 assert_eq!(doc.markdown, "fake pdf");
428 }
429
430 #[test]
431 fn first_registered_extractor_wins() {
432 let mut engine = Engine::new();
433 engine.register(Box::new(EchoExtractor { exts: &["md"] }));
434 // A second extractor for the same extension should be reachable
435 // only via direct calls — the dispatcher picks the first match.
436 engine.register(Box::new(EchoExtractor { exts: &["md"] }));
437 assert_eq!(engine.len(), 2);
438 }
439
440 #[test]
441 fn extract_bytes_uses_explicit_extension() {
442 let mut engine = Engine::new();
443 engine.register(Box::new(EchoExtractor { exts: &["html"] }));
444
445 let doc = engine.extract_bytes(b"<p>hi</p>", "html").unwrap();
446 assert_eq!(doc.markdown, "<p>hi</p>");
447
448 // Leading dot is tolerated.
449 let doc2 = engine.extract_bytes(b"<p>hi</p>", ".html").unwrap();
450 assert_eq!(doc2.markdown, "<p>hi</p>");
451 }
452
453 #[test]
454 fn missing_extension_is_a_clean_error() {
455 let engine = Engine::with_defaults();
456 let f = tempfile::Builder::new().tempfile().unwrap();
457 let err = engine.extract(f.path()).unwrap_err();
458 assert!(matches!(err, Error::UnsupportedFormat(_)));
459 }
460
461 #[test]
462 fn document_helpers_work() {
463 let mut doc = Document::new("hello");
464 assert_eq!(doc.len(), 5);
465 assert!(!doc.is_empty());
466 doc.markdown.clear();
467 assert!(doc.is_empty());
468 }
469}