1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
[]
= "mdkit"
= "0.7.4"
= "2021"
= "1.88"
= ["mdkit contributors"]
= "Get markdown out of any document — Pandoc + pdfium + platform-native OCR, dispatched per format."
= "https://docs.rs/mdkit"
= "https://github.com/seryai/mdkit"
= "https://github.com/seryai/mdkit"
= "MIT OR Apache-2.0"
= "README.md"
= ["markdown", "pdf", "docx", "pandoc", "ocr"]
= ["text-processing", "parser-implementations", "filesystem"]
= [
"/.github",
"/tests/fixtures/large/*",
]
[]
# The default set is the in-process Rust backends — no sidecar binaries
# to bundle, no platform-specific FFI. Adds ~7 MB to a release build.
= ["pdf", "calamine", "csv", "html", "ipynb"]
# In-process Rust backends.
= ["dep:pdfium-render", "dep:tempfile"] # PDF text via Google Pdfium (libpdfium required at runtime); tempfile spools pages for OCR fallback
= ["dep:calamine"] # XLSX / XLS / XLSB / ODS spreadsheets
= ["dep:csv"] # CSV / TSV
= ["dep:html2md"] # HTML / HTM (lighter alternative to Pandoc for HTML)
= ["dep:serde_json"] # Jupyter notebooks (.ipynb) — pure-Rust JSON parse, no external deps
# Sidecar backend — shells out to the `pandoc` binary on PATH (or a
# caller-provided absolute path via PandocExtractor::with_binary).
# No Rust dependency needed; the binary itself is the dependency.
= []
# OCR backends. ocr-platform is preferred at runtime on macOS / Windows
# (uses the OS-native OCR engine — Vision / Windows.Media.Ocr). ocr-onnx
# is the cross-platform fallback (works on Linux + everywhere else)
# backed by the `oar-ocr` crate, which wraps PaddleOCR ONNX models via
# `ort`. Both can coexist; the caller decides which extractor to
# register, or uses `Engine::with_defaults` to get the platform-native
# one when available.
= ["dep:objc2", "dep:objc2-foundation", "dep:objc2-app-kit", "dep:objc2-core-graphics", "dep:objc2-vision", "dep:windows", "dep:windows-future"]
= ["dep:oar-ocr", "dep:image"]
# Opt-in: let oar-ocr fetch the ONNX Runtime native library at build
# time / first use. Without this feature, the consumer is responsible
# for shipping libonnxruntime alongside their binary (or installing
# the system package). Same shape as the libpdfium runtime
# requirement for the `pdf` feature.
= ["ocr-onnx", "oar-ocr/download-binaries"]
# Build-everything convenience for downstream tests.
= ["pdf", "pandoc", "ocr-platform", "ocr-onnx-download", "calamine", "csv", "html", "ipynb"]
[]
= "2"
# Optional backends. Each is gated by the corresponding feature flag
# above. Dependencies use `default-features = false` and only enable
# the minimum surface mdkit consumes — keeps the dependency graph
# small for downstream consumers.
= { = "0.9", = true, = false, = ["thread_safe", "pdfium_latest", "image_latest"] }
= { = "0.34", = true }
= { = "1", = true }
= { = "0.2", = true }
# Jupyter notebooks (.ipynb) are JSON; parse directly with serde_json.
# No other JSON-handling code in mdkit consumes this — it stays gated
# behind the `ipynb` feature so backends that don't need it pay nothing.
= { = "1", = true }
# Used by PdfiumExtractor's scanned-PDF → OCR fallback (v0.5.3) to
# spool rendered page PNGs into a per-call TempDir before handing them
# to a registered OCR extractor. Only pulled in when the `pdf`
# feature is active.
= { = "3", = true }
# ONNX-runtime OCR via `oar-ocr` (v0.6.0). Wraps PaddleOCR ONNX models
# through `ort`; works on Linux + macOS + Windows + WebAssembly.
# Caller supplies the three model files (det + rec + dict) — see
# `OnnxOcrExtractor::with_models`. We disable oar-ocr's default
# features (which include `download-binaries` for ONNX runtime DLL
# auto-fetch) so consumers opt in explicitly via the
# `ocr-onnx-download` feature.
= { = "0.6", = true, = false }
# `image` is a direct dep when EITHER `pdf` (PNG encoding of rendered
# pages) OR `ocr-onnx` (image decoding before handing to oar-ocr) is
# enabled. We disable default features and enable only the formats
# the OCR backends actually accept, keeping compile time + binary
# size down.
= { = "0.25", = true, = false, = ["png", "jpeg", "bmp", "gif", "tiff"] }
# macOS Vision.framework OCR (gated by `ocr-platform` feature). The
# objc2 ecosystem deps are macOS-only by definition; on other targets
# the feature is a no-op (the module itself is gated by both feature
# AND target_os, so no compile error on Linux / Windows).
[]
= { = "0.6", = true }
= { = "0.3", = true }
= { = "0.3", = true, = ["NSImage", "NSImageRep"] }
= { = "0.3", = true }
= { = "0.3", = true, = ["alloc", "VNRequest", "VNRequestHandler", "VNRecognizeTextRequest", "VNObservation", "objc2-core-graphics"] }
# Windows.Media.Ocr (gated by `ocr-platform` feature). The `windows`
# crate is the official Microsoft windows-rs binding; it's Windows-only
# by definition, so on other targets the feature is a no-op (the module
# itself is gated by both feature AND target_os, so no compile error
# on macOS / Linux).
[]
= { = "0.62", = true, = [
"Storage",
"Storage_Streams",
"Graphics_Imaging",
"Media_Ocr",
"Globalization",
"Foundation",
"Win32_System_WinRT",
] }
# IAsyncOperation + AsyncStatus + the .get() blocking helper live in
# the split-out windows-future crate, not the umbrella `windows` crate.
= { = "0.3", = true }
[]
= "3"
[]
# `deny` (rather than `forbid`) so backends that legitimately need FFI
# (the macOS Vision OCR module is the first such case) can opt in via
# per-module #[allow(unsafe_code)] with a clear safety comment. Core
# dispatch and trait-only backends remain unsafe-free.
= "deny"
= "warn"
[]
= { = "warn", = -1 }
= { = "warn", = -1 }
# Selectively allow the pedantic lints that fight common idioms.
= "allow"
= "allow"
= "allow"