mdkit 0.7.4

Get markdown out of any document — Pandoc + pdfium + platform-native OCR, dispatched per format.
Documentation
[package]
name = "mdkit"
version = "0.7.4"
edition = "2021"
rust-version = "1.88"
authors = ["mdkit contributors"]
description = "Get markdown out of any document — Pandoc + pdfium + platform-native OCR, dispatched per format."
documentation = "https://docs.rs/mdkit"
homepage = "https://github.com/seryai/mdkit"
repository = "https://github.com/seryai/mdkit"
license = "MIT OR Apache-2.0"
readme = "README.md"
keywords = ["markdown", "pdf", "docx", "pandoc", "ocr"]
categories = ["text-processing", "parser-implementations", "filesystem"]
exclude = [
    "/.github",
    "/tests/fixtures/large/*",
]

[features]
# The default set is the in-process Rust backends — no sidecar binaries
# to bundle, no platform-specific FFI. Adds ~7 MB to a release build.
default = ["pdf", "calamine", "csv", "html", "ipynb"]

# In-process Rust backends.
pdf      = ["dep:pdfium-render", "dep:tempfile"]  # PDF text via Google Pdfium (libpdfium required at runtime); tempfile spools pages for OCR fallback
calamine = ["dep:calamine"]         # XLSX / XLS / XLSB / ODS spreadsheets
csv      = ["dep:csv"]              # CSV / TSV
html     = ["dep:html2md"]          # HTML / HTM (lighter alternative to Pandoc for HTML)
ipynb    = ["dep:serde_json"]       # Jupyter notebooks (.ipynb) — pure-Rust JSON parse, no external deps

# Sidecar backend — shells out to the `pandoc` binary on PATH (or a
# caller-provided absolute path via PandocExtractor::with_binary).
# No Rust dependency needed; the binary itself is the dependency.
pandoc   = []

# OCR backends. ocr-platform is preferred at runtime on macOS / Windows
# (uses the OS-native OCR engine — Vision / Windows.Media.Ocr). ocr-onnx
# is the cross-platform fallback (works on Linux + everywhere else)
# backed by the `oar-ocr` crate, which wraps PaddleOCR ONNX models via
# `ort`. Both can coexist; the caller decides which extractor to
# register, or uses `Engine::with_defaults` to get the platform-native
# one when available.
ocr-platform = ["dep:objc2", "dep:objc2-foundation", "dep:objc2-app-kit", "dep:objc2-core-graphics", "dep:objc2-vision", "dep:windows", "dep:windows-future"]
ocr-onnx     = ["dep:oar-ocr", "dep:image"]
# Opt-in: let oar-ocr fetch the ONNX Runtime native library at build
# time / first use. Without this feature, the consumer is responsible
# for shipping libonnxruntime alongside their binary (or installing
# the system package). Same shape as the libpdfium runtime
# requirement for the `pdf` feature.
ocr-onnx-download = ["ocr-onnx", "oar-ocr/download-binaries"]

# Build-everything convenience for downstream tests.
full = ["pdf", "pandoc", "ocr-platform", "ocr-onnx-download", "calamine", "csv", "html", "ipynb"]

[dependencies]
thiserror = "2"

# Optional backends. Each is gated by the corresponding feature flag
# above. Dependencies use `default-features = false` and only enable
# the minimum surface mdkit consumes — keeps the dependency graph
# small for downstream consumers.
pdfium-render = { version = "0.9", optional = true, default-features = false, features = ["thread_safe", "pdfium_latest", "image_latest"] }
calamine      = { version = "0.34", optional = true }
csv           = { version = "1", optional = true }
html2md       = { version = "0.2", optional = true }
# Jupyter notebooks (.ipynb) are JSON; parse directly with serde_json.
# No other JSON-handling code in mdkit consumes this — it stays gated
# behind the `ipynb` feature so backends that don't need it pay nothing.
serde_json    = { version = "1", optional = true }
# Used by PdfiumExtractor's scanned-PDF → OCR fallback (v0.5.3) to
# spool rendered page PNGs into a per-call TempDir before handing them
# to a registered OCR extractor. Only pulled in when the `pdf`
# feature is active.
tempfile      = { version = "3", optional = true }
# ONNX-runtime OCR via `oar-ocr` (v0.6.0). Wraps PaddleOCR ONNX models
# through `ort`; works on Linux + macOS + Windows + WebAssembly.
# Caller supplies the three model files (det + rec + dict) — see
# `OnnxOcrExtractor::with_models`. We disable oar-ocr's default
# features (which include `download-binaries` for ONNX runtime DLL
# auto-fetch) so consumers opt in explicitly via the
# `ocr-onnx-download` feature.
oar-ocr       = { version = "0.6", optional = true, default-features = false }
# `image` is a direct dep when EITHER `pdf` (PNG encoding of rendered
# pages) OR `ocr-onnx` (image decoding before handing to oar-ocr) is
# enabled. We disable default features and enable only the formats
# the OCR backends actually accept, keeping compile time + binary
# size down.
image         = { version = "0.25", optional = true, default-features = false, features = ["png", "jpeg", "bmp", "gif", "tiff"] }

# macOS Vision.framework OCR (gated by `ocr-platform` feature). The
# objc2 ecosystem deps are macOS-only by definition; on other targets
# the feature is a no-op (the module itself is gated by both feature
# AND target_os, so no compile error on Linux / Windows).
[target.'cfg(target_os = "macos")'.dependencies]
objc2               = { version = "0.6", optional = true }
objc2-foundation    = { version = "0.3", optional = true }
objc2-app-kit       = { version = "0.3", optional = true, features = ["NSImage", "NSImageRep"] }
objc2-core-graphics = { version = "0.3", optional = true }
objc2-vision        = { version = "0.3", optional = true, features = ["alloc", "VNRequest", "VNRequestHandler", "VNRecognizeTextRequest", "VNObservation", "objc2-core-graphics"] }

# Windows.Media.Ocr (gated by `ocr-platform` feature). The `windows`
# crate is the official Microsoft windows-rs binding; it's Windows-only
# by definition, so on other targets the feature is a no-op (the module
# itself is gated by both feature AND target_os, so no compile error
# on macOS / Linux).
[target.'cfg(target_os = "windows")'.dependencies]
windows = { version = "0.62", optional = true, features = [
    "Storage",
    "Storage_Streams",
    "Graphics_Imaging",
    "Media_Ocr",
    "Globalization",
    "Foundation",
    "Win32_System_WinRT",
] }
# IAsyncOperation + AsyncStatus + the .get() blocking helper live in
# the split-out windows-future crate, not the umbrella `windows` crate.
windows-future = { version = "0.3", optional = true }

[dev-dependencies]
tempfile = "3"

[lints.rust]
# `deny` (rather than `forbid`) so backends that legitimately need FFI
# (the macOS Vision OCR module is the first such case) can opt in via
# per-module #[allow(unsafe_code)] with a clear safety comment. Core
# dispatch and trait-only backends remain unsafe-free.
unsafe_code = "deny"
missing_docs = "warn"

[lints.clippy]
all = { level = "warn", priority = -1 }
pedantic = { level = "warn", priority = -1 }
# Selectively allow the pedantic lints that fight common idioms.
module_name_repetitions = "allow"
must_use_candidate = "allow"
missing_errors_doc = "allow"