mdkit 0.5.3

Get markdown out of any document — Pandoc + pdfium + platform-native OCR, dispatched per format.
Documentation
[package]
name = "mdkit"
version = "0.5.3"
edition = "2021"
rust-version = "1.88"
authors = ["mdkit contributors"]
description = "Get markdown out of any document — Pandoc + pdfium + platform-native OCR, dispatched per format."
documentation = "https://docs.rs/mdkit"
homepage = "https://github.com/mdkit-project/mdkit"
repository = "https://github.com/mdkit-project/mdkit"
license = "MIT OR Apache-2.0"
readme = "README.md"
keywords = ["markdown", "pdf", "docx", "pandoc", "ocr"]
categories = ["text-processing", "parser-implementations", "filesystem"]
exclude = [
    "/.github",
    "/tests/fixtures/large/*",
]

[features]
# The default set is the in-process Rust backends — no sidecar binaries
# to bundle, no platform-specific FFI. Adds ~7 MB to a release build.
default = ["pdf", "calamine", "csv", "html"]

# In-process Rust backends.
pdf      = ["dep:pdfium-render", "dep:tempfile"]  # PDF text via Google Pdfium (libpdfium required at runtime); tempfile spools pages for OCR fallback
calamine = ["dep:calamine"]         # XLSX / XLS / XLSB / ODS spreadsheets
csv      = ["dep:csv"]              # CSV / TSV
html     = ["dep:html2md"]          # HTML / HTM (lighter alternative to Pandoc for HTML)

# Sidecar backend — shells out to the `pandoc` binary on PATH (or a
# caller-provided absolute path via PandocExtractor::with_binary).
# No Rust dependency needed; the binary itself is the dependency.
pandoc   = []

# OCR backends — pick one. Both can coexist; ocr-platform is preferred
# at runtime when available (macOS / Windows), falling back to ocr-onnx.
ocr-platform = ["dep:objc2", "dep:objc2-foundation", "dep:objc2-app-kit", "dep:objc2-core-graphics", "dep:objc2-vision", "dep:windows", "dep:windows-future"]
ocr-onnx     = []   # placeholder; Surya/ONNX in v0.6

# Build-everything convenience for downstream tests.
full = ["pdf", "pandoc", "ocr-platform", "ocr-onnx", "calamine", "csv", "html"]

[dependencies]
thiserror = "2"

# Optional backends. Each is gated by the corresponding feature flag
# above. Dependencies use `default-features = false` and only enable
# the minimum surface mdkit consumes — keeps the dependency graph
# small for downstream consumers.
pdfium-render = { version = "0.9", optional = true, default-features = false, features = ["thread_safe", "pdfium_latest", "image_latest"] }
calamine      = { version = "0.34", optional = true }
csv           = { version = "1", optional = true }
html2md       = { version = "0.2", optional = true }
# Used by PdfiumExtractor's scanned-PDF → OCR fallback (v0.5.3) to
# spool rendered page PNGs into a per-call TempDir before handing them
# to a registered OCR extractor. Only pulled in when the `pdf`
# feature is active.
tempfile      = { version = "3", optional = true }

# macOS Vision.framework OCR (gated by `ocr-platform` feature). The
# objc2 ecosystem deps are macOS-only by definition; on other targets
# the feature is a no-op (the module itself is gated by both feature
# AND target_os, so no compile error on Linux / Windows).
[target.'cfg(target_os = "macos")'.dependencies]
objc2               = { version = "0.6", optional = true }
objc2-foundation    = { version = "0.3", optional = true }
objc2-app-kit       = { version = "0.3", optional = true, features = ["NSImage", "NSImageRep"] }
objc2-core-graphics = { version = "0.3", optional = true }
objc2-vision        = { version = "0.3", optional = true, features = ["alloc", "VNRequest", "VNRequestHandler", "VNRecognizeTextRequest", "VNObservation", "objc2-core-graphics"] }

# Windows.Media.Ocr (gated by `ocr-platform` feature). The `windows`
# crate is the official Microsoft windows-rs binding; it's Windows-only
# by definition, so on other targets the feature is a no-op (the module
# itself is gated by both feature AND target_os, so no compile error
# on macOS / Linux).
[target.'cfg(target_os = "windows")'.dependencies]
windows = { version = "0.62", optional = true, features = [
    "Storage",
    "Storage_Streams",
    "Graphics_Imaging",
    "Media_Ocr",
    "Globalization",
    "Foundation",
    "Win32_System_WinRT",
] }
# IAsyncOperation + AsyncStatus + the .get() blocking helper live in
# the split-out windows-future crate, not the umbrella `windows` crate.
windows-future = { version = "0.3", optional = true }

[dev-dependencies]
tempfile = "3"

[lints.rust]
# `deny` (rather than `forbid`) so backends that legitimately need FFI
# (the macOS Vision OCR module is the first such case) can opt in via
# per-module #[allow(unsafe_code)] with a clear safety comment. Core
# dispatch and trait-only backends remain unsafe-free.
unsafe_code = "deny"
missing_docs = "warn"

[lints.clippy]
all = { level = "warn", priority = -1 }
pedantic = { level = "warn", priority = -1 }
# Selectively allow the pedantic lints that fight common idioms.
module_name_repetitions = "allow"
must_use_candidate = "allow"
missing_errors_doc = "allow"