transmutation 0.2.0

High-performance document conversion engine for AI/LLM embeddings - 27 formats supported
Documentation
[package]
name = "transmutation"
version = "0.2.0"
edition = "2024"
authors = ["HiveLLM Team <team@hivellm.org>"]
license = "MIT"
description = "High-performance document conversion engine for AI/LLM embeddings - 27 formats supported"
repository = "https://github.com/hivellm/transmutation"
homepage = "https://hivellm.org/transmutation"
documentation = "https://docs.rs/transmutation"
keywords = ["document", "conversion", "pdf", "llm", "embedding"]
categories = ["parser-implementations", "text-processing", "multimedia"]
readme = "README.md"
rust-version = "1.85"
exclude = [
    "data/*",
    "docling_parse",
    "docling-parse/",
    "build_*",
    "libs/",
    "*.pdf",
    "*.mp3",
    "*.mp4",
    ".github/",
]

# WiX Installer metadata for Windows MSI generation
[package.metadata.wix]
upgrade-guid = "12345678-1234-1234-1234-123456789012"
path-guid = "87654321-4321-4321-4321-210987654321"
license = "wix/License.rtf"
eula = true

[lib]
name = "transmutation"
path = "src/lib.rs"

[[bin]]
name = "transmutation"
path = "src/bin/transmutation.rs"
required-features = ["cli"]

[dependencies]
# Async runtime
tokio = { version = "1.47", features = ["full"] }
async-trait = "0.1"
futures = "0.3"

# Error handling
thiserror = "2.0"
anyhow = "1.0"

# Serialization
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"

# File I/O and path handling
walkdir = "2.5"
tempfile = "3.20"
mime = "0.3"
mime_guess = "2.0"

# Hashing and crypto
sha2 = "0.10"
blake3 = "1.5"

# Logging and tracing
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
tracing-opentelemetry = { version = "0.30", optional = true }

# File type detection
file-format = "0.26"

# NOTE: We are NOT integrating with Docling - we are building a pure Rust competitor
# The docling repository is for reference only, not as a dependency

# OCR (Tesseract)
tesseract = { version = "0.15", optional = true }
leptess = { version = "0.14", optional = true }

# Image processing
image = { version = "0.25", features = ["png", "jpeg", "gif", "bmp", "tiff", "webp"] }
imageproc = "0.25"

# ML & Computer Vision (for docling-ffi feature)
ort = { version = "2.0.0-rc.10", optional = true, features = ["download-binaries"] }
ndarray = { version = "0.15", optional = true }

# Spatial indexing
rstar = { version = "0.12", optional = true }

# Document parsing - Core formats (always enabled)
# PDF
lopdf = "0.35"
pdf-extract = "0.7"
pdfium-render = { version = "0.8", optional = true }  # Only for PDF rendering to images

# HTML/XML
scraper = "0.21"
html5ever = "0.29"
quick-xml = "0.37"
roxmltree = { version = "0.21", optional = true }

# Archives (ZIP always enabled for core functionality)
zip = "6.0"
tar = { version = "0.4", optional = true }
flate2 = { version = "1.0", optional = true }
sevenz-rust = { version = "0.6", optional = true }

# Office formats (optional)
docx-rs = { version = "0.4", optional = true }
umya-spreadsheet = { version = "2.3", optional = true }

# Markdown
pulldown-cmark = "0.13"
comrak = "0.29"
regex = "1.11"
once_cell = "1.20"

# Note: Audio/Video use external ffmpeg and whisper CLI tools (no Rust crates needed)

# Parallelism
rayon = "1.10"
num_cpus = "1.16"

# System directories
dirs = "5.0"

# CLI dependencies
clap = { version = "4.5", features = ["derive", "cargo", "env"], optional = true }
indicatif = { version = "0.17", optional = true }
console = { version = "0.15", optional = true }
colored = { version = "2.2", optional = true }

# Note: External integrations removed (cache, HTTP, metrics)
# Transmutation is a library/CLI - external features are handled by HiveLLM Vectorizer

[dev-dependencies]
criterion = { version = "0.6", features = ["async_tokio", "html_reports"] }
pretty_assertions = "1.4"
proptest = "1.6"
tempfile = "3.20"
mockall = "0.13"
tokio-test = "0.4"

[build-dependencies]
# Windows resource compiler for embedding icons
winres = "0.1"

[features]
default = ["office"]  # Core formats (PDF, HTML, XML, ZIP) are always enabled

# Core engines (pure Rust implementations)
tesseract = ["dep:tesseract", "dep:leptess"]

# Format support (pure Rust implementations)
# Note: PDF, HTML, XML, and basic ZIP support are ALWAYS enabled (no feature flags)
pdf-to-image = ["dep:pdfium-render"]  # PDF rendering to images per page (optional)
office = ["docx-rs", "umya-spreadsheet"]  # Office formats (DOCX, XLSX, PPTX)
image-ocr = ["tesseract"]
audio = []  # Audio transcription (requires external whisper CLI)
video = []  # Video transcription (requires external ffmpeg + whisper CLI)
archives-extended = ["tar", "flate2", "sevenz-rust"]  # Extended archive support (TAR, GZ, 7Z)

# Advanced layout analysis (C++ FFI to docling-parse + ML models)
docling-ffi = ["dep:ort", "dep:ndarray", "dep:rstar", "dep:pdfium-render"]  # Enable C++ docling-parse + ONNX ML models

# CLI
cli = ["clap", "indicatif", "console", "colored"]

# All features (no external integrations - Transmutation is a library/CLI)
full = [
    "pdf-to-image",
    "office",
    "image-ocr",
    "audio",
    "video",
    "archives-extended",
    "cli",
]

# Benchmarks removed - files don't exist yet
# TODO: Re-add when benchmark files are created
# [[bench]]
# name = "conversion_benchmarks"
# harness = false
#
# [[bench]]
# name = "pipeline_benchmark"
# harness = false

[profile.release]
opt-level = 3
lto = true
codegen-units = 1
strip = true

[profile.bench]
opt-level = 3
lto = true

[package.metadata.docs.rs]
all-features = true
rustdoc-args = ["--cfg", "docsrs"]

# =============================================================================
# WORKSPACE LINTS (Based on Qdrant standards)
# =============================================================================
[lints.clippy]
cast_lossless = "warn"
doc_link_with_quotes = "warn"
enum_glob_use = "warn"
explicit_into_iter_loop = "warn"
filter_map_next = "warn"
flat_map_option = "warn"
from_iter_instead_of_collect = "warn"
implicit_clone = "warn"
inconsistent_struct_constructor = "warn"
inefficient_to_string = "warn"
manual_is_variant_and = "warn"
manual_let_else = "warn"
needless_continue = "warn"
needless_raw_string_hashes = "warn"
ptr_as_ptr = "warn"
ref_option_ref = "warn"
uninlined_format_args = "warn"
unnecessary_wraps = "warn"
unused_self = "warn"
used_underscore_binding = "warn"
match_wildcard_for_single_variants = "warn"
needless_pass_by_ref_mut = "warn"

[lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }

[lints.rustdoc]
private_intra_doc_links = "allow"

# =============================================================================
# DEBIAN PACKAGE METADATA
# =============================================================================
[package.metadata.deb]
maintainer = "HiveLLM Team <team@hivellm.org>"
depends = "$auto"
license-file = ["LICENSE", "0"]
section = "text"
priority = "optional"
extended-description = """\
Transmutation is a high-performance document conversion engine for AI/LLM embeddings. \
Supports 27+ formats including PDF, Office docs, images, audio, and video. \
Built in Rust for maximum performance and safety.\
"""
assets = [
    ["target/release/transmutation", "usr/bin/", "755"],
    ["README.md", "usr/share/doc/transmutation/README", "644"],
    ["LICENSE", "usr/share/doc/transmutation/LICENSE", "644"],
]
conf-files = []
maintainer-scripts = "debian/"