transmutation 0.3.1

High-performance document conversion engine for AI/LLM embeddings - 27 formats supported
Documentation
[package]

name = "transmutation"

version = "0.3.1"

edition = "2024"

authors = ["HiveLLM Team <team@hivellm.org>"]

license = "MIT"

description = "High-performance document conversion engine for AI/LLM embeddings - 27 formats supported"

repository = "https://github.com/hivellm/transmutation"

homepage = "https://hivellm.org/transmutation"

documentation = "https://docs.rs/transmutation"

keywords = ["document", "conversion", "pdf", "llm", "embedding"]

categories = ["parser-implementations", "text-processing", "multimedia"]

readme = "README.md"

rust-version = "1.85"

exclude = [

    "data/*",

    "docling_parse",

    "docling-parse/",

    "build_*",

    "libs/",

    "*.pdf",

    "*.mp3",

    "*.mp4",

    ".github/",

]



# WiX Installer metadata for Windows MSI generation

[package.metadata.wix]

upgrade-guid = "12345678-1234-1234-1234-123456789012"

path-guid = "87654321-4321-4321-4321-210987654321"

license = "wix/License.rtf"

eula = true



[lib]

name = "transmutation"

path = "src/lib.rs"



[[bin]]

name = "transmutation"

path = "src/bin/transmutation.rs"

required-features = ["cli"]



[dependencies]

# Async runtime

tokio = { version = "1.47", features = ["full"] }

async-trait = "0.1"

futures = "0.3"



# Error handling

thiserror = "2.0"

anyhow = "1.0"



# Serialization

serde = { version = "1.0", features = ["derive"] }

serde_json = "1.0"



# File I/O and path handling

walkdir = "2.5"

tempfile = "3.20"

mime = "0.3"

mime_guess = "2.0"



# Hashing and crypto

sha2 = "0.10"

blake3 = "1.5"



# Logging and tracing

tracing = "0.1"

tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }

tracing-opentelemetry = { version = "0.30", optional = true }



# File type detection

file-format = "0.26"



# NOTE: We are NOT integrating with Docling - we are building a pure Rust competitor

# The docling repository is for reference only, not as a dependency



# OCR (Tesseract)

tesseract = { version = "0.15", optional = true }

leptess = { version = "0.14", optional = true }



# Image processing

image = { version = "0.25", features = ["png", "jpeg", "gif", "bmp", "tiff", "webp"] }

imageproc = "0.25"



# ML & Computer Vision (for docling-ffi feature)

ort = { version = "2.0.0-rc.10", optional = true, features = ["download-binaries"] }

ndarray = { version = "0.15", optional = true }



# Spatial indexing

rstar = { version = "0.12", optional = true }



# Document parsing - Core formats (always enabled)

# PDF

lopdf = "0.35"

pdf-extract = "0.7"

pdfium-render = { version = "0.8", optional = true }  # Only for PDF rendering to images



# HTML/XML

scraper = "0.21"

html5ever = "0.29"

quick-xml = "0.37"

roxmltree = { version = "0.21", optional = true }



# Archives (ZIP always enabled for core functionality)

zip = "6.0"

tar = { version = "0.4", optional = true }

flate2 = { version = "1.0", optional = true }

sevenz-rust = { version = "0.6", optional = true }



# Office formats (optional)

docx-rs = { version = "0.4", optional = true }

umya-spreadsheet = { version = "2.3", optional = true }



# Markdown

pulldown-cmark = "0.13"

comrak = "0.29"

regex = "1.11"

once_cell = "1.20"



# Note: Audio/Video use external ffmpeg and whisper CLI tools (no Rust crates needed)



# Parallelism

rayon = "1.10"

num_cpus = "1.16"



# System directories

dirs = "5.0"



# CLI dependencies

clap = { version = "4.5", features = ["derive", "cargo", "env"], optional = true }

indicatif = { version = "0.17", optional = true }

console = { version = "0.15", optional = true }

colored = { version = "2.2", optional = true }



# Note: External integrations removed (cache, HTTP, metrics)

# Transmutation is a library/CLI - external features are handled by HiveLLM Vectorizer



[dev-dependencies]

criterion = { version = "0.6", features = ["async_tokio", "html_reports"] }

pretty_assertions = "1.4"

proptest = "1.6"

tempfile = "3.20"

mockall = "0.13"

tokio-test = "0.4"



[build-dependencies]

# Windows resource compiler for embedding icons

winres = "0.1"



[features]

default = ["office"]  # Core formats (PDF, HTML, XML, ZIP) are always enabled



# Core engines (pure Rust implementations)

tesseract = ["dep:tesseract", "dep:leptess"]



# Format support (pure Rust implementations)

# Note: PDF, HTML, XML, and basic ZIP support are ALWAYS enabled (no feature flags)

pdf-to-image = ["dep:pdfium-render"]  # PDF rendering to images per page (optional)

office = ["docx-rs", "umya-spreadsheet"]  # Office formats (DOCX, XLSX, PPTX)

image-ocr = ["tesseract"]

audio = []  # Audio transcription (requires external whisper CLI)

video = []  # Video transcription (requires external ffmpeg + whisper CLI)

archives-extended = ["tar", "flate2", "sevenz-rust"]  # Extended archive support (TAR, GZ, 7Z)



# Advanced layout analysis (C++ FFI to docling-parse + ML models)

docling-ffi = ["dep:ort", "dep:ndarray", "dep:rstar", "dep:pdfium-render"]  # Enable C++ docling-parse + ONNX ML models



# CLI

cli = ["clap", "indicatif", "console", "colored"]



# All features (no external integrations - Transmutation is a library/CLI)

full = [

    "pdf-to-image",

    "office",

    "image-ocr",

    "audio",

    "video",

    "archives-extended",

    "cli",

]



# Benchmarks removed - files don't exist yet

# TODO: Re-add when benchmark files are created

# [[bench]]

# name = "conversion_benchmarks"

# harness = false

#

# [[bench]]

# name = "pipeline_benchmark"

# harness = false



[profile.release]

opt-level = 3

lto = true

codegen-units = 1

strip = true



[profile.bench]

opt-level = 3

lto = true



[package.metadata.docs.rs]

all-features = true

rustdoc-args = ["--cfg", "docsrs"]



# =============================================================================

# WORKSPACE LINTS (Based on Qdrant standards)

# =============================================================================

[lints.clippy]

cast_lossless = "warn"

doc_link_with_quotes = "warn"

enum_glob_use = "warn"

explicit_into_iter_loop = "warn"

filter_map_next = "warn"

flat_map_option = "warn"

from_iter_instead_of_collect = "warn"

implicit_clone = "warn"

inconsistent_struct_constructor = "warn"

inefficient_to_string = "warn"

manual_is_variant_and = "warn"

manual_let_else = "warn"

needless_continue = "warn"

needless_raw_string_hashes = "warn"

ptr_as_ptr = "warn"

ref_option_ref = "warn"

uninlined_format_args = "warn"

unnecessary_wraps = "warn"

unused_self = "warn"

used_underscore_binding = "warn"

match_wildcard_for_single_variants = "warn"

needless_pass_by_ref_mut = "warn"



[lints.rust]

unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }



[lints.rustdoc]

private_intra_doc_links = "allow"



# =============================================================================

# DEBIAN PACKAGE METADATA

# =============================================================================

[package.metadata.deb]

maintainer = "HiveLLM Team <team@hivellm.org>"

depends = "$auto"

license-file = ["LICENSE", "0"]

section = "text"

priority = "optional"

extended-description = """\

Transmutation is a high-performance document conversion engine for AI/LLM embeddings. \

Supports 27+ formats including PDF, Office docs, images, audio, and video. \

Built in Rust for maximum performance and safety.\

"""

assets = [

    ["target/release/transmutation", "usr/bin/", "755"],

    ["README.md", "usr/share/doc/transmutation/README", "644"],

    ["LICENSE", "usr/share/doc/transmutation/LICENSE", "644"],

]

conf-files = []

maintainer-scripts = "debian/"