[package]
edition = "2021"
rust-version = "1.75"
name = "three-dcf-core"
version = "0.2.0"
authors = ["3DCF Contributors"]
build = "build.rs"
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Document-to-dataset encoding library for LLM training data preparation. Converts PDFs, Markdown, HTML into structured formats optimized for machine learning."
documentation = "https://docs.rs/three-dcf-core"
readme = "README.md"
keywords = [
"llm",
"dataset",
"pdf",
"document",
"ml",
]
categories = [
"encoding",
"text-processing",
"parser-implementations",
]
license = "Apache-2.0"
repository = "https://github.com/3DCF-Labs/doc2dataset"
[package.metadata.docs.rs]
all-features = true
rustdoc-args = [
"--cfg",
"docsrs",
]
[features]
default = ["text"]
full = [
"pdfium",
"ocr",
]
ocr = ["dep:leptess"]
pdfium = ["dep:pdfium-render"]
text = []
[lib]
name = "three_dcf_core"
path = "src/lib.rs"
[[example]]
name = "batch_process"
path = "examples/batch_process.rs"
[[example]]
name = "custom_config"
path = "examples/custom_config.rs"
[[example]]
name = "encode_pdf"
path = "examples/encode_pdf.rs"
[[test]]
name = "property_roundtrip"
path = "tests/property_roundtrip.rs"
[[test]]
name = "roundtrip"
path = "tests/roundtrip.rs"
[dependencies.anyhow]
version = "1"
[dependencies.base64]
version = "0.21"
[dependencies.blake3]
version = "1"
[dependencies.bytes]
version = "1"
[dependencies.chrono]
version = "0.4"
features = ["serde"]
[dependencies.hdrhistogram]
version = "7"
[dependencies.hex]
version = "0.4"
[dependencies.html2text]
version = "0.5"
[dependencies.image]
version = "0.25"
features = [
"png",
"jpeg",
"gif",
"tiff",
"bmp",
]
default-features = false
[dependencies.indexmap]
version = "2"
features = ["serde"]
[dependencies.itertools]
version = "0.12"
[dependencies.leptess]
version = "0.5"
optional = true
[dependencies.lopdf]
version = "0.34"
[dependencies.once_cell]
version = "1"
[dependencies.pdf-extract]
version = "0.10"
[dependencies.pdfium-render]
version = "0.8"
optional = true
[dependencies.prost]
version = "0.12"
[dependencies.prost-types]
version = "0.12"
[dependencies.pulldown-cmark]
version = "0.10"
[dependencies.rand]
version = "0.8"
[dependencies.rayon]
version = "1"
[dependencies.regex]
version = "1"
[dependencies.rstar]
version = "0.11"
[dependencies.rustc-hash]
version = "1.1"
[dependencies.serde]
version = "1"
features = ["derive"]
[dependencies.serde_json]
version = "1"
[dependencies.sha1]
version = "0.10"
[dependencies.sha2]
version = "0.10"
[dependencies.strsim]
version = "0.11"
[dependencies.sysinfo]
version = "0.30"
[dependencies.tempfile]
version = "3"
[dependencies.thiserror]
version = "1"
[dependencies.tiktoken-rs]
version = "0.5"
[dependencies.tracing]
version = "0.1"
[dependencies.tracing-subscriber]
version = "0.3"
features = [
"fmt",
"env-filter",
]
[dependencies.unicode-normalization]
version = "0.1"
[dependencies.unicode-segmentation]
version = "1"
[dependencies.walkdir]
version = "2"
[dependencies.zstd]
version = "0.13"
features = ["zstdmt"]
default-features = false
[dev-dependencies.insta]
version = "1"
features = ["yaml"]
[dev-dependencies.proptest]
version = "1"
[build-dependencies.prost-build]
version = "0.12"
[build-dependencies.protoc-bin-vendored]
version = "3"