[package]
name = "dataprof"
version = "0.6.2"
edition = "2024"
license = "MIT OR Apache-2.0"
authors = ["Andrea Bozzo"]
description = "High-performance data profiler with ISO 8000/25012 quality metrics for CSV, JSON/JSONL, and Parquet files"
repository = "https://github.com/AndreaBozzo/dataprof"
keywords = ["data", "analysis", "cli", "data-quality", "parquet"]
categories = ["command-line-utilities", "data-structures"]
readme = "README.md"
documentation = "https://docs.rs/dataprof"
homepage = "https://github.com/AndreaBozzo/dataprof"
exclude = [
"assets/animations/*",
"assets/images/*.png",
"examples/*",
".venv/*",
".mypy_cache/*",
"python/*",
".github/*",
]
[package.metadata.docs.rs]
all-features = true
rustdoc-args = ["--cfg", "docsrs"]
[lints.rust]
unsafe_code = "warn"
[lints.clippy]
all = "deny"
[dependencies]
csv = "1.4"
serde = { version = "1.0", features = ["derive"] }
clap = { version = "4.5.54", features = ["derive"], optional = true }
colored = { version = "3.0", optional = true }
is-terminal = { version = "0.4", optional = true }
env_logger = { version = "0.11", optional = true }
anyhow = "1.0"
thiserror = "2.0"
regex = "1.12"
serde_json = "1.0.149"
toml = "0.9.8"
log = "0.4"
indicatif = { version = "0.18", optional = true }
sysinfo = "0.38"
rayon = "1.8"
memmap2 = "0.9"
rand_distr = "0.5"
chrono = { version = "0.4", features = ["serde"] }
uuid = { version = "1.0", features = ["v4"] }
wide = "1.1.1"
rand = { version = "0.9", features = ["small_rng"] }
rand_chacha = "0.9"
glob = "0.3"
num_cpus = "1.16"
pyo3 = { version = "0.27", features = ["extension-module"], optional = true }
pyo3-async-runtimes = { version = "0.27", features = ["tokio-runtime"], optional = true }
tokio = { version = "1.49", features = ["full"], optional = true }
async-trait = { version = "0.1", optional = true }
bytes = { version = "1.10", optional = true }
tokio-util = { version = "0.7", features = ["io-util"], optional = true }
sqlx = { version = "0.8.1", features = [
"runtime-tokio-rustls",
"chrono",
"uuid",
], optional = true, default-features = false }
url = { version = "2.5.8", optional = true }
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "stream"], optional = true }
arrow = { version = "57.3.0", features = ["ffi"] }
parquet = { version = "57.3.0", features = ["arrow"] }
datafusion = { version = "52.2.0", optional = true }
futures = "0.3.31"
[dev-dependencies]
tempfile = "3.24"
anyhow = "1.0"
env_logger = "0.11"
proptest = "1.9"
criterion = { version = "0.8", features = ["html_reports"] }
serde_json = "1.0"
serial_test = "3.4.0"
predicates = "3.1"
[[bin]]
name = "dataprof-cli"
path = "src/main.rs"
required-features = ["cli"]
[[bench]]
name = "benchmarks"
harness = false
[profile.dev]
opt-level = 0
debug = true
split-debuginfo = "packed"
incremental = true
codegen-units = 256
[profile.test]
opt-level = 1
debug = 2
incremental = true
codegen-units = 256
[profile.release]
lto = true
codegen-units = 1
panic = "abort"
strip = true
[profile.release-lto]
inherits = "release"
[profile.ci]
inherits = "test"
opt-level = 0
debug = 2
incremental = false
codegen-units = 1
[profile.dev.package."*"]
opt-level = 1
[profile.test.package."*"]
opt-level = 1
[features]
default = ["cli"]
cli = ["dep:clap", "dep:colored", "dep:is-terminal", "dep:indicatif", "dep:env_logger"]
python = ["dep:pyo3"]
python-async = ["python", "dep:pyo3-async-runtimes", "dep:tokio"]
database = ["dep:tokio", "dep:async-trait", "dep:url"]
datafusion = ["dep:datafusion", "dep:tokio"]
async-streaming = ["dep:tokio", "dep:async-trait", "dep:bytes", "dep:tokio-util"]
parquet-async = ["parquet/async", "dep:reqwest", "async-streaming"]
postgres = ["database", "dep:sqlx", "sqlx/postgres"]
mysql = ["database", "dep:sqlx", "sqlx/mysql"]
sqlite = ["database", "dep:sqlx", "sqlx/sqlite"]
minimal = []
full-cli = ["cli", "all-db"]
production = ["postgres", "mysql"]
all-db = ["postgres", "mysql", "sqlite"]
[lib]
name = "dataprof"
crate-type = ["rlib"]