dataprof 0.6.2

High-performance data profiler with ISO 8000/25012 quality metrics for CSV, JSON/JSONL, and Parquet files
Documentation
[package]
name = "dataprof"
version = "0.6.2"
edition = "2024"
license = "MIT OR Apache-2.0"
authors = ["Andrea Bozzo"]
description = "High-performance data profiler with ISO 8000/25012 quality metrics for CSV, JSON/JSONL, and Parquet files"
repository = "https://github.com/AndreaBozzo/dataprof"
keywords = ["data", "analysis", "cli", "data-quality", "parquet"]
categories = ["command-line-utilities", "data-structures"]
readme = "README.md"
documentation = "https://docs.rs/dataprof"
homepage = "https://github.com/AndreaBozzo/dataprof"
exclude = [
    "assets/animations/*",
    "assets/images/*.png",
    "examples/*",
    ".venv/*",
    ".mypy_cache/*",
    "python/*",
    ".github/*",
]

# Configuration for docs.rs
[package.metadata.docs.rs]
all-features = true
rustdoc-args = ["--cfg", "docsrs"]

[lints.rust]
unsafe_code = "warn"

[lints.clippy]

all = "deny"



[dependencies]
# CSV parsing
csv = "1.4"  # Updated: 1.3.1 -> 1.4.0 (minor bump, backward compatible)
serde = { version = "1.0", features = ["derive"] }

# CLI (optional — only needed for the binary)
clap = { version = "4.5.54", features = ["derive"], optional = true }
colored = { version = "3.0", optional = true }
is-terminal = { version = "0.4", optional = true }
env_logger = { version = "0.11", optional = true }

# Error handling
anyhow = "1.0"
thiserror = "2.0"

# Pattern detection
regex = "1.12"  # Updated: 1.11.3 -> 1.12.2 (minor bump, backward compatible)


# JSON support
serde_json = "1.0.149"

# Configuration & logging
toml = "0.9.8"
log = "0.4"
indicatif = { version = "0.18", optional = true }


# System info for adaptive chunking
sysinfo = "0.38"

# Rayon for parallel processing
rayon = "1.8"

# Memory mapping for large files
memmap2 = "0.9"  # Latest: 0.9.9 (patch available via cargo update)

# Statistical analysis and benchmarking
rand_distr = "0.5"
chrono = { version = "0.4", features = ["serde"] }
uuid = { version = "1.0", features = ["v4"] }

# Note: Async support is available via feature flags (database, python-async)

# SIMD acceleration
wide = "1.1.1"

# Random number generation for sampling
rand = { version = "0.9", features = ["small_rng"] }
rand_chacha = "0.9"

# Batch processing support
glob = "0.3"
num_cpus = "1.16"



pyo3 = { version = "0.27", features = ["extension-module"], optional = true }
pyo3-async-runtimes = { version = "0.27", features = ["tokio-runtime"], optional = true }


tokio = { version = "1.49", features = ["full"], optional = true }  # Latest: 1.48.0 (patch available via cargo update)
async-trait = { version = "0.1", optional = true }
bytes = { version = "1.10", optional = true }
tokio-util = { version = "0.7", features = ["io-util"], optional = true }
sqlx = { version = "0.8.1", features = [
    "runtime-tokio-rustls",
    "chrono",
    "uuid",
], optional = true, default-features = false }
url = { version = "2.5.8", optional = true }
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "stream"], optional = true }


arrow = { version = "57.3.0", features = ["ffi"] }
parquet = { version = "57.3.0", features = ["arrow"] }
datafusion = { version = "52.2.0", optional = true }
futures = "0.3.31"

[dev-dependencies]
tempfile = "3.24"
anyhow = "1.0"
env_logger = "0.11"

proptest = "1.9"  # Updated: 1.8.0 -> 1.10.0 (minor bump, backward compatible)


criterion = { version = "0.8", features = ["html_reports"] }
serde_json = "1.0"

serial_test = "3.4.0"
predicates = "3.1"

[[bin]]
name = "dataprof-cli"
path = "src/main.rs"
required-features = ["cli"]

[[bench]]
name = "benchmarks"
harness = false



[profile.dev]
opt-level = 0             
debug = true               
split-debuginfo = "packed" 
incremental = true         
codegen-units = 256        


[profile.test]
opt-level = 1
debug = 2
incremental = true
codegen-units = 256

[profile.release]
lto = true
codegen-units = 1
panic = "abort"
strip = true      


[profile.release-lto]
inherits = "release"


[profile.ci]
inherits = "test"
opt-level = 0        
debug = 2            
incremental = false  
codegen-units = 1    

[profile.dev.package."*"]
opt-level = 1 

[profile.test.package."*"]
opt-level = 1 


[features]
default = ["cli"]

# CLI feature — pulls in clap, colored, is-terminal, indicatif, env_logger
cli = ["dep:clap", "dep:colored", "dep:is-terminal", "dep:indicatif", "dep:env_logger"]

# Individual features
python = ["dep:pyo3"]
python-async = ["python", "dep:pyo3-async-runtimes", "dep:tokio"]
database = ["dep:tokio", "dep:async-trait", "dep:url"]
datafusion = ["dep:datafusion", "dep:tokio"]
async-streaming = ["dep:tokio", "dep:async-trait", "dep:bytes", "dep:tokio-util"]
parquet-async = ["parquet/async", "dep:reqwest", "async-streaming"]
postgres = ["database", "dep:sqlx", "sqlx/postgres"]
mysql = ["database", "dep:sqlx", "sqlx/mysql"]
sqlite = ["database", "dep:sqlx", "sqlx/sqlite"]

# Sensible feature combinations
minimal = []                             # Just CSV processing - fastest builds
full-cli = ["cli", "all-db"]  # CLI with all format + DB support
production = ["postgres", "mysql"]       
all-db = ["postgres", "mysql", "sqlite"] 


# Library configuration - rlib only for Rust use
# cdylib is added automatically by maturin when building Python bindings
[lib]
name = "dataprof"
crate-type = ["rlib"]