dataprof 0.4.78

High-performance data profiler with ISO 8000/25012 quality metrics for CSV, JSON/JSONL, and Parquet files
Documentation
[package]
name = "dataprof"
version = "0.4.78"
edition = "2021"
license = "MIT"
authors = ["Andrea Bozzo"]
description = "High-performance data profiler with ISO 8000/25012 quality metrics for CSV, JSON/JSONL, and Parquet files"
repository = "https://github.com/AndreaBozzo/dataprof"
keywords = ["data", "analysis", "cli", "data-quality", "parquet"]
categories = ["command-line-utilities", "data-structures"]

[dependencies]
# CSV parsing
csv = "1.3"
serde = { version = "1.0", features = ["derive"] }

# CLI
clap = { version = "4.5", features = ["derive"] }

# Terminal output
colored = "3.0"
is-terminal = "0.4"

# Error handling
anyhow = "1.0"
thiserror = "2.0"

# Pattern detection
regex = "1.10"
lazy_static = "1.4"


# JSON support
serde_json = "1.0"

# Template engine for HTML generation
handlebars = "5.1"

# Enhanced CLI features - Core functionality
toml = "0.8"
log = "0.4"
indicatif = "0.17"
# clap_complete = "4.4"  # Temporarily disabled for Windows compatibility

# System info for adaptive chunking
sysinfo = "0.30"

# Rayon for parallel processing
rayon = "1.8"

# Memory mapping for large files
memmap2 = "0.9"

# Statistical analysis and benchmarking
rand_distr = "0.4"
chrono = { version = "0.4", features = ["serde"] }

# Note: Async support temporarily disabled

# SIMD acceleration
wide = "0.7"

# Random number generation for sampling
rand = { version = "0.8", features = ["small_rng"] }
rand_chacha = "0.3"

# Batch processing support
glob = "0.3"
glob-match = "0.2"
num_cpus = "1.16"
walkdir = "2.5"

# Python bindings
pyo3 = { version = "0.24.1", features = ["extension-module"], optional = true }
# pyo3-asyncio = { version = "0.20", features = ["tokio-runtime"], optional = true }

# Database connectors and async runtime
tokio = { version = "1.0", features = ["full"], optional = true }
async-trait = { version = "0.1", optional = true }
sqlx = { version = "0.8.1", features = [
    "runtime-tokio-rustls",
    "chrono",
    "uuid",
], optional = true, default-features = false }
url = { version = "2.5", optional = true }

# Apache Arrow for columnar processing
arrow = { version = "56.1", optional = true }
parquet = { version = "56.1", features = ["arrow"], optional = true }

[dev-dependencies]
tempfile = "3.10"

# Property-based testing
proptest = "1.5"

# Performance benchmarking
criterion = { version = "0.5", features = ["html_reports"] }
lazy_static = "1.4"
serde_json = "1.0"

# Memory profiling for benchmarks (Windows compatibility issue with pprof 0.13)
# pprof = { version = "0.13", features = ["criterion", "protobuf-codec"] }

# Coverage reporting (install with: cargo install cargo-tarpaulin)
# Run with: cargo tarpaulin --out Html --output-dir coverage

# Additional test utilities
assert_matches = "1.5"
serial_test = "3.1"
fake = { version = "2.9", features = ["derive"] }
quickcheck = "1.0"
quickcheck_macros = "1.0"

# CLI integration testing - temporarily disabled due to Windows registry issues
# assert_cmd = "1.0"
# predicates = "2.1"

[[bin]]
name = "dataprof-cli"
path = "src/main.rs"

[[bench]]
name = "unified_benchmarks"
harness = false

[[bench]]
name = "domain_benchmarks"
harness = false


[[bench]]
name = "statistical_benchmark"
harness = false


# Debug profile per sviluppo veloce
[profile.dev]
opt-level = 0              # Nessuna ottimizzazione per compilazione più veloce
debug = true               # Mantieni simboli debug
split-debuginfo = "packed" # Su Windows
incremental = true         # Build incrementali
codegen-units = 256        # Parallelizzazione massima per dev builds

# Profile per test veloci
[profile.test]
opt-level = 1
debug = 2
incremental = true
codegen-units = 256

# Release profile ottimizzato
[profile.release]
lto = "thin"      # LTO più veloce di "true"
codegen-units = 1
opt-level = 3
panic = "abort"   # Riduce dimensione binario

# Bench profile (based on release, but needs panic = "unwind" for benchmarks)
[profile.bench]
inherits = "release"
panic = "unwind"     # Required for benchmarks and tests


# Configurazione per dipendenze più veloci
[profile.dev.package."*"]
opt-level = 1 # Ottimizza le dipendenze anche in dev mode

[profile.test.package."*"]
opt-level = 1 # Ottimizza le dipendenze anche nei test

# Features
[features]
default = [] # Minimal by default - MUCH faster compilation

# Individual features
python = ["dep:pyo3"]
# python-async = ["python", "dep:pyo3-asyncio", "dep:tokio"]
database = ["dep:tokio", "dep:async-trait", "dep:url"]
arrow = ["dep:arrow"]
parquet = ["arrow", "dep:parquet"]
postgres = ["database", "dep:sqlx", "sqlx/postgres"]
mysql = ["database", "dep:sqlx", "sqlx/mysql"]
sqlite = ["database", "dep:sqlx", "sqlx/sqlite"]

# Sensible feature combinations
minimal = []                             # Just CSV processing - fastest builds
production = ["postgres", "mysql"]       # Most common production databases
all-db = ["postgres", "mysql", "sqlite"] # All databases


# Library configuration - rlib for internal use, cdylib for Python bindings
[lib]
name = "dataprof"
crate-type = ["rlib", "cdylib"]