rust-data-processing 0.3.4

Schema-first ingestion (CSV, JSON, Parquet, Excel) into an in-memory DataSet, plus Polars-backed pipelines, SQL, profiling, validation, and map/reduce-style processing.
[package]
name = "rust-data-processing"
version = "0.3.4"
edition = "2024"
rust-version = "1.85"
description = "Schema-first ingestion (CSV, JSON, Parquet, Excel) into an in-memory DataSet, plus Polars-backed pipelines, SQL, profiling, validation, and map/reduce-style processing."
readme = "README_CRATE.md"
license = "MIT OR Apache-2.0"
repository = "https://github.com/scorpio-datalake/rust-data-processing"
documentation = "https://docs.rs/rust-data-processing"
keywords = ["csv", "json", "parquet", "dataset", "etl"]
categories = ["database-implementations", "parser-implementations"]
exclude = [
    "bindings/",
    "spikes/",
    "Planning/",
    "python-wrapper/",
    "docs/python/",
    "docs/landing/",
    # Monorepo landing README (Python + Rust); not the crate README. Shipped readme is README_CRATE.md.
    "README.md",
    "test_run.log",
    ".github/",
    "scripts/",
]

# docs.rs builds the crate in a memory- and time-bounded environment. Polars + Arrow pull a large
# dependency graph; limiting parallel rustc jobs avoids OOM kills on the builder. Also ensure
# `cargo doc` is clean under `RUSTDOCFLAGS=-D warnings` (intra-doc links, etc.).
# See https://docs.rs/about/metadata and https://docs.rs/about/builds
[package.metadata.docs.rs]
cargo-args = ["-j", "1"]

[dependencies]
csv = "1.3.1"
quick-xml = "0.37.5"
glob = "0.3"
walkdir = "2"
calamine = { version = "0.33.0", optional = true }
parquet = "57"
rayon = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1.0.139"
serde_arrow = { version = "0.14.0", default-features = false, features = ["arrow-54"], optional = true }
thiserror = "2.0.11"
sha2 = "0.10"
polars = { version = "0.53.0", features = ["lazy", "csv", "parquet", "json", "fmt", "ipc"] }
polars-sql = { version = "0.53.0", optional = true }
rust_xlsxwriter = { version = "0.93", optional = true }
arrow = { version = "54", optional = true }
connectorx = { version = "0.4.5", default-features = false, features = ["dst_arrow", "fptr"], optional = true }
# Minimum postgres stack for RUSTSEC-2026-0178/0179/0180 (ConnectorX transitive); lockfile pins exact patch.
postgres-protocol = { version = ">=0.6.12, <0.7", optional = true }
tokio-postgres = { version = ">=0.7.18, <0.8", optional = true, default-features = false, features = ["runtime"] }
object_store = { version = "0.13.1", optional = true, default-features = false, features = ["aws", "gcp", "azure", "fs"] }
tokio = { version = "1", optional = true, features = ["rt"] }
url = { version = "2", optional = true }
ureq = { version = "2.12", optional = true, default-features = false }
libssh2-sys = { version = "0.3.1", features = ["vendored-openssl"], optional = true }
ssh2 = { version = "0.9.4", optional = true }
suppaftp = { version = "6.3.0", default-features = false, features = ["rustls"], optional = true }
webpki-roots = { version = "0.26", optional = true }
rdkafka = { version = "0.36.2", optional = true, default-features = false, features = [
  "cmake-build",
] }

[features]
# Keep the default dependency surface small; enable connectors explicitly.
default = ["sql"]
# Enable Excel ingestion (adds `calamine`).
excel = ["dep:calamine"]
# Enable Excel integration tests / benchmarks that generate an `.xlsx` at runtime (adds `rust_xlsxwriter`).
excel_test_writer = ["excel", "dep:rust_xlsxwriter"]
deep_tests = [] # Criterion benches + large fixture tests (omit from default `clippy --all-targets` to save linker RAM).
sql = ["dep:polars-sql"]
arrow = ["dep:arrow"]
serde_arrow = ["arrow", "dep:serde_arrow"]
db_connectorx = [
  "arrow",
  "dep:connectorx",
  "dep:postgres-protocol",
  "dep:tokio-postgres",
  "connectorx/src_postgres",
  "connectorx/src_mysql",
  "connectorx/src_mssql",
  "connectorx/src_oracle",
]
object_store = ["dep:object_store", "dep:tokio", "dep:url", "dep:ureq"]
delta_lake = ["object_store"]
snowflake = ["object_store"]
file_transfer = [
  "dep:libssh2-sys",
  "dep:ssh2",
  "dep:suppaftp",
  "dep:url",
  "dep:webpki-roots",
]
cloud_connectors = ["object_store", "delta_lake", "snowflake", "file_transfer"]
# Every batch connector in docs/CONNECTORS.md for integration builds (parity with rdp_jvm_sys `full`).
# JVM-only DB sinks (PostgreSQL COPY, Oracle OCI) live in bindings/jvm-sys, not here.
integration_full = ["db_connectorx", "cloud_connectors", "excel"]
# Kafka streaming via librdkafka (system lib on Linux CI; see docs/adr/007-kafka-rdkafka-connector-abi.md).
kafka = ["dep:rdkafka"]
# Same feature set as `rust_ci.yml` “expanded” job: deep tests + Arrow/serde_arrow + Excel writer,
# but **not** `db_connectorx` (ConnectorX → OpenSSL; needs Perl or system SSL on Windows).
ci_expanded = ["deep_tests", "excel_test_writer", "arrow", "serde_arrow"]

[dev-dependencies]
criterion = "0.5"

[[bench]]
name = "pipelines"
harness = false
required-features = ["deep_tests"]

[[bench]]
name = "ingestion"
harness = false
required-features = ["deep_tests"]

[[bench]]
name = "map_reduce"
harness = false
required-features = ["deep_tests"]

[[bench]]
name = "profiling"
harness = false
required-features = ["deep_tests"]

[[bench]]
name = "validation"
harness = false
required-features = ["deep_tests"]

[[bench]]
name = "outliers"
harness = false
required-features = ["deep_tests"]

[[test]]
name = "file_transfer_ftp_integration"
required-features = ["cloud_connectors"]

[[example]]
name = "file_transfer_ingest"
required-features = ["cloud_connectors"]

[[example]]
name = "kafka_elt_byo_load"
required-features = ["kafka"]

[[example]]
name = "kafka_elt_stream"
required-features = ["kafka"]

[[test]]
name = "kafka_elt_load"
required-features = ["kafka"]

[[bin]]
name = "generate_people_xlsx_fixture"
path = "src/bin/generate_people_xlsx_fixture.rs"
required-features = ["excel_test_writer"]

# Reduce linker memory for integration tests (Polars + Arrow graphs).
[profile.test]
debug = 0
strip = "debuginfo"

# Criterion benches link the same Polars/Arrow graph; keep debug info off the final link.
[profile.bench]
debug = 0
strip = "debuginfo"

# Dependencies still use the `dev` profile; full debug (`debug = 2`) makes `.rlib`s enormous and
# routinely exhausts the linker during `--lib` / integration test links on 8–16 GB boxes
# (Windows LNK1102 / Linux rust-lld SIGBUS). `debug = 1` keeps line tables while cutting
# link pressure; Linux also uses bfd ld via `.cargo/config.toml`.
[profile.dev]
debug = 1

# Integration connector prebuild (`cargo test --no-run`); faster than full release link.
[profile.integration]
inherits = "release"
opt-level = 1
debug = 1
lto = false
codegen-units = 16
strip = "debuginfo"

[workspace]
members = [
  ".",
  "bindings/jvm-sys",
  "python-wrapper",
  "integration_testing/Oracle/rust",
  "integration_testing/PostgreSQL/rust",
  "integration_testing/SQLServer/rust",
  "integration_testing/Snowflake/rust",
  "integration_testing/Databricks/rust",
  "integration_testing/Spark/rust",
  "integration_testing/CloudConnectors/rust",
  "integration_testing/Kafka/rust",
]
resolver = "2"