[package]
name = "rust-data-processing"
version = "0.3.4"
edition = "2024"
rust-version = "1.85"
description = "Schema-first ingestion (CSV, JSON, Parquet, Excel) into an in-memory DataSet, plus Polars-backed pipelines, SQL, profiling, validation, and map/reduce-style processing."
readme = "README_CRATE.md"
license = "MIT OR Apache-2.0"
repository = "https://github.com/scorpio-datalake/rust-data-processing"
documentation = "https://docs.rs/rust-data-processing"
keywords = ["csv", "json", "parquet", "dataset", "etl"]
categories = ["database-implementations", "parser-implementations"]
exclude = [
"bindings/",
"spikes/",
"Planning/",
"python-wrapper/",
"docs/python/",
"docs/landing/",
"README.md",
"test_run.log",
".github/",
"scripts/",
]
[package.metadata.docs.rs]
cargo-args = ["-j", "1"]
[dependencies]
csv = "1.3.1"
quick-xml = "0.37.5"
glob = "0.3"
walkdir = "2"
calamine = { version = "0.33.0", optional = true }
parquet = "57"
rayon = "1"
serde = { version = "1", features = ["derive"] }
serde_json = "1.0.139"
serde_arrow = { version = "0.14.0", default-features = false, features = ["arrow-54"], optional = true }
thiserror = "2.0.11"
sha2 = "0.10"
polars = { version = "0.53.0", features = ["lazy", "csv", "parquet", "json", "fmt", "ipc"] }
polars-sql = { version = "0.53.0", optional = true }
rust_xlsxwriter = { version = "0.93", optional = true }
arrow = { version = "54", optional = true }
connectorx = { version = "0.4.5", default-features = false, features = ["dst_arrow", "fptr"], optional = true }
postgres-protocol = { version = ">=0.6.12, <0.7", optional = true }
tokio-postgres = { version = ">=0.7.18, <0.8", optional = true, default-features = false, features = ["runtime"] }
object_store = { version = "0.13.1", optional = true, default-features = false, features = ["aws", "gcp", "azure", "fs"] }
tokio = { version = "1", optional = true, features = ["rt"] }
url = { version = "2", optional = true }
ureq = { version = "2.12", optional = true, default-features = false }
libssh2-sys = { version = "0.3.1", features = ["vendored-openssl"], optional = true }
ssh2 = { version = "0.9.4", optional = true }
suppaftp = { version = "6.3.0", default-features = false, features = ["rustls"], optional = true }
webpki-roots = { version = "0.26", optional = true }
rdkafka = { version = "0.36.2", optional = true, default-features = false, features = [
"cmake-build",
] }
[features]
default = ["sql"]
excel = ["dep:calamine"]
excel_test_writer = ["excel", "dep:rust_xlsxwriter"]
deep_tests = []
sql = ["dep:polars-sql"]
arrow = ["dep:arrow"]
serde_arrow = ["arrow", "dep:serde_arrow"]
db_connectorx = [
"arrow",
"dep:connectorx",
"dep:postgres-protocol",
"dep:tokio-postgres",
"connectorx/src_postgres",
"connectorx/src_mysql",
"connectorx/src_mssql",
"connectorx/src_oracle",
]
object_store = ["dep:object_store", "dep:tokio", "dep:url", "dep:ureq"]
delta_lake = ["object_store"]
snowflake = ["object_store"]
file_transfer = [
"dep:libssh2-sys",
"dep:ssh2",
"dep:suppaftp",
"dep:url",
"dep:webpki-roots",
]
cloud_connectors = ["object_store", "delta_lake", "snowflake", "file_transfer"]
integration_full = ["db_connectorx", "cloud_connectors", "excel"]
kafka = ["dep:rdkafka"]
ci_expanded = ["deep_tests", "excel_test_writer", "arrow", "serde_arrow"]
[dev-dependencies]
criterion = "0.5"
[[bench]]
name = "pipelines"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "ingestion"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "map_reduce"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "profiling"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "validation"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "outliers"
harness = false
required-features = ["deep_tests"]
[[test]]
name = "file_transfer_ftp_integration"
required-features = ["cloud_connectors"]
[[example]]
name = "file_transfer_ingest"
required-features = ["cloud_connectors"]
[[example]]
name = "kafka_elt_byo_load"
required-features = ["kafka"]
[[example]]
name = "kafka_elt_stream"
required-features = ["kafka"]
[[test]]
name = "kafka_elt_load"
required-features = ["kafka"]
[[bin]]
name = "generate_people_xlsx_fixture"
path = "src/bin/generate_people_xlsx_fixture.rs"
required-features = ["excel_test_writer"]
[profile.test]
debug = 0
strip = "debuginfo"
[profile.bench]
debug = 0
strip = "debuginfo"
[profile.dev]
debug = 1
[profile.integration]
inherits = "release"
opt-level = 1
debug = 1
lto = false
codegen-units = 16
strip = "debuginfo"
[workspace]
members = [
".",
"bindings/jvm-sys",
"python-wrapper",
"integration_testing/Oracle/rust",
"integration_testing/PostgreSQL/rust",
"integration_testing/SQLServer/rust",
"integration_testing/Snowflake/rust",
"integration_testing/Databricks/rust",
"integration_testing/Spark/rust",
"integration_testing/CloudConnectors/rust",
"integration_testing/Kafka/rust",
]
resolver = "2"