[package]
edition = "2024"
rust-version = "1.85"
name = "rust-data-processing"
version = "0.3.3"
build = false
exclude = [
"bindings/",
"spikes/",
"Planning/",
"python-wrapper/",
"docs/python/",
"docs/landing/",
"README.md",
"test_run.log",
".github/",
"scripts/",
]
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "Schema-first ingestion (CSV, JSON, Parquet, Excel) into an in-memory DataSet, plus Polars-backed pipelines, SQL, profiling, validation, and map/reduce-style processing."
documentation = "https://docs.rs/rust-data-processing"
readme = "README_CRATE.md"
keywords = [
"csv",
"json",
"parquet",
"dataset",
"etl",
]
categories = [
"database-implementations",
"parser-implementations",
]
license = "MIT OR Apache-2.0"
repository = "https://github.com/scorpio-datalake/rust-data-processing"
resolver = "2"
[package.metadata.docs.rs]
cargo-args = [
"-j",
"1",
]
[features]
arrow = ["dep:arrow"]
ci_expanded = [
"deep_tests",
"excel_test_writer",
"arrow",
"serde_arrow",
]
cloud_connectors = [
"object_store",
"delta_lake",
"snowflake",
"file_transfer",
]
db_connectorx = [
"arrow",
"dep:connectorx",
"dep:postgres-protocol",
"dep:tokio-postgres",
"connectorx/src_postgres",
"connectorx/src_mysql",
"connectorx/src_mssql",
"connectorx/src_oracle",
]
deep_tests = []
default = ["sql"]
delta_lake = ["object_store"]
excel = ["dep:calamine"]
excel_test_writer = [
"excel",
"dep:rust_xlsxwriter",
]
file_transfer = [
"dep:libssh2-sys",
"dep:ssh2",
"dep:suppaftp",
"dep:url",
"dep:webpki-roots",
]
integration_full = [
"db_connectorx",
"cloud_connectors",
"excel",
]
kafka = ["dep:rdkafka"]
object_store = [
"dep:object_store",
"dep:tokio",
"dep:url",
"dep:ureq",
]
serde_arrow = [
"arrow",
"dep:serde_arrow",
]
snowflake = ["object_store"]
sql = ["dep:polars-sql"]
[lib]
name = "rust_data_processing"
path = "src/lib.rs"
[[bin]]
name = "generate_people_xlsx_fixture"
path = "src/bin/generate_people_xlsx_fixture.rs"
required-features = ["excel_test_writer"]
[[bin]]
name = "rust-data-processing"
path = "src/main.rs"
[[example]]
name = "file_transfer_ingest"
path = "examples/file_transfer_ingest.rs"
required-features = ["cloud_connectors"]
[[example]]
name = "kafka_elt_byo_load"
path = "examples/kafka_elt_byo_load.rs"
required-features = ["kafka"]
[[example]]
name = "kafka_elt_stream"
path = "examples/kafka_elt_stream.rs"
required-features = ["kafka"]
[[test]]
name = "arrow_record_batches"
path = "tests/arrow_record_batches.rs"
[[test]]
name = "csv_ingestion"
path = "tests/csv_ingestion.rs"
[[test]]
name = "dataframe_centric_pipeline_fixtures"
path = "tests/dataframe_centric_pipeline_fixtures.rs"
[[test]]
name = "deep_tests"
path = "tests/deep_tests.rs"
[[test]]
name = "excel_ingestion"
path = "tests/excel_ingestion.rs"
[[test]]
name = "excel_snippets_fixtures"
path = "tests/excel_snippets_fixtures.rs"
[[test]]
name = "file_transfer_ftp_integration"
path = "tests/file_transfer_ftp_integration.rs"
required-features = ["cloud_connectors"]
[[test]]
name = "ghcn_json_xml_parquet_pipeline_fixtures"
path = "tests/ghcn_json_xml_parquet_pipeline_fixtures.rs"
[[test]]
name = "ingestion_observability"
path = "tests/ingestion_observability.rs"
[[test]]
name = "json_ingestion"
path = "tests/json_ingestion.rs"
[[test]]
name = "json_parquet_excel_snippets_fixtures"
path = "tests/json_parquet_excel_snippets_fixtures.rs"
[[test]]
name = "kafka_elt_load"
path = "tests/kafka_elt_load.rs"
required-features = ["kafka"]
[[test]]
name = "mapping_spec"
path = "tests/mapping_spec.rs"
[[test]]
name = "ordered_batch_ingestion"
path = "tests/ordered_batch_ingestion.rs"
[[test]]
name = "parquet_ingestion"
path = "tests/parquet_ingestion.rs"
[[test]]
name = "parquet_snippets_fixtures"
path = "tests/parquet_snippets_fixtures.rs"
[[test]]
name = "partition_discovery"
path = "tests/partition_discovery.rs"
[[test]]
name = "path_from_directory_scan_fixtures"
path = "tests/path_from_directory_scan_fixtures.rs"
[[test]]
name = "sql"
path = "tests/sql.rs"
[[test]]
name = "student_etl_fixtures"
path = "tests/student_etl_fixtures.rs"
[[test]]
name = "unified_ingestion"
path = "tests/unified_ingestion.rs"
[[test]]
name = "watermark_incremental"
path = "tests/watermark_incremental.rs"
[[bench]]
name = "ingestion"
path = "benches/ingestion.rs"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "map_reduce"
path = "benches/map_reduce.rs"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "outliers"
path = "benches/outliers.rs"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "pipelines"
path = "benches/pipelines.rs"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "profiling"
path = "benches/profiling.rs"
harness = false
required-features = ["deep_tests"]
[[bench]]
name = "validation"
path = "benches/validation.rs"
harness = false
required-features = ["deep_tests"]
[dependencies.arrow]
version = "54"
optional = true
[dependencies.calamine]
version = "0.33.0"
optional = true
[dependencies.connectorx]
version = "0.4.5"
features = [
"dst_arrow",
"fptr",
]
optional = true
default-features = false
[dependencies.csv]
version = "1.3.1"
[dependencies.glob]
version = "0.3"
[dependencies.libssh2-sys]
version = "0.3.1"
features = ["vendored-openssl"]
optional = true
[dependencies.object_store]
version = "0.13.1"
features = [
"aws",
"gcp",
"azure",
"fs",
]
optional = true
default-features = false
[dependencies.parquet]
version = "57"
[dependencies.polars]
version = "0.53.0"
features = [
"lazy",
"csv",
"parquet",
"json",
"fmt",
"ipc",
]
[dependencies.polars-sql]
version = "0.53.0"
optional = true
[dependencies.postgres-protocol]
version = ">=0.6.12, <0.7"
optional = true
[dependencies.quick-xml]
version = "0.37.5"
[dependencies.rayon]
version = "1"
[dependencies.rdkafka]
version = "0.36.2"
features = ["cmake-build"]
optional = true
default-features = false
[dependencies.rust_xlsxwriter]
version = "0.93"
optional = true
[dependencies.serde]
version = "1"
features = ["derive"]
[dependencies.serde_arrow]
version = "0.14.0"
features = ["arrow-54"]
optional = true
default-features = false
[dependencies.serde_json]
version = "1.0.139"
[dependencies.sha2]
version = "0.10"
[dependencies.ssh2]
version = "0.9.4"
optional = true
[dependencies.suppaftp]
version = "6.3.0"
features = ["rustls"]
optional = true
default-features = false
[dependencies.thiserror]
version = "2.0.11"
[dependencies.tokio]
version = "1"
features = ["rt"]
optional = true
[dependencies.tokio-postgres]
version = ">=0.7.18, <0.8"
features = ["runtime"]
optional = true
default-features = false
[dependencies.ureq]
version = "2.12"
optional = true
default-features = false
[dependencies.url]
version = "2"
optional = true
[dependencies.walkdir]
version = "2"
[dependencies.webpki-roots]
version = "0.26"
optional = true
[dev-dependencies.criterion]
version = "0.5"
[profile.bench]
debug = 0
strip = "debuginfo"
[profile.dev]
debug = 1
[profile.integration]
opt-level = 1
lto = false
codegen-units = 16
debug = 1
inherits = "release"
strip = "debuginfo"
[profile.test]
debug = 0
strip = "debuginfo"