rust-data-processing 0.2.2

Schema-first ingestion (CSV, JSON, Parquet, Excel) into an in-memory DataSet, plus Polars-backed pipelines, SQL, profiling, validation, and map/reduce-style processing.
#![cfg(feature = "arrow")]

use std::sync::Arc;

use arrow::array::{Int64Array, StringArray};
use arrow::datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema};
use arrow::record_batch::RecordBatch;

use rust_data_processing::transform::arrow::record_batches_to_dataset;
use rust_data_processing::types::{DataType, Field as DField, Schema, Value};

fn batch1() -> RecordBatch {
    let schema = Arc::new(ArrowSchema::new(vec![
        Field::new("id", ArrowDataType::Int64, true),
        Field::new("name", ArrowDataType::Utf8, true),
    ]));
    let ids = Int64Array::from(vec![1, 2]);
    let names = StringArray::from(vec!["a", "b"]);
    RecordBatch::try_new(schema, vec![Arc::new(ids), Arc::new(names)]).unwrap()
}

fn batch2() -> RecordBatch {
    let schema = Arc::new(ArrowSchema::new(vec![
        Field::new("id", ArrowDataType::Int64, true),
        Field::new("name", ArrowDataType::Utf8, true),
    ]));
    let ids = Int64Array::from(vec![3]);
    let names = StringArray::from(vec!["c"]);
    RecordBatch::try_new(schema, vec![Arc::new(ids), Arc::new(names)]).unwrap()
}

#[test]
fn concat_record_batches_into_dataset() {
    let ds_schema = Schema::new(vec![
        DField::new("id", DataType::Int64),
        DField::new("name", DataType::Utf8),
    ]);
    let ds = record_batches_to_dataset(&[batch1(), batch2()], &ds_schema).unwrap();
    assert_eq!(ds.row_count(), 3);
    assert_eq!(ds.rows[2][0], Value::Int64(3));
    assert_eq!(ds.rows[2][1], Value::Utf8("c".into()));
}