arrow2 0.18.0

Unofficial implementation of Apache Arrow spec in safe Rust
Documentation
use arrow2::array::*;
use arrow2::datatypes::*;
use arrow2::error::Result;
use arrow2::io::json::read;

#[test]
fn read_json() -> Result<()> {
    let data = br#"[
        {
            "a": 1
        },
        {
            "a": 2
        },
        {
            "a": 3
        }
    ]"#;

    let json = json_deserializer::parse(data)?;

    let data_type = read::infer(&json)?;

    let result = read::deserialize(&json, data_type)?;

    let expected = StructArray::new(
        DataType::Struct(vec![Field::new("a", DataType::Int64, true)]),
        vec![Box::new(Int64Array::from_slice([1, 2, 3])) as _],
        None,
    );

    assert_eq!(expected, result.as_ref());

    Ok(())
}

#[test]
fn read_json_records() -> Result<()> {
    let data = br#"[
        {
            "a": [
                [1.1, 2, 3],
                [2, 3],
                [4, 5, 6]
            ],
            "b": [1, 2, 3],
            "c": ["test"],
            "d": [true]
        },
        {
            "a": [
                [3, 2, 1],
                [3, 2],
                [6, 5, 4]
            ]
        },
        {
            "b": [7, 8, 9],
            "c": ["string"],
            "d": [false]
        }
    ]"#;

    let a_iter = vec![
        vec![
            Some(vec![Some(1.1), Some(2.), Some(3.)]),
            Some(vec![Some(2.), Some(3.)]),
            Some(vec![Some(4.), Some(5.), Some(6.)]),
        ],
        vec![
            Some(vec![Some(3.), Some(2.), Some(1.)]),
            Some(vec![Some(3.), Some(2.)]),
            Some(vec![Some(6.), Some(5.), Some(4.)]),
        ],
    ];

    let a_iter = a_iter.into_iter().map(Some);
    let a_inner = MutableListArray::<i32, MutablePrimitiveArray<f64>>::new_with_field(
        MutablePrimitiveArray::<f64>::new(),
        "item",
        true,
    );
    let mut a_outer =
        MutableListArray::<i32, MutableListArray<i32, MutablePrimitiveArray<f64>>>::new_with_field(
            a_inner, "item", true,
        );
    a_outer.try_extend(a_iter).unwrap();
    let a_expected: ListArray<i32> = a_outer.into();

    let b_iter = vec![
        vec![Some(1), Some(2), Some(3)],
        vec![Some(7), Some(8), Some(9)],
    ];
    let b_iter = b_iter.into_iter().map(Some);
    let mut b = MutableListArray::<i32, MutablePrimitiveArray<i64>>::new_with_field(
        MutablePrimitiveArray::<i64>::new(),
        "item",
        true,
    );
    b.try_extend(b_iter).unwrap();
    let b_expected: ListArray<i32> = b.into();

    let c_iter = vec![vec![Some("test")], vec![Some("string")]];

    let c_iter = c_iter.into_iter().map(Some);
    let mut c = MutableListArray::<i32, MutableUtf8Array<i32>>::new_with_field(
        MutableUtf8Array::<i32>::new(),
        "item",
        true,
    );

    c.try_extend(c_iter).unwrap();
    let c_expected: ListArray<i32> = c.into();

    let d_iter = vec![vec![Some(true)], vec![Some(false)]];

    let d_iter = d_iter.into_iter().map(Some);
    let mut d = MutableListArray::<i32, MutableBooleanArray>::new_with_field(
        MutableBooleanArray::new(),
        "item",
        true,
    );

    d.try_extend(d_iter).unwrap();
    let d_expected: ListArray<i32> = d.into();

    let json = json_deserializer::parse(data)?;

    let schema = read::infer_records_schema(&json)?;
    let actual = read::deserialize_records(&json, &schema)?;

    for (f, arr) in schema.fields.iter().zip(actual.arrays().iter()) {
        let (expected, actual) = if f.name == "a" {
            (&a_expected, arr.as_ref())
        } else if f.name == "b" {
            (&b_expected, arr.as_ref())
        } else if f.name == "c" {
            (&c_expected, arr.as_ref())
        } else if f.name == "d" {
            (&d_expected, arr.as_ref())
        } else {
            panic!("unexpected field found: {}", f.name);
        };

        assert_eq!(expected.to_boxed().as_ref(), actual);
    }

    Ok(())
}

#[test]
fn read_json_fixed_size_records() -> Result<()> {
    let data = br#"[
        {
            "a": [1, 2.2, 3, 4]
        },
        {
            "a": [5, 6, 7, 8]
        },
        {
            "a": [7, 8, 9]
        }
    ]"#;

    let a_iter = vec![
        Some(vec![Some(1.), Some(2.2), Some(3.), Some(4.)]),
        Some(vec![Some(5.), Some(6.), Some(7.), Some(8.)]),
        None,
    ];

    let a_iter = a_iter.into_iter();
    let mut a = MutableFixedSizeListArray::<MutablePrimitiveArray<f64>>::new_with_field(
        MutablePrimitiveArray::<f64>::new(),
        "inner",
        true,
        4,
    );
    a.try_extend(a_iter).unwrap();
    let a_expected: FixedSizeListArray = a.into();

    let json = json_deserializer::parse(data)?;

    let schema: Schema = vec![Field::new("a", a_expected.data_type().clone(), true)].into();
    let actual = read::deserialize_records(&json, &schema)?;

    for (f, arr) in schema.fields.iter().zip(actual.arrays().iter()) {
        let (expected, actual) = if f.name == "a" {
            (&a_expected, arr.as_ref())
        } else {
            panic!("unexpected field found: {}", f.name);
        };

        assert_eq!(expected.to_boxed().as_ref(), actual);
    }

    Ok(())
}

#[test]
fn read_json_records_with_schema() -> Result<()> {
    let raw = b"[{\"matrix\":[0.0,2.0]},{\"matrix\":[0.0,0.0,2.1,3.0]}]";
    let schema = Schema {
        fields: vec![Field::new(
            "matrix",
            DataType::List(Box::new(Field::new("inner", DataType::Float32, false))),
            false,
        )],
        metadata: Metadata::default(),
    };

    let json = json_deserializer::parse(raw)?;
    let actual = read::deserialize_records(&json, &schema)?;
    assert_eq!(
        format!("{:?}", actual.arrays()[0]),
        "ListArray[[0, 2], [0, 0, 2.1, 3]]"
    );

    let schema = read::infer_records_schema(&json)?;
    let actual = read::deserialize_records(&json, &schema)?;
    assert_eq!(
        format!("{:?}", actual.arrays()[0]),
        "ListArray[[0, 2], [0, 0, 2.1, 3]]"
    );

    Ok(())
}

#[test]
fn deserialize_timestamp_string_ns() -> Result<()> {
    let data = br#"["2023-04-07T12:23:34.000000001Z"]"#;

    let json = json_deserializer::parse(data)?;

    let data_type = DataType::List(Box::new(Field::new(
        "item",
        DataType::Timestamp(TimeUnit::Nanosecond, None),
        false,
    )));

    let result = read::deserialize(&json, data_type)?;

    let expected = Int64Array::from([Some(1680870214000000001)])
        .to(DataType::Timestamp(TimeUnit::Nanosecond, None));

    assert_eq!(expected, result.as_ref());

    Ok(())
}

#[test]
fn deserialize_timestamp_string_us() -> Result<()> {
    let data = br#"["2023-04-07T12:23:34.000000001Z"]"#;

    let json = json_deserializer::parse(data)?;

    let data_type = DataType::List(Box::new(Field::new(
        "item",
        DataType::Timestamp(TimeUnit::Microsecond, None),
        false,
    )));

    let result = read::deserialize(&json, data_type)?;

    let expected = Int64Array::from([Some(1680870214000000)])
        .to(DataType::Timestamp(TimeUnit::Microsecond, None));

    assert_eq!(expected, result.as_ref());

    Ok(())
}

#[test]
fn deserialize_timestamp_string_ms() -> Result<()> {
    let data = br#"["2023-04-07T12:23:34.000000001Z"]"#;

    let json = json_deserializer::parse(data)?;

    let data_type = DataType::List(Box::new(Field::new(
        "item",
        DataType::Timestamp(TimeUnit::Millisecond, None),
        false,
    )));

    let result = read::deserialize(&json, data_type)?;

    let expected = Int64Array::from([Some(1680870214000)])
        .to(DataType::Timestamp(TimeUnit::Millisecond, None));

    assert_eq!(expected, result.as_ref());

    Ok(())
}

#[test]
fn deserialize_timestamp_string_s() -> Result<()> {
    let data = br#"["2023-04-07T12:23:34.000000001Z"]"#;

    let json = json_deserializer::parse(data)?;

    let data_type = DataType::List(Box::new(Field::new(
        "item",
        DataType::Timestamp(TimeUnit::Second, None),
        false,
    )));

    let result = read::deserialize(&json, data_type)?;

    let expected =
        Int64Array::from([Some(1680870214)]).to(DataType::Timestamp(TimeUnit::Second, None));

    assert_eq!(expected, result.as_ref());

    Ok(())
}

#[test]
fn deserialize_timestamp_string_tz_s() -> Result<()> {
    let data = br#"["2023-04-07T12:23:34.000000001+00:00"]"#;

    let json = json_deserializer::parse(data)?;

    let data_type = DataType::List(Box::new(Field::new(
        "item",
        DataType::Timestamp(TimeUnit::Second, Some("+01:00".to_string())),
        false,
    )));

    let result = read::deserialize(&json, data_type)?;

    let expected = Int64Array::from([Some(1680870214)]).to(DataType::Timestamp(
        TimeUnit::Second,
        Some("+01:00".to_string()),
    ));

    assert_eq!(expected, result.as_ref());

    Ok(())
}

#[test]
fn deserialize_timestamp_int_ns() -> Result<()> {
    let data = br#"[1680870214000000001]"#;

    let json = json_deserializer::parse(data)?;

    let data_type = DataType::List(Box::new(Field::new(
        "item",
        DataType::Timestamp(TimeUnit::Nanosecond, None),
        false,
    )));

    let result = read::deserialize(&json, data_type)?;

    let expected = Int64Array::from([Some(1680870214000000001)])
        .to(DataType::Timestamp(TimeUnit::Nanosecond, None));

    assert_eq!(expected, result.as_ref());

    Ok(())
}