arrow-json 58.1.0

Support for parsing JSON format to and from the Arrow format
Documentation
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

//! Transfer data between the Arrow memory format and JSON line-delimited records.
//!
//! See the module level documentation for the
//! [`reader`] and [`writer`] for usage examples.
//!
//! # Binary Data uses `Base16` Encoding
//!
//! As per [RFC7159] JSON cannot encode arbitrary binary data. This crate works around that
//! limitation by encoding/decoding binary data as a [hexadecimal] string (i.e.
//! [`Base16` encoding]).
//!
//! Note that `Base16` only has 50% space efficiency (i.e., the encoded data is twice as large
//! as the original). If that is an issue, we recommend to convert binary data to/from a different
//! encoding format such as `Base64` instead. See the following example for details.
//!
//! ## `Base64` Encoding Example
//!
//! [`Base64`] is a common [binary-to-text encoding] scheme with a space efficiency of 75%. The
//! following example shows how to use the [`arrow_cast`] crate to encode binary data to `Base64`
//! before converting it to JSON and how to decode it back.
//!
//! ```
//! # use std::io::Cursor;
//! # use std::sync::Arc;
//! # use arrow_array::{BinaryArray, RecordBatch, StringArray};
//! # use arrow_array::cast::AsArray;
//! use arrow_cast::base64::{b64_decode, b64_encode, BASE64_STANDARD};
//! # use arrow_json::{LineDelimitedWriter, ReaderBuilder};
//! #
//! // The data we want to write
//! let input = BinaryArray::from(vec![b"\xDE\x00\xFF".as_ref()]);
//!
//! // Base64 encode it to a string
//! let encoded: StringArray = b64_encode(&BASE64_STANDARD, &input);
//!
//! // Write the StringArray to JSON
//! let batch = RecordBatch::try_from_iter([("col", Arc::new(encoded) as _)]).unwrap();
//! let mut buf = Vec::with_capacity(1024);
//! let mut writer = LineDelimitedWriter::new(&mut buf);
//! writer.write(&batch).unwrap();
//! writer.finish().unwrap();
//!
//! // Read the JSON data
//! let cursor = Cursor::new(buf);
//! let mut reader = ReaderBuilder::new(batch.schema()).build(cursor).unwrap();
//! let batch = reader.next().unwrap().unwrap();
//!
//! // Reverse the base64 encoding
//! let col: BinaryArray = batch.column(0).as_string::<i32>().clone().into();
//! let output = b64_decode(&BASE64_STANDARD, &col).unwrap();
//!
//! assert_eq!(input, output);
//! ```
//!
//! [RFC7159]: https://datatracker.ietf.org/doc/html/rfc7159#section-8.1
//! [binary-to-text encoding]: https://en.wikipedia.org/wiki/Binary-to-text_encoding
//! [hexadecimal]: https://en.wikipedia.org/wiki/Hexadecimal
//! [`Base16` encoding]: https://en.wikipedia.org/wiki/Base16#Base16
//! [`Base64`]: https://en.wikipedia.org/wiki/Base64

#![doc(
    html_logo_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_white-bg.svg",
    html_favicon_url = "https://arrow.apache.org/img/arrow-logo_chevrons_black-txt_transparent-bg.svg"
)]
#![cfg_attr(docsrs, feature(doc_cfg))]
#![deny(rustdoc::broken_intra_doc_links)]
#![warn(missing_docs)]

pub mod reader;
pub mod writer;

pub use self::reader::{Reader, ReaderBuilder};
pub use self::writer::{
    ArrayWriter, Encoder, EncoderFactory, EncoderOptions, LineDelimitedWriter, Writer,
    WriterBuilder,
};
use half::f16;
use serde_json::{Number, Value};

/// Specifies what is considered valid JSON when reading or writing
/// RecordBatches or StructArrays.
///
/// This enum controls which form(s) the Reader will accept and which form the
/// Writer will produce. For example, if the RecordBatch Schema is
/// `[("a", Int32), ("r", Struct("b": Boolean, "c" Utf8))]`
/// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
/// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly']
/// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce
/// rows formatted similarly.
///
/// The list encoding is more compact if the schema is known, and is used by
/// tools such as [Presto] and [Trino].
///
/// When reading objects, the order of the key does not matter. When reading
/// lists, the entries must be the same number and in the same order as the
/// struct fields. Map columns are not affected by this option.
///
/// [Presto]: https://prestodb.io/docs/current/develop/client-protocol.html#important-queryresults-attributes
/// [Trino]: https://trino.io/docs/current/develop/client-protocol.html#important-queryresults-attributes
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
pub enum StructMode {
    #[default]
    /// Encode/decode structs as objects (e.g., {"a": 1, "b": "c"})
    ObjectOnly,
    /// Encode/decode structs as lists (e.g., [1, "c"])
    ListOnly,
}

/// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.).
pub trait JsonSerializable: 'static {
    /// Converts self into json value if its possible
    fn into_json_value(self) -> Option<Value>;
}

macro_rules! json_serializable {
    ($t:ty) => {
        impl JsonSerializable for $t {
            fn into_json_value(self) -> Option<Value> {
                Some(self.into())
            }
        }
    };
}

json_serializable!(bool);
json_serializable!(u8);
json_serializable!(u16);
json_serializable!(u32);
json_serializable!(u64);
json_serializable!(i8);
json_serializable!(i16);
json_serializable!(i32);
json_serializable!(i64);

impl JsonSerializable for i128 {
    fn into_json_value(self) -> Option<Value> {
        // Serialize as string to avoid issues with arbitrary_precision serde_json feature
        // - https://github.com/serde-rs/json/issues/559
        // - https://github.com/serde-rs/json/issues/845
        // - https://github.com/serde-rs/json/issues/846
        Some(self.to_string().into())
    }
}

impl JsonSerializable for f16 {
    fn into_json_value(self) -> Option<Value> {
        Number::from_f64(f64::round(f64::from(self) * 1000.0) / 1000.0).map(Value::Number)
    }
}

impl JsonSerializable for f32 {
    fn into_json_value(self) -> Option<Value> {
        Number::from_f64(f64::round(self as f64 * 1000.0) / 1000.0).map(Value::Number)
    }
}

impl JsonSerializable for f64 {
    fn into_json_value(self) -> Option<Value> {
        Number::from_f64(self).map(Value::Number)
    }
}

#[cfg(test)]
mod tests {
    use std::sync::Arc;

    use crate::writer::JsonArray;

    use super::*;

    use arrow_array::{
        ArrayRef, GenericBinaryArray, GenericByteViewArray, RecordBatch, RecordBatchWriter,
        builder::FixedSizeBinaryBuilder, types::BinaryViewType,
    };
    use serde_json::Value::{Bool, Number as VNumber, String as VString};

    #[test]
    fn test_arrow_native_type_to_json() {
        assert_eq!(Some(Bool(true)), true.into_json_value());
        assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value());
        assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value());
        assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value());
        assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value());
        assert_eq!(Some(VString("1".to_string())), 1i128.into_json_value());
        assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value());
        assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value());
        assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value());
        assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value());
        assert_eq!(
            Some(VNumber(Number::from_f64(0.01f64).unwrap())),
            0.01.into_json_value()
        );
        assert_eq!(
            Some(VNumber(Number::from_f64(0.01f64).unwrap())),
            0.01f64.into_json_value()
        );
        assert_eq!(None, f32::NAN.into_json_value());
    }

    #[test]
    fn test_json_roundtrip_structs() {
        use crate::writer::LineDelimited;
        use arrow_schema::DataType;
        use arrow_schema::Field;
        use arrow_schema::Fields;
        use arrow_schema::Schema;
        use std::sync::Arc;

        let schema = Arc::new(Schema::new(vec![
            Field::new(
                "c1",
                DataType::Struct(Fields::from(vec![
                    Field::new("c11", DataType::Int32, true),
                    Field::new(
                        "c12",
                        DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()),
                        false,
                    ),
                ])),
                false,
            ),
            Field::new("c2", DataType::Utf8, false),
        ]));

        {
            let object_input = r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"}
{"c1":{"c12":{"c121":"f"}},"c2":"b"}
{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"}
"#
            .as_bytes();
            let object_reader = ReaderBuilder::new(schema.clone())
                .with_struct_mode(StructMode::ObjectOnly)
                .build(object_input)
                .unwrap();

            let mut object_output: Vec<u8> = Vec::new();
            let mut object_writer = WriterBuilder::new()
                .with_struct_mode(StructMode::ObjectOnly)
                .build::<_, LineDelimited>(&mut object_output);
            for batch_res in object_reader {
                object_writer.write(&batch_res.unwrap()).unwrap();
            }
            assert_eq!(object_input, &object_output);
        }

        {
            let list_input = r#"[[1,["e"]],"a"]
[[null,["f"]],"b"]
[[5,["g"]],"c"]
"#
            .as_bytes();
            let list_reader = ReaderBuilder::new(schema.clone())
                .with_struct_mode(StructMode::ListOnly)
                .build(list_input)
                .unwrap();

            let mut list_output: Vec<u8> = Vec::new();
            let mut list_writer = WriterBuilder::new()
                .with_struct_mode(StructMode::ListOnly)
                .build::<_, LineDelimited>(&mut list_output);
            for batch_res in list_reader {
                list_writer.write(&batch_res.unwrap()).unwrap();
            }
            assert_eq!(list_input, &list_output);
        }
    }

    #[test]
    #[allow(invalid_from_utf8)]
    fn test_json_roundtrip_binary() {
        let not_utf8: &[u8] = b"Not UTF8 \xa0\xa1!";
        assert!(str::from_utf8(not_utf8).is_err());

        let values: &[Option<&[u8]>] = &[
            Some(b"Ned Flanders" as &[u8]),
            None,
            Some(b"Troy McClure" as &[u8]),
            Some(not_utf8),
        ];
        // Binary:
        assert_binary_json(Arc::new(GenericBinaryArray::<i32>::from_iter(values)));

        // LargeBinary:
        assert_binary_json(Arc::new(GenericBinaryArray::<i64>::from_iter(values)));

        // FixedSizeBinary:
        assert_binary_json(build_array_fixed_size_binary(12, values));

        // BinaryView:
        assert_binary_json(Arc::new(GenericByteViewArray::<BinaryViewType>::from_iter(
            values,
        )));
    }

    fn build_array_fixed_size_binary(byte_width: i32, values: &[Option<&[u8]>]) -> ArrayRef {
        let mut builder = FixedSizeBinaryBuilder::new(byte_width);
        for value in values {
            match value {
                Some(v) => builder.append_value(v).unwrap(),
                None => builder.append_null(),
            }
        }
        Arc::new(builder.finish())
    }

    fn assert_binary_json(array: ArrayRef) {
        // encode and check JSON with and without explicit nulls
        assert_binary_json_with_writer(
            array.clone(),
            WriterBuilder::new().with_explicit_nulls(true),
        );
        assert_binary_json_with_writer(array, WriterBuilder::new().with_explicit_nulls(false));
    }

    fn assert_binary_json_with_writer(array: ArrayRef, builder: WriterBuilder) {
        let batch = RecordBatch::try_from_iter([("bytes", array)]).unwrap();

        let mut buf = Vec::new();
        let json_value: Value = {
            let mut writer = builder.build::<_, JsonArray>(&mut buf);
            writer.write(&batch).unwrap();
            writer.close().unwrap();
            serde_json::from_slice(&buf).unwrap()
        };

        let json_array = json_value.as_array().unwrap();

        let decoded = {
            let mut decoder = ReaderBuilder::new(batch.schema().clone())
                .build_decoder()
                .unwrap();
            decoder.serialize(json_array).unwrap();
            decoder.flush().unwrap().unwrap()
        };

        assert_eq!(batch, decoded);
    }
}