orc-format 0.2.0

Unofficial implementation of Apache ORC spec in safe Rust
Documentation
import random

import pyorc

data = {
    "a": [1.0, 2.0, None, 4.0, 5.0],
    "b": [True, False, None, True, False],
    "str_direct": ["a", "cccccc", None, "ddd", "ee"],
    "d": ["a", "bb", None, "ccc", "ddd"],
    "e": ["ddd", "cc", None, "bb", "a"],
    "f": ["aaaaa", "bbbbb", None, "ccccc", "ddddd"],
    "int_short_repeated": [5, 5, None, 5, 5],
    "int_neg_short_repeated": [-5, -5, None, -5, -5],
    "int_delta": [1, 2, None, 4, 5],
    "int_neg_delta": [5, 4, None, 2, 1],
    "int_direct": [1, 6, None, 3, 2],
    "int_neg_direct": [-1, -6, None, -3, -2],
}


def _write(
    schema: str,
    data,
    file_name: str,
    compression=pyorc.CompressionKind.NONE,
    dict_key_size_threshold=0.0,
):
    output = open(file_name, "wb")
    writer = pyorc.Writer(
        output,
        schema,
        dict_key_size_threshold=dict_key_size_threshold,
        # use a small number to ensure that compression crosses value boundaries
        compression_block_size=32,
        compression=compression,
    )
    num_rows = len(list(data.values())[0])
    for x in range(num_rows):
        row = tuple(values[x] for values in data.values())
        writer.write(row)
    writer.close()

    with open(file_name, "rb") as f:
        reader = pyorc.Reader(f)
        list(reader)


_write(
    "struct<a:float,b:boolean,str_direct:string,d:string,e:string,f:string,int_short_repeated:int,int_neg_short_repeated:int,int_delta:int,int_neg_delta:int,int_direct:int,int_neg_direct:int>",
    data,
    "test.orc",
)

data_boolean = {
    "long": [True] * 32,
}

_write("struct<long:boolean>", data_boolean, "long_bool.orc")

_write("struct<long:boolean>", data_boolean, "long_bool_gzip.orc", pyorc.CompressionKind.ZLIB)

data_dict = {
    "dict": ["abcd", "efgh"] * 32,
}

_write("struct<dict:string>", data_dict, "string_long.orc")

data_dict = {
    "dict": ["abc", "efgh"] * 32,
}

_write("struct<dict:string>", data_dict, "string_dict.orc", dict_key_size_threshold=0.1)

_write("struct<dict:string>", data_dict, "string_dict_gzip.orc", pyorc.CompressionKind.ZLIB)

data_dict = {
    "dict": ["abcd", "efgh"] * (10**4 // 2),
}

_write("struct<dict:string>", data_dict, "string_long_long.orc")
_write("struct<dict:string>", data_dict, "string_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)

long_f32 = {
    "dict": [random.uniform(0, 1) for _ in range(10**6)],
}

_write("struct<dict:float>", long_f32, "f32_long_long_gzip.orc", pyorc.CompressionKind.ZLIB)