connector_arrow 0.12.0

Load data from databases to Apache Arrow, the fastest way.
Documentation
#![allow(dead_code)]

use arrow::datatypes::*;
use std::sync::Arc;

pub type ArrowGenSpec = Vec<ColumnSpec>;

#[derive(Clone, Copy)]
pub enum ValueGenProcess {
    Null,
    Low,
    High,
    Unit,
    RandomUniform,
}

const VALUE_GEN_PROCESS_ALL: [ValueGenProcess; 5] = [
    ValueGenProcess::Low,
    ValueGenProcess::High,
    ValueGenProcess::Unit,
    ValueGenProcess::Null,
    ValueGenProcess::RandomUniform,
];

pub struct ValuesSpec {
    pub gen_process: ValueGenProcess,
    pub repeat: usize,
}

pub struct ColumnSpec {
    pub field_name: String,
    pub is_nullable: bool,
    pub data_type: DataType,
    pub values: Vec<ValuesSpec>,
}

pub fn basic_types() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::Null,
            DataType::Boolean,
            DataType::Int8,
            DataType::Int16,
            DataType::Int32,
            DataType::Int64,
            DataType::UInt8,
            DataType::UInt16,
            DataType::UInt32,
            DataType::UInt64,
            DataType::Float16,
            DataType::Float32,
            DataType::Float64,
            // DataType::Timestamp(TimeUnit::Nanosecond, None),
            // DataType::Timestamp(TimeUnit::Microsecond, None),
            // DataType::Timestamp(TimeUnit::Millisecond, None),
            // DataType::Timestamp(TimeUnit::Second, None),
            // DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+07:30"))),
            // DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("+07:30"))),
            // DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::from("+07:30"))),
            // DataType::Timestamp(TimeUnit::Second, Some(Arc::from("+07:30"))),
            // DataType::Time32(TimeUnit::Millisecond),
            // DataType::Time32(TimeUnit::Second),
            // DataType::Time64(TimeUnit::Nanosecond),
            // DataType::Time64(TimeUnit::Microsecond),
            // DataType::Duration(TimeUnit::Nanosecond),
            // DataType::Duration(TimeUnit::Microsecond),
            // DataType::Duration(TimeUnit::Millisecond),
            // DataType::Duration(TimeUnit::Second),
            // DataType::Interval(IntervalUnit::YearMonth),
            // DataType::Interval(IntervalUnit::MonthDayNano),
            // DataType::Interval(IntervalUnit::DayTime),
        ],
        &[false, true],
        &[ValueGenProcess::High],
    )
}

pub fn empty() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[DataType::Null, DataType::Int64, DataType::Float64],
        &[false, true],
        &[],
    )
}

pub fn null_bool() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[DataType::Null, DataType::Boolean],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn int() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::Int8,
            DataType::Int16,
            DataType::Int32,
            DataType::Int64,
        ],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn uint() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::UInt8,
            DataType::UInt16,
            DataType::UInt32,
            DataType::UInt64,
        ],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn float() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[DataType::Float16, DataType::Float32, DataType::Float64],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn decimal() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::Decimal128(15, 4),
            DataType::Decimal128(Decimal128Type::MAX_PRECISION, 0),
            DataType::Decimal128(Decimal128Type::MAX_PRECISION, Decimal128Type::MAX_SCALE),
            DataType::Decimal256(45, 12),
            DataType::Decimal256(Decimal256Type::MAX_PRECISION, 0),
            DataType::Decimal256(Decimal256Type::MAX_PRECISION, Decimal256Type::MAX_SCALE),
        ],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn timestamp() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::Timestamp(TimeUnit::Nanosecond, None),
            DataType::Timestamp(TimeUnit::Microsecond, None),
            DataType::Timestamp(TimeUnit::Millisecond, None),
            DataType::Timestamp(TimeUnit::Second, None),
            DataType::Timestamp(TimeUnit::Nanosecond, Some(Arc::from("+07:30"))),
            DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("+07:30"))),
            DataType::Timestamp(TimeUnit::Millisecond, Some(Arc::from("+07:30"))),
            DataType::Timestamp(TimeUnit::Second, Some(Arc::from("+07:30"))),
        ],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn date() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[DataType::Date32, DataType::Date64],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn time() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::Time32(TimeUnit::Millisecond),
            DataType::Time32(TimeUnit::Second),
            DataType::Time64(TimeUnit::Nanosecond),
            DataType::Time64(TimeUnit::Microsecond),
        ],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn duration() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::Duration(TimeUnit::Nanosecond),
            DataType::Duration(TimeUnit::Microsecond),
            DataType::Duration(TimeUnit::Millisecond),
            DataType::Duration(TimeUnit::Second),
        ],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn interval() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::Interval(IntervalUnit::YearMonth),
            DataType::Interval(IntervalUnit::MonthDayNano),
            DataType::Interval(IntervalUnit::DayTime),
        ],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn utf8() -> Vec<ColumnSpec> {
    domains_to_batch_spec(&[DataType::Utf8], &[false, true], &VALUE_GEN_PROCESS_ALL)
}

pub fn utf8_large() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[DataType::Utf8, DataType::LargeUtf8],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn binary() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::Binary,
            DataType::FixedSizeBinary(1),
            DataType::FixedSizeBinary(2),
            DataType::FixedSizeBinary(3),
            DataType::FixedSizeBinary(4),
        ],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

pub fn binary_large() -> Vec<ColumnSpec> {
    domains_to_batch_spec(
        &[
            DataType::Binary,
            DataType::LargeBinary,
            DataType::FixedSizeBinary(15),
            DataType::FixedSizeBinary(1),
            DataType::FixedSizeBinary(2),
            DataType::FixedSizeBinary(3),
            DataType::FixedSizeBinary(4),
            DataType::FixedSizeBinary(5),
            DataType::FixedSizeBinary(0),
        ],
        &[false, true],
        &VALUE_GEN_PROCESS_ALL,
    )
}

fn domains_to_batch_spec(
    data_types_domain: &[DataType],
    is_nullable_domain: &[bool],
    value_gen_process_domain: &[ValueGenProcess],
) -> Vec<ColumnSpec> {
    let mut columns = Vec::new();
    for data_type in data_types_domain {
        for is_nullable in is_nullable_domain {
            let is_nullable = *is_nullable;
            if matches!(data_type, &DataType::Null) && !is_nullable {
                continue;
            }

            let mut field_name = data_type
                .to_string()
                .replace("(", "_")
                .replace(")", "_")
                .replace(", ", "_")
                .replace("\"", "")
                .replace(":", "")
                .replace("+", "");
            if is_nullable {
                field_name += "_null";
            }
            let mut col = ColumnSpec {
                field_name,
                data_type: data_type.clone(),
                is_nullable,
                values: Vec::new(),
            };

            for gen_process in value_gen_process_domain {
                col.values.push(ValuesSpec {
                    gen_process: if matches!(gen_process, ValueGenProcess::Null) && !is_nullable {
                        ValueGenProcess::RandomUniform
                    } else {
                        *gen_process
                    },
                    repeat: 1,
                });
            }
            columns.push(col);
        }
    }
    columns
}