polars_parquet/parquet/
types.rs

1use arrow::types::{
2    AlignedBytes, AlignedBytesCast, Bytes12Alignment4, Bytes4Alignment4, Bytes8Alignment8,
3};
4
5use crate::parquet::schema::types::PhysicalType;
6
7/// A physical native representation of a Parquet fixed-sized type.
8pub trait NativeType:
9    std::fmt::Debug + Send + Sync + 'static + Copy + Clone + AlignedBytesCast<Self::AlignedBytes>
10{
11    type Bytes: AsRef<[u8]>
12        + bytemuck::Pod
13        + IntoIterator<Item = u8>
14        + for<'a> TryFrom<&'a [u8], Error = std::array::TryFromSliceError>
15        + std::fmt::Debug
16        + Clone
17        + Copy;
18    type AlignedBytes: AlignedBytes<Unaligned = Self::Bytes> + From<Self> + Into<Self>;
19
20    fn to_le_bytes(&self) -> Self::Bytes;
21
22    fn from_le_bytes(bytes: Self::Bytes) -> Self;
23
24    fn ord(&self, other: &Self) -> std::cmp::Ordering;
25
26    const TYPE: PhysicalType;
27}
28
29macro_rules! native {
30    ($type:ty, $unaligned:ty, $physical_type:expr) => {
31        impl NativeType for $type {
32            type Bytes = [u8; size_of::<Self>()];
33            type AlignedBytes = $unaligned;
34
35            #[inline]
36            fn to_le_bytes(&self) -> Self::Bytes {
37                Self::to_le_bytes(*self)
38            }
39
40            #[inline]
41            fn from_le_bytes(bytes: Self::Bytes) -> Self {
42                Self::from_le_bytes(bytes)
43            }
44
45            #[inline]
46            fn ord(&self, other: &Self) -> std::cmp::Ordering {
47                self.partial_cmp(other).unwrap_or(std::cmp::Ordering::Equal)
48            }
49
50            const TYPE: PhysicalType = $physical_type;
51        }
52    };
53}
54
55native!(i32, Bytes4Alignment4, PhysicalType::Int32);
56native!(i64, Bytes8Alignment8, PhysicalType::Int64);
57native!(f32, Bytes4Alignment4, PhysicalType::Float);
58native!(f64, Bytes8Alignment8, PhysicalType::Double);
59
60impl NativeType for [u32; 3] {
61    const TYPE: PhysicalType = PhysicalType::Int96;
62
63    type Bytes = [u8; size_of::<Self>()];
64    type AlignedBytes = Bytes12Alignment4;
65
66    #[inline]
67    fn to_le_bytes(&self) -> Self::Bytes {
68        let mut bytes = [0; 12];
69        let first = self[0].to_le_bytes();
70        bytes[0] = first[0];
71        bytes[1] = first[1];
72        bytes[2] = first[2];
73        bytes[3] = first[3];
74        let second = self[1].to_le_bytes();
75        bytes[4] = second[0];
76        bytes[5] = second[1];
77        bytes[6] = second[2];
78        bytes[7] = second[3];
79        let third = self[2].to_le_bytes();
80        bytes[8] = third[0];
81        bytes[9] = third[1];
82        bytes[10] = third[2];
83        bytes[11] = third[3];
84        bytes
85    }
86
87    #[inline]
88    fn from_le_bytes(bytes: Self::Bytes) -> Self {
89        let mut first = [0; 4];
90        first[0] = bytes[0];
91        first[1] = bytes[1];
92        first[2] = bytes[2];
93        first[3] = bytes[3];
94        let mut second = [0; 4];
95        second[0] = bytes[4];
96        second[1] = bytes[5];
97        second[2] = bytes[6];
98        second[3] = bytes[7];
99        let mut third = [0; 4];
100        third[0] = bytes[8];
101        third[1] = bytes[9];
102        third[2] = bytes[10];
103        third[3] = bytes[11];
104        [
105            u32::from_le_bytes(first),
106            u32::from_le_bytes(second),
107            u32::from_le_bytes(third),
108        ]
109    }
110
111    #[inline]
112    fn ord(&self, other: &Self) -> std::cmp::Ordering {
113        int96_to_i64_ns(*self).ord(&int96_to_i64_ns(*other))
114    }
115}
116
117#[inline]
118pub fn int96_to_i64_ns(value: [u32; 3]) -> i64 {
119    const JULIAN_DAY_OF_EPOCH: i64 = 2_440_588;
120    const SECONDS_PER_DAY: i64 = 86_400;
121    const NANOS_PER_SECOND: i64 = 1_000_000_000;
122
123    let day = value[2] as i64;
124    let nanoseconds = ((value[1] as i64) << 32) + value[0] as i64;
125    let seconds = (day - JULIAN_DAY_OF_EPOCH) * SECONDS_PER_DAY;
126
127    seconds * NANOS_PER_SECOND + nanoseconds
128}
129
130/// Returns the ordering of two binary values.
131pub fn ord_binary<'a>(a: &'a [u8], b: &'a [u8]) -> std::cmp::Ordering {
132    use std::cmp::Ordering::*;
133    match (a.is_empty(), b.is_empty()) {
134        (true, true) => return Equal,
135        (true, false) => return Less,
136        (false, true) => return Greater,
137        (false, false) => {},
138    }
139
140    for (v1, v2) in a.iter().zip(b.iter()) {
141        match v1.cmp(v2) {
142            Equal => continue,
143            other => return other,
144        }
145    }
146    Equal
147}
148
149#[inline]
150pub fn decode<T: NativeType>(chunk: &[u8]) -> T {
151    assert!(chunk.len() >= size_of::<<T as NativeType>::Bytes>());
152    unsafe { decode_unchecked(chunk) }
153}
154
155/// Convert a Little-Endian byte-slice into the `T`
156///
157/// # Safety
158///
159/// This is safe if the length is properly checked.
160#[inline]
161pub unsafe fn decode_unchecked<T: NativeType>(chunk: &[u8]) -> T {
162    let chunk: <T as NativeType>::Bytes = unsafe { chunk.try_into().unwrap_unchecked() };
163    T::from_le_bytes(chunk)
164}