Skip to main content

parquet_lite/
types.rs

1use thiserror::Error;
2
3/// Core Parquet physical types
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub enum ParquetType {
6    Boolean,
7    Int32,
8    Int64,
9    Int96,
10    Float,
11    Double,
12    ByteArray,
13    FixedLenByteArray(i32),
14}
15
16impl ParquetType {
17    /// Convert from the Thrift type code to our enum
18    pub fn from_thrift(code: i32) -> Result<Self> {
19        match code {
20            0 => Ok(ParquetType::Boolean),
21            1 => Ok(ParquetType::Int32),
22            2 => Ok(ParquetType::Int64),
23            3 => Ok(ParquetType::Int96),
24            4 => Ok(ParquetType::Float),
25            5 => Ok(ParquetType::Double),
26            6 => Ok(ParquetType::ByteArray),
27            7 => Ok(ParquetType::FixedLenByteArray(0)), // length set later
28            _ => Err(ParquetError::UnsupportedType(format!(
29                "Unknown physical type code: {code}"
30            ))),
31        }
32    }
33}
34
35/// Encoding types supported by Parquet
36#[derive(Debug, Clone, Copy, PartialEq, Eq)]
37pub enum Encoding {
38    Plain,
39    RleBitPacked,
40    DeltaBinaryPacked,
41    DeltaLengthByteArray,
42    DeltaByteArray,
43}
44
45impl Encoding {
46    /// Convert from the Thrift encoding code
47    pub fn from_thrift(code: i32) -> Result<Self> {
48        match code {
49            0 => Ok(Encoding::Plain),
50            4 => Ok(Encoding::RleBitPacked),
51            5 => Ok(Encoding::DeltaBinaryPacked),
52            6 => Ok(Encoding::DeltaLengthByteArray),
53            7 => Ok(Encoding::DeltaByteArray),
54            // Treat unknown encodings as Plain for forward compat
55            _ => Ok(Encoding::Plain),
56        }
57    }
58}
59
60/// Compression codecs supported by Parquet
61#[derive(Debug, Clone, Copy, PartialEq, Eq)]
62pub enum Compression {
63    Uncompressed,
64    Snappy,
65    Gzip,
66    Lzo,
67    Brotli,
68    Lz4,
69    Zstd,
70}
71
72impl Compression {
73    /// Convert from the Thrift compression code
74    pub fn from_thrift(code: i32) -> Result<Self> {
75        match code {
76            0 => Ok(Compression::Uncompressed),
77            1 => Ok(Compression::Snappy),
78            2 => Ok(Compression::Gzip),
79            3 => Ok(Compression::Lzo),
80            4 => Ok(Compression::Brotli),
81            5 => Ok(Compression::Lz4),
82            6 => Ok(Compression::Zstd),
83            _ => Err(ParquetError::UnsupportedCompression(format!(
84                "Unknown compression code: {code}"
85            ))),
86        }
87    }
88}
89
90/// Page types within a column chunk
91#[derive(Debug, Clone, Copy, PartialEq, Eq)]
92pub enum PageType {
93    DataPage,
94    IndexPage,
95    DictionaryPage,
96    DataPageV2,
97}
98
99impl PageType {
100    pub fn from_thrift(code: i32) -> Result<Self> {
101        match code {
102            0 => Ok(PageType::DataPage),
103            1 => Ok(PageType::IndexPage),
104            2 => Ok(PageType::DictionaryPage),
105            3 => Ok(PageType::DataPageV2),
106            _ => Err(ParquetError::DataError(format!(
107                "Unknown page type: {code}"
108            ))),
109        }
110    }
111}
112
113/// Parquet error types
114#[derive(Debug, Error)]
115pub enum ParquetError {
116    #[error("IO error: {0}")]
117    IoError(#[from] std::io::Error),
118
119    #[error("Invalid Parquet file: {0}")]
120    InvalidFile(String),
121
122    #[error("Unsupported physical type: {0}")]
123    UnsupportedType(String),
124
125    #[error("Unsupported compression: {0}")]
126    UnsupportedCompression(String),
127
128    #[error("Unsupported encoding: {0}")]
129    UnsupportedEncoding(String),
130
131    #[error("Data error: {0}")]
132    DataError(String),
133
134    #[error("Arrow conversion error: {0}")]
135    ArrowError(String),
136
137    #[error("Column index {0} out of range")]
138    ColumnOutOfRange(usize),
139}
140
141/// Convenience Result type
142pub type Result<T> = std::result::Result<T, ParquetError>;
143
144/// Metadata for a single column chunk
145#[derive(Debug, Clone)]
146pub struct ColumnMetadata {
147    /// Column name extracted from schema
148    pub name: String,
149    /// Physical type of the column
150    pub physical_type: ParquetType,
151    /// Encoding used
152    pub encoding: Encoding,
153    /// Compression codec used
154    pub compression: Compression,
155    /// Number of values in this column chunk
156    pub num_values: i64,
157    /// Byte offset of the column chunk data in the file
158    pub data_offset: i64,
159    /// Total compressed size in bytes
160    pub total_compressed_size: i64,
161    /// Total uncompressed size in bytes
162    pub total_uncompressed_size: i64,
163}
164
165/// Metadata for a row group
166#[derive(Debug, Clone)]
167pub struct RowGroupMetadata {
168    /// Column chunks in this row group
169    pub columns: Vec<ColumnMetadata>,
170    /// Total number of rows
171    pub num_rows: i64,
172    /// Total byte size of the row group
173    pub total_byte_size: i64,
174}
175
176/// Top-level file metadata
177#[derive(Debug, Clone)]
178pub struct ParquetMetadata {
179    /// Parquet format version
180    pub version: i32,
181    /// Total number of rows across all row groups
182    pub num_rows: i64,
183    /// Number of columns
184    pub num_columns: usize,
185    /// Schema element names
186    pub schema_names: Vec<String>,
187    /// Row groups
188    pub row_groups: Vec<RowGroupMetadata>,
189    /// Flattened column metadata (first row group, for convenience)
190    pub columns: Vec<ColumnMetadata>,
191    /// Created by string
192    pub created_by: Option<String>,
193}