Skip to main content

parquet_lite/
schema.rs

1use crate::types::*;
2
3/// Schema definition for a single column
4#[derive(Debug, Clone)]
5pub struct ColumnSchema {
6    pub name: String,
7    pub physical_type: ParquetType,
8    pub logical_type: LogicalType,
9    pub encoding: Encoding,
10    pub compression: Compression,
11    pub required: bool,
12}
13
14/// Logical type annotations that give semantic meaning to physical types
15#[derive(Debug, Clone, Copy, PartialEq)]
16pub enum LogicalType {
17    String,
18    Integer,
19    Float,
20    Boolean,
21    Timestamp(TimestampUnit),
22    Date,
23    Decimal { precision: u8, scale: u8 },
24}
25
26/// Timestamp precision units
27#[derive(Debug, Clone, Copy, PartialEq)]
28pub enum TimestampUnit {
29    Millis,
30    Micros,
31    Nanos,
32}
33
34/// Fluent builder for constructing column schemas
35pub struct SchemaBuilder {
36    columns: Vec<ColumnSchema>,
37}
38
39impl SchemaBuilder {
40    pub fn new() -> Self {
41        SchemaBuilder {
42            columns: Vec::new(),
43        }
44    }
45
46    /// Add a required column with specified types
47    pub fn add_column(
48        mut self,
49        name: impl Into<String>,
50        physical_type: ParquetType,
51        logical_type: LogicalType,
52    ) -> Self {
53        self.columns.push(ColumnSchema {
54            name: name.into(),
55            physical_type,
56            logical_type,
57            encoding: Encoding::Plain,
58            compression: Compression::Uncompressed,
59            required: true,
60        });
61        self
62    }
63
64    /// Add an optional (nullable) column
65    pub fn add_optional_column(
66        mut self,
67        name: impl Into<String>,
68        physical_type: ParquetType,
69        logical_type: LogicalType,
70    ) -> Self {
71        self.columns.push(ColumnSchema {
72            name: name.into(),
73            physical_type,
74            logical_type,
75            encoding: Encoding::Plain,
76            compression: Compression::Uncompressed,
77            required: false,
78        });
79        self
80    }
81
82    /// Set compression on the most recently added column
83    pub fn with_compression(mut self, compression: Compression) -> Self {
84        if let Some(col) = self.columns.last_mut() {
85            col.compression = compression;
86        }
87        self
88    }
89
90    /// Set encoding on the most recently added column
91    pub fn with_encoding(mut self, encoding: Encoding) -> Self {
92        if let Some(col) = self.columns.last_mut() {
93            col.encoding = encoding;
94        }
95        self
96    }
97
98    /// Consume the builder and return the column schemas
99    pub fn build(self) -> Vec<ColumnSchema> {
100        self.columns
101    }
102}
103
104impl Default for SchemaBuilder {
105    fn default() -> Self {
106        Self::new()
107    }
108}
109
110impl ColumnSchema {
111    /// Create a column schema with sensible defaults inferred from the physical type
112    pub fn default_for_type(name: impl Into<String>, parquet_type: ParquetType) -> Self {
113        let logical_type = match parquet_type {
114            ParquetType::Boolean => LogicalType::Boolean,
115            ParquetType::Int32 => LogicalType::Integer,
116            ParquetType::Int64 => LogicalType::Integer,
117            ParquetType::Float => LogicalType::Float,
118            ParquetType::Double => LogicalType::Float,
119            ParquetType::ByteArray => LogicalType::String,
120            ParquetType::FixedLenByteArray(_) => LogicalType::String,
121            ParquetType::Int96 => LogicalType::Timestamp(TimestampUnit::Nanos),
122        };
123
124        ColumnSchema {
125            name: name.into(),
126            physical_type: parquet_type,
127            logical_type,
128            encoding: Encoding::Plain,
129            compression: Compression::Uncompressed,
130            required: true,
131        }
132    }
133}
134
135#[cfg(test)]
136mod tests {
137    use super::*;
138
139    #[test]
140    fn test_schema_builder() {
141        let schema = SchemaBuilder::new()
142            .add_column("id", ParquetType::Int64, LogicalType::Integer)
143            .add_column("name", ParquetType::ByteArray, LogicalType::String)
144            .add_optional_column("score", ParquetType::Double, LogicalType::Float)
145            .with_compression(Compression::Snappy)
146            .build();
147
148        assert_eq!(schema.len(), 3);
149        assert_eq!(schema[0].name, "id");
150        assert!(schema[0].required);
151        assert_eq!(schema[2].name, "score");
152        assert!(!schema[2].required);
153        assert_eq!(schema[2].compression, Compression::Snappy);
154    }
155
156    #[test]
157    fn test_default_for_type() {
158        let col = ColumnSchema::default_for_type("age", ParquetType::Int32);
159        assert_eq!(col.logical_type, LogicalType::Integer);
160        assert!(col.required);
161
162        let col = ColumnSchema::default_for_type("name", ParquetType::ByteArray);
163        assert_eq!(col.logical_type, LogicalType::String);
164    }
165}