parquet_lite/lib.rs
1//! # parquet-lite
2//!
3//! A lightweight, pure-Rust alternative to the official Apache Parquet crate.
4//!
5//! Designed for projects where the full `parquet` crate is overkill.
6//! `parquet-lite` provides read-path essentials in a fraction of the size,
7//! with zero unsafe code and full WASM compatibility.
8//!
9//! ## Key Differences from Official Crate
10//!
11//! | Feature | `parquet` (official) | `parquet-lite` |
12//! |------------------------|----------------------|--------------------|
13//! | Binary size | Large | Small |
14//! | Dependencies | ~80 | ~15 |
15//! | Thrift dependency | Yes | No (hand-rolled) |
16//! | Read support | Full | Flat schemas |
17//! | Write support | Yes | Not yet |
18//! | Arrow integration | Yes | Yes |
19//! | WASM compatible | Partial | Full |
20//!
21//! ## Quick Start
22//!
23//! ```rust,no_run
24//! use parquet_lite::*;
25//! use std::fs;
26//!
27//! let data = fs::read("data.parquet").unwrap();
28//!
29//! // Read metadata
30//! let metadata = read_metadata(&data).unwrap();
31//! println!("Rows: {}, Columns: {}", metadata.num_rows, metadata.num_columns);
32//!
33//! // Read as Arrow batches
34//! let batches = read_to_arrow_batches(&data, 1024).unwrap();
35//! for batch in batches {
36//! let batch = batch.unwrap();
37//! println!("Batch: {} rows", batch.num_rows());
38//! }
39//! ```
40//!
41//! ## Feature Flags
42//!
43//! | Feature | Default | Description |
44//! |----------|---------|------------------------------------------|
45//! | `snappy` | ✅ | Snappy compression/decompression |
46//! | `serde` | ❌ | Serde serialization for metadata |
47//! | `wasm` | ❌ | WASM bindings via wasm-bindgen |
48//! | `full` | ❌ | All features enabled |
49
50pub mod types;
51pub mod schema;
52pub mod metadata;
53pub mod codecs;
54pub mod reader;
55pub mod arrow_convert;
56pub mod batch_iter_advanced;
57pub mod streaming_reader;
58pub mod statistics;
59
60#[cfg(target_arch = "wasm32")]
61pub mod wasm;
62
63// Re-export key types for convenience
64pub use types::{
65 Compression, Encoding, ParquetError, ParquetMetadata, ParquetType, Result,
66 ColumnMetadata, RowGroupMetadata,
67};
68pub use schema::{ColumnSchema, LogicalType, SchemaBuilder, TimestampUnit};
69pub use reader::{ColumnData, ParquetReader};
70pub use arrow_convert::ArrowConverter;
71pub use batch_iter_advanced::SelectiveBatchIterator;
72pub use streaming_reader::StreamingParquetReader;
73pub use statistics::{ColumnStatistics, StatisticsCollector};
74
75// ---------------------------------------------------------------------------
76// Top-level convenience API
77// ---------------------------------------------------------------------------
78
79/// Parse metadata from raw Parquet file bytes.
80///
81/// This is a lightweight operation — only the footer is parsed,
82/// no column data is read.
83pub fn read_metadata(data: &[u8]) -> Result<ParquetMetadata> {
84 metadata::MetadataReader::read_metadata(data)
85}
86
87/// Create a batch iterator over Arrow RecordBatches.
88///
89/// Reads all columns and yields batches of `batch_size` rows.
90pub fn read_to_arrow_batches(
91 data: &[u8],
92 batch_size: usize,
93) -> Result<SelectiveBatchIterator> {
94 let reader = ParquetReader::new(data)?;
95 Ok(SelectiveBatchIterator::new(reader, batch_size))
96}
97
98/// Create a batch iterator reading only the specified columns.
99pub fn read_columns_to_arrow_batches(
100 data: &[u8],
101 batch_size: usize,
102 columns: Vec<usize>,
103) -> Result<SelectiveBatchIterator> {
104 let reader = ParquetReader::new(data)?;
105 Ok(SelectiveBatchIterator::new(reader, batch_size).with_columns(columns))
106}
107
108/// Print a formatted statistics summary to stdout.
109pub fn print_stats(data: &[u8]) -> Result<()> {
110 let metadata = read_metadata(data)?;
111 StatisticsCollector::print_summary(&metadata);
112 Ok(())
113}
114
115#[cfg(test)]
116mod tests {
117 use super::*;
118
119 #[test]
120 fn test_invalid_magic() {
121 let data = b"NOT_PARQUET_DATA_AT_ALL!!";
122 let result = read_metadata(data);
123 assert!(result.is_err());
124 }
125
126 #[test]
127 fn test_too_small() {
128 let data = b"PAR1";
129 let result = read_metadata(data);
130 assert!(result.is_err());
131 }
132
133 #[test]
134 fn test_schema_builder_integration() {
135 let schema = SchemaBuilder::new()
136 .add_column("id", ParquetType::Int64, LogicalType::Integer)
137 .add_column("name", ParquetType::ByteArray, LogicalType::String)
138 .add_optional_column("value", ParquetType::Double, LogicalType::Float)
139 .with_compression(Compression::Snappy)
140 .build();
141
142 assert_eq!(schema.len(), 3);
143 assert_eq!(schema[0].name, "id");
144 assert_eq!(schema[1].name, "name");
145 assert_eq!(schema[2].name, "value");
146 assert!(!schema[2].required);
147 }
148}