Skip to main content

parquet_lite/
lib.rs

1//! # parquet-lite
2//!
3//! A lightweight, pure-Rust alternative to the official Apache Parquet crate.
4//!
5//! Designed for projects where the full `parquet` crate is overkill.
6//! `parquet-lite` provides read-path essentials in a fraction of the size,
7//! with zero unsafe code and full WASM compatibility.
8//!
9//! ## Key Differences from Official Crate
10//!
11//! | Feature                | `parquet` (official) | `parquet-lite`     |
12//! |------------------------|----------------------|--------------------|
13//! | Binary size            | Large                | Small              |
14//! | Dependencies           | ~80                  | ~15                |
15//! | Thrift dependency      | Yes                  | No (hand-rolled)   |
16//! | Read support           | Full                 | Flat schemas        |
17//! | Write support          | Yes                  | Not yet             |
18//! | Arrow integration      | Yes                  | Yes                 |
19//! | WASM compatible        | Partial              | Full                |
20//!
21//! ## Quick Start
22//!
23//! ```rust,no_run
24//! use parquet_lite::*;
25//! use std::fs;
26//!
27//! let data = fs::read("data.parquet").unwrap();
28//!
29//! // Read metadata
30//! let metadata = read_metadata(&data).unwrap();
31//! println!("Rows: {}, Columns: {}", metadata.num_rows, metadata.num_columns);
32//!
33//! // Read as Arrow batches
34//! let batches = read_to_arrow_batches(&data, 1024).unwrap();
35//! for batch in batches {
36//!     let batch = batch.unwrap();
37//!     println!("Batch: {} rows", batch.num_rows());
38//! }
39//! ```
40//!
41//! ## Feature Flags
42//!
43//! | Feature  | Default | Description                              |
44//! |----------|---------|------------------------------------------|
45//! | `snappy` | ✅      | Snappy compression/decompression         |
46//! | `serde`  | ❌      | Serde serialization for metadata         |
47//! | `wasm`   | ❌      | WASM bindings via wasm-bindgen           |
48//! | `full`   | ❌      | All features enabled                     |
49
50pub mod types;
51pub mod schema;
52pub mod metadata;
53pub mod codecs;
54pub mod reader;
55pub mod arrow_convert;
56pub mod batch_iter_advanced;
57pub mod streaming_reader;
58pub mod statistics;
59
60#[cfg(target_arch = "wasm32")]
61pub mod wasm;
62
63// Re-export key types for convenience
64pub use types::{
65    Compression, Encoding, ParquetError, ParquetMetadata, ParquetType, Result,
66    ColumnMetadata, RowGroupMetadata,
67};
68pub use schema::{ColumnSchema, LogicalType, SchemaBuilder, TimestampUnit};
69pub use reader::{ColumnData, ParquetReader};
70pub use arrow_convert::ArrowConverter;
71pub use batch_iter_advanced::SelectiveBatchIterator;
72pub use streaming_reader::StreamingParquetReader;
73pub use statistics::{ColumnStatistics, StatisticsCollector};
74
75// ---------------------------------------------------------------------------
76// Top-level convenience API
77// ---------------------------------------------------------------------------
78
79/// Parse metadata from raw Parquet file bytes.
80///
81/// This is a lightweight operation — only the footer is parsed,
82/// no column data is read.
83pub fn read_metadata(data: &[u8]) -> Result<ParquetMetadata> {
84    metadata::MetadataReader::read_metadata(data)
85}
86
87/// Create a batch iterator over Arrow RecordBatches.
88///
89/// Reads all columns and yields batches of `batch_size` rows.
90pub fn read_to_arrow_batches(
91    data: &[u8],
92    batch_size: usize,
93) -> Result<SelectiveBatchIterator> {
94    let reader = ParquetReader::new(data)?;
95    Ok(SelectiveBatchIterator::new(reader, batch_size))
96}
97
98/// Create a batch iterator reading only the specified columns.
99pub fn read_columns_to_arrow_batches(
100    data: &[u8],
101    batch_size: usize,
102    columns: Vec<usize>,
103) -> Result<SelectiveBatchIterator> {
104    let reader = ParquetReader::new(data)?;
105    Ok(SelectiveBatchIterator::new(reader, batch_size).with_columns(columns))
106}
107
108/// Print a formatted statistics summary to stdout.
109pub fn print_stats(data: &[u8]) -> Result<()> {
110    let metadata = read_metadata(data)?;
111    StatisticsCollector::print_summary(&metadata);
112    Ok(())
113}
114
115#[cfg(test)]
116mod tests {
117    use super::*;
118
119    #[test]
120    fn test_invalid_magic() {
121        let data = b"NOT_PARQUET_DATA_AT_ALL!!";
122        let result = read_metadata(data);
123        assert!(result.is_err());
124    }
125
126    #[test]
127    fn test_too_small() {
128        let data = b"PAR1";
129        let result = read_metadata(data);
130        assert!(result.is_err());
131    }
132
133    #[test]
134    fn test_schema_builder_integration() {
135        let schema = SchemaBuilder::new()
136            .add_column("id", ParquetType::Int64, LogicalType::Integer)
137            .add_column("name", ParquetType::ByteArray, LogicalType::String)
138            .add_optional_column("value", ParquetType::Double, LogicalType::Float)
139            .with_compression(Compression::Snappy)
140            .build();
141
142        assert_eq!(schema.len(), 3);
143        assert_eq!(schema[0].name, "id");
144        assert_eq!(schema[1].name, "name");
145        assert_eq!(schema[2].name, "value");
146        assert!(!schema[2].required);
147    }
148}