Module parquet::arrow[][src]

Expand description

Apache Arrow is a cross-language development platform for in-memory data.

This mod provides API for converting between arrow and parquet.

Example of writing Arrow record batch to Parquet file

 use arrow::array::Int32Array;
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::record_batch::RecordBatch;
 use parquet::arrow::arrow_writer::ArrowWriter;
 use parquet::file::properties::WriterProperties;
 use std::fs::File;
 use std::sync::Arc;
 let ids = Int32Array::from(vec![1, 2, 3, 4]);
 let vals = Int32Array::from(vec![5, 6, 7, 8]);
 let schema = Arc::new(Schema::new(vec![
     Field::new("id", DataType::Int32, false),
     Field::new("val", DataType::Int32, false),
 ]));

 let file = File::create("data.parquet").unwrap();

 let batch =
     RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(ids), Arc::new(vals)]).unwrap();
 let batches = vec![batch];

 // Default writer properties
 let props = WriterProperties::builder().build();

 let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), Some(props)).unwrap();

 for batch in batches {
     writer.write(&batch).expect("Writing batch");
 }
 writer.close().unwrap();

WriterProperties can be used to set Parquet file options

use parquet::file::properties::WriterProperties;
use parquet::basic::{ Compression, Encoding };
use parquet::file::properties::WriterVersion;

// File compression
let props = WriterProperties::builder()
    .set_compression(Compression::SNAPPY)
    .build();

Example of reading parquet file into arrow record batch

use arrow::record_batch::RecordBatchReader;
use parquet::file::reader::SerializedFileReader;
use parquet::arrow::{ParquetFileArrowReader, ArrowReader};
use std::sync::Arc;
use std::fs::File;


let file = File::open("data.parquet").unwrap();
let file_reader = SerializedFileReader::new(file).unwrap();
let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader));

println!("Converted arrow schema is: {}", arrow_reader.get_schema().unwrap());
println!("Arrow schema after projection is: {}",
   arrow_reader.get_schema_by_columns(vec![0], true).unwrap());

let mut record_batch_reader = arrow_reader.get_record_reader(2048).unwrap();

for maybe_record_batch in record_batch_reader {
   let record_batch = maybe_record_batch.unwrap();
   if record_batch.num_rows() > 0 {
       println!("Read {} records.", record_batch.num_rows());
   } else {
       println!("End of file!");
   }
}

Re-exports

pub use self::arrow_reader::ArrowReader;
pub use self::arrow_reader::ParquetFileArrowReader;
pub use self::arrow_writer::ArrowWriter;
pub use self::schema::arrow_to_parquet_schema;
pub use self::schema::parquet_to_arrow_schema;
pub use self::schema::parquet_to_arrow_schema_by_columns;

Modules

Contains reader which reads parquet data into arrow array.

Contains writer which writes arrow data into parquet data.

Provides API for converting parquet schema to arrow schema and vice versa.

Constants

Schema metadata key used to store serialized Arrow IPC schema