Expand description

Apache Arrow is a cross-language development platform for in-memory data.

This mod provides API for converting between arrow and parquet.

Example of writing Arrow record batch to Parquet file

 use arrow::array::Int32Array;
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::record_batch::RecordBatch;
 use parquet::arrow::arrow_writer::ArrowWriter;
 use parquet::file::properties::WriterProperties;
 use std::fs::File;
 use std::sync::Arc;
 let ids = Int32Array::from(vec![1, 2, 3, 4]);
 let vals = Int32Array::from(vec![5, 6, 7, 8]);
 let schema = Arc::new(Schema::new(vec![
     Field::new("id", DataType::Int32, false),
     Field::new("val", DataType::Int32, false),
 ]));

 let file = File::create("data.parquet").unwrap();

 let batch =
     RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(ids), Arc::new(vals)]).unwrap();
 let batches = vec![batch];

 // Default writer properties
 let props = WriterProperties::builder().build();

 let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), Some(props)).unwrap();

 for batch in batches {
     writer.write(&batch).expect("Writing batch");
 }
 writer.close().unwrap();

WriterProperties can be used to set Parquet file options

use parquet::file::properties::WriterProperties;
use parquet::basic::{ Compression, Encoding };
use parquet::file::properties::WriterVersion;

// File compression
let props = WriterProperties::builder()
    .set_compression(Compression::SNAPPY)
    .build();

Example of reading parquet file into arrow record batch

use arrow::record_batch::RecordBatchReader;
use parquet::file::reader::SerializedFileReader;
use parquet::arrow::{ParquetFileArrowReader, ArrowReader};
use std::sync::Arc;
use std::fs::File;


let file = File::open("data.parquet").unwrap();
let file_reader = SerializedFileReader::new(file).unwrap();
let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(file_reader));

println!("Converted arrow schema is: {}", arrow_reader.get_schema().unwrap());
println!("Arrow schema after projection is: {}",
   arrow_reader.get_schema_by_columns(vec![0], true).unwrap());

let mut record_batch_reader = arrow_reader.get_record_reader(2048).unwrap();

for maybe_record_batch in record_batch_reader {
   let record_batch = maybe_record_batch.unwrap();
   if record_batch.num_rows() > 0 {
       println!("Read {} records.", record_batch.num_rows());
   } else {
       println!("End of file!");
   }
}

Re-exports

pub use self::arrow_reader::ArrowReader;
pub use self::arrow_reader::ParquetFileArrowReader;
pub use self::arrow_writer::ArrowWriter;

Modules

Contains reader which reads parquet data into arrow array.

Contains writer which writes arrow data into parquet data.

Constants

Schema metadata key used to store serialized Arrow IPC schema