use arrow::array::{ArrayRef, RecordBatch};
use arrow::datatypes::SchemaRef as ArrowSchemaRef;
use evolution_builder::builder::ParquetBuilder;
use evolution_common::error::{Result, SetupError};
use parquet::arrow::ArrowWriter;
use parquet::file::properties::WriterProperties as ArrowWriterProperties;
use std::fs::{File, OpenOptions};
use std::path::PathBuf;
pub struct ParquetWriter {
inner: ArrowWriter<File>,
n_columns: usize,
}
impl ParquetWriter {
pub fn builder() -> ParquetWriterBuilder {
ParquetWriterBuilder {
..Default::default()
}
}
pub fn try_write_from_builder(&mut self, builder: &mut ParquetBuilder) -> Result<()> {
let mut buffer: Vec<(&str, ArrayRef)> = Vec::with_capacity(self.n_columns);
for column_builder in builder.columns().iter_mut() {
buffer.push(column_builder.finish());
}
let record_batch: RecordBatch = RecordBatch::try_from_iter(buffer)?;
self.inner.write(&record_batch)?;
Ok(())
}
pub fn try_finish(&mut self) -> Result<()> {
self.inner.finish()?;
Ok(())
}
}
#[derive(Default)]
pub struct ParquetWriterBuilder {
out_path: Option<PathBuf>,
schema: Option<ArrowSchemaRef>,
properties: Option<ArrowWriterProperties>,
}
impl ParquetWriterBuilder {
pub fn with_out_path(mut self, out_path: PathBuf) -> Self {
self.out_path = Some(out_path);
self
}
pub fn with_arrow_schema(mut self, schema: ArrowSchemaRef) -> Self {
self.schema = Some(schema);
self
}
pub fn with_properties(mut self, properties: Option<ArrowWriterProperties>) -> Self {
self.properties = properties;
self
}
pub fn try_build(self) -> Result<ParquetWriter> {
let out_file: File = match self.out_path {
Some(p) => OpenOptions::new().create(true).append(true).open(p)?,
None => {
return Err(Box::new(SetupError::new(
"Required field 'out_path' was not provided, exiting...",
)))
}
};
let schema: ArrowSchemaRef = self.schema.ok_or_else(|| {
Box::new(SetupError::new(
"Required field 'schema' was not provided, exiting...",
))
})?;
let n_columns: usize = schema.all_fields().len();
let inner: ArrowWriter<File> = ArrowWriter::try_new(out_file, schema, self.properties)?;
Ok(ParquetWriter { inner, n_columns })
}
pub fn build(self) -> ParquetWriter {
self.try_build().unwrap()
}
}