use crate::error::Result;
use crate::parquet::reader::ParquetReader;
use crate::{ExcelReader, ExcelWriter};
use std::path::Path;
pub struct ParquetToExcelConverter {
parquet_path: String,
}
impl ParquetToExcelConverter {
pub fn new<P: AsRef<Path>>(parquet_path: P) -> Result<Self> {
let path_str = parquet_path
.as_ref()
.to_str()
.ok_or_else(|| {
crate::error::ExcelError::InvalidState("Invalid parquet path".to_string())
})?
.to_string();
Ok(Self {
parquet_path: path_str,
})
}
pub fn convert_to_excel<P: AsRef<Path>>(&self, excel_path: P) -> Result<usize> {
let reader = ParquetReader::open(&self.parquet_path)?;
let mut writer = ExcelWriter::new(excel_path)?;
let headers = reader.column_names();
writer.write_header_bold(&headers)?;
let mut row_count = 0;
for row in reader.rows()? {
let row_data = row?;
writer.write_row(&row_data)?;
row_count += 1;
}
writer.save()?;
Ok(row_count)
}
pub fn convert_with_progress<P, F>(&self, excel_path: P, mut callback: F) -> Result<usize>
where
P: AsRef<Path>,
F: FnMut(usize, usize),
{
let reader = ParquetReader::open(&self.parquet_path)?;
let total_rows = reader.row_count();
let mut writer = ExcelWriter::new(excel_path)?;
let headers = reader.column_names();
writer.write_header_bold(&headers)?;
let mut row_count = 0;
for (idx, row) in reader.rows()?.enumerate() {
let row_data = row?;
writer.write_row(&row_data)?;
row_count += 1;
if (idx + 1) % 1000 == 0 || idx + 1 == total_rows {
callback(idx + 1, total_rows);
}
}
writer.save()?;
Ok(row_count)
}
}
pub struct ExcelToParquetConverter {
excel_path: String,
}
impl ExcelToParquetConverter {
pub fn new<P: AsRef<Path>>(excel_path: P) -> Result<Self> {
let path_str = excel_path
.as_ref()
.to_str()
.ok_or_else(|| {
crate::error::ExcelError::InvalidState("Invalid excel path".to_string())
})?
.to_string();
Ok(Self {
excel_path: path_str,
})
}
pub fn convert_to_parquet<P: AsRef<Path>>(&self, parquet_path: P) -> Result<usize> {
use arrow::datatypes::{DataType, Field, Schema};
use parquet::arrow::arrow_writer::ArrowWriter;
use parquet::file::properties::WriterProperties;
use std::fs::File;
use std::sync::Arc;
const BATCH_SIZE: usize = 10_000;
let mut reader = ExcelReader::open(&self.excel_path)?;
let sheet_names = reader.sheet_names();
if sheet_names.is_empty() {
return Err(crate::error::ExcelError::ReadError(
"No sheets found in Excel file".to_string(),
));
}
let sheet_name = &sheet_names[0];
let mut rows_iter = reader.rows(sheet_name)?;
let headers = match rows_iter.next() {
Some(Ok(row)) => row.to_strings(),
Some(Err(e)) => return Err(e),
None => {
return Err(crate::error::ExcelError::ReadError(
"No data found in Excel file".to_string(),
))
}
};
let fields: Vec<Field> = headers
.iter()
.map(|name| Field::new(name, DataType::Utf8, true))
.collect();
let schema = Arc::new(Schema::new(fields));
let num_columns = headers.len();
let file = File::create(parquet_path)?;
let props = WriterProperties::builder().build();
let mut writer = ArrowWriter::try_new(file, schema.clone(), Some(props))
.map_err(|e| crate::error::ExcelError::WriteError(e.to_string()))?;
let mut total_rows = 0;
let mut batch_buffer: Vec<Vec<String>> = Vec::with_capacity(BATCH_SIZE);
for row_result in rows_iter {
let row = row_result?;
batch_buffer.push(row.to_strings());
if batch_buffer.len() >= BATCH_SIZE {
Self::write_batch(&mut writer, &schema, &batch_buffer, num_columns)?;
total_rows += batch_buffer.len();
batch_buffer.clear(); }
}
if !batch_buffer.is_empty() {
Self::write_batch(&mut writer, &schema, &batch_buffer, num_columns)?;
total_rows += batch_buffer.len();
}
writer
.close()
.map_err(|e| crate::error::ExcelError::WriteError(e.to_string()))?;
Ok(total_rows)
}
fn write_batch(
writer: &mut parquet::arrow::arrow_writer::ArrowWriter<std::fs::File>,
schema: &std::sync::Arc<arrow::datatypes::Schema>,
rows: &[Vec<String>],
num_columns: usize,
) -> Result<()> {
use arrow::array::{ArrayRef, StringArray};
use arrow::record_batch::RecordBatch;
use std::sync::Arc;
if rows.is_empty() {
return Ok(());
}
let mut columns: Vec<ArrayRef> = Vec::with_capacity(num_columns);
for col_idx in 0..num_columns {
let col_data: Vec<Option<&str>> = rows
.iter()
.map(|row| {
if col_idx < row.len() && !row[col_idx].is_empty() {
Some(row[col_idx].as_str())
} else {
None
}
})
.collect();
let array = StringArray::from(col_data);
columns.push(Arc::new(array) as ArrayRef);
}
let batch = RecordBatch::try_new(schema.clone(), columns)
.map_err(|e| crate::error::ExcelError::WriteError(e.to_string()))?;
writer
.write(&batch)
.map_err(|e| crate::error::ExcelError::WriteError(e.to_string()))?;
Ok(())
}
pub fn convert_with_progress<P, F>(&self, parquet_path: P, mut callback: F) -> Result<usize>
where
P: AsRef<Path>,
F: FnMut(&str),
{
callback("Reading Excel file...");
let row_count = self.convert_to_parquet(parquet_path)?;
callback(&format!("Converted {} rows to Parquet", row_count));
Ok(row_count)
}
}