use crate::core::{Content, Error, ExtractionResult, Metadata, MetadataValue, Result};
use crate::parsers::Parser;
use crate::utils::security::{validate_file_size, validate_zip_structure, FileSizeLimits};
use calamine::{open_workbook_auto_from_rs, Data, Reader, Sheets};
use std::io::Cursor;
pub struct XlsxParser;
impl XlsxParser {
fn extract_sheets(workbook: &mut Sheets<Cursor<Vec<u8>>>) -> Result<String> {
let mut content = String::with_capacity(8192);
let sheet_names = workbook.sheet_names().to_vec();
for sheet_name in &sheet_names {
if !content.is_empty() {
content.push_str("\n\n");
}
content.push_str(&format!("=== Sheet: {} ===\n", sheet_name));
if let Ok(range) = workbook.worksheet_range(sheet_name) {
for row in range.rows() {
let row_str = row
.iter()
.map(|cell| match cell {
Data::Empty => String::new(),
Data::String(s) => s.clone(),
Data::Float(f) => f.to_string(),
Data::Int(i) => i.to_string(),
Data::Bool(b) => b.to_string(),
Data::Error(e) => format!("#ERROR: {:?}", e),
Data::DateTime(dt) => dt.to_string(),
Data::DateTimeIso(dt) => dt.clone(),
Data::DurationIso(d) => d.clone(),
})
.collect::<Vec<_>>()
.join(",");
content.push_str(&row_str);
content.push('\n');
}
}
}
Ok(content)
}
fn extract_metadata(workbook: &mut Sheets<Cursor<Vec<u8>>>) -> Metadata {
let mut metadata = Metadata::new();
let sheet_names = workbook.sheet_names().to_vec();
metadata.insert(
"sheet_count".to_string(),
MetadataValue::Number(sheet_names.len() as i64),
);
metadata.insert(
"sheet_names".to_string(),
MetadataValue::Text(sheet_names.join(", ")),
);
let mut total_rows = 0;
let mut max_columns = 0;
for sheet_name in &sheet_names {
if let Ok(range) = workbook.worksheet_range(sheet_name) {
let (rows, cols) = range.get_size();
total_rows += rows;
max_columns = max_columns.max(cols);
metadata.insert(
format!("sheet_{}_rows", sheet_name),
MetadataValue::Number(rows as i64),
);
metadata.insert(
format!("sheet_{}_columns", sheet_name),
MetadataValue::Number(cols as i64),
);
}
}
metadata.insert(
"total_rows".to_string(),
MetadataValue::Number(total_rows as i64),
);
metadata.insert(
"max_columns".to_string(),
MetadataValue::Number(max_columns as i64),
);
metadata
}
}
impl Parser for XlsxParser {
fn name(&self) -> &str {
"XlsxParser"
}
fn supported_types(&self) -> &[&str] {
&[
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/xlsx",
]
}
fn parse(&self, data: &[u8], mime_type: &str) -> Result<ExtractionResult> {
validate_file_size(data, FileSizeLimits::XLSX, "XLSX")?;
validate_zip_structure(data, Some(&["[Content_Types].xml"]))?;
let cursor = Cursor::new(data.to_vec());
let mut workbook = open_workbook_auto_from_rs(cursor).map_err(|e| {
let error_msg = e.to_string();
if error_msg.contains("password") || error_msg.contains("encrypted") {
Error::ParseError("File is password-protected or encrypted".to_string())
} else if error_msg.contains("corrupt") || error_msg.contains("invalid") {
Error::CorruptedFile(format!("Corrupted XLSX file: {}", error_msg))
} else {
Error::ParseError(format!("Failed to parse XLSX: {}", error_msg))
}
})?;
let content_text = Self::extract_sheets(&mut workbook)?;
let metadata = Self::extract_metadata(&mut workbook);
Ok(ExtractionResult {
mime_type: mime_type.to_string(),
content: Content::Text(content_text),
metadata,
detection_confidence: 0.95,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_supported_types() {
let parser = XlsxParser;
let types = parser.supported_types();
assert!(types.contains(&"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
assert!(types.contains(&"application/xlsx"));
}
#[test]
fn test_parser_name() {
let parser = XlsxParser;
assert_eq!(parser.name(), "XlsxParser");
}
#[test]
fn test_error_handling_invalid_data() {
let parser = XlsxParser;
let invalid_data = b"This is not an XLSX file";
let result = parser.parse(invalid_data, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
assert!(result.is_err(), "Expected error for invalid XLSX data");
}
#[test]
fn test_error_handling_empty_data() {
let parser = XlsxParser;
let empty_data = b"";
let result = parser.parse(empty_data, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
assert!(result.is_err(), "Expected error for empty XLSX data");
}
}