use crate::core::{Content, Error, ExtractionResult, Metadata, MetadataValue, Result};
use crate::parsers::Parser;
use crate::utils::security::{validate_file_size, validate_zip_structure, check_xml_bomb, FileSizeLimits};
use quick_xml::events::Event;
use quick_xml::Reader;
use std::io::{Cursor, Read};
use zip::ZipArchive;
pub struct OdsParser;
impl Parser for OdsParser {
fn name(&self) -> &str {
"OdsParser"
}
fn supported_types(&self) -> &[&str] {
&[
"application/vnd.oasis.opendocument.spreadsheet",
"application/ods",
]
}
fn parse(&self, data: &[u8], mime_type: &str) -> Result<ExtractionResult> {
validate_file_size(data, FileSizeLimits::ODS, "ODS")?;
validate_zip_structure(data, Some(&["content.xml", "meta.xml"]))?;
let cursor = Cursor::new(data);
let mut archive = ZipArchive::new(cursor).map_err(|e| {
Error::ParseError(format!("Failed to open ODS archive: {}", e))
})?;
let (text, table_info) = extract_tables(&mut archive)?;
let mut metadata = extract_metadata(&mut archive)?;
metadata.insert(
"table_count".to_string(),
MetadataValue::Number(table_info.len() as i64),
);
let table_names: Vec<String> = table_info.iter().map(|t| t.name.clone()).collect();
metadata.insert(
"table_names".to_string(),
MetadataValue::Text(table_names.join(", ")),
);
Ok(ExtractionResult {
mime_type: mime_type.to_string(),
content: Content::Text(text),
metadata,
detection_confidence: 0.95,
})
}
}
struct TableInfo {
name: String,
row_count: usize,
column_count: usize,
}
fn extract_tables(archive: &mut ZipArchive<Cursor<&[u8]>>) -> Result<(String, Vec<TableInfo>)> {
let mut content_file = archive
.by_name("content.xml")
.map_err(|e| Error::ParseError(format!("Failed to find content.xml: {}", e)))?;
let mut xml_content = String::new();
content_file
.read_to_string(&mut xml_content)
.map_err(|e| Error::ParseError(format!("Failed to read content.xml: {}", e)))?;
check_xml_bomb(&xml_content)?;
parse_tables(&xml_content)
}
fn parse_tables(xml_content: &str) -> Result<(String, Vec<TableInfo>)> {
let mut reader = Reader::from_str(xml_content);
reader.trim_text(true);
let mut all_text = String::new();
let mut table_info = Vec::new();
let mut buf = Vec::new();
let mut in_table = false;
let mut in_row = false;
let mut in_cell = false;
let mut current_table_name = String::new();
let mut current_row = Vec::new();
let mut all_rows = Vec::new();
let mut max_columns = 0;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let name = e.name();
let local_name_bytes = name.local_name();
let local_name = local_name_bytes.as_ref();
match local_name {
b"table" => {
in_table = true;
for attr in e.attributes() {
if let Ok(attr) = attr {
if attr.key.local_name().as_ref() == b"name" {
current_table_name = String::from_utf8_lossy(&attr.value).to_string();
break;
}
}
}
}
b"table-row" if in_table => {
in_row = true;
current_row.clear();
}
b"table-cell" if in_row => {
in_cell = true;
let mut repeat_count = 1;
for attr in e.attributes() {
if let Ok(attr) = attr {
if attr.key.local_name().as_ref() == b"number-columns-repeated" {
if let Ok(count_str) = String::from_utf8(attr.value.to_vec()) {
repeat_count = count_str.parse().unwrap_or(1);
}
}
}
}
for _ in 0..repeat_count {
current_row.push(String::new());
}
}
b"p" if in_cell => {
}
_ => {}
}
}
Ok(Event::Text(e)) if in_cell => {
let content = e
.unescape()
.map_err(|e| Error::ParseError(format!("Failed to unescape text: {}", e)))?;
let text = content.trim();
if !text.is_empty() && !current_row.is_empty() {
if let Some(last_cell) = current_row.last_mut() {
if !last_cell.is_empty() {
last_cell.push(' ');
}
last_cell.push_str(text);
}
}
}
Ok(Event::End(ref e)) => {
let name = e.name();
let local_name_bytes = name.local_name();
let local_name = local_name_bytes.as_ref();
match local_name {
b"table-cell" => {
in_cell = false;
}
b"table-row" => {
in_row = false;
if !current_row.is_empty() {
max_columns = max_columns.max(current_row.len());
all_rows.push(current_row.clone());
}
}
b"table" => {
in_table = false;
if !all_text.is_empty() {
all_text.push_str("\n\n");
}
all_text.push_str(&format!("=== Table: {} ===\n", current_table_name));
for row in &all_rows {
all_text.push_str(&row.join(","));
all_text.push('\n');
}
table_info.push(TableInfo {
name: current_table_name.clone(),
row_count: all_rows.len(),
column_count: max_columns,
});
all_rows.clear();
max_columns = 0;
current_table_name.clear();
}
_ => {}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(Error::ParseError(format!(
"Error parsing content.xml: {}",
e
)))
}
_ => {}
}
buf.clear();
}
Ok((all_text, table_info))
}
fn extract_metadata(archive: &mut ZipArchive<Cursor<&[u8]>>) -> Result<Metadata> {
let mut metadata = Metadata::new();
let meta_result = archive.by_name("meta.xml");
if let Ok(mut meta_file) = meta_result {
let mut xml_content = String::new();
if meta_file.read_to_string(&mut xml_content).is_ok() {
parse_meta_properties(&xml_content, &mut metadata)?;
}
}
Ok(metadata)
}
fn parse_meta_properties(xml_content: &str, metadata: &mut Metadata) -> Result<()> {
let mut reader = Reader::from_str(xml_content);
reader.trim_text(true);
let mut buf = Vec::new();
let mut current_element = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(ref e)) => {
let name = e.name();
let local_name_bytes = name.local_name();
let local_name = String::from_utf8_lossy(local_name_bytes.as_ref());
current_element = local_name.to_string();
}
Ok(Event::Text(e)) => {
if !current_element.is_empty() {
let content = e
.unescape()
.map_err(|e| Error::ParseError(format!("Failed to unescape text: {}", e)))?;
let text = content.trim().to_string();
if !text.is_empty() {
match current_element.as_str() {
"title" => {
metadata.insert("title".to_string(), MetadataValue::Text(text));
}
"initial-creator" | "creator" => {
metadata.insert("author".to_string(), MetadataValue::Text(text));
}
"subject" => {
metadata.insert("subject".to_string(), MetadataValue::Text(text));
}
"description" => {
metadata
.insert("description".to_string(), MetadataValue::Text(text));
}
"creation-date" => {
metadata
.insert("creation_date".to_string(), MetadataValue::Text(text));
}
"date" => {
metadata
.insert("modified_date".to_string(), MetadataValue::Text(text));
}
_ => {}
}
}
}
}
Ok(Event::End(_)) => {
current_element.clear();
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(Error::ParseError(format!("Error parsing meta.xml: {}", e)))
}
_ => {}
}
buf.clear();
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_supported_types() {
let parser = OdsParser;
let types = parser.supported_types();
assert!(types.contains(&"application/vnd.oasis.opendocument.spreadsheet"));
assert!(types.contains(&"application/ods"));
}
#[test]
fn test_parser_name() {
let parser = OdsParser;
assert_eq!(parser.name(), "OdsParser");
}
}