use crate::common::international::extract_numbers_international;
use std::io::Read;
use std::path::Path;
pub fn parse_opendocument_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
let extension = file_path
.extension()
.and_then(|ext| ext.to_str())
.unwrap_or("")
.to_lowercase();
match extension.as_str() {
"odt" => parse_odt_file(file_path),
"ods" => {
crate::common::input::formats::excel::parse_excel_file(file_path)
}
_ => Err(crate::error::BenfError::ParseError(format!(
"Unsupported OpenDocument file extension: {extension}"
))),
}
}
fn parse_odt_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
if !file_path.exists() {
return Err(crate::error::BenfError::FileError(format!(
"OpenDocument file not found: {}",
file_path.display()
)));
}
let file = std::fs::File::open(file_path).map_err(|e| {
crate::error::BenfError::FileError(format!("Failed to open OpenDocument file: {e}"))
})?;
let mut archive = zip::ZipArchive::new(file).map_err(|e| {
crate::error::BenfError::ParseError(format!(
"Invalid OpenDocument file format (not a ZIP archive): {e}"
))
})?;
let mut content_xml = None;
for i in 0..archive.len() {
let file = archive.by_index(i).map_err(|e| {
crate::error::BenfError::ParseError(format!("Failed to read ZIP entry: {e}"))
})?;
if file.name() == "content.xml" {
content_xml = Some(i);
break;
}
}
let content_index = content_xml.ok_or_else(|| {
crate::error::BenfError::ParseError(
"content.xml not found in OpenDocument file".to_string(),
)
})?;
let mut file = archive.by_index(content_index).map_err(|e| {
crate::error::BenfError::ParseError(format!("Failed to read content.xml: {e}"))
})?;
let mut contents = String::new();
file.read_to_string(&mut contents).map_err(|e| {
crate::error::BenfError::ParseError(format!("Failed to read content.xml data: {e}"))
})?;
let text_content = extract_text_from_odt_xml(&contents)?;
if text_content.trim().is_empty() {
return Err(crate::error::BenfError::NoNumbersFound);
}
let numbers = extract_numbers_international(&text_content);
if numbers.is_empty() {
Err(crate::error::BenfError::NoNumbersFound)
} else {
Ok(numbers)
}
}
fn extract_text_from_odt_xml(xml_content: &str) -> crate::error::Result<String> {
use regex::Regex;
let text_regex = Regex::new(r"<text:[^>]*>(.*?)</text:[^>]*>").map_err(|e| {
crate::error::BenfError::ParseError(format!("Failed to compile regex: {e}"))
})?;
let table_regex = Regex::new(r"<table:[^>]*>(.*?)</table:[^>]*>").map_err(|e| {
crate::error::BenfError::ParseError(format!("Failed to compile table regex: {e}"))
})?;
let mut extracted_text = Vec::new();
for cap in text_regex.captures_iter(xml_content) {
if let Some(text_match) = cap.get(1) {
let inner_text = extract_inner_text(text_match.as_str());
if !inner_text.trim().is_empty() {
extracted_text.push(inner_text);
}
}
}
for cap in table_regex.captures_iter(xml_content) {
if let Some(text_match) = cap.get(1) {
let inner_text = extract_inner_text(text_match.as_str());
if !inner_text.trim().is_empty() {
extracted_text.push(inner_text);
}
}
}
Ok(extracted_text.join(" "))
}
fn extract_inner_text(xml_fragment: &str) -> String {
use regex::Regex;
let tag_regex = Regex::new(r"<[^>]*>").unwrap();
let text_only = tag_regex.replace_all(xml_fragment, " ");
decode_xml_entities(&text_only)
}
fn decode_xml_entities(text: &str) -> String {
text.replace("<", "<")
.replace(">", ">")
.replace("&", "&")
.replace(""", "\"")
.replace("'", "'")
.replace(" ", " ")
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_odt_parsing_concept() {
let test_path = PathBuf::from("nonexistent.odt");
let result = parse_opendocument_file(&test_path);
assert!(result.is_err());
match result {
Err(crate::error::BenfError::FileError(_)) => {
}
_ => panic!("Expected file error for non-existent ODT file"),
}
}
#[test]
fn test_odt_real_file() {
let test_path = PathBuf::from("tests/fixtures/sample_document.odt");
if test_path.exists() {
let result = parse_opendocument_file(&test_path);
match result {
Ok(numbers) => {
println!(
"✅ ODT parsing succeeded! Found {count} numbers",
count = numbers.len()
);
assert!(
!numbers.is_empty(),
"Should extract at least some numbers from ODT"
);
println!(
"First 10 numbers: {first_10:?}",
first_10 = numbers.iter().take(10).collect::<Vec<_>>()
);
}
Err(e) => {
println!("ODT parsing failed: {e}");
}
}
} else {
println!("Test ODT file not found at {test_path:?}, skipping real file test");
}
}
#[test]
fn test_odt_xml_text_extraction() {
let sample_xml = r#"
<office:document-content xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0">
<office:body>
<office:text>
<text:p>売上高: 1,234,567円</text:p>
<text:p>純利益: 2,345,678円</text:p>
<text:h>第3四半期実績</text:h>
<text:span>前年比: 345%増加</text:span>
</office:text>
</office:body>
</office:document-content>
"#;
let result = extract_text_from_odt_xml(sample_xml);
assert!(result.is_ok());
let text = result.unwrap();
assert!(text.contains("1,234,567"));
assert!(text.contains("2,345,678"));
assert!(text.contains("345"));
println!("Extracted ODT text: {text}");
}
#[test]
fn test_odt_inner_text_extraction() {
let xml_with_nested_tags = r#"売上: <text:span style="font-weight:bold">1,000,000</text:span>円 利益: <text:span>500,000</text:span>円"#;
let result = extract_inner_text(xml_with_nested_tags);
assert!(result.contains("1,000,000"));
assert!(result.contains("500,000"));
assert!(!result.contains("<text:span"));
println!("Inner text: {result}");
}
#[test]
fn test_odt_xml_entity_decoding() {
let text_with_entities =
"Profit & Loss: <500,000> "estimated" margin";
let decoded = decode_xml_entities(text_with_entities);
assert_eq!(decoded, "Profit & Loss: <500,000> \"estimated\" margin");
}
#[test]
fn test_ods_redirect() {
let test_path = PathBuf::from("tests/fixtures/sample_data.ods");
if test_path.exists() {
let result = parse_opendocument_file(&test_path);
match result {
Ok(numbers) => {
println!(
"✅ ODS parsing (via Excel parser) succeeded! Found {count} numbers",
count = numbers.len()
);
}
Err(e) => {
println!("ODS parsing failed: {e}");
}
}
} else {
println!("Test ODS file not found, skipping ODS redirect test");
}
}
}