use crate::common::international::extract_numbers_international;
use docx_rs::*;
use std::path::Path;
pub fn parse_word_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
let extension = file_path
.extension()
.and_then(|ext| ext.to_str())
.unwrap_or("")
.to_lowercase();
match extension.as_str() {
"docx" => parse_docx_file(file_path),
"doc" => {
Err(crate::error::BenfError::ParseError(
"Legacy .doc format not supported. Please convert to .docx format.".to_string(),
))
}
_ => Err(crate::error::BenfError::ParseError(format!(
"Unsupported Word file extension: {extension}"
))),
}
}
fn parse_docx_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
let file_bytes = std::fs::read(file_path).map_err(|e| {
crate::error::BenfError::FileError(format!("Failed to read Word file: {e}"))
})?;
let doc = read_docx(&file_bytes).map_err(|e| {
crate::error::BenfError::ParseError(format!("Failed to parse DOCX file: {e:?}"))
})?;
let mut all_text = String::new();
for child in &doc.document.children {
if let DocumentChild::Paragraph(paragraph) = child {
for run_child in ¶graph.children {
if let ParagraphChild::Run(run) = run_child {
for text_child in &run.children {
if let RunChild::Text(text) = text_child {
all_text.push_str(&text.text);
all_text.push(' ');
}
}
}
}
all_text.push('\n'); }
}
let numbers = extract_numbers_international(&all_text);
if numbers.is_empty() {
return Err(crate::error::BenfError::NoNumbersFound);
}
Ok(numbers)
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_word_parsing_concept() {
let test_path = PathBuf::from("nonexistent.docx");
let result = parse_word_file(&test_path);
assert!(result.is_err());
match result {
Err(crate::error::BenfError::FileError(_)) => {
}
_ => panic!("Expected file error for non-existent Word file"),
}
}
#[test]
fn test_doc_format_rejection() {
let test_path = PathBuf::from("test.doc");
let result = parse_word_file(&test_path);
assert!(result.is_err());
match result {
Err(crate::error::BenfError::ParseError(msg)) => {
assert!(msg.contains("Legacy .doc format not supported"));
}
_ => panic!("Expected parse error for .doc file"),
}
}
#[test]
fn test_real_word_file() {
let test_path = PathBuf::from("tests/fixtures/sample_report.docx");
if test_path.exists() {
let result = parse_word_file(&test_path);
match result {
Ok(numbers) => {
assert!(!numbers.is_empty(), "Should extract at least some numbers");
println!(
"Extracted {count} numbers from Word file",
count = numbers.len()
);
println!("All extracted numbers: {numbers:?}");
assert!(
numbers.contains(&567.89),
"Should contain decimal amount 567.89"
);
assert!(
numbers.contains(&234.0),
"Should contain partial amount 234"
);
assert!(numbers.contains(&2023.0), "Should contain year 2023");
assert!(numbers.contains(&12.5), "Should contain percentage 12.5");
assert!(numbers.contains(&8.9), "Should contain percentage 8.9");
assert!(
numbers.len() >= 30,
"Should extract at least 30 numbers, got {count}",
count = numbers.len()
);
println!("✅ Word parsing test passed! Extracted financial report data.");
}
Err(e) => {
println!("Word parsing failed (expected if test file is missing): {e}");
}
}
} else {
println!("Test Word file not found at {test_path:?}, skipping real file test");
}
}
}