use crate::common::international::extract_numbers_international;
use std::io::Read;
use std::path::Path;
pub fn parse_powerpoint_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
let extension = file_path
.extension()
.and_then(|ext| ext.to_str())
.unwrap_or("")
.to_lowercase();
match extension.as_str() {
"pptx" => parse_pptx_file(file_path),
"ppt" => {
Err(crate::error::BenfError::ParseError(
"Legacy .ppt format not supported. Please convert to .pptx format.".to_string(),
))
}
_ => Err(crate::error::BenfError::ParseError(format!(
"Unsupported PowerPoint file extension: {extension}"
))),
}
}
fn parse_pptx_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
if !file_path.exists() {
return Err(crate::error::BenfError::FileError(format!(
"PowerPoint file not found: {}",
file_path.display()
)));
}
let file = std::fs::File::open(file_path).map_err(|e| {
crate::error::BenfError::FileError(format!("Failed to open PowerPoint file: {e}"))
})?;
let mut archive = zip::ZipArchive::new(file).map_err(|e| {
crate::error::BenfError::ParseError(format!(
"Invalid PowerPoint file format (not a ZIP archive): {e}"
))
})?;
let mut all_text = String::new();
for i in 0..archive.len() {
let mut file = archive.by_index(i).map_err(|e| {
crate::error::BenfError::ParseError(format!("Failed to read ZIP entry: {e}"))
})?;
let file_name = file.name().to_string();
if file_name.starts_with("ppt/slides/slide") && file_name.ends_with(".xml") {
let mut contents = String::new();
file.read_to_string(&mut contents).map_err(|e| {
crate::error::BenfError::ParseError(format!("Failed to read slide XML: {e}"))
})?;
let slide_text = extract_text_from_slide_xml(&contents)?;
all_text.push_str(&slide_text);
all_text.push(' '); }
}
if all_text.trim().is_empty() {
return Err(crate::error::BenfError::NoNumbersFound);
}
let numbers = extract_numbers_international(&all_text);
if numbers.is_empty() {
Err(crate::error::BenfError::NoNumbersFound)
} else {
Ok(numbers)
}
}
fn extract_text_from_slide_xml(xml_content: &str) -> crate::error::Result<String> {
use regex::Regex;
let text_regex = Regex::new(r"<a:t[^>]*>(.*?)</a:t>").map_err(|e| {
crate::error::BenfError::ParseError(format!("Failed to compile regex: {e}"))
})?;
let mut extracted_text = Vec::new();
for cap in text_regex.captures_iter(xml_content) {
if let Some(text_match) = cap.get(1) {
let text = text_match.as_str();
let decoded_text = decode_xml_entities(text);
extracted_text.push(decoded_text);
}
}
Ok(extracted_text.join(" "))
}
fn decode_xml_entities(text: &str) -> String {
text.replace("<", "<")
.replace(">", ">")
.replace("&", "&")
.replace(""", "\"")
.replace("'", "'")
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_powerpoint_parsing_concept() {
let test_path = PathBuf::from("nonexistent.pptx");
let result = parse_powerpoint_file(&test_path);
assert!(result.is_err());
match result {
Err(crate::error::BenfError::FileError(_)) => {
}
_ => panic!("Expected file error for non-existent PowerPoint file"),
}
}
#[test]
fn test_ppt_format_rejection() {
let test_path = PathBuf::from("test.ppt");
let result = parse_powerpoint_file(&test_path);
assert!(result.is_err());
match result {
Err(crate::error::BenfError::ParseError(msg)) => {
assert!(msg.contains("Legacy .ppt format not supported"));
}
_ => panic!("Expected parse error for .ppt file"),
}
}
#[test]
fn test_powerpoint_real_file() {
let test_path = PathBuf::from("tests/fixtures/sample_presentation.pptx");
if test_path.exists() {
let result = parse_powerpoint_file(&test_path);
match result {
Ok(numbers) => {
println!(
"✅ PowerPoint parsing succeeded! Found {count} numbers",
count = numbers.len()
);
assert!(
!numbers.is_empty(),
"Should extract at least some numbers from PowerPoint"
);
println!(
"First 10 numbers: {first_10:?}",
first_10 = numbers.iter().take(10).collect::<Vec<_>>()
);
}
Err(e) => {
println!("PowerPoint parsing failed: {e}");
}
}
} else {
println!("Test PowerPoint file not found at {test_path:?}, skipping real file test");
}
}
#[test]
fn test_xml_text_extraction() {
let sample_xml = r#"
<a:p xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
<a:r>
<a:rPr lang="ja-JP"/>
<a:t>売上: 1,234,567円</a:t>
</a:r>
</a:p>
<a:p>
<a:r>
<a:t>利益: 2,345,678円</a:t>
</a:r>
</a:p>
"#;
let result = extract_text_from_slide_xml(sample_xml);
assert!(result.is_ok());
let text = result.unwrap();
assert!(text.contains("1,234,567"));
assert!(text.contains("2,345,678"));
println!("Extracted text: {text}");
}
#[test]
fn test_xml_entity_decoding() {
let text_with_entities = "Sales & Marketing: <100,000> "profit"";
let decoded = decode_xml_entities(text_with_entities);
assert_eq!(decoded, "Sales & Marketing: <100,000> \"profit\"");
}
}