lawkit-core 2.6.1

Core library for statistical law analysis with international number support
Documentation
use crate::common::international::extract_numbers_international;
use std::io::Read;
use std::path::Path;

/// Parse PowerPoint files (.pptx, .ppt) and extract numbers from slide content
pub fn parse_powerpoint_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
    let extension = file_path
        .extension()
        .and_then(|ext| ext.to_str())
        .unwrap_or("")
        .to_lowercase();

    match extension.as_str() {
        "pptx" => parse_pptx_file(file_path),
        "ppt" => {
            // .ppt files require different handling (legacy format)
            // For now, return an error suggesting conversion to .pptx
            Err(crate::error::BenfError::ParseError(
                "Legacy .ppt format not supported. Please convert to .pptx format.".to_string(),
            ))
        }
        _ => Err(crate::error::BenfError::ParseError(format!(
            "Unsupported PowerPoint file extension: {extension}"
        ))),
    }
}

/// Parse PPTX files using ZIP extraction and XML parsing
fn parse_pptx_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
    // PowerPoint (.pptx) files are ZIP archives containing XML files
    // The slide content is stored in ppt/slides/slide*.xml files
    // Text content is in <a:t> elements within the XML structure

    // Attempt basic file validation
    if !file_path.exists() {
        return Err(crate::error::BenfError::FileError(format!(
            "PowerPoint file not found: {}",
            file_path.display()
        )));
    }

    // Open the PPTX file as a ZIP archive
    let file = std::fs::File::open(file_path).map_err(|e| {
        crate::error::BenfError::FileError(format!("Failed to open PowerPoint file: {e}"))
    })?;

    let mut archive = zip::ZipArchive::new(file).map_err(|e| {
        crate::error::BenfError::ParseError(format!(
            "Invalid PowerPoint file format (not a ZIP archive): {e}"
        ))
    })?;

    let mut all_text = String::new();

    // Iterate through all files in the ZIP archive
    for i in 0..archive.len() {
        let mut file = archive.by_index(i).map_err(|e| {
            crate::error::BenfError::ParseError(format!("Failed to read ZIP entry: {e}"))
        })?;

        let file_name = file.name().to_string();

        // Look for slide XML files
        if file_name.starts_with("ppt/slides/slide") && file_name.ends_with(".xml") {
            let mut contents = String::new();
            file.read_to_string(&mut contents).map_err(|e| {
                crate::error::BenfError::ParseError(format!("Failed to read slide XML: {e}"))
            })?;

            // Extract text content from XML
            let slide_text = extract_text_from_slide_xml(&contents)?;
            all_text.push_str(&slide_text);
            all_text.push(' '); // Add separator between slides
        }
    }

    if all_text.trim().is_empty() {
        return Err(crate::error::BenfError::NoNumbersFound);
    }

    // Extract numbers using international number processing
    let numbers = extract_numbers_international(&all_text);

    if numbers.is_empty() {
        Err(crate::error::BenfError::NoNumbersFound)
    } else {
        Ok(numbers)
    }
}

/// Extract text content from a PowerPoint slide XML
fn extract_text_from_slide_xml(xml_content: &str) -> crate::error::Result<String> {
    use regex::Regex;

    // PowerPoint slide text is contained in <a:t> elements
    // We'll use regex to extract text content from these elements
    let text_regex = Regex::new(r"<a:t[^>]*>(.*?)</a:t>").map_err(|e| {
        crate::error::BenfError::ParseError(format!("Failed to compile regex: {e}"))
    })?;

    let mut extracted_text = Vec::new();

    for cap in text_regex.captures_iter(xml_content) {
        if let Some(text_match) = cap.get(1) {
            let text = text_match.as_str();
            // Decode XML entities
            let decoded_text = decode_xml_entities(text);
            extracted_text.push(decoded_text);
        }
    }

    Ok(extracted_text.join(" "))
}

/// Decode basic XML entities
fn decode_xml_entities(text: &str) -> String {
    text.replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&amp;", "&")
        .replace("&quot;", "\"")
        .replace("&apos;", "'")
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn test_powerpoint_parsing_concept() {
        // Test with non-existent file
        let test_path = PathBuf::from("nonexistent.pptx");

        let result = parse_powerpoint_file(&test_path);
        assert!(result.is_err());

        // Check error type
        match result {
            Err(crate::error::BenfError::FileError(_)) => {
                // Expected file error for non-existent file
            }
            _ => panic!("Expected file error for non-existent PowerPoint file"),
        }
    }

    #[test]
    fn test_ppt_format_rejection() {
        // Test that .ppt files are properly rejected
        let test_path = PathBuf::from("test.ppt");

        let result = parse_powerpoint_file(&test_path);
        assert!(result.is_err());

        match result {
            Err(crate::error::BenfError::ParseError(msg)) => {
                assert!(msg.contains("Legacy .ppt format not supported"));
            }
            _ => panic!("Expected parse error for .ppt file"),
        }
    }

    #[test]
    fn test_powerpoint_real_file() {
        // Test with real PowerPoint file
        let test_path = PathBuf::from("tests/fixtures/sample_presentation.pptx");

        if test_path.exists() {
            let result = parse_powerpoint_file(&test_path);
            match result {
                Ok(numbers) => {
                    println!(
                        "✅ PowerPoint parsing succeeded! Found {count} numbers",
                        count = numbers.len()
                    );
                    assert!(
                        !numbers.is_empty(),
                        "Should extract at least some numbers from PowerPoint"
                    );

                    // Print first few numbers for verification
                    println!(
                        "First 10 numbers: {first_10:?}",
                        first_10 = numbers.iter().take(10).collect::<Vec<_>>()
                    );
                }
                Err(e) => {
                    println!("PowerPoint parsing failed: {e}");
                    // For now, we'll allow this to fail as the implementation is new
                    // In the future, this should be changed to assert!(false)
                }
            }
        } else {
            println!("Test PowerPoint file not found at {test_path:?}, skipping real file test");
        }
    }

    #[test]
    fn test_xml_text_extraction() {
        // Test XML text extraction function
        let sample_xml = r#"
            <a:p xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
                <a:r>
                    <a:rPr lang="ja-JP"/>
                    <a:t>売上: 1,234,567円</a:t>
                </a:r>
            </a:p>
            <a:p>
                <a:r>
                    <a:t>利益: 2,345,678円</a:t>
                </a:r>
            </a:p>
        "#;

        let result = extract_text_from_slide_xml(sample_xml);
        assert!(result.is_ok());

        let text = result.unwrap();
        assert!(text.contains("1,234,567"));
        assert!(text.contains("2,345,678"));
        println!("Extracted text: {text}");
    }

    #[test]
    fn test_xml_entity_decoding() {
        let text_with_entities = "Sales &amp; Marketing: &lt;100,000&gt; &quot;profit&quot;";
        let decoded = decode_xml_entities(text_with_entities);
        assert_eq!(decoded, "Sales & Marketing: <100,000> \"profit\"");
    }
}