lawkit-core 2.1.0

Core library for statistical law analysis with international number support
Documentation
use crate::common::international::extract_numbers_international;
use std::io::Read;
use std::path::Path;

/// Parse OpenDocument files (.odt, .ods) and extract numbers from content
pub fn parse_opendocument_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
    let extension = file_path
        .extension()
        .and_then(|ext| ext.to_str())
        .unwrap_or("")
        .to_lowercase();

    match extension.as_str() {
        "odt" => parse_odt_file(file_path),
        "ods" => {
            // .ods files are already handled by the Excel parser (calamine)
            // Redirect to the existing Excel parser
            crate::common::input::formats::excel::parse_excel_file(file_path)
        }
        _ => Err(crate::error::BenfError::ParseError(format!(
            "Unsupported OpenDocument file extension: {extension}"
        ))),
    }
}

/// Parse ODT (OpenDocument Text) files using ZIP extraction and XML parsing
fn parse_odt_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
    // OpenDocument Text (.odt) files are ZIP archives containing XML files
    // The main content is stored in content.xml
    // Text content is in various elements like <text:p>, <text:span>, etc.

    // Attempt basic file validation
    if !file_path.exists() {
        return Err(crate::error::BenfError::FileError(format!(
            "OpenDocument file not found: {}",
            file_path.display()
        )));
    }

    // Open the ODT file as a ZIP archive
    let file = std::fs::File::open(file_path).map_err(|e| {
        crate::error::BenfError::FileError(format!("Failed to open OpenDocument file: {e}"))
    })?;

    let mut archive = zip::ZipArchive::new(file).map_err(|e| {
        crate::error::BenfError::ParseError(format!(
            "Invalid OpenDocument file format (not a ZIP archive): {e}"
        ))
    })?;

    // Look for content.xml (main document content)
    let mut content_xml = None;
    for i in 0..archive.len() {
        let file = archive.by_index(i).map_err(|e| {
            crate::error::BenfError::ParseError(format!("Failed to read ZIP entry: {e}"))
        })?;

        if file.name() == "content.xml" {
            content_xml = Some(i);
            break;
        }
    }

    let content_index = content_xml.ok_or_else(|| {
        crate::error::BenfError::ParseError(
            "content.xml not found in OpenDocument file".to_string(),
        )
    })?;

    // Extract content from content.xml
    let mut file = archive.by_index(content_index).map_err(|e| {
        crate::error::BenfError::ParseError(format!("Failed to read content.xml: {e}"))
    })?;

    let mut contents = String::new();
    file.read_to_string(&mut contents).map_err(|e| {
        crate::error::BenfError::ParseError(format!("Failed to read content.xml data: {e}"))
    })?;

    // Extract text content from XML
    let text_content = extract_text_from_odt_xml(&contents)?;

    if text_content.trim().is_empty() {
        return Err(crate::error::BenfError::NoNumbersFound);
    }

    // Extract numbers using international number processing
    let numbers = extract_numbers_international(&text_content);

    if numbers.is_empty() {
        Err(crate::error::BenfError::NoNumbersFound)
    } else {
        Ok(numbers)
    }
}

/// Extract text content from OpenDocument XML
fn extract_text_from_odt_xml(xml_content: &str) -> crate::error::Result<String> {
    use regex::Regex;

    // OpenDocument text content is in various elements
    // Common text elements: <text:p>, <text:span>, <text:h>, etc.
    let text_regex = Regex::new(r"<text:[^>]*>(.*?)</text:[^>]*>").map_err(|e| {
        crate::error::BenfError::ParseError(format!("Failed to compile regex: {e}"))
    })?;

    // Also extract from table cells
    let table_regex = Regex::new(r"<table:[^>]*>(.*?)</table:[^>]*>").map_err(|e| {
        crate::error::BenfError::ParseError(format!("Failed to compile table regex: {e}"))
    })?;

    let mut extracted_text = Vec::new();

    // Extract text from text elements
    for cap in text_regex.captures_iter(xml_content) {
        if let Some(text_match) = cap.get(1) {
            let inner_text = extract_inner_text(text_match.as_str());
            if !inner_text.trim().is_empty() {
                extracted_text.push(inner_text);
            }
        }
    }

    // Extract text from table elements
    for cap in table_regex.captures_iter(xml_content) {
        if let Some(text_match) = cap.get(1) {
            let inner_text = extract_inner_text(text_match.as_str());
            if !inner_text.trim().is_empty() {
                extracted_text.push(inner_text);
            }
        }
    }

    Ok(extracted_text.join(" "))
}

/// Extract inner text from XML, removing nested tags
fn extract_inner_text(xml_fragment: &str) -> String {
    use regex::Regex;

    // Remove all XML tags, keeping only text content
    let tag_regex = Regex::new(r"<[^>]*>").unwrap();
    let text_only = tag_regex.replace_all(xml_fragment, " ");

    // Decode XML entities
    decode_xml_entities(&text_only)
}

/// Decode basic XML entities
fn decode_xml_entities(text: &str) -> String {
    text.replace("&lt;", "<")
        .replace("&gt;", ">")
        .replace("&amp;", "&")
        .replace("&quot;", "\"")
        .replace("&apos;", "'")
        .replace("&nbsp;", " ")
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn test_odt_parsing_concept() {
        // Test with non-existent file
        let test_path = PathBuf::from("nonexistent.odt");

        let result = parse_opendocument_file(&test_path);
        assert!(result.is_err());

        // Check error type
        match result {
            Err(crate::error::BenfError::FileError(_)) => {
                // Expected file error for non-existent file
            }
            _ => panic!("Expected file error for non-existent ODT file"),
        }
    }

    #[test]
    fn test_odt_real_file() {
        // Test with real ODT file if it exists
        let test_path = PathBuf::from("tests/fixtures/sample_document.odt");

        if test_path.exists() {
            let result = parse_opendocument_file(&test_path);
            match result {
                Ok(numbers) => {
                    println!(
                        "✅ ODT parsing succeeded! Found {count} numbers",
                        count = numbers.len()
                    );
                    assert!(
                        !numbers.is_empty(),
                        "Should extract at least some numbers from ODT"
                    );

                    // Print first few numbers for verification
                    println!(
                        "First 10 numbers: {first_10:?}",
                        first_10 = numbers.iter().take(10).collect::<Vec<_>>()
                    );
                }
                Err(e) => {
                    println!("ODT parsing failed: {e}");
                    // For now, we'll allow this to fail as the test file might not exist
                }
            }
        } else {
            println!("Test ODT file not found at {test_path:?}, skipping real file test");
        }
    }

    #[test]
    fn test_odt_xml_text_extraction() {
        // Test ODT XML text extraction function
        let sample_xml = r#"
            <office:document-content xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
                                   xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0">
                <office:body>
                    <office:text>
                        <text:p>売上高: 1,234,567円</text:p>
                        <text:p>純利益: 2,345,678円</text:p>
                        <text:h>第3四半期実績</text:h>
                        <text:span>前年比: 345%増加</text:span>
                    </office:text>
                </office:body>
            </office:document-content>
        "#;

        let result = extract_text_from_odt_xml(sample_xml);
        assert!(result.is_ok());

        let text = result.unwrap();
        assert!(text.contains("1,234,567"));
        assert!(text.contains("2,345,678"));
        assert!(text.contains("345"));
        println!("Extracted ODT text: {text}");
    }

    #[test]
    fn test_odt_inner_text_extraction() {
        let xml_with_nested_tags = r#"売上: <text:span style="font-weight:bold">1,000,000</text:span>円 利益: <text:span>500,000</text:span>円"#;
        let result = extract_inner_text(xml_with_nested_tags);
        assert!(result.contains("1,000,000"));
        assert!(result.contains("500,000"));
        assert!(!result.contains("<text:span"));
        println!("Inner text: {result}");
    }

    #[test]
    fn test_odt_xml_entity_decoding() {
        let text_with_entities =
            "Profit &amp; Loss: &lt;500,000&gt; &quot;estimated&quot; &nbsp;margin";
        let decoded = decode_xml_entities(text_with_entities);
        assert_eq!(decoded, "Profit & Loss: <500,000> \"estimated\"  margin");
    }

    #[test]
    fn test_ods_redirect() {
        // Test that .ods files are redirected to Excel parser
        let test_path = PathBuf::from("tests/fixtures/sample_data.ods");

        if test_path.exists() {
            let result = parse_opendocument_file(&test_path);
            // This should work if the .ods file exists and is valid
            match result {
                Ok(numbers) => {
                    println!(
                        "✅ ODS parsing (via Excel parser) succeeded! Found {count} numbers",
                        count = numbers.len()
                    );
                }
                Err(e) => {
                    println!("ODS parsing failed: {e}");
                    // This might fail if file doesn't exist, which is fine for testing
                }
            }
        } else {
            println!("Test ODS file not found, skipping ODS redirect test");
        }
    }
}