lawkit-core 2.6.1

Core library for statistical law analysis with international number support
Documentation
use crate::common::international::extract_numbers_international;
use docx_rs::*;
use std::path::Path;

/// Parse Word files (.docx, .doc) and extract numbers from text content
pub fn parse_word_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
    let extension = file_path
        .extension()
        .and_then(|ext| ext.to_str())
        .unwrap_or("")
        .to_lowercase();

    match extension.as_str() {
        "docx" => parse_docx_file(file_path),
        "doc" => {
            // .doc files require different handling (legacy format)
            // For now, return an error suggesting conversion to .docx
            Err(crate::error::BenfError::ParseError(
                "Legacy .doc format not supported. Please convert to .docx format.".to_string(),
            ))
        }
        _ => Err(crate::error::BenfError::ParseError(format!(
            "Unsupported Word file extension: {extension}"
        ))),
    }
}

/// Parse DOCX files specifically using docx-rs
fn parse_docx_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
    // Read the DOCX file
    let file_bytes = std::fs::read(file_path).map_err(|e| {
        crate::error::BenfError::FileError(format!("Failed to read Word file: {e}"))
    })?;

    // Parse the DOCX document
    let doc = read_docx(&file_bytes).map_err(|e| {
        crate::error::BenfError::ParseError(format!("Failed to parse DOCX file: {e:?}"))
    })?;

    // Extract all text content from the document
    let mut all_text = String::new();

    // Extract text from paragraphs
    for child in &doc.document.children {
        if let DocumentChild::Paragraph(paragraph) = child {
            for run_child in &paragraph.children {
                if let ParagraphChild::Run(run) = run_child {
                    for text_child in &run.children {
                        if let RunChild::Text(text) = text_child {
                            all_text.push_str(&text.text);
                            all_text.push(' ');
                        }
                    }
                }
            }
            all_text.push('\n'); // Add line break after each paragraph
        }
    }

    // Extract numbers from the collected text (including international numerals)
    let numbers = extract_numbers_international(&all_text);

    if numbers.is_empty() {
        return Err(crate::error::BenfError::NoNumbersFound);
    }

    Ok(numbers)
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn test_word_parsing_concept() {
        // Test with non-existent file
        let test_path = PathBuf::from("nonexistent.docx");

        let result = parse_word_file(&test_path);
        assert!(result.is_err());

        // Check error type
        match result {
            Err(crate::error::BenfError::FileError(_)) => {
                // Expected file error for non-existent file
            }
            _ => panic!("Expected file error for non-existent Word file"),
        }
    }

    #[test]
    fn test_doc_format_rejection() {
        // Test that .doc files are properly rejected
        let test_path = PathBuf::from("test.doc");

        let result = parse_word_file(&test_path);
        assert!(result.is_err());

        match result {
            Err(crate::error::BenfError::ParseError(msg)) => {
                assert!(msg.contains("Legacy .doc format not supported"));
            }
            _ => panic!("Expected parse error for .doc file"),
        }
    }

    #[test]
    fn test_real_word_file() {
        // Test with actual Word file if it exists
        let test_path = PathBuf::from("tests/fixtures/sample_report.docx");

        if test_path.exists() {
            let result = parse_word_file(&test_path);
            match result {
                Ok(numbers) => {
                    // Should extract numbers from the test Word file
                    assert!(!numbers.is_empty(), "Should extract at least some numbers");

                    println!(
                        "Extracted {count} numbers from Word file",
                        count = numbers.len()
                    );
                    println!("All extracted numbers: {numbers:?}");

                    // Note: The extract_numbers_international function treats comma-separated
                    // numbers as separate values, so "1,234,567.89" becomes [1, 234, 567.89]
                    // This is the expected behavior for international numeral processing

                    // Check for some of the extracted values
                    assert!(
                        numbers.contains(&567.89),
                        "Should contain decimal amount 567.89"
                    );
                    assert!(
                        numbers.contains(&234.0),
                        "Should contain partial amount 234"
                    );
                    assert!(numbers.contains(&2023.0), "Should contain year 2023");
                    assert!(numbers.contains(&12.5), "Should contain percentage 12.5");
                    assert!(numbers.contains(&8.9), "Should contain percentage 8.9");

                    // Should extract many numbers from the financial report
                    assert!(
                        numbers.len() >= 30,
                        "Should extract at least 30 numbers, got {count}",
                        count = numbers.len()
                    );

                    println!("✅ Word parsing test passed! Extracted financial report data.");
                }
                Err(e) => {
                    // If the test file is missing or corrupt, that's also a valid test result
                    println!("Word parsing failed (expected if test file is missing): {e}");
                }
            }
        } else {
            println!("Test Word file not found at {test_path:?}, skipping real file test");
        }
    }
}