lawkit-core 2.1.0

Core library for statistical law analysis with international number support
Documentation
use crate::common::international::extract_numbers_international;
use pdf_extract::extract_text;
use std::path::Path;

/// Parse PDF files and extract numbers from text content
pub fn parse_pdf_file(file_path: &Path) -> crate::error::Result<Vec<f64>> {
    // Extract text from PDF file path
    let text = extract_text(file_path).map_err(|e| {
        crate::error::BenfError::ParseError(format!("Failed to extract text from PDF: {e}"))
    })?;

    // Extract numbers from the text (including international numerals)
    let numbers = extract_numbers_international(&text);

    if numbers.is_empty() {
        return Err(crate::error::BenfError::NoNumbersFound);
    }

    Ok(numbers)
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn test_pdf_parsing_concept() {
        // This test demonstrates the concept - actual PDF files would be needed for real testing
        let test_path = PathBuf::from("nonexistent.pdf");

        // Test should fail gracefully for non-existent file
        let result = parse_pdf_file(&test_path);
        assert!(result.is_err());

        // Check error type - pdf-extract returns ParseError for non-existent files
        match result {
            Err(crate::error::BenfError::ParseError(_)) => {
                // Expected parse error for non-existent file
            }
            Err(crate::error::BenfError::FileError(_)) => {
                // Also acceptable as file error
            }
            _ => panic!("Expected parse or file error for non-existent PDF file"),
        }
    }

    #[test]
    fn test_real_pdf_file() {
        // Test with actual PDF file if it exists
        let test_path = PathBuf::from("tests/fixtures/sample_report.pdf");

        if test_path.exists() {
            let result = parse_pdf_file(&test_path);
            match result {
                Ok(numbers) => {
                    // Should extract numbers from the test PDF file
                    assert!(!numbers.is_empty(), "Should extract at least some numbers");

                    println!(
                        "Extracted {count} numbers from PDF file",
                        count = numbers.len()
                    );
                    println!("All extracted numbers: {numbers:?}");

                    // Expected numbers from our financial report:
                    // Same content as Word document, so similar number extraction expected
                    // Note: PDF text extraction might have different spacing/formatting

                    // Check for some key numbers (allowing for PDF formatting differences)
                    assert!(
                        numbers.contains(&567.89) || numbers.contains(&567.0),
                        "Should contain amount 567.89 or 567"
                    );
                    assert!(numbers.contains(&2023.0), "Should contain year 2023");
                    assert!(
                        numbers.contains(&12.5) || numbers.contains(&12.0),
                        "Should contain percentage 12.5 or 12"
                    );

                    // Should extract reasonable number of values from financial report
                    assert!(
                        numbers.len() >= 20,
                        "Should extract at least 20 numbers, got {count}",
                        count = numbers.len()
                    );

                    println!("✅ PDF parsing test passed! Extracted financial report data.");
                }
                Err(e) => {
                    // If the test file is missing or corrupt, that's also a valid test result
                    println!("PDF parsing failed (expected if test file is missing): {e}");
                }
            }
        } else {
            println!("Test PDF file not found at {test_path:?}, skipping real file test");
        }
    }
}