unpdf 0.4.5

High-performance PDF content extraction to Markdown, text, and JSON
Documentation
use std::path::Path;
use unpdf::{parse_file, ExtractionQuality};

#[test]
fn test_extraction_quality_from_text() {
    let q = ExtractionQuality::from_text("The quick brown fox jumps over the lazy dog");
    assert_eq!(q.char_count, 43);
    assert_eq!(q.word_count, 9);
    assert_eq!(q.replacement_char_count, 0);
    assert!(q.is_good());
    assert!(q.warning_message().is_none());
}

#[test]
fn test_extraction_quality_low() {
    // 3 replacement chars + 2 normal chars = 5 total, ratio = 0.6
    let q = ExtractionQuality::from_text("\u{FFFD}\u{FFFD}\u{FFFD}ab");
    assert_eq!(q.char_count, 5);
    assert_eq!(q.replacement_char_count, 3);
    assert!(!q.is_good());
    let msg = q.warning_message().unwrap();
    assert!(msg.contains("3 of 5"));
}

#[test]
fn test_extraction_quality_empty() {
    let q = ExtractionQuality::from_text("");
    assert_eq!(q.char_count, 0);
    assert_eq!(q.word_count, 0);
    assert!(!q.is_good());
    assert!(q.warning_message().is_some());
}

#[test]
fn test_basic_pdf_has_quality_metrics() {
    let path = Path::new("test-files/basic/trivial.pdf");
    if !path.exists() {
        return;
    }
    let doc = parse_file(path).unwrap();
    assert!(doc.extraction_quality.char_count > 0);
    assert!(doc.extraction_quality.word_count > 0);
}

#[test]
fn test_encrypted_pdf_handling() {
    let path = Path::new("test-files/encrypted/password-protected.pdf");
    if !path.exists() {
        return;
    }
    let result = parse_file(path);
    // Should either succeed (if empty password works) or return a clear error
    match result {
        Ok(doc) => {
            // Successfully decrypted with empty password
            assert!(
                doc.extraction_quality.char_count > 0 || doc.metadata.encrypted,
                "Decrypted PDF should have content or report encrypted"
            );
        }
        Err(e) => {
            let msg = e.to_string();
            assert!(
                msg.contains("encrypted")
                    || msg.contains("Encrypted")
                    || msg.contains("password")
                    || msg.contains("supported"),
                "Error should be about encryption: {}",
                msg
            );
        }
    }
}

#[test]
fn test_multicolumn_reading_order() {
    let path = Path::new("test-files/complex/multicolumn.pdf");
    if !path.exists() {
        return;
    }
    let doc = parse_file(path).unwrap();
    let text = doc.plain_text();
    assert!(!text.is_empty(), "Should extract text from multicolumn PDF");
}

#[test]
fn test_two_column_reading_order() {
    let path = Path::new("test-files/complex/two-column.pdf");
    if !path.exists() {
        return;
    }
    let doc = parse_file(path).unwrap();
    let text = doc.plain_text();
    assert!(!text.is_empty(), "Should extract text from two-column PDF");
}

#[test]
fn test_toc_dot_leader_removal() {
    use unpdf::render::{CleanupPipeline, CleanupPreset};
    let pipeline = CleanupPipeline::from_preset(CleanupPreset::Standard);
    let input = "Chapter 1: Introduction ................................ 6\n\
                 Chapter 2: Methods ...................................... 12\n\
                 Normal paragraph text without dots.";
    let output = pipeline.process(input);
    assert!(
        !output.contains("................................"),
        "Dot leaders should be removed"
    );
    assert!(output.contains("Introduction"));
    assert!(output.contains("Normal paragraph text"));
}

#[test]
fn test_image_pdf_has_content() {
    let path = Path::new("test-files/images/sample-with-images.pdf");
    if !path.exists() {
        return;
    }
    let doc = parse_file(path).unwrap();
    let text = doc.plain_text();
    assert!(!text.is_empty(), "Should extract text from PDF with images");
}

#[test]
fn test_page_number_pattern() {
    // This tests the overall extraction, not the internal function
    let path = Path::new("test-files/basic/trivial.pdf");
    if !path.exists() {
        return;
    }
    let doc = parse_file(path).unwrap();
    let text = doc.plain_text();
    // Basic PDFs should still extract content
    assert!(!text.is_empty());
}

#[test]
fn test_table_extraction_basic() {
    let path = Path::new("test-files/tables/sample-tables.pdf");
    if !path.exists() {
        return;
    }
    let result = parse_file(path);
    // Skip if file is encrypted or unreadable
    if result.is_err() {
        return;
    }
    let doc = result.unwrap();
    // Should have some table blocks
    let has_tables = doc.pages.iter().any(|p| {
        p.elements
            .iter()
            .any(|b| matches!(b, unpdf::model::Block::Table(_)))
    });
    assert!(has_tables, "Table PDF should detect tables");
}