pdf_oxide 0.3.22

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
//! Tests for ISO 32000-1:2008 Section 9.10.2 Character-to-Unicode Mapping Priorities
//!
//! PDF spec defines a 5-level priority order for character-to-Unicode mapping:
//! 1. ToUnicode CMap (highest priority)
//! 2. Adobe Glyph List (fallback 1)
//! 3. Predefined CMaps (fallback 2)
//! 4. ActualText attribute (fallback 3)
//! 5. Font encoding (lowest priority)

use pdf_oxide::fonts::character_mapper::CharacterMapper;
use std::collections::HashMap;

#[test]
fn test_priority_1_tounicode_cmap_highest() {
    let mut mapper = CharacterMapper::new();

    // Priority 1: ToUnicode CMap should override everything
    let tounicode = {
        let mut cmap = HashMap::new();
        cmap.insert(0x41, "A".to_string());
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode));

    // Should return 'A' from ToUnicode
    assert_eq!(mapper.map_character(0x41), Some("A".to_string()));
}

#[test]
fn test_priority_2_adobe_glyph_list_fallback() {
    let mapper = CharacterMapper::new();

    // Without ToUnicode, should fall back to Adobe Glyph List
    // 0x47 maps to "G" in Adobe Glyph List
    assert_eq!(mapper.map_character(0x47), Some("G".to_string()));
}

#[test]
fn test_priority_1_overrides_adobe_glyph_list() {
    let mut mapper = CharacterMapper::new();

    // ToUnicode CMap takes priority over Adobe Glyph List
    let tounicode = {
        let mut cmap = HashMap::new();
        // Map 0x47 to 'X' (instead of 'G' from glyph list)
        cmap.insert(0x47, "X".to_string());
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode));

    // Should return 'X' from ToUnicode, not 'G' from glyph list
    assert_eq!(mapper.map_character(0x47), Some("X".to_string()));
}

#[test]
fn test_adobe_glyph_list_symbol_fonts() {
    let mapper = CharacterMapper::new();

    // Symbol fonts like Wingdings use Adobe Glyph List
    // Common symbol mappings
    assert!(mapper.map_character(0x20).is_some()); // Space
    assert!(mapper.map_character(0x41).is_some()); // A
    assert!(mapper.map_character(0x7A).is_some()); // z
}

#[test]
fn test_adobe_glyph_list_special_glyphs() {
    let mapper = CharacterMapper::new();

    // Adobe Glyph List has specific mappings for named glyphs
    // Testing that named glyph lookups work
    assert!(mapper.map_glyph_name("A").is_some());
    assert!(mapper.map_glyph_name("B").is_some());
    assert!(mapper.map_glyph_name("ampersand").is_some());
    assert!(mapper.map_glyph_name("infinity").is_some());
}

#[test]
fn test_adobe_glyph_list_ligatures() {
    let mapper = CharacterMapper::new();

    // Adobe Glyph List includes ligatures
    assert!(mapper.map_glyph_name("fi").is_some()); // fi ligature
    assert!(mapper.map_glyph_name("fl").is_some()); // fl ligature
    assert!(mapper.map_glyph_name("ff").is_some()); // ff ligature
}

#[test]
fn test_priority_chain_tounicode_wins() {
    let mut mapper = CharacterMapper::new();

    // Set up ToUnicode CMap for code 0x50
    let tounicode = {
        let mut cmap = HashMap::new();
        cmap.insert(0x50, "P".to_string());
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode));

    // ToUnicode should win over any other priority
    assert_eq!(mapper.map_character(0x50), Some("P".to_string()));
}

#[test]
fn test_incomplete_tounicode_falls_back_to_adobe() {
    let mut mapper = CharacterMapper::new();

    // ToUnicode CMap is incomplete - only has mapping for 0x50
    let tounicode = {
        let mut cmap = HashMap::new();
        cmap.insert(0x50, "P".to_string());
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode));

    // 0x50 should use ToUnicode
    assert_eq!(mapper.map_character(0x50), Some("P".to_string()));

    // 0x51 should fall back to Adobe Glyph List
    assert_eq!(mapper.map_character(0x51), Some("Q".to_string()));
}

#[test]
fn test_empty_tounicode_uses_adobe_glyph_list() {
    let mut mapper = CharacterMapper::new();

    // Empty ToUnicode CMap - should fall back to Adobe Glyph List
    mapper.set_tounicode_cmap(Some(HashMap::new()));

    // Should use Adobe Glyph List
    assert_eq!(mapper.map_character(0x41), Some("A".to_string()));
}

#[test]
fn test_character_mapper_state_isolation() {
    let mut mapper1 = CharacterMapper::new();
    let mut mapper2 = CharacterMapper::new();

    // Set different ToUnicode for mapper1
    let tounicode = {
        let mut cmap = HashMap::new();
        cmap.insert(0x41, "X".to_string());
        cmap
    };
    mapper1.set_tounicode_cmap(Some(tounicode));

    // mapper1 should use ToUnicode
    assert_eq!(mapper1.map_character(0x41), Some("X".to_string()));

    // mapper2 should use Adobe Glyph List (default)
    assert_eq!(mapper2.map_character(0x41), Some("A".to_string()));
}

#[test]
fn test_character_mapping_with_font_encoding() {
    let mut mapper = CharacterMapper::new();

    // Set font encoding (lowest priority)
    let mut encoding = HashMap::new();
    encoding.insert(0xAA, 'Z');
    mapper.set_font_encoding(Some(encoding));

    // Without ToUnicode, should use font encoding
    assert_eq!(mapper.map_character(0xAA), Some("Z".to_string()));

    // Now add ToUnicode - it should override font encoding
    let tounicode = {
        let mut cmap = HashMap::new();
        cmap.insert(0xAA, "Y".to_string());
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode));

    // ToUnicode should override font encoding
    assert_eq!(mapper.map_character(0xAA), Some("Y".to_string()));
}

#[test]
fn test_no_mapping_returns_none() {
    let mapper = CharacterMapper::new();

    // Character with no mapping anywhere should return None or replacement character
    // Using a high character code unlikely to have a mapping
    let result = mapper.map_character(0xFFFE);
    // Should either return None or a fallback character
    assert!(result.is_none() || result == Some("\u{FFFD}".to_string())); // Replacement char
}

#[test]
fn test_ascii_range_always_mapped() {
    let mapper = CharacterMapper::new();

    // ASCII printable range (0x20-0x7E) should always map
    for code in 0x20..=0x7E {
        assert!(
            mapper.map_character(code as u32).is_some(),
            "ASCII code 0x{:02X} should map",
            code
        );
    }
}

#[test]
fn test_custom_tounicode_overwrites_previous() {
    let mut mapper = CharacterMapper::new();

    // Set initial ToUnicode
    let tounicode1 = {
        let mut cmap = HashMap::new();
        cmap.insert(0x41, "A".to_string());
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode1));
    assert_eq!(mapper.map_character(0x41), Some("A".to_string()));

    // Overwrite with new ToUnicode
    let tounicode2 = {
        let mut cmap = HashMap::new();
        cmap.insert(0x41, "X".to_string());
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode2));
    assert_eq!(mapper.map_character(0x41), Some("X".to_string()));
}

#[test]
fn test_ligature_expansion() {
    let mut mapper = CharacterMapper::new();

    // ToUnicode can map a single character code to multiple Unicode chars (ligatures)
    let tounicode = {
        let mut cmap = HashMap::new();
        cmap.insert(0x0C, "fi".to_string()); // 0x0C maps to "fi" ligature
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode));

    // Should return "fi" as a string (2 characters)
    assert_eq!(mapper.map_character(0x0C), Some("fi".to_string()));
}

#[test]
fn test_utf16_surrogate_pairs() {
    let mut mapper = CharacterMapper::new();

    // ToUnicode CMap handles UTF-16 surrogate pairs for characters > U+FFFF
    let tounicode = {
        let mut cmap = HashMap::new();
        // Example: Mathematical italic small rho U+1D70C
        // In UTF-16: D835 DF0C (high surrogate D835, low surrogate DF0C)
        cmap.insert(0x0001, "𝜌".to_string()); // Mathematical italic rho
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode));

    assert_eq!(mapper.map_character(0x0001), Some("𝜌".to_string()));
}

#[test]
fn test_cjk_font_support() {
    let mapper = CharacterMapper::new();

    // CJK fonts use high character codes (16-bit)
    // Adobe Glyph List should handle CJK mappings if available
    // This test verifies that the mapper can handle high code points
    let result = mapper.map_character(0x3042); // Japanese hiragana A
    // May or may not have a mapping - that's okay
    // Just verify it doesn't panic
    let _ = result;
}

#[test]
fn test_batch_character_mapping() {
    let mapper = CharacterMapper::new();

    // Test mapping multiple characters efficiently
    let test_codes = vec![0x41, 0x42, 0x43, 0x44, 0x45];
    let expected = vec!["A", "B", "C", "D", "E"];

    for (code, expected_char) in test_codes.iter().zip(expected.iter()) {
        assert_eq!(mapper.map_character(*code), Some(expected_char.to_string()));
    }
}

#[test]
fn test_spec_compliance_priority_order() {
    // This test verifies the exact priority order from PDF spec
    // ISO 32000-1:2008, Section 9.10.2

    let mut mapper = CharacterMapper::new();

    // Priority 1: ToUnicode CMap - explicit mapping
    let tounicode = {
        let mut cmap = HashMap::new();
        cmap.insert(0x41, "FromToUnicode".to_string());
        cmap
    };
    mapper.set_tounicode_cmap(Some(tounicode));

    // Should get ToUnicode value (priority 1)
    assert_eq!(
        mapper.map_character(0x41),
        Some("FromToUnicode".to_string()),
        "ToUnicode CMap should have highest priority"
    );

    // For character without ToUnicode mapping, test fallback to Adobe Glyph List
    // 0x42 is not in our ToUnicode mapping
    let result_without_tounicode = mapper.map_character(0x42);
    // Should fall back to Adobe Glyph List (should map to "B")
    assert_eq!(result_without_tounicode, Some("B".to_string()));
}