pdf_oxide 0.3.59

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
//! Unicode encoding support for PDF text.
//!
//! This module handles encoding conversions for PDF text strings,
//! particularly for CID fonts with Identity-H encoding.
//!
//! # Identity-H Encoding
//!
//! Per PDF spec Section 9.7.5.2, Identity-H is a predefined CMap that
//! maps CIDs directly to character codes (i.e., CID = GID for TrueType).
//! This allows using glyph IDs directly in content streams as:
//!
//! ```text
//! <0001002300A4> Tj
//! ```
//!
//! Where each 4-digit hex value is a glyph ID (big-endian u16).

use std::collections::HashMap;

/// Unicode encoder for PDF text strings.
///
/// Converts Unicode text to PDF string format for different encodings.
#[derive(Debug)]
pub struct UnicodeEncoder {
    /// Glyph lookup function results cache
    glyph_cache: HashMap<u32, u16>,
}

impl UnicodeEncoder {
    /// Create a new Unicode encoder.
    pub fn new() -> Self {
        Self {
            glyph_cache: HashMap::new(),
        }
    }

    /// Encode a Unicode string to Identity-H format (hex string of glyph IDs).
    ///
    /// # Arguments
    /// * `text` - Unicode text to encode
    /// * `glyph_lookup` - Function to convert Unicode codepoint to glyph ID
    ///
    /// # Returns
    /// Hex-encoded string suitable for Tj/TJ operators, e.g., "<00410042>"
    pub fn encode_identity_h(
        &mut self,
        text: &str,
        glyph_lookup: impl Fn(u32) -> Option<u16>,
    ) -> String {
        let mut hex = String::with_capacity(text.len() * 4 + 2);
        hex.push('<');

        for ch in text.chars() {
            let codepoint = ch as u32;
            let glyph_id = self
                .glyph_cache
                .get(&codepoint)
                .copied()
                .or_else(|| {
                    let gid = glyph_lookup(codepoint)?;
                    self.glyph_cache.insert(codepoint, gid);
                    Some(gid)
                })
                .unwrap_or(0); // Use .notdef for missing glyphs

            hex.push_str(&format!("{:04X}", glyph_id));
        }

        hex.push('>');
        hex
    }

    /// Encode a single character to Identity-H format.
    pub fn encode_char_identity_h(&self, glyph_id: u16) -> String {
        format!("<{:04X}>", glyph_id)
    }

    /// Encode text as PDF literal string (for WinAnsi/MacRoman encoding).
    ///
    /// Characters outside the encoding are replaced with '?'.
    pub fn encode_literal(text: &str) -> String {
        let mut result = String::with_capacity(text.len() + 2);
        result.push('(');

        for ch in text.chars() {
            match ch {
                // Escape special characters
                '(' => result.push_str("\\("),
                ')' => result.push_str("\\)"),
                '\\' => result.push_str("\\\\"),
                '\n' => result.push_str("\\n"),
                '\r' => result.push_str("\\r"),
                '\t' => result.push_str("\\t"),
                // Printable ASCII
                c if c.is_ascii() && c >= ' ' => result.push(c),
                // Non-ASCII - encode as octal if in Latin-1
                c if (c as u32) < 256 => {
                    result.push_str(&format!("\\{:03o}", c as u32));
                },
                // Characters outside Latin-1 - replace with ?
                _ => result.push('?'),
            }
        }

        result.push(')');
        result
    }

    /// Encode text as PDF hex string for UTF-16BE.
    ///
    /// Used for metadata strings and bookmarks that need full Unicode.
    pub fn encode_utf16be(text: &str) -> String {
        let mut hex = String::new();
        hex.push('<');

        // BOM for UTF-16BE
        hex.push_str("FEFF");

        for ch in text.chars() {
            let codepoint = ch as u32;
            if codepoint <= 0xFFFF {
                // BMP character
                hex.push_str(&format!("{:04X}", codepoint));
            } else {
                // Supplementary plane - encode as surrogate pair
                let adjusted = codepoint - 0x10000;
                let high = ((adjusted >> 10) & 0x3FF) + 0xD800;
                let low = (adjusted & 0x3FF) + 0xDC00;
                hex.push_str(&format!("{:04X}{:04X}", high, low));
            }
        }

        hex.push('>');
        hex
    }

    /// Encode text as PDF literal string if ASCII, otherwise as UTF-16BE hex.
    ///
    /// This is the recommended approach for general PDF strings.
    pub fn encode_text(text: &str) -> String {
        if text
            .chars()
            .all(|c| c.is_ascii() && c >= ' ' && c != '(' && c != ')' && c != '\\')
        {
            // Simple ASCII - use literal string
            format!("({})", text)
        } else if text.chars().all(|c| (c as u32) < 256) {
            // Latin-1 compatible - use escaped literal
            Self::encode_literal(text)
        } else {
            // Full Unicode - use UTF-16BE
            Self::encode_utf16be(text)
        }
    }

    /// Clear the glyph cache.
    pub fn clear_cache(&mut self) {
        self.glyph_cache.clear();
    }

    /// Get cache statistics.
    pub fn cache_size(&self) -> usize {
        self.glyph_cache.len()
    }
}

impl Default for UnicodeEncoder {
    fn default() -> Self {
        Self::new()
    }
}

/// Map a Mathematical Alphanumeric Symbol (U+1D400-U+1D7FF) to its plain
/// Latin or Greek base codepoint.
///
/// This block holds 1024 styled letter/digit forms (italic, bold, script,
/// fraktur, double-struck, sans-serif, monospace) used in mathematical
/// notation. None of them have glyphs in the standard 14 PDF fonts, but
/// they collapse cleanly to plain letters: `𝑥` (U+1D465) → `x`, `𝛽`
/// (U+1D6FD) → `β`, etc. The block is structured as fixed-stride 26-letter
/// or 10-digit ranges, so the mapping is purely arithmetic.
///
/// Returns None for codepoints outside the block (and for the 6 reserved
/// holes within it where the canonical letter lives elsewhere).
pub fn math_alphanumeric_base(codepoint: u32) -> Option<u32> {
    // Outside the block — nothing to do.
    if !(0x1D400..=0x1D7FF).contains(&codepoint) {
        return None;
    }
    // Reserved holes: codepoints already encoded elsewhere in Unicode.
    // (BMP italic h, planar holes for h-related letters, etc.)
    let canonical = match codepoint {
        0x1D455 => 0x0068, // h (BMP italic h hole)
        0x1D49D => 0x0042, // B
        0x1D4A0 => 0x0045, // E
        0x1D4A1 => 0x0046, // F
        0x1D4A3 => 0x0048, // H
        0x1D4A4 => 0x0049, // I
        0x1D4A7 => 0x004C, // L
        0x1D4A8 => 0x004D, // M
        0x1D4AD => 0x0052, // R
        0x1D4BA => 0x0065, // e
        0x1D4BC => 0x0067, // g
        0x1D4C4 => 0x006F, // o
        0x1D506 => 0x0043, // C
        0x1D50B => 0x0048, // H
        0x1D50C => 0x0049, // I
        0x1D515 => 0x0052, // R
        0x1D51D => 0x005A, // Z
        0x1D53A => 0x0043, // C
        0x1D53F => 0x0048, // H
        0x1D545 => 0x004E, // N
        0x1D547 => 0x0050, // P
        0x1D548 => 0x0051, // Q
        0x1D549 => 0x0052, // R
        0x1D551 => 0x005A, // Z
        _ => 0,
    };
    if canonical != 0 {
        return Some(canonical);
    }

    // Bold / Italic / Bold-Italic / Script / Bold-Script / Fraktur /
    // Double-Struck / Bold-Fraktur / Sans / Sans-Bold / Sans-Italic /
    // Sans-Bold-Italic / Mono Latin (each 52 chars: A-Z then a-z).
    const LATIN_RANGES: &[(u32, u32)] = &[
        (0x1D400, 0x41), // A-Z bold
        (0x1D41A, 0x61), // a-z bold
        (0x1D434, 0x41), // A-Z italic
        (0x1D44E, 0x61), // a-z italic
        (0x1D468, 0x41), // bold italic
        (0x1D482, 0x61),
        (0x1D49C, 0x41), // script
        (0x1D4B6, 0x61),
        (0x1D4D0, 0x41), // bold script
        (0x1D4EA, 0x61),
        (0x1D504, 0x41), // fraktur
        (0x1D51E, 0x61),
        (0x1D538, 0x41), // double-struck
        (0x1D552, 0x61),
        (0x1D56C, 0x41), // bold fraktur
        (0x1D586, 0x61),
        (0x1D5A0, 0x41), // sans-serif
        (0x1D5BA, 0x61),
        (0x1D5D4, 0x41), // sans-serif bold
        (0x1D5EE, 0x61),
        (0x1D608, 0x41), // sans-serif italic
        (0x1D622, 0x61),
        (0x1D63C, 0x41), // sans-serif bold italic
        (0x1D656, 0x61),
        (0x1D670, 0x41), // monospace
        (0x1D68A, 0x61),
    ];
    for &(start, base) in LATIN_RANGES {
        if codepoint >= start && codepoint < start + 26 {
            return Some(base + (codepoint - start));
        }
    }

    // Greek bold / italic / bold-italic / sans-bold / sans-bold-italic.
    // Each block is 58 chars: 25 capitals (Α-Ω inc. capital Theta variant
    // at offset 17), nabla at +25, then 25 lowercase α-ω at +26, partial-
    // differential at +51, then 6 alt forms (epsilon/theta/kappa/phi/rho/pi).
    // We map the 25 capitals (best-effort — the Theta-variant slot lands on
    // unassigned U+03A2 but is rare) and the 25 lowercases.
    const GREEK_RANGES: &[u32] = &[0x1D6A8, 0x1D6E2, 0x1D71C, 0x1D756, 0x1D790];
    for &start in GREEK_RANGES {
        // Capital Greek letters at offset 0..25.
        if codepoint >= start && codepoint < start + 25 {
            return Some(0x0391 + (codepoint - start));
        }
        // Lowercase Greek letters at offset 26..51.
        let lower_start = start + 26;
        if codepoint >= lower_start && codepoint < lower_start + 25 {
            return Some(0x03B1 + (codepoint - lower_start));
        }
    }

    // Math digits 0-9 in five styles.
    const DIGIT_STARTS: &[u32] = &[0x1D7CE, 0x1D7D8, 0x1D7E2, 0x1D7EC, 0x1D7F6];
    for &start in DIGIT_STARTS {
        if codepoint >= start && codepoint < start + 10 {
            return Some(0x30 + (codepoint - start));
        }
    }

    None
}

/// WinAnsi (Windows-1252) encoding table.
///
/// Maps Unicode codepoints to WinAnsi byte values for the range 0x80-0x9F
/// which differs from Latin-1.
pub fn unicode_to_winansi(codepoint: u32) -> Option<u8> {
    // Direct mapping for 0x00-0x7F and 0xA0-0xFF
    if codepoint < 0x80 || (0xA0..=0xFF).contains(&codepoint) {
        return Some(codepoint as u8);
    }

    // Special WinAnsi characters in 0x80-0x9F range
    match codepoint {
        0x20AC => Some(0x80), // Euro sign
        0x201A => Some(0x82), // Single low-9 quotation mark
        0x0192 => Some(0x83), // Latin small letter f with hook
        0x201E => Some(0x84), // Double low-9 quotation mark
        0x2026 => Some(0x85), // Horizontal ellipsis
        0x2020 => Some(0x86), // Dagger
        0x2021 => Some(0x87), // Double dagger
        0x02C6 => Some(0x88), // Modifier letter circumflex accent
        0x2030 => Some(0x89), // Per mille sign
        0x0160 => Some(0x8A), // Latin capital letter S with caron
        0x2039 => Some(0x8B), // Single left-pointing angle quotation mark
        0x0152 => Some(0x8C), // Latin capital ligature OE
        0x017D => Some(0x8E), // Latin capital letter Z with caron
        0x2018 => Some(0x91), // Left single quotation mark
        0x2019 => Some(0x92), // Right single quotation mark
        0x201C => Some(0x93), // Left double quotation mark
        0x201D => Some(0x94), // Right double quotation mark
        0x2022 => Some(0x95), // Bullet
        0x2013 => Some(0x96), // En dash
        0x2014 => Some(0x97), // Em dash
        0x02DC => Some(0x98), // Small tilde
        0x2122 => Some(0x99), // Trade mark sign
        0x0161 => Some(0x9A), // Latin small letter s with caron
        0x203A => Some(0x9B), // Single right-pointing angle quotation mark
        0x0153 => Some(0x9C), // Latin small ligature oe
        0x017E => Some(0x9E), // Latin small letter z with caron
        0x0178 => Some(0x9F), // Latin capital letter Y with diaeresis
        _ => None,
    }
}

/// Check if a character can be encoded in WinAnsi.
pub fn is_winansi_char(ch: char) -> bool {
    unicode_to_winansi(ch as u32).is_some()
}

/// Escape a byte for PDF literal string.
fn escape_byte_for_literal(b: u8) -> String {
    match b {
        b'(' => "\\(".to_string(),
        b')' => "\\)".to_string(),
        b'\\' => "\\\\".to_string(),
        0x0A => "\\n".to_string(),
        0x0D => "\\r".to_string(),
        0x09 => "\\t".to_string(),
        0x08 => "\\b".to_string(),
        0x0C => "\\f".to_string(),
        b if (0x20..0x7F).contains(&b) => (b as char).to_string(),
        b => format!("\\{:03o}", b),
    }
}

/// Encode bytes as PDF literal string with proper escaping.
pub fn encode_bytes_as_literal(bytes: &[u8]) -> String {
    let mut result = String::with_capacity(bytes.len() * 2 + 2);
    result.push('(');
    for &b in bytes {
        result.push_str(&escape_byte_for_literal(b));
    }
    result.push(')');
    result
}

/// Encode bytes as PDF hex string.
pub fn encode_bytes_as_hex(bytes: &[u8]) -> String {
    let mut result = String::with_capacity(bytes.len() * 2 + 2);
    result.push('<');
    for b in bytes {
        result.push_str(&format!("{:02X}", b));
    }
    result.push('>');
    result
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_encode_identity_h() {
        let mut encoder = UnicodeEncoder::new();
        // Simple lookup: A=1, B=2
        let lookup = |cp: u32| match cp {
            0x41 => Some(1_u16),
            0x42 => Some(2_u16),
            _ => None,
        };

        let result = encoder.encode_identity_h("AB", lookup);
        assert_eq!(result, "<00010002>");
    }

    #[test]
    fn test_encode_identity_h_missing_glyph() {
        let mut encoder = UnicodeEncoder::new();
        let lookup = |_: u32| None;

        let result = encoder.encode_identity_h("A", lookup);
        assert_eq!(result, "<0000>"); // .notdef
    }

    #[test]
    fn test_encode_literal_simple() {
        let result = UnicodeEncoder::encode_literal("Hello");
        assert_eq!(result, "(Hello)");
    }

    #[test]
    fn test_encode_literal_escapes() {
        let result = UnicodeEncoder::encode_literal("(test)");
        assert_eq!(result, "(\\(test\\))");

        let result = UnicodeEncoder::encode_literal("back\\slash");
        assert_eq!(result, "(back\\\\slash)");
    }

    #[test]
    fn test_encode_utf16be() {
        // Simple ASCII
        let result = UnicodeEncoder::encode_utf16be("A");
        assert_eq!(result, "<FEFF0041>");

        // BMP character (Euro sign)
        let result = UnicodeEncoder::encode_utf16be("\u{20AC}");
        assert_eq!(result, "<FEFF20AC>");
    }

    #[test]
    fn test_encode_utf16be_supplementary() {
        // Emoji (U+1F600 = GRINNING FACE)
        let result = UnicodeEncoder::encode_utf16be("\u{1F600}");
        // U+1F600 = surrogate pair D83D DE00
        assert_eq!(result, "<FEFFD83DDE00>");
    }

    #[test]
    fn test_encode_text_auto() {
        // Pure ASCII -> literal
        let result = UnicodeEncoder::encode_text("Hello");
        assert_eq!(result, "(Hello)");

        // Unicode -> UTF-16BE
        let result = UnicodeEncoder::encode_text("Hello\u{20AC}World");
        assert!(result.starts_with("<FEFF"));
    }

    #[test]
    fn test_winansi_mapping() {
        assert_eq!(unicode_to_winansi(0x41), Some(0x41)); // 'A'
        assert_eq!(unicode_to_winansi(0x20AC), Some(0x80)); // Euro
        assert_eq!(unicode_to_winansi(0x2019), Some(0x92)); // Right single quote
        assert_eq!(unicode_to_winansi(0x10000), None); // Outside range
    }

    #[test]
    fn test_math_alphanumeric_basic() {
        // Italic small x (U+1D465) → 'x'
        assert_eq!(math_alphanumeric_base(0x1D465), Some(0x78));
        // Italic capital A (U+1D434) → 'A'
        assert_eq!(math_alphanumeric_base(0x1D434), Some(0x41));
        // Bold small a (U+1D41A) → 'a'
        assert_eq!(math_alphanumeric_base(0x1D41A), Some(0x61));
        // Math italic h hole (U+1D455) → 'h'
        assert_eq!(math_alphanumeric_base(0x1D455), Some(0x68));
        // Outside the block
        assert_eq!(math_alphanumeric_base(0x41), None);
        assert_eq!(math_alphanumeric_base(0x1D800), None);
    }

    #[test]
    fn test_math_alphanumeric_greek() {
        // Mathematical italic small beta (U+1D6FD) → β (U+03B2)
        assert_eq!(math_alphanumeric_base(0x1D6FD), Some(0x03B2));
        // Mathematical italic small sigma (U+1D70E) → σ (U+03C3)
        assert_eq!(math_alphanumeric_base(0x1D70E), Some(0x03C3));
        // Mathematical italic capital Alpha (U+1D6E2) → Α (U+0391)
        assert_eq!(math_alphanumeric_base(0x1D6E2), Some(0x0391));
    }

    #[test]
    fn test_math_alphanumeric_digits() {
        // Bold zero (U+1D7CE) → '0'
        assert_eq!(math_alphanumeric_base(0x1D7CE), Some(0x30));
        // Mono nine (U+1D7FF) → '9'
        assert_eq!(math_alphanumeric_base(0x1D7FF), Some(0x39));
    }

    #[test]
    fn test_is_winansi_char() {
        assert!(is_winansi_char('A'));
        assert!(is_winansi_char('\u{20AC}')); // Euro
        assert!(!is_winansi_char('\u{4E2D}')); // Chinese character
    }

    #[test]
    fn test_encode_bytes_as_hex() {
        let result = encode_bytes_as_hex(&[0x41, 0x42, 0x43]);
        assert_eq!(result, "<414243>");
    }

    #[test]
    fn test_encode_bytes_as_literal() {
        let result = encode_bytes_as_literal(b"ABC");
        assert_eq!(result, "(ABC)");

        let result = encode_bytes_as_literal(&[0x28, 0x29]); // '(' ')'
        assert_eq!(result, "(\\(\\))");
    }

    #[test]
    fn test_encoder_caching() {
        let mut encoder = UnicodeEncoder::new();
        let lookup = |cp: u32| Some(cp as u16);

        encoder.encode_identity_h("AAA", lookup);
        assert_eq!(encoder.cache_size(), 1); // Only 'A' cached

        encoder.encode_identity_h("ABC", lookup);
        assert_eq!(encoder.cache_size(), 3); // A, B, C cached

        encoder.clear_cache();
        assert_eq!(encoder.cache_size(), 0);
    }
}