pdf_oxide 0.3.38

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
//! Right-to-Left (RTL) Script Support
//!
//! This module provides comprehensive support for Arabic and Hebrew scripts,
//! including:
//! - Script detection (Arabic, Hebrew, supplements, presentation forms)
//! - Diacritical mark handling (no boundaries before marks)
//! - Letter and punctuation detection
//! - Number handling (Western and Eastern Arabic digits)
//! - LAM-ALEF ligature support
//! - Contextual form normalization
//! - RTL-specific word boundary detection
//!
//! The implementation follows Unicode standards and common RTL text processing rules.

use crate::text::{BoundaryContext, CharacterInfo};

/// Detected RTL script types
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RTLScript {
    /// Arabic main block (U+0600-U+06FF)
    Arabic,
    /// Arabic Supplement (U+0750-U+077F)
    ArabicSupplement,
    /// Arabic Extended-A (U+08A0-U+08FF)
    ArabicExtendedA,
    /// Hebrew (U+0590-U+05FF)
    Hebrew,
    /// Arabic Presentation Forms-A (U+FB50-U+FDFF)
    PresentationFormsA,
    /// Arabic Presentation Forms-B (U+FE70-U+FEFF)
    PresentationFormsB,
}

// ============================================================================
// SCRIPT DETECTION
// ============================================================================

/// Detect RTL script for a character code (O(1) complexity)
///
/// Returns the specific RTL script type if the character belongs to an RTL script,
/// or None if it's not an RTL character.
///
/// # Fast Path
/// The implementation checks the Arabic main range first as it's most common.
pub fn detect_rtl_script(code: u32) -> Option<RTLScript> {
    // Fast path: Arabic main range (most common)
    if matches!(code, 0x0600..=0x06FF) {
        return Some(RTLScript::Arabic);
    }

    // Other ranges
    match code {
        0x0590..=0x05FF => Some(RTLScript::Hebrew),
        0x0750..=0x077F => Some(RTLScript::ArabicSupplement),
        0x08A0..=0x08FF => Some(RTLScript::ArabicExtendedA),
        0xFB50..=0xFDFF => Some(RTLScript::PresentationFormsA),
        0xFE70..=0xFEFF => Some(RTLScript::PresentationFormsB),
        _ => None,
    }
}

/// Check if a character code is any RTL text
///
/// This is a convenience function that returns true if the character
/// belongs to any RTL script (Arabic or Hebrew).
#[inline]
pub fn is_rtl_text(code: u32) -> bool {
    detect_rtl_script(code).is_some()
}

// ============================================================================
// ARABIC DIACRITICS
// ============================================================================

/// Check if a code is an Arabic diacritical mark
///
/// Arabic diacritics include:
/// - Basic marks (U+064B-U+0658): FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SHADDA, SUKUN, etc.
/// - Extended marks (U+06D6-U+06ED): Various small high and low marks
///
/// Diacritics should never create word boundaries.
pub fn is_arabic_diacritic(code: u32) -> bool {
    matches!(code,
        0x064B..=0x0658 |  // Basic Arabic diacritics
        0x06D6..=0x06DC |  // Small high marks
        0x06DF..=0x06E4 |  // Small high marks continued
        0x06E7..=0x06E8 |  // Small high marks continued
        0x06EA..=0x06ED    // Small low marks
    )
}

/// Check if a code is an Arabic letter (not diacritic or punctuation)
///
/// Includes letters from:
/// - Arabic main block (U+0621-U+063A, U+0641-U+064A)
/// - Arabic Supplement (U+0750-U+076D)
/// - Arabic Extended-A (U+08A0-U+08B4, U+08B6-U+08BD)
pub fn is_arabic_letter(code: u32) -> bool {
    matches!(code,
        0x0621..=0x063A |  // Arabic letters (excluding TATWEEL at 0x0640)
        0x0641..=0x064A |  // More Arabic letters
        0x0750..=0x076D |  // Arabic Supplement letters
        0x08A0..=0x08B4 |  // Arabic Extended-A letters
        0x08B6..=0x08BD    // More Extended-A letters
    )
}

// ============================================================================
// HEBREW DIACRITICS AND PUNCTUATION
// ============================================================================

/// Check if a code is a Hebrew diacritical mark
///
/// Hebrew diacritics include:
/// - Vowel points (U+05B0-U+05BB): SHEVA, HATAF SEGOL, HOLAM, etc.
/// - Other marks (U+05BC-U+05C7): DAGESH, METEG, RAFE, SHIN DOT, SIN DOT, etc.
///
/// Diacritics should never create word boundaries.
pub fn is_hebrew_diacritic(code: u32) -> bool {
    matches!(code,
        0x05B0..=0x05BB |  // Hebrew vowel points
        0x05BC |           // DAGESH
        0x05BD |           // METEG
        0x05BF |           // RAFE
        0x05C1..=0x05C2 |  // SHIN DOT, SIN DOT
        0x05C4..=0x05C5 |  // UPPER DOT, LOWER DOT
        0x05C7             // QAMATS QATAN
    )
}

/// Check if a code is a Hebrew letter
///
/// Hebrew alphabet: U+05D0-U+05EA (ALEF through TAV)
pub fn is_hebrew_letter(code: u32) -> bool {
    matches!(code, 0x05D0..=0x05EA)
}

/// Check if a code is Hebrew punctuation
///
/// Includes:
/// - GERESH (U+05F3): Used for abbreviations
/// - GERSHAYIM (U+05F4): Used for acronyms and abbreviations
pub fn is_hebrew_punctuation(code: u32) -> bool {
    matches!(code, 0x05F3 | 0x05F4)
}

// ============================================================================
// SHARED DIACRITIC DETECTION
// ============================================================================

/// Check if a code is any RTL diacritical mark (Arabic or Hebrew)
#[inline]
pub fn is_rtl_diacritic(code: u32) -> bool {
    is_arabic_diacritic(code) || is_hebrew_diacritic(code)
}

// ============================================================================
// ARABIC CONTEXTUAL FORMS AND LIGATURES
// ============================================================================

/// Normalize Arabic contextual form to base character
///
/// Arabic letters have multiple presentation forms (isolated, initial, medial, final).
/// This function maps presentation forms back to their base characters.
///
/// Handles:
/// - Presentation Forms-A (U+FB50-U+FDFF)
/// - Presentation Forms-B (U+FE70-U+FEFF)
///
/// Returns the base character if a presentation form, otherwise returns the original code.
pub fn normalize_arabic_contextual_form(code: u32) -> u32 {
    match code {
        // Presentation Forms-A mappings (partial list - common forms)
        0xFB50 => 0x0671, // ALEF WASLA
        0xFE82 => 0x0627, // ALEF FINAL
        0xFE8D => 0x0627, // ALEF ISOLATED
        0xFE8E => 0x0627, // ALEF FINAL

        // Presentation Forms-B mappings (BEH as example)
        0xFE8F => 0x0628, // BEH ISOLATED
        0xFE90 => 0x0628, // BEH FINAL
        0xFE91 => 0x0628, // BEH INITIAL
        0xFE92 => 0x0628, // BEH MEDIAL

        // For full implementation, would need all ~600 mappings
        // For now, if in presentation form range but not mapped, return as-is
        0xFB50..=0xFDFF | 0xFE70..=0xFEFF => {
            // Generic approximation: many presentation forms follow patterns
            // In production, use a complete lookup table
            code
        },

        // Not a presentation form - return unchanged
        _ => code,
    }
}

/// Check if a code is a LAM-ALEF ligature
///
/// LAM-ALEF is a mandatory ligature in Arabic consisting of LAM (ل) + ALEF (ا) or variants.
/// Unicode has dedicated code points for these ligatures:
/// - U+FEFB, U+FEFC: LAM with ALEF
/// - U+FEF5-U+FEFA: LAM with ALEF variants (with MADDA, HAMZA ABOVE, HAMZA BELOW)
pub fn is_lam_alef_ligature(code: u32) -> bool {
    matches!(code, 0xFEF5..=0xFEFC) // LAM-ALEF ligatures (all forms)
}

/// Decompose a LAM-ALEF ligature into its constituent characters
///
/// Returns (LAM, ALEF_VARIANT) if the code is a LAM-ALEF ligature, None otherwise.
pub fn decompose_lam_alef(code: u32) -> Option<(u32, u32)> {
    match code {
        0xFEFB | 0xFEFC => Some((0x0644, 0x0627)), // LAM + ALEF
        0xFEF5 | 0xFEF6 => Some((0x0644, 0x0622)), // LAM + ALEF WITH MADDA ABOVE
        0xFEF7 | 0xFEF8 => Some((0x0644, 0x0623)), // LAM + ALEF WITH HAMZA ABOVE
        0xFEF9 | 0xFEFA => Some((0x0644, 0x0625)), // LAM + ALEF WITH HAMZA BELOW
        _ => None,
    }
}

// ============================================================================
// NUMBER HANDLING
// ============================================================================

/// Check if a code is an Eastern Arabic-Indic digit (٠-٩)
///
/// Eastern Arabic digits: U+06F0-U+06F9
/// These are commonly used in Persian, Urdu, and some Arabic contexts.
pub fn is_eastern_arabic_digit(code: u32) -> bool {
    matches!(code, 0x06F0..=0x06F9)
}

/// Check if a code is a number in RTL context (Western or Eastern Arabic digit)
///
/// Includes:
/// - Western digits (0-9): U+0030-U+0039
/// - Eastern Arabic-Indic digits (٠-٩): U+06F0-U+06F9
///
/// In RTL text, both types of digits may appear and should be kept together.
pub fn is_arabic_number(code: u32) -> bool {
    matches!(code,
        0x0030..=0x0039 |  // Western digits 0-9
        0x06F0..=0x06F9    // Eastern Arabic-Indic digits ٠-٩
    )
}

// ============================================================================
// RTL BOUNDARY DETECTION
// ============================================================================

/// Determine if a word boundary should be created between two characters in RTL context
///
/// Returns:
/// - Some(true): Definitely create a boundary
/// - Some(false): Definitely do NOT create a boundary
/// - None: Not applicable (let other detectors handle)
///
/// # Boundary Rules (in priority order)
///
/// 1. **Space (U+0020)**: Always creates a boundary
/// 2. **TATWEEL (U+0640)**: Never creates a boundary (Arabic kashida for elongation)
/// 3. **Diacritics**: Never create boundaries (must stay with base character)
/// 4. **Multiple marks on same base**: No boundary between marks
/// 5. **TJ offset**: Large negative offset (< -50) in RTL context creates boundary
/// 6. **Script transitions**: RTL-to-LTR or LTR-to-RTL creates boundary
/// 7. **RTL punctuation**: Creates boundary
/// 8. **Number sequences**: No boundaries within digit sequences
/// 9. **Normal letters**: No boundary between consecutive letters of same script
///
/// # Arguments
///
/// * `prev_char` - The previous character
/// * `curr_char` - The current character
/// * `context` - Optional boundary context (unused for RTL currently)
pub fn should_split_at_rtl_boundary(
    prev_char: &CharacterInfo,
    curr_char: &CharacterInfo,
    _context: Option<&BoundaryContext>,
) -> Option<bool> {
    let prev_code = prev_char.code;
    let curr_code = curr_char.code;

    let prev_is_rtl = is_rtl_text(prev_code);
    let curr_is_rtl = is_rtl_text(curr_code);

    // Rule 1: Space always creates boundary
    if curr_code == 0x0020 || prev_code == 0x0020 {
        return Some(true);
    }

    // Rule 8 (early): Number sequences - no boundaries between digits
    // This must come before the RTL check because Western digits (0-9) are not RTL
    if is_arabic_number(prev_code) && is_arabic_number(curr_code) {
        return Some(false);
    }

    // If neither character is RTL (and not numbers), return None (not our concern)
    if !prev_is_rtl && !curr_is_rtl {
        return None;
    }

    // Rule 2: TATWEEL (Arabic kashida) never creates boundary
    if curr_code == 0x0640 || prev_code == 0x0640 {
        return Some(false);
    }

    // Rule 3: Diacritical marks never create boundaries
    if is_rtl_diacritic(curr_code) {
        return Some(false);
    }

    // Rule 4: Multiple marks on same base (prev is also a mark)
    if is_rtl_diacritic(prev_code) && is_rtl_diacritic(curr_code) {
        return Some(false);
    }

    // Rule 5: TJ offset - large negative offset in RTL creates boundary
    if let Some(tj_offset) = prev_char.tj_offset {
        if tj_offset < -50 {
            return Some(true);
        }
    }

    // Rule 6: RTL-to-LTR or LTR-to-RTL transitions create boundary
    // But skip if both are numbers (already handled above)
    if prev_is_rtl != curr_is_rtl && !(is_arabic_number(prev_code) && is_arabic_number(curr_code)) {
        return Some(true);
    }

    // Rule 7: RTL punctuation creates boundary
    if is_arabic_punctuation(curr_code) || is_hebrew_punctuation(curr_code) {
        return Some(true);
    }

    // Rule 9: Normal letter sequences - no boundary
    if (is_arabic_letter(prev_code)
        || is_arabic_letter(normalize_arabic_contextual_form(prev_code)))
        && (is_arabic_letter(curr_code)
            || is_arabic_letter(normalize_arabic_contextual_form(curr_code)))
    {
        return Some(false);
    }

    if is_hebrew_letter(prev_code) && is_hebrew_letter(curr_code) {
        return Some(false);
    }

    // Both are RTL but fall through all rules - assume no boundary for same-script RTL
    if prev_is_rtl && curr_is_rtl {
        return Some(false);
    }

    // Shouldn't reach here, but return None as fallback
    None
}

// ============================================================================
// HELPER FUNCTIONS
// ============================================================================

/// Check if a code is Arabic punctuation
///
/// Common Arabic punctuation marks that should create word boundaries.
fn is_arabic_punctuation(code: u32) -> bool {
    matches!(
        code,
        0x060C |  // ARABIC COMMA
        0x061B |  // ARABIC SEMICOLON
        0x061F |  // ARABIC QUESTION MARK
        0x066A |  // ARABIC PERCENT SIGN
        0x066B |  // ARABIC DECIMAL SEPARATOR
        0x066C |  // ARABIC THOUSANDS SEPARATOR
        0x066D // ARABIC FIVE POINTED STAR (paragraph mark)
    )
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_basic_script_detection() {
        assert_eq!(detect_rtl_script(0x0627), Some(RTLScript::Arabic)); // ALEF
        assert_eq!(detect_rtl_script(0x05D0), Some(RTLScript::Hebrew)); // ALEF
        assert_eq!(detect_rtl_script(0x0041), None); // Latin 'A'
    }

    #[test]
    fn test_basic_diacritic_detection() {
        assert!(is_arabic_diacritic(0x064E)); // FATHA
        assert!(is_hebrew_diacritic(0x05BC)); // DAGESH
        assert!(!is_arabic_diacritic(0x0627)); // ALEF (letter)
    }

    #[test]
    fn test_basic_letter_detection() {
        assert!(is_arabic_letter(0x0628)); // BEH
        assert!(is_hebrew_letter(0x05D1)); // BET
        assert!(!is_arabic_letter(0x064B)); // FATHATAN (diacritic)
    }

    #[test]
    fn test_lam_alef_basic() {
        assert!(is_lam_alef_ligature(0xFEFC));
        assert_eq!(decompose_lam_alef(0xFEFC), Some((0x0644, 0x0627)));
    }
}