rpdfium_text/textpage/
mod.rs

1// Derived from PDFium's fpdf_text.cpp text extraction and analysis
2// Original: Copyright 2014 The PDFium Authors
3// Licensed under BSD-3-Clause / Apache-2.0
4// See pdfium-upstream/LICENSE for the original license.
5
6//! Text extraction from a single PDF page with character-level position data.
7//! Implements text extraction, segmentation, reading order, and search.
8
9pub mod fx_bidi;
10
11use std::collections::HashMap;
12
13#[cfg(feature = "icu")]
14use unicode_normalization::UnicodeNormalization;
15
16use rpdfium_core::{Matrix, Name};
17use rpdfium_graphics::{
18    BlendMode, ClipPath, Color, ColorSpaceFamily, ImageRef, PathOp, PathStyle, TextRenderingMode,
19};
20use rpdfium_page::display::{DisplayVisitor, SoftMask, TextRun};
21use rpdfium_page::shading::ShadingDict;
22use rpdfium_parser::Operand;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
25pub enum CharType {
26    /// Normal character decoded from glyph bytes.
27    #[default]
28    Normal,
29    /// Synthesized space inserted by gap detection.
30    Generated,
31    /// Soft or hard hyphen character.
32    Hyphen,
33    /// Part of a decomposed ligature (2nd+ character).
34    Piece,
35    /// Character code could not be mapped to Unicode (upstream `kNotUnicode`).
36    NotUnicode,
37}
38
39/// A single extracted character with its position, size, and font metadata.
40///
41/// All fields are private, matching upstream PDFium where `CharInfo` is a
42/// private struct inside `CPDF_TextPage`.  External access goes through
43/// indexed getters on [`TextPage`] (e.g. `get_unicode()`, `get_font_size()`).
44#[derive(Debug, Clone)]
45pub struct TextCharacter {
46    /// The Unicode character.
47    unicode: char,
48    /// Raw font encoding code before Unicode mapping (upstream `CharInfo::char_code`).
49    /// For ActualText runs and generated spaces, this is `0`.
50    char_code: u32,
51    /// Tight bounding box in page space (upstream `CharInfo::char_box_`).
52    char_box: CharRect,
53    /// Font size in text space units.
54    font_size: f32,
55    /// Name of the font used for this character.
56    font_name: String,
57    /// Width of the space character for this font in page space (if available).
58    /// Used for dynamic word-gap threshold in segmentation.
59    space_width: Option<f32>,
60    /// True if this character is a soft hyphen (U+00AD).
61    is_soft_hyphen: bool,
62    /// Classification of this character's origin.
63    char_type: CharType,
64    /// Per-character text matrix `[a, b, c, d, e, f]` in page space.
65    /// `a/b/c/d` are from the run matrix; `e/f` are the character's page-space origin.
66    matrix: [f32; 6],
67    /// Expanded bounding box using font ascent/descent (upstream `loose_char_box`).
68    /// `None` for ActualText and generated characters.
69    loose_char_box: Option<CharRect>,
70    /// Fill color at the time this character was rendered (upstream `FPDFText_GetFillColor`).
71    fill_color: Option<Color>,
72    /// Stroke color at the time this character was rendered (upstream `FPDFText_GetStrokeColor`).
73    stroke_color: Option<Color>,
74    /// Font weight (100–900, CSS-style) from the font descriptor (upstream `FPDFText_GetFontWeight`).
75    font_weight: Option<i32>,
76    /// PDF font descriptor flags (Table 123) from the font descriptor (upstream `FPDFText_GetFontInfo` flags param).
77    font_flags: Option<u32>,
78    /// Text rendering mode at the time this character was rendered (upstream `FPDFText_GetTextRenderMode`).
79    rendering_mode: TextRenderingMode,
80}
81
82/// Maximum size of the duplicate-detection ring buffer.
83const RECENT_RING_CAPACITY: usize = 7;
84
85/// Extracts text characters from a `DisplayTree` by visiting text nodes.
86pub struct TextExtractor {
87    characters: Vec<TextCharacter>,
88    /// Parallel array of run IDs (same length as `characters`).
89    /// Tracks which text run each character belongs to.
90    run_ids: Vec<Option<u32>>,
91    /// Ring buffer of recently added characters for duplicate detection.
92    recent_chars: Vec<TextCharacter>,
93    /// Last emitted ActualText span ID for deduplication.
94    last_actual_text_id: Option<u64>,
95    /// Monotonic counter for assigning run IDs to characters.
96    next_run_id: u32,
97    /// When true, use RTL base direction for bidi reordering.
98    rtl: bool,
99}
100
101impl Default for TextExtractor {
102    fn default() -> Self {
103        Self::new()
104    }
105}
106
107impl TextExtractor {
108    /// Create a new empty text extractor.
109    pub fn new() -> Self {
110        Self {
111            characters: Vec::new(),
112            run_ids: Vec::new(),
113            recent_chars: Vec::with_capacity(RECENT_RING_CAPACITY),
114            last_actual_text_id: None,
115            next_run_id: 0,
116            rtl: false,
117        }
118    }
119
120    /// Create a new text extractor with an RTL base direction hint.
121    ///
122    /// When `rtl` is true and the `icu` feature is enabled, bidi reordering
123    /// uses RTL as the default paragraph direction.
124    pub fn with_rtl(rtl: bool) -> Self {
125        Self {
126            characters: Vec::new(),
127            run_ids: Vec::new(),
128            recent_chars: Vec::with_capacity(RECENT_RING_CAPACITY),
129            last_actual_text_id: None,
130            next_run_id: 0,
131            rtl,
132        }
133    }
134
135    /// Returns the RTL hint configured for this extractor.
136    pub fn is_rtl(&self) -> bool {
137        self.rtl
138    }
139
140    /// Check if a character is a duplicate of a recently added character.
141    ///
142    /// A character is considered a duplicate if it has the same unicode codepoint,
143    /// the same font name, and both x and y positions are within 1% of font_size.
144    fn is_duplicate(&self, ch: &TextCharacter) -> bool {
145        let tolerance = ch.font_size * 0.01;
146        self.recent_chars.iter().any(|recent| {
147            recent.unicode == ch.unicode
148                && recent.font_name == ch.font_name
149                && (recent.char_box.left - ch.char_box.left).abs() < tolerance
150                && (recent.char_box.bottom - ch.char_box.bottom).abs() < tolerance
151        })
152    }
153
154    /// Add a character to the ring buffer (always, whether duplicate or not).
155    fn push_recent(&mut self, ch: &TextCharacter) {
156        if self.recent_chars.len() >= RECENT_RING_CAPACITY {
157            self.recent_chars.remove(0);
158        }
159        self.recent_chars.push(ch.clone());
160    }
161
162    /// Consume the extractor and return all collected characters with their run IDs.
163    pub fn into_characters(self) -> (Vec<TextCharacter>, Vec<Option<u32>>) {
164        (self.characters, self.run_ids)
165    }
166
167    /// Try to add a character, skipping if it's a duplicate.
168    /// Always adds the character to the ring buffer.
169    fn try_add_character(&mut self, ch: TextCharacter, run_id: Option<u32>) {
170        let is_dup = self.is_duplicate(&ch);
171        self.push_recent(&ch);
172        if !is_dup {
173            self.characters.push(ch);
174            self.run_ids.push(run_id);
175        }
176    }
177
178    /// Extract characters from a single `TextRun`.
179    fn extract_run(&mut self, run: &TextRun) {
180        // Skip invisible text (rendering mode 3 = invisible)
181        if run.rendering_mode == TextRenderingMode::Invisible {
182            return;
183        }
184
185        // Deduplicate runs with the same ActualText span ID.
186        // Multiple text runs within a single BDC ActualText sequence share
187        // the same ID — only the first should emit characters.
188        if let Some(id) = run.actual_text_id {
189            if self.last_actual_text_id == Some(id) {
190                return;
191            }
192            if run.actual_text.is_some() {
193                self.last_actual_text_id = Some(id);
194            }
195        }
196
197        // Assign a run ID to all characters from this run.
198        let run_id = self.next_run_id;
199        self.next_run_id += 1;
200
201        let font_name_str = run.font_name.as_str().to_string();
202        let font_size = run.font_size;
203        let matrix = &run.matrix;
204
205        // Compute effective height from the matrix: scale factor on y-axis
206        let height = (matrix.b * matrix.b + matrix.d * matrix.d).sqrt() as f32 * font_size;
207
208        // Precompute matrix scale factor for width calculation (constant per run)
209        let width_scale = (matrix.a * matrix.a + matrix.c * matrix.c).sqrt() as f32;
210
211        // Precompute run matrix components for per-character matrix field
212        let mat_a = matrix.a as f32;
213        let mat_b = matrix.b as f32;
214        let mat_c = matrix.c as f32;
215        let mat_d = matrix.d as f32;
216
217        // Compute font ascent/descent for loose_char_box (in 1/1000 em units)
218        let (font_ascent, font_descent) = run
219            .resolved_font
220            .as_ref()
221            .map(|rf| (rf.ascent as f32, rf.descent as f32))
222            .unwrap_or((750.0, -250.0));
223
224        // Detect vertical CID font mode (WMode=1 with CID font type).
225        // Vertical CID fonts advance downward and use vert_origins / VerticalMetrics
226        // for bounding box computation instead of ascent/descent.
227        let is_vertical_cid = run.is_vertical
228            && run
229                .resolved_font
230                .as_ref()
231                .map(|rf| rf.is_cid_font())
232                .unwrap_or(false);
233
234        // Compute space width from the resolved font for dynamic word-gap threshold
235        let space_width = run.resolved_font.as_ref().map(|rf| {
236            let w = rf.char_width(32) as f32;
237            w * font_size / 1000.0 * width_scale
238        });
239
240        // If /ActualText is present, emit those characters instead of decoding glyphs.
241        // Distribute the total advance width proportionally across ActualText characters.
242        if let Some(ref actual) = run.actual_text {
243            let total_advance: f32 = run.positions.iter().sum();
244            let total_width = width_scale * total_advance;
245            let decomposed_actual = decompose_ligatures(actual);
246            let actual_chars: Vec<char> = decomposed_actual.chars().collect();
247            let char_count = actual_chars.len().max(1);
248            let per_char_width = total_width / char_count as f32;
249
250            for (i, ch) in actual_chars.iter().enumerate() {
251                let tx = total_advance * i as f32 / char_count as f32;
252                let ty = run.rise;
253                let page_x = (matrix.a * tx as f64 + matrix.c * ty as f64 + matrix.e) as f32;
254                let page_y = (matrix.b * tx as f64 + matrix.d * ty as f64 + matrix.f) as f32;
255
256                let is_hyphen = *ch == '\u{00AD}'
257                    || *ch == '\u{002D}'
258                    || *ch == '\u{2010}'
259                    || *ch == '\u{2011}';
260                self.try_add_character(
261                    TextCharacter {
262                        unicode: *ch,
263                        char_code: 0,
264                        char_box: CharRect {
265                            left: page_x,
266                            bottom: page_y,
267                            right: page_x + per_char_width,
268                            top: page_y + height,
269                        },
270                        font_size,
271                        font_name: font_name_str.clone(),
272                        space_width,
273                        is_soft_hyphen: *ch == '\u{00AD}',
274                        char_type: if is_hyphen {
275                            CharType::Hyphen
276                        } else {
277                            CharType::Normal
278                        },
279                        matrix: [mat_a, mat_b, mat_c, mat_d, page_x, page_y],
280                        loose_char_box: None,
281                        fill_color: run.fill_color.clone(),
282                        stroke_color: run.stroke_color.clone(),
283                        font_weight: run.resolved_font.as_ref().and_then(|rf| rf.weight),
284                        font_flags: run.resolved_font.as_ref().and_then(|rf| rf.flags),
285                        rendering_mode: run.rendering_mode,
286                    },
287                    Some(run_id),
288                );
289            }
290            return;
291        }
292
293        // Determine character codes depending on whether we have a CID font
294        let char_codes: Vec<(u32, usize)> = if let Some(ref resolved) = run.resolved_font {
295            if resolved.is_cid_font() {
296                if let Some(ref cmap) = resolved.cid_cmap {
297                    cmap.extract_char_codes(&run.text)
298                } else {
299                    // Default: 2-byte codes for CID fonts without explicit CMap
300                    extract_two_byte_codes(&run.text)
301                }
302            } else {
303                // Simple fonts: 1 byte per character
304                run.text.iter().map(|&b| (b as u32, 1)).collect()
305            }
306        } else {
307            // No resolved font: treat as 1-byte codes
308            run.text.iter().map(|&b| (b as u32, 1)).collect()
309        };
310
311        // Compute the space gap threshold for intra-run space insertion.
312        let gap_threshold =
313            space_width.map(|sw| space_threshold(sw as f64, font_size as f64) as f32);
314
315        // Track previous character's page-space x position for intra-run gap detection.
316        let mut prev_page_end_x: Option<f32> = None;
317
318        // Walk through character codes, using the positions array for advances.
319        // Track accumulated advance incrementally to avoid O(n^2) re-summation.
320        let mut accumulated_advance: f32 = 0.0;
321        for (pos_idx, (code, _byte_len)) in char_codes.iter().enumerate() {
322            let code = *code;
323
324            // Decode to Unicode
325            let unicode_str = if let Some(ref resolved) = run.resolved_font {
326                resolved.unicode_from_char_code(code)
327            } else {
328                // Fallback: try ASCII if in range
329                if (0x20..0x7F).contains(&code) {
330                    Some((code as u8 as char).to_string())
331                } else {
332                    None
333                }
334            };
335
336            // Get advance width from positions array
337            let advance = if pos_idx < run.positions.len() {
338                run.positions[pos_idx]
339            } else {
340                0.0
341            };
342
343            if let Some(ustr) = unicode_str {
344                let normalized = normalize_text(&ustr);
345                // Decompose ligatures
346                let decomposed = decompose_ligatures(&normalized);
347                let chars: Vec<char> = decomposed.chars().collect();
348                let char_count = chars.len().max(1);
349
350                // Check for intra-run space: gap between previous char end and current char start
351                let tx = accumulated_advance;
352                let ty = run.rise;
353                let page_x = (matrix.a * tx as f64 + matrix.c * ty as f64 + matrix.e) as f32;
354                let page_y = (matrix.b * tx as f64 + matrix.d * ty as f64 + matrix.f) as f32;
355
356                if let (Some(prev_end), Some(threshold)) = (prev_page_end_x, gap_threshold) {
357                    let gap = page_x - prev_end;
358                    if gap > 0.0 && gap > threshold {
359                        self.try_add_character(
360                            TextCharacter {
361                                unicode: ' ',
362                                char_code: 0x20,
363                                char_box: CharRect {
364                                    left: prev_end,
365                                    bottom: page_y,
366                                    right: prev_end + gap,
367                                    top: page_y + height,
368                                },
369                                font_size,
370                                font_name: font_name_str.clone(),
371                                space_width,
372                                is_soft_hyphen: false,
373                                char_type: CharType::Generated,
374                                matrix: [mat_a, mat_b, mat_c, mat_d, prev_end, page_y],
375                                loose_char_box: None,
376                                fill_color: run.fill_color.clone(),
377                                stroke_color: run.stroke_color.clone(),
378                                font_weight: run.resolved_font.as_ref().and_then(|rf| rf.weight),
379                                font_flags: run.resolved_font.as_ref().and_then(|rf| rf.flags),
380                                rendering_mode: run.rendering_mode,
381                            },
382                            None,
383                        );
384                    }
385                }
386
387                // Width in page space distributed among decomposed characters
388                let total_width = width_scale * advance;
389                let per_char_width = total_width / char_count as f32;
390
391                for (ci, ch) in chars.iter().enumerate() {
392                    let char_offset = per_char_width * ci as f32;
393                    let char_x = page_x + char_offset;
394
395                    let is_hyphen = *ch == '\u{00AD}'
396                        || *ch == '\u{002D}'
397                        || *ch == '\u{2010}'
398                        || *ch == '\u{2011}';
399                    let ctype = if ci > 0 {
400                        CharType::Piece
401                    } else if is_hyphen {
402                        CharType::Hyphen
403                    } else {
404                        CharType::Normal
405                    };
406                    // For vertical CID fonts, use vert_origin + w1y metrics
407                    // matching PDFium's CPDF_TextPage::GenerateCharacter() which calls
408                    // pFont->GetVertOriginX/Y() and pFont->GetVertWidth() instead of
409                    // ascent/descent when the font has WMode=1.
410                    let loose = if is_vertical_cid {
411                        // Look up vertical metrics for this CID from the font's
412                        // /DW2 / /W2 dictionaries (VerticalMetrics::lookup).
413                        // (w1y, vx, vy) — w1y is the vertical advance (typically −1000),
414                        // vy is the vertical origin Y offset (typically 880).
415                        let (w1y, vy) = run
416                            .resolved_font
417                            .as_ref()
418                            .and_then(|rf| rf.vertical_metrics.as_ref())
419                            .map(|vm| {
420                                let (w1y, _vx, vy) = vm.lookup(code as u16);
421                                (w1y as f32, vy as f32)
422                            })
423                            .unwrap_or((-1000.0, 880.0));
424                        // Per-glyph vert_origins may override vy when the interpreter
425                        // has already resolved the origin from the font dictionary.
426                        let vy = run
427                            .vert_origins
428                            .get(pos_idx)
429                            .map(|&(_vx, run_vy)| run_vy as f32)
430                            .unwrap_or(vy);
431                        compute_loose_char_box_vertical(
432                            char_x,
433                            page_y,
434                            per_char_width,
435                            font_size,
436                            vy,
437                            w1y,
438                            mat_a,
439                            mat_b,
440                            mat_c,
441                            mat_d,
442                        )
443                    } else {
444                        compute_loose_char_box(
445                            char_x,
446                            page_y,
447                            per_char_width,
448                            font_size,
449                            font_ascent,
450                            font_descent,
451                            mat_a,
452                            mat_b,
453                            mat_c,
454                            mat_d,
455                        )
456                    };
457                    self.try_add_character(
458                        TextCharacter {
459                            unicode: *ch,
460                            char_code: code,
461                            char_box: CharRect {
462                                left: char_x,
463                                bottom: page_y,
464                                right: char_x + per_char_width,
465                                top: page_y + height,
466                            },
467                            font_size,
468                            font_name: font_name_str.clone(),
469                            space_width,
470                            is_soft_hyphen: *ch == '\u{00AD}',
471                            char_type: ctype,
472                            matrix: [mat_a, mat_b, mat_c, mat_d, char_x, page_y],
473                            loose_char_box: Some(loose),
474                            fill_color: run.fill_color.clone(),
475                            stroke_color: run.stroke_color.clone(),
476                            font_weight: run.resolved_font.as_ref().and_then(|rf| rf.weight),
477                            font_flags: run.resolved_font.as_ref().and_then(|rf| rf.flags),
478                            rendering_mode: run.rendering_mode,
479                        },
480                        Some(run_id),
481                    );
482                }
483
484                prev_page_end_x = Some(page_x + total_width);
485            }
486
487            // Incrementally accumulate for the next character
488            accumulated_advance += advance;
489        }
490    }
491}
492
493/// Compute the loose bounding box for a character using font ascent/descent.
494///
495/// The loose box expands the character's tight bounding box vertically
496/// to account for diacritics and descenders, using the font's ascent and
497/// descent metrics (in 1/1000 em units). The box is transformed to page
498/// space using the character's matrix components.
499#[allow(clippy::too_many_arguments)]
500fn compute_loose_char_box(
501    x: f32,
502    y: f32,
503    width: f32,
504    font_size: f32,
505    font_ascent: f32,
506    font_descent: f32,
507    mat_a: f32,
508    mat_b: f32,
509    mat_c: f32,
510    mat_d: f32,
511) -> CharRect {
512    let ascent_page = font_ascent * font_size / 1000.0;
513    let descent_page = font_descent * font_size / 1000.0;
514
515    // Transform the four corners of the loose box through the matrix
516    // The text-space box is [0, descent_page] to [width, ascent_page]
517    // relative to the character origin, but we apply only the a/b/c/d
518    // rotation/scale components (e/f is already in x/y).
519    let scale = (mat_a * mat_a + mat_c * mat_c).sqrt();
520    let width_scaled = if scale > 0.0 { width / scale } else { width };
521
522    // Compute corners in text space then transform
523    let corners = [
524        (0.0f32, descent_page),
525        (width_scaled, descent_page),
526        (width_scaled, ascent_page),
527        (0.0, ascent_page),
528    ];
529
530    let mut min_x = f32::INFINITY;
531    let mut min_y = f32::INFINITY;
532    let mut max_x = f32::NEG_INFINITY;
533    let mut max_y = f32::NEG_INFINITY;
534
535    for &(cx, cy) in &corners {
536        let px = x + mat_a * cx + mat_c * cy;
537        let py = y + mat_b * cx + mat_d * cy;
538        min_x = min_x.min(px);
539        min_y = min_y.min(py);
540        max_x = max_x.max(px);
541        max_y = max_y.max(py);
542    }
543
544    CharRect {
545        left: min_x,
546        bottom: min_y,
547        right: max_x,
548        top: max_y,
549    }
550}
551
552/// Compute the loose bounding box for a character in a vertical CID font.
553///
554/// For vertical CID fonts (WMode=1), the character box is oriented around the
555/// vertical writing direction. The vertical origin Y (`vert_origin_y`) and
556/// vertical advance (`vert_w1y`) from the font's `/DW2`/`/W2` dictionaries
557/// replace the ascent/descent used for horizontal text.
558///
559/// This matches PDFium's `CPDF_TextPage::GenerateCharacter()` which calls
560/// `pFont->GetVertOriginY()` and `pFont->GetVertWidth()` for CID fonts with
561/// `WMode=1` (checked via `pFont->IsVertWriting()`).
562///
563/// # Parameters
564/// - `x`, `y`: character origin in page space
565/// - `width`: character advance width in page space
566/// - `font_size`: font size in text space units
567/// - `vert_origin_y`: vertical origin Y in 1/1000 em (typically 880)
568/// - `vert_w1y`: vertical advance in 1/1000 em (typically −1000, negative = downward)
569/// - `mat_a..mat_d`: text-to-page matrix rotation/scale components
570#[allow(clippy::too_many_arguments)]
571fn compute_loose_char_box_vertical(
572    x: f32,
573    y: f32,
574    width: f32,
575    font_size: f32,
576    vert_origin_y: f32,
577    vert_w1y: f32,
578    mat_a: f32,
579    mat_b: f32,
580    mat_c: f32,
581    mat_d: f32,
582) -> CharRect {
583    // In vertical writing mode the glyph's bounding box is defined by:
584    //   top    = vert_origin_y scaled to page units (offset above writing pos)
585    //   bottom = top + vert_w1y scaled to page units (vert_w1y < 0 → descends)
586    //   left   = 0 (writing position x)
587    //   right  = width (glyph advance width, which is the cell width)
588    let top_page = vert_origin_y * font_size / 1000.0;
589    let bottom_page = (vert_origin_y + vert_w1y) * font_size / 1000.0;
590
591    // Build the four corners in text-relative space and transform through
592    // the matrix a/b/c/d components (x/y translation is already in x, y).
593    let scale = (mat_a * mat_a + mat_c * mat_c).sqrt();
594    let width_scaled = if scale > 0.0 { width / scale } else { width };
595
596    let corners = [
597        (0.0f32, bottom_page),
598        (width_scaled, bottom_page),
599        (width_scaled, top_page),
600        (0.0, top_page),
601    ];
602
603    let mut min_x = f32::INFINITY;
604    let mut min_y = f32::INFINITY;
605    let mut max_x = f32::NEG_INFINITY;
606    let mut max_y = f32::NEG_INFINITY;
607
608    for &(cx, cy) in &corners {
609        let px = x + mat_a * cx + mat_c * cy;
610        let py = y + mat_b * cx + mat_d * cy;
611        min_x = min_x.min(px);
612        min_y = min_y.min(py);
613        max_x = max_x.max(px);
614        max_y = max_y.max(py);
615    }
616
617    CharRect {
618        left: min_x,
619        bottom: min_y,
620        right: max_x,
621        top: max_y,
622    }
623}
624
625/// Decompose Unicode ligature codepoints into their component characters.
626///
627/// Returns the input string unchanged if it contains no ligatures.
628fn decompose_ligatures(text: &str) -> String {
629    let mut result = String::with_capacity(text.len());
630    for ch in text.chars() {
631        match ch {
632            '\u{FB00}' => result.push_str("ff"),
633            '\u{FB01}' => result.push_str("fi"),
634            '\u{FB02}' => result.push_str("fl"),
635            '\u{FB03}' => result.push_str("ffi"),
636            '\u{FB04}' => result.push_str("ffl"),
637            '\u{FB05}' | '\u{FB06}' => result.push_str("st"),
638            _ => result.push(ch),
639        }
640    }
641    result
642}
643
644/// Apply NFC normalization to a string when the `icu` feature is enabled.
645#[cfg(feature = "icu")]
646pub fn normalize_text(text: &str) -> String {
647    text.nfc().collect()
648}
649
650/// Identity pass-through when the `icu` feature is disabled.
651#[cfg(not(feature = "icu"))]
652pub fn normalize_text(text: &str) -> String {
653    text.to_string()
654}
655
656/// Extract 2-byte character codes from raw bytes (for CID fonts without CMap).
657fn extract_two_byte_codes(data: &[u8]) -> Vec<(u32, usize)> {
658    let mut result = Vec::new();
659    let mut i = 0;
660    while i + 1 < data.len() {
661        let code = ((data[i] as u32) << 8) | (data[i + 1] as u32);
662        result.push((code, 2));
663        i += 2;
664    }
665    // Handle trailing odd byte
666    if i < data.len() {
667        result.push((data[i] as u32, 1));
668    }
669    result
670}
671
672impl DisplayVisitor for TextExtractor {
673    fn enter_group(
674        &mut self,
675        _blend_mode: BlendMode,
676        _clip: Option<&ClipPath>,
677        _opacity: f32,
678        _isolated: bool,
679        _knockout: bool,
680        _soft_mask: &Option<Box<SoftMask>>,
681    ) -> bool {
682        true // always descend
683    }
684
685    fn leave_group(&mut self) {
686        // no-op
687    }
688
689    fn visit_path(
690        &mut self,
691        _ops: &[PathOp],
692        _style: &PathStyle,
693        _matrix: &Matrix,
694        _fill_color: Option<&Color>,
695        _stroke_color: Option<&Color>,
696        _fill_color_space: Option<&ColorSpaceFamily>,
697        _stroke_color_space: Option<&ColorSpaceFamily>,
698        _transfer_function: Option<&rpdfium_page::function::TransferFunction>,
699        _overprint: bool,
700        _overprint_mode: u32,
701    ) {
702        // no-op for text extraction
703    }
704
705    fn visit_image(
706        &mut self,
707        _image_ref: &ImageRef,
708        _matrix: &Matrix,
709        _mask: Option<&rpdfium_page::display::ImageMask>,
710        _fill_color: Option<&Color>,
711        _transfer_function: Option<&rpdfium_page::function::TransferFunction>,
712    ) {
713        // no-op for text extraction
714    }
715
716    fn visit_inline_image(
717        &mut self,
718        _properties: &HashMap<Name, Operand>,
719        _data: &[u8],
720        _matrix: &Matrix,
721    ) {
722        // no-op for text extraction
723    }
724
725    fn visit_shading_fill(&mut self, _shading: &ShadingDict, _matrix: &Matrix) {
726        // no-op for text extraction
727    }
728
729    fn visit_pattern_fill(
730        &mut self,
731        _path_ops: &[PathOp],
732        _fill_rule: rpdfium_graphics::FillRule,
733        _pattern: &rpdfium_page::pattern::TilingPattern,
734        _pattern_tree: &rpdfium_page::display::DisplayTree,
735        _fill_color: Option<&Color>,
736        _matrix: &rpdfium_core::Matrix,
737    ) {
738        // no-op for text extraction
739    }
740
741    fn visit_text(&mut self, runs: &[TextRun]) {
742        for run in runs {
743            self.extract_run(run);
744        }
745    }
746}
747
748/// A word extracted from the page — a sequence of adjacent characters.
749#[derive(Debug, Clone)]
750pub struct TextWord {
751    /// The concatenated Unicode text of this word.
752    pub text: String,
753    /// X coordinate of the word's left edge in page space.
754    pub x: f32,
755    /// Y coordinate of the word's baseline in page space.
756    pub y: f32,
757    /// Total advance width of the word in page space.
758    pub width: f32,
759    /// Height of the word (maximum character height).
760    pub height: f32,
761}
762
763/// A line of text extracted from the page — a sequence of words on the
764/// same baseline.
765#[derive(Debug, Clone)]
766pub struct TextLine {
767    /// The full text of the line, with words separated by spaces.
768    pub text: String,
769    /// Words composing this line, in reading order (left to right).
770    pub words: Vec<TextWord>,
771    /// Y coordinate of the line's baseline.
772    pub y: f32,
773    /// Height of the line (maximum word height).
774    pub height: f32,
775}
776
777/// Threshold multiplier for gap-based word breaking.
778///
779/// A gap larger than `average_char_width * WORD_GAP_THRESHOLD` between
780/// two consecutive characters triggers a word boundary.
781const WORD_GAP_THRESHOLD: f32 = 0.3;
782
783/// Upstream-faithful NormalizeThreshold (cpdf_textpage.cpp:47-60).
784///
785/// `threshold` is in 1/1000 of text space units.
786/// Returns normalized threshold in the same units.
787fn normalize_threshold(threshold: f32, t1: i32, t2: i32, t3: i32) -> f32 {
788    debug_assert!(t1 < t2 && t2 < t3);
789    if threshold < t1 as f32 {
790        threshold / 2.0
791    } else if threshold < t2 as f32 {
792        threshold / 4.0
793    } else if threshold < t3 as f32 {
794        threshold / 5.0
795    } else {
796        threshold / 6.0
797    }
798}
799
800/// Compute space detection threshold for a character.
801///
802/// Input: `char_width_thou` is the space character width in 1/1000 text space
803/// units (from `ResolvedFont::char_width(32)`). `font_size_h` is the horizontal
804/// font size in page space (font_size * |matrix.a|).
805///
806/// Uses upstream tier boundaries (300, 500, 700) for character-width threshold.
807pub fn space_threshold(char_width_thou: f64, font_size_h: f64) -> f64 {
808    let normalized = normalize_threshold(char_width_thou as f32, 300, 500, 700);
809    font_size_h * normalized as f64 / 1000.0
810}
811
812/// Y-tolerance factor for grouping characters into lines.
813///
814/// Characters within `font_size * LINE_Y_TOLERANCE` of each other's
815/// y-coordinate are considered part of the same line.
816const LINE_Y_TOLERANCE: f32 = 0.5;
817
818/// Segment a sequence of characters into words by detecting gaps.
819///
820/// Characters are grouped left-to-right. A new word starts when the
821/// horizontal gap between two consecutive characters exceeds
822/// `average_char_width * WORD_GAP_THRESHOLD`, or when the y-coordinate
823/// changes significantly.
824pub fn segment_words(chars: &[TextCharacter]) -> Vec<TextWord> {
825    if chars.is_empty() {
826        return Vec::new();
827    }
828
829    let total: f32 = chars
830        .iter()
831        .map(|c| c.char_box.right - c.char_box.left)
832        .sum();
833    let avg_width = total / chars.len() as f32;
834
835    let fallback_threshold = avg_width * WORD_GAP_THRESHOLD;
836
837    let mut words = Vec::new();
838    let mut current_chars: Vec<&TextCharacter> = vec![&chars[0]];
839
840    for i in 1..chars.len() {
841        let prev = &chars[i - 1];
842        let curr = &chars[i];
843
844        // Check if there's a significant horizontal gap or vertical change.
845        // A negative gap means characters overlap (e.g. kerning) — never break.
846        let gap = curr.char_box.left - prev.char_box.right;
847        let y_diff = (curr.char_box.bottom - prev.char_box.bottom).abs();
848        let y_threshold = prev.font_size * LINE_Y_TOLERANCE;
849
850        // Soft hyphens at end-of-line continue the word (don't break).
851        if prev.is_soft_hyphen {
852            current_chars.push(curr);
853            continue;
854        }
855
856        // CJK ideographs are treated as individual words regardless of gap.
857        let cjk_break = is_cjk_ideograph(curr.unicode) || is_cjk_ideograph(prev.unicode);
858
859        // Use space_width from the current character (or previous) if available,
860        // falling back to average width * WORD_GAP_THRESHOLD.
861        // Applies piecewise NormalizeThreshold when space_width is known.
862        let gap_threshold = curr
863            .space_width
864            .or(prev.space_width)
865            .map(|sw| space_threshold(sw as f64, curr.font_size as f64) as f32)
866            .unwrap_or(fallback_threshold);
867
868        if cjk_break || (gap > 0.0 && gap > gap_threshold) || y_diff > y_threshold {
869            // Finish current word
870            words.push(build_word(&current_chars));
871            current_chars.clear();
872        }
873
874        current_chars.push(curr);
875    }
876
877    // Finish last word
878    if !current_chars.is_empty() {
879        words.push(build_word(&current_chars));
880    }
881
882    words
883}
884
885/// Segment a sequence of characters into lines by grouping on y-position.
886///
887/// Characters close in y-coordinate are grouped into the same line,
888/// then sorted left-to-right. Lines are sorted top-to-bottom (descending y
889/// in PDF coordinates).
890pub fn segment_lines(chars: &[TextCharacter]) -> Vec<TextLine> {
891    if chars.is_empty() {
892        return Vec::new();
893    }
894
895    // Sort characters by y (descending, since PDF y=0 is bottom) then by x
896    let mut sorted: Vec<&TextCharacter> = chars.iter().collect();
897    sorted.sort_by(|a, b| {
898        b.char_box
899            .bottom
900            .partial_cmp(&a.char_box.bottom)
901            .unwrap_or(std::cmp::Ordering::Equal)
902            .then_with(|| {
903                a.char_box
904                    .left
905                    .partial_cmp(&b.char_box.left)
906                    .unwrap_or(std::cmp::Ordering::Equal)
907            })
908    });
909
910    // Group into lines by y-proximity
911    let mut lines: Vec<Vec<&TextCharacter>> = Vec::new();
912    let mut current_line: Vec<&TextCharacter> = vec![sorted[0]];
913    let mut line_y = sorted[0].char_box.bottom;
914
915    for &ch in &sorted[1..] {
916        let y_threshold = ch.font_size * LINE_Y_TOLERANCE;
917        if (ch.char_box.bottom - line_y).abs() <= y_threshold {
918            current_line.push(ch);
919        } else {
920            lines.push(current_line);
921            current_line = vec![ch];
922            line_y = ch.char_box.bottom;
923        }
924    }
925    lines.push(current_line);
926
927    // Build TextLine from each group
928    lines
929        .into_iter()
930        .map(|line_chars| {
931            // Sort by x within the line
932            let mut sorted_line = line_chars;
933            sorted_line.sort_by(|a, b| {
934                a.char_box
935                    .left
936                    .partial_cmp(&b.char_box.left)
937                    .unwrap_or(std::cmp::Ordering::Equal)
938            });
939
940            // Collect owned chars for word segmentation
941            let owned: Vec<TextCharacter> = sorted_line.iter().map(|&c| c.clone()).collect();
942            let words = segment_words(&owned);
943
944            let y = sorted_line[0].char_box.bottom;
945            let height = sorted_line
946                .iter()
947                .map(|c| c.char_box.top - c.char_box.bottom)
948                .fold(0.0f32, f32::max);
949            let text = words
950                .iter()
951                .map(|w| w.text.as_str())
952                .collect::<Vec<_>>()
953                .join(" ");
954
955            TextLine {
956                text,
957                words,
958                y,
959                height,
960            }
961        })
962        .collect()
963}
964
965/// Group lines into paragraphs based on vertical spacing.
966///
967/// A new paragraph starts when the vertical gap between consecutive lines
968/// exceeds the average line spacing by a factor of 1.5, or when the first
969/// word of the next line is significantly indented.
970pub fn segment_paragraphs(lines: &[TextLine]) -> Vec<Vec<TextLine>> {
971    if lines.is_empty() {
972        return Vec::new();
973    }
974    if lines.len() == 1 {
975        return vec![lines.to_vec()];
976    }
977
978    // Calculate average line gap
979    let mut gaps: Vec<f32> = Vec::new();
980    for i in 0..lines.len() - 1 {
981        let gap = (lines[i].y - lines[i + 1].y).abs();
982        gaps.push(gap);
983    }
984    let avg_gap = if gaps.is_empty() {
985        0.0
986    } else {
987        gaps.iter().sum::<f32>() / gaps.len() as f32
988    };
989
990    let para_threshold = avg_gap * 1.5;
991
992    let mut paragraphs = Vec::new();
993    let mut current_para = vec![lines[0].clone()];
994
995    for i in 1..lines.len() {
996        let gap = (lines[i - 1].y - lines[i].y).abs();
997        if para_threshold > 0.0 && gap > para_threshold {
998            paragraphs.push(current_para);
999            current_para = vec![lines[i].clone()];
1000        } else {
1001            current_para.push(lines[i].clone());
1002        }
1003    }
1004    paragraphs.push(current_para);
1005
1006    paragraphs
1007}
1008
1009/// Returns `true` if the character is a CJK ideograph.
1010///
1011/// CJK ideographs are treated as individual words in segmentation because
1012/// they are not separated by spaces in typical CJK text.
1013fn is_cjk_ideograph(c: char) -> bool {
1014    matches!(c,
1015        '\u{4E00}'..='\u{9FFF}'   // CJK Unified Ideographs
1016        | '\u{3400}'..='\u{4DBF}' // CJK Unified Ideographs Extension A
1017        | '\u{F900}'..='\u{FAFF}' // CJK Compatibility Ideographs
1018    )
1019}
1020
1021fn build_word(chars: &[&TextCharacter]) -> TextWord {
1022    let text: String = chars.iter().map(|c| c.unicode).collect();
1023    let x = chars[0].char_box.left;
1024    let y = chars[0].char_box.bottom;
1025    let last = chars.last().unwrap();
1026    let width = last.char_box.right - x;
1027    let height = chars
1028        .iter()
1029        .map(|c| c.char_box.top - c.char_box.bottom)
1030        .fold(0.0f32, f32::max);
1031
1032    TextWord {
1033        text,
1034        x,
1035        y,
1036        width,
1037        height,
1038    }
1039}
1040
1041pub use rpdfium_core::fx_bidi::mirror_char;
1042
1043/// A detected column boundary in page space.
1044#[derive(Debug, Clone)]
1045pub struct ColumnBound {
1046    /// Left edge of the column.
1047    pub x_start: f32,
1048    /// Right edge of the column.
1049    pub x_end: f32,
1050}
1051
1052/// Dominant text flow orientation within a column or page.
1053#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1054pub enum TextFlowOrientation {
1055    /// Left-to-right (or right-to-left) horizontal text.
1056    Horizontal,
1057    /// Top-to-bottom vertical text (CJK convention).
1058    Vertical,
1059    /// Could not determine orientation (upstream `kUnknown`).
1060    Unknown,
1061}
1062
1063/// Minimum column width as a fraction of total page width.
1064/// Columns narrower than this are rejected as noise.
1065const MIN_COLUMN_WIDTH_FRACTION: f32 = 0.10;
1066
1067/// Minimum number of lines that must fall within a candidate column
1068/// for it to be accepted. Single-line "columns" are likely noise.
1069const MIN_LINES_PER_COLUMN: usize = 2;
1070
1071/// Sort lines into reading order: top-to-bottom, left-to-right.
1072///
1073/// For single-column layouts, this simply sorts by descending y (top first).
1074/// For multi-column layouts, lines are grouped by column, then sorted
1075/// column-by-column (left column first), top-to-bottom within each column.
1076///
1077/// Vertical text columns (detected by character aspect ratios) are sorted
1078/// right-to-left (traditional CJK convention), top-to-bottom within each
1079/// column.
1080pub fn sort_reading_order(lines: &mut [TextLine]) {
1081    if lines.len() <= 1 {
1082        return;
1083    }
1084
1085    let columns = detect_columns(lines);
1086    let orientation = detect_orientation_from_lines(lines);
1087
1088    if columns.len() <= 1 {
1089        // Single column — sort by y descending (top of page first)
1090        lines.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal));
1091    } else if orientation == TextFlowOrientation::Vertical {
1092        // Vertical text: columns right-to-left, within each column top-to-bottom
1093        lines.sort_by(|a, b| {
1094            let col_a = column_index_for_line(a, &columns);
1095            let col_b = column_index_for_line(b, &columns);
1096            // Reverse column order: higher column index (rightmost) first
1097            col_b
1098                .cmp(&col_a)
1099                .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal))
1100        });
1101    } else {
1102        // Horizontal multi-column — assign each line to a column, then sort by
1103        // (column_index, descending y)
1104        lines.sort_by(|a, b| {
1105            let col_a = column_index_for_line(a, &columns);
1106            let col_b = column_index_for_line(b, &columns);
1107            col_a
1108                .cmp(&col_b)
1109                .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal))
1110        });
1111    }
1112}
1113
1114/// Detect column boundaries from line positions.
1115///
1116/// Uses histogram-based gap detection: builds a histogram of x-gaps between
1117/// line spans, finds natural break points, and validates candidate columns
1118/// against minimum width and line count thresholds.
1119///
1120/// Returns one `ColumnBound` per detected column, sorted left-to-right.
1121/// Falls back to a single column when detection is ambiguous.
1122pub fn detect_columns(lines: &[TextLine]) -> Vec<ColumnBound> {
1123    if lines.is_empty() {
1124        return Vec::new();
1125    }
1126
1127    // Collect (x_start, x_end) for each line, skipping empty lines
1128    let mut line_spans: Vec<(f32, f32)> = lines
1129        .iter()
1130        .filter_map(|line| {
1131            if line.words.is_empty() {
1132                None
1133            } else {
1134                let x_start = line.words[0].x;
1135                let last_word = line.words.last().unwrap();
1136                let x_end = last_word.x + last_word.width;
1137                Some((x_start, x_end))
1138            }
1139        })
1140        .collect();
1141
1142    if line_spans.is_empty() {
1143        return Vec::new();
1144    }
1145
1146    // Find the overall extent
1147    let overall_left = line_spans
1148        .iter()
1149        .map(|(s, _)| *s)
1150        .fold(f32::INFINITY, f32::min);
1151    let overall_right = line_spans
1152        .iter()
1153        .map(|(_, e)| *e)
1154        .fold(f32::NEG_INFINITY, f32::max);
1155
1156    let page_width = overall_right - overall_left;
1157    if page_width < 1.0 {
1158        return vec![ColumnBound {
1159            x_start: overall_left,
1160            x_end: overall_right,
1161        }];
1162    }
1163
1164    let min_column_width = page_width * MIN_COLUMN_WIDTH_FRACTION;
1165
1166    // Sort by x_start for gap analysis
1167    line_spans.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
1168
1169    // Build histogram of x-gaps between non-overlapping spans.
1170    // Merge overlapping spans into clusters, then measure gaps between clusters.
1171    let mut merged: Vec<(f32, f32)> = Vec::new();
1172    for &(start, end) in &line_spans {
1173        if let Some(last) = merged.last_mut() {
1174            // Merge if overlapping or touching
1175            if start <= last.1 {
1176                last.1 = last.1.max(end);
1177            } else {
1178                merged.push((start, end));
1179            }
1180        } else {
1181            merged.push((start, end));
1182        }
1183    }
1184
1185    if merged.len() <= 1 {
1186        return vec![ColumnBound {
1187            x_start: overall_left,
1188            x_end: overall_right,
1189        }];
1190    }
1191
1192    // Collect inter-cluster gaps
1193    let mut gaps: Vec<(f32, usize)> = Vec::new(); // (gap_size, index_before_gap)
1194    for i in 0..merged.len() - 1 {
1195        let gap = merged[i + 1].0 - merged[i].1;
1196        if gap > 0.0 {
1197            gaps.push((gap, i));
1198        }
1199    }
1200
1201    if gaps.is_empty() {
1202        return vec![ColumnBound {
1203            x_start: overall_left,
1204            x_end: overall_right,
1205        }];
1206    }
1207
1208    // Find the natural break point: the largest gap that produces valid columns.
1209    // Sort gaps descending by size.
1210    gaps.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1211
1212    // Try the largest gap first; verify it produces valid columns
1213    for &(_gap_size, gap_idx) in &gaps {
1214        let left_col = ColumnBound {
1215            x_start: merged[0].0,
1216            x_end: merged[gap_idx].1,
1217        };
1218        let right_col = ColumnBound {
1219            x_start: merged[gap_idx + 1].0,
1220            x_end: merged.last().unwrap().1,
1221        };
1222
1223        // Validate minimum column width
1224        let left_width = left_col.x_end - left_col.x_start;
1225        let right_width = right_col.x_end - right_col.x_start;
1226        if left_width < min_column_width || right_width < min_column_width {
1227            continue;
1228        }
1229
1230        // Validate minimum line count per column
1231        let left_count = count_lines_in_column(lines, &left_col);
1232        let right_count = count_lines_in_column(lines, &right_col);
1233        if left_count < MIN_LINES_PER_COLUMN || right_count < MIN_LINES_PER_COLUMN {
1234            continue;
1235        }
1236
1237        // Check for ambiguity: if any line spans both columns, fall back
1238        let has_spanning_line = lines.iter().any(|line| {
1239            if line.words.is_empty() {
1240                return false;
1241            }
1242            let lx = line.words[0].x;
1243            let last_w = line.words.last().unwrap();
1244            let rx = last_w.x + last_w.width;
1245            lx < left_col.x_end && rx > right_col.x_start
1246        });
1247        if has_spanning_line {
1248            continue;
1249        }
1250
1251        return vec![left_col, right_col];
1252    }
1253
1254    // No valid split found — single column
1255    vec![ColumnBound {
1256        x_start: overall_left,
1257        x_end: overall_right,
1258    }]
1259}
1260
1261/// Count how many lines have their center within the given column bounds.
1262fn count_lines_in_column(lines: &[TextLine], col: &ColumnBound) -> usize {
1263    lines
1264        .iter()
1265        .filter(|line| {
1266            if line.words.is_empty() {
1267                return false;
1268            }
1269            let first = &line.words[0];
1270            let last = line.words.last().unwrap();
1271            let center = (first.x + last.x + last.width) / 2.0;
1272            center >= col.x_start && center <= col.x_end
1273        })
1274        .count()
1275}
1276
1277/// Detect text flow orientation using coverage masks (upstream `FindTextlineFlowOrientation`).
1278///
1279/// Uses `TextCharacter` bounding boxes as proxy for page objects.
1280/// Builds horizontal and vertical coverage masks, comparing fill ratios
1281/// to determine whether text flows horizontally or vertically.
1282pub fn detect_orientation(
1283    chars: &[TextCharacter],
1284    page_width: f32,
1285    page_height: f32,
1286) -> TextFlowOrientation {
1287    let pw = page_width as usize;
1288    let ph = page_height as usize;
1289    if pw == 0 || ph == 0 || chars.is_empty() {
1290        return TextFlowOrientation::Unknown;
1291    }
1292
1293    // Clamp dimensions to avoid excessive allocation (8192px max)
1294    let pw = pw.min(8192);
1295    let ph = ph.min(8192);
1296
1297    let mut h_mask = vec![false; pw];
1298    let mut v_mask = vec![false; ph];
1299    let mut line_height: f32 = 0.0;
1300    let (mut start_h, mut end_h) = (pw, 0usize);
1301    let (mut start_v, mut end_v) = (ph, 0usize);
1302
1303    for ch in chars {
1304        if ch.char_type == CharType::Generated {
1305            continue;
1306        }
1307        let min_h = (ch.char_box.left.max(0.0) as usize).min(pw);
1308        let max_h = (ch.char_box.right.max(0.0) as usize).min(pw);
1309        let min_v = (ch.char_box.bottom.max(0.0) as usize).min(ph);
1310        let max_v = (ch.char_box.top.max(0.0) as usize).min(ph);
1311        if min_h >= max_h || min_v >= max_v {
1312            continue;
1313        }
1314
1315        for cell in &mut h_mask[min_h..max_h] {
1316            *cell = true;
1317        }
1318        for cell in &mut v_mask[min_v..max_v] {
1319            *cell = true;
1320        }
1321
1322        start_h = start_h.min(min_h);
1323        end_h = end_h.max(max_h);
1324        start_v = start_v.min(min_v);
1325        end_v = end_v.max(max_v);
1326
1327        if line_height <= 0.0 {
1328            line_height = ch.char_box.top - ch.char_box.bottom;
1329        }
1330    }
1331
1332    let double_lh = (2.0 * line_height) as usize;
1333    if end_v.saturating_sub(start_v) < double_lh {
1334        return TextFlowOrientation::Horizontal;
1335    }
1336    if end_h.saturating_sub(start_h) < double_lh {
1337        return TextFlowOrientation::Vertical;
1338    }
1339
1340    let sum_h = mask_percent_filled(&h_mask, start_h, end_h);
1341    if sum_h > 0.8 {
1342        return TextFlowOrientation::Horizontal;
1343    }
1344    let sum_v = mask_percent_filled(&v_mask, start_v, end_v);
1345    if sum_h > sum_v {
1346        TextFlowOrientation::Horizontal
1347    } else if sum_h < sum_v {
1348        TextFlowOrientation::Vertical
1349    } else {
1350        TextFlowOrientation::Unknown
1351    }
1352}
1353
1354/// Compute the percentage of `true` values in a mask slice.
1355fn mask_percent_filled(mask: &[bool], start: usize, end: usize) -> f32 {
1356    if start >= end {
1357        return 0.0;
1358    }
1359    let count = mask[start..end].iter().filter(|&&b| b).count();
1360    count as f32 / (end - start) as f32
1361}
1362
1363/// Line-based orientation detection (backward-compatible wrapper).
1364///
1365/// Derives approximate character extents from line/word geometry
1366/// and delegates to the coverage mask algorithm.
1367fn detect_orientation_from_lines(lines: &[TextLine]) -> TextFlowOrientation {
1368    if lines.is_empty() {
1369        return TextFlowOrientation::Unknown;
1370    }
1371
1372    let mut vertical_count = 0usize;
1373    let mut total_count = 0usize;
1374
1375    for line in lines {
1376        if line.words.is_empty() {
1377            continue;
1378        }
1379        total_count += 1;
1380        let first = &line.words[0];
1381        let last = line.words.last().unwrap();
1382        let line_width = (last.x + last.width) - first.x;
1383        if line_width > 0.0 && line_width < line.height * 1.5 {
1384            vertical_count += 1;
1385        }
1386    }
1387
1388    if total_count > 0 && vertical_count * 2 > total_count {
1389        TextFlowOrientation::Vertical
1390    } else {
1391        TextFlowOrientation::Horizontal
1392    }
1393}
1394
1395pub use self::fx_bidi::reorder_bidi;
1396#[cfg(feature = "icu")]
1397use self::fx_bidi::reorder_bidi_with_direction;
1398
1399fn column_index_for_line(line: &TextLine, columns: &[ColumnBound]) -> usize {
1400    if line.words.is_empty() {
1401        return 0;
1402    }
1403    let line_center = {
1404        let first = &line.words[0];
1405        let last = line.words.last().unwrap();
1406        (first.x + last.x + last.width) / 2.0
1407    };
1408
1409    columns
1410        .iter()
1411        .enumerate()
1412        .min_by(|(_, a), (_, b)| {
1413            let mid_a = (a.x_start + a.x_end) / 2.0;
1414            let mid_b = (b.x_start + b.x_end) / 2.0;
1415            let dist_a = (line_center - mid_a).abs();
1416            let dist_b = (line_center - mid_b).abs();
1417            dist_a
1418                .partial_cmp(&dist_b)
1419                .unwrap_or(std::cmp::Ordering::Equal)
1420        })
1421        .map(|(idx, _)| idx)
1422        .unwrap_or(0)
1423}
1424
1425/// Bounding rectangle for a character or merged selection region.
1426#[derive(Debug, Clone, PartialEq)]
1427pub struct CharRect {
1428    pub left: f32,
1429    pub bottom: f32,
1430    pub right: f32,
1431    pub top: f32,
1432}
1433
1434/// Page-space origin of a character (upstream `CFX_PointF` from `CharInfo::origin_`).
1435///
1436/// Returned by [`TextPage::get_char_origin`] (upstream `FPDFText_GetCharOrigin`).
1437#[derive(Debug, Clone, Copy, PartialEq)]
1438pub struct CharOrigin {
1439    pub x: f32,
1440    pub y: f32,
1441}
1442
1443/// Extracted text from a single PDF page, with character-level position data.
1444#[derive(Debug, Clone)]
1445pub struct TextPage {
1446    characters: Vec<TextCharacter>,
1447    text: String,
1448    /// Run IDs for each character (same length as `characters`).
1449    /// Internal tracking for `text_object()` / `text_by_object()`.
1450    run_ids: Vec<Option<u32>>,
1451}
1452
1453impl TextPage {
1454    /// Create a new `TextPage` from extracted characters.
1455    ///
1456    /// Run IDs default to `None` for all characters. Use
1457    /// [`new_with_run_ids`](Self::new_with_run_ids) to supply run tracking data.
1458    pub fn new(characters: Vec<TextCharacter>) -> Self {
1459        let run_ids = vec![None; characters.len()];
1460        Self::new_with_run_ids(characters, run_ids, false)
1461    }
1462
1463    /// Create a new `TextPage` with an explicit RTL direction hint.
1464    ///
1465    /// When `rtl` is true and the `icu` feature is enabled, bidi reordering
1466    /// uses RTL as the default paragraph direction.
1467    pub fn new_with_direction(characters: Vec<TextCharacter>, rtl: bool) -> Self {
1468        let run_ids = vec![None; characters.len()];
1469        Self::new_with_run_ids(characters, run_ids, rtl)
1470    }
1471
1472    /// Create a new `TextPage` with run ID tracking and optional RTL hint.
1473    ///
1474    /// `run_ids` must have the same length as `characters`. Each entry maps
1475    /// a character to its originating text run (or `None` for generated characters).
1476    pub fn new_with_run_ids(
1477        characters: Vec<TextCharacter>,
1478        run_ids: Vec<Option<u32>>,
1479        rtl: bool,
1480    ) -> Self {
1481        debug_assert_eq!(characters.len(), run_ids.len());
1482        let mut characters = characters;
1483        let mut run_ids = run_ids;
1484        if rtl {
1485            #[cfg(feature = "icu")]
1486            reorder_bidi_with_direction(&mut characters, &mut run_ids, true);
1487            #[cfg(not(feature = "icu"))]
1488            reorder_bidi(&mut characters, &mut run_ids);
1489        } else {
1490            reorder_bidi(&mut characters, &mut run_ids);
1491        }
1492        // Rebuild text from reordered characters (bidi may change order)
1493        let text: String = characters.iter().map(|c| c.unicode).collect();
1494        let text = normalize_text(&text);
1495        Self {
1496            characters,
1497            text,
1498            run_ids,
1499        }
1500    }
1501
1502    /// Returns all extracted text from the page (upstream `GetAllPageText`).
1503    pub fn all_page_text(&self) -> &str {
1504        &self.text
1505    }
1506
1507    /// Upstream-aligned alias for [`all_page_text()`](Self::all_page_text).
1508    #[inline]
1509    pub fn get_all_page_text(&self) -> &str {
1510        self.all_page_text()
1511    }
1512
1513    /// All extracted characters with position metadata.
1514    pub fn characters(&self) -> &[TextCharacter] {
1515        &self.characters
1516    }
1517
1518    /// Returns the number of extracted characters (upstream `CountChars`).
1519    pub fn char_count(&self) -> usize {
1520        self.characters.len()
1521    }
1522
1523    /// Upstream-aligned alias for [`char_count()`](Self::char_count).
1524    ///
1525    /// Corresponds to `FPDFText_CountChars`.
1526    #[inline]
1527    pub fn text_count_chars(&self) -> usize {
1528        self.char_count()
1529    }
1530
1531    /// Use [`text_count_chars()`](Self::text_count_chars) instead (upstream `FPDFText_CountChars`).
1532    #[deprecated(
1533        since = "0.1.0",
1534        note = "Use `text_count_chars()` (upstream `FPDFText_CountChars`)"
1535    )]
1536    #[inline]
1537    pub fn count_chars(&self) -> usize {
1538        self.char_count()
1539    }
1540
1541    /// Returns the number of extracted characters.
1542    ///
1543    /// Upstream-aligned alias for [`char_count()`](Self::char_count).
1544    ///
1545    /// Corresponds to `CPDF_TextPage::size()`.
1546    #[inline]
1547    pub fn size(&self) -> usize {
1548        self.char_count()
1549    }
1550
1551    // --- Indexed character property getters (upstream FPDF_Text alignment) ---
1552
1553    /// Get a reference to the character at `index`, or `None` if out of bounds.
1554    fn get_char(&self, index: usize) -> Option<&TextCharacter> {
1555        self.characters.get(index)
1556    }
1557
1558    /// Get the character info at `index` (upstream `CPDF_TextPage::GetCharInfo()`).
1559    ///
1560    /// Returns `None` if `index` is out of bounds.
1561    /// The returned `TextCharacter` holds unicode, char_code, char_box,
1562    /// font_size, font_name, matrix, loose_char_box, fill/stroke colors, and more.
1563    pub fn char_info(&self, index: usize) -> Option<&TextCharacter> {
1564        self.characters.get(index)
1565    }
1566
1567    /// Upstream-aligned alias for [`char_info()`](Self::char_info).
1568    ///
1569    /// Corresponds to `CPDF_TextPage::GetCharInfo()`.
1570    #[inline]
1571    pub fn get_char_info(&self, index: usize) -> Option<&TextCharacter> {
1572        self.char_info(index)
1573    }
1574
1575    /// Get the Unicode character at `index` (upstream `FPDFText_GetUnicode`).
1576    pub fn unicode(&self, index: usize) -> Option<char> {
1577        self.get_char(index).map(|c| c.unicode)
1578    }
1579
1580    /// Upstream-aligned alias for [`unicode()`](Self::unicode).
1581    ///
1582    /// Corresponds to `FPDFText_GetUnicode`.
1583    #[inline]
1584    pub fn text_get_unicode(&self, index: usize) -> Option<char> {
1585        self.unicode(index)
1586    }
1587
1588    /// Use [`text_get_unicode()`](Self::text_get_unicode) instead (upstream `FPDFText_GetUnicode`).
1589    #[deprecated(
1590        since = "0.1.0",
1591        note = "Use `text_get_unicode()` (upstream `FPDFText_GetUnicode`)"
1592    )]
1593    #[inline]
1594    pub fn get_unicode(&self, index: usize) -> Option<char> {
1595        self.unicode(index)
1596    }
1597
1598    /// Get the raw font encoding code before Unicode mapping at `index`
1599    /// (upstream `CharInfo::char_code`).
1600    ///
1601    /// Returns `0` for ActualText runs and generated spaces.
1602    pub fn char_code(&self, index: usize) -> Option<u32> {
1603        self.get_char(index).map(|c| c.char_code)
1604    }
1605
1606    /// Alias for [`char_code()`](Self::char_code).
1607    ///
1608    /// Note: there is no upstream `FPDFText_GetCharCode` or `GetCharCode()`
1609    /// C/C++ function. `char_code` is a field on `CharInfo`, not a method on
1610    /// `CPDF_TextPage`. Use [`char_code()`](Self::char_code) directly.
1611    #[deprecated(
1612        since = "0.1.0",
1613        note = "Use `char_code()` directly; no upstream GetCharCode() method exists"
1614    )]
1615    #[inline]
1616    pub fn get_char_code(&self, index: usize) -> Option<u32> {
1617        self.char_code(index)
1618    }
1619
1620    /// Check if the character at `index` is generated (upstream `FPDFText_IsGenerated`).
1621    pub fn is_generated(&self, index: usize) -> Option<bool> {
1622        self.get_char(index)
1623            .map(|c| c.char_type == CharType::Generated)
1624    }
1625
1626    /// Upstream-aligned alias for [`is_generated()`](Self::is_generated).
1627    ///
1628    /// Corresponds to `FPDFText_IsGenerated`.
1629    #[inline]
1630    pub fn text_is_generated(&self, index: usize) -> Option<bool> {
1631        self.is_generated(index)
1632    }
1633
1634    /// Check if the character at `index` is a hyphen (upstream `FPDFText_IsHyphen`).
1635    pub fn is_hyphen(&self, index: usize) -> Option<bool> {
1636        self.get_char(index)
1637            .map(|c| c.char_type == CharType::Hyphen || c.is_soft_hyphen)
1638    }
1639
1640    /// Upstream-aligned alias for [`is_hyphen()`](Self::is_hyphen).
1641    ///
1642    /// Corresponds to `FPDFText_IsHyphen`.
1643    #[inline]
1644    pub fn text_is_hyphen(&self, index: usize) -> Option<bool> {
1645        self.is_hyphen(index)
1646    }
1647
1648    /// Check if the character at `index` had a Unicode mapping error
1649    /// (upstream `FPDFText_HasUnicodeMapError`).
1650    pub fn has_unicode_map_error(&self, index: usize) -> Option<bool> {
1651        self.get_char(index)
1652            .map(|c| c.char_type == CharType::NotUnicode)
1653    }
1654
1655    /// Upstream-aligned alias for [`has_unicode_map_error()`](Self::has_unicode_map_error).
1656    ///
1657    /// Corresponds to `FPDFText_HasUnicodeMapError`.
1658    #[inline]
1659    pub fn text_has_unicode_map_error(&self, index: usize) -> Option<bool> {
1660        self.has_unicode_map_error(index)
1661    }
1662
1663    /// Get the font size of the character at `index`
1664    /// (upstream `CPDF_TextPage::GetCharFontSize()` / `FPDFText_GetFontSize`).
1665    pub fn font_size(&self, index: usize) -> Option<f32> {
1666        self.get_char(index).map(|c| c.font_size)
1667    }
1668
1669    /// Upstream-aligned alias for [`font_size()`](Self::font_size).
1670    ///
1671    /// Corresponds to `CPDF_TextPage::GetCharFontSize()`.
1672    #[inline]
1673    pub fn get_char_font_size(&self, index: usize) -> Option<f32> {
1674        self.font_size(index)
1675    }
1676
1677    /// Upstream-aligned alias for [`font_size()`](Self::font_size).
1678    ///
1679    /// Corresponds to `FPDFText_GetFontSize`.
1680    #[inline]
1681    pub fn text_get_font_size(&self, index: usize) -> Option<f32> {
1682        self.font_size(index)
1683    }
1684
1685    /// Use [`text_get_font_size()`](Self::text_get_font_size) instead (upstream `FPDFText_GetFontSize`).
1686    #[deprecated(
1687        since = "0.1.0",
1688        note = "Use `text_get_font_size()` (upstream `FPDFText_GetFontSize`)"
1689    )]
1690    #[inline]
1691    pub fn get_font_size(&self, index: usize) -> Option<f32> {
1692        self.font_size(index)
1693    }
1694
1695    /// Get the font name of the character at `index` (upstream `FPDFText_GetFontInfo`).
1696    pub fn font_info(&self, index: usize) -> Option<&str> {
1697        self.get_char(index).map(|c| c.font_name.as_str())
1698    }
1699
1700    /// Upstream-aligned alias for [`font_info()`](Self::font_info).
1701    ///
1702    /// Corresponds to `FPDFText_GetFontInfo`.
1703    #[inline]
1704    pub fn text_get_font_info(&self, index: usize) -> Option<&str> {
1705        self.font_info(index)
1706    }
1707
1708    /// Use [`text_get_font_info()`](Self::text_get_font_info) instead (upstream `FPDFText_GetFontInfo`).
1709    #[deprecated(
1710        since = "0.1.0",
1711        note = "Use `text_get_font_info()` (upstream `FPDFText_GetFontInfo`)"
1712    )]
1713    #[inline]
1714    pub fn get_font_info(&self, index: usize) -> Option<&str> {
1715        self.font_info(index)
1716    }
1717
1718    /// Get the rotation angle of the character at `index` in radians,
1719    /// normalized to `[0, 2*PI)` (upstream `FPDFText_GetCharAngle`).
1720    pub fn char_angle(&self, index: usize) -> Option<f32> {
1721        self.get_char(index).map(|c| {
1722            let angle = c.matrix[1].atan2(c.matrix[0]);
1723            if angle < 0.0 {
1724                angle + std::f32::consts::TAU
1725            } else {
1726                angle
1727            }
1728        })
1729    }
1730
1731    /// Upstream-aligned alias for [`char_angle()`](Self::char_angle).
1732    ///
1733    /// Corresponds to `FPDFText_GetCharAngle`.
1734    #[inline]
1735    pub fn text_get_char_angle(&self, index: usize) -> Option<f32> {
1736        self.char_angle(index)
1737    }
1738
1739    /// Use [`text_get_char_angle()`](Self::text_get_char_angle) instead (upstream `FPDFText_GetCharAngle`).
1740    #[deprecated(
1741        since = "0.1.0",
1742        note = "Use `text_get_char_angle()` (upstream `FPDFText_GetCharAngle`)"
1743    )]
1744    #[inline]
1745    pub fn get_char_angle(&self, index: usize) -> Option<f32> {
1746        self.char_angle(index)
1747    }
1748
1749    /// Get the tight bounding box of the character at `index`
1750    /// (upstream `FPDFText_GetCharBox`).
1751    pub fn char_box(&self, index: usize) -> Option<CharRect> {
1752        self.get_char(index).map(|c| c.char_box.clone())
1753    }
1754
1755    /// Upstream-aligned alias for [`char_box()`](Self::char_box).
1756    ///
1757    /// Corresponds to `FPDFText_GetCharBox`.
1758    #[inline]
1759    pub fn text_get_char_box(&self, index: usize) -> Option<CharRect> {
1760        self.char_box(index)
1761    }
1762
1763    /// Use [`text_get_char_box()`](Self::text_get_char_box) instead (upstream `FPDFText_GetCharBox`).
1764    #[deprecated(
1765        since = "0.1.0",
1766        note = "Use `text_get_char_box()` (upstream `FPDFText_GetCharBox`)"
1767    )]
1768    #[inline]
1769    pub fn get_char_box(&self, index: usize) -> Option<CharRect> {
1770        self.char_box(index)
1771    }
1772
1773    /// Get the loose bounding box of the character at `index`, using font
1774    /// ascent/descent metrics
1775    /// (upstream `CPDF_TextPage::GetCharLooseBounds()` / `FPDFText_GetLooseCharBox`).
1776    ///
1777    /// Returns `None` if the index is out of bounds or the character has no
1778    /// loose box (e.g. generated/ActualText characters).
1779    pub fn loose_char_box(&self, index: usize) -> Option<CharRect> {
1780        self.get_char(index).and_then(|c| c.loose_char_box.clone())
1781    }
1782
1783    /// Corresponds to internal `CPDF_TextPage::GetCharLooseBounds()` (not a public FPDF_* API).
1784    /// Use [`text_get_loose_char_box()`](Self::text_get_loose_char_box) — matches public
1785    /// upstream `FPDFText_GetLooseCharBox`.
1786    #[deprecated(
1787        note = "use `text_get_loose_char_box()` — matches upstream `FPDFText_GetLooseCharBox`"
1788    )]
1789    #[inline]
1790    pub fn get_char_loose_bounds(&self, index: usize) -> Option<CharRect> {
1791        self.loose_char_box(index)
1792    }
1793
1794    /// Upstream-aligned alias for [`loose_char_box()`](Self::loose_char_box).
1795    ///
1796    /// Corresponds to `FPDFText_GetLooseCharBox`.
1797    #[inline]
1798    pub fn text_get_loose_char_box(&self, index: usize) -> Option<CharRect> {
1799        self.loose_char_box(index)
1800    }
1801
1802    /// Use [`text_get_loose_char_box()`](Self::text_get_loose_char_box) instead (upstream `FPDFText_GetLooseCharBox`).
1803    #[deprecated(
1804        since = "0.1.0",
1805        note = "Use `text_get_loose_char_box()` (upstream `FPDFText_GetLooseCharBox`)"
1806    )]
1807    #[inline]
1808    pub fn get_loose_char_box(&self, index: usize) -> Option<CharRect> {
1809        self.loose_char_box(index)
1810    }
1811
1812    /// Get the text matrix `[a, b, c, d, e, f]` of the character at `index`
1813    /// (upstream `FPDFText_GetMatrix`).
1814    pub fn matrix(&self, index: usize) -> Option<[f32; 6]> {
1815        self.get_char(index).map(|c| c.matrix)
1816    }
1817
1818    /// Upstream-aligned alias for [`matrix()`](Self::matrix).
1819    ///
1820    /// Corresponds to `FPDFText_GetMatrix`.
1821    #[inline]
1822    pub fn text_get_matrix(&self, index: usize) -> Option<[f32; 6]> {
1823        self.matrix(index)
1824    }
1825
1826    /// Use [`text_get_matrix()`](Self::text_get_matrix) instead (upstream `FPDFText_GetMatrix`).
1827    #[deprecated(
1828        since = "0.1.0",
1829        note = "Use `text_get_matrix()` (upstream `FPDFText_GetMatrix`)"
1830    )]
1831    #[inline]
1832    pub fn get_matrix(&self, index: usize) -> Option<[f32; 6]> {
1833        self.matrix(index)
1834    }
1835
1836    /// Get the page-space origin of the character at `index`
1837    /// (upstream `FPDFText_GetCharOrigin`).
1838    pub fn char_origin(&self, index: usize) -> Option<CharOrigin> {
1839        self.get_char(index).map(|c| CharOrigin {
1840            x: c.matrix[4],
1841            y: c.matrix[5],
1842        })
1843    }
1844
1845    /// Upstream-aligned alias for [`char_origin()`](Self::char_origin).
1846    ///
1847    /// Corresponds to `FPDFText_GetCharOrigin`.
1848    #[inline]
1849    pub fn text_get_char_origin(&self, index: usize) -> Option<CharOrigin> {
1850        self.char_origin(index)
1851    }
1852
1853    /// Use [`text_get_char_origin()`](Self::text_get_char_origin) instead (upstream `FPDFText_GetCharOrigin`).
1854    #[deprecated(
1855        since = "0.1.0",
1856        note = "Use `text_get_char_origin()` (upstream `FPDFText_GetCharOrigin`)"
1857    )]
1858    #[inline]
1859    pub fn get_char_origin(&self, index: usize) -> Option<CharOrigin> {
1860        self.char_origin(index)
1861    }
1862
1863    /// Get the text run ID of the character at `index`
1864    /// (upstream `FPDFText_GetTextObject`).
1865    pub fn text_object(&self, index: usize) -> Option<u32> {
1866        self.run_ids.get(index).copied().flatten()
1867    }
1868
1869    /// Upstream-aligned alias for [`text_object()`](Self::text_object).
1870    ///
1871    /// Corresponds to `FPDFText_GetTextObject`.
1872    #[inline]
1873    pub fn text_get_text_object(&self, index: usize) -> Option<u32> {
1874        self.text_object(index)
1875    }
1876
1877    /// Deprecated: use [`text_get_text_object()`](Self::text_get_text_object) — matches upstream `FPDFText_GetTextObject`.
1878    #[deprecated(note = "use `text_get_text_object()` — matches upstream `FPDFText_GetTextObject`")]
1879    #[inline]
1880    pub fn get_text_object(&self, index: usize) -> Option<u32> {
1881        self.text_object(index)
1882    }
1883
1884    /// Get the fill color of the character at `index`
1885    /// (upstream `FPDFText_GetFillColor`).
1886    pub fn fill_color(&self, index: usize) -> Option<&Color> {
1887        self.get_char(index).and_then(|c| c.fill_color.as_ref())
1888    }
1889
1890    /// Upstream-aligned alias for [`fill_color()`](Self::fill_color).
1891    ///
1892    /// Corresponds to `FPDFText_GetFillColor`.
1893    #[inline]
1894    pub fn text_get_fill_color(&self, index: usize) -> Option<&Color> {
1895        self.fill_color(index)
1896    }
1897
1898    /// Use [`text_get_fill_color()`](Self::text_get_fill_color) instead (upstream `FPDFText_GetFillColor`).
1899    #[deprecated(
1900        since = "0.1.0",
1901        note = "Use `text_get_fill_color()` (upstream `FPDFText_GetFillColor`)"
1902    )]
1903    #[inline]
1904    pub fn get_fill_color(&self, index: usize) -> Option<&Color> {
1905        self.fill_color(index)
1906    }
1907
1908    /// Get the stroke color of the character at `index`
1909    /// (upstream `FPDFText_GetStrokeColor`).
1910    pub fn stroke_color(&self, index: usize) -> Option<&Color> {
1911        self.get_char(index).and_then(|c| c.stroke_color.as_ref())
1912    }
1913
1914    /// Upstream-aligned alias for [`stroke_color()`](Self::stroke_color).
1915    ///
1916    /// Corresponds to `FPDFText_GetStrokeColor`.
1917    #[inline]
1918    pub fn text_get_stroke_color(&self, index: usize) -> Option<&Color> {
1919        self.stroke_color(index)
1920    }
1921
1922    /// Use [`text_get_stroke_color()`](Self::text_get_stroke_color) instead (upstream `FPDFText_GetStrokeColor`).
1923    #[deprecated(
1924        since = "0.1.0",
1925        note = "Use `text_get_stroke_color()` (upstream `FPDFText_GetStrokeColor`)"
1926    )]
1927    #[inline]
1928    pub fn get_stroke_color(&self, index: usize) -> Option<&Color> {
1929        self.stroke_color(index)
1930    }
1931
1932    /// Get the font weight of the character at `index` (100-900, CSS-style)
1933    /// (upstream `FPDFText_GetFontWeight`).
1934    pub fn font_weight(&self, index: usize) -> Option<i32> {
1935        self.get_char(index).and_then(|c| c.font_weight)
1936    }
1937
1938    /// Upstream-aligned alias for [`font_weight()`](Self::font_weight).
1939    ///
1940    /// Corresponds to `FPDFText_GetFontWeight`.
1941    #[inline]
1942    pub fn text_get_font_weight(&self, index: usize) -> Option<i32> {
1943        self.font_weight(index)
1944    }
1945
1946    /// Use [`text_get_font_weight()`](Self::text_get_font_weight) instead (upstream `FPDFText_GetFontWeight`).
1947    #[deprecated(
1948        since = "0.1.0",
1949        note = "Use `text_get_font_weight()` (upstream `FPDFText_GetFontWeight`)"
1950    )]
1951    #[inline]
1952    pub fn get_font_weight(&self, index: usize) -> Option<i32> {
1953        self.font_weight(index)
1954    }
1955
1956    /// Get the font descriptor flags of the character at `index`
1957    /// (upstream `FPDFText_GetFontInfo` flags output parameter).
1958    pub fn font_flags(&self, index: usize) -> Option<u32> {
1959        self.get_char(index).and_then(|c| c.font_flags)
1960    }
1961
1962    /// No dedicated upstream `FPDF_Text_GetFontFlags` function exists; flags are a
1963    /// sub-value of [`text_get_font_info()`](Self::text_get_font_info).
1964    #[deprecated(
1965        note = "use `font_flags()` directly; no upstream `FPDFText_GetFontFlags()` method exists"
1966    )]
1967    #[inline]
1968    pub fn get_font_flags(&self, index: usize) -> Option<u32> {
1969        self.font_flags(index)
1970    }
1971
1972    /// Get the text rendering mode of the character at `index`
1973    /// (upstream `FPDFTextObj_GetTextRenderMode`, cached at extraction time).
1974    pub fn char_render_mode(&self, index: usize) -> Option<TextRenderingMode> {
1975        self.get_char(index).map(|c| c.rendering_mode)
1976    }
1977
1978    /// Deprecated — use [`char_render_mode()`](Self::char_render_mode).
1979    ///
1980    /// Note: there is no public `FPDFText_GetTextRenderMode` in the PDFium C API.
1981    /// The page-object-level API is `FPDFTextObj_GetTextRenderMode` (available as
1982    /// [`text_obj_get_text_render_mode()`](crate::textpage::TextPage) on `TextObject` in
1983    /// rpdfium-edit). The rendering mode is cached per character at text page extraction time.
1984    #[deprecated(
1985        note = "use `char_render_mode()` — no public `FPDFText_GetTextRenderMode`; for page-object API see `text_obj_get_text_render_mode()` on TextObject"
1986    )]
1987    #[inline]
1988    pub fn text_get_text_render_mode(&self, index: usize) -> Option<TextRenderingMode> {
1989        self.char_render_mode(index)
1990    }
1991
1992    #[deprecated(
1993        note = "use `char_render_mode()` — no public `FPDFText_GetTextRenderMode`; for page-object API see `text_obj_get_text_render_mode()` on TextObject"
1994    )]
1995    #[inline]
1996    pub fn get_text_render_mode(&self, index: usize) -> Option<TextRenderingMode> {
1997        self.char_render_mode(index)
1998    }
1999
2000    /// Get the fill color as RGBA u8 tuple.
2001    ///
2002    /// Convenience helper — not a direct upstream method.
2003    /// The upstream C API `FPDFText_GetFillColor` is covered by
2004    /// [`fill_color()`](Self::fill_color) / [`get_fill_color()`](Self::get_fill_color).
2005    pub fn fill_color_rgba(&self, index: usize) -> Option<(u8, u8, u8, u8)> {
2006        self.fill_color(index).map(|c| c.to_rgba_u8())
2007    }
2008
2009    /// Get the stroke color as RGBA u8 tuple.
2010    ///
2011    /// Convenience helper — not a direct upstream method.
2012    /// The upstream C API `FPDFText_GetStrokeColor` is covered by
2013    /// [`stroke_color()`](Self::stroke_color) / [`get_stroke_color()`](Self::get_stroke_color).
2014    pub fn stroke_color_rgba(&self, index: usize) -> Option<(u8, u8, u8, u8)> {
2015        self.stroke_color(index).map(|c| c.to_rgba_u8())
2016    }
2017
2018    /// Get text belonging to a specific text object (upstream `GetTextByObject`).
2019    ///
2020    /// Uses run IDs as proxy for the upstream `CPDF_TextObject*` pointer.
2021    /// Iterates all characters, collecting those matching the given run ID.
2022    /// Trailing spaces after matched characters are included.
2023    /// Line breaks (`\r\n`) are inserted when the Y position changes between
2024    /// non-adjacent matched characters.
2025    ///
2026    /// Corresponds to `CPDF_TextPage::GetTextByObject()`.
2027    pub fn text_by_object(&self, run_id: u32) -> String {
2028        let mut result = String::new();
2029        let mut prev_y: f32 = 0.0;
2030        let mut has_prev = false;
2031        let mut need_line_feed = false;
2032
2033        for (ch, rid) in self.characters.iter().zip(self.run_ids.iter()) {
2034            if *rid == Some(run_id) {
2035                if need_line_feed
2036                    && !has_prev
2037                    && (prev_y - ch.char_box.bottom).abs() > 0.0
2038                    && !result.is_empty()
2039                {
2040                    result.push_str("\r\n");
2041                }
2042                prev_y = ch.char_box.bottom;
2043                has_prev = true;
2044                need_line_feed = false;
2045                if ch.unicode != '\0' {
2046                    result.push(ch.unicode);
2047                }
2048            } else if ch.unicode == ' ' {
2049                if has_prev {
2050                    result.push(' ');
2051                    has_prev = false;
2052                    need_line_feed = false;
2053                }
2054            } else {
2055                has_prev = false;
2056                need_line_feed = true;
2057            }
2058        }
2059        result
2060    }
2061
2062    /// Upstream-aligned alias for [`text_by_object()`](Self::text_by_object).
2063    ///
2064    /// Corresponds to `CPDF_TextPage::GetTextByObject()`.
2065    #[inline]
2066    pub fn get_text_by_object(&self, run_id: u32) -> String {
2067        self.text_by_object(run_id)
2068    }
2069
2070    /// Rust-idiomatic alias for [`text_by_object()`](Self::text_by_object).
2071    ///
2072    /// Prefer [`text_by_object()`](Self::text_by_object) which matches the upstream C++ name.
2073    #[deprecated(since = "0.1.0", note = "Use `text_by_object()` instead")]
2074    #[inline]
2075    pub fn text_by_run(&self, run_id: u32) -> String {
2076        self.text_by_object(run_id)
2077    }
2078
2079    /// Check whether two characters (by index) belong to the same text object.
2080    ///
2081    /// Two characters are considered from the same text object if they have
2082    /// the same run ID. Falls back to comparing unicode, font name,
2083    /// and position within 1% tolerance when run IDs are not available.
2084    pub fn is_same_text_object(&self, a: usize, b: usize) -> bool {
2085        if let (Some(Some(id_a)), Some(Some(id_b))) = (self.run_ids.get(a), self.run_ids.get(b)) {
2086            return id_a == id_b;
2087        }
2088        // Fallback: position + font comparison
2089        if let (Some(ca), Some(cb)) = (self.get_char(a), self.get_char(b)) {
2090            let tolerance = ca.font_size * 0.01;
2091            ca.unicode == cb.unicode
2092                && ca.font_name == cb.font_name
2093                && (ca.char_box.left - cb.char_box.left).abs() < tolerance
2094                && (ca.char_box.bottom - cb.char_box.bottom).abs() < tolerance
2095        } else {
2096            false
2097        }
2098    }
2099
2100    /// Extract links (URLs and email addresses) from the page text.
2101    pub fn extract_links(&self) -> Vec<crate::linkextract::Link> {
2102        crate::linkextract::extract_links(&self.text)
2103    }
2104
2105    /// Upstream-aligned alias for [`extract_links()`](Self::extract_links).
2106    ///
2107    /// Corresponds to `FPDFLink_LoadWebLinks`.
2108    #[inline]
2109    pub fn link_load_web_links(&self) -> Vec<crate::linkextract::Link> {
2110        self.extract_links()
2111    }
2112
2113    /// Use [`link_load_web_links()`](Self::link_load_web_links) instead (upstream `FPDFLink_LoadWebLinks`).
2114    #[deprecated(
2115        since = "0.1.0",
2116        note = "Use `link_load_web_links()` (upstream `FPDFLink_LoadWebLinks`)"
2117    )]
2118    #[inline]
2119    pub fn load_web_links(&self) -> Vec<crate::linkextract::Link> {
2120        self.extract_links()
2121    }
2122
2123    /// Return the page text with soft hyphens (U+00AD) removed and
2124    /// hyphenated words re-joined.
2125    pub fn text_without_soft_hyphens(&self) -> String {
2126        self.characters
2127            .iter()
2128            .filter(|c| !c.is_soft_hyphen)
2129            .map(|c| c.unicode)
2130            .collect()
2131    }
2132
2133    /// Find the character index closest to the given position within tolerance.
2134    ///
2135    /// Matches upstream PDFium's `CPDF_TextPage::GetIndexAtPos` algorithm:
2136    /// 1. Direct hit (point inside char_box) → return immediately.
2137    /// 2. Tolerance: expand char_box by `x_tol/2` and `y_tol/2` on each side,
2138    ///    compute Manhattan distance to nearest edge, track minimum.
2139    pub fn index_at_pos(&self, x: f32, y: f32, x_tol: f32, y_tol: f32) -> Option<usize> {
2140        let mut nearest: Option<(usize, f32)> = None;
2141        for (i, ch) in self.characters.iter().enumerate() {
2142            let r = &ch.char_box;
2143            // Direct hit: point is inside the tight bounding box.
2144            if x >= r.left && x <= r.right && y >= r.bottom && y <= r.top {
2145                return Some(i);
2146            }
2147            // Skip if no tolerance is specified.
2148            if x_tol <= 0.0 && y_tol <= 0.0 {
2149                continue;
2150            }
2151            // Check expanded rect (char_box expanded by tol/2 on each side).
2152            let ext_left = r.left - x_tol / 2.0;
2153            let ext_right = r.right + x_tol / 2.0;
2154            let ext_bottom = r.bottom - y_tol / 2.0;
2155            let ext_top = r.top + y_tol / 2.0;
2156            if x < ext_left || x > ext_right || y < ext_bottom || y > ext_top {
2157                continue;
2158            }
2159            // Manhattan distance to nearest edge of the original (non-expanded) box.
2160            let dx = (x - r.left).abs().min((x - r.right).abs());
2161            let dy = (y - r.bottom).abs().min((y - r.top).abs());
2162            let dist = dx + dy;
2163            if nearest.is_none_or(|(_, d)| dist < d) {
2164                nearest = Some((i, dist));
2165            }
2166        }
2167        nearest.map(|(i, _)| i)
2168    }
2169
2170    /// Upstream-aligned alias for [`index_at_pos()`](Self::index_at_pos).
2171    #[inline]
2172    pub fn get_index_at_pos(&self, x: f32, y: f32, x_tol: f32, y_tol: f32) -> Option<usize> {
2173        self.index_at_pos(x, y, x_tol, y_tol)
2174    }
2175
2176    /// Find the character index at the given position with tolerance
2177    /// (upstream `FPDFText_GetCharIndexAtPos`).
2178    ///
2179    /// This is a convenience wrapper around [`index_at_pos()`](Self::index_at_pos)
2180    /// that accepts `f64` parameters to match the upstream C API signature.
2181    pub fn char_index_at_pos(&self, x: f64, y: f64, x_tol: f64, y_tol: f64) -> Option<usize> {
2182        self.index_at_pos(x as f32, y as f32, x_tol as f32, y_tol as f32)
2183    }
2184
2185    /// Upstream-aligned alias for [`char_index_at_pos()`](Self::char_index_at_pos).
2186    ///
2187    /// Corresponds to `FPDFText_GetCharIndexAtPos`.
2188    #[inline]
2189    pub fn text_get_char_index_at_pos(
2190        &self,
2191        x: f64,
2192        y: f64,
2193        x_tol: f64,
2194        y_tol: f64,
2195    ) -> Option<usize> {
2196        self.char_index_at_pos(x, y, x_tol, y_tol)
2197    }
2198
2199    /// Use [`text_get_char_index_at_pos()`](Self::text_get_char_index_at_pos) instead (upstream `FPDFText_GetCharIndexAtPos`).
2200    #[deprecated(
2201        since = "0.1.0",
2202        note = "Use `text_get_char_index_at_pos()` (upstream `FPDFText_GetCharIndexAtPos`)"
2203    )]
2204    #[inline]
2205    pub fn get_char_index_at_pos(&self, x: f64, y: f64, x_tol: f64, y_tol: f64) -> Option<usize> {
2206        self.char_index_at_pos(x, y, x_tol, y_tol)
2207    }
2208
2209    /// Get bounding rectangles for each character in range `[start, end)`.
2210    pub fn char_rects(&self, start: usize, end: usize) -> Vec<CharRect> {
2211        let end = end.min(self.characters.len());
2212        let start = start.min(end);
2213        self.characters[start..end]
2214            .iter()
2215            .map(|ch| ch.char_box.clone())
2216            .collect()
2217    }
2218
2219    /// Get selection rectangles that merge adjacent same-run characters.
2220    ///
2221    /// Matches upstream PDFium's `CPDF_TextPage::GetRectArray` algorithm:
2222    /// - Skips `CharType::Generated` characters (synthesized spaces).
2223    /// - Skips zero-size bounding boxes (width or height below epsilon).
2224    /// - Merges consecutive characters that share the same non-None run ID
2225    ///   into a single rectangle (equivalent to grouping by text object).
2226    /// - Starts a new rectangle whenever the run ID changes.
2227    pub fn rect_array(&self, start: usize, end: usize) -> Vec<CharRect> {
2228        let end_clamped = end.min(self.characters.len());
2229        let start_clamped = start.min(end_clamped);
2230        if start_clamped >= end_clamped {
2231            return Vec::new();
2232        }
2233
2234        let chars = &self.characters[start_clamped..end_clamped];
2235        let run_ids = &self.run_ids[start_clamped..end_clamped];
2236
2237        const EPSILON: f32 = 1e-6;
2238        let mut merged: Vec<CharRect> = Vec::new();
2239        let mut current_run_id: Option<Option<u32>> = None;
2240
2241        for (ch, &rid) in chars.iter().zip(run_ids.iter()) {
2242            // Skip generated (synthesized) characters.
2243            if ch.char_type == CharType::Generated {
2244                continue;
2245            }
2246            let r = &ch.char_box;
2247            // Skip zero-size boxes.
2248            let w = r.right - r.left;
2249            let h = r.top - r.bottom;
2250            if w < EPSILON || h < EPSILON {
2251                continue;
2252            }
2253
2254            // Determine whether this character starts a new run group.
2255            // Two characters are in the same group iff they share the same
2256            // non-None run ID. None run IDs always start a new group.
2257            let same_run = match (current_run_id, rid) {
2258                (Some(Some(prev)), Some(curr)) => prev == curr,
2259                _ => false,
2260            };
2261
2262            if same_run {
2263                let last = merged.last_mut().unwrap();
2264                last.left = last.left.min(r.left);
2265                last.bottom = last.bottom.min(r.bottom);
2266                last.right = last.right.max(r.right);
2267                last.top = last.top.max(r.top);
2268            } else {
2269                merged.push(r.clone());
2270                current_run_id = Some(rid);
2271            }
2272        }
2273
2274        merged
2275    }
2276
2277    /// Upstream-aligned alias for [`rect_array()`](Self::rect_array).
2278    #[inline]
2279    pub fn get_rect_array(&self, start: usize, end: usize) -> Vec<CharRect> {
2280        self.rect_array(start, end)
2281    }
2282
2283    /// Count the bounding rectangles for characters in `[start_index, start_index + count)`.
2284    ///
2285    /// Corresponds to `FPDFText_CountRects()`.  Upstream caches the result for a
2286    /// subsequent `FPDFText_GetRect()` call; rpdfium computes this statelessly —
2287    /// call [`rect_array()`](Self::rect_array) to retrieve the rectangles directly.
2288    ///
2289    /// Returns the number of merged character rectangles, or `0` if out of range.
2290    pub fn rect_count(&self, start_index: usize, count: usize) -> usize {
2291        self.rect_array(start_index, start_index.saturating_add(count))
2292            .len()
2293    }
2294
2295    /// Upstream-aligned alias for [`rect_count()`](Self::rect_count).
2296    ///
2297    /// Corresponds to `FPDFText_CountRects`.
2298    #[inline]
2299    pub fn text_count_rects(&self, start_index: usize, count: usize) -> usize {
2300        self.rect_count(start_index, count)
2301    }
2302
2303    /// Use [`text_count_rects()`](Self::text_count_rects) instead (upstream `FPDFText_CountRects`).
2304    #[deprecated(
2305        since = "0.1.0",
2306        note = "Use `text_count_rects()` (upstream `FPDFText_CountRects`)"
2307    )]
2308    #[inline]
2309    pub fn count_rects(&self, start_index: usize, count: usize) -> usize {
2310        self.rect_count(start_index, count)
2311    }
2312
2313    /// Return the bounding rectangle at `rect_index` within the set computed for
2314    /// `[start_index, start_index + total_count)`.
2315    ///
2316    /// Corresponds to `FPDFText_GetRect()`. Upstream retrieves by index from an
2317    /// internal cache set by `FPDFText_CountRects()`; rpdfium recomputes statelessly.
2318    /// Prefer [`rect_array()`](Self::rect_array) when all rectangles are needed.
2319    ///
2320    /// Returns `None` if `rect_index` is out of range.
2321    pub fn rect_at(
2322        &self,
2323        start_index: usize,
2324        total_count: usize,
2325        rect_index: usize,
2326    ) -> Option<CharRect> {
2327        self.rect_array(start_index, start_index.saturating_add(total_count))
2328            .into_iter()
2329            .nth(rect_index)
2330    }
2331
2332    /// Upstream-aligned alias for [`rect_at()`](Self::rect_at).
2333    ///
2334    /// Corresponds to `FPDFText_GetRect`.
2335    #[inline]
2336    pub fn text_get_rect(
2337        &self,
2338        start_index: usize,
2339        total_count: usize,
2340        rect_index: usize,
2341    ) -> Option<CharRect> {
2342        self.rect_at(start_index, total_count, rect_index)
2343    }
2344
2345    /// Use [`text_get_rect()`](Self::text_get_rect) instead (upstream `FPDFText_GetRect`).
2346    #[deprecated(
2347        since = "0.1.0",
2348        note = "Use `text_get_rect()` (upstream `FPDFText_GetRect`)"
2349    )]
2350    #[inline]
2351    pub fn get_rect(
2352        &self,
2353        start_index: usize,
2354        total_count: usize,
2355        rect_index: usize,
2356    ) -> Option<CharRect> {
2357        self.rect_at(start_index, total_count, rect_index)
2358    }
2359
2360    /// Return the page text with line breaks (`\r\n`) inserted between lines.
2361    ///
2362    /// Uses [`segment_lines`] to detect
2363    /// line boundaries and joins consecutive lines with `\r\n`.
2364    pub fn text_with_line_breaks(&self) -> String {
2365        let lines = segment_lines(&self.characters);
2366        if lines.is_empty() {
2367            return String::new();
2368        }
2369        lines
2370            .iter()
2371            .map(|l| l.text.as_str())
2372            .collect::<Vec<_>>()
2373            .join("\r\n")
2374    }
2375
2376    /// Extract text within a rectangular region (in page coordinates).
2377    ///
2378    /// Returns the concatenated Unicode characters whose origin falls
2379    /// within the rectangle defined by `(x1, y1)` (bottom-left) and
2380    /// `(x2, y2)` (top-right).
2381    pub fn text_by_rect(&self, x1: f32, y1: f32, x2: f32, y2: f32) -> String {
2382        self.characters
2383            .iter()
2384            .filter(|c| {
2385                c.char_box.left >= x1
2386                    && c.char_box.left <= x2
2387                    && c.char_box.bottom >= y1
2388                    && c.char_box.bottom <= y2
2389            })
2390            .map(|c| c.unicode)
2391            .collect()
2392    }
2393
2394    /// ADR-019 alias for [`text_by_rect()`](Self::text_by_rect).
2395    ///
2396    /// Corresponds to `CPDF_TextPage::GetTextByRect`.
2397    #[inline]
2398    pub fn get_text_by_rect(&self, x1: f32, y1: f32, x2: f32, y2: f32) -> String {
2399        self.text_by_rect(x1, y1, x2, y2)
2400    }
2401
2402    /// Return all text whose character boxes intersect the given rectangle
2403    /// (upstream `FPDFText_GetBoundedText`).
2404    ///
2405    /// Parameters use the upstream convention: `(left, top, right, bottom)`.
2406    /// This is a convenience wrapper around [`text_by_rect()`](Self::text_by_rect)
2407    /// that accepts `f64` parameters and reorders to `(left, bottom, right, top)`.
2408    pub fn text_in_rect(&self, left: f64, top: f64, right: f64, bottom: f64) -> String {
2409        self.text_by_rect(left as f32, bottom as f32, right as f32, top as f32)
2410    }
2411
2412    /// Non-upstream convenience wrapper for [`text_in_rect()`](Self::text_in_rect).
2413    ///
2414    /// There is no `GetTextInRect()` in upstream PDFium. Use
2415    /// [`get_bounded_text()`](Self::get_bounded_text) which corresponds to the
2416    /// real upstream `FPDFText_GetBoundedText`.
2417    #[deprecated(
2418        since = "0.1.0",
2419        note = "Use `text_in_rect()` or `get_bounded_text()` (upstream `FPDFText_GetBoundedText`) instead"
2420    )]
2421    #[inline]
2422    pub fn get_text_in_rect(&self, left: f64, top: f64, right: f64, bottom: f64) -> String {
2423        self.text_in_rect(left, top, right, bottom)
2424    }
2425
2426    /// Upstream-aligned alias for [`text_in_rect()`](Self::text_in_rect).
2427    ///
2428    /// Corresponds to `FPDFText_GetBoundedText`.
2429    #[inline]
2430    pub fn text_get_bounded_text(&self, left: f64, top: f64, right: f64, bottom: f64) -> String {
2431        self.text_in_rect(left, top, right, bottom)
2432    }
2433
2434    /// Use [`text_get_bounded_text()`](Self::text_get_bounded_text) instead (upstream `FPDFText_GetBoundedText`).
2435    #[deprecated(
2436        since = "0.1.0",
2437        note = "Use `text_get_bounded_text()` (upstream `FPDFText_GetBoundedText`)"
2438    )]
2439    #[inline]
2440    pub fn get_bounded_text(&self, left: f64, top: f64, right: f64, bottom: f64) -> String {
2441        self.text_in_rect(left, top, right, bottom)
2442    }
2443
2444    /// Get a substring of the page text starting at character index `start`
2445    /// for `count` characters (upstream `GetPageText(start, count)`).
2446    ///
2447    /// Indices are clamped to the character count.
2448    pub fn page_text(&self, start: usize, count: usize) -> String {
2449        let len = self.characters.len();
2450        let s = start.min(len);
2451        let e = (s + count).min(len);
2452        self.characters[s..e].iter().map(|c| c.unicode).collect()
2453    }
2454
2455    /// Upstream-aligned alias for [`page_text()`](Self::page_text).
2456    #[inline]
2457    pub fn get_page_text(&self, start: usize, count: usize) -> String {
2458        self.page_text(start, count)
2459    }
2460
2461    /// Upstream-aligned alias for [`page_text()`](Self::page_text).
2462    ///
2463    /// Corresponds to `FPDFText_GetText`.
2464    #[inline]
2465    pub fn text_get_text(&self, start: usize, count: usize) -> String {
2466        self.page_text(start, count)
2467    }
2468
2469    /// Use [`text_get_text()`](Self::text_get_text) instead (upstream `FPDFText_GetText`).
2470    #[deprecated(
2471        since = "0.1.0",
2472        note = "Use `text_get_text()` (upstream `FPDFText_GetText`)"
2473    )]
2474    #[inline]
2475    pub fn get_text(&self, start: usize, count: usize) -> String {
2476        self.page_text(start, count)
2477    }
2478
2479    /// Convert a text index (position in the full text string) to a
2480    /// character index (position in the characters array).
2481    ///
2482    /// # Identity mapping
2483    ///
2484    /// rpdfium uses an identity mapping (text index == char index) because
2485    /// every character stored in `self.characters` appears exactly once in
2486    /// `self.text`.  This contrasts with upstream PDFium, which maintains a
2487    /// separate `char_indices_` segment array to skip over "silent" kHyphen
2488    /// characters (U+FFFE) that are stored in `char_list_` but omitted from
2489    /// `text_buf_`.  rpdfium never inserts such silent characters into
2490    /// `self.characters`, so the mapping is always 1:1.
2491    ///
2492    /// Returns `None` if `text_index` is out of bounds (`≥ char_count()`).
2493    pub fn char_index_from_text_index(&self, text_index: usize) -> Option<usize> {
2494        if text_index < self.characters.len() {
2495            Some(text_index)
2496        } else {
2497            None
2498        }
2499    }
2500
2501    /// Upstream-aligned alias for [`char_index_from_text_index()`](Self::char_index_from_text_index).
2502    ///
2503    /// Corresponds to `FPDFText_GetCharIndexFromTextIndex`.
2504    #[inline]
2505    pub fn text_get_char_index_from_text_index(&self, text_index: usize) -> Option<usize> {
2506        self.char_index_from_text_index(text_index)
2507    }
2508
2509    /// Deprecated: use [`text_get_char_index_from_text_index()`](Self::text_get_char_index_from_text_index) — matches upstream `FPDFText_GetCharIndexFromTextIndex`.
2510    #[deprecated(
2511        note = "use `text_get_char_index_from_text_index()` — matches upstream `FPDFText_GetCharIndexFromTextIndex`"
2512    )]
2513    #[inline]
2514    pub fn get_char_index_from_text_index(&self, text_index: usize) -> Option<usize> {
2515        self.char_index_from_text_index(text_index)
2516    }
2517
2518    /// Convert a character index (position in the characters array) to a
2519    /// text index (position in the full text string).
2520    ///
2521    /// # Identity mapping
2522    ///
2523    /// rpdfium uses an identity mapping (char index == text index) because
2524    /// every entry in `self.characters` maps directly to one position in
2525    /// `self.text`.  See [`char_index_from_text_index`](Self::char_index_from_text_index)
2526    /// for the rationale.
2527    ///
2528    /// Returns `None` if `char_index` is out of bounds (`≥ char_count()`).
2529    pub fn text_index_from_char_index(&self, char_index: usize) -> Option<usize> {
2530        if char_index < self.characters.len() {
2531            Some(char_index)
2532        } else {
2533            None
2534        }
2535    }
2536
2537    /// Upstream-aligned alias for [`text_index_from_char_index()`](Self::text_index_from_char_index).
2538    ///
2539    /// Corresponds to `FPDFText_GetTextIndexFromCharIndex`.
2540    #[inline]
2541    pub fn text_get_text_index_from_char_index(&self, char_index: usize) -> Option<usize> {
2542        self.text_index_from_char_index(char_index)
2543    }
2544
2545    /// Deprecated: use [`text_get_text_index_from_char_index()`](Self::text_get_text_index_from_char_index) — matches upstream `FPDFText_GetTextIndexFromCharIndex`.
2546    #[deprecated(
2547        note = "use `text_get_text_index_from_char_index()` — matches upstream `FPDFText_GetTextIndexFromCharIndex`"
2548    )]
2549    #[inline]
2550    pub fn get_text_index_from_char_index(&self, char_index: usize) -> Option<usize> {
2551        self.text_index_from_char_index(char_index)
2552    }
2553
2554    /// Returns the index of the page object (in the edit layer) corresponding to the
2555    /// character at `char_index`, if any.
2556    ///
2557    /// # Not Supported
2558    ///
2559    /// Mapping text characters to page object indices requires cross-layer access
2560    /// between the text extraction layer and the edit layer, which is not currently
2561    /// wired up in rpdfium.
2562    ///
2563    /// Corresponds to `FPDFText_GetCharObject`.
2564    pub fn char_object_index(&self, _char_index: usize) -> Option<usize> {
2565        None
2566    }
2567
2568    /// Upstream-aligned alias for [`char_object_index()`](Self::char_object_index).
2569    ///
2570    /// Corresponds to `FPDFText_GetCharObject`.
2571    #[inline]
2572    pub fn text_get_char_object(&self, char_index: usize) -> Option<usize> {
2573        self.char_object_index(char_index)
2574    }
2575
2576    /// Deprecated — use [`text_get_char_object()`](Self::text_get_char_object) — matches upstream `FPDFText_GetCharObject`.
2577    #[deprecated(note = "use `text_get_char_object()` — matches upstream `FPDFText_GetCharObject`")]
2578    #[inline]
2579    pub fn get_char_object(&self, char_index: usize) -> Option<usize> {
2580        self.char_object_index(char_index)
2581    }
2582
2583    /// Returns `true` if the character at `char_index` has an associated page object.
2584    ///
2585    /// Corresponds to `FPDFText_HasTextObjectForChar`.
2586    pub fn has_text_object_for_char(&self, char_index: usize) -> bool {
2587        self.char_object_index(char_index).is_some()
2588    }
2589
2590    /// Get bounding rectangles for a link's text span.
2591    ///
2592    /// Converts the link's byte indices to character indices and
2593    /// delegates to [`rect_array()`](TextPage::rect_array).
2594    pub fn link_rects(&self, link: &crate::linkextract::Link) -> Vec<CharRect> {
2595        // Link indices are byte offsets in the text string.
2596        // Convert to char indices.
2597        let start_char = self.text[..link.start_index.min(self.text.len())]
2598            .chars()
2599            .count();
2600        let end_char = start_char
2601            + self.text[link.start_index.min(self.text.len())..link.end_index.min(self.text.len())]
2602                .chars()
2603                .count();
2604        self.rect_array(start_char, end_char)
2605    }
2606}
2607
2608#[cfg(test)]
2609mod tests {
2610    use super::*;
2611
2612    /// Helper to build a `TextCharacter` with minimal defaults.
2613    fn make_char(unicode: char, x: f32, y: f32) -> TextCharacter {
2614        TextCharacter {
2615            unicode,
2616            char_code: unicode as u32,
2617            char_box: CharRect {
2618                left: x,
2619                bottom: y,
2620                right: x + 10.0,
2621                top: y + 12.0,
2622            },
2623            font_size: 12.0,
2624            font_name: "TestFont".to_string(),
2625            space_width: Some(4.0),
2626            is_soft_hyphen: false,
2627            char_type: CharType::Normal,
2628            matrix: [1.0, 0.0, 0.0, 1.0, x, y],
2629            loose_char_box: None,
2630            fill_color: None,
2631            stroke_color: None,
2632            font_weight: None,
2633            font_flags: None,
2634            rendering_mode: TextRenderingMode::Fill,
2635        }
2636    }
2637
2638    // --- Phase A: TextCharacter field tests ---
2639
2640    #[test]
2641    fn test_text_character_has_char_code_field() {
2642        let ch = make_char('A', 0.0, 0.0);
2643        assert_eq!(ch.char_code, 65);
2644    }
2645
2646    #[test]
2647    fn test_text_character_has_matrix_field() {
2648        let ch = make_char('B', 10.0, 20.0);
2649        assert_eq!(ch.matrix, [1.0, 0.0, 0.0, 1.0, 10.0, 20.0]);
2650    }
2651
2652    #[test]
2653    fn test_get_text_object_via_run_ids() {
2654        let chars = vec![make_char('C', 0.0, 0.0)];
2655        let run_ids = vec![Some(42)];
2656        let page = TextPage::new_with_run_ids(chars, run_ids, false);
2657        assert_eq!(page.text_object(0), Some(42));
2658    }
2659
2660    #[test]
2661    fn test_char_type_not_unicode_variant() {
2662        let ct = CharType::NotUnicode;
2663        assert_ne!(ct, CharType::Normal);
2664        assert_eq!(ct, CharType::NotUnicode);
2665    }
2666
2667    // --- Phase B: loose_char_box tests ---
2668
2669    #[test]
2670    fn test_compute_loose_char_box_identity_matrix() {
2671        let rect = compute_loose_char_box(
2672            10.0, 20.0, 8.0, 12.0, // x, y, width, font_size
2673            750.0, -250.0, // ascent, descent
2674            1.0, 0.0, 0.0, 1.0, // identity matrix
2675        );
2676        // Descent = -250 * 12 / 1000 = -3.0
2677        // Ascent  = 750 * 12 / 1000 = 9.0
2678        assert!(rect.left <= 10.0);
2679        assert!(rect.bottom < 20.0);
2680        assert!(rect.right >= 18.0);
2681        assert!(rect.top > 20.0);
2682    }
2683
2684    #[test]
2685    fn test_compute_loose_char_box_scaled_matrix() {
2686        let rect = compute_loose_char_box(0.0, 0.0, 16.0, 12.0, 750.0, -250.0, 2.0, 0.0, 0.0, 2.0);
2687        // Width of 16.0 at scale 2 → text-space width = 8
2688        // Vertical: descent = -3 * 2 = -6, ascent = 9 * 2 = 18
2689        assert!(rect.top > 0.0);
2690        assert!(rect.bottom < 0.0);
2691    }
2692
2693    #[test]
2694    fn test_compute_loose_char_box_vertical_identity_matrix() {
2695        // vert_origin_y = 880, vert_w1y = -1000 (typical CJK defaults)
2696        // font_size = 12, identity matrix
2697        // top_page    = 880 * 12 / 1000 = 10.56
2698        // bottom_page = (880 + -1000) * 12 / 1000 = -120 * 12 / 1000 = -1.44
2699        let rect = compute_loose_char_box_vertical(
2700            0.0, 0.0, // x, y (origin)
2701            10.0, 12.0, // width, font_size
2702            880.0, -1000.0, // vert_origin_y, vert_w1y
2703            1.0, 0.0, 0.0, 1.0, // identity matrix
2704        );
2705        // top should be above writing position (positive)
2706        assert!(rect.top > 0.0, "top={}", rect.top);
2707        // bottom should be below writing position (negative)
2708        assert!(rect.bottom < 0.0, "bottom={}", rect.bottom);
2709        // right should be > left (valid width)
2710        assert!(rect.right > rect.left);
2711        // approximate values
2712        assert!((rect.top - 10.56).abs() < 0.01, "top={}", rect.top);
2713        assert!(
2714            (rect.bottom - (-1.44)).abs() < 0.01,
2715            "bottom={}",
2716            rect.bottom
2717        );
2718    }
2719
2720    #[test]
2721    fn test_compute_loose_char_box_vertical_with_offset() {
2722        // With a non-zero origin: x=5, y=100
2723        // vert_origin_y=880, vert_w1y=-1000, font_size=10, identity matrix
2724        // top_page    = 880 * 10 / 1000 = 8.8 → abs top  = 100 + 8.8 = 108.8
2725        // bottom_page = (880 - 1000) * 10 / 1000 = -1.2 → abs bot = 100 - 1.2 = 98.8
2726        let rect = compute_loose_char_box_vertical(
2727            5.0, 100.0, 10.0, 10.0, 880.0, -1000.0, 1.0, 0.0, 0.0, 1.0,
2728        );
2729        assert!((rect.top - 108.8).abs() < 0.01, "top={}", rect.top);
2730        assert!((rect.bottom - 98.8).abs() < 0.01, "bottom={}", rect.bottom);
2731    }
2732
2733    #[test]
2734    fn test_loose_char_box_is_none_for_generated_chars() {
2735        let mut ch = make_char(' ', 0.0, 0.0);
2736        ch.char_type = CharType::Generated;
2737        ch.loose_char_box = None;
2738        assert!(ch.loose_char_box.is_none());
2739    }
2740
2741    // --- Phase C: TextPage methods ---
2742
2743    #[test]
2744    fn test_get_page_text_basic() {
2745        let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
2746        let page = TextPage::new(chars);
2747        assert_eq!(page.page_text(0, 2), "Hi");
2748        assert_eq!(page.page_text(0, 1), "H");
2749        assert_eq!(page.page_text(1, 1), "i");
2750    }
2751
2752    #[test]
2753    fn test_get_page_text_clamped() {
2754        let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2755        let page = TextPage::new(chars);
2756        assert_eq!(page.page_text(0, 100), "AB");
2757        assert_eq!(page.page_text(5, 10), "");
2758    }
2759
2760    #[test]
2761    fn test_char_index_from_text_index_identity() {
2762        let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2763        let page = TextPage::new(chars);
2764        assert_eq!(page.char_index_from_text_index(0), Some(0));
2765        assert_eq!(page.char_index_from_text_index(1), Some(1));
2766        assert_eq!(page.char_index_from_text_index(2), None);
2767    }
2768
2769    #[test]
2770    fn test_text_index_from_char_index_identity() {
2771        let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2772        let page = TextPage::new(chars);
2773        assert_eq!(page.text_index_from_char_index(0), Some(0));
2774        assert_eq!(page.text_index_from_char_index(1), Some(1));
2775        assert_eq!(page.text_index_from_char_index(2), None);
2776    }
2777
2778    #[test]
2779    fn test_is_same_text_object_same_run() {
2780        let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2781        let run_ids = vec![Some(5), Some(5)];
2782        let page = TextPage::new_with_run_ids(chars, run_ids, false);
2783        assert!(page.is_same_text_object(0, 1));
2784    }
2785
2786    #[test]
2787    fn test_is_same_text_object_different_run() {
2788        let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2789        let run_ids = vec![Some(5), Some(6)];
2790        let page = TextPage::new_with_run_ids(chars, run_ids, false);
2791        assert!(!page.is_same_text_object(0, 1));
2792    }
2793
2794    #[test]
2795    fn test_is_same_text_object_fallback_no_run_id() {
2796        let chars = vec![make_char('A', 10.0, 20.0), make_char('A', 10.05, 20.05)];
2797        let run_ids = vec![None, None];
2798        let page = TextPage::new_with_run_ids(chars, run_ids, false);
2799        // Within 1% of font_size (0.12) → same
2800        assert!(page.is_same_text_object(0, 1));
2801    }
2802
2803    #[test]
2804    fn test_is_same_text_object_fallback_different_position() {
2805        let chars = vec![make_char('A', 10.0, 20.0), make_char('A', 50.0, 20.0)];
2806        let run_ids = vec![None, None];
2807        let page = TextPage::new_with_run_ids(chars, run_ids, false);
2808        assert!(!page.is_same_text_object(0, 1));
2809    }
2810
2811    // --- TextExtractor constructor tests ---
2812
2813    #[test]
2814    fn test_text_extractor_default_not_rtl() {
2815        let ext = TextExtractor::new();
2816        assert!(!ext.is_rtl());
2817    }
2818
2819    #[test]
2820    fn test_text_extractor_with_rtl() {
2821        let ext = TextExtractor::with_rtl(true);
2822        assert!(ext.is_rtl());
2823    }
2824
2825    #[test]
2826    fn test_text_extractor_with_rtl_false() {
2827        let ext = TextExtractor::with_rtl(false);
2828        assert!(!ext.is_rtl());
2829    }
2830
2831    // --- TextPage new_with_direction ---
2832
2833    #[test]
2834    fn test_text_page_new_with_direction_false() {
2835        let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
2836        let page = TextPage::new_with_direction(chars, false);
2837        assert_eq!(page.all_page_text(), "Hi");
2838    }
2839
2840    #[test]
2841    fn test_text_page_new_with_direction_true() {
2842        let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
2843        let page = TextPage::new_with_direction(chars, true);
2844        // LTR text with RTL hint: bidi reordering may or may not reorder
2845        // plain LTR text, but the result should still be valid
2846        assert_eq!(page.char_count(), 2);
2847    }
2848
2849    // --- link_rects ---
2850
2851    // --- Phase D: Indexed getter tests ---
2852
2853    #[test]
2854    fn test_get_unicode_returns_char() {
2855        let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2856        let page = TextPage::new(chars);
2857        assert_eq!(page.unicode(0), Some('A'));
2858        assert_eq!(page.unicode(1), Some('B'));
2859        assert_eq!(page.unicode(2), None);
2860    }
2861
2862    #[test]
2863    fn test_is_generated_false_for_normal() {
2864        let chars = vec![make_char('A', 0.0, 0.0)];
2865        let page = TextPage::new(chars);
2866        assert_eq!(page.is_generated(0), Some(false));
2867    }
2868
2869    #[test]
2870    fn test_is_generated_true_for_generated() {
2871        let mut ch = make_char(' ', 0.0, 0.0);
2872        ch.char_type = CharType::Generated;
2873        let page = TextPage::new(vec![ch]);
2874        assert_eq!(page.is_generated(0), Some(true));
2875    }
2876
2877    #[test]
2878    fn test_is_generated_out_of_bounds() {
2879        let page = TextPage::new(vec![]);
2880        assert_eq!(page.is_generated(0), None);
2881    }
2882
2883    #[test]
2884    fn test_is_hyphen_true_for_hyphen_char_type() {
2885        let mut ch = make_char('-', 0.0, 0.0);
2886        ch.char_type = CharType::Hyphen;
2887        let page = TextPage::new(vec![ch]);
2888        assert_eq!(page.is_hyphen(0), Some(true));
2889    }
2890
2891    #[test]
2892    fn test_is_hyphen_true_for_soft_hyphen() {
2893        let mut ch = make_char('\u{00AD}', 0.0, 0.0);
2894        ch.is_soft_hyphen = true;
2895        let page = TextPage::new(vec![ch]);
2896        assert_eq!(page.is_hyphen(0), Some(true));
2897    }
2898
2899    #[test]
2900    fn test_is_hyphen_false_for_normal() {
2901        let chars = vec![make_char('A', 0.0, 0.0)];
2902        let page = TextPage::new(chars);
2903        assert_eq!(page.is_hyphen(0), Some(false));
2904    }
2905
2906    #[test]
2907    fn test_has_unicode_map_error_true_for_not_unicode() {
2908        let mut ch = make_char('\u{FFFD}', 0.0, 0.0);
2909        ch.char_type = CharType::NotUnicode;
2910        let page = TextPage::new(vec![ch]);
2911        assert_eq!(page.has_unicode_map_error(0), Some(true));
2912    }
2913
2914    #[test]
2915    fn test_has_unicode_map_error_false_for_normal() {
2916        let chars = vec![make_char('A', 0.0, 0.0)];
2917        let page = TextPage::new(chars);
2918        assert_eq!(page.has_unicode_map_error(0), Some(false));
2919    }
2920
2921    #[test]
2922    fn test_get_font_size_returns_size() {
2923        let chars = vec![make_char('A', 0.0, 0.0)];
2924        let page = TextPage::new(chars);
2925        assert_eq!(page.font_size(0), Some(12.0));
2926        assert_eq!(page.font_size(1), None);
2927    }
2928
2929    #[test]
2930    fn test_get_font_info_returns_name() {
2931        let chars = vec![make_char('A', 0.0, 0.0)];
2932        let page = TextPage::new(chars);
2933        assert_eq!(page.font_info(0), Some("TestFont"));
2934        assert_eq!(page.font_info(1), None);
2935    }
2936
2937    #[test]
2938    fn test_get_char_angle_identity_matrix() {
2939        let chars = vec![make_char('A', 0.0, 0.0)];
2940        let page = TextPage::new(chars);
2941        let angle = page.char_angle(0).unwrap();
2942        // atan2(0, 1) = 0
2943        assert!(angle.abs() < 0.001);
2944    }
2945
2946    #[test]
2947    fn test_get_char_angle_rotated() {
2948        let mut ch = make_char('A', 0.0, 0.0);
2949        // 90-degree rotation: a=0, b=1, c=-1, d=0
2950        ch.matrix = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0];
2951        let page = TextPage::new(vec![ch]);
2952        let angle = page.char_angle(0).unwrap();
2953        assert!((angle - std::f32::consts::FRAC_PI_2).abs() < 0.001);
2954    }
2955
2956    #[test]
2957    fn test_get_char_angle_negative_normalized() {
2958        let mut ch = make_char('A', 0.0, 0.0);
2959        // -90 degree rotation: a=0, b=-1
2960        ch.matrix = [0.0, -1.0, 1.0, 0.0, 0.0, 0.0];
2961        let page = TextPage::new(vec![ch]);
2962        let angle = page.char_angle(0).unwrap();
2963        // Should be normalized to >= 0 (3*PI/2)
2964        assert!(angle >= 0.0);
2965        assert!((angle - 3.0 * std::f32::consts::FRAC_PI_2).abs() < 0.001);
2966    }
2967
2968    #[test]
2969    fn test_get_char_box_basic() {
2970        let chars = vec![make_char('A', 10.0, 20.0)];
2971        let page = TextPage::new(chars);
2972        let rect = page.char_box(0).unwrap();
2973        assert_eq!(rect.left, 10.0);
2974        assert_eq!(rect.bottom, 20.0);
2975        assert_eq!(rect.right, 20.0); // 10 + width(10)
2976        assert_eq!(rect.top, 32.0); // 20 + height(12)
2977    }
2978
2979    #[test]
2980    fn test_get_char_box_out_of_bounds() {
2981        let page = TextPage::new(vec![]);
2982        assert!(page.char_box(0).is_none());
2983    }
2984
2985    #[test]
2986    fn test_get_loose_char_box_none_when_absent() {
2987        let chars = vec![make_char('A', 0.0, 0.0)];
2988        let page = TextPage::new(chars);
2989        // make_char sets loose_char_box to None
2990        assert!(page.loose_char_box(0).is_none());
2991    }
2992
2993    #[test]
2994    fn test_get_loose_char_box_some_when_present() {
2995        let mut ch = make_char('A', 0.0, 0.0);
2996        ch.loose_char_box = Some(CharRect {
2997            left: -1.0,
2998            bottom: -3.0,
2999            right: 11.0,
3000            top: 9.0,
3001        });
3002        let page = TextPage::new(vec![ch]);
3003        let rect = page.loose_char_box(0).unwrap();
3004        assert_eq!(rect.left, -1.0);
3005        assert_eq!(rect.top, 9.0);
3006    }
3007
3008    #[test]
3009    fn test_get_matrix_returns_matrix() {
3010        let chars = vec![make_char('A', 5.0, 10.0)];
3011        let page = TextPage::new(chars);
3012        let m = page.matrix(0).unwrap();
3013        assert_eq!(m, [1.0, 0.0, 0.0, 1.0, 5.0, 10.0]);
3014        assert!(page.matrix(1).is_none());
3015    }
3016
3017    #[test]
3018    fn test_get_char_origin_returns_ef() {
3019        let chars = vec![make_char('A', 100.0, 200.0)];
3020        let page = TextPage::new(chars);
3021        let o = page.char_origin(0).unwrap();
3022        assert_eq!(o.x, 100.0);
3023        assert_eq!(o.y, 200.0);
3024    }
3025
3026    #[test]
3027    fn test_get_text_object_returns_run_id() {
3028        let chars = vec![make_char('A', 0.0, 0.0)];
3029        let run_ids = vec![Some(42)];
3030        let page = TextPage::new_with_run_ids(chars, run_ids, false);
3031        assert_eq!(page.text_object(0), Some(42));
3032    }
3033
3034    #[test]
3035    fn test_get_text_object_none_for_generated() {
3036        let mut ch = make_char(' ', 0.0, 0.0);
3037        ch.char_type = CharType::Generated;
3038        let run_ids = vec![None];
3039        let page = TextPage::new_with_run_ids(vec![ch], run_ids, false);
3040        assert_eq!(page.text_object(0), None);
3041    }
3042
3043    #[test]
3044    fn test_get_fill_color_none_by_default() {
3045        let chars = vec![make_char('A', 0.0, 0.0)];
3046        let page = TextPage::new(chars);
3047        assert!(page.fill_color(0).is_none());
3048    }
3049
3050    #[test]
3051    fn test_get_fill_color_some_when_set() {
3052        let mut ch = make_char('A', 0.0, 0.0);
3053        ch.fill_color = Some(Color::rgb(1.0, 0.0, 0.0));
3054        let page = TextPage::new(vec![ch]);
3055        let color = page.fill_color(0).unwrap();
3056        assert_eq!(color.components[0], 1.0);
3057        assert_eq!(color.components[1], 0.0);
3058    }
3059
3060    #[test]
3061    fn test_get_stroke_color_none_by_default() {
3062        let chars = vec![make_char('A', 0.0, 0.0)];
3063        let page = TextPage::new(chars);
3064        assert!(page.stroke_color(0).is_none());
3065    }
3066
3067    #[test]
3068    fn test_get_stroke_color_some_when_set() {
3069        let mut ch = make_char('A', 0.0, 0.0);
3070        ch.stroke_color = Some(Color::gray(0.5));
3071        let page = TextPage::new(vec![ch]);
3072        let color = page.stroke_color(0).unwrap();
3073        assert_eq!(color.components[0], 0.5);
3074    }
3075
3076    #[test]
3077    fn test_get_font_weight_none_by_default() {
3078        let chars = vec![make_char('A', 0.0, 0.0)];
3079        let page = TextPage::new(chars);
3080        assert_eq!(page.font_weight(0), None);
3081    }
3082
3083    #[test]
3084    fn test_get_font_weight_some_when_set() {
3085        let mut ch = make_char('A', 0.0, 0.0);
3086        ch.font_weight = Some(700);
3087        let page = TextPage::new(vec![ch]);
3088        assert_eq!(page.font_weight(0), Some(700));
3089    }
3090
3091    #[test]
3092    fn test_get_font_weight_out_of_bounds() {
3093        let page = TextPage::new(vec![]);
3094        assert_eq!(page.font_weight(0), None);
3095    }
3096
3097    #[test]
3098    fn test_link_rects_basic() {
3099        let chars: Vec<TextCharacter> = "Hello"
3100            .chars()
3101            .enumerate()
3102            .map(|(i, c)| make_char(c, i as f32 * 10.0, 0.0))
3103            .collect();
3104        // All 5 chars share run ID 1 — they merge into a single rect.
3105        let run_ids: Vec<Option<u32>> = vec![Some(1); 5];
3106        let page = TextPage::new_with_run_ids(chars, run_ids, false);
3107        let link = crate::linkextract::Link {
3108            url: "Hello".to_string(),
3109            start_index: 0,
3110            end_index: 5,
3111            kind: crate::linkextract::LinkKind::WebUrl,
3112        };
3113        let rects = page.link_rects(&link);
3114        assert!(!rects.is_empty());
3115        // All chars share the same run ID — should merge to 1 rect.
3116        assert_eq!(rects.len(), 1);
3117    }
3118
3119    // --- G1: font_flags getter tests ---
3120
3121    #[test]
3122    fn test_get_font_flags_none_by_default() {
3123        let chars = vec![make_char('A', 0.0, 0.0)];
3124        let page = TextPage::new(chars);
3125        assert_eq!(page.font_flags(0), None);
3126    }
3127
3128    #[test]
3129    fn test_get_font_flags_returns_value() {
3130        let mut ch = make_char('A', 0.0, 0.0);
3131        ch.font_flags = Some(0x42); // Serif + Symbolic
3132        let page = TextPage::new(vec![ch]);
3133        assert_eq!(page.font_flags(0), Some(0x42));
3134    }
3135
3136    #[test]
3137    fn test_get_font_flags_out_of_bounds() {
3138        let page = TextPage::new(vec![]);
3139        assert_eq!(page.font_flags(0), None);
3140    }
3141
3142    // --- G2: RGBA convenience getter tests ---
3143
3144    #[test]
3145    fn test_get_fill_color_rgba_returns_tuple() {
3146        let mut ch = make_char('A', 0.0, 0.0);
3147        ch.fill_color = Some(Color::rgb(1.0, 0.0, 0.0));
3148        let page = TextPage::new(vec![ch]);
3149        assert_eq!(page.fill_color_rgba(0), Some((255, 0, 0, 255)));
3150    }
3151
3152    #[test]
3153    fn test_get_stroke_color_rgba_returns_tuple() {
3154        let mut ch = make_char('A', 0.0, 0.0);
3155        ch.stroke_color = Some(Color::rgb(0.0, 1.0, 0.0));
3156        let page = TextPage::new(vec![ch]);
3157        assert_eq!(page.stroke_color_rgba(0), Some((0, 255, 0, 255)));
3158    }
3159
3160    #[test]
3161    fn test_get_fill_color_rgba_none_when_no_color() {
3162        let chars = vec![make_char('A', 0.0, 0.0)];
3163        let page = TextPage::new(chars);
3164        assert_eq!(page.fill_color_rgba(0), None);
3165    }
3166
3167    // --- G3: text_by_object / get_text_by_object tests ---
3168
3169    #[test]
3170    fn test_text_by_object_single_run() {
3171        let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
3172        let run_ids = vec![Some(1), Some(1)];
3173        let page = TextPage::new_with_run_ids(chars, run_ids, false);
3174        assert_eq!(page.text_by_object(1), "Hi");
3175    }
3176
3177    #[test]
3178    fn test_text_by_object_nonexistent() {
3179        let chars = vec![make_char('A', 0.0, 0.0)];
3180        let run_ids = vec![Some(1)];
3181        let page = TextPage::new_with_run_ids(chars, run_ids, false);
3182        assert_eq!(page.text_by_object(99), "");
3183    }
3184
3185    #[test]
3186    fn test_text_by_object_with_trailing_space() {
3187        let mut chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
3188        let mut run_ids: Vec<Option<u32>> = vec![Some(1), Some(1)];
3189        // Add a space after the run
3190        let mut sp = make_char(' ', 20.0, 0.0);
3191        sp.char_type = CharType::Generated;
3192        chars.push(sp);
3193        run_ids.push(None);
3194        // Add a different run
3195        chars.push(make_char('X', 30.0, 0.0));
3196        run_ids.push(Some(2));
3197        let page = TextPage::new_with_run_ids(chars, run_ids, false);
3198        // Trailing space after run 1 should be included
3199        assert_eq!(page.text_by_object(1), "AB ");
3200    }
3201
3202    #[test]
3203    fn test_text_by_object_multi_line() {
3204        let mut chars = vec![make_char('A', 0.0, 100.0), make_char('B', 10.0, 100.0)];
3205        let mut run_ids: Vec<Option<u32>> = vec![Some(1), Some(1)];
3206        // Different run on a different line
3207        chars.push(make_char('C', 0.0, 80.0));
3208        run_ids.push(Some(2));
3209        // Same run 1 on a new line
3210        chars.push(make_char('D', 0.0, 60.0));
3211        run_ids.push(Some(1));
3212        let page = TextPage::new_with_run_ids(chars, run_ids, false);
3213        let text = page.text_by_object(1);
3214        assert!(text.contains("AB"));
3215        assert!(text.contains("D"));
3216        // Line break between AB and D
3217        assert!(text.contains("\r\n"));
3218    }
3219
3220    // --- A1: NormalizeThreshold tests ---
3221
3222    #[test]
3223    fn test_normalize_threshold_below_t1() {
3224        // threshold < 300 → threshold / 2.0
3225        let result = normalize_threshold(200.0, 300, 500, 700);
3226        assert!((result - 100.0).abs() < 0.01);
3227    }
3228
3229    #[test]
3230    fn test_normalize_threshold_between_t1_t2() {
3231        // 300 <= threshold < 500 → threshold / 4.0
3232        let result = normalize_threshold(400.0, 300, 500, 700);
3233        assert!((result - 100.0).abs() < 0.01);
3234    }
3235
3236    #[test]
3237    fn test_normalize_threshold_between_t2_t3() {
3238        // 500 <= threshold < 700 → threshold / 5.0
3239        let result = normalize_threshold(600.0, 300, 500, 700);
3240        assert!((result - 120.0).abs() < 0.01);
3241    }
3242
3243    #[test]
3244    fn test_normalize_threshold_above_t3() {
3245        // threshold >= 700 → threshold / 6.0
3246        let result = normalize_threshold(900.0, 300, 500, 700);
3247        assert!((result - 150.0).abs() < 0.01);
3248    }
3249
3250    #[test]
3251    fn test_normalize_threshold_zero() {
3252        let result = normalize_threshold(0.0, 300, 500, 700);
3253        assert!((result - 0.0).abs() < 0.01);
3254    }
3255
3256    // --- A2: Orientation detection tests ---
3257
3258    #[test]
3259    fn test_detect_orientation_empty() {
3260        let result = detect_orientation(&[], 612.0, 792.0);
3261        assert_eq!(result, TextFlowOrientation::Unknown);
3262    }
3263
3264    #[test]
3265    fn test_detect_orientation_horizontal_text() {
3266        // Wide characters spread across the page horizontally
3267        let chars: Vec<TextCharacter> = (0..20)
3268            .map(|i| {
3269                let mut c = make_char('A', i as f32 * 30.0, 700.0);
3270                c.char_box = CharRect {
3271                    left: c.char_box.left,
3272                    bottom: c.char_box.bottom,
3273                    right: c.char_box.left + 25.0,
3274                    top: c.char_box.bottom + 12.0,
3275                };
3276                c
3277            })
3278            .collect();
3279        let result = detect_orientation(&chars, 612.0, 792.0);
3280        assert_eq!(result, TextFlowOrientation::Horizontal);
3281    }
3282
3283    #[test]
3284    fn test_detect_orientation_vertical_text() {
3285        // Narrow characters stacked vertically
3286        let chars: Vec<TextCharacter> = (0..20)
3287            .map(|i| {
3288                let mut c = make_char('A', 500.0, 700.0 - i as f32 * 30.0);
3289                c.char_box = CharRect {
3290                    left: c.char_box.left,
3291                    bottom: c.char_box.bottom,
3292                    right: c.char_box.left + 12.0,
3293                    top: c.char_box.bottom + 25.0,
3294                };
3295                c
3296            })
3297            .collect();
3298        let result = detect_orientation(&chars, 612.0, 792.0);
3299        assert_eq!(result, TextFlowOrientation::Vertical);
3300    }
3301
3302    #[test]
3303    fn test_detect_orientation_zero_dimensions() {
3304        let chars = vec![make_char('A', 0.0, 0.0)];
3305        let result = detect_orientation(&chars, 0.0, 0.0);
3306        assert_eq!(result, TextFlowOrientation::Unknown);
3307    }
3308
3309    // --- Bidi reordering tests (audit gap: no bidirectional integration tests) ---
3310
3311    #[test]
3312    fn test_bidi_reordering_rtl_chars_reordered() {
3313        // Arabic characters: BA (U+0628) and ALEF (U+0627) — both have bidi class AL.
3314        // TextPage::new() calls reorder_bidi(), which may reorder them visually.
3315        // Regardless of reordering, all characters must be preserved.
3316        let chars = vec![
3317            make_char('\u{0628}', 10.0, 0.0), // ب BA
3318            make_char('\u{0627}', 0.0, 0.0),  // ا ALEF
3319        ];
3320        let page = TextPage::new(chars);
3321        assert_eq!(
3322            page.char_count(),
3323            2,
3324            "bidi reordering must not lose Arabic characters"
3325        );
3326        let text = page.all_page_text();
3327        assert!(
3328            text.contains('\u{0628}') && text.contains('\u{0627}'),
3329            "both Arabic characters must be present after bidi reordering, got: {:?}",
3330            text
3331        );
3332    }
3333
3334    #[test]
3335    fn test_bidi_reordering_mixed_ltr_rtl() {
3336        // Mix of Latin LTR characters and Arabic RTL characters.
3337        // Bidi reordering may change visual order but must preserve all chars.
3338        let chars = vec![
3339            make_char('H', 0.0, 0.0),
3340            make_char('i', 10.0, 0.0),
3341            make_char('\u{0627}', 20.0, 0.0), // ا ALEF (RTL)
3342            make_char('\u{0628}', 30.0, 0.0), // ب BA (RTL)
3343        ];
3344        let page = TextPage::new(chars);
3345        assert_eq!(
3346            page.char_count(),
3347            4,
3348            "mixed bidi must preserve all 4 characters"
3349        );
3350        let text = page.all_page_text();
3351        assert!(
3352            text.contains('H') && text.contains('i'),
3353            "LTR characters must survive bidi reordering: {:?}",
3354            text
3355        );
3356        assert!(
3357            text.contains('\u{0627}') && text.contains('\u{0628}'),
3358            "RTL characters must survive bidi reordering: {:?}",
3359            text
3360        );
3361    }
3362
3363    // --- Large-page / stress tests (upstream analogue: BigtableTextExtraction, Bug921, CountRects, TextSearch) ---
3364
3365    #[test]
3366    fn test_large_page_count_chars_500() {
3367        // Upstream analogue: FPDFText_CountChars on a large page.
3368        // Bug921 embedder test uses 278 chars; scaled here to 500.
3369        let chars: Vec<TextCharacter> = (0..500u32)
3370            .map(|i| {
3371                let col = (i % 50) as f32;
3372                let row = (i / 50) as f32;
3373                let ch = (b'A' + (i % 26) as u8) as char;
3374                make_char(ch, col * 10.0, 700.0 - row * 20.0)
3375            })
3376            .collect();
3377        let page = TextPage::new(chars);
3378        assert_eq!(page.char_count(), 500);
3379    }
3380
3381    #[test]
3382    fn test_large_page_get_page_text_complete() {
3383        // Upstream analogue: FPDFText_GetText looping all chars on a large page
3384        // (Bug921 embedder test loops all 278 chars; scaled here to 500).
3385        // Verifies get_all_page_text() and get_page_text(start, count) are consistent.
3386        let chars: Vec<TextCharacter> = (0..500u32)
3387            .map(|i| {
3388                let col = (i % 50) as f32;
3389                let row = (i / 50) as f32;
3390                let ch = (b'A' + (i % 26) as u8) as char;
3391                make_char(ch, col * 10.0, 700.0 - row * 20.0)
3392            })
3393            .collect();
3394        let expected: String = (0..500u32)
3395            .map(|i| (b'A' + (i % 26) as u8) as char)
3396            .collect();
3397        let page = TextPage::new(chars);
3398        assert_eq!(page.all_page_text(), expected);
3399        // Subrange: chars 10..20
3400        let sub: String = (10..20u32)
3401            .map(|i| (b'A' + (i % 26) as u8) as char)
3402            .collect();
3403        assert_eq!(page.page_text(10, 10), sub);
3404    }
3405
3406    #[test]
3407    fn test_large_page_all_char_getters_return_some() {
3408        // Upstream analogue: FPDFText_GetUnicode / GetFontSize / GetCharBox /
3409        // GetMatrix / GetCharOrigin / IsGenerated / IsHyphen / HasUnicodeMapError
3410        // for every valid index.
3411        // BigtableTextExtraction loops all 65 chars; scaled here to 300.
3412        let chars: Vec<TextCharacter> = (0..300u32)
3413            .map(|i| make_char((b'A' + (i % 26) as u8) as char, i as f32 * 10.0, 700.0))
3414            .collect();
3415        let page = TextPage::new(chars);
3416        for i in 0..300 {
3417            assert!(page.unicode(i).is_some(), "get_unicode({i})");
3418            assert!(page.char_code(i).is_some(), "get_char_code({i})");
3419            assert!(page.font_size(i).is_some(), "get_font_size({i})");
3420            assert!(page.font_info(i).is_some(), "get_font_info({i})");
3421            assert!(page.char_angle(i).is_some(), "get_char_angle({i})");
3422            assert!(page.char_box(i).is_some(), "get_char_box({i})");
3423            assert!(page.matrix(i).is_some(), "get_matrix({i})");
3424            assert!(page.char_origin(i).is_some(), "get_char_origin({i})");
3425            assert!(page.is_generated(i).is_some(), "is_generated({i})");
3426            assert!(page.is_hyphen(i).is_some(), "is_hyphen({i})");
3427            assert!(
3428                page.has_unicode_map_error(i).is_some(),
3429                "has_unicode_map_error({i})"
3430            );
3431        }
3432        // Out-of-bounds must return None (upstream: invalid index returns false/−1)
3433        assert!(page.unicode(300).is_none());
3434        assert!(page.char_box(300).is_none());
3435        assert!(page.char_origin(300).is_none());
3436        assert!(page.matrix(300).is_none());
3437    }
3438
3439    #[test]
3440    fn test_large_page_char_box_valid_bounds() {
3441        // Upstream analogue: CharBox test extended to large N.
3442        // Upstream checks exact char-box dimensions for 9 chars in font_matrix.pdf;
3443        // here we verify the structural invariant (left ≤ right, bottom ≤ top) for 200 chars.
3444        let chars: Vec<TextCharacter> = (0..200u32)
3445            .map(|i| make_char('A', i as f32 * 12.0, 500.0))
3446            .collect();
3447        let page = TextPage::new(chars);
3448        for i in 0..200 {
3449            let rect = page.char_box(i).expect("get_char_box should be Some");
3450            assert!(
3451                rect.right >= rect.left,
3452                "rect.right >= rect.left violated at index {i}"
3453            );
3454            assert!(
3455                rect.top >= rect.bottom,
3456                "rect.top >= rect.bottom violated at index {i}"
3457            );
3458        }
3459    }
3460
3461    #[test]
3462    fn test_large_page_char_origin_matches_matrix_ef() {
3463        // Upstream analogue: FPDFText_GetCharOrigin returns (matrix.e, matrix.f) per spec.
3464        // The GetMatrix embedder test verifies this relationship; here we verify
3465        // the invariant holds for all 200 characters.
3466        let chars: Vec<TextCharacter> = (0..200u32)
3467            .map(|i| make_char('X', i as f32 * 7.5, 300.0 + (i % 10) as f32 * 15.0))
3468            .collect();
3469        let page = TextPage::new(chars);
3470        for i in 0..200 {
3471            let origin = page.char_origin(i).expect("get_char_origin should be Some");
3472            let matrix = page.matrix(i).expect("get_matrix should be Some");
3473            assert_eq!(
3474                origin.x, matrix[4],
3475                "CharOrigin.x != matrix[4] at index {i}"
3476            );
3477            assert_eq!(
3478                origin.y, matrix[5],
3479                "CharOrigin.y != matrix[5] at index {i}"
3480            );
3481        }
3482    }
3483
3484    #[test]
3485    fn test_large_page_segmentation_scales_to_10_lines() {
3486        // Upstream analogue: orientation detection / line segmentation on a multi-line page.
3487        // Build 200 chars across 10 lines (20 chars per line).
3488        let chars: Vec<TextCharacter> = (0..200u32)
3489            .map(|i| {
3490                let col = (i % 20) as f32;
3491                let row = (i / 20) as f32;
3492                make_char(
3493                    (b'A' + (i % 26) as u8) as char,
3494                    col * 10.0,
3495                    700.0 - row * 20.0,
3496                )
3497            })
3498            .collect();
3499        let lines = segment_lines(&chars);
3500        assert_eq!(
3501            lines.len(),
3502            10,
3503            "expected 10 lines from 200 chars at 20/line"
3504        );
3505        for (idx, line) in lines.iter().enumerate() {
3506            assert!(
3507                !line.words.is_empty(),
3508                "line {idx} should have at least one word"
3509            );
3510        }
3511    }
3512
3513    #[test]
3514    fn test_large_page_search_finds_repeated_pattern() {
3515        // Upstream analogue: FPDFText_FindNext loops until no more results.
3516        // TextSearch embedder test uses "world" in a 30-char page; scaled here to
3517        // 300 chars = 100 repetitions of "ABC".
3518        let chars: Vec<TextCharacter> = (0..300u32)
3519            .map(|i| {
3520                let ch = match i % 3 {
3521                    0 => 'A',
3522                    1 => 'B',
3523                    _ => 'C',
3524                };
3525                make_char(ch, i as f32 * 8.0, 500.0)
3526            })
3527            .collect();
3528        let page = TextPage::new(chars);
3529        let text = page.all_page_text().to_string();
3530        let mut finder = crate::textpagefind::TextPageFind::new(
3531            &text,
3532            "ABC",
3533            crate::textpagefind::SearchOptions::default(),
3534        );
3535        let mut count = 0usize;
3536        while finder.find_next().is_some() {
3537            count += 1;
3538        }
3539        assert_eq!(
3540            count, 100,
3541            "expected 100 occurrences of 'ABC' in 300-char page"
3542        );
3543    }
3544
3545    #[test]
3546    fn test_large_page_rect_array_single_line_merges_to_one() {
3547        // Upstream analogue: FPDFText_CountRects / FPDFText_GetRect (CountRects embedder test).
3548        // 50 chars on a single horizontal line with the same run ID should merge into 1 rect.
3549        let chars: Vec<TextCharacter> = (0..50u32)
3550            .map(|i| make_char('A', i as f32 * 10.0, 500.0))
3551            .collect();
3552        // All chars share run ID 1 — they should merge into a single rect.
3553        let run_ids: Vec<Option<u32>> = (0..50).map(|_| Some(1)).collect();
3554        let page = TextPage::new_with_run_ids(chars, run_ids, false);
3555        let rects = page.rect_array(0, 50);
3556        assert_eq!(
3557            rects.len(),
3558            1,
3559            "50 chars with same run ID should merge into 1 rect, got {}",
3560            rects.len()
3561        );
3562        assert_eq!(rects[0].left, 0.0);
3563        assert!(rects[0].right > rects[0].left);
3564    }
3565
3566    // --- Duplicate detection tests (audit gap: no duplicate text object tests) ---
3567
3568    #[test]
3569    fn test_duplicate_same_position_same_run_deduplicated() {
3570        // Two characters with identical unicode, font_name, and position should
3571        // be deduplicated by the TextExtractor ring buffer mechanism.
3572        // Tolerance = font_size * 0.01 = 12 * 0.01 = 0.12 → both at (72, 700) → same.
3573        let ch1 = make_char('A', 72.0, 700.0);
3574        let ch2 = make_char('A', 72.0, 700.0);
3575        let mut extractor = TextExtractor::new();
3576        extractor.try_add_character(ch1, Some(1));
3577        extractor.try_add_character(ch2, Some(1));
3578        let (chars, run_ids) = extractor.into_characters();
3579        let page = TextPage::new_with_run_ids(chars, run_ids, false);
3580        // The second 'A' is a duplicate — only 1 should remain.
3581        assert!(
3582            page.char_count() < 2,
3583            "expected duplicate 'A' at same position to be removed, got {} chars",
3584            page.char_count()
3585        );
3586    }
3587
3588    #[test]
3589    fn test_duplicate_overlapping_position_no_dedup_different_content() {
3590        // Two characters at the same position but with different unicode codepoints
3591        // must NOT be deduplicated — they represent distinct rendered glyphs.
3592        let ch1 = make_char('A', 72.0, 700.0);
3593        let ch2 = make_char('B', 72.0, 700.0);
3594        let mut extractor = TextExtractor::new();
3595        extractor.try_add_character(ch1, Some(1));
3596        extractor.try_add_character(ch2, Some(2));
3597        let (chars, run_ids) = extractor.into_characters();
3598        let page = TextPage::new_with_run_ids(chars, run_ids, false);
3599        // 'A' and 'B' have different unicode — dedup must not fire.
3600        assert_eq!(
3601            page.char_count(),
3602            2,
3603            "expected both 'A' and 'B' to be preserved (different unicode)"
3604        );
3605    }
3606
3607    // --- Item 2: CharIndex / TextIndex round-trip invariant tests ---
3608    //
3609    // rpdfium uses an identity mapping between text indices and char indices.
3610    // This is architecturally correct because rpdfium never stores "silent"
3611    // characters in `self.characters` that are filtered out of the text string.
3612    // Upstream PDFium's `char_indices_` segment array is needed because
3613    // PDFium stores kHyphen (U+FFFE) characters in char_list_ but excludes
3614    // them from text_buf_.  rpdfium instead simply does not add those
3615    // characters to `self.characters` at all, so the mapping is always 1:1.
3616    //
3617    // The tests below verify:
3618    //   1. index 0 maps to 0 in both directions
3619    //   2. full round-trip: text_index_from_char_index(char_index_from_text_index(i)) == i
3620    //   3. out-of-bounds returns None in both directions
3621    //   4. the mapping is consistent with char_count() / text length
3622
3623    #[test]
3624    fn test_char_index_from_text_index_first_is_zero() {
3625        let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
3626        let page = TextPage::new(chars);
3627        assert_eq!(
3628            page.char_index_from_text_index(0),
3629            Some(0),
3630            "first text index must map to first char index"
3631        );
3632    }
3633
3634    #[test]
3635    fn test_char_text_index_round_trip_all_positions() {
3636        let chars = vec![
3637            make_char('H', 0.0, 0.0),
3638            make_char('e', 10.0, 0.0),
3639            make_char('l', 20.0, 0.0),
3640            make_char('l', 30.0, 0.0),
3641            make_char('o', 40.0, 0.0),
3642        ];
3643        let page = TextPage::new(chars);
3644        for i in 0..5 {
3645            let char_idx = page.char_index_from_text_index(i);
3646            assert!(char_idx.is_some(), "text index {} must be in bounds", i);
3647            let round_trip = page.text_index_from_char_index(char_idx.unwrap());
3648            assert_eq!(
3649                round_trip,
3650                Some(i),
3651                "round-trip text->char->text failed at index {}",
3652                i
3653            );
3654        }
3655    }
3656
3657    #[test]
3658    fn test_char_index_from_text_index_out_of_bounds_returns_none() {
3659        let chars = vec![make_char('X', 0.0, 0.0)];
3660        let page = TextPage::new(chars);
3661        // Index equal to char_count() is out of bounds.
3662        assert_eq!(page.char_index_from_text_index(1), None);
3663        // Large index is also out of bounds.
3664        assert_eq!(page.char_index_from_text_index(999), None);
3665    }
3666
3667    #[test]
3668    fn test_text_index_from_char_index_out_of_bounds_returns_none() {
3669        let chars = vec![make_char('Y', 0.0, 0.0)];
3670        let page = TextPage::new(chars);
3671        assert_eq!(page.text_index_from_char_index(1), None);
3672        assert_eq!(page.text_index_from_char_index(usize::MAX), None);
3673    }
3674
3675    #[test]
3676    fn test_char_text_index_mapping_consistent_with_char_count() {
3677        let chars = vec![
3678            make_char('A', 0.0, 0.0),
3679            make_char('B', 10.0, 0.0),
3680            make_char('C', 20.0, 0.0),
3681        ];
3682        let page = TextPage::new(chars);
3683        let n = page.char_count();
3684        // All indices in [0, n) must be valid in both directions.
3685        for i in 0..n {
3686            assert!(
3687                page.char_index_from_text_index(i).is_some(),
3688                "text index {} should be valid (char_count={})",
3689                i,
3690                n
3691            );
3692            assert!(
3693                page.text_index_from_char_index(i).is_some(),
3694                "char index {} should be valid (char_count={})",
3695                i,
3696                n
3697            );
3698        }
3699        // Index n itself must be out of bounds.
3700        assert!(page.char_index_from_text_index(n).is_none());
3701        assert!(page.text_index_from_char_index(n).is_none());
3702    }
3703
3704    #[test]
3705    fn test_char_text_index_empty_page_both_out_of_bounds() {
3706        let page = TextPage::new(vec![]);
3707        assert_eq!(page.char_index_from_text_index(0), None);
3708        assert_eq!(page.text_index_from_char_index(0), None);
3709    }
3710
3711    // --- char_render_mode tests ---
3712
3713    #[test]
3714    fn test_char_render_mode_default_is_fill() {
3715        let chars = vec![make_char('A', 0.0, 0.0)];
3716        let page = TextPage::new(chars);
3717        assert_eq!(page.char_render_mode(0), Some(TextRenderingMode::Fill));
3718    }
3719
3720    #[test]
3721    fn test_char_render_mode_stroke() {
3722        let mut ch = make_char('A', 0.0, 0.0);
3723        ch.rendering_mode = TextRenderingMode::Stroke;
3724        let page = TextPage::new(vec![ch]);
3725        assert_eq!(page.char_render_mode(0), Some(TextRenderingMode::Stroke));
3726    }
3727
3728    #[test]
3729    fn test_char_render_mode_fill_stroke_clip() {
3730        let mut ch = make_char('A', 0.0, 0.0);
3731        ch.rendering_mode = TextRenderingMode::FillStrokeClip;
3732        let page = TextPage::new(vec![ch]);
3733        assert_eq!(
3734            page.char_render_mode(0),
3735            Some(TextRenderingMode::FillStrokeClip)
3736        );
3737    }
3738
3739    #[test]
3740    fn test_char_render_mode_out_of_bounds() {
3741        let page = TextPage::new(vec![]);
3742        assert_eq!(page.char_render_mode(0), None);
3743    }
3744
3745    #[test]
3746    fn test_char_render_mode_returns_mode() {
3747        let mut ch = make_char('B', 0.0, 0.0);
3748        ch.rendering_mode = TextRenderingMode::Invisible;
3749        let page = TextPage::new(vec![ch]);
3750        assert_eq!(page.char_render_mode(0), Some(TextRenderingMode::Invisible));
3751    }
3752
3753    // --- char_index_at_pos f64 tests ---
3754
3755    #[test]
3756    fn test_char_index_at_pos_f64_basic() {
3757        let chars = vec![make_char('A', 10.0, 20.0), make_char('B', 30.0, 20.0)];
3758        let page = TextPage::new(chars);
3759        // Center of 'A' is at (15, 26), hit with zero tolerance
3760        assert_eq!(page.char_index_at_pos(15.0, 26.0, 0.0, 0.0), Some(0));
3761    }
3762
3763    #[test]
3764    fn test_char_index_at_pos_f64_with_tolerance() {
3765        // make_char('A', 10.0, 20.0) → char_box = [10..20, 20..32].
3766        // New upstream algorithm: expanded box = [left-tol/2 .. right+tol/2, bottom-tol/2 .. top+tol/2].
3767        // With x_tol=6, y_tol=2: expanded = [10-3=7 .. 20+3=23, 20-1=19 .. 32+1=33].
3768        // Point (8.0, 26.0) is inside the expanded box (x: 8≥7, y: 26 in [19,33]).
3769        let chars = vec![make_char('A', 10.0, 20.0)];
3770        let page = TextPage::new(chars);
3771        // Point just outside the tight box but within tol/2 — should hit via tolerance pass.
3772        assert_eq!(page.char_index_at_pos(8.0, 26.0, 6.0, 2.0), Some(0));
3773        // Point too far outside: x=4 < ext_left=7, should return None.
3774        assert_eq!(page.char_index_at_pos(4.0, 26.0, 6.0, 2.0), None);
3775    }
3776
3777    #[test]
3778    fn test_char_index_at_pos_f64_no_match() {
3779        let chars = vec![make_char('A', 10.0, 20.0)];
3780        let page = TextPage::new(chars);
3781        assert_eq!(page.char_index_at_pos(100.0, 100.0, 1.0, 1.0), None);
3782    }
3783
3784    // --- text_in_rect f64 tests ---
3785
3786    #[test]
3787    fn test_text_in_rect_basic() {
3788        let chars = vec![
3789            make_char('A', 10.0, 20.0),
3790            make_char('B', 30.0, 20.0),
3791            make_char('C', 50.0, 50.0),
3792        ];
3793        let page = TextPage::new(chars);
3794        // Rect encompassing A and B (left=0, top=40, right=40, bottom=10)
3795        assert_eq!(page.text_in_rect(0.0, 40.0, 40.0, 10.0), "AB");
3796    }
3797
3798    #[test]
3799    fn test_text_in_rect_empty() {
3800        let chars = vec![make_char('A', 10.0, 20.0)];
3801        let page = TextPage::new(chars);
3802        assert_eq!(page.text_in_rect(100.0, 200.0, 200.0, 100.0), "");
3803    }
3804
3805    #[allow(deprecated)]
3806    #[test]
3807    fn test_text_in_rect_alias() {
3808        let chars = vec![make_char('X', 5.0, 5.0)];
3809        let page = TextPage::new(chars);
3810        assert_eq!(page.get_text_in_rect(0.0, 20.0, 20.0, 0.0), "X");
3811    }
3812
3813    // -----------------------------------------------------------------------
3814    // Batch 17: FPDFText_GetCharObject / FPDFText_HasTextObjectForChar stubs
3815    // -----------------------------------------------------------------------
3816
3817    #[test]
3818    fn test_text_page_char_object_index_returns_none() {
3819        let chars = vec![make_char('A', 0.0, 0.0)];
3820        let page = TextPage::new(chars);
3821        // char_object_index is a None stub — cross-layer access not wired yet
3822        assert!(page.char_object_index(0).is_none());
3823        assert!(page.text_get_char_object(0).is_none());
3824    }
3825
3826    #[test]
3827    fn test_text_page_has_text_object_for_char_false() {
3828        let chars = vec![make_char('B', 10.0, 10.0)];
3829        let page = TextPage::new(chars);
3830        // has_text_object_for_char is false since char_object_index returns None
3831        assert!(!page.has_text_object_for_char(0));
3832        assert!(!page.has_text_object_for_char(99));
3833    }
3834}
rpdfium_text/textpage/mod.rs

rpdfium_text/textpage/
mod.rs