1pub mod fx_bidi;
10
11use std::collections::HashMap;
12
13#[cfg(feature = "icu")]
14use unicode_normalization::UnicodeNormalization;
15
16use rpdfium_core::{Matrix, Name};
17use rpdfium_graphics::{
18 BlendMode, ClipPath, Color, ColorSpaceFamily, ImageRef, PathOp, PathStyle, TextRenderingMode,
19};
20use rpdfium_page::display::{DisplayVisitor, SoftMask, TextRun};
21use rpdfium_page::shading::ShadingDict;
22use rpdfium_parser::Operand;
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
25pub enum CharType {
26 #[default]
28 Normal,
29 Generated,
31 Hyphen,
33 Piece,
35 NotUnicode,
37}
38
39#[derive(Debug, Clone)]
45pub struct TextCharacter {
46 unicode: char,
48 char_code: u32,
51 char_box: CharRect,
53 font_size: f32,
55 font_name: String,
57 space_width: Option<f32>,
60 is_soft_hyphen: bool,
62 char_type: CharType,
64 matrix: [f32; 6],
67 loose_char_box: Option<CharRect>,
70 fill_color: Option<Color>,
72 stroke_color: Option<Color>,
74 font_weight: Option<i32>,
76 font_flags: Option<u32>,
78 rendering_mode: TextRenderingMode,
80}
81
82const RECENT_RING_CAPACITY: usize = 7;
84
85pub struct TextExtractor {
87 characters: Vec<TextCharacter>,
88 run_ids: Vec<Option<u32>>,
91 recent_chars: Vec<TextCharacter>,
93 last_actual_text_id: Option<u64>,
95 next_run_id: u32,
97 rtl: bool,
99}
100
101impl Default for TextExtractor {
102 fn default() -> Self {
103 Self::new()
104 }
105}
106
107impl TextExtractor {
108 pub fn new() -> Self {
110 Self {
111 characters: Vec::new(),
112 run_ids: Vec::new(),
113 recent_chars: Vec::with_capacity(RECENT_RING_CAPACITY),
114 last_actual_text_id: None,
115 next_run_id: 0,
116 rtl: false,
117 }
118 }
119
120 pub fn with_rtl(rtl: bool) -> Self {
125 Self {
126 characters: Vec::new(),
127 run_ids: Vec::new(),
128 recent_chars: Vec::with_capacity(RECENT_RING_CAPACITY),
129 last_actual_text_id: None,
130 next_run_id: 0,
131 rtl,
132 }
133 }
134
135 pub fn is_rtl(&self) -> bool {
137 self.rtl
138 }
139
140 fn is_duplicate(&self, ch: &TextCharacter) -> bool {
145 let tolerance = ch.font_size * 0.01;
146 self.recent_chars.iter().any(|recent| {
147 recent.unicode == ch.unicode
148 && recent.font_name == ch.font_name
149 && (recent.char_box.left - ch.char_box.left).abs() < tolerance
150 && (recent.char_box.bottom - ch.char_box.bottom).abs() < tolerance
151 })
152 }
153
154 fn push_recent(&mut self, ch: &TextCharacter) {
156 if self.recent_chars.len() >= RECENT_RING_CAPACITY {
157 self.recent_chars.remove(0);
158 }
159 self.recent_chars.push(ch.clone());
160 }
161
162 pub fn into_characters(self) -> (Vec<TextCharacter>, Vec<Option<u32>>) {
164 (self.characters, self.run_ids)
165 }
166
167 fn try_add_character(&mut self, ch: TextCharacter, run_id: Option<u32>) {
170 let is_dup = self.is_duplicate(&ch);
171 self.push_recent(&ch);
172 if !is_dup {
173 self.characters.push(ch);
174 self.run_ids.push(run_id);
175 }
176 }
177
178 fn extract_run(&mut self, run: &TextRun) {
180 if run.rendering_mode == TextRenderingMode::Invisible {
182 return;
183 }
184
185 if let Some(id) = run.actual_text_id {
189 if self.last_actual_text_id == Some(id) {
190 return;
191 }
192 if run.actual_text.is_some() {
193 self.last_actual_text_id = Some(id);
194 }
195 }
196
197 let run_id = self.next_run_id;
199 self.next_run_id += 1;
200
201 let font_name_str = run.font_name.as_str().to_string();
202 let font_size = run.font_size;
203 let matrix = &run.matrix;
204
205 let height = (matrix.b * matrix.b + matrix.d * matrix.d).sqrt() as f32 * font_size;
207
208 let width_scale = (matrix.a * matrix.a + matrix.c * matrix.c).sqrt() as f32;
210
211 let mat_a = matrix.a as f32;
213 let mat_b = matrix.b as f32;
214 let mat_c = matrix.c as f32;
215 let mat_d = matrix.d as f32;
216
217 let (font_ascent, font_descent) = run
219 .resolved_font
220 .as_ref()
221 .map(|rf| (rf.ascent as f32, rf.descent as f32))
222 .unwrap_or((750.0, -250.0));
223
224 let is_vertical_cid = run.is_vertical
228 && run
229 .resolved_font
230 .as_ref()
231 .map(|rf| rf.is_cid_font())
232 .unwrap_or(false);
233
234 let space_width = run.resolved_font.as_ref().map(|rf| {
236 let w = rf.char_width(32) as f32;
237 w * font_size / 1000.0 * width_scale
238 });
239
240 if let Some(ref actual) = run.actual_text {
243 let total_advance: f32 = run.positions.iter().sum();
244 let total_width = width_scale * total_advance;
245 let decomposed_actual = decompose_ligatures(actual);
246 let actual_chars: Vec<char> = decomposed_actual.chars().collect();
247 let char_count = actual_chars.len().max(1);
248 let per_char_width = total_width / char_count as f32;
249
250 for (i, ch) in actual_chars.iter().enumerate() {
251 let tx = total_advance * i as f32 / char_count as f32;
252 let ty = run.rise;
253 let page_x = (matrix.a * tx as f64 + matrix.c * ty as f64 + matrix.e) as f32;
254 let page_y = (matrix.b * tx as f64 + matrix.d * ty as f64 + matrix.f) as f32;
255
256 let is_hyphen = *ch == '\u{00AD}'
257 || *ch == '\u{002D}'
258 || *ch == '\u{2010}'
259 || *ch == '\u{2011}';
260 self.try_add_character(
261 TextCharacter {
262 unicode: *ch,
263 char_code: 0,
264 char_box: CharRect {
265 left: page_x,
266 bottom: page_y,
267 right: page_x + per_char_width,
268 top: page_y + height,
269 },
270 font_size,
271 font_name: font_name_str.clone(),
272 space_width,
273 is_soft_hyphen: *ch == '\u{00AD}',
274 char_type: if is_hyphen {
275 CharType::Hyphen
276 } else {
277 CharType::Normal
278 },
279 matrix: [mat_a, mat_b, mat_c, mat_d, page_x, page_y],
280 loose_char_box: None,
281 fill_color: run.fill_color.clone(),
282 stroke_color: run.stroke_color.clone(),
283 font_weight: run.resolved_font.as_ref().and_then(|rf| rf.weight),
284 font_flags: run.resolved_font.as_ref().and_then(|rf| rf.flags),
285 rendering_mode: run.rendering_mode,
286 },
287 Some(run_id),
288 );
289 }
290 return;
291 }
292
293 let char_codes: Vec<(u32, usize)> = if let Some(ref resolved) = run.resolved_font {
295 if resolved.is_cid_font() {
296 if let Some(ref cmap) = resolved.cid_cmap {
297 cmap.extract_char_codes(&run.text)
298 } else {
299 extract_two_byte_codes(&run.text)
301 }
302 } else {
303 run.text.iter().map(|&b| (b as u32, 1)).collect()
305 }
306 } else {
307 run.text.iter().map(|&b| (b as u32, 1)).collect()
309 };
310
311 let gap_threshold =
313 space_width.map(|sw| space_threshold(sw as f64, font_size as f64) as f32);
314
315 let mut prev_page_end_x: Option<f32> = None;
317
318 let mut accumulated_advance: f32 = 0.0;
321 for (pos_idx, (code, _byte_len)) in char_codes.iter().enumerate() {
322 let code = *code;
323
324 let unicode_str = if let Some(ref resolved) = run.resolved_font {
326 resolved.unicode_from_char_code(code)
327 } else {
328 if (0x20..0x7F).contains(&code) {
330 Some((code as u8 as char).to_string())
331 } else {
332 None
333 }
334 };
335
336 let advance = if pos_idx < run.positions.len() {
338 run.positions[pos_idx]
339 } else {
340 0.0
341 };
342
343 if let Some(ustr) = unicode_str {
344 let normalized = normalize_text(&ustr);
345 let decomposed = decompose_ligatures(&normalized);
347 let chars: Vec<char> = decomposed.chars().collect();
348 let char_count = chars.len().max(1);
349
350 let tx = accumulated_advance;
352 let ty = run.rise;
353 let page_x = (matrix.a * tx as f64 + matrix.c * ty as f64 + matrix.e) as f32;
354 let page_y = (matrix.b * tx as f64 + matrix.d * ty as f64 + matrix.f) as f32;
355
356 if let (Some(prev_end), Some(threshold)) = (prev_page_end_x, gap_threshold) {
357 let gap = page_x - prev_end;
358 if gap > 0.0 && gap > threshold {
359 self.try_add_character(
360 TextCharacter {
361 unicode: ' ',
362 char_code: 0x20,
363 char_box: CharRect {
364 left: prev_end,
365 bottom: page_y,
366 right: prev_end + gap,
367 top: page_y + height,
368 },
369 font_size,
370 font_name: font_name_str.clone(),
371 space_width,
372 is_soft_hyphen: false,
373 char_type: CharType::Generated,
374 matrix: [mat_a, mat_b, mat_c, mat_d, prev_end, page_y],
375 loose_char_box: None,
376 fill_color: run.fill_color.clone(),
377 stroke_color: run.stroke_color.clone(),
378 font_weight: run.resolved_font.as_ref().and_then(|rf| rf.weight),
379 font_flags: run.resolved_font.as_ref().and_then(|rf| rf.flags),
380 rendering_mode: run.rendering_mode,
381 },
382 None,
383 );
384 }
385 }
386
387 let total_width = width_scale * advance;
389 let per_char_width = total_width / char_count as f32;
390
391 for (ci, ch) in chars.iter().enumerate() {
392 let char_offset = per_char_width * ci as f32;
393 let char_x = page_x + char_offset;
394
395 let is_hyphen = *ch == '\u{00AD}'
396 || *ch == '\u{002D}'
397 || *ch == '\u{2010}'
398 || *ch == '\u{2011}';
399 let ctype = if ci > 0 {
400 CharType::Piece
401 } else if is_hyphen {
402 CharType::Hyphen
403 } else {
404 CharType::Normal
405 };
406 let loose = if is_vertical_cid {
411 let (w1y, vy) = run
416 .resolved_font
417 .as_ref()
418 .and_then(|rf| rf.vertical_metrics.as_ref())
419 .map(|vm| {
420 let (w1y, _vx, vy) = vm.lookup(code as u16);
421 (w1y as f32, vy as f32)
422 })
423 .unwrap_or((-1000.0, 880.0));
424 let vy = run
427 .vert_origins
428 .get(pos_idx)
429 .map(|&(_vx, run_vy)| run_vy as f32)
430 .unwrap_or(vy);
431 compute_loose_char_box_vertical(
432 char_x,
433 page_y,
434 per_char_width,
435 font_size,
436 vy,
437 w1y,
438 mat_a,
439 mat_b,
440 mat_c,
441 mat_d,
442 )
443 } else {
444 compute_loose_char_box(
445 char_x,
446 page_y,
447 per_char_width,
448 font_size,
449 font_ascent,
450 font_descent,
451 mat_a,
452 mat_b,
453 mat_c,
454 mat_d,
455 )
456 };
457 self.try_add_character(
458 TextCharacter {
459 unicode: *ch,
460 char_code: code,
461 char_box: CharRect {
462 left: char_x,
463 bottom: page_y,
464 right: char_x + per_char_width,
465 top: page_y + height,
466 },
467 font_size,
468 font_name: font_name_str.clone(),
469 space_width,
470 is_soft_hyphen: *ch == '\u{00AD}',
471 char_type: ctype,
472 matrix: [mat_a, mat_b, mat_c, mat_d, char_x, page_y],
473 loose_char_box: Some(loose),
474 fill_color: run.fill_color.clone(),
475 stroke_color: run.stroke_color.clone(),
476 font_weight: run.resolved_font.as_ref().and_then(|rf| rf.weight),
477 font_flags: run.resolved_font.as_ref().and_then(|rf| rf.flags),
478 rendering_mode: run.rendering_mode,
479 },
480 Some(run_id),
481 );
482 }
483
484 prev_page_end_x = Some(page_x + total_width);
485 }
486
487 accumulated_advance += advance;
489 }
490 }
491}
492
493#[allow(clippy::too_many_arguments)]
500fn compute_loose_char_box(
501 x: f32,
502 y: f32,
503 width: f32,
504 font_size: f32,
505 font_ascent: f32,
506 font_descent: f32,
507 mat_a: f32,
508 mat_b: f32,
509 mat_c: f32,
510 mat_d: f32,
511) -> CharRect {
512 let ascent_page = font_ascent * font_size / 1000.0;
513 let descent_page = font_descent * font_size / 1000.0;
514
515 let scale = (mat_a * mat_a + mat_c * mat_c).sqrt();
520 let width_scaled = if scale > 0.0 { width / scale } else { width };
521
522 let corners = [
524 (0.0f32, descent_page),
525 (width_scaled, descent_page),
526 (width_scaled, ascent_page),
527 (0.0, ascent_page),
528 ];
529
530 let mut min_x = f32::INFINITY;
531 let mut min_y = f32::INFINITY;
532 let mut max_x = f32::NEG_INFINITY;
533 let mut max_y = f32::NEG_INFINITY;
534
535 for &(cx, cy) in &corners {
536 let px = x + mat_a * cx + mat_c * cy;
537 let py = y + mat_b * cx + mat_d * cy;
538 min_x = min_x.min(px);
539 min_y = min_y.min(py);
540 max_x = max_x.max(px);
541 max_y = max_y.max(py);
542 }
543
544 CharRect {
545 left: min_x,
546 bottom: min_y,
547 right: max_x,
548 top: max_y,
549 }
550}
551
552#[allow(clippy::too_many_arguments)]
571fn compute_loose_char_box_vertical(
572 x: f32,
573 y: f32,
574 width: f32,
575 font_size: f32,
576 vert_origin_y: f32,
577 vert_w1y: f32,
578 mat_a: f32,
579 mat_b: f32,
580 mat_c: f32,
581 mat_d: f32,
582) -> CharRect {
583 let top_page = vert_origin_y * font_size / 1000.0;
589 let bottom_page = (vert_origin_y + vert_w1y) * font_size / 1000.0;
590
591 let scale = (mat_a * mat_a + mat_c * mat_c).sqrt();
594 let width_scaled = if scale > 0.0 { width / scale } else { width };
595
596 let corners = [
597 (0.0f32, bottom_page),
598 (width_scaled, bottom_page),
599 (width_scaled, top_page),
600 (0.0, top_page),
601 ];
602
603 let mut min_x = f32::INFINITY;
604 let mut min_y = f32::INFINITY;
605 let mut max_x = f32::NEG_INFINITY;
606 let mut max_y = f32::NEG_INFINITY;
607
608 for &(cx, cy) in &corners {
609 let px = x + mat_a * cx + mat_c * cy;
610 let py = y + mat_b * cx + mat_d * cy;
611 min_x = min_x.min(px);
612 min_y = min_y.min(py);
613 max_x = max_x.max(px);
614 max_y = max_y.max(py);
615 }
616
617 CharRect {
618 left: min_x,
619 bottom: min_y,
620 right: max_x,
621 top: max_y,
622 }
623}
624
625fn decompose_ligatures(text: &str) -> String {
629 let mut result = String::with_capacity(text.len());
630 for ch in text.chars() {
631 match ch {
632 '\u{FB00}' => result.push_str("ff"),
633 '\u{FB01}' => result.push_str("fi"),
634 '\u{FB02}' => result.push_str("fl"),
635 '\u{FB03}' => result.push_str("ffi"),
636 '\u{FB04}' => result.push_str("ffl"),
637 '\u{FB05}' | '\u{FB06}' => result.push_str("st"),
638 _ => result.push(ch),
639 }
640 }
641 result
642}
643
644#[cfg(feature = "icu")]
646pub fn normalize_text(text: &str) -> String {
647 text.nfc().collect()
648}
649
650#[cfg(not(feature = "icu"))]
652pub fn normalize_text(text: &str) -> String {
653 text.to_string()
654}
655
656fn extract_two_byte_codes(data: &[u8]) -> Vec<(u32, usize)> {
658 let mut result = Vec::new();
659 let mut i = 0;
660 while i + 1 < data.len() {
661 let code = ((data[i] as u32) << 8) | (data[i + 1] as u32);
662 result.push((code, 2));
663 i += 2;
664 }
665 if i < data.len() {
667 result.push((data[i] as u32, 1));
668 }
669 result
670}
671
672impl DisplayVisitor for TextExtractor {
673 fn enter_group(
674 &mut self,
675 _blend_mode: BlendMode,
676 _clip: Option<&ClipPath>,
677 _opacity: f32,
678 _isolated: bool,
679 _knockout: bool,
680 _soft_mask: &Option<Box<SoftMask>>,
681 ) -> bool {
682 true }
684
685 fn leave_group(&mut self) {
686 }
688
689 fn visit_path(
690 &mut self,
691 _ops: &[PathOp],
692 _style: &PathStyle,
693 _matrix: &Matrix,
694 _fill_color: Option<&Color>,
695 _stroke_color: Option<&Color>,
696 _fill_color_space: Option<&ColorSpaceFamily>,
697 _stroke_color_space: Option<&ColorSpaceFamily>,
698 _transfer_function: Option<&rpdfium_page::function::TransferFunction>,
699 _overprint: bool,
700 _overprint_mode: u32,
701 ) {
702 }
704
705 fn visit_image(
706 &mut self,
707 _image_ref: &ImageRef,
708 _matrix: &Matrix,
709 _mask: Option<&rpdfium_page::display::ImageMask>,
710 _fill_color: Option<&Color>,
711 _transfer_function: Option<&rpdfium_page::function::TransferFunction>,
712 ) {
713 }
715
716 fn visit_inline_image(
717 &mut self,
718 _properties: &HashMap<Name, Operand>,
719 _data: &[u8],
720 _matrix: &Matrix,
721 ) {
722 }
724
725 fn visit_shading_fill(&mut self, _shading: &ShadingDict, _matrix: &Matrix) {
726 }
728
729 fn visit_pattern_fill(
730 &mut self,
731 _path_ops: &[PathOp],
732 _fill_rule: rpdfium_graphics::FillRule,
733 _pattern: &rpdfium_page::pattern::TilingPattern,
734 _pattern_tree: &rpdfium_page::display::DisplayTree,
735 _fill_color: Option<&Color>,
736 _matrix: &rpdfium_core::Matrix,
737 ) {
738 }
740
741 fn visit_text(&mut self, runs: &[TextRun]) {
742 for run in runs {
743 self.extract_run(run);
744 }
745 }
746}
747
748#[derive(Debug, Clone)]
750pub struct TextWord {
751 pub text: String,
753 pub x: f32,
755 pub y: f32,
757 pub width: f32,
759 pub height: f32,
761}
762
763#[derive(Debug, Clone)]
766pub struct TextLine {
767 pub text: String,
769 pub words: Vec<TextWord>,
771 pub y: f32,
773 pub height: f32,
775}
776
777const WORD_GAP_THRESHOLD: f32 = 0.3;
782
783fn normalize_threshold(threshold: f32, t1: i32, t2: i32, t3: i32) -> f32 {
788 debug_assert!(t1 < t2 && t2 < t3);
789 if threshold < t1 as f32 {
790 threshold / 2.0
791 } else if threshold < t2 as f32 {
792 threshold / 4.0
793 } else if threshold < t3 as f32 {
794 threshold / 5.0
795 } else {
796 threshold / 6.0
797 }
798}
799
800pub fn space_threshold(char_width_thou: f64, font_size_h: f64) -> f64 {
808 let normalized = normalize_threshold(char_width_thou as f32, 300, 500, 700);
809 font_size_h * normalized as f64 / 1000.0
810}
811
812const LINE_Y_TOLERANCE: f32 = 0.5;
817
818pub fn segment_words(chars: &[TextCharacter]) -> Vec<TextWord> {
825 if chars.is_empty() {
826 return Vec::new();
827 }
828
829 let total: f32 = chars
830 .iter()
831 .map(|c| c.char_box.right - c.char_box.left)
832 .sum();
833 let avg_width = total / chars.len() as f32;
834
835 let fallback_threshold = avg_width * WORD_GAP_THRESHOLD;
836
837 let mut words = Vec::new();
838 let mut current_chars: Vec<&TextCharacter> = vec![&chars[0]];
839
840 for i in 1..chars.len() {
841 let prev = &chars[i - 1];
842 let curr = &chars[i];
843
844 let gap = curr.char_box.left - prev.char_box.right;
847 let y_diff = (curr.char_box.bottom - prev.char_box.bottom).abs();
848 let y_threshold = prev.font_size * LINE_Y_TOLERANCE;
849
850 if prev.is_soft_hyphen {
852 current_chars.push(curr);
853 continue;
854 }
855
856 let cjk_break = is_cjk_ideograph(curr.unicode) || is_cjk_ideograph(prev.unicode);
858
859 let gap_threshold = curr
863 .space_width
864 .or(prev.space_width)
865 .map(|sw| space_threshold(sw as f64, curr.font_size as f64) as f32)
866 .unwrap_or(fallback_threshold);
867
868 if cjk_break || (gap > 0.0 && gap > gap_threshold) || y_diff > y_threshold {
869 words.push(build_word(¤t_chars));
871 current_chars.clear();
872 }
873
874 current_chars.push(curr);
875 }
876
877 if !current_chars.is_empty() {
879 words.push(build_word(¤t_chars));
880 }
881
882 words
883}
884
885pub fn segment_lines(chars: &[TextCharacter]) -> Vec<TextLine> {
891 if chars.is_empty() {
892 return Vec::new();
893 }
894
895 let mut sorted: Vec<&TextCharacter> = chars.iter().collect();
897 sorted.sort_by(|a, b| {
898 b.char_box
899 .bottom
900 .partial_cmp(&a.char_box.bottom)
901 .unwrap_or(std::cmp::Ordering::Equal)
902 .then_with(|| {
903 a.char_box
904 .left
905 .partial_cmp(&b.char_box.left)
906 .unwrap_or(std::cmp::Ordering::Equal)
907 })
908 });
909
910 let mut lines: Vec<Vec<&TextCharacter>> = Vec::new();
912 let mut current_line: Vec<&TextCharacter> = vec![sorted[0]];
913 let mut line_y = sorted[0].char_box.bottom;
914
915 for &ch in &sorted[1..] {
916 let y_threshold = ch.font_size * LINE_Y_TOLERANCE;
917 if (ch.char_box.bottom - line_y).abs() <= y_threshold {
918 current_line.push(ch);
919 } else {
920 lines.push(current_line);
921 current_line = vec![ch];
922 line_y = ch.char_box.bottom;
923 }
924 }
925 lines.push(current_line);
926
927 lines
929 .into_iter()
930 .map(|line_chars| {
931 let mut sorted_line = line_chars;
933 sorted_line.sort_by(|a, b| {
934 a.char_box
935 .left
936 .partial_cmp(&b.char_box.left)
937 .unwrap_or(std::cmp::Ordering::Equal)
938 });
939
940 let owned: Vec<TextCharacter> = sorted_line.iter().map(|&c| c.clone()).collect();
942 let words = segment_words(&owned);
943
944 let y = sorted_line[0].char_box.bottom;
945 let height = sorted_line
946 .iter()
947 .map(|c| c.char_box.top - c.char_box.bottom)
948 .fold(0.0f32, f32::max);
949 let text = words
950 .iter()
951 .map(|w| w.text.as_str())
952 .collect::<Vec<_>>()
953 .join(" ");
954
955 TextLine {
956 text,
957 words,
958 y,
959 height,
960 }
961 })
962 .collect()
963}
964
965pub fn segment_paragraphs(lines: &[TextLine]) -> Vec<Vec<TextLine>> {
971 if lines.is_empty() {
972 return Vec::new();
973 }
974 if lines.len() == 1 {
975 return vec![lines.to_vec()];
976 }
977
978 let mut gaps: Vec<f32> = Vec::new();
980 for i in 0..lines.len() - 1 {
981 let gap = (lines[i].y - lines[i + 1].y).abs();
982 gaps.push(gap);
983 }
984 let avg_gap = if gaps.is_empty() {
985 0.0
986 } else {
987 gaps.iter().sum::<f32>() / gaps.len() as f32
988 };
989
990 let para_threshold = avg_gap * 1.5;
991
992 let mut paragraphs = Vec::new();
993 let mut current_para = vec![lines[0].clone()];
994
995 for i in 1..lines.len() {
996 let gap = (lines[i - 1].y - lines[i].y).abs();
997 if para_threshold > 0.0 && gap > para_threshold {
998 paragraphs.push(current_para);
999 current_para = vec![lines[i].clone()];
1000 } else {
1001 current_para.push(lines[i].clone());
1002 }
1003 }
1004 paragraphs.push(current_para);
1005
1006 paragraphs
1007}
1008
1009fn is_cjk_ideograph(c: char) -> bool {
1014 matches!(c,
1015 '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{F900}'..='\u{FAFF}' )
1019}
1020
1021fn build_word(chars: &[&TextCharacter]) -> TextWord {
1022 let text: String = chars.iter().map(|c| c.unicode).collect();
1023 let x = chars[0].char_box.left;
1024 let y = chars[0].char_box.bottom;
1025 let last = chars.last().unwrap();
1026 let width = last.char_box.right - x;
1027 let height = chars
1028 .iter()
1029 .map(|c| c.char_box.top - c.char_box.bottom)
1030 .fold(0.0f32, f32::max);
1031
1032 TextWord {
1033 text,
1034 x,
1035 y,
1036 width,
1037 height,
1038 }
1039}
1040
1041pub use rpdfium_core::fx_bidi::mirror_char;
1042
1043#[derive(Debug, Clone)]
1045pub struct ColumnBound {
1046 pub x_start: f32,
1048 pub x_end: f32,
1050}
1051
1052#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1054pub enum TextFlowOrientation {
1055 Horizontal,
1057 Vertical,
1059 Unknown,
1061}
1062
1063const MIN_COLUMN_WIDTH_FRACTION: f32 = 0.10;
1066
1067const MIN_LINES_PER_COLUMN: usize = 2;
1070
1071pub fn sort_reading_order(lines: &mut [TextLine]) {
1081 if lines.len() <= 1 {
1082 return;
1083 }
1084
1085 let columns = detect_columns(lines);
1086 let orientation = detect_orientation_from_lines(lines);
1087
1088 if columns.len() <= 1 {
1089 lines.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal));
1091 } else if orientation == TextFlowOrientation::Vertical {
1092 lines.sort_by(|a, b| {
1094 let col_a = column_index_for_line(a, &columns);
1095 let col_b = column_index_for_line(b, &columns);
1096 col_b
1098 .cmp(&col_a)
1099 .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal))
1100 });
1101 } else {
1102 lines.sort_by(|a, b| {
1105 let col_a = column_index_for_line(a, &columns);
1106 let col_b = column_index_for_line(b, &columns);
1107 col_a
1108 .cmp(&col_b)
1109 .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(std::cmp::Ordering::Equal))
1110 });
1111 }
1112}
1113
1114pub fn detect_columns(lines: &[TextLine]) -> Vec<ColumnBound> {
1123 if lines.is_empty() {
1124 return Vec::new();
1125 }
1126
1127 let mut line_spans: Vec<(f32, f32)> = lines
1129 .iter()
1130 .filter_map(|line| {
1131 if line.words.is_empty() {
1132 None
1133 } else {
1134 let x_start = line.words[0].x;
1135 let last_word = line.words.last().unwrap();
1136 let x_end = last_word.x + last_word.width;
1137 Some((x_start, x_end))
1138 }
1139 })
1140 .collect();
1141
1142 if line_spans.is_empty() {
1143 return Vec::new();
1144 }
1145
1146 let overall_left = line_spans
1148 .iter()
1149 .map(|(s, _)| *s)
1150 .fold(f32::INFINITY, f32::min);
1151 let overall_right = line_spans
1152 .iter()
1153 .map(|(_, e)| *e)
1154 .fold(f32::NEG_INFINITY, f32::max);
1155
1156 let page_width = overall_right - overall_left;
1157 if page_width < 1.0 {
1158 return vec![ColumnBound {
1159 x_start: overall_left,
1160 x_end: overall_right,
1161 }];
1162 }
1163
1164 let min_column_width = page_width * MIN_COLUMN_WIDTH_FRACTION;
1165
1166 line_spans.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
1168
1169 let mut merged: Vec<(f32, f32)> = Vec::new();
1172 for &(start, end) in &line_spans {
1173 if let Some(last) = merged.last_mut() {
1174 if start <= last.1 {
1176 last.1 = last.1.max(end);
1177 } else {
1178 merged.push((start, end));
1179 }
1180 } else {
1181 merged.push((start, end));
1182 }
1183 }
1184
1185 if merged.len() <= 1 {
1186 return vec![ColumnBound {
1187 x_start: overall_left,
1188 x_end: overall_right,
1189 }];
1190 }
1191
1192 let mut gaps: Vec<(f32, usize)> = Vec::new(); for i in 0..merged.len() - 1 {
1195 let gap = merged[i + 1].0 - merged[i].1;
1196 if gap > 0.0 {
1197 gaps.push((gap, i));
1198 }
1199 }
1200
1201 if gaps.is_empty() {
1202 return vec![ColumnBound {
1203 x_start: overall_left,
1204 x_end: overall_right,
1205 }];
1206 }
1207
1208 gaps.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1211
1212 for &(_gap_size, gap_idx) in &gaps {
1214 let left_col = ColumnBound {
1215 x_start: merged[0].0,
1216 x_end: merged[gap_idx].1,
1217 };
1218 let right_col = ColumnBound {
1219 x_start: merged[gap_idx + 1].0,
1220 x_end: merged.last().unwrap().1,
1221 };
1222
1223 let left_width = left_col.x_end - left_col.x_start;
1225 let right_width = right_col.x_end - right_col.x_start;
1226 if left_width < min_column_width || right_width < min_column_width {
1227 continue;
1228 }
1229
1230 let left_count = count_lines_in_column(lines, &left_col);
1232 let right_count = count_lines_in_column(lines, &right_col);
1233 if left_count < MIN_LINES_PER_COLUMN || right_count < MIN_LINES_PER_COLUMN {
1234 continue;
1235 }
1236
1237 let has_spanning_line = lines.iter().any(|line| {
1239 if line.words.is_empty() {
1240 return false;
1241 }
1242 let lx = line.words[0].x;
1243 let last_w = line.words.last().unwrap();
1244 let rx = last_w.x + last_w.width;
1245 lx < left_col.x_end && rx > right_col.x_start
1246 });
1247 if has_spanning_line {
1248 continue;
1249 }
1250
1251 return vec![left_col, right_col];
1252 }
1253
1254 vec![ColumnBound {
1256 x_start: overall_left,
1257 x_end: overall_right,
1258 }]
1259}
1260
1261fn count_lines_in_column(lines: &[TextLine], col: &ColumnBound) -> usize {
1263 lines
1264 .iter()
1265 .filter(|line| {
1266 if line.words.is_empty() {
1267 return false;
1268 }
1269 let first = &line.words[0];
1270 let last = line.words.last().unwrap();
1271 let center = (first.x + last.x + last.width) / 2.0;
1272 center >= col.x_start && center <= col.x_end
1273 })
1274 .count()
1275}
1276
1277pub fn detect_orientation(
1283 chars: &[TextCharacter],
1284 page_width: f32,
1285 page_height: f32,
1286) -> TextFlowOrientation {
1287 let pw = page_width as usize;
1288 let ph = page_height as usize;
1289 if pw == 0 || ph == 0 || chars.is_empty() {
1290 return TextFlowOrientation::Unknown;
1291 }
1292
1293 let pw = pw.min(8192);
1295 let ph = ph.min(8192);
1296
1297 let mut h_mask = vec![false; pw];
1298 let mut v_mask = vec![false; ph];
1299 let mut line_height: f32 = 0.0;
1300 let (mut start_h, mut end_h) = (pw, 0usize);
1301 let (mut start_v, mut end_v) = (ph, 0usize);
1302
1303 for ch in chars {
1304 if ch.char_type == CharType::Generated {
1305 continue;
1306 }
1307 let min_h = (ch.char_box.left.max(0.0) as usize).min(pw);
1308 let max_h = (ch.char_box.right.max(0.0) as usize).min(pw);
1309 let min_v = (ch.char_box.bottom.max(0.0) as usize).min(ph);
1310 let max_v = (ch.char_box.top.max(0.0) as usize).min(ph);
1311 if min_h >= max_h || min_v >= max_v {
1312 continue;
1313 }
1314
1315 for cell in &mut h_mask[min_h..max_h] {
1316 *cell = true;
1317 }
1318 for cell in &mut v_mask[min_v..max_v] {
1319 *cell = true;
1320 }
1321
1322 start_h = start_h.min(min_h);
1323 end_h = end_h.max(max_h);
1324 start_v = start_v.min(min_v);
1325 end_v = end_v.max(max_v);
1326
1327 if line_height <= 0.0 {
1328 line_height = ch.char_box.top - ch.char_box.bottom;
1329 }
1330 }
1331
1332 let double_lh = (2.0 * line_height) as usize;
1333 if end_v.saturating_sub(start_v) < double_lh {
1334 return TextFlowOrientation::Horizontal;
1335 }
1336 if end_h.saturating_sub(start_h) < double_lh {
1337 return TextFlowOrientation::Vertical;
1338 }
1339
1340 let sum_h = mask_percent_filled(&h_mask, start_h, end_h);
1341 if sum_h > 0.8 {
1342 return TextFlowOrientation::Horizontal;
1343 }
1344 let sum_v = mask_percent_filled(&v_mask, start_v, end_v);
1345 if sum_h > sum_v {
1346 TextFlowOrientation::Horizontal
1347 } else if sum_h < sum_v {
1348 TextFlowOrientation::Vertical
1349 } else {
1350 TextFlowOrientation::Unknown
1351 }
1352}
1353
1354fn mask_percent_filled(mask: &[bool], start: usize, end: usize) -> f32 {
1356 if start >= end {
1357 return 0.0;
1358 }
1359 let count = mask[start..end].iter().filter(|&&b| b).count();
1360 count as f32 / (end - start) as f32
1361}
1362
1363fn detect_orientation_from_lines(lines: &[TextLine]) -> TextFlowOrientation {
1368 if lines.is_empty() {
1369 return TextFlowOrientation::Unknown;
1370 }
1371
1372 let mut vertical_count = 0usize;
1373 let mut total_count = 0usize;
1374
1375 for line in lines {
1376 if line.words.is_empty() {
1377 continue;
1378 }
1379 total_count += 1;
1380 let first = &line.words[0];
1381 let last = line.words.last().unwrap();
1382 let line_width = (last.x + last.width) - first.x;
1383 if line_width > 0.0 && line_width < line.height * 1.5 {
1384 vertical_count += 1;
1385 }
1386 }
1387
1388 if total_count > 0 && vertical_count * 2 > total_count {
1389 TextFlowOrientation::Vertical
1390 } else {
1391 TextFlowOrientation::Horizontal
1392 }
1393}
1394
1395pub use self::fx_bidi::reorder_bidi;
1396#[cfg(feature = "icu")]
1397use self::fx_bidi::reorder_bidi_with_direction;
1398
1399fn column_index_for_line(line: &TextLine, columns: &[ColumnBound]) -> usize {
1400 if line.words.is_empty() {
1401 return 0;
1402 }
1403 let line_center = {
1404 let first = &line.words[0];
1405 let last = line.words.last().unwrap();
1406 (first.x + last.x + last.width) / 2.0
1407 };
1408
1409 columns
1410 .iter()
1411 .enumerate()
1412 .min_by(|(_, a), (_, b)| {
1413 let mid_a = (a.x_start + a.x_end) / 2.0;
1414 let mid_b = (b.x_start + b.x_end) / 2.0;
1415 let dist_a = (line_center - mid_a).abs();
1416 let dist_b = (line_center - mid_b).abs();
1417 dist_a
1418 .partial_cmp(&dist_b)
1419 .unwrap_or(std::cmp::Ordering::Equal)
1420 })
1421 .map(|(idx, _)| idx)
1422 .unwrap_or(0)
1423}
1424
1425#[derive(Debug, Clone, PartialEq)]
1427pub struct CharRect {
1428 pub left: f32,
1429 pub bottom: f32,
1430 pub right: f32,
1431 pub top: f32,
1432}
1433
1434#[derive(Debug, Clone, Copy, PartialEq)]
1438pub struct CharOrigin {
1439 pub x: f32,
1440 pub y: f32,
1441}
1442
1443#[derive(Debug, Clone)]
1445pub struct TextPage {
1446 characters: Vec<TextCharacter>,
1447 text: String,
1448 run_ids: Vec<Option<u32>>,
1451}
1452
1453impl TextPage {
1454 pub fn new(characters: Vec<TextCharacter>) -> Self {
1459 let run_ids = vec![None; characters.len()];
1460 Self::new_with_run_ids(characters, run_ids, false)
1461 }
1462
1463 pub fn new_with_direction(characters: Vec<TextCharacter>, rtl: bool) -> Self {
1468 let run_ids = vec![None; characters.len()];
1469 Self::new_with_run_ids(characters, run_ids, rtl)
1470 }
1471
1472 pub fn new_with_run_ids(
1477 characters: Vec<TextCharacter>,
1478 run_ids: Vec<Option<u32>>,
1479 rtl: bool,
1480 ) -> Self {
1481 debug_assert_eq!(characters.len(), run_ids.len());
1482 let mut characters = characters;
1483 let mut run_ids = run_ids;
1484 if rtl {
1485 #[cfg(feature = "icu")]
1486 reorder_bidi_with_direction(&mut characters, &mut run_ids, true);
1487 #[cfg(not(feature = "icu"))]
1488 reorder_bidi(&mut characters, &mut run_ids);
1489 } else {
1490 reorder_bidi(&mut characters, &mut run_ids);
1491 }
1492 let text: String = characters.iter().map(|c| c.unicode).collect();
1494 let text = normalize_text(&text);
1495 Self {
1496 characters,
1497 text,
1498 run_ids,
1499 }
1500 }
1501
1502 pub fn all_page_text(&self) -> &str {
1504 &self.text
1505 }
1506
1507 #[inline]
1509 pub fn get_all_page_text(&self) -> &str {
1510 self.all_page_text()
1511 }
1512
1513 pub fn characters(&self) -> &[TextCharacter] {
1515 &self.characters
1516 }
1517
1518 pub fn char_count(&self) -> usize {
1520 self.characters.len()
1521 }
1522
1523 #[inline]
1527 pub fn text_count_chars(&self) -> usize {
1528 self.char_count()
1529 }
1530
1531 #[deprecated(
1533 since = "0.1.0",
1534 note = "Use `text_count_chars()` (upstream `FPDFText_CountChars`)"
1535 )]
1536 #[inline]
1537 pub fn count_chars(&self) -> usize {
1538 self.char_count()
1539 }
1540
1541 #[inline]
1547 pub fn size(&self) -> usize {
1548 self.char_count()
1549 }
1550
1551 fn get_char(&self, index: usize) -> Option<&TextCharacter> {
1555 self.characters.get(index)
1556 }
1557
1558 pub fn char_info(&self, index: usize) -> Option<&TextCharacter> {
1564 self.characters.get(index)
1565 }
1566
1567 #[inline]
1571 pub fn get_char_info(&self, index: usize) -> Option<&TextCharacter> {
1572 self.char_info(index)
1573 }
1574
1575 pub fn unicode(&self, index: usize) -> Option<char> {
1577 self.get_char(index).map(|c| c.unicode)
1578 }
1579
1580 #[inline]
1584 pub fn text_get_unicode(&self, index: usize) -> Option<char> {
1585 self.unicode(index)
1586 }
1587
1588 #[deprecated(
1590 since = "0.1.0",
1591 note = "Use `text_get_unicode()` (upstream `FPDFText_GetUnicode`)"
1592 )]
1593 #[inline]
1594 pub fn get_unicode(&self, index: usize) -> Option<char> {
1595 self.unicode(index)
1596 }
1597
1598 pub fn char_code(&self, index: usize) -> Option<u32> {
1603 self.get_char(index).map(|c| c.char_code)
1604 }
1605
1606 #[deprecated(
1612 since = "0.1.0",
1613 note = "Use `char_code()` directly; no upstream GetCharCode() method exists"
1614 )]
1615 #[inline]
1616 pub fn get_char_code(&self, index: usize) -> Option<u32> {
1617 self.char_code(index)
1618 }
1619
1620 pub fn is_generated(&self, index: usize) -> Option<bool> {
1622 self.get_char(index)
1623 .map(|c| c.char_type == CharType::Generated)
1624 }
1625
1626 #[inline]
1630 pub fn text_is_generated(&self, index: usize) -> Option<bool> {
1631 self.is_generated(index)
1632 }
1633
1634 pub fn is_hyphen(&self, index: usize) -> Option<bool> {
1636 self.get_char(index)
1637 .map(|c| c.char_type == CharType::Hyphen || c.is_soft_hyphen)
1638 }
1639
1640 #[inline]
1644 pub fn text_is_hyphen(&self, index: usize) -> Option<bool> {
1645 self.is_hyphen(index)
1646 }
1647
1648 pub fn has_unicode_map_error(&self, index: usize) -> Option<bool> {
1651 self.get_char(index)
1652 .map(|c| c.char_type == CharType::NotUnicode)
1653 }
1654
1655 #[inline]
1659 pub fn text_has_unicode_map_error(&self, index: usize) -> Option<bool> {
1660 self.has_unicode_map_error(index)
1661 }
1662
1663 pub fn font_size(&self, index: usize) -> Option<f32> {
1666 self.get_char(index).map(|c| c.font_size)
1667 }
1668
1669 #[inline]
1673 pub fn get_char_font_size(&self, index: usize) -> Option<f32> {
1674 self.font_size(index)
1675 }
1676
1677 #[inline]
1681 pub fn text_get_font_size(&self, index: usize) -> Option<f32> {
1682 self.font_size(index)
1683 }
1684
1685 #[deprecated(
1687 since = "0.1.0",
1688 note = "Use `text_get_font_size()` (upstream `FPDFText_GetFontSize`)"
1689 )]
1690 #[inline]
1691 pub fn get_font_size(&self, index: usize) -> Option<f32> {
1692 self.font_size(index)
1693 }
1694
1695 pub fn font_info(&self, index: usize) -> Option<&str> {
1697 self.get_char(index).map(|c| c.font_name.as_str())
1698 }
1699
1700 #[inline]
1704 pub fn text_get_font_info(&self, index: usize) -> Option<&str> {
1705 self.font_info(index)
1706 }
1707
1708 #[deprecated(
1710 since = "0.1.0",
1711 note = "Use `text_get_font_info()` (upstream `FPDFText_GetFontInfo`)"
1712 )]
1713 #[inline]
1714 pub fn get_font_info(&self, index: usize) -> Option<&str> {
1715 self.font_info(index)
1716 }
1717
1718 pub fn char_angle(&self, index: usize) -> Option<f32> {
1721 self.get_char(index).map(|c| {
1722 let angle = c.matrix[1].atan2(c.matrix[0]);
1723 if angle < 0.0 {
1724 angle + std::f32::consts::TAU
1725 } else {
1726 angle
1727 }
1728 })
1729 }
1730
1731 #[inline]
1735 pub fn text_get_char_angle(&self, index: usize) -> Option<f32> {
1736 self.char_angle(index)
1737 }
1738
1739 #[deprecated(
1741 since = "0.1.0",
1742 note = "Use `text_get_char_angle()` (upstream `FPDFText_GetCharAngle`)"
1743 )]
1744 #[inline]
1745 pub fn get_char_angle(&self, index: usize) -> Option<f32> {
1746 self.char_angle(index)
1747 }
1748
1749 pub fn char_box(&self, index: usize) -> Option<CharRect> {
1752 self.get_char(index).map(|c| c.char_box.clone())
1753 }
1754
1755 #[inline]
1759 pub fn text_get_char_box(&self, index: usize) -> Option<CharRect> {
1760 self.char_box(index)
1761 }
1762
1763 #[deprecated(
1765 since = "0.1.0",
1766 note = "Use `text_get_char_box()` (upstream `FPDFText_GetCharBox`)"
1767 )]
1768 #[inline]
1769 pub fn get_char_box(&self, index: usize) -> Option<CharRect> {
1770 self.char_box(index)
1771 }
1772
1773 pub fn loose_char_box(&self, index: usize) -> Option<CharRect> {
1780 self.get_char(index).and_then(|c| c.loose_char_box.clone())
1781 }
1782
1783 #[deprecated(
1787 note = "use `text_get_loose_char_box()` — matches upstream `FPDFText_GetLooseCharBox`"
1788 )]
1789 #[inline]
1790 pub fn get_char_loose_bounds(&self, index: usize) -> Option<CharRect> {
1791 self.loose_char_box(index)
1792 }
1793
1794 #[inline]
1798 pub fn text_get_loose_char_box(&self, index: usize) -> Option<CharRect> {
1799 self.loose_char_box(index)
1800 }
1801
1802 #[deprecated(
1804 since = "0.1.0",
1805 note = "Use `text_get_loose_char_box()` (upstream `FPDFText_GetLooseCharBox`)"
1806 )]
1807 #[inline]
1808 pub fn get_loose_char_box(&self, index: usize) -> Option<CharRect> {
1809 self.loose_char_box(index)
1810 }
1811
1812 pub fn matrix(&self, index: usize) -> Option<[f32; 6]> {
1815 self.get_char(index).map(|c| c.matrix)
1816 }
1817
1818 #[inline]
1822 pub fn text_get_matrix(&self, index: usize) -> Option<[f32; 6]> {
1823 self.matrix(index)
1824 }
1825
1826 #[deprecated(
1828 since = "0.1.0",
1829 note = "Use `text_get_matrix()` (upstream `FPDFText_GetMatrix`)"
1830 )]
1831 #[inline]
1832 pub fn get_matrix(&self, index: usize) -> Option<[f32; 6]> {
1833 self.matrix(index)
1834 }
1835
1836 pub fn char_origin(&self, index: usize) -> Option<CharOrigin> {
1839 self.get_char(index).map(|c| CharOrigin {
1840 x: c.matrix[4],
1841 y: c.matrix[5],
1842 })
1843 }
1844
1845 #[inline]
1849 pub fn text_get_char_origin(&self, index: usize) -> Option<CharOrigin> {
1850 self.char_origin(index)
1851 }
1852
1853 #[deprecated(
1855 since = "0.1.0",
1856 note = "Use `text_get_char_origin()` (upstream `FPDFText_GetCharOrigin`)"
1857 )]
1858 #[inline]
1859 pub fn get_char_origin(&self, index: usize) -> Option<CharOrigin> {
1860 self.char_origin(index)
1861 }
1862
1863 pub fn text_object(&self, index: usize) -> Option<u32> {
1866 self.run_ids.get(index).copied().flatten()
1867 }
1868
1869 #[inline]
1873 pub fn text_get_text_object(&self, index: usize) -> Option<u32> {
1874 self.text_object(index)
1875 }
1876
1877 #[deprecated(note = "use `text_get_text_object()` — matches upstream `FPDFText_GetTextObject`")]
1879 #[inline]
1880 pub fn get_text_object(&self, index: usize) -> Option<u32> {
1881 self.text_object(index)
1882 }
1883
1884 pub fn fill_color(&self, index: usize) -> Option<&Color> {
1887 self.get_char(index).and_then(|c| c.fill_color.as_ref())
1888 }
1889
1890 #[inline]
1894 pub fn text_get_fill_color(&self, index: usize) -> Option<&Color> {
1895 self.fill_color(index)
1896 }
1897
1898 #[deprecated(
1900 since = "0.1.0",
1901 note = "Use `text_get_fill_color()` (upstream `FPDFText_GetFillColor`)"
1902 )]
1903 #[inline]
1904 pub fn get_fill_color(&self, index: usize) -> Option<&Color> {
1905 self.fill_color(index)
1906 }
1907
1908 pub fn stroke_color(&self, index: usize) -> Option<&Color> {
1911 self.get_char(index).and_then(|c| c.stroke_color.as_ref())
1912 }
1913
1914 #[inline]
1918 pub fn text_get_stroke_color(&self, index: usize) -> Option<&Color> {
1919 self.stroke_color(index)
1920 }
1921
1922 #[deprecated(
1924 since = "0.1.0",
1925 note = "Use `text_get_stroke_color()` (upstream `FPDFText_GetStrokeColor`)"
1926 )]
1927 #[inline]
1928 pub fn get_stroke_color(&self, index: usize) -> Option<&Color> {
1929 self.stroke_color(index)
1930 }
1931
1932 pub fn font_weight(&self, index: usize) -> Option<i32> {
1935 self.get_char(index).and_then(|c| c.font_weight)
1936 }
1937
1938 #[inline]
1942 pub fn text_get_font_weight(&self, index: usize) -> Option<i32> {
1943 self.font_weight(index)
1944 }
1945
1946 #[deprecated(
1948 since = "0.1.0",
1949 note = "Use `text_get_font_weight()` (upstream `FPDFText_GetFontWeight`)"
1950 )]
1951 #[inline]
1952 pub fn get_font_weight(&self, index: usize) -> Option<i32> {
1953 self.font_weight(index)
1954 }
1955
1956 pub fn font_flags(&self, index: usize) -> Option<u32> {
1959 self.get_char(index).and_then(|c| c.font_flags)
1960 }
1961
1962 #[deprecated(
1965 note = "use `font_flags()` directly; no upstream `FPDFText_GetFontFlags()` method exists"
1966 )]
1967 #[inline]
1968 pub fn get_font_flags(&self, index: usize) -> Option<u32> {
1969 self.font_flags(index)
1970 }
1971
1972 pub fn char_render_mode(&self, index: usize) -> Option<TextRenderingMode> {
1975 self.get_char(index).map(|c| c.rendering_mode)
1976 }
1977
1978 #[deprecated(
1985 note = "use `char_render_mode()` — no public `FPDFText_GetTextRenderMode`; for page-object API see `text_obj_get_text_render_mode()` on TextObject"
1986 )]
1987 #[inline]
1988 pub fn text_get_text_render_mode(&self, index: usize) -> Option<TextRenderingMode> {
1989 self.char_render_mode(index)
1990 }
1991
1992 #[deprecated(
1993 note = "use `char_render_mode()` — no public `FPDFText_GetTextRenderMode`; for page-object API see `text_obj_get_text_render_mode()` on TextObject"
1994 )]
1995 #[inline]
1996 pub fn get_text_render_mode(&self, index: usize) -> Option<TextRenderingMode> {
1997 self.char_render_mode(index)
1998 }
1999
2000 pub fn fill_color_rgba(&self, index: usize) -> Option<(u8, u8, u8, u8)> {
2006 self.fill_color(index).map(|c| c.to_rgba_u8())
2007 }
2008
2009 pub fn stroke_color_rgba(&self, index: usize) -> Option<(u8, u8, u8, u8)> {
2015 self.stroke_color(index).map(|c| c.to_rgba_u8())
2016 }
2017
2018 pub fn text_by_object(&self, run_id: u32) -> String {
2028 let mut result = String::new();
2029 let mut prev_y: f32 = 0.0;
2030 let mut has_prev = false;
2031 let mut need_line_feed = false;
2032
2033 for (ch, rid) in self.characters.iter().zip(self.run_ids.iter()) {
2034 if *rid == Some(run_id) {
2035 if need_line_feed
2036 && !has_prev
2037 && (prev_y - ch.char_box.bottom).abs() > 0.0
2038 && !result.is_empty()
2039 {
2040 result.push_str("\r\n");
2041 }
2042 prev_y = ch.char_box.bottom;
2043 has_prev = true;
2044 need_line_feed = false;
2045 if ch.unicode != '\0' {
2046 result.push(ch.unicode);
2047 }
2048 } else if ch.unicode == ' ' {
2049 if has_prev {
2050 result.push(' ');
2051 has_prev = false;
2052 need_line_feed = false;
2053 }
2054 } else {
2055 has_prev = false;
2056 need_line_feed = true;
2057 }
2058 }
2059 result
2060 }
2061
2062 #[inline]
2066 pub fn get_text_by_object(&self, run_id: u32) -> String {
2067 self.text_by_object(run_id)
2068 }
2069
2070 #[deprecated(since = "0.1.0", note = "Use `text_by_object()` instead")]
2074 #[inline]
2075 pub fn text_by_run(&self, run_id: u32) -> String {
2076 self.text_by_object(run_id)
2077 }
2078
2079 pub fn is_same_text_object(&self, a: usize, b: usize) -> bool {
2085 if let (Some(Some(id_a)), Some(Some(id_b))) = (self.run_ids.get(a), self.run_ids.get(b)) {
2086 return id_a == id_b;
2087 }
2088 if let (Some(ca), Some(cb)) = (self.get_char(a), self.get_char(b)) {
2090 let tolerance = ca.font_size * 0.01;
2091 ca.unicode == cb.unicode
2092 && ca.font_name == cb.font_name
2093 && (ca.char_box.left - cb.char_box.left).abs() < tolerance
2094 && (ca.char_box.bottom - cb.char_box.bottom).abs() < tolerance
2095 } else {
2096 false
2097 }
2098 }
2099
2100 pub fn extract_links(&self) -> Vec<crate::linkextract::Link> {
2102 crate::linkextract::extract_links(&self.text)
2103 }
2104
2105 #[inline]
2109 pub fn link_load_web_links(&self) -> Vec<crate::linkextract::Link> {
2110 self.extract_links()
2111 }
2112
2113 #[deprecated(
2115 since = "0.1.0",
2116 note = "Use `link_load_web_links()` (upstream `FPDFLink_LoadWebLinks`)"
2117 )]
2118 #[inline]
2119 pub fn load_web_links(&self) -> Vec<crate::linkextract::Link> {
2120 self.extract_links()
2121 }
2122
2123 pub fn text_without_soft_hyphens(&self) -> String {
2126 self.characters
2127 .iter()
2128 .filter(|c| !c.is_soft_hyphen)
2129 .map(|c| c.unicode)
2130 .collect()
2131 }
2132
2133 pub fn index_at_pos(&self, x: f32, y: f32, x_tol: f32, y_tol: f32) -> Option<usize> {
2140 let mut nearest: Option<(usize, f32)> = None;
2141 for (i, ch) in self.characters.iter().enumerate() {
2142 let r = &ch.char_box;
2143 if x >= r.left && x <= r.right && y >= r.bottom && y <= r.top {
2145 return Some(i);
2146 }
2147 if x_tol <= 0.0 && y_tol <= 0.0 {
2149 continue;
2150 }
2151 let ext_left = r.left - x_tol / 2.0;
2153 let ext_right = r.right + x_tol / 2.0;
2154 let ext_bottom = r.bottom - y_tol / 2.0;
2155 let ext_top = r.top + y_tol / 2.0;
2156 if x < ext_left || x > ext_right || y < ext_bottom || y > ext_top {
2157 continue;
2158 }
2159 let dx = (x - r.left).abs().min((x - r.right).abs());
2161 let dy = (y - r.bottom).abs().min((y - r.top).abs());
2162 let dist = dx + dy;
2163 if nearest.is_none_or(|(_, d)| dist < d) {
2164 nearest = Some((i, dist));
2165 }
2166 }
2167 nearest.map(|(i, _)| i)
2168 }
2169
2170 #[inline]
2172 pub fn get_index_at_pos(&self, x: f32, y: f32, x_tol: f32, y_tol: f32) -> Option<usize> {
2173 self.index_at_pos(x, y, x_tol, y_tol)
2174 }
2175
2176 pub fn char_index_at_pos(&self, x: f64, y: f64, x_tol: f64, y_tol: f64) -> Option<usize> {
2182 self.index_at_pos(x as f32, y as f32, x_tol as f32, y_tol as f32)
2183 }
2184
2185 #[inline]
2189 pub fn text_get_char_index_at_pos(
2190 &self,
2191 x: f64,
2192 y: f64,
2193 x_tol: f64,
2194 y_tol: f64,
2195 ) -> Option<usize> {
2196 self.char_index_at_pos(x, y, x_tol, y_tol)
2197 }
2198
2199 #[deprecated(
2201 since = "0.1.0",
2202 note = "Use `text_get_char_index_at_pos()` (upstream `FPDFText_GetCharIndexAtPos`)"
2203 )]
2204 #[inline]
2205 pub fn get_char_index_at_pos(&self, x: f64, y: f64, x_tol: f64, y_tol: f64) -> Option<usize> {
2206 self.char_index_at_pos(x, y, x_tol, y_tol)
2207 }
2208
2209 pub fn char_rects(&self, start: usize, end: usize) -> Vec<CharRect> {
2211 let end = end.min(self.characters.len());
2212 let start = start.min(end);
2213 self.characters[start..end]
2214 .iter()
2215 .map(|ch| ch.char_box.clone())
2216 .collect()
2217 }
2218
2219 pub fn rect_array(&self, start: usize, end: usize) -> Vec<CharRect> {
2228 let end_clamped = end.min(self.characters.len());
2229 let start_clamped = start.min(end_clamped);
2230 if start_clamped >= end_clamped {
2231 return Vec::new();
2232 }
2233
2234 let chars = &self.characters[start_clamped..end_clamped];
2235 let run_ids = &self.run_ids[start_clamped..end_clamped];
2236
2237 const EPSILON: f32 = 1e-6;
2238 let mut merged: Vec<CharRect> = Vec::new();
2239 let mut current_run_id: Option<Option<u32>> = None;
2240
2241 for (ch, &rid) in chars.iter().zip(run_ids.iter()) {
2242 if ch.char_type == CharType::Generated {
2244 continue;
2245 }
2246 let r = &ch.char_box;
2247 let w = r.right - r.left;
2249 let h = r.top - r.bottom;
2250 if w < EPSILON || h < EPSILON {
2251 continue;
2252 }
2253
2254 let same_run = match (current_run_id, rid) {
2258 (Some(Some(prev)), Some(curr)) => prev == curr,
2259 _ => false,
2260 };
2261
2262 if same_run {
2263 let last = merged.last_mut().unwrap();
2264 last.left = last.left.min(r.left);
2265 last.bottom = last.bottom.min(r.bottom);
2266 last.right = last.right.max(r.right);
2267 last.top = last.top.max(r.top);
2268 } else {
2269 merged.push(r.clone());
2270 current_run_id = Some(rid);
2271 }
2272 }
2273
2274 merged
2275 }
2276
2277 #[inline]
2279 pub fn get_rect_array(&self, start: usize, end: usize) -> Vec<CharRect> {
2280 self.rect_array(start, end)
2281 }
2282
2283 pub fn rect_count(&self, start_index: usize, count: usize) -> usize {
2291 self.rect_array(start_index, start_index.saturating_add(count))
2292 .len()
2293 }
2294
2295 #[inline]
2299 pub fn text_count_rects(&self, start_index: usize, count: usize) -> usize {
2300 self.rect_count(start_index, count)
2301 }
2302
2303 #[deprecated(
2305 since = "0.1.0",
2306 note = "Use `text_count_rects()` (upstream `FPDFText_CountRects`)"
2307 )]
2308 #[inline]
2309 pub fn count_rects(&self, start_index: usize, count: usize) -> usize {
2310 self.rect_count(start_index, count)
2311 }
2312
2313 pub fn rect_at(
2322 &self,
2323 start_index: usize,
2324 total_count: usize,
2325 rect_index: usize,
2326 ) -> Option<CharRect> {
2327 self.rect_array(start_index, start_index.saturating_add(total_count))
2328 .into_iter()
2329 .nth(rect_index)
2330 }
2331
2332 #[inline]
2336 pub fn text_get_rect(
2337 &self,
2338 start_index: usize,
2339 total_count: usize,
2340 rect_index: usize,
2341 ) -> Option<CharRect> {
2342 self.rect_at(start_index, total_count, rect_index)
2343 }
2344
2345 #[deprecated(
2347 since = "0.1.0",
2348 note = "Use `text_get_rect()` (upstream `FPDFText_GetRect`)"
2349 )]
2350 #[inline]
2351 pub fn get_rect(
2352 &self,
2353 start_index: usize,
2354 total_count: usize,
2355 rect_index: usize,
2356 ) -> Option<CharRect> {
2357 self.rect_at(start_index, total_count, rect_index)
2358 }
2359
2360 pub fn text_with_line_breaks(&self) -> String {
2365 let lines = segment_lines(&self.characters);
2366 if lines.is_empty() {
2367 return String::new();
2368 }
2369 lines
2370 .iter()
2371 .map(|l| l.text.as_str())
2372 .collect::<Vec<_>>()
2373 .join("\r\n")
2374 }
2375
2376 pub fn text_by_rect(&self, x1: f32, y1: f32, x2: f32, y2: f32) -> String {
2382 self.characters
2383 .iter()
2384 .filter(|c| {
2385 c.char_box.left >= x1
2386 && c.char_box.left <= x2
2387 && c.char_box.bottom >= y1
2388 && c.char_box.bottom <= y2
2389 })
2390 .map(|c| c.unicode)
2391 .collect()
2392 }
2393
2394 #[inline]
2398 pub fn get_text_by_rect(&self, x1: f32, y1: f32, x2: f32, y2: f32) -> String {
2399 self.text_by_rect(x1, y1, x2, y2)
2400 }
2401
2402 pub fn text_in_rect(&self, left: f64, top: f64, right: f64, bottom: f64) -> String {
2409 self.text_by_rect(left as f32, bottom as f32, right as f32, top as f32)
2410 }
2411
2412 #[deprecated(
2418 since = "0.1.0",
2419 note = "Use `text_in_rect()` or `get_bounded_text()` (upstream `FPDFText_GetBoundedText`) instead"
2420 )]
2421 #[inline]
2422 pub fn get_text_in_rect(&self, left: f64, top: f64, right: f64, bottom: f64) -> String {
2423 self.text_in_rect(left, top, right, bottom)
2424 }
2425
2426 #[inline]
2430 pub fn text_get_bounded_text(&self, left: f64, top: f64, right: f64, bottom: f64) -> String {
2431 self.text_in_rect(left, top, right, bottom)
2432 }
2433
2434 #[deprecated(
2436 since = "0.1.0",
2437 note = "Use `text_get_bounded_text()` (upstream `FPDFText_GetBoundedText`)"
2438 )]
2439 #[inline]
2440 pub fn get_bounded_text(&self, left: f64, top: f64, right: f64, bottom: f64) -> String {
2441 self.text_in_rect(left, top, right, bottom)
2442 }
2443
2444 pub fn page_text(&self, start: usize, count: usize) -> String {
2449 let len = self.characters.len();
2450 let s = start.min(len);
2451 let e = (s + count).min(len);
2452 self.characters[s..e].iter().map(|c| c.unicode).collect()
2453 }
2454
2455 #[inline]
2457 pub fn get_page_text(&self, start: usize, count: usize) -> String {
2458 self.page_text(start, count)
2459 }
2460
2461 #[inline]
2465 pub fn text_get_text(&self, start: usize, count: usize) -> String {
2466 self.page_text(start, count)
2467 }
2468
2469 #[deprecated(
2471 since = "0.1.0",
2472 note = "Use `text_get_text()` (upstream `FPDFText_GetText`)"
2473 )]
2474 #[inline]
2475 pub fn get_text(&self, start: usize, count: usize) -> String {
2476 self.page_text(start, count)
2477 }
2478
2479 pub fn char_index_from_text_index(&self, text_index: usize) -> Option<usize> {
2494 if text_index < self.characters.len() {
2495 Some(text_index)
2496 } else {
2497 None
2498 }
2499 }
2500
2501 #[inline]
2505 pub fn text_get_char_index_from_text_index(&self, text_index: usize) -> Option<usize> {
2506 self.char_index_from_text_index(text_index)
2507 }
2508
2509 #[deprecated(
2511 note = "use `text_get_char_index_from_text_index()` — matches upstream `FPDFText_GetCharIndexFromTextIndex`"
2512 )]
2513 #[inline]
2514 pub fn get_char_index_from_text_index(&self, text_index: usize) -> Option<usize> {
2515 self.char_index_from_text_index(text_index)
2516 }
2517
2518 pub fn text_index_from_char_index(&self, char_index: usize) -> Option<usize> {
2530 if char_index < self.characters.len() {
2531 Some(char_index)
2532 } else {
2533 None
2534 }
2535 }
2536
2537 #[inline]
2541 pub fn text_get_text_index_from_char_index(&self, char_index: usize) -> Option<usize> {
2542 self.text_index_from_char_index(char_index)
2543 }
2544
2545 #[deprecated(
2547 note = "use `text_get_text_index_from_char_index()` — matches upstream `FPDFText_GetTextIndexFromCharIndex`"
2548 )]
2549 #[inline]
2550 pub fn get_text_index_from_char_index(&self, char_index: usize) -> Option<usize> {
2551 self.text_index_from_char_index(char_index)
2552 }
2553
2554 pub fn char_object_index(&self, _char_index: usize) -> Option<usize> {
2565 None
2566 }
2567
2568 #[inline]
2572 pub fn text_get_char_object(&self, char_index: usize) -> Option<usize> {
2573 self.char_object_index(char_index)
2574 }
2575
2576 #[deprecated(note = "use `text_get_char_object()` — matches upstream `FPDFText_GetCharObject`")]
2578 #[inline]
2579 pub fn get_char_object(&self, char_index: usize) -> Option<usize> {
2580 self.char_object_index(char_index)
2581 }
2582
2583 pub fn has_text_object_for_char(&self, char_index: usize) -> bool {
2587 self.char_object_index(char_index).is_some()
2588 }
2589
2590 pub fn link_rects(&self, link: &crate::linkextract::Link) -> Vec<CharRect> {
2595 let start_char = self.text[..link.start_index.min(self.text.len())]
2598 .chars()
2599 .count();
2600 let end_char = start_char
2601 + self.text[link.start_index.min(self.text.len())..link.end_index.min(self.text.len())]
2602 .chars()
2603 .count();
2604 self.rect_array(start_char, end_char)
2605 }
2606}
2607
2608#[cfg(test)]
2609mod tests {
2610 use super::*;
2611
2612 fn make_char(unicode: char, x: f32, y: f32) -> TextCharacter {
2614 TextCharacter {
2615 unicode,
2616 char_code: unicode as u32,
2617 char_box: CharRect {
2618 left: x,
2619 bottom: y,
2620 right: x + 10.0,
2621 top: y + 12.0,
2622 },
2623 font_size: 12.0,
2624 font_name: "TestFont".to_string(),
2625 space_width: Some(4.0),
2626 is_soft_hyphen: false,
2627 char_type: CharType::Normal,
2628 matrix: [1.0, 0.0, 0.0, 1.0, x, y],
2629 loose_char_box: None,
2630 fill_color: None,
2631 stroke_color: None,
2632 font_weight: None,
2633 font_flags: None,
2634 rendering_mode: TextRenderingMode::Fill,
2635 }
2636 }
2637
2638 #[test]
2641 fn test_text_character_has_char_code_field() {
2642 let ch = make_char('A', 0.0, 0.0);
2643 assert_eq!(ch.char_code, 65);
2644 }
2645
2646 #[test]
2647 fn test_text_character_has_matrix_field() {
2648 let ch = make_char('B', 10.0, 20.0);
2649 assert_eq!(ch.matrix, [1.0, 0.0, 0.0, 1.0, 10.0, 20.0]);
2650 }
2651
2652 #[test]
2653 fn test_get_text_object_via_run_ids() {
2654 let chars = vec![make_char('C', 0.0, 0.0)];
2655 let run_ids = vec![Some(42)];
2656 let page = TextPage::new_with_run_ids(chars, run_ids, false);
2657 assert_eq!(page.text_object(0), Some(42));
2658 }
2659
2660 #[test]
2661 fn test_char_type_not_unicode_variant() {
2662 let ct = CharType::NotUnicode;
2663 assert_ne!(ct, CharType::Normal);
2664 assert_eq!(ct, CharType::NotUnicode);
2665 }
2666
2667 #[test]
2670 fn test_compute_loose_char_box_identity_matrix() {
2671 let rect = compute_loose_char_box(
2672 10.0, 20.0, 8.0, 12.0, 750.0, -250.0, 1.0, 0.0, 0.0, 1.0, );
2676 assert!(rect.left <= 10.0);
2679 assert!(rect.bottom < 20.0);
2680 assert!(rect.right >= 18.0);
2681 assert!(rect.top > 20.0);
2682 }
2683
2684 #[test]
2685 fn test_compute_loose_char_box_scaled_matrix() {
2686 let rect = compute_loose_char_box(0.0, 0.0, 16.0, 12.0, 750.0, -250.0, 2.0, 0.0, 0.0, 2.0);
2687 assert!(rect.top > 0.0);
2690 assert!(rect.bottom < 0.0);
2691 }
2692
2693 #[test]
2694 fn test_compute_loose_char_box_vertical_identity_matrix() {
2695 let rect = compute_loose_char_box_vertical(
2700 0.0, 0.0, 10.0, 12.0, 880.0, -1000.0, 1.0, 0.0, 0.0, 1.0, );
2705 assert!(rect.top > 0.0, "top={}", rect.top);
2707 assert!(rect.bottom < 0.0, "bottom={}", rect.bottom);
2709 assert!(rect.right > rect.left);
2711 assert!((rect.top - 10.56).abs() < 0.01, "top={}", rect.top);
2713 assert!(
2714 (rect.bottom - (-1.44)).abs() < 0.01,
2715 "bottom={}",
2716 rect.bottom
2717 );
2718 }
2719
2720 #[test]
2721 fn test_compute_loose_char_box_vertical_with_offset() {
2722 let rect = compute_loose_char_box_vertical(
2727 5.0, 100.0, 10.0, 10.0, 880.0, -1000.0, 1.0, 0.0, 0.0, 1.0,
2728 );
2729 assert!((rect.top - 108.8).abs() < 0.01, "top={}", rect.top);
2730 assert!((rect.bottom - 98.8).abs() < 0.01, "bottom={}", rect.bottom);
2731 }
2732
2733 #[test]
2734 fn test_loose_char_box_is_none_for_generated_chars() {
2735 let mut ch = make_char(' ', 0.0, 0.0);
2736 ch.char_type = CharType::Generated;
2737 ch.loose_char_box = None;
2738 assert!(ch.loose_char_box.is_none());
2739 }
2740
2741 #[test]
2744 fn test_get_page_text_basic() {
2745 let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
2746 let page = TextPage::new(chars);
2747 assert_eq!(page.page_text(0, 2), "Hi");
2748 assert_eq!(page.page_text(0, 1), "H");
2749 assert_eq!(page.page_text(1, 1), "i");
2750 }
2751
2752 #[test]
2753 fn test_get_page_text_clamped() {
2754 let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2755 let page = TextPage::new(chars);
2756 assert_eq!(page.page_text(0, 100), "AB");
2757 assert_eq!(page.page_text(5, 10), "");
2758 }
2759
2760 #[test]
2761 fn test_char_index_from_text_index_identity() {
2762 let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2763 let page = TextPage::new(chars);
2764 assert_eq!(page.char_index_from_text_index(0), Some(0));
2765 assert_eq!(page.char_index_from_text_index(1), Some(1));
2766 assert_eq!(page.char_index_from_text_index(2), None);
2767 }
2768
2769 #[test]
2770 fn test_text_index_from_char_index_identity() {
2771 let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2772 let page = TextPage::new(chars);
2773 assert_eq!(page.text_index_from_char_index(0), Some(0));
2774 assert_eq!(page.text_index_from_char_index(1), Some(1));
2775 assert_eq!(page.text_index_from_char_index(2), None);
2776 }
2777
2778 #[test]
2779 fn test_is_same_text_object_same_run() {
2780 let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2781 let run_ids = vec![Some(5), Some(5)];
2782 let page = TextPage::new_with_run_ids(chars, run_ids, false);
2783 assert!(page.is_same_text_object(0, 1));
2784 }
2785
2786 #[test]
2787 fn test_is_same_text_object_different_run() {
2788 let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2789 let run_ids = vec![Some(5), Some(6)];
2790 let page = TextPage::new_with_run_ids(chars, run_ids, false);
2791 assert!(!page.is_same_text_object(0, 1));
2792 }
2793
2794 #[test]
2795 fn test_is_same_text_object_fallback_no_run_id() {
2796 let chars = vec![make_char('A', 10.0, 20.0), make_char('A', 10.05, 20.05)];
2797 let run_ids = vec![None, None];
2798 let page = TextPage::new_with_run_ids(chars, run_ids, false);
2799 assert!(page.is_same_text_object(0, 1));
2801 }
2802
2803 #[test]
2804 fn test_is_same_text_object_fallback_different_position() {
2805 let chars = vec![make_char('A', 10.0, 20.0), make_char('A', 50.0, 20.0)];
2806 let run_ids = vec![None, None];
2807 let page = TextPage::new_with_run_ids(chars, run_ids, false);
2808 assert!(!page.is_same_text_object(0, 1));
2809 }
2810
2811 #[test]
2814 fn test_text_extractor_default_not_rtl() {
2815 let ext = TextExtractor::new();
2816 assert!(!ext.is_rtl());
2817 }
2818
2819 #[test]
2820 fn test_text_extractor_with_rtl() {
2821 let ext = TextExtractor::with_rtl(true);
2822 assert!(ext.is_rtl());
2823 }
2824
2825 #[test]
2826 fn test_text_extractor_with_rtl_false() {
2827 let ext = TextExtractor::with_rtl(false);
2828 assert!(!ext.is_rtl());
2829 }
2830
2831 #[test]
2834 fn test_text_page_new_with_direction_false() {
2835 let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
2836 let page = TextPage::new_with_direction(chars, false);
2837 assert_eq!(page.all_page_text(), "Hi");
2838 }
2839
2840 #[test]
2841 fn test_text_page_new_with_direction_true() {
2842 let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
2843 let page = TextPage::new_with_direction(chars, true);
2844 assert_eq!(page.char_count(), 2);
2847 }
2848
2849 #[test]
2854 fn test_get_unicode_returns_char() {
2855 let chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
2856 let page = TextPage::new(chars);
2857 assert_eq!(page.unicode(0), Some('A'));
2858 assert_eq!(page.unicode(1), Some('B'));
2859 assert_eq!(page.unicode(2), None);
2860 }
2861
2862 #[test]
2863 fn test_is_generated_false_for_normal() {
2864 let chars = vec![make_char('A', 0.0, 0.0)];
2865 let page = TextPage::new(chars);
2866 assert_eq!(page.is_generated(0), Some(false));
2867 }
2868
2869 #[test]
2870 fn test_is_generated_true_for_generated() {
2871 let mut ch = make_char(' ', 0.0, 0.0);
2872 ch.char_type = CharType::Generated;
2873 let page = TextPage::new(vec![ch]);
2874 assert_eq!(page.is_generated(0), Some(true));
2875 }
2876
2877 #[test]
2878 fn test_is_generated_out_of_bounds() {
2879 let page = TextPage::new(vec![]);
2880 assert_eq!(page.is_generated(0), None);
2881 }
2882
2883 #[test]
2884 fn test_is_hyphen_true_for_hyphen_char_type() {
2885 let mut ch = make_char('-', 0.0, 0.0);
2886 ch.char_type = CharType::Hyphen;
2887 let page = TextPage::new(vec![ch]);
2888 assert_eq!(page.is_hyphen(0), Some(true));
2889 }
2890
2891 #[test]
2892 fn test_is_hyphen_true_for_soft_hyphen() {
2893 let mut ch = make_char('\u{00AD}', 0.0, 0.0);
2894 ch.is_soft_hyphen = true;
2895 let page = TextPage::new(vec![ch]);
2896 assert_eq!(page.is_hyphen(0), Some(true));
2897 }
2898
2899 #[test]
2900 fn test_is_hyphen_false_for_normal() {
2901 let chars = vec![make_char('A', 0.0, 0.0)];
2902 let page = TextPage::new(chars);
2903 assert_eq!(page.is_hyphen(0), Some(false));
2904 }
2905
2906 #[test]
2907 fn test_has_unicode_map_error_true_for_not_unicode() {
2908 let mut ch = make_char('\u{FFFD}', 0.0, 0.0);
2909 ch.char_type = CharType::NotUnicode;
2910 let page = TextPage::new(vec![ch]);
2911 assert_eq!(page.has_unicode_map_error(0), Some(true));
2912 }
2913
2914 #[test]
2915 fn test_has_unicode_map_error_false_for_normal() {
2916 let chars = vec![make_char('A', 0.0, 0.0)];
2917 let page = TextPage::new(chars);
2918 assert_eq!(page.has_unicode_map_error(0), Some(false));
2919 }
2920
2921 #[test]
2922 fn test_get_font_size_returns_size() {
2923 let chars = vec![make_char('A', 0.0, 0.0)];
2924 let page = TextPage::new(chars);
2925 assert_eq!(page.font_size(0), Some(12.0));
2926 assert_eq!(page.font_size(1), None);
2927 }
2928
2929 #[test]
2930 fn test_get_font_info_returns_name() {
2931 let chars = vec![make_char('A', 0.0, 0.0)];
2932 let page = TextPage::new(chars);
2933 assert_eq!(page.font_info(0), Some("TestFont"));
2934 assert_eq!(page.font_info(1), None);
2935 }
2936
2937 #[test]
2938 fn test_get_char_angle_identity_matrix() {
2939 let chars = vec![make_char('A', 0.0, 0.0)];
2940 let page = TextPage::new(chars);
2941 let angle = page.char_angle(0).unwrap();
2942 assert!(angle.abs() < 0.001);
2944 }
2945
2946 #[test]
2947 fn test_get_char_angle_rotated() {
2948 let mut ch = make_char('A', 0.0, 0.0);
2949 ch.matrix = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0];
2951 let page = TextPage::new(vec![ch]);
2952 let angle = page.char_angle(0).unwrap();
2953 assert!((angle - std::f32::consts::FRAC_PI_2).abs() < 0.001);
2954 }
2955
2956 #[test]
2957 fn test_get_char_angle_negative_normalized() {
2958 let mut ch = make_char('A', 0.0, 0.0);
2959 ch.matrix = [0.0, -1.0, 1.0, 0.0, 0.0, 0.0];
2961 let page = TextPage::new(vec![ch]);
2962 let angle = page.char_angle(0).unwrap();
2963 assert!(angle >= 0.0);
2965 assert!((angle - 3.0 * std::f32::consts::FRAC_PI_2).abs() < 0.001);
2966 }
2967
2968 #[test]
2969 fn test_get_char_box_basic() {
2970 let chars = vec![make_char('A', 10.0, 20.0)];
2971 let page = TextPage::new(chars);
2972 let rect = page.char_box(0).unwrap();
2973 assert_eq!(rect.left, 10.0);
2974 assert_eq!(rect.bottom, 20.0);
2975 assert_eq!(rect.right, 20.0); assert_eq!(rect.top, 32.0); }
2978
2979 #[test]
2980 fn test_get_char_box_out_of_bounds() {
2981 let page = TextPage::new(vec![]);
2982 assert!(page.char_box(0).is_none());
2983 }
2984
2985 #[test]
2986 fn test_get_loose_char_box_none_when_absent() {
2987 let chars = vec![make_char('A', 0.0, 0.0)];
2988 let page = TextPage::new(chars);
2989 assert!(page.loose_char_box(0).is_none());
2991 }
2992
2993 #[test]
2994 fn test_get_loose_char_box_some_when_present() {
2995 let mut ch = make_char('A', 0.0, 0.0);
2996 ch.loose_char_box = Some(CharRect {
2997 left: -1.0,
2998 bottom: -3.0,
2999 right: 11.0,
3000 top: 9.0,
3001 });
3002 let page = TextPage::new(vec![ch]);
3003 let rect = page.loose_char_box(0).unwrap();
3004 assert_eq!(rect.left, -1.0);
3005 assert_eq!(rect.top, 9.0);
3006 }
3007
3008 #[test]
3009 fn test_get_matrix_returns_matrix() {
3010 let chars = vec![make_char('A', 5.0, 10.0)];
3011 let page = TextPage::new(chars);
3012 let m = page.matrix(0).unwrap();
3013 assert_eq!(m, [1.0, 0.0, 0.0, 1.0, 5.0, 10.0]);
3014 assert!(page.matrix(1).is_none());
3015 }
3016
3017 #[test]
3018 fn test_get_char_origin_returns_ef() {
3019 let chars = vec![make_char('A', 100.0, 200.0)];
3020 let page = TextPage::new(chars);
3021 let o = page.char_origin(0).unwrap();
3022 assert_eq!(o.x, 100.0);
3023 assert_eq!(o.y, 200.0);
3024 }
3025
3026 #[test]
3027 fn test_get_text_object_returns_run_id() {
3028 let chars = vec![make_char('A', 0.0, 0.0)];
3029 let run_ids = vec![Some(42)];
3030 let page = TextPage::new_with_run_ids(chars, run_ids, false);
3031 assert_eq!(page.text_object(0), Some(42));
3032 }
3033
3034 #[test]
3035 fn test_get_text_object_none_for_generated() {
3036 let mut ch = make_char(' ', 0.0, 0.0);
3037 ch.char_type = CharType::Generated;
3038 let run_ids = vec![None];
3039 let page = TextPage::new_with_run_ids(vec![ch], run_ids, false);
3040 assert_eq!(page.text_object(0), None);
3041 }
3042
3043 #[test]
3044 fn test_get_fill_color_none_by_default() {
3045 let chars = vec![make_char('A', 0.0, 0.0)];
3046 let page = TextPage::new(chars);
3047 assert!(page.fill_color(0).is_none());
3048 }
3049
3050 #[test]
3051 fn test_get_fill_color_some_when_set() {
3052 let mut ch = make_char('A', 0.0, 0.0);
3053 ch.fill_color = Some(Color::rgb(1.0, 0.0, 0.0));
3054 let page = TextPage::new(vec![ch]);
3055 let color = page.fill_color(0).unwrap();
3056 assert_eq!(color.components[0], 1.0);
3057 assert_eq!(color.components[1], 0.0);
3058 }
3059
3060 #[test]
3061 fn test_get_stroke_color_none_by_default() {
3062 let chars = vec![make_char('A', 0.0, 0.0)];
3063 let page = TextPage::new(chars);
3064 assert!(page.stroke_color(0).is_none());
3065 }
3066
3067 #[test]
3068 fn test_get_stroke_color_some_when_set() {
3069 let mut ch = make_char('A', 0.0, 0.0);
3070 ch.stroke_color = Some(Color::gray(0.5));
3071 let page = TextPage::new(vec![ch]);
3072 let color = page.stroke_color(0).unwrap();
3073 assert_eq!(color.components[0], 0.5);
3074 }
3075
3076 #[test]
3077 fn test_get_font_weight_none_by_default() {
3078 let chars = vec![make_char('A', 0.0, 0.0)];
3079 let page = TextPage::new(chars);
3080 assert_eq!(page.font_weight(0), None);
3081 }
3082
3083 #[test]
3084 fn test_get_font_weight_some_when_set() {
3085 let mut ch = make_char('A', 0.0, 0.0);
3086 ch.font_weight = Some(700);
3087 let page = TextPage::new(vec![ch]);
3088 assert_eq!(page.font_weight(0), Some(700));
3089 }
3090
3091 #[test]
3092 fn test_get_font_weight_out_of_bounds() {
3093 let page = TextPage::new(vec![]);
3094 assert_eq!(page.font_weight(0), None);
3095 }
3096
3097 #[test]
3098 fn test_link_rects_basic() {
3099 let chars: Vec<TextCharacter> = "Hello"
3100 .chars()
3101 .enumerate()
3102 .map(|(i, c)| make_char(c, i as f32 * 10.0, 0.0))
3103 .collect();
3104 let run_ids: Vec<Option<u32>> = vec![Some(1); 5];
3106 let page = TextPage::new_with_run_ids(chars, run_ids, false);
3107 let link = crate::linkextract::Link {
3108 url: "Hello".to_string(),
3109 start_index: 0,
3110 end_index: 5,
3111 kind: crate::linkextract::LinkKind::WebUrl,
3112 };
3113 let rects = page.link_rects(&link);
3114 assert!(!rects.is_empty());
3115 assert_eq!(rects.len(), 1);
3117 }
3118
3119 #[test]
3122 fn test_get_font_flags_none_by_default() {
3123 let chars = vec![make_char('A', 0.0, 0.0)];
3124 let page = TextPage::new(chars);
3125 assert_eq!(page.font_flags(0), None);
3126 }
3127
3128 #[test]
3129 fn test_get_font_flags_returns_value() {
3130 let mut ch = make_char('A', 0.0, 0.0);
3131 ch.font_flags = Some(0x42); let page = TextPage::new(vec![ch]);
3133 assert_eq!(page.font_flags(0), Some(0x42));
3134 }
3135
3136 #[test]
3137 fn test_get_font_flags_out_of_bounds() {
3138 let page = TextPage::new(vec![]);
3139 assert_eq!(page.font_flags(0), None);
3140 }
3141
3142 #[test]
3145 fn test_get_fill_color_rgba_returns_tuple() {
3146 let mut ch = make_char('A', 0.0, 0.0);
3147 ch.fill_color = Some(Color::rgb(1.0, 0.0, 0.0));
3148 let page = TextPage::new(vec![ch]);
3149 assert_eq!(page.fill_color_rgba(0), Some((255, 0, 0, 255)));
3150 }
3151
3152 #[test]
3153 fn test_get_stroke_color_rgba_returns_tuple() {
3154 let mut ch = make_char('A', 0.0, 0.0);
3155 ch.stroke_color = Some(Color::rgb(0.0, 1.0, 0.0));
3156 let page = TextPage::new(vec![ch]);
3157 assert_eq!(page.stroke_color_rgba(0), Some((0, 255, 0, 255)));
3158 }
3159
3160 #[test]
3161 fn test_get_fill_color_rgba_none_when_no_color() {
3162 let chars = vec![make_char('A', 0.0, 0.0)];
3163 let page = TextPage::new(chars);
3164 assert_eq!(page.fill_color_rgba(0), None);
3165 }
3166
3167 #[test]
3170 fn test_text_by_object_single_run() {
3171 let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
3172 let run_ids = vec![Some(1), Some(1)];
3173 let page = TextPage::new_with_run_ids(chars, run_ids, false);
3174 assert_eq!(page.text_by_object(1), "Hi");
3175 }
3176
3177 #[test]
3178 fn test_text_by_object_nonexistent() {
3179 let chars = vec![make_char('A', 0.0, 0.0)];
3180 let run_ids = vec![Some(1)];
3181 let page = TextPage::new_with_run_ids(chars, run_ids, false);
3182 assert_eq!(page.text_by_object(99), "");
3183 }
3184
3185 #[test]
3186 fn test_text_by_object_with_trailing_space() {
3187 let mut chars = vec![make_char('A', 0.0, 0.0), make_char('B', 10.0, 0.0)];
3188 let mut run_ids: Vec<Option<u32>> = vec![Some(1), Some(1)];
3189 let mut sp = make_char(' ', 20.0, 0.0);
3191 sp.char_type = CharType::Generated;
3192 chars.push(sp);
3193 run_ids.push(None);
3194 chars.push(make_char('X', 30.0, 0.0));
3196 run_ids.push(Some(2));
3197 let page = TextPage::new_with_run_ids(chars, run_ids, false);
3198 assert_eq!(page.text_by_object(1), "AB ");
3200 }
3201
3202 #[test]
3203 fn test_text_by_object_multi_line() {
3204 let mut chars = vec![make_char('A', 0.0, 100.0), make_char('B', 10.0, 100.0)];
3205 let mut run_ids: Vec<Option<u32>> = vec![Some(1), Some(1)];
3206 chars.push(make_char('C', 0.0, 80.0));
3208 run_ids.push(Some(2));
3209 chars.push(make_char('D', 0.0, 60.0));
3211 run_ids.push(Some(1));
3212 let page = TextPage::new_with_run_ids(chars, run_ids, false);
3213 let text = page.text_by_object(1);
3214 assert!(text.contains("AB"));
3215 assert!(text.contains("D"));
3216 assert!(text.contains("\r\n"));
3218 }
3219
3220 #[test]
3223 fn test_normalize_threshold_below_t1() {
3224 let result = normalize_threshold(200.0, 300, 500, 700);
3226 assert!((result - 100.0).abs() < 0.01);
3227 }
3228
3229 #[test]
3230 fn test_normalize_threshold_between_t1_t2() {
3231 let result = normalize_threshold(400.0, 300, 500, 700);
3233 assert!((result - 100.0).abs() < 0.01);
3234 }
3235
3236 #[test]
3237 fn test_normalize_threshold_between_t2_t3() {
3238 let result = normalize_threshold(600.0, 300, 500, 700);
3240 assert!((result - 120.0).abs() < 0.01);
3241 }
3242
3243 #[test]
3244 fn test_normalize_threshold_above_t3() {
3245 let result = normalize_threshold(900.0, 300, 500, 700);
3247 assert!((result - 150.0).abs() < 0.01);
3248 }
3249
3250 #[test]
3251 fn test_normalize_threshold_zero() {
3252 let result = normalize_threshold(0.0, 300, 500, 700);
3253 assert!((result - 0.0).abs() < 0.01);
3254 }
3255
3256 #[test]
3259 fn test_detect_orientation_empty() {
3260 let result = detect_orientation(&[], 612.0, 792.0);
3261 assert_eq!(result, TextFlowOrientation::Unknown);
3262 }
3263
3264 #[test]
3265 fn test_detect_orientation_horizontal_text() {
3266 let chars: Vec<TextCharacter> = (0..20)
3268 .map(|i| {
3269 let mut c = make_char('A', i as f32 * 30.0, 700.0);
3270 c.char_box = CharRect {
3271 left: c.char_box.left,
3272 bottom: c.char_box.bottom,
3273 right: c.char_box.left + 25.0,
3274 top: c.char_box.bottom + 12.0,
3275 };
3276 c
3277 })
3278 .collect();
3279 let result = detect_orientation(&chars, 612.0, 792.0);
3280 assert_eq!(result, TextFlowOrientation::Horizontal);
3281 }
3282
3283 #[test]
3284 fn test_detect_orientation_vertical_text() {
3285 let chars: Vec<TextCharacter> = (0..20)
3287 .map(|i| {
3288 let mut c = make_char('A', 500.0, 700.0 - i as f32 * 30.0);
3289 c.char_box = CharRect {
3290 left: c.char_box.left,
3291 bottom: c.char_box.bottom,
3292 right: c.char_box.left + 12.0,
3293 top: c.char_box.bottom + 25.0,
3294 };
3295 c
3296 })
3297 .collect();
3298 let result = detect_orientation(&chars, 612.0, 792.0);
3299 assert_eq!(result, TextFlowOrientation::Vertical);
3300 }
3301
3302 #[test]
3303 fn test_detect_orientation_zero_dimensions() {
3304 let chars = vec![make_char('A', 0.0, 0.0)];
3305 let result = detect_orientation(&chars, 0.0, 0.0);
3306 assert_eq!(result, TextFlowOrientation::Unknown);
3307 }
3308
3309 #[test]
3312 fn test_bidi_reordering_rtl_chars_reordered() {
3313 let chars = vec![
3317 make_char('\u{0628}', 10.0, 0.0), make_char('\u{0627}', 0.0, 0.0), ];
3320 let page = TextPage::new(chars);
3321 assert_eq!(
3322 page.char_count(),
3323 2,
3324 "bidi reordering must not lose Arabic characters"
3325 );
3326 let text = page.all_page_text();
3327 assert!(
3328 text.contains('\u{0628}') && text.contains('\u{0627}'),
3329 "both Arabic characters must be present after bidi reordering, got: {:?}",
3330 text
3331 );
3332 }
3333
3334 #[test]
3335 fn test_bidi_reordering_mixed_ltr_rtl() {
3336 let chars = vec![
3339 make_char('H', 0.0, 0.0),
3340 make_char('i', 10.0, 0.0),
3341 make_char('\u{0627}', 20.0, 0.0), make_char('\u{0628}', 30.0, 0.0), ];
3344 let page = TextPage::new(chars);
3345 assert_eq!(
3346 page.char_count(),
3347 4,
3348 "mixed bidi must preserve all 4 characters"
3349 );
3350 let text = page.all_page_text();
3351 assert!(
3352 text.contains('H') && text.contains('i'),
3353 "LTR characters must survive bidi reordering: {:?}",
3354 text
3355 );
3356 assert!(
3357 text.contains('\u{0627}') && text.contains('\u{0628}'),
3358 "RTL characters must survive bidi reordering: {:?}",
3359 text
3360 );
3361 }
3362
3363 #[test]
3366 fn test_large_page_count_chars_500() {
3367 let chars: Vec<TextCharacter> = (0..500u32)
3370 .map(|i| {
3371 let col = (i % 50) as f32;
3372 let row = (i / 50) as f32;
3373 let ch = (b'A' + (i % 26) as u8) as char;
3374 make_char(ch, col * 10.0, 700.0 - row * 20.0)
3375 })
3376 .collect();
3377 let page = TextPage::new(chars);
3378 assert_eq!(page.char_count(), 500);
3379 }
3380
3381 #[test]
3382 fn test_large_page_get_page_text_complete() {
3383 let chars: Vec<TextCharacter> = (0..500u32)
3387 .map(|i| {
3388 let col = (i % 50) as f32;
3389 let row = (i / 50) as f32;
3390 let ch = (b'A' + (i % 26) as u8) as char;
3391 make_char(ch, col * 10.0, 700.0 - row * 20.0)
3392 })
3393 .collect();
3394 let expected: String = (0..500u32)
3395 .map(|i| (b'A' + (i % 26) as u8) as char)
3396 .collect();
3397 let page = TextPage::new(chars);
3398 assert_eq!(page.all_page_text(), expected);
3399 let sub: String = (10..20u32)
3401 .map(|i| (b'A' + (i % 26) as u8) as char)
3402 .collect();
3403 assert_eq!(page.page_text(10, 10), sub);
3404 }
3405
3406 #[test]
3407 fn test_large_page_all_char_getters_return_some() {
3408 let chars: Vec<TextCharacter> = (0..300u32)
3413 .map(|i| make_char((b'A' + (i % 26) as u8) as char, i as f32 * 10.0, 700.0))
3414 .collect();
3415 let page = TextPage::new(chars);
3416 for i in 0..300 {
3417 assert!(page.unicode(i).is_some(), "get_unicode({i})");
3418 assert!(page.char_code(i).is_some(), "get_char_code({i})");
3419 assert!(page.font_size(i).is_some(), "get_font_size({i})");
3420 assert!(page.font_info(i).is_some(), "get_font_info({i})");
3421 assert!(page.char_angle(i).is_some(), "get_char_angle({i})");
3422 assert!(page.char_box(i).is_some(), "get_char_box({i})");
3423 assert!(page.matrix(i).is_some(), "get_matrix({i})");
3424 assert!(page.char_origin(i).is_some(), "get_char_origin({i})");
3425 assert!(page.is_generated(i).is_some(), "is_generated({i})");
3426 assert!(page.is_hyphen(i).is_some(), "is_hyphen({i})");
3427 assert!(
3428 page.has_unicode_map_error(i).is_some(),
3429 "has_unicode_map_error({i})"
3430 );
3431 }
3432 assert!(page.unicode(300).is_none());
3434 assert!(page.char_box(300).is_none());
3435 assert!(page.char_origin(300).is_none());
3436 assert!(page.matrix(300).is_none());
3437 }
3438
3439 #[test]
3440 fn test_large_page_char_box_valid_bounds() {
3441 let chars: Vec<TextCharacter> = (0..200u32)
3445 .map(|i| make_char('A', i as f32 * 12.0, 500.0))
3446 .collect();
3447 let page = TextPage::new(chars);
3448 for i in 0..200 {
3449 let rect = page.char_box(i).expect("get_char_box should be Some");
3450 assert!(
3451 rect.right >= rect.left,
3452 "rect.right >= rect.left violated at index {i}"
3453 );
3454 assert!(
3455 rect.top >= rect.bottom,
3456 "rect.top >= rect.bottom violated at index {i}"
3457 );
3458 }
3459 }
3460
3461 #[test]
3462 fn test_large_page_char_origin_matches_matrix_ef() {
3463 let chars: Vec<TextCharacter> = (0..200u32)
3467 .map(|i| make_char('X', i as f32 * 7.5, 300.0 + (i % 10) as f32 * 15.0))
3468 .collect();
3469 let page = TextPage::new(chars);
3470 for i in 0..200 {
3471 let origin = page.char_origin(i).expect("get_char_origin should be Some");
3472 let matrix = page.matrix(i).expect("get_matrix should be Some");
3473 assert_eq!(
3474 origin.x, matrix[4],
3475 "CharOrigin.x != matrix[4] at index {i}"
3476 );
3477 assert_eq!(
3478 origin.y, matrix[5],
3479 "CharOrigin.y != matrix[5] at index {i}"
3480 );
3481 }
3482 }
3483
3484 #[test]
3485 fn test_large_page_segmentation_scales_to_10_lines() {
3486 let chars: Vec<TextCharacter> = (0..200u32)
3489 .map(|i| {
3490 let col = (i % 20) as f32;
3491 let row = (i / 20) as f32;
3492 make_char(
3493 (b'A' + (i % 26) as u8) as char,
3494 col * 10.0,
3495 700.0 - row * 20.0,
3496 )
3497 })
3498 .collect();
3499 let lines = segment_lines(&chars);
3500 assert_eq!(
3501 lines.len(),
3502 10,
3503 "expected 10 lines from 200 chars at 20/line"
3504 );
3505 for (idx, line) in lines.iter().enumerate() {
3506 assert!(
3507 !line.words.is_empty(),
3508 "line {idx} should have at least one word"
3509 );
3510 }
3511 }
3512
3513 #[test]
3514 fn test_large_page_search_finds_repeated_pattern() {
3515 let chars: Vec<TextCharacter> = (0..300u32)
3519 .map(|i| {
3520 let ch = match i % 3 {
3521 0 => 'A',
3522 1 => 'B',
3523 _ => 'C',
3524 };
3525 make_char(ch, i as f32 * 8.0, 500.0)
3526 })
3527 .collect();
3528 let page = TextPage::new(chars);
3529 let text = page.all_page_text().to_string();
3530 let mut finder = crate::textpagefind::TextPageFind::new(
3531 &text,
3532 "ABC",
3533 crate::textpagefind::SearchOptions::default(),
3534 );
3535 let mut count = 0usize;
3536 while finder.find_next().is_some() {
3537 count += 1;
3538 }
3539 assert_eq!(
3540 count, 100,
3541 "expected 100 occurrences of 'ABC' in 300-char page"
3542 );
3543 }
3544
3545 #[test]
3546 fn test_large_page_rect_array_single_line_merges_to_one() {
3547 let chars: Vec<TextCharacter> = (0..50u32)
3550 .map(|i| make_char('A', i as f32 * 10.0, 500.0))
3551 .collect();
3552 let run_ids: Vec<Option<u32>> = (0..50).map(|_| Some(1)).collect();
3554 let page = TextPage::new_with_run_ids(chars, run_ids, false);
3555 let rects = page.rect_array(0, 50);
3556 assert_eq!(
3557 rects.len(),
3558 1,
3559 "50 chars with same run ID should merge into 1 rect, got {}",
3560 rects.len()
3561 );
3562 assert_eq!(rects[0].left, 0.0);
3563 assert!(rects[0].right > rects[0].left);
3564 }
3565
3566 #[test]
3569 fn test_duplicate_same_position_same_run_deduplicated() {
3570 let ch1 = make_char('A', 72.0, 700.0);
3574 let ch2 = make_char('A', 72.0, 700.0);
3575 let mut extractor = TextExtractor::new();
3576 extractor.try_add_character(ch1, Some(1));
3577 extractor.try_add_character(ch2, Some(1));
3578 let (chars, run_ids) = extractor.into_characters();
3579 let page = TextPage::new_with_run_ids(chars, run_ids, false);
3580 assert!(
3582 page.char_count() < 2,
3583 "expected duplicate 'A' at same position to be removed, got {} chars",
3584 page.char_count()
3585 );
3586 }
3587
3588 #[test]
3589 fn test_duplicate_overlapping_position_no_dedup_different_content() {
3590 let ch1 = make_char('A', 72.0, 700.0);
3593 let ch2 = make_char('B', 72.0, 700.0);
3594 let mut extractor = TextExtractor::new();
3595 extractor.try_add_character(ch1, Some(1));
3596 extractor.try_add_character(ch2, Some(2));
3597 let (chars, run_ids) = extractor.into_characters();
3598 let page = TextPage::new_with_run_ids(chars, run_ids, false);
3599 assert_eq!(
3601 page.char_count(),
3602 2,
3603 "expected both 'A' and 'B' to be preserved (different unicode)"
3604 );
3605 }
3606
3607 #[test]
3624 fn test_char_index_from_text_index_first_is_zero() {
3625 let chars = vec![make_char('H', 0.0, 0.0), make_char('i', 10.0, 0.0)];
3626 let page = TextPage::new(chars);
3627 assert_eq!(
3628 page.char_index_from_text_index(0),
3629 Some(0),
3630 "first text index must map to first char index"
3631 );
3632 }
3633
3634 #[test]
3635 fn test_char_text_index_round_trip_all_positions() {
3636 let chars = vec![
3637 make_char('H', 0.0, 0.0),
3638 make_char('e', 10.0, 0.0),
3639 make_char('l', 20.0, 0.0),
3640 make_char('l', 30.0, 0.0),
3641 make_char('o', 40.0, 0.0),
3642 ];
3643 let page = TextPage::new(chars);
3644 for i in 0..5 {
3645 let char_idx = page.char_index_from_text_index(i);
3646 assert!(char_idx.is_some(), "text index {} must be in bounds", i);
3647 let round_trip = page.text_index_from_char_index(char_idx.unwrap());
3648 assert_eq!(
3649 round_trip,
3650 Some(i),
3651 "round-trip text->char->text failed at index {}",
3652 i
3653 );
3654 }
3655 }
3656
3657 #[test]
3658 fn test_char_index_from_text_index_out_of_bounds_returns_none() {
3659 let chars = vec![make_char('X', 0.0, 0.0)];
3660 let page = TextPage::new(chars);
3661 assert_eq!(page.char_index_from_text_index(1), None);
3663 assert_eq!(page.char_index_from_text_index(999), None);
3665 }
3666
3667 #[test]
3668 fn test_text_index_from_char_index_out_of_bounds_returns_none() {
3669 let chars = vec![make_char('Y', 0.0, 0.0)];
3670 let page = TextPage::new(chars);
3671 assert_eq!(page.text_index_from_char_index(1), None);
3672 assert_eq!(page.text_index_from_char_index(usize::MAX), None);
3673 }
3674
3675 #[test]
3676 fn test_char_text_index_mapping_consistent_with_char_count() {
3677 let chars = vec![
3678 make_char('A', 0.0, 0.0),
3679 make_char('B', 10.0, 0.0),
3680 make_char('C', 20.0, 0.0),
3681 ];
3682 let page = TextPage::new(chars);
3683 let n = page.char_count();
3684 for i in 0..n {
3686 assert!(
3687 page.char_index_from_text_index(i).is_some(),
3688 "text index {} should be valid (char_count={})",
3689 i,
3690 n
3691 );
3692 assert!(
3693 page.text_index_from_char_index(i).is_some(),
3694 "char index {} should be valid (char_count={})",
3695 i,
3696 n
3697 );
3698 }
3699 assert!(page.char_index_from_text_index(n).is_none());
3701 assert!(page.text_index_from_char_index(n).is_none());
3702 }
3703
3704 #[test]
3705 fn test_char_text_index_empty_page_both_out_of_bounds() {
3706 let page = TextPage::new(vec![]);
3707 assert_eq!(page.char_index_from_text_index(0), None);
3708 assert_eq!(page.text_index_from_char_index(0), None);
3709 }
3710
3711 #[test]
3714 fn test_char_render_mode_default_is_fill() {
3715 let chars = vec![make_char('A', 0.0, 0.0)];
3716 let page = TextPage::new(chars);
3717 assert_eq!(page.char_render_mode(0), Some(TextRenderingMode::Fill));
3718 }
3719
3720 #[test]
3721 fn test_char_render_mode_stroke() {
3722 let mut ch = make_char('A', 0.0, 0.0);
3723 ch.rendering_mode = TextRenderingMode::Stroke;
3724 let page = TextPage::new(vec![ch]);
3725 assert_eq!(page.char_render_mode(0), Some(TextRenderingMode::Stroke));
3726 }
3727
3728 #[test]
3729 fn test_char_render_mode_fill_stroke_clip() {
3730 let mut ch = make_char('A', 0.0, 0.0);
3731 ch.rendering_mode = TextRenderingMode::FillStrokeClip;
3732 let page = TextPage::new(vec![ch]);
3733 assert_eq!(
3734 page.char_render_mode(0),
3735 Some(TextRenderingMode::FillStrokeClip)
3736 );
3737 }
3738
3739 #[test]
3740 fn test_char_render_mode_out_of_bounds() {
3741 let page = TextPage::new(vec![]);
3742 assert_eq!(page.char_render_mode(0), None);
3743 }
3744
3745 #[test]
3746 fn test_char_render_mode_returns_mode() {
3747 let mut ch = make_char('B', 0.0, 0.0);
3748 ch.rendering_mode = TextRenderingMode::Invisible;
3749 let page = TextPage::new(vec![ch]);
3750 assert_eq!(page.char_render_mode(0), Some(TextRenderingMode::Invisible));
3751 }
3752
3753 #[test]
3756 fn test_char_index_at_pos_f64_basic() {
3757 let chars = vec![make_char('A', 10.0, 20.0), make_char('B', 30.0, 20.0)];
3758 let page = TextPage::new(chars);
3759 assert_eq!(page.char_index_at_pos(15.0, 26.0, 0.0, 0.0), Some(0));
3761 }
3762
3763 #[test]
3764 fn test_char_index_at_pos_f64_with_tolerance() {
3765 let chars = vec![make_char('A', 10.0, 20.0)];
3770 let page = TextPage::new(chars);
3771 assert_eq!(page.char_index_at_pos(8.0, 26.0, 6.0, 2.0), Some(0));
3773 assert_eq!(page.char_index_at_pos(4.0, 26.0, 6.0, 2.0), None);
3775 }
3776
3777 #[test]
3778 fn test_char_index_at_pos_f64_no_match() {
3779 let chars = vec![make_char('A', 10.0, 20.0)];
3780 let page = TextPage::new(chars);
3781 assert_eq!(page.char_index_at_pos(100.0, 100.0, 1.0, 1.0), None);
3782 }
3783
3784 #[test]
3787 fn test_text_in_rect_basic() {
3788 let chars = vec![
3789 make_char('A', 10.0, 20.0),
3790 make_char('B', 30.0, 20.0),
3791 make_char('C', 50.0, 50.0),
3792 ];
3793 let page = TextPage::new(chars);
3794 assert_eq!(page.text_in_rect(0.0, 40.0, 40.0, 10.0), "AB");
3796 }
3797
3798 #[test]
3799 fn test_text_in_rect_empty() {
3800 let chars = vec![make_char('A', 10.0, 20.0)];
3801 let page = TextPage::new(chars);
3802 assert_eq!(page.text_in_rect(100.0, 200.0, 200.0, 100.0), "");
3803 }
3804
3805 #[allow(deprecated)]
3806 #[test]
3807 fn test_text_in_rect_alias() {
3808 let chars = vec![make_char('X', 5.0, 5.0)];
3809 let page = TextPage::new(chars);
3810 assert_eq!(page.get_text_in_rect(0.0, 20.0, 20.0, 0.0), "X");
3811 }
3812
3813 #[test]
3818 fn test_text_page_char_object_index_returns_none() {
3819 let chars = vec![make_char('A', 0.0, 0.0)];
3820 let page = TextPage::new(chars);
3821 assert!(page.char_object_index(0).is_none());
3823 assert!(page.text_get_char_object(0).is_none());
3824 }
3825
3826 #[test]
3827 fn test_text_page_has_text_object_for_char_false() {
3828 let chars = vec![make_char('B', 10.0, 10.0)];
3829 let page = TextPage::new(chars);
3830 assert!(!page.has_text_object_for_char(0));
3832 assert!(!page.has_text_object_for_char(99));
3833 }
3834}