1use crate::clustering::cluster_items;
2use crate::geometry::objects_to_bbox;
3use crate::types::{BBox, Char, Direction, SearchMatch, TextLine, Word};
4
5#[derive(Debug, Clone)]
6pub struct TextOptions {
7 pub x_tolerance: f64,
8 pub y_tolerance: f64,
9 pub x_tolerance_ratio: Option<f64>,
10 pub y_tolerance_ratio: Option<f64>,
11 pub layout: bool,
12 pub layout_width: Option<f64>,
13 pub layout_height: Option<f64>,
14 pub layout_width_chars: Option<usize>,
15 pub layout_height_chars: Option<usize>,
16 pub layout_bbox: Option<BBox>,
17 pub x_density: f64,
18 pub y_density: f64,
19 pub x_shift: f64,
20 pub y_shift: f64,
21 pub line_dir: Direction,
22 pub char_dir: Direction,
23 pub line_dir_rotated: Option<Direction>,
24 pub char_dir_rotated: Option<Direction>,
25 pub line_dir_render: Option<Direction>,
26 pub char_dir_render: Option<Direction>,
27 pub keep_blank_chars: bool,
28 pub use_text_flow: bool,
29 pub split_at_punctuation: Option<String>,
30 pub expand_ligatures: bool,
31}
32
33impl Default for TextOptions {
34 fn default() -> Self {
35 Self {
36 x_tolerance: 3.0,
37 y_tolerance: 3.0,
38 x_tolerance_ratio: None,
39 y_tolerance_ratio: None,
40 layout: false,
41 layout_width: None,
42 layout_height: None,
43 layout_width_chars: None,
44 layout_height_chars: None,
45 layout_bbox: None,
46 x_density: 7.25,
47 y_density: 13.0,
48 x_shift: 0.0,
49 y_shift: 0.0,
50 line_dir: Direction::Ttb,
51 char_dir: Direction::Ltr,
52 line_dir_rotated: None,
53 char_dir_rotated: None,
54 line_dir_render: None,
55 char_dir_render: None,
56 keep_blank_chars: false,
57 use_text_flow: false,
58 split_at_punctuation: None,
59 expand_ligatures: true,
60 }
61 }
62}
63
64impl TextOptions {
65 pub fn resolved_line_dir_rotated(&self) -> Direction {
66 self.line_dir_rotated.unwrap_or(self.char_dir)
67 }
68
69 pub fn resolved_char_dir_rotated(&self) -> Direction {
70 self.char_dir_rotated.unwrap_or(self.line_dir)
71 }
72
73 pub fn resolved_line_dir_render(&self) -> Direction {
74 self.line_dir_render.unwrap_or(self.line_dir)
75 }
76
77 pub fn resolved_char_dir_render(&self) -> Direction {
78 self.char_dir_render.unwrap_or(self.char_dir)
79 }
80}
81
82#[derive(Debug, Clone)]
83pub struct DedupeOptions {
84 pub tolerance: f64,
85 pub extra_attrs: Vec<String>,
86}
87
88impl Default for DedupeOptions {
89 fn default() -> Self {
90 Self {
91 tolerance: 1.0,
92 extra_attrs: vec!["fontname".to_string(), "size".to_string()],
93 }
94 }
95}
96
97#[derive(Debug, Clone)]
98pub struct SearchOptions {
99 pub regex: bool,
100 pub case_sensitive: bool,
101 pub main_group: usize,
102 pub return_groups: bool,
103 pub return_chars: bool,
104}
105
106impl Default for SearchOptions {
107 fn default() -> Self {
108 Self {
109 regex: true,
110 case_sensitive: true,
111 main_group: 0,
112 return_groups: true,
113 return_chars: true,
114 }
115 }
116}
117
118#[derive(Debug, Clone)]
119pub struct WordMap {
120 pub tuples: Vec<(Word, Vec<Char>)>,
121}
122
123impl WordMap {
124 pub fn to_textmap(&self, options: &TextOptions) -> TextMap {
125 let mut tuples: Vec<(char, Option<Char>)> = Vec::new();
126 if self.tuples.is_empty() {
127 return TextMap {
128 tuples,
129 line_dir_render: options.resolved_line_dir_render(),
130 char_dir_render: options.resolved_char_dir_render(),
131 };
132 }
133
134 let expansions = |text: &str| -> String {
135 if !options.expand_ligatures {
136 return text.to_string();
137 }
138 match text {
139 "ff" => "ff".to_string(),
140 "ffi" => "ffi".to_string(),
141 "ffl" => "ffl".to_string(),
142 "fi" => "fi".to_string(),
143 "fl" => "fl".to_string(),
144 "st" => "st".to_string(),
145 "ſt" => "st".to_string(),
146 _ => text.to_string(),
147 }
148 };
149
150 let mut width_chars = options.layout_width_chars.unwrap_or(0);
151 if width_chars == 0 {
152 if let Some(width) = options.layout_width {
153 width_chars = (width / options.x_density).round() as usize;
154 }
155 }
156
157 let mut height_chars = options.layout_height_chars.unwrap_or(0);
158 if height_chars == 0 {
159 if let Some(height) = options.layout_height {
160 height_chars = (height / options.y_density).round() as usize;
161 }
162 }
163
164 let layout_bbox = options.layout_bbox.unwrap_or_else(|| {
165 let words: Vec<Word> = self.tuples.iter().map(|(word, _)| word.clone()).collect();
166 objects_to_bbox(&words).unwrap_or_default()
167 });
168
169 let blank_line: Vec<(char, Option<Char>)> = if options.layout {
170 vec![(' ', None); width_chars]
171 } else {
172 Vec::new()
173 };
174
175 let words_sorted = {
176 let mut items = self.tuples.clone();
177 items.sort_by(|a, b| {
178 let va = line_cluster_value(&a.0, options.line_dir);
179 let vb = line_cluster_value(&b.0, options.line_dir);
180 va.total_cmp(&vb)
181 });
182 items
183 };
184
185 let line_tuples = cluster_items(
186 &words_sorted,
187 |pair| line_cluster_value(&pair.0, options.line_dir),
188 options.y_tolerance,
189 );
190
191 let line_position_key = position_key_from_bbox(layout_bbox, options.line_dir);
192 let char_position_origin = position_key_from_bbox(layout_bbox, options.char_dir);
193
194 let mut num_newlines = 0isize;
195
196 for (line_index, mut line) in line_tuples.into_iter().enumerate() {
197 if !options.use_text_flow {
198 line.sort_by(|a, b| {
199 let ka = sort_key(&a.0, options.char_dir);
200 let kb = sort_key(&b.0, options.char_dir);
201 ka.0.total_cmp(&kb.0).then_with(|| ka.1.total_cmp(&kb.1))
202 });
203 }
204
205 let y_dist = if options.layout {
206 let line_position = position_value(&line[0].0, options.line_dir);
207 let raw = line_position - (line_position_key + options.y_shift);
208 let adj = if matches!(options.line_dir, Direction::Btt | Direction::Rtl) {
209 -1.0
210 } else {
211 1.0
212 };
213 raw * adj / options.y_density
214 } else {
215 0.0
216 };
217
218 let target_newlines = if line_index > 0 { 1 } else { 0 };
219 let prepend = std::cmp::max(target_newlines, (y_dist.round() as isize) - num_newlines);
220
221 for _ in 0..prepend.max(0) as usize {
222 if tuples.is_empty() || tuples.last().map(|(c, _)| *c == '\n').unwrap_or(false) {
223 tuples.extend(blank_line.clone());
224 }
225 tuples.push(('\n', None));
226 }
227 num_newlines += prepend.max(0);
228
229 let mut line_len: isize = 0;
230 for (word, chars) in line {
231 let x_dist = if options.layout {
232 let char_position = position_value(&word, options.char_dir);
233 let raw = char_position - (char_position_origin + options.x_shift);
234 let adj = if matches!(options.char_dir, Direction::Btt | Direction::Rtl) {
235 -1.0
236 } else {
237 1.0
238 };
239 raw * adj / options.x_density
240 } else {
241 0.0
242 };
243
244 let prepend_spaces = std::cmp::max(std::cmp::min(1, line_len), (x_dist.round() as isize) - line_len);
245 for _ in 0..prepend_spaces.max(0) as usize {
246 tuples.push((' ', None));
247 }
248 line_len += prepend_spaces.max(0);
249
250 for ch in chars {
251 let expanded = expansions(&ch.text);
252 for letter in expanded.chars() {
253 tuples.push((letter, Some(ch.clone())));
254 line_len += 1;
255 }
256 }
257 }
258
259 if options.layout && width_chars > 0 && line_len < width_chars as isize {
260 for _ in 0..(width_chars as isize - line_len) as usize {
261 tuples.push((' ', None));
262 }
263 }
264 }
265
266 if options.layout && height_chars > 0 {
267 let append = height_chars as isize - (num_newlines + 1);
268 for i in 0..append.max(0) as usize {
269 if i > 0 {
270 tuples.extend(blank_line.clone());
271 }
272 tuples.push(('\n', None));
273 }
274 if tuples.last().map(|(c, _)| *c == '\n').unwrap_or(false) {
275 tuples.pop();
276 }
277 }
278
279 TextMap {
280 tuples,
281 line_dir_render: options.resolved_line_dir_render(),
282 char_dir_render: options.resolved_char_dir_render(),
283 }
284 }
285}
286
287#[derive(Debug, Clone)]
288pub struct TextMap {
289 pub tuples: Vec<(char, Option<Char>)>,
290 pub line_dir_render: Direction,
291 pub char_dir_render: Direction,
292}
293
294impl TextMap {
295 pub fn as_string(&self) -> String {
296 let base: String = self.tuples.iter().map(|(c, _)| *c).collect();
297 if self.char_dir_render == Direction::Ltr && self.line_dir_render == Direction::Ttb {
298 return base;
299 }
300
301 let mut lines: Vec<String> = base.lines().map(|line| line.to_string()).collect();
302
303 if matches!(self.line_dir_render, Direction::Btt | Direction::Rtl) {
304 lines.reverse();
305 }
306
307 if self.char_dir_render == Direction::Rtl {
308 lines = lines.into_iter().map(|line| line.chars().rev().collect()).collect();
309 }
310
311 if matches!(self.line_dir_render, Direction::Rtl | Direction::Ltr) {
312 let max_line_len = lines.iter().map(|line| line.chars().count()).max().unwrap_or(0);
313 let padded: Vec<Vec<char>> = lines
314 .iter()
315 .map(|line| {
316 let mut chars: Vec<char> = line.chars().collect();
317 while chars.len() < max_line_len {
318 if self.char_dir_render == Direction::Btt {
319 chars.insert(0, ' ');
320 } else {
321 chars.push(' ');
322 }
323 }
324 chars
325 })
326 .collect();
327
328 let mut out = String::new();
329 for idx in 0..max_line_len {
330 for row in &padded {
331 out.push(row[idx]);
332 }
333 if idx + 1 != max_line_len {
334 out.push('\n');
335 }
336 }
337 return out;
338 }
339
340 lines.join("\n")
341 }
342
343 pub fn extract_text_lines(&self, strip: bool, return_chars: bool) -> Vec<TextLine> {
344 let text: String = self.tuples.iter().map(|(c, _)| *c).collect();
346 let mut out = Vec::new();
347 let mut offset = 0usize;
348 for raw_line in text.split('\n') {
349 let line = if strip { raw_line.trim() } else { raw_line };
350 let char_count = raw_line.chars().count();
351 if line.is_empty() {
352 offset += char_count + 1;
353 continue;
354 }
355
356 let chars: Vec<Char> = self
357 .slice_chars(offset, offset + char_count)
358 .into_iter()
359 .collect();
360
361 if let Some(bbox) = objects_to_bbox(&chars) {
362 out.push(TextLine {
363 text: line.to_string(),
364 x0: bbox.x0,
365 top: bbox.top,
366 x1: bbox.x1,
367 bottom: bbox.bottom,
368 chars: if return_chars { Some(chars) } else { None },
369 });
370 }
371 offset += char_count + 1;
372 }
373 out
374 }
375
376 pub fn search(&self, pattern: &str, options: &SearchOptions) -> crate::Result<Vec<SearchMatch>> {
377 let regex = if options.regex {
378 regex::RegexBuilder::new(pattern)
379 .case_insensitive(!options.case_sensitive)
380 .build()?
381 } else {
382 regex::RegexBuilder::new(®ex::escape(pattern))
383 .case_insensitive(!options.case_sensitive)
384 .build()?
385 };
386
387 let haystack: String = self.tuples.iter().map(|(c, _)| *c).collect();
391 let mut out = Vec::new();
392
393 for captures in regex.captures_iter(&haystack) {
394 let Some(main) = captures.get(options.main_group) else {
395 continue;
396 };
397 if main.as_str().trim().is_empty() {
398 continue;
399 }
400
401 let start = byte_to_char_index(&haystack, main.start());
402 let end = byte_to_char_index(&haystack, main.end());
403
404 let chars = self.slice_chars(start, end);
405 if chars.is_empty() {
406 continue;
407 }
408 let Some(bbox) = objects_to_bbox(&chars) else {
409 continue;
410 };
411
412 let groups = if options.return_groups {
413 let mut gs = Vec::new();
414 for idx in 1..captures.len() {
415 gs.push(captures.get(idx).map(|m| m.as_str().to_string()));
416 }
417 Some(gs)
418 } else {
419 None
420 };
421
422 out.push(SearchMatch {
423 text: main.as_str().to_string(),
424 x0: bbox.x0,
425 top: bbox.top,
426 x1: bbox.x1,
427 bottom: bbox.bottom,
428 groups,
429 chars: if options.return_chars { Some(chars) } else { None },
430 });
431 }
432
433 Ok(out)
434 }
435
436 fn slice_chars(&self, start: usize, end: usize) -> Vec<Char> {
437 let start = start.min(self.tuples.len());
438 let end = end.min(self.tuples.len());
439 if start >= end {
440 return Vec::new();
441 }
442 self.tuples[start..end]
443 .iter()
444 .filter_map(|(_, ch)| ch.clone())
445 .collect()
446 }
447}
448
449#[derive(Debug, Clone)]
450pub struct WordExtractor {
451 pub options: TextOptions,
452}
453
454impl WordExtractor {
455 pub fn new(options: TextOptions) -> Self {
456 Self { options }
457 }
458
459 pub fn extract_wordmap(&self, chars: &[Char], return_chars: bool) -> WordMap {
460 let mut tuples = Vec::new();
461 for (word, group) in self.iter_extract_tuples(chars, return_chars) {
462 tuples.push((word, group));
463 }
464 WordMap { tuples }
465 }
466
467 pub fn extract_words(&self, chars: &[Char], return_chars: bool) -> Vec<Word> {
468 self.iter_extract_tuples(chars, return_chars)
469 .into_iter()
470 .map(|(word, _)| word)
471 .collect()
472 }
473
474 fn iter_extract_tuples(&self, chars: &[Char], return_chars: bool) -> Vec<(Word, Vec<Char>)> {
475 let mut sorted = chars.to_vec();
476 if !self.options.use_text_flow {
477 sorted.sort_by(|a, b| {
478 a.upright
479 .cmp(&b.upright)
480 .then_with(|| a.doctop.total_cmp(&b.doctop))
481 .then_with(|| a.x0.total_cmp(&b.x0))
482 });
483 }
484
485 let mut groups: Vec<Vec<Char>> = Vec::new();
486 for ch in sorted {
487 if let Some(last_group) = groups.last_mut() {
488 let same_upright = last_group.last().map(|item| item.upright == ch.upright).unwrap_or(false);
489 if same_upright {
490 last_group.push(ch);
491 } else {
492 groups.push(vec![ch]);
493 }
494 } else {
495 groups.push(vec![ch]);
496 }
497 }
498
499 let mut out = Vec::new();
500 for group in groups {
501 for (chars_in_line, direction) in self.iter_chars_to_lines(&group) {
502 for word_chars in self.iter_chars_to_words(&chars_in_line, direction) {
503 let word = self.merge_chars(&word_chars, direction, return_chars);
504 out.push((word, word_chars));
505 }
506 }
507 }
508 out
509 }
510
511 fn merge_chars(&self, ordered_chars: &[Char], direction: Direction, return_chars: bool) -> Word {
512 let bbox = objects_to_bbox(ordered_chars).unwrap_or_default();
513 let doctop_adj = ordered_chars.first().map(|item| item.doctop - item.top).unwrap_or(0.0);
514 Word {
515 text: ordered_chars
516 .iter()
517 .map(|ch| {
518 if self.options.expand_ligatures {
519 match ch.text.as_str() {
520 "ff" => "ff",
521 "ffi" => "ffi",
522 "ffl" => "ffl",
523 "fi" => "fi",
524 "fl" => "fl",
525 "st" => "st",
526 "ſt" => "st",
527 _ => ch.text.as_str(),
528 }
529 } else {
530 ch.text.as_str()
531 }
532 })
533 .collect(),
534 x0: bbox.x0,
535 top: bbox.top,
536 x1: bbox.x1,
537 bottom: bbox.bottom,
538 doctop: bbox.top + doctop_adj,
539 width: bbox.width(),
540 height: bbox.height(),
541 upright: ordered_chars.first().map(|item| item.upright).unwrap_or(true),
542 direction,
543 chars: if return_chars { Some(ordered_chars.to_vec()) } else { None },
544 }
545 }
546
547 fn char_dir(&self, upright: bool) -> Direction {
548 if upright {
549 self.options.char_dir
550 } else {
551 self.options.resolved_char_dir_rotated()
552 }
553 }
554
555 fn line_dir(&self, upright: bool) -> Direction {
556 if upright {
557 self.options.line_dir
558 } else {
559 self.options.resolved_line_dir_rotated()
560 }
561 }
562
563 fn iter_chars_to_lines(&self, chars: &[Char]) -> Vec<(Vec<Char>, Direction)> {
564 if chars.is_empty() {
565 return Vec::new();
566 }
567 let upright = chars[0].upright;
568 let line_dir = self.line_dir(upright);
569 let char_dir = self.char_dir(upright);
570
571 let tol = if matches!(line_dir, Direction::Ttb | Direction::Btt) {
572 self.options.y_tolerance
573 } else {
574 self.options.x_tolerance
575 };
576
577 let mut line_groups = cluster_items(chars, |ch| line_cluster_value(ch, line_dir), tol);
578
579 for group in &mut line_groups {
580 group.sort_by(|a, b| {
581 let ka = sort_key(a, char_dir);
582 let kb = sort_key(b, char_dir);
583 ka.0.total_cmp(&kb.0).then_with(|| ka.1.total_cmp(&kb.1))
584 });
585 }
586
587 line_groups.into_iter().map(|group| (group, char_dir)).collect()
588 }
589
590 fn iter_chars_to_words(&self, ordered_chars: &[Char], direction: Direction) -> Vec<Vec<Char>> {
591 let mut words: Vec<Vec<Char>> = Vec::new();
592 let punctuation = self.options.split_at_punctuation.clone().unwrap_or_default();
593 let mut saw_space = false;
594
595 for ch in ordered_chars.iter().cloned() {
596 if !self.options.keep_blank_chars && ch.text.chars().all(|c| c.is_whitespace()) {
597 saw_space = true;
598 continue;
599 }
600
601 if !punctuation.is_empty() && ch.text.chars().all(|c| punctuation.contains(c)) {
602 words.push(vec![ch]);
603 continue;
604 }
605
606 let should_start_new = saw_space
607 || words
608 .last()
609 .and_then(|word| word.last())
610 .map(|prev| {
611 let x_tol = self
612 .options
613 .x_tolerance_ratio
614 .map(|ratio| ratio * prev.size)
615 .unwrap_or(self.options.x_tolerance);
616
617 let y_tol = self
618 .options
619 .y_tolerance_ratio
620 .map(|ratio| ratio * prev.size)
621 .unwrap_or(self.options.y_tolerance);
622
623 char_begins_new_word(prev, &ch, direction, x_tol, y_tol)
624 })
625 .unwrap_or(false);
626 saw_space = false;
627
628 if should_start_new {
629 words.push(vec![ch]);
630 } else if let Some(last) = words.last_mut() {
631 last.push(ch);
632 } else {
633 words.push(vec![ch]);
634 }
635 }
636
637 words.into_iter().filter(|word| !word.is_empty()).collect()
638 }
639}
640
641pub fn chars_to_textmap(chars: &[Char], options: &TextOptions) -> TextMap {
642 let mut opts = options.clone();
643 if opts.layout_bbox.is_none() {
644 opts.layout_bbox = objects_to_bbox(chars);
645 }
646 if opts.layout_width.is_none() {
647 if let Some(bbox) = opts.layout_bbox {
648 opts.layout_width = Some(bbox.width());
649 }
650 }
651 if opts.layout_height.is_none() {
652 if let Some(bbox) = opts.layout_bbox {
653 opts.layout_height = Some(bbox.height());
654 }
655 }
656
657 let extractor = WordExtractor::new(opts.clone());
658 extractor.extract_wordmap(chars, true).to_textmap(&opts)
659}
660
661pub fn extract_text(chars: &[Char], options: &TextOptions) -> String {
662 chars_to_textmap(chars, options).as_string()
663}
664
665pub fn extract_words(chars: &[Char], options: &TextOptions, return_chars: bool) -> Vec<Word> {
666 WordExtractor::new(options.clone()).extract_words(chars, return_chars)
667}
668
669pub fn extract_text_lines(chars: &[Char], options: &TextOptions, strip: bool, return_chars: bool) -> Vec<TextLine> {
670 chars_to_textmap(chars, options).extract_text_lines(strip, return_chars)
671}
672
673pub fn extract_text_simple(chars: &[Char], x_tolerance: f64, y_tolerance: f64) -> String {
674 let clustered = cluster_items(chars, |ch| ch.doctop, y_tolerance);
675 clustered
676 .into_iter()
677 .map(|mut line| {
678 line.sort_by(|a, b| a.x0.total_cmp(&b.x0));
679 collate_line(&line, x_tolerance)
680 })
681 .collect::<Vec<String>>()
682 .join("\n")
683}
684
685pub fn collate_line(line_chars: &[Char], tolerance: f64) -> String {
686 let mut line = String::new();
687 let mut last_x1: Option<f64> = None;
688 for ch in line_chars {
689 if let Some(prev_x1) = last_x1 {
690 if ch.x0 > prev_x1 + tolerance {
691 line.push(' ');
692 }
693 }
694 line.push_str(&ch.text);
695 last_x1 = Some(ch.x1);
696 }
697 line
698}
699
700pub fn dedupe_chars(chars: &[Char], options: &DedupeOptions) -> Vec<Char> {
701 if chars.is_empty() {
702 return Vec::new();
703 }
704
705 let mut indexed: Vec<(usize, Char)> = chars.iter().cloned().enumerate().collect();
706 indexed.sort_by(|a, b| dedupe_cmp(&a.1, &b.1, &options.extra_attrs));
707
708 let mut kept: Vec<(usize, Char)> = Vec::new();
709 let mut start = 0usize;
710 while start < indexed.len() {
711 let mut end = start + 1;
712 while end < indexed.len()
713 && dedupe_same_key(&indexed[start].1, &indexed[end].1, &options.extra_attrs)
714 {
715 end += 1;
716 }
717
718 let group: Vec<(usize, Char)> = indexed[start..end].to_vec();
719 let y_clusters = cluster_items(&group, |(_, ch)| ch.doctop, options.tolerance);
720 for y_cluster in y_clusters {
721 let x_clusters = cluster_items(&y_cluster, |(_, ch)| ch.x0, options.tolerance);
722 for x_cluster in x_clusters {
723 let mut cluster = x_cluster;
724 cluster.sort_by(|a, b| {
725 a.1.doctop
726 .total_cmp(&b.1.doctop)
727 .then_with(|| a.1.x0.total_cmp(&b.1.x0))
728 });
729 kept.push(cluster[0].clone());
730 }
731 }
732
733 start = end;
734 }
735
736 kept.sort_by(|a, b| a.0.cmp(&b.0));
737 kept.into_iter().map(|(_, ch)| ch).collect()
738}
739
740fn dedupe_cmp(a: &Char, b: &Char, extra_attrs: &[String]) -> std::cmp::Ordering {
741 a.upright
742 .cmp(&b.upright)
743 .then_with(|| a.text.cmp(&b.text))
744 .then_with(|| extra_attr_cmp(a, b, extra_attrs))
745 .then_with(|| a.doctop.total_cmp(&b.doctop))
746 .then_with(|| a.x0.total_cmp(&b.x0))
747}
748
749fn extra_attr_cmp(a: &Char, b: &Char, extra_attrs: &[String]) -> std::cmp::Ordering {
750 for attr in extra_attrs {
751 let ord = match attr.as_str() {
752 "fontname" => a.fontname.cmp(&b.fontname),
753 "size" => a.size.total_cmp(&b.size),
754 _ => std::cmp::Ordering::Equal,
755 };
756 if ord != std::cmp::Ordering::Equal {
757 return ord;
758 }
759 }
760 std::cmp::Ordering::Equal
761}
762
763fn dedupe_same_key(a: &Char, b: &Char, extra_attrs: &[String]) -> bool {
764 if a.upright != b.upright || a.text != b.text {
765 return false;
766 }
767 extra_attr_cmp(a, b, extra_attrs) == std::cmp::Ordering::Equal
768}
769
770fn byte_to_char_index(s: &str, byte_idx: usize) -> usize {
771 s[..byte_idx].chars().count()
772}
773
774fn position_key_from_bbox(bbox: BBox, direction: Direction) -> f64 {
775 match direction {
776 Direction::Ttb => bbox.top,
777 Direction::Btt => bbox.bottom,
778 Direction::Ltr => bbox.x0,
779 Direction::Rtl => bbox.x1,
780 }
781}
782
783fn position_value<T: TextObject>(obj: &T, direction: Direction) -> f64 {
784 match direction {
785 Direction::Ttb => obj.top(),
786 Direction::Btt => obj.bottom(),
787 Direction::Ltr => obj.x0(),
788 Direction::Rtl => obj.x1(),
789 }
790}
791
792fn line_cluster_value<T: TextObject>(obj: &T, direction: Direction) -> f64 {
793 match direction {
794 Direction::Ttb => obj.top(),
795 Direction::Btt => -obj.bottom(),
796 Direction::Ltr => obj.x0(),
797 Direction::Rtl => -obj.x1(),
798 }
799}
800
801fn sort_key<T: TextObject>(obj: &T, direction: Direction) -> (f64, f64) {
802 match direction {
803 Direction::Ttb => (obj.top(), obj.bottom()),
804 Direction::Btt => (-(obj.top() + obj.height()), -obj.top()),
805 Direction::Ltr => (obj.x0(), obj.x0()),
806 Direction::Rtl => (-obj.x1(), -obj.x0()),
807 }
808}
809
810fn char_begins_new_word(prev: &Char, curr: &Char, direction: Direction, x_tolerance: f64, y_tolerance: f64) -> bool {
811 let (ax, bx, cx, ay, cy, x, y) = match direction {
812 Direction::Ltr => (
813 prev.x0,
814 prev.x1,
815 curr.x0,
816 prev.top,
817 curr.top,
818 x_tolerance,
819 y_tolerance,
820 ),
821 Direction::Rtl => (
822 -prev.x1,
823 -prev.x0,
824 -curr.x1,
825 prev.top,
826 curr.top,
827 x_tolerance,
828 y_tolerance,
829 ),
830 Direction::Ttb => (
831 prev.top,
832 prev.bottom,
833 curr.top,
834 prev.x0,
835 curr.x0,
836 y_tolerance,
837 x_tolerance,
838 ),
839 Direction::Btt => (
840 -prev.bottom,
841 -prev.top,
842 -curr.bottom,
843 prev.x0,
844 curr.x0,
845 y_tolerance,
846 x_tolerance,
847 ),
848 };
849
850 (cx < ax) || (cx > bx + x) || (cy - ay).abs() > y
851}
852
853trait TextObject {
854 fn x0(&self) -> f64;
855 fn x1(&self) -> f64;
856 fn top(&self) -> f64;
857 fn bottom(&self) -> f64;
858 fn height(&self) -> f64;
859}
860
861impl TextObject for Char {
862 fn x0(&self) -> f64 { self.x0 }
863 fn x1(&self) -> f64 { self.x1 }
864 fn top(&self) -> f64 { self.top }
865 fn bottom(&self) -> f64 { self.bottom }
866 fn height(&self) -> f64 { self.height }
867}
868
869impl TextObject for Word {
870 fn x0(&self) -> f64 { self.x0 }
871 fn x1(&self) -> f64 { self.x1 }
872 fn top(&self) -> f64 { self.top }
873 fn bottom(&self) -> f64 { self.bottom }
874 fn height(&self) -> f64 { self.height }
875}