Skip to main content

pdfsink_rs/
text.rs

1use crate::clustering::cluster_items;
2use crate::geometry::objects_to_bbox;
3use crate::types::{BBox, Char, Direction, SearchMatch, TextLine, Word};
4
5#[derive(Debug, Clone)]
6pub struct TextOptions {
7    pub x_tolerance: f64,
8    pub y_tolerance: f64,
9    pub x_tolerance_ratio: Option<f64>,
10    pub y_tolerance_ratio: Option<f64>,
11    pub layout: bool,
12    pub layout_width: Option<f64>,
13    pub layout_height: Option<f64>,
14    pub layout_width_chars: Option<usize>,
15    pub layout_height_chars: Option<usize>,
16    pub layout_bbox: Option<BBox>,
17    pub x_density: f64,
18    pub y_density: f64,
19    pub x_shift: f64,
20    pub y_shift: f64,
21    pub line_dir: Direction,
22    pub char_dir: Direction,
23    pub line_dir_rotated: Option<Direction>,
24    pub char_dir_rotated: Option<Direction>,
25    pub line_dir_render: Option<Direction>,
26    pub char_dir_render: Option<Direction>,
27    pub keep_blank_chars: bool,
28    pub use_text_flow: bool,
29    pub split_at_punctuation: Option<String>,
30    pub expand_ligatures: bool,
31}
32
33impl Default for TextOptions {
34    fn default() -> Self {
35        Self {
36            x_tolerance: 3.0,
37            y_tolerance: 3.0,
38            x_tolerance_ratio: None,
39            y_tolerance_ratio: None,
40            layout: false,
41            layout_width: None,
42            layout_height: None,
43            layout_width_chars: None,
44            layout_height_chars: None,
45            layout_bbox: None,
46            x_density: 7.25,
47            y_density: 13.0,
48            x_shift: 0.0,
49            y_shift: 0.0,
50            line_dir: Direction::Ttb,
51            char_dir: Direction::Ltr,
52            line_dir_rotated: None,
53            char_dir_rotated: None,
54            line_dir_render: None,
55            char_dir_render: None,
56            keep_blank_chars: false,
57            use_text_flow: false,
58            split_at_punctuation: None,
59            expand_ligatures: true,
60        }
61    }
62}
63
64impl TextOptions {
65    pub fn resolved_line_dir_rotated(&self) -> Direction {
66        self.line_dir_rotated.unwrap_or(self.char_dir)
67    }
68
69    pub fn resolved_char_dir_rotated(&self) -> Direction {
70        self.char_dir_rotated.unwrap_or(self.line_dir)
71    }
72
73    pub fn resolved_line_dir_render(&self) -> Direction {
74        self.line_dir_render.unwrap_or(self.line_dir)
75    }
76
77    pub fn resolved_char_dir_render(&self) -> Direction {
78        self.char_dir_render.unwrap_or(self.char_dir)
79    }
80}
81
82#[derive(Debug, Clone)]
83pub struct DedupeOptions {
84    pub tolerance: f64,
85    pub extra_attrs: Vec<String>,
86}
87
88impl Default for DedupeOptions {
89    fn default() -> Self {
90        Self {
91            tolerance: 1.0,
92            extra_attrs: vec!["fontname".to_string(), "size".to_string()],
93        }
94    }
95}
96
97#[derive(Debug, Clone)]
98pub struct SearchOptions {
99    pub regex: bool,
100    pub case_sensitive: bool,
101    pub main_group: usize,
102    pub return_groups: bool,
103    pub return_chars: bool,
104}
105
106impl Default for SearchOptions {
107    fn default() -> Self {
108        Self {
109            regex: true,
110            case_sensitive: true,
111            main_group: 0,
112            return_groups: true,
113            return_chars: true,
114        }
115    }
116}
117
118#[derive(Debug, Clone)]
119pub struct WordMap {
120    pub tuples: Vec<(Word, Vec<Char>)>,
121}
122
123impl WordMap {
124    pub fn to_textmap(&self, options: &TextOptions) -> TextMap {
125        let mut tuples: Vec<(char, Option<Char>)> = Vec::new();
126        if self.tuples.is_empty() {
127            return TextMap {
128                tuples,
129                line_dir_render: options.resolved_line_dir_render(),
130                char_dir_render: options.resolved_char_dir_render(),
131            };
132        }
133
134        let expansions = |text: &str| -> String {
135            if !options.expand_ligatures {
136                return text.to_string();
137            }
138            match text {
139                "ff" => "ff".to_string(),
140                "ffi" => "ffi".to_string(),
141                "ffl" => "ffl".to_string(),
142                "fi" => "fi".to_string(),
143                "fl" => "fl".to_string(),
144                "st" => "st".to_string(),
145                "ſt" => "st".to_string(),
146                _ => text.to_string(),
147            }
148        };
149
150        let mut width_chars = options.layout_width_chars.unwrap_or(0);
151        if width_chars == 0 {
152            if let Some(width) = options.layout_width {
153                width_chars = (width / options.x_density).round() as usize;
154            }
155        }
156
157        let mut height_chars = options.layout_height_chars.unwrap_or(0);
158        if height_chars == 0 {
159            if let Some(height) = options.layout_height {
160                height_chars = (height / options.y_density).round() as usize;
161            }
162        }
163
164        let layout_bbox = options.layout_bbox.unwrap_or_else(|| {
165            let words: Vec<Word> = self.tuples.iter().map(|(word, _)| word.clone()).collect();
166            objects_to_bbox(&words).unwrap_or_default()
167        });
168
169        let blank_line: Vec<(char, Option<Char>)> = if options.layout {
170            vec![(' ', None); width_chars]
171        } else {
172            Vec::new()
173        };
174
175        let words_sorted = {
176            let mut items = self.tuples.clone();
177            items.sort_by(|a, b| {
178                let va = line_cluster_value(&a.0, options.line_dir);
179                let vb = line_cluster_value(&b.0, options.line_dir);
180                va.total_cmp(&vb)
181            });
182            items
183        };
184
185        let line_tuples = cluster_items(
186            &words_sorted,
187            |pair| line_cluster_value(&pair.0, options.line_dir),
188            options.y_tolerance,
189        );
190
191        let line_position_key = position_key_from_bbox(layout_bbox, options.line_dir);
192        let char_position_origin = position_key_from_bbox(layout_bbox, options.char_dir);
193
194        let mut num_newlines = 0isize;
195
196        for (line_index, mut line) in line_tuples.into_iter().enumerate() {
197            if !options.use_text_flow {
198                line.sort_by(|a, b| {
199                    let ka = sort_key(&a.0, options.char_dir);
200                    let kb = sort_key(&b.0, options.char_dir);
201                    ka.0.total_cmp(&kb.0).then_with(|| ka.1.total_cmp(&kb.1))
202                });
203            }
204
205            let y_dist = if options.layout {
206                let line_position = position_value(&line[0].0, options.line_dir);
207                let raw = line_position - (line_position_key + options.y_shift);
208                let adj = if matches!(options.line_dir, Direction::Btt | Direction::Rtl) {
209                    -1.0
210                } else {
211                    1.0
212                };
213                raw * adj / options.y_density
214            } else {
215                0.0
216            };
217
218            let target_newlines = if line_index > 0 { 1 } else { 0 };
219            let prepend = std::cmp::max(target_newlines, (y_dist.round() as isize) - num_newlines);
220
221            for _ in 0..prepend.max(0) as usize {
222                if tuples.is_empty() || tuples.last().map(|(c, _)| *c == '\n').unwrap_or(false) {
223                    tuples.extend(blank_line.clone());
224                }
225                tuples.push(('\n', None));
226            }
227            num_newlines += prepend.max(0);
228
229            let mut line_len: isize = 0;
230            for (word, chars) in line {
231                let x_dist = if options.layout {
232                    let char_position = position_value(&word, options.char_dir);
233                    let raw = char_position - (char_position_origin + options.x_shift);
234                    let adj = if matches!(options.char_dir, Direction::Btt | Direction::Rtl) {
235                        -1.0
236                    } else {
237                        1.0
238                    };
239                    raw * adj / options.x_density
240                } else {
241                    0.0
242                };
243
244                let prepend_spaces = std::cmp::max(std::cmp::min(1, line_len), (x_dist.round() as isize) - line_len);
245                for _ in 0..prepend_spaces.max(0) as usize {
246                    tuples.push((' ', None));
247                }
248                line_len += prepend_spaces.max(0);
249
250                for ch in chars {
251                    let expanded = expansions(&ch.text);
252                    for letter in expanded.chars() {
253                        tuples.push((letter, Some(ch.clone())));
254                        line_len += 1;
255                    }
256                }
257            }
258
259            if options.layout && width_chars > 0 && line_len < width_chars as isize {
260                for _ in 0..(width_chars as isize - line_len) as usize {
261                    tuples.push((' ', None));
262                }
263            }
264        }
265
266        if options.layout && height_chars > 0 {
267            let append = height_chars as isize - (num_newlines + 1);
268            for i in 0..append.max(0) as usize {
269                if i > 0 {
270                    tuples.extend(blank_line.clone());
271                }
272                tuples.push(('\n', None));
273            }
274            if tuples.last().map(|(c, _)| *c == '\n').unwrap_or(false) {
275                tuples.pop();
276            }
277        }
278
279        TextMap {
280            tuples,
281            line_dir_render: options.resolved_line_dir_render(),
282            char_dir_render: options.resolved_char_dir_render(),
283        }
284    }
285}
286
287#[derive(Debug, Clone)]
288pub struct TextMap {
289    pub tuples: Vec<(char, Option<Char>)>,
290    pub line_dir_render: Direction,
291    pub char_dir_render: Direction,
292}
293
294impl TextMap {
295    pub fn as_string(&self) -> String {
296        let base: String = self.tuples.iter().map(|(c, _)| *c).collect();
297        if self.char_dir_render == Direction::Ltr && self.line_dir_render == Direction::Ttb {
298            return base;
299        }
300
301        let mut lines: Vec<String> = base.lines().map(|line| line.to_string()).collect();
302
303        if matches!(self.line_dir_render, Direction::Btt | Direction::Rtl) {
304            lines.reverse();
305        }
306
307        if self.char_dir_render == Direction::Rtl {
308            lines = lines.into_iter().map(|line| line.chars().rev().collect()).collect();
309        }
310
311        if matches!(self.line_dir_render, Direction::Rtl | Direction::Ltr) {
312            let max_line_len = lines.iter().map(|line| line.chars().count()).max().unwrap_or(0);
313            let padded: Vec<Vec<char>> = lines
314                .iter()
315                .map(|line| {
316                    let mut chars: Vec<char> = line.chars().collect();
317                    while chars.len() < max_line_len {
318                        if self.char_dir_render == Direction::Btt {
319                            chars.insert(0, ' ');
320                        } else {
321                            chars.push(' ');
322                        }
323                    }
324                    chars
325                })
326                .collect();
327
328            let mut out = String::new();
329            for idx in 0..max_line_len {
330                for row in &padded {
331                    out.push(row[idx]);
332                }
333                if idx + 1 != max_line_len {
334                    out.push('\n');
335                }
336            }
337            return out;
338        }
339
340        lines.join("\n")
341    }
342
343    pub fn extract_text_lines(&self, strip: bool, return_chars: bool) -> Vec<TextLine> {
344        // Use the base string (1:1 char-to-tuple mapping) for offset tracking.
345        let text: String = self.tuples.iter().map(|(c, _)| *c).collect();
346        let mut out = Vec::new();
347        let mut offset = 0usize;
348        for raw_line in text.split('\n') {
349            let line = if strip { raw_line.trim() } else { raw_line };
350            let char_count = raw_line.chars().count();
351            if line.is_empty() {
352                offset += char_count + 1;
353                continue;
354            }
355
356            let chars: Vec<Char> = self
357                .slice_chars(offset, offset + char_count)
358                .into_iter()
359                .collect();
360
361            if let Some(bbox) = objects_to_bbox(&chars) {
362                out.push(TextLine {
363                    text: line.to_string(),
364                    x0: bbox.x0,
365                    top: bbox.top,
366                    x1: bbox.x1,
367                    bottom: bbox.bottom,
368                    chars: if return_chars { Some(chars) } else { None },
369                });
370            }
371            offset += char_count + 1;
372        }
373        out
374    }
375
376    pub fn search(&self, pattern: &str, options: &SearchOptions) -> crate::Result<Vec<SearchMatch>> {
377        let regex = if options.regex {
378            regex::RegexBuilder::new(pattern)
379                .case_insensitive(!options.case_sensitive)
380                .build()?
381        } else {
382            regex::RegexBuilder::new(&regex::escape(pattern))
383                .case_insensitive(!options.case_sensitive)
384                .build()?
385        };
386
387        // Use the base string (1:1 char-to-tuple mapping) so that byte/char
388        // indices produced by the regex correspond directly to tuple positions.
389        // as_string() may reorder lines and add padding, breaking the mapping.
390        let haystack: String = self.tuples.iter().map(|(c, _)| *c).collect();
391        let mut out = Vec::new();
392
393        for captures in regex.captures_iter(&haystack) {
394            let Some(main) = captures.get(options.main_group) else {
395                continue;
396            };
397            if main.as_str().trim().is_empty() {
398                continue;
399            }
400
401            let start = byte_to_char_index(&haystack, main.start());
402            let end = byte_to_char_index(&haystack, main.end());
403
404            let chars = self.slice_chars(start, end);
405            if chars.is_empty() {
406                continue;
407            }
408            let Some(bbox) = objects_to_bbox(&chars) else {
409                continue;
410            };
411
412            let groups = if options.return_groups {
413                let mut gs = Vec::new();
414                for idx in 1..captures.len() {
415                    gs.push(captures.get(idx).map(|m| m.as_str().to_string()));
416                }
417                Some(gs)
418            } else {
419                None
420            };
421
422            out.push(SearchMatch {
423                text: main.as_str().to_string(),
424                x0: bbox.x0,
425                top: bbox.top,
426                x1: bbox.x1,
427                bottom: bbox.bottom,
428                groups,
429                chars: if options.return_chars { Some(chars) } else { None },
430            });
431        }
432
433        Ok(out)
434    }
435
436    fn slice_chars(&self, start: usize, end: usize) -> Vec<Char> {
437        let start = start.min(self.tuples.len());
438        let end = end.min(self.tuples.len());
439        if start >= end {
440            return Vec::new();
441        }
442        self.tuples[start..end]
443            .iter()
444            .filter_map(|(_, ch)| ch.clone())
445            .collect()
446    }
447}
448
449#[derive(Debug, Clone)]
450pub struct WordExtractor {
451    pub options: TextOptions,
452}
453
454impl WordExtractor {
455    pub fn new(options: TextOptions) -> Self {
456        Self { options }
457    }
458
459    pub fn extract_wordmap(&self, chars: &[Char], return_chars: bool) -> WordMap {
460        let mut tuples = Vec::new();
461        for (word, group) in self.iter_extract_tuples(chars, return_chars) {
462            tuples.push((word, group));
463        }
464        WordMap { tuples }
465    }
466
467    pub fn extract_words(&self, chars: &[Char], return_chars: bool) -> Vec<Word> {
468        self.iter_extract_tuples(chars, return_chars)
469            .into_iter()
470            .map(|(word, _)| word)
471            .collect()
472    }
473
474    fn iter_extract_tuples(&self, chars: &[Char], return_chars: bool) -> Vec<(Word, Vec<Char>)> {
475        let mut sorted = chars.to_vec();
476        if !self.options.use_text_flow {
477            sorted.sort_by(|a, b| {
478                a.upright
479                    .cmp(&b.upright)
480                    .then_with(|| a.doctop.total_cmp(&b.doctop))
481                    .then_with(|| a.x0.total_cmp(&b.x0))
482            });
483        }
484
485        let mut groups: Vec<Vec<Char>> = Vec::new();
486        for ch in sorted {
487            if let Some(last_group) = groups.last_mut() {
488                let same_upright = last_group.last().map(|item| item.upright == ch.upright).unwrap_or(false);
489                if same_upright {
490                    last_group.push(ch);
491                } else {
492                    groups.push(vec![ch]);
493                }
494            } else {
495                groups.push(vec![ch]);
496            }
497        }
498
499        let mut out = Vec::new();
500        for group in groups {
501            for (chars_in_line, direction) in self.iter_chars_to_lines(&group) {
502                for word_chars in self.iter_chars_to_words(&chars_in_line, direction) {
503                    let word = self.merge_chars(&word_chars, direction, return_chars);
504                    out.push((word, word_chars));
505                }
506            }
507        }
508        out
509    }
510
511    fn merge_chars(&self, ordered_chars: &[Char], direction: Direction, return_chars: bool) -> Word {
512        let bbox = objects_to_bbox(ordered_chars).unwrap_or_default();
513        let doctop_adj = ordered_chars.first().map(|item| item.doctop - item.top).unwrap_or(0.0);
514        Word {
515            text: ordered_chars
516                .iter()
517                .map(|ch| {
518                    if self.options.expand_ligatures {
519                        match ch.text.as_str() {
520                            "ff" => "ff",
521                            "ffi" => "ffi",
522                            "ffl" => "ffl",
523                            "fi" => "fi",
524                            "fl" => "fl",
525                            "st" => "st",
526                            "ſt" => "st",
527                            _ => ch.text.as_str(),
528                        }
529                    } else {
530                        ch.text.as_str()
531                    }
532                })
533                .collect(),
534            x0: bbox.x0,
535            top: bbox.top,
536            x1: bbox.x1,
537            bottom: bbox.bottom,
538            doctop: bbox.top + doctop_adj,
539            width: bbox.width(),
540            height: bbox.height(),
541            upright: ordered_chars.first().map(|item| item.upright).unwrap_or(true),
542            direction,
543            chars: if return_chars { Some(ordered_chars.to_vec()) } else { None },
544        }
545    }
546
547    fn char_dir(&self, upright: bool) -> Direction {
548        if upright {
549            self.options.char_dir
550        } else {
551            self.options.resolved_char_dir_rotated()
552        }
553    }
554
555    fn line_dir(&self, upright: bool) -> Direction {
556        if upright {
557            self.options.line_dir
558        } else {
559            self.options.resolved_line_dir_rotated()
560        }
561    }
562
563    fn iter_chars_to_lines(&self, chars: &[Char]) -> Vec<(Vec<Char>, Direction)> {
564        if chars.is_empty() {
565            return Vec::new();
566        }
567        let upright = chars[0].upright;
568        let line_dir = self.line_dir(upright);
569        let char_dir = self.char_dir(upright);
570
571        let tol = if matches!(line_dir, Direction::Ttb | Direction::Btt) {
572            self.options.y_tolerance
573        } else {
574            self.options.x_tolerance
575        };
576
577        let mut line_groups = cluster_items(chars, |ch| line_cluster_value(ch, line_dir), tol);
578
579        for group in &mut line_groups {
580            group.sort_by(|a, b| {
581                let ka = sort_key(a, char_dir);
582                let kb = sort_key(b, char_dir);
583                ka.0.total_cmp(&kb.0).then_with(|| ka.1.total_cmp(&kb.1))
584            });
585        }
586
587        line_groups.into_iter().map(|group| (group, char_dir)).collect()
588    }
589
590    fn iter_chars_to_words(&self, ordered_chars: &[Char], direction: Direction) -> Vec<Vec<Char>> {
591        let mut words: Vec<Vec<Char>> = Vec::new();
592        let punctuation = self.options.split_at_punctuation.clone().unwrap_or_default();
593        let mut saw_space = false;
594
595        for ch in ordered_chars.iter().cloned() {
596            if !self.options.keep_blank_chars && ch.text.chars().all(|c| c.is_whitespace()) {
597                saw_space = true;
598                continue;
599            }
600
601            if !punctuation.is_empty() && ch.text.chars().all(|c| punctuation.contains(c)) {
602                words.push(vec![ch]);
603                continue;
604            }
605
606            let should_start_new = saw_space
607                || words
608                    .last()
609                    .and_then(|word| word.last())
610                    .map(|prev| {
611                        let x_tol = self
612                            .options
613                            .x_tolerance_ratio
614                            .map(|ratio| ratio * prev.size)
615                            .unwrap_or(self.options.x_tolerance);
616
617                        let y_tol = self
618                            .options
619                            .y_tolerance_ratio
620                            .map(|ratio| ratio * prev.size)
621                            .unwrap_or(self.options.y_tolerance);
622
623                        char_begins_new_word(prev, &ch, direction, x_tol, y_tol)
624                    })
625                    .unwrap_or(false);
626            saw_space = false;
627
628            if should_start_new {
629                words.push(vec![ch]);
630            } else if let Some(last) = words.last_mut() {
631                last.push(ch);
632            } else {
633                words.push(vec![ch]);
634            }
635        }
636
637        words.into_iter().filter(|word| !word.is_empty()).collect()
638    }
639}
640
641pub fn chars_to_textmap(chars: &[Char], options: &TextOptions) -> TextMap {
642    let mut opts = options.clone();
643    if opts.layout_bbox.is_none() {
644        opts.layout_bbox = objects_to_bbox(chars);
645    }
646    if opts.layout_width.is_none() {
647        if let Some(bbox) = opts.layout_bbox {
648            opts.layout_width = Some(bbox.width());
649        }
650    }
651    if opts.layout_height.is_none() {
652        if let Some(bbox) = opts.layout_bbox {
653            opts.layout_height = Some(bbox.height());
654        }
655    }
656
657    let extractor = WordExtractor::new(opts.clone());
658    extractor.extract_wordmap(chars, true).to_textmap(&opts)
659}
660
661pub fn extract_text(chars: &[Char], options: &TextOptions) -> String {
662    chars_to_textmap(chars, options).as_string()
663}
664
665pub fn extract_words(chars: &[Char], options: &TextOptions, return_chars: bool) -> Vec<Word> {
666    WordExtractor::new(options.clone()).extract_words(chars, return_chars)
667}
668
669pub fn extract_text_lines(chars: &[Char], options: &TextOptions, strip: bool, return_chars: bool) -> Vec<TextLine> {
670    chars_to_textmap(chars, options).extract_text_lines(strip, return_chars)
671}
672
673pub fn extract_text_simple(chars: &[Char], x_tolerance: f64, y_tolerance: f64) -> String {
674    let clustered = cluster_items(chars, |ch| ch.doctop, y_tolerance);
675    clustered
676        .into_iter()
677        .map(|mut line| {
678            line.sort_by(|a, b| a.x0.total_cmp(&b.x0));
679            collate_line(&line, x_tolerance)
680        })
681        .collect::<Vec<String>>()
682        .join("\n")
683}
684
685pub fn collate_line(line_chars: &[Char], tolerance: f64) -> String {
686    let mut line = String::new();
687    let mut last_x1: Option<f64> = None;
688    for ch in line_chars {
689        if let Some(prev_x1) = last_x1 {
690            if ch.x0 > prev_x1 + tolerance {
691                line.push(' ');
692            }
693        }
694        line.push_str(&ch.text);
695        last_x1 = Some(ch.x1);
696    }
697    line
698}
699
700pub fn dedupe_chars(chars: &[Char], options: &DedupeOptions) -> Vec<Char> {
701    if chars.is_empty() {
702        return Vec::new();
703    }
704
705    let mut indexed: Vec<(usize, Char)> = chars.iter().cloned().enumerate().collect();
706    indexed.sort_by(|a, b| dedupe_cmp(&a.1, &b.1, &options.extra_attrs));
707
708    let mut kept: Vec<(usize, Char)> = Vec::new();
709    let mut start = 0usize;
710    while start < indexed.len() {
711        let mut end = start + 1;
712        while end < indexed.len()
713            && dedupe_same_key(&indexed[start].1, &indexed[end].1, &options.extra_attrs)
714        {
715            end += 1;
716        }
717
718        let group: Vec<(usize, Char)> = indexed[start..end].to_vec();
719        let y_clusters = cluster_items(&group, |(_, ch)| ch.doctop, options.tolerance);
720        for y_cluster in y_clusters {
721            let x_clusters = cluster_items(&y_cluster, |(_, ch)| ch.x0, options.tolerance);
722            for x_cluster in x_clusters {
723                let mut cluster = x_cluster;
724                cluster.sort_by(|a, b| {
725                    a.1.doctop
726                        .total_cmp(&b.1.doctop)
727                        .then_with(|| a.1.x0.total_cmp(&b.1.x0))
728                });
729                kept.push(cluster[0].clone());
730            }
731        }
732
733        start = end;
734    }
735
736    kept.sort_by(|a, b| a.0.cmp(&b.0));
737    kept.into_iter().map(|(_, ch)| ch).collect()
738}
739
740fn dedupe_cmp(a: &Char, b: &Char, extra_attrs: &[String]) -> std::cmp::Ordering {
741    a.upright
742        .cmp(&b.upright)
743        .then_with(|| a.text.cmp(&b.text))
744        .then_with(|| extra_attr_cmp(a, b, extra_attrs))
745        .then_with(|| a.doctop.total_cmp(&b.doctop))
746        .then_with(|| a.x0.total_cmp(&b.x0))
747}
748
749fn extra_attr_cmp(a: &Char, b: &Char, extra_attrs: &[String]) -> std::cmp::Ordering {
750    for attr in extra_attrs {
751        let ord = match attr.as_str() {
752            "fontname" => a.fontname.cmp(&b.fontname),
753            "size" => a.size.total_cmp(&b.size),
754            _ => std::cmp::Ordering::Equal,
755        };
756        if ord != std::cmp::Ordering::Equal {
757            return ord;
758        }
759    }
760    std::cmp::Ordering::Equal
761}
762
763fn dedupe_same_key(a: &Char, b: &Char, extra_attrs: &[String]) -> bool {
764    if a.upright != b.upright || a.text != b.text {
765        return false;
766    }
767    extra_attr_cmp(a, b, extra_attrs) == std::cmp::Ordering::Equal
768}
769
770fn byte_to_char_index(s: &str, byte_idx: usize) -> usize {
771    s[..byte_idx].chars().count()
772}
773
774fn position_key_from_bbox(bbox: BBox, direction: Direction) -> f64 {
775    match direction {
776        Direction::Ttb => bbox.top,
777        Direction::Btt => bbox.bottom,
778        Direction::Ltr => bbox.x0,
779        Direction::Rtl => bbox.x1,
780    }
781}
782
783fn position_value<T: TextObject>(obj: &T, direction: Direction) -> f64 {
784    match direction {
785        Direction::Ttb => obj.top(),
786        Direction::Btt => obj.bottom(),
787        Direction::Ltr => obj.x0(),
788        Direction::Rtl => obj.x1(),
789    }
790}
791
792fn line_cluster_value<T: TextObject>(obj: &T, direction: Direction) -> f64 {
793    match direction {
794        Direction::Ttb => obj.top(),
795        Direction::Btt => -obj.bottom(),
796        Direction::Ltr => obj.x0(),
797        Direction::Rtl => -obj.x1(),
798    }
799}
800
801fn sort_key<T: TextObject>(obj: &T, direction: Direction) -> (f64, f64) {
802    match direction {
803        Direction::Ttb => (obj.top(), obj.bottom()),
804        Direction::Btt => (-(obj.top() + obj.height()), -obj.top()),
805        Direction::Ltr => (obj.x0(), obj.x0()),
806        Direction::Rtl => (-obj.x1(), -obj.x0()),
807    }
808}
809
810fn char_begins_new_word(prev: &Char, curr: &Char, direction: Direction, x_tolerance: f64, y_tolerance: f64) -> bool {
811    let (ax, bx, cx, ay, cy, x, y) = match direction {
812        Direction::Ltr => (
813            prev.x0,
814            prev.x1,
815            curr.x0,
816            prev.top,
817            curr.top,
818            x_tolerance,
819            y_tolerance,
820        ),
821        Direction::Rtl => (
822            -prev.x1,
823            -prev.x0,
824            -curr.x1,
825            prev.top,
826            curr.top,
827            x_tolerance,
828            y_tolerance,
829        ),
830        Direction::Ttb => (
831            prev.top,
832            prev.bottom,
833            curr.top,
834            prev.x0,
835            curr.x0,
836            y_tolerance,
837            x_tolerance,
838        ),
839        Direction::Btt => (
840            -prev.bottom,
841            -prev.top,
842            -curr.bottom,
843            prev.x0,
844            curr.x0,
845            y_tolerance,
846            x_tolerance,
847        ),
848    };
849
850    (cx < ax) || (cx > bx + x) || (cy - ay).abs() > y
851}
852
853trait TextObject {
854    fn x0(&self) -> f64;
855    fn x1(&self) -> f64;
856    fn top(&self) -> f64;
857    fn bottom(&self) -> f64;
858    fn height(&self) -> f64;
859}
860
861impl TextObject for Char {
862    fn x0(&self) -> f64 { self.x0 }
863    fn x1(&self) -> f64 { self.x1 }
864    fn top(&self) -> f64 { self.top }
865    fn bottom(&self) -> f64 { self.bottom }
866    fn height(&self) -> f64 { self.height }
867}
868
869impl TextObject for Word {
870    fn x0(&self) -> f64 { self.x0 }
871    fn x1(&self) -> f64 { self.x1 }
872    fn top(&self) -> f64 { self.top }
873    fn bottom(&self) -> f64 { self.bottom }
874    fn height(&self) -> f64 { self.height }
875}