pdfplumber_core/
words.rs

1use crate::geometry::BBox;
2use crate::text::{Char, TextDirection, is_cjk_text};
3
4/// Options for word extraction, matching pdfplumber defaults.
5#[derive(Debug, Clone)]
6pub struct WordOptions {
7    /// Maximum horizontal distance between characters to group into a word.
8    pub x_tolerance: f64,
9    /// Maximum vertical distance between characters to group into a word.
10    pub y_tolerance: f64,
11    /// If true, include blank/space characters in words instead of splitting on them.
12    pub keep_blank_chars: bool,
13    /// If true, use the text flow order from the PDF content stream instead of spatial ordering.
14    pub use_text_flow: bool,
15    /// Text direction for grouping characters.
16    pub text_direction: TextDirection,
17}
18
19impl Default for WordOptions {
20    fn default() -> Self {
21        Self {
22            x_tolerance: 3.0,
23            y_tolerance: 3.0,
24            keep_blank_chars: false,
25            use_text_flow: false,
26            text_direction: TextDirection::default(),
27        }
28    }
29}
30
31/// A word extracted from a PDF page.
32#[derive(Debug, Clone, PartialEq)]
33#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
34pub struct Word {
35    /// The text content of this word.
36    pub text: String,
37    /// Bounding box encompassing all constituent characters.
38    pub bbox: BBox,
39    /// Distance from the top of the first page (minimum doctop of constituent chars).
40    pub doctop: f64,
41    /// Text direction for this word.
42    pub direction: TextDirection,
43    /// The characters that make up this word.
44    pub chars: Vec<Char>,
45}
46
47/// Extracts words from a sequence of characters based on spatial proximity.
48pub struct WordExtractor;
49
50impl WordExtractor {
51    /// Extract words from the given characters using the specified options.
52    ///
53    /// Characters are grouped into words based on spatial proximity:
54    /// - Characters within `x_tolerance` horizontally and `y_tolerance` vertically
55    ///   are grouped together.
56    /// - For CJK characters, character width (or height for vertical text) is used
57    ///   as the tolerance instead of the fixed `x_tolerance`/`y_tolerance`.
58    /// - By default, whitespace characters split words. Set `keep_blank_chars`
59    ///   to include them.
60    /// - By default, characters are sorted spatially. Set `use_text_flow` to
61    ///   preserve PDF content stream order.
62    /// - `text_direction` controls sorting and gap logic for vertical text.
63    pub fn extract(chars: &[Char], options: &WordOptions) -> Vec<Word> {
64        if chars.is_empty() {
65            return Vec::new();
66        }
67
68        let mut sorted_chars: Vec<&Char> = chars.iter().collect();
69        if !options.use_text_flow {
70            match options.text_direction {
71                TextDirection::Ttb => {
72                    // Vertical: columns right-to-left, top-to-bottom within column
73                    sorted_chars.sort_by(|a, b| {
74                        b.bbox
75                            .x0
76                            .partial_cmp(&a.bbox.x0)
77                            .unwrap()
78                            .then(a.bbox.top.partial_cmp(&b.bbox.top).unwrap())
79                    });
80                }
81                TextDirection::Btt => {
82                    // Vertical bottom-to-top: columns right-to-left, bottom-to-top
83                    sorted_chars.sort_by(|a, b| {
84                        b.bbox
85                            .x0
86                            .partial_cmp(&a.bbox.x0)
87                            .unwrap()
88                            .then(b.bbox.bottom.partial_cmp(&a.bbox.bottom).unwrap())
89                    });
90                }
91                _ => {
92                    // Horizontal (Ltr/Rtl): top-to-bottom, left-to-right
93                    sorted_chars.sort_by(|a, b| {
94                        a.bbox
95                            .top
96                            .partial_cmp(&b.bbox.top)
97                            .unwrap()
98                            .then(a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap())
99                    });
100                }
101            }
102        }
103
104        let is_vertical = matches!(
105            options.text_direction,
106            TextDirection::Ttb | TextDirection::Btt
107        );
108
109        let mut words = Vec::new();
110        let mut current_chars: Vec<Char> = Vec::new();
111
112        for &ch in &sorted_chars {
113            let is_blank = ch.text.chars().all(|c| c.is_whitespace());
114
115            // If this is a blank and we're not keeping blanks, finish current word
116            if is_blank && !options.keep_blank_chars {
117                if !current_chars.is_empty() {
118                    words.push(Self::make_word(&current_chars));
119                    current_chars.clear();
120                }
121                continue;
122            }
123
124            if current_chars.is_empty() {
125                current_chars.push(ch.clone());
126                continue;
127            }
128
129            let last = current_chars.last().unwrap();
130
131            let should_split = if is_vertical {
132                Self::should_split_vertical(last, ch, options)
133            } else {
134                Self::should_split_horizontal(last, ch, options)
135            };
136
137            if should_split {
138                words.push(Self::make_word(&current_chars));
139                current_chars.clear();
140            }
141
142            current_chars.push(ch.clone());
143        }
144
145        if !current_chars.is_empty() {
146            words.push(Self::make_word(&current_chars));
147        }
148
149        words
150    }
151
152    /// Determine the effective x-tolerance between two characters.
153    ///
154    /// For CJK characters, uses the previous character's width as tolerance,
155    /// which accounts for the wider spacing of full-width characters.
156    fn effective_x_tolerance(last: &Char, current: &Char, base: f64) -> f64 {
157        if is_cjk_text(&last.text) || is_cjk_text(&current.text) {
158            last.bbox.width().max(base)
159        } else {
160            base
161        }
162    }
163
164    /// Determine the effective y-tolerance between two characters (for vertical text).
165    fn effective_y_tolerance(last: &Char, current: &Char, base: f64) -> f64 {
166        if is_cjk_text(&last.text) || is_cjk_text(&current.text) {
167            last.bbox.height().max(base)
168        } else {
169            base
170        }
171    }
172
173    /// Check if two horizontally-adjacent chars should be split into separate words.
174    fn should_split_horizontal(last: &Char, current: &Char, options: &WordOptions) -> bool {
175        let x_gap = current.bbox.x0 - last.bbox.x1;
176        let y_diff = (current.bbox.top - last.bbox.top).abs();
177        let x_tol = Self::effective_x_tolerance(last, current, options.x_tolerance);
178        x_gap > x_tol || y_diff > options.y_tolerance
179    }
180
181    /// Check if two vertically-adjacent chars should be split into separate words.
182    fn should_split_vertical(last: &Char, current: &Char, options: &WordOptions) -> bool {
183        let y_gap = current.bbox.top - last.bbox.bottom;
184        let x_diff = (current.bbox.x0 - last.bbox.x0).abs();
185        let y_tol = Self::effective_y_tolerance(last, current, options.y_tolerance);
186        y_gap > y_tol || x_diff > options.x_tolerance
187    }
188
189    fn make_word(chars: &[Char]) -> Word {
190        let text: String = chars.iter().map(|c| c.text.as_str()).collect();
191        let bbox = chars
192            .iter()
193            .map(|c| c.bbox)
194            .reduce(|a, b| a.union(&b))
195            .expect("make_word called with non-empty chars");
196        let doctop = chars.iter().map(|c| c.doctop).fold(f64::INFINITY, f64::min);
197        let direction = chars[0].direction;
198        Word {
199            text,
200            bbox,
201            doctop,
202            direction,
203            chars: chars.to_vec(),
204        }
205    }
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    fn make_char(text: &str, x0: f64, top: f64, x1: f64, bottom: f64) -> Char {
213        Char {
214            text: text.to_string(),
215            bbox: BBox::new(x0, top, x1, bottom),
216            fontname: "TestFont".to_string(),
217            size: 12.0,
218            doctop: top,
219            upright: true,
220            direction: TextDirection::Ltr,
221            stroking_color: None,
222            non_stroking_color: None,
223            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
224            char_code: 0,
225            mcid: None,
226            tag: None,
227        }
228    }
229
230    #[test]
231    fn test_word_has_doctop_and_direction() {
232        let chars = vec![
233            make_char("A", 10.0, 100.0, 20.0, 112.0),
234            make_char("B", 20.0, 100.0, 30.0, 112.0),
235        ];
236        let words = WordExtractor::extract(&chars, &WordOptions::default());
237        assert_eq!(words.len(), 1);
238        assert_eq!(words[0].doctop, 100.0);
239        assert_eq!(words[0].direction, TextDirection::Ltr);
240    }
241
242    #[test]
243    fn test_word_doctop_uses_min_char_doctop() {
244        // Characters with different doctop values - word should use minimum
245        let mut chars = vec![
246            make_char("X", 10.0, 100.0, 20.0, 112.0),
247            make_char("Y", 20.0, 100.0, 30.0, 112.0),
248        ];
249        chars[0].doctop = 900.0;
250        chars[1].doctop = 892.0;
251        let words = WordExtractor::extract(&chars, &WordOptions::default());
252        assert_eq!(words[0].doctop, 892.0);
253    }
254
255    #[test]
256    fn test_default_options() {
257        let opts = WordOptions::default();
258        assert_eq!(opts.x_tolerance, 3.0);
259        assert_eq!(opts.y_tolerance, 3.0);
260        assert!(!opts.keep_blank_chars);
261        assert!(!opts.use_text_flow);
262    }
263
264    #[test]
265    fn test_empty_chars() {
266        let words = WordExtractor::extract(&[], &WordOptions::default());
267        assert!(words.is_empty());
268    }
269
270    #[test]
271    fn test_single_char() {
272        let chars = vec![make_char("A", 10.0, 100.0, 20.0, 112.0)];
273        let words = WordExtractor::extract(&chars, &WordOptions::default());
274        assert_eq!(words.len(), 1);
275        assert_eq!(words[0].text, "A");
276        assert_eq!(words[0].chars.len(), 1);
277    }
278
279    #[test]
280    fn test_simple_horizontal_text() {
281        // "Hello" — 5 consecutive touching chars on one line
282        let chars = vec![
283            make_char("H", 10.0, 100.0, 20.0, 112.0),
284            make_char("e", 20.0, 100.0, 30.0, 112.0),
285            make_char("l", 30.0, 100.0, 35.0, 112.0),
286            make_char("l", 35.0, 100.0, 40.0, 112.0),
287            make_char("o", 40.0, 100.0, 50.0, 112.0),
288        ];
289        let words = WordExtractor::extract(&chars, &WordOptions::default());
290        assert_eq!(words.len(), 1);
291        assert_eq!(words[0].text, "Hello");
292        assert_eq!(words[0].bbox, BBox::new(10.0, 100.0, 50.0, 112.0));
293        assert_eq!(words[0].chars.len(), 5);
294    }
295
296    #[test]
297    fn test_multi_line_text() {
298        // "Hi" on line 1 (top=100), "Lo" on line 2 (top=120)
299        let chars = vec![
300            make_char("H", 10.0, 100.0, 20.0, 112.0),
301            make_char("i", 20.0, 100.0, 30.0, 112.0),
302            make_char("L", 10.0, 120.0, 20.0, 132.0),
303            make_char("o", 20.0, 120.0, 30.0, 132.0),
304        ];
305        let words = WordExtractor::extract(&chars, &WordOptions::default());
306        assert_eq!(words.len(), 2);
307        assert_eq!(words[0].text, "Hi");
308        assert_eq!(words[1].text, "Lo");
309    }
310
311    #[test]
312    fn test_text_with_large_gap() {
313        // "AB" then gap of 20 then "CD" — should be separate words
314        let chars = vec![
315            make_char("A", 10.0, 100.0, 20.0, 112.0),
316            make_char("B", 20.0, 100.0, 30.0, 112.0),
317            make_char("C", 50.0, 100.0, 60.0, 112.0), // gap = 50-30 = 20 > 3
318            make_char("D", 60.0, 100.0, 70.0, 112.0),
319        ];
320        let words = WordExtractor::extract(&chars, &WordOptions::default());
321        assert_eq!(words.len(), 2);
322        assert_eq!(words[0].text, "AB");
323        assert_eq!(words[1].text, "CD");
324    }
325
326    #[test]
327    fn test_text_with_small_gap_within_tolerance() {
328        // Gap of 2 which is within default tolerance of 3
329        let chars = vec![
330            make_char("A", 10.0, 100.0, 20.0, 112.0),
331            make_char("B", 22.0, 100.0, 32.0, 112.0), // gap = 22-20 = 2 <= 3
332        ];
333        let words = WordExtractor::extract(&chars, &WordOptions::default());
334        assert_eq!(words.len(), 1);
335        assert_eq!(words[0].text, "AB");
336    }
337
338    #[test]
339    fn test_split_on_space_char() {
340        // "A B" with an explicit space character
341        let chars = vec![
342            make_char("A", 10.0, 100.0, 20.0, 112.0),
343            make_char(" ", 20.0, 100.0, 25.0, 112.0),
344            make_char("B", 25.0, 100.0, 35.0, 112.0),
345        ];
346        let words = WordExtractor::extract(&chars, &WordOptions::default());
347        assert_eq!(words.len(), 2);
348        assert_eq!(words[0].text, "A");
349        assert_eq!(words[1].text, "B");
350    }
351
352    #[test]
353    fn test_keep_blank_chars_true() {
354        // "A B" with space — keep_blank_chars groups them as one word
355        let chars = vec![
356            make_char("A", 10.0, 100.0, 20.0, 112.0),
357            make_char(" ", 20.0, 100.0, 25.0, 112.0),
358            make_char("B", 25.0, 100.0, 35.0, 112.0),
359        ];
360        let opts = WordOptions {
361            keep_blank_chars: true,
362            ..WordOptions::default()
363        };
364        let words = WordExtractor::extract(&chars, &opts);
365        assert_eq!(words.len(), 1);
366        assert_eq!(words[0].text, "A B");
367    }
368
369    #[test]
370    fn test_configurable_x_tolerance() {
371        // Gap of 10 between A and B
372        let chars = vec![
373            make_char("A", 10.0, 100.0, 20.0, 112.0),
374            make_char("B", 30.0, 100.0, 40.0, 112.0), // gap = 10
375        ];
376
377        // Default tolerance (3) — two words
378        let words = WordExtractor::extract(&chars, &WordOptions::default());
379        assert_eq!(words.len(), 2);
380
381        // Larger tolerance (15) — one word
382        let opts = WordOptions {
383            x_tolerance: 15.0,
384            ..WordOptions::default()
385        };
386        let words = WordExtractor::extract(&chars, &opts);
387        assert_eq!(words.len(), 1);
388        assert_eq!(words[0].text, "AB");
389    }
390
391    #[test]
392    fn test_configurable_y_tolerance() {
393        // Chars on slightly different vertical positions (y_diff = 5)
394        let chars = vec![
395            make_char("A", 10.0, 100.0, 20.0, 112.0),
396            make_char("B", 20.0, 105.0, 30.0, 117.0), // y_diff = 5
397        ];
398
399        // Default y_tolerance (3) — two words
400        let words = WordExtractor::extract(&chars, &WordOptions::default());
401        assert_eq!(words.len(), 2);
402
403        // Larger y_tolerance (10) — one word
404        let opts = WordOptions {
405            y_tolerance: 10.0,
406            ..WordOptions::default()
407        };
408        let words = WordExtractor::extract(&chars, &opts);
409        assert_eq!(words.len(), 1);
410        assert_eq!(words[0].text, "AB");
411    }
412
413    #[test]
414    fn test_word_bbox_is_union_of_char_bboxes() {
415        // Characters with varying heights
416        let chars = vec![
417            make_char("A", 10.0, 98.0, 20.0, 112.0),
418            make_char("b", 20.0, 100.0, 28.0, 110.0),
419            make_char("C", 28.0, 97.0, 38.0, 113.0),
420        ];
421        let words = WordExtractor::extract(&chars, &WordOptions::default());
422        assert_eq!(words.len(), 1);
423        assert_eq!(words[0].bbox, BBox::new(10.0, 97.0, 38.0, 113.0));
424    }
425
426    #[test]
427    fn test_unsorted_chars_are_sorted_spatially() {
428        // Chars given in reverse spatial order
429        let chars = vec![
430            make_char("B", 20.0, 100.0, 30.0, 112.0),
431            make_char("A", 10.0, 100.0, 20.0, 112.0),
432        ];
433        let words = WordExtractor::extract(&chars, &WordOptions::default());
434        assert_eq!(words.len(), 1);
435        assert_eq!(words[0].text, "AB");
436    }
437
438    #[test]
439    fn test_use_text_flow_preserves_order() {
440        // Chars in PDF content stream order (reverse of spatial)
441        let chars = vec![
442            make_char("B", 20.0, 100.0, 30.0, 112.0),
443            make_char("A", 10.0, 100.0, 20.0, 112.0),
444        ];
445        let opts = WordOptions {
446            use_text_flow: true,
447            ..WordOptions::default()
448        };
449        let words = WordExtractor::extract(&chars, &opts);
450        // With text_flow, order preserved: "B" first, "A" second
451        // x_gap = A.x0(10) - B.x1(30) = -20 <= 3, so they group
452        assert_eq!(words.len(), 1);
453        assert_eq!(words[0].text, "BA");
454    }
455
456    #[test]
457    fn test_multiple_spaces_between_words() {
458        // "A" then multiple spaces then "B"
459        let chars = vec![
460            make_char("A", 10.0, 100.0, 20.0, 112.0),
461            make_char(" ", 20.0, 100.0, 25.0, 112.0),
462            make_char(" ", 25.0, 100.0, 30.0, 112.0),
463            make_char("B", 30.0, 100.0, 40.0, 112.0),
464        ];
465        let words = WordExtractor::extract(&chars, &WordOptions::default());
466        assert_eq!(words.len(), 2);
467        assert_eq!(words[0].text, "A");
468        assert_eq!(words[1].text, "B");
469    }
470
471    #[test]
472    fn test_leading_spaces_ignored() {
473        let chars = vec![
474            make_char(" ", 5.0, 100.0, 10.0, 112.0),
475            make_char("A", 10.0, 100.0, 20.0, 112.0),
476        ];
477        let words = WordExtractor::extract(&chars, &WordOptions::default());
478        assert_eq!(words.len(), 1);
479        assert_eq!(words[0].text, "A");
480    }
481
482    #[test]
483    fn test_trailing_spaces_ignored() {
484        let chars = vec![
485            make_char("A", 10.0, 100.0, 20.0, 112.0),
486            make_char(" ", 20.0, 100.0, 25.0, 112.0),
487        ];
488        let words = WordExtractor::extract(&chars, &WordOptions::default());
489        assert_eq!(words.len(), 1);
490        assert_eq!(words[0].text, "A");
491    }
492
493    #[test]
494    fn test_overlapping_chars_grouped() {
495        // Overlapping characters (negative gap) should still group
496        let chars = vec![
497            make_char("f", 10.0, 100.0, 20.0, 112.0),
498            make_char("i", 18.0, 100.0, 25.0, 112.0), // gap = 18-20 = -2 (overlap)
499        ];
500        let words = WordExtractor::extract(&chars, &WordOptions::default());
501        assert_eq!(words.len(), 1);
502        assert_eq!(words[0].text, "fi");
503    }
504
505    #[test]
506    fn test_three_words_on_one_line() {
507        // "The quick fox" — three words separated by spaces
508        let chars = vec![
509            make_char("T", 10.0, 100.0, 20.0, 112.0),
510            make_char("h", 20.0, 100.0, 28.0, 112.0),
511            make_char("e", 28.0, 100.0, 36.0, 112.0),
512            make_char(" ", 36.0, 100.0, 40.0, 112.0),
513            make_char("q", 40.0, 100.0, 48.0, 112.0),
514            make_char("u", 48.0, 100.0, 56.0, 112.0),
515            make_char("i", 56.0, 100.0, 60.0, 112.0),
516            make_char("c", 60.0, 100.0, 68.0, 112.0),
517            make_char("k", 68.0, 100.0, 76.0, 112.0),
518            make_char(" ", 76.0, 100.0, 80.0, 112.0),
519            make_char("f", 80.0, 100.0, 88.0, 112.0),
520            make_char("o", 88.0, 100.0, 96.0, 112.0),
521            make_char("x", 96.0, 100.0, 104.0, 112.0),
522        ];
523        let words = WordExtractor::extract(&chars, &WordOptions::default());
524        assert_eq!(words.len(), 3);
525        assert_eq!(words[0].text, "The");
526        assert_eq!(words[1].text, "quick");
527        assert_eq!(words[2].text, "fox");
528    }
529
530    #[test]
531    fn test_multiline_sorting() {
532        // Chars from two lines given interleaved — should sort by top then x0
533        let chars = vec![
534            make_char("C", 10.0, 120.0, 20.0, 132.0), // line 2
535            make_char("A", 10.0, 100.0, 20.0, 112.0), // line 1
536            make_char("D", 20.0, 120.0, 30.0, 132.0), // line 2
537            make_char("B", 20.0, 100.0, 30.0, 112.0), // line 1
538        ];
539        let words = WordExtractor::extract(&chars, &WordOptions::default());
540        assert_eq!(words.len(), 2);
541        assert_eq!(words[0].text, "AB");
542        assert_eq!(words[1].text, "CD");
543    }
544
545    // --- CJK word grouping tests (US-020) ---
546
547    /// Helper to create a CJK character (full-width, typically 12pt wide).
548    fn make_cjk_char(text: &str, x0: f64, top: f64, width: f64, height: f64) -> Char {
549        Char {
550            text: text.to_string(),
551            bbox: BBox::new(x0, top, x0 + width, top + height),
552            fontname: "SimSun".to_string(),
553            size: 12.0,
554            doctop: top,
555            upright: true,
556            direction: TextDirection::Ltr,
557            stroking_color: None,
558            non_stroking_color: None,
559            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
560            char_code: 0,
561            mcid: None,
562            tag: None,
563        }
564    }
565
566    #[test]
567    fn test_chinese_text_grouping() {
568        // "中国人" — 3 consecutive CJK characters, each 12pt wide with small gaps
569        // With default x_tolerance=3, a gap of 1 between 12pt-wide chars should group
570        let chars = vec![
571            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
572            make_cjk_char("国", 23.0, 100.0, 12.0, 12.0), // gap = 23-22 = 1
573            make_cjk_char("人", 36.0, 100.0, 12.0, 12.0), // gap = 36-35 = 1
574        ];
575        let words = WordExtractor::extract(&chars, &WordOptions::default());
576        assert_eq!(words.len(), 1);
577        assert_eq!(words[0].text, "中国人");
578    }
579
580    #[test]
581    fn test_chinese_text_with_larger_gap_uses_char_width_tolerance() {
582        // CJK chars with gap=8, which exceeds default x_tolerance=3
583        // but CJK-aware logic should use char width (12) as tolerance
584        let chars = vec![
585            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
586            make_cjk_char("国", 30.0, 100.0, 12.0, 12.0), // gap = 30-22 = 8 > 3 but < 12
587        ];
588        let words = WordExtractor::extract(&chars, &WordOptions::default());
589        assert_eq!(
590            words.len(),
591            1,
592            "CJK chars within char-width tolerance should group"
593        );
594        assert_eq!(words[0].text, "中国");
595    }
596
597    #[test]
598    fn test_chinese_text_large_gap_splits() {
599        // CJK chars with gap=15, exceeding char width (12)
600        let chars = vec![
601            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
602            make_cjk_char("国", 37.0, 100.0, 12.0, 12.0), // gap = 37-22 = 15 > 12
603        ];
604        let words = WordExtractor::extract(&chars, &WordOptions::default());
605        assert_eq!(
606            words.len(),
607            2,
608            "CJK chars beyond char-width tolerance should split"
609        );
610        assert_eq!(words[0].text, "中");
611        assert_eq!(words[1].text, "国");
612    }
613
614    #[test]
615    fn test_japanese_mixed_text() {
616        // "日本語abc" — CJK followed by Latin
617        let chars = vec![
618            make_cjk_char("日", 10.0, 100.0, 12.0, 12.0),
619            make_cjk_char("本", 23.0, 100.0, 12.0, 12.0), // gap=1
620            make_cjk_char("語", 36.0, 100.0, 12.0, 12.0), // gap=1
621            make_char("a", 49.0, 100.0, 55.0, 112.0),     // gap=1
622            make_char("b", 55.0, 100.0, 61.0, 112.0),     // gap=0
623            make_char("c", 61.0, 100.0, 67.0, 112.0),     // gap=0
624        ];
625        let words = WordExtractor::extract(&chars, &WordOptions::default());
626        assert_eq!(words.len(), 1);
627        assert_eq!(words[0].text, "日本語abc");
628    }
629
630    #[test]
631    fn test_korean_text_grouping() {
632        // "한글" — 2 Korean characters
633        let chars = vec![
634            make_cjk_char("한", 10.0, 100.0, 12.0, 12.0),
635            make_cjk_char("글", 23.0, 100.0, 12.0, 12.0), // gap=1
636        ];
637        let words = WordExtractor::extract(&chars, &WordOptions::default());
638        assert_eq!(words.len(), 1);
639        assert_eq!(words[0].text, "한글");
640    }
641
642    #[test]
643    fn test_mixed_cjk_latin_with_gap() {
644        // "Hello" then gap then "中国" — should be two words
645        let chars = vec![
646            make_char("H", 10.0, 100.0, 18.0, 112.0),
647            make_char("e", 18.0, 100.0, 24.0, 112.0),
648            make_char("l", 24.0, 100.0, 28.0, 112.0),
649            make_char("l", 28.0, 100.0, 32.0, 112.0),
650            make_char("o", 32.0, 100.0, 38.0, 112.0),
651            // gap of 20 (well beyond any tolerance)
652            make_cjk_char("中", 58.0, 100.0, 12.0, 12.0),
653            make_cjk_char("国", 71.0, 100.0, 12.0, 12.0), // gap=1
654        ];
655        let words = WordExtractor::extract(&chars, &WordOptions::default());
656        assert_eq!(words.len(), 2);
657        assert_eq!(words[0].text, "Hello");
658        assert_eq!(words[1].text, "中国");
659    }
660
661    #[test]
662    fn test_cjk_transition_to_latin_uses_cjk_tolerance() {
663        // CJK char followed by Latin char with gap=5 (> default 3, but < CJK width 12)
664        let chars = vec![
665            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
666            make_char("A", 27.0, 100.0, 33.0, 112.0), // gap = 27-22 = 5
667        ];
668        let words = WordExtractor::extract(&chars, &WordOptions::default());
669        assert_eq!(
670            words.len(),
671            1,
672            "CJK-to-Latin transition should use CJK tolerance"
673        );
674        assert_eq!(words[0].text, "中A");
675    }
676
677    #[test]
678    fn test_vertical_text_chinese() {
679        // Vertical text: chars stacked top-to-bottom in a column
680        // "中国人" flowing vertically at x=100
681        let chars = vec![
682            make_cjk_char("中", 100.0, 10.0, 12.0, 12.0),
683            make_cjk_char("国", 100.0, 23.0, 12.0, 12.0), // y_gap = 23-22 = 1
684            make_cjk_char("人", 100.0, 36.0, 12.0, 12.0), // y_gap = 36-35 = 1
685        ];
686        let opts = WordOptions {
687            text_direction: TextDirection::Ttb,
688            ..WordOptions::default()
689        };
690        let words = WordExtractor::extract(&chars, &opts);
691        assert_eq!(words.len(), 1);
692        assert_eq!(words[0].text, "中国人");
693    }
694
695    #[test]
696    fn test_vertical_text_two_columns() {
697        // Two vertical columns: column 1 at x=100, column 2 at x=70
698        // Vertical text reads right-to-left (column1 first, column2 second)
699        let chars = vec![
700            // Column 1 (right side, x=100)
701            make_cjk_char("一", 100.0, 10.0, 12.0, 12.0),
702            make_cjk_char("二", 100.0, 23.0, 12.0, 12.0),
703            // Column 2 (left side, x=70)
704            make_cjk_char("三", 70.0, 10.0, 12.0, 12.0),
705            make_cjk_char("四", 70.0, 23.0, 12.0, 12.0),
706        ];
707        let opts = WordOptions {
708            text_direction: TextDirection::Ttb,
709            ..WordOptions::default()
710        };
711        let words = WordExtractor::extract(&chars, &opts);
712        assert_eq!(words.len(), 2);
713        // Right column first in reading order (right-to-left)
714        assert_eq!(words[0].text, "一二");
715        assert_eq!(words[1].text, "三四");
716    }
717
718    #[test]
719    fn test_vertical_text_with_gap() {
720        // Vertical CJK chars with large vertical gap
721        let chars = vec![
722            make_cjk_char("上", 100.0, 10.0, 12.0, 12.0),
723            make_cjk_char("下", 100.0, 40.0, 12.0, 12.0), // y_gap = 40-22 = 18 > 12
724        ];
725        let opts = WordOptions {
726            text_direction: TextDirection::Ttb,
727            ..WordOptions::default()
728        };
729        let words = WordExtractor::extract(&chars, &opts);
730        assert_eq!(
731            words.len(),
732            2,
733            "Vertical CJK chars with large gap should split"
734        );
735        assert_eq!(words[0].text, "上");
736        assert_eq!(words[1].text, "下");
737    }
738
739    #[test]
740    fn test_cjk_with_space_splits() {
741        // CJK chars separated by a space character should still split on the space
742        let chars = vec![
743            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
744            Char {
745                text: " ".to_string(),
746                bbox: BBox::new(22.0, 100.0, 25.0, 112.0),
747                fontname: "SimSun".to_string(),
748                size: 12.0,
749                doctop: 100.0,
750                upright: true,
751                direction: TextDirection::Ltr,
752                stroking_color: None,
753                non_stroking_color: None,
754                ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
755                char_code: 32,
756                mcid: None,
757                tag: None,
758            },
759            make_cjk_char("国", 25.0, 100.0, 12.0, 12.0),
760        ];
761        let words = WordExtractor::extract(&chars, &WordOptions::default());
762        assert_eq!(words.len(), 2);
763        assert_eq!(words[0].text, "中");
764        assert_eq!(words[1].text, "国");
765    }
766}
pdfplumber_core/words.rs

pdfplumber_core/
words.rs