pdfplumber_core/
words.rs

1use crate::geometry::BBox;
2use crate::text::{Char, TextDirection, is_cjk_text};
3
4/// Options for word extraction, matching pdfplumber defaults.
5#[derive(Debug, Clone)]
6pub struct WordOptions {
7    /// Maximum horizontal distance between characters to group into a word.
8    pub x_tolerance: f64,
9    /// Maximum vertical distance between characters to group into a word.
10    pub y_tolerance: f64,
11    /// If true, include blank/space characters in words instead of splitting on them.
12    pub keep_blank_chars: bool,
13    /// If true, use the text flow order from the PDF content stream instead of spatial ordering.
14    pub use_text_flow: bool,
15    /// Text direction for grouping characters.
16    pub text_direction: TextDirection,
17}
18
19impl Default for WordOptions {
20    fn default() -> Self {
21        Self {
22            x_tolerance: 3.0,
23            y_tolerance: 3.0,
24            keep_blank_chars: false,
25            use_text_flow: false,
26            text_direction: TextDirection::default(),
27        }
28    }
29}
30
31/// A word extracted from a PDF page.
32#[derive(Debug, Clone, PartialEq)]
33#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
34pub struct Word {
35    /// The text content of this word.
36    pub text: String,
37    /// Bounding box encompassing all constituent characters.
38    pub bbox: BBox,
39    /// Distance from the top of the first page (minimum doctop of constituent chars).
40    pub doctop: f64,
41    /// Text direction for this word.
42    pub direction: TextDirection,
43    /// The characters that make up this word.
44    pub chars: Vec<Char>,
45}
46
47/// Extracts words from a sequence of characters based on spatial proximity.
48pub struct WordExtractor;
49
50impl WordExtractor {
51    /// Extract words from the given characters using the specified options.
52    ///
53    /// Characters are grouped into words based on spatial proximity:
54    /// - Characters within `x_tolerance` horizontally and `y_tolerance` vertically
55    ///   are grouped together.
56    /// - For CJK characters, character width (or height for vertical text) is used
57    ///   as the tolerance instead of the fixed `x_tolerance`/`y_tolerance`.
58    /// - By default, whitespace characters split words. Set `keep_blank_chars`
59    ///   to include them.
60    /// - By default, characters are sorted spatially. Set `use_text_flow` to
61    ///   preserve PDF content stream order.
62    /// - `text_direction` controls sorting and gap logic for vertical text.
63    pub fn extract(chars: &[Char], options: &WordOptions) -> Vec<Word> {
64        if chars.is_empty() {
65            return Vec::new();
66        }
67
68        let mut sorted_chars: Vec<&Char> = chars.iter().collect();
69        if !options.use_text_flow {
70            match options.text_direction {
71                TextDirection::Ttb => {
72                    // Vertical: columns right-to-left, top-to-bottom within column
73                    sorted_chars.sort_by(|a, b| {
74                        b.bbox
75                            .x0
76                            .partial_cmp(&a.bbox.x0)
77                            .unwrap()
78                            .then(a.bbox.top.partial_cmp(&b.bbox.top).unwrap())
79                    });
80                }
81                TextDirection::Btt => {
82                    // Vertical bottom-to-top: columns right-to-left, bottom-to-top
83                    sorted_chars.sort_by(|a, b| {
84                        b.bbox
85                            .x0
86                            .partial_cmp(&a.bbox.x0)
87                            .unwrap()
88                            .then(b.bbox.bottom.partial_cmp(&a.bbox.bottom).unwrap())
89                    });
90                }
91                _ => {
92                    // Horizontal (Ltr/Rtl): top-to-bottom, left-to-right
93                    sorted_chars.sort_by(|a, b| {
94                        a.bbox
95                            .top
96                            .partial_cmp(&b.bbox.top)
97                            .unwrap()
98                            .then(a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap())
99                    });
100                }
101            }
102        }
103
104        let is_vertical = matches!(
105            options.text_direction,
106            TextDirection::Ttb | TextDirection::Btt
107        );
108
109        let mut words = Vec::new();
110        let mut current_chars: Vec<Char> = Vec::new();
111
112        for &ch in &sorted_chars {
113            let is_blank = ch.text.chars().all(|c| c.is_whitespace());
114
115            // If this is a blank and we're not keeping blanks, finish current word
116            if is_blank && !options.keep_blank_chars {
117                if !current_chars.is_empty() {
118                    words.push(Self::make_word(&current_chars));
119                    current_chars.clear();
120                }
121                continue;
122            }
123
124            if current_chars.is_empty() {
125                current_chars.push(ch.clone());
126                continue;
127            }
128
129            let last = current_chars.last().unwrap();
130
131            let should_split = if is_vertical {
132                Self::should_split_vertical(last, ch, options)
133            } else {
134                Self::should_split_horizontal(last, ch, options)
135            };
136
137            if should_split {
138                words.push(Self::make_word(&current_chars));
139                current_chars.clear();
140            }
141
142            current_chars.push(ch.clone());
143        }
144
145        if !current_chars.is_empty() {
146            words.push(Self::make_word(&current_chars));
147        }
148
149        words
150    }
151
152    /// Determine the effective x-tolerance between two characters.
153    ///
154    /// For CJK characters, uses the previous character's width as tolerance,
155    /// which accounts for the wider spacing of full-width characters.
156    fn effective_x_tolerance(last: &Char, current: &Char, base: f64) -> f64 {
157        if is_cjk_text(&last.text) || is_cjk_text(&current.text) {
158            last.bbox.width().max(base)
159        } else {
160            base
161        }
162    }
163
164    /// Determine the effective y-tolerance between two characters (for vertical text).
165    fn effective_y_tolerance(last: &Char, current: &Char, base: f64) -> f64 {
166        if is_cjk_text(&last.text) || is_cjk_text(&current.text) {
167            last.bbox.height().max(base)
168        } else {
169            base
170        }
171    }
172
173    /// Check if two horizontally-adjacent chars should be split into separate words.
174    fn should_split_horizontal(last: &Char, current: &Char, options: &WordOptions) -> bool {
175        let x_gap = current.bbox.x0 - last.bbox.x1;
176        let y_diff = (current.bbox.top - last.bbox.top).abs();
177        let x_tol = Self::effective_x_tolerance(last, current, options.x_tolerance);
178        x_gap > x_tol || y_diff > options.y_tolerance
179    }
180
181    /// Check if two vertically-adjacent chars should be split into separate words.
182    fn should_split_vertical(last: &Char, current: &Char, options: &WordOptions) -> bool {
183        let y_gap = current.bbox.top - last.bbox.bottom;
184        let x_diff = (current.bbox.x0 - last.bbox.x0).abs();
185        let y_tol = Self::effective_y_tolerance(last, current, options.y_tolerance);
186        y_gap > y_tol || x_diff > options.x_tolerance
187    }
188
189    fn make_word(chars: &[Char]) -> Word {
190        let text: String = chars.iter().map(|c| c.text.as_str()).collect();
191        let bbox = chars
192            .iter()
193            .map(|c| c.bbox)
194            .reduce(|a, b| a.union(&b))
195            .expect("make_word called with non-empty chars");
196        let doctop = chars.iter().map(|c| c.doctop).fold(f64::INFINITY, f64::min);
197        let direction = chars[0].direction;
198        Word {
199            text,
200            bbox,
201            doctop,
202            direction,
203            chars: chars.to_vec(),
204        }
205    }
206}
207
208#[cfg(test)]
209mod tests {
210    use super::*;
211
212    fn make_char(text: &str, x0: f64, top: f64, x1: f64, bottom: f64) -> Char {
213        Char {
214            text: text.to_string(),
215            bbox: BBox::new(x0, top, x1, bottom),
216            fontname: "TestFont".to_string(),
217            size: 12.0,
218            doctop: top,
219            upright: true,
220            direction: TextDirection::Ltr,
221            stroking_color: None,
222            non_stroking_color: None,
223            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
224            char_code: 0,
225        }
226    }
227
228    #[test]
229    fn test_word_has_doctop_and_direction() {
230        let chars = vec![
231            make_char("A", 10.0, 100.0, 20.0, 112.0),
232            make_char("B", 20.0, 100.0, 30.0, 112.0),
233        ];
234        let words = WordExtractor::extract(&chars, &WordOptions::default());
235        assert_eq!(words.len(), 1);
236        assert_eq!(words[0].doctop, 100.0);
237        assert_eq!(words[0].direction, TextDirection::Ltr);
238    }
239
240    #[test]
241    fn test_word_doctop_uses_min_char_doctop() {
242        // Characters with different doctop values - word should use minimum
243        let mut chars = vec![
244            make_char("X", 10.0, 100.0, 20.0, 112.0),
245            make_char("Y", 20.0, 100.0, 30.0, 112.0),
246        ];
247        chars[0].doctop = 900.0;
248        chars[1].doctop = 892.0;
249        let words = WordExtractor::extract(&chars, &WordOptions::default());
250        assert_eq!(words[0].doctop, 892.0);
251    }
252
253    #[test]
254    fn test_default_options() {
255        let opts = WordOptions::default();
256        assert_eq!(opts.x_tolerance, 3.0);
257        assert_eq!(opts.y_tolerance, 3.0);
258        assert!(!opts.keep_blank_chars);
259        assert!(!opts.use_text_flow);
260    }
261
262    #[test]
263    fn test_empty_chars() {
264        let words = WordExtractor::extract(&[], &WordOptions::default());
265        assert!(words.is_empty());
266    }
267
268    #[test]
269    fn test_single_char() {
270        let chars = vec![make_char("A", 10.0, 100.0, 20.0, 112.0)];
271        let words = WordExtractor::extract(&chars, &WordOptions::default());
272        assert_eq!(words.len(), 1);
273        assert_eq!(words[0].text, "A");
274        assert_eq!(words[0].chars.len(), 1);
275    }
276
277    #[test]
278    fn test_simple_horizontal_text() {
279        // "Hello" — 5 consecutive touching chars on one line
280        let chars = vec![
281            make_char("H", 10.0, 100.0, 20.0, 112.0),
282            make_char("e", 20.0, 100.0, 30.0, 112.0),
283            make_char("l", 30.0, 100.0, 35.0, 112.0),
284            make_char("l", 35.0, 100.0, 40.0, 112.0),
285            make_char("o", 40.0, 100.0, 50.0, 112.0),
286        ];
287        let words = WordExtractor::extract(&chars, &WordOptions::default());
288        assert_eq!(words.len(), 1);
289        assert_eq!(words[0].text, "Hello");
290        assert_eq!(words[0].bbox, BBox::new(10.0, 100.0, 50.0, 112.0));
291        assert_eq!(words[0].chars.len(), 5);
292    }
293
294    #[test]
295    fn test_multi_line_text() {
296        // "Hi" on line 1 (top=100), "Lo" on line 2 (top=120)
297        let chars = vec![
298            make_char("H", 10.0, 100.0, 20.0, 112.0),
299            make_char("i", 20.0, 100.0, 30.0, 112.0),
300            make_char("L", 10.0, 120.0, 20.0, 132.0),
301            make_char("o", 20.0, 120.0, 30.0, 132.0),
302        ];
303        let words = WordExtractor::extract(&chars, &WordOptions::default());
304        assert_eq!(words.len(), 2);
305        assert_eq!(words[0].text, "Hi");
306        assert_eq!(words[1].text, "Lo");
307    }
308
309    #[test]
310    fn test_text_with_large_gap() {
311        // "AB" then gap of 20 then "CD" — should be separate words
312        let chars = vec![
313            make_char("A", 10.0, 100.0, 20.0, 112.0),
314            make_char("B", 20.0, 100.0, 30.0, 112.0),
315            make_char("C", 50.0, 100.0, 60.0, 112.0), // gap = 50-30 = 20 > 3
316            make_char("D", 60.0, 100.0, 70.0, 112.0),
317        ];
318        let words = WordExtractor::extract(&chars, &WordOptions::default());
319        assert_eq!(words.len(), 2);
320        assert_eq!(words[0].text, "AB");
321        assert_eq!(words[1].text, "CD");
322    }
323
324    #[test]
325    fn test_text_with_small_gap_within_tolerance() {
326        // Gap of 2 which is within default tolerance of 3
327        let chars = vec![
328            make_char("A", 10.0, 100.0, 20.0, 112.0),
329            make_char("B", 22.0, 100.0, 32.0, 112.0), // gap = 22-20 = 2 <= 3
330        ];
331        let words = WordExtractor::extract(&chars, &WordOptions::default());
332        assert_eq!(words.len(), 1);
333        assert_eq!(words[0].text, "AB");
334    }
335
336    #[test]
337    fn test_split_on_space_char() {
338        // "A B" with an explicit space character
339        let chars = vec![
340            make_char("A", 10.0, 100.0, 20.0, 112.0),
341            make_char(" ", 20.0, 100.0, 25.0, 112.0),
342            make_char("B", 25.0, 100.0, 35.0, 112.0),
343        ];
344        let words = WordExtractor::extract(&chars, &WordOptions::default());
345        assert_eq!(words.len(), 2);
346        assert_eq!(words[0].text, "A");
347        assert_eq!(words[1].text, "B");
348    }
349
350    #[test]
351    fn test_keep_blank_chars_true() {
352        // "A B" with space — keep_blank_chars groups them as one word
353        let chars = vec![
354            make_char("A", 10.0, 100.0, 20.0, 112.0),
355            make_char(" ", 20.0, 100.0, 25.0, 112.0),
356            make_char("B", 25.0, 100.0, 35.0, 112.0),
357        ];
358        let opts = WordOptions {
359            keep_blank_chars: true,
360            ..WordOptions::default()
361        };
362        let words = WordExtractor::extract(&chars, &opts);
363        assert_eq!(words.len(), 1);
364        assert_eq!(words[0].text, "A B");
365    }
366
367    #[test]
368    fn test_configurable_x_tolerance() {
369        // Gap of 10 between A and B
370        let chars = vec![
371            make_char("A", 10.0, 100.0, 20.0, 112.0),
372            make_char("B", 30.0, 100.0, 40.0, 112.0), // gap = 10
373        ];
374
375        // Default tolerance (3) — two words
376        let words = WordExtractor::extract(&chars, &WordOptions::default());
377        assert_eq!(words.len(), 2);
378
379        // Larger tolerance (15) — one word
380        let opts = WordOptions {
381            x_tolerance: 15.0,
382            ..WordOptions::default()
383        };
384        let words = WordExtractor::extract(&chars, &opts);
385        assert_eq!(words.len(), 1);
386        assert_eq!(words[0].text, "AB");
387    }
388
389    #[test]
390    fn test_configurable_y_tolerance() {
391        // Chars on slightly different vertical positions (y_diff = 5)
392        let chars = vec![
393            make_char("A", 10.0, 100.0, 20.0, 112.0),
394            make_char("B", 20.0, 105.0, 30.0, 117.0), // y_diff = 5
395        ];
396
397        // Default y_tolerance (3) — two words
398        let words = WordExtractor::extract(&chars, &WordOptions::default());
399        assert_eq!(words.len(), 2);
400
401        // Larger y_tolerance (10) — one word
402        let opts = WordOptions {
403            y_tolerance: 10.0,
404            ..WordOptions::default()
405        };
406        let words = WordExtractor::extract(&chars, &opts);
407        assert_eq!(words.len(), 1);
408        assert_eq!(words[0].text, "AB");
409    }
410
411    #[test]
412    fn test_word_bbox_is_union_of_char_bboxes() {
413        // Characters with varying heights
414        let chars = vec![
415            make_char("A", 10.0, 98.0, 20.0, 112.0),
416            make_char("b", 20.0, 100.0, 28.0, 110.0),
417            make_char("C", 28.0, 97.0, 38.0, 113.0),
418        ];
419        let words = WordExtractor::extract(&chars, &WordOptions::default());
420        assert_eq!(words.len(), 1);
421        assert_eq!(words[0].bbox, BBox::new(10.0, 97.0, 38.0, 113.0));
422    }
423
424    #[test]
425    fn test_unsorted_chars_are_sorted_spatially() {
426        // Chars given in reverse spatial order
427        let chars = vec![
428            make_char("B", 20.0, 100.0, 30.0, 112.0),
429            make_char("A", 10.0, 100.0, 20.0, 112.0),
430        ];
431        let words = WordExtractor::extract(&chars, &WordOptions::default());
432        assert_eq!(words.len(), 1);
433        assert_eq!(words[0].text, "AB");
434    }
435
436    #[test]
437    fn test_use_text_flow_preserves_order() {
438        // Chars in PDF content stream order (reverse of spatial)
439        let chars = vec![
440            make_char("B", 20.0, 100.0, 30.0, 112.0),
441            make_char("A", 10.0, 100.0, 20.0, 112.0),
442        ];
443        let opts = WordOptions {
444            use_text_flow: true,
445            ..WordOptions::default()
446        };
447        let words = WordExtractor::extract(&chars, &opts);
448        // With text_flow, order preserved: "B" first, "A" second
449        // x_gap = A.x0(10) - B.x1(30) = -20 <= 3, so they group
450        assert_eq!(words.len(), 1);
451        assert_eq!(words[0].text, "BA");
452    }
453
454    #[test]
455    fn test_multiple_spaces_between_words() {
456        // "A" then multiple spaces then "B"
457        let chars = vec![
458            make_char("A", 10.0, 100.0, 20.0, 112.0),
459            make_char(" ", 20.0, 100.0, 25.0, 112.0),
460            make_char(" ", 25.0, 100.0, 30.0, 112.0),
461            make_char("B", 30.0, 100.0, 40.0, 112.0),
462        ];
463        let words = WordExtractor::extract(&chars, &WordOptions::default());
464        assert_eq!(words.len(), 2);
465        assert_eq!(words[0].text, "A");
466        assert_eq!(words[1].text, "B");
467    }
468
469    #[test]
470    fn test_leading_spaces_ignored() {
471        let chars = vec![
472            make_char(" ", 5.0, 100.0, 10.0, 112.0),
473            make_char("A", 10.0, 100.0, 20.0, 112.0),
474        ];
475        let words = WordExtractor::extract(&chars, &WordOptions::default());
476        assert_eq!(words.len(), 1);
477        assert_eq!(words[0].text, "A");
478    }
479
480    #[test]
481    fn test_trailing_spaces_ignored() {
482        let chars = vec![
483            make_char("A", 10.0, 100.0, 20.0, 112.0),
484            make_char(" ", 20.0, 100.0, 25.0, 112.0),
485        ];
486        let words = WordExtractor::extract(&chars, &WordOptions::default());
487        assert_eq!(words.len(), 1);
488        assert_eq!(words[0].text, "A");
489    }
490
491    #[test]
492    fn test_overlapping_chars_grouped() {
493        // Overlapping characters (negative gap) should still group
494        let chars = vec![
495            make_char("f", 10.0, 100.0, 20.0, 112.0),
496            make_char("i", 18.0, 100.0, 25.0, 112.0), // gap = 18-20 = -2 (overlap)
497        ];
498        let words = WordExtractor::extract(&chars, &WordOptions::default());
499        assert_eq!(words.len(), 1);
500        assert_eq!(words[0].text, "fi");
501    }
502
503    #[test]
504    fn test_three_words_on_one_line() {
505        // "The quick fox" — three words separated by spaces
506        let chars = vec![
507            make_char("T", 10.0, 100.0, 20.0, 112.0),
508            make_char("h", 20.0, 100.0, 28.0, 112.0),
509            make_char("e", 28.0, 100.0, 36.0, 112.0),
510            make_char(" ", 36.0, 100.0, 40.0, 112.0),
511            make_char("q", 40.0, 100.0, 48.0, 112.0),
512            make_char("u", 48.0, 100.0, 56.0, 112.0),
513            make_char("i", 56.0, 100.0, 60.0, 112.0),
514            make_char("c", 60.0, 100.0, 68.0, 112.0),
515            make_char("k", 68.0, 100.0, 76.0, 112.0),
516            make_char(" ", 76.0, 100.0, 80.0, 112.0),
517            make_char("f", 80.0, 100.0, 88.0, 112.0),
518            make_char("o", 88.0, 100.0, 96.0, 112.0),
519            make_char("x", 96.0, 100.0, 104.0, 112.0),
520        ];
521        let words = WordExtractor::extract(&chars, &WordOptions::default());
522        assert_eq!(words.len(), 3);
523        assert_eq!(words[0].text, "The");
524        assert_eq!(words[1].text, "quick");
525        assert_eq!(words[2].text, "fox");
526    }
527
528    #[test]
529    fn test_multiline_sorting() {
530        // Chars from two lines given interleaved — should sort by top then x0
531        let chars = vec![
532            make_char("C", 10.0, 120.0, 20.0, 132.0), // line 2
533            make_char("A", 10.0, 100.0, 20.0, 112.0), // line 1
534            make_char("D", 20.0, 120.0, 30.0, 132.0), // line 2
535            make_char("B", 20.0, 100.0, 30.0, 112.0), // line 1
536        ];
537        let words = WordExtractor::extract(&chars, &WordOptions::default());
538        assert_eq!(words.len(), 2);
539        assert_eq!(words[0].text, "AB");
540        assert_eq!(words[1].text, "CD");
541    }
542
543    // --- CJK word grouping tests (US-020) ---
544
545    /// Helper to create a CJK character (full-width, typically 12pt wide).
546    fn make_cjk_char(text: &str, x0: f64, top: f64, width: f64, height: f64) -> Char {
547        Char {
548            text: text.to_string(),
549            bbox: BBox::new(x0, top, x0 + width, top + height),
550            fontname: "SimSun".to_string(),
551            size: 12.0,
552            doctop: top,
553            upright: true,
554            direction: TextDirection::Ltr,
555            stroking_color: None,
556            non_stroking_color: None,
557            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
558            char_code: 0,
559        }
560    }
561
562    #[test]
563    fn test_chinese_text_grouping() {
564        // "中国人" — 3 consecutive CJK characters, each 12pt wide with small gaps
565        // With default x_tolerance=3, a gap of 1 between 12pt-wide chars should group
566        let chars = vec![
567            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
568            make_cjk_char("国", 23.0, 100.0, 12.0, 12.0), // gap = 23-22 = 1
569            make_cjk_char("人", 36.0, 100.0, 12.0, 12.0), // gap = 36-35 = 1
570        ];
571        let words = WordExtractor::extract(&chars, &WordOptions::default());
572        assert_eq!(words.len(), 1);
573        assert_eq!(words[0].text, "中国人");
574    }
575
576    #[test]
577    fn test_chinese_text_with_larger_gap_uses_char_width_tolerance() {
578        // CJK chars with gap=8, which exceeds default x_tolerance=3
579        // but CJK-aware logic should use char width (12) as tolerance
580        let chars = vec![
581            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
582            make_cjk_char("国", 30.0, 100.0, 12.0, 12.0), // gap = 30-22 = 8 > 3 but < 12
583        ];
584        let words = WordExtractor::extract(&chars, &WordOptions::default());
585        assert_eq!(
586            words.len(),
587            1,
588            "CJK chars within char-width tolerance should group"
589        );
590        assert_eq!(words[0].text, "中国");
591    }
592
593    #[test]
594    fn test_chinese_text_large_gap_splits() {
595        // CJK chars with gap=15, exceeding char width (12)
596        let chars = vec![
597            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
598            make_cjk_char("国", 37.0, 100.0, 12.0, 12.0), // gap = 37-22 = 15 > 12
599        ];
600        let words = WordExtractor::extract(&chars, &WordOptions::default());
601        assert_eq!(
602            words.len(),
603            2,
604            "CJK chars beyond char-width tolerance should split"
605        );
606        assert_eq!(words[0].text, "中");
607        assert_eq!(words[1].text, "国");
608    }
609
610    #[test]
611    fn test_japanese_mixed_text() {
612        // "日本語abc" — CJK followed by Latin
613        let chars = vec![
614            make_cjk_char("日", 10.0, 100.0, 12.0, 12.0),
615            make_cjk_char("本", 23.0, 100.0, 12.0, 12.0), // gap=1
616            make_cjk_char("語", 36.0, 100.0, 12.0, 12.0), // gap=1
617            make_char("a", 49.0, 100.0, 55.0, 112.0),     // gap=1
618            make_char("b", 55.0, 100.0, 61.0, 112.0),     // gap=0
619            make_char("c", 61.0, 100.0, 67.0, 112.0),     // gap=0
620        ];
621        let words = WordExtractor::extract(&chars, &WordOptions::default());
622        assert_eq!(words.len(), 1);
623        assert_eq!(words[0].text, "日本語abc");
624    }
625
626    #[test]
627    fn test_korean_text_grouping() {
628        // "한글" — 2 Korean characters
629        let chars = vec![
630            make_cjk_char("한", 10.0, 100.0, 12.0, 12.0),
631            make_cjk_char("글", 23.0, 100.0, 12.0, 12.0), // gap=1
632        ];
633        let words = WordExtractor::extract(&chars, &WordOptions::default());
634        assert_eq!(words.len(), 1);
635        assert_eq!(words[0].text, "한글");
636    }
637
638    #[test]
639    fn test_mixed_cjk_latin_with_gap() {
640        // "Hello" then gap then "中国" — should be two words
641        let chars = vec![
642            make_char("H", 10.0, 100.0, 18.0, 112.0),
643            make_char("e", 18.0, 100.0, 24.0, 112.0),
644            make_char("l", 24.0, 100.0, 28.0, 112.0),
645            make_char("l", 28.0, 100.0, 32.0, 112.0),
646            make_char("o", 32.0, 100.0, 38.0, 112.0),
647            // gap of 20 (well beyond any tolerance)
648            make_cjk_char("中", 58.0, 100.0, 12.0, 12.0),
649            make_cjk_char("国", 71.0, 100.0, 12.0, 12.0), // gap=1
650        ];
651        let words = WordExtractor::extract(&chars, &WordOptions::default());
652        assert_eq!(words.len(), 2);
653        assert_eq!(words[0].text, "Hello");
654        assert_eq!(words[1].text, "中国");
655    }
656
657    #[test]
658    fn test_cjk_transition_to_latin_uses_cjk_tolerance() {
659        // CJK char followed by Latin char with gap=5 (> default 3, but < CJK width 12)
660        let chars = vec![
661            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
662            make_char("A", 27.0, 100.0, 33.0, 112.0), // gap = 27-22 = 5
663        ];
664        let words = WordExtractor::extract(&chars, &WordOptions::default());
665        assert_eq!(
666            words.len(),
667            1,
668            "CJK-to-Latin transition should use CJK tolerance"
669        );
670        assert_eq!(words[0].text, "中A");
671    }
672
673    #[test]
674    fn test_vertical_text_chinese() {
675        // Vertical text: chars stacked top-to-bottom in a column
676        // "中国人" flowing vertically at x=100
677        let chars = vec![
678            make_cjk_char("中", 100.0, 10.0, 12.0, 12.0),
679            make_cjk_char("国", 100.0, 23.0, 12.0, 12.0), // y_gap = 23-22 = 1
680            make_cjk_char("人", 100.0, 36.0, 12.0, 12.0), // y_gap = 36-35 = 1
681        ];
682        let opts = WordOptions {
683            text_direction: TextDirection::Ttb,
684            ..WordOptions::default()
685        };
686        let words = WordExtractor::extract(&chars, &opts);
687        assert_eq!(words.len(), 1);
688        assert_eq!(words[0].text, "中国人");
689    }
690
691    #[test]
692    fn test_vertical_text_two_columns() {
693        // Two vertical columns: column 1 at x=100, column 2 at x=70
694        // Vertical text reads right-to-left (column1 first, column2 second)
695        let chars = vec![
696            // Column 1 (right side, x=100)
697            make_cjk_char("一", 100.0, 10.0, 12.0, 12.0),
698            make_cjk_char("二", 100.0, 23.0, 12.0, 12.0),
699            // Column 2 (left side, x=70)
700            make_cjk_char("三", 70.0, 10.0, 12.0, 12.0),
701            make_cjk_char("四", 70.0, 23.0, 12.0, 12.0),
702        ];
703        let opts = WordOptions {
704            text_direction: TextDirection::Ttb,
705            ..WordOptions::default()
706        };
707        let words = WordExtractor::extract(&chars, &opts);
708        assert_eq!(words.len(), 2);
709        // Right column first in reading order (right-to-left)
710        assert_eq!(words[0].text, "一二");
711        assert_eq!(words[1].text, "三四");
712    }
713
714    #[test]
715    fn test_vertical_text_with_gap() {
716        // Vertical CJK chars with large vertical gap
717        let chars = vec![
718            make_cjk_char("上", 100.0, 10.0, 12.0, 12.0),
719            make_cjk_char("下", 100.0, 40.0, 12.0, 12.0), // y_gap = 40-22 = 18 > 12
720        ];
721        let opts = WordOptions {
722            text_direction: TextDirection::Ttb,
723            ..WordOptions::default()
724        };
725        let words = WordExtractor::extract(&chars, &opts);
726        assert_eq!(
727            words.len(),
728            2,
729            "Vertical CJK chars with large gap should split"
730        );
731        assert_eq!(words[0].text, "上");
732        assert_eq!(words[1].text, "下");
733    }
734
735    #[test]
736    fn test_cjk_with_space_splits() {
737        // CJK chars separated by a space character should still split on the space
738        let chars = vec![
739            make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
740            Char {
741                text: " ".to_string(),
742                bbox: BBox::new(22.0, 100.0, 25.0, 112.0),
743                fontname: "SimSun".to_string(),
744                size: 12.0,
745                doctop: 100.0,
746                upright: true,
747                direction: TextDirection::Ltr,
748                stroking_color: None,
749                non_stroking_color: None,
750                ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
751                char_code: 32,
752            },
753            make_cjk_char("国", 25.0, 100.0, 12.0, 12.0),
754        ];
755        let words = WordExtractor::extract(&chars, &WordOptions::default());
756        assert_eq!(words.len(), 2);
757        assert_eq!(words[0].text, "中");
758        assert_eq!(words[1].text, "国");
759    }
760}
pdfplumber_core/words.rs

pdfplumber_core/
words.rs