1use crate::geometry::BBox;
2use crate::text::{Char, TextDirection, is_cjk_text};
3
4#[derive(Debug, Clone)]
6pub struct WordOptions {
7 pub x_tolerance: f64,
9 pub y_tolerance: f64,
11 pub keep_blank_chars: bool,
13 pub use_text_flow: bool,
15 pub text_direction: TextDirection,
17}
18
19impl Default for WordOptions {
20 fn default() -> Self {
21 Self {
22 x_tolerance: 3.0,
23 y_tolerance: 3.0,
24 keep_blank_chars: false,
25 use_text_flow: false,
26 text_direction: TextDirection::default(),
27 }
28 }
29}
30
31#[derive(Debug, Clone, PartialEq)]
33#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
34pub struct Word {
35 pub text: String,
37 pub bbox: BBox,
39 pub doctop: f64,
41 pub direction: TextDirection,
43 pub chars: Vec<Char>,
45}
46
47pub struct WordExtractor;
49
50impl WordExtractor {
51 pub fn extract(chars: &[Char], options: &WordOptions) -> Vec<Word> {
64 if chars.is_empty() {
65 return Vec::new();
66 }
67
68 let mut sorted_chars: Vec<&Char> = chars.iter().collect();
69 if !options.use_text_flow {
70 match options.text_direction {
71 TextDirection::Ttb => {
72 sorted_chars.sort_by(|a, b| {
74 b.bbox
75 .x0
76 .partial_cmp(&a.bbox.x0)
77 .unwrap()
78 .then(a.bbox.top.partial_cmp(&b.bbox.top).unwrap())
79 });
80 }
81 TextDirection::Btt => {
82 sorted_chars.sort_by(|a, b| {
84 b.bbox
85 .x0
86 .partial_cmp(&a.bbox.x0)
87 .unwrap()
88 .then(b.bbox.bottom.partial_cmp(&a.bbox.bottom).unwrap())
89 });
90 }
91 _ => {
92 sorted_chars.sort_by(|a, b| {
94 a.bbox
95 .top
96 .partial_cmp(&b.bbox.top)
97 .unwrap()
98 .then(a.bbox.x0.partial_cmp(&b.bbox.x0).unwrap())
99 });
100 }
101 }
102 }
103
104 let is_vertical = matches!(
105 options.text_direction,
106 TextDirection::Ttb | TextDirection::Btt
107 );
108
109 let mut words = Vec::new();
110 let mut current_chars: Vec<Char> = Vec::new();
111
112 for &ch in &sorted_chars {
113 let is_blank = ch.text.chars().all(|c| c.is_whitespace());
114
115 if is_blank && !options.keep_blank_chars {
117 if !current_chars.is_empty() {
118 words.push(Self::make_word(¤t_chars));
119 current_chars.clear();
120 }
121 continue;
122 }
123
124 if current_chars.is_empty() {
125 current_chars.push(ch.clone());
126 continue;
127 }
128
129 let last = current_chars.last().unwrap();
130
131 let should_split = if is_vertical {
132 Self::should_split_vertical(last, ch, options)
133 } else {
134 Self::should_split_horizontal(last, ch, options)
135 };
136
137 if should_split {
138 words.push(Self::make_word(¤t_chars));
139 current_chars.clear();
140 }
141
142 current_chars.push(ch.clone());
143 }
144
145 if !current_chars.is_empty() {
146 words.push(Self::make_word(¤t_chars));
147 }
148
149 words
150 }
151
152 fn effective_x_tolerance(last: &Char, current: &Char, base: f64) -> f64 {
157 if is_cjk_text(&last.text) || is_cjk_text(¤t.text) {
158 last.bbox.width().max(base)
159 } else {
160 base
161 }
162 }
163
164 fn effective_y_tolerance(last: &Char, current: &Char, base: f64) -> f64 {
166 if is_cjk_text(&last.text) || is_cjk_text(¤t.text) {
167 last.bbox.height().max(base)
168 } else {
169 base
170 }
171 }
172
173 fn should_split_horizontal(last: &Char, current: &Char, options: &WordOptions) -> bool {
175 let x_gap = current.bbox.x0 - last.bbox.x1;
176 let y_diff = (current.bbox.top - last.bbox.top).abs();
177 let x_tol = Self::effective_x_tolerance(last, current, options.x_tolerance);
178 x_gap > x_tol || y_diff > options.y_tolerance
179 }
180
181 fn should_split_vertical(last: &Char, current: &Char, options: &WordOptions) -> bool {
183 let y_gap = current.bbox.top - last.bbox.bottom;
184 let x_diff = (current.bbox.x0 - last.bbox.x0).abs();
185 let y_tol = Self::effective_y_tolerance(last, current, options.y_tolerance);
186 y_gap > y_tol || x_diff > options.x_tolerance
187 }
188
189 fn make_word(chars: &[Char]) -> Word {
190 let text: String = chars.iter().map(|c| c.text.as_str()).collect();
191 let bbox = chars
192 .iter()
193 .map(|c| c.bbox)
194 .reduce(|a, b| a.union(&b))
195 .expect("make_word called with non-empty chars");
196 let doctop = chars.iter().map(|c| c.doctop).fold(f64::INFINITY, f64::min);
197 let direction = chars[0].direction;
198 Word {
199 text,
200 bbox,
201 doctop,
202 direction,
203 chars: chars.to_vec(),
204 }
205 }
206}
207
208#[cfg(test)]
209mod tests {
210 use super::*;
211
212 fn make_char(text: &str, x0: f64, top: f64, x1: f64, bottom: f64) -> Char {
213 Char {
214 text: text.to_string(),
215 bbox: BBox::new(x0, top, x1, bottom),
216 fontname: "TestFont".to_string(),
217 size: 12.0,
218 doctop: top,
219 upright: true,
220 direction: TextDirection::Ltr,
221 stroking_color: None,
222 non_stroking_color: None,
223 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
224 char_code: 0,
225 mcid: None,
226 tag: None,
227 }
228 }
229
230 #[test]
231 fn test_word_has_doctop_and_direction() {
232 let chars = vec![
233 make_char("A", 10.0, 100.0, 20.0, 112.0),
234 make_char("B", 20.0, 100.0, 30.0, 112.0),
235 ];
236 let words = WordExtractor::extract(&chars, &WordOptions::default());
237 assert_eq!(words.len(), 1);
238 assert_eq!(words[0].doctop, 100.0);
239 assert_eq!(words[0].direction, TextDirection::Ltr);
240 }
241
242 #[test]
243 fn test_word_doctop_uses_min_char_doctop() {
244 let mut chars = vec![
246 make_char("X", 10.0, 100.0, 20.0, 112.0),
247 make_char("Y", 20.0, 100.0, 30.0, 112.0),
248 ];
249 chars[0].doctop = 900.0;
250 chars[1].doctop = 892.0;
251 let words = WordExtractor::extract(&chars, &WordOptions::default());
252 assert_eq!(words[0].doctop, 892.0);
253 }
254
255 #[test]
256 fn test_default_options() {
257 let opts = WordOptions::default();
258 assert_eq!(opts.x_tolerance, 3.0);
259 assert_eq!(opts.y_tolerance, 3.0);
260 assert!(!opts.keep_blank_chars);
261 assert!(!opts.use_text_flow);
262 }
263
264 #[test]
265 fn test_empty_chars() {
266 let words = WordExtractor::extract(&[], &WordOptions::default());
267 assert!(words.is_empty());
268 }
269
270 #[test]
271 fn test_single_char() {
272 let chars = vec![make_char("A", 10.0, 100.0, 20.0, 112.0)];
273 let words = WordExtractor::extract(&chars, &WordOptions::default());
274 assert_eq!(words.len(), 1);
275 assert_eq!(words[0].text, "A");
276 assert_eq!(words[0].chars.len(), 1);
277 }
278
279 #[test]
280 fn test_simple_horizontal_text() {
281 let chars = vec![
283 make_char("H", 10.0, 100.0, 20.0, 112.0),
284 make_char("e", 20.0, 100.0, 30.0, 112.0),
285 make_char("l", 30.0, 100.0, 35.0, 112.0),
286 make_char("l", 35.0, 100.0, 40.0, 112.0),
287 make_char("o", 40.0, 100.0, 50.0, 112.0),
288 ];
289 let words = WordExtractor::extract(&chars, &WordOptions::default());
290 assert_eq!(words.len(), 1);
291 assert_eq!(words[0].text, "Hello");
292 assert_eq!(words[0].bbox, BBox::new(10.0, 100.0, 50.0, 112.0));
293 assert_eq!(words[0].chars.len(), 5);
294 }
295
296 #[test]
297 fn test_multi_line_text() {
298 let chars = vec![
300 make_char("H", 10.0, 100.0, 20.0, 112.0),
301 make_char("i", 20.0, 100.0, 30.0, 112.0),
302 make_char("L", 10.0, 120.0, 20.0, 132.0),
303 make_char("o", 20.0, 120.0, 30.0, 132.0),
304 ];
305 let words = WordExtractor::extract(&chars, &WordOptions::default());
306 assert_eq!(words.len(), 2);
307 assert_eq!(words[0].text, "Hi");
308 assert_eq!(words[1].text, "Lo");
309 }
310
311 #[test]
312 fn test_text_with_large_gap() {
313 let chars = vec![
315 make_char("A", 10.0, 100.0, 20.0, 112.0),
316 make_char("B", 20.0, 100.0, 30.0, 112.0),
317 make_char("C", 50.0, 100.0, 60.0, 112.0), make_char("D", 60.0, 100.0, 70.0, 112.0),
319 ];
320 let words = WordExtractor::extract(&chars, &WordOptions::default());
321 assert_eq!(words.len(), 2);
322 assert_eq!(words[0].text, "AB");
323 assert_eq!(words[1].text, "CD");
324 }
325
326 #[test]
327 fn test_text_with_small_gap_within_tolerance() {
328 let chars = vec![
330 make_char("A", 10.0, 100.0, 20.0, 112.0),
331 make_char("B", 22.0, 100.0, 32.0, 112.0), ];
333 let words = WordExtractor::extract(&chars, &WordOptions::default());
334 assert_eq!(words.len(), 1);
335 assert_eq!(words[0].text, "AB");
336 }
337
338 #[test]
339 fn test_split_on_space_char() {
340 let chars = vec![
342 make_char("A", 10.0, 100.0, 20.0, 112.0),
343 make_char(" ", 20.0, 100.0, 25.0, 112.0),
344 make_char("B", 25.0, 100.0, 35.0, 112.0),
345 ];
346 let words = WordExtractor::extract(&chars, &WordOptions::default());
347 assert_eq!(words.len(), 2);
348 assert_eq!(words[0].text, "A");
349 assert_eq!(words[1].text, "B");
350 }
351
352 #[test]
353 fn test_keep_blank_chars_true() {
354 let chars = vec![
356 make_char("A", 10.0, 100.0, 20.0, 112.0),
357 make_char(" ", 20.0, 100.0, 25.0, 112.0),
358 make_char("B", 25.0, 100.0, 35.0, 112.0),
359 ];
360 let opts = WordOptions {
361 keep_blank_chars: true,
362 ..WordOptions::default()
363 };
364 let words = WordExtractor::extract(&chars, &opts);
365 assert_eq!(words.len(), 1);
366 assert_eq!(words[0].text, "A B");
367 }
368
369 #[test]
370 fn test_configurable_x_tolerance() {
371 let chars = vec![
373 make_char("A", 10.0, 100.0, 20.0, 112.0),
374 make_char("B", 30.0, 100.0, 40.0, 112.0), ];
376
377 let words = WordExtractor::extract(&chars, &WordOptions::default());
379 assert_eq!(words.len(), 2);
380
381 let opts = WordOptions {
383 x_tolerance: 15.0,
384 ..WordOptions::default()
385 };
386 let words = WordExtractor::extract(&chars, &opts);
387 assert_eq!(words.len(), 1);
388 assert_eq!(words[0].text, "AB");
389 }
390
391 #[test]
392 fn test_configurable_y_tolerance() {
393 let chars = vec![
395 make_char("A", 10.0, 100.0, 20.0, 112.0),
396 make_char("B", 20.0, 105.0, 30.0, 117.0), ];
398
399 let words = WordExtractor::extract(&chars, &WordOptions::default());
401 assert_eq!(words.len(), 2);
402
403 let opts = WordOptions {
405 y_tolerance: 10.0,
406 ..WordOptions::default()
407 };
408 let words = WordExtractor::extract(&chars, &opts);
409 assert_eq!(words.len(), 1);
410 assert_eq!(words[0].text, "AB");
411 }
412
413 #[test]
414 fn test_word_bbox_is_union_of_char_bboxes() {
415 let chars = vec![
417 make_char("A", 10.0, 98.0, 20.0, 112.0),
418 make_char("b", 20.0, 100.0, 28.0, 110.0),
419 make_char("C", 28.0, 97.0, 38.0, 113.0),
420 ];
421 let words = WordExtractor::extract(&chars, &WordOptions::default());
422 assert_eq!(words.len(), 1);
423 assert_eq!(words[0].bbox, BBox::new(10.0, 97.0, 38.0, 113.0));
424 }
425
426 #[test]
427 fn test_unsorted_chars_are_sorted_spatially() {
428 let chars = vec![
430 make_char("B", 20.0, 100.0, 30.0, 112.0),
431 make_char("A", 10.0, 100.0, 20.0, 112.0),
432 ];
433 let words = WordExtractor::extract(&chars, &WordOptions::default());
434 assert_eq!(words.len(), 1);
435 assert_eq!(words[0].text, "AB");
436 }
437
438 #[test]
439 fn test_use_text_flow_preserves_order() {
440 let chars = vec![
442 make_char("B", 20.0, 100.0, 30.0, 112.0),
443 make_char("A", 10.0, 100.0, 20.0, 112.0),
444 ];
445 let opts = WordOptions {
446 use_text_flow: true,
447 ..WordOptions::default()
448 };
449 let words = WordExtractor::extract(&chars, &opts);
450 assert_eq!(words.len(), 1);
453 assert_eq!(words[0].text, "BA");
454 }
455
456 #[test]
457 fn test_multiple_spaces_between_words() {
458 let chars = vec![
460 make_char("A", 10.0, 100.0, 20.0, 112.0),
461 make_char(" ", 20.0, 100.0, 25.0, 112.0),
462 make_char(" ", 25.0, 100.0, 30.0, 112.0),
463 make_char("B", 30.0, 100.0, 40.0, 112.0),
464 ];
465 let words = WordExtractor::extract(&chars, &WordOptions::default());
466 assert_eq!(words.len(), 2);
467 assert_eq!(words[0].text, "A");
468 assert_eq!(words[1].text, "B");
469 }
470
471 #[test]
472 fn test_leading_spaces_ignored() {
473 let chars = vec![
474 make_char(" ", 5.0, 100.0, 10.0, 112.0),
475 make_char("A", 10.0, 100.0, 20.0, 112.0),
476 ];
477 let words = WordExtractor::extract(&chars, &WordOptions::default());
478 assert_eq!(words.len(), 1);
479 assert_eq!(words[0].text, "A");
480 }
481
482 #[test]
483 fn test_trailing_spaces_ignored() {
484 let chars = vec![
485 make_char("A", 10.0, 100.0, 20.0, 112.0),
486 make_char(" ", 20.0, 100.0, 25.0, 112.0),
487 ];
488 let words = WordExtractor::extract(&chars, &WordOptions::default());
489 assert_eq!(words.len(), 1);
490 assert_eq!(words[0].text, "A");
491 }
492
493 #[test]
494 fn test_overlapping_chars_grouped() {
495 let chars = vec![
497 make_char("f", 10.0, 100.0, 20.0, 112.0),
498 make_char("i", 18.0, 100.0, 25.0, 112.0), ];
500 let words = WordExtractor::extract(&chars, &WordOptions::default());
501 assert_eq!(words.len(), 1);
502 assert_eq!(words[0].text, "fi");
503 }
504
505 #[test]
506 fn test_three_words_on_one_line() {
507 let chars = vec![
509 make_char("T", 10.0, 100.0, 20.0, 112.0),
510 make_char("h", 20.0, 100.0, 28.0, 112.0),
511 make_char("e", 28.0, 100.0, 36.0, 112.0),
512 make_char(" ", 36.0, 100.0, 40.0, 112.0),
513 make_char("q", 40.0, 100.0, 48.0, 112.0),
514 make_char("u", 48.0, 100.0, 56.0, 112.0),
515 make_char("i", 56.0, 100.0, 60.0, 112.0),
516 make_char("c", 60.0, 100.0, 68.0, 112.0),
517 make_char("k", 68.0, 100.0, 76.0, 112.0),
518 make_char(" ", 76.0, 100.0, 80.0, 112.0),
519 make_char("f", 80.0, 100.0, 88.0, 112.0),
520 make_char("o", 88.0, 100.0, 96.0, 112.0),
521 make_char("x", 96.0, 100.0, 104.0, 112.0),
522 ];
523 let words = WordExtractor::extract(&chars, &WordOptions::default());
524 assert_eq!(words.len(), 3);
525 assert_eq!(words[0].text, "The");
526 assert_eq!(words[1].text, "quick");
527 assert_eq!(words[2].text, "fox");
528 }
529
530 #[test]
531 fn test_multiline_sorting() {
532 let chars = vec![
534 make_char("C", 10.0, 120.0, 20.0, 132.0), make_char("A", 10.0, 100.0, 20.0, 112.0), make_char("D", 20.0, 120.0, 30.0, 132.0), make_char("B", 20.0, 100.0, 30.0, 112.0), ];
539 let words = WordExtractor::extract(&chars, &WordOptions::default());
540 assert_eq!(words.len(), 2);
541 assert_eq!(words[0].text, "AB");
542 assert_eq!(words[1].text, "CD");
543 }
544
545 fn make_cjk_char(text: &str, x0: f64, top: f64, width: f64, height: f64) -> Char {
549 Char {
550 text: text.to_string(),
551 bbox: BBox::new(x0, top, x0 + width, top + height),
552 fontname: "SimSun".to_string(),
553 size: 12.0,
554 doctop: top,
555 upright: true,
556 direction: TextDirection::Ltr,
557 stroking_color: None,
558 non_stroking_color: None,
559 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
560 char_code: 0,
561 mcid: None,
562 tag: None,
563 }
564 }
565
566 #[test]
567 fn test_chinese_text_grouping() {
568 let chars = vec![
571 make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
572 make_cjk_char("国", 23.0, 100.0, 12.0, 12.0), make_cjk_char("人", 36.0, 100.0, 12.0, 12.0), ];
575 let words = WordExtractor::extract(&chars, &WordOptions::default());
576 assert_eq!(words.len(), 1);
577 assert_eq!(words[0].text, "中国人");
578 }
579
580 #[test]
581 fn test_chinese_text_with_larger_gap_uses_char_width_tolerance() {
582 let chars = vec![
585 make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
586 make_cjk_char("国", 30.0, 100.0, 12.0, 12.0), ];
588 let words = WordExtractor::extract(&chars, &WordOptions::default());
589 assert_eq!(
590 words.len(),
591 1,
592 "CJK chars within char-width tolerance should group"
593 );
594 assert_eq!(words[0].text, "中国");
595 }
596
597 #[test]
598 fn test_chinese_text_large_gap_splits() {
599 let chars = vec![
601 make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
602 make_cjk_char("国", 37.0, 100.0, 12.0, 12.0), ];
604 let words = WordExtractor::extract(&chars, &WordOptions::default());
605 assert_eq!(
606 words.len(),
607 2,
608 "CJK chars beyond char-width tolerance should split"
609 );
610 assert_eq!(words[0].text, "中");
611 assert_eq!(words[1].text, "国");
612 }
613
614 #[test]
615 fn test_japanese_mixed_text() {
616 let chars = vec![
618 make_cjk_char("日", 10.0, 100.0, 12.0, 12.0),
619 make_cjk_char("本", 23.0, 100.0, 12.0, 12.0), make_cjk_char("語", 36.0, 100.0, 12.0, 12.0), make_char("a", 49.0, 100.0, 55.0, 112.0), make_char("b", 55.0, 100.0, 61.0, 112.0), make_char("c", 61.0, 100.0, 67.0, 112.0), ];
625 let words = WordExtractor::extract(&chars, &WordOptions::default());
626 assert_eq!(words.len(), 1);
627 assert_eq!(words[0].text, "日本語abc");
628 }
629
630 #[test]
631 fn test_korean_text_grouping() {
632 let chars = vec![
634 make_cjk_char("한", 10.0, 100.0, 12.0, 12.0),
635 make_cjk_char("글", 23.0, 100.0, 12.0, 12.0), ];
637 let words = WordExtractor::extract(&chars, &WordOptions::default());
638 assert_eq!(words.len(), 1);
639 assert_eq!(words[0].text, "한글");
640 }
641
642 #[test]
643 fn test_mixed_cjk_latin_with_gap() {
644 let chars = vec![
646 make_char("H", 10.0, 100.0, 18.0, 112.0),
647 make_char("e", 18.0, 100.0, 24.0, 112.0),
648 make_char("l", 24.0, 100.0, 28.0, 112.0),
649 make_char("l", 28.0, 100.0, 32.0, 112.0),
650 make_char("o", 32.0, 100.0, 38.0, 112.0),
651 make_cjk_char("中", 58.0, 100.0, 12.0, 12.0),
653 make_cjk_char("国", 71.0, 100.0, 12.0, 12.0), ];
655 let words = WordExtractor::extract(&chars, &WordOptions::default());
656 assert_eq!(words.len(), 2);
657 assert_eq!(words[0].text, "Hello");
658 assert_eq!(words[1].text, "中国");
659 }
660
661 #[test]
662 fn test_cjk_transition_to_latin_uses_cjk_tolerance() {
663 let chars = vec![
665 make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
666 make_char("A", 27.0, 100.0, 33.0, 112.0), ];
668 let words = WordExtractor::extract(&chars, &WordOptions::default());
669 assert_eq!(
670 words.len(),
671 1,
672 "CJK-to-Latin transition should use CJK tolerance"
673 );
674 assert_eq!(words[0].text, "中A");
675 }
676
677 #[test]
678 fn test_vertical_text_chinese() {
679 let chars = vec![
682 make_cjk_char("中", 100.0, 10.0, 12.0, 12.0),
683 make_cjk_char("国", 100.0, 23.0, 12.0, 12.0), make_cjk_char("人", 100.0, 36.0, 12.0, 12.0), ];
686 let opts = WordOptions {
687 text_direction: TextDirection::Ttb,
688 ..WordOptions::default()
689 };
690 let words = WordExtractor::extract(&chars, &opts);
691 assert_eq!(words.len(), 1);
692 assert_eq!(words[0].text, "中国人");
693 }
694
695 #[test]
696 fn test_vertical_text_two_columns() {
697 let chars = vec![
700 make_cjk_char("一", 100.0, 10.0, 12.0, 12.0),
702 make_cjk_char("二", 100.0, 23.0, 12.0, 12.0),
703 make_cjk_char("三", 70.0, 10.0, 12.0, 12.0),
705 make_cjk_char("四", 70.0, 23.0, 12.0, 12.0),
706 ];
707 let opts = WordOptions {
708 text_direction: TextDirection::Ttb,
709 ..WordOptions::default()
710 };
711 let words = WordExtractor::extract(&chars, &opts);
712 assert_eq!(words.len(), 2);
713 assert_eq!(words[0].text, "一二");
715 assert_eq!(words[1].text, "三四");
716 }
717
718 #[test]
719 fn test_vertical_text_with_gap() {
720 let chars = vec![
722 make_cjk_char("上", 100.0, 10.0, 12.0, 12.0),
723 make_cjk_char("下", 100.0, 40.0, 12.0, 12.0), ];
725 let opts = WordOptions {
726 text_direction: TextDirection::Ttb,
727 ..WordOptions::default()
728 };
729 let words = WordExtractor::extract(&chars, &opts);
730 assert_eq!(
731 words.len(),
732 2,
733 "Vertical CJK chars with large gap should split"
734 );
735 assert_eq!(words[0].text, "上");
736 assert_eq!(words[1].text, "下");
737 }
738
739 #[test]
740 fn test_cjk_with_space_splits() {
741 let chars = vec![
743 make_cjk_char("中", 10.0, 100.0, 12.0, 12.0),
744 Char {
745 text: " ".to_string(),
746 bbox: BBox::new(22.0, 100.0, 25.0, 112.0),
747 fontname: "SimSun".to_string(),
748 size: 12.0,
749 doctop: 100.0,
750 upright: true,
751 direction: TextDirection::Ltr,
752 stroking_color: None,
753 non_stroking_color: None,
754 ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
755 char_code: 32,
756 mcid: None,
757 tag: None,
758 },
759 make_cjk_char("国", 25.0, 100.0, 12.0, 12.0),
760 ];
761 let words = WordExtractor::extract(&chars, &WordOptions::default());
762 assert_eq!(words.len(), 2);
763 assert_eq!(words[0].text, "中");
764 assert_eq!(words[1].text, "国");
765 }
766}