oxitext-icu 0.1.2

ICU4X-backed CLDR segmentation and locale-aware collation for OxiText
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
//! CLDR-based text segmentation via `icu_segmenter`.
//!
//! Wraps the four ICU4X segmenters (line, word, grapheme-cluster, sentence)
//! into a single [`IcuSegmenter`] struct, using compiled CLDR data so no
//! external data provider is required at runtime.

use std::collections::HashMap;
use std::sync::Mutex;

use icu_segmenter::options::{
    LineBreakOptions, SentenceBreakInvariantOptions, WordBreakInvariantOptions,
};
use icu_segmenter::{
    GraphemeClusterSegmenter, GraphemeClusterSegmenterBorrowed, LineSegmenter,
    LineSegmenterBorrowed, SentenceSegmenter, SentenceSegmenterBorrowed, WordSegmenter,
    WordSegmenterBorrowed,
};

/// The kind of text boundary to locate.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SegmentKind {
    /// UAX #14 line-break opportunities (suitable for word-wrapping).
    Line,
    /// UAX #29 word boundaries.
    Word,
    /// UAX #29 grapheme-cluster boundaries (user-perceived characters).
    GraphemeCluster,
    /// Alias for [`SegmentKind::GraphemeCluster`] — UAX #29 grapheme-cluster
    /// boundaries (user-perceived characters).
    Grapheme,
    /// UAX #29 sentence boundaries.
    Sentence,
}

/// A text segment with position information and kind metadata.
///
/// Produced by `IcuSegmenter::rich_segments`; `byte_start` and `byte_end`
/// are UTF-8 byte offsets into the analysed string.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Segment {
    /// The text content of the segment (owned copy).
    pub text: String,
    /// UTF-8 byte offset of the start of the segment (inclusive).
    pub byte_start: usize,
    /// UTF-8 byte offset of the end of the segment (exclusive).
    pub byte_end: usize,
    /// The kind of boundary used to produce this segment.
    pub kind: SegmentKind,
}

/// Multi-kind text segmenter backed by ICU4X CLDR compiled data.
///
/// Holds borrowed references into static CLDR tables; constructing one is
/// essentially free (no allocation, no I/O).
///
/// # Examples
///
/// ```rust
/// use oxitext_icu::{IcuSegmenter, SegmentKind};
///
/// let seg = IcuSegmenter::new();
/// let breaks = seg.break_points("Hello world", SegmentKind::Word);
/// // "Hello world" has break opportunities at [0, 5, 6, 11]
/// assert!(breaks.len() >= 2);
/// ```
pub struct IcuSegmenter {
    line: LineSegmenterBorrowed<'static>,
    word: WordSegmenterBorrowed<'static>,
    grapheme: GraphemeClusterSegmenterBorrowed<'static>,
    sentence: SentenceSegmenterBorrowed<'static>,
    /// Memoisation cache for [`Self::segments`].
    ///
    /// Uses interior mutability so the public API remains `&self`, which is
    /// important for GUI render-loops that hold a shared reference.
    segment_cache: Mutex<HashMap<(String, SegmentKind), Vec<Segment>>>,
    /// Memoisation cache for [`Self::break_points`].
    break_cache: Mutex<HashMap<(String, SegmentKind), Vec<usize>>>,
}

impl IcuSegmenter {
    /// Creates a new locale-invariant [`IcuSegmenter`] using compiled CLDR data.
    ///
    /// All four segmenters are initialised with default options and the best
    /// available algorithm for complex scripts (LSTM model for line and word
    /// breaking in South-East Asian scripts; dictionary for Japanese).
    ///
    /// For locale-aware construction (currently only affects the invariant
    /// options) use [`Self::new_with_locale`].
    pub fn new() -> Self {
        Self {
            line: LineSegmenter::new_auto(LineBreakOptions::default()),
            word: WordSegmenter::new_auto(WordBreakInvariantOptions::default()),
            grapheme: GraphemeClusterSegmenter::new(),
            sentence: SentenceSegmenter::new(SentenceBreakInvariantOptions::default()),
            segment_cache: Mutex::new(HashMap::new()),
            break_cache: Mutex::new(HashMap::new()),
        }
    }

    /// Creates a locale-aware [`IcuSegmenter`].
    ///
    /// `locale_id` is a BCP-47 locale string (e.g. `"en"`, `"ja"`, `"th"`).
    /// The compiled CLDR data already includes dictionary/LSTM models for all
    /// supported scripts, so in practice this behaves identically to [`Self::new`]
    /// for most locales.  The constructor is provided for API symmetry with
    /// `IcuCollator::new`.
    ///
    /// # Errors
    ///
    /// The locale string is accepted for documentation purposes only; this
    /// constructor always succeeds (returns `Ok`).
    pub fn new_with_locale(_locale_id: &str) -> Result<Self, crate::CollateError> {
        Ok(Self::new())
    }

    /// Returns the byte-offset break-points in `text` for the given [`SegmentKind`].
    ///
    /// The returned `Vec` always includes the length of `text` as the final
    /// element (the boundary at end-of-string), and may include `0` as the first
    /// element depending on the segmenter.  Callers typically use adjacent pairs
    /// `[breaks[i]..breaks[i+1]]` to iterate over segments.
    ///
    /// Results are memoised: repeated calls with the same `(text, kind)` pair return
    /// a clone of the cached result without re-running the segmenter.
    pub fn break_points(&self, text: &str, kind: SegmentKind) -> Vec<usize> {
        // Fast path: return a clone of the cached result if present.
        {
            let cache = self.break_cache.lock().unwrap_or_else(|p| p.into_inner());
            if let Some(cached) = cache.get(&(text.to_owned(), kind)) {
                return cached.clone();
            }
        }
        // Slow path: compute and cache.
        let result: Vec<usize> = match kind {
            SegmentKind::Line => self.line.segment_str(text).collect(),
            SegmentKind::Word => self.word.segment_str(text).collect(),
            SegmentKind::GraphemeCluster | SegmentKind::Grapheme => {
                self.grapheme.segment_str(text).collect()
            }
            SegmentKind::Sentence => self.sentence.segment_str(text).collect(),
        };
        {
            let mut cache = self.break_cache.lock().unwrap_or_else(|p| p.into_inner());
            cache.insert((text.to_owned(), kind), result.clone());
        }
        result
    }

    /// Returns the actual text substrings for the given [`SegmentKind`].
    ///
    /// Unlike [`Self::break_points`] (which returns raw byte offsets) or
    /// [`Self::segments`] (which returns owned [`Segment`] structs with full
    /// position metadata), this returns borrowed string slices — useful when
    /// ownership of the text content is not required.
    ///
    /// Empty leading/trailing slices are omitted.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use oxitext_icu::{IcuSegmenter, SegmentKind};
    ///
    /// let seg = IcuSegmenter::new();
    /// let words = seg.segment_strs("Hello world", SegmentKind::Word);
    /// // "Hello", " ", "world" (word segmenter includes the space segment).
    /// assert!(words.contains(&"Hello"));
    /// assert!(words.contains(&"world"));
    /// ```
    pub fn segment_strs<'a>(&self, text: &'a str, kind: SegmentKind) -> Vec<&'a str> {
        let breaks = self.break_points(text, kind);
        let mut out = Vec::new();
        let mut prev = 0usize;
        for &b in &breaks {
            if b > prev && b <= text.len() {
                if let Some(slice) = text.get(prev..b) {
                    out.push(slice);
                }
                prev = b;
            }
        }
        // Tail (some segmenters may not emit a final boundary at text.len()).
        if prev < text.len() {
            if let Some(slice) = text.get(prev..) {
                out.push(slice);
            }
        }
        out
    }

    /// Returns rich [`Segment`] structs for the given [`SegmentKind`].
    ///
    /// Each segment carries an owned copy of the text, its UTF-8 byte offsets,
    /// and the [`SegmentKind`] that produced it.  This is the primary API for
    /// consumers that need byte-position metadata alongside the segment text.
    ///
    /// Results are memoised: repeated calls with the same `(text, kind)` pair return
    /// a clone of the cached result without re-running the segmenter.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use oxitext_icu::{IcuSegmenter, SegmentKind};
    ///
    /// let seg = IcuSegmenter::new();
    /// let segs = seg.segments("hello world", SegmentKind::Word);
    /// let words: Vec<&str> = segs.iter()
    ///     .filter(|s| !s.text.trim().is_empty())
    ///     .map(|s| s.text.as_str())
    ///     .collect();
    /// assert!(words.contains(&"hello"));
    /// assert!(words.contains(&"world"));
    /// // Byte offsets are valid and match the text.
    /// for s in &segs {
    ///     assert_eq!(&"hello world"[s.byte_start..s.byte_end], s.text.as_str());
    /// }
    /// ```
    pub fn segments(&self, text: &str, kind: SegmentKind) -> Vec<Segment> {
        // Fast path: return a clone of the cached result if present.
        {
            let cache = self.segment_cache.lock().unwrap_or_else(|p| p.into_inner());
            if let Some(cached) = cache.get(&(text.to_owned(), kind)) {
                return cached.clone();
            }
        }
        // Slow path: compute using (already cached) break_points.
        let breaks = self.break_points(text, kind);
        let mut out = Vec::new();
        let mut prev = 0usize;
        for &b in &breaks {
            if b > prev && b <= text.len() {
                if let Some(slice) = text.get(prev..b) {
                    out.push(Segment {
                        text: slice.to_owned(),
                        byte_start: prev,
                        byte_end: b,
                        kind,
                    });
                }
                prev = b;
            }
        }
        // Tail (some segmenters may not emit a final boundary at text.len()).
        if prev < text.len() {
            if let Some(slice) = text.get(prev..) {
                out.push(Segment {
                    text: slice.to_owned(),
                    byte_start: prev,
                    byte_end: text.len(),
                    kind,
                });
            }
        }
        {
            let mut cache = self.segment_cache.lock().unwrap_or_else(|p| p.into_inner());
            cache.insert((text.to_owned(), kind), out.clone());
        }
        out
    }

    /// Creates a locale-aware [`IcuSegmenter`].
    ///
    /// This is an alias for [`Self::new_with_locale`] with a more ergonomic name.
    /// The locale string is accepted for API symmetry and documentation but the
    /// compiled CLDR data already covers all supported scripts, so the result
    /// is equivalent to [`Self::new`] for all locales.
    ///
    /// # Errors
    ///
    /// Always succeeds; the `Result` wrapper exists for forwards compatibility.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use oxitext_icu::{IcuSegmenter, SegmentKind};
    ///
    /// let seg = IcuSegmenter::with_locale("ja").expect("Japanese segmenter");
    /// let words = seg.segment_strs("東京都は日本の首都です", SegmentKind::Word);
    /// assert!(words.len() >= 2);
    /// ```
    pub fn with_locale(locale: &str) -> Result<Self, crate::CollateError> {
        Self::new_with_locale(locale)
    }

    /// Returns byte offsets of all word boundaries in `text`.
    ///
    /// These are the CLDR UAX #29 word-break opportunities from
    /// [`SegmentKind::Word`].  Useful for text selection, double-click selection,
    /// and word-wrap layout.
    ///
    /// Results are memoised via the internal break-point cache.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use oxitext_icu::IcuSegmenter;
    ///
    /// let seg = IcuSegmenter::new();
    /// let bps = seg.word_boundaries("Hello world");
    /// assert!(!bps.is_empty());
    /// ```
    pub fn word_boundaries(&self, text: &str) -> Vec<usize> {
        self.break_points(text, SegmentKind::Word)
    }

    /// Returns byte offsets where line breaks are permitted in `text`.
    ///
    /// These are the CLDR UAX #14 line-break opportunities from
    /// [`SegmentKind::Line`].  Use them as a replacement for the
    /// `unicode-linebreak` crate in the layout engine.
    ///
    /// Results are memoised via the internal break-point cache.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use oxitext_icu::IcuSegmenter;
    ///
    /// let seg = IcuSegmenter::new();
    /// let ops = seg.line_break_opportunities("Hello world test");
    /// assert!(!ops.is_empty());
    /// ```
    pub fn line_break_opportunities(&self, text: &str) -> Vec<usize> {
        self.break_points(text, SegmentKind::Line)
    }

    /// Returns word boundary offsets suitable for line-breaking in CJK/Thai text.
    ///
    /// For CJK and Thai, dictionary-based word segmentation determines where
    /// lines can break. This method returns byte offsets of word boundaries from
    /// the Word segmenter, which uses LSTM/dictionary for languages that require
    /// it.
    ///
    /// For CJK text, every word boundary is a valid line break position. The
    /// result is deduplicated and omits zero-length spans.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use oxitext_icu::IcuSegmenter;
    ///
    /// let seg = IcuSegmenter::new();
    /// let breaks = seg.cjk_line_break_opportunities("東京都は日本の首都");
    /// assert!(!breaks.is_empty());
    /// ```
    pub fn cjk_line_break_opportunities(&self, text: &str) -> Vec<usize> {
        let breaks = self.break_points(text, SegmentKind::Word);
        // Deduplicate adjacent equal offsets and drop any zero-length spans.
        let mut out = Vec::with_capacity(breaks.len());
        let mut prev = usize::MAX;
        for b in breaks {
            if b != prev {
                out.push(b);
                prev = b;
            }
        }
        out
    }

    /// Determine if `text` contains characters that require dictionary-based
    /// segmentation (Thai, Khmer, Lao, Myanmar, Japanese, Chinese without
    /// spaces, Korean Hangul).
    ///
    /// Returns `true` if any character in the text belongs to one of these
    /// script ranges and therefore benefits from LSTM/dictionary segmentation
    /// rather than rule-based word breaking.
    ///
    /// # Examples
    ///
    /// ```rust
    /// use oxitext_icu::IcuSegmenter;
    ///
    /// assert!(IcuSegmenter::needs_dictionary_segmentation("สวัสดี"));
    /// assert!(IcuSegmenter::needs_dictionary_segmentation("東京"));
    /// assert!(!IcuSegmenter::needs_dictionary_segmentation("Hello world"));
    /// ```
    pub fn needs_dictionary_segmentation(text: &str) -> bool {
        text.chars().any(|c| {
            matches!(
                c as u32,
                0x0E00..=0x0E7F   // Thai
                | 0x1780..=0x17FF // Khmer
                | 0x0E80..=0x0EFF // Lao
                | 0x1000..=0x109F // Myanmar
                | 0x3000..=0x9FFF // CJK + Hiragana + Katakana
                | 0xAC00..=0xD7AF // Korean Hangul
            )
        })
    }
}

/// A lazy iterator over [`Segment`] values produced from a fixed input string.
///
/// Produced by [`IcuSegmenter::iter_segments`]. Avoids collecting all segments
/// into a `Vec` when only a subset of the output is needed.
///
/// # Examples
///
/// ```rust
/// use oxitext_icu::{IcuSegmenter, SegmentKind};
///
/// let seg = IcuSegmenter::new();
/// let words: Vec<_> = seg
///     .iter_segments("hello world", SegmentKind::Word)
///     .filter(|s| !s.text.trim().is_empty())
///     .collect();
/// assert!(words.iter().any(|s| s.text == "hello"));
/// ```
pub struct SegmentIter {
    text: String,
    kind: SegmentKind,
    breaks: std::vec::IntoIter<usize>,
    prev: usize,
}

impl Iterator for SegmentIter {
    type Item = Segment;

    fn next(&mut self) -> Option<Self::Item> {
        loop {
            let end = self.breaks.next()?;
            // Skip degenerate boundaries (zero-length or out-of-order).
            if end <= self.prev || end > self.text.len() {
                continue;
            }
            // Only slice at valid UTF-8 boundaries.
            let text_slice = self.text.get(self.prev..end)?;
            let seg = Segment {
                text: text_slice.to_owned(),
                byte_start: self.prev,
                byte_end: end,
                kind: self.kind,
            };
            self.prev = end;
            return Some(seg);
        }
    }
}

impl IcuSegmenter {
    /// Returns a lazy iterator over [`Segment`] values for the given [`SegmentKind`].
    ///
    /// Unlike [`Self::segments`] which eagerly collects all segments into a `Vec`,
    /// this returns an iterator that produces segments on demand. Use it when you
    /// only need to scan part of the output (e.g. stop at the first word boundary).
    ///
    /// # Examples
    ///
    /// ```rust
    /// use oxitext_icu::{IcuSegmenter, SegmentKind};
    ///
    /// let seg = IcuSegmenter::new();
    /// let mut iter = seg.iter_segments("hello world", SegmentKind::Word);
    /// // Consume only as many segments as needed.
    /// let first = iter.next();
    /// assert!(first.is_some());
    /// ```
    pub fn iter_segments(&self, text: &str, kind: SegmentKind) -> SegmentIter {
        let breaks = self.break_points(text, kind);
        SegmentIter {
            text: text.to_owned(),
            kind,
            breaks: breaks.into_iter(),
            prev: 0,
        }
    }
}

impl Default for IcuSegmenter {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // ── segment_strs (former `segments`) ──────────────────────────────────────

    #[test]
    fn grapheme_cluster_combining_char() {
        // "e\u{0301}" is e + combining acute accent = 1 grapheme cluster.
        let seg = IcuSegmenter::new();
        let clusters = seg.segment_strs("e\u{0301}", SegmentKind::GraphemeCluster);
        assert_eq!(
            clusters.len(),
            1,
            "e + combining accent should be 1 grapheme cluster, got: {clusters:?}"
        );
    }

    #[test]
    fn word_segmentation_basic() {
        let seg = IcuSegmenter::new();
        // "hello world" should produce at least 2 word segments.
        let words = seg.segment_strs("hello world", SegmentKind::Word);
        let non_space: Vec<&&str> = words.iter().filter(|s| !s.trim().is_empty()).collect();
        assert!(
            non_space.len() >= 2,
            "expected at least 2 words, got: {words:?}"
        );
    }

    #[test]
    fn sentence_segmentation_basic() {
        let seg = IcuSegmenter::new();
        let sentences = seg.segment_strs("Hello world. Goodbye world.", SegmentKind::Sentence);
        assert!(
            sentences.len() >= 2,
            "expected at least 2 sentences, got: {sentences:?}"
        );
    }

    #[test]
    fn line_break_cjk() {
        // CJK characters can break between any two consecutive characters.
        let seg = IcuSegmenter::new();
        // "東京" = 2 CJK characters; line-break segmenter should allow a break between them.
        let breaks = seg.segment_strs("東京", SegmentKind::Line);
        assert!(
            !breaks.is_empty(),
            "CJK text should have at least one line segment"
        );
    }

    #[test]
    fn grapheme_cluster_emoji_zwj() {
        // A family emoji sequence joined by ZWJ should be 1 grapheme cluster.
        // "👨‍👩‍👧" = man ZWJ woman ZWJ girl → 1 grapheme cluster
        let seg = IcuSegmenter::new();
        let text = "👨\u{200D}👩\u{200D}👧";
        let clusters = seg.segment_strs(text, SegmentKind::GraphemeCluster);
        // ICU4X should recognize this as a single extended grapheme cluster.
        assert_eq!(
            clusters.len(),
            1,
            "ZWJ family emoji should be 1 grapheme cluster: {clusters:?}"
        );
    }

    #[test]
    fn break_points_includes_end() {
        // break_points should always include text.len() as the last element.
        let seg = IcuSegmenter::new();
        let text = "hello";
        let breaks = seg.break_points(text, SegmentKind::GraphemeCluster);
        assert!(
            breaks.contains(&text.len()),
            "break_points must include end offset: {breaks:?}"
        );
    }

    #[test]
    fn segment_strs_cover_full_text() {
        // The concatenation of all segment_strs must equal the input string.
        let seg = IcuSegmenter::new();
        let text = "Hello, 世界! Goodbye.";
        for kind in [
            SegmentKind::GraphemeCluster,
            SegmentKind::Line,
            SegmentKind::Word,
        ] {
            let segs = seg.segment_strs(text, kind);
            let rejoined: String = segs.concat();
            assert_eq!(
                rejoined, text,
                "segment_strs for {kind:?} do not cover full text"
            );
        }
    }

    #[test]
    fn word_segmentation_japanese() {
        // Japanese text: the word segmenter (auto/LSTM) should split this
        // into at least 2 segments.
        let seg = IcuSegmenter::new();
        let text = "東京都は日本の首都です";
        let words = seg.segment_strs(text, SegmentKind::Word);
        assert!(
            words.len() >= 2,
            "Japanese word segmentation should produce ≥2 segments, got: {words:?}"
        );
    }

    // ── segments() → Vec<Segment> ─────────────────────────────────────────────

    #[test]
    fn word_segments_english() {
        let seg = IcuSegmenter::new();
        let segs = seg.segments("hello world", SegmentKind::Word);
        // Should produce segments covering "hello" and "world".
        let words: Vec<&str> = segs
            .iter()
            .filter(|s| !s.text.trim().is_empty())
            .map(|s| s.text.as_str())
            .collect();
        assert!(words.contains(&"hello"), "should segment 'hello'");
        assert!(words.contains(&"world"), "should segment 'world'");

        // Verify byte offsets are valid, non-overlapping, and match text.
        let mut prev_end = 0;
        let input = "hello world";
        for s in &segs {
            assert!(
                s.byte_start >= prev_end,
                "overlapping segments: byte_start {} < prev_end {}",
                s.byte_start,
                prev_end
            );
            assert!(
                s.byte_end <= input.len(),
                "byte_end {} exceeds text length {}",
                s.byte_end,
                input.len()
            );
            assert_eq!(
                &input[s.byte_start..s.byte_end],
                s.text.as_str(),
                "byte range does not match text"
            );
            prev_end = s.byte_end;
        }
    }

    #[test]
    fn thai_word_segmentation() {
        // Thai text: "สวัสดีชาวโลก" (sawatdee chao lok — Hello world)
        let seg = IcuSegmenter::new();
        let text = "สวัสดีชาวโลก";
        let segs = seg.segments(text, SegmentKind::Word);
        assert!(
            !segs.is_empty(),
            "Thai text should segment into at least one segment"
        );
        // All byte spans must cover the full string.
        let total: usize = segs.iter().map(|s| s.byte_end - s.byte_start).sum();
        assert_eq!(total, text.len(), "segments must cover entire Thai string");
    }

    #[test]
    fn japanese_word_segmentation_rich() {
        let seg = IcuSegmenter::new();
        let text = "日本語テスト";
        let segs = seg.segments(text, SegmentKind::Word);
        assert!(!segs.is_empty());
        let total: usize = segs.iter().map(|s| s.byte_end - s.byte_start).sum();
        assert_eq!(
            total,
            text.len(),
            "segments must cover entire Japanese string"
        );
    }

    #[test]
    fn sentence_segmentation_with_abbreviation() {
        let seg = IcuSegmenter::new();
        let text = "Dr. Smith went home. He was tired.";
        let segs = seg.segments(text, SegmentKind::Sentence);
        assert!(!segs.is_empty(), "should produce at least one sentence");
        let total: usize = segs.iter().map(|s| s.byte_end - s.byte_start).sum();
        assert_eq!(total, text.len(), "sentences must cover entire string");
        // ICU should produce 1–3 sentences (Dr. abbreviation handling varies by data version).
        assert!(
            !segs.is_empty() && segs.len() <= 3,
            "expected 1–3 sentences, got {}",
            segs.len()
        );
    }

    #[test]
    fn segments_byte_offsets_are_valid() {
        // Rich Segment::byte_start/byte_end must index valid UTF-8 boundaries.
        let seg = IcuSegmenter::new();
        let text = "Hello, 世界! Goodbye.";
        for kind in [
            SegmentKind::GraphemeCluster,
            SegmentKind::Line,
            SegmentKind::Word,
        ] {
            let segs = seg.segments(text, kind);
            let rejoined: String = segs.iter().map(|s| s.text.as_str()).collect();
            assert_eq!(
                rejoined, text,
                "segments for {kind:?} do not cover full text"
            );
            for s in &segs {
                assert_eq!(
                    &text[s.byte_start..s.byte_end],
                    s.text.as_str(),
                    "byte range mismatch in {kind:?} segment"
                );
            }
        }
    }

    #[test]
    fn grapheme_kind_alias_matches_grapheme_cluster() {
        // SegmentKind::Grapheme must produce the same text / byte offsets as
        // SegmentKind::GraphemeCluster (only the stored `kind` field differs).
        let seg = IcuSegmenter::new();
        let text = "e\u{0301}á";
        let gc = seg.segments(text, SegmentKind::GraphemeCluster);
        let g = seg.segments(text, SegmentKind::Grapheme);
        assert_eq!(
            gc.len(),
            g.len(),
            "Grapheme and GraphemeCluster must produce the same number of segments"
        );
        for (a, b) in gc.iter().zip(g.iter()) {
            assert_eq!(
                a.text, b.text,
                "Grapheme and GraphemeCluster segment text must match"
            );
            assert_eq!(
                a.byte_start, b.byte_start,
                "Grapheme and GraphemeCluster byte_start must match"
            );
            assert_eq!(
                a.byte_end, b.byte_end,
                "Grapheme and GraphemeCluster byte_end must match"
            );
        }
    }

    // ── SegmentIter (lazy iteration) ──────────────────────────────────────────

    #[test]
    fn iter_segments_matches_segments_eager() {
        // iter_segments must produce the same output as the eager segments() call.
        let seg = IcuSegmenter::new();
        let text = "hello world";
        let eager = seg.segments(text, SegmentKind::Word);
        let lazy: Vec<_> = seg.iter_segments(text, SegmentKind::Word).collect();
        assert_eq!(
            eager.len(),
            lazy.len(),
            "iter_segments and segments must produce the same number of segments"
        );
        for (a, b) in eager.iter().zip(lazy.iter()) {
            assert_eq!(a.text, b.text, "text mismatch");
            assert_eq!(a.byte_start, b.byte_start, "byte_start mismatch");
            assert_eq!(a.byte_end, b.byte_end, "byte_end mismatch");
        }
    }

    #[test]
    fn iter_segments_lazy_stops_early() {
        // Verify the iterator can be consumed partially without forcing all items.
        let seg = IcuSegmenter::new();
        let text = "one two three four five";
        let mut iter = seg.iter_segments(text, SegmentKind::Word);
        let first = iter.next();
        assert!(first.is_some(), "should yield at least one segment");
        // Only consume a few; remaining segments are not evaluated.
        let _second = iter.next();
    }

    #[test]
    fn iter_segments_grapheme_clusters() {
        let seg = IcuSegmenter::new();
        let text = "e\u{0301}á";
        let lazy: Vec<_> = seg
            .iter_segments(text, SegmentKind::GraphemeCluster)
            .collect();
        // Should be 2 grapheme clusters.
        assert_eq!(lazy.len(), 2, "expected 2 grapheme clusters: {lazy:?}");
        // Total byte coverage must equal string length.
        let total: usize = lazy.iter().map(|s| s.byte_end - s.byte_start).sum();
        assert_eq!(total, text.len(), "iter_segments must cover full string");
    }

    #[test]
    fn iter_segments_empty_text() {
        let seg = IcuSegmenter::new();
        let mut iter = seg.iter_segments("", SegmentKind::Word);
        assert!(iter.next().is_none(), "empty text should yield no segments");
    }

    // ── with_locale ──────────────────────────────────────────────────────────

    #[test]
    fn with_locale_succeeds_for_common_locales() {
        for locale in &["en", "ja", "th", "zh", "ar", "ko"] {
            IcuSegmenter::with_locale(locale)
                .unwrap_or_else(|e| panic!("with_locale({locale}) failed: {e}"));
        }
    }

    #[test]
    fn with_locale_japanese_word_segmentation() {
        let seg = IcuSegmenter::with_locale("ja").expect("Japanese segmenter");
        let text = "東京都は日本の首都です";
        let words = seg.segment_strs(text, SegmentKind::Word);
        assert!(
            words.len() >= 2,
            "Japanese should produce ≥2 segments: {words:?}"
        );
    }

    // ── Cache ────────────────────────────────────────────────────────────────

    #[test]
    fn test_segmenter_cache_hit() {
        let seg = IcuSegmenter::new();
        let text = "Hello world test";
        let r1 = seg.break_points(text, SegmentKind::Word);
        let r2 = seg.break_points(text, SegmentKind::Word);
        assert_eq!(r1, r2, "cached result should match original");
    }

    #[test]
    fn cache_hit_immutable_ref() {
        // Verify the cache works via &self (not &mut self).
        let seg = IcuSegmenter::new();
        let text = "Hello world test";
        let r1 = seg.break_points(text, SegmentKind::Word);
        let r2 = seg.break_points(text, SegmentKind::Word);
        assert_eq!(r1, r2, "immutable-ref cache hit should match");
    }

    #[test]
    fn segment_cache_hit() {
        let seg = IcuSegmenter::new();
        let text = "Hello world test";
        let r1 = seg.segments(text, SegmentKind::Word);
        let r2 = seg.segments(text, SegmentKind::Word);
        assert_eq!(r1, r2, "segment cached result should match original");
    }

    // ── word_boundaries / line_break_opportunities ───────────────────────────

    #[test]
    fn test_word_boundaries() {
        let seg = IcuSegmenter::new();
        let bps = seg.word_boundaries("Hello world");
        // Should have at least one boundary.
        assert!(!bps.is_empty());
    }

    #[test]
    fn test_line_break_opportunities() {
        let seg = IcuSegmenter::new();
        let ops = seg.line_break_opportunities("Hello world test");
        // Should have break opportunities at word boundaries.
        assert!(!ops.is_empty());
    }

    #[test]
    fn word_boundaries_matches_break_points() {
        let seg = IcuSegmenter::new();
        let text = "one two three";
        let wb = seg.word_boundaries(text);
        let bp = seg.break_points(text, SegmentKind::Word);
        assert_eq!(
            wb, bp,
            "word_boundaries should delegate to break_points(Word)"
        );
    }

    #[test]
    fn line_break_opportunities_matches_break_points() {
        let seg = IcuSegmenter::new();
        let text = "one two three";
        let lbo = seg.line_break_opportunities(text);
        let bp = seg.break_points(text, SegmentKind::Line);
        assert_eq!(
            lbo, bp,
            "line_break_opportunities should delegate to break_points(Line)"
        );
    }

    // ── Thai word segmentation ───────────────────────────────────────────────

    #[test]
    fn test_thai_word_segmentation() {
        // Thai text: "สวัสดีชาวโลก" (hello world in Thai)
        // This tests that dictionary-based word breaking works.
        let seg = IcuSegmenter::new();
        let text = "สวัสดีชาวโลก";
        let words = seg.segments(text, SegmentKind::Word);
        let non_space: Vec<_> = words.iter().filter(|w| !w.text.trim().is_empty()).collect();
        assert!(
            non_space.len() >= 2,
            "Thai should produce at least 2 word segments, got: {words:?}"
        );
    }

    // ── cjk_line_break_opportunities / needs_dictionary_segmentation ─────────

    #[test]
    fn test_cjk_line_break_opportunities_japanese() {
        let seg = IcuSegmenter::new();
        let breaks = seg.cjk_line_break_opportunities("東京都は日本の首都");
        assert!(!breaks.is_empty(), "CJK text must have break opportunities");
    }

    #[test]
    fn test_cjk_line_break_opportunities_deduplication() {
        let seg = IcuSegmenter::new();
        let breaks = seg.cjk_line_break_opportunities("hello world");
        // Verify no adjacent duplicates.
        let has_dup = breaks.windows(2).any(|w| w[0] == w[1]);
        assert!(
            !has_dup,
            "cjk_line_break_opportunities must not contain adjacent duplicates"
        );
    }

    #[test]
    fn test_needs_dictionary_segmentation_thai() {
        assert!(IcuSegmenter::needs_dictionary_segmentation("สวัสดี"));
    }

    #[test]
    fn test_needs_dictionary_segmentation_cjk() {
        assert!(IcuSegmenter::needs_dictionary_segmentation("東京"));
    }

    #[test]
    fn test_needs_dictionary_segmentation_latin_false() {
        assert!(!IcuSegmenter::needs_dictionary_segmentation("Hello world"));
    }

    // ── Performance benchmarks (ignored by default) ──────────────────────────

    #[test]
    #[ignore = "benchmark — run with: cargo test -p oxitext-icu -- --ignored bench_segmentation"]
    fn bench_segmentation_100k_chars() {
        let text = "The quick brown fox jumps over the lazy dog. ".repeat(2300); // ~100K chars
        let seg = IcuSegmenter::new();
        let start = std::time::Instant::now();
        for _ in 0..10 {
            let _ = seg.break_points(&text, SegmentKind::Word);
        }
        let elapsed = start.elapsed();
        println!("10× 100K-char word segmentation: {:?}", elapsed);
        println!("Per-call: {:?}", elapsed / 10);
    }

    #[test]
    #[ignore = "benchmark — run with: cargo test -p oxitext-icu -- --ignored bench_collation"]
    fn bench_collation_sort_10k_strings() {
        use crate::IcuCollator;
        let strings: Vec<String> = (0..10_000).map(|i| format!("item_{i:05}")).collect();
        let collator = IcuCollator::new("en").expect("collator");
        let start = std::time::Instant::now();
        let mut sorted = strings.clone();
        sorted.sort_by(|a, b| collator.compare(a.as_str(), b.as_str()));
        println!("Sort 10K strings: {:?}", start.elapsed());
        assert!(sorted[0] <= sorted[sorted.len() - 1]);
    }
}