1use icu_segmenter::{WordSegmenter, WordSegmenterBorrowed, options::WordBreakInvariantOptions};
5
6use crate::{TextAnalyzer, TextAnalyzerBuilder, Token, TokenStream, Tokenizer};
7
8#[derive(Clone)]
9pub struct IcuTokenizer {
10 segmenter: WordSegmenterBorrowed<'static>,
11}
12
13impl Default for IcuTokenizer {
14 fn default() -> Self {
15 Self {
16 segmenter: WordSegmenter::new_dictionary(WordBreakInvariantOptions::default()),
17 }
18 }
19}
20
21impl IcuTokenizer {
22 pub fn analyzer(self) -> TextAnalyzer {
23 TextAnalyzer::builder(self).build()
24 }
25
26 pub fn analyzer_builder(self) -> TextAnalyzerBuilder {
27 TextAnalyzer::builder(self).dynamic()
28 }
29}
30
31pub struct IcuTokenStream {
32 tokens: Vec<Token>,
33 index: usize,
34}
35
36impl TokenStream for IcuTokenStream {
37 fn advance(&mut self) -> bool {
38 if self.index < self.tokens.len() {
39 self.index += 1;
40 true
41 } else {
42 false
43 }
44 }
45
46 fn token(&self) -> &Token {
47 &self.tokens[self.index - 1]
48 }
49
50 fn token_mut(&mut self) -> &mut Token {
51 &mut self.tokens[self.index - 1]
52 }
53}
54
55impl Tokenizer for IcuTokenizer {
56 type TokenStream<'a> = IcuTokenStream;
57
58 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
59 let mut boundaries = self.segmenter.segment_str(text);
60 let mut tokens = Vec::new();
61 let Some(mut offset_from) = boundaries.next() else {
62 return IcuTokenStream { tokens, index: 0 };
63 };
64
65 for offset_to in boundaries {
66 let token_text = &text[offset_from..offset_to];
67 if token_text.chars().any(char::is_alphanumeric) {
68 tokens.push(Token {
69 offset_from,
70 offset_to,
71 position: tokens.len(),
72 text: token_text.to_owned(),
73 position_length: 1,
74 });
75 }
76 offset_from = offset_to;
77 }
78
79 IcuTokenStream { tokens, index: 0 }
80 }
81}
82
83#[cfg(test)]
84mod tests {
85 use crate::{IcuTokenizer, Token, TokenStream, Tokenizer};
86
87 fn collect_tokens(text: &str) -> Vec<Token> {
88 let mut tokenizer = IcuTokenizer::default();
89 let mut stream = tokenizer.token_stream(text);
90 let mut tokens = Vec::new();
91 stream.process(&mut |token| tokens.push(token.clone()));
92 tokens
93 }
94
95 #[test]
96 fn test_icu_tokenizer_segments_mixed_text() {
97 let tokens = collect_tokens("Hello, こんにちは世界!");
98
99 assert_eq!(
100 tokens
101 .iter()
102 .map(|token| token.text.as_str())
103 .collect::<Vec<_>>(),
104 vec!["Hello", "こんにちは", "世界"]
105 );
106 assert_eq!(
107 tokens
108 .iter()
109 .map(|token| (token.offset_from, token.offset_to, token.position))
110 .collect::<Vec<_>>(),
111 vec![(0, 5, 0), (7, 22, 1), (22, 28, 2)]
112 );
113 }
114
115 #[test]
116 fn test_icu_tokenizer_skips_non_word_segments() {
117 let tokens = collect_tokens("Mark'd ye his words?");
118
119 assert_eq!(
120 tokens
121 .iter()
122 .map(|token| token.text.as_str())
123 .collect::<Vec<_>>(),
124 vec!["Mark'd", "ye", "his", "words"]
125 );
126 }
127}