1pub trait Metadata
5where
6 Self: dyn_clone::DynClone,
7{
8}
9
10dyn_clone::clone_trait_object!(Metadata);
11
12impl Metadata for () {}
13
14#[derive(Clone, Copy, Debug)]
16pub struct Span {
17 pub start: usize,
19
20 pub length: usize,
22}
23
24impl Span {
25 pub fn index<'a>(&self, s: &'a str) -> &'a str {
27 &s[self.start..self.end()]
28 }
29
30 pub const fn end(&self) -> usize {
32 self.start + self.length
33 }
34}
35
36#[derive(Clone)]
38pub struct Token<'a> {
39 pub src_span: Span,
41
42 pub text: std::borrow::Cow<'a, str>,
44
45 pub metadata: Box<dyn Metadata + 'a>,
47}
48
49impl<'a> Token<'a> {
50 pub fn map_text(self, f: impl for<'b> Fn(&'b str) -> std::borrow::Cow<'b, str>) -> Self {
52 Self {
53 text: match &self.text {
54 std::borrow::Cow::Borrowed(text) => f(text),
55 std::borrow::Cow::Owned(text) => f(text).into_owned().into(),
56 },
57 ..self
58 }
59 }
60}
61
62pub trait Tokenizer {
64 fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>>;
66
67 fn filter<F>(self, filter: F) -> impl Tokenizer
69 where
70 Self: Sized,
71 F: Filter,
72 {
73 FilterTokenizer {
74 tokenizer: self,
75 filter,
76 }
77 }
78
79 fn boxed(self) -> Box<dyn DynTokenizer>
81 where
82 Self: Sized + 'static,
83 {
84 Box::new(self)
85 }
86}
87
88pub trait DynTokenizer {
90 fn tokenize<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Token<'a>> + 'a>;
91}
92
93impl<T> DynTokenizer for T
94where
95 T: Tokenizer,
96{
97 fn tokenize<'a>(&'a self, s: &'a str) -> Box<dyn Iterator<Item = Token<'a>> + 'a> {
98 Box::new(Tokenizer::tokenize(self, s))
99 }
100}
101
102struct FilterTokenizer<T, F> {
103 tokenizer: T,
104 filter: F,
105}
106
107impl<T, F> Tokenizer for FilterTokenizer<T, F>
108where
109 T: Tokenizer,
110 F: Filter,
111{
112 fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
113 self.filter.apply(self.tokenizer.tokenize(s))
114 }
115}
116
117pub struct UnicodeWordsTokenizer;
119
120impl Tokenizer for UnicodeWordsTokenizer {
121 fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
122 use unicode_segmentation::UnicodeSegmentation as _;
123
124 s.unicode_word_indices().map(|(start, word)| Token {
125 src_span: Span {
126 start,
127 length: word.len(),
128 },
129 text: std::borrow::Cow::Borrowed(word),
130 metadata: Box::new(()),
131 })
132 }
133}
134
135pub struct WhitespaceTokenizer;
137
138impl Tokenizer for WhitespaceTokenizer {
139 fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
140 s.split_whitespace().map(move |word| Token {
141 src_span: Span {
142 start: word.as_ptr() as usize - s.as_ptr() as usize,
143 length: word.len(),
144 },
145 text: std::borrow::Cow::Borrowed(word),
146 metadata: Box::new(()),
147 })
148 }
149}
150
151#[cfg(feature = "chinese")]
152mod chinese {
153 use super::*;
154
155 pub struct ChineseTokenizer {
159 jieba: jieba_rs::Jieba,
160 }
161
162 impl ChineseTokenizer {
163 pub fn new() -> Self {
165 Self::from_jieba(jieba_rs::Jieba::new())
166 }
167
168 pub fn from_jieba(jieba: jieba_rs::Jieba) -> Self {
170 Self { jieba }
171 }
172 }
173
174 impl Tokenizer for ChineseTokenizer {
175 fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
176 self.jieba
177 .tokenize(s, jieba_rs::TokenizeMode::Default, false)
178 .into_iter()
179 .map(|token| Token {
180 src_span: Span {
181 start: token.start,
182 length: token.end - token.start,
183 },
184 text: std::borrow::Cow::Borrowed(token.word),
185 metadata: Box::new(()),
186 })
187 }
188 }
189}
190
191#[cfg(feature = "chinese")]
192pub use chinese::*;
193
194#[cfg(feature = "japanese-korean")]
195mod jako {
196 use super::*;
197
198 pub struct JaKoTokenizer {
204 segmenter: lindera::segmenter::Segmenter,
205 }
206
207 impl JaKoTokenizer {
208 pub fn from_dictionary(
210 dictionary: lindera::dictionary::Dictionary,
211 user_dictionary: Option<lindera::dictionary::UserDictionary>,
212 ) -> Self {
213 Self::from_segmenter(lindera::segmenter::Segmenter::new(
214 lindera::mode::Mode::Normal,
215 dictionary,
216 user_dictionary,
217 ))
218 }
219
220 pub fn from_segmenter(segmenter: lindera::segmenter::Segmenter) -> Self {
222 Self { segmenter }
223 }
224 }
225
226 #[derive(Clone)]
228 pub struct JaKoTokenMetadata<'a> {
229 pub details: Option<Vec<std::borrow::Cow<'a, str>>>,
231 }
232
233 impl<'a> Metadata for JaKoTokenMetadata<'a> {}
234
235 impl Tokenizer for JaKoTokenizer {
236 fn tokenize<'a>(&'a self, s: &'a str) -> impl Iterator<Item = Token<'a>> {
237 Box::new(
238 self.segmenter
239 .segment(std::borrow::Cow::Borrowed(s))
240 .into_iter()
241 .flat_map(|tokens| {
242 tokens.into_iter().map(|token| Token {
243 src_span: Span {
244 start: token.byte_start,
245 length: token.byte_end - token.byte_start,
246 },
247 text: token.text,
248 metadata: Box::new(JaKoTokenMetadata {
249 details: token.details,
250 }),
251 })
252 }),
253 )
254 }
255 }
256}
257
258#[cfg(feature = "japanese-korean")]
259pub use jako::*;
260
261pub trait Filter {
263 fn apply<'a>(&self, tokens: impl Iterator<Item = Token<'a>>)
265 -> impl Iterator<Item = Token<'a>>;
266}
267
268#[cfg(feature = "stemming")]
269mod stemming {
270 use super::*;
271
272 pub struct StemmingFilter {
274 stemmer: rust_stemmers::Stemmer,
275 }
276
277 impl StemmingFilter {
278 pub fn new(algorithm: rust_stemmers::Algorithm) -> Self {
280 Self {
281 stemmer: rust_stemmers::Stemmer::create(algorithm),
282 }
283 }
284 }
285
286 impl Filter for StemmingFilter {
287 fn apply<'a>(
288 &self,
289 tokens: impl Iterator<Item = Token<'a>>,
290 ) -> impl Iterator<Item = Token<'a>> {
291 tokens.map(|token| token.map_text(|text| self.stemmer.stem(text)))
292 }
293 }
294}
295
296#[cfg(feature = "stemming")]
297pub use stemming::*;
298
299pub struct LowercaseFilter;
301
302impl Filter for LowercaseFilter {
303 fn apply<'a>(
304 &self,
305 tokens: impl Iterator<Item = Token<'a>>,
306 ) -> impl Iterator<Item = Token<'a>> {
307 tokens.map(|token| token.map_text(|text| text.to_lowercase().into()))
308 }
309}
310
311pub struct TrAzLowercaseFilter;
315
316impl Filter for TrAzLowercaseFilter {
317 fn apply<'a>(
318 &self,
319 tokens: impl Iterator<Item = Token<'a>>,
320 ) -> impl Iterator<Item = Token<'a>> {
321 tokens.map(|token| {
322 token.map_text(|text| {
323 text.chars()
324 .map(|c| match c {
325 'İ' => "i".to_string(),
326 'I' => "ı".to_string(),
327 c => c.to_lowercase().to_string(),
328 })
329 .collect::<String>()
330 .into()
331 })
332 })
333 }
334}
335
336pub struct UppercaseFilter;
338
339impl Filter for UppercaseFilter {
340 fn apply<'a>(
341 &self,
342 tokens: impl Iterator<Item = Token<'a>>,
343 ) -> impl Iterator<Item = Token<'a>> {
344 tokens.map(|token| token.map_text(|text| text.to_uppercase().into()))
345 }
346}
347
348pub struct TrAzUppercaseFilter;
352
353impl Filter for TrAzUppercaseFilter {
354 fn apply<'a>(
355 &self,
356 tokens: impl Iterator<Item = Token<'a>>,
357 ) -> impl Iterator<Item = Token<'a>> {
358 tokens.map(|token| {
359 token.map_text(|text| {
360 text.chars()
361 .map(|c| match c {
362 'i' => "İ".to_string(),
363 c => c.to_uppercase().to_string(),
364 })
365 .collect::<String>()
366 .into()
367 })
368 })
369 }
370}
371
372pub struct StripMetadataFilter;
374
375impl Filter for StripMetadataFilter {
376 fn apply<'a>(
377 &self,
378 tokens: impl Iterator<Item = Token<'a>>,
379 ) -> impl Iterator<Item = Token<'a>> {
380 tokens.map(|token| Token {
381 metadata: Box::new(()),
382 ..token
383 })
384 }
385}
386
387#[cfg(test)]
388mod test {
389 use super::*;
390
391 #[test]
392 fn test_tr_az_lowercase_filter() {
393 assert_eq!(
394 TrAzLowercaseFilter
395 .apply(
396 vec![Token {
397 src_span: Span {
398 start: 0,
399 length: 1
400 },
401 text: "MEKSİKALI".into(),
402 metadata: Box::new(()),
403 }]
404 .into_iter()
405 )
406 .map(|token| token.text)
407 .collect::<Vec<_>>(),
408 &["meksikalı"]
409 );
410 }
411
412 #[test]
413 fn test_tr_az_uppercase_filter() {
414 assert_eq!(
415 TrAzUppercaseFilter
416 .apply(
417 vec![Token {
418 src_span: Span {
419 start: 0,
420 length: 1
421 },
422 text: "meksikalı".into(),
423 metadata: Box::new(()),
424 }]
425 .into_iter()
426 )
427 .map(|token| token.text)
428 .collect::<Vec<_>>(),
429 &["MEKSİKALI"]
430 );
431 }
432}