1use crate::analysis::token::Token;
9
10pub trait Tokenizer: Send + Sync {
15 fn tokenize(&self, text: &str, output: &mut Vec<Token>);
20}
21
22pub struct StandardTokenizer;
31
32impl Tokenizer for StandardTokenizer {
33 fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
34 use unicode_segmentation::UnicodeSegmentation;
35
36 let mut position = output.last().map_or(0, |t| t.position + 1);
37
38 for (byte_offset, word) in text.unicode_word_indices() {
39 output.push(Token::new(
40 word,
41 byte_offset,
42 byte_offset + word.len(),
43 position,
44 ));
45 position += 1;
46 }
47 }
48}
49
50pub struct WhitespaceTokenizer;
57
58impl Tokenizer for WhitespaceTokenizer {
59 fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
60 let mut position = output.last().map_or(0, |t| t.position + 1);
61
62 for token_text in text.split_whitespace() {
63 let byte_offset = token_text.as_ptr() as usize - text.as_ptr() as usize;
65 output.push(Token::new(
66 token_text,
67 byte_offset,
68 byte_offset + token_text.len(),
69 position,
70 ));
71 position += 1;
72 }
73 }
74}
75
76pub struct LetterTokenizer;
83
84impl Tokenizer for LetterTokenizer {
85 fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
86 let mut position = output.last().map_or(0, |t| t.position + 1);
87 let mut start = None;
88
89 for (i, ch) in text.char_indices() {
90 if ch.is_alphabetic() {
91 if start.is_none() {
92 start = Some(i);
93 }
94 } else if let Some(s) = start.take() {
95 output.push(Token::new(&text[s..i], s, i, position));
96 position += 1;
97 }
98 }
99
100 if let Some(s) = start {
102 output.push(Token::new(&text[s..], s, text.len(), position));
103 }
104 }
105}
106
107pub struct KeywordTokenizer;
114
115impl Tokenizer for KeywordTokenizer {
116 fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
117 if text.is_empty() {
118 return;
119 }
120 let position = output.last().map_or(0, |t| t.position + 1);
121 output.push(Token::new(text, 0, text.len(), position));
122 }
123}
124
125#[derive(Clone, Debug, PartialEq, Eq)]
129pub enum TokenChar {
130 Letter,
131 Digit,
132 Whitespace,
133 Punctuation,
134 Symbol,
135}
136
137impl TokenChar {
138 fn matches(&self, ch: char) -> bool {
139 match self {
140 TokenChar::Letter => ch.is_alphabetic(),
141 TokenChar::Digit => ch.is_ascii_digit(),
142 TokenChar::Whitespace => ch.is_whitespace(),
143 TokenChar::Punctuation => ch.is_ascii_punctuation(),
144 TokenChar::Symbol => {
145 !ch.is_alphanumeric() && !ch.is_whitespace() && !ch.is_ascii_punctuation()
146 }
147 }
148 }
149
150 pub fn from_str(s: &str) -> Option<Self> {
152 match s {
153 "letter" => Some(TokenChar::Letter),
154 "digit" => Some(TokenChar::Digit),
155 "whitespace" => Some(TokenChar::Whitespace),
156 "punctuation" => Some(TokenChar::Punctuation),
157 "symbol" => Some(TokenChar::Symbol),
158 _ => None,
159 }
160 }
161}
162
163fn is_token_char(ch: char, token_chars: &[TokenChar]) -> bool {
164 if token_chars.is_empty() {
165 return true; }
167 token_chars.iter().any(|tc| tc.matches(ch))
168}
169
170pub struct NGramTokenizer {
180 pub min_gram: usize,
181 pub max_gram: usize,
182 pub token_chars: Vec<TokenChar>,
183}
184
185impl NGramTokenizer {
186 pub fn new(min_gram: usize, max_gram: usize, token_chars: Vec<TokenChar>) -> Self {
187 Self {
188 min_gram,
189 max_gram,
190 token_chars,
191 }
192 }
193}
194
195impl Tokenizer for NGramTokenizer {
196 fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
197 let mut position = output.last().map_or(0, |t| t.position + 1);
198
199 let words = split_by_token_chars(text, &self.token_chars);
200
201 for (word, word_offset) in words {
202 let chars: Vec<(usize, char)> = word.char_indices().collect();
203 for n in self.min_gram..=self.max_gram {
204 if n > chars.len() {
205 break;
206 }
207 for i in 0..=chars.len() - n {
208 let start = chars[i].0;
209 let end = if i + n < chars.len() {
210 chars[i + n].0
211 } else {
212 word.len()
213 };
214 let gram = &word[start..end];
215 output.push(Token::new(
216 gram,
217 word_offset + start,
218 word_offset + end,
219 position,
220 ));
221 position += 1;
222 }
223 }
224 }
225 }
226}
227
228pub struct EdgeNGramTokenizer {
237 pub min_gram: usize,
238 pub max_gram: usize,
239 pub token_chars: Vec<TokenChar>,
240}
241
242impl EdgeNGramTokenizer {
243 pub fn new(min_gram: usize, max_gram: usize, token_chars: Vec<TokenChar>) -> Self {
244 Self {
245 min_gram,
246 max_gram,
247 token_chars,
248 }
249 }
250}
251
252impl Tokenizer for EdgeNGramTokenizer {
253 fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
254 let mut position = output.last().map_or(0, |t| t.position + 1);
255
256 let words = split_by_token_chars(text, &self.token_chars);
257
258 for (word, word_offset) in words {
259 let chars: Vec<(usize, char)> = word.char_indices().collect();
260 for n in self.min_gram..=self.max_gram.min(chars.len()) {
261 let end = if n < chars.len() {
262 chars[n].0
263 } else {
264 word.len()
265 };
266 let gram = &word[..end];
267 output.push(Token::new(gram, word_offset, word_offset + end, position));
268 position += 1;
269 }
270 }
271 }
272}
273
274fn split_by_token_chars<'a>(text: &'a str, token_chars: &[TokenChar]) -> Vec<(&'a str, usize)> {
276 if token_chars.is_empty() {
277 if text.is_empty() {
278 return Vec::new();
279 }
280 return vec![(text, 0)];
281 }
282
283 let mut words = Vec::new();
284 let mut start = None;
285
286 for (i, ch) in text.char_indices() {
287 if is_token_char(ch, token_chars) {
288 if start.is_none() {
289 start = Some(i);
290 }
291 } else if let Some(s) = start.take() {
292 words.push((&text[s..i], s));
293 }
294 }
295
296 if let Some(s) = start {
297 words.push((&text[s..], s));
298 }
299
300 words
301}
302
303pub struct PatternTokenizer {
310 pattern: regex::Regex,
311}
312
313impl PatternTokenizer {
314 pub fn new(pattern: &str) -> Result<Self, regex::Error> {
315 Ok(Self {
316 pattern: regex::Regex::new(pattern)?,
317 })
318 }
319}
320
321impl Tokenizer for PatternTokenizer {
322 fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
323 let mut position = output.last().map_or(0, |t| t.position + 1);
324 let mut last_end = 0;
325
326 for m in self.pattern.find_iter(text) {
327 if m.start() > last_end {
328 let token_text = &text[last_end..m.start()];
329 if !token_text.is_empty() {
330 output.push(Token::new(token_text, last_end, m.start(), position));
331 position += 1;
332 }
333 }
334 last_end = m.end();
335 }
336
337 if last_end < text.len() {
339 let token_text = &text[last_end..];
340 if !token_text.is_empty() {
341 output.push(Token::new(token_text, last_end, text.len(), position));
342 }
343 }
344 }
345}
346
347pub struct PathHierarchyTokenizer {
355 pub separator: char,
356 pub replacement: Option<char>,
357}
358
359impl PathHierarchyTokenizer {
360 pub fn new(separator: char, replacement: Option<char>) -> Self {
361 Self {
362 separator,
363 replacement,
364 }
365 }
366}
367
368impl Default for PathHierarchyTokenizer {
369 fn default() -> Self {
370 Self {
371 separator: '/',
372 replacement: None,
373 }
374 }
375}
376
377impl Tokenizer for PathHierarchyTokenizer {
378 fn tokenize(&self, text: &str, output: &mut Vec<Token>) {
379 if text.is_empty() {
380 return;
381 }
382
383 let mut position = output.last().map_or(0, |t| t.position + 1);
384 let replacement = self.replacement.unwrap_or(self.separator);
385
386 let mut sep_positions: Vec<usize> = Vec::new();
388 for (i, ch) in text.char_indices() {
389 if ch == self.separator {
390 sep_positions.push(i);
391 }
392 }
393
394 if sep_positions.is_empty() {
395 let token_text = if self.replacement.is_some() {
397 text.to_string()
398 } else {
399 text.to_string()
400 };
401 output.push(Token::new(token_text, 0, text.len(), position));
402 return;
403 }
404
405 for &sep_pos in &sep_positions {
407 let end = sep_pos;
408 if end == 0 {
409 continue; }
411 let segment = &text[..end];
412 let token_text = if replacement != self.separator {
413 segment.replace(self.separator, &replacement.to_string())
414 } else {
415 segment.to_string()
416 };
417 output.push(Token::new(token_text, 0, end, position));
418 position += 1;
419 }
420
421 let token_text = if replacement != self.separator {
423 text.replace(self.separator, &replacement.to_string())
424 } else {
425 text.to_string()
426 };
427 output.push(Token::new(token_text, 0, text.len(), position));
428 }
429}
430
431#[cfg(test)]
432mod tests {
433 use super::*;
434
435 #[test]
438 fn standard_basic() {
439 let mut tokens = Vec::new();
440 StandardTokenizer.tokenize("Hello, world!", &mut tokens);
441 assert_eq!(tokens.len(), 2);
442 assert_eq!(tokens[0].text, "Hello");
443 assert_eq!(tokens[1].text, "world");
444 }
445
446 #[test]
447 fn standard_positions() {
448 let mut tokens = Vec::new();
449 StandardTokenizer.tokenize("the quick brown fox", &mut tokens);
450 assert_eq!(tokens.len(), 4);
451 for (i, token) in tokens.iter().enumerate() {
452 assert_eq!(token.position, i as u32);
453 }
454 }
455
456 #[test]
457 fn standard_offsets() {
458 let mut tokens = Vec::new();
459 StandardTokenizer.tokenize("Hello world", &mut tokens);
460 assert_eq!(tokens[0].offset_from, 0);
461 assert_eq!(tokens[0].offset_to, 5);
462 assert_eq!(tokens[1].offset_from, 6);
463 assert_eq!(tokens[1].offset_to, 11);
464 }
465
466 #[test]
467 fn standard_empty() {
468 let mut tokens = Vec::new();
469 StandardTokenizer.tokenize("", &mut tokens);
470 assert!(tokens.is_empty());
471 }
472
473 #[test]
474 fn standard_punctuation_only() {
475 let mut tokens = Vec::new();
476 StandardTokenizer.tokenize("!!! ... ???", &mut tokens);
477 assert!(tokens.is_empty());
478 }
479
480 #[test]
481 fn standard_numbers() {
482 let mut tokens = Vec::new();
483 StandardTokenizer.tokenize("test123 456abc", &mut tokens);
484 assert!(tokens.len() >= 2);
486 }
487
488 #[test]
489 fn standard_apostrophe() {
490 let mut tokens = Vec::new();
491 StandardTokenizer.tokenize("it's a test", &mut tokens);
492 assert!(tokens.iter().any(|t| t.text.contains("it")));
494 }
495
496 #[test]
499 fn whitespace_basic() {
500 let mut tokens = Vec::new();
501 WhitespaceTokenizer.tokenize("Hello, world!", &mut tokens);
502 assert_eq!(tokens.len(), 2);
503 assert_eq!(tokens[0].text, "Hello,");
504 assert_eq!(tokens[1].text, "world!");
505 }
506
507 #[test]
508 fn whitespace_preserves_punctuation() {
509 let mut tokens = Vec::new();
510 WhitespaceTokenizer.tokenize("price=$100.00", &mut tokens);
511 assert_eq!(tokens.len(), 1);
512 assert_eq!(tokens[0].text, "price=$100.00");
513 }
514
515 #[test]
516 fn whitespace_multiple_spaces() {
517 let mut tokens = Vec::new();
518 WhitespaceTokenizer.tokenize("a b\t\nc", &mut tokens);
519 assert_eq!(tokens.len(), 3);
520 assert_eq!(tokens[0].text, "a");
521 assert_eq!(tokens[1].text, "b");
522 assert_eq!(tokens[2].text, "c");
523 }
524
525 #[test]
526 fn whitespace_offsets() {
527 let mut tokens = Vec::new();
528 WhitespaceTokenizer.tokenize("Hello world", &mut tokens);
529 assert_eq!(tokens[0].offset_from, 0);
530 assert_eq!(tokens[0].offset_to, 5);
531 assert_eq!(tokens[1].offset_from, 6);
532 assert_eq!(tokens[1].offset_to, 11);
533 }
534
535 #[test]
536 fn whitespace_empty() {
537 let mut tokens = Vec::new();
538 WhitespaceTokenizer.tokenize("", &mut tokens);
539 assert!(tokens.is_empty());
540 }
541
542 #[test]
545 fn letter_basic() {
546 let mut tokens = Vec::new();
547 LetterTokenizer.tokenize("Hello, world!", &mut tokens);
548 assert_eq!(tokens.len(), 2);
549 assert_eq!(tokens[0].text, "Hello");
550 assert_eq!(tokens[1].text, "world");
551 }
552
553 #[test]
554 fn letter_strips_numbers() {
555 let mut tokens = Vec::new();
556 LetterTokenizer.tokenize("test123data", &mut tokens);
557 assert_eq!(tokens.len(), 2);
558 assert_eq!(tokens[0].text, "test");
559 assert_eq!(tokens[1].text, "data");
560 }
561
562 #[test]
563 fn letter_unicode() {
564 let mut tokens = Vec::new();
565 LetterTokenizer.tokenize("café résumé", &mut tokens);
566 assert_eq!(tokens.len(), 2);
567 assert_eq!(tokens[0].text, "café");
568 assert_eq!(tokens[1].text, "résumé");
569 }
570
571 #[test]
572 fn letter_offsets() {
573 let mut tokens = Vec::new();
574 LetterTokenizer.tokenize("abc 123 def", &mut tokens);
575 assert_eq!(tokens[0].offset_from, 0);
576 assert_eq!(tokens[0].offset_to, 3);
577 assert_eq!(tokens[1].offset_from, 8);
578 assert_eq!(tokens[1].offset_to, 11);
579 }
580
581 #[test]
582 fn letter_empty() {
583 let mut tokens = Vec::new();
584 LetterTokenizer.tokenize("", &mut tokens);
585 assert!(tokens.is_empty());
586 }
587
588 #[test]
589 fn letter_no_letters() {
590 let mut tokens = Vec::new();
591 LetterTokenizer.tokenize("12345 !@#$%", &mut tokens);
592 assert!(tokens.is_empty());
593 }
594
595 #[test]
598 fn keyword_basic() {
599 let mut tokens = Vec::new();
600 KeywordTokenizer.tokenize("Hello, world!", &mut tokens);
601 assert_eq!(tokens.len(), 1);
602 assert_eq!(tokens[0].text, "Hello, world!");
603 }
604
605 #[test]
606 fn keyword_offsets() {
607 let mut tokens = Vec::new();
608 KeywordTokenizer.tokenize("test value", &mut tokens);
609 assert_eq!(tokens[0].offset_from, 0);
610 assert_eq!(tokens[0].offset_to, 10);
611 assert_eq!(tokens[0].position, 0);
612 }
613
614 #[test]
615 fn keyword_empty() {
616 let mut tokens = Vec::new();
617 KeywordTokenizer.tokenize("", &mut tokens);
618 assert!(tokens.is_empty());
619 }
620
621 #[test]
624 fn ngram_basic() {
625 let tok = NGramTokenizer::new(2, 3, vec![]);
626 let mut tokens = Vec::new();
627 tok.tokenize("Quick", &mut tokens);
628 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
629 assert_eq!(texts, vec!["Qu", "ui", "ic", "ck", "Qui", "uic", "ick"]);
632 }
633
634 #[test]
635 fn ngram_with_token_chars() {
636 let tok = NGramTokenizer::new(3, 3, vec![TokenChar::Letter, TokenChar::Digit]);
637 let mut tokens = Vec::new();
638 tok.tokenize("2 Quick Foxes", &mut tokens);
639 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
641 assert_eq!(texts, vec!["Qui", "uic", "ick", "Fox", "oxe", "xes"]);
642 }
643
644 #[test]
645 fn ngram_empty() {
646 let tok = NGramTokenizer::new(1, 2, vec![]);
647 let mut tokens = Vec::new();
648 tok.tokenize("", &mut tokens);
649 assert!(tokens.is_empty());
650 }
651
652 #[test]
653 fn ngram_offsets() {
654 let tok = NGramTokenizer::new(2, 2, vec![]);
655 let mut tokens = Vec::new();
656 tok.tokenize("abc", &mut tokens);
657 assert_eq!(tokens[0].text, "ab");
658 assert_eq!(tokens[0].offset_from, 0);
659 assert_eq!(tokens[0].offset_to, 2);
660 assert_eq!(tokens[1].text, "bc");
661 assert_eq!(tokens[1].offset_from, 1);
662 assert_eq!(tokens[1].offset_to, 3);
663 }
664
665 #[test]
668 fn edge_ngram_basic() {
669 let tok = EdgeNGramTokenizer::new(2, 5, vec![TokenChar::Letter]);
670 let mut tokens = Vec::new();
671 tok.tokenize("Quick", &mut tokens);
672 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
673 assert_eq!(texts, vec!["Qu", "Qui", "Quic", "Quick"]);
674 }
675
676 #[test]
677 fn edge_ngram_multiple_words() {
678 let tok = EdgeNGramTokenizer::new(2, 4, vec![TokenChar::Letter]);
679 let mut tokens = Vec::new();
680 tok.tokenize("Quick Fox", &mut tokens);
681 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
682 assert_eq!(texts, vec!["Qu", "Qui", "Quic", "Fo", "Fox"]);
683 }
684
685 #[test]
686 fn edge_ngram_min_larger_than_word() {
687 let tok = EdgeNGramTokenizer::new(5, 10, vec![TokenChar::Letter]);
688 let mut tokens = Vec::new();
689 tok.tokenize("Hi", &mut tokens);
690 assert!(tokens.is_empty()); }
692
693 #[test]
694 fn edge_ngram_offsets() {
695 let tok = EdgeNGramTokenizer::new(2, 3, vec![TokenChar::Letter]);
696 let mut tokens = Vec::new();
697 tok.tokenize("Hello", &mut tokens);
698 assert_eq!(tokens[0].text, "He");
699 assert_eq!(tokens[0].offset_from, 0);
700 assert_eq!(tokens[0].offset_to, 2);
701 assert_eq!(tokens[1].text, "Hel");
702 assert_eq!(tokens[1].offset_from, 0);
703 assert_eq!(tokens[1].offset_to, 3);
704 }
705
706 #[test]
709 fn pattern_basic() {
710 let tok = PatternTokenizer::new(r"[ .,!?]").unwrap();
711 let mut tokens = Vec::new();
712 tok.tokenize("Hello, World! Test.", &mut tokens);
713 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
714 assert_eq!(texts, vec!["Hello", "World", "Test"]);
715 }
716
717 #[test]
718 fn pattern_no_match() {
719 let tok = PatternTokenizer::new(r"\d+").unwrap();
720 let mut tokens = Vec::new();
721 tok.tokenize("hello world", &mut tokens);
722 assert_eq!(tokens.len(), 1);
723 assert_eq!(tokens[0].text, "hello world");
724 }
725
726 #[test]
727 fn pattern_offsets() {
728 let tok = PatternTokenizer::new(r"\s+").unwrap();
729 let mut tokens = Vec::new();
730 tok.tokenize("hello world", &mut tokens);
731 assert_eq!(tokens[0].offset_from, 0);
732 assert_eq!(tokens[0].offset_to, 5);
733 assert_eq!(tokens[1].offset_from, 6);
734 assert_eq!(tokens[1].offset_to, 11);
735 }
736
737 #[test]
740 fn path_hierarchy_basic() {
741 let tok = PathHierarchyTokenizer::default();
742 let mut tokens = Vec::new();
743 tok.tokenize("/a/b/c", &mut tokens);
744 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
745 assert_eq!(texts, vec!["/a", "/a/b", "/a/b/c"]);
746 }
747
748 #[test]
749 fn path_hierarchy_no_leading_sep() {
750 let tok = PathHierarchyTokenizer::default();
751 let mut tokens = Vec::new();
752 tok.tokenize("a/b/c", &mut tokens);
753 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
754 assert_eq!(texts, vec!["a", "a/b", "a/b/c"]);
755 }
756
757 #[test]
758 fn path_hierarchy_no_sep() {
759 let tok = PathHierarchyTokenizer::default();
760 let mut tokens = Vec::new();
761 tok.tokenize("filename", &mut tokens);
762 assert_eq!(tokens.len(), 1);
763 assert_eq!(tokens[0].text, "filename");
764 }
765
766 #[test]
767 fn path_hierarchy_custom_sep() {
768 let tok = PathHierarchyTokenizer::new('.', None);
769 let mut tokens = Vec::new();
770 tok.tokenize("com.example.app", &mut tokens);
771 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
772 assert_eq!(texts, vec!["com", "com.example", "com.example.app"]);
773 }
774}