1use std::borrow::Cow;
2use std::iter::Filter;
3
4use lazy_static::lazy_static;
5use regex::Regex;
6
7use crate::utils::split_in_three;
8
9pub(crate) type Tokens<'a> = Filter<Tokenizer<'a>, fn(&Token) -> bool>;
10
11pub(crate) fn tokenize(phrase: &str) -> Tokens {
12 let tokens = Tokenizer {
13 string: phrase,
14 next_token: None,
15 };
16 tokens.filter(|t| !t.is_empty())
17}
18
19lazy_static! {
20 static ref WORD_SEPARATORS: Regex = Regex::new(r#"[,.?!:;()«»„“”"—\s]+"#).unwrap();
21}
22
23pub(crate) struct Tokenizer<'a> {
24 next_token: Option<Token<'a>>,
25 string: &'a str,
26}
27
28impl<'a> Iterator for Tokenizer<'a> {
29 type Item = Token<'a>;
30
31 fn next(&mut self) -> Option<Self::Item> {
32 if let Some(token) = self.next_token.take() {
33 return Some(token);
34 }
35
36 if self.string.is_empty() {
37 return None;
38 }
39
40 if let Some(mat) = WORD_SEPARATORS.find(self.string) {
41 let (before, matching_part, rest) = split_in_three(self.string, mat.start(), mat.end());
42 self.string = rest;
43 self.next_token = Some(Token::new(matching_part, TokenKind::Separator));
44 return Some(Token::new(before, TokenKind::Word));
45 } else {
46 let rest = self.string;
47 self.string = "";
48 return Some(Token::new(rest, TokenKind::Word));
49 }
50 }
51}
52
53#[derive(Debug, Copy, Clone, PartialEq)]
54pub(crate) enum TokenKind {
55 Word,
56 Separator,
57}
58
59#[derive(Debug, Clone)]
60pub(crate) struct Token<'a> {
61 pub original: &'a str,
62 pub normalized: Cow<'a, str>,
63 pub kind: TokenKind,
64}
65
66impl<'a> Token<'a> {
67 pub fn new(original: &'a str, kind: TokenKind) -> Self {
68 let normalized = if kind == TokenKind::Word && original.contains(char::is_uppercase) {
69 Cow::Owned(original.to_lowercase())
70 } else {
71 Cow::Borrowed(original)
72 };
73
74 Self {
75 original,
76 normalized,
77 kind,
78 }
79 }
80
81 pub fn is_meaningful(&self) -> bool {
82 self.kind == TokenKind::Word
83 }
84
85 pub fn is_empty(&self) -> bool {
86 self.original.is_empty()
87 }
88
89 pub fn get_normalized(&self) -> &str {
92 self.normalized.as_ref()
93 }
94
95 pub fn get_original(&self) -> &str {
96 self.original
97 }
98}