truecase/
tokenizer.rs

1use std::borrow::Cow;
2use std::iter::Filter;
3
4use lazy_static::lazy_static;
5use regex::Regex;
6
7use crate::utils::split_in_three;
8
9pub(crate) type Tokens<'a> = Filter<Tokenizer<'a>, fn(&Token) -> bool>;
10
11pub(crate) fn tokenize(phrase: &str) -> Tokens {
12    let tokens = Tokenizer {
13        string: phrase,
14        next_token: None,
15    };
16    tokens.filter(|t| !t.is_empty())
17}
18
19lazy_static! {
20    static ref WORD_SEPARATORS: Regex = Regex::new(r#"[,.?!:;()«»„“”"—\s]+"#).unwrap();
21}
22
23pub(crate) struct Tokenizer<'a> {
24    next_token: Option<Token<'a>>,
25    string: &'a str,
26}
27
28impl<'a> Iterator for Tokenizer<'a> {
29    type Item = Token<'a>;
30
31    fn next(&mut self) -> Option<Self::Item> {
32        if let Some(token) = self.next_token.take() {
33            return Some(token);
34        }
35
36        if self.string.is_empty() {
37            return None;
38        }
39
40        if let Some(mat) = WORD_SEPARATORS.find(self.string) {
41            let (before, matching_part, rest) = split_in_three(self.string, mat.start(), mat.end());
42            self.string = rest;
43            self.next_token = Some(Token::new(matching_part, TokenKind::Separator));
44            return Some(Token::new(before, TokenKind::Word));
45        } else {
46            let rest = self.string;
47            self.string = "";
48            return Some(Token::new(rest, TokenKind::Word));
49        }
50    }
51}
52
53#[derive(Debug, Copy, Clone, PartialEq)]
54pub(crate) enum TokenKind {
55    Word,
56    Separator,
57}
58
59#[derive(Debug, Clone)]
60pub(crate) struct Token<'a> {
61    pub original: &'a str,
62    pub normalized: Cow<'a, str>,
63    pub kind: TokenKind,
64}
65
66impl<'a> Token<'a> {
67    pub fn new(original: &'a str, kind: TokenKind) -> Self {
68        let normalized = if kind == TokenKind::Word && original.contains(char::is_uppercase) {
69            Cow::Owned(original.to_lowercase())
70        } else {
71            Cow::Borrowed(original)
72        };
73
74        Self {
75            original,
76            normalized,
77            kind,
78        }
79    }
80
81    pub fn is_meaningful(&self) -> bool {
82        self.kind == TokenKind::Word
83    }
84
85    pub fn is_empty(&self) -> bool {
86        self.original.is_empty()
87    }
88
89    // these functions are only necessary because closures can't be cloned and
90    // `join_with_spaces` requires a cloneable iterator
91    pub fn get_normalized(&self) -> &str {
92        self.normalized.as_ref()
93    }
94
95    pub fn get_original(&self) -> &str {
96        self.original
97    }
98}