trie/
tokenizer.rs

1use std::sync::Arc;
2
3use unicode_segmentation::UnicodeSegmentation;
4
5pub enum Tokenizer {
6    /// A slice based `Tokenizer`
7    ///
8    /// Will tokenize `String` and detokenize `Vec<String>` by `usize` length.
9    Slice(usize),
10    /// A delimiter based `Tokenizer`
11    ///
12    /// Will tokenize `String` and detokenize `Vec<String>` using a `String` delimiter.
13    Delimiter(String),
14    /// A custom user defined `Tokenizer`
15    ///
16    /// Arguments
17    ///
18    /// `Box<dyn Fn(String) -> Vec<String>>` - A function that will be used to `tokenize` a key into tokens.
19    /// `Box<dyn Fn(Vec<String>) -> String` - A function that will be used to `detokenize` a `Vec<String>` of tokens into a `String`.
20    ///
21    /// Will tokenize and detokenize in a user defined way.
22    Custom(Arc<dyn Fn(String) -> Vec<String>>, Arc<dyn Fn(Vec<String>) -> String>)
23}
24
25impl Tokenizer {
26    /// Breaks a `String` into pieces based on `Tokenizer` type.
27    ///
28    /// * A `Tokenizer::Slice(length)` will split the String by `length`.
29    /// * A `Tokenizer::Delimiter(characters)` will split the String by `characters`.
30    ///
31    /// Arguments
32    /// `key` - A `String` that you want to be broken into pieces.
33    ///
34    /// Returns
35    /// `Vec<String>`
36    pub fn tokenize(&self, key: String) -> Vec<String> {
37        match self {
38            Self::Slice(length) => {
39                let mut slices = Vec::new();
40                let mut current_slice = String::new();
41
42                for grapheme in key.graphemes(true) {
43                    if current_slice.len() + grapheme.len() <= *length {
44                        current_slice.push_str(grapheme);
45                    } else {
46                        slices.push(current_slice.clone());
47                        current_slice.clear();
48                        current_slice.push_str(grapheme);
49                    }
50                }
51
52                if !current_slice.is_empty() {
53                    slices.push(current_slice);
54                }
55                slices
56            }
57            Self::Delimiter(delimiter) => {
58                key.split(delimiter).map(|s| s.to_string()).collect()
59            }
60            Self::Custom(tokenize_fn, _) => tokenize_fn(key)
61        }
62    }
63
64    /// Joins pieces of a `String` together based on `Tokenizer` type.
65    ///
66    /// * A `Tokenizer::Slice` will join elements together without a delimiter.
67    /// * A `Tokenizer::Delimiter` will join elements together with a delimiter.
68    /// Arguments
69    /// `tokens` - A `Vec<String>` that you'd like to be a single String.
70    ///
71    /// Returns
72    /// `String`
73    pub fn detokenize(&self, tokens: Vec<String>) -> String {
74        match self {
75            Self::Slice(_) => {
76                tokens.join("")
77            }
78            Self::Delimiter(delimiter) => {
79                tokens.join(delimiter)
80            }
81            Self::Custom(_, detokenize_fn) => detokenize_fn(tokens)
82        }
83    }
84}
85