trie/tokenizer.rs
1use std::sync::Arc;
2
3use unicode_segmentation::UnicodeSegmentation;
4
5pub enum Tokenizer {
6 /// A slice based `Tokenizer`
7 ///
8 /// Will tokenize `String` and detokenize `Vec<String>` by `usize` length.
9 Slice(usize),
10 /// A delimiter based `Tokenizer`
11 ///
12 /// Will tokenize `String` and detokenize `Vec<String>` using a `String` delimiter.
13 Delimiter(String),
14 /// A custom user defined `Tokenizer`
15 ///
16 /// Arguments
17 ///
18 /// `Box<dyn Fn(String) -> Vec<String>>` - A function that will be used to `tokenize` a key into tokens.
19 /// `Box<dyn Fn(Vec<String>) -> String` - A function that will be used to `detokenize` a `Vec<String>` of tokens into a `String`.
20 ///
21 /// Will tokenize and detokenize in a user defined way.
22 Custom(Arc<dyn Fn(String) -> Vec<String>>, Arc<dyn Fn(Vec<String>) -> String>)
23}
24
25impl Tokenizer {
26 /// Breaks a `String` into pieces based on `Tokenizer` type.
27 ///
28 /// * A `Tokenizer::Slice(length)` will split the String by `length`.
29 /// * A `Tokenizer::Delimiter(characters)` will split the String by `characters`.
30 ///
31 /// Arguments
32 /// `key` - A `String` that you want to be broken into pieces.
33 ///
34 /// Returns
35 /// `Vec<String>`
36 pub fn tokenize(&self, key: String) -> Vec<String> {
37 match self {
38 Self::Slice(length) => {
39 let mut slices = Vec::new();
40 let mut current_slice = String::new();
41
42 for grapheme in key.graphemes(true) {
43 if current_slice.len() + grapheme.len() <= *length {
44 current_slice.push_str(grapheme);
45 } else {
46 slices.push(current_slice.clone());
47 current_slice.clear();
48 current_slice.push_str(grapheme);
49 }
50 }
51
52 if !current_slice.is_empty() {
53 slices.push(current_slice);
54 }
55 slices
56 }
57 Self::Delimiter(delimiter) => {
58 key.split(delimiter).map(|s| s.to_string()).collect()
59 }
60 Self::Custom(tokenize_fn, _) => tokenize_fn(key)
61 }
62 }
63
64 /// Joins pieces of a `String` together based on `Tokenizer` type.
65 ///
66 /// * A `Tokenizer::Slice` will join elements together without a delimiter.
67 /// * A `Tokenizer::Delimiter` will join elements together with a delimiter.
68 /// Arguments
69 /// `tokens` - A `Vec<String>` that you'd like to be a single String.
70 ///
71 /// Returns
72 /// `String`
73 pub fn detokenize(&self, tokens: Vec<String>) -> String {
74 match self {
75 Self::Slice(_) => {
76 tokens.join("")
77 }
78 Self::Delimiter(delimiter) => {
79 tokens.join(delimiter)
80 }
81 Self::Custom(_, detokenize_fn) => detokenize_fn(tokens)
82 }
83 }
84}
85