1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
use super::Tokenizer;
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct WhitespaceTokenizer {}
impl Tokenizer for WhitespaceTokenizer {
fn load() -> Self {
WhitespaceTokenizer{}
}
fn tokenize(&self, string: &str) -> Vec<String> {
string.split(' ').map(|f| {f.to_string()}).collect()
}
fn batch_tokenize(&self, strings: Vec<String>) -> Vec<Vec<String>> {
strings.iter().map(|string| {
let tokens: Vec<String> = string.split("").map(|f| {f.to_string()}).collect();
tokens[1..tokens.len()-1].to_vec()
}).collect()
}
fn untokenize(&self, tokens: Vec<String>) -> String {
tokens.join(" ")
}
fn batch_untokenize(&self, tokens: Vec<Vec<String>>) -> Vec<String> {
tokens.iter().map(|tokens| {
tokens.join(" ")
}).collect()
}
}