1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
use super::Tokenizer;
use serde::{Deserialize, Serialize};

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct WhitespaceTokenizer {}

impl Tokenizer for WhitespaceTokenizer {
    fn load() -> Self {
        WhitespaceTokenizer{}
    }

    fn tokenize(&self, string: &str) -> Vec<String> {
        string.split(' ').map(|f| {f.to_string()}).collect()
    }

    fn batch_tokenize(&self, strings: Vec<String>) -> Vec<Vec<String>> {
        strings.iter().map(|string| {
            let tokens: Vec<String> = string.split("").map(|f| {f.to_string()}).collect();
            tokens[1..tokens.len()-1].to_vec() // For some reason, the split adds empty strings to each end
        }).collect()
    }

    fn untokenize(&self, tokens: Vec<String>) -> String {
        tokens.join(" ")
    }

    fn batch_untokenize(&self, tokens: Vec<Vec<String>>) -> Vec<String> {
        tokens.iter().map(|tokens| {
            tokens.join(" ")
        }).collect()
    }
}