1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
use super::{BasicVocab, TokenNotFoundError, Vocab};
use serde::{Deserialize, Serialize};

#[derive(Serialize, Deserialize, Default, Clone)]
pub struct WordPieceVocab {
    vocab: BasicVocab,
}

impl Vocab for WordPieceVocab {
    fn load() -> Self {
        // Open vocab file
        let tokens: Vec<&str> = include_str!("../resources/wordpiece_vocab.txt")
            .split('\n')
            .collect();
        // Build vocab
        let mut vocab = BasicVocab::new();
        for token in tokens {
            vocab.add_token(String::from(token));
        }
        WordPieceVocab { vocab }
    }

    fn len(&self) -> usize {
        self.vocab.len()
    }

    fn tokens_from_indexes(&self, indexes: &[usize]) -> Result<Vec<String>, TokenNotFoundError> {
        self.vocab.tokens_from_indexes(indexes)
    }

    fn batch_tokens_from_indexes(
        &self,
        indexes: &[Vec<usize>],
    ) -> Result<Vec<Vec<String>>, TokenNotFoundError> {
        self.vocab.batch_tokens_from_indexes(indexes)
    }

    fn indexes_from_tokens(&self, tokens: &[String]) -> Result<Vec<usize>, TokenNotFoundError> {
        self.vocab.indexes_from_tokens(tokens)
    }

    fn batch_indexes_from_tokens(
        &self,
        tokens: &[Vec<String>],
    ) -> Result<Vec<Vec<usize>>, TokenNotFoundError> {
        self.vocab.batch_indexes_from_tokens(tokens)
    }
}