syntaxdot_tokenizers/
albert.rs

1use std::path::Path;
2
3use sentencepiece::SentencePieceProcessor;
4use udgraph::graph::{Node, Sentence};
5
6use super::{SentenceWithPieces, Tokenize};
7use crate::TokenizerError;
8
9/// Tokenizer for ALBERT models.
10///
11/// ALBERT uses the sentencepiece tokenizer. However, we cannot use
12/// it in the intended way: we would have to detokenize sentences and
13/// it is not guaranteed that each token has a unique piece, which is
14/// required in sequence labeling. So instead, we use the tokenizer as
15/// a subword tokenizer.
16pub struct AlbertTokenizer {
17    spp: SentencePieceProcessor,
18}
19
20impl AlbertTokenizer {
21    pub fn new(spp: SentencePieceProcessor) -> Self {
22        AlbertTokenizer { spp }
23    }
24
25    pub fn open<P>(model: P) -> Result<Self, TokenizerError>
26    where
27        P: AsRef<Path>,
28    {
29        let spp = SentencePieceProcessor::open(model)?;
30        Ok(Self::new(spp))
31    }
32}
33
34impl From<SentencePieceProcessor> for AlbertTokenizer {
35    fn from(spp: SentencePieceProcessor) -> Self {
36        AlbertTokenizer::new(spp)
37    }
38}
39
40impl Tokenize for AlbertTokenizer {
41    fn tokenize(&self, sentence: Sentence) -> SentenceWithPieces {
42        // An average of three pieces per token ought to be enough for
43        // everyone ;).
44        let mut pieces = Vec::with_capacity((sentence.len() + 1) * 3);
45        let mut token_offsets = Vec::with_capacity(sentence.len());
46
47        pieces.push(
48            self.spp
49                .piece_to_id("[CLS]")
50                .expect("ALBERT model does not have a [CLS] token")
51                .expect("ALBERT model does not have a [CLS] token") as i64,
52        );
53
54        for token in sentence.iter().filter_map(Node::token) {
55            token_offsets.push(pieces.len());
56
57            let token_pieces = self
58                .spp
59                .encode(token.form())
60                .expect("The sentencepiece tokenizer failed");
61
62            if !token_pieces.is_empty() {
63                pieces.extend(token_pieces.into_iter().map(|piece| piece.id as i64));
64            } else {
65                // Use the unknown token id if sentencepiece does not
66                // give an output for the token. This should not
67                // happen under normal circumstances, since
68                // sentencepiece does return this id for unknown
69                // tokens. However, the input may be corrupt and use
70                // some form of non-tab whitespace as a form, for which
71                // sentencepiece does not return any identifier.
72                pieces.push(self.spp.unk_id() as i64);
73            }
74        }
75
76        pieces.push(
77            self.spp
78                .piece_to_id("[SEP]")
79                .expect("ALBERT model does not have a [SEP] token")
80                .expect("ALBERT model does not have a [SEP] token") as i64,
81        );
82
83        SentenceWithPieces {
84            pieces: pieces.into(),
85            sentence,
86            token_offsets,
87        }
88    }
89}
90
91#[cfg(feature = "model-tests")]
92#[cfg(test)]
93mod tests {
94    use std::iter::FromIterator;
95
96    use ndarray::array;
97    use sentencepiece::SentencePieceProcessor;
98    use udgraph::graph::Sentence;
99    use udgraph::token::Token;
100
101    use super::AlbertTokenizer;
102    use crate::Tokenize;
103
104    fn sentence_from_forms(forms: &[&str]) -> Sentence {
105        Sentence::from_iter(forms.iter().map(|&f| Token::new(f)))
106    }
107
108    fn albert_tokenizer() -> AlbertTokenizer {
109        let spp = SentencePieceProcessor::open(env!("ALBERT_BASE_V2_SENTENCEPIECE")).unwrap();
110        AlbertTokenizer::new(spp)
111    }
112
113    #[test]
114    fn tokenizer_gives_expected_output() {
115        let tokenizer = albert_tokenizer();
116        let sent = sentence_from_forms(&["pierre", "vinken", "will", "join", "the", "board", "."]);
117        let pieces = tokenizer.tokenize(sent);
118        assert_eq!(
119            pieces.pieces,
120            array![2, 5399, 9730, 2853, 129, 1865, 14, 686, 13, 9, 3]
121        );
122    }
123
124    #[test]
125    fn handles_missing_sentence_pieces() {
126        let tokenizer = albert_tokenizer();
127        let sent = sentence_from_forms(&["pierre", " ", "vinken"]);
128        let pieces = tokenizer.tokenize(sent);
129        assert_eq!(pieces.pieces, array![2, 5399, 1, 9730, 2853, 3]);
130    }
131}