syntaxdot_tokenizers/
albert.rs1use std::path::Path;
2
3use sentencepiece::SentencePieceProcessor;
4use udgraph::graph::{Node, Sentence};
5
6use super::{SentenceWithPieces, Tokenize};
7use crate::TokenizerError;
8
9pub struct AlbertTokenizer {
17 spp: SentencePieceProcessor,
18}
19
20impl AlbertTokenizer {
21 pub fn new(spp: SentencePieceProcessor) -> Self {
22 AlbertTokenizer { spp }
23 }
24
25 pub fn open<P>(model: P) -> Result<Self, TokenizerError>
26 where
27 P: AsRef<Path>,
28 {
29 let spp = SentencePieceProcessor::open(model)?;
30 Ok(Self::new(spp))
31 }
32}
33
34impl From<SentencePieceProcessor> for AlbertTokenizer {
35 fn from(spp: SentencePieceProcessor) -> Self {
36 AlbertTokenizer::new(spp)
37 }
38}
39
40impl Tokenize for AlbertTokenizer {
41 fn tokenize(&self, sentence: Sentence) -> SentenceWithPieces {
42 let mut pieces = Vec::with_capacity((sentence.len() + 1) * 3);
45 let mut token_offsets = Vec::with_capacity(sentence.len());
46
47 pieces.push(
48 self.spp
49 .piece_to_id("[CLS]")
50 .expect("ALBERT model does not have a [CLS] token")
51 .expect("ALBERT model does not have a [CLS] token") as i64,
52 );
53
54 for token in sentence.iter().filter_map(Node::token) {
55 token_offsets.push(pieces.len());
56
57 let token_pieces = self
58 .spp
59 .encode(token.form())
60 .expect("The sentencepiece tokenizer failed");
61
62 if !token_pieces.is_empty() {
63 pieces.extend(token_pieces.into_iter().map(|piece| piece.id as i64));
64 } else {
65 pieces.push(self.spp.unk_id() as i64);
73 }
74 }
75
76 pieces.push(
77 self.spp
78 .piece_to_id("[SEP]")
79 .expect("ALBERT model does not have a [SEP] token")
80 .expect("ALBERT model does not have a [SEP] token") as i64,
81 );
82
83 SentenceWithPieces {
84 pieces: pieces.into(),
85 sentence,
86 token_offsets,
87 }
88 }
89}
90
91#[cfg(feature = "model-tests")]
92#[cfg(test)]
93mod tests {
94 use std::iter::FromIterator;
95
96 use ndarray::array;
97 use sentencepiece::SentencePieceProcessor;
98 use udgraph::graph::Sentence;
99 use udgraph::token::Token;
100
101 use super::AlbertTokenizer;
102 use crate::Tokenize;
103
104 fn sentence_from_forms(forms: &[&str]) -> Sentence {
105 Sentence::from_iter(forms.iter().map(|&f| Token::new(f)))
106 }
107
108 fn albert_tokenizer() -> AlbertTokenizer {
109 let spp = SentencePieceProcessor::open(env!("ALBERT_BASE_V2_SENTENCEPIECE")).unwrap();
110 AlbertTokenizer::new(spp)
111 }
112
113 #[test]
114 fn tokenizer_gives_expected_output() {
115 let tokenizer = albert_tokenizer();
116 let sent = sentence_from_forms(&["pierre", "vinken", "will", "join", "the", "board", "."]);
117 let pieces = tokenizer.tokenize(sent);
118 assert_eq!(
119 pieces.pieces,
120 array![2, 5399, 9730, 2853, 129, 1865, 14, 686, 13, 9, 3]
121 );
122 }
123
124 #[test]
125 fn handles_missing_sentence_pieces() {
126 let tokenizer = albert_tokenizer();
127 let sent = sentence_from_forms(&["pierre", " ", "vinken"]);
128 let pieces = tokenizer.tokenize(sent);
129 assert_eq!(pieces.pieces, array![2, 5399, 1, 9730, 2853, 3]);
130 }
131}