oxidized_transformers/tokenizers/
hf_tokenizer.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
use std::path::Path;
use std::{fs::File, path::PathBuf};

use snafu::{ensure, ResultExt, Snafu};
use tokenizers::tokenizer::Tokenizer as HuggingFaceTokenizer;

use super::pieces::PiecesWithIds;
use super::tokenizer::FromRepo;
use super::{
    hf_hub::FromHFHub,
    tokenizer::{Tokenizer, TokenizerEncodeInput},
};
use crate::error::BoxedError;
use crate::repository::repo::Repo;

/// `HfTokenizer` errors.
#[derive(Debug, Snafu)]
pub enum HfTokenizerError {
    #[snafu(display("Couldn't encode tokenizer inputs into pieces and ids"))]
    Encode { source: tokenizers::Error },

    #[snafu(display("Couldn't decode piece identifiers into strings"))]
    Decode { source: tokenizers::Error },

    #[snafu(display("Couldn't open 'tokenizer.json'"))]
    OpenTokenizerJSON { source: BoxedError },

    #[snafu(display("'tokenizer.json' file is missing"))]
    MissingTokenizerJSON,

    #[snafu(display("Couldn't open 'tokenizer_config.json'"))]
    OpenTokenizerConfigJSON { source: BoxedError },

    #[snafu(display("Couldn't open 'special_tokens_map.json'"))]
    OpenSpecialTokensMapJSON { source: BoxedError },

    #[snafu(display("Couldn't open JSON file at {}", path.to_string_lossy()))]
    OpenJSON {
        path: PathBuf,
        source: std::io::Error,
    },

    #[snafu(display("Cannot deserialize JSON file at {}", path.to_string_lossy()))]
    DeserializeJSON {
        path: PathBuf,
        source: serde_json::Error,
    },

    #[snafu(display("Couldn't load Hugging Face tokenizer from config"))]
    LoadHFTokenizer { source: BoxedError },
}

/// Wraps the tokenizers from the HuggingFace `tokenizers` package. It supports a
/// wide range of piece tokenizers, including word piece, byte pair encoding, and
/// sentencepiece unigram tokenizers. This is the tokenizer that should be used
/// in the majority of cases
pub struct HfTokenizer {
    tokenizer: HuggingFaceTokenizer,
    eos_piece: Option<String>,
}

impl HfTokenizer {
    fn new(
        tokenizer: HuggingFaceTokenizer,
        config: Option<&config::ConfigWithEosToken>,
        special_tokens_map: Option<&config::ConfigWithEosToken>,
    ) -> Self {
        let eos_piece = config
            .and_then(|e| e.eos_token())
            .or_else(|| special_tokens_map.and_then(|e| e.eos_token()));

        Self {
            tokenizer,
            eos_piece: eos_piece.cloned(),
        }
    }

    fn try_parse_json_config(
        path: &impl AsRef<Path>,
    ) -> Result<Option<config::ConfigWithEosToken>, BoxedError> {
        let file = File::open(path.as_ref()).context(OpenJSONSnafu {
            path: path.as_ref(),
        })?;

        let deserialized: Option<config::ConfigWithEosToken> = serde_json::from_reader(file)
            .context(DeserializeJSONSnafu {
                path: path.as_ref().to_owned(),
            })
            .boxed()?;

        Ok(deserialized)
    }
}

impl Tokenizer for HfTokenizer {
    fn encode<V, I>(&self, input: V) -> Result<PiecesWithIds, BoxedError>
    where
        V: AsRef<[TokenizerEncodeInput<I>]>,
        I: AsRef<str>,
    {
        let converted_input = input
            .as_ref()
            .iter()
            .map(|input| match input {
                TokenizerEncodeInput::RawString(s) => {
                    tokenizers::EncodeInput::Single(s.as_ref().into())
                }
            })
            .collect::<Vec<_>>();

        let encoding = self
            .tokenizer
            .encode_batch(converted_input, true)
            .context(EncodeSnafu)?;

        Ok(PiecesWithIds {
            ids: encoding
                .iter()
                .map(|ids| ids.get_ids().to_owned())
                .collect(),
            pieces: encoding
                .iter()
                .map(|ids| ids.get_tokens().to_owned())
                .collect(),
        })
    }

    fn decode<V, I>(&self, input: V, skip_special_pieces: bool) -> Result<Vec<String>, BoxedError>
    where
        V: AsRef<[I]>,
        I: AsRef<[u32]>,
    {
        let converted_input = input
            .as_ref()
            .iter()
            .map(|input| input.as_ref())
            .collect::<Vec<_>>();

        self.tokenizer
            .decode_batch(&converted_input, skip_special_pieces)
            .context(DecodeSnafu)
            .boxed()
    }

    fn piece_to_id(&self, piece: impl AsRef<str>) -> Option<u32> {
        self.tokenizer.token_to_id(piece.as_ref())
    }

    fn eos_piece(&self) -> Option<&str> {
        self.eos_piece.as_deref()
    }
}

impl FromRepo for HfTokenizer {
    fn from_repo(repo: &impl Repo) -> Result<Self, BoxedError> {
        let tokenizer_json = repo
            .file("tokenizer.json")
            .context(OpenTokenizerJSONSnafu)
            .boxed()?;
        let tokenizer_config_json = repo
            .file("tokenizer_config.json")
            .context(OpenTokenizerConfigJSONSnafu)
            .boxed()?;
        let special_tokens_map_json = repo
            .file("special_tokens_map.json")
            .context(OpenSpecialTokensMapJSONSnafu)
            .boxed()?;

        ensure!(tokenizer_json.is_some(), MissingTokenizerJSONSnafu);
        let tokenizer = HuggingFaceTokenizer::from_file(tokenizer_json.unwrap())
            .context(LoadHFTokenizerSnafu)?;

        let tokenizer_config = tokenizer_config_json
            .map(|p| Self::try_parse_json_config(&p))
            .transpose()?
            .flatten();

        let special_tokens_map = special_tokens_map_json
            .map(|p| Self::try_parse_json_config(&p))
            .transpose()?
            .flatten();

        Ok(Self::new(
            tokenizer,
            tokenizer_config.as_ref(),
            special_tokens_map.as_ref(),
        ))
    }
}

impl FromHFHub for HfTokenizer {}

mod config {
    use std::collections::HashMap;

    use serde::{Deserialize, Serialize};
    use serde_json::Value;

    /// Represents an EOS token in the tokenizer configuration.
    #[derive(Debug, Clone, Serialize, Deserialize)]
    #[serde(untagged)]
    pub(super) enum EosTokenInConfig {
        Default(String),
        Wrapped { content: Option<String> },
    }

    /// Represents a tokenizer configuration that includes an EOS token.
    /// Primarily used to with `tokenizer_config.json` and `special_tokens_map.json` files.
    #[derive(Debug, Clone, Serialize, Deserialize, Default)]
    pub(super) struct ConfigWithEosToken {
        #[serde(default)]
        eos_token: Option<EosTokenInConfig>,
        #[serde(flatten)]
        _extra: HashMap<String, Value>,
    }

    impl ConfigWithEosToken {
        pub(crate) fn eos_token(&self) -> Option<&String> {
            self.eos_token.as_ref().and_then(|e| match e {
                EosTokenInConfig::Default(s) => Some(s),
                EosTokenInConfig::Wrapped { content } => content.as_ref(),
            })
        }
    }
}

#[cfg(test)]
mod tests {
    use candle_core::Device;
    use rstest::{fixture, rstest};
    use tokenizers::{tokenizer::Tokenizer as HuggingFaceTokenizer, EncodeInput, PaddingParams};

    use super::*;

    #[fixture]
    fn short_sample_texts() -> &'static [&'static str] {
        &[
            "I saw a girl with a telescope.",
            "Today we will eat poké bowl, lots of it!",
            "Tokens which are unknown inペ mostで latinが alphabet際 vocabularies.",
        ]
    }

    #[fixture]
    fn long_sample_texts() -> &'static [&'static str] {
        // Two short Wikipedia fragments from:
        // https://en.wikipedia.org/wiki/Kinesis_(keyboard)#Contoured_/_Advantage
        // https://en.wikipedia.org/wiki/Doom_(1993_video_game)#Engine
        &[
            r#"The original Model 100, released in 1992, featured a single-piece 
        "contoured design similar to the Maltron keyboard, with the keys laid 
        "out in a traditional QWERTY arrangement, separated into two clusters
        "for the left and right hands.[2] A 1993 article in PC Magazine 
        "described the US$690 (equivalent to $1,300 in 2021) keyboard's
        'arrangement as having "the alphabet keys in precisely vertical
        "(not diagonal) columns in two concave depressions. The Kinesis
        "Keyboard also puts the Backspace, Delete, Enter, Space, Ctrl, Alt,
        "Home, End, Page Up, and Page Down keys under your thumbs in the 
        'middle.[23]"#,
            r#"Doom was programmed largely in the ANSI C programming language, with "
        "a few elements in assembly language. Development was done on NeXT "
        "computers running the NeXTSTEP operating system.[35] The data used by "
        "the game engine, including level designs and graphics files, are "
        'stored in WAD files, short for "Where\'s All the Data?"."#,
        ]
    }

    fn compare_tokenizer_outputs_with_hf_tokenizer(
        model_name: &str,
        pad_token: Option<&str>,
        eos_piece: Option<&str>,
        texts: &[&str],
    ) {
        let tokenizer = HfTokenizer::from_hf_hub(model_name, None)
            .expect("Failed to load tokenizer from HF Hub");
        let mut hf_tokenizer = HuggingFaceTokenizer::from_pretrained(model_name, None)
            .expect("Failed to load HF tokenizer from HF Hub");

        assert_eq!(tokenizer.eos_piece(), eos_piece.as_deref());

        let our_input: Vec<TokenizerEncodeInput<_>> = texts.iter().map(|s| (*s).into()).collect();
        let hf_input: Vec<EncodeInput> = texts.iter().map(|s| (*s).into()).collect();

        let mut right_padding = PaddingParams::default();
        right_padding.pad_token = pad_token
            .unwrap_or(right_padding.pad_token.as_ref())
            .to_string();
        let mut left_padding = PaddingParams::default();
        left_padding.direction = tokenizers::PaddingDirection::Left;
        left_padding.pad_token = pad_token
            .unwrap_or(right_padding.pad_token.as_ref())
            .to_string();

        let our_encoded = tokenizer.encode(our_input).expect("Failed to encode input");

        // Right padding.
        let our_encoded_padded_right = our_encoded
            .padded_tensor(right_padding.pad_id, false, &Device::Cpu)
            .expect("Failed to pad tensor")
            .to_vec2::<u32>()
            .expect("Failed to convert tensor to vec2");
        let our_encoded_attn_mask_padded_right = our_encoded
            .attention_mask(false, &Device::Cpu)
            .expect("Cannot create attention mask");
        let our_encoded_attn_mask_padded_right =
            match our_encoded_attn_mask_padded_right.bool_mask().dims2() {
                Ok((_, _)) => our_encoded_attn_mask_padded_right
                    .bool_mask()
                    .to_vec2::<u32>()
                    .expect("Cannot convert mask to vec2"),
                _ => our_encoded_attn_mask_padded_right
                    .bool_mask()
                    .squeeze(1)
                    .expect("Failed to squeeze attn mask")
                    .squeeze(1)
                    .expect("Failed to squeeze attn mask")
                    .to_vec2::<u32>()
                    .expect("Cannot convert mask to vec2"),
            };
        let hf_encoded_padded_right = hf_tokenizer
            .with_padding(Some(right_padding.clone()))
            .encode_batch(hf_input.clone(), true)
            .expect("Failed to encode input");

        for (ours, hf) in our_encoded_padded_right
            .iter()
            .zip(hf_encoded_padded_right.iter())
        {
            assert_eq!(ours.as_slice(), hf.get_ids());
        }

        for (ours, hf) in our_encoded_attn_mask_padded_right
            .iter()
            .zip(hf_encoded_padded_right.iter())
        {
            assert_eq!(ours.as_slice(), hf.get_attention_mask());
        }

        // Left padding.
        let our_encoded_padded_left = our_encoded
            .padded_tensor(left_padding.pad_id, true, &Device::Cpu)
            .expect("Failed to pad tensor")
            .to_vec2::<u32>()
            .expect("Failed to convert tensor to vec2");
        let our_encoded_attn_mask_padded_left = our_encoded
            .attention_mask(true, &Device::Cpu)
            .expect("Cannot create attention mask");
        let our_encoded_attn_mask_padded_left =
            match our_encoded_attn_mask_padded_left.bool_mask().dims2() {
                Ok((_, _)) => our_encoded_attn_mask_padded_left
                    .bool_mask()
                    .to_vec2::<u32>()
                    .expect("Cannot convert mask to vec2"),
                _ => our_encoded_attn_mask_padded_left
                    .bool_mask()
                    .squeeze(1)
                    .expect("Failed to squeeze attn mask")
                    .squeeze(1)
                    .expect("Failed to squeeze attn mask")
                    .to_vec2::<u32>()
                    .expect("Cannot convert mask to vec2"),
            };
        let hf_encoded_padded_left = hf_tokenizer
            .with_padding(Some(left_padding.clone()))
            .encode_batch(hf_input.clone(), true)
            .expect("Failed to encode input");

        for (ours, hf) in our_encoded_padded_left
            .iter()
            .zip(hf_encoded_padded_left.iter())
        {
            assert_eq!(ours.as_slice(), hf.get_ids());
        }

        for (ours, hf) in our_encoded_attn_mask_padded_left
            .iter()
            .zip(hf_encoded_padded_left.iter())
        {
            assert_eq!(ours.as_slice(), hf.get_attention_mask());
        }

        // Decoding.
        let our_decoded = tokenizer
            .decode(our_encoded.ids.iter(), true)
            .expect("Failed to decode input");
        let hf_decoded = hf_tokenizer
            .with_padding(Some(right_padding.clone()))
            .decode_batch(
                hf_encoded_padded_right
                    .iter()
                    .map(|v| v.get_ids())
                    .collect::<Vec<_>>()
                    .as_slice(),
                true,
            )
            .expect("Failed to decode input");

        assert_eq!(our_decoded, hf_decoded);
    }

    #[rstest]
    #[case("bert-base-cased", None, None)]
    #[case("camembert-base", None, None)]
    #[case("roberta-base", None, None)]
    #[case("xlm-roberta-base", None, None)]
    #[case("EleutherAI/gpt-neox-20b", Some("[PAD]"), Some("<|endoftext|>"))]
    #[case("ausboss/llama-30b-supercot", Some("</s>"), Some("</s>"))]
    #[case("tiiuae/falcon-7b", Some("<|endoftext|>"), Some("<|endoftext|>"))]
    fn tokenizer_test_against_hugging_face_short(
        #[case] model_name: &str,
        #[case] pad_token: Option<&str>,
        #[case] eos_piece: Option<&str>,
        short_sample_texts: &[&str],
    ) {
        compare_tokenizer_outputs_with_hf_tokenizer(
            model_name,
            pad_token,
            eos_piece,
            short_sample_texts,
        );
    }

    #[rstest]
    #[case("bert-base-cased", None, None)]
    #[case("camembert-base", None, None)]
    #[case("roberta-base", None, None)]
    #[case("xlm-roberta-base", None, None)]
    #[case("EleutherAI/gpt-neox-20b", Some("[PAD]"), Some("<|endoftext|>"))]
    #[case("ausboss/llama-30b-supercot", Some("</s>"), Some("</s>"))]
    #[case("tiiuae/falcon-7b", Some("<|endoftext|>"), Some("<|endoftext|>"))]
    fn tokenizer_test_against_hugging_face_long(
        #[case] model_name: &str,
        #[case] pad_token: Option<&str>,
        #[case] eos_piece: Option<&str>,
        short_sample_texts: &[&str],
    ) {
        compare_tokenizer_outputs_with_hf_tokenizer(
            model_name,
            pad_token,
            eos_piece,
            short_sample_texts,
        );
    }
}