tokenizers/models/bpe/
mod.rs

1//! [Byte Pair Encoding](https://www.aclweb.org/anthology/P16-1162/) model.
2use std::{iter, mem};
3
4mod model;
5mod serialization;
6pub mod trainer;
7mod word;
8
9type Pair = (u32, u32);
10
11/// Errors that can be encountered while using or constructing a `BPE` model.
12#[derive(thiserror::Error, Debug)]
13pub enum Error {
14    /// An error encountered while reading files mainly.
15    #[error("IoError: {0}")]
16    Io(#[from] std::io::Error),
17    /// An error forwarded from Serde, while parsing JSON
18    #[error("JsonError: {0}")]
19    JsonError(#[from] serde_json::Error),
20    /// When the vocab.json file is in the wrong format
21    #[error("Bad vocabulary json file")]
22    BadVocabulary,
23    /// When the merges.txt file is in the wrong format. This error holds the line
24    /// number of the line that caused the error.
25    #[error("Merges text file invalid at line {0}")]
26    BadMerges(usize),
27    /// If a token found in merges, is not in the vocab
28    #[error("Token `{0}` out of vocabulary")]
29    MergeTokenOutOfVocabulary(String),
30    /// If the provided unk token is out of vocabulary
31    #[error("Unk token `{0}` not found in the vocabulary")]
32    UnkTokenOutOfVocabulary(String),
33    /// Dropout not between 0 and 1.
34    #[error("Dropout should be between 0 and 1, inclusive")]
35    InvalidDropout,
36}
37
38/// Provides access to the `FirstLastIterator` to any Iterator
39pub(crate) trait WithFirstLastIterator: Iterator + Sized {
40    fn with_first_and_last(self) -> FirstLastIterator<Self>;
41}
42
43impl<I> WithFirstLastIterator for I
44where
45    I: Iterator,
46{
47    fn with_first_and_last(self) -> FirstLastIterator<Self> {
48        FirstLastIterator {
49            first: true,
50            iter: self.peekable(),
51        }
52    }
53}
54
55/// Provides information about whether an item is the first and/or the last of the iterator
56pub(crate) struct FirstLastIterator<I>
57where
58    I: Iterator,
59{
60    first: bool,
61    iter: iter::Peekable<I>,
62}
63
64impl<I> Iterator for FirstLastIterator<I>
65where
66    I: Iterator,
67{
68    /// (is_first, is_last, item)
69    type Item = (bool, bool, I::Item);
70
71    fn next(&mut self) -> Option<Self::Item> {
72        let first = mem::replace(&mut self.first, false);
73        self.iter
74            .next()
75            .map(|e| (first, self.iter.peek().is_none(), e))
76    }
77}
78
79// Re-export
80pub use model::*;
81pub use trainer::*;
82use word::*;