tokenizers/models/bpe/
mod.rs1use std::{iter, mem};
3
4mod model;
5mod serialization;
6pub mod trainer;
7mod word;
8
9type Pair = (u32, u32);
10
11#[derive(thiserror::Error, Debug)]
13pub enum Error {
14 #[error("IoError: {0}")]
16 Io(#[from] std::io::Error),
17 #[error("JsonError: {0}")]
19 JsonError(#[from] serde_json::Error),
20 #[error("Bad vocabulary json file")]
22 BadVocabulary,
23 #[error("Merges text file invalid at line {0}")]
26 BadMerges(usize),
27 #[error("Token `{0}` out of vocabulary")]
29 MergeTokenOutOfVocabulary(String),
30 #[error("Unk token `{0}` not found in the vocabulary")]
32 UnkTokenOutOfVocabulary(String),
33 #[error("Dropout should be between 0 and 1, inclusive")]
35 InvalidDropout,
36}
37
38pub(crate) trait WithFirstLastIterator: Iterator + Sized {
40 fn with_first_and_last(self) -> FirstLastIterator<Self>;
41}
42
43impl<I> WithFirstLastIterator for I
44where
45 I: Iterator,
46{
47 fn with_first_and_last(self) -> FirstLastIterator<Self> {
48 FirstLastIterator {
49 first: true,
50 iter: self.peekable(),
51 }
52 }
53}
54
55pub(crate) struct FirstLastIterator<I>
57where
58 I: Iterator,
59{
60 first: bool,
61 iter: iter::Peekable<I>,
62}
63
64impl<I> Iterator for FirstLastIterator<I>
65where
66 I: Iterator,
67{
68 type Item = (bool, bool, I::Item);
70
71 fn next(&mut self) -> Option<Self::Item> {
72 let first = mem::replace(&mut self.first, false);
73 self.iter
74 .next()
75 .map(|e| (first, self.iter.peek().is_none(), e))
76 }
77}
78
79pub use model::*;
81pub use trainer::*;
82use word::*;