1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
//! # Token Vocabulary Index
use crate::{
alloc::vec::Vec,
types::{
TokenType,
WCHashSet,
},
};
/// Common traits for token vocabularies.
pub trait VocabIndex<T: TokenType>: Clone + Send + Sync {
/// The token type: T.
type Token: TokenType;
/// Returns a set of all tokens.
fn tokens(&self) -> WCHashSet<T>;
/// Returns the number of tokens in the vocabulary.
fn len(&self) -> usize;
/// Returns true if the vocabulary is empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
/// Gets the highest ranked token.
///
/// ## Returns
/// The maximum token value, or None.
fn max_token(&self) -> Option<T> {
self.tokens().iter().max().copied()
}
/// Generate all ``(Vec<u8>, T)`` pairs in the vocabulary.
///
/// ## Returns
/// An iterator over pairs of byte vectors and their corresponding tokens.
fn span_pairs(&self) -> impl Iterator<Item = (Vec<u8>, T)>;
}