pub struct AlephAlphaTokenizer { /* private fields */ }
Expand description
The Tokenizer. Use AlephAlphaTokenizer::from_vocab
to create an
instance.
Implementations§
Source§impl AlephAlphaTokenizer
impl AlephAlphaTokenizer
Sourcepub fn from_vocab(path: &str) -> Result<Self, Box<dyn Error + Send + Sync>>
pub fn from_vocab(path: &str) -> Result<Self, Box<dyn Error + Send + Sync>>
Creates a tokenizer from the vocabulary.
For now, we assume the following tokens / IDs:
[CLS]
is classification (and if present is used as prefix)[SEP]
is separator (and if present is used as suffix)[PAD]
is padding and is in position0
[UNK]
is the unknonw token specifier
Sourcepub fn char_ranges<'i>(
text: &'i str,
ranges: impl Iterator<Item = &'i Range<usize>> + 'i,
) -> impl Iterator<Item = (Range<usize>, Range<usize>)> + 'i
pub fn char_ranges<'i>( text: &'i str, ranges: impl Iterator<Item = &'i Range<usize>> + 'i, ) -> impl Iterator<Item = (Range<usize>, Range<usize>)> + 'i
Wraps a UTF8 byte range iterator to produce a tuple of (byte-range, character-range).
§Examples
let text = "äußerst";
let ranges = &[0usize..3, 3..7, 7..9];
assert_eq!(&[(0..3, 0..2), (3..7, 2..5), (7..9, 5..7)],
&AlephAlphaTokenizer::char_ranges(text, ranges.iter()).collect::<Vec<_>>()[..]);
Sourcepub fn tokens_into<T: TokenID>(
&self,
text: &str,
token_ids: &mut Vec<T>,
token_ranges: &mut Vec<Range<usize>>,
words: Option<&mut Vec<Range<usize>>>,
)
pub fn tokens_into<T: TokenID>( &self, text: &str, token_ids: &mut Vec<T>, token_ranges: &mut Vec<Range<usize>>, words: Option<&mut Vec<Range<usize>>>, )
tokenize the given text into a &mut Vec<u64>
for ids and
&mut Vec<Range<usize>>
for source ranges respectively, optionally
filling a words
&mut Vec<Range>
with ranges into the tokens array
with the words’ token indices.
This works by first splitting by whitespace, then gathering the longest
prefix in our token tree (first the starters, then the followers) until
the word is complete, or inserting a [UNK]
token if the word couldn’t
fully be tokenized. This is what wordpiece does, too.
Note: The output Vec
s will be cleared before appending tokens.
§Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer;
let source_text = "Ein interessantes Beispiel";
let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap();
let mut ids: Vec<i32> = Vec::new();
let mut ranges = Vec::new();
tokenizer.tokens_into(source_text, &mut ids, &mut ranges, None);
assert_eq!(&[3, 198, 23181, 26902, 2249, 4], &ids[..]);
Sourcepub fn text_of<T: TokenID>(&self, token_id: T) -> &str
pub fn text_of<T: TokenID>(&self, token_id: T) -> &str
Gets the text of this token.
§Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer;
let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap();
assert_eq!("[PAD]", tokenizer.text_of(0));
Sourcepub fn texts_of<'t, T: TokenID>(&'t self, token_ids: &[T]) -> Vec<&'t str>
pub fn texts_of<'t, T: TokenID>(&'t self, token_ids: &[T]) -> Vec<&'t str>
Gets the texts of the tokens.
§Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer;
let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap();
assert_eq!(
vec!["[CLS]", "Super", "[SEP]"],
tokenizer.texts_of(&[3, 4285, 4])
);
Sourcepub fn is_special<T: TokenID>(&self, token_id: T) -> bool
pub fn is_special<T: TokenID>(&self, token_id: T) -> bool
Determines whether this token is a special token.
Special tokens are e.g. [CLS]
, [SEP]
, [PAD]
or [UNK]
.
§Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer;
let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap();
assert!(tokenizer.is_special(0i32)); // [PAD]
assert!(tokenizer.is_special(3i32)); // [CLS]
assert!(tokenizer.is_special(4i32)); // [SEP]
assert!(!tokenizer.is_special(42i32));
Sourcepub fn attention<T: TokenID, U: TokenID>(token_id: T) -> U
pub fn attention<T: TokenID, U: TokenID>(token_id: T) -> U
Calculates the required attention for this token.
§Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer;
let pad_attention: i64 = AlephAlphaTokenizer::attention(0u64);
let token_attention: f64 = AlephAlphaTokenizer::attention(99i32);
assert_eq!(pad_attention, 0);
assert_eq!(token_attention, 1.0f64);
Sourcepub fn attentions_into<T: TokenID, U: TokenID>(
token_ids: &[T],
attns: &mut Vec<U>,
)
pub fn attentions_into<T: TokenID, U: TokenID>( token_ids: &[T], attns: &mut Vec<U>, )
Given a slice of [u64]
s, appends the attentions to the given Vec
.
§Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer;
let mut attns: Vec<i32> = Vec::new();
AlephAlphaTokenizer::attentions_into(&[3, 4285, 4, 0, 0], &mut attns);
assert_eq!(&attns[..], &[1, 1, 1, 0, 0]);