[−][src]Struct aleph_alpha_tokenizer::AlephAlphaTokenizer
The Tokenizer. Use AlephAlphaTokenizer::from_vocab
to create an
instance.
Implementations
impl AlephAlphaTokenizer
[src]
pub fn from_vocab(path: &str) -> Result<Self, Box<dyn Error + Send + Sync>>
[src]
Creates a tokenizer from the vocabulary.
For now, we assume the following tokens / IDs:
[CLS]
is classification (and if present is used as prefix)[SEP]
is separator (and if present is used as suffix)[PAD]
is padding and is in position0
[UNK]
is the unknonw token specifier
pub fn char_ranges<'i>(
text: &'i str,
ranges: impl Iterator<Item = &'i Range<usize>> + 'i
) -> impl Iterator<Item = (Range<usize>, Range<usize>)> + 'i
[src]
text: &'i str,
ranges: impl Iterator<Item = &'i Range<usize>> + 'i
) -> impl Iterator<Item = (Range<usize>, Range<usize>)> + 'i
Wraps a UTF8 byte range iterator to produce a tuple of (byte-range, character-range).
Examples
let text = "äußerst"; let ranges = &[0usize..3, 3..7, 7..9]; assert_eq!(&[(0..3, 0..2), (3..7, 2..5), (7..9, 5..7)], &AlephAlphaTokenizer::char_ranges(text, ranges.iter()).collect::<Vec<_>>()[..]);
pub fn tokens_into<T: TokenID>(
&self,
text: &str,
token_ids: &mut Vec<T>,
token_ranges: &mut Vec<Range<usize>>,
words: Option<&mut Vec<Range<usize>>>
)
[src]
&self,
text: &str,
token_ids: &mut Vec<T>,
token_ranges: &mut Vec<Range<usize>>,
words: Option<&mut Vec<Range<usize>>>
)
tokenize the given text into a &mut Vec<u64>
for ids and
&mut Vec<Range<usize>>
for source ranges respectively, optionally
filling a words
&mut Vec<Range>
with ranges into the tokens array
with the words' token indices.
This works by first splitting by whitespace, then gathering the longest
prefix in our token tree (first the starters, then the followers) until
the word is complete, or inserting a [UNK]
token if the word couldn't
fully be tokenized. This is what wordpiece does, too.
Note: The output Vec
s will be cleared before appending tokens.
Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer; let source_text = "Ein interessantes Beispiel"; let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap(); let mut ids: Vec<i32> = Vec::new(); let mut ranges = Vec::new(); tokenizer.tokens_into(source_text, &mut ids, &mut ranges, None); assert_eq!(&[3, 198, 23181, 26902, 2249, 4], &ids[..]);
pub fn text_of<T: TokenID>(&self, token_id: T) -> &str
[src]
Gets the text of this token.
Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer; let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap(); assert_eq!("[PAD]", tokenizer.text_of(0));
pub fn texts_of<'t, T: TokenID>(&'t self, token_ids: &[T]) -> Vec<&'t str>
[src]
Gets the texts of the tokens.
Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer; let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap(); assert_eq!( vec!["[CLS]", "Super", "[SEP]"], tokenizer.texts_of(&[3, 4285, 4]) );
pub fn is_special<T: TokenID>(&self, token_id: T) -> bool
[src]
Determines whether this token is a special token.
Special tokens are e.g. [CLS]
, [SEP]
, [PAD]
or [UNK]
.
Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer; let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap(); assert!(tokenizer.is_special(0i32)); // [PAD] assert!(tokenizer.is_special(3i32)); // [CLS] assert!(tokenizer.is_special(4i32)); // [SEP] assert!(!tokenizer.is_special(42i32));
pub fn attention<T: TokenID, U: TokenID>(token_id: T) -> U
[src]
Calculates the required attention for this token.
Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer; let pad_attention: i64 = AlephAlphaTokenizer::attention(0u64); let token_attention: f64 = AlephAlphaTokenizer::attention(99i32); assert_eq!(pad_attention, 0); assert_eq!(token_attention, 1.0f64);
pub fn attentions_into<T: TokenID, U: TokenID>(
token_ids: &[T],
attns: &mut Vec<U>
)
[src]
token_ids: &[T],
attns: &mut Vec<U>
)
Given a slice of [u64]
s, appends the attentions to the given Vec
.
Examples
use aleph_alpha_tokenizer::AlephAlphaTokenizer; let mut attns: Vec<i32> = Vec::new(); AlephAlphaTokenizer::attentions_into(&[3, 4285, 4, 0, 0], &mut attns); assert_eq!(&attns[..], &[1, 1, 1, 0, 0]);
pub fn save_vocab(
&self,
vocab_path: PathBuf
) -> Result<PathBuf, Box<dyn Error + Send + Sync>>
[src]
&self,
vocab_path: PathBuf
) -> Result<PathBuf, Box<dyn Error + Send + Sync>>
Save the vocabulary back to a file
Auto Trait Implementations
impl RefUnwindSafe for AlephAlphaTokenizer
impl Send for AlephAlphaTokenizer
impl Sync for AlephAlphaTokenizer
impl Unpin for AlephAlphaTokenizer
impl UnwindSafe for AlephAlphaTokenizer
Blanket Implementations
impl<T> Any for T where
T: 'static + ?Sized,
[src]
T: 'static + ?Sized,
impl<T> Borrow<T> for T where
T: ?Sized,
[src]
T: ?Sized,
impl<T> BorrowMut<T> for T where
T: ?Sized,
[src]
T: ?Sized,
fn borrow_mut(&mut self) -> &mut T
[src]
impl<T> From<T> for T
[src]
impl<T, U> Into<U> for T where
U: From<T>,
[src]
U: From<T>,
impl<T, U> TryFrom<U> for T where
U: Into<T>,
[src]
U: Into<T>,
type Error = Infallible
The type returned in the event of a conversion error.
fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>
[src]
impl<T, U> TryInto<U> for T where
U: TryFrom<T>,
[src]
U: TryFrom<T>,