[][src]Struct aleph_alpha_tokenizer::AlephAlphaTokenizer

pub struct AlephAlphaTokenizer { /* fields omitted */ }

The Tokenizer. Use AlephAlphaTokenizer::from_vocab to create an instance.

Implementations

impl AlephAlphaTokenizer[src]

pub fn from_vocab(path: &str) -> Result<Self, Box<dyn Error + Send + Sync>>[src]

Creates a tokenizer from the vocabulary.

For now, we assume the following tokens / IDs:

  • [CLS] is classification (and if present is used as prefix)
  • [SEP] is separator (and if present is used as suffix)
  • [PAD] is padding and is in position 0
  • [UNK] is the unknonw token specifier

pub fn char_ranges<'i>(
    text: &'i str,
    ranges: impl Iterator<Item = &'i Range<usize>> + 'i
) -> impl Iterator<Item = (Range<usize>, Range<usize>)> + 'i
[src]

Wraps a UTF8 byte range iterator to produce a tuple of (byte-range, character-range).

Examples

let text = "äußerst";
let ranges = &[0usize..3, 3..7, 7..9];
assert_eq!(&[(0..3, 0..2), (3..7, 2..5), (7..9, 5..7)],
    &AlephAlphaTokenizer::char_ranges(text, ranges.iter()).collect::<Vec<_>>()[..]);

pub fn tokens_into<T: TokenID>(
    &self,
    text: &str,
    token_ids: &mut Vec<T>,
    token_ranges: &mut Vec<Range<usize>>,
    words: Option<&mut Vec<Range<usize>>>
)
[src]

tokenize the given text into a &mut Vec<u64> for ids and &mut Vec<Range<usize>> for source ranges respectively, optionally filling a words &mut Vec<Range> with ranges into the tokens array with the words' token indices.

This works by first splitting by whitespace, then gathering the longest prefix in our token tree (first the starters, then the followers) until the word is complete, or inserting a [UNK] token if the word couldn't fully be tokenized. This is what wordpiece does, too.

Note: The output Vecs will be cleared before appending tokens.

Examples

use aleph_alpha_tokenizer::AlephAlphaTokenizer;

let source_text = "Ein interessantes Beispiel";
let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap();
let mut ids: Vec<i32> = Vec::new();
let mut ranges = Vec::new();
tokenizer.tokens_into(source_text, &mut ids, &mut ranges, None);
assert_eq!(&[3, 198, 23181, 26902, 2249, 4], &ids[..]);

pub fn text_of<T: TokenID>(&self, token_id: T) -> &str[src]

Gets the text of this token.

Examples

use aleph_alpha_tokenizer::AlephAlphaTokenizer;
let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap();

assert_eq!("[PAD]", tokenizer.text_of(0));

pub fn texts_of<'t, T: TokenID>(&'t self, token_ids: &[T]) -> Vec<&'t str>[src]

Gets the texts of the tokens.

Examples

use aleph_alpha_tokenizer::AlephAlphaTokenizer;
let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap();

assert_eq!(
    vec!["[CLS]", "Super", "[SEP]"],
    tokenizer.texts_of(&[3, 4285, 4])
);

pub fn is_special<T: TokenID>(&self, token_id: T) -> bool[src]

Determines whether this token is a special token.

Special tokens are e.g. [CLS], [SEP], [PAD] or [UNK].

Examples

use aleph_alpha_tokenizer::AlephAlphaTokenizer;
let tokenizer = AlephAlphaTokenizer::from_vocab("vocab.txt").unwrap();

assert!(tokenizer.is_special(0i32)); // [PAD]
assert!(tokenizer.is_special(3i32));  // [CLS]
assert!(tokenizer.is_special(4i32));  // [SEP]
assert!(!tokenizer.is_special(42i32));

pub fn attention<T: TokenID, U: TokenID>(token_id: T) -> U[src]

Calculates the required attention for this token.

Examples

use aleph_alpha_tokenizer::AlephAlphaTokenizer;

let pad_attention: i64 = AlephAlphaTokenizer::attention(0u64);
let token_attention: f64 = AlephAlphaTokenizer::attention(99i32);
assert_eq!(pad_attention, 0);
assert_eq!(token_attention, 1.0f64);

pub fn attentions_into<T: TokenID, U: TokenID>(
    token_ids: &[T],
    attns: &mut Vec<U>
)
[src]

Given a slice of [u64]s, appends the attentions to the given Vec.

Examples

use aleph_alpha_tokenizer::AlephAlphaTokenizer;

let mut attns: Vec<i32> = Vec::new();
AlephAlphaTokenizer::attentions_into(&[3, 4285, 4, 0, 0], &mut attns);
assert_eq!(&attns[..], &[1, 1, 1, 0, 0]);

pub fn save_vocab(
    &self,
    vocab_path: PathBuf
) -> Result<PathBuf, Box<dyn Error + Send + Sync>>
[src]

Save the vocabulary back to a file

Auto Trait Implementations

Blanket Implementations

impl<T> Any for T where
    T: 'static + ?Sized
[src]

impl<T> Borrow<T> for T where
    T: ?Sized
[src]

impl<T> BorrowMut<T> for T where
    T: ?Sized
[src]

impl<T> From<T> for T[src]

impl<T, U> Into<U> for T where
    U: From<T>, 
[src]

impl<T, U> TryFrom<U> for T where
    U: Into<T>, 
[src]

type Error = Infallible

The type returned in the event of a conversion error.

impl<T, U> TryInto<U> for T where
    U: TryFrom<T>, 
[src]

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.