Skip to main content

TokenDict

Struct TokenDict 

Source
pub struct TokenDict { /* private fields */ }
Expand description

a simple dictionary based tokenizer dictionary where single bytes implicitly form the lower 256 ids. It’s reference counted and cheap to clone

Implementations§

Source§

impl TokenDict

Source

pub fn detokenize<I: IntoIterator>( &self, tokens: I, ) -> Detokenization<I::IntoIter>
where I::Item: Val<u32>,

decodes the tokens into bytes

Source

pub fn detoken_iter<I: IntoIterator>( &self, tokens: I, ) -> impl Iterator<Item = Token>
where I::Item: Val<u32>,

creates an iterator over tokens

Source

pub fn detokenize_str<I: IntoIterator>( &self, tokens: I, ) -> impl Iterator<Item = char>
where I::Item: Val<u32>,

decodes the tokens into chars, replacing invalid unicode with replacement character

Examples found in repository?
examples/japanese.rs (line 4)
1pub fn main(){
2	let tokenizer:TokenDict=BufReader::new(File::open("words.txt").unwrap()).lines().filter_map(Result::ok).collect();
3	let tokens:Vec<u32>=tokenizer.tokenize_str("スペースは不要です").collect();
4	let detokens:String=tokenizer.detokenize_str(&tokens).collect();
5
6	print!("[");
7	for id in tokens.iter().take(tokens.len().saturating_sub(1)){print!("{id}, ")}
8	if let Some(id)=tokens.last(){print!("{id}")}
9	println!("]");
10	println!("\"{detokens}\"");
11}
More examples
Hide additional examples
examples/token.rs (line 4)
1pub fn main(){
2	let tokenizer:TokenDict=BufReader::new(File::open("words.txt").unwrap()).lines().filter_map(Result::ok).collect();
3	let tokens:Vec<u32>=tokenizer.tokenize_str("some text to tokenize").collect();
4	let detokens:String=tokenizer.detokenize_str(&tokens).collect();
5
6	print!("[");
7	for id in tokens.iter().take(tokens.len().saturating_sub(1)){print!("{id}, ")}
8	if let Some(id)=tokens.last(){print!("{id}")}
9	println!("]");
10	println!("\"{detokens}\"");
11}
Source

pub fn detokenize_string<I: IntoIterator>(&self, tokens: I) -> String
where I::Item: Val<u32>,

decodes the tokens into chars, replacing invalid unicode with replacement character

Source

pub fn frequencies<I: IntoIterator, O: Into<Option<Vec<usize>>>>( &self, data: I, freq: O, ) -> Vec<usize>
where I::Item: Val<u8>,

accumulates frequencies of each token in the text as if they were tokenized by this tokenizer

Source

pub fn get_id(&self, token: &[u8]) -> Option<u32>

gets an id for the token if it is in the dictionary

Source

pub fn iter(&self) -> DictIter<'_>

returns an interator over the possible tokens generated by this tokenizer

Source

pub fn len(&self) -> usize

returns the number of possible token ids generated by this tokenizer

Source

pub fn pairs<I: IntoIterator, O: Into<Option<HashMap<Token, usize>>>>( &self, data: I, freq: O, ) -> HashMap<Token, usize>
where I::Item: Val<u8>,

finds token pairs and returns new tokens of them mapped to their frequencies with ids as if they were added to this dictionary. Tokens with ids within the current dictionary will have those ids

Source

pub fn push<A: AsRef<[u8]>>(&mut self, token: A)

adds the token to the dictionary

Source

pub fn string_to_tokens<S: ?Sized + AsRef<str>>(&self, input: &S) -> Vec<u32>

converts the string to a token vec

Source

pub fn token_iter<I: IntoIterator>( &self, bytes: I, ) -> impl Iterator<Item = Token>
where I::Item: Val<u8>,

creates an iterator over tokens

Source

pub fn tokenize<I: IntoIterator>(&self, bytes: I) -> Tokenization<I::IntoIter>
where I::Item: Val<u8>,

converts the bytes to tokens

Source

pub fn tokenize_str<'a, S: ?Sized + AsRef<str>>( &self, input: &'a S, ) -> Tokenization<SliceIter<'a, u8>>

converts the string to tokens

Examples found in repository?
examples/japanese.rs (line 3)
1pub fn main(){
2	let tokenizer:TokenDict=BufReader::new(File::open("words.txt").unwrap()).lines().filter_map(Result::ok).collect();
3	let tokens:Vec<u32>=tokenizer.tokenize_str("スペースは不要です").collect();
4	let detokens:String=tokenizer.detokenize_str(&tokens).collect();
5
6	print!("[");
7	for id in tokens.iter().take(tokens.len().saturating_sub(1)){print!("{id}, ")}
8	if let Some(id)=tokens.last(){print!("{id}")}
9	println!("]");
10	println!("\"{detokens}\"");
11}
More examples
Hide additional examples
examples/token.rs (line 3)
1pub fn main(){
2	let tokenizer:TokenDict=BufReader::new(File::open("words.txt").unwrap()).lines().filter_map(Result::ok).collect();
3	let tokens:Vec<u32>=tokenizer.tokenize_str("some text to tokenize").collect();
4	let detokens:String=tokenizer.detokenize_str(&tokens).collect();
5
6	print!("[");
7	for id in tokens.iter().take(tokens.len().saturating_sub(1)){print!("{id}, ")}
8	if let Some(id)=tokens.last(){print!("{id}")}
9	println!("]");
10	println!("\"{detokens}\"");
11}
Source

pub fn tokenize_string(&self, input: String) -> Tokenization<VecIntoIter<u8>>

converts the string to tokens

Source

pub fn tokens_to_string<V: ?Sized + AsRef<[u32]>>(&self, input: &V) -> String

converts the token vec to string

Trait Implementations§

Source§

impl AsMut<TokenDict> for TokenDict

Source§

fn as_mut(&mut self) -> &mut Self

Converts this type into a mutable reference of the (usually inferred) input type.
Source§

impl AsRef<TokenDict> for TokenDict

Source§

fn as_ref(&self) -> &Self

Converts this type into a shared reference of the (usually inferred) input type.
Source§

impl Clone for TokenDict

Source§

fn clone(&self) -> TokenDict

Returns a duplicate of the value. Read more
1.0.0 · Source§

fn clone_from(&mut self, source: &Self)

Performs copy-assignment from source. Read more
Source§

impl Debug for TokenDict

Source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more
Source§

impl Default for TokenDict

Source§

fn default() -> Self

Returns the “default value” for a type. Read more
Source§

impl<A: AsRef<[u8]>> Extend<A> for TokenDict

Source§

fn extend<I: IntoIterator<Item = A>>(&mut self, iter: I)

Extends a collection with the contents of an iterator. Read more
Source§

fn extend_one(&mut self, item: A)

🔬This is a nightly-only experimental API. (extend_one)
Extends a collection with exactly one element.
Source§

fn extend_reserve(&mut self, additional: usize)

🔬This is a nightly-only experimental API. (extend_one)
Reserves capacity in a collection for the given number of additional elements. Read more
Source§

impl<A: AsRef<[u8]>> FromIterator<A> for TokenDict

Source§

fn from_iter<I: IntoIterator<Item = A>>(iter: I) -> Self

Creates a value from an iterator. Read more
Source§

impl Index<u32> for TokenDict

Source§

type Output = [u8]

The returned type after indexing.
Source§

fn index(&self, ix: u32) -> &Self::Output

Performs the indexing (container[index]) operation. Read more
Source§

impl Index<usize> for TokenDict

Source§

type Output = Token

The returned type after indexing.
Source§

fn index(&self, ix: usize) -> &Self::Output

Performs the indexing (container[index]) operation. Read more
Source§

impl IntoIterator for TokenDict

Source§

type IntoIter = DictIntoIter

Which kind of iterator are we turning this into?
Source§

type Item = Token

The type of the elements being iterated over.
Source§

fn into_iter(self) -> Self::IntoIter

Creates an iterator from a value. Read more

Auto Trait Implementations§

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> CloneToUninit for T
where T: Clone,

Source§

unsafe fn clone_to_uninit(&self, dest: *mut u8)

🔬This is a nightly-only experimental API. (clone_to_uninit)
Performs copy-assignment from self to dest. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> ToOwned for T
where T: Clone,

Source§

type Owned = T

The resulting type after obtaining ownership.
Source§

fn to_owned(&self) -> T

Creates owned data from borrowed data, usually by cloning. Read more
Source§

fn clone_into(&self, target: &mut T)

Uses borrowed data to replace owned data, usually by cloning. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<T> Val<T> for T

Source§

fn val(self) -> T

gets the value