Struct TokenDict

Source

pub struct TokenDict { /* private fields */ }

Expand description

a simple dictionary based tokenizer dictionary where single bytes implicitly form the lower 256 ids. It’s reference counted and cheap to clone

Implementations§

Source §

impl TokenDict

Source

pub fn detokenize<I: IntoIterator>( &self, tokens: I, ) -> Detokenization<I::IntoIter> ⓘ
where I::Item: Val<u32>,

decodes the tokens into bytes

Source

pub fn detoken_iter<I: IntoIterator>( &self, tokens: I, ) -> impl Iterator<Item = Token>
where I::Item: Val<u32>,

creates an iterator over tokens

Source

pub fn detokenize_str<I: IntoIterator>( &self, tokens: I, ) -> impl Iterator<Item = char>
where I::Item: Val<u32>,

decodes the tokens into chars, replacing invalid unicode with replacement character

Examples found in repository ?

examples/japanese.rs (line 4)

1pub fn main(){
2	let tokenizer:TokenDict=BufReader::new(File::open("words.txt").unwrap()).lines().filter_map(Result::ok).collect();
3	let tokens:Vec<u32>=tokenizer.tokenize_str("スペースは不要です").collect();
4	let detokens:String=tokenizer.detokenize_str(&tokens).collect();
5
6	print!("[");
7	for id in tokens.iter().take(tokens.len().saturating_sub(1)){print!("{id}, ")}
8	if let Some(id)=tokens.last(){print!("{id}")}
9	println!("]");
10	println!("\"{detokens}\"");
11}

More examples

Hide additional examples

examples/token.rs (line 4)

1pub fn main(){
2	let tokenizer:TokenDict=BufReader::new(File::open("words.txt").unwrap()).lines().filter_map(Result::ok).collect();
3	let tokens:Vec<u32>=tokenizer.tokenize_str("some text to tokenize").collect();
4	let detokens:String=tokenizer.detokenize_str(&tokens).collect();
5
6	print!("[");
7	for id in tokens.iter().take(tokens.len().saturating_sub(1)){print!("{id}, ")}
8	if let Some(id)=tokens.last(){print!("{id}")}
9	println!("]");
10	println!("\"{detokens}\"");
11}

Source

pub fn detokenize_string<I: IntoIterator>(&self, tokens: I) -> String
where I::Item: Val<u32>,

decodes the tokens into chars, replacing invalid unicode with replacement character

Source

pub fn frequencies<I: IntoIterator, O: Into<Option<Vec<usize>>>>( &self, data: I, freq: O, ) -> Vec<usize>
where I::Item: Val<u8>,

accumulates frequencies of each token in the text as if they were tokenized by this tokenizer

Source

pub fn get_id(&self, token: &[u8]) -> Option<u32>

gets an id for the token if it is in the dictionary

Source

pub fn iter(&self) -> DictIter<'_> ⓘ

returns an interator over the possible tokens generated by this tokenizer

Source

pub fn len(&self) -> usize

returns the number of possible token ids generated by this tokenizer

Source

pub fn pairs<I: IntoIterator, O: Into<Option<HashMap<Token, usize>>>>( &self, data: I, freq: O, ) -> HashMap<Token, usize>
where I::Item: Val<u8>,

finds token pairs and returns new tokens of them mapped to their frequencies with ids as if they were added to this dictionary. Tokens with ids within the current dictionary will have those ids

Source

pub fn push<A: AsRef<[u8]>>(&mut self, token: A)

adds the token to the dictionary

Source

pub fn string_to_tokens<S: ?Sized + AsRef<str>>(&self, input: &S) -> Vec<u32>

converts the string to a token vec

Source

pub fn token_iter<I: IntoIterator>( &self, bytes: I, ) -> impl Iterator<Item = Token>
where I::Item: Val<u8>,

creates an iterator over tokens

Source

pub fn tokenize<I: IntoIterator>(&self, bytes: I) -> Tokenization<I::IntoIter> ⓘ
where I::Item: Val<u8>,

converts the bytes to tokens

Source

pub fn tokenize_str<'a, S: ?Sized + AsRef<str>>( &self, input: &'a S, ) -> Tokenization<SliceIter<'a, u8>> ⓘ

converts the string to tokens

Examples found in repository ?

examples/japanese.rs (line 3)

1pub fn main(){
2	let tokenizer:TokenDict=BufReader::new(File::open("words.txt").unwrap()).lines().filter_map(Result::ok).collect();
3	let tokens:Vec<u32>=tokenizer.tokenize_str("スペースは不要です").collect();
4	let detokens:String=tokenizer.detokenize_str(&tokens).collect();
5
6	print!("[");
7	for id in tokens.iter().take(tokens.len().saturating_sub(1)){print!("{id}, ")}
8	if let Some(id)=tokens.last(){print!("{id}")}
9	println!("]");
10	println!("\"{detokens}\"");
11}

More examples

Hide additional examples

examples/token.rs (line 3)

1pub fn main(){
2	let tokenizer:TokenDict=BufReader::new(File::open("words.txt").unwrap()).lines().filter_map(Result::ok).collect();
3	let tokens:Vec<u32>=tokenizer.tokenize_str("some text to tokenize").collect();
4	let detokens:String=tokenizer.detokenize_str(&tokens).collect();
5
6	print!("[");
7	for id in tokens.iter().take(tokens.len().saturating_sub(1)){print!("{id}, ")}
8	if let Some(id)=tokens.last(){print!("{id}")}
9	println!("]");
10	println!("\"{detokens}\"");
11}