token-dict 1.0.2

basic dictionary based tokenization
Documentation
impl AsMut<Self> for Token{
	fn as_mut(&mut self)->&mut Self{self}
}
impl AsRef<[u8]> for Token{
	fn as_ref(&self)->&[u8]{self.data()}
}
impl AsRef<Self> for Token{
	fn as_ref(&self)->&Self{self}
}
impl Borrow<[u8]> for Token{
	fn borrow(&self)->&[u8]{self.data()}
}
impl Deref for Token{
	fn deref(&self)->&Self::Target{self.data()}
	type Target=[u8];
}
impl DoubleEndedIterator for TokenIntoIter{
	fn next_back(&mut self)->Option<Self::Item>{self.range.next_back().map(|n|self.token[n])}
	fn nth_back(&mut self,n:usize)->Option<Self::Item>{self.range.nth_back(n).map(|n|self.token[n])}
	fn rfold<B,F:FnMut(B,Self::Item)->B>(self,init:B,f:F)->B{self.token.data()[self.range].iter().copied().rfold(init,f)}
}
impl Eq for Token{}
impl ExactSizeIterator for TokenIntoIter{
	fn len(&self)->usize{self.range.len()}
}
impl Hash for Token{
	fn hash<H:Hasher>(&self,hasher:&mut H){self.data().hash(hasher)}
}
impl IntoIterator for Token{
	fn into_iter(self)->Self::IntoIter{
		TokenIntoIter{range:0..self.data().len(),token:self}
	}
	type IntoIter=TokenIntoIter;
	type Item=u8;
}
impl Iterator for TokenIntoIter{
	fn count(self)->usize{self.range.count()}
	fn fold<B,F:FnMut(B,Self::Item)->B>(self,init:B,f:F)->B{self.token.data()[self.range].iter().copied().fold(init,f)}
	fn last(mut self)->Option<Self::Item>{self.next_back()}
	fn next(&mut self)->Option<Self::Item>{self.range.next().map(|n|self.token[n])}
	fn nth(&mut self,n:usize)->Option<Self::Item>{self.range.nth(n).map(|n|self.token[n])}
	fn size_hint(&self)->(usize,Option<usize>){self.range.size_hint()}
	type Item=u8;
}
impl Token{
	/// creates a token from a single byte
	pub (crate) const fn single(n:u8)->Token{
		Self{id:n as u32,token:None}
	}
	pub (crate) fn new(id:u32,token:Option<Arc<[u8]>>)->Self{
        Self{id,token}
    }
	/// returns the token bytes
	pub fn data(&self)->&[u8]{
		let id=self.id as usize;
		let token=&self.token;

		if id<256{&SINGLE_BYTES[id..id+1]}else{token.as_deref().unwrap()}
	}
	/// returns the token id
	pub fn id(&self)->u32{self.id}
}
impl<'a> IntoIterator for &'a Token{
    fn into_iter(self)->Self::IntoIter{self.deref().iter()}
    type IntoIter=SliceIter<'a,u8>;
    type Item=&'a u8;
}
impl<T:?Sized+AsRef<[u8]>> PartialEq<T> for Token{
	fn eq(&self,other:&T)->bool{self.data()==other.as_ref()}
}

#[cfg(test)]
mod tests{
	#[test]
	fn test_token_into_iter_single_byte() {
		// Use a Token with id < 256 and no custom data
		let token = Token { id: 5, token: None };
		let v: Vec<u8> = token.clone().into_iter().collect();
		// SINGLE_BYTES[5] == 5
		assert_eq!(v, vec![5]);
	}
	#[test]
	fn test_token_into_iter_multi_byte() {
		// Create a custom token of two bytes [10, 20]
		let data:Arc<[u8]> = Arc::from([10u8, 20u8].as_ref());
		let token = Token { id: 256, token: Some(data.clone()) };
		let mut iter = token.clone().into_iter();
		// next should yield first element
		assert_eq!(iter.next(), Some(10));
		// next_back should yield last element
		assert_eq!(iter.next_back(), Some(20));
	}
	#[test]
	fn test_exact_size_iterator_len() {
		// Custom two-byte token
		let data:Arc<[u8]> = Arc::from([30u8, 40u8].as_ref());
		let token = Token { id: 300, token: Some(data.clone()) };
		let iter = token.into_iter();
		assert_eq!(iter.len(), 2, "ExactSizeIterator.len() should reflect number of bytes");
	}
	use super::*;
}

pub (crate) const SINGLE_BYTES:&[u8;256]=&[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255];
#[cfg_attr(feature="serial",derive(Deserialize,Serialize))]
#[derive(Clone,Debug,Default)]
/// a reference counted structure with a token and its id
pub struct Token{id:u32,token:Option<Arc<[u8]>>}
#[derive(Clone,Debug)]
/// a simple dictionary based tokenizer dictionary where single bytes implicitly form the lower 256 ids. It's reference counted and cheap to clone
pub struct TokenIntoIter{range:Range<usize>,token:Token}
#[cfg(feature="serial")]
use serde::{Deserialize,Serialize};
use std::{
   borrow::Borrow,cmp::{Eq,PartialEq},hash::{Hash,Hasher},ops::{Deref,Range},slice::Iter as SliceIter,sync::Arc,
};