Skip to main content

token_dict/
token.rs

1impl AsMut<Self> for Token{
2	fn as_mut(&mut self)->&mut Self{self}
3}
4impl AsRef<[u8]> for Token{
5	fn as_ref(&self)->&[u8]{self.data()}
6}
7impl AsRef<Self> for Token{
8	fn as_ref(&self)->&Self{self}
9}
10impl Borrow<[u8]> for Token{
11	fn borrow(&self)->&[u8]{self.data()}
12}
13impl Deref for Token{
14	fn deref(&self)->&Self::Target{self.data()}
15	type Target=[u8];
16}
17impl DoubleEndedIterator for TokenIntoIter{
18	fn next_back(&mut self)->Option<Self::Item>{self.range.next_back().map(|n|self.token[n])}
19	fn nth_back(&mut self,n:usize)->Option<Self::Item>{self.range.nth_back(n).map(|n|self.token[n])}
20	fn rfold<B,F:FnMut(B,Self::Item)->B>(self,init:B,f:F)->B{self.token.data()[self.range].iter().copied().rfold(init,f)}
21}
22impl Eq for Token{}
23impl ExactSizeIterator for TokenIntoIter{
24	fn len(&self)->usize{self.range.len()}
25}
26impl Hash for Token{
27	fn hash<H:Hasher>(&self,hasher:&mut H){self.data().hash(hasher)}
28}
29impl IntoIterator for Token{
30	fn into_iter(self)->Self::IntoIter{
31		TokenIntoIter{range:0..self.data().len(),token:self}
32	}
33	type IntoIter=TokenIntoIter;
34	type Item=u8;
35}
36impl Iterator for TokenIntoIter{
37	fn count(self)->usize{self.range.count()}
38	fn fold<B,F:FnMut(B,Self::Item)->B>(self,init:B,f:F)->B{self.token.data()[self.range].iter().copied().fold(init,f)}
39	fn last(mut self)->Option<Self::Item>{self.next_back()}
40	fn next(&mut self)->Option<Self::Item>{self.range.next().map(|n|self.token[n])}
41	fn nth(&mut self,n:usize)->Option<Self::Item>{self.range.nth(n).map(|n|self.token[n])}
42	fn size_hint(&self)->(usize,Option<usize>){self.range.size_hint()}
43	type Item=u8;
44}
45impl Token{
46	/// creates a token from a single byte
47	pub (crate) const fn single(n:u8)->Token{
48		Self{id:n as u32,token:None}
49	}
50	pub (crate) fn new(id:u32,token:Option<Arc<[u8]>>)->Self{
51        Self{id,token}
52    }
53	/// returns the token bytes
54	pub fn data(&self)->&[u8]{
55		let id=self.id as usize;
56		let token=&self.token;
57
58		if id<256{&SINGLE_BYTES[id..id+1]}else{token.as_deref().unwrap()}
59	}
60	/// returns the token id
61	pub fn id(&self)->u32{self.id}
62}
63impl<'a> IntoIterator for &'a Token{
64    fn into_iter(self)->Self::IntoIter{self.deref().iter()}
65    type IntoIter=SliceIter<'a,u8>;
66    type Item=&'a u8;
67}
68impl<T:?Sized+AsRef<[u8]>> PartialEq<T> for Token{
69	fn eq(&self,other:&T)->bool{self.data()==other.as_ref()}
70}
71#[cfg(test)]
72mod tests{
73	#[test]
74	fn test_token_into_iter_single_byte() {
75		// Use a Token with id < 256 and no custom data
76		let token = Token { id: 5, token: None };
77		let v: Vec<u8> = token.clone().into_iter().collect();
78		// SINGLE_BYTES[5] == 5
79		assert_eq!(v, vec![5]);
80	}
81	#[test]
82	fn test_token_into_iter_multi_byte() {
83		// Create a custom token of two bytes [10, 20]
84		let data:Arc<[u8]> = Arc::from([10u8, 20u8].as_ref());
85		let token = Token { id: 256, token: Some(data.clone()) };
86		let mut iter = token.clone().into_iter();
87		// next should yield first element
88		assert_eq!(iter.next(), Some(10));
89		// next_back should yield last element
90		assert_eq!(iter.next_back(), Some(20));
91	}
92	#[test]
93	fn test_exact_size_iterator_len() {
94		// Custom two-byte token
95		let data:Arc<[u8]> = Arc::from([30u8, 40u8].as_ref());
96		let token = Token { id: 300, token: Some(data.clone()) };
97		let iter = token.into_iter();
98		assert_eq!(iter.len(), 2, "ExactSizeIterator.len() should reflect number of bytes");
99	}
100	use super::*;
101}
102pub (crate) const SINGLE_BYTES:&[u8;256]=&[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255];
103#[cfg_attr(feature="serial",derive(Deserialize,Serialize))]
104#[derive(Clone,Debug,Default)]
105/// a reference counted structure with a token and its id
106pub struct Token{id:u32,token:Option<Arc<[u8]>>}
107#[derive(Clone,Debug)]
108/// a simple dictionary based tokenizer dictionary where single bytes implicitly form the lower 256 ids. It's reference counted and cheap to clone
109pub struct TokenIntoIter{range:Range<usize>,token:Token}
110#[cfg(feature="serial")]
111use serde::{Deserialize,Serialize};
112use std::{
113   borrow::Borrow,cmp::{Eq,PartialEq},hash::{Hash,Hasher},ops::{Deref,Range},slice::Iter as SliceIter,sync::Arc,
114};