Skip to main content

token_dict/
lib.rs

1impl<I:IntoIterator> From<I> for UTF8CharIter<I::IntoIter> where I::Item:Val<u8>{
2	fn from(value:I)->Self{
3		Self{inner:value.into_iter()}
4	}
5}
6impl<I:Iterator> Iterator for UTF8CharIter<I> where I::Item:Val<u8>{
7	fn next(&mut self)->Option<Result<char,[u8;4]>>{
8		let inner=&mut self.inner;
9
10		let firstbyte=inner.next()?.val();
11		let mut bytes=[firstbyte,0,0,0];
12		let (charlen,value)=if firstbyte&0b10000000==0b00000000{(1,firstbyte)}
13		else if firstbyte&0b11000000==0b10000000{return Some(Err(bytes))}
14		else if firstbyte&0b11100000==0b11000000{(2,firstbyte&0b00011111)}
15		else if firstbyte&0b11110000==0b11100000{(3,firstbyte&0b00001111)}
16		else if firstbyte&0b11111000==0b11110000{(4,firstbyte&0b00000111)}
17		else                                    {return Some(Err(bytes))};
18		let mut value=value as u32;
19
20		for n in 1..charlen{
21			let nextbyte=if let Some(b)=inner.next(){b.val()}else{return Some(Err(bytes))};
22			bytes[n]=nextbyte;
23			if nextbyte&0b11000000==0b10000000{value=(value<<6)+((nextbyte&0b00111111) as u32)}else{return Some(Err(bytes))}
24		}
25		char::from_u32(value).map(|c|Ok(c)).or(Some(Err(bytes)))
26	}
27	fn size_hint(&self)->(usize,Option<usize>){
28		let (lowerbytes,upperbytes)=self.inner.size_hint();
29
30		(lowerbytes/4,upperbytes)
31	}
32	type Item=Result<char,[u8;4]>;
33}
34impl<I:Iterator> UTF8CharIter<I> where I::Item:Val<u8>{
35	/// converts into the inner value
36	pub fn into_inner(self)->I{self.inner}
37}
38impl<T:Copy> Val<T> for &T{
39	fn val(self)->T{*self}
40}
41impl<T> Val<T> for T{
42	fn val(self)->T{self}
43}
44#[cfg(test)]
45mod tests{
46	#[test]
47	fn utf8_test_1(){
48		let iter=UTF8CharIter::from("correct string".as_bytes());
49		let c:String=iter.filter_map(|c|c.ok()).collect();
50		assert_eq!(c,"correct string");
51		let iter=UTF8CharIter::from("incorrect string".bytes().chain([255,255]).chain(" incorrectness removed".bytes()));
52		let c:String=iter.filter_map(|c|c.ok()).collect();
53		assert_eq!(c,"incorrect string incorrectness removed");
54
55		let iter=UTF8CharIter::from("正しくない文字列".bytes().chain([128,255,0x20,255,0b10000000]).chain(" 不正確さは削除された".bytes()));
56		let c:String=iter.filter_map(|c|c.ok()).collect();
57		assert_eq!(c,"正しくない文字列  不正確さは削除された");
58	}
59	#[test]
60	fn test_utf8_char_iter_valid() {
61		// Unicode character '€' (euro sign)
62		let bytes = vec![0xE2u8, 0x82u8, 0xACu8];
63		let mut iter: UTF8CharIter<_> = bytes.into_iter().into();
64		match iter.next() {
65			Some(Ok(c)) => assert_eq!(c, '€'),
66			other => panic!("Expected Ok('€'), got {:?}", other),
67		}
68		// No more characters
69		assert!(iter.next().is_none());
70	}
71	#[test]
72	fn test_utf8_char_iter_error() {
73		// Invalid UTF-8 start byte
74		let bytes = vec![0xFFu8];
75		let mut iter: UTF8CharIter<_> = bytes.into_iter().into();
76		match iter.next() {
77			Some(Err(buf)) => assert_eq!(buf[0], 0xFF),
78			other => panic!("Expected Err with first byte 0xFF, got {:?}", other),
79		}
80	}
81	use super::*;
82}
83/// module for token dictionary
84pub mod dict;
85/// module for Token type
86pub mod token;
87#[derive(Clone,Debug)]
88/// iterator for live converting utf8 to chars, returning errors for all bytes that aren't part of a valid character. useful for lazily detokenizing into a string
89pub struct UTF8CharIter<I:Iterator> where I::Item:Val<u8>{inner:I}
90/// trait for unifying primitives and their references
91pub trait Val<T>{
92	/// gets the value
93	fn val(self)->T;
94}
95pub use {dict::TokenDict,token::Token};