char_from_utf8/
lib.rs

1pub trait FromUtf8{
2	type Output;
3	fn from_utf8(code_units:&[u8])->Option<Self::Output>;
4}
5impl FromUtf8 for char{
6	type Output = char;
7    fn from_utf8(code_units:&[u8])->Option<Self::Output> {
8		if code_units.len() == 1{
9			let byte = code_units[0];
10			if (byte & 0b10000000) != 0{
11				//panic!("invalid single utf-8 code unit");
12				return None;
13			}
14			return char::from_u32(byte as u32);
15		}else if code_units.len() > 1{
16			let first_byte = code_units[0];
17			let bytes = 'bytes_number:{
18				for i in 2..=7{
19					let i = 9 - i;
20					let test = 2u8.pow(i);
21					if (first_byte & test) == 0{
22						break 'bytes_number 7 - i;
23					}
24				}
25				//panic!("invalid utf-8 code units sequence");
26				return None;
27			};
28			if bytes > code_units.len() as u32{
29				//panic!("invalid utf-8 code units sequence: expected {bytes} bytes but only has {}",code_units.len());
30				return None;
31			}
32			//let high_byte = (2u8.pow(bytes) - 1) & first_byte;
33			let mut code_point = 0u32;
34			let last_low_byte_index = bytes - 1;
35			for k in 1..=last_low_byte_index{
36				let index = bytes - k;
37				let code_unit = code_units[index as usize];
38				code_point = code_point | ((code_unit as u32 & 0b00111111) << ((k - 1)*6));
39			}
40			let high_byte = ((2u8.pow(bytes) - 1) & first_byte) as u32;
41			let high_byte = high_byte << (last_low_byte_index*6);
42			code_point = code_point | high_byte;
43			return char::from_u32(code_point);
44		}else{
45			//panic!("invalid utf-8 code units sequence");
46			return None;
47		}
48    }
49}
50
51#[cfg(test)]
52mod tests {
53    use super::*;
54
55    #[test]
56    fn it_works() {
57		assert_eq!(char::from_utf8(&[0x61]), Some('a'));
58		assert_eq!(char::from_utf8(&[0xC3,0x80]), Some('À'));
59        assert_eq!(char::from_utf8(&[0xE6,0x88,0x91]), Some('我'));
60		assert_eq!(char::from_utf8(&[0xF0,0x93,0x83,0xB0]), Some('𓃰'));
61    }
62}