sj/parser/
unicode_char.rs1use {
34 alloc::vec::Vec,
35 crate::Result,
36};
37
38#[derive(Debug)]
40pub (super) struct UnicodeChar {
41 bytes: [u8; 4],
42 idx: usize,
43}
44
45impl UnicodeChar {
46
47 #[inline(always)]
49 pub fn new() -> Self {
50 Self {
51 bytes: [0; 4],
52 idx: 0,
53 }
54 }
55
56 #[inline(always)]
58 pub fn add_hex(&mut self, hex: &u8) -> Result<()> {
59 match self.bytes.get_mut(self.idx) {
60 Some(item) => {
61 let byte = match hex {
62 b'0'..=b'9' => hex - b'0',
63 b'a'..=b'f' => hex - b'a' + 10,
64 b'A'..=b'F' => hex - b'A' + 10,
65 _ => return Err(err!("Not a hexadecimal: {:?}", char::from(*hex))),
66 };
67 match self.idx % 2 {
68 0 => *item = byte * 16,
69 _ => *item = byte,
70 };
71 self.idx += 1;
72 Ok(())
73 },
74 None => return Err(e!("Bytes are full")),
75 }
76 }
77
78 #[inline(always)]
80 pub fn is_full(&self) -> bool {
81 self.idx >= self.bytes.len()
82 }
83
84 #[inline(always)]
86 pub fn encode_as_utf8_bytes(self, out: &mut Vec<u8>) -> Result<()> {
87 const MARKER: u8 = 0b_1000_0000;
88
89 if self.is_full() {
90 let first = self.bytes[0] + self.bytes[1];
91 let last = self.bytes[2] + self.bytes[3];
92 let byte_count = match first {
93 0x00 => match last {
94 0x00..=0x7f => ByteCount::One,
95 _ => ByteCount::Two,
96 },
97 0x01..=0x07 => ByteCount::Two,
98 _ => ByteCount::Three,
99 };
100 match byte_count {
101 ByteCount::One => out.push(last),
102 ByteCount::Two => {
103 const HEADER_OF_TWO: u8 = 0b_1100_0000;
104 out.push(HEADER_OF_TWO | (first << 5 >> 3) | (last >> 6));
105 out.push(MARKER | (last << 2 >> 2));
106 },
107 ByteCount::Three => {
108 const HEADER_OF_THREE: u8 = 0b_1110_0000;
109 out.push(HEADER_OF_THREE | (first >> 4));
110 out.push(MARKER | (first << 4 >> 2) | (last >> 6));
111 out.push(MARKER | (last << 2 >> 2));
112 },
113 };
114 return Ok(());
115 }
116
117 Err(e!("Bytes are not full"))
118 }
119
120}
121
122#[derive(Debug)]
124enum ByteCount {
125 One,
126 Two,
127 Three,
128}
129
130#[test]
131fn tests() -> Result<()> {
132 for (hex, chr) in &[
133 ("0024", '\u{0024}'), ("00A2", '\u{00a2}'),
134
135 ("07fF", '\u{07ff}'),
136
137 ("0939", '\u{0939}'), ("20AC", '\u{20AC}'), ("d55c", '\u{d55c}'), ("1d2D", '\u{1d2d}'), ("0800", '\u{0800}'), ("fFFf", '\u{FFff}'),
138 ] {
139 let mut buf = [0; 4];
140 chr.encode_utf8(&mut buf);
141 let buf = &buf[..chr.len_utf8()];
142
143 let mut uc = UnicodeChar::new();
144 for b in hex.as_bytes() {
145 uc.add_hex(b)?;
146 }
147
148 let mut out = Vec::with_capacity(4);
149 uc.encode_as_utf8_bytes(&mut out)?;
150 assert_eq!(out, buf);
151 }
152
153 Ok(())
154}