1use crate::{Bytes, Error, UnsafeWriter};
2
3const CHAR_WIDTH: &[u8; 256] = &[
4 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ];
22
23#[must_use]
24pub const fn is_ascii_mutf8(bytes: &[u8]) -> bool {
25 bytes.is_ascii() && !has_zero(bytes)
26}
27
28#[must_use]
29pub const fn is_mutf8(bytes: &[u8]) -> bool {
30 let mut index = 0;
31 while index < bytes.len() {
32 let byte = bytes[index];
33 let w = unsafe { *CHAR_WIDTH.as_ptr().add(byte as usize) };
34 if w == 0 {
35 return false;
36 }
37 index += w as usize;
38 }
39
40 true
41}
42
43#[inline]
44const fn has_zero(bytes: &[u8]) -> bool {
45 const CHUNK_SIZE: usize = 16;
46
47 let mut i = 0;
48
49 while i + CHUNK_SIZE <= bytes.len() {
50 let chunk_end = i + CHUNK_SIZE;
51
52 let mut flag = false;
53 while i < chunk_end {
54 flag |= bytes[i] == 0;
55 i += 1;
56 }
57
58 if flag {
59 return true;
60 }
61 }
62
63 let mut flag = false;
64 while i < bytes.len() {
65 flag |= bytes[i] == 0;
66 i += 1;
67 }
68
69 flag
70}
71
72pub const fn encode_mutf8_len(bytes: &str) -> usize {
73 let mut l = 0;
74 let mut index = 0;
75 let bytes = bytes.as_bytes();
76 while index < bytes.len() {
77 let byte = bytes[index];
78 let w = unsafe { *CHAR_WIDTH.as_ptr().add(byte as usize) };
79 index += w as usize;
80 if w == 0 {
81 if byte == 0 {
82 l += 2 - 1;
83 index += 1;
84 } else {
85 l += 6 - 4;
86 index += 4;
87 }
88 }
89 }
90 l + index
91}
92
93pub unsafe fn encode_mutf8(bytes: &[u8], w: &mut UnsafeWriter) {
97 let mut index = 0;
98 let mut start = 0;
99
100 while let Some(&byte) = bytes.get(index) {
101 let x = unsafe { *CHAR_WIDTH.get_unchecked(byte as usize) };
102 index += x as usize;
103 if x != 0 {
104 continue;
105 }
106 if byte == 0 {
107 unsafe {
108 w.write(bytes.get_unchecked(start..index));
109 w.write(&[0xc0, 0x80]);
110 }
111 index += 1;
112 start = index;
113 } else {
114 let code_point = unsafe {
115 core::str::from_utf8_unchecked(&bytes[index..index + 4])
116 .chars()
117 .next()
118 .unwrap_unchecked() as u32
119 };
120 let code_point = code_point - 0x10000;
121 let first = ((code_point >> 10) as u16) | 0xD800;
122 let second = ((code_point & 0x3FF) as u16) | 0xDC00;
123
124 unsafe {
125 w.write(bytes.get_unchecked(start..index));
126 w.write(&[
127 0xE0 | ((first & 0xF000) >> 12) as u8,
128 0x80 | ((first & 0xFC0) >> 6) as u8,
129 0x80 | ((first & 0x3F) as u8),
130 0xE0 | ((second & 0xF000) >> 12) as u8,
131 0x80 | ((second & 0xFC0) >> 6) as u8,
132 0x80 | (second & 0x3F) as u8,
133 ]);
134 }
135 index += 4;
136 start = index;
137 }
138 }
139 unsafe { w.write(bytes.get_unchecked(start..index)) }
140}
141
142pub fn decode_mutf8_len(mut bytes: &[u8]) -> Result<usize, Error> {
143 let mut len = 0usize;
144
145 while let Ok(byte) = bytes.u8() {
146 match byte {
147 0x01..=0x7F => len += 1,
148 0xC2..=0xDF => {
149 let sec = bytes.u8()?;
150 if !(byte == 0xC0 && sec == 0x80) {
151 len += 2;
152 } else {
153 len += 1;
154 }
155 }
156 0xE0..=0xEF => {
157 let sec = bytes.u8()?;
158 let third = bytes.u8()?;
159 if sec & 0xC0 != 0x80 || third & 0xC0 != 0x80 {
160 return Err(Error);
161 }
162 match (byte, sec) {
163 (0xE0, 0xA0..=0xBF)
164 | (0xE1..=0xEC | 0xEE | 0xEF, 0x80..=0xBF)
165 | (0xED, 0x80..=0x9F) => {
166 len += 3;
167 }
168 (0xED, 0xA0..=0xAF) => {
169 if bytes.u8()? != 0xED {
170 return Err(Error);
171 }
172 match bytes.u8()? {
173 0xB0..=0xBF => (),
174 _ => return Err(Error),
175 }
176 if bytes.u8()? & 0xC0 != 0x80 {
177 return Err(Error);
178 }
179 len += 4;
180 }
181 _ => return Err(Error),
182 }
183 }
184 _ => return Err(Error),
185 }
186 }
187 Ok(len)
188}
189
190pub unsafe fn decode_mutf8(bytes: &[u8], w: &mut UnsafeWriter) -> Result<(), Error> {
192 let mut index = 0;
193 let mut start = 0;
194
195 while let Some(&byte) = bytes.get(index) {
196 match byte {
197 0x01..=0x7F => index += 1,
198 0xC2..=0xDF => unsafe {
199 let sec = match bytes.get(index + 1) {
200 Some(&byte) => byte,
201 _ => return Err(Error),
202 };
203 index += 2;
204 if !(byte == 0xC0 && sec == 0x80) {
205 } else {
206 w.write(bytes.get_unchecked(start..index));
207 w.write_byte(b'\0');
208 start = index;
209 }
210 },
211 0xE0..=0xEF => unsafe {
212 let sec = match bytes.get(index + 1) {
213 Some(&byte) if byte & 0xC0 == 0x80 => byte,
214 _ => return Err(Error),
215 };
216 let third = match bytes.get(index + 2) {
217 Some(&byte) if byte & 0xC0 == 0x80 => byte,
218 _ => return Err(Error),
219 };
220 match (byte, sec) {
221 (0xE0, 0xA0..=0xBF)
222 | (0xE1..=0xEC | 0xEE | 0xEF, 0x80..=0xBF)
223 | (0xED, 0x80..=0x9F) => {
224 index += 3;
225 }
226 (0xED, 0xA0..=0xAF) => {
227 match bytes.get(index + 3) {
228 Some(0xED) => (),
229 _ => return Err(Error),
230 };
231 let fifth = match bytes.get(index + 4) {
232 Some(&x @ 0xB0..=0xBF) => x & 0x3F,
233 _ => return Err(Error),
234 };
235 let sixth = match bytes.get(index + 5) {
236 Some(&x) if x & 0xC0 == 0x80 => x & 0x3F,
237 _ => return Err(Error),
238 };
239 let s1 = 0xD000 | (u32::from(sec & 0x3F) << 6) | u32::from(third & 0x3F);
240 let s2 = 0xD000 | (u32::from(fifth) << 6) | u32::from(sixth);
241 let point = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
242 w.write(&[
243 0xF0 | ((point & 0x1C0000) >> 18) as u8,
244 0x80 | ((point & 0x3F000) >> 12) as u8,
245 0x80 | ((point & 0xFC0) >> 6) as u8,
246 0x80 | (point & 0x3F) as u8,
247 ]);
248 }
249 _ => return Err(Error),
250 }
251 },
252 _ => return Err(Error),
253 }
254 }
255
256 unsafe { w.write(bytes.get_unchecked(start..index)) }
257
258 Ok(())
259}