mser/
mutf8.rs

1use crate::{Bytes, Error, UnsafeWriter};
2
3const CHAR_WIDTH: &[u8; 256] = &[
4    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
5    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
6    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
7    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
8    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
9    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
10    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
11    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
12    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
13    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
14    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
15    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
16    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
17    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
18    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
19    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
20    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
21];
22
23#[must_use]
24pub const fn is_ascii_mutf8(bytes: &[u8]) -> bool {
25    bytes.is_ascii() && !has_zero(bytes)
26}
27
28#[must_use]
29pub const fn is_mutf8(bytes: &[u8]) -> bool {
30    let mut index = 0;
31    while index < bytes.len() {
32        let byte = bytes[index];
33        let w = unsafe { *CHAR_WIDTH.as_ptr().add(byte as usize) };
34        if w == 0 {
35            return false;
36        }
37        index += w as usize;
38    }
39
40    true
41}
42
43#[inline]
44const fn has_zero(bytes: &[u8]) -> bool {
45    const CHUNK_SIZE: usize = 16;
46
47    let mut i = 0;
48
49    while i + CHUNK_SIZE <= bytes.len() {
50        let chunk_end = i + CHUNK_SIZE;
51
52        let mut flag = false;
53        while i < chunk_end {
54            flag |= bytes[i] == 0;
55            i += 1;
56        }
57
58        if flag {
59            return true;
60        }
61    }
62
63    let mut flag = false;
64    while i < bytes.len() {
65        flag |= bytes[i] == 0;
66        i += 1;
67    }
68
69    flag
70}
71
72pub const fn encode_mutf8_len(bytes: &str) -> usize {
73    let mut l = 0;
74    let mut index = 0;
75    let bytes = bytes.as_bytes();
76    while index < bytes.len() {
77        let byte = bytes[index];
78        let w = unsafe { *CHAR_WIDTH.as_ptr().add(byte as usize) };
79        index += w as usize;
80        if w == 0 {
81            if byte == 0 {
82                l += 2 - 1;
83                index += 1;
84            } else {
85                l += 6 - 4;
86                index += 4;
87            }
88        }
89    }
90    l + index
91}
92
93/// # Safety
94///
95/// `bytes` is UTF-8
96pub unsafe fn encode_mutf8(bytes: &[u8], w: &mut UnsafeWriter) {
97    let mut index = 0;
98    let mut start = 0;
99
100    while let Some(&byte) = bytes.get(index) {
101        let x = unsafe { *CHAR_WIDTH.get_unchecked(byte as usize) };
102        index += x as usize;
103        if x != 0 {
104            continue;
105        }
106        if byte == 0 {
107            unsafe {
108                w.write(bytes.get_unchecked(start..index));
109                w.write(&[0xc0, 0x80]);
110            }
111            index += 1;
112            start = index;
113        } else {
114            let code_point = unsafe {
115                core::str::from_utf8_unchecked(&bytes[index..index + 4])
116                    .chars()
117                    .next()
118                    .unwrap_unchecked() as u32
119            };
120            let code_point = code_point - 0x10000;
121            let first = ((code_point >> 10) as u16) | 0xD800;
122            let second = ((code_point & 0x3FF) as u16) | 0xDC00;
123
124            unsafe {
125                w.write(bytes.get_unchecked(start..index));
126                w.write(&[
127                    0xE0 | ((first & 0xF000) >> 12) as u8,
128                    0x80 | ((first & 0xFC0) >> 6) as u8,
129                    0x80 | ((first & 0x3F) as u8),
130                    0xE0 | ((second & 0xF000) >> 12) as u8,
131                    0x80 | ((second & 0xFC0) >> 6) as u8,
132                    0x80 | (second & 0x3F) as u8,
133                ]);
134            }
135            index += 4;
136            start = index;
137        }
138    }
139    unsafe { w.write(bytes.get_unchecked(start..index)) }
140}
141
142pub fn decode_mutf8_len(mut bytes: &[u8]) -> Result<usize, Error> {
143    let mut len = 0usize;
144
145    while let Ok(byte) = bytes.u8() {
146        match byte {
147            0x01..=0x7F => len += 1,
148            0xC2..=0xDF => {
149                let sec = bytes.u8()?;
150                if !(byte == 0xC0 && sec == 0x80) {
151                    len += 2;
152                } else {
153                    len += 1;
154                }
155            }
156            0xE0..=0xEF => {
157                let sec = bytes.u8()?;
158                let third = bytes.u8()?;
159                if sec & 0xC0 != 0x80 || third & 0xC0 != 0x80 {
160                    return Err(Error);
161                }
162                match (byte, sec) {
163                    (0xE0, 0xA0..=0xBF)
164                    | (0xE1..=0xEC | 0xEE | 0xEF, 0x80..=0xBF)
165                    | (0xED, 0x80..=0x9F) => {
166                        len += 3;
167                    }
168                    (0xED, 0xA0..=0xAF) => {
169                        if bytes.u8()? != 0xED {
170                            return Err(Error);
171                        }
172                        match bytes.u8()? {
173                            0xB0..=0xBF => (),
174                            _ => return Err(Error),
175                        }
176                        if bytes.u8()? & 0xC0 != 0x80 {
177                            return Err(Error);
178                        }
179                        len += 4;
180                    }
181                    _ => return Err(Error),
182                }
183            }
184            _ => return Err(Error),
185        }
186    }
187    Ok(len)
188}
189
190/// # Safety
191pub unsafe fn decode_mutf8(bytes: &[u8], w: &mut UnsafeWriter) -> Result<(), Error> {
192    let mut index = 0;
193    let mut start = 0;
194
195    while let Some(&byte) = bytes.get(index) {
196        match byte {
197            0x01..=0x7F => index += 1,
198            0xC2..=0xDF => unsafe {
199                let sec = match bytes.get(index + 1) {
200                    Some(&byte) => byte,
201                    _ => return Err(Error),
202                };
203                index += 2;
204                if !(byte == 0xC0 && sec == 0x80) {
205                } else {
206                    w.write(bytes.get_unchecked(start..index));
207                    w.write_byte(b'\0');
208                    start = index;
209                }
210            },
211            0xE0..=0xEF => unsafe {
212                let sec = match bytes.get(index + 1) {
213                    Some(&byte) if byte & 0xC0 == 0x80 => byte,
214                    _ => return Err(Error),
215                };
216                let third = match bytes.get(index + 2) {
217                    Some(&byte) if byte & 0xC0 == 0x80 => byte,
218                    _ => return Err(Error),
219                };
220                match (byte, sec) {
221                    (0xE0, 0xA0..=0xBF)
222                    | (0xE1..=0xEC | 0xEE | 0xEF, 0x80..=0xBF)
223                    | (0xED, 0x80..=0x9F) => {
224                        index += 3;
225                    }
226                    (0xED, 0xA0..=0xAF) => {
227                        match bytes.get(index + 3) {
228                            Some(0xED) => (),
229                            _ => return Err(Error),
230                        };
231                        let fifth = match bytes.get(index + 4) {
232                            Some(&x @ 0xB0..=0xBF) => x & 0x3F,
233                            _ => return Err(Error),
234                        };
235                        let sixth = match bytes.get(index + 5) {
236                            Some(&x) if x & 0xC0 == 0x80 => x & 0x3F,
237                            _ => return Err(Error),
238                        };
239                        let s1 = 0xD000 | (u32::from(sec & 0x3F) << 6) | u32::from(third & 0x3F);
240                        let s2 = 0xD000 | (u32::from(fifth) << 6) | u32::from(sixth);
241                        let point = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
242                        w.write(&[
243                            0xF0 | ((point & 0x1C0000) >> 18) as u8,
244                            0x80 | ((point & 0x3F000) >> 12) as u8,
245                            0x80 | ((point & 0xFC0) >> 6) as u8,
246                            0x80 | (point & 0x3F) as u8,
247                        ]);
248                    }
249                    _ => return Err(Error),
250                }
251            },
252            _ => return Err(Error),
253        }
254    }
255
256    unsafe { w.write(bytes.get_unchecked(start..index)) }
257
258    Ok(())
259}