1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#[derive(Debug, Default)]
pub struct Utf8Accum {
/// Buffer for utf8 octets aggregation until full utf-8 char is received
buffer: [u8; 4],
/// How many more utf8 octets are expected
expected: u8,
/// How many utf8 octets are in the buffer
partial: u8,
}
impl Utf8Accum {
pub fn push_byte(&mut self, byte: u8) -> Option<&str> {
// Plain and stupid utf-8 validation
// Bytes are supposed to be human input so it's okay to be not blazing fast
if byte <= 0x7F {
self.partial = 0;
self.expected = 0;
self.buffer[0] = byte;
// SAFETY: ascii chars are all valid utf-8 chars
return Some(unsafe { core::str::from_utf8_unchecked(&self.buffer[..1]) });
} else if (0xC0..=0xDF).contains(&byte) {
// this is first octet of 2-byte value
self.buffer[0] = byte;
self.partial = 1;
self.expected = 1;
} else if (0xE0..=0xEF).contains(&byte) {
// this is first octet of 3-byte value
self.buffer[0] = byte;
self.partial = 1;
self.expected = 2;
} else if (0xF0..=0xF7).contains(&byte) {
// this is first octet of 4-byte value
self.buffer[0] = byte;
self.partial = 1;
self.expected = 3;
} else if (0x80..=0xBF).contains(&byte) && self.expected > 0 {
// this is one of other octets of multi-byte value
self.buffer[self.partial as usize] = byte;
self.partial += 1;
self.expected -= 1;
if self.expected == 0 {
let len = self.partial as usize;
self.partial = 0;
// SAFETY: we checked previously that buffer contains valid utf8
return Some(unsafe { core::str::from_utf8_unchecked(&self.buffer[..len]) });
}
}
None
}
}
#[cfg(test)]
mod tests {
use std::string::String;
use crate::utf8::Utf8Accum;
#[test]
fn utf8_support() {
let mut accum = Utf8Accum::default();
let expected_str = "abcdабвг佐佗佟𑿁𑿆𑿌";
let mut text = String::new();
for &b in expected_str.as_bytes() {
if let Some(t) = accum.push_byte(b) {
text.push_str(t);
}
}
assert_eq!(text, expected_str);
}
}