1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
// devela::text::unicode::scalar::namespace::byte
use crate::{Char, is};
/// # Methods over `u8`.
#[rustfmt::skip]
impl Char<u8> {
/* private helpers */
// https://tools.ietf.org/html/rfc3629
// https://github.com/rust-lang/rust/blob/master/library/core/src/str/validations.rs
pub(crate) const UTF8_CHAR_LEN: &[u8; 256] = &[
// 1 2 3 4 5 6 7 8 9 A B C D E F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 0x00..=0x7F => 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C 0xC2..=0xDF => 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E 0xE0..=0xEF => 3,
4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F 0xF0..=0xF4 => 4,
];
/* public methods */
/// Returns the expected UTF-8 byte length based on the given first byte, or `None` if invalid.
///
/// LUT based (256-byte array).
#[must_use]
pub const fn len_utf8(self) -> Option<usize> {
let width = self.len_utf8_unchecked();
is![width == 0, None, Some(width)]
}
/// Returns the expected UTF-8 byte length based on the given first byte, or `0` if invalid.
///
/// LUT based (256-byte array).
#[must_use]
pub const fn len_utf8_unchecked(self) -> usize {
Self::UTF8_CHAR_LEN[self.0 as usize] as usize
}
/// Returns the expected UTF-8 byte length based on the given first byte, or `None` if invalid.
///
/// Match based, for when memory accesses are more expensive than branches.
#[must_use]
pub const fn len_utf8_match(self) -> Option<usize> {
match self.0 { // same logic as Self::UTF8_CHAR_LEN
0x00..=0x7F => Some(1),
0xC2..=0xDF => Some(2), // skips invalid C0, C1
0xE0..=0xEF => Some(3),
0xF0..=0xF4 => Some(4), // skips invalid 0xF5..0x=F7
_ => None, // invalid leading byte
}
}
/// Returns the expected UTF-8 byte length based on the given first byte.
///
/// Match based, for when memory accesses are more expensive than branches.
///
/// This function does **not** validate UTF-8 but determines how many bytes
/// a valid sequence **should** occupy based on the leading byte.
///
/// ### Caveat
/// - If used on malformed UTF-8, it may suggest a length longer than the actual valid sequence.
/// - Always use in conjunction with proper UTF-8 validation if handling untrusted input.
#[must_use]
pub const fn len_utf8_match_naive(self) -> usize {
match self.0 {
0x00..=0x7F => 1, // 1-byte ASCII
0xC0..=0xDF => 2, // 2-byte sequence
0xE0..=0xEF => 3, // 3-byte sequence
0xF0..=0xF7 => 4, // 4-byte sequence
_ => 0, // invalid leading byte
}
}
/// Returns `true` if this byte is a valid starting point for a UTF-8 sequence.
///
/// This checks if the byte is not a UTF-8 continuation byte (i.e., it's either
/// an ASCII character or a valid leading byte of a multi-byte sequence).
#[must_use] #[inline(always)]
pub const fn is_utf8_boundary(self) -> bool {
// Equivalent to: b < 128 || b >= 192 (== not a continuation byte (0b10xxxxxx))
(self.0 as i8) >= -0x40
}
/// Returns `true` if this byte is a UTF-8 continuation byte.
///
/// Continuation bytes have the bit pattern `10xxxxxx`.
#[must_use] #[inline(always)]
pub const fn is_utf8_continuation(self) -> bool { !self.is_utf8_boundary() }
/// Returns the current byte as a `char`.
///
/// See [`char::from(u8)`][char#impl-From<u8>-for-char].
#[must_use] #[inline(always)]
pub const fn as_char(self) -> char { self.0 as char }
}