avr_progmem/string/
validations.rs

1// This file as a partial copy of the str/validations.rs of the Rust core lib.
2//
3// A copy was needed, because the original `next_code_point` takes an iterator
4// of `&u8`, which is not an option for as, because we only have `u8` by-value.
5//
6// Source:
7// https://github.com/rust-lang/rust/blob/03b17b181af4945fa24e0df79676e89454546440/library/core/src/str/validations.rs
8
9
10/// Mask of the value bits of a continuation byte.
11const CONT_MASK: u8 = 0b0011_1111;
12
13/// Returns the initial codepoint accumulator for the first byte.
14/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
15/// for width 3, and 3 bits for width 4.
16#[inline]
17const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
18	(byte & (0x7F >> width)) as u32
19}
20
21/// Returns the value of `ch` updated with continuation byte `byte`.
22#[inline]
23const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
24	(ch << 6) | (byte & CONT_MASK) as u32
25}
26
27
28/// Reads the next code point out of a byte iterator (assuming a
29/// UTF-8-like encoding).
30///
31/// # Safety
32///
33/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
34#[inline]
35pub(super) unsafe fn next_code_point<I: Iterator<Item = u8>>(bytes: &mut I) -> Option<u32> {
36	// Decode UTF-8
37	let x = bytes.next()?;
38	if x < 128 {
39		return Some(x as u32);
40	}
41
42	// Multibyte case follows
43	// Decode from a byte combination out of: [[[x y] z] w]
44	// NOTE: Performance is sensitive to the exact formulation here
45	let init = utf8_first_byte(x, 2);
46	// SAFETY: `bytes` produces an UTF-8-like string,
47	// so the iterator must produce a value here.
48	let y = bytes.next().unwrap();
49	let mut ch = utf8_acc_cont_byte(init, y);
50	if x >= 0xE0 {
51		// [[x y z] w] case
52		// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
53		// SAFETY: `bytes` produces an UTF-8-like string,
54		// so the iterator must produce a value here.
55		let z = bytes.next().unwrap();
56		let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
57		ch = init << 12 | y_z;
58		if x >= 0xF0 {
59			// [x y z w] case
60			// use only the lower 3 bits of `init`
61			// SAFETY: `bytes` produces an UTF-8-like string,
62			// so the iterator must produce a value here.
63			let w = bytes.next().unwrap();
64			ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
65		}
66	}
67
68	Some(ch)
69}