ggstd/unicode/utf8/
utf8.rs

1// Copyright 2023 The rust-ggstd authors. All rights reserved.
2// Copyright 2009 The Go Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// // The conditions RUNE_ERROR==unicode:REPLACEMENT_CHAR and
7// // MAX_RUNE==unicode.MAX_RUNE are verified in the tests.
8// // Defining them locally avoids this package depending on package unicode.
9
10/// Numbers fundamental to the encoding.
11pub const RUNE_ERROR: char = '\u{FFFD}'; // the "error" Rune or "Unicode replacement character"
12
13/// Characters below RUNE_SELF are represented as themselves in a single byte.
14pub const RUNE_SELF: char = 0x80 as char;
15
16/// Maximum valid Unicode code point.
17pub const MAX_RUNE: char = '\u{10FFFF}';
18
19/// Maximum number of bytes of a UTF-8 encoded Unicode character.
20pub const UTFMAX: usize = 4;
21
22/// Code points in the surrogate range are not valid for UTF-8.
23pub(crate) const SURROGATE_MIN: u32 = 0xD800;
24pub(crate) const SURROGATE_MAX: u32 = 0xDFFF;
25
26#[allow(unused)]
27const T1: u8 = 0b00000000;
28const TX: u8 = 0b10000000;
29const T2: u8 = 0b11000000;
30const T3: u8 = 0b11100000;
31const T4: u8 = 0b11110000;
32#[allow(unused)]
33const T5: u8 = 0b11111000;
34
35const MASKX: u8 = 0b00111111;
36const MASK2: u8 = 0b00011111;
37const MASK3: u8 = 0b00001111;
38const MASK4: u8 = 0b00000111;
39
40pub(crate) const RUNE1_MAX: u32 = (1 << 7) - 1;
41pub(crate) const RUNE2_MAX: u32 = (1 << 11) - 1;
42pub(crate) const RUNE3_MAX: u32 = 0xffff;
43
44// The default lowest and highest continuation byte.
45const LOCB: u8 = 0b10000000;
46const HICB: u8 = 0b10111111;
47
48// These names of these constants are chosen to give nice alignment in the
49// table below. The first nibble is an index into acceptRanges or F for
50// special one-byte cases. The second nibble is the Rune length or the
51// Status for the special one-byte case.
52/// invalid: size 1
53const XX: u8 = 0xF1;
54/// ASCII: size 1
55const AS: u8 = 0xF0;
56/// accept 0, size 2
57const S1: u8 = 0x02;
58/// accept 1, size 3
59const S2: u8 = 0x13;
60/// accept 0, size 3
61const S3: u8 = 0x03;
62/// accept 2, size 3
63const S4: u8 = 0x23;
64/// accept 3, size 4
65const S5: u8 = 0x34;
66/// accept 0, size 4
67const S6: u8 = 0x04;
68/// accept 4, size 4
69const S7: u8 = 0x44;
70
71/// first is information about the first byte in a UTF-8 sequence.
72const FIRST: [u8; 256] = [
73    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
74    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
75    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
76    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
77    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
78    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
79    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
80    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
81    AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
82    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
83    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
84    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
85    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
86    XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
87    XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
88    S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
89    S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
90    S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
91];
92
93/// acceptRange gives the range of valid values for the second byte in a UTF-8
94/// sequence.
95struct AcceptRange {
96    lo: u8, // lowest value for second byte.
97    hi: u8, // highest value for second byte.
98}
99
100/// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
101const ACCEPT_RANGES: [AcceptRange; 16] = [
102    AcceptRange { lo: LOCB, hi: HICB },
103    AcceptRange { lo: 0xA0, hi: HICB },
104    AcceptRange { lo: LOCB, hi: 0x9F },
105    AcceptRange { lo: 0x90, hi: HICB },
106    AcceptRange { lo: LOCB, hi: 0x8F },
107    AcceptRange { lo: 0, hi: 0 },
108    AcceptRange { lo: 0, hi: 0 },
109    AcceptRange { lo: 0, hi: 0 },
110    AcceptRange { lo: 0, hi: 0 },
111    AcceptRange { lo: 0, hi: 0 },
112    AcceptRange { lo: 0, hi: 0 },
113    AcceptRange { lo: 0, hi: 0 },
114    AcceptRange { lo: 0, hi: 0 },
115    AcceptRange { lo: 0, hi: 0 },
116    AcceptRange { lo: 0, hi: 0 },
117    AcceptRange { lo: 0, hi: 0 },
118];
119
120/// full_rune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
121/// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
122pub fn full_rune(p: &[u8]) -> bool {
123    let n = p.len();
124    if n == 0 {
125        return false;
126    }
127    let x = FIRST[p[0] as usize];
128    if n >= (x & 7) as usize {
129        return true; // ASCII, invalid or valid.
130    }
131    // Must be short or invalid.
132    let accept = &ACCEPT_RANGES[(x >> 4) as usize];
133    #[allow(clippy::if_same_then_else)]
134    if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
135        return true;
136    } else if n > 2 && (p[2] < LOCB || HICB < p[2]) {
137        return true;
138    }
139    false
140}
141
142// // FullRuneInString is like full_rune but its input is a string.
143// fn FullRuneInString(s string) -> bool {
144// 	let n = len(s);
145// 	if n == 0 {
146// 		return false
147// 	}
148// 	x := first[s[0]]
149// 	if n >= int(x&7) {
150// 		return true // ASCII, invalid, or valid.
151// 	}
152// 	// Must be short or invalid.
153// 	accept := acceptRanges[x>>4]
154// 	if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
155// 		return true
156// 	} else if n > 2 && (s[2] < locb || hicb < s[2]) {
157// 		return true
158// 	}
159// 	return false
160// }
161
162/// decode_rune unpacks the first UTF-8 encoding in p and returns the rune and
163/// its width in bytes. If p is empty it returns (RUNE_ERROR, 0). Otherwise, if
164/// the encoding is invalid, it returns (RUNE_ERROR, 1). Both are impossible
165/// results for correct, non-empty UTF-8.
166///
167/// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
168/// out of range, or is not the shortest possible UTF-8 encoding for the
169/// value. No other validation is performed.
170pub fn decode_rune(p: &[u8]) -> (char, usize) {
171    // fn decode_rune(p: &[u8]) (r rune, size int) {
172    let n = p.len();
173    if n < 1 {
174        return (RUNE_ERROR, 0);
175    }
176    let p0 = p[0];
177    let x = FIRST[p0 as usize];
178    if x >= AS {
179        // The following code simulates an additional check for x == XX and
180        // handling the ASCII and invalid cases accordingly. This mask-and-or
181        // approach prevents an additional branch.
182        let mask = (((x as i32) << 31) >> 31) as u32; // Create 0x0000 or 0xFFFF.
183        unsafe {
184            return (
185                char::from_u32_unchecked(((p[0] as u32) & !mask) | (RUNE_ERROR as u32 & mask)),
186                1,
187            );
188        }
189    }
190    let sz = (x & 7) as usize;
191    let accept = &ACCEPT_RANGES[(x >> 4) as usize];
192    if n < sz {
193        return (RUNE_ERROR, 1);
194    }
195    let b1 = p[1];
196    if b1 < accept.lo || accept.hi < b1 {
197        return (RUNE_ERROR, 1);
198    }
199    if sz <= 2 {
200        // <= instead of == to help the compiler eliminate some bounds checks
201        unsafe {
202            return (
203                char::from_u32_unchecked((((p0 & MASK2) as u32) << 6) | (b1 & MASKX) as u32),
204                2,
205            );
206        }
207    }
208    let b2 = p[2];
209    if !(LOCB..=HICB).contains(&b2) {
210        return (RUNE_ERROR, 1);
211    }
212    if sz <= 3 {
213        unsafe {
214            return (
215                char::from_u32_unchecked(
216                    (((p0 & MASK3) as u32) << 12)
217                        | (((b1 & MASKX) as u32) << 6)
218                        | (b2 & MASKX) as u32,
219                ),
220                3,
221            );
222        }
223    }
224    let b3 = p[3];
225    if !(LOCB..=HICB).contains(&b3) {
226        return (RUNE_ERROR, 1);
227    }
228    unsafe {
229        (
230            char::from_u32_unchecked(
231                (((p0 & MASK4) as u32) << 18)
232                    | (((b1 & MASKX) as u32) << 12)
233                    | (((b2 & MASKX) as u32) << 6)
234                    | ((b3 & MASKX) as u32),
235            ),
236            4,
237        )
238    }
239}
240
241/// decode_rune_in_string is like decode_rune but its input is a string. If s is
242/// empty it returns (RUNE_ERROR, 0). Otherwise, if the encoding is invalid, it
243/// returns (RUNE_ERROR, 1). Both are impossible results for correct, non-empty
244/// UTF-8.
245///
246/// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
247/// out of range, or is not the shortest possible UTF-8 encoding for the
248/// value. No other validation is performed.
249pub fn decode_rune_in_string(s: &str) -> (char, usize) {
250    decode_rune(s.as_bytes())
251    // 	let n = len(s);
252    // 	if n < 1 {
253    // 		return RUNE_ERROR, 0
254    // 	}
255    // 	s0 := s[0]
256    // 	x := first[s0]
257    // 	if x >= as {
258    // 		// The following code simulates an additional check for x == XX and
259    // 		// handling the ASCII and invalid cases accordingly. This mask-and-or
260    // 		// approach prevents an additional branch.
261    // 		mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
262    // 		return rune(s[0])&^mask | RUNE_ERROR&mask, 1
263    // 	}
264    // 	sz := int(x & 7)
265    // 	accept := acceptRanges[x>>4]
266    // 	if n < sz {
267    // 		return RUNE_ERROR, 1
268    // 	}
269    // 	s1 := s[1]
270    // 	if s1 < accept.lo || accept.hi < s1 {
271    // 		return RUNE_ERROR, 1
272    // 	}
273    // 	if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
274    // 		return rune(s0&mask2)<<6 | rune(s1&maskx), 2
275    // 	}
276    // 	s2 := s[2]
277    // 	if s2 < locb || hicb < s2 {
278    // 		return RUNE_ERROR, 1
279    // 	}
280    // 	if sz <= 3 {
281    // 		return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
282    // 	}
283    // 	s3 := s[3]
284    // 	if s3 < locb || hicb < s3 {
285    // 		return RUNE_ERROR, 1
286    // 	}
287    // 	return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
288}
289
290// // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
291// // its width in bytes. If p is empty it returns (RUNE_ERROR, 0). Otherwise, if
292// // the encoding is invalid, it returns (RUNE_ERROR, 1). Both are impossible
293// // results for correct, non-empty UTF-8.
294// //
295// // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
296// // out of range, or is not the shortest possible UTF-8 encoding for the
297// // value. No other validation is performed.
298// fn DecodeLastRune(p: &[u8]) (r rune, size int) {
299// 	end := p.len()
300// 	if end == 0 {
301// 		return RUNE_ERROR, 0
302// 	}
303// 	start := end - 1
304// 	r = rune(p[start])
305// 	if r < RUNE_SELF {
306// 		return r, 1
307// 	}
308// 	// guard against O(n^2) behavior when traversing
309// 	// backwards through strings with long sequences of
310// 	// invalid UTF-8.
311// 	lim := end - UTFMAX
312// 	if lim < 0 {
313// 		lim = 0
314// 	}
315// 	for start--; start >= lim; start-- {
316// 		if RuneStart(p[start]) {
317// 			break
318// 		}
319// 	}
320// 	if start < 0 {
321// 		start = 0
322// 	}
323// 	r, size = decode_rune(p[start:end])
324// 	if start+size != end {
325// 		return RUNE_ERROR, 1
326// 	}
327// 	return r, size
328// }
329
330// // DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
331// // s is empty it returns (RUNE_ERROR, 0). Otherwise, if the encoding is invalid,
332// // it returns (RUNE_ERROR, 1). Both are impossible results for correct,
333// // non-empty UTF-8.
334// //
335// // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
336// // out of range, or is not the shortest possible UTF-8 encoding for the
337// // value. No other validation is performed.
338// fn DecodeLastRuneInString(s string) (r rune, size int) {
339// 	end := len(s)
340// 	if end == 0 {
341// 		return RUNE_ERROR, 0
342// 	}
343// 	start := end - 1
344// 	r = rune(s[start])
345// 	if r < RUNE_SELF {
346// 		return r, 1
347// 	}
348// 	// guard against O(n^2) behavior when traversing
349// 	// backwards through strings with long sequences of
350// 	// invalid UTF-8.
351// 	lim := end - UTFMAX
352// 	if lim < 0 {
353// 		lim = 0
354// 	}
355// 	for start--; start >= lim; start-- {
356// 		if RuneStart(s[start]) {
357// 			break
358// 		}
359// 	}
360// 	if start < 0 {
361// 		start = 0
362// 	}
363// 	r, size = decode_rune_in_string(s[start:end])
364// 	if start+size != end {
365// 		return RUNE_ERROR, 1
366// 	}
367// 	return r, size
368// }
369
370/// rune_len returns the number of bytes required to encode the rune.
371/// It returns -1 if the rune is not a valid value to encode in UTF-8.
372pub fn rune_len(r: u32) -> isize {
373    if r <= RUNE1_MAX {
374        return 1;
375    } else if r <= RUNE2_MAX {
376        return 2;
377    } else if (SURROGATE_MIN..=SURROGATE_MAX).contains(&r) {
378        return -1;
379    } else if r <= RUNE3_MAX {
380        return 3;
381    } else if r <= MAX_RUNE as u32 {
382        return 4;
383    }
384    -1
385}
386
387/// encode_rune writes into p (which must be large enough) the UTF-8 encoding of the rune.
388/// If the rune is out of range, it writes the encoding of RUNE_ERROR.
389/// It returns the number of bytes written.
390pub fn encode_rune(p: &mut [u8], r: u32) -> usize {
391    let mut r = r;
392    if r <= RUNE1_MAX {
393        p[0] = r as u8;
394        1
395    } else if r <= RUNE2_MAX {
396        // 		_ = p[1] // eliminate bounds checks
397        p[0] = T2 | (r >> 6) as u8;
398        p[1] = TX | (r as u8) & MASKX;
399        return 2;
400    } else {
401        if (r > (MAX_RUNE as u32)) || (SURROGATE_MIN..=SURROGATE_MAX).contains(&r) {
402            r = RUNE_ERROR as u32;
403        }
404        if r <= RUNE3_MAX {
405            // 		_ = p[2] // eliminate bounds checks
406            p[0] = T3 | (r >> 12) as u8;
407            p[1] = TX | (r >> 6) as u8 & MASKX;
408            p[2] = TX | (r as u8) & MASKX;
409            return 3;
410        } else {
411            // 		_ = p[3] // eliminate bounds checks
412            p[0] = T4 | (r >> 18) as u8;
413            p[1] = TX | (r >> 12) as u8 & MASKX;
414            p[2] = TX | (r >> 6) as u8 & MASKX;
415            p[3] = TX | (r as u8) & MASKX;
416            return 4;
417        }
418    }
419}
420
421// // AppendRune appends the UTF-8 encoding of r to the end of p and
422// // returns the extended buffer. If the rune is out of range,
423// // it appends the encoding of RUNE_ERROR.
424// fn AppendRune(p []byte, r rune) []byte {
425// 	// This function is inlineable for fast handling of ASCII.
426// 	if uint32(r) <= RUNE1_MAX {
427// 		return append(p, byte(r))
428// 	}
429// 	return appendRuneNonASCII(p, r)
430// }
431
432// fn appendRuneNonASCII(p []byte, r rune) []byte {
433// 	// Negative values are erroneous. Making it unsigned addresses the problem.
434// 	switch i := uint32(r); {
435// 	case i <= RUNE2_MAX:
436// 		return append(p, t2|byte(r>>6), tx|byte(r)&maskx)
437// 	case i > MAX_RUNE, SURROGATE_MIN <= i && i <= SURROGATE_MAX:
438// 		r = RUNE_ERROR
439// 		fallthrough
440// 	case i <= rune3Max:
441// 		return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
442// 	default:
443// 		return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
444// 	}
445// }
446
447/// rune_count returns the number of runes in p. Erroneous and short
448/// encodings are treated as single runes of width 1 byte.
449pub fn rune_count(p: &[u8]) -> usize {
450    let np = p.len();
451    let mut n = 0;
452    let mut i = 0;
453    while i < np {
454        n += 1;
455        let c = p[i];
456        if (c as u32) < (RUNE_SELF as u32) {
457            // ASCII fast path
458            i += 1;
459            continue;
460        }
461        let x = FIRST[c as usize];
462        if x == XX {
463            i += 1; // invalid.
464            continue;
465        }
466        let mut size = (x & 7) as usize;
467        if i + size > np {
468            i += 1; // Short or invalid.
469            continue;
470        }
471        let accept = &ACCEPT_RANGES[(x >> 4) as usize];
472        let c = p[i + 1];
473        if c < accept.lo || accept.hi < c {
474            size = 1
475        } else if size == 2 {
476        } else {
477            let c = p[i + 2];
478            if !(LOCB..=HICB).contains(&c) {
479                size = 1
480            } else if size == 3 {
481            } else {
482                let c = p[i + 3];
483                if !(LOCB..=HICB).contains(&c) {
484                    size = 1;
485                }
486            }
487        }
488        i += size;
489    }
490    n
491}
492
493// // RuneCountInString is like rune_count but its input is a string.
494// fn RuneCountInString(s string) (n int) {
495// 	ns := len(s)
496// 	for i := 0; i < ns; n++ {
497// 		c := s[i]
498// 		if c < RUNE_SELF {
499// 			// ASCII fast path
500// 			i++
501// 			continue
502// 		}
503// 		x := FIRST[c]
504// 		if x == XX {
505// 			i++ // invalid.
506// 			continue
507// 		}
508// 		size := int(x & 7)
509// 		if i+size > ns {
510// 			i++ // Short or invalid.
511// 			continue
512// 		}
513// 		accept := acceptRanges[x>>4]
514// 		if c := s[i+1]; c < accept.lo || accept.hi < c {
515// 			size = 1
516// 		} else if size == 2 {
517// 		} else if c := s[i+2]; c < locb || hicb < c {
518// 			size = 1
519// 		} else if size == 3 {
520// 		} else if c := s[i+3]; c < locb || hicb < c {
521// 			size = 1
522// 		}
523// 		i += size
524// 	}
525// 	return n
526// }
527
528// // RuneStart reports whether the byte could be the first byte of an encoded,
529// // possibly invalid rune. Second and subsequent bytes always have the top two
530// // bits set to 10.
531// fn RuneStart(b byte) -> bool { return b&0xC0 != 0x80 }
532
533// // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
534// fn Valid(p: &[u8]) -> bool {
535// 	// This optimization avoids the need to recompute the capacity
536// 	// when generating code for p[8:], bringing it to parity with
537// 	// ValidString, which was 20% faster on long ASCII strings.
538// 	p = p[:p.len():p.len()]
539
540// 	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
541// 	for p.len() >= 8 {
542// 		// Combining two 32 bit loads allows the same code to be used
543// 		// for 32 and 64 bit platforms.
544// 		// The compiler can generate a 32bit load for first32 and second32
545// 		// on many platforms. See test/codegen/memcombine.go.
546// 		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
547// 		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
548// 		if (first32|second32)&0x80808080 != 0 {
549// 			// Found a non ASCII byte (>= RUNE_SELF).
550// 			break
551// 		}
552// 		p = p[8:]
553// 	}
554// 	let n = p.len();
555// 	for i := 0; i < n; {
556// 		pi := p[i]
557// 		if pi < RUNE_SELF {
558// 			i++
559// 			continue
560// 		}
561// 		x := first[pi]
562// 		if x == XX {
563// 			return false // Illegal starter byte.
564// 		}
565// 		size := int(x & 7)
566// 		if i+size > n {
567// 			return false // Short or invalid.
568// 		}
569// 		accept := acceptRanges[x>>4]
570// 		if c := p[i+1]; c < accept.lo || accept.hi < c {
571// 			return false
572// 		} else if size == 2 {
573// 		} else if c := p[i+2]; c < locb || hicb < c {
574// 			return false
575// 		} else if size == 3 {
576// 		} else if c := p[i+3]; c < locb || hicb < c {
577// 			return false
578// 		}
579// 		i += size
580// 	}
581// 	return true
582// }
583
584// // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
585// fn ValidString(s string) -> bool {
586// 	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
587// 	for len(s) >= 8 {
588// 		// Combining two 32 bit loads allows the same code to be used
589// 		// for 32 and 64 bit platforms.
590// 		// The compiler can generate a 32bit load for first32 and second32
591// 		// on many platforms. See test/codegen/memcombine.go.
592// 		first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
593// 		second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
594// 		if (first32|second32)&0x80808080 != 0 {
595// 			// Found a non ASCII byte (>= RUNE_SELF).
596// 			break
597// 		}
598// 		s = s[8:]
599// 	}
600// 	let n = len(s);
601// 	for i := 0; i < n; {
602// 		si := s[i]
603// 		if si < RUNE_SELF {
604// 			i++
605// 			continue
606// 		}
607// 		x := first[si]
608// 		if x == XX {
609// 			return false // Illegal starter byte.
610// 		}
611// 		size := int(x & 7)
612// 		if i+size > n {
613// 			return false // Short or invalid.
614// 		}
615// 		accept := acceptRanges[x>>4]
616// 		if c := s[i+1]; c < accept.lo || accept.hi < c {
617// 			return false
618// 		} else if size == 2 {
619// 		} else if c := s[i+2]; c < locb || hicb < c {
620// 			return false
621// 		} else if size == 3 {
622// 		} else if c := s[i+3]; c < locb || hicb < c {
623// 			return false
624// 		}
625// 		i += size
626// 	}
627// 	return true
628// }
629
630/// ValidRune reports whether r can be legally encoded as UTF-8.
631/// Code points that are out of range or a surrogate half are illegal.
632pub fn valid_rune(r: u32) -> bool {
633    (r < SURROGATE_MIN) || (SURROGATE_MAX < r && r <= (MAX_RUNE as u32))
634}
ggstd/unicode/utf8/utf8.rs

ggstd/unicode/utf8/
utf8.rs