ggstd/unicode/utf8/utf8.rs
1// Copyright 2023 The rust-ggstd authors. All rights reserved.
2// Copyright 2009 The Go Authors. All rights reserved.
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file.
5
6// // The conditions RUNE_ERROR==unicode:REPLACEMENT_CHAR and
7// // MAX_RUNE==unicode.MAX_RUNE are verified in the tests.
8// // Defining them locally avoids this package depending on package unicode.
9
10/// Numbers fundamental to the encoding.
11pub const RUNE_ERROR: char = '\u{FFFD}'; // the "error" Rune or "Unicode replacement character"
12
13/// Characters below RUNE_SELF are represented as themselves in a single byte.
14pub const RUNE_SELF: char = 0x80 as char;
15
16/// Maximum valid Unicode code point.
17pub const MAX_RUNE: char = '\u{10FFFF}';
18
19/// Maximum number of bytes of a UTF-8 encoded Unicode character.
20pub const UTFMAX: usize = 4;
21
22/// Code points in the surrogate range are not valid for UTF-8.
23pub(crate) const SURROGATE_MIN: u32 = 0xD800;
24pub(crate) const SURROGATE_MAX: u32 = 0xDFFF;
25
26#[allow(unused)]
27const T1: u8 = 0b00000000;
28const TX: u8 = 0b10000000;
29const T2: u8 = 0b11000000;
30const T3: u8 = 0b11100000;
31const T4: u8 = 0b11110000;
32#[allow(unused)]
33const T5: u8 = 0b11111000;
34
35const MASKX: u8 = 0b00111111;
36const MASK2: u8 = 0b00011111;
37const MASK3: u8 = 0b00001111;
38const MASK4: u8 = 0b00000111;
39
40pub(crate) const RUNE1_MAX: u32 = (1 << 7) - 1;
41pub(crate) const RUNE2_MAX: u32 = (1 << 11) - 1;
42pub(crate) const RUNE3_MAX: u32 = 0xffff;
43
44// The default lowest and highest continuation byte.
45const LOCB: u8 = 0b10000000;
46const HICB: u8 = 0b10111111;
47
48// These names of these constants are chosen to give nice alignment in the
49// table below. The first nibble is an index into acceptRanges or F for
50// special one-byte cases. The second nibble is the Rune length or the
51// Status for the special one-byte case.
52/// invalid: size 1
53const XX: u8 = 0xF1;
54/// ASCII: size 1
55const AS: u8 = 0xF0;
56/// accept 0, size 2
57const S1: u8 = 0x02;
58/// accept 1, size 3
59const S2: u8 = 0x13;
60/// accept 0, size 3
61const S3: u8 = 0x03;
62/// accept 2, size 3
63const S4: u8 = 0x23;
64/// accept 3, size 4
65const S5: u8 = 0x34;
66/// accept 0, size 4
67const S6: u8 = 0x04;
68/// accept 4, size 4
69const S7: u8 = 0x44;
70
71/// first is information about the first byte in a UTF-8 sequence.
72const FIRST: [u8; 256] = [
73 // 1 2 3 4 5 6 7 8 9 A B C D E F
74 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
75 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
76 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
77 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
78 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
79 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
80 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
81 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
82 // 1 2 3 4 5 6 7 8 9 A B C D E F
83 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
84 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
85 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
86 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
87 XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
88 S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
89 S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
90 S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
91];
92
93/// acceptRange gives the range of valid values for the second byte in a UTF-8
94/// sequence.
95struct AcceptRange {
96 lo: u8, // lowest value for second byte.
97 hi: u8, // highest value for second byte.
98}
99
100/// acceptRanges has size 16 to avoid bounds checks in the code that uses it.
101const ACCEPT_RANGES: [AcceptRange; 16] = [
102 AcceptRange { lo: LOCB, hi: HICB },
103 AcceptRange { lo: 0xA0, hi: HICB },
104 AcceptRange { lo: LOCB, hi: 0x9F },
105 AcceptRange { lo: 0x90, hi: HICB },
106 AcceptRange { lo: LOCB, hi: 0x8F },
107 AcceptRange { lo: 0, hi: 0 },
108 AcceptRange { lo: 0, hi: 0 },
109 AcceptRange { lo: 0, hi: 0 },
110 AcceptRange { lo: 0, hi: 0 },
111 AcceptRange { lo: 0, hi: 0 },
112 AcceptRange { lo: 0, hi: 0 },
113 AcceptRange { lo: 0, hi: 0 },
114 AcceptRange { lo: 0, hi: 0 },
115 AcceptRange { lo: 0, hi: 0 },
116 AcceptRange { lo: 0, hi: 0 },
117 AcceptRange { lo: 0, hi: 0 },
118];
119
120/// full_rune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
121/// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
122pub fn full_rune(p: &[u8]) -> bool {
123 let n = p.len();
124 if n == 0 {
125 return false;
126 }
127 let x = FIRST[p[0] as usize];
128 if n >= (x & 7) as usize {
129 return true; // ASCII, invalid or valid.
130 }
131 // Must be short or invalid.
132 let accept = &ACCEPT_RANGES[(x >> 4) as usize];
133 #[allow(clippy::if_same_then_else)]
134 if n > 1 && (p[1] < accept.lo || accept.hi < p[1]) {
135 return true;
136 } else if n > 2 && (p[2] < LOCB || HICB < p[2]) {
137 return true;
138 }
139 false
140}
141
142// // FullRuneInString is like full_rune but its input is a string.
143// fn FullRuneInString(s string) -> bool {
144// let n = len(s);
145// if n == 0 {
146// return false
147// }
148// x := first[s[0]]
149// if n >= int(x&7) {
150// return true // ASCII, invalid, or valid.
151// }
152// // Must be short or invalid.
153// accept := acceptRanges[x>>4]
154// if n > 1 && (s[1] < accept.lo || accept.hi < s[1]) {
155// return true
156// } else if n > 2 && (s[2] < locb || hicb < s[2]) {
157// return true
158// }
159// return false
160// }
161
162/// decode_rune unpacks the first UTF-8 encoding in p and returns the rune and
163/// its width in bytes. If p is empty it returns (RUNE_ERROR, 0). Otherwise, if
164/// the encoding is invalid, it returns (RUNE_ERROR, 1). Both are impossible
165/// results for correct, non-empty UTF-8.
166///
167/// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
168/// out of range, or is not the shortest possible UTF-8 encoding for the
169/// value. No other validation is performed.
170pub fn decode_rune(p: &[u8]) -> (char, usize) {
171 // fn decode_rune(p: &[u8]) (r rune, size int) {
172 let n = p.len();
173 if n < 1 {
174 return (RUNE_ERROR, 0);
175 }
176 let p0 = p[0];
177 let x = FIRST[p0 as usize];
178 if x >= AS {
179 // The following code simulates an additional check for x == XX and
180 // handling the ASCII and invalid cases accordingly. This mask-and-or
181 // approach prevents an additional branch.
182 let mask = (((x as i32) << 31) >> 31) as u32; // Create 0x0000 or 0xFFFF.
183 unsafe {
184 return (
185 char::from_u32_unchecked(((p[0] as u32) & !mask) | (RUNE_ERROR as u32 & mask)),
186 1,
187 );
188 }
189 }
190 let sz = (x & 7) as usize;
191 let accept = &ACCEPT_RANGES[(x >> 4) as usize];
192 if n < sz {
193 return (RUNE_ERROR, 1);
194 }
195 let b1 = p[1];
196 if b1 < accept.lo || accept.hi < b1 {
197 return (RUNE_ERROR, 1);
198 }
199 if sz <= 2 {
200 // <= instead of == to help the compiler eliminate some bounds checks
201 unsafe {
202 return (
203 char::from_u32_unchecked((((p0 & MASK2) as u32) << 6) | (b1 & MASKX) as u32),
204 2,
205 );
206 }
207 }
208 let b2 = p[2];
209 if !(LOCB..=HICB).contains(&b2) {
210 return (RUNE_ERROR, 1);
211 }
212 if sz <= 3 {
213 unsafe {
214 return (
215 char::from_u32_unchecked(
216 (((p0 & MASK3) as u32) << 12)
217 | (((b1 & MASKX) as u32) << 6)
218 | (b2 & MASKX) as u32,
219 ),
220 3,
221 );
222 }
223 }
224 let b3 = p[3];
225 if !(LOCB..=HICB).contains(&b3) {
226 return (RUNE_ERROR, 1);
227 }
228 unsafe {
229 (
230 char::from_u32_unchecked(
231 (((p0 & MASK4) as u32) << 18)
232 | (((b1 & MASKX) as u32) << 12)
233 | (((b2 & MASKX) as u32) << 6)
234 | ((b3 & MASKX) as u32),
235 ),
236 4,
237 )
238 }
239}
240
241/// decode_rune_in_string is like decode_rune but its input is a string. If s is
242/// empty it returns (RUNE_ERROR, 0). Otherwise, if the encoding is invalid, it
243/// returns (RUNE_ERROR, 1). Both are impossible results for correct, non-empty
244/// UTF-8.
245///
246/// An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
247/// out of range, or is not the shortest possible UTF-8 encoding for the
248/// value. No other validation is performed.
249pub fn decode_rune_in_string(s: &str) -> (char, usize) {
250 decode_rune(s.as_bytes())
251 // let n = len(s);
252 // if n < 1 {
253 // return RUNE_ERROR, 0
254 // }
255 // s0 := s[0]
256 // x := first[s0]
257 // if x >= as {
258 // // The following code simulates an additional check for x == XX and
259 // // handling the ASCII and invalid cases accordingly. This mask-and-or
260 // // approach prevents an additional branch.
261 // mask := rune(x) << 31 >> 31 // Create 0x0000 or 0xFFFF.
262 // return rune(s[0])&^mask | RUNE_ERROR&mask, 1
263 // }
264 // sz := int(x & 7)
265 // accept := acceptRanges[x>>4]
266 // if n < sz {
267 // return RUNE_ERROR, 1
268 // }
269 // s1 := s[1]
270 // if s1 < accept.lo || accept.hi < s1 {
271 // return RUNE_ERROR, 1
272 // }
273 // if sz <= 2 { // <= instead of == to help the compiler eliminate some bounds checks
274 // return rune(s0&mask2)<<6 | rune(s1&maskx), 2
275 // }
276 // s2 := s[2]
277 // if s2 < locb || hicb < s2 {
278 // return RUNE_ERROR, 1
279 // }
280 // if sz <= 3 {
281 // return rune(s0&mask3)<<12 | rune(s1&maskx)<<6 | rune(s2&maskx), 3
282 // }
283 // s3 := s[3]
284 // if s3 < locb || hicb < s3 {
285 // return RUNE_ERROR, 1
286 // }
287 // return rune(s0&mask4)<<18 | rune(s1&maskx)<<12 | rune(s2&maskx)<<6 | rune(s3&maskx), 4
288}
289
290// // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and
291// // its width in bytes. If p is empty it returns (RUNE_ERROR, 0). Otherwise, if
292// // the encoding is invalid, it returns (RUNE_ERROR, 1). Both are impossible
293// // results for correct, non-empty UTF-8.
294// //
295// // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
296// // out of range, or is not the shortest possible UTF-8 encoding for the
297// // value. No other validation is performed.
298// fn DecodeLastRune(p: &[u8]) (r rune, size int) {
299// end := p.len()
300// if end == 0 {
301// return RUNE_ERROR, 0
302// }
303// start := end - 1
304// r = rune(p[start])
305// if r < RUNE_SELF {
306// return r, 1
307// }
308// // guard against O(n^2) behavior when traversing
309// // backwards through strings with long sequences of
310// // invalid UTF-8.
311// lim := end - UTFMAX
312// if lim < 0 {
313// lim = 0
314// }
315// for start--; start >= lim; start-- {
316// if RuneStart(p[start]) {
317// break
318// }
319// }
320// if start < 0 {
321// start = 0
322// }
323// r, size = decode_rune(p[start:end])
324// if start+size != end {
325// return RUNE_ERROR, 1
326// }
327// return r, size
328// }
329
330// // DecodeLastRuneInString is like DecodeLastRune but its input is a string. If
331// // s is empty it returns (RUNE_ERROR, 0). Otherwise, if the encoding is invalid,
332// // it returns (RUNE_ERROR, 1). Both are impossible results for correct,
333// // non-empty UTF-8.
334// //
335// // An encoding is invalid if it is incorrect UTF-8, encodes a rune that is
336// // out of range, or is not the shortest possible UTF-8 encoding for the
337// // value. No other validation is performed.
338// fn DecodeLastRuneInString(s string) (r rune, size int) {
339// end := len(s)
340// if end == 0 {
341// return RUNE_ERROR, 0
342// }
343// start := end - 1
344// r = rune(s[start])
345// if r < RUNE_SELF {
346// return r, 1
347// }
348// // guard against O(n^2) behavior when traversing
349// // backwards through strings with long sequences of
350// // invalid UTF-8.
351// lim := end - UTFMAX
352// if lim < 0 {
353// lim = 0
354// }
355// for start--; start >= lim; start-- {
356// if RuneStart(s[start]) {
357// break
358// }
359// }
360// if start < 0 {
361// start = 0
362// }
363// r, size = decode_rune_in_string(s[start:end])
364// if start+size != end {
365// return RUNE_ERROR, 1
366// }
367// return r, size
368// }
369
370/// rune_len returns the number of bytes required to encode the rune.
371/// It returns -1 if the rune is not a valid value to encode in UTF-8.
372pub fn rune_len(r: u32) -> isize {
373 if r <= RUNE1_MAX {
374 return 1;
375 } else if r <= RUNE2_MAX {
376 return 2;
377 } else if (SURROGATE_MIN..=SURROGATE_MAX).contains(&r) {
378 return -1;
379 } else if r <= RUNE3_MAX {
380 return 3;
381 } else if r <= MAX_RUNE as u32 {
382 return 4;
383 }
384 -1
385}
386
387/// encode_rune writes into p (which must be large enough) the UTF-8 encoding of the rune.
388/// If the rune is out of range, it writes the encoding of RUNE_ERROR.
389/// It returns the number of bytes written.
390pub fn encode_rune(p: &mut [u8], r: u32) -> usize {
391 let mut r = r;
392 if r <= RUNE1_MAX {
393 p[0] = r as u8;
394 1
395 } else if r <= RUNE2_MAX {
396 // _ = p[1] // eliminate bounds checks
397 p[0] = T2 | (r >> 6) as u8;
398 p[1] = TX | (r as u8) & MASKX;
399 return 2;
400 } else {
401 if (r > (MAX_RUNE as u32)) || (SURROGATE_MIN..=SURROGATE_MAX).contains(&r) {
402 r = RUNE_ERROR as u32;
403 }
404 if r <= RUNE3_MAX {
405 // _ = p[2] // eliminate bounds checks
406 p[0] = T3 | (r >> 12) as u8;
407 p[1] = TX | (r >> 6) as u8 & MASKX;
408 p[2] = TX | (r as u8) & MASKX;
409 return 3;
410 } else {
411 // _ = p[3] // eliminate bounds checks
412 p[0] = T4 | (r >> 18) as u8;
413 p[1] = TX | (r >> 12) as u8 & MASKX;
414 p[2] = TX | (r >> 6) as u8 & MASKX;
415 p[3] = TX | (r as u8) & MASKX;
416 return 4;
417 }
418 }
419}
420
421// // AppendRune appends the UTF-8 encoding of r to the end of p and
422// // returns the extended buffer. If the rune is out of range,
423// // it appends the encoding of RUNE_ERROR.
424// fn AppendRune(p []byte, r rune) []byte {
425// // This function is inlineable for fast handling of ASCII.
426// if uint32(r) <= RUNE1_MAX {
427// return append(p, byte(r))
428// }
429// return appendRuneNonASCII(p, r)
430// }
431
432// fn appendRuneNonASCII(p []byte, r rune) []byte {
433// // Negative values are erroneous. Making it unsigned addresses the problem.
434// switch i := uint32(r); {
435// case i <= RUNE2_MAX:
436// return append(p, t2|byte(r>>6), tx|byte(r)&maskx)
437// case i > MAX_RUNE, SURROGATE_MIN <= i && i <= SURROGATE_MAX:
438// r = RUNE_ERROR
439// fallthrough
440// case i <= rune3Max:
441// return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
442// default:
443// return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx)
444// }
445// }
446
447/// rune_count returns the number of runes in p. Erroneous and short
448/// encodings are treated as single runes of width 1 byte.
449pub fn rune_count(p: &[u8]) -> usize {
450 let np = p.len();
451 let mut n = 0;
452 let mut i = 0;
453 while i < np {
454 n += 1;
455 let c = p[i];
456 if (c as u32) < (RUNE_SELF as u32) {
457 // ASCII fast path
458 i += 1;
459 continue;
460 }
461 let x = FIRST[c as usize];
462 if x == XX {
463 i += 1; // invalid.
464 continue;
465 }
466 let mut size = (x & 7) as usize;
467 if i + size > np {
468 i += 1; // Short or invalid.
469 continue;
470 }
471 let accept = &ACCEPT_RANGES[(x >> 4) as usize];
472 let c = p[i + 1];
473 if c < accept.lo || accept.hi < c {
474 size = 1
475 } else if size == 2 {
476 } else {
477 let c = p[i + 2];
478 if !(LOCB..=HICB).contains(&c) {
479 size = 1
480 } else if size == 3 {
481 } else {
482 let c = p[i + 3];
483 if !(LOCB..=HICB).contains(&c) {
484 size = 1;
485 }
486 }
487 }
488 i += size;
489 }
490 n
491}
492
493// // RuneCountInString is like rune_count but its input is a string.
494// fn RuneCountInString(s string) (n int) {
495// ns := len(s)
496// for i := 0; i < ns; n++ {
497// c := s[i]
498// if c < RUNE_SELF {
499// // ASCII fast path
500// i++
501// continue
502// }
503// x := FIRST[c]
504// if x == XX {
505// i++ // invalid.
506// continue
507// }
508// size := int(x & 7)
509// if i+size > ns {
510// i++ // Short or invalid.
511// continue
512// }
513// accept := acceptRanges[x>>4]
514// if c := s[i+1]; c < accept.lo || accept.hi < c {
515// size = 1
516// } else if size == 2 {
517// } else if c := s[i+2]; c < locb || hicb < c {
518// size = 1
519// } else if size == 3 {
520// } else if c := s[i+3]; c < locb || hicb < c {
521// size = 1
522// }
523// i += size
524// }
525// return n
526// }
527
528// // RuneStart reports whether the byte could be the first byte of an encoded,
529// // possibly invalid rune. Second and subsequent bytes always have the top two
530// // bits set to 10.
531// fn RuneStart(b byte) -> bool { return b&0xC0 != 0x80 }
532
533// // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
534// fn Valid(p: &[u8]) -> bool {
535// // This optimization avoids the need to recompute the capacity
536// // when generating code for p[8:], bringing it to parity with
537// // ValidString, which was 20% faster on long ASCII strings.
538// p = p[:p.len():p.len()]
539
540// // Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
541// for p.len() >= 8 {
542// // Combining two 32 bit loads allows the same code to be used
543// // for 32 and 64 bit platforms.
544// // The compiler can generate a 32bit load for first32 and second32
545// // on many platforms. See test/codegen/memcombine.go.
546// first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
547// second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
548// if (first32|second32)&0x80808080 != 0 {
549// // Found a non ASCII byte (>= RUNE_SELF).
550// break
551// }
552// p = p[8:]
553// }
554// let n = p.len();
555// for i := 0; i < n; {
556// pi := p[i]
557// if pi < RUNE_SELF {
558// i++
559// continue
560// }
561// x := first[pi]
562// if x == XX {
563// return false // Illegal starter byte.
564// }
565// size := int(x & 7)
566// if i+size > n {
567// return false // Short or invalid.
568// }
569// accept := acceptRanges[x>>4]
570// if c := p[i+1]; c < accept.lo || accept.hi < c {
571// return false
572// } else if size == 2 {
573// } else if c := p[i+2]; c < locb || hicb < c {
574// return false
575// } else if size == 3 {
576// } else if c := p[i+3]; c < locb || hicb < c {
577// return false
578// }
579// i += size
580// }
581// return true
582// }
583
584// // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
585// fn ValidString(s string) -> bool {
586// // Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
587// for len(s) >= 8 {
588// // Combining two 32 bit loads allows the same code to be used
589// // for 32 and 64 bit platforms.
590// // The compiler can generate a 32bit load for first32 and second32
591// // on many platforms. See test/codegen/memcombine.go.
592// first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
593// second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
594// if (first32|second32)&0x80808080 != 0 {
595// // Found a non ASCII byte (>= RUNE_SELF).
596// break
597// }
598// s = s[8:]
599// }
600// let n = len(s);
601// for i := 0; i < n; {
602// si := s[i]
603// if si < RUNE_SELF {
604// i++
605// continue
606// }
607// x := first[si]
608// if x == XX {
609// return false // Illegal starter byte.
610// }
611// size := int(x & 7)
612// if i+size > n {
613// return false // Short or invalid.
614// }
615// accept := acceptRanges[x>>4]
616// if c := s[i+1]; c < accept.lo || accept.hi < c {
617// return false
618// } else if size == 2 {
619// } else if c := s[i+2]; c < locb || hicb < c {
620// return false
621// } else if size == 3 {
622// } else if c := s[i+3]; c < locb || hicb < c {
623// return false
624// }
625// i += size
626// }
627// return true
628// }
629
630/// ValidRune reports whether r can be legally encoded as UTF-8.
631/// Code points that are out of range or a surrogate half are illegal.
632pub fn valid_rune(r: u32) -> bool {
633 (r < SURROGATE_MIN) || (SURROGATE_MAX < r && r <= (MAX_RUNE as u32))
634}