simdutf8_cli/validate.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: 2025,2026 ndaal Gesellschaft für Sicherheit in der Informationstechnik mbH & Co KG, Cologne
3// SPDX-FileCopyrightText: Author: Pierre Gronau <Pierre.Gronau@ndaal.eu>
4
5//! UTF-8 validation built on top of [`simdutf8`].
6//!
7//! Two flavours are exposed, mirroring the upstream crate:
8//!
9//! * [`is_valid`] uses `simdutf8::basic` for the fastest possible yes/no answer.
10//! * [`validate`] uses `simdutf8::compat` to additionally report *where*
11//! validation failed, matching the semantics of [`std::str::from_utf8`].
12
13/// The outcome of validating a byte slice as UTF-8.
14#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15pub enum Validity {
16 /// The input is well-formed UTF-8.
17 Valid,
18 /// The input is not valid UTF-8.
19 Invalid {
20 /// Number of leading bytes that formed valid UTF-8 before the error.
21 valid_up_to: usize,
22 /// Length of the invalid sequence, or `None` if the input ended with an
23 /// incomplete (but otherwise valid-so-far) multi-byte sequence.
24 error_len: Option<usize>,
25 },
26}
27
28impl Validity {
29 /// Returns `true` if the verdict is [`Validity::Valid`].
30 #[must_use]
31 pub const fn is_valid(&self) -> bool {
32 matches!(self, Self::Valid)
33 }
34}
35
36/// Fast yes/no UTF-8 check using `simdutf8::basic::from_utf8`.
37///
38/// Prefer this when the location of any error is not needed: the `basic` flavour
39/// has a zero-sized error type and is the fastest option.
40#[must_use]
41pub fn is_valid(bytes: &[u8]) -> bool {
42 simdutf8::basic::from_utf8(bytes).is_ok()
43}
44
45/// Validate `bytes` as UTF-8, reporting the failure location on error.
46///
47/// Uses `simdutf8::compat::from_utf8`, whose [`std`]-compatible error type
48/// carries `valid_up_to()` and `error_len()`.
49#[must_use]
50pub fn validate(bytes: &[u8]) -> Validity {
51 match simdutf8::compat::from_utf8(bytes) {
52 Ok(_) => Validity::Valid,
53 Err(error) => Validity::Invalid {
54 valid_up_to: error.valid_up_to(),
55 error_len: error.error_len(),
56 },
57 }
58}
59
60#[cfg(test)]
61mod tests {
62 use super::*;
63
64 #[test]
65 fn empty_input_is_valid() {
66 assert!(is_valid(b""));
67 assert_eq!(validate(b""), Validity::Valid);
68 }
69
70 #[test]
71 fn ascii_is_valid() {
72 assert!(is_valid(b"The quick brown fox"));
73 assert_eq!(validate(b"The quick brown fox"), Validity::Valid);
74 }
75
76 #[test]
77 fn multilingual_utf8_is_valid() {
78 let sample = "Grüße — 日本語 — 😊 — Здравствуйте".as_bytes();
79 assert!(is_valid(sample));
80 assert_eq!(validate(sample), Validity::Valid);
81 }
82
83 #[test]
84 fn lone_continuation_byte_is_invalid() {
85 // 0x80 is a continuation byte with no leader.
86 assert!(!is_valid(b"\x80"));
87 assert_eq!(
88 validate(b"\x80"),
89 Validity::Invalid {
90 valid_up_to: 0,
91 error_len: Some(1),
92 }
93 );
94 }
95
96 #[test]
97 fn lone_ff_byte_is_invalid() {
98 assert_eq!(
99 validate(b"a\xFFb"),
100 Validity::Invalid {
101 valid_up_to: 1,
102 error_len: Some(1),
103 }
104 );
105 }
106
107 #[test]
108 fn truncated_multibyte_reports_no_error_len() {
109 // "abc" followed by the first byte of a 4-byte sequence: incomplete.
110 assert_eq!(
111 validate(b"abc\xF0"),
112 Validity::Invalid {
113 valid_up_to: 3,
114 error_len: None,
115 }
116 );
117 }
118
119 #[test]
120 fn utf16le_bytes_are_not_valid_utf8() {
121 // "Hi" encoded as UTF-16LE with BOM: FF FE 48 00 69 00
122 let utf16 = b"\xFF\xFE\x48\x00\x69\x00";
123 assert!(!is_valid(utf16));
124 assert!(!validate(utf16).is_valid());
125 }
126}