Skip to main content

simdutf8_cli/
validate.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: 2025,2026 ndaal Gesellschaft für Sicherheit in der Informationstechnik mbH & Co KG, Cologne
3// SPDX-FileCopyrightText: Author: Pierre Gronau <Pierre.Gronau@ndaal.eu>
4
5//! UTF-8 validation built on top of [`simdutf8`].
6//!
7//! Two flavours are exposed, mirroring the upstream crate:
8//!
9//! * [`is_valid`] uses `simdutf8::basic` for the fastest possible yes/no answer.
10//! * [`validate`] uses `simdutf8::compat` to additionally report *where*
11//!   validation failed, matching the semantics of [`std::str::from_utf8`].
12
13/// The outcome of validating a byte slice as UTF-8.
14#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15pub enum Validity {
16    /// The input is well-formed UTF-8.
17    Valid,
18    /// The input is not valid UTF-8.
19    Invalid {
20        /// Number of leading bytes that formed valid UTF-8 before the error.
21        valid_up_to: usize,
22        /// Length of the invalid sequence, or `None` if the input ended with an
23        /// incomplete (but otherwise valid-so-far) multi-byte sequence.
24        error_len: Option<usize>,
25    },
26}
27
28impl Validity {
29    /// Returns `true` if the verdict is [`Validity::Valid`].
30    #[must_use]
31    pub const fn is_valid(&self) -> bool {
32        matches!(self, Self::Valid)
33    }
34}
35
36/// Fast yes/no UTF-8 check using `simdutf8::basic::from_utf8`.
37///
38/// Prefer this when the location of any error is not needed: the `basic` flavour
39/// has a zero-sized error type and is the fastest option.
40#[must_use]
41pub fn is_valid(bytes: &[u8]) -> bool {
42    simdutf8::basic::from_utf8(bytes).is_ok()
43}
44
45/// Validate `bytes` as UTF-8, reporting the failure location on error.
46///
47/// Uses `simdutf8::compat::from_utf8`, whose [`std`]-compatible error type
48/// carries `valid_up_to()` and `error_len()`.
49#[must_use]
50pub fn validate(bytes: &[u8]) -> Validity {
51    match simdutf8::compat::from_utf8(bytes) {
52        Ok(_) => Validity::Valid,
53        Err(error) => Validity::Invalid {
54            valid_up_to: error.valid_up_to(),
55            error_len: error.error_len(),
56        },
57    }
58}
59
60#[cfg(test)]
61mod tests {
62    use super::*;
63
64    #[test]
65    fn empty_input_is_valid() {
66        assert!(is_valid(b""));
67        assert_eq!(validate(b""), Validity::Valid);
68    }
69
70    #[test]
71    fn ascii_is_valid() {
72        assert!(is_valid(b"The quick brown fox"));
73        assert_eq!(validate(b"The quick brown fox"), Validity::Valid);
74    }
75
76    #[test]
77    fn multilingual_utf8_is_valid() {
78        let sample = "Grüße — 日本語 — 😊 — Здравствуйте".as_bytes();
79        assert!(is_valid(sample));
80        assert_eq!(validate(sample), Validity::Valid);
81    }
82
83    #[test]
84    fn lone_continuation_byte_is_invalid() {
85        // 0x80 is a continuation byte with no leader.
86        assert!(!is_valid(b"\x80"));
87        assert_eq!(
88            validate(b"\x80"),
89            Validity::Invalid {
90                valid_up_to: 0,
91                error_len: Some(1),
92            }
93        );
94    }
95
96    #[test]
97    fn lone_ff_byte_is_invalid() {
98        assert_eq!(
99            validate(b"a\xFFb"),
100            Validity::Invalid {
101                valid_up_to: 1,
102                error_len: Some(1),
103            }
104        );
105    }
106
107    #[test]
108    fn truncated_multibyte_reports_no_error_len() {
109        // "abc" followed by the first byte of a 4-byte sequence: incomplete.
110        assert_eq!(
111            validate(b"abc\xF0"),
112            Validity::Invalid {
113                valid_up_to: 3,
114                error_len: None,
115            }
116        );
117    }
118
119    #[test]
120    fn utf16le_bytes_are_not_valid_utf8() {
121        // "Hi" encoded as UTF-16LE with BOM: FF FE 48 00 69 00
122        let utf16 = b"\xFF\xFE\x48\x00\x69\x00";
123        assert!(!is_valid(utf16));
124        assert!(!validate(utf16).is_valid());
125    }
126}