simdutf8-cli 0.1.6

// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: 2025,2026 ndaal Gesellschaft für Sicherheit in der Informationstechnik mbH & Co KG, Cologne
// SPDX-FileCopyrightText: Author: Pierre Gronau <Pierre.Gronau@ndaal.eu>

//! UTF-8 validation built on top of [`simdutf8`].
//!
//! Two flavours are exposed, mirroring the upstream crate:
//!
//! * [`is_valid`] uses `simdutf8::basic` for the fastest possible yes/no answer.
//! * [`validate`] uses `simdutf8::compat` to additionally report *where*
//!   validation failed, matching the semantics of [`std::str::from_utf8`].

/// The outcome of validating a byte slice as UTF-8.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum Validity {
    /// The input is well-formed UTF-8.
    Valid,
    /// The input is not valid UTF-8.
    Invalid {
        /// Number of leading bytes that formed valid UTF-8 before the error.
        valid_up_to: usize,
        /// Length of the invalid sequence, or `None` if the input ended with an
        /// incomplete (but otherwise valid-so-far) multi-byte sequence.
        error_len: Option<usize>,
    },
}

impl Validity {
    /// Returns `true` if the verdict is [`Validity::Valid`].
    #[must_use]
    pub const fn is_valid(&self) -> bool {
        matches!(self, Self::Valid)
    }
}

/// Fast yes/no UTF-8 check using `simdutf8::basic::from_utf8`.
///
/// Prefer this when the location of any error is not needed: the `basic` flavour
/// has a zero-sized error type and is the fastest option.
#[must_use]
pub fn is_valid(bytes: &[u8]) -> bool {
    simdutf8::basic::from_utf8(bytes).is_ok()
}

/// Validate `bytes` as UTF-8, reporting the failure location on error.
///
/// Uses `simdutf8::compat::from_utf8`, whose [`std`]-compatible error type
/// carries `valid_up_to()` and `error_len()`.
#[must_use]
pub fn validate(bytes: &[u8]) -> Validity {
    match simdutf8::compat::from_utf8(bytes) {
        Ok(_) => Validity::Valid,
        Err(error) => Validity::Invalid {
            valid_up_to: error.valid_up_to(),
            error_len: error.error_len(),
        },
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_input_is_valid() {
        assert!(is_valid(b""));
        assert_eq!(validate(b""), Validity::Valid);
    }

    #[test]
    fn ascii_is_valid() {
        assert!(is_valid(b"The quick brown fox"));
        assert_eq!(validate(b"The quick brown fox"), Validity::Valid);
    }

    #[test]
    fn multilingual_utf8_is_valid() {
        let sample = "Grüße — 日本語 — 😊 — Здравствуйте".as_bytes();
        assert!(is_valid(sample));
        assert_eq!(validate(sample), Validity::Valid);
    }

    #[test]
    fn lone_continuation_byte_is_invalid() {
        // 0x80 is a continuation byte with no leader.
        assert!(!is_valid(b"\x80"));
        assert_eq!(
            validate(b"\x80"),
            Validity::Invalid {
                valid_up_to: 0,
                error_len: Some(1),
            }
        );
    }

    #[test]
    fn lone_ff_byte_is_invalid() {
        assert_eq!(
            validate(b"a\xFFb"),
            Validity::Invalid {
                valid_up_to: 1,
                error_len: Some(1),
            }
        );
    }

    #[test]
    fn truncated_multibyte_reports_no_error_len() {
        // "abc" followed by the first byte of a 4-byte sequence: incomplete.
        assert_eq!(
            validate(b"abc\xF0"),
            Validity::Invalid {
                valid_up_to: 3,
                error_len: None,
            }
        );
    }

    #[test]
    fn utf16le_bytes_are_not_valid_utf8() {
        // "Hi" encoded as UTF-16LE with BOM: FF FE 48 00 69 00
        let utf16 = b"\xFF\xFE\x48\x00\x69\x00";
        assert!(!is_valid(utf16));
        assert!(!validate(utf16).is_valid());
    }
}