simdutf8-cli 0.2.7

SIMD-accelerated UTF-8 validation CLI built on the simdutf8 crate, with hardened path handling.
Documentation
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: 2025,2026 ndaal Gesellschaft für Sicherheit in der Informationstechnik mbH & Co KG, Cologne
// SPDX-FileCopyrightText: Author: Pierre Gronau <Pierre.Gronau@ndaal.eu>

//! Generate a set of example files in a variety of text encodings.
//!
//! Only the UTF-8 files are well-formed UTF-8; the UTF-16/UTF-32, Latin-1 and
//! deliberately-corrupt files are *not* and are useful for demonstrating that
//! `simdutf8-cli` correctly rejects non-UTF-8 input.
//!
//! Usage:
//!
//! ```text
//! cargo run --example generate_fixtures            # writes ./tests/fixtures
//! cargo run --example generate_fixtures -- /tmp/x  # writes /tmp/x
//! ```
//!
//! The files this produces are also committed under `tests/fixtures/` and are
//! checked by the integration tests, so regenerating them must stay byte-stable.

#![forbid(unsafe_code)]

use std::fs::File;
use std::io::{self, Write};
use std::path::{Path, PathBuf};

/// UTF-8 byte-order-mark (encodes U+FEFF); valid UTF-8.
const UTF8_BOM: [u8; 3] = [0xEF, 0xBB, 0xBF];

fn main() -> io::Result<()> {
    let mut args = std::env::args_os().skip(1);
    let out_dir = args
        .next()
        .map_or_else(|| PathBuf::from("tests/fixtures"), PathBuf::from);

    // This dev-only example legitimately creates its output directory; the
    // disallowed-methods policy targets first-party library/binary code.
    #[allow(clippy::disallowed_methods)]
    std::fs::create_dir_all(&out_dir)?;

    let stdout = io::stdout();
    let mut log = stdout.lock();

    // --- Valid UTF-8 -------------------------------------------------------
    write_file(
        &out_dir,
        "ascii.txt",
        b"The quick brown fox jumps over the lazy dog.\n",
        &mut log,
    )?;

    let multilingual = "Grüße — 日本語 — 😊 — Здравствуйте — 한국어\n";
    write_file(
        &out_dir,
        "utf8_multilingual.txt",
        multilingual.as_bytes(),
        &mut log,
    )?;

    let mut utf8_bom = UTF8_BOM.to_vec();
    utf8_bom.extend_from_slice("UTF-8 with BOM: grüße 😊\n".as_bytes());
    write_file(&out_dir, "utf8_bom.txt", &utf8_bom, &mut log)?;

    // --- UTF-16 (not valid UTF-8) -----------------------------------------
    let sample16 = "UTF-16 sample: grüße 😊\n";
    write_file(
        &out_dir,
        "utf16le_bom.txt",
        &utf16_bytes(sample16, Endian::Little),
        &mut log,
    )?;
    write_file(
        &out_dir,
        "utf16be_bom.txt",
        &utf16_bytes(sample16, Endian::Big),
        &mut log,
    )?;

    // --- UTF-32 LE (not valid UTF-8) --------------------------------------
    write_file(
        &out_dir,
        "utf32le_bom.bin",
        &utf32le_bytes("UTF-32 😊\n"),
        &mut log,
    )?;

    // --- Latin-1 / ISO-8859-1 (not valid UTF-8) ---------------------------
    // "Café déjà vu\n" with the accented letters as single high bytes (>0x7F),
    // which are invalid as stand-alone UTF-8 leaders.
    let latin1: &[u8] = b"Caf\xE9 d\xE9j\xE0 vu\n";
    write_file(&out_dir, "latin1.txt", latin1, &mut log)?;

    // --- Deliberately corrupt UTF-8 ---------------------------------------
    // Valid prefix then the first byte of a 4-byte sequence (incomplete).
    write_file(&out_dir, "truncated_utf8.bin", b"abc\xF0", &mut log)?;
    // A lone continuation byte with no leader.
    write_file(&out_dir, "lone_continuation.bin", b"\x80", &mut log)?;

    writeln!(log, "Done. Fixtures written to {}", out_dir.display())?;
    Ok(())
}

/// Byte order for UTF-16 encoding.
#[derive(Clone, Copy)]
enum Endian {
    Little,
    Big,
}

/// Encode `text` as UTF-16 with a byte-order mark in the given endianness.
fn utf16_bytes(text: &str, endian: Endian) -> Vec<u8> {
    // U+FEFF byte-order mark followed by the encoded text.
    let mut units = vec![0xFEFF_u16];
    units.extend(text.encode_utf16());

    let mut bytes = Vec::with_capacity(units.len().saturating_mul(2));
    for unit in units {
        match endian {
            Endian::Little => bytes.extend_from_slice(&unit.to_le_bytes()),
            Endian::Big => bytes.extend_from_slice(&unit.to_be_bytes()),
        }
    }
    bytes
}

/// Encode `text` as little-endian UTF-32 with a byte-order mark.
fn utf32le_bytes(text: &str) -> Vec<u8> {
    let mut bytes = 0xFEFF_u32.to_le_bytes().to_vec();
    for ch in text.chars() {
        bytes.extend_from_slice(&u32::from(ch).to_le_bytes());
    }
    bytes
}

/// Write `bytes` to `dir/name`, logging the action.
fn write_file<W: Write>(dir: &Path, name: &str, bytes: &[u8], log: &mut W) -> io::Result<()> {
    let path = dir.join(name);
    let mut file = File::create(&path)?;
    file.write_all(bytes)?;
    file.flush()?;
    writeln!(log, "wrote {name:>22}  ({} bytes)", bytes.len())?;
    Ok(())
}