static-automata 1.0.2

Derive macro for static regular grammar
Documentation
//! This frameworks helps you define validation functions based on deterministic
//! finite automata (DFAs) generated from ABNF grammars.
//!
//! It works as follows:
//! - Declare a `mod` item annotated with the `#[grammar]` macro specifying an
//!   ABNF grammar (either from a file, or in the doc comments).
//! - Use the `cargo build-automata` command line interface to generate the
//!   declared module file, containing the grammar compiled into a deterministic
//!   finite automaton. Alternatively you can call the [`build-automata`]
//!   library directly from a `build.rs` script.
//! - Use the validation functions provided in the generated module directly,
//!   or use the `Validate` derive macro to bind them to custom types.
//!
//! [`build-automata`]: https://crates.io/crates/build-automata
//!
//! ## Example
//!
//! ```ignore
//! /// Automata module.
//! ///
//! /// This module file is generated by the command line interface or builder
//! /// library. It contains an `Iri` type definition for the `IRI` production
//! /// of the `iri.abnf` that we exported here, with a `validate_bytes` and a
//! /// `validate_bytes` const function.
//! ///
//! /// The attribute macro itself doesn't generate anything, but replaces this
//! /// item with an external module import `mod automata;`.
//! use static_automata::{grammar, Validate};
//!
//! #[grammar(file = "iri.abnf", export("IRI"))]
//! mod automata {}
//!
//! /// Derive the `validate_bytes` and `validate_str` methods from the
//! /// `automata::Iri` automaton.
//! #[derive(Validate)]
//! #[automaton(automata::Iri)]
//! struct Foo;
//!
//! fn main() {
//!     // Byte string validation.
//!     assert!(Foo::validate_bytes(b"https://example.com").is_ok());
//!
//!     // String validation.
//!     assert!(Foo::validate_str("https://example.com").is_ok());
//!
//!     // The validation functions are `const` compatible.
//!     const _: () = {
//!         assert!(Foo::validate_bytes(b"https://example.com").is_ok());
//!         assert!(Foo::validate_str("https://example.com").is_ok());
//!     };
//! }
//! ```
//!
//! ## Why not compile the grammars through the attribute macro?
//!
//! Compiling a grammar requires determinizing a potentially large automaton,
//! which is computationally very expensive.
//!
//! ## Command line interface
//!
//! You can install it with `cargo install cargo-build-automata` then use it
//! when you need to re-generate the autamata (e.g. when the associated grammar
//! changes):
//!
//! ```bash
//! cargo build-automata
//! ```
//!
//! Be careful, this will override the content of the modules annotated with the
//! `#[grammar]` attribute macro. If you're not sure which file will be
//! overriden you can run the cli with the `-d`/`--dry-run` flag:
//!
//! ```bash
//! cargo build-automata -d
//! ```
//!
//! It will compile the grammars, but not write anything.
//!
//! ## Build script
//!
//! The advantage of the command line interface is that it allows you to ship
//! the automata already compiled with your library/application. However you
//! might prefer to compile the automata on the user machine, using a `build.rs`
//! script. To do that you can use the [`build-automata`] library (the cli is
//! basically a wrapper around this library).
//!
//! ```ignore
//! use build_automata::build_automata;
//!
//! fn main() {
//!   build_automata();
//! }
//! ```
use core::fmt;
pub use static_automata_macros::{Validate, grammar};

/// UTF-8 decoding error.
#[derive(Debug)]
pub struct Utf8Error;

impl fmt::Display for Utf8Error {
	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
		write!(f, "Invalid UTF-8")
	}
}

impl core::error::Error for Utf8Error {}

/// Decode an UTF-8 encoded `char` from the given bytes at offset `i`.
///
/// This function is used by the generated `validate_bytes` functions.
pub const fn decode_utf8_char(bytes: &[u8], i: usize) -> Result<(char, usize), Utf8Error> {
	if bytes.len() <= i {
		return Err(Utf8Error);
	}

	let a = bytes[i];
	let (code_point, len) = if a & 0x80 == 0 {
		// 1 byte
		(a as u32 & 0x7f, 1)
	} else if a & 0xe0 == 0xc0 {
		// 2 bytes.
		if bytes.len() <= i + 1 {
			return Err(Utf8Error);
		}

		let Ok(b) = read_extended(bytes, i + 1) else {
			return Err(Utf8Error);
		};

		let code_point = (a as u32 & 0x1f) << 6 | b as u32;

		(code_point, 2)
	} else if a & 0xf0 == 0xe0 {
		// 3 bytes.
		if bytes.len() <= i + 2 {
			return Err(Utf8Error);
		}

		let Ok(b) = read_extended(bytes, i + 1) else {
			return Err(Utf8Error);
		};

		let Ok(c) = read_extended(bytes, i + 2) else {
			return Err(Utf8Error);
		};

		let code_point = (a as u32 & 0x0f) << 12 | (b as u32) << 6 | c as u32;

		(code_point, 3)
	} else if a & 0xf8 == 0xf0 {
		// 4 bytes.
		if bytes.len() <= i + 3 {
			return Err(Utf8Error);
		}

		let Ok(b) = read_extended(bytes, i + 1) else {
			return Err(Utf8Error);
		};

		let Ok(c) = read_extended(bytes, i + 2) else {
			return Err(Utf8Error);
		};

		let Ok(d) = read_extended(bytes, i + 3) else {
			return Err(Utf8Error);
		};

		let code_point = (a as u32 & 0x07) << 18 | (b as u32) << 12 | (c as u32) << 6 | d as u32;

		(code_point, 4)
	} else {
		return Err(Utf8Error);
	};

	// Surrogate code point.
	if code_point >= 0xD800 && code_point <= 0xDFFF {
		return Err(Utf8Error);
	}

	// Overflow.
	if code_point > 0x10FFFF {
		return Err(Utf8Error);
	}

	Ok((unsafe { char::from_u32_unchecked(code_point) }, len))
}

const fn read_extended(bytes: &[u8], i: usize) -> Result<u8, Utf8Error> {
	let b = bytes[i];

	if b & 0xc0 != 0x80 {
		return Err(Utf8Error);
	}

	Ok(b & 0x3f)
}

#[cfg(test)]
mod tests {
	use super::*;

	fn decode_utf8(bytes: &[u8]) -> String {
		let mut i = 0;

		let mut result = String::new();
		while i < bytes.len() {
			let (c, len) = decode_utf8_char(bytes, i).unwrap();
			i += len;
			result.push(c);
		}

		result
	}

	const VALID: [&str; 13] = [
		// Basic Latin.
		"Hello, World!",
		// Greek and Coptic.
		"Γειά σου Κόσμε",
		// Cyrillic.
		"Привет, мир!",
		// Arabic.
		"مرحبا بالعالم",
		// Devanagari.
		"नमस्ते दुनिया",
		// Hebrew.
		"שלום עולם",
		// Thai.
		"สวัสดีโลก",
		// Hiragana (Japanese).
		"こんにちは",
		// Katakana (Japanese).
		"コンニチハ",
		// CJK Unified Ideographs (Chinese).
		"你好,世界",
		// Mathematical Operators.
		"∀x ∈ ℝ: ∃y ≥ 0",
		// Emoji.
		"😀🚀✨",
		// Boundaries.
		"\0\u{10FFFF}",
	];

	#[test]
	fn valid() {
		for string in VALID {
			let decoded = decode_utf8(string.as_bytes());
			assert_eq!(string, decoded);
		}
	}

	const INVALID: [&[u8]; 3] = [
		&[0b1111_0100, 0b10_010000, 0b10_000000, 0b10_000000], // 0x110000
		&[0b1110_1101, 0b10_100000, 0b10_000000],              // 0xD800
		&[0b1110_1101, 0b10_111111, 0b10_111111],              // 0xDFFF
	];

	#[test]
	fn invalid() {
		for string in INVALID {
			assert!(decode_utf8_char(string, 0).is_err());
		}
	}
}