Skip to main content

static_automata/
lib.rs

1//! This frameworks helps you define validation functions based on deterministic
2//! finite automata (DFAs) generated from ABNF grammars.
3//!
4//! It works as follows:
5//! - Declare a `mod` item annotated with the `#[grammar]` macro specifying an
6//!   ABNF grammar (either from a file, or in the doc comments).
7//! - Use the `cargo build-automata` command line interface to generate the
8//!   declared module file, containing the grammar compiled into a deterministic
9//!   finite automaton. Alternatively you can call the [`build-automata`]
10//!   library directly from a `build.rs` script.
11//! - Use the validation functions provided in the generated module directly,
12//!   or use the `Validate` derive macro to bind them to custom types.
13//!
14//! [`build-automata`]: https://crates.io/crates/build-automata
15//!
16//! ## Example
17//!
18//! ```ignore
19//! /// Automata module.
20//! ///
21//! /// This module file is generated by the command line interface or builder
22//! /// library. It contains an `Iri` type definition for the `IRI` production
23//! /// of the `iri.abnf` that we exported here, with a `validate_bytes` and a
24//! /// `validate_bytes` const function.
25//! ///
26//! /// The attribute macro itself doesn't generate anything, but replaces this
27//! /// item with an external module import `mod automata;`.
28//! use static_automata::{grammar, Validate};
29//!
30//! #[grammar(file = "iri.abnf", export("IRI"))]
31//! mod automata {}
32//!
33//! /// Derive the `validate_bytes` and `validate_str` methods from the
34//! /// `automata::Iri` automaton.
35//! #[derive(Validate)]
36//! #[automaton(automata::Iri)]
37//! struct Foo;
38//!
39//! fn main() {
40//!     // Byte string validation.
41//!     assert!(Foo::validate_bytes(b"https://example.com").is_ok());
42//!
43//!     // String validation.
44//!     assert!(Foo::validate_str("https://example.com").is_ok());
45//!
46//!     // The validation functions are `const` compatible.
47//!     const _: () = {
48//!         assert!(Foo::validate_bytes(b"https://example.com").is_ok());
49//!         assert!(Foo::validate_str("https://example.com").is_ok());
50//!     };
51//! }
52//! ```
53//!
54//! ## Why not compile the grammars through the attribute macro?
55//!
56//! Compiling a grammar requires determinizing a potentially large automaton,
57//! which is computationally very expensive.
58//!
59//! ## Command line interface
60//!
61//! You can install it with `cargo install cargo-build-automata` then use it
62//! when you need to re-generate the autamata (e.g. when the associated grammar
63//! changes):
64//!
65//! ```bash
66//! cargo build-automata
67//! ```
68//!
69//! Be careful, this will override the content of the modules annotated with the
70//! `#[grammar]` attribute macro. If you're not sure which file will be
71//! overriden you can run the cli with the `-d`/`--dry-run` flag:
72//!
73//! ```bash
74//! cargo build-automata -d
75//! ```
76//!
77//! It will compile the grammars, but not write anything.
78//!
79//! ## Build script
80//!
81//! The advantage of the command line interface is that it allows you to ship
82//! the automata already compiled with your library/application. However you
83//! might prefer to compile the automata on the user machine, using a `build.rs`
84//! script. To do that you can use the [`build-automata`] library (the cli is
85//! basically a wrapper around this library).
86//!
87//! ```ignore
88//! use build_automata::build_automata;
89//!
90//! fn main() {
91//!   build_automata();
92//! }
93//! ```
94use core::fmt;
95pub use static_automata_macros::{Validate, grammar};
96
97/// UTF-8 decoding error.
98#[derive(Debug)]
99pub struct Utf8Error;
100
101impl fmt::Display for Utf8Error {
102	fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
103		write!(f, "Invalid UTF-8")
104	}
105}
106
107impl core::error::Error for Utf8Error {}
108
109/// Decode an UTF-8 encoded `char` from the given bytes at offset `i`.
110///
111/// This function is used by the generated `validate_bytes` functions.
112pub const fn decode_utf8_char(bytes: &[u8], i: usize) -> Result<(char, usize), Utf8Error> {
113	if bytes.len() <= i {
114		return Err(Utf8Error);
115	}
116
117	let a = bytes[i];
118	let (code_point, len) = if a & 0x80 == 0 {
119		// 1 byte
120		(a as u32 & 0x7f, 1)
121	} else if a & 0xe0 == 0xc0 {
122		// 2 bytes.
123		if bytes.len() <= i + 1 {
124			return Err(Utf8Error);
125		}
126
127		let Ok(b) = read_extended(bytes, i + 1) else {
128			return Err(Utf8Error);
129		};
130
131		let code_point = (a as u32 & 0x1f) << 6 | b as u32;
132
133		(code_point, 2)
134	} else if a & 0xf0 == 0xe0 {
135		// 3 bytes.
136		if bytes.len() <= i + 2 {
137			return Err(Utf8Error);
138		}
139
140		let Ok(b) = read_extended(bytes, i + 1) else {
141			return Err(Utf8Error);
142		};
143
144		let Ok(c) = read_extended(bytes, i + 2) else {
145			return Err(Utf8Error);
146		};
147
148		let code_point = (a as u32 & 0x0f) << 12 | (b as u32) << 6 | c as u32;
149
150		(code_point, 3)
151	} else if a & 0xf8 == 0xf0 {
152		// 4 bytes.
153		if bytes.len() <= i + 3 {
154			return Err(Utf8Error);
155		}
156
157		let Ok(b) = read_extended(bytes, i + 1) else {
158			return Err(Utf8Error);
159		};
160
161		let Ok(c) = read_extended(bytes, i + 2) else {
162			return Err(Utf8Error);
163		};
164
165		let Ok(d) = read_extended(bytes, i + 3) else {
166			return Err(Utf8Error);
167		};
168
169		let code_point = (a as u32 & 0x07) << 18 | (b as u32) << 12 | (c as u32) << 6 | d as u32;
170
171		(code_point, 4)
172	} else {
173		return Err(Utf8Error);
174	};
175
176	// Surrogate code point.
177	if code_point >= 0xD800 && code_point <= 0xDFFF {
178		return Err(Utf8Error);
179	}
180
181	// Overflow.
182	if code_point > 0x10FFFF {
183		return Err(Utf8Error);
184	}
185
186	Ok((unsafe { char::from_u32_unchecked(code_point) }, len))
187}
188
189const fn read_extended(bytes: &[u8], i: usize) -> Result<u8, Utf8Error> {
190	let b = bytes[i];
191
192	if b & 0xc0 != 0x80 {
193		return Err(Utf8Error);
194	}
195
196	Ok(b & 0x3f)
197}
198
199#[cfg(test)]
200mod tests {
201	use super::*;
202
203	fn decode_utf8(bytes: &[u8]) -> String {
204		let mut i = 0;
205
206		let mut result = String::new();
207		while i < bytes.len() {
208			let (c, len) = decode_utf8_char(bytes, i).unwrap();
209			i += len;
210			result.push(c);
211		}
212
213		result
214	}
215
216	const VALID: [&str; 13] = [
217		// Basic Latin.
218		"Hello, World!",
219		// Greek and Coptic.
220		"Γειά σου Κόσμε",
221		// Cyrillic.
222		"Привет, мир!",
223		// Arabic.
224		"مرحبا بالعالم",
225		// Devanagari.
226		"नमस्ते दुनिया",
227		// Hebrew.
228		"שלום עולם",
229		// Thai.
230		"สวัสดีโลก",
231		// Hiragana (Japanese).
232		"こんにちは",
233		// Katakana (Japanese).
234		"コンニチハ",
235		// CJK Unified Ideographs (Chinese).
236		"你好,世界",
237		// Mathematical Operators.
238		"∀x ∈ ℝ: ∃y ≥ 0",
239		// Emoji.
240		"😀🚀✨",
241		// Boundaries.
242		"\0\u{10FFFF}",
243	];
244
245	#[test]
246	fn valid() {
247		for string in VALID {
248			let decoded = decode_utf8(string.as_bytes());
249			assert_eq!(string, decoded);
250		}
251	}
252
253	const INVALID: [&[u8]; 3] = [
254		&[0b1111_0100, 0b10_010000, 0b10_000000, 0b10_000000], // 0x110000
255		&[0b1110_1101, 0b10_100000, 0b10_000000],              // 0xD800
256		&[0b1110_1101, 0b10_111111, 0b10_111111],              // 0xDFFF
257	];
258
259	#[test]
260	fn invalid() {
261		for string in INVALID {
262			assert!(decode_utf8_char(string, 0).is_err());
263		}
264	}
265}