static_automata/lib.rs
1//! This frameworks helps you define validation functions based on deterministic
2//! finite automata (DFAs) generated from ABNF grammars.
3//!
4//! It works as follows:
5//! - Declare a `mod` item annotated with the `#[grammar]` macro specifying an
6//! ABNF grammar (either from a file, or in the doc comments).
7//! - Use the `cargo build-automata` command line interface to generate the
8//! declared module file, containing the grammar compiled into a deterministic
9//! finite automaton. Alternatively you can call the [`build-automata`]
10//! library directly from a `build.rs` script.
11//! - Use the validation functions provided in the generated module directly,
12//! or use the `Validate` derive macro to bind them to custom types.
13//!
14//! [`build-automata`]: https://crates.io/crates/build-automata
15//!
16//! ## Example
17//!
18//! ```ignore
19//! /// Automata module.
20//! ///
21//! /// This module file is generated by the command line interface or builder
22//! /// library. It contains an `Iri` type definition for the `IRI` production
23//! /// of the `iri.abnf` that we exported here, with a `validate_bytes` and a
24//! /// `validate_bytes` const function.
25//! ///
26//! /// The attribute macro itself doesn't generate anything, but replaces this
27//! /// item with an external module import `mod automata;`.
28//! use static_automata::{grammar, Validate};
29//!
30//! #[grammar(file = "iri.abnf", export("IRI"))]
31//! mod automata {}
32//!
33//! /// Derive the `validate_bytes` and `validate_str` methods from the
34//! /// `automata::Iri` automaton.
35//! #[derive(Validate)]
36//! #[automaton(automata::Iri)]
37//! struct Foo;
38//!
39//! fn main() {
40//! // Byte string validation.
41//! assert!(Foo::validate_bytes(b"https://example.com").is_ok());
42//!
43//! // String validation.
44//! assert!(Foo::validate_str("https://example.com").is_ok());
45//!
46//! // The validation functions are `const` compatible.
47//! const _: () = {
48//! assert!(Foo::validate_bytes(b"https://example.com").is_ok());
49//! assert!(Foo::validate_str("https://example.com").is_ok());
50//! };
51//! }
52//! ```
53//!
54//! ## Why not compile the grammars through the attribute macro?
55//!
56//! Compiling a grammar requires determinizing a potentially large automaton,
57//! which is computationally very expensive.
58//!
59//! ## Command line interface
60//!
61//! You can install it with `cargo install cargo-build-automata` then use it
62//! when you need to re-generate the autamata (e.g. when the associated grammar
63//! changes):
64//!
65//! ```bash
66//! cargo build-automata
67//! ```
68//!
69//! Be careful, this will override the content of the modules annotated with the
70//! `#[grammar]` attribute macro. If you're not sure which file will be
71//! overriden you can run the cli with the `-d`/`--dry-run` flag:
72//!
73//! ```bash
74//! cargo build-automata -d
75//! ```
76//!
77//! It will compile the grammars, but not write anything.
78//!
79//! ## Build script
80//!
81//! The advantage of the command line interface is that it allows you to ship
82//! the automata already compiled with your library/application. However you
83//! might prefer to compile the automata on the user machine, using a `build.rs`
84//! script. To do that you can use the [`build-automata`] library (the cli is
85//! basically a wrapper around this library).
86//!
87//! ```ignore
88//! use build_automata::build_automata;
89//!
90//! fn main() {
91//! build_automata();
92//! }
93//! ```
94use core::fmt;
95pub use static_automata_macros::{Validate, grammar};
96
97/// UTF-8 decoding error.
98#[derive(Debug)]
99pub struct Utf8Error;
100
101impl fmt::Display for Utf8Error {
102 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
103 write!(f, "Invalid UTF-8")
104 }
105}
106
107impl core::error::Error for Utf8Error {}
108
109/// Decode an UTF-8 encoded `char` from the given bytes at offset `i`.
110///
111/// This function is used by the generated `validate_bytes` functions.
112pub const fn decode_utf8_char(bytes: &[u8], i: usize) -> Result<(char, usize), Utf8Error> {
113 if bytes.len() <= i {
114 return Err(Utf8Error);
115 }
116
117 let a = bytes[i];
118 let (code_point, len) = if a & 0x80 == 0 {
119 // 1 byte
120 (a as u32 & 0x7f, 1)
121 } else if a & 0xe0 == 0xc0 {
122 // 2 bytes.
123 if bytes.len() <= i + 1 {
124 return Err(Utf8Error);
125 }
126
127 let Ok(b) = read_extended(bytes, i + 1) else {
128 return Err(Utf8Error);
129 };
130
131 let code_point = (a as u32 & 0x1f) << 6 | b as u32;
132
133 (code_point, 2)
134 } else if a & 0xf0 == 0xe0 {
135 // 3 bytes.
136 if bytes.len() <= i + 2 {
137 return Err(Utf8Error);
138 }
139
140 let Ok(b) = read_extended(bytes, i + 1) else {
141 return Err(Utf8Error);
142 };
143
144 let Ok(c) = read_extended(bytes, i + 2) else {
145 return Err(Utf8Error);
146 };
147
148 let code_point = (a as u32 & 0x0f) << 12 | (b as u32) << 6 | c as u32;
149
150 (code_point, 3)
151 } else if a & 0xf8 == 0xf0 {
152 // 4 bytes.
153 if bytes.len() <= i + 3 {
154 return Err(Utf8Error);
155 }
156
157 let Ok(b) = read_extended(bytes, i + 1) else {
158 return Err(Utf8Error);
159 };
160
161 let Ok(c) = read_extended(bytes, i + 2) else {
162 return Err(Utf8Error);
163 };
164
165 let Ok(d) = read_extended(bytes, i + 3) else {
166 return Err(Utf8Error);
167 };
168
169 let code_point = (a as u32 & 0x07) << 18 | (b as u32) << 12 | (c as u32) << 6 | d as u32;
170
171 (code_point, 4)
172 } else {
173 return Err(Utf8Error);
174 };
175
176 // Surrogate code point.
177 if code_point >= 0xD800 && code_point <= 0xDFFF {
178 return Err(Utf8Error);
179 }
180
181 // Overflow.
182 if code_point > 0x10FFFF {
183 return Err(Utf8Error);
184 }
185
186 Ok((unsafe { char::from_u32_unchecked(code_point) }, len))
187}
188
189const fn read_extended(bytes: &[u8], i: usize) -> Result<u8, Utf8Error> {
190 let b = bytes[i];
191
192 if b & 0xc0 != 0x80 {
193 return Err(Utf8Error);
194 }
195
196 Ok(b & 0x3f)
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202
203 fn decode_utf8(bytes: &[u8]) -> String {
204 let mut i = 0;
205
206 let mut result = String::new();
207 while i < bytes.len() {
208 let (c, len) = decode_utf8_char(bytes, i).unwrap();
209 i += len;
210 result.push(c);
211 }
212
213 result
214 }
215
216 const VALID: [&str; 13] = [
217 // Basic Latin.
218 "Hello, World!",
219 // Greek and Coptic.
220 "Γειά σου Κόσμε",
221 // Cyrillic.
222 "Привет, мир!",
223 // Arabic.
224 "مرحبا بالعالم",
225 // Devanagari.
226 "नमस्ते दुनिया",
227 // Hebrew.
228 "שלום עולם",
229 // Thai.
230 "สวัสดีโลก",
231 // Hiragana (Japanese).
232 "こんにちは",
233 // Katakana (Japanese).
234 "コンニチハ",
235 // CJK Unified Ideographs (Chinese).
236 "你好,世界",
237 // Mathematical Operators.
238 "∀x ∈ ℝ: ∃y ≥ 0",
239 // Emoji.
240 "😀🚀✨",
241 // Boundaries.
242 "\0\u{10FFFF}",
243 ];
244
245 #[test]
246 fn valid() {
247 for string in VALID {
248 let decoded = decode_utf8(string.as_bytes());
249 assert_eq!(string, decoded);
250 }
251 }
252
253 const INVALID: [&[u8]; 3] = [
254 &[0b1111_0100, 0b10_010000, 0b10_000000, 0b10_000000], // 0x110000
255 &[0b1110_1101, 0b10_100000, 0b10_000000], // 0xD800
256 &[0b1110_1101, 0b10_111111, 0b10_111111], // 0xDFFF
257 ];
258
259 #[test]
260 fn invalid() {
261 for string in INVALID {
262 assert!(decode_utf8_char(string, 0).is_err());
263 }
264 }
265}