bset/lib.rs
1//! Fast and compact sets of bytes or ASCII characters,
2//! useful for searching, parsing and determining membership of a given byte
3//! in the given set.
4//! They don't use any allocation, nor even any `std` features.
5//! In fact, all of the provided functions are `const`, so they can be freely
6//! constructed at the compile time
7//!
8//! # Sets
9//! This crate exports two set types - `ByteSet` for a set of general bytes
10//! and `AsciiSet` for a set restricted to the range of ASCII characters.
11//! The is two times smaller in the memory, but comes with a slight
12//! performance trade-off in having to check whether the given character
13//! belongs to the ASCII range.
14//! ```
15//! use bset::AsciiSet;
16//!
17//! const OP: AsciiSet = AsciiSet::new().add_bytes(b"+-*/%&|^");
18//! assert!(OP.contains(b'%'));
19//! ```
20//! The sets are implemented as an array of pointer-sized bit masks.
21//!
22//! # Stack of sets
23//! Inspired by [this article](https://maciej.codes/2020-04-19-stacking-luts-in-logos.html)
24//! by Maciej Hirsz, this crate provides a way to stack multiple sets into one
25//! structure to gain even more performance.
26//! To not slow it down, sets are "obtained" from the given stack at the type
27//! level. For this, the module `bits` contains types `B0`, `B1`, ..., `B7`
28//! representing indices of a set in the stack.
29//! Because `const fn`s currently don't support generic functions, the sets
30//! are indexed by the order they were added to the stack.
31//! Type aliases can be used to identify the sets within the stack:
32//! ```
33//! use bset::{bits::*, ByteSet, ByteStack};
34//!
35//! const BYTE_STACK: ByteStack<B3> = ByteStack::new()
36//! .add_set(ByteSet::DIGITS)
37//! .add_set(ByteSet::ALPHABETIC)
38//! .add_set(ByteSet::new().add_bytes(b"+-*/%&|^"));
39//! type Digits = B0;
40//! type Alphabetic = B1;
41//! type Operations = B2;
42//! assert!(BYTE_STACK.contains::<Operations>(b'%'));
43//! ```
44//! Again, there are two versions, `ByteStack` for all bytes and `AsciiStack`
45//! restricted to the ASCII range. Benchmarks show that testing the set membership
46//! is about 20% faster with stacked sets. They come with 8 times larger
47//! memory size (128/256 bytes vs. 16/32), which does not increase with the stacks
48//! added, so when 8 sets (the maximum number) are used in one stack,
49//! the memory size is equivalent.
50#![no_std]
51#![warn(missing_docs)]
52mod bit;
53/// Types that denote the position of a byte set within a byte stack.
54pub mod bits {
55 pub use crate::bit::{B0, B1, B2, B3, B4, B5, B6, B7};
56}
57use bit::Bit;
58use bits::*;
59use core::marker::PhantomData;
60use core::ops::RangeInclusive;
61
62type Chunk = usize;
63/// Range of ASCII characters.
64pub const ASCII_RANGE_LEN: usize = 0x80;
65/// Size of one chunk of the mask in the implementation of byte sets.
66pub const CHUNK_SIZE: usize = core::mem::size_of::<Chunk>();
67/// Number of bytes in one chunk of the mask.
68pub const BITS_PER_CHUNK: usize = 8 * CHUNK_SIZE;
69/// Number of chunks in the ASCII set.
70pub const CHUNKS: usize = ASCII_RANGE_LEN / BITS_PER_CHUNK;
71
72/// A compact set of bytes.
73/// Only particular instances - `AsciiSet` and `ByteSet` can be constructed.
74#[derive(Clone, Copy, Debug, Eq, PartialEq)]
75pub struct AnyByteSet<const N: usize> {
76 mask: [Chunk; N],
77}
78
79/// A compact set of ASCII bytes. Spans only 16 bytes.
80pub type AsciiSet = AnyByteSet<CHUNKS>;
81/// A compact set of all bytes. Spans only 32 bytes.
82pub type ByteSet = AnyByteSet<{ 2 * CHUNKS }>;
83
84/// A compact stack of up to 8 byte sets for fast lookup.
85/// Only particular instances - `AsciiStack` and `ByteStack` can be constructed.
86#[derive(Clone, Copy, Debug, Eq, PartialEq)]
87pub struct AnyByteStack<B, const N: usize> {
88 masks: [u8; N],
89 current: PhantomData<B>,
90}
91
92/// A compact stack of up to 8 ASCII sets for fast lookup.
93pub type AsciiStack<B = ()> = AnyByteStack<B, ASCII_RANGE_LEN>;
94/// A compact stack of up to 8 full byte sets for fast lookup.
95pub type ByteStack<B = ()> = AnyByteStack<B, { 2 * ASCII_RANGE_LEN }>;
96
97impl AsciiSet {
98 /// Creates a new, empty, `AsciiSet`.
99 pub const fn new() -> Self {
100 Self { mask: [0; CHUNKS] }
101 }
102
103 /// Tests whether this set contains the `byte`.
104 #[inline]
105 pub const fn contains(&self, byte: u8) -> bool {
106 if byte >= ASCII_RANGE_LEN as u8 {
107 return false;
108 };
109 let chunk = self.mask[byte as usize / BITS_PER_CHUNK];
110 let mask = 1 << (byte as usize % BITS_PER_CHUNK);
111 (chunk & mask) != 0
112 }
113}
114
115impl ByteSet {
116 /// Creates a new, empty, `ByteSet`.
117 pub const fn new() -> Self {
118 Self {
119 mask: [0; 2 * CHUNKS],
120 }
121 }
122
123 /// Tests whether this set contains the `byte`.
124 #[inline]
125 pub const fn contains(&self, byte: u8) -> bool {
126 let chunk = self.mask[byte as usize / BITS_PER_CHUNK];
127 let mask = 1 << (byte as usize % BITS_PER_CHUNK);
128 (chunk & mask) != 0
129 }
130}
131
132impl<const N: usize> AnyByteSet<N> {
133 /// Lowercase letters (`a` - `z`)
134 pub const LOWERCASE: Self = Self::blank().add_range(b'a'..=b'z');
135 /// Uppercase letters (`A` - `Z`)
136 pub const UPPERCASE: Self = Self::blank().add_range(b'A'..=b'Z');
137 /// Numerical digits (`0` - `9`)
138 pub const DIGITS: Self = Self::blank().add_range(b'0'..=b'9');
139 /// Uppercase and lowercase letters
140 pub const ALPHABETIC: Self = Self::LOWERCASE.union(Self::UPPERCASE);
141 /// Uppercase and lowercase letters and digits
142 pub const ALPHANUMERIC: Self = Self::ALPHABETIC.union(Self::DIGITS);
143
144 /// Space and tab
145 pub const SPACE_TAB: Self = Self::blank().add_bytes(b" \t");
146 /// Line feed and carriage return
147 pub const NEWLINE: Self = Self::blank().add_bytes(b"\r\n");
148 /// Space, tab, line feed and carriage return
149 pub const WHITESPACE: Self = Self::SPACE_TAB.union(Self::NEWLINE);
150
151 /// ASCII graphic characters
152 pub const GRAPHIC: Self = Self::blank().add_range(b'!'..=b'~');
153 /// Reserved URI characters (per RFC 3986, section 2.2)
154 pub const URI_RESERVED: Self = Self::blank().add_bytes(b"!#$&'()*+,/:;=?@[]");
155
156 const fn blank() -> Self {
157 Self { mask: [0; N] }
158 }
159
160 /// Adds the `byte` to the set.
161 pub const fn add(&self, byte: u8) -> Self {
162 let mut mask = self.mask;
163 mask[byte as usize / BITS_PER_CHUNK] |= 1 << (byte as usize % BITS_PER_CHUNK);
164 Self { mask }
165 }
166
167 /// Removes the `byte` from the set.
168 pub const fn remove(&self, byte: u8) -> Self {
169 let mut mask = self.mask;
170 mask[byte as usize / BITS_PER_CHUNK] &= !(1 << (byte as usize % BITS_PER_CHUNK));
171 Self { mask }
172 }
173
174 /// Adds every byte from the slice to the set.
175 pub const fn add_bytes(&self, bytes: &[u8]) -> Self {
176 let mut aset = *self;
177 let mut i = 0;
178 while i < bytes.len() {
179 aset = aset.add(bytes[i]);
180 i += 1;
181 }
182 aset
183 }
184
185 /// Removes every byte from the slice from the set.
186 pub const fn remove_bytes(&self, bytes: &[u8]) -> Self {
187 let mut aset = *self;
188 let mut i = 0;
189 while i < bytes.len() {
190 aset = aset.remove(bytes[i]);
191 i += 1;
192 }
193 aset
194 }
195
196 /// Adds every byte from the inclusive range to the set.
197 pub const fn add_range(&self, range: RangeInclusive<u8>) -> Self {
198 let mut aset = *self;
199 let mut c = *range.start();
200 while c <= *range.end() {
201 aset = aset.add(c);
202 c += 1;
203 }
204 aset
205 }
206
207 /// Removes every byte from the inclusive range from the set.
208 pub const fn remove_range(&self, range: RangeInclusive<u8>) -> Self {
209 let mut aset = *self;
210 let mut c = *range.start();
211 while c <= *range.end() {
212 aset = aset.remove(c);
213 c += 1;
214 }
215 aset
216 }
217
218 /// Returns the union of this set and `other`.
219 ///
220 /// #Panics
221 /// Panics if the size of `other` is bigger than the size of `self`.
222 ///
223 /// # Examples
224 /// ```
225 /// use bset::AsciiSet;
226 /// assert_eq!(AsciiSet::ALPHABETIC, AsciiSet::UPPERCASE.union(AsciiSet::LOWERCASE));
227 /// ```
228 pub const fn union<const M: usize>(&self, other: AnyByteSet<M>) -> Self {
229 let mut mask = [0; N];
230 let mut i = 0;
231 while i < N {
232 mask[i] = self.mask[i] | other.mask[i];
233 i += 1;
234 }
235 Self { mask }
236 }
237
238 /// Returns the intersection of this set and `other`.
239 ///
240 /// #Panics
241 /// Panics if the size of `other` is bigger than the size of `self`.
242 ///
243 /// # Examples
244 /// ```
245 /// use bset::AsciiSet;
246 /// assert_eq!(AsciiSet::LOWERCASE, AsciiSet::ALPHABETIC.intersection(AsciiSet::LOWERCASE));
247 /// ```
248 pub const fn intersection<const M: usize>(&self, other: AnyByteSet<M>) -> Self {
249 let mut mask = [0; N];
250 let mut i = 0;
251 while i < N {
252 mask[i] = self.mask[i] & other.mask[i];
253 i += 1;
254 }
255 Self { mask }
256 }
257
258 /// Returns the set of all ASCII chars not in `self`.
259 pub const fn complement(&self) -> Self {
260 let mut mask = self.mask;
261 let mut i = 0;
262 while i < N {
263 mask[i] = !mask[i];
264 i += 1;
265 }
266 Self { mask }
267 }
268
269 /// Returns the set of chars in `self` but not `other`.
270 ///
271 /// #Panics
272 /// Panics if the size of `other` is bigger than the size of `self`.
273 ///
274 /// # Examples
275 /// ```
276 /// use bset::AsciiSet;
277 /// assert_eq!(AsciiSet::LOWERCASE, AsciiSet::ALPHABETIC.difference(AsciiSet::UPPERCASE));
278 /// ```
279 pub const fn difference<const M: usize>(&self, other: AnyByteSet<M>) -> Self {
280 self.intersection(other.complement())
281 }
282}
283
284impl<T> AsciiStack<T> {
285 /// Tests whether the set at the position `B` in the stack contains the `byte`.
286 #[inline]
287 pub fn contains<B: Bit>(&self, byte: u8) -> bool {
288 byte < ASCII_RANGE_LEN as u8 && self.masks[byte as usize] & (1 << B::NUMBER) != 0
289 }
290}
291
292impl AsciiStack<B0> {
293 /// Creates a new `AsciiStack`
294 pub const fn new() -> Self {
295 Self {
296 masks: [0; ASCII_RANGE_LEN],
297 current: PhantomData,
298 }
299 }
300}
301
302impl<T> ByteStack<T> {
303 /// Tests whether the set at the position `B` in the stack contains the `byte`.
304 #[inline]
305 pub fn contains<B: Bit>(&self, byte: u8) -> bool {
306 self.masks[byte as usize] & (1 << B::NUMBER) != 0
307 }
308}
309
310impl ByteStack<B0> {
311 /// Creates a new `ByteStack`
312 pub const fn new() -> Self {
313 Self {
314 masks: [0; 2 * ASCII_RANGE_LEN],
315 current: PhantomData,
316 }
317 }
318}
319
320// TODO: Implement this generically once generic bounds are stable for const fns.
321macro_rules! implement_add_set {
322 ($($ty:ty),*) => {
323 $(impl<const N: usize> AnyByteStack<$ty, N> {
324 /// Add this byte set to the next available position in this stack.
325 pub const fn add_set<const M: usize>(
326 &self,
327 aset: AnyByteSet<M>,
328 ) -> AnyByteStack<<$ty as Bit>::Successor, N> {
329 let mut masks = self.masks;
330 let mask = aset.mask;
331 let mut i = 0;
332 while i < M {
333 let mut j = 0;
334 while j < BITS_PER_CHUNK {
335 if mask[i] & (1 << j) != 0 {
336 masks[i * BITS_PER_CHUNK + j] |= 1 << <$ty>::NUMBER;
337 }
338 j += 1;
339 }
340 i += 1;
341 }
342
343 AnyByteStack {
344 masks,
345 current: PhantomData,
346 }
347 }
348 })*
349 }
350}
351implement_add_set!(B0, B1, B2, B3, B4, B5, B6, B7);