bset/
lib.rs

1//! Fast and compact sets of bytes or ASCII characters,
2//! useful for searching, parsing and determining membership of a given byte
3//! in the given set.
4//! They don't use any allocation, nor even any `std` features.
5//! In fact, all of the provided functions are `const`, so they can be freely
6//! constructed at the compile time
7//!
8//! # Sets
9//! This crate exports two set types - `ByteSet` for a set of general bytes
10//! and `AsciiSet` for a set restricted to the range of ASCII characters.
11//! The is two times smaller in the memory, but comes with a slight
12//! performance trade-off in having to check whether the given character
13//! belongs to the ASCII range.
14//! ```
15//! use bset::AsciiSet;
16//!
17//! const OP: AsciiSet = AsciiSet::new().add_bytes(b"+-*/%&|^");
18//! assert!(OP.contains(b'%'));
19//! ```
20//! The sets are implemented as an array of pointer-sized bit masks.
21//!
22//! # Stack of sets
23//! Inspired by [this article](https://maciej.codes/2020-04-19-stacking-luts-in-logos.html)
24//! by Maciej Hirsz, this crate provides a way to stack multiple sets into one
25//! structure to gain even more performance.
26//! To not slow it down, sets are "obtained" from the given stack at the type
27//! level. For this, the module `bits` contains types `B0`, `B1`, ..., `B7`
28//! representing indices of a set in the stack.
29//! Because `const fn`s currently don't support generic functions, the sets
30//! are indexed by the order they were added to the stack.
31//! Type aliases can be used to identify the sets within the stack:
32//! ```
33//! use bset::{bits::*, ByteSet, ByteStack};
34//! 
35//! const BYTE_STACK: ByteStack<B3> = ByteStack::new()
36//!     .add_set(ByteSet::DIGITS)
37//!     .add_set(ByteSet::ALPHABETIC)
38//!     .add_set(ByteSet::new().add_bytes(b"+-*/%&|^"));
39//! type Digits = B0;
40//! type Alphabetic = B1;
41//! type Operations = B2;
42//! assert!(BYTE_STACK.contains::<Operations>(b'%'));
43//! ```
44//! Again, there are two versions, `ByteStack` for all bytes and `AsciiStack`
45//! restricted to the ASCII range. Benchmarks show that testing the set membership
46//! is about 20% faster with stacked sets. They come with 8 times larger
47//! memory size (128/256 bytes vs. 16/32), which does not increase with the stacks
48//! added, so when 8 sets (the maximum number) are used in one stack,
49//! the memory size is equivalent.
50#![no_std]
51#![warn(missing_docs)]
52mod bit;
53/// Types that denote the position of a byte set within a byte stack.
54pub mod bits {
55    pub use crate::bit::{B0, B1, B2, B3, B4, B5, B6, B7};
56}
57use bit::Bit;
58use bits::*;
59use core::marker::PhantomData;
60use core::ops::RangeInclusive;
61
62type Chunk = usize;
63/// Range of ASCII characters.
64pub const ASCII_RANGE_LEN: usize = 0x80;
65/// Size of one chunk of the mask in the implementation of byte sets.
66pub const CHUNK_SIZE: usize = core::mem::size_of::<Chunk>();
67/// Number of bytes in one chunk of the mask.
68pub const BITS_PER_CHUNK: usize = 8 * CHUNK_SIZE;
69/// Number of chunks in the ASCII set.
70pub const CHUNKS: usize = ASCII_RANGE_LEN / BITS_PER_CHUNK;
71
72/// A compact set of bytes.
73/// Only particular instances - `AsciiSet` and `ByteSet` can be constructed.
74#[derive(Clone, Copy, Debug, Eq, PartialEq)]
75pub struct AnyByteSet<const N: usize> {
76    mask: [Chunk; N],
77}
78
79/// A compact set of ASCII bytes. Spans only 16 bytes.
80pub type AsciiSet = AnyByteSet<CHUNKS>;
81/// A compact set of all bytes. Spans only 32 bytes.
82pub type ByteSet = AnyByteSet<{ 2 * CHUNKS }>;
83
84/// A compact stack of up to 8 byte sets for fast lookup.
85/// Only particular instances - `AsciiStack` and `ByteStack` can be constructed.
86#[derive(Clone, Copy, Debug, Eq, PartialEq)]
87pub struct AnyByteStack<B, const N: usize> {
88    masks: [u8; N],
89    current: PhantomData<B>,
90}
91
92/// A compact stack of up to 8 ASCII sets for fast lookup.
93pub type AsciiStack<B = ()> = AnyByteStack<B, ASCII_RANGE_LEN>;
94/// A compact stack of up to 8 full byte sets for fast lookup.
95pub type ByteStack<B = ()> = AnyByteStack<B, { 2 * ASCII_RANGE_LEN }>;
96
97impl AsciiSet {
98    /// Creates a new, empty, `AsciiSet`.
99    pub const fn new() -> Self {
100        Self { mask: [0; CHUNKS] }
101    }
102
103    /// Tests whether this set contains the `byte`.
104    #[inline]
105    pub const fn contains(&self, byte: u8) -> bool {
106        if byte >= ASCII_RANGE_LEN as u8 {
107            return false;
108        };
109        let chunk = self.mask[byte as usize / BITS_PER_CHUNK];
110        let mask = 1 << (byte as usize % BITS_PER_CHUNK);
111        (chunk & mask) != 0
112    }
113}
114
115impl ByteSet {
116    /// Creates a new, empty, `ByteSet`.
117    pub const fn new() -> Self {
118        Self {
119            mask: [0; 2 * CHUNKS],
120        }
121    }
122
123    /// Tests whether this set contains the `byte`.
124    #[inline]
125    pub const fn contains(&self, byte: u8) -> bool {
126        let chunk = self.mask[byte as usize / BITS_PER_CHUNK];
127        let mask = 1 << (byte as usize % BITS_PER_CHUNK);
128        (chunk & mask) != 0
129    }
130}
131
132impl<const N: usize> AnyByteSet<N> {
133    /// Lowercase letters (`a` - `z`)
134    pub const LOWERCASE: Self = Self::blank().add_range(b'a'..=b'z');
135    /// Uppercase letters (`A` - `Z`)
136    pub const UPPERCASE: Self = Self::blank().add_range(b'A'..=b'Z');
137    /// Numerical digits (`0` - `9`)
138    pub const DIGITS: Self = Self::blank().add_range(b'0'..=b'9');
139    /// Uppercase and lowercase letters
140    pub const ALPHABETIC: Self = Self::LOWERCASE.union(Self::UPPERCASE);
141    /// Uppercase and lowercase letters and digits
142    pub const ALPHANUMERIC: Self = Self::ALPHABETIC.union(Self::DIGITS);
143
144    /// Space and tab
145    pub const SPACE_TAB: Self = Self::blank().add_bytes(b" \t");
146    /// Line feed and carriage return
147    pub const NEWLINE: Self = Self::blank().add_bytes(b"\r\n");
148    /// Space, tab, line feed and carriage return
149    pub const WHITESPACE: Self = Self::SPACE_TAB.union(Self::NEWLINE);
150
151    /// ASCII graphic characters
152    pub const GRAPHIC: Self = Self::blank().add_range(b'!'..=b'~');
153    /// Reserved URI characters (per RFC 3986, section 2.2)
154    pub const URI_RESERVED: Self = Self::blank().add_bytes(b"!#$&'()*+,/:;=?@[]");
155
156    const fn blank() -> Self {
157        Self { mask: [0; N] }
158    }
159
160    /// Adds the `byte` to the set.
161    pub const fn add(&self, byte: u8) -> Self {
162        let mut mask = self.mask;
163        mask[byte as usize / BITS_PER_CHUNK] |= 1 << (byte as usize % BITS_PER_CHUNK);
164        Self { mask }
165    }
166
167    /// Removes the `byte` from the set.
168    pub const fn remove(&self, byte: u8) -> Self {
169        let mut mask = self.mask;
170        mask[byte as usize / BITS_PER_CHUNK] &= !(1 << (byte as usize % BITS_PER_CHUNK));
171        Self { mask }
172    }
173
174    /// Adds every byte from the slice to the set.
175    pub const fn add_bytes(&self, bytes: &[u8]) -> Self {
176        let mut aset = *self;
177        let mut i = 0;
178        while i < bytes.len() {
179            aset = aset.add(bytes[i]);
180            i += 1;
181        }
182        aset
183    }
184
185    /// Removes every byte from the slice from the set.
186    pub const fn remove_bytes(&self, bytes: &[u8]) -> Self {
187        let mut aset = *self;
188        let mut i = 0;
189        while i < bytes.len() {
190            aset = aset.remove(bytes[i]);
191            i += 1;
192        }
193        aset
194    }
195
196    /// Adds every byte from the inclusive range to the set.
197    pub const fn add_range(&self, range: RangeInclusive<u8>) -> Self {
198        let mut aset = *self;
199        let mut c = *range.start();
200        while c <= *range.end() {
201            aset = aset.add(c);
202            c += 1;
203        }
204        aset
205    }
206
207    /// Removes every byte from the inclusive range from the set.
208    pub const fn remove_range(&self, range: RangeInclusive<u8>) -> Self {
209        let mut aset = *self;
210        let mut c = *range.start();
211        while c <= *range.end() {
212            aset = aset.remove(c);
213            c += 1;
214        }
215        aset
216    }
217
218    /// Returns the union of this set and `other`.
219    /// 
220    /// #Panics
221    /// Panics if the size of `other` is bigger than the size of `self`.
222    ///
223    /// # Examples
224    /// ```
225    /// use bset::AsciiSet;
226    /// assert_eq!(AsciiSet::ALPHABETIC, AsciiSet::UPPERCASE.union(AsciiSet::LOWERCASE));
227    /// ```
228    pub const fn union<const M: usize>(&self, other: AnyByteSet<M>) -> Self {
229        let mut mask = [0; N];
230        let mut i = 0;
231        while i < N {
232            mask[i] = self.mask[i] | other.mask[i];
233            i += 1;
234        }
235        Self { mask }
236    }
237
238    /// Returns the intersection of this set and `other`.
239    /// 
240    /// #Panics
241    /// Panics if the size of `other` is bigger than the size of `self`.
242    ///
243    /// # Examples
244    /// ```
245    /// use bset::AsciiSet;
246    /// assert_eq!(AsciiSet::LOWERCASE, AsciiSet::ALPHABETIC.intersection(AsciiSet::LOWERCASE));
247    /// ```
248    pub const fn intersection<const M: usize>(&self, other: AnyByteSet<M>) -> Self {
249        let mut mask = [0; N];
250        let mut i = 0;
251        while i < N {
252            mask[i] = self.mask[i] & other.mask[i];
253            i += 1;
254        }
255        Self { mask }
256    }
257
258    /// Returns the set of all ASCII chars not in `self`.
259    pub const fn complement(&self) -> Self {
260        let mut mask = self.mask;
261        let mut i = 0;
262        while i < N {
263            mask[i] = !mask[i];
264            i += 1;
265        }
266        Self { mask }
267    }
268
269    /// Returns the set of chars in `self` but not `other`.
270    /// 
271    /// #Panics
272    /// Panics if the size of `other` is bigger than the size of `self`.
273    ///
274    /// # Examples
275    /// ```
276    /// use bset::AsciiSet;
277    /// assert_eq!(AsciiSet::LOWERCASE, AsciiSet::ALPHABETIC.difference(AsciiSet::UPPERCASE));
278    /// ```
279    pub const fn difference<const M: usize>(&self, other: AnyByteSet<M>) -> Self {
280        self.intersection(other.complement())
281    }
282}
283
284impl<T> AsciiStack<T> {
285    /// Tests whether the set at the position `B` in the stack contains the `byte`.
286    #[inline]
287    pub fn contains<B: Bit>(&self, byte: u8) -> bool {
288        byte < ASCII_RANGE_LEN as u8 && self.masks[byte as usize] & (1 << B::NUMBER) != 0
289    }
290}
291
292impl AsciiStack<B0> {
293    /// Creates a new `AsciiStack`
294    pub const fn new() -> Self {
295        Self {
296            masks: [0; ASCII_RANGE_LEN],
297            current: PhantomData,
298        }
299    }
300}
301
302impl<T> ByteStack<T> {
303    /// Tests whether the set at the position `B` in the stack contains the `byte`.
304    #[inline]
305    pub fn contains<B: Bit>(&self, byte: u8) -> bool {
306        self.masks[byte as usize] & (1 << B::NUMBER) != 0
307    }
308}
309
310impl ByteStack<B0> {
311    /// Creates a new `ByteStack`
312    pub const fn new() -> Self {
313        Self {
314            masks: [0; 2 * ASCII_RANGE_LEN],
315            current: PhantomData,
316        }
317    }
318}
319
320// TODO: Implement this generically once generic bounds are stable for const fns.
321macro_rules! implement_add_set {
322    ($($ty:ty),*) => {
323        $(impl<const N: usize> AnyByteStack<$ty, N> {
324            /// Add this byte set to the next available position in this stack.
325            pub const fn add_set<const M: usize>(
326                &self,
327                aset: AnyByteSet<M>,
328            ) -> AnyByteStack<<$ty as Bit>::Successor, N> {
329                let mut masks = self.masks;
330                let mask = aset.mask;
331                let mut i = 0;
332                while i < M {
333                    let mut j = 0;
334                    while j < BITS_PER_CHUNK {
335                        if mask[i] & (1 << j) != 0 {
336                            masks[i * BITS_PER_CHUNK + j] |= 1 << <$ty>::NUMBER;
337                        }
338                        j += 1;
339                    }
340                    i += 1;
341                }
342
343                AnyByteStack {
344                    masks,
345                    current: PhantomData,
346                }
347            }
348        })*
349    }
350}
351implement_add_set!(B0, B1, B2, B3, B4, B5, B6, B7);