intern_str/
lib.rs

1//! Simple, fast and allocation-free string interning.
2//!
3//! `intern-str` is a library for interning strings in Rust. It allows
4//! you to build a graph that can convert strings (and byte strings)
5//! into arbitrary values. In addition, these graphs are `no_std` and
6//! can be embedded into projects at compile time.
7//!
8//! The goal is to allow for low-cost, efficient string interning that
9//!  does not require an allocator or the standard library. The intended
10//! use case is [MIME Types], but it is likely to be useful in many
11//! other cases as well.
12//!
13//! [MIME Types]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types
14//!
15//! `intern-str` is `no_std`, no `extern crate alloc`,
16//! `forbid(unsafe_code)`, and has no dependencies aside from the Rust
17//! `core` library.
18//!
19//! To generate `intern-str` graphs as code, see the [`builder`] module
20//! and the [`intern-str-codegen`] crate.
21//!
22//! [`intern-str-codegen`]: https://crates.io/crates/intern-str-codegen
23//!
24//! ## Implementation
25//!
26//! `intern-str` generates a DFA consisting of all possible options for
27//! a given set of strings. This DFA is then used to convert a string
28//! into a value. The DFA is generated at compile time, and is embedded
29//! into the binary. This allows for efficient string interning without
30//! the need for an allocator or the standard library.
31//!
32//! In many cases, this approach is significantly faster than converting
33//! the string to lowercase and matching on it. When matching on
34//! `/usr/share/dict/words`, `intern-str` usually completes queries in
35//! 50 ns.
36//!
37//! The main advantage of this approach is that it can be used to create
38//! case-insensitive matching, which is significantly more difficult to
39//! do with other libraries like [`phf`]. When compared against [`phf`]
40//! when [`phf`] has to convert strings to lower case before running,
41//! `intern-str` is usually as fast as [`phf`] if not faster.
42//!
43//! [`phf`]: https://crates.io/crates/phf
44
45#![no_std]
46#![forbid(
47    unsafe_code,
48    missing_docs,
49    missing_debug_implementations,
50    missing_copy_implementations,
51    future_incompatible,
52    rust_2018_idioms
53)]
54
55#[cfg(feature = "builder")]
56pub mod builder;
57
58#[cfg(all(feature = "builder", not(intern_str_no_alloc)))]
59extern crate alloc;
60#[cfg(all(feature = "builder", intern_str_no_alloc))]
61extern crate std as alloc;
62
63#[cfg(feature = "std")]
64extern crate std;
65
66#[cfg(feature = "builder")]
67use alloc::vec::Vec;
68
69use core::{cmp, hash, ops};
70
71/// A node in a DFA.
72#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
73pub struct Node<'inst, Input, Output> {
74    /// The slice of values that this node accepts, combined with the index of the
75    /// next node.
76    ///
77    /// The slice is sorted by the input value.
78    inputs: MaybeSlice<'inst, (Input, usize)>,
79
80    /// The output resulting from the DFA halting on this node.
81    output: Output,
82
83    /// The index of the default node to go to if no input matches.
84    default: usize,
85
86    /// The "slice" of the input that we need to match on.
87    amount: usize,
88}
89
90/// A deterministic finite automaton (DFA) that can be used to process sequential
91/// input to produce an output.
92#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
93pub struct Graph<'inst, 'nodes, Input, Output> {
94    /// The nodes in the graph.
95    nodes: &'nodes [Node<'inst, Input, Output>],
96
97    /// The index of the start node.
98    start: usize,
99}
100
101impl<'inst, Input, Output> Node<'inst, Input, Output> {
102    /// Create a new node from its parts.
103    pub const fn new(
104        inputs: &'inst [(Input, usize)],
105        output: Output,
106        default: usize,
107        amount: usize,
108    ) -> Self {
109        Self {
110            inputs: MaybeSlice::Slice(inputs),
111            output,
112            default,
113            amount,
114        }
115    }
116}
117
118impl<'inst, Input: Segmentable, Output> Node<'inst, Input, Output> {
119    /// Determine the next index to go to based on the input.
120    fn next(&self, input: &Input) -> usize {
121        // Use a binary search, since the input is sorted.
122        match self.inputs.binary_search_by(|(i, _)| i.cmp(input)) {
123            Ok(i) => self.inputs[i].1,
124            Err(_) => self.default,
125        }
126    }
127
128    /// Get the inputs of this node.
129    pub fn inputs(&self) -> &[(Input, usize)] {
130        match &self.inputs {
131            MaybeSlice::Slice(s) => s,
132            #[cfg(feature = "builder")]
133            MaybeSlice::Vec(v) => v,
134        }
135    }
136
137    /// Get the output of this node.
138    pub fn output(&self) -> &Output {
139        &self.output
140    }
141
142    /// Get the default node index.
143    pub fn default(&self) -> usize {
144        self.default
145    }
146
147    /// Get the amount of input to match on.
148    pub fn amount(&self) -> usize {
149        self.amount
150    }
151}
152
153impl<'nodes, 'inst, Input, Output> Graph<'inst, 'nodes, Input, Output> {
154    /// Create a new graph from a set of nodes.
155    pub const fn new(nodes: &'nodes [Node<'inst, Input, Output>], start: usize) -> Self {
156        Self { nodes, start }
157    }
158}
159
160impl<'nodes, 'inst, Input: Segmentable, Output> Graph<'inst, 'nodes, Input, Output> {
161    /// Get the nodes of this graph.
162    pub fn nodes(&self) -> &'nodes [Node<'inst, Input, Output>] {
163        self.nodes
164    }
165
166    /// Get the start node index.
167    pub fn start(&self) -> usize {
168        self.start
169    }
170
171    /// Process the input and return the output.
172    pub fn process(&self, mut input: Input) -> &Output {
173        let mut node = &self.nodes[self.start];
174
175        // Process the input in chunks.
176        loop {
177            // Get the next input chunk.
178            let (chunk, rest) = match input.split(node.amount) {
179                Some(result) => result,
180                None => {
181                    // Return the value of the current node.
182                    return &node.output;
183                }
184            };
185
186            // Get the next node.
187            node = &self.nodes[node.next(&chunk)];
188            input = rest;
189        }
190    }
191}
192
193/// An item that can be segmented into parts.
194pub trait Segmentable: Ord + Sized {
195    /// Split the item into two parts.
196    fn split(self, at: usize) -> Option<(Self, Self)>;
197
198    /// Get the length of the item.
199    fn len(&self) -> usize;
200
201    /// Tell if the item is empty.
202    fn is_empty(&self) -> bool {
203        self.len() == 0
204    }
205}
206
207impl<'a> Segmentable for &'a str {
208    fn split(self, at: usize) -> Option<(Self, Self)> {
209        if at > self.len() {
210            return None;
211        }
212
213        let (left, right) = self.split_at(at);
214        Some((left, right))
215    }
216
217    fn len(&self) -> usize {
218        str::len(self)
219    }
220}
221
222impl<'a, T: Ord> Segmentable for &'a [T] {
223    fn split(self, at: usize) -> Option<(Self, Self)> {
224        if at > self.len() {
225            return None;
226        }
227
228        let (left, right) = self.split_at(at);
229        Some((left, right))
230    }
231
232    fn len(&self) -> usize {
233        <[T]>::len(self)
234    }
235}
236
237/// The wrapper type for a string that is compared case-insensitively.
238///
239/// The inner string is implied to be ASCII.
240#[derive(Debug, Clone, Copy, Default)]
241pub struct CaseInsensitive<T>(pub T);
242
243impl<T> ops::Deref for CaseInsensitive<T> {
244    type Target = T;
245
246    fn deref(&self) -> &T {
247        &self.0
248    }
249}
250
251impl<T> ops::DerefMut for CaseInsensitive<T> {
252    fn deref_mut(&mut self) -> &mut T {
253        &mut self.0
254    }
255}
256
257impl<T> From<T> for CaseInsensitive<T> {
258    fn from(value: T) -> Self {
259        CaseInsensitive(value)
260    }
261}
262
263impl<T: AsRef<[u8]>> PartialEq for CaseInsensitive<T> {
264    fn eq(&self, other: &Self) -> bool {
265        self.0.as_ref().eq_ignore_ascii_case(other.0.as_ref())
266    }
267}
268
269impl<T: AsRef<[u8]>> Eq for CaseInsensitive<T> {}
270
271impl<T: AsRef<[u8]>> PartialOrd for CaseInsensitive<T> {
272    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
273        Some(self.cmp(other))
274    }
275}
276
277impl<T: AsRef<[u8]>> Ord for CaseInsensitive<T> {
278    fn cmp(&self, other: &Self) -> cmp::Ordering {
279        let this = self.0.as_ref();
280        let other = other.0.as_ref();
281        let common_len = cmp::min(this.len(), other.len());
282
283        let this_seg = &this[..common_len];
284        let other_seg = &other[..common_len];
285
286        // Compare the common segment.
287        for (a, b) in this_seg.iter().zip(other_seg.iter()) {
288            let a = a.to_ascii_lowercase();
289            let b = b.to_ascii_lowercase();
290
291            match a.cmp(&b) {
292                cmp::Ordering::Equal => continue,
293                other => return other,
294            }
295        }
296
297        // Compare the lengths.
298        this.len().cmp(&other.len())
299    }
300}
301
302impl<T: AsRef<[u8]>> hash::Hash for CaseInsensitive<T> {
303    fn hash<H: hash::Hasher>(&self, state: &mut H) {
304        for byte in self.0.as_ref() {
305            state.write_u8(byte.to_ascii_lowercase());
306        }
307    }
308}
309
310impl<T: Segmentable + AsRef<[u8]>> Segmentable for CaseInsensitive<T> {
311    fn split(self, at: usize) -> Option<(Self, Self)> {
312        T::split(self.0, at).map(|(left, right)| (left.into(), right.into()))
313    }
314
315    fn len(&self) -> usize {
316        T::len(&self.0)
317    }
318}
319
320#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
321enum MaybeSlice<'a, T> {
322    Slice(&'a [T]),
323    #[cfg(feature = "builder")]
324    Vec(Vec<T>),
325}
326
327impl<'a, T> core::ops::Deref for MaybeSlice<'a, T> {
328    type Target = [T];
329
330    fn deref(&self) -> &Self::Target {
331        match self {
332            MaybeSlice::Slice(slice) => slice,
333            #[cfg(feature = "builder")]
334            MaybeSlice::Vec(vec) => vec,
335        }
336    }
337}