fst_no_std/
set.rs

1#[cfg(feature = "alloc")]
2use crate::automaton::{AlwaysMatch, Automaton};
3use crate::raw;
4#[cfg(feature = "alloc")]
5use crate::stream::IntoStreamer;
6use crate::stream::Streamer;
7use crate::Result;
8#[cfg(feature = "alloc")]
9use alloc::{string::String, vec::Vec};
10#[cfg(feature = "alloc")]
11use core::fmt;
12#[cfg(feature = "std")]
13use core::iter;
14#[cfg(feature = "alloc")]
15use core::iter::FromIterator;
16#[cfg(feature = "std")]
17use std::io;
18
19/// Set is a lexicographically ordered set of byte strings.
20///
21/// A `Set` is constructed with the `SetBuilder` type. Alternatively, a `Set`
22/// can be constructed in memory from a lexicographically ordered iterator
23/// of byte strings (`Set::from_iter`).
24///
25/// A key feature of `Set` is that it can be serialized to disk compactly. Its
26/// underlying representation is built such that the `Set` can be memory mapped
27/// and searched without necessarily loading the entire set into memory.
28///
29/// It supports most common operations associated with sets, such as
30/// membership, union, intersection, subset/superset, etc. It also supports
31/// range queries and automata based searches (e.g. a regular expression).
32///
33/// Sets are represented by a finite state transducer where output values are
34/// always zero. As such, sets have the following invariants:
35///
36/// 1. Once constructed, a `Set` can never be modified.
37/// 2. Sets must be constructed with lexicographically ordered byte sequences.
38#[derive(Clone)]
39pub struct Set<D>(raw::Fst<D>);
40
41#[cfg(feature = "alloc")]
42impl Set<Vec<u8>> {
43    /// Create a `Set` from an iterator of lexicographically ordered byte
44    /// strings.
45    ///
46    /// If the iterator does not yield values in lexicographic order, then an
47    /// error is returned.
48    ///
49    /// Note that this is a convenience function to build a set in memory.
50    /// To build a set that streams to an arbitrary `io::Write`, use
51    /// `SetBuilder`.
52    #[cfg(feature = "std")]
53    pub fn from_iter<T, I>(iter: I) -> Result<Set<Vec<u8>>>
54    where
55        T: AsRef<[u8]>,
56        I: IntoIterator<Item = T>,
57    {
58        let mut builder = SetBuilder::memory();
59        builder.extend_iter(iter)?;
60        Set::new(builder.into_inner()?)
61    }
62}
63
64impl<D: AsRef<[u8]>> Set<D> {
65    /// Creates a set from its representation as a raw byte sequence.
66    ///
67    /// This accepts anything that can be cheaply converted to a `&[u8]`. The
68    /// caller is responsible for guaranteeing that the given bytes refer to
69    /// a valid FST. While memory safety will not be violated by invalid input,
70    /// a panic could occur while reading the FST at any point.
71    ///
72    /// # Example
73    ///
74    /// ```no_run
75    /// use fst_no_std::Set;
76    ///
77    /// // File written from a build script using SetBuilder.
78    /// # const IGNORE: &str = stringify! {
79    /// static FST: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/set.fst"));
80    /// # };
81    /// # static FST: &[u8] = &[];
82    ///
83    /// let set = Set::new(FST).unwrap();
84    /// ```
85    pub fn new(data: D) -> Result<Set<D>> {
86        raw::Fst::new(data).map(Set)
87    }
88
89    /// Tests the membership of a single key.
90    ///
91    /// # Example
92    ///
93    /// ```rust
94    /// use fst_no_std::Set;
95    ///
96    /// let set = Set::from_iter(&["a", "b", "c"]).unwrap();
97    ///
98    /// assert_eq!(set.contains("b"), true);
99    /// assert_eq!(set.contains("z"), false);
100    /// ```
101    pub fn contains<K: AsRef<[u8]>>(&self, key: K) -> bool {
102        self.0.contains_key(key)
103    }
104
105    /// Return a lexicographically ordered stream of all keys in this set.
106    ///
107    /// While this is a stream, it does require heap space proportional to the
108    /// longest key in the set.
109    ///
110    /// If the set is memory mapped, then no further heap space is needed.
111    /// Note though that your operating system may fill your page cache
112    /// (which will cause the resident memory usage of the process to go up
113    /// correspondingly).
114    ///
115    /// # Example
116    ///
117    /// Since streams are not iterators, the traditional `for` loop cannot be
118    /// used. `while let` is useful instead:
119    ///
120    /// ```rust
121    /// use fst_no_std::{IntoStreamer, Streamer, Set};
122    ///
123    /// let set = Set::from_iter(&["a", "b", "c"]).unwrap();
124    /// let mut stream = set.stream();
125    ///
126    /// let mut keys = vec![];
127    /// while let Some(key) = stream.next() {
128    ///     keys.push(key.to_vec());
129    /// }
130    /// assert_eq!(keys, vec![b"a", b"b", b"c"]);
131    /// ```
132    #[inline]
133    #[cfg(feature = "alloc")]
134    pub fn stream(&self) -> Stream<'_> {
135        Stream(self.0.stream())
136    }
137
138    /// Return a builder for range queries.
139    ///
140    /// A range query returns a subset of keys in this set in a range given in
141    /// lexicographic order.
142    ///
143    /// Memory requirements are the same as described on `Set::stream`.
144    /// Notably, only the keys in the range are read; keys outside the range
145    /// are not.
146    ///
147    /// # Example
148    ///
149    /// Returns only the keys in the range given.
150    ///
151    /// ```rust
152    /// use fst_no_std::{IntoStreamer, Streamer, Set};
153    ///
154    /// let set = Set::from_iter(&["a", "b", "c", "d", "e"]).unwrap();
155    /// let mut stream = set.range().ge("b").lt("e").into_stream();
156    ///
157    /// let mut keys = vec![];
158    /// while let Some(key) = stream.next() {
159    ///     keys.push(key.to_vec());
160    /// }
161    /// assert_eq!(keys, vec![b"b", b"c", b"d"]);
162    /// ```
163    #[inline]
164    #[cfg(feature = "alloc")]
165    pub fn range(&self) -> StreamBuilder<'_> {
166        StreamBuilder(self.0.range())
167    }
168
169    /// Executes an automaton on the keys of this set.
170    ///
171    /// Note that this returns a `StreamBuilder`, which can be used to
172    /// add a range query to the search (see the `range` method).
173    ///
174    /// Memory requirements are the same as described on `Set::stream`.
175    ///
176    /// # Example
177    ///
178    /// An implementation of subsequence search for `Automaton` can be used
179    /// to search sets:
180    ///
181    /// ```rust
182    /// use fst_no_std::automaton::Subsequence;
183    /// use fst_no_std::{IntoStreamer, Streamer, Set};
184    ///
185    /// # fn main() { example().unwrap(); }
186    /// fn example() -> Result<(), Box<dyn std::error::Error>> {
187    ///     let set = Set::from_iter(&[
188    ///         "a foo bar", "foo", "foo1", "foo2", "foo3", "foobar",
189    ///     ]).unwrap();
190    ///
191    ///     let matcher = Subsequence::new("for");
192    ///     let mut stream = set.search(&matcher).into_stream();
193    ///
194    ///     let mut keys = vec![];
195    ///     while let Some(key) = stream.next() {
196    ///         keys.push(String::from_utf8(key.to_vec())?);
197    ///     }
198    ///     assert_eq!(keys, vec![
199    ///         "a foo bar", "foobar",
200    ///     ]);
201    ///
202    ///     Ok(())
203    /// }
204    /// ```
205    #[cfg(feature = "alloc")]
206    pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> {
207        StreamBuilder(self.0.search(aut))
208    }
209
210    /// Executes an automaton on the values of this set and yields matching
211    /// values along with the corresponding matching states in the given
212    /// automaton.
213    ///
214    /// Note that this returns a `StreamWithStateBuilder`, which can be used to
215    /// add a range query to the search (see the `range` method).
216    ///
217    /// Memory requirements are the same as described on `Map::stream`.
218    ///
219    #[cfg_attr(
220        feature = "levenshtein",
221        doc = r##"
222# Example
223
224An implementation of fuzzy search using Levenshtein automata can be used
225to search sets:
226
227```rust
228use fst_no_std::automaton::Levenshtein;
229use fst_no_std::{IntoStreamer, Streamer, Set};
230
231# fn main() { example().unwrap(); }
232fn example() -> Result<(), Box<dyn std::error::Error>> {
233    let set = Set::from_iter(vec![
234        "foo",
235        "foob",
236        "foobar",
237        "fozb",
238    ]).unwrap();
239
240    let query = Levenshtein::new("foo", 2)?;
241    let mut stream = set.search_with_state(&query).into_stream();
242
243    let mut vs = vec![];
244    while let Some((v, s)) = stream.next() {
245        vs.push((String::from_utf8(v.to_vec())?, s));
246    }
247    // Currently, there isn't much interesting that you can do with the states.
248    assert_eq!(vs, vec![
249        ("foo".to_string(), Some(183)),
250        ("foob".to_string(), Some(123)),
251        ("fozb".to_string(), Some(83)),
252    ]);
253
254    Ok(())
255}
256```
257"##
258    )]
259    #[cfg(feature = "alloc")]
260    pub fn search_with_state<A: Automaton>(
261        &self,
262        aut: A,
263    ) -> StreamWithStateBuilder<'_, A> {
264        StreamWithStateBuilder(self.0.search_with_state(aut))
265    }
266
267    /// Returns the number of elements in this set.
268    #[inline]
269    pub fn len(&self) -> usize {
270        self.0.len()
271    }
272
273    /// Returns true if and only if this set is empty.
274    #[inline]
275    pub fn is_empty(&self) -> bool {
276        self.0.is_empty()
277    }
278
279    /// Creates a new set operation with this set added to it.
280    ///
281    /// The `OpBuilder` type can be used to add additional set streams
282    /// and perform set operations like union, intersection, difference and
283    /// symmetric difference.
284    ///
285    /// # Example
286    ///
287    /// ```rust
288    /// use fst_no_std::{IntoStreamer, Streamer, Set};
289    ///
290    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
291    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
292    ///
293    /// let mut union = set1.op().add(&set2).union();
294    ///
295    /// let mut keys = vec![];
296    /// while let Some(key) = union.next() {
297    ///     keys.push(key.to_vec());
298    /// }
299    /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]);
300    /// ```
301    #[inline]
302    #[cfg(feature = "alloc")]
303    pub fn op(&self) -> OpBuilder<'_> {
304        OpBuilder::new().add(self)
305    }
306
307    /// Returns true if and only if the `self` set is disjoint with the set
308    /// `stream`.
309    ///
310    /// `stream` must be a lexicographically ordered sequence of byte strings.
311    ///
312    /// # Example
313    ///
314    /// ```rust
315    /// use fst_no_std::{IntoStreamer, Streamer, Set};
316    ///
317    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
318    /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
319    ///
320    /// assert_eq!(set1.is_disjoint(&set2), true);
321    ///
322    /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
323    ///
324    /// assert_eq!(set1.is_disjoint(&set3), false);
325    /// ```
326    #[cfg(feature = "alloc")]
327    pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
328    where
329        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
330        S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
331    {
332        self.0.is_disjoint(StreamZeroOutput(stream.into_stream()))
333    }
334
335    /// Returns true if and only if the `self` set is a subset of `stream`.
336    ///
337    /// `stream` must be a lexicographically ordered sequence of byte strings.
338    ///
339    /// # Example
340    ///
341    /// ```rust
342    /// use fst_no_std::Set;
343    ///
344    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
345    /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
346    ///
347    /// assert_eq!(set1.is_subset(&set2), false);
348    ///
349    /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
350    ///
351    /// assert_eq!(set1.is_subset(&set3), false);
352    /// assert_eq!(set3.is_subset(&set1), true);
353    /// ```
354    #[cfg(feature = "alloc")]
355    pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
356    where
357        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
358        S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
359    {
360        self.0.is_subset(StreamZeroOutput(stream.into_stream()))
361    }
362
363    /// Returns true if and only if the `self` set is a superset of `stream`.
364    ///
365    /// `stream` must be a lexicographically ordered sequence of byte strings.
366    ///
367    /// # Example
368    ///
369    /// ```rust
370    /// use fst_no_std::Set;
371    ///
372    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
373    /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
374    ///
375    /// assert_eq!(set1.is_superset(&set2), false);
376    ///
377    /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
378    ///
379    /// assert_eq!(set1.is_superset(&set3), true);
380    /// assert_eq!(set3.is_superset(&set1), false);
381    /// ```
382    #[cfg(feature = "alloc")]
383    pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
384    where
385        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
386        S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
387    {
388        self.0.is_superset(StreamZeroOutput(stream.into_stream()))
389    }
390
391    /// Returns a reference to the underlying raw finite state transducer.
392    #[inline]
393    pub fn as_fst(&self) -> &raw::Fst<D> {
394        &self.0
395    }
396
397    /// Returns the underlying raw finite state transducer.
398    #[inline]
399    pub fn into_fst(self) -> raw::Fst<D> {
400        self.0
401    }
402
403    /// Maps the underlying data of the fst Set to another data type.
404    ///
405    /// # Example
406    ///
407    /// This example shows that you can map an fst Set based on a `Vec<u8>`
408    /// into an fst Set based on a `Cow<[u8]>`, it can also work with a
409    /// reference counted type (e.g. `Arc`, `Rc`).
410    ///
411    /// ```
412    /// use std::borrow::Cow;
413    ///
414    /// use fst_no_std::Set;
415    ///
416    /// let set: Set<Vec<u8>> = Set::from_iter(
417    ///     &["hello", "world"],
418    /// ).unwrap();
419    ///
420    /// let set_on_cow: Set<Cow<[u8]>> = set.map_data(Cow::Owned).unwrap();
421    /// ```
422    #[inline]
423    pub fn map_data<F, T>(self, f: F) -> Result<Set<T>>
424    where
425        F: FnMut(D) -> T,
426        T: AsRef<[u8]>,
427    {
428        self.into_fst().map_data(f).map(Set::from)
429    }
430}
431
432#[cfg(feature = "std")]
433impl Default for Set<Vec<u8>> {
434    #[inline]
435    fn default() -> Set<Vec<u8>> {
436        Set::from_iter(iter::empty::<&[u8]>()).unwrap()
437    }
438}
439
440#[cfg(feature = "alloc")]
441impl<D: AsRef<[u8]>> fmt::Debug for Set<D> {
442    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
443        write!(f, "Set([")?;
444        let mut stream = self.stream();
445        let mut first = true;
446        while let Some(key) = stream.next() {
447            if !first {
448                write!(f, ", ")?;
449            }
450            first = false;
451            write!(f, "{}", String::from_utf8_lossy(key))?;
452        }
453        write!(f, "])")
454    }
455}
456
457/// Returns the underlying finite state transducer.
458impl<D: AsRef<[u8]>> AsRef<raw::Fst<D>> for Set<D> {
459    #[inline]
460    fn as_ref(&self) -> &raw::Fst<D> {
461        &self.0
462    }
463}
464
465#[cfg(feature = "alloc")]
466impl<'s, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'s Set<D> {
467    type Item = &'a [u8];
468    type Into = Stream<'s>;
469
470    #[inline]
471    fn into_stream(self) -> Stream<'s> {
472        Stream(self.0.stream())
473    }
474}
475
476// Construct a set from an Fst object.
477impl<D: AsRef<[u8]>> From<raw::Fst<D>> for Set<D> {
478    #[inline]
479    fn from(fst: raw::Fst<D>) -> Set<D> {
480        Set(fst)
481    }
482}
483
484/// A builder for creating a set.
485///
486/// This is not your average everyday builder. It has two important qualities
487/// that make it a bit unique from what you might expect:
488///
489/// 1. All keys must be added in lexicographic order. Adding a key out of order
490///    will result in an error.
491/// 2. The representation of a set is streamed to *any* `io::Write` as it is
492///    built. For an in memory representation, this can be a `Vec<u8>`.
493///
494/// Point (2) is especially important because it means that a set can be
495/// constructed *without storing the entire set in memory*. Namely, since it
496/// works with any `io::Write`, it can be streamed directly to a file.
497///
498/// With that said, the builder does use memory, but **memory usage is bounded
499/// to a constant size**. The amount of memory used trades off with the
500/// compression ratio. Currently, the implementation hard codes this trade off
501/// which can result in about 5-20MB of heap usage during construction. (N.B.
502/// Guaranteeing a maximal compression ratio requires memory proportional to
503/// the size of the set, which defeats the benefit of streaming it to disk.
504/// In practice, a small bounded amount of memory achieves close-to-minimal
505/// compression ratios.)
506///
507/// The algorithmic complexity of set construction is `O(n)` where `n` is the
508/// number of elements added to the set.
509///
510/// # Example: build in memory
511///
512/// This shows how to use the builder to construct a set in memory. Note that
513/// `Set::from_iter` provides a convenience function that achieves this same
514/// goal without needing to explicitly use `SetBuilder`.
515///
516/// ```rust
517/// use fst_no_std::{IntoStreamer, Streamer, Set, SetBuilder};
518///
519/// let mut build = SetBuilder::memory();
520/// build.insert("bruce").unwrap();
521/// build.insert("clarence").unwrap();
522/// build.insert("stevie").unwrap();
523///
524/// // You could also call `finish()` here, but since we're building the set in
525/// // memory, there would be no way to get the `Vec<u8>` back.
526/// let bytes = build.into_inner().unwrap();
527///
528/// // At this point, the set has been constructed, but here's how to read it.
529/// let set = Set::new(bytes).unwrap();
530/// let mut stream = set.into_stream();
531/// let mut keys = vec![];
532/// while let Some(key) = stream.next() {
533///     keys.push(key.to_vec());
534/// }
535/// assert_eq!(keys, vec![
536///     "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(),
537/// ]);
538/// ```
539///
540/// # Example: stream to file
541///
542/// This shows how to stream construction of a set to a file.
543///
544/// ```rust,no_run
545/// use std::fs::File;
546/// use std::io;
547///
548/// use fst_no_std::{IntoStreamer, Streamer, Set, SetBuilder};
549///
550/// let mut wtr = io::BufWriter::new(File::create("set.fst").unwrap());
551/// let mut build = SetBuilder::new(wtr).unwrap();
552/// build.insert("bruce").unwrap();
553/// build.insert("clarence").unwrap();
554/// build.insert("stevie").unwrap();
555///
556/// // If you want the writer back, then call `into_inner`. Otherwise, this
557/// // will finish construction and call `flush`.
558/// build.finish().unwrap();
559///
560/// // At this point, the set has been constructed, but here's how to read it.
561/// // NOTE: Normally, one would memory map a file instead of reading its
562/// // entire contents on to the heap.
563/// let set = Set::new(std::fs::read("set.fst").unwrap()).unwrap();
564/// let mut stream = set.into_stream();
565/// let mut keys = vec![];
566/// while let Some(key) = stream.next() {
567///     keys.push(key.to_vec());
568/// }
569/// assert_eq!(keys, vec![
570///     "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(),
571/// ]);
572/// ```
573#[cfg(feature = "std")]
574pub struct SetBuilder<W>(raw::Builder<W>);
575
576#[cfg(feature = "std")]
577impl SetBuilder<Vec<u8>> {
578    /// Create a builder that builds a set in memory.
579    #[inline]
580    #[must_use]
581    pub fn memory() -> SetBuilder<Vec<u8>> {
582        SetBuilder(raw::Builder::memory())
583    }
584
585    /// Finishes the construction of the set and returns it.
586    #[inline]
587    #[must_use]
588    pub fn into_set(self) -> Set<Vec<u8>> {
589        Set(self.0.into_fst())
590    }
591}
592
593#[cfg(feature = "std")]
594impl<W: io::Write> SetBuilder<W> {
595    /// Create a builder that builds a set by writing it to `wtr` in a
596    /// streaming fashion.
597    pub fn new(wtr: W) -> Result<SetBuilder<W>> {
598        raw::Builder::new_type(wtr, 0).map(SetBuilder)
599    }
600
601    /// Insert a new key into the set.
602    ///
603    /// If a key is inserted that is less than any previous key added, then
604    /// an error is returned. Similarly, if there was a problem writing to
605    /// the underlying writer, an error is returned.
606    pub fn insert<K: AsRef<[u8]>>(&mut self, key: K) -> Result<()> {
607        self.0.add(key)
608    }
609
610    /// Calls insert on each item in the iterator.
611    ///
612    /// If an error occurred while adding an element, processing is stopped
613    /// and the error is returned.
614    pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
615    where
616        T: AsRef<[u8]>,
617        I: IntoIterator<Item = T>,
618    {
619        for key in iter {
620            self.0.add(key)?;
621        }
622        Ok(())
623    }
624
625    /// Calls insert on each item in the stream.
626    ///
627    /// Note that unlike `extend_iter`, this is not generic on the items in
628    /// the stream.
629    pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
630    where
631        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
632        S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
633    {
634        self.0.extend_stream(StreamZeroOutput(stream.into_stream()))
635    }
636
637    /// Finishes the construction of the set and flushes the underlying
638    /// writer. After completion, the data written to `W` may be read using
639    /// one of `Set`'s constructor methods.
640    pub fn finish(self) -> Result<()> {
641        self.0.finish()
642    }
643
644    /// Just like `finish`, except it returns the underlying writer after
645    /// flushing it.
646    pub fn into_inner(self) -> Result<W> {
647        self.0.into_inner()
648    }
649
650    /// Gets a reference to the underlying writer.
651    pub fn get_ref(&self) -> &W {
652        self.0.get_ref()
653    }
654
655    /// Returns the number of bytes written to the underlying writer
656    pub fn bytes_written(&self) -> u64 {
657        self.0.bytes_written()
658    }
659}
660
661/// A lexicographically ordered stream of keys from a set.
662///
663/// The `A` type parameter corresponds to an optional automaton to filter
664/// the stream. By default, no filtering is done.
665///
666/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
667#[cfg(feature = "alloc")]
668pub struct Stream<'s, A = AlwaysMatch>(raw::Stream<'s, A>)
669where
670    A: Automaton;
671
672#[cfg(feature = "alloc")]
673impl<'s, A: Automaton> Stream<'s, A> {
674    /// Convert this stream into a vector of Unicode strings.
675    ///
676    /// If any key is not valid UTF-8, then iteration on the stream is stopped
677    /// and a UTF-8 decoding error is returned.
678    ///
679    /// Note that this creates a new allocation for every key in the stream.
680    pub fn into_strs(self) -> Result<Vec<String>> {
681        self.0.into_str_keys()
682    }
683
684    /// Convert this stream into a vector of byte strings.
685    ///
686    /// Note that this creates a new allocation for every key in the stream.
687    pub fn into_bytes(self) -> Vec<Vec<u8>> {
688        self.0.into_byte_keys()
689    }
690}
691
692#[cfg(feature = "alloc")]
693impl<'a, 's, A: Automaton> Streamer<'a> for Stream<'s, A> {
694    type Item = &'a [u8];
695
696    fn next(&'a mut self) -> Option<&'a [u8]> {
697        self.0.next().map(|(key, _)| key)
698    }
699}
700
701/// A lexicographically ordered stream of key-state pairs from a set and
702/// an automaton.
703///
704/// The keys are from the set while the states are from the automaton.
705///
706/// The `A` type parameter corresponds to an optional automaton to filter
707/// the stream. By default, no filtering is done.
708///
709/// The `'m` lifetime parameter refers to the lifetime of the underlying set.
710#[cfg(feature = "alloc")]
711pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>)
712where
713    A: Automaton;
714
715#[cfg(feature = "alloc")]
716impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A>
717where
718    A::State: Clone,
719{
720    type Item = (&'a [u8], A::State);
721
722    fn next(&'a mut self) -> Option<(&'a [u8], A::State)> {
723        self.0.next().map(|(key, _, state)| (key, state))
724    }
725}
726
727/// A builder for constructing range queries on streams.
728///
729/// Once all bounds are set, one should call `into_stream` to get a
730/// `Stream`.
731///
732/// Bounds are not additive. That is, if `ge` is called twice on the same
733/// builder, then the second setting wins.
734///
735/// The `A` type parameter corresponds to an optional automaton to filter
736/// the stream. By default, no filtering is done.
737///
738/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
739#[cfg(feature = "alloc")]
740pub struct StreamBuilder<'s, A = AlwaysMatch>(raw::StreamBuilder<'s, A>);
741
742#[cfg(feature = "alloc")]
743impl<'s, A: Automaton> StreamBuilder<'s, A> {
744    /// Specify a greater-than-or-equal-to bound.
745    pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
746        StreamBuilder(self.0.ge(bound))
747    }
748
749    /// Specify a greater-than bound.
750    pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
751        StreamBuilder(self.0.gt(bound))
752    }
753
754    /// Specify a less-than-or-equal-to bound.
755    pub fn le<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
756        StreamBuilder(self.0.le(bound))
757    }
758
759    /// Specify a less-than bound.
760    pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
761        StreamBuilder(self.0.lt(bound))
762    }
763}
764
765#[cfg(feature = "alloc")]
766impl<'s, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'s, A> {
767    type Item = &'a [u8];
768    type Into = Stream<'s, A>;
769
770    fn into_stream(self) -> Stream<'s, A> {
771        Stream(self.0.into_stream())
772    }
773}
774
775/// A builder for constructing range queries on streams that include automaton
776/// states.
777///
778/// In general, one should use `StreamBuilder` unless you have a specific need
779/// for accessing the states of the underlying automaton that is being used to
780/// filter this stream.
781///
782/// Once all bounds are set, one should call `into_stream` to get a
783/// `Stream`.
784///
785/// Bounds are not additive. That is, if `ge` is called twice on the same
786/// builder, then the second setting wins.
787///
788/// The `A` type parameter corresponds to an optional automaton to filter
789/// the stream. By default, no filtering is done.
790///
791/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
792#[cfg(feature = "alloc")]
793pub struct StreamWithStateBuilder<'s, A = AlwaysMatch>(
794    raw::StreamWithStateBuilder<'s, A>,
795);
796
797#[cfg(feature = "alloc")]
798impl<'s, A: Automaton> StreamWithStateBuilder<'s, A> {
799    /// Specify a greater-than-or-equal-to bound.
800    pub fn ge<T: AsRef<[u8]>>(
801        self,
802        bound: T,
803    ) -> StreamWithStateBuilder<'s, A> {
804        StreamWithStateBuilder(self.0.ge(bound))
805    }
806
807    /// Specify a greater-than bound.
808    pub fn gt<T: AsRef<[u8]>>(
809        self,
810        bound: T,
811    ) -> StreamWithStateBuilder<'s, A> {
812        StreamWithStateBuilder(self.0.gt(bound))
813    }
814
815    /// Specify a less-than-or-equal-to bound.
816    pub fn le<T: AsRef<[u8]>>(
817        self,
818        bound: T,
819    ) -> StreamWithStateBuilder<'s, A> {
820        StreamWithStateBuilder(self.0.le(bound))
821    }
822
823    /// Specify a less-than bound.
824    pub fn lt<T: AsRef<[u8]>>(
825        self,
826        bound: T,
827    ) -> StreamWithStateBuilder<'s, A> {
828        StreamWithStateBuilder(self.0.lt(bound))
829    }
830}
831
832#[cfg(feature = "alloc")]
833impl<'s, 'a, A: 'a + Automaton> IntoStreamer<'a>
834    for StreamWithStateBuilder<'s, A>
835where
836    A::State: Clone,
837{
838    type Item = (&'a [u8], A::State);
839    type Into = StreamWithState<'s, A>;
840
841    fn into_stream(self) -> StreamWithState<'s, A> {
842        StreamWithState(self.0.into_stream())
843    }
844}
845
846/// A builder for collecting set streams on which to perform set operations.
847///
848/// Set operations include intersection, union, difference and symmetric
849/// difference. The result of each set operation is itself a stream that emits
850/// keys in lexicographic order.
851///
852/// All set operations work efficiently on an arbitrary number of
853/// streams with memory proportional to the number of streams.
854///
855/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
856/// where `n1, n2, n3, ...` correspond to the number of elements in each
857/// stream.
858///
859/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
860#[cfg(feature = "alloc")]
861pub struct OpBuilder<'s>(raw::OpBuilder<'s>);
862
863#[cfg(feature = "alloc")]
864impl<'s> Default for OpBuilder<'s> {
865    fn default() -> Self {
866        Self::new()
867    }
868}
869
870#[cfg(feature = "alloc")]
871impl<'s> OpBuilder<'s> {
872    /// Create a new set operation builder.
873    #[inline]
874    #[must_use]
875    pub fn new() -> OpBuilder<'s> {
876        OpBuilder(raw::OpBuilder::new())
877    }
878
879    /// Add a stream to this set operation.
880    ///
881    /// This is useful for a chaining style pattern, e.g.,
882    /// `builder.add(stream1).add(stream2).union()`.
883    ///
884    /// The stream must emit a lexicographically ordered sequence of byte
885    /// strings.
886    pub fn add<I, S>(mut self, streamable: I) -> OpBuilder<'s>
887    where
888        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
889        S: 's + for<'a> Streamer<'a, Item = &'a [u8]>,
890    {
891        self.push(streamable);
892        self
893    }
894
895    /// Add a stream to this set operation.
896    ///
897    /// The stream must emit a lexicographically ordered sequence of byte
898    /// strings.
899    pub fn push<I, S>(&mut self, streamable: I)
900    where
901        I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
902        S: 's + for<'a> Streamer<'a, Item = &'a [u8]>,
903    {
904        self.0.push(StreamZeroOutput(streamable.into_stream()));
905    }
906
907    /// Performs a union operation on all streams that have been added.
908    ///
909    /// # Example
910    ///
911    /// ```rust
912    /// use fst_no_std::{IntoStreamer, Streamer, Set};
913    ///
914    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
915    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
916    ///
917    /// let mut union = set1.op().add(&set2).union();
918    ///
919    /// let mut keys = vec![];
920    /// while let Some(key) = union.next() {
921    ///     keys.push(key.to_vec());
922    /// }
923    /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]);
924    /// ```
925    #[inline]
926    #[must_use]
927    pub fn union(self) -> Union<'s> {
928        Union(self.0.union())
929    }
930
931    /// Performs an intersection operation on all streams that have been added.
932    ///
933    /// # Example
934    ///
935    /// ```rust
936    /// use fst_no_std::{IntoStreamer, Streamer, Set};
937    ///
938    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
939    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
940    ///
941    /// let mut intersection = set1.op().add(&set2).intersection();
942    ///
943    /// let mut keys = vec![];
944    /// while let Some(key) = intersection.next() {
945    ///     keys.push(key.to_vec());
946    /// }
947    /// assert_eq!(keys, vec![b"a"]);
948    /// ```
949    #[inline]
950    #[must_use]
951    pub fn intersection(self) -> Intersection<'s> {
952        Intersection(self.0.intersection())
953    }
954
955    /// Performs a difference operation with respect to the first stream added.
956    /// That is, this returns a stream of all elements in the first stream
957    /// that don't exist in any other stream that has been added.
958    ///
959    /// # Example
960    ///
961    /// ```rust
962    /// use fst_no_std::{IntoStreamer, Streamer, Set};
963    ///
964    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
965    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
966    ///
967    /// let mut difference = set1.op().add(&set2).difference();
968    ///
969    /// let mut keys = vec![];
970    /// while let Some(key) = difference.next() {
971    ///     keys.push(key.to_vec());
972    /// }
973    /// assert_eq!(keys, vec![b"b", b"c"]);
974    /// ```
975    #[inline]
976    #[must_use]
977    pub fn difference(self) -> Difference<'s> {
978        Difference(self.0.difference())
979    }
980
981    /// Performs a symmetric difference operation on all of the streams that
982    /// have been added.
983    ///
984    /// When there are only two streams, then the keys returned correspond to
985    /// keys that are in either stream but *not* in both streams.
986    ///
987    /// More generally, for any number of streams, keys that occur in an odd
988    /// number of streams are returned.
989    ///
990    /// # Example
991    ///
992    /// ```rust
993    /// use fst_no_std::{IntoStreamer, Streamer, Set};
994    ///
995    /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
996    /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
997    ///
998    /// let mut sym_difference = set1.op().add(&set2).symmetric_difference();
999    ///
1000    /// let mut keys = vec![];
1001    /// while let Some(key) = sym_difference.next() {
1002    ///     keys.push(key.to_vec());
1003    /// }
1004    /// assert_eq!(keys, vec![b"b", b"c", b"y", b"z"]);
1005    /// ```
1006    #[inline]
1007    #[must_use]
1008    pub fn symmetric_difference(self) -> SymmetricDifference<'s> {
1009        SymmetricDifference(self.0.symmetric_difference())
1010    }
1011}
1012
1013#[cfg(feature = "alloc")]
1014impl<'f, I, S> Extend<I> for OpBuilder<'f>
1015where
1016    I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
1017    S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
1018{
1019    fn extend<T>(&mut self, it: T)
1020    where
1021        T: IntoIterator<Item = I>,
1022    {
1023        for stream in it {
1024            self.push(stream);
1025        }
1026    }
1027}
1028
1029#[cfg(feature = "alloc")]
1030impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
1031where
1032    I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
1033    S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
1034{
1035    fn from_iter<T>(it: T) -> OpBuilder<'f>
1036    where
1037        T: IntoIterator<Item = I>,
1038    {
1039        let mut op = OpBuilder::new();
1040        op.extend(it);
1041        op
1042    }
1043}
1044
1045/// A stream of set union over multiple streams in lexicographic order.
1046///
1047/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1048#[cfg(feature = "alloc")]
1049pub struct Union<'s>(raw::Union<'s>);
1050
1051#[cfg(feature = "alloc")]
1052impl<'a, 's> Streamer<'a> for Union<'s> {
1053    type Item = &'a [u8];
1054
1055    #[inline]
1056    fn next(&'a mut self) -> Option<&'a [u8]> {
1057        self.0.next().map(|(key, _)| key)
1058    }
1059}
1060
1061/// A stream of set intersection over multiple streams in lexicographic order.
1062///
1063/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1064#[cfg(feature = "alloc")]
1065pub struct Intersection<'s>(raw::Intersection<'s>);
1066
1067#[cfg(feature = "alloc")]
1068impl<'a, 's> Streamer<'a> for Intersection<'s> {
1069    type Item = &'a [u8];
1070
1071    #[inline]
1072    fn next(&'a mut self) -> Option<&'a [u8]> {
1073        self.0.next().map(|(key, _)| key)
1074    }
1075}
1076
1077/// A stream of set difference over multiple streams in lexicographic order.
1078///
1079/// The difference operation is taken with respect to the first stream and the
1080/// rest of the streams. i.e., All elements in the first stream that do not
1081/// appear in any other streams.
1082///
1083/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1084#[cfg(feature = "alloc")]
1085pub struct Difference<'s>(raw::Difference<'s>);
1086
1087#[cfg(feature = "alloc")]
1088impl<'a, 's> Streamer<'a> for Difference<'s> {
1089    type Item = &'a [u8];
1090
1091    #[inline]
1092    fn next(&'a mut self) -> Option<&'a [u8]> {
1093        self.0.next().map(|(key, _)| key)
1094    }
1095}
1096
1097/// A stream of set symmetric difference over multiple streams in lexicographic
1098/// order.
1099///
1100/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1101#[cfg(feature = "alloc")]
1102pub struct SymmetricDifference<'s>(raw::SymmetricDifference<'s>);
1103
1104#[cfg(feature = "alloc")]
1105impl<'a, 's> Streamer<'a> for SymmetricDifference<'s> {
1106    type Item = &'a [u8];
1107
1108    #[inline]
1109    fn next(&'a mut self) -> Option<&'a [u8]> {
1110        self.0.next().map(|(key, _)| key)
1111    }
1112}
1113
1114/// A specialized stream for mapping set streams (`&[u8]`) to streams used
1115/// by raw fsts (`(&[u8], Output)`).
1116///
1117/// If this were iterators, we could use `iter::Map`, but doing this on streams
1118/// requires HKT, so we need to write out the monomorphization ourselves.
1119struct StreamZeroOutput<S>(S);
1120
1121impl<'a, S: Streamer<'a>> Streamer<'a> for StreamZeroOutput<S> {
1122    type Item = (S::Item, raw::Output);
1123
1124    fn next(&'a mut self) -> Option<(S::Item, raw::Output)> {
1125        self.0.next().map(|key| (key, raw::Output::zero()))
1126    }
1127}
1128
1129#[cfg(test)]
1130mod tests {
1131    use super::OpBuilder;
1132    use crate::Streamer;
1133
1134    #[test]
1135    fn no_fsts() {
1136        struct Iter<'a> {
1137            i: usize,
1138            xs: Vec<&'a [u8]>,
1139        }
1140
1141        impl<'a> Iter<'a> {
1142            fn new(xs: Vec<&'a [u8]>) -> Iter<'a> {
1143                Iter { i: 0, xs }
1144            }
1145        }
1146
1147        impl<'a, 's> Streamer<'a> for Iter<'s> {
1148            type Item = &'a [u8];
1149            fn next(&'a mut self) -> Option<&'a [u8]> {
1150                if self.i >= self.xs.len() {
1151                    None
1152                } else {
1153                    let i = self.i;
1154                    self.i += 1;
1155                    Some(self.xs[i])
1156                }
1157            }
1158        }
1159
1160        let mut stream = OpBuilder::new()
1161            .add(Iter::new(vec![
1162                &b"bar"[..],
1163                &b"baz"[..],
1164                &b"foo"[..],
1165                &b"fubar"[..],
1166                &b"quux"[..],
1167            ]))
1168            .add(Iter::new(vec![&b"bar"[..], &b"foofoo"[..], &b"fubar"[..]]))
1169            .intersection();
1170
1171        let mut got = vec![];
1172        while let Some(x) = stream.next() {
1173            got.push(x.to_vec());
1174        }
1175        assert_eq!(got, vec![&b"bar"[..], &b"fubar"[..]]);
1176    }
1177}