fst_no_std/set.rs
1#[cfg(feature = "alloc")]
2use crate::automaton::{AlwaysMatch, Automaton};
3use crate::raw;
4#[cfg(feature = "alloc")]
5use crate::stream::IntoStreamer;
6use crate::stream::Streamer;
7use crate::Result;
8#[cfg(feature = "alloc")]
9use alloc::{string::String, vec::Vec};
10#[cfg(feature = "alloc")]
11use core::fmt;
12#[cfg(feature = "std")]
13use core::iter;
14#[cfg(feature = "alloc")]
15use core::iter::FromIterator;
16#[cfg(feature = "std")]
17use std::io;
18
19/// Set is a lexicographically ordered set of byte strings.
20///
21/// A `Set` is constructed with the `SetBuilder` type. Alternatively, a `Set`
22/// can be constructed in memory from a lexicographically ordered iterator
23/// of byte strings (`Set::from_iter`).
24///
25/// A key feature of `Set` is that it can be serialized to disk compactly. Its
26/// underlying representation is built such that the `Set` can be memory mapped
27/// and searched without necessarily loading the entire set into memory.
28///
29/// It supports most common operations associated with sets, such as
30/// membership, union, intersection, subset/superset, etc. It also supports
31/// range queries and automata based searches (e.g. a regular expression).
32///
33/// Sets are represented by a finite state transducer where output values are
34/// always zero. As such, sets have the following invariants:
35///
36/// 1. Once constructed, a `Set` can never be modified.
37/// 2. Sets must be constructed with lexicographically ordered byte sequences.
38#[derive(Clone)]
39pub struct Set<D>(raw::Fst<D>);
40
41#[cfg(feature = "alloc")]
42impl Set<Vec<u8>> {
43 /// Create a `Set` from an iterator of lexicographically ordered byte
44 /// strings.
45 ///
46 /// If the iterator does not yield values in lexicographic order, then an
47 /// error is returned.
48 ///
49 /// Note that this is a convenience function to build a set in memory.
50 /// To build a set that streams to an arbitrary `io::Write`, use
51 /// `SetBuilder`.
52 #[cfg(feature = "std")]
53 pub fn from_iter<T, I>(iter: I) -> Result<Set<Vec<u8>>>
54 where
55 T: AsRef<[u8]>,
56 I: IntoIterator<Item = T>,
57 {
58 let mut builder = SetBuilder::memory();
59 builder.extend_iter(iter)?;
60 Set::new(builder.into_inner()?)
61 }
62}
63
64impl<D: AsRef<[u8]>> Set<D> {
65 /// Creates a set from its representation as a raw byte sequence.
66 ///
67 /// This accepts anything that can be cheaply converted to a `&[u8]`. The
68 /// caller is responsible for guaranteeing that the given bytes refer to
69 /// a valid FST. While memory safety will not be violated by invalid input,
70 /// a panic could occur while reading the FST at any point.
71 ///
72 /// # Example
73 ///
74 /// ```no_run
75 /// use fst_no_std::Set;
76 ///
77 /// // File written from a build script using SetBuilder.
78 /// # const IGNORE: &str = stringify! {
79 /// static FST: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/set.fst"));
80 /// # };
81 /// # static FST: &[u8] = &[];
82 ///
83 /// let set = Set::new(FST).unwrap();
84 /// ```
85 pub fn new(data: D) -> Result<Set<D>> {
86 raw::Fst::new(data).map(Set)
87 }
88
89 /// Tests the membership of a single key.
90 ///
91 /// # Example
92 ///
93 /// ```rust
94 /// use fst_no_std::Set;
95 ///
96 /// let set = Set::from_iter(&["a", "b", "c"]).unwrap();
97 ///
98 /// assert_eq!(set.contains("b"), true);
99 /// assert_eq!(set.contains("z"), false);
100 /// ```
101 pub fn contains<K: AsRef<[u8]>>(&self, key: K) -> bool {
102 self.0.contains_key(key)
103 }
104
105 /// Return a lexicographically ordered stream of all keys in this set.
106 ///
107 /// While this is a stream, it does require heap space proportional to the
108 /// longest key in the set.
109 ///
110 /// If the set is memory mapped, then no further heap space is needed.
111 /// Note though that your operating system may fill your page cache
112 /// (which will cause the resident memory usage of the process to go up
113 /// correspondingly).
114 ///
115 /// # Example
116 ///
117 /// Since streams are not iterators, the traditional `for` loop cannot be
118 /// used. `while let` is useful instead:
119 ///
120 /// ```rust
121 /// use fst_no_std::{IntoStreamer, Streamer, Set};
122 ///
123 /// let set = Set::from_iter(&["a", "b", "c"]).unwrap();
124 /// let mut stream = set.stream();
125 ///
126 /// let mut keys = vec![];
127 /// while let Some(key) = stream.next() {
128 /// keys.push(key.to_vec());
129 /// }
130 /// assert_eq!(keys, vec![b"a", b"b", b"c"]);
131 /// ```
132 #[inline]
133 #[cfg(feature = "alloc")]
134 pub fn stream(&self) -> Stream<'_> {
135 Stream(self.0.stream())
136 }
137
138 /// Return a builder for range queries.
139 ///
140 /// A range query returns a subset of keys in this set in a range given in
141 /// lexicographic order.
142 ///
143 /// Memory requirements are the same as described on `Set::stream`.
144 /// Notably, only the keys in the range are read; keys outside the range
145 /// are not.
146 ///
147 /// # Example
148 ///
149 /// Returns only the keys in the range given.
150 ///
151 /// ```rust
152 /// use fst_no_std::{IntoStreamer, Streamer, Set};
153 ///
154 /// let set = Set::from_iter(&["a", "b", "c", "d", "e"]).unwrap();
155 /// let mut stream = set.range().ge("b").lt("e").into_stream();
156 ///
157 /// let mut keys = vec![];
158 /// while let Some(key) = stream.next() {
159 /// keys.push(key.to_vec());
160 /// }
161 /// assert_eq!(keys, vec![b"b", b"c", b"d"]);
162 /// ```
163 #[inline]
164 #[cfg(feature = "alloc")]
165 pub fn range(&self) -> StreamBuilder<'_> {
166 StreamBuilder(self.0.range())
167 }
168
169 /// Executes an automaton on the keys of this set.
170 ///
171 /// Note that this returns a `StreamBuilder`, which can be used to
172 /// add a range query to the search (see the `range` method).
173 ///
174 /// Memory requirements are the same as described on `Set::stream`.
175 ///
176 /// # Example
177 ///
178 /// An implementation of subsequence search for `Automaton` can be used
179 /// to search sets:
180 ///
181 /// ```rust
182 /// use fst_no_std::automaton::Subsequence;
183 /// use fst_no_std::{IntoStreamer, Streamer, Set};
184 ///
185 /// # fn main() { example().unwrap(); }
186 /// fn example() -> Result<(), Box<dyn std::error::Error>> {
187 /// let set = Set::from_iter(&[
188 /// "a foo bar", "foo", "foo1", "foo2", "foo3", "foobar",
189 /// ]).unwrap();
190 ///
191 /// let matcher = Subsequence::new("for");
192 /// let mut stream = set.search(&matcher).into_stream();
193 ///
194 /// let mut keys = vec![];
195 /// while let Some(key) = stream.next() {
196 /// keys.push(String::from_utf8(key.to_vec())?);
197 /// }
198 /// assert_eq!(keys, vec![
199 /// "a foo bar", "foobar",
200 /// ]);
201 ///
202 /// Ok(())
203 /// }
204 /// ```
205 #[cfg(feature = "alloc")]
206 pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> {
207 StreamBuilder(self.0.search(aut))
208 }
209
210 /// Executes an automaton on the values of this set and yields matching
211 /// values along with the corresponding matching states in the given
212 /// automaton.
213 ///
214 /// Note that this returns a `StreamWithStateBuilder`, which can be used to
215 /// add a range query to the search (see the `range` method).
216 ///
217 /// Memory requirements are the same as described on `Map::stream`.
218 ///
219 #[cfg_attr(
220 feature = "levenshtein",
221 doc = r##"
222# Example
223
224An implementation of fuzzy search using Levenshtein automata can be used
225to search sets:
226
227```rust
228use fst_no_std::automaton::Levenshtein;
229use fst_no_std::{IntoStreamer, Streamer, Set};
230
231# fn main() { example().unwrap(); }
232fn example() -> Result<(), Box<dyn std::error::Error>> {
233 let set = Set::from_iter(vec![
234 "foo",
235 "foob",
236 "foobar",
237 "fozb",
238 ]).unwrap();
239
240 let query = Levenshtein::new("foo", 2)?;
241 let mut stream = set.search_with_state(&query).into_stream();
242
243 let mut vs = vec![];
244 while let Some((v, s)) = stream.next() {
245 vs.push((String::from_utf8(v.to_vec())?, s));
246 }
247 // Currently, there isn't much interesting that you can do with the states.
248 assert_eq!(vs, vec![
249 ("foo".to_string(), Some(183)),
250 ("foob".to_string(), Some(123)),
251 ("fozb".to_string(), Some(83)),
252 ]);
253
254 Ok(())
255}
256```
257"##
258 )]
259 #[cfg(feature = "alloc")]
260 pub fn search_with_state<A: Automaton>(
261 &self,
262 aut: A,
263 ) -> StreamWithStateBuilder<'_, A> {
264 StreamWithStateBuilder(self.0.search_with_state(aut))
265 }
266
267 /// Returns the number of elements in this set.
268 #[inline]
269 pub fn len(&self) -> usize {
270 self.0.len()
271 }
272
273 /// Returns true if and only if this set is empty.
274 #[inline]
275 pub fn is_empty(&self) -> bool {
276 self.0.is_empty()
277 }
278
279 /// Creates a new set operation with this set added to it.
280 ///
281 /// The `OpBuilder` type can be used to add additional set streams
282 /// and perform set operations like union, intersection, difference and
283 /// symmetric difference.
284 ///
285 /// # Example
286 ///
287 /// ```rust
288 /// use fst_no_std::{IntoStreamer, Streamer, Set};
289 ///
290 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
291 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
292 ///
293 /// let mut union = set1.op().add(&set2).union();
294 ///
295 /// let mut keys = vec![];
296 /// while let Some(key) = union.next() {
297 /// keys.push(key.to_vec());
298 /// }
299 /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]);
300 /// ```
301 #[inline]
302 #[cfg(feature = "alloc")]
303 pub fn op(&self) -> OpBuilder<'_> {
304 OpBuilder::new().add(self)
305 }
306
307 /// Returns true if and only if the `self` set is disjoint with the set
308 /// `stream`.
309 ///
310 /// `stream` must be a lexicographically ordered sequence of byte strings.
311 ///
312 /// # Example
313 ///
314 /// ```rust
315 /// use fst_no_std::{IntoStreamer, Streamer, Set};
316 ///
317 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
318 /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
319 ///
320 /// assert_eq!(set1.is_disjoint(&set2), true);
321 ///
322 /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
323 ///
324 /// assert_eq!(set1.is_disjoint(&set3), false);
325 /// ```
326 #[cfg(feature = "alloc")]
327 pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
328 where
329 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
330 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
331 {
332 self.0.is_disjoint(StreamZeroOutput(stream.into_stream()))
333 }
334
335 /// Returns true if and only if the `self` set is a subset of `stream`.
336 ///
337 /// `stream` must be a lexicographically ordered sequence of byte strings.
338 ///
339 /// # Example
340 ///
341 /// ```rust
342 /// use fst_no_std::Set;
343 ///
344 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
345 /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
346 ///
347 /// assert_eq!(set1.is_subset(&set2), false);
348 ///
349 /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
350 ///
351 /// assert_eq!(set1.is_subset(&set3), false);
352 /// assert_eq!(set3.is_subset(&set1), true);
353 /// ```
354 #[cfg(feature = "alloc")]
355 pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
356 where
357 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
358 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
359 {
360 self.0.is_subset(StreamZeroOutput(stream.into_stream()))
361 }
362
363 /// Returns true if and only if the `self` set is a superset of `stream`.
364 ///
365 /// `stream` must be a lexicographically ordered sequence of byte strings.
366 ///
367 /// # Example
368 ///
369 /// ```rust
370 /// use fst_no_std::Set;
371 ///
372 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
373 /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap();
374 ///
375 /// assert_eq!(set1.is_superset(&set2), false);
376 ///
377 /// let set3 = Set::from_iter(&["a", "c"]).unwrap();
378 ///
379 /// assert_eq!(set1.is_superset(&set3), true);
380 /// assert_eq!(set3.is_superset(&set1), false);
381 /// ```
382 #[cfg(feature = "alloc")]
383 pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
384 where
385 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
386 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
387 {
388 self.0.is_superset(StreamZeroOutput(stream.into_stream()))
389 }
390
391 /// Returns a reference to the underlying raw finite state transducer.
392 #[inline]
393 pub fn as_fst(&self) -> &raw::Fst<D> {
394 &self.0
395 }
396
397 /// Returns the underlying raw finite state transducer.
398 #[inline]
399 pub fn into_fst(self) -> raw::Fst<D> {
400 self.0
401 }
402
403 /// Maps the underlying data of the fst Set to another data type.
404 ///
405 /// # Example
406 ///
407 /// This example shows that you can map an fst Set based on a `Vec<u8>`
408 /// into an fst Set based on a `Cow<[u8]>`, it can also work with a
409 /// reference counted type (e.g. `Arc`, `Rc`).
410 ///
411 /// ```
412 /// use std::borrow::Cow;
413 ///
414 /// use fst_no_std::Set;
415 ///
416 /// let set: Set<Vec<u8>> = Set::from_iter(
417 /// &["hello", "world"],
418 /// ).unwrap();
419 ///
420 /// let set_on_cow: Set<Cow<[u8]>> = set.map_data(Cow::Owned).unwrap();
421 /// ```
422 #[inline]
423 pub fn map_data<F, T>(self, f: F) -> Result<Set<T>>
424 where
425 F: FnMut(D) -> T,
426 T: AsRef<[u8]>,
427 {
428 self.into_fst().map_data(f).map(Set::from)
429 }
430}
431
432#[cfg(feature = "std")]
433impl Default for Set<Vec<u8>> {
434 #[inline]
435 fn default() -> Set<Vec<u8>> {
436 Set::from_iter(iter::empty::<&[u8]>()).unwrap()
437 }
438}
439
440#[cfg(feature = "alloc")]
441impl<D: AsRef<[u8]>> fmt::Debug for Set<D> {
442 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
443 write!(f, "Set([")?;
444 let mut stream = self.stream();
445 let mut first = true;
446 while let Some(key) = stream.next() {
447 if !first {
448 write!(f, ", ")?;
449 }
450 first = false;
451 write!(f, "{}", String::from_utf8_lossy(key))?;
452 }
453 write!(f, "])")
454 }
455}
456
457/// Returns the underlying finite state transducer.
458impl<D: AsRef<[u8]>> AsRef<raw::Fst<D>> for Set<D> {
459 #[inline]
460 fn as_ref(&self) -> &raw::Fst<D> {
461 &self.0
462 }
463}
464
465#[cfg(feature = "alloc")]
466impl<'s, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'s Set<D> {
467 type Item = &'a [u8];
468 type Into = Stream<'s>;
469
470 #[inline]
471 fn into_stream(self) -> Stream<'s> {
472 Stream(self.0.stream())
473 }
474}
475
476// Construct a set from an Fst object.
477impl<D: AsRef<[u8]>> From<raw::Fst<D>> for Set<D> {
478 #[inline]
479 fn from(fst: raw::Fst<D>) -> Set<D> {
480 Set(fst)
481 }
482}
483
484/// A builder for creating a set.
485///
486/// This is not your average everyday builder. It has two important qualities
487/// that make it a bit unique from what you might expect:
488///
489/// 1. All keys must be added in lexicographic order. Adding a key out of order
490/// will result in an error.
491/// 2. The representation of a set is streamed to *any* `io::Write` as it is
492/// built. For an in memory representation, this can be a `Vec<u8>`.
493///
494/// Point (2) is especially important because it means that a set can be
495/// constructed *without storing the entire set in memory*. Namely, since it
496/// works with any `io::Write`, it can be streamed directly to a file.
497///
498/// With that said, the builder does use memory, but **memory usage is bounded
499/// to a constant size**. The amount of memory used trades off with the
500/// compression ratio. Currently, the implementation hard codes this trade off
501/// which can result in about 5-20MB of heap usage during construction. (N.B.
502/// Guaranteeing a maximal compression ratio requires memory proportional to
503/// the size of the set, which defeats the benefit of streaming it to disk.
504/// In practice, a small bounded amount of memory achieves close-to-minimal
505/// compression ratios.)
506///
507/// The algorithmic complexity of set construction is `O(n)` where `n` is the
508/// number of elements added to the set.
509///
510/// # Example: build in memory
511///
512/// This shows how to use the builder to construct a set in memory. Note that
513/// `Set::from_iter` provides a convenience function that achieves this same
514/// goal without needing to explicitly use `SetBuilder`.
515///
516/// ```rust
517/// use fst_no_std::{IntoStreamer, Streamer, Set, SetBuilder};
518///
519/// let mut build = SetBuilder::memory();
520/// build.insert("bruce").unwrap();
521/// build.insert("clarence").unwrap();
522/// build.insert("stevie").unwrap();
523///
524/// // You could also call `finish()` here, but since we're building the set in
525/// // memory, there would be no way to get the `Vec<u8>` back.
526/// let bytes = build.into_inner().unwrap();
527///
528/// // At this point, the set has been constructed, but here's how to read it.
529/// let set = Set::new(bytes).unwrap();
530/// let mut stream = set.into_stream();
531/// let mut keys = vec![];
532/// while let Some(key) = stream.next() {
533/// keys.push(key.to_vec());
534/// }
535/// assert_eq!(keys, vec![
536/// "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(),
537/// ]);
538/// ```
539///
540/// # Example: stream to file
541///
542/// This shows how to stream construction of a set to a file.
543///
544/// ```rust,no_run
545/// use std::fs::File;
546/// use std::io;
547///
548/// use fst_no_std::{IntoStreamer, Streamer, Set, SetBuilder};
549///
550/// let mut wtr = io::BufWriter::new(File::create("set.fst").unwrap());
551/// let mut build = SetBuilder::new(wtr).unwrap();
552/// build.insert("bruce").unwrap();
553/// build.insert("clarence").unwrap();
554/// build.insert("stevie").unwrap();
555///
556/// // If you want the writer back, then call `into_inner`. Otherwise, this
557/// // will finish construction and call `flush`.
558/// build.finish().unwrap();
559///
560/// // At this point, the set has been constructed, but here's how to read it.
561/// // NOTE: Normally, one would memory map a file instead of reading its
562/// // entire contents on to the heap.
563/// let set = Set::new(std::fs::read("set.fst").unwrap()).unwrap();
564/// let mut stream = set.into_stream();
565/// let mut keys = vec![];
566/// while let Some(key) = stream.next() {
567/// keys.push(key.to_vec());
568/// }
569/// assert_eq!(keys, vec![
570/// "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(),
571/// ]);
572/// ```
573#[cfg(feature = "std")]
574pub struct SetBuilder<W>(raw::Builder<W>);
575
576#[cfg(feature = "std")]
577impl SetBuilder<Vec<u8>> {
578 /// Create a builder that builds a set in memory.
579 #[inline]
580 #[must_use]
581 pub fn memory() -> SetBuilder<Vec<u8>> {
582 SetBuilder(raw::Builder::memory())
583 }
584
585 /// Finishes the construction of the set and returns it.
586 #[inline]
587 #[must_use]
588 pub fn into_set(self) -> Set<Vec<u8>> {
589 Set(self.0.into_fst())
590 }
591}
592
593#[cfg(feature = "std")]
594impl<W: io::Write> SetBuilder<W> {
595 /// Create a builder that builds a set by writing it to `wtr` in a
596 /// streaming fashion.
597 pub fn new(wtr: W) -> Result<SetBuilder<W>> {
598 raw::Builder::new_type(wtr, 0).map(SetBuilder)
599 }
600
601 /// Insert a new key into the set.
602 ///
603 /// If a key is inserted that is less than any previous key added, then
604 /// an error is returned. Similarly, if there was a problem writing to
605 /// the underlying writer, an error is returned.
606 pub fn insert<K: AsRef<[u8]>>(&mut self, key: K) -> Result<()> {
607 self.0.add(key)
608 }
609
610 /// Calls insert on each item in the iterator.
611 ///
612 /// If an error occurred while adding an element, processing is stopped
613 /// and the error is returned.
614 pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
615 where
616 T: AsRef<[u8]>,
617 I: IntoIterator<Item = T>,
618 {
619 for key in iter {
620 self.0.add(key)?;
621 }
622 Ok(())
623 }
624
625 /// Calls insert on each item in the stream.
626 ///
627 /// Note that unlike `extend_iter`, this is not generic on the items in
628 /// the stream.
629 pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
630 where
631 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
632 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
633 {
634 self.0.extend_stream(StreamZeroOutput(stream.into_stream()))
635 }
636
637 /// Finishes the construction of the set and flushes the underlying
638 /// writer. After completion, the data written to `W` may be read using
639 /// one of `Set`'s constructor methods.
640 pub fn finish(self) -> Result<()> {
641 self.0.finish()
642 }
643
644 /// Just like `finish`, except it returns the underlying writer after
645 /// flushing it.
646 pub fn into_inner(self) -> Result<W> {
647 self.0.into_inner()
648 }
649
650 /// Gets a reference to the underlying writer.
651 pub fn get_ref(&self) -> &W {
652 self.0.get_ref()
653 }
654
655 /// Returns the number of bytes written to the underlying writer
656 pub fn bytes_written(&self) -> u64 {
657 self.0.bytes_written()
658 }
659}
660
661/// A lexicographically ordered stream of keys from a set.
662///
663/// The `A` type parameter corresponds to an optional automaton to filter
664/// the stream. By default, no filtering is done.
665///
666/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
667#[cfg(feature = "alloc")]
668pub struct Stream<'s, A = AlwaysMatch>(raw::Stream<'s, A>)
669where
670 A: Automaton;
671
672#[cfg(feature = "alloc")]
673impl<'s, A: Automaton> Stream<'s, A> {
674 /// Convert this stream into a vector of Unicode strings.
675 ///
676 /// If any key is not valid UTF-8, then iteration on the stream is stopped
677 /// and a UTF-8 decoding error is returned.
678 ///
679 /// Note that this creates a new allocation for every key in the stream.
680 pub fn into_strs(self) -> Result<Vec<String>> {
681 self.0.into_str_keys()
682 }
683
684 /// Convert this stream into a vector of byte strings.
685 ///
686 /// Note that this creates a new allocation for every key in the stream.
687 pub fn into_bytes(self) -> Vec<Vec<u8>> {
688 self.0.into_byte_keys()
689 }
690}
691
692#[cfg(feature = "alloc")]
693impl<'a, 's, A: Automaton> Streamer<'a> for Stream<'s, A> {
694 type Item = &'a [u8];
695
696 fn next(&'a mut self) -> Option<&'a [u8]> {
697 self.0.next().map(|(key, _)| key)
698 }
699}
700
701/// A lexicographically ordered stream of key-state pairs from a set and
702/// an automaton.
703///
704/// The keys are from the set while the states are from the automaton.
705///
706/// The `A` type parameter corresponds to an optional automaton to filter
707/// the stream. By default, no filtering is done.
708///
709/// The `'m` lifetime parameter refers to the lifetime of the underlying set.
710#[cfg(feature = "alloc")]
711pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>)
712where
713 A: Automaton;
714
715#[cfg(feature = "alloc")]
716impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A>
717where
718 A::State: Clone,
719{
720 type Item = (&'a [u8], A::State);
721
722 fn next(&'a mut self) -> Option<(&'a [u8], A::State)> {
723 self.0.next().map(|(key, _, state)| (key, state))
724 }
725}
726
727/// A builder for constructing range queries on streams.
728///
729/// Once all bounds are set, one should call `into_stream` to get a
730/// `Stream`.
731///
732/// Bounds are not additive. That is, if `ge` is called twice on the same
733/// builder, then the second setting wins.
734///
735/// The `A` type parameter corresponds to an optional automaton to filter
736/// the stream. By default, no filtering is done.
737///
738/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
739#[cfg(feature = "alloc")]
740pub struct StreamBuilder<'s, A = AlwaysMatch>(raw::StreamBuilder<'s, A>);
741
742#[cfg(feature = "alloc")]
743impl<'s, A: Automaton> StreamBuilder<'s, A> {
744 /// Specify a greater-than-or-equal-to bound.
745 pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
746 StreamBuilder(self.0.ge(bound))
747 }
748
749 /// Specify a greater-than bound.
750 pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
751 StreamBuilder(self.0.gt(bound))
752 }
753
754 /// Specify a less-than-or-equal-to bound.
755 pub fn le<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
756 StreamBuilder(self.0.le(bound))
757 }
758
759 /// Specify a less-than bound.
760 pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'s, A> {
761 StreamBuilder(self.0.lt(bound))
762 }
763}
764
765#[cfg(feature = "alloc")]
766impl<'s, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'s, A> {
767 type Item = &'a [u8];
768 type Into = Stream<'s, A>;
769
770 fn into_stream(self) -> Stream<'s, A> {
771 Stream(self.0.into_stream())
772 }
773}
774
775/// A builder for constructing range queries on streams that include automaton
776/// states.
777///
778/// In general, one should use `StreamBuilder` unless you have a specific need
779/// for accessing the states of the underlying automaton that is being used to
780/// filter this stream.
781///
782/// Once all bounds are set, one should call `into_stream` to get a
783/// `Stream`.
784///
785/// Bounds are not additive. That is, if `ge` is called twice on the same
786/// builder, then the second setting wins.
787///
788/// The `A` type parameter corresponds to an optional automaton to filter
789/// the stream. By default, no filtering is done.
790///
791/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
792#[cfg(feature = "alloc")]
793pub struct StreamWithStateBuilder<'s, A = AlwaysMatch>(
794 raw::StreamWithStateBuilder<'s, A>,
795);
796
797#[cfg(feature = "alloc")]
798impl<'s, A: Automaton> StreamWithStateBuilder<'s, A> {
799 /// Specify a greater-than-or-equal-to bound.
800 pub fn ge<T: AsRef<[u8]>>(
801 self,
802 bound: T,
803 ) -> StreamWithStateBuilder<'s, A> {
804 StreamWithStateBuilder(self.0.ge(bound))
805 }
806
807 /// Specify a greater-than bound.
808 pub fn gt<T: AsRef<[u8]>>(
809 self,
810 bound: T,
811 ) -> StreamWithStateBuilder<'s, A> {
812 StreamWithStateBuilder(self.0.gt(bound))
813 }
814
815 /// Specify a less-than-or-equal-to bound.
816 pub fn le<T: AsRef<[u8]>>(
817 self,
818 bound: T,
819 ) -> StreamWithStateBuilder<'s, A> {
820 StreamWithStateBuilder(self.0.le(bound))
821 }
822
823 /// Specify a less-than bound.
824 pub fn lt<T: AsRef<[u8]>>(
825 self,
826 bound: T,
827 ) -> StreamWithStateBuilder<'s, A> {
828 StreamWithStateBuilder(self.0.lt(bound))
829 }
830}
831
832#[cfg(feature = "alloc")]
833impl<'s, 'a, A: 'a + Automaton> IntoStreamer<'a>
834 for StreamWithStateBuilder<'s, A>
835where
836 A::State: Clone,
837{
838 type Item = (&'a [u8], A::State);
839 type Into = StreamWithState<'s, A>;
840
841 fn into_stream(self) -> StreamWithState<'s, A> {
842 StreamWithState(self.0.into_stream())
843 }
844}
845
846/// A builder for collecting set streams on which to perform set operations.
847///
848/// Set operations include intersection, union, difference and symmetric
849/// difference. The result of each set operation is itself a stream that emits
850/// keys in lexicographic order.
851///
852/// All set operations work efficiently on an arbitrary number of
853/// streams with memory proportional to the number of streams.
854///
855/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
856/// where `n1, n2, n3, ...` correspond to the number of elements in each
857/// stream.
858///
859/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
860#[cfg(feature = "alloc")]
861pub struct OpBuilder<'s>(raw::OpBuilder<'s>);
862
863#[cfg(feature = "alloc")]
864impl<'s> Default for OpBuilder<'s> {
865 fn default() -> Self {
866 Self::new()
867 }
868}
869
870#[cfg(feature = "alloc")]
871impl<'s> OpBuilder<'s> {
872 /// Create a new set operation builder.
873 #[inline]
874 #[must_use]
875 pub fn new() -> OpBuilder<'s> {
876 OpBuilder(raw::OpBuilder::new())
877 }
878
879 /// Add a stream to this set operation.
880 ///
881 /// This is useful for a chaining style pattern, e.g.,
882 /// `builder.add(stream1).add(stream2).union()`.
883 ///
884 /// The stream must emit a lexicographically ordered sequence of byte
885 /// strings.
886 pub fn add<I, S>(mut self, streamable: I) -> OpBuilder<'s>
887 where
888 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
889 S: 's + for<'a> Streamer<'a, Item = &'a [u8]>,
890 {
891 self.push(streamable);
892 self
893 }
894
895 /// Add a stream to this set operation.
896 ///
897 /// The stream must emit a lexicographically ordered sequence of byte
898 /// strings.
899 pub fn push<I, S>(&mut self, streamable: I)
900 where
901 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
902 S: 's + for<'a> Streamer<'a, Item = &'a [u8]>,
903 {
904 self.0.push(StreamZeroOutput(streamable.into_stream()));
905 }
906
907 /// Performs a union operation on all streams that have been added.
908 ///
909 /// # Example
910 ///
911 /// ```rust
912 /// use fst_no_std::{IntoStreamer, Streamer, Set};
913 ///
914 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
915 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
916 ///
917 /// let mut union = set1.op().add(&set2).union();
918 ///
919 /// let mut keys = vec![];
920 /// while let Some(key) = union.next() {
921 /// keys.push(key.to_vec());
922 /// }
923 /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]);
924 /// ```
925 #[inline]
926 #[must_use]
927 pub fn union(self) -> Union<'s> {
928 Union(self.0.union())
929 }
930
931 /// Performs an intersection operation on all streams that have been added.
932 ///
933 /// # Example
934 ///
935 /// ```rust
936 /// use fst_no_std::{IntoStreamer, Streamer, Set};
937 ///
938 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
939 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
940 ///
941 /// let mut intersection = set1.op().add(&set2).intersection();
942 ///
943 /// let mut keys = vec![];
944 /// while let Some(key) = intersection.next() {
945 /// keys.push(key.to_vec());
946 /// }
947 /// assert_eq!(keys, vec![b"a"]);
948 /// ```
949 #[inline]
950 #[must_use]
951 pub fn intersection(self) -> Intersection<'s> {
952 Intersection(self.0.intersection())
953 }
954
955 /// Performs a difference operation with respect to the first stream added.
956 /// That is, this returns a stream of all elements in the first stream
957 /// that don't exist in any other stream that has been added.
958 ///
959 /// # Example
960 ///
961 /// ```rust
962 /// use fst_no_std::{IntoStreamer, Streamer, Set};
963 ///
964 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
965 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
966 ///
967 /// let mut difference = set1.op().add(&set2).difference();
968 ///
969 /// let mut keys = vec![];
970 /// while let Some(key) = difference.next() {
971 /// keys.push(key.to_vec());
972 /// }
973 /// assert_eq!(keys, vec![b"b", b"c"]);
974 /// ```
975 #[inline]
976 #[must_use]
977 pub fn difference(self) -> Difference<'s> {
978 Difference(self.0.difference())
979 }
980
981 /// Performs a symmetric difference operation on all of the streams that
982 /// have been added.
983 ///
984 /// When there are only two streams, then the keys returned correspond to
985 /// keys that are in either stream but *not* in both streams.
986 ///
987 /// More generally, for any number of streams, keys that occur in an odd
988 /// number of streams are returned.
989 ///
990 /// # Example
991 ///
992 /// ```rust
993 /// use fst_no_std::{IntoStreamer, Streamer, Set};
994 ///
995 /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap();
996 /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap();
997 ///
998 /// let mut sym_difference = set1.op().add(&set2).symmetric_difference();
999 ///
1000 /// let mut keys = vec![];
1001 /// while let Some(key) = sym_difference.next() {
1002 /// keys.push(key.to_vec());
1003 /// }
1004 /// assert_eq!(keys, vec![b"b", b"c", b"y", b"z"]);
1005 /// ```
1006 #[inline]
1007 #[must_use]
1008 pub fn symmetric_difference(self) -> SymmetricDifference<'s> {
1009 SymmetricDifference(self.0.symmetric_difference())
1010 }
1011}
1012
1013#[cfg(feature = "alloc")]
1014impl<'f, I, S> Extend<I> for OpBuilder<'f>
1015where
1016 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
1017 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
1018{
1019 fn extend<T>(&mut self, it: T)
1020 where
1021 T: IntoIterator<Item = I>,
1022 {
1023 for stream in it {
1024 self.push(stream);
1025 }
1026 }
1027}
1028
1029#[cfg(feature = "alloc")]
1030impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
1031where
1032 I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
1033 S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
1034{
1035 fn from_iter<T>(it: T) -> OpBuilder<'f>
1036 where
1037 T: IntoIterator<Item = I>,
1038 {
1039 let mut op = OpBuilder::new();
1040 op.extend(it);
1041 op
1042 }
1043}
1044
1045/// A stream of set union over multiple streams in lexicographic order.
1046///
1047/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1048#[cfg(feature = "alloc")]
1049pub struct Union<'s>(raw::Union<'s>);
1050
1051#[cfg(feature = "alloc")]
1052impl<'a, 's> Streamer<'a> for Union<'s> {
1053 type Item = &'a [u8];
1054
1055 #[inline]
1056 fn next(&'a mut self) -> Option<&'a [u8]> {
1057 self.0.next().map(|(key, _)| key)
1058 }
1059}
1060
1061/// A stream of set intersection over multiple streams in lexicographic order.
1062///
1063/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1064#[cfg(feature = "alloc")]
1065pub struct Intersection<'s>(raw::Intersection<'s>);
1066
1067#[cfg(feature = "alloc")]
1068impl<'a, 's> Streamer<'a> for Intersection<'s> {
1069 type Item = &'a [u8];
1070
1071 #[inline]
1072 fn next(&'a mut self) -> Option<&'a [u8]> {
1073 self.0.next().map(|(key, _)| key)
1074 }
1075}
1076
1077/// A stream of set difference over multiple streams in lexicographic order.
1078///
1079/// The difference operation is taken with respect to the first stream and the
1080/// rest of the streams. i.e., All elements in the first stream that do not
1081/// appear in any other streams.
1082///
1083/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1084#[cfg(feature = "alloc")]
1085pub struct Difference<'s>(raw::Difference<'s>);
1086
1087#[cfg(feature = "alloc")]
1088impl<'a, 's> Streamer<'a> for Difference<'s> {
1089 type Item = &'a [u8];
1090
1091 #[inline]
1092 fn next(&'a mut self) -> Option<&'a [u8]> {
1093 self.0.next().map(|(key, _)| key)
1094 }
1095}
1096
1097/// A stream of set symmetric difference over multiple streams in lexicographic
1098/// order.
1099///
1100/// The `'s` lifetime parameter refers to the lifetime of the underlying set.
1101#[cfg(feature = "alloc")]
1102pub struct SymmetricDifference<'s>(raw::SymmetricDifference<'s>);
1103
1104#[cfg(feature = "alloc")]
1105impl<'a, 's> Streamer<'a> for SymmetricDifference<'s> {
1106 type Item = &'a [u8];
1107
1108 #[inline]
1109 fn next(&'a mut self) -> Option<&'a [u8]> {
1110 self.0.next().map(|(key, _)| key)
1111 }
1112}
1113
1114/// A specialized stream for mapping set streams (`&[u8]`) to streams used
1115/// by raw fsts (`(&[u8], Output)`).
1116///
1117/// If this were iterators, we could use `iter::Map`, but doing this on streams
1118/// requires HKT, so we need to write out the monomorphization ourselves.
1119struct StreamZeroOutput<S>(S);
1120
1121impl<'a, S: Streamer<'a>> Streamer<'a> for StreamZeroOutput<S> {
1122 type Item = (S::Item, raw::Output);
1123
1124 fn next(&'a mut self) -> Option<(S::Item, raw::Output)> {
1125 self.0.next().map(|key| (key, raw::Output::zero()))
1126 }
1127}
1128
1129#[cfg(test)]
1130mod tests {
1131 use super::OpBuilder;
1132 use crate::Streamer;
1133
1134 #[test]
1135 fn no_fsts() {
1136 struct Iter<'a> {
1137 i: usize,
1138 xs: Vec<&'a [u8]>,
1139 }
1140
1141 impl<'a> Iter<'a> {
1142 fn new(xs: Vec<&'a [u8]>) -> Iter<'a> {
1143 Iter { i: 0, xs }
1144 }
1145 }
1146
1147 impl<'a, 's> Streamer<'a> for Iter<'s> {
1148 type Item = &'a [u8];
1149 fn next(&'a mut self) -> Option<&'a [u8]> {
1150 if self.i >= self.xs.len() {
1151 None
1152 } else {
1153 let i = self.i;
1154 self.i += 1;
1155 Some(self.xs[i])
1156 }
1157 }
1158 }
1159
1160 let mut stream = OpBuilder::new()
1161 .add(Iter::new(vec![
1162 &b"bar"[..],
1163 &b"baz"[..],
1164 &b"foo"[..],
1165 &b"fubar"[..],
1166 &b"quux"[..],
1167 ]))
1168 .add(Iter::new(vec![&b"bar"[..], &b"foofoo"[..], &b"fubar"[..]]))
1169 .intersection();
1170
1171 let mut got = vec![];
1172 while let Some(x) = stream.next() {
1173 got.push(x.to_vec());
1174 }
1175 assert_eq!(got, vec![&b"bar"[..], &b"fubar"[..]]);
1176 }
1177}