lindera_fst/
map.rs

1use std::fmt;
2use std::io;
3use std::iter::FromIterator;
4
5use crate::automaton::{AlwaysMatch, Automaton};
6use crate::raw;
7pub use crate::raw::IndexedValue;
8use crate::stream::{IntoStreamer, Streamer};
9use crate::Result;
10use std::ops::Deref;
11
12/// Map is a lexicographically ordered map from byte strings to integers.
13///
14/// A `Map` is constructed with the `MapBuilder` type. Alternatively, a `Map`
15/// can be constructed in memory from a lexicographically ordered iterator
16/// of key-value pairs (`Map::from_iter`).
17///
18/// A key feature of `Map` is that it can be serialized to disk compactly. Its
19/// underlying representation is built such that the `Map` can be memory mapped
20/// (`Map::from_path`) and searched without necessarily loading the entire
21/// map into memory.
22///
23/// It supports most common operations associated with maps, such as key
24/// lookup and search. It also supports set operations on its keys along with
25/// the ability to specify how conflicting values are merged together. Maps
26/// also support range queries and automata based searches (e.g. a regular
27/// expression).
28///
29/// Maps are represented by a finite state transducer where inputs are the keys
30/// and outputs are the values. As such, maps have the following invariants:
31///
32/// 1. Once constructed, a `Map` can never be modified.
33/// 2. Maps must be constructed with lexicographically ordered byte sequences.
34///    There is no restricting on the ordering of values.
35///
36/// # Differences with sets
37///
38/// Maps and sets are represented by the same underlying data structure: the
39/// finite state transducer. The principal difference between them is that
40/// sets always have their output values set to `0`. This has an impact on the
41/// representation size and is reflected in the type system for convenience.
42/// A secondary but subtle difference is that duplicate values can be added
43/// to a set, but it is an error to do so with maps. That is, a set can have
44/// the same key added sequentially, but a map can't.
45///
46/// # The future
47///
48/// It is regrettable that the output value is fixed to `u64`. Indeed, it is
49/// not necessary, but it was a major simplification in the implementation.
50/// In the future, the value type may become generic to an extent (outputs must
51/// satisfy a basic algebra).
52///
53/// Keys will always be byte strings; however, we may grow more conveniences
54/// around dealing with them (such as a serialization/deserialization step,
55/// although it isn't clear where exactly this should live).
56pub struct Map<Data>(raw::Fst<Data>);
57
58impl Map<Vec<u8>> {
59    /// Creates a map from its representation as a raw byte sequence.
60    ///
61    /// Note that this operation is very cheap (no allocations and no copies).
62    ///
63    /// The map must have been written with a compatible finite state
64    /// transducer builder (`MapBuilder` qualifies). If the format is invalid
65    /// or if there is a mismatch between the API version of this library
66    /// and the map, then an error is returned.
67    pub fn from_bytes(bytes: Vec<u8>) -> Result<Map<Vec<u8>>> {
68        raw::Fst::new(bytes).map(Map)
69    }
70
71    /// Create a `Map` from an iterator of lexicographically ordered byte
72    /// strings and associated values.
73    ///
74    /// If the iterator does not yield unique keys in lexicographic order, then
75    /// an error is returned.
76    ///
77    /// Note that this is a convenience function to build a map in memory.
78    /// To build a map that streams to an arbitrary `io::Write`, use
79    /// `MapBuilder`.
80    pub fn from_iter<K, I>(iter: I) -> Result<Self>
81    where
82        K: AsRef<[u8]>,
83        I: IntoIterator<Item = (K, u64)>,
84    {
85        let mut builder = MapBuilder::memory();
86        builder.extend_iter(iter)?;
87        Map::from_bytes(builder.into_inner()?)
88    }
89}
90
91impl<Data: Deref<Target = [u8]>> Map<Data> {
92    /// Tests the membership of a single key.
93    ///
94    /// # Example
95    ///
96    /// ```rust
97    /// use lindera_fst::Map;
98    ///
99    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
100    ///
101    /// assert_eq!(map.contains_key("b"), true);
102    /// assert_eq!(map.contains_key("z"), false);
103    /// ```
104    pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
105        self.0.contains_key(key)
106    }
107
108    /// Retrieves the value associated with a key.
109    ///
110    /// If the key does not exist, then `None` is returned.
111    ///
112    /// # Example
113    ///
114    /// ```rust
115    /// use lindera_fst::Map;
116    ///
117    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
118    ///
119    /// assert_eq!(map.get("b"), Some(2));
120    /// assert_eq!(map.get("z"), None);
121    /// ```
122    pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<u64> {
123        self.0.get(key).map(|output| output.value())
124    }
125
126    /// Return a lexicographically ordered stream of all key-value pairs in
127    /// this map.
128    ///
129    /// While this is a stream, it does require heap space proportional to the
130    /// longest key in the map.
131    ///
132    /// If the map is memory mapped, then no further heap space is needed.
133    /// Note though that your operating system may fill your page cache
134    /// (which will cause the resident memory usage of the process to go up
135    /// correspondingly).
136    ///
137    /// # Example
138    ///
139    /// Since streams are not iterators, the traditional `for` loop cannot be
140    /// used. `while let` is useful instead:
141    ///
142    /// ```rust
143    /// use lindera_fst::{IntoStreamer, Streamer, Map};
144    ///
145    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
146    /// let mut stream = map.stream();
147    ///
148    /// let mut kvs = vec![];
149    /// while let Some((k, v)) = stream.next() {
150    ///     kvs.push((k.to_vec(), v));
151    /// }
152    /// assert_eq!(kvs, vec![
153    ///     (b"a".to_vec(), 1),
154    ///     (b"b".to_vec(), 2),
155    ///     (b"c".to_vec(), 3),
156    /// ]);
157    /// ```
158    #[inline]
159    pub fn stream(&self) -> Stream {
160        Stream(self.0.stream())
161    }
162
163    /// Return a lexicographically ordered stream of all keys in this map.
164    ///
165    /// Memory requirements are the same as described on `Map::stream`.
166    ///
167    /// # Example
168    ///
169    /// ```rust
170    /// use lindera_fst::{IntoStreamer, Streamer, Map};
171    ///
172    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
173    /// let mut stream = map.keys();
174    ///
175    /// let mut keys = vec![];
176    /// while let Some(k) = stream.next() {
177    ///     keys.push(k.to_vec());
178    /// }
179    /// assert_eq!(keys, vec![b"a", b"b", b"c"]);
180    /// ```
181    #[inline]
182    pub fn keys(&self) -> Keys {
183        Keys(self.0.stream())
184    }
185
186    /// Return a stream of all values in this map ordered lexicographically
187    /// by each value's corresponding key.
188    ///
189    /// Memory requirements are the same as described on `Map::stream`.
190    ///
191    /// # Example
192    ///
193    /// ```rust
194    /// use lindera_fst::{IntoStreamer, Streamer, Map};
195    ///
196    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
197    /// let mut stream = map.values();
198    ///
199    /// let mut values = vec![];
200    /// while let Some(v) = stream.next() {
201    ///     values.push(v);
202    /// }
203    /// assert_eq!(values, vec![1, 2, 3]);
204    /// ```
205    #[inline]
206    pub fn values(&self) -> Values {
207        Values(self.0.stream())
208    }
209
210    /// Return a builder for range queries.
211    ///
212    /// A range query returns a subset of key-value pairs in this map in a
213    /// range given in lexicographic order.
214    ///
215    /// Memory requirements are the same as described on `Map::stream`.
216    /// Notably, only the keys in the range are read; keys outside the range
217    /// are not.
218    ///
219    /// # Example
220    ///
221    /// Returns only the key-value pairs in the range given.
222    ///
223    /// ```rust
224    /// use lindera_fst::{IntoStreamer, Streamer, Map};
225    ///
226    /// let map = Map::from_iter(vec![
227    ///     ("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5),
228    /// ]).unwrap();
229    /// let mut stream = map.range().ge("b").lt("e").into_stream();
230    ///
231    /// let mut kvs = vec![];
232    /// while let Some((k, v)) = stream.next() {
233    ///     kvs.push((k.to_vec(), v));
234    /// }
235    /// assert_eq!(kvs, vec![
236    ///     (b"b".to_vec(), 2),
237    ///     (b"c".to_vec(), 3),
238    ///     (b"d".to_vec(), 4),
239    /// ]);
240    /// ```
241    #[inline]
242    pub fn range(&self) -> StreamBuilder {
243        StreamBuilder(self.0.range())
244    }
245
246    /// Executes an automaton on the keys of this map.
247    ///
248    /// Note that this returns a `StreamBuilder`, which can be used to
249    /// add a range query to the search (see the `range` method).
250    ///
251    /// Memory requirements are the same as described on `Map::stream`.
252    ///
253    /// # Example
254    ///
255    /// An implementation of regular expressions for `Automaton` is available
256    /// in the `fst-regex` crate, which can be used to search maps.
257    ///
258    /// ```rust
259    ///
260    /// use std::error::Error;
261    ///
262    /// use lindera_fst::{IntoStreamer, Streamer, Map};
263    /// use lindera_fst::Regex;
264    ///
265    /// # fn main() { example().unwrap(); }
266    /// fn example() -> Result<(), Box<Error>> {
267    ///     let map = Map::from_iter(vec![
268    ///         ("foo", 1), ("foo1", 2), ("foo2", 3), ("foo3", 4), ("foobar", 5),
269    ///     ]).unwrap();
270    ///
271    ///     let re = Regex::new("f[a-z]+3?").unwrap();
272    ///     let mut stream = map.search(&re).into_stream();
273    ///
274    ///     let mut kvs = vec![];
275    ///     while let Some((k, v)) = stream.next() {
276    ///         kvs.push((k.to_vec(), v));
277    ///     }
278    ///     assert_eq!(kvs, vec![
279    ///         (b"foo".to_vec(), 1),
280    ///         (b"foo3".to_vec(), 4),
281    ///         (b"foobar".to_vec(), 5),
282    ///     ]);
283    ///
284    ///     Ok(())
285    /// }
286    /// ```
287    pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<A> {
288        StreamBuilder(self.0.search(aut))
289    }
290
291    /// Returns the number of elements in this map.
292    #[inline]
293    pub fn len(&self) -> usize {
294        self.0.len()
295    }
296
297    /// Returns true if and only if this map is empty.
298    #[inline]
299    pub fn is_empty(&self) -> bool {
300        self.0.is_empty()
301    }
302
303    /// Creates a new map operation with this map added to it.
304    ///
305    /// The `OpBuilder` type can be used to add additional map streams
306    /// and perform set operations like union, intersection, difference and
307    /// symmetric difference on the keys of the map. These set operations also
308    /// allow one to specify how conflicting values are merged in the stream.
309    ///
310    /// # Example
311    ///
312    /// This example demonstrates a union on multiple map streams. Notice that
313    /// the stream returned from the union is not a sequence of key-value
314    /// pairs, but rather a sequence of keys associated with one or more
315    /// values. Namely, a key is associated with each value associated with
316    /// that same key in the all of the streams.
317    ///
318    /// ```rust
319    /// use lindera_fst::{Streamer, Map};
320    /// use lindera_fst::{map::IndexedValue};
321    ///
322    /// let map1 = Map::from_iter(vec![
323    ///     ("a", 1), ("b", 2), ("c", 3),
324    /// ]).unwrap();
325    /// let map2 = Map::from_iter(vec![
326    ///     ("a", 10), ("y", 11), ("z", 12),
327    /// ]).unwrap();
328    ///
329    /// let mut union = map1.op().add(&map2).union();
330    ///
331    /// let mut kvs = vec![];
332    /// while let Some((k, vs)) = union.next() {
333    ///     kvs.push((k.to_vec(), vs.to_vec()));
334    /// }
335    /// assert_eq!(kvs, vec![
336    ///     (b"a".to_vec(), vec![
337    ///         IndexedValue { index: 0, value: 1 },
338    ///         IndexedValue { index: 1, value: 10 },
339    ///     ]),
340    ///     (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
341    ///     (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
342    ///     (b"y".to_vec(), vec![IndexedValue { index: 1, value: 11 }]),
343    ///     (b"z".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
344    /// ]);
345    /// ```
346    #[inline]
347    pub fn op(&self) -> OpBuilder {
348        OpBuilder::new().add(self)
349    }
350
351    /// Returns a reference to the underlying raw finite state transducer.
352    #[inline]
353    pub fn as_fst(&self) -> &raw::Fst<Data> {
354        &self.0
355    }
356}
357
358impl<Data: Deref<Target = [u8]>> fmt::Debug for Map<Data> {
359    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
360        write!(f, "Map([")?;
361        let mut stream = self.stream();
362        let mut first = true;
363        while let Some((k, v)) = stream.next() {
364            if !first {
365                write!(f, ", ")?;
366            }
367            first = false;
368            write!(f, "({}, {})", String::from_utf8_lossy(k), v)?;
369        }
370        write!(f, "])")
371    }
372}
373
374// Construct a map from an Fst object.
375impl<Data> From<raw::Fst<Data>> for Map<Data> {
376    #[inline]
377    fn from(fst: raw::Fst<Data>) -> Self {
378        Map(fst)
379    }
380}
381
382/// Returns the underlying finite state transducer.
383impl<Data> AsRef<raw::Fst<Data>> for Map<Data> {
384    #[inline]
385    fn as_ref(&self) -> &raw::Fst<Data> {
386        &self.0
387    }
388}
389
390impl<'m, 'a, Data: Deref<Target = [u8]>> IntoStreamer<'a> for &'m Map<Data> {
391    type Item = (&'a [u8], u64);
392    type Into = Stream<'m>;
393
394    #[inline]
395    fn into_stream(self) -> Self::Into {
396        Stream(self.0.stream())
397    }
398}
399
400/// A builder for creating a map.
401///
402/// This is not your average everyday builder. It has two important qualities
403/// that make it a bit unique from what you might expect:
404///
405/// 1. All keys must be added in lexicographic order. Adding a key out of order
406///    will result in an error. Additionally, adding a duplicate key will also
407///    result in an error. That is, once a key is associated with a value,
408///    that association can never be modified or deleted.
409/// 2. The representation of a map is streamed to *any* `io::Write` as it is
410///    built. For an in memory representation, this can be a `Vec<u8>`.
411///
412/// Point (2) is especially important because it means that a map can be
413/// constructed *without storing the entire map in memory*. Namely, since it
414/// works with any `io::Write`, it can be streamed directly to a file.
415///
416/// With that said, the builder does use memory, but **memory usage is bounded
417/// to a constant size**. The amount of memory used trades off with the
418/// compression ratio. Currently, the implementation hard codes this trade off
419/// which can result in about 5-20MB of heap usage during construction. (N.B.
420/// Guaranteeing a maximal compression ratio requires memory proportional to
421/// the size of the map, which defeats some of the benefit of streaming
422/// it to disk. In practice, a small bounded amount of memory achieves
423/// close-to-minimal compression ratios.)
424///
425/// The algorithmic complexity of map construction is `O(n)` where `n` is the
426/// number of elements added to the map.
427///
428/// # Example: build in memory
429///
430/// This shows how to use the builder to construct a map in memory. Note that
431/// `Map::from_iter` provides a convenience function that achieves this same
432/// goal without needing to explicitly use `MapBuilder`.
433///
434/// ```rust
435/// use lindera_fst::{IntoStreamer, Streamer, Map, MapBuilder};
436///
437/// let mut build = MapBuilder::memory();
438/// build.insert("bruce", 1).unwrap();
439/// build.insert("clarence", 2).unwrap();
440/// build.insert("stevie", 3).unwrap();
441///
442/// // You could also call `finish()` here, but since we're building the map in
443/// // memory, there would be no way to get the `Vec<u8>` back.
444/// let bytes = build.into_inner().unwrap();
445///
446/// // At this point, the map has been constructed, but here's how to read it.
447/// let map = Map::from_bytes(bytes).unwrap();
448/// let mut stream = map.into_stream();
449/// let mut kvs = vec![];
450/// while let Some((k, v)) = stream.next() {
451///     kvs.push((k.to_vec(), v));
452/// }
453/// assert_eq!(kvs, vec![
454///     (b"bruce".to_vec(), 1),
455///     (b"clarence".to_vec(), 2),
456///     (b"stevie".to_vec(), 3),
457/// ]);
458/// ```
459pub struct MapBuilder<W>(raw::Builder<W>);
460
461impl MapBuilder<Vec<u8>> {
462    /// Create a builder that builds a map in memory.
463    #[inline]
464    pub fn memory() -> Self {
465        MapBuilder(raw::Builder::memory())
466    }
467}
468
469impl<W: io::Write> MapBuilder<W> {
470    /// Create a builder that builds a map by writing it to `wtr` in a
471    /// streaming fashion.
472    pub fn new(wtr: W) -> Result<MapBuilder<W>> {
473        raw::Builder::new_type(wtr, 0).map(MapBuilder)
474    }
475
476    /// Insert a new key-value pair into the map.
477    ///
478    /// Keys must be convertible to byte strings. Values must be a `u64`, which
479    /// is a restriction of the current implementation of finite state
480    /// transducers. (Values may one day be expanded to other types.)
481    ///
482    /// If a key is inserted that is less than or equal to any previous key
483    /// added, then an error is returned. Similarly, if there was a problem
484    /// writing to the underlying writer, an error is returned.
485    pub fn insert<K: AsRef<[u8]>>(&mut self, key: K, val: u64) -> Result<()> {
486        self.0.insert(key, val)
487    }
488
489    /// Calls insert on each item in the iterator.
490    ///
491    /// If an error occurred while adding an element, processing is stopped
492    /// and the error is returned.
493    ///
494    /// If a key is inserted that is less than or equal to any previous key
495    /// added, then an error is returned. Similarly, if there was a problem
496    /// writing to the underlying writer, an error is returned.
497    pub fn extend_iter<K, I>(&mut self, iter: I) -> Result<()>
498    where
499        K: AsRef<[u8]>,
500        I: IntoIterator<Item = (K, u64)>,
501    {
502        self.0
503            .extend_iter(iter.into_iter().map(|(k, v)| (k, raw::Output::new(v))))
504    }
505
506    /// Calls insert on each item in the stream.
507    ///
508    /// Note that unlike `extend_iter`, this is not generic on the items in
509    /// the stream.
510    ///
511    /// If a key is inserted that is less than or equal to any previous key
512    /// added, then an error is returned. Similarly, if there was a problem
513    /// writing to the underlying writer, an error is returned.
514    pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
515    where
516        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
517        S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
518    {
519        self.0.extend_stream(StreamOutput(stream.into_stream()))
520    }
521
522    /// Finishes the construction of the map and flushes the underlying
523    /// writer. After completion, the data written to `W` may be read using
524    /// one of `Map`'s constructor methods.
525    pub fn finish(self) -> Result<()> {
526        self.0.finish()
527    }
528
529    /// Just like `finish`, except it returns the underlying writer after
530    /// flushing it.
531    pub fn into_inner(self) -> Result<W> {
532        self.0.into_inner()
533    }
534
535    /// Gets a reference to the underlying writer.
536    pub fn get_ref(&self) -> &W {
537        self.0.get_ref()
538    }
539
540    /// Returns the number of bytes written to the underlying writer
541    pub fn bytes_written(&self) -> u64 {
542        self.0.bytes_written()
543    }
544}
545
546/// A lexicographically ordered stream of key-value pairs from a map.
547///
548/// The `A` type parameter corresponds to an optional automaton to filter
549/// the stream. By default, no filtering is done.
550///
551/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
552pub struct Stream<'m, A = AlwaysMatch>(raw::Stream<'m, A>)
553where
554    A: Automaton;
555
556impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> {
557    type Item = (&'a [u8], u64);
558
559    fn next(&'a mut self) -> Option<Self::Item> {
560        self.0.next().map(|(key, out)| (key, out.value()))
561    }
562}
563
564impl<'m, A: Automaton> Stream<'m, A> {
565    /// Convert this stream into a vector of byte strings and outputs.
566    ///
567    /// Note that this creates a new allocation for every key in the stream.
568    pub fn into_byte_vec(self) -> Vec<(Vec<u8>, u64)> {
569        self.0.into_byte_vec()
570    }
571
572    /// Convert this stream into a vector of Unicode strings and outputs.
573    ///
574    /// If any key is not valid UTF-8, then iteration on the stream is stopped
575    /// and a UTF-8 decoding error is returned.
576    ///
577    /// Note that this creates a new allocation for every key in the stream.
578    pub fn into_str_vec(self) -> Result<Vec<(String, u64)>> {
579        self.0.into_str_vec()
580    }
581
582    /// Convert this stream into a vector of byte strings.
583    ///
584    /// Note that this creates a new allocation for every key in the stream.
585    pub fn into_byte_keys(self) -> Vec<Vec<u8>> {
586        self.0.into_byte_keys()
587    }
588
589    /// Convert this stream into a vector of Unicode strings.
590    ///
591    /// If any key is not valid UTF-8, then iteration on the stream is stopped
592    /// and a UTF-8 decoding error is returned.
593    ///
594    /// Note that this creates a new allocation for every key in the stream.
595    pub fn into_str_keys(self) -> Result<Vec<String>> {
596        self.0.into_str_keys()
597    }
598
599    /// Convert this stream into a vector of outputs.
600    pub fn into_values(self) -> Vec<u64> {
601        self.0.into_values()
602    }
603}
604
605/// A lexicographically ordered stream of keys from a map.
606///
607/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
608pub struct Keys<'m>(raw::Stream<'m>);
609
610impl<'a, 'm> Streamer<'a> for Keys<'m> {
611    type Item = &'a [u8];
612
613    #[inline]
614    fn next(&'a mut self) -> Option<Self::Item> {
615        self.0.next().map(|(key, _)| key)
616    }
617}
618
619/// A stream of values from a map, lexicographically ordered by each value's
620/// corresponding key.
621///
622/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
623pub struct Values<'m>(raw::Stream<'m>);
624
625impl<'a, 'm> Streamer<'a> for Values<'m> {
626    type Item = u64;
627
628    #[inline]
629    fn next(&'a mut self) -> Option<Self::Item> {
630        self.0.next().map(|(_, out)| out.value())
631    }
632}
633
634/// A builder for constructing range queries on streams.
635///
636/// Once all bounds are set, one should call `into_stream` to get a
637/// `Stream`.
638///
639/// Bounds are not additive. That is, if `ge` is called twice on the same
640/// builder, then the second setting wins.
641///
642/// The `A` type parameter corresponds to an optional automaton to filter
643/// the stream. By default, no filtering is done.
644///
645/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
646pub struct StreamBuilder<'m, A = AlwaysMatch>(raw::StreamBuilder<'m, A>);
647
648impl<'m, A: Automaton> StreamBuilder<'m, A> {
649    /// Specify a greater-than-or-equal-to bound.
650    pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> Self {
651        StreamBuilder(self.0.ge(bound))
652    }
653
654    /// Specify a greater-than bound.
655    pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> Self {
656        StreamBuilder(self.0.gt(bound))
657    }
658
659    /// Specify a less-than-or-equal-to bound.
660    pub fn le<T: AsRef<[u8]>>(self, bound: T) -> Self {
661        StreamBuilder(self.0.le(bound))
662    }
663
664    /// Specify a less-than bound.
665    pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> Self {
666        StreamBuilder(self.0.lt(bound))
667    }
668
669    /// Make it iterate backwards.
670    pub fn backward(self) -> Self {
671        StreamBuilder(self.0.backward())
672    }
673
674    /// Return this builder and gives the automaton states
675    /// along with the results.
676    pub fn with_state(self) -> StreamWithStateBuilder<'m, A> {
677        StreamWithStateBuilder(self.0.with_state())
678    }
679}
680
681impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> {
682    type Item = (&'a [u8], u64);
683    type Into = Stream<'m, A>;
684
685    fn into_stream(self) -> Self::Into {
686        Stream(self.0.into_stream())
687    }
688}
689
690/// A builder for constructing range queries of streams
691/// that returns results along with automaton states.
692///
693/// Once all bounds are set, one should call `into_stream` to get a
694/// `StreamWithState`.
695///
696/// Bounds are not additive. That is, if `ge` is called twice on the same
697/// builder, then the second setting wins.
698///
699/// The `A` type parameter corresponds to an optional automaton to filter
700/// the stream. By default, no filtering is done.
701///
702/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
703pub struct StreamWithStateBuilder<'m, A = AlwaysMatch>(raw::StreamWithStateBuilder<'m, A>);
704
705impl<'m, 'a, A: 'a + Automaton> IntoStreamer<'a> for StreamWithStateBuilder<'m, A>
706where
707    A::State: Clone,
708{
709    type Item = (&'a [u8], u64, A::State);
710    type Into = StreamWithState<'m, A>;
711
712    fn into_stream(self) -> Self::Into {
713        StreamWithState(self.0.into_stream())
714    }
715}
716
717/// A builder for collecting map streams on which to perform set operations
718/// on the keys of maps.
719///
720/// Set operations include intersection, union, difference and symmetric
721/// difference. The result of each set operation is itself a stream that emits
722/// pairs of keys and a sequence of each occurrence of that key in the
723/// participating streams. This information allows one to perform set
724/// operations on maps and customize how conflicting output values are handled.
725///
726/// All set operations work efficiently on an arbitrary number of
727/// streams with memory proportional to the number of streams.
728///
729/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
730/// where `n1, n2, n3, ...` correspond to the number of elements in each
731/// stream.
732///
733/// The `'m` lifetime parameter refers to the lifetime of the underlying set.
734pub struct OpBuilder<'m>(raw::OpBuilder<'m>);
735
736impl<'m> OpBuilder<'m> {
737    /// Create a new set operation builder.
738    #[inline]
739    pub fn new() -> Self {
740        OpBuilder(raw::OpBuilder::default())
741    }
742
743    /// Add a stream to this set operation.
744    ///
745    /// This is useful for a chaining style pattern, e.g.,
746    /// `builder.add(stream1).add(stream2).union()`.
747    ///
748    /// The stream must emit a lexicographically ordered sequence of key-value
749    /// pairs.
750    pub fn add<I, S>(mut self, streamable: I) -> Self
751    where
752        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
753        S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
754    {
755        self.push(streamable);
756        self
757    }
758
759    /// Add a stream to this set operation.
760    ///
761    /// The stream must emit a lexicographically ordered sequence of key-value
762    /// pairs.
763    pub fn push<I, S>(&mut self, streamable: I)
764    where
765        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
766        S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
767    {
768        self.0.push(StreamOutput(streamable.into_stream()));
769    }
770
771    /// Performs a union operation on all streams that have been added.
772    ///
773    /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
774    /// first element of the tuple is the byte string key. The second element
775    /// of the tuple is a list of all occurrences of that key in participating
776    /// streams. The `IndexedValue` contains an index and the value associated
777    /// with that key in that stream. The index uniquely identifies each
778    /// stream, which is an integer that is auto-incremented when a stream
779    /// is added to this operation (starting at `0`).
780    ///
781    /// # Example
782    ///
783    /// ```rust
784    /// use lindera_fst::{IntoStreamer, Streamer, Map};
785    /// use lindera_fst::map::IndexedValue;
786    ///
787    /// let map1 = Map::from_iter(vec![
788    ///     ("a", 1), ("b", 2), ("c", 3),
789    /// ]).unwrap();
790    /// let map2 = Map::from_iter(vec![
791    ///     ("a", 11), ("y", 12), ("z", 13),
792    /// ]).unwrap();
793    ///
794    /// let mut union = map1.op().add(&map2).union();
795    ///
796    /// let mut kvs = vec![];
797    /// while let Some((k, vs)) = union.next() {
798    ///     kvs.push((k.to_vec(), vs.to_vec()));
799    /// }
800    /// assert_eq!(kvs, vec![
801    ///     (b"a".to_vec(), vec![
802    ///         IndexedValue { index: 0, value: 1 },
803    ///         IndexedValue { index: 1, value: 11 },
804    ///     ]),
805    ///     (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
806    ///     (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
807    ///     (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
808    ///     (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
809    /// ]);
810    /// ```
811    #[inline]
812    pub fn union(self) -> Union<'m> {
813        Union(self.0.union())
814    }
815
816    /// Performs an intersection operation on all streams that have been added.
817    ///
818    /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
819    /// first element of the tuple is the byte string key. The second element
820    /// of the tuple is a list of all occurrences of that key in participating
821    /// streams. The `IndexedValue` contains an index and the value associated
822    /// with that key in that stream. The index uniquely identifies each
823    /// stream, which is an integer that is auto-incremented when a stream
824    /// is added to this operation (starting at `0`).
825    ///
826    /// # Example
827    ///
828    /// ```rust
829    /// use lindera_fst::{IntoStreamer, Streamer, Map};
830    /// use lindera_fst::map::IndexedValue;
831    ///
832    /// let map1 = Map::from_iter(vec![
833    ///     ("a", 1), ("b", 2), ("c", 3),
834    /// ]).unwrap();
835    /// let map2 = Map::from_iter(vec![
836    ///     ("a", 11), ("y", 12), ("z", 13),
837    /// ]).unwrap();
838    ///
839    /// let mut intersection = map1.op().add(&map2).intersection();
840    ///
841    /// let mut kvs = vec![];
842    /// while let Some((k, vs)) = intersection.next() {
843    ///     kvs.push((k.to_vec(), vs.to_vec()));
844    /// }
845    /// assert_eq!(kvs, vec![
846    ///     (b"a".to_vec(), vec![
847    ///         IndexedValue { index: 0, value: 1 },
848    ///         IndexedValue { index: 1, value: 11 },
849    ///     ]),
850    /// ]);
851    /// ```
852    #[inline]
853    pub fn intersection(self) -> Intersection<'m> {
854        Intersection(self.0.intersection())
855    }
856
857    /// Performs a difference operation with respect to the first stream added.
858    /// That is, this returns a stream of all elements in the first stream
859    /// that don't exist in any other stream that has been added.
860    ///
861    /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
862    /// first element of the tuple is the byte string key. The second element
863    /// of the tuple is a list of all occurrences of that key in participating
864    /// streams. The `IndexedValue` contains an index and the value associated
865    /// with that key in that stream. The index uniquely identifies each
866    /// stream, which is an integer that is auto-incremented when a stream
867    /// is added to this operation (starting at `0`).
868    ///
869    /// # Example
870    ///
871    /// ```rust
872    /// use lindera_fst::{Streamer, Map};
873    /// use lindera_fst::map::IndexedValue;
874    ///
875    /// let map1 = Map::from_iter(vec![
876    ///     ("a", 1), ("b", 2), ("c", 3),
877    /// ]).unwrap();
878    /// let map2 = Map::from_iter(vec![
879    ///     ("a", 11), ("y", 12), ("z", 13),
880    /// ]).unwrap();
881    ///
882    /// let mut difference = map1.op().add(&map2).difference();
883    ///
884    /// let mut kvs = vec![];
885    /// while let Some((k, vs)) = difference.next() {
886    ///     kvs.push((k.to_vec(), vs.to_vec()));
887    /// }
888    /// assert_eq!(kvs, vec![
889    ///     (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
890    ///     (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
891    /// ]);
892    /// ```
893    #[inline]
894    pub fn difference(self) -> Difference<'m> {
895        Difference(self.0.difference())
896    }
897
898    /// Performs a symmetric difference operation on all of the streams that
899    /// have been added.
900    ///
901    /// When there are only two streams, then the keys returned correspond to
902    /// keys that are in either stream but *not* in both streams.
903    ///
904    /// More generally, for any number of streams, keys that occur in an odd
905    /// number of streams are returned.
906    ///
907    /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
908    /// first element of the tuple is the byte string key. The second element
909    /// of the tuple is a list of all occurrences of that key in participating
910    /// streams. The `IndexedValue` contains an index and the value associated
911    /// with that key in that stream. The index uniquely identifies each
912    /// stream, which is an integer that is auto-incremented when a stream
913    /// is added to this operation (starting at `0`).
914    ///
915    /// # Example
916    ///
917    /// ```rust
918    /// use lindera_fst::{IntoStreamer, Streamer, Map};
919    /// use lindera_fst::map::IndexedValue;
920    ///
921    /// let map1 = Map::from_iter(vec![
922    ///     ("a", 1), ("b", 2), ("c", 3),
923    /// ]).unwrap();
924    /// let map2 = Map::from_iter(vec![
925    ///     ("a", 11), ("y", 12), ("z", 13),
926    /// ]).unwrap();
927    ///
928    /// let mut sym_difference = map1.op().add(&map2).symmetric_difference();
929    ///
930    /// let mut kvs = vec![];
931    /// while let Some((k, vs)) = sym_difference.next() {
932    ///     kvs.push((k.to_vec(), vs.to_vec()));
933    /// }
934    /// assert_eq!(kvs, vec![
935    ///     (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
936    ///     (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
937    ///     (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
938    ///     (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
939    /// ]);
940    /// ```
941    #[inline]
942    pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
943        SymmetricDifference(self.0.symmetric_difference())
944    }
945}
946
947impl<'f, I, S> Extend<I> for OpBuilder<'f>
948where
949    I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
950    S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
951{
952    fn extend<T>(&mut self, it: T)
953    where
954        T: IntoIterator<Item = I>,
955    {
956        for stream in it {
957            self.push(stream);
958        }
959    }
960}
961
962impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
963where
964    I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
965    S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
966{
967    fn from_iter<T>(it: T) -> Self
968    where
969        T: IntoIterator<Item = I>,
970    {
971        let mut op = OpBuilder::new();
972        op.extend(it);
973        op
974    }
975}
976
977/// A stream of set union over multiple map streams in lexicographic order.
978///
979/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
980pub struct Union<'m>(raw::Union<'m>);
981
982impl<'a, 'm> Streamer<'a> for Union<'m> {
983    type Item = (&'a [u8], &'a [IndexedValue]);
984
985    #[inline]
986    fn next(&'a mut self) -> Option<Self::Item> {
987        self.0.next()
988    }
989}
990
991/// A stream of set intersection over multiple map streams in lexicographic
992/// order.
993///
994/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
995pub struct Intersection<'m>(raw::Intersection<'m>);
996
997impl<'a, 'm> Streamer<'a> for Intersection<'m> {
998    type Item = (&'a [u8], &'a [IndexedValue]);
999
1000    #[inline]
1001    fn next(&'a mut self) -> Option<Self::Item> {
1002        self.0.next()
1003    }
1004}
1005
1006/// A stream of set difference over multiple map streams in lexicographic
1007/// order.
1008///
1009/// The difference operation is taken with respect to the first stream and the
1010/// rest of the streams. i.e., All elements in the first stream that do not
1011/// appear in any other streams.
1012///
1013/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1014pub struct Difference<'m>(raw::Difference<'m>);
1015
1016impl<'a, 'm> Streamer<'a> for Difference<'m> {
1017    type Item = (&'a [u8], &'a [IndexedValue]);
1018
1019    #[inline]
1020    fn next(&'a mut self) -> Option<Self::Item> {
1021        self.0.next()
1022    }
1023}
1024
1025/// A stream of set symmetric difference over multiple map streams in
1026/// lexicographic order.
1027///
1028/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1029pub struct SymmetricDifference<'m>(raw::SymmetricDifference<'m>);
1030
1031impl<'a, 'm> Streamer<'a> for SymmetricDifference<'m> {
1032    type Item = (&'a [u8], &'a [IndexedValue]);
1033
1034    #[inline]
1035    fn next(&'a mut self) -> Option<Self::Item> {
1036        self.0.next()
1037    }
1038}
1039
1040/// A specialized stream for mapping map streams (`(&[u8], u64)`) to streams
1041/// used by raw fsts (`(&[u8], Output)`).
1042///
1043/// If this were iterators, we could use `iter::Map`, but doing this on streams
1044/// requires HKT, so we need to write out the monomorphization ourselves.
1045struct StreamOutput<S>(S);
1046
1047impl<'a, S> Streamer<'a> for StreamOutput<S>
1048where
1049    S: Streamer<'a, Item = (&'a [u8], u64)>,
1050{
1051    type Item = (&'a [u8], raw::Output);
1052
1053    fn next(&'a mut self) -> Option<Self::Item> {
1054        self.0.next().map(|(k, v)| (k, raw::Output::new(v)))
1055    }
1056}
1057
1058/// A lexicographically ordered stream of key-value from a map
1059/// along with the states of the automaton.
1060///
1061/// The `Stream` type is based on the `StreamWithState`.
1062pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>)
1063where
1064    A: Automaton;
1065
1066impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A>
1067where
1068    A::State: Clone,
1069{
1070    type Item = (&'a [u8], u64, A::State);
1071
1072    fn next(&'a mut self) -> Option<Self::Item> {
1073        self.0
1074            .next()
1075            .map(|(key, out, state)| (key, out.value(), state))
1076    }
1077}